From 69e1eed85dcadb7094fa125f8c493db41b5381f0 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Fri, 24 May 2024 12:12:34 +0200
Subject: [PATCH 001/448] add Windows gitlab job

---
 .gitlab-ci.yml | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index f3cecee4b71..09ec11dab68 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -622,6 +622,41 @@ build/icpx/igpu/release/static:
     ONEAPI_DEVICE_SELECTOR: "*:gpu"
     BUILD_HWLOC: "OFF"
 
+# windows jobs: Release shared
+# Note that this is using Powershell, not bash
+build/windows/release/shared:
+  stage: build
+  script:
+    - if (Test-Path build) { rm -r -fo build }
+    - if (Test-Path install) { rm -r -fo install }
+    - mkdir build
+    - mkdir install
+    - cmake -B build -DBUILD_SHARED_LIBS=ON -DGINKGO_BUILD_CUDA=OFF "-DCMAKE_INSTALL_PREFIX=$pwd\install" .
+    - cmake --build build --config Release -j16
+    - ctest --test-dir build -C Release --no-tests=error --output-on-failure -j16
+    - $env:PATH+=";$pwd/install/bin"
+    - cmake --install build --config Release
+    - cmake --build build --target test_install --config Release
+  tags:
+    - windows
+
+# CUDA
+build/windows-cuda/release/shared:
+  stage: build
+  script:
+    - if (Test-Path build) { rm -r -fo build }
+    - if (Test-Path install) { rm -r -fo install }
+    - mkdir build
+    - mkdir install
+    - cmake -B build -DBUILD_SHARED_LIBS=ON -DGINKGO_BUILD_CUDA=ON "-DCMAKE_INSTALL_PREFIX=$pwd\install" .
+    - cmake --build build --config Release -j16
+    - ctest --test-dir build -C Release --no-tests=error --output-on-failure
+    - $env:PATH+=";$pwd/install/bin"
+    - cmake --install build --config Release
+    - cmake --build build --target test_install --config Release
+  tags:
+    - windows-cuda
+
 # Job with important warnings as error
 warnings:
   stage: code_quality

From 68048dad25690edde3e490f1917b1c9d6ca9dcd3 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 26 Jun 2024 18:21:11 +0200
Subject: [PATCH 002/448] disable Github actions for Windows

---
 .github/workflows/windows-msvc-cuda.yml | 1 +
 .github/workflows/windows-msvc-ref.yml  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.github/workflows/windows-msvc-cuda.yml b/.github/workflows/windows-msvc-cuda.yml
index b1df1aaf4ed..efa637b2bf9 100644
--- a/.github/workflows/windows-msvc-cuda.yml
+++ b/.github/workflows/windows-msvc-cuda.yml
@@ -23,6 +23,7 @@ concurrency:
 
 jobs:
   windows_cuda:
+    if: ${{ false }}
     strategy:
       fail-fast: false
       matrix:
diff --git a/.github/workflows/windows-msvc-ref.yml b/.github/workflows/windows-msvc-ref.yml
index 117262b2016..60a811bb99b 100644
--- a/.github/workflows/windows-msvc-ref.yml
+++ b/.github/workflows/windows-msvc-ref.yml
@@ -23,6 +23,7 @@ concurrency:
 
 jobs:
   windows_ref:
+    if: ${{ false }}
     strategy:
       fail-fast: false
       matrix:

From 84f34e630c3ad7f07ff220eb4e83542c47cdbdfb Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 26 Jun 2024 18:37:51 +0200
Subject: [PATCH 003/448] remove duplicate job runs

---
 .gitlab-ci.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 09ec11dab68..4a7860d263a 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -625,6 +625,8 @@ build/icpx/igpu/release/static:
 # windows jobs: Release shared
 # Note that this is using Powershell, not bash
 build/windows/release/shared:
+  extends:
+    - .quick_test_condition
   stage: build
   script:
     - if (Test-Path build) { rm -r -fo build }
@@ -642,6 +644,8 @@ build/windows/release/shared:
 
 # CUDA
 build/windows-cuda/release/shared:
+  extends:
+    - .quick_test_condition
   stage: build
   script:
     - if (Test-Path build) { rm -r -fo build }

From 1a2ee540e707ebd9a2c6e8678f4f29fe7d4b0080 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 26 Jun 2024 18:53:52 +0200
Subject: [PATCH 004/448] work around intel timer issues

---
 test/base/timer.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/test/base/timer.cpp b/test/base/timer.cpp
index 551e8b4b955..a817ddeef96 100644
--- a/test/base/timer.cpp
+++ b/test/base/timer.cpp
@@ -41,9 +41,15 @@ TEST_F(Timer, WorksAsync)
     auto timer = gko::Timer::create_for_executor(this->exec);
     auto start = timer->create_time_point();
     auto stop = timer->create_time_point();
+    gko::array<int> dummy{this->exec, {0}};
+    auto dummy2 = dummy;
+    this->exec->synchronize();
+    // we do some minimal work to work around Intel GPU timers running backwards
 
     timer->record(start);
+    dummy = dummy2;
     std::this_thread::sleep_for(std::chrono::seconds{5});
+    dummy = dummy2;
     timer->record(stop);
     timer->wait(stop);
 
@@ -56,9 +62,15 @@ TEST_F(Timer, Works)
     auto timer = gko::Timer::create_for_executor(this->exec);
     auto start = timer->create_time_point();
     auto stop = timer->create_time_point();
+    gko::array<int> dummy{this->exec, {0}};
+    auto dummy2 = dummy;
+    this->exec->synchronize();
+    // we do some minimal work to work around Intel GPU timers running backwards
 
     timer->record(start);
+    dummy = dummy2;
     std::this_thread::sleep_for(std::chrono::seconds{5});
+    dummy = dummy2;
     timer->record(stop);
 
     ASSERT_GT(timer->difference(start, stop), std::chrono::seconds{1});

From afaaf9b3d41109e44ac928e64a72849e26ea141b Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Thu, 27 Jun 2024 16:10:46 +0200
Subject: [PATCH 005/448] disable tests, run CUDA first

---
 .gitlab-ci.yml | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 4a7860d263a..1866f16406a 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -622,9 +622,9 @@ build/icpx/igpu/release/static:
     ONEAPI_DEVICE_SELECTOR: "*:gpu"
     BUILD_HWLOC: "OFF"
 
-# windows jobs: Release shared
+# windows jobs
 # Note that this is using Powershell, not bash
-build/windows/release/shared:
+build/windows-cuda/release/shared:
   extends:
     - .quick_test_condition
   stage: build
@@ -633,17 +633,17 @@ build/windows/release/shared:
     - if (Test-Path install) { rm -r -fo install }
     - mkdir build
     - mkdir install
-    - cmake -B build -DBUILD_SHARED_LIBS=ON -DGINKGO_BUILD_CUDA=OFF "-DCMAKE_INSTALL_PREFIX=$pwd\install" .
+    - cmake -B build -DBUILD_SHARED_LIBS=ON -DGINKGO_BUILD_CUDA=ON "-DCMAKE_INSTALL_PREFIX=$pwd\install" .
     - cmake --build build --config Release -j16
-    - ctest --test-dir build -C Release --no-tests=error --output-on-failure -j16
+# we disable these tests until the triangular solver issues are resolved
+#   - ctest --test-dir build -C Release --no-tests=error --output-on-failure
     - $env:PATH+=";$pwd/install/bin"
     - cmake --install build --config Release
     - cmake --build build --target test_install --config Release
   tags:
-    - windows
+    - windows-cuda
 
-# CUDA
-build/windows-cuda/release/shared:
+build/windows/release/shared:
   extends:
     - .quick_test_condition
   stage: build
@@ -652,14 +652,14 @@ build/windows-cuda/release/shared:
     - if (Test-Path install) { rm -r -fo install }
     - mkdir build
     - mkdir install
-    - cmake -B build -DBUILD_SHARED_LIBS=ON -DGINKGO_BUILD_CUDA=ON "-DCMAKE_INSTALL_PREFIX=$pwd\install" .
+    - cmake -B build -DBUILD_SHARED_LIBS=ON -DGINKGO_BUILD_CUDA=OFF "-DCMAKE_INSTALL_PREFIX=$pwd\install" .
     - cmake --build build --config Release -j16
-    - ctest --test-dir build -C Release --no-tests=error --output-on-failure
+    - ctest --test-dir build -C Release --no-tests=error --output-on-failure -j16
     - $env:PATH+=";$pwd/install/bin"
     - cmake --install build --config Release
     - cmake --build build --target test_install --config Release
   tags:
-    - windows-cuda
+    - windows
 
 # Job with important warnings as error
 warnings:

From 412756ae5e6ebd76aa7dd803c6a5aaa01e280c9e Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Tue, 21 May 2024 17:01:37 +0200
Subject: [PATCH 006/448] replace EXEC_NAMESPACE by GKO_DEVICE_NAMESPACE

---
 test/base/batch_multi_vector_kernels.cpp      |  8 ++--
 test/base/executor.cpp                        |  4 +-
 test/base/index_range.cpp                     |  2 +-
 test/base/kernel_launch_generic.cpp           | 36 ++++++++---------
 test/components/absolute_array_kernels.cpp    |  8 ++--
 test/components/fill_array_kernels.cpp        |  4 +-
 test/components/format_conversion_kernels.cpp | 10 ++---
 test/components/prefix_sum_kernels.cpp        |  8 ++--
 test/components/reduce_array_kernels.cpp      |  2 +-
 test/distributed/index_map_kernels.cpp        | 16 ++++----
 test/distributed/matrix_kernels.cpp           |  2 +-
 test/distributed/partition_helper_kernels.cpp | 22 +++++-----
 test/distributed/vector_kernels.cpp           |  2 +-
 test/factorization/cholesky_kernels.cpp       | 16 ++++----
 test/factorization/lu_kernels.cpp             |  8 ++--
 test/factorization/par_ic_kernels.cpp         |  4 +-
 test/factorization/par_ict_kernels.cpp        |  8 ++--
 test/factorization/par_ilu_kernels.cpp        | 14 +++----
 test/factorization/par_ilut_kernels.cpp       | 23 ++++++-----
 test/matrix/csr_kernels.cpp                   |  6 +--
 test/matrix/csr_kernels2.cpp                  | 15 +++----
 test/matrix/dense_kernels.cpp                 | 10 ++---
 test/matrix/ell_kernels.cpp                   |  2 +-
 test/matrix/sparsity_csr_kernels.cpp          |  6 +--
 test/multigrid/pgm_kernels.cpp                | 21 +++++-----
 test/preconditioner/batch_jacobi_kernels.cpp  |  2 +-
 test/preconditioner/isai_kernels.cpp          | 40 +++++++++----------
 test/solver/batch_bicgstab_kernels.cpp        |  2 +-
 test/solver/batch_cg_kernels.cpp              |  2 +-
 test/solver/bicg_kernels.cpp                  |  6 +--
 test/solver/bicgstab_kernels.cpp              |  8 ++--
 test/solver/cb_gmres_kernels.cpp              |  8 ++--
 test/solver/cg_kernels.cpp                    | 14 +++----
 test/solver/cgs_kernels.cpp                   |  8 ++--
 test/solver/fcg_kernels.cpp                   | 10 ++---
 test/solver/gcr_kernels.cpp                   |  6 +--
 test/solver/gmres_kernels.cpp                 | 12 +++---
 test/solver/idr_kernels.cpp                   | 14 +++----
 test/solver/ir_kernels.cpp                    |  2 +-
 test/solver/multigrid_kernels.cpp             |  8 ++--
 40 files changed, 201 insertions(+), 198 deletions(-)

diff --git a/test/base/batch_multi_vector_kernels.cpp b/test/base/batch_multi_vector_kernels.cpp
index ab15e1a99a3..07749d9bed2 100644
--- a/test/base/batch_multi_vector_kernels.cpp
+++ b/test/base/batch_multi_vector_kernels.cpp
@@ -312,8 +312,8 @@ TEST_F(MultiVector, CopySingleIsEquivalentToRef)
 
     gko::kernels::reference::batch_multi_vector::copy(this->ref, x.get(),
                                                       y.get());
-    gko::kernels::EXEC_NAMESPACE::batch_multi_vector::copy(this->exec, dx.get(),
-                                                           dy.get());
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_multi_vector::copy(
+        this->exec, dx.get(), dy.get());
 
     GKO_ASSERT_BATCH_MTX_NEAR(dy, y, 0.0);
 }
@@ -325,8 +325,8 @@ TEST_F(MultiVector, CopyIsEquivalentToRef)
 
     gko::kernels::reference::batch_multi_vector::copy(this->ref, x.get(),
                                                       y.get());
-    gko::kernels::EXEC_NAMESPACE::batch_multi_vector::copy(this->exec, dx.get(),
-                                                           dy.get());
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_multi_vector::copy(
+        this->exec, dx.get(), dy.get());
 
     GKO_ASSERT_BATCH_MTX_NEAR(dy, y, 0.0);
 }
diff --git a/test/base/executor.cpp b/test/base/executor.cpp
index 8ea3b01fb24..541360d01d4 100644
--- a/test/base/executor.cpp
+++ b/test/base/executor.cpp
@@ -72,7 +72,7 @@ TEST_F(Executor, RunsCorrectOperation)
 
     exec->run(ExampleOperation(value));
 
-    ASSERT_EQ(EXEC_NAMESPACE::value, value);
+    ASSERT_EQ(GKO_DEVICE_NAMESPACE::value, value);
 }
 
 
@@ -104,7 +104,7 @@ TEST_F(Executor, RunsCorrectLambdaOperation)
 
     exec->run(omp_lambda, cuda_lambda, hip_lambda, dpcpp_lambda);
 
-    ASSERT_EQ(EXEC_NAMESPACE::value, value);
+    ASSERT_EQ(GKO_DEVICE_NAMESPACE::value, value);
 }
 
 
diff --git a/test/base/index_range.cpp b/test/base/index_range.cpp
index 044202fd8e2..b16b5fb9046 100644
--- a/test/base/index_range.cpp
+++ b/test/base/index_range.cpp
@@ -30,7 +30,7 @@ class IndexRange : public CommonTestFixture {
 void run_range_for(std::shared_ptr<gko::EXEC_TYPE> exec,
                    gko::array<int>& result_array)
 {
-    gko::kernels::EXEC_NAMESPACE::run_kernel(
+    gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto result, auto size) {
             for (auto i : gko::irange<int>{size}) {
diff --git a/test/base/kernel_launch_generic.cpp b/test/base/kernel_launch_generic.cpp
index 55e1268a77a..c746a5b3461 100644
--- a/test/base/kernel_launch_generic.cpp
+++ b/test/base/kernel_launch_generic.cpp
@@ -46,7 +46,7 @@ move_only_type move_only_val{};
 
 namespace gko {
 namespace kernels {
-namespace EXEC_NAMESPACE {
+namespace GKO_DEVICE_NAMESPACE {
 
 
 template <>
@@ -57,7 +57,7 @@ struct to_device_type_impl<move_only_type&> {
 };
 
 
-}  // namespace EXEC_NAMESPACE
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
 
@@ -108,7 +108,7 @@ class KernelLaunch : public CommonTestFixture {
 // nvcc doesn't like device lambdas declared in complex classes, move it out
 void run1d(std::shared_ptr<gko::EXEC_TYPE> exec, size_type dim, int* data)
 {
-    gko::kernels::EXEC_NAMESPACE::run_kernel(
+    gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto d, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
@@ -129,7 +129,7 @@ TEST_F(KernelLaunch, Runs1D)
 
 void run1d(std::shared_ptr<gko::EXEC_TYPE> exec, gko::array<int>& data)
 {
-    gko::kernels::EXEC_NAMESPACE::run_kernel(
+    gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto d, auto d_ptr, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
@@ -155,7 +155,7 @@ TEST_F(KernelLaunch, Runs1DArray)
 
 void run1d(std::shared_ptr<gko::EXEC_TYPE> exec, Mtx* m)
 {
-    gko::kernels::EXEC_NAMESPACE::run_kernel(
+    gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
@@ -193,7 +193,7 @@ TEST_F(KernelLaunch, Runs1DDense)
 
 void run2d(std::shared_ptr<gko::EXEC_TYPE> exec, int* data)
 {
-    gko::kernels::EXEC_NAMESPACE::run_kernel(
+    gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
@@ -215,7 +215,7 @@ TEST_F(KernelLaunch, Runs2D)
 
 void run2d(std::shared_ptr<gko::EXEC_TYPE> exec, gko::array<int>& data)
 {
-    gko::kernels::EXEC_NAMESPACE::run_kernel(
+    gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
@@ -242,7 +242,7 @@ TEST_F(KernelLaunch, Runs2DArray)
 
 void run2d(std::shared_ptr<gko::EXEC_TYPE> exec, Mtx* m1, Mtx* m2, Mtx* m3)
 {
-    gko::kernels::EXEC_NAMESPACE::run_kernel_solver(
+    gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_solver(
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d, auto d2, auto d_ptr, auto d3,
                       auto d4, auto d2_ptr, auto d3_ptr, auto dummy) {
@@ -280,8 +280,8 @@ void run2d(std::shared_ptr<gko::EXEC_TYPE> exec, Mtx* m1, Mtx* m2, Mtx* m3)
         },
         dim<2>{4, 4}, m2->get_stride(), m1, static_cast<const Mtx*>(m1),
         m1->get_const_values(),
-        gko::kernels::EXEC_NAMESPACE::default_stride(m2),
-        gko::kernels::EXEC_NAMESPACE::row_vector(m3), m2->get_values(),
+        gko::kernels::GKO_DEVICE_NAMESPACE::default_stride(m2),
+        gko::kernels::GKO_DEVICE_NAMESPACE::row_vector(m3), m2->get_values(),
         m3->get_values(), move_only_val);
 }
 
@@ -297,7 +297,7 @@ void run1d_reduction(std::shared_ptr<gko::EXEC_TYPE> exec)
 {
     gko::array<int64> output{exec, {-1l}};
     auto run_reduction = [&](int64 init, size_type size) {
-        gko::kernels::EXEC_NAMESPACE::run_kernel_reduction(
+        gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_reduction(
             exec,
             [] GKO_KERNEL(auto i, auto a, auto dummy) {
                 static_assert(is_same<decltype(i), int64>::value, "index");
@@ -343,7 +343,7 @@ void run1d_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
     gko::array<char> temp(exec);
     for (const auto& size : sizes) {
         temp.clear();
-        gko::kernels::EXEC_NAMESPACE::run_kernel_reduction_cached(
+        gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_reduction_cached(
             exec, [] GKO_KERNEL(auto i) { return i + 1; },
             [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); },
             [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(),
@@ -366,7 +366,7 @@ void run2d_reduction(std::shared_ptr<gko::EXEC_TYPE> exec)
 {
     gko::array<int64> output{exec, {-1l}};
     auto run_reduction = [&](int64 init, gko::dim<2> size) {
-        gko::kernels::EXEC_NAMESPACE::run_kernel_reduction(
+        gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_reduction(
             exec,
             [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) {
                 static_assert(is_same<decltype(i), int64>::value, "index");
@@ -435,7 +435,7 @@ void run2d_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
     gko::array<char> temp(exec);
     for (const auto& dim : dims) {
         temp.clear();
-        gko::kernels::EXEC_NAMESPACE::run_kernel_reduction_cached(
+        gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_reduction_cached(
             exec, [] GKO_KERNEL(auto i, auto j) { return i + j + 2; },
             [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); },
             [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(),
@@ -482,7 +482,7 @@ void run2d_row_reduction(std::shared_ptr<gko::EXEC_TYPE> exec)
                     static_cast<int64>(num_cols) * (num_cols + 1) * (i + 1);
             }
 
-            gko::kernels::EXEC_NAMESPACE::run_kernel_row_reduction(
+            gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_row_reduction(
                 exec,
                 [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) {
                     static_assert(is_same<decltype(i), int64>::value, "index");
@@ -527,7 +527,7 @@ void run2d_row_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
             host_ref.get_data()[i] = dim[1] + i + 1;
         }
 
-        gko::kernels::EXEC_NAMESPACE::run_kernel_row_reduction_cached(
+        gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_row_reduction_cached(
             exec, [] GKO_KERNEL(auto i, auto j) { return i + j + 2; },
             [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); },
             [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(),
@@ -576,7 +576,7 @@ void run2d_col_reduction(std::shared_ptr<gko::EXEC_TYPE> exec)
                     static_cast<int64>(num_rows) * (num_rows + 1) * (i + 1);
             }
 
-            gko::kernels::EXEC_NAMESPACE::run_kernel_col_reduction(
+            gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_col_reduction(
                 exec,
                 [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) {
                     static_assert(is_same<decltype(i), int64>::value, "index");
@@ -620,7 +620,7 @@ void run2d_col_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
             host_ref.get_data()[i] = dim[0] + i + 1;
         }
 
-        gko::kernels::EXEC_NAMESPACE::run_kernel_col_reduction_cached(
+        gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_col_reduction_cached(
             exec, [] GKO_KERNEL(auto i, auto j) { return i + j + 2; },
             [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); },
             [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(),
diff --git a/test/components/absolute_array_kernels.cpp b/test/components/absolute_array_kernels.cpp
index 6e00ad6e185..08dd52f35e3 100644
--- a/test/components/absolute_array_kernels.cpp
+++ b/test/components/absolute_array_kernels.cpp
@@ -46,7 +46,7 @@ class AbsoluteArray : public CommonTestFixture {
 
 TEST_F(AbsoluteArray, InplaceEqualsReference)
 {
-    gko::kernels::EXEC_NAMESPACE::components::inplace_absolute_array(
+    gko::kernels::GKO_DEVICE_NAMESPACE::components::inplace_absolute_array(
         exec, dvals.get_data(), total_size);
     gko::kernels::reference::components::inplace_absolute_array(
         ref, vals.get_data(), total_size);
@@ -57,7 +57,7 @@ TEST_F(AbsoluteArray, InplaceEqualsReference)
 
 TEST_F(AbsoluteArray, InplaceComplexEqualsReference)
 {
-    gko::kernels::EXEC_NAMESPACE::components::inplace_absolute_array(
+    gko::kernels::GKO_DEVICE_NAMESPACE::components::inplace_absolute_array(
         exec, dcomplex_vals.get_data(), total_size);
     gko::kernels::reference::components::inplace_absolute_array(
         ref, complex_vals.get_data(), total_size);
@@ -71,7 +71,7 @@ TEST_F(AbsoluteArray, OutplaceEqualsReference)
     gko::array<value_type> abs_vals(ref, total_size);
     gko::array<value_type> dabs_vals(exec, total_size);
 
-    gko::kernels::EXEC_NAMESPACE::components::outplace_absolute_array(
+    gko::kernels::GKO_DEVICE_NAMESPACE::components::outplace_absolute_array(
         exec, dvals.get_const_data(), total_size, dabs_vals.get_data());
     gko::kernels::reference::components::outplace_absolute_array(
         ref, vals.get_const_data(), total_size, abs_vals.get_data());
@@ -85,7 +85,7 @@ TEST_F(AbsoluteArray, OutplaceComplexEqualsReference)
     gko::array<value_type> abs_vals(ref, total_size);
     gko::array<value_type> dabs_vals(exec, total_size);
 
-    gko::kernels::EXEC_NAMESPACE::components::outplace_absolute_array(
+    gko::kernels::GKO_DEVICE_NAMESPACE::components::outplace_absolute_array(
         exec, dcomplex_vals.get_const_data(), total_size, dabs_vals.get_data());
     gko::kernels::reference::components::outplace_absolute_array(
         ref, complex_vals.get_const_data(), total_size, abs_vals.get_data());
diff --git a/test/components/fill_array_kernels.cpp b/test/components/fill_array_kernels.cpp
index 9ccf63e5c88..3997c5830ea 100644
--- a/test/components/fill_array_kernels.cpp
+++ b/test/components/fill_array_kernels.cpp
@@ -47,7 +47,7 @@ TYPED_TEST_SUITE(FillArray, gko::test::ValueAndIndexTypes,
 TYPED_TEST(FillArray, EqualsReference)
 {
     using T = typename TestFixture::value_type;
-    gko::kernels::EXEC_NAMESPACE::components::fill_array(
+    gko::kernels::GKO_DEVICE_NAMESPACE::components::fill_array(
         this->exec, this->dvals.get_data(), this->total_size, T(1523));
 
     GKO_ASSERT_ARRAY_EQ(this->vals, this->dvals);
@@ -57,7 +57,7 @@ TYPED_TEST(FillArray, EqualsReference)
 TYPED_TEST(FillArray, FillSeqEqualsReference)
 {
     using T = typename TestFixture::value_type;
-    gko::kernels::EXEC_NAMESPACE::components::fill_seq_array(
+    gko::kernels::GKO_DEVICE_NAMESPACE::components::fill_seq_array(
         this->exec, this->dvals.get_data(), this->total_size);
 
     GKO_ASSERT_ARRAY_EQ(this->seqs, this->dvals);
diff --git a/test/components/format_conversion_kernels.cpp b/test/components/format_conversion_kernels.cpp
index fee77ea5986..053171ffbe2 100644
--- a/test/components/format_conversion_kernels.cpp
+++ b/test/components/format_conversion_kernels.cpp
@@ -63,7 +63,7 @@ TYPED_TEST(FormatConversion, ConvertsEmptyPtrsToIdxs)
     ptrs.fill(0);
     TypeParam* output = nullptr;
 
-    gko::kernels::EXEC_NAMESPACE::components::convert_ptrs_to_idxs(
+    gko::kernels::GKO_DEVICE_NAMESPACE::components::convert_ptrs_to_idxs(
         this->exec, ptrs.get_const_data(), this->size, output);
 
     // mustn't segfault
@@ -75,7 +75,7 @@ TYPED_TEST(FormatConversion, ConvertPtrsToIdxs)
     auto ref_idxs = this->idxs;
     this->idxs.fill(-1);
 
-    gko::kernels::EXEC_NAMESPACE::components::convert_ptrs_to_idxs(
+    gko::kernels::GKO_DEVICE_NAMESPACE::components::convert_ptrs_to_idxs(
         this->exec, this->ptrs.get_const_data(), this->size,
         this->idxs.get_data());
 
@@ -90,7 +90,7 @@ TYPED_TEST(FormatConversion, ConvertsEmptyIdxsToPtrs)
     this->ptrs.fill(-1);
     TypeParam* input = nullptr;
 
-    gko::kernels::EXEC_NAMESPACE::components::convert_idxs_to_ptrs(
+    gko::kernels::GKO_DEVICE_NAMESPACE::components::convert_idxs_to_ptrs(
         this->exec, input, 0, this->size, this->ptrs.get_data());
 
     GKO_ASSERT_ARRAY_EQ(this->ptrs, ref_ptrs);
@@ -102,7 +102,7 @@ TYPED_TEST(FormatConversion, ConvertIdxsToPtrsIsEquivalentToRef)
     auto ref_ptrs = this->ptrs;
     this->ptrs.fill(-1);
 
-    gko::kernels::EXEC_NAMESPACE::components::convert_idxs_to_ptrs(
+    gko::kernels::GKO_DEVICE_NAMESPACE::components::convert_idxs_to_ptrs(
         this->exec, this->idxs.get_const_data(), this->idxs.get_size(),
         this->size, this->ptrs.get_data());
 
@@ -115,7 +115,7 @@ TYPED_TEST(FormatConversion, ConvertPtrsToSizesIsEquivalentToRef)
     auto ref_sizes = this->sizes;
     this->sizes.fill(12345);
 
-    gko::kernels::EXEC_NAMESPACE::components::convert_ptrs_to_sizes(
+    gko::kernels::GKO_DEVICE_NAMESPACE::components::convert_ptrs_to_sizes(
         this->exec, this->ptrs.get_const_data(), this->size,
         this->sizes.get_data());
 
diff --git a/test/components/prefix_sum_kernels.cpp b/test/components/prefix_sum_kernels.cpp
index cf1777bb6ae..73cb0c7874e 100644
--- a/test/components/prefix_sum_kernels.cpp
+++ b/test/components/prefix_sum_kernels.cpp
@@ -57,7 +57,7 @@ TYPED_TEST(PrefixSum, EqualsReference)
         SCOPED_TRACE(size);
         gko::kernels::reference::components::prefix_sum_nonnegative(
             this->ref, this->vals.get_data(), size);
-        gko::kernels::EXEC_NAMESPACE::components::prefix_sum_nonnegative(
+        gko::kernels::GKO_DEVICE_NAMESPACE::components::prefix_sum_nonnegative(
             this->exec, this->dvals.get_data(), size);
 
         GKO_ASSERT_ARRAY_EQ(this->vals, this->dvals);
@@ -74,7 +74,7 @@ TYPED_TEST(PrefixSum, WorksCloseToOverflow)
                      std::is_unsigned<TypeParam>::value;
     gko::array<TypeParam> data{this->exec, I<TypeParam>({max - 1, 1, 0})};
 
-    gko::kernels::EXEC_NAMESPACE::components::prefix_sum_nonnegative(
+    gko::kernels::GKO_DEVICE_NAMESPACE::components::prefix_sum_nonnegative(
         this->exec, data.get_data(), data.get_size());
 
     GKO_ASSERT_ARRAY_EQ(data, I<TypeParam>({0, max - 1, max}));
@@ -86,7 +86,7 @@ TYPED_TEST(PrefixSum, DoesntOverflowFromLastElement)
     const auto max = std::numeric_limits<TypeParam>::max();
     gko::array<TypeParam> data{this->exec, I<TypeParam>({2, max - 1})};
 
-    gko::kernels::EXEC_NAMESPACE::components::prefix_sum_nonnegative(
+    gko::kernels::GKO_DEVICE_NAMESPACE::components::prefix_sum_nonnegative(
         this->exec, data.get_data(), data.get_size());
 
     GKO_ASSERT_ARRAY_EQ(data, I<TypeParam>({0, 2}));
@@ -103,7 +103,7 @@ TYPED_TEST(PrefixSum, ThrowsOnOverflow)
                                {max / 3, max / 2, max / 4, max / 3, max / 4}};
 
     ASSERT_THROW(
-        gko::kernels::EXEC_NAMESPACE::components::prefix_sum_nonnegative(
+        gko::kernels::GKO_DEVICE_NAMESPACE::components::prefix_sum_nonnegative(
             this->exec, data.get_data(), data.get_size()),
         gko::OverflowError);
 }
diff --git a/test/components/reduce_array_kernels.cpp b/test/components/reduce_array_kernels.cpp
index cd6c2a8d7bf..dfc2e046c84 100644
--- a/test/components/reduce_array_kernels.cpp
+++ b/test/components/reduce_array_kernels.cpp
@@ -50,7 +50,7 @@ TYPED_TEST(ReduceArray, EqualsReference)
 {
     gko::kernels::reference::components::reduce_add_array(this->ref, this->vals,
                                                           this->out);
-    gko::kernels::EXEC_NAMESPACE::components::reduce_add_array(
+    gko::kernels::GKO_DEVICE_NAMESPACE::components::reduce_add_array(
         this->exec, this->dvals, this->dout);
 
     GKO_ASSERT_ARRAY_EQ(this->out, this->dout);
diff --git a/test/distributed/index_map_kernels.cpp b/test/distributed/index_map_kernels.cpp
index 458ca594a56..cafd7b4da35 100644
--- a/test/distributed/index_map_kernels.cpp
+++ b/test/distributed/index_map_kernels.cpp
@@ -97,7 +97,7 @@ TEST_F(IndexMapBuildMapping, BuildMappingSameAsRef)
     gko::kernels::reference::index_map::build_mapping(
         ref, part.get(), query, target_ids, remote_local_idxs,
         remote_global_idxs, remote_sizes);
-    gko::kernels::EXEC_NAMESPACE::index_map::build_mapping(
+    gko::kernels::GKO_DEVICE_NAMESPACE::index_map::build_mapping(
         exec, dpart.get(), dquery, dtarget_ids, dremote_local_idxs,
         dremote_global_idxs, dremote_sizes);
 
@@ -136,7 +136,7 @@ class IndexMap : public CommonTestFixture {
         gko::kernels::reference::index_map::build_mapping(
             ref, part.get(), connections, target_ids, flat_remote_local_idxs,
             flat_remote_global_idxs, remote_sizes);
-        gko::kernels::EXEC_NAMESPACE::index_map::build_mapping(
+        gko::kernels::GKO_DEVICE_NAMESPACE::index_map::build_mapping(
             exec, dpart.get(), dconnections, dtarget_ids,
             dflat_remote_local_idxs, dflat_remote_global_idxs, dremote_sizes);
 
@@ -247,7 +247,7 @@ TEST_F(IndexMap, GetLocalWithLocalIndexSpaceSameAsRef)
         ref, part.get(), target_ids, to_device_const(remote_global_idxs),
         this_rank, query, gko::experimental::distributed::index_space::local,
         result);
-    gko::kernels::EXEC_NAMESPACE::index_map::map_to_local(
+    gko::kernels::GKO_DEVICE_NAMESPACE::index_map::map_to_local(
         exec, dpart.get(), dtarget_ids, to_device_const(dremote_global_idxs),
         this_rank, dquery, gko::experimental::distributed::index_space::local,
         dresult);
@@ -275,7 +275,7 @@ TEST_F(IndexMap, GetLocalWithLocalIndexSpaceWithInvalidIndexSameAsRef)
         ref, part.get(), target_ids, to_device_const(remote_global_idxs),
         this_rank, query, gko::experimental::distributed::index_space::local,
         result);
-    gko::kernels::EXEC_NAMESPACE::index_map::map_to_local(
+    gko::kernels::GKO_DEVICE_NAMESPACE::index_map::map_to_local(
         exec, dpart.get(), dtarget_ids, to_device_const(dremote_global_idxs),
         this_rank, dquery, gko::experimental::distributed::index_space::local,
         dresult);
@@ -304,7 +304,7 @@ TEST_F(IndexMap, GetLocalWithNonLocalIndexSpaceSameAsRef)
         ref, part.get(), target_ids, to_device_const(remote_global_idxs),
         this_rank, query,
         gko::experimental::distributed::index_space::non_local, result);
-    gko::kernels::EXEC_NAMESPACE::index_map::map_to_local(
+    gko::kernels::GKO_DEVICE_NAMESPACE::index_map::map_to_local(
         exec, dpart.get(), dtarget_ids, to_device_const(dremote_global_idxs),
         this_rank, dquery,
         gko::experimental::distributed::index_space::non_local, dresult);
@@ -330,7 +330,7 @@ TEST_F(IndexMap, GetLocalWithNonLocalIndexSpaceWithInvalidIndexSameAsRef)
         ref, part.get(), target_ids, to_device_const(remote_global_idxs),
         this_rank, query,
         gko::experimental::distributed::index_space::non_local, result);
-    gko::kernels::EXEC_NAMESPACE::index_map::map_to_local(
+    gko::kernels::GKO_DEVICE_NAMESPACE::index_map::map_to_local(
         exec, dpart.get(), dtarget_ids, to_device_const(dremote_global_idxs),
         this_rank, dquery,
         gko::experimental::distributed::index_space::non_local, dresult);
@@ -355,7 +355,7 @@ TEST_F(IndexMap, GetLocalWithCombinedIndexSpaceSameAsRef)
         ref, part.get(), target_ids, to_device_const(remote_global_idxs),
         this_rank, query, gko::experimental::distributed::index_space::combined,
         result);
-    gko::kernels::EXEC_NAMESPACE::index_map::map_to_local(
+    gko::kernels::GKO_DEVICE_NAMESPACE::index_map::map_to_local(
         exec, dpart.get(), dtarget_ids, to_device_const(dremote_global_idxs),
         this_rank, dquery,
         gko::experimental::distributed::index_space::combined, dresult);
@@ -385,7 +385,7 @@ TEST_F(IndexMap, GetLocalWithCombinedIndexSpaceWithInvalidIndexSameAsRef)
         ref, part.get(), target_ids, to_device_const(remote_global_idxs),
         this_rank, query,
         gko::experimental::distributed::index_space::non_local, result);
-    gko::kernels::EXEC_NAMESPACE::index_map::map_to_local(
+    gko::kernels::GKO_DEVICE_NAMESPACE::index_map::map_to_local(
         exec, dpart.get(), dtarget_ids, to_device_const(dremote_global_idxs),
         this_rank, dquery,
         gko::experimental::distributed::index_space::non_local, dresult);
diff --git a/test/distributed/matrix_kernels.cpp b/test/distributed/matrix_kernels.cpp
index 5e3677db2f4..8445aee6a0e 100644
--- a/test/distributed/matrix_kernels.cpp
+++ b/test/distributed/matrix_kernels.cpp
@@ -72,7 +72,7 @@ class Matrix : public CommonTestFixture {
                     ref, input, row_partition.get(), col_partition.get(), part,
                     local_row_idxs, local_col_idxs, local_values,
                     non_local_row_idxs, non_local_col_idxs, non_local_values);
-            gko::kernels::EXEC_NAMESPACE::distributed_matrix::
+            gko::kernels::GKO_DEVICE_NAMESPACE::distributed_matrix::
                 separate_local_nonlocal(
                     exec, d_input, d_row_partition.get(), d_col_partition.get(),
                     part, d_local_row_idxs, d_local_col_idxs, d_local_values,
diff --git a/test/distributed/partition_helper_kernels.cpp b/test/distributed/partition_helper_kernels.cpp
index 8121a720908..9e985ffec9e 100644
--- a/test/distributed/partition_helper_kernels.cpp
+++ b/test/distributed/partition_helper_kernels.cpp
@@ -147,8 +147,8 @@ TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRanges)
     auto offsets = make_array(this->exec, create_ranges<index_type>(100));
     bool result = false;
 
-    gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges(
-        this->exec, offsets, result);
+    gko::kernels::GKO_DEVICE_NAMESPACE::partition_helpers::
+        check_consecutive_ranges(this->exec, offsets, result);
 
     ASSERT_TRUE(result);
 }
@@ -163,8 +163,8 @@ TYPED_TEST(PartitionHelpers, CanCheckNonConsecutiveRanges)
         make_array(this->exec, remove_indices(full_range_ends, removal_idxs));
     bool result = true;
 
-    gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges(
-        this->exec, start_ends, result);
+    gko::kernels::GKO_DEVICE_NAMESPACE::partition_helpers::
+        check_consecutive_ranges(this->exec, start_ends, result);
 
     ASSERT_FALSE(result);
 }
@@ -176,8 +176,8 @@ TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRangesWithSingleRange)
     auto start_ends = make_array(this->ref, create_ranges<index_type>(1));
     bool result = false;
 
-    gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges(
-        this->exec, start_ends, result);
+    gko::kernels::GKO_DEVICE_NAMESPACE::partition_helpers::
+        check_consecutive_ranges(this->exec, start_ends, result);
 
     ASSERT_TRUE(result);
 }
@@ -189,8 +189,8 @@ TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRangesWithSingleElement)
     auto start_ends = gko::array<index_type>(this->exec, {1});
     bool result = false;
 
-    gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges(
-        this->exec, start_ends, result);
+    gko::kernels::GKO_DEVICE_NAMESPACE::partition_helpers::
+        check_consecutive_ranges(this->exec, start_ends, result);
 
     ASSERT_TRUE(result);
 }
@@ -206,7 +206,7 @@ TYPED_TEST(PartitionHelpers, CanSortConsecutiveRanges)
     auto expected_start_ends = start_ends;
     auto expected_part_ids = part_ids_arr;
 
-    gko::kernels::EXEC_NAMESPACE::partition_helpers::sort_by_range_start(
+    gko::kernels::GKO_DEVICE_NAMESPACE::partition_helpers::sort_by_range_start(
         this->exec, start_ends, part_ids_arr);
 
     GKO_ASSERT_ARRAY_EQ(expected_start_ends, start_ends);
@@ -227,7 +227,7 @@ TYPED_TEST(PartitionHelpers, CanSortNonConsecutiveRanges)
     auto part_ids_arr = gko::array<comm_index_type>(
         this->exec, shuffled.second.begin(), shuffled.second.end());
 
-    gko::kernels::EXEC_NAMESPACE::partition_helpers::sort_by_range_start(
+    gko::kernels::GKO_DEVICE_NAMESPACE::partition_helpers::sort_by_range_start(
         this->exec, start_ends, part_ids_arr);
 
     GKO_ASSERT_ARRAY_EQ(expected_start_ends, start_ends);
@@ -242,7 +242,7 @@ TYPED_TEST(PartitionHelpers, CanCompressRanges)
     auto ranges = make_array(this->exec, create_ranges(expected_offsets));
     gko::array<index_type> offsets{this->exec, expected_offsets.size()};
 
-    gko::kernels::EXEC_NAMESPACE::partition_helpers::compress_ranges(
+    gko::kernels::GKO_DEVICE_NAMESPACE::partition_helpers::compress_ranges(
         this->exec, ranges, offsets);
 
     GKO_ASSERT_ARRAY_EQ(offsets, make_array(this->exec, expected_offsets));
diff --git a/test/distributed/vector_kernels.cpp b/test/distributed/vector_kernels.cpp
index e8e3d6a7e7b..86faca6b2b2 100644
--- a/test/distributed/vector_kernels.cpp
+++ b/test/distributed/vector_kernels.cpp
@@ -61,7 +61,7 @@ class Vector : public CommonTestFixture {
 
             gko::kernels::reference::distributed_vector::build_local(
                 ref, input, partition.get(), part, output.get());
-            gko::kernels::EXEC_NAMESPACE::distributed_vector::build_local(
+            gko::kernels::GKO_DEVICE_NAMESPACE::distributed_vector::build_local(
                 exec, d_input, d_partition.get(), part, d_output.get());
 
             GKO_ASSERT_MTX_NEAR(output, d_output, 0);
diff --git a/test/factorization/cholesky_kernels.cpp b/test/factorization/cholesky_kernels.cpp
index 82c59477fd8..c1d0a6c7336 100644
--- a/test/factorization/cholesky_kernels.cpp
+++ b/test/factorization/cholesky_kernels.cpp
@@ -150,7 +150,7 @@ TYPED_TEST(CholeskySymbolic, KernelSymbolicCount)
 
         gko::kernels::reference::cholesky::symbolic_count(
             this->ref, mtx.get(), *forest, row_nnz.get_data(), this->tmp);
-        gko::kernels::EXEC_NAMESPACE::cholesky::symbolic_count(
+        gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::symbolic_count(
             this->exec, dmtx.get(), *dforest, drow_nnz.get_data(), this->dtmp);
 
         GKO_ASSERT_ARRAY_EQ(drow_nnz, row_nnz);
@@ -189,12 +189,12 @@ TYPED_TEST(CholeskySymbolic, KernelSymbolicFactorize)
         std::unique_ptr<elimination_forest> dforest;
         gko::factorization::compute_elim_forest(dmtx.get(), dforest);
         gko::array<index_type> dtmp_ptrs{this->exec, num_rows + 1};
-        gko::kernels::EXEC_NAMESPACE::cholesky::symbolic_count(
+        gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::symbolic_count(
             this->exec, dmtx.get(), *dforest, dtmp_ptrs.get_data(), this->dtmp);
 
         gko::kernels::reference::cholesky::symbolic_factorize(
             this->ref, mtx.get(), *forest, l_factor.get(), this->tmp);
-        gko::kernels::EXEC_NAMESPACE::cholesky::symbolic_factorize(
+        gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::symbolic_factorize(
             this->exec, dmtx.get(), *dforest, dl_factor.get(), this->dtmp);
 
         GKO_ASSERT_MTX_EQ_SPARSITY(dl_factor, l_factor);
@@ -239,7 +239,7 @@ TYPED_TEST(CholeskySymbolic, KernelForestFromFactorWorks)
         elimination_forest dforest{this->exec,
                                    static_cast<index_type>(mtx->get_size()[0])};
 
-        gko::kernels::EXEC_NAMESPACE::cholesky::forest_from_factor(
+        gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::forest_from_factor(
             this->exec, dfactors.get(), dforest);
 
         this->assert_equal_forests(*forest, dforest);
@@ -367,7 +367,7 @@ TYPED_TEST(Cholesky, KernelInitializeIsEquivalentToRef)
     this->forall_matrices([this] {
         const auto nnz = this->mtx_chol->get_num_stored_elements();
         std::fill_n(this->mtx_chol->get_values(), nnz, gko::zero<value_type>());
-        gko::kernels::EXEC_NAMESPACE::components::fill_array(
+        gko::kernels::GKO_DEVICE_NAMESPACE::components::fill_array(
             this->exec, this->dmtx_chol->get_values(), nnz,
             gko::zero<value_type>());
         gko::array<index_type> diag_idxs{this->ref, this->num_rows};
@@ -380,7 +380,7 @@ TYPED_TEST(Cholesky, KernelInitializeIsEquivalentToRef)
             this->row_descs.get_const_data(), this->storage.get_const_data(),
             diag_idxs.get_data(), transpose_idxs.get_data(),
             this->mtx_chol.get());
-        gko::kernels::EXEC_NAMESPACE::cholesky::initialize(
+        gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::initialize(
             this->exec, this->dmtx.get(),
             this->dstorage_offsets.get_const_data(),
             this->drow_descs.get_const_data(), this->dstorage.get_const_data(),
@@ -410,7 +410,7 @@ TYPED_TEST(Cholesky, KernelFactorizeIsEquivalentToRef)
             this->row_descs.get_const_data(), this->storage.get_const_data(),
             diag_idxs.get_data(), transpose_idxs.get_data(),
             this->mtx_chol.get());
-        gko::kernels::EXEC_NAMESPACE::cholesky::initialize(
+        gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::initialize(
             this->exec, this->dmtx.get(),
             this->dstorage_offsets.get_const_data(),
             this->drow_descs.get_const_data(), this->dstorage.get_const_data(),
@@ -422,7 +422,7 @@ TYPED_TEST(Cholesky, KernelFactorizeIsEquivalentToRef)
             this->row_descs.get_const_data(), this->storage.get_const_data(),
             diag_idxs.get_const_data(), transpose_idxs.get_const_data(),
             *this->forest, this->mtx_chol.get(), tmp);
-        gko::kernels::EXEC_NAMESPACE::cholesky::factorize(
+        gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::factorize(
             this->exec, this->dstorage_offsets.get_const_data(),
             this->drow_descs.get_const_data(), this->dstorage.get_const_data(),
             ddiag_idxs.get_const_data(), dtranspose_idxs.get_const_data(),
diff --git a/test/factorization/lu_kernels.cpp b/test/factorization/lu_kernels.cpp
index fdcaa0cfad0..0ea06bed506 100644
--- a/test/factorization/lu_kernels.cpp
+++ b/test/factorization/lu_kernels.cpp
@@ -156,7 +156,7 @@ TYPED_TEST(Lu, KernelInitializeIsEquivalentToRef)
         std::fill_n(this->mtx_lu->get_values(),
                     this->mtx_lu->get_num_stored_elements(),
                     gko::zero<value_type>());
-        gko::kernels::EXEC_NAMESPACE::components::fill_array(
+        gko::kernels::GKO_DEVICE_NAMESPACE::components::fill_array(
             this->exec, this->dmtx_lu->get_values(),
             this->dmtx_lu->get_num_stored_elements(), gko::zero<value_type>());
         gko::array<index_type> diag_idxs{this->ref, this->num_rows};
@@ -166,7 +166,7 @@ TYPED_TEST(Lu, KernelInitializeIsEquivalentToRef)
             this->ref, this->mtx.get(), this->storage_offsets.get_const_data(),
             this->row_descs.get_const_data(), this->storage.get_const_data(),
             diag_idxs.get_data(), this->mtx_lu.get());
-        gko::kernels::EXEC_NAMESPACE::lu_factorization::initialize(
+        gko::kernels::GKO_DEVICE_NAMESPACE::lu_factorization::initialize(
             this->exec, this->dmtx.get(),
             this->dstorage_offsets.get_const_data(),
             this->drow_descs.get_const_data(), this->dstorage.get_const_data(),
@@ -191,7 +191,7 @@ TYPED_TEST(Lu, KernelFactorizeIsEquivalentToRef)
             this->ref, this->mtx.get(), this->storage_offsets.get_const_data(),
             this->row_descs.get_const_data(), this->storage.get_const_data(),
             diag_idxs.get_data(), this->mtx_lu.get());
-        gko::kernels::EXEC_NAMESPACE::lu_factorization::initialize(
+        gko::kernels::GKO_DEVICE_NAMESPACE::lu_factorization::initialize(
             this->exec, this->dmtx.get(),
             this->dstorage_offsets.get_const_data(),
             this->drow_descs.get_const_data(), this->dstorage.get_const_data(),
@@ -201,7 +201,7 @@ TYPED_TEST(Lu, KernelFactorizeIsEquivalentToRef)
             this->ref, this->storage_offsets.get_const_data(),
             this->row_descs.get_const_data(), this->storage.get_const_data(),
             diag_idxs.get_const_data(), this->mtx_lu.get(), tmp);
-        gko::kernels::EXEC_NAMESPACE::lu_factorization::factorize(
+        gko::kernels::GKO_DEVICE_NAMESPACE::lu_factorization::factorize(
             this->exec, this->dstorage_offsets.get_const_data(),
             this->drow_descs.get_const_data(), this->dstorage.get_const_data(),
             ddiag_idxs.get_const_data(), this->dmtx_lu.get(), dtmp);
diff --git a/test/factorization/par_ic_kernels.cpp b/test/factorization/par_ic_kernels.cpp
index 57086a1550d..40a40b5acf5 100644
--- a/test/factorization/par_ic_kernels.cpp
+++ b/test/factorization/par_ic_kernels.cpp
@@ -100,7 +100,7 @@ TYPED_TEST(ParIc, KernelInitFactorIsEquivalentToRef)
 
     gko::kernels::reference::par_ic_factorization::init_factor(
         this->ref, this->mtx_l.get());
-    gko::kernels::EXEC_NAMESPACE::par_ic_factorization::init_factor(
+    gko::kernels::GKO_DEVICE_NAMESPACE::par_ic_factorization::init_factor(
         this->exec, this->dmtx_l.get());
 
     GKO_ASSERT_MTX_NEAR(this->mtx_l, this->dmtx_l, r<value_type>::value);
@@ -118,7 +118,7 @@ TYPED_TEST(ParIc, KernelComputeFactorIsEquivalentToRef)
 
     gko::kernels::reference::par_ic_factorization::compute_factor(
         this->ref, 1, mtx_l_coo.get(), this->mtx_l_ani_init.get());
-    gko::kernels::EXEC_NAMESPACE::par_ic_factorization::compute_factor(
+    gko::kernels::GKO_DEVICE_NAMESPACE::par_ic_factorization::compute_factor(
         this->exec, 100, dmtx_l_coo.get(), this->dmtx_l_ani_init.get());
 
     GKO_ASSERT_MTX_NEAR(this->mtx_l_ani_init, this->dmtx_l_ani_init, 1e-4);
diff --git a/test/factorization/par_ict_kernels.cpp b/test/factorization/par_ict_kernels.cpp
index 254c2e4a40e..81d1dd83ffb 100644
--- a/test/factorization/par_ict_kernels.cpp
+++ b/test/factorization/par_ict_kernels.cpp
@@ -118,7 +118,7 @@ TYPED_TEST(ParIct, KernelAddCandidatesIsEquivalentToRef)
     gko::kernels::reference::par_ict_factorization::add_candidates(
         this->ref, mtx_llh.get(), this->mtx.get(), this->mtx_l.get(),
         res_mtx_l.get());
-    gko::kernels::EXEC_NAMESPACE::par_ict_factorization::add_candidates(
+    gko::kernels::GKO_DEVICE_NAMESPACE::par_ict_factorization::add_candidates(
         this->exec, dmtx_llh.get(), this->dmtx.get(), this->dmtx_l.get(),
         dres_mtx_l.get());
 
@@ -140,9 +140,9 @@ TYPED_TEST(ParIct, KernelComputeFactorIsEquivalentToRef)
     gko::kernels::reference::par_ict_factorization::compute_factor(
         this->ref, this->mtx_ani.get(), this->mtx_l_ani.get(), mtx_l_coo.get());
     for (int i = 0; i < 20; ++i) {
-        gko::kernels::EXEC_NAMESPACE::par_ict_factorization::compute_factor(
-            this->exec, this->dmtx_ani.get(), this->dmtx_l_ani.get(),
-            dmtx_l_coo.get());
+        gko::kernels::GKO_DEVICE_NAMESPACE::par_ict_factorization::
+            compute_factor(this->exec, this->dmtx_ani.get(),
+                           this->dmtx_l_ani.get(), dmtx_l_coo.get());
     }
 
     GKO_ASSERT_MTX_NEAR(this->mtx_l_ani, this->dmtx_l_ani, 1e-2);
diff --git a/test/factorization/par_ilu_kernels.cpp b/test/factorization/par_ilu_kernels.cpp
index 94e2eb6512f..0d853af0745 100644
--- a/test/factorization/par_ilu_kernels.cpp
+++ b/test/factorization/par_ilu_kernels.cpp
@@ -89,8 +89,8 @@ class ParIlu : public CommonTestFixture {
     {
         gko::kernels::reference::factorization::initialize_row_ptrs_l_u(
             ref, mtx.get(), l_row_ptrs, u_row_ptrs);
-        gko::kernels::EXEC_NAMESPACE::factorization::initialize_row_ptrs_l_u(
-            exec, dmtx.get(), dl_row_ptrs, du_row_ptrs);
+        gko::kernels::GKO_DEVICE_NAMESPACE::factorization::
+            initialize_row_ptrs_l_u(exec, dmtx.get(), dl_row_ptrs, du_row_ptrs);
     }
 
     void initialize_lu(std::unique_ptr<Csr>& l, std::unique_ptr<Csr>& u,
@@ -121,7 +121,7 @@ class ParIlu : public CommonTestFixture {
 
         gko::kernels::reference::factorization::initialize_l_u(
             ref, mtx.get(), l.get(), u.get());
-        gko::kernels::EXEC_NAMESPACE::factorization::initialize_l_u(
+        gko::kernels::GKO_DEVICE_NAMESPACE::factorization::initialize_l_u(
             exec, dmtx.get(), dl.get(), du.get());
     }
 
@@ -139,7 +139,7 @@ class ParIlu : public CommonTestFixture {
 
         gko::kernels::reference::par_ilu_factorization::compute_l_u_factors(
             ref, iterations, coo.get(), l.get(), u_transpose_mtx.get());
-        gko::kernels::EXEC_NAMESPACE::par_ilu_factorization::
+        gko::kernels::GKO_DEVICE_NAMESPACE::par_ilu_factorization::
             compute_l_u_factors(exec, iterations, dcoo.get(), dl.get(),
                                 u_transpose_dmtx.get());
         auto u_lin_op = u_transpose_mtx->transpose();
@@ -160,7 +160,7 @@ TYPED_TEST(ParIlu, KernelAddDiagonalElementsSortedEquivalentToRef)
 
     gko::kernels::reference::factorization::add_diagonal_elements(
         this->ref, mtx.get(), true);
-    gko::kernels::EXEC_NAMESPACE::factorization::add_diagonal_elements(
+    gko::kernels::GKO_DEVICE_NAMESPACE::factorization::add_diagonal_elements(
         this->exec, dmtx.get(), true);
 
     ASSERT_TRUE(mtx->is_sorted_by_column_index());
@@ -176,7 +176,7 @@ TYPED_TEST(ParIlu, KernelAddDiagonalElementsUnsortedEquivalentToRef)
 
     gko::kernels::reference::factorization::add_diagonal_elements(
         this->ref, mtx.get(), false);
-    gko::kernels::EXEC_NAMESPACE::factorization::add_diagonal_elements(
+    gko::kernels::GKO_DEVICE_NAMESPACE::factorization::add_diagonal_elements(
         this->exec, dmtx.get(), false);
 
     ASSERT_FALSE(mtx->is_sorted_by_column_index());
@@ -193,7 +193,7 @@ TYPED_TEST(ParIlu, KernelAddDiagonalElementsNonSquareEquivalentToRef)
 
     gko::kernels::reference::factorization::add_diagonal_elements(
         this->ref, mtx.get(), true);
-    gko::kernels::EXEC_NAMESPACE::factorization::add_diagonal_elements(
+    gko::kernels::GKO_DEVICE_NAMESPACE::factorization::add_diagonal_elements(
         this->exec, dmtx.get(), true);
 
     ASSERT_TRUE(mtx->is_sorted_by_column_index());
diff --git a/test/factorization/par_ilut_kernels.cpp b/test/factorization/par_ilut_kernels.cpp
index c4ad7fe412a..7d46f7979ac 100644
--- a/test/factorization/par_ilut_kernels.cpp
+++ b/test/factorization/par_ilut_kernels.cpp
@@ -151,8 +151,8 @@ class ParIlut : public CommonTestFixture {
 
         gko::kernels::reference::par_ilut_factorization::threshold_select(
             ref, mtx.get(), rank, tmp, tmp2, res);
-        gko::kernels::EXEC_NAMESPACE::par_ilut_factorization::threshold_select(
-            exec, dmtx.get(), rank, dtmp, dtmp2, dres);
+        gko::kernels::GKO_DEVICE_NAMESPACE::par_ilut_factorization::
+            threshold_select(exec, dmtx.get(), rank, dtmp, dtmp2, dres);
 
         ASSERT_NEAR(res, dres, tolerance);
     }
@@ -174,9 +174,9 @@ class ParIlut : public CommonTestFixture {
 
         gko::kernels::reference::par_ilut_factorization::threshold_filter(
             ref, local_mtx.get(), threshold, res.get(), res_coo.get(), lower);
-        gko::kernels::EXEC_NAMESPACE::par_ilut_factorization::threshold_filter(
-            exec, local_dmtx.get(), threshold, dres.get(), dres_coo.get(),
-            lower);
+        gko::kernels::GKO_DEVICE_NAMESPACE::par_ilut_factorization::
+            threshold_filter(exec, local_dmtx.get(), threshold, dres.get(),
+                             dres_coo.get(), lower);
 
         GKO_ASSERT_MTX_NEAR(res, dres, 0);
         GKO_ASSERT_MTX_EQ_SPARSITY(res, dres);
@@ -208,7 +208,7 @@ class ParIlut : public CommonTestFixture {
         gko::kernels::reference::par_ilut_factorization::
             threshold_filter_approx(ref, mtx.get(), rank, tmp, threshold,
                                     res.get(), res_coo.get());
-        gko::kernels::EXEC_NAMESPACE::par_ilut_factorization::
+        gko::kernels::GKO_DEVICE_NAMESPACE::par_ilut_factorization::
             threshold_filter_approx(exec, dmtx.get(), rank, dtmp, dthreshold,
                                     dres.get(), dres_coo.get());
 
@@ -283,8 +283,9 @@ TYPED_TEST(ParIlut, KernelThresholdFilterNullptrCooIsEquivalentToRef)
 
     gko::kernels::reference::par_ilut_factorization::threshold_filter(
         this->ref, this->mtx_l.get(), 0.5, res.get(), null_coo, true);
-    gko::kernels::EXEC_NAMESPACE::par_ilut_factorization::threshold_filter(
-        this->exec, this->dmtx_l.get(), 0.5, dres.get(), null_coo, true);
+    gko::kernels::GKO_DEVICE_NAMESPACE::par_ilut_factorization::
+        threshold_filter(this->exec, this->dmtx_l.get(), 0.5, dres.get(),
+                         null_coo, true);
 
     GKO_ASSERT_MTX_NEAR(res, dres, 0);
     GKO_ASSERT_MTX_EQ_SPARSITY(res, dres);
@@ -346,7 +347,7 @@ TYPED_TEST(ParIlut, KernelThresholdFilterApproxNullptrCooIsEquivalentToRef)
     gko::kernels::reference::par_ilut_factorization::threshold_filter_approx(
         this->ref, this->mtx_l.get(), rank, tmp, threshold, res.get(),
         null_coo);
-    gko::kernels::EXEC_NAMESPACE::par_ilut_factorization::
+    gko::kernels::GKO_DEVICE_NAMESPACE::par_ilut_factorization::
         threshold_filter_approx(this->exec, this->dmtx_l.get(), rank, dtmp,
                                 dthreshold, dres.get(), null_coo);
 
@@ -393,7 +394,7 @@ TYPED_TEST(ParIlut, KernelAddCandidatesIsEquivalentToRef)
     gko::kernels::reference::par_ilut_factorization::add_candidates(
         this->ref, mtx_lu.get(), this->mtx_square.get(), this->mtx_l2.get(),
         this->mtx_u.get(), res_mtx_l.get(), res_mtx_u.get());
-    gko::kernels::EXEC_NAMESPACE::par_ilut_factorization::add_candidates(
+    gko::kernels::GKO_DEVICE_NAMESPACE::par_ilut_factorization::add_candidates(
         this->exec, dmtx_lu.get(), this->dmtx_square.get(), this->dmtx_l2.get(),
         this->dmtx_u.get(), dres_mtx_l.get(), dres_mtx_u.get());
 
@@ -422,7 +423,7 @@ TYPED_TEST(ParIlut, KernelComputeLUIsEquivalentToRef)
         this->ref, this->mtx_ani.get(), this->mtx_l_ani.get(), mtx_l_coo.get(),
         this->mtx_u_ani.get(), mtx_u_coo.get(), this->mtx_ut_ani.get());
     for (int i = 0; i < 20; ++i) {
-        gko::kernels::EXEC_NAMESPACE::par_ilut_factorization::
+        gko::kernels::GKO_DEVICE_NAMESPACE::par_ilut_factorization::
             compute_l_u_factors(this->exec, this->dmtx_ani.get(),
                                 this->dmtx_l_ani.get(), dmtx_l_coo.get(),
                                 this->dmtx_u_ani.get(), dmtx_u_coo.get(),
diff --git a/test/matrix/csr_kernels.cpp b/test/matrix/csr_kernels.cpp
index 347425175bb..d3a7bb8f8e5 100644
--- a/test/matrix/csr_kernels.cpp
+++ b/test/matrix/csr_kernels.cpp
@@ -149,7 +149,7 @@ void assert_lookup_correct(std::shared_ptr<const gko::EXEC_TYPE> exec,
     const auto row_ptrs = mtx->get_const_row_ptrs();
     const auto col_idxs = mtx->get_const_col_idxs();
     gko::array<bool> correct{exec, {true}};
-    gko::kernels::EXEC_NAMESPACE::run_kernel(
+    gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel(
         exec,
         [] GKO_KERNEL(auto row, auto num_cols, auto row_ptrs, auto col_idxs,
                       auto storage_offsets, auto storage, auto row_descs,
@@ -215,7 +215,7 @@ TYPED_TEST(CsrLookup, BuildLookupWorks)
         // otherwise things might crash
         gko::kernels::reference::csr::build_lookup_offsets(
             this->ref, row_ptrs, col_idxs, num_rows, allowed, storage_offsets);
-        gko::kernels::EXEC_NAMESPACE::csr::build_lookup_offsets(
+        gko::kernels::GKO_DEVICE_NAMESPACE::csr::build_lookup_offsets(
             this->exec, drow_ptrs, dcol_idxs, num_rows, allowed,
             dstorage_offsets);
 
@@ -238,7 +238,7 @@ TYPED_TEST(CsrLookup, BuildLookupWorks)
         gko::kernels::reference::csr::build_lookup(
             this->ref, row_ptrs, col_idxs, num_rows, allowed, storage_offsets,
             row_descs, storage);
-        gko::kernels::EXEC_NAMESPACE::csr::build_lookup(
+        gko::kernels::GKO_DEVICE_NAMESPACE::csr::build_lookup(
             this->exec, drow_ptrs, dcol_idxs, num_rows, allowed,
             dstorage_offsets, drow_descs, dstorage);
 
diff --git a/test/matrix/csr_kernels2.cpp b/test/matrix/csr_kernels2.cpp
index 713593b4ae5..4ff8e749766 100644
--- a/test/matrix/csr_kernels2.cpp
+++ b/test/matrix/csr_kernels2.cpp
@@ -1346,7 +1346,7 @@ TEST_F(Csr, CalculateNnzPerRowInSpanIsEquivalentToRef)
 
     gko::kernels::reference::csr::calculate_nonzeros_per_row_in_span(
         this->ref, this->mtx2.get(), rspan, cspan, &row_nnz);
-    gko::kernels::EXEC_NAMESPACE::csr::calculate_nonzeros_per_row_in_span(
+    gko::kernels::GKO_DEVICE_NAMESPACE::csr::calculate_nonzeros_per_row_in_span(
         this->exec, this->dmtx2.get(), rspan, cspan, &drow_nnz);
 
     GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz);
@@ -1382,7 +1382,7 @@ TEST_F(Csr, ComputeSubmatrixIsEquivalentToRef)
 
     gko::kernels::reference::csr::compute_submatrix(this->ref, this->mtx2.get(),
                                                     rspan, cspan, smat1.get());
-    gko::kernels::EXEC_NAMESPACE::csr::compute_submatrix(
+    gko::kernels::GKO_DEVICE_NAMESPACE::csr::compute_submatrix(
         this->exec, this->dmtx2.get(), rspan, cspan, sdmat1.get());
 
     GKO_ASSERT_MTX_NEAR(sdmat1, smat1, 0.0);
@@ -1408,8 +1408,9 @@ TEST_F(Csr, CalculateNnzPerRowInIndexSetIsEquivalentToRef)
 
     gko::kernels::reference::csr::calculate_nonzeros_per_row_in_index_set(
         this->ref, this->mtx2.get(), rset, cset, row_nnz.get_data());
-    gko::kernels::EXEC_NAMESPACE::csr::calculate_nonzeros_per_row_in_index_set(
-        this->exec, this->dmtx2.get(), drset, dcset, drow_nnz.get_data());
+    gko::kernels::GKO_DEVICE_NAMESPACE::csr::
+        calculate_nonzeros_per_row_in_index_set(
+            this->exec, this->dmtx2.get(), drset, dcset, drow_nnz.get_data());
 
     GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz);
 }
@@ -1446,7 +1447,7 @@ TEST_F(Csr, ComputeSubmatrixFromIndexSetIsEquivalentToRef)
 
     gko::kernels::reference::csr::compute_submatrix_from_index_set(
         this->ref, this->mtx2.get(), rset, cset, smat1.get());
-    gko::kernels::EXEC_NAMESPACE::csr::compute_submatrix_from_index_set(
+    gko::kernels::GKO_DEVICE_NAMESPACE::csr::compute_submatrix_from_index_set(
         this->exec, this->dmtx2.get(), drset, dcset, sdmat1.get());
 
     GKO_ASSERT_MTX_NEAR(sdmat1, smat1, 0.0);
@@ -1501,7 +1502,7 @@ TEST_F(Csr, CanDetectMissingDiagonalEntry)
     auto mtx = gko::clone(exec, ref_mtx);
     bool has_diags = true;
 
-    gko::kernels::EXEC_NAMESPACE::csr::check_diagonal_entries_exist(
+    gko::kernels::GKO_DEVICE_NAMESPACE::csr::check_diagonal_entries_exist(
         exec, mtx.get(), has_diags);
 
     ASSERT_FALSE(has_diags);
@@ -1516,7 +1517,7 @@ TEST_F(Csr, CanDetectWhenAllDiagonalEntriesArePresent)
     auto mtx = gko::clone(exec, ref_mtx);
     bool has_diags = true;
 
-    gko::kernels::EXEC_NAMESPACE::csr::check_diagonal_entries_exist(
+    gko::kernels::GKO_DEVICE_NAMESPACE::csr::check_diagonal_entries_exist(
         exec, mtx.get(), has_diags);
 
     ASSERT_TRUE(has_diags);
diff --git a/test/matrix/dense_kernels.cpp b/test/matrix/dense_kernels.cpp
index 25b82215dcd..56ca536187e 100644
--- a/test/matrix/dense_kernels.cpp
+++ b/test/matrix/dense_kernels.cpp
@@ -603,7 +603,7 @@ TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef)
 
     gko::kernels::reference::dense::count_nonzeros_per_row(
         ref, x.get(), nnz_per_row.get_data());
-    gko::kernels::EXEC_NAMESPACE::dense::count_nonzeros_per_row(
+    gko::kernels::GKO_DEVICE_NAMESPACE::dense::count_nonzeros_per_row(
         exec, dx.get(), dnnz_per_row.get_data());
 
     auto tmp = gko::array<gko::size_type>(ref, dnnz_per_row);
@@ -621,8 +621,8 @@ TEST_F(Dense, ComputeMaxNNZPerRowIsEquivalentToRef)
 
     gko::kernels::reference::dense::compute_max_nnz_per_row(ref, x.get(),
                                                             max_nnz);
-    gko::kernels::EXEC_NAMESPACE::dense::compute_max_nnz_per_row(exec, dx.get(),
-                                                                 dmax_nnz);
+    gko::kernels::GKO_DEVICE_NAMESPACE::dense::compute_max_nnz_per_row(
+        exec, dx.get(), dmax_nnz);
 
     ASSERT_EQ(max_nnz, dmax_nnz);
 }
@@ -2017,7 +2017,7 @@ TEST_F(Dense, ComputeNorm2SquaredIsEquivalentToRef)
 
     gko::kernels::reference::dense::compute_squared_norm2(
         ref, x.get(), norm_expected.get(), tmp);
-    gko::kernels::EXEC_NAMESPACE::dense::compute_squared_norm2(
+    gko::kernels::GKO_DEVICE_NAMESPACE::dense::compute_squared_norm2(
         exec, dx.get(), dnorm.get(), dtmp);
 
     GKO_ASSERT_MTX_NEAR(dnorm, norm_expected, r<value_type>::value);
@@ -2033,7 +2033,7 @@ TEST_F(Dense, ComputesSqrt)
     auto dmtx = gko::clone(exec, mtx);
 
     gko::kernels::reference::dense::compute_sqrt(ref, mtx.get());
-    gko::kernels::EXEC_NAMESPACE::dense::compute_sqrt(exec, dmtx.get());
+    gko::kernels::GKO_DEVICE_NAMESPACE::dense::compute_sqrt(exec, dmtx.get());
 
     GKO_ASSERT_MTX_NEAR(mtx, dmtx, r<value_type>::value);
 }
diff --git a/test/matrix/ell_kernels.cpp b/test/matrix/ell_kernels.cpp
index f6b9a9d1edb..b61d97a0a7a 100644
--- a/test/matrix/ell_kernels.cpp
+++ b/test/matrix/ell_kernels.cpp
@@ -533,7 +533,7 @@ TEST_F(Ell, CalculateNNZPerRowIsEquivalentToRef)
 
     gko::kernels::reference::ell::count_nonzeros_per_row(
         ref, mtx.get(), nnz_per_row.get_data());
-    gko::kernels::EXEC_NAMESPACE::ell::count_nonzeros_per_row(
+    gko::kernels::GKO_DEVICE_NAMESPACE::ell::count_nonzeros_per_row(
         exec, dmtx.get(), dnnz_per_row.get_data());
 
     GKO_ASSERT_ARRAY_EQ(nnz_per_row, dnnz_per_row);
diff --git a/test/matrix/sparsity_csr_kernels.cpp b/test/matrix/sparsity_csr_kernels.cpp
index 6fc3caf60ad..010bd7faa86 100644
--- a/test/matrix/sparsity_csr_kernels.cpp
+++ b/test/matrix/sparsity_csr_kernels.cpp
@@ -64,8 +64,8 @@ TEST_F(SparsityCsr, KernelDiagonalElementPrefixSumIsEquivalentToRef)
 
     gko::kernels::reference::sparsity_csr::diagonal_element_prefix_sum(
         ref, mtx.get(), prefix_sum.get_data());
-    gko::kernels::EXEC_NAMESPACE::sparsity_csr::diagonal_element_prefix_sum(
-        exec, dmtx.get(), dprefix_sum.get_data());
+    gko::kernels::GKO_DEVICE_NAMESPACE::sparsity_csr::
+        diagonal_element_prefix_sum(exec, dmtx.get(), dprefix_sum.get_data());
 
     GKO_ASSERT_ARRAY_EQ(prefix_sum, dprefix_sum);
 }
@@ -88,7 +88,7 @@ TEST_F(SparsityCsr, KernelRemoveDiagonalElementsIsEquivalentToRef)
     gko::kernels::reference::sparsity_csr::remove_diagonal_elements(
         ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
         prefix_sum.get_const_data(), out_mtx.get());
-    gko::kernels::EXEC_NAMESPACE::sparsity_csr::remove_diagonal_elements(
+    gko::kernels::GKO_DEVICE_NAMESPACE::sparsity_csr::remove_diagonal_elements(
         exec, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(),
         dprefix_sum.get_const_data(), dout_mtx.get());
 
diff --git a/test/multigrid/pgm_kernels.cpp b/test/multigrid/pgm_kernels.cpp
index a5f2d32fe32..10e5cf01a7a 100644
--- a/test/multigrid/pgm_kernels.cpp
+++ b/test/multigrid/pgm_kernels.cpp
@@ -159,8 +159,8 @@ TEST_F(Pgm, MatchEdgeIsEquivalentToRef)
     auto d_x = d_unfinished_agg;
 
     gko::kernels::reference::pgm::match_edge(ref, strongest_neighbor, x);
-    gko::kernels::EXEC_NAMESPACE::pgm::match_edge(exec, d_strongest_neighbor,
-                                                  d_x);
+    gko::kernels::GKO_DEVICE_NAMESPACE::pgm::match_edge(
+        exec, d_strongest_neighbor, d_x);
 
     GKO_ASSERT_ARRAY_EQ(d_x, x);
 }
@@ -173,8 +173,8 @@ TEST_F(Pgm, CountUnaggIsEquivalentToRef)
     index_type d_num_unagg;
 
     gko::kernels::reference::pgm::count_unagg(ref, unfinished_agg, &num_unagg);
-    gko::kernels::EXEC_NAMESPACE::pgm::count_unagg(exec, d_unfinished_agg,
-                                                   &d_num_unagg);
+    gko::kernels::GKO_DEVICE_NAMESPACE::pgm::count_unagg(exec, d_unfinished_agg,
+                                                         &d_num_unagg);
 
     ASSERT_EQ(d_num_unagg, num_unagg);
 }
@@ -187,7 +187,7 @@ TEST_F(Pgm, RenumberIsEquivalentToRef)
     index_type d_num_agg;
 
     gko::kernels::reference::pgm::renumber(ref, agg, &num_agg);
-    gko::kernels::EXEC_NAMESPACE::pgm::renumber(exec, d_agg, &d_num_agg);
+    gko::kernels::GKO_DEVICE_NAMESPACE::pgm::renumber(exec, d_agg, &d_num_agg);
 
     ASSERT_EQ(d_num_agg, num_agg);
     GKO_ASSERT_ARRAY_EQ(d_agg, agg);
@@ -203,7 +203,7 @@ TEST_F(Pgm, FindStrongestNeighborIsEquivalentToRef)
 
     gko::kernels::reference::pgm::find_strongest_neighbor(
         ref, weight_csr.get(), weight_diag.get(), agg, snb);
-    gko::kernels::EXEC_NAMESPACE::pgm::find_strongest_neighbor(
+    gko::kernels::GKO_DEVICE_NAMESPACE::pgm::find_strongest_neighbor(
         exec, d_weight_csr.get(), d_weight_diag.get(), d_agg, d_snb);
 
     GKO_ASSERT_ARRAY_EQ(d_snb, snb);
@@ -220,7 +220,7 @@ TEST_F(Pgm, AssignToExistAggIsEquivalentToRef)
 
     gko::kernels::reference::pgm::assign_to_exist_agg(
         ref, weight_csr.get(), weight_diag.get(), x, intermediate_agg);
-    gko::kernels::EXEC_NAMESPACE::pgm::assign_to_exist_agg(
+    gko::kernels::GKO_DEVICE_NAMESPACE::pgm::assign_to_exist_agg(
         exec, d_weight_csr.get(), d_weight_diag.get(), d_x, d_intermediate_agg);
 
     GKO_ASSERT_ARRAY_EQ(d_x, x);
@@ -234,9 +234,10 @@ TEST_F(Pgm, AssignToExistAggUnderteminsticIsEquivalentToRef)
     auto d_intermediate_agg = gko::array<index_type>(exec, 0);
     index_type d_num_unagg;
 
-    gko::kernels::EXEC_NAMESPACE::pgm::assign_to_exist_agg(
+    gko::kernels::GKO_DEVICE_NAMESPACE::pgm::assign_to_exist_agg(
         exec, d_weight_csr.get(), d_weight_diag.get(), d_x, d_intermediate_agg);
-    gko::kernels::EXEC_NAMESPACE::pgm::count_unagg(exec, d_agg, &d_num_unagg);
+    gko::kernels::GKO_DEVICE_NAMESPACE::pgm::count_unagg(exec, d_agg,
+                                                         &d_num_unagg);
 
     // only test whether all elements are aggregated.
     GKO_ASSERT_EQ(d_num_unagg, 0);
@@ -257,7 +258,7 @@ TEST_F(Pgm, GatherIndexIsEquivalentToRef)
     gko::kernels::reference::pgm::gather_index(ref, num, orig.get_const_data(),
                                                map.get_const_data(),
                                                result.get_data());
-    gko::kernels::EXEC_NAMESPACE::pgm::gather_index(
+    gko::kernels::GKO_DEVICE_NAMESPACE::pgm::gather_index(
         exec, num, d_orig.get_const_data(), d_map.get_const_data(),
         d_result.get_data());
 
diff --git a/test/preconditioner/batch_jacobi_kernels.cpp b/test/preconditioner/batch_jacobi_kernels.cpp
index 30dbfa271ee..f8a1bd015ef 100644
--- a/test/preconditioner/batch_jacobi_kernels.cpp
+++ b/test/preconditioner/batch_jacobi_kernels.cpp
@@ -117,7 +117,7 @@ class BatchJacobi : public CommonTestFixture {
                                   const gko::batch::BatchLinOp* prec,
                                   const Mtx* mtx, const MVec* b, MVec* x,
                                   LogData& log_data) {
-            gko::kernels::EXEC_NAMESPACE::batch_bicgstab::apply<
+            gko::kernels::GKO_DEVICE_NAMESPACE::batch_bicgstab::apply<
                 typename Mtx::value_type>(executor, settings, mtx, prec, b, x,
                                           log_data);
         };
diff --git a/test/preconditioner/isai_kernels.cpp b/test/preconditioner/isai_kernels.cpp
index 57f8c14ac27..6e737d31790 100644
--- a/test/preconditioner/isai_kernels.cpp
+++ b/test/preconditioner/isai_kernels.cpp
@@ -122,7 +122,7 @@ TEST_F(Isai, IsaiGenerateLinverseShortIsEquivalentToRef)
 
     gko::kernels::reference::isai::generate_tri_inverse(
         ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true);
-    gko::kernels::EXEC_NAMESPACE::isai::generate_tri_inverse(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_tri_inverse(
         exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
         true);
 
@@ -145,7 +145,7 @@ TEST_F(Isai, IsaiGenerateUinverseShortIsEquivalentToRef)
 
     gko::kernels::reference::isai::generate_tri_inverse(
         ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
-    gko::kernels::EXEC_NAMESPACE::isai::generate_tri_inverse(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_tri_inverse(
         exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
         false);
 
@@ -168,7 +168,7 @@ TEST_F(Isai, IsaiGenerateAinverseShortIsEquivalentToRef)
 
     gko::kernels::reference::isai::generate_general_inverse(
         ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
-    gko::kernels::EXEC_NAMESPACE::isai::generate_general_inverse(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_general_inverse(
         exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
         false);
 
@@ -191,7 +191,7 @@ TEST_F(Isai, IsaiGenerateSpdinverseShortIsEquivalentToRef)
 
     gko::kernels::reference::isai::generate_general_inverse(
         ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true);
-    gko::kernels::EXEC_NAMESPACE::isai::generate_general_inverse(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_general_inverse(
         exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
         true);
 
@@ -214,7 +214,7 @@ TEST_F(Isai, IsaiGenerateLinverseLongIsEquivalentToRef)
 
     gko::kernels::reference::isai::generate_tri_inverse(
         ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true);
-    gko::kernels::EXEC_NAMESPACE::isai::generate_tri_inverse(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_tri_inverse(
         exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
         true);
 
@@ -237,7 +237,7 @@ TEST_F(Isai, IsaiGenerateUinverseLongIsEquivalentToRef)
 
     gko::kernels::reference::isai::generate_tri_inverse(
         ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
-    gko::kernels::EXEC_NAMESPACE::isai::generate_tri_inverse(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_tri_inverse(
         exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
         false);
 
@@ -260,7 +260,7 @@ TEST_F(Isai, IsaiGenerateAinverseLongIsEquivalentToRef)
 
     gko::kernels::reference::isai::generate_general_inverse(
         ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
-    gko::kernels::EXEC_NAMESPACE::isai::generate_general_inverse(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_general_inverse(
         exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
         false);
 
@@ -283,7 +283,7 @@ TEST_F(Isai, IsaiGenerateSpdinverseLongIsEquivalentToRef)
 
     gko::kernels::reference::isai::generate_general_inverse(
         ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
-    gko::kernels::EXEC_NAMESPACE::isai::generate_general_inverse(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_general_inverse(
         exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
         false);
 
@@ -315,7 +315,7 @@ TEST_F(Isai, IsaiGenerateExcessLinverseLongIsEquivalentToRef)
     gko::kernels::reference::isai::generate_excess_system(
         ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(),
         excess.get(), e_rhs.get(), 0, num_rows);
-    gko::kernels::EXEC_NAMESPACE::isai::generate_excess_system(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_excess_system(
         exec, d_mtx.get(), d_inverse.get(), da1.get_const_data(),
         da2.get_const_data(), dexcess.get(), de_rhs.get(), 0, num_rows);
 
@@ -346,7 +346,7 @@ TEST_F(Isai, IsaiGenerateExcessUinverseLongIsEquivalentToRef)
     gko::kernels::reference::isai::generate_excess_system(
         ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(),
         excess.get(), e_rhs.get(), 0, num_rows);
-    gko::kernels::EXEC_NAMESPACE::isai::generate_excess_system(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_excess_system(
         exec, d_mtx.get(), d_inverse.get(), da1.get_const_data(),
         da2.get_const_data(), dexcess.get(), de_rhs.get(), 0, num_rows);
 
@@ -377,7 +377,7 @@ TEST_F(Isai, IsaiGenerateExcessAinverseLongIsEquivalentToRef)
     gko::kernels::reference::isai::generate_excess_system(
         ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(),
         excess.get(), e_rhs.get(), 0, num_rows);
-    gko::kernels::EXEC_NAMESPACE::isai::generate_excess_system(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_excess_system(
         exec, d_mtx.get(), d_inverse.get(), da1.get_const_data(),
         da2.get_const_data(), dexcess.get(), de_rhs.get(), 0, num_rows);
 
@@ -408,7 +408,7 @@ TEST_F(Isai, IsaiGenerateExcessSpdinverseLongIsEquivalentToRef)
     gko::kernels::reference::isai::generate_excess_system(
         ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(),
         excess.get(), e_rhs.get(), 0, num_rows);
-    gko::kernels::EXEC_NAMESPACE::isai::generate_excess_system(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_excess_system(
         exec, d_mtx.get(), d_inverse.get(), da1.get_const_data(),
         da2.get_const_data(), dexcess.get(), de_rhs.get(), 0, num_rows);
 
@@ -439,7 +439,7 @@ TEST_F(Isai, IsaiGeneratePartialExcessIsEquivalentToRef)
     gko::kernels::reference::isai::generate_excess_system(
         ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(),
         excess.get(), e_rhs.get(), 5u, 10u);
-    gko::kernels::EXEC_NAMESPACE::isai::generate_excess_system(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_excess_system(
         exec, d_mtx.get(), d_inverse.get(), da1.get_const_data(),
         da2.get_const_data(), dexcess.get(), de_rhs.get(), 5u, 10u);
 
@@ -467,7 +467,7 @@ TEST_F(Isai, IsaiScaleExcessSolutionIsEquivalentToRef)
 
     gko::kernels::reference::isai::scale_excess_solution(
         ref, a1.get_const_data(), e_rhs.get(), 0, num_rows);
-    gko::kernels::EXEC_NAMESPACE::isai::scale_excess_solution(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::scale_excess_solution(
         exec, da1.get_const_data(), de_rhs.get(), 0, num_rows);
 
     GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0);
@@ -490,7 +490,7 @@ TEST_F(Isai, IsaiScalePartialExcessSolutionIsEquivalentToRef)
 
     gko::kernels::reference::isai::scale_excess_solution(
         ref, a1.get_const_data(), e_rhs.get(), 5u, 10u);
-    gko::kernels::EXEC_NAMESPACE::isai::scale_excess_solution(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::scale_excess_solution(
         exec, da1.get_const_data(), de_rhs.get(), 5u, 10u);
 
     GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0);
@@ -514,7 +514,7 @@ TEST_F(Isai, IsaiScatterExcessSolutionLIsEquivalentToRef)
 
     gko::kernels::reference::isai::scatter_excess_solution(
         ref, a1.get_const_data(), e_rhs.get(), inverse.get(), 0, num_rows);
-    gko::kernels::EXEC_NAMESPACE::isai::scatter_excess_solution(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::scatter_excess_solution(
         exec, da1.get_const_data(), de_rhs.get(), d_inverse.get(), 0, num_rows);
 
     GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0);
@@ -540,7 +540,7 @@ TEST_F(Isai, IsaiScatterExcessSolutionUIsEquivalentToRef)
 
     gko::kernels::reference::isai::scatter_excess_solution(
         ref, a1.get_const_data(), e_rhs.get(), inverse.get(), 0, num_rows);
-    gko::kernels::EXEC_NAMESPACE::isai::scatter_excess_solution(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::scatter_excess_solution(
         exec, da1.get_const_data(), de_rhs.get(), d_inverse.get(), 0, num_rows);
 
     GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0);
@@ -566,7 +566,7 @@ TEST_F(Isai, IsaiScatterExcessSolutionAIsEquivalentToRef)
 
     gko::kernels::reference::isai::scatter_excess_solution(
         ref, a1.get_const_data(), e_rhs.get(), inverse.get(), 0, num_rows);
-    gko::kernels::EXEC_NAMESPACE::isai::scatter_excess_solution(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::scatter_excess_solution(
         exec, da1.get_const_data(), de_rhs.get(), d_inverse.get(), 0, num_rows);
 
     GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0);
@@ -592,7 +592,7 @@ TEST_F(Isai, IsaiScatterExcessSolutionSpdIsEquivalentToRef)
 
     gko::kernels::reference::isai::scatter_excess_solution(
         ref, a1.get_const_data(), e_rhs.get(), inverse.get(), 0, num_rows);
-    gko::kernels::EXEC_NAMESPACE::isai::scatter_excess_solution(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::scatter_excess_solution(
         exec, da1.get_const_data(), de_rhs.get(), d_inverse.get(), 0, num_rows);
 
     GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0);
@@ -618,7 +618,7 @@ TEST_F(Isai, IsaiScatterPartialExcessSolutionIsEquivalentToRef)
 
     gko::kernels::reference::isai::scatter_excess_solution(
         ref, a1.get_const_data(), e_rhs.get(), inverse.get(), 5u, 10u);
-    gko::kernels::EXEC_NAMESPACE::isai::scatter_excess_solution(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::scatter_excess_solution(
         exec, da1.get_const_data(), de_rhs.get(), d_inverse.get(), 5u, 10u);
 
     GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0);
diff --git a/test/solver/batch_bicgstab_kernels.cpp b/test/solver/batch_bicgstab_kernels.cpp
index 821a8a6d29c..14bca65e41f 100644
--- a/test/solver/batch_bicgstab_kernels.cpp
+++ b/test/solver/batch_bicgstab_kernels.cpp
@@ -52,7 +52,7 @@ class BatchBicgstab : public CommonTestFixture {
                                   const gko::batch::BatchLinOp* prec,
                                   const Mtx* mtx, const MVec* b, MVec* x,
                                   LogData& log_data) {
-            gko::kernels::EXEC_NAMESPACE::batch_bicgstab::apply<
+            gko::kernels::GKO_DEVICE_NAMESPACE::batch_bicgstab::apply<
                 typename Mtx::value_type>(executor, settings, mtx, prec, b, x,
                                           log_data);
         };
diff --git a/test/solver/batch_cg_kernels.cpp b/test/solver/batch_cg_kernels.cpp
index 49f0db2a09b..7c013020686 100644
--- a/test/solver/batch_cg_kernels.cpp
+++ b/test/solver/batch_cg_kernels.cpp
@@ -50,7 +50,7 @@ class BatchCg : public CommonTestFixture {
                                   const gko::batch::BatchLinOp* prec,
                                   const Mtx* mtx, const MVec* b, MVec* x,
                                   LogData& log_data) {
-            gko::kernels::EXEC_NAMESPACE::batch_cg::apply<
+            gko::kernels::GKO_DEVICE_NAMESPACE::batch_cg::apply<
                 typename Mtx::value_type>(executor, settings, mtx, prec, b, x,
                                           log_data);
         };
diff --git a/test/solver/bicg_kernels.cpp b/test/solver/bicg_kernels.cpp
index 616f7eff096..ab63b01f9cc 100644
--- a/test/solver/bicg_kernels.cpp
+++ b/test/solver/bicg_kernels.cpp
@@ -139,7 +139,7 @@ TEST_F(Bicg, BicgInitializeIsEquivalentToRef)
     gko::kernels::reference::bicg::initialize(
         ref, b.get(), r.get(), z.get(), p.get(), q.get(), prev_rho.get(),
         rho.get(), r2.get(), z2.get(), p2.get(), q2.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::bicg::initialize(
+    gko::kernels::GKO_DEVICE_NAMESPACE::bicg::initialize(
         exec, d_b.get(), d_r.get(), d_z.get(), d_p.get(), d_q.get(),
         d_prev_rho.get(), d_rho.get(), d_r2.get(), d_z2.get(), d_p2.get(),
         d_q2.get(), d_stop_status.get());
@@ -165,7 +165,7 @@ TEST_F(Bicg, BicgStep1IsEquivalentToRef)
     gko::kernels::reference::bicg::step_1(ref, p.get(), z.get(), p2.get(),
                                           z2.get(), rho.get(), prev_rho.get(),
                                           stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::bicg::step_1(
+    gko::kernels::GKO_DEVICE_NAMESPACE::bicg::step_1(
         exec, d_p.get(), d_z.get(), d_p2.get(), d_z2.get(), d_rho.get(),
         d_prev_rho.get(), d_stop_status.get());
 
@@ -183,7 +183,7 @@ TEST_F(Bicg, BicgStep2IsEquivalentToRef)
     gko::kernels::reference::bicg::step_2(
         ref, x.get(), r.get(), r2.get(), p.get(), q.get(), q2.get(), beta.get(),
         rho.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::bicg::step_2(
+    gko::kernels::GKO_DEVICE_NAMESPACE::bicg::step_2(
         exec, d_x.get(), d_r.get(), d_r2.get(), d_p.get(), d_q.get(),
         d_q2.get(), d_beta.get(), d_rho.get(), d_stop_status.get());
 
diff --git a/test/solver/bicgstab_kernels.cpp b/test/solver/bicgstab_kernels.cpp
index a63ff7f39f4..4f68edd6a8e 100644
--- a/test/solver/bicgstab_kernels.cpp
+++ b/test/solver/bicgstab_kernels.cpp
@@ -176,7 +176,7 @@ TEST_F(Bicgstab, BicgstabInitializeIsEquivalentToRef)
         ref, b.get(), r.get(), rr.get(), y.get(), s.get(), t.get(), z.get(),
         v.get(), p.get(), prev_rho.get(), rho.get(), alpha.get(), beta.get(),
         gamma.get(), omega.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::bicgstab::initialize(
+    gko::kernels::GKO_DEVICE_NAMESPACE::bicgstab::initialize(
         exec, d_b.get(), d_r.get(), d_rr.get(), d_y.get(), d_s.get(), d_t.get(),
         d_z.get(), d_v.get(), d_p.get(), d_prev_rho.get(), d_rho.get(),
         d_alpha.get(), d_beta.get(), d_gamma.get(), d_omega.get(),
@@ -207,7 +207,7 @@ TEST_F(Bicgstab, BicgstabStep1IsEquivalentToRef)
     gko::kernels::reference::bicgstab::step_1(
         ref, r.get(), p.get(), v.get(), rho.get(), prev_rho.get(), alpha.get(),
         omega.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::bicgstab::step_1(
+    gko::kernels::GKO_DEVICE_NAMESPACE::bicgstab::step_1(
         exec, d_r.get(), d_p.get(), d_v.get(), d_rho.get(), d_prev_rho.get(),
         d_alpha.get(), d_omega.get(), d_stop_status.get());
 
@@ -222,7 +222,7 @@ TEST_F(Bicgstab, BicgstabStep2IsEquivalentToRef)
     gko::kernels::reference::bicgstab::step_2(ref, r.get(), s.get(), v.get(),
                                               rho.get(), alpha.get(),
                                               beta.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::bicgstab::step_2(
+    gko::kernels::GKO_DEVICE_NAMESPACE::bicgstab::step_2(
         exec, d_r.get(), d_s.get(), d_v.get(), d_rho.get(), d_alpha.get(),
         d_beta.get(), d_stop_status.get());
 
@@ -238,7 +238,7 @@ TEST_F(Bicgstab, BicgstabStep3IsEquivalentToRef)
     gko::kernels::reference::bicgstab::step_3(
         ref, x.get(), r.get(), s.get(), t.get(), y.get(), z.get(), alpha.get(),
         beta.get(), gamma.get(), omega.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::bicgstab::step_3(
+    gko::kernels::GKO_DEVICE_NAMESPACE::bicgstab::step_3(
         exec, d_x.get(), d_r.get(), d_s.get(), d_t.get(), d_y.get(), d_z.get(),
         d_alpha.get(), d_beta.get(), d_gamma.get(), d_omega.get(),
         d_stop_status.get());
diff --git a/test/solver/cb_gmres_kernels.cpp b/test/solver/cb_gmres_kernels.cpp
index 4f854a26180..3b5f5956c2e 100644
--- a/test/solver/cb_gmres_kernels.cpp
+++ b/test/solver/cb_gmres_kernels.cpp
@@ -209,7 +209,7 @@ TEST_F(CbGmres, CbGmresInitialize1IsEquivalentToRef)
     gko::kernels::reference::cb_gmres::initialize(
         ref, b.get(), residual.get(), givens_sin.get(), givens_cos.get(),
         stop_status.get(), default_krylov_dim_mixed);
-    gko::kernels::EXEC_NAMESPACE::cb_gmres::initialize(
+    gko::kernels::GKO_DEVICE_NAMESPACE::cb_gmres::initialize(
         exec, d_b.get(), d_residual.get(), d_givens_sin.get(),
         d_givens_cos.get(), d_stop_status.get(), default_krylov_dim_mixed);
 
@@ -230,7 +230,7 @@ TEST_F(CbGmres, CbGmresInitialize2IsEquivalentToRef)
         residual_norm_collection.get(), arnoldi_norm.get(),
         range_helper.get_range(), next_krylov_basis.get(),
         final_iter_nums.get(), tmp, default_krylov_dim_mixed);
-    gko::kernels::EXEC_NAMESPACE::cb_gmres::restart(
+    gko::kernels::GKO_DEVICE_NAMESPACE::cb_gmres::restart(
         exec, d_residual.get(), d_residual_norm.get(),
         d_residual_norm_collection.get(), d_arnoldi_norm.get(),
         d_range_helper.get_range(), d_next_krylov_basis.get(),
@@ -255,7 +255,7 @@ TEST_F(CbGmres, CbGmresStep1IsEquivalentToRef)
         range_helper.get_range(), hessenberg_iter.get(), buffer_iter.get(),
         arnoldi_norm.get(), iter, final_iter_nums.get(), stop_status.get(),
         reorth_status.get(), num_reorth.get());
-    gko::kernels::EXEC_NAMESPACE::cb_gmres::arnoldi(
+    gko::kernels::GKO_DEVICE_NAMESPACE::cb_gmres::arnoldi(
         exec, d_next_krylov_basis.get(), d_givens_sin.get(), d_givens_cos.get(),
         d_residual_norm.get(), d_residual_norm_collection.get(),
         d_range_helper.get_range(), d_hessenberg_iter.get(),
@@ -285,7 +285,7 @@ TEST_F(CbGmres, CbGmresStep2IsEquivalentToRef)
         ref, residual_norm_collection.get(),
         range_helper.get_range().get_accessor().to_const(), hessenberg.get(),
         y.get(), before_preconditioner.get(), final_iter_nums.get());
-    gko::kernels::EXEC_NAMESPACE::cb_gmres::solve_krylov(
+    gko::kernels::GKO_DEVICE_NAMESPACE::cb_gmres::solve_krylov(
         exec, d_residual_norm_collection.get(),
         d_range_helper.get_range().get_accessor().to_const(),
         d_hessenberg.get(), d_y.get(), d_before_preconditioner.get(),
diff --git a/test/solver/cg_kernels.cpp b/test/solver/cg_kernels.cpp
index 41af16489a2..be9dc052314 100644
--- a/test/solver/cg_kernels.cpp
+++ b/test/solver/cg_kernels.cpp
@@ -114,7 +114,7 @@ TEST_F(Cg, CgInitializeIsEquivalentToRef)
     gko::kernels::reference::cg::initialize(ref, b.get(), r.get(), z.get(),
                                             p.get(), q.get(), prev_rho.get(),
                                             rho.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::cg::initialize(
+    gko::kernels::GKO_DEVICE_NAMESPACE::cg::initialize(
         exec, d_b.get(), d_r.get(), d_z.get(), d_p.get(), d_q.get(),
         d_prev_rho.get(), d_rho.get(), d_stop_status.get());
 
@@ -134,9 +134,9 @@ TEST_F(Cg, CgStep1IsEquivalentToRef)
 
     gko::kernels::reference::cg::step_1(ref, p.get(), z.get(), rho.get(),
                                         prev_rho.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::cg::step_1(exec, d_p.get(), d_z.get(),
-                                             d_rho.get(), d_prev_rho.get(),
-                                             d_stop_status.get());
+    gko::kernels::GKO_DEVICE_NAMESPACE::cg::step_1(
+        exec, d_p.get(), d_z.get(), d_rho.get(), d_prev_rho.get(),
+        d_stop_status.get());
 
     GKO_ASSERT_MTX_NEAR(d_p, p, ::r<value_type>::value);
     GKO_ASSERT_MTX_NEAR(d_z, z, ::r<value_type>::value);
@@ -149,9 +149,9 @@ TEST_F(Cg, CgStep2IsEquivalentToRef)
     gko::kernels::reference::cg::step_2(ref, x.get(), r.get(), p.get(), q.get(),
                                         beta.get(), rho.get(),
                                         stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::cg::step_2(exec, d_x.get(), d_r.get(),
-                                             d_p.get(), d_q.get(), d_beta.get(),
-                                             d_rho.get(), d_stop_status.get());
+    gko::kernels::GKO_DEVICE_NAMESPACE::cg::step_2(
+        exec, d_x.get(), d_r.get(), d_p.get(), d_q.get(), d_beta.get(),
+        d_rho.get(), d_stop_status.get());
 
     GKO_ASSERT_MTX_NEAR(d_x, x, ::r<value_type>::value);
     GKO_ASSERT_MTX_NEAR(d_r, r, ::r<value_type>::value);
diff --git a/test/solver/cgs_kernels.cpp b/test/solver/cgs_kernels.cpp
index 123f76727b5..6c2bab293e3 100644
--- a/test/solver/cgs_kernels.cpp
+++ b/test/solver/cgs_kernels.cpp
@@ -167,7 +167,7 @@ TEST_F(Cgs, CgsInitializeIsEquivalentToRef)
         ref, b.get(), r.get(), r_tld.get(), p.get(), q.get(), u.get(),
         u_hat.get(), v_hat.get(), t.get(), alpha.get(), beta.get(), gamma.get(),
         rho_prev.get(), rho.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::cgs::initialize(
+    gko::kernels::GKO_DEVICE_NAMESPACE::cgs::initialize(
         exec, d_b.get(), d_r.get(), d_r_tld.get(), d_p.get(), d_q.get(),
         d_u.get(), d_u_hat.get(), d_v_hat.get(), d_t.get(), d_alpha.get(),
         d_beta.get(), d_gamma.get(), d_rho_prev.get(), d_rho.get(),
@@ -197,7 +197,7 @@ TEST_F(Cgs, CgsStep1IsEquivalentToRef)
     gko::kernels::reference::cgs::step_1(ref, r.get(), u.get(), p.get(),
                                          q.get(), beta.get(), rho.get(),
                                          rho_prev.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::cgs::step_1(
+    gko::kernels::GKO_DEVICE_NAMESPACE::cgs::step_1(
         exec, d_r.get(), d_u.get(), d_p.get(), d_q.get(), d_beta.get(),
         d_rho.get(), d_rho_prev.get(), d_stop_status.get());
 
@@ -214,7 +214,7 @@ TEST_F(Cgs, CgsStep2IsEquivalentToRef)
     gko::kernels::reference::cgs::step_2(ref, u.get(), v_hat.get(), q.get(),
                                          t.get(), alpha.get(), rho.get(),
                                          gamma.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::cgs::step_2(
+    gko::kernels::GKO_DEVICE_NAMESPACE::cgs::step_2(
         exec, d_u.get(), d_v_hat.get(), d_q.get(), d_t.get(), d_alpha.get(),
         d_rho.get(), d_gamma.get(), d_stop_status.get());
 
@@ -231,7 +231,7 @@ TEST_F(Cgs, CgsStep3IsEquivalentToRef)
     gko::kernels::reference::cgs::step_3(ref, t.get(), u_hat.get(), r.get(),
                                          x.get(), alpha.get(),
                                          stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::cgs::step_3(
+    gko::kernels::GKO_DEVICE_NAMESPACE::cgs::step_3(
         exec, d_t.get(), d_u_hat.get(), d_r.get(), d_x.get(), d_alpha.get(),
         d_stop_status.get());
 
diff --git a/test/solver/fcg_kernels.cpp b/test/solver/fcg_kernels.cpp
index faf7225c883..f1f09f759bc 100644
--- a/test/solver/fcg_kernels.cpp
+++ b/test/solver/fcg_kernels.cpp
@@ -122,7 +122,7 @@ TEST_F(Fcg, FcgInitializeIsEquivalentToRef)
     gko::kernels::reference::fcg::initialize(
         ref, b.get(), r.get(), z.get(), p.get(), q.get(), t.get(),
         prev_rho.get(), rho.get(), rho_t.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::fcg::initialize(
+    gko::kernels::GKO_DEVICE_NAMESPACE::fcg::initialize(
         exec, d_b.get(), d_r.get(), d_z.get(), d_p.get(), d_q.get(), d_t.get(),
         d_prev_rho.get(), d_rho.get(), d_rho_t.get(), d_stop_status.get());
 
@@ -144,9 +144,9 @@ TEST_F(Fcg, FcgStep1IsEquivalentToRef)
 
     gko::kernels::reference::fcg::step_1(ref, p.get(), z.get(), rho_t.get(),
                                          prev_rho.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::fcg::step_1(exec, d_p.get(), d_z.get(),
-                                              d_rho_t.get(), d_prev_rho.get(),
-                                              d_stop_status.get());
+    gko::kernels::GKO_DEVICE_NAMESPACE::fcg::step_1(
+        exec, d_p.get(), d_z.get(), d_rho_t.get(), d_prev_rho.get(),
+        d_stop_status.get());
 
     GKO_ASSERT_MTX_NEAR(d_p, p, ::r<value_type>::value);
     GKO_ASSERT_MTX_NEAR(d_z, z, ::r<value_type>::value);
@@ -159,7 +159,7 @@ TEST_F(Fcg, FcgStep2IsEquivalentToRef)
     gko::kernels::reference::fcg::step_2(ref, x.get(), r.get(), t.get(),
                                          p.get(), q.get(), beta.get(),
                                          rho.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::fcg::step_2(
+    gko::kernels::GKO_DEVICE_NAMESPACE::fcg::step_2(
         exec, d_x.get(), d_r.get(), d_t.get(), d_p.get(), d_q.get(),
         d_beta.get(), d_rho.get(), d_stop_status.get());
 
diff --git a/test/solver/gcr_kernels.cpp b/test/solver/gcr_kernels.cpp
index 575d55ded87..7a00b3fed30 100644
--- a/test/solver/gcr_kernels.cpp
+++ b/test/solver/gcr_kernels.cpp
@@ -153,7 +153,7 @@ TEST_F(Gcr, GcrKernelInitializeIsEquivalentToRef)
 
     gko::kernels::reference::gcr::initialize(ref, b.get(), residual.get(),
                                              stop_status.get_data());
-    gko::kernels::EXEC_NAMESPACE::gcr::initialize(
+    gko::kernels::GKO_DEVICE_NAMESPACE::gcr::initialize(
         exec, d_b.get(), d_residual.get(), d_stop_status.get_data());
 
     GKO_ASSERT_MTX_NEAR(d_residual, residual, r<value_type>::value);
@@ -168,7 +168,7 @@ TEST_F(Gcr, GcrKernelRestartIsEquivalentToRef)
     gko::kernels::reference::gcr::restart(ref, residual.get(), A_residual.get(),
                                           p_bases.get(), Ap_bases.get(),
                                           final_iter_nums.get_data());
-    gko::kernels::EXEC_NAMESPACE::gcr::restart(
+    gko::kernels::GKO_DEVICE_NAMESPACE::gcr::restart(
         exec, d_residual.get(), d_A_residual.get(), d_p_bases.get(),
         d_Ap_bases.get(), d_final_iter_nums.get_data());
 
@@ -186,7 +186,7 @@ TEST_F(Gcr, GcrStep1IsEquivalentToRef)
     gko::kernels::reference::gcr::step_1(ref, x.get(), residual.get(), p.get(),
                                          Ap.get(), Ap_norm.get(), rAp.get(),
                                          stop_status.get_data());
-    gko::kernels::EXEC_NAMESPACE::gcr::step_1(
+    gko::kernels::GKO_DEVICE_NAMESPACE::gcr::step_1(
         exec, d_x.get(), d_residual.get(), d_p.get(), d_Ap.get(),
         d_Ap_norm.get(), d_rAp.get(), d_stop_status.get_data());
 
diff --git a/test/solver/gmres_kernels.cpp b/test/solver/gmres_kernels.cpp
index ac9139d81aa..08259c91ce0 100644
--- a/test/solver/gmres_kernels.cpp
+++ b/test/solver/gmres_kernels.cpp
@@ -159,7 +159,7 @@ TEST_F(Gmres, GmresKernelInitializeIsEquivalentToRef)
     gko::kernels::reference::common_gmres::initialize(
         ref, b.get(), residual.get(), givens_sin.get(), givens_cos.get(),
         stop_status.get_data());
-    gko::kernels::EXEC_NAMESPACE::common_gmres::initialize(
+    gko::kernels::GKO_DEVICE_NAMESPACE::common_gmres::initialize(
         exec, d_b.get(), d_residual.get(), d_givens_sin.get(),
         d_givens_cos.get(), d_stop_status.get_data());
 
@@ -180,7 +180,7 @@ TEST_F(Gmres, GmresKernelRestartIsEquivalentToRef)
         ref, residual.get(), residual_norm.get(),
         residual_norm_collection.get(), krylov_bases.get(),
         final_iter_nums.get_data());
-    gko::kernels::EXEC_NAMESPACE::gmres::restart(
+    gko::kernels::GKO_DEVICE_NAMESPACE::gmres::restart(
         exec, d_residual.get(), d_residual_norm.get(),
         d_residual_norm_collection.get(), d_krylov_bases.get(),
         d_final_iter_nums.get_data());
@@ -202,7 +202,7 @@ TEST_F(Gmres, GmresKernelHessenbergQRIsEquivalentToRef)
         ref, givens_sin.get(), givens_cos.get(), residual_norm.get(),
         residual_norm_collection.get(), hessenberg_iter.get(), iter,
         final_iter_nums.get_data(), stop_status.get_const_data());
-    gko::kernels::EXEC_NAMESPACE::common_gmres::hessenberg_qr(
+    gko::kernels::GKO_DEVICE_NAMESPACE::common_gmres::hessenberg_qr(
         exec, d_givens_sin.get(), d_givens_cos.get(), d_residual_norm.get(),
         d_residual_norm_collection.get(), d_hessenberg_iter.get(), iter,
         d_final_iter_nums.get_data(), d_stop_status.get_const_data());
@@ -228,7 +228,7 @@ TEST_F(Gmres, GmresKernelHessenbergQROnSingleRHSIsEquivalentToRef)
         ref, givens_sin.get(), givens_cos.get(), residual_norm.get(),
         residual_norm_collection.get(), hessenberg_iter.get(), iter,
         final_iter_nums.get_data(), stop_status.get_const_data());
-    gko::kernels::EXEC_NAMESPACE::common_gmres::hessenberg_qr(
+    gko::kernels::GKO_DEVICE_NAMESPACE::common_gmres::hessenberg_qr(
         exec, d_givens_sin.get(), d_givens_cos.get(), d_residual_norm.get(),
         d_residual_norm_collection.get(), d_hessenberg_iter.get(), iter,
         d_final_iter_nums.get_data(), d_stop_status.get_const_data());
@@ -252,7 +252,7 @@ TEST_F(Gmres, GmresKernelSolveKrylovIsEquivalentToRef)
     gko::kernels::reference::common_gmres::solve_krylov(
         ref, residual_norm_collection.get(), hessenberg.get(), y.get(),
         final_iter_nums.get_const_data(), stop_status.get_const_data());
-    gko::kernels::EXEC_NAMESPACE::common_gmres::solve_krylov(
+    gko::kernels::GKO_DEVICE_NAMESPACE::common_gmres::solve_krylov(
         exec, d_residual_norm_collection.get(), d_hessenberg.get(), d_y.get(),
         d_final_iter_nums.get_const_data(), d_stop_status.get_const_data());
 
@@ -267,7 +267,7 @@ TEST_F(Gmres, GmresKernelMultiAxpyIsEquivalentToRef)
     gko::kernels::reference::gmres::multi_axpy(
         ref, krylov_bases.get(), y.get(), before_preconditioner.get(),
         final_iter_nums.get_const_data(), stop_status.get_data());
-    gko::kernels::EXEC_NAMESPACE::gmres::multi_axpy(
+    gko::kernels::GKO_DEVICE_NAMESPACE::gmres::multi_axpy(
         exec, d_krylov_bases.get(), d_y.get(), d_before_preconditioner.get(),
         d_final_iter_nums.get_const_data(), d_stop_status.get_data());
 
diff --git a/test/solver/idr_kernels.cpp b/test/solver/idr_kernels.cpp
index 31c7df99168..b165824dbe0 100644
--- a/test/solver/idr_kernels.cpp
+++ b/test/solver/idr_kernels.cpp
@@ -160,7 +160,7 @@ TEST_F(Idr, IdrInitializeIsEquivalentToRef)
 
     gko::kernels::reference::idr::initialize(ref, nrhs, m.get(), p.get(), true,
                                              stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::idr::initialize(
+    gko::kernels::GKO_DEVICE_NAMESPACE::idr::initialize(
         exec, nrhs, d_m.get(), d_p.get(), true, d_stop_status.get());
 
     GKO_ASSERT_MTX_NEAR(m, d_m, rr<value_type>::value);
@@ -176,7 +176,7 @@ TEST_F(Idr, IdrStep1IsEquivalentToRef)
     gko::kernels::reference::idr::step_1(ref, nrhs, k, m.get(), f.get(),
                                          r.get(), g.get(), c.get(), v.get(),
                                          stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::idr::step_1(
+    gko::kernels::GKO_DEVICE_NAMESPACE::idr::step_1(
         exec, nrhs, k, d_m.get(), d_f.get(), d_r.get(), d_g.get(), d_c.get(),
         d_v.get(), d_stop_status.get());
 
@@ -192,9 +192,9 @@ TEST_F(Idr, IdrStep2IsEquivalentToRef)
     gko::size_type k = 2;
     gko::kernels::reference::idr::step_2(ref, nrhs, k, omega.get(), v.get(),
                                          c.get(), u.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::idr::step_2(exec, nrhs, k, d_omega.get(),
-                                              d_v.get(), d_c.get(), d_u.get(),
-                                              d_stop_status.get());
+    gko::kernels::GKO_DEVICE_NAMESPACE::idr::step_2(
+        exec, nrhs, k, d_omega.get(), d_v.get(), d_c.get(), d_u.get(),
+        d_stop_status.get());
 
     GKO_ASSERT_MTX_NEAR(u, d_u, rr<value_type>::value);
 }
@@ -208,7 +208,7 @@ TEST_F(Idr, IdrStep3IsEquivalentToRef)
     gko::kernels::reference::idr::step_3(
         ref, nrhs, k, p.get(), g.get(), v.get(), u.get(), m.get(), f.get(),
         alpha.get(), r.get(), x.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::idr::step_3(
+    gko::kernels::GKO_DEVICE_NAMESPACE::idr::step_3(
         exec, nrhs, k, d_p.get(), d_g.get(), d_v.get(), d_u.get(), d_m.get(),
         d_f.get(), d_alpha.get(), d_r.get(), d_x.get(), d_stop_status.get());
 
@@ -230,7 +230,7 @@ TEST_F(Idr, IdrComputeOmegaIsEquivalentToRef)
     gko::kernels::reference::idr::compute_omega(ref, nrhs, kappa, tht.get(),
                                                 residual_norm.get(),
                                                 omega.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::idr::compute_omega(
+    gko::kernels::GKO_DEVICE_NAMESPACE::idr::compute_omega(
         exec, nrhs, kappa, d_tht.get(), d_residual_norm.get(), d_omega.get(),
         d_stop_status.get());
 
diff --git a/test/solver/ir_kernels.cpp b/test/solver/ir_kernels.cpp
index 99550dfd99f..7a8e84324bd 100644
--- a/test/solver/ir_kernels.cpp
+++ b/test/solver/ir_kernels.cpp
@@ -55,7 +55,7 @@ TEST_F(Ir, InitializeIsEquivalentToRef)
     auto d_stop_status = gko::array<gko::stopping_status>(exec, stop_status);
 
     gko::kernels::reference::ir::initialize(ref, &stop_status);
-    gko::kernels::EXEC_NAMESPACE::ir::initialize(exec, &d_stop_status);
+    gko::kernels::GKO_DEVICE_NAMESPACE::ir::initialize(exec, &d_stop_status);
 
     auto tmp = gko::array<gko::stopping_status>(ref, d_stop_status);
     for (int i = 0; i < stop_status.get_size(); ++i) {
diff --git a/test/solver/multigrid_kernels.cpp b/test/solver/multigrid_kernels.cpp
index 139cb1a4647..4b4b0157df5 100644
--- a/test/solver/multigrid_kernels.cpp
+++ b/test/solver/multigrid_kernels.cpp
@@ -144,7 +144,7 @@ TEST_F(Multigrid, MultigridKCycleStep1IsEquivalentToRef)
 
     gko::kernels::reference::multigrid::kcycle_step_1(
         ref, alpha.get(), rho.get(), v.get(), g.get(), d.get(), e.get());
-    gko::kernels::EXEC_NAMESPACE::multigrid::kcycle_step_1(
+    gko::kernels::GKO_DEVICE_NAMESPACE::multigrid::kcycle_step_1(
         exec, d_alpha.get(), d_rho.get(), d_v.get(), d_g.get(), d_d.get(),
         d_e.get());
 
@@ -161,7 +161,7 @@ TEST_F(Multigrid, MultigridKCycleStep2IsEquivalentToRef)
     gko::kernels::reference::multigrid::kcycle_step_2(
         ref, alpha.get(), rho.get(), gamma.get(), beta.get(), zeta.get(),
         d.get(), e.get());
-    gko::kernels::EXEC_NAMESPACE::multigrid::kcycle_step_2(
+    gko::kernels::GKO_DEVICE_NAMESPACE::multigrid::kcycle_step_2(
         exec, d_alpha.get(), d_rho.get(), d_gamma.get(), d_beta.get(),
         d_zeta.get(), d_d.get(), d_e.get());
 
@@ -179,11 +179,11 @@ TEST_F(Multigrid, MultigridKCycleCheckStopIsEquivalentToRef)
 
     gko::kernels::reference::multigrid::kcycle_check_stop(
         ref, old_norm.get(), new_norm.get(), 1.0, is_stop_10);
-    gko::kernels::EXEC_NAMESPACE::multigrid::kcycle_check_stop(
+    gko::kernels::GKO_DEVICE_NAMESPACE::multigrid::kcycle_check_stop(
         exec, d_old_norm.get(), d_new_norm.get(), 1.0, d_is_stop_10);
     gko::kernels::reference::multigrid::kcycle_check_stop(
         ref, old_norm.get(), new_norm.get(), 0.5, is_stop_5);
-    gko::kernels::EXEC_NAMESPACE::multigrid::kcycle_check_stop(
+    gko::kernels::GKO_DEVICE_NAMESPACE::multigrid::kcycle_check_stop(
         exec, d_old_norm.get(), d_new_norm.get(), 0.5, d_is_stop_5);
 
     GKO_ASSERT_EQ(d_is_stop_10, is_stop_10);

From ae72a92ebfee0556313645ffca64bea74f5bf4a1 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 21 Feb 2024 17:37:42 +0100
Subject: [PATCH 007/448] prepare for unification

- Add necessary switching headers
- Provide device namespace macro via compiler definitions
- Add necessary (namespace) aliases
- adapt math lib includes and namespaces
- uniformize files
---
 accessor/cuda_hip_helper.hpp                  |  38 ++
 cmake/create_test.cmake                       |  12 +-
 common/cuda_hip/base/blas_bindings.hpp        |  16 +
 common/cuda_hip/base/config.hpp               |  16 +
 common/cuda_hip/base/pointer_mode_guard.hpp   |  16 +
 common/cuda_hip/base/randlib_bindings.hpp     |  16 +
 common/cuda_hip/base/runtime.hpp              |  14 +
 common/cuda_hip/base/sparselib_bindings.hpp   |  16 +
 common/cuda_hip/base/thrust.hpp               |  51 +++
 common/cuda_hip/base/types.hpp                |   9 +
 common/cuda_hip/components/atomic.hpp.inc     |  32 ++
 .../components/cooperative_groups.hpp         |  16 +
 .../cuda_hip/components/format_conversion.hpp |  16 +
 common/cuda_hip/components/memory.hpp         |  16 +
 ...ernels.hpp.inc => par_ict_kernels.hpp.inc} |  68 ++++
 .../par_ict_sweep_kernels.hpp.inc             |  76 ----
 ... => jacobi_advanced_apply_kernels.hpp.inc} |   0
 ...pp.inc => jacobi_generate_kernels.hpp.inc} |   0
 ...nc => jacobi_simple_apply_kernels.hpp.inc} |   0
 common/unified/base/kernel_launch.hpp         |   4 +-
 core/test/gtest/CMakeLists.txt                |   8 +-
 cuda/CMakeLists.txt                           |   4 +-
 cuda/base/batch_multi_vector_kernels.cu       |  10 +-
 cuda/base/batch_struct.hpp                    |   4 +-
 cuda/base/cublas_bindings.hpp                 |  16 +-
 cuda/base/curand_bindings.hpp                 |  14 +-
 cuda/base/cusparse_bindings.hpp               |  20 +-
 cuda/base/cusparse_block_bindings.hpp         |   2 +-
 cuda/base/device_matrix_data_kernels.cu       |   2 +-
 cuda/base/executor.cpp                        |   2 +-
 cuda/base/kernel_launch.cuh                   |  13 +-
 cuda/base/kernel_launch_reduction.cuh         |   4 +-
 cuda/base/kernel_launch_solver.cuh            |   3 +
 cuda/base/types.hpp                           |   4 +
 cuda/components/atomic.cuh                    |  34 +-
 cuda/components/cooperative_groups.cuh        |   2 +-
 .../diagonal_block_manipulation.cuh           |   6 +-
 cuda/components/format_conversion.cuh         |   2 +-
 cuda/components/memory.cuh                    |   2 +-
 cuda/components/prefix_sum.cuh                |   4 +-
 cuda/components/reduction.cuh                 |   7 +-
 cuda/components/searching.cuh                 |   2 +-
 cuda/components/segment_scan.cuh              |   2 +-
 cuda/components/sorting.cuh                   |   4 +-
 cuda/components/syncfree.cuh                  |   6 +-
 cuda/components/thread_ids.cuh                |   7 +-
 cuda/distributed/vector_kernels.cu            |   3 +
 cuda/factorization/cholesky_kernels.cu        |  18 +-
 cuda/factorization/factorization_kernels.cu   |   7 +-
 cuda/factorization/ic_kernels.cu              |  32 +-
 cuda/factorization/ilu_kernels.cu             |  32 +-
 cuda/factorization/lu_kernels.cu              |   4 +-
 cuda/factorization/par_ic_kernels.cu          |   4 +-
 cuda/factorization/par_ict_kernels.cu         |   6 +-
 cuda/factorization/par_ilu_kernels.cu         |   6 +-
 ...l.cu => par_ilut_approx_filter_kernels.cu} |   6 +-
 ...r_kernel.cu => par_ilut_filter_kernels.cu} |   7 +-
 ...t_kernel.cu => par_ilut_select_kernels.cu} |   3 +-
 ...m_kernel.cu => par_ilut_spgeam_kernels.cu} |   7 +-
 ...ep_kernel.cu => par_ilut_sweep_kernels.cu} |   3 +-
 cuda/log/batch_logger.cuh                     |   1 +
 cuda/matrix/batch_csr_kernels.cu              |   5 +-
 cuda/matrix/batch_dense_kernels.cu            |   6 +-
 cuda/matrix/batch_ell_kernels.cu              |   5 +-
 cuda/matrix/batch_struct.hpp                  |   2 +-
 cuda/matrix/coo_kernels.cu                    |  16 +-
 cuda/matrix/csr_kernels.template.cu           | 374 +++++++++---------
 cuda/matrix/dense_kernels.cu                  |  99 ++---
 cuda/matrix/diagonal_kernels.cu               |   7 +-
 cuda/matrix/ell_kernels.cu                    |  38 +-
 cuda/matrix/fbcsr_kernels.template.cu         |  92 +++--
 cuda/matrix/sellp_kernels.cu                  |   7 +-
 cuda/matrix/sparsity_csr_kernels.cu           |  37 +-
 cuda/multigrid/pgm_kernels.cu                 |   4 +-
 cuda/preconditioner/batch_preconditioners.cuh |   2 +-
 cuda/preconditioner/isai_kernels.cu           |   7 +-
 ...el.cu => jacobi_advanced_apply_kernels.cu} |   0
 ...obi_advanced_apply_kernels.instantiate.cu} |   8 +-
 cuda/preconditioner/jacobi_common.hpp.in      |   2 +-
 ...e_kernel.cu => jacobi_generate_kernels.cu} |   0
 ...=> jacobi_generate_kernels.instantiate.cu} |   8 +-
 cuda/preconditioner/jacobi_kernels.cu         |  13 +-
 ...rnel.cu => jacobi_simple_apply_kernels.cu} |   0
 ...acobi_simple_apply_kernels.instantiate.cu} |   8 +-
 cuda/reorder/rcm_kernels.cu                   |   2 +-
 cuda/solver/batch_bicgstab_kernels.cu         |   7 +-
 cuda/solver/batch_cg_kernels.cu               |   6 +-
 cuda/solver/cb_gmres_kernels.cu               |  42 +-
 cuda/solver/common_trs_kernels.cuh            |  88 ++---
 cuda/solver/idr_kernels.cu                    |  30 +-
 cuda/solver/lower_trs_kernels.cu              |   4 +-
 cuda/solver/multigrid_kernels.cu              |   3 +-
 cuda/solver/upper_trs_kernels.cu              |   4 +-
 cuda/stop/criterion_kernels.cu                |   2 +-
 cuda/stop/residual_norm_kernels.cu            |   2 +-
 cuda/test/base/math.cu                        |   2 +-
 cuda/test/components/cooperative_groups.cu    |   6 +-
 cuda/test/components/merging.cu               |   2 +-
 cuda/test/components/searching.cu             |   2 +-
 dpcpp/CMakeLists.txt                          |   4 +-
 dpcpp/test/base/CMakeLists.txt                |   2 +-
 hip/CMakeLists.txt                            |   4 +-
 hip/base/batch_multi_vector_kernels.hip.cpp   |  10 +-
 hip/base/batch_struct.hip.hpp                 |   4 +-
 hip/base/config.hip.hpp                       |   4 +-
 hip/base/device.hip.cpp                       |   4 +-
 hip/base/device_matrix_data_kernels.hip.cpp   |   2 +-
 hip/base/exception.hip.cpp                    |   2 +-
 hip/base/executor.hip.cpp                     |   6 +-
 hip/base/hipblas_bindings.hip.hpp             |  18 +-
 hip/base/hiprand_bindings.hip.hpp             |  16 +-
 hip/base/hipsparse_bindings.hip.hpp           |  18 +-
 hip/base/hipsparse_block_bindings.hip.hpp     |   4 +-
 hip/base/kernel_launch.hip.hpp                |  14 +-
 hip/base/kernel_launch_reduction.hip.hpp      |   4 +-
 hip/base/kernel_launch_solver.hip.hpp         |   2 +-
 hip/base/memory.hip.cpp                       |   4 +-
 hip/base/pointer_mode_guard.hip.hpp           |   2 +-
 hip/base/roctx.hip.cpp                        |   4 +-
 hip/base/scoped_device_id.hip.cpp             |   4 +-
 hip/base/stream.hip.cpp                       |   4 +-
 hip/base/timer.hip.cpp                        |   4 +-
 hip/base/types.hip.hpp                        |   8 +-
 hip/components/atomic.hip.hpp                 |  34 +-
 hip/components/cooperative_groups.hip.hpp     |   4 +-
 .../diagonal_block_manipulation.hip.hpp       |   6 +-
 hip/components/format_conversion.hip.hpp      |   6 +-
 hip/components/memory.hip.hpp                 |   2 +-
 hip/components/prefix_sum.hip.hpp             |   4 +-
 hip/components/reduction.hip.hpp              |  10 +-
 hip/components/searching.hip.hpp              |   2 +-
 hip/components/segment_scan.hip.hpp           |   2 +-
 hip/components/sorting.hip.hpp                |   4 +-
 hip/components/syncfree.hip.hpp               |   6 +-
 hip/components/thread_ids.hip.hpp             |   7 +-
 hip/factorization/cholesky_kernels.hip.cpp    |  14 +-
 .../factorization_kernels.hip.cpp             |   9 +-
 hip/factorization/ic_kernels.hip.cpp          |  26 +-
 hip/factorization/ilu_kernels.hip.cpp         |  26 +-
 hip/factorization/lu_kernels.hip.cpp          |   4 +-
 hip/factorization/par_ic_kernels.hip.cpp      |   4 +-
 hip/factorization/par_ict_kernels.hip.cpp     |   9 +-
 hip/factorization/par_ilu_kernels.hip.cpp     |   8 +-
 ...=> par_ilut_approx_filter_kernels.hip.cpp} |  10 +-
 ...ip.cpp => par_ilut_filter_kernels.hip.cpp} |  10 +-
 .../par_ilut_select_common.hip.cpp            |   2 +-
 ...ip.cpp => par_ilut_select_kernels.hip.cpp} |   4 +-
 ...ip.cpp => par_ilut_spgeam_kernels.hip.cpp} |   6 +-
 ...hip.cpp => par_ilut_sweep_kernels.hip.cpp} |   7 +-
 hip/matrix/batch_csr_kernels.hip.cpp          |   6 +-
 hip/matrix/batch_dense_kernels.hip.cpp        |   8 +-
 hip/matrix/batch_ell_kernels.hip.cpp          |   6 +-
 hip/matrix/batch_struct.hip.hpp               |   2 +-
 hip/matrix/coo_kernels.hip.cpp                |  19 +-
 hip/matrix/csr_kernels.template.hip.cpp       | 153 +++----
 hip/matrix/dense_kernels.hip.cpp              | 103 +++--
 hip/matrix/diagonal_kernels.hip.cpp           |  10 +-
 hip/matrix/ell_kernels.hip.cpp                |  35 +-
 hip/matrix/fbcsr_kernels.template.hip.cpp     | 119 ++++--
 hip/matrix/fft_kernels.hip.cpp                |   2 +-
 hip/matrix/sellp_kernels.hip.cpp              |  10 +-
 hip/matrix/sparsity_csr_kernels.hip.cpp       |  34 +-
 hip/multigrid/pgm_kernels.hip.cpp             |   2 +-
 .../batch_preconditioners.hip.hpp             |   2 +-
 hip/preconditioner/isai_kernels.hip.cpp       |  11 +-
 ...obi_advanced_apply_instantiate.inc.hip.cpp |  12 +-
 hip/preconditioner/jacobi_common.hip.hpp.in   |   2 +-
 .../jacobi_generate_instantiate.inc.hip.cpp   |   8 +-
 .../jacobi_generate_kernel.hip.cpp            |  12 +-
 hip/preconditioner/jacobi_kernels.hip.cpp     |  14 +-
 ...acobi_simple_apply_instantiate.inc.hip.cpp |   8 +-
 .../jacobi_simple_apply_kernel.hip.cpp        |  12 +-
 hip/reorder/rcm_kernels.hip.cpp               |   2 +-
 hip/solver/batch_bicgstab_kernels.hip.cpp     |   8 +-
 hip/solver/batch_cg_kernels.hip.cpp           |   8 +-
 hip/solver/cb_gmres_kernels.hip.cpp           |  36 +-
 hip/solver/common_trs_kernels.hip.hpp         |  30 +-
 hip/solver/idr_kernels.hip.cpp                |  30 +-
 hip/solver/lower_trs_kernels.hip.cpp          |   6 +-
 hip/solver/multigrid_kernels.hip.cpp          |   6 +-
 hip/solver/upper_trs_kernels.hip.cpp          |   6 +-
 hip/stop/criterion_kernels.hip.cpp            |   2 +-
 hip/stop/residual_norm_kernels.hip.cpp        |   6 +-
 hip/test/base/math.hip.cpp                    |   2 +-
 .../components/cooperative_groups.hip.cpp     |   6 +-
 hip/test/components/merging.hip.cpp           |   2 +-
 hip/test/components/searching.hip.cpp         |   2 +-
 include/ginkgo/core/base/executor.hpp         |  26 ++
 omp/CMakeLists.txt                            |   4 +-
 189 files changed, 1592 insertions(+), 1251 deletions(-)
 create mode 100644 accessor/cuda_hip_helper.hpp
 create mode 100644 common/cuda_hip/base/blas_bindings.hpp
 create mode 100644 common/cuda_hip/base/config.hpp
 create mode 100644 common/cuda_hip/base/pointer_mode_guard.hpp
 create mode 100644 common/cuda_hip/base/randlib_bindings.hpp
 create mode 100644 common/cuda_hip/base/runtime.hpp
 create mode 100644 common/cuda_hip/base/sparselib_bindings.hpp
 create mode 100644 common/cuda_hip/base/thrust.hpp
 create mode 100644 common/cuda_hip/base/types.hpp
 create mode 100644 common/cuda_hip/components/cooperative_groups.hpp
 create mode 100644 common/cuda_hip/components/format_conversion.hpp
 create mode 100644 common/cuda_hip/components/memory.hpp
 rename common/cuda_hip/factorization/{par_ict_spgeam_kernels.hpp.inc => par_ict_kernels.hpp.inc} (75%)
 delete mode 100644 common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc
 rename common/cuda_hip/preconditioner/{jacobi_advanced_apply_kernel.hpp.inc => jacobi_advanced_apply_kernels.hpp.inc} (100%)
 rename common/cuda_hip/preconditioner/{jacobi_generate_kernel.hpp.inc => jacobi_generate_kernels.hpp.inc} (100%)
 rename common/cuda_hip/preconditioner/{jacobi_simple_apply_kernel.hpp.inc => jacobi_simple_apply_kernels.hpp.inc} (100%)
 rename cuda/factorization/{par_ilut_approx_filter_kernel.cu => par_ilut_approx_filter_kernels.cu} (97%)
 rename cuda/factorization/{par_ilut_filter_kernel.cu => par_ilut_filter_kernels.cu} (96%)
 rename cuda/factorization/{par_ilut_select_kernel.cu => par_ilut_select_kernels.cu} (98%)
 rename cuda/factorization/{par_ilut_spgeam_kernel.cu => par_ilut_spgeam_kernels.cu} (97%)
 rename cuda/factorization/{par_ilut_sweep_kernel.cu => par_ilut_sweep_kernels.cu} (97%)
 rename cuda/preconditioner/{jacobi_advanced_apply_kernel.cu => jacobi_advanced_apply_kernels.cu} (100%)
 rename cuda/preconditioner/{jacobi_advanced_apply_instantiate.inc.cu => jacobi_advanced_apply_kernels.instantiate.cu} (94%)
 rename cuda/preconditioner/{jacobi_generate_kernel.cu => jacobi_generate_kernels.cu} (100%)
 rename cuda/preconditioner/{jacobi_generate_instantiate.inc.cu => jacobi_generate_kernels.instantiate.cu} (94%)
 rename cuda/preconditioner/{jacobi_simple_apply_kernel.cu => jacobi_simple_apply_kernels.cu} (100%)
 rename cuda/preconditioner/{jacobi_simple_apply_instantiate.inc.cu => jacobi_simple_apply_kernels.instantiate.cu} (93%)
 rename hip/factorization/{par_ilut_approx_filter_kernel.hip.cpp => par_ilut_approx_filter_kernels.hip.cpp} (97%)
 rename hip/factorization/{par_ilut_filter_kernel.hip.cpp => par_ilut_filter_kernels.hip.cpp} (96%)
 rename hip/factorization/{par_ilut_select_kernel.hip.cpp => par_ilut_select_kernels.hip.cpp} (99%)
 rename hip/factorization/{par_ilut_spgeam_kernel.hip.cpp => par_ilut_spgeam_kernels.hip.cpp} (98%)
 rename hip/factorization/{par_ilut_sweep_kernel.hip.cpp => par_ilut_sweep_kernels.hip.cpp} (97%)

diff --git a/accessor/cuda_hip_helper.hpp b/accessor/cuda_hip_helper.hpp
new file mode 100644
index 00000000000..225fdfe1b15
--- /dev/null
+++ b/accessor/cuda_hip_helper.hpp
@@ -0,0 +1,38 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_ACCESSOR_CUDA_HIP_HELPER_HPP_
+#define GKO_ACCESSOR_CUDA_HIP_HELPER_HPP_
+
+
+#include <utility>
+
+
+#ifdef GKO_COMPILING_HIP
+#include "accessor/hip_helper.hpp"
+#else  // GKO_COMPILING_CUDA
+#include "accessor/cuda_helper.hpp"
+#endif
+
+
+namespace gko {
+namespace acc {
+
+
+template <typename AccType>
+GKO_ACC_INLINE auto as_device_range(AccType&& acc)
+{
+#ifdef GKO_COMPILING_HIP
+    return as_hip_range(std::forward<AccType>(acc));
+#else  // GKO_COMPILING_CUDA
+    return as_cuda_range(std::forward<AccType>(acc));
+#endif
+}
+
+
+}  // namespace acc
+}  // namespace gko
+
+
+#endif  // GKO_ACCESSOR_CUDA_HIP_HELPER_HPP_
diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake
index 0aa93a3b141..9f7079f60a3 100644
--- a/cmake/create_test.cmake
+++ b/cmake/create_test.cmake
@@ -160,7 +160,7 @@ endfunction(ginkgo_create_cuda_test)
 ## Internal function allowing separate test name, filename and target name
 function(ginkgo_create_cuda_test_internal test_name filename test_target_name)
     add_executable(${test_target_name} ${filename})
-    target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_CUDA)
+    target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_CUDA GKO_DEVICE_NAMESPACE=cuda)
     if(MSVC)
         target_compile_options(${test_target_name}
             PRIVATE
@@ -188,7 +188,7 @@ endfunction(ginkgo_create_hip_test)
 function(ginkgo_create_hip_test_internal test_name filename test_target_name)
     set_source_files_properties(${filename} PROPERTIES LANGUAGE HIP)
     add_executable(${test_target_name} ${filename})
-    target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_HIP)
+    target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_HIP GKO_DEVICE_NAMESPACE=hip)
     ginkgo_set_test_target_properties(${test_target_name} "_hip" ${ARGN})
     ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE hipgpu)
 endfunction(ginkgo_create_hip_test_internal)
@@ -203,7 +203,7 @@ endfunction()
 function(ginkgo_create_omp_test_internal test_name filename test_target_name)
     ginkgo_build_test_name(${test_name} test_target_name)
     add_executable(${test_target_name} ${test_name}.cpp)
-    target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_OMP)
+    target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_OMP GKO_DEVICE_NAMESPACE=omp)
     target_link_libraries(${test_target_name} PRIVATE OpenMP::OpenMP_CXX)
     ginkgo_set_test_target_properties(${test_target_name} "_omp" ${ARGN})
     ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE cpu)
@@ -253,7 +253,7 @@ function(ginkgo_create_common_test_internal test_name exec_type exec)
         target_link_libraries(${test_target_name} PRIVATE OpenMP::OpenMP_CXX)
     endif ()
 
-    target_compile_definitions(${test_target_name} PRIVATE EXEC_TYPE=${exec_type} EXEC_NAMESPACE=${exec} GKO_COMPILING_${exec_upper})
+    target_compile_definitions(${test_target_name} PRIVATE EXEC_TYPE=${exec_type} GKO_DEVICE_NAMESPACE=${exec} GKO_COMPILING_${exec_upper})
     target_link_libraries(${test_target_name} PRIVATE ${common_test_ADDITIONAL_LIBRARIES})
     # use float for DPC++ if necessary
     if((exec STREQUAL "dpcpp") AND GINKGO_DPCPP_SINGLE_MODE)
@@ -285,13 +285,13 @@ function(ginkgo_create_common_device_test test_name)
         # need to make a separate file for this, since we can't set conflicting properties on the same file
         configure_file(${test_name}.cpp ${test_name}.cu COPYONLY)
         ginkgo_create_cuda_test_internal(${test_name}_cuda ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.cu ${test_target_name}_cuda ${ARGN})
-        target_compile_definitions(${test_target_name}_cuda PRIVATE EXEC_TYPE=CudaExecutor EXEC_NAMESPACE=cuda)
+        target_compile_definitions(${test_target_name}_cuda PRIVATE EXEC_TYPE=CudaExecutor GKO_DEVICE_NAMESPACE=cuda)
     endif()
     if(GINKGO_BUILD_HIP)
         # need to make a separate file for this, since we can't set conflicting properties on the same file
         configure_file(${test_name}.cpp ${test_name}.hip.cpp COPYONLY)
         ginkgo_create_hip_test_internal(${test_name}_hip ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.hip.cpp ${test_target_name}_hip ${ARGN})
-        target_compile_definitions(${test_target_name}_hip PRIVATE EXEC_TYPE=HipExecutor EXEC_NAMESPACE=hip)
+        target_compile_definitions(${test_target_name}_hip PRIVATE EXEC_TYPE=HipExecutor GKO_DEVICE_NAMESPACE=hip)
     endif()
 endfunction(ginkgo_create_common_device_test)
 
diff --git a/common/cuda_hip/base/blas_bindings.hpp b/common/cuda_hip/base/blas_bindings.hpp
new file mode 100644
index 00000000000..1708fb88ce1
--- /dev/null
+++ b/common/cuda_hip/base/blas_bindings.hpp
@@ -0,0 +1,16 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_BASE_BLAS_BINDINGS_HPP_
+#define GKO_COMMON_CUDA_HIP_BASE_BLAS_BINDINGS_HPP_
+
+
+#ifdef GKO_COMPILING_HIP
+#include "hip/base/hipblas_bindings.hip.hpp"
+#else  // GKO_COMPILING_CUDA
+#include "cuda/base/cublas_bindings.hpp"
+#endif
+
+
+#endif  // GKO_COMMON_CUDA_HIP_BASE_BLAS_BINDINGS_HPP_
diff --git a/common/cuda_hip/base/config.hpp b/common/cuda_hip/base/config.hpp
new file mode 100644
index 00000000000..d2085ae946b
--- /dev/null
+++ b/common/cuda_hip/base/config.hpp
@@ -0,0 +1,16 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_BASE_CONFIG_HPP_
+#define GKO_COMMON_CUDA_HIP_BASE_CONFIG_HPP_
+
+
+#ifdef GKO_COMPILING_HIP
+#include "hip/base/config.hip.hpp"
+#else  // GKO_COMPILING_CUDA
+#include "cuda/base/config.hpp"
+#endif
+
+
+#endif  // GKO_COMMON_CUDA_HIP_BASE_CONFIG_HPP_
diff --git a/common/cuda_hip/base/pointer_mode_guard.hpp b/common/cuda_hip/base/pointer_mode_guard.hpp
new file mode 100644
index 00000000000..41ff6242e49
--- /dev/null
+++ b/common/cuda_hip/base/pointer_mode_guard.hpp
@@ -0,0 +1,16 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_BASE_POINTER_MODE_GUARD_HPP_
+#define GKO_COMMON_CUDA_HIP_BASE_POINTER_MODE_GUARD_HPP_
+
+
+#ifdef GKO_COMPILING_HIP
+#include "hip/base/pointer_mode_guard.hip.hpp"
+#else  // GKO_COMPILING_CUDA
+#include "cuda/base/pointer_mode_guard.hpp"
+#endif
+
+
+#endif  // GKO_COMMON_CUDA_HIP_BASE_POINTER_MODE_GUARD_HPP_
diff --git a/common/cuda_hip/base/randlib_bindings.hpp b/common/cuda_hip/base/randlib_bindings.hpp
new file mode 100644
index 00000000000..249489b0e68
--- /dev/null
+++ b/common/cuda_hip/base/randlib_bindings.hpp
@@ -0,0 +1,16 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_BASE_RANDLIB_BINDINGS_HPP_
+#define GKO_COMMON_CUDA_HIP_BASE_RANDLIB_BINDINGS_HPP_
+
+
+#ifdef GKO_COMPILING_HIP
+#include "hip/base/hiprand_bindings.hip.hpp"
+#else  // GKO_COMPILING_CUDA
+#include "cuda/base/curand_bindings.hpp"
+#endif
+
+
+#endif  // GKO_COMMON_CUDA_HIP_BASE_RANDLIB_BINDINGS_HPP_
diff --git a/common/cuda_hip/base/runtime.hpp b/common/cuda_hip/base/runtime.hpp
new file mode 100644
index 00000000000..ccddfdd2661
--- /dev/null
+++ b/common/cuda_hip/base/runtime.hpp
@@ -0,0 +1,14 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_BASE_RUNTIME_HPP_
+#define GKO_COMMON_CUDA_HIP_BASE_RUNTIME_HPP_
+
+
+#ifdef GKO_COMPILING_HIP
+#include <hip/hip_runtime.h>
+#endif
+
+
+#endif  // GKO_COMMON_CUDA_HIP_BASE_RUNTIME_HPP_
diff --git a/common/cuda_hip/base/sparselib_bindings.hpp b/common/cuda_hip/base/sparselib_bindings.hpp
new file mode 100644
index 00000000000..bc565f9190a
--- /dev/null
+++ b/common/cuda_hip/base/sparselib_bindings.hpp
@@ -0,0 +1,16 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_BASE_SPARSELIB_BINDINGS_HPP_
+#define GKO_COMMON_CUDA_HIP_BASE_SPARSELIB_BINDINGS_HPP_
+
+
+#ifdef GKO_COMPILING_HIP
+#include "hip/base/hipsparse_bindings.hip.hpp"
+#else  // GKO_COMPILING_CUDA
+#include "cuda/base/cusparse_bindings.hpp"
+#endif
+
+
+#endif  // GKO_COMMON_CUDA_HIP_BASE_SPARSELIB_BINDINGS_HPP_
diff --git a/common/cuda_hip/base/thrust.hpp b/common/cuda_hip/base/thrust.hpp
new file mode 100644
index 00000000000..f2015d6d544
--- /dev/null
+++ b/common/cuda_hip/base/thrust.hpp
@@ -0,0 +1,51 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_BASE_THRUST_HPP_
+#define GKO_COMMON_CUDA_HIP_BASE_THRUST_HPP_
+
+
+#include <thrust/execution_policy.h>
+
+
+#include <ginkgo/config.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+
+#if defined(GKO_COMPILING_CUDA) || \
+    (defined(GKO_COMPILING_HIP) && !GINKGO_HIP_PLATFORM_HCC)
+#include <thrust/system/cuda/detail/execution_policy.h>
+#else
+#include <thrust/system/hip/detail/execution_policy.h>
+#endif
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
+#ifdef GKO_COMPILING_CUDA
+inline auto thrust_policy(std::shared_ptr<const CudaExecutor> exec)
+{
+    return thrust::cuda::par.on(exec->get_stream());
+}
+#else
+inline auto thrust_policy(std::shared_ptr<const HipExecutor> exec)
+{
+#if GINKGO_HIP_PLATFORM_HCC
+    return thrust::hip::par.on(exec->get_stream());
+#else
+    return thrust::cuda::par.on(exec->get_stream());
+#endif
+}
+#endif
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_BASE_THRUST_HPP_
diff --git a/common/cuda_hip/base/types.hpp b/common/cuda_hip/base/types.hpp
new file mode 100644
index 00000000000..213664d3a4d
--- /dev/null
+++ b/common/cuda_hip/base/types.hpp
@@ -0,0 +1,9 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifdef GKO_COMPILING_CUDA
+#include "cuda/base/types.hpp"
+#else
+#include "hip/base/types.hip.hpp"
+#endif
diff --git a/common/cuda_hip/components/atomic.hpp.inc b/common/cuda_hip/components/atomic.hpp.inc
index 3d76cfdcb79..60eaf5a9dd9 100644
--- a/common/cuda_hip/components/atomic.hpp.inc
+++ b/common/cuda_hip/components/atomic.hpp.inc
@@ -196,3 +196,35 @@ GKO_BIND_ATOMIC_MAX(unsigned long long int);
 
 
 #undef GKO_BIND_ATOMIC_MAX
+
+
+/**
+ * @internal
+ *
+ * @note It is not 'real' complex<float> atomic add operation
+ */
+__forceinline__ __device__ thrust::complex<float> atomic_add(
+    thrust::complex<float>* __restrict__ address, thrust::complex<float> val)
+{
+    auto addr = reinterpret_cast<float*>(address);
+    // Separate to real part and imag part
+    auto real = atomic_add(addr, val.real());
+    auto imag = atomic_add(addr + 1, val.imag());
+    return {real, imag};
+}
+
+
+/**
+ * @internal
+ *
+ * @note It is not 'real' complex<double> atomic add operation
+ */
+__forceinline__ __device__ thrust::complex<double> atomic_add(
+    thrust::complex<double>* __restrict__ address, thrust::complex<double> val)
+{
+    auto addr = reinterpret_cast<double*>(address);
+    // Separate to real part and imag part
+    auto real = atomic_add(addr, val.real());
+    auto imag = atomic_add(addr + 1, val.imag());
+    return {real, imag};
+}
diff --git a/common/cuda_hip/components/cooperative_groups.hpp b/common/cuda_hip/components/cooperative_groups.hpp
new file mode 100644
index 00000000000..b1f17842302
--- /dev/null
+++ b/common/cuda_hip/components/cooperative_groups.hpp
@@ -0,0 +1,16 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_COOPERATIVE_GROUPS_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_COOPERATIVE_GROUPS_HPP_
+
+
+#ifdef GKO_COMPILING_HIP
+#include "hip/components/cooperative_groups.hip.hpp"
+#else  // GKO_COMPILING_CUDA
+#include "cuda/components/cooperative_groups.cuh"
+#endif
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_COOPERATIVE_GROUPS_HPP_
diff --git a/common/cuda_hip/components/format_conversion.hpp b/common/cuda_hip/components/format_conversion.hpp
new file mode 100644
index 00000000000..a16d09b2e3a
--- /dev/null
+++ b/common/cuda_hip/components/format_conversion.hpp
@@ -0,0 +1,16 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_FORMAT_CONVERSION_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_FORMAT_CONVERSION_HPP_
+
+
+#ifdef GKO_COMPILING_HIP
+#include "hip/components/format_conversion.hip.hpp"
+#else  // GKO_COMPILING_CUDA
+#include "cuda/components/format_conversion.cuh"
+#endif
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_FORMAT_CONVERSION_HPP_
diff --git a/common/cuda_hip/components/memory.hpp b/common/cuda_hip/components/memory.hpp
new file mode 100644
index 00000000000..974431e2fb8
--- /dev/null
+++ b/common/cuda_hip/components/memory.hpp
@@ -0,0 +1,16 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_MEMORY_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_MEMORY_HPP_
+
+
+#ifdef GKO_COMPILING_HIP
+#include "hip/components/memory.hip.hpp"
+#else  // GKO_COMPILING_CUDA
+#include "cuda/components/memory.cuh"
+#endif
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_MEMORY_HPP_
diff --git a/common/cuda_hip/factorization/par_ict_spgeam_kernels.hpp.inc b/common/cuda_hip/factorization/par_ict_kernels.hpp.inc
similarity index 75%
rename from common/cuda_hip/factorization/par_ict_spgeam_kernels.hpp.inc
rename to common/cuda_hip/factorization/par_ict_kernels.hpp.inc
index 93a49e56d21..87aa8297345 100644
--- a/common/cuda_hip/factorization/par_ict_spgeam_kernels.hpp.inc
+++ b/common/cuda_hip/factorization/par_ict_kernels.hpp.inc
@@ -206,4 +206,72 @@ __global__ __launch_bounds__(default_block_size) void ict_tri_spgeam_init(
 }
 
 
+template <int subwarp_size, typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void ict_sweep(
+    const IndexType* __restrict__ a_row_ptrs,
+    const IndexType* __restrict__ a_col_idxs,
+    const ValueType* __restrict__ a_vals,
+    const IndexType* __restrict__ l_row_ptrs,
+    const IndexType* __restrict__ l_row_idxs,
+    const IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_vals,
+    IndexType l_nnz)
+{
+    auto l_nz = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
+    if (l_nz >= l_nnz) {
+        return;
+    }
+    auto row = l_row_idxs[l_nz];
+    auto col = l_col_idxs[l_nz];
+    auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    // find entry of A at (row, col)
+    auto a_row_begin = a_row_ptrs[row];
+    auto a_row_end = a_row_ptrs[row + 1];
+    auto a_row_size = a_row_end - a_row_begin;
+    auto a_idx =
+        group_wide_search(a_row_begin, a_row_size, subwarp,
+                          [&](IndexType i) { return a_col_idxs[i] >= col; });
+    bool has_a = a_idx < a_row_end && a_col_idxs[a_idx] == col;
+    auto a_val = has_a ? a_vals[a_idx] : zero<ValueType>();
+    auto l_row_begin = l_row_ptrs[row];
+    auto l_row_size = l_row_ptrs[row + 1] - l_row_begin;
+    auto lh_col_begin = l_row_ptrs[col];
+    auto lh_col_size = l_row_ptrs[col + 1] - lh_col_begin;
+    ValueType sum{};
+    IndexType lh_nz{};
+    auto last_entry = col;
+    group_merge<subwarp_size>(
+        l_col_idxs + l_row_begin, l_row_size, l_col_idxs + lh_col_begin,
+        lh_col_size, subwarp,
+        [&](IndexType l_idx, IndexType l_col, IndexType lh_idx,
+            IndexType lh_row, IndexType, bool) {
+            // we don't need to use the `bool valid` because last_entry is
+            // already a smaller sentinel value than the one used in group_merge
+            if (l_col == lh_row && l_col < last_entry) {
+                sum += load_relaxed(l_vals + (l_idx + l_row_begin)) *
+                       conj(load_relaxed(l_vals + (lh_idx + lh_col_begin)));
+            }
+            // remember the transposed element
+            auto found_transp = subwarp.ballot(lh_row == row);
+            if (found_transp) {
+                lh_nz =
+                    subwarp.shfl(lh_idx + lh_col_begin, ffs(found_transp) - 1);
+            }
+            return true;
+        });
+    // accumulate result from all threads
+    sum = reduce(subwarp, sum, [](ValueType a, ValueType b) { return a + b; });
+
+    if (subwarp.thread_rank() == 0) {
+        auto to_write =
+            row == col ? sqrt(a_val - sum)
+                       : (a_val - sum) /
+                             load_relaxed(l_vals + (l_row_ptrs[col + 1] - 1));
+        if (is_finite(to_write)) {
+            store_relaxed(l_vals + l_nz, to_write);
+        }
+    }
+}
+
+
 }  // namespace kernel
diff --git a/common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc b/common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc
deleted file mode 100644
index bc58f0a9799..00000000000
--- a/common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc
+++ /dev/null
@@ -1,76 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-namespace kernel {
-
-
-template <int subwarp_size, typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void ict_sweep(
-    const IndexType* __restrict__ a_row_ptrs,
-    const IndexType* __restrict__ a_col_idxs,
-    const ValueType* __restrict__ a_vals,
-    const IndexType* __restrict__ l_row_ptrs,
-    const IndexType* __restrict__ l_row_idxs,
-    const IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_vals,
-    IndexType l_nnz)
-{
-    auto l_nz = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
-    if (l_nz >= l_nnz) {
-        return;
-    }
-    auto row = l_row_idxs[l_nz];
-    auto col = l_col_idxs[l_nz];
-    auto subwarp =
-        group::tiled_partition<subwarp_size>(group::this_thread_block());
-    // find entry of A at (row, col)
-    auto a_row_begin = a_row_ptrs[row];
-    auto a_row_end = a_row_ptrs[row + 1];
-    auto a_row_size = a_row_end - a_row_begin;
-    auto a_idx =
-        group_wide_search(a_row_begin, a_row_size, subwarp,
-                          [&](IndexType i) { return a_col_idxs[i] >= col; });
-    bool has_a = a_idx < a_row_end && a_col_idxs[a_idx] == col;
-    auto a_val = has_a ? a_vals[a_idx] : zero<ValueType>();
-    auto l_row_begin = l_row_ptrs[row];
-    auto l_row_size = l_row_ptrs[row + 1] - l_row_begin;
-    auto lh_col_begin = l_row_ptrs[col];
-    auto lh_col_size = l_row_ptrs[col + 1] - lh_col_begin;
-    ValueType sum{};
-    IndexType lh_nz{};
-    auto last_entry = col;
-    group_merge<subwarp_size>(
-        l_col_idxs + l_row_begin, l_row_size, l_col_idxs + lh_col_begin,
-        lh_col_size, subwarp,
-        [&](IndexType l_idx, IndexType l_col, IndexType lh_idx,
-            IndexType lh_row, IndexType, bool) {
-            // we don't need to use the `bool valid` because last_entry is
-            // already a smaller sentinel value than the one used in group_merge
-            if (l_col == lh_row && l_col < last_entry) {
-                sum += load_relaxed(l_vals + (l_idx + l_row_begin)) *
-                       conj(load_relaxed(l_vals + (lh_idx + lh_col_begin)));
-            }
-            // remember the transposed element
-            auto found_transp = subwarp.ballot(lh_row == row);
-            if (found_transp) {
-                lh_nz =
-                    subwarp.shfl(lh_idx + lh_col_begin, ffs(found_transp) - 1);
-            }
-            return true;
-        });
-    // accumulate result from all threads
-    sum = reduce(subwarp, sum, [](ValueType a, ValueType b) { return a + b; });
-
-    if (subwarp.thread_rank() == 0) {
-        auto to_write =
-            row == col ? sqrt(a_val - sum)
-                       : (a_val - sum) /
-                             load_relaxed(l_vals + (l_row_ptrs[col + 1] - 1));
-        if (is_finite(to_write)) {
-            store_relaxed(l_vals + l_nz, to_write);
-        }
-    }
-}
-
-
-}  // namespace kernel
diff --git a/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernel.hpp.inc b/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc
similarity index 100%
rename from common/cuda_hip/preconditioner/jacobi_advanced_apply_kernel.hpp.inc
rename to common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc
diff --git a/common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc b/common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc
similarity index 100%
rename from common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc
rename to common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc
diff --git a/common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc b/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc
similarity index 100%
rename from common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc
rename to common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc
diff --git a/common/unified/base/kernel_launch.hpp b/common/unified/base/kernel_launch.hpp
index b32572546f0..5ca25ecb1e3 100644
--- a/common/unified/base/kernel_launch.hpp
+++ b/common/unified/base/kernel_launch.hpp
@@ -19,7 +19,7 @@
 
 #define GKO_DEVICE_NAMESPACE cuda
 #define GKO_KERNEL __device__
-#include "cuda/base/types.hpp"
+#include "common/cuda_hip/base/types.hpp"
 
 
 namespace gko {
@@ -46,7 +46,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr unpack_member_type<T> unpack_member(T value)
 
 #define GKO_DEVICE_NAMESPACE hip
 #define GKO_KERNEL __device__
-#include "hip/base/types.hip.hpp"
+#include "common/cuda_hip/base/types.hpp"
 
 
 namespace gko {
diff --git a/core/test/gtest/CMakeLists.txt b/core/test/gtest/CMakeLists.txt
index 56f83181375..f500ddb6ae5 100644
--- a/core/test/gtest/CMakeLists.txt
+++ b/core/test/gtest/CMakeLists.txt
@@ -25,14 +25,14 @@ if (GINKGO_BUILD_MPI)
     add_library(ginkgo_gtest_main_mpi_cpu ALIAS ginkgo_gtest_main_mpi)
 endif()
 if (GINKGO_BUILD_OMP)
-    add_gtest_main("_omp" "GKO_COMPILING_OMP")
+    add_gtest_main("_omp" "GKO_COMPILING_OMP;GKO_DEVICE_NAMESPACE=omp")
 endif()
 if (GINKGO_BUILD_CUDA)
-    add_gtest_main("_cuda" "GKO_COMPILING_CUDA")
+    add_gtest_main("_cuda" "GKO_COMPILING_CUDA;GKO_DEVICE_NAMESPACE=cuda")
 endif()
 if (GINKGO_BUILD_HIP)
-    add_gtest_main("_hip" "GKO_COMPILING_HIP")
+    add_gtest_main("_hip" "GKO_COMPILING_HIP;GKO_DEVICE_NAMESPACE=hip")
 endif()
 if (GINKGO_BUILD_SYCL)
-    add_gtest_main("_dpcpp" "GKO_COMPILING_DPCPP")
+    add_gtest_main("_dpcpp" "GKO_COMPILING_DPCPP;GKO_DEVICE_NAMESPACE=dpcpp")
 endif()
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index bd214691a2e..88ae83e9005 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -120,7 +120,7 @@ if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
 endif()
 
 ginkgo_compile_features(ginkgo_cuda)
-target_compile_definitions(ginkgo_cuda PRIVATE GKO_COMPILING_CUDA)
+target_compile_definitions(ginkgo_cuda PRIVATE GKO_COMPILING_CUDA GKO_DEVICE_NAMESPACE=cuda)
 
 # include path for generated headers like jacobi_common.hpp
 target_include_directories(ginkgo_cuda
@@ -133,7 +133,7 @@ ginkgo_default_includes(ginkgo_cuda)
 ginkgo_install_library(ginkgo_cuda)
 
 if (GINKGO_CHECK_CIRCULAR_DEPS)
-    ginkgo_check_headers(ginkgo_cuda GKO_COMPILING_CUDA)
+    ginkgo_check_headers(ginkgo_cuda "GKO_COMPILING_CUDA;GKO_DEVICE_NAMESPACE=cuda")
 endif()
 
 if(GINKGO_BUILD_TESTS)
diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu
index 5bc899c11ed..dcaafd5a46c 100644
--- a/cuda/base/batch_multi_vector_kernels.cu
+++ b/cuda/base/batch_multi_vector_kernels.cu
@@ -13,13 +13,14 @@
 #include <ginkgo/core/base/range_accessors.hpp>
 
 
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/batch_struct.hpp"
 #include "cuda/base/batch_struct.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/cublas_bindings.hpp"
-#include "cuda/base/pointer_mode_guard.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/components/uninitialized_array.hpp"
@@ -39,6 +40,7 @@ namespace batch_multi_vector {
 constexpr auto default_block_size = 256;
 constexpr int sm_oversubscription = 4;
 
+
 // clang-format off
 
 // NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
diff --git a/cuda/base/batch_struct.hpp b/cuda/base/batch_struct.hpp
index 7c968ec2c6e..5251c594d42 100644
--- a/cuda/base/batch_struct.hpp
+++ b/cuda/base/batch_struct.hpp
@@ -10,9 +10,9 @@
 #include <ginkgo/core/base/math.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
 #include "core/base/batch_struct.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/types.hpp"
 
 
 namespace gko {
diff --git a/cuda/base/cublas_bindings.hpp b/cuda/base/cublas_bindings.hpp
index 485249b7665..c1cdf1f996e 100644
--- a/cuda/base/cublas_bindings.hpp
+++ b/cuda/base/cublas_bindings.hpp
@@ -12,8 +12,8 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
 
 
 namespace gko {
@@ -249,6 +249,20 @@ inline void destroy(cublasHandle_t handle)
 
 
 }  // namespace cublas
+
+
+namespace blas {
+
+
+using namespace cublas;
+
+
+#define BLAS_OP_N CUBLAS_OP_N
+#define BLAS_OP_T CUBLAS_OP_T
+#define BLAS_OP_C CUBLAS_OP_C
+
+
+}  // namespace blas
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace gko
diff --git a/cuda/base/curand_bindings.hpp b/cuda/base/curand_bindings.hpp
index b0ae52c5f00..10e09f4a356 100644
--- a/cuda/base/curand_bindings.hpp
+++ b/cuda/base/curand_bindings.hpp
@@ -12,8 +12,8 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
 
 
 namespace gko {
@@ -85,6 +85,18 @@ GKO_BIND_CURAND_RANDOM_VECTOR(std::complex<double>, curandGenerateNormalDouble);
 
 
 }  // namespace curand
+
+
+namespace randlib {
+
+
+using namespace curand;
+
+
+#define RANDLIB_RNG_PSEUDO_DEFAULT CURAND_RNG_PSEUDO_DEFAULT
+
+
+}  // namespace randlib
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace gko
diff --git a/cuda/base/cusparse_bindings.hpp b/cuda/base/cusparse_bindings.hpp
index 87737e8865e..06aaf0c6f1d 100644
--- a/cuda/base/cusparse_bindings.hpp
+++ b/cuda/base/cusparse_bindings.hpp
@@ -13,7 +13,7 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
-#include "cuda/base/types.hpp"
+#include "common/cuda_hip/base/types.hpp"
 
 
 namespace gko {
@@ -948,7 +948,7 @@ inline csrilu02Info_t create_ilu0_info()
 }
 
 
-inline void destroy(csrilu02Info_t info)
+inline void destroy_ilu0_info(csrilu02Info_t info)
 {
     GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyCsrilu02Info(info));
 }
@@ -962,7 +962,7 @@ inline csric02Info_t create_ic0_info()
 }
 
 
-inline void destroy(csric02Info_t info)
+inline void destroy_ic0_info(csric02Info_t info)
 {
     GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyCsric02Info(info));
 }
@@ -1463,6 +1463,20 @@ GKO_BIND_CUSPARSE_IC0(std::complex<double>, cusparseZcsric02);
 
 
 }  // namespace cusparse
+
+
+namespace sparselib {
+
+
+using namespace cusparse;
+
+
+#define SPARSELIB_OPERATION_TRANSPOSE CUSPARSE_OPERATION_TRANSPOSE
+#define SPARSELIB_OPERATION_NON_TRANSPOSE CUSPARSE_OPERATION_NON_TRANSPOSE
+#define SPARSELIB_SOLVE_POLICY_USE_LEVEL CUSPARSE_SOLVE_POLICY_USE_LEVEL
+
+
+}  // namespace sparselib
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace gko
diff --git a/cuda/base/cusparse_block_bindings.hpp b/cuda/base/cusparse_block_bindings.hpp
index eddf249a22b..fc64c19796c 100644
--- a/cuda/base/cusparse_block_bindings.hpp
+++ b/cuda/base/cusparse_block_bindings.hpp
@@ -13,8 +13,8 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "cuda/base/cusparse_bindings.hpp"
-#include "cuda/base/types.hpp"
 
 
 namespace gko {
diff --git a/cuda/base/device_matrix_data_kernels.cu b/cuda/base/device_matrix_data_kernels.cu
index ed5601f57a5..554abe8bc37 100644
--- a/cuda/base/device_matrix_data_kernels.cu
+++ b/cuda/base/device_matrix_data_kernels.cu
@@ -14,8 +14,8 @@
 #include <thrust/tuple.h>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/base/types.hpp"
 
 
 namespace gko {
diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp
index 52a92132689..3d1dbf7c92c 100644
--- a/cuda/base/executor.cpp
+++ b/cuda/base/executor.cpp
@@ -20,7 +20,7 @@
 #include <ginkgo/core/base/memory.hpp>
 
 
-#include "cuda/base/config.hpp"
+#include "common/cuda_hip/base/config.hpp"
 #include "cuda/base/cublas_bindings.hpp"
 #include "cuda/base/cusparse_handle.hpp"
 #include "cuda/base/scoped_device_id.hpp"
diff --git a/cuda/base/kernel_launch.cuh b/cuda/base/kernel_launch.cuh
index ec8d31ba747..0d4bc4eebd5 100644
--- a/cuda/base/kernel_launch.cuh
+++ b/cuda/base/kernel_launch.cuh
@@ -11,8 +11,9 @@
 #include <thrust/tuple.h>
 
 
-#include "accessor/cuda_helper.hpp"
-#include "cuda/base/types.hpp"
+#include "accessor/cuda_hip_helper.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
 #include "cuda/components/thread_ids.cuh"
 
 
@@ -23,21 +24,21 @@ namespace cuda {
 
 template <typename AccessorType>
 struct to_device_type_impl<gko::acc::range<AccessorType>&> {
-    using type = std::decay_t<decltype(gko::acc::as_cuda_range(
+    using type = std::decay_t<decltype(gko::acc::as_device_range(
         std::declval<gko::acc::range<AccessorType>>()))>;
     static type map_to_device(gko::acc::range<AccessorType>& range)
     {
-        return gko::acc::as_cuda_range(range);
+        return gko::acc::as_device_range(range);
     }
 };
 
 template <typename AccessorType>
 struct to_device_type_impl<const gko::acc::range<AccessorType>&> {
-    using type = std::decay_t<decltype(gko::acc::as_cuda_range(
+    using type = std::decay_t<decltype(gko::acc::as_device_range(
         std::declval<gko::acc::range<AccessorType>>()))>;
     static type map_to_device(const gko::acc::range<AccessorType>& range)
     {
-        return gko::acc::as_cuda_range(range);
+        return gko::acc::as_device_range(range);
     }
 };
 
diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh
index 6146d7248d0..817d19006bc 100644
--- a/cuda/base/kernel_launch_reduction.cuh
+++ b/cuda/base/kernel_launch_reduction.cuh
@@ -8,9 +8,9 @@
 #endif
 
 
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
 
diff --git a/cuda/base/kernel_launch_solver.cuh b/cuda/base/kernel_launch_solver.cuh
index 17988755517..0d9eaeb2653 100644
--- a/cuda/base/kernel_launch_solver.cuh
+++ b/cuda/base/kernel_launch_solver.cuh
@@ -8,6 +8,9 @@
 #endif
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+
+
 namespace gko {
 namespace kernels {
 namespace cuda {
diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp
index 88e9eb17a35..510d7cef889 100644
--- a/cuda/base/types.hpp
+++ b/cuda/base/types.hpp
@@ -394,6 +394,10 @@ GKO_INLINE GKO_ATTRIBUTES constexpr
 }
 
 
+using gpuComplex = cuComplex;
+using gpuDoubleComplex = cuDoubleComplex;
+
+
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace gko
diff --git a/cuda/components/atomic.cuh b/cuda/components/atomic.cuh
index 6dbed0b0d25..1964f0ae196 100644
--- a/cuda/components/atomic.cuh
+++ b/cuda/components/atomic.cuh
@@ -9,8 +9,8 @@
 #include <type_traits>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
 
 
 namespace gko {
@@ -21,38 +21,6 @@ namespace cuda {
 #include "common/cuda_hip/components/atomic.hpp.inc"
 
 
-/**
- * @internal
- *
- * @note It is not 'real' complex<float> atomic add operation
- */
-__forceinline__ __device__ thrust::complex<float> atomic_add(
-    thrust::complex<float>* __restrict__ address, thrust::complex<float> val)
-{
-    cuComplex* addr = reinterpret_cast<cuComplex*>(address);
-    // Separate to real part and imag part
-    auto real = atomic_add(&(addr->x), val.real());
-    auto imag = atomic_add(&(addr->y), val.imag());
-    return {real, imag};
-}
-
-
-/**
- * @internal
- *
- * @note It is not 'real' complex<double> atomic add operation
- */
-__forceinline__ __device__ thrust::complex<double> atomic_add(
-    thrust::complex<double>* __restrict__ address, thrust::complex<double> val)
-{
-    cuDoubleComplex* addr = reinterpret_cast<cuDoubleComplex*>(address);
-    // Separate to real part and imag part
-    auto real = atomic_add(&(addr->x), val.real());
-    auto imag = atomic_add(&(addr->y), val.imag());
-    return {real, imag};
-}
-
-
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace gko
diff --git a/cuda/components/cooperative_groups.cuh b/cuda/components/cooperative_groups.cuh
index eae0c957f21..70643a3b16a 100644
--- a/cuda/components/cooperative_groups.cuh
+++ b/cuda/components/cooperative_groups.cuh
@@ -13,7 +13,7 @@
 #include <cuda.h>
 
 
-#include "cuda/base/config.hpp"
+#include "common/cuda_hip/base/config.hpp"
 
 
 namespace gko {
diff --git a/cuda/components/diagonal_block_manipulation.cuh b/cuda/components/diagonal_block_manipulation.cuh
index d748fcab2e5..a8f27d3a81f 100644
--- a/cuda/components/diagonal_block_manipulation.cuh
+++ b/cuda/components/diagonal_block_manipulation.cuh
@@ -9,9 +9,9 @@
 #include <type_traits>
 
 
-#include "cuda/base/config.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 
 
 namespace gko {
diff --git a/cuda/components/format_conversion.cuh b/cuda/components/format_conversion.cuh
index bccc927c9cd..f0ef007c53c 100644
--- a/cuda/components/format_conversion.cuh
+++ b/cuda/components/format_conversion.cuh
@@ -10,7 +10,7 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
-#include "cuda/components/cooperative_groups.cuh"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "cuda/components/thread_ids.cuh"
 
 
diff --git a/cuda/components/memory.cuh b/cuda/components/memory.cuh
index 22bedca9699..97e5d67c23a 100644
--- a/cuda/components/memory.cuh
+++ b/cuda/components/memory.cuh
@@ -12,7 +12,7 @@
 #include <ginkgo/core/base/math.hpp>
 
 
-#include "cuda/base/types.hpp"
+#include "common/cuda_hip/base/types.hpp"
 
 
 namespace gko {
diff --git a/cuda/components/prefix_sum.cuh b/cuda/components/prefix_sum.cuh
index 653de4e9e15..2f6f145e304 100644
--- a/cuda/components/prefix_sum.cuh
+++ b/cuda/components/prefix_sum.cuh
@@ -9,8 +9,8 @@
 #include <type_traits>
 
 
-#include "cuda/base/cublas_bindings.hpp"
-#include "cuda/components/cooperative_groups.cuh"
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
 
diff --git a/cuda/components/reduction.cuh b/cuda/components/reduction.cuh
index ded80fae40a..250c560d44b 100644
--- a/cuda/components/reduction.cuh
+++ b/cuda/components/reduction.cuh
@@ -13,10 +13,11 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/array_access.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/components/uninitialized_array.hpp"
 
diff --git a/cuda/components/searching.cuh b/cuda/components/searching.cuh
index 1dc1304a82a..5472ac46ed1 100644
--- a/cuda/components/searching.cuh
+++ b/cuda/components/searching.cuh
@@ -6,7 +6,7 @@
 #define GKO_CUDA_COMPONENTS_SEARCHING_CUH_
 
 
-#include "cuda/base/config.hpp"
+#include "common/cuda_hip/base/config.hpp"
 #include "cuda/components/intrinsics.cuh"
 
 
diff --git a/cuda/components/segment_scan.cuh b/cuda/components/segment_scan.cuh
index 842f1e06760..6ffb8028334 100644
--- a/cuda/components/segment_scan.cuh
+++ b/cuda/components/segment_scan.cuh
@@ -6,7 +6,7 @@
 #define GKO_CUDA_COMPONENTS_SEGMENT_SCAN_CUH_
 
 
-#include "cuda/components/cooperative_groups.cuh"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 
 
 namespace gko {
diff --git a/cuda/components/sorting.cuh b/cuda/components/sorting.cuh
index e6eb17ec8e4..59e44d1bb82 100644
--- a/cuda/components/sorting.cuh
+++ b/cuda/components/sorting.cuh
@@ -6,8 +6,8 @@
 #define GKO_CUDA_COMPONENTS_SORTING_CUH_
 
 
-#include "cuda/base/config.hpp"
-#include "cuda/components/cooperative_groups.cuh"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 
 
 namespace gko {
diff --git a/cuda/components/syncfree.cuh b/cuda/components/syncfree.cuh
index 0d45c8db516..0d5c0d11f43 100644
--- a/cuda/components/syncfree.cuh
+++ b/cuda/components/syncfree.cuh
@@ -9,11 +9,11 @@
 #include <ginkgo/core/base/array.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/memory.hpp"
 #include "core/components/fill_array_kernels.hpp"
-#include "cuda/base/config.hpp"
 #include "cuda/components/atomic.cuh"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/memory.cuh"
 
 
 namespace gko {
diff --git a/cuda/components/thread_ids.cuh b/cuda/components/thread_ids.cuh
index c3e517e0f9d..1113ea75fc6 100644
--- a/cuda/components/thread_ids.cuh
+++ b/cuda/components/thread_ids.cuh
@@ -6,17 +6,12 @@
 #define GKO_CUDA_COMPONENTS_THREAD_IDS_CUH_
 
 
-#include "cuda/base/config.hpp"
+#include "common/cuda_hip/base/config.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace cuda {
-/**
- * @brief The CUDA thread namespace.
- *
- * @ingroup cuda_thread
- */
 namespace thread {
 
 
diff --git a/cuda/distributed/vector_kernels.cu b/cuda/distributed/vector_kernels.cu
index ca9c419239b..7b06ada9f0e 100644
--- a/cuda/distributed/vector_kernels.cu
+++ b/cuda/distributed/vector_kernels.cu
@@ -5,6 +5,9 @@
 #include "core/distributed/vector_kernels.hpp"
 
 
+#include <functional>
+
+
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/transform_iterator.h>
diff --git a/cuda/factorization/cholesky_kernels.cu b/cuda/factorization/cholesky_kernels.cu
index 79779f2f54b..e05b0803dc2 100644
--- a/cuda/factorization/cholesky_kernels.cu
+++ b/cuda/factorization/cholesky_kernels.cu
@@ -20,15 +20,15 @@
 #include <ginkgo/core/matrix/csr.hpp>
 
 
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/format_conversion_kernels.hpp"
 #include "core/factorization/elimination_forest.hpp"
 #include "core/factorization/lu_kernels.hpp"
 #include "core/matrix/csr_lookup.hpp"
-#include "cuda/base/cusparse_bindings.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/intrinsics.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/syncfree.cuh"
@@ -80,19 +80,19 @@ void symbolic_count(std::shared_ptr<const DefaultExecutor> exec,
     }
     // sort postorder_cols inside rows
     {
-        const auto handle = exec->get_cusparse_handle();
-        auto descr = cusparse::create_mat_descr();
+        const auto handle = exec->get_sparselib_handle();
+        auto descr = sparselib::create_mat_descr();
         array<IndexType> permutation_array(exec, mtx_nnz);
         auto permutation = permutation_array.get_data();
         components::fill_seq_array(exec, permutation, mtx_nnz);
         size_type buffer_size{};
-        cusparse::csrsort_buffer_size(handle, num_rows, num_rows, mtx_nnz,
-                                      row_ptrs, postorder_cols, buffer_size);
+        sparselib::csrsort_buffer_size(handle, num_rows, num_rows, mtx_nnz,
+                                       row_ptrs, postorder_cols, buffer_size);
         array<char> buffer_array{exec, buffer_size};
         auto buffer = buffer_array.get_data();
-        cusparse::csrsort(handle, num_rows, num_rows, mtx_nnz, descr, row_ptrs,
-                          postorder_cols, permutation, buffer);
-        cusparse::destroy(descr);
+        sparselib::csrsort(handle, num_rows, num_rows, mtx_nnz, descr, row_ptrs,
+                           postorder_cols, permutation, buffer);
+        sparselib::destroy(descr);
     }
     // count nonzeros per row of L
     {
diff --git a/cuda/factorization/factorization_kernels.cu b/cuda/factorization/factorization_kernels.cu
index 4ea03981a15..309ded37d34 100644
--- a/cuda/factorization/factorization_kernels.cu
+++ b/cuda/factorization/factorization_kernels.cu
@@ -8,12 +8,13 @@
 #include <ginkgo/core/base/array.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/array_access.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/intrinsics.cuh"
 #include "cuda/components/searching.cuh"
 #include "cuda/components/thread_ids.cuh"
diff --git a/cuda/factorization/ic_kernels.cu b/cuda/factorization/ic_kernels.cu
index 1afb10ce57a..9d55856f139 100644
--- a/cuda/factorization/ic_kernels.cu
+++ b/cuda/factorization/ic_kernels.cu
@@ -8,7 +8,7 @@
 #include <ginkgo/core/base/array.hpp>
 
 
-#include "cuda/base/cusparse_bindings.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
 
 
 namespace gko {
@@ -27,37 +27,37 @@ void compute(std::shared_ptr<const DefaultExecutor> exec,
              matrix::Csr<ValueType, IndexType>* m)
 {
     const auto id = exec->get_device_id();
-    auto handle = exec->get_cusparse_handle();
-    auto desc = cusparse::create_mat_descr();
-    auto info = cusparse::create_ic0_info();
+    auto handle = exec->get_sparselib_handle();
+    auto desc = sparselib::create_mat_descr();
+    auto info = sparselib::create_ic0_info();
 
     // get buffer size for IC
     IndexType num_rows = m->get_size()[0];
     IndexType nnz = m->get_num_stored_elements();
     size_type buffer_size{};
-    cusparse::ic0_buffer_size(handle, num_rows, nnz, desc,
-                              m->get_const_values(), m->get_const_row_ptrs(),
-                              m->get_const_col_idxs(), info, buffer_size);
+    sparselib::ic0_buffer_size(handle, num_rows, nnz, desc,
+                               m->get_const_values(), m->get_const_row_ptrs(),
+                               m->get_const_col_idxs(), info, buffer_size);
 
     array<char> buffer{exec, buffer_size};
 
     // set up IC(0)
-    cusparse::ic0_analysis(handle, num_rows, nnz, desc, m->get_const_values(),
-                           m->get_const_row_ptrs(), m->get_const_col_idxs(),
-                           info, CUSPARSE_SOLVE_POLICY_USE_LEVEL,
-                           buffer.get_data());
+    sparselib::ic0_analysis(handle, num_rows, nnz, desc, m->get_const_values(),
+                            m->get_const_row_ptrs(), m->get_const_col_idxs(),
+                            info, SPARSELIB_SOLVE_POLICY_USE_LEVEL,
+                            buffer.get_data());
 
-    cusparse::ic0(handle, num_rows, nnz, desc, m->get_values(),
-                  m->get_const_row_ptrs(), m->get_const_col_idxs(), info,
-                  CUSPARSE_SOLVE_POLICY_USE_LEVEL, buffer.get_data());
+    sparselib::ic0(handle, num_rows, nnz, desc, m->get_values(),
+                   m->get_const_row_ptrs(), m->get_const_col_idxs(), info,
+                   SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data());
 
     // CUDA 11.4 has a use-after-free bug on Turing
 #if (CUDA_VERSION >= 11040)
     exec->synchronize();
 #endif
 
-    cusparse::destroy(info);
-    cusparse::destroy(desc);
+    sparselib::destroy_ic0_info(info);
+    sparselib::destroy(desc);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC_COMPUTE_KERNEL);
diff --git a/cuda/factorization/ilu_kernels.cu b/cuda/factorization/ilu_kernels.cu
index 33e59bb54c9..acebec6e94c 100644
--- a/cuda/factorization/ilu_kernels.cu
+++ b/cuda/factorization/ilu_kernels.cu
@@ -8,7 +8,7 @@
 #include <ginkgo/core/base/array.hpp>
 
 
-#include "cuda/base/cusparse_bindings.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
 
 
 namespace gko {
@@ -27,37 +27,37 @@ void compute_lu(std::shared_ptr<const DefaultExecutor> exec,
                 matrix::Csr<ValueType, IndexType>* m)
 {
     const auto id = exec->get_device_id();
-    auto handle = exec->get_cusparse_handle();
-    auto desc = cusparse::create_mat_descr();
-    auto info = cusparse::create_ilu0_info();
+    auto handle = exec->get_sparselib_handle();
+    auto desc = sparselib::create_mat_descr();
+    auto info = sparselib::create_ilu0_info();
 
     // get buffer size for ILU
     IndexType num_rows = m->get_size()[0];
     IndexType nnz = m->get_num_stored_elements();
     size_type buffer_size{};
-    cusparse::ilu0_buffer_size(handle, num_rows, nnz, desc,
-                               m->get_const_values(), m->get_const_row_ptrs(),
-                               m->get_const_col_idxs(), info, buffer_size);
+    sparselib::ilu0_buffer_size(handle, num_rows, nnz, desc,
+                                m->get_const_values(), m->get_const_row_ptrs(),
+                                m->get_const_col_idxs(), info, buffer_size);
 
     array<char> buffer{exec, buffer_size};
 
     // set up ILU(0)
-    cusparse::ilu0_analysis(handle, num_rows, nnz, desc, m->get_const_values(),
-                            m->get_const_row_ptrs(), m->get_const_col_idxs(),
-                            info, CUSPARSE_SOLVE_POLICY_USE_LEVEL,
-                            buffer.get_data());
+    sparselib::ilu0_analysis(handle, num_rows, nnz, desc, m->get_const_values(),
+                             m->get_const_row_ptrs(), m->get_const_col_idxs(),
+                             info, SPARSELIB_SOLVE_POLICY_USE_LEVEL,
+                             buffer.get_data());
 
-    cusparse::ilu0(handle, num_rows, nnz, desc, m->get_values(),
-                   m->get_const_row_ptrs(), m->get_const_col_idxs(), info,
-                   CUSPARSE_SOLVE_POLICY_USE_LEVEL, buffer.get_data());
+    sparselib::ilu0(handle, num_rows, nnz, desc, m->get_values(),
+                    m->get_const_row_ptrs(), m->get_const_col_idxs(), info,
+                    SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data());
 
     // CUDA 11.4 has a use-after-free bug on Turing
 #if (CUDA_VERSION >= 11040)
     exec->synchronize();
 #endif
 
-    cusparse::destroy(info);
-    cusparse::destroy(desc);
+    sparselib::destroy_ilu0_info(info);
+    sparselib::destroy(desc);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/cuda/factorization/lu_kernels.cu b/cuda/factorization/lu_kernels.cu
index 583bf51fb67..9c3069f62cf 100644
--- a/cuda/factorization/lu_kernels.cu
+++ b/cuda/factorization/lu_kernels.cu
@@ -17,11 +17,11 @@
 #include <ginkgo/core/matrix/csr.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/allocator.hpp"
 #include "core/matrix/csr_lookup.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/syncfree.cuh"
 #include "cuda/components/thread_ids.cuh"
diff --git a/cuda/factorization/par_ic_kernels.cu b/cuda/factorization/par_ic_kernels.cu
index a9de634f1f9..f493cb11fd1 100644
--- a/cuda/factorization/par_ic_kernels.cu
+++ b/cuda/factorization/par_ic_kernels.cu
@@ -10,9 +10,9 @@
 #include <ginkgo/core/matrix/csr.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/memory.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/memory.cuh"
 #include "cuda/components/thread_ids.cuh"
 
 
diff --git a/cuda/factorization/par_ict_kernels.cu b/cuda/factorization/par_ict_kernels.cu
index 5f48ceef2f8..d958f81d2f4 100644
--- a/cuda/factorization/par_ict_kernels.cu
+++ b/cuda/factorization/par_ict_kernels.cu
@@ -12,6 +12,8 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/memory.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
@@ -19,7 +21,6 @@
 #include "core/synthesizer/implementation_selection.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/components/intrinsics.cuh"
-#include "cuda/components/memory.cuh"
 #include "cuda/components/merging.cuh"
 #include "cuda/components/prefix_sum.cuh"
 #include "cuda/components/reduction.cuh"
@@ -46,8 +47,7 @@ using compiled_kernels =
     syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
 
 
-#include "common/cuda_hip/factorization/par_ict_spgeam_kernels.hpp.inc"
-#include "common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc"
+#include "common/cuda_hip/factorization/par_ict_kernels.hpp.inc"
 
 
 namespace {
diff --git a/cuda/factorization/par_ilu_kernels.cu b/cuda/factorization/par_ilu_kernels.cu
index 7a770a39353..cd48dd2a9db 100644
--- a/cuda/factorization/par_ilu_kernels.cu
+++ b/cuda/factorization/par_ilu_kernels.cu
@@ -5,12 +5,14 @@
 #include "core/factorization/par_ilu_kernels.hpp"
 
 
+#include <ginkgo/core/base/std_extensions.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/memory.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/memory.cuh"
 #include "cuda/components/thread_ids.cuh"
 
 
diff --git a/cuda/factorization/par_ilut_approx_filter_kernel.cu b/cuda/factorization/par_ilut_approx_filter_kernels.cu
similarity index 97%
rename from cuda/factorization/par_ilut_approx_filter_kernel.cu
rename to cuda/factorization/par_ilut_approx_filter_kernels.cu
index 853519cd36b..ae544939e17 100644
--- a/cuda/factorization/par_ilut_approx_filter_kernel.cu
+++ b/cuda/factorization/par_ilut_approx_filter_kernels.cu
@@ -15,16 +15,16 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/config.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
 #include "cuda/components/atomic.cuh"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/intrinsics.cuh"
 #include "cuda/components/prefix_sum.cuh"
 #include "cuda/components/sorting.cuh"
diff --git a/cuda/factorization/par_ilut_filter_kernel.cu b/cuda/factorization/par_ilut_filter_kernels.cu
similarity index 96%
rename from cuda/factorization/par_ilut_filter_kernel.cu
rename to cuda/factorization/par_ilut_filter_kernels.cu
index 0e63f102b72..4a24c5f305b 100644
--- a/cuda/factorization/par_ilut_filter_kernel.cu
+++ b/cuda/factorization/par_ilut_filter_kernels.cu
@@ -12,15 +12,16 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/config.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/intrinsics.cuh"
 #include "cuda/components/thread_ids.cuh"
 
diff --git a/cuda/factorization/par_ilut_select_kernel.cu b/cuda/factorization/par_ilut_select_kernels.cu
similarity index 98%
rename from cuda/factorization/par_ilut_select_kernel.cu
rename to cuda/factorization/par_ilut_select_kernels.cu
index ca8b55e504b..6a7bd53c1c4 100644
--- a/cuda/factorization/par_ilut_select_kernel.cu
+++ b/cuda/factorization/par_ilut_select_kernels.cu
@@ -13,6 +13,7 @@
 #include <ginkgo/core/matrix/csr.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/components/atomic.cuh"
@@ -147,7 +148,7 @@ void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
     auto out_ptr = reinterpret_cast<AbsType*>(tmp1.get_data());
     kernel::basecase_select<<<1, kernel::basecase_block_size, 0,
                               exec->get_stream()>>>(
-        as_cuda_type(tmp22), bucket.size, rank, as_cuda_type(out_ptr));
+        as_device_type(tmp22), bucket.size, rank, as_device_type(out_ptr));
     threshold = exec->copy_val_to_host(out_ptr);
 }
 
diff --git a/cuda/factorization/par_ilut_spgeam_kernel.cu b/cuda/factorization/par_ilut_spgeam_kernels.cu
similarity index 97%
rename from cuda/factorization/par_ilut_spgeam_kernel.cu
rename to cuda/factorization/par_ilut_spgeam_kernels.cu
index c4372f66219..0a751c2f48f 100644
--- a/cuda/factorization/par_ilut_spgeam_kernel.cu
+++ b/cuda/factorization/par_ilut_spgeam_kernels.cu
@@ -12,13 +12,14 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/intrinsics.cuh"
 #include "cuda/components/merging.cuh"
 #include "cuda/components/prefix_sum.cuh"
@@ -80,8 +81,8 @@ void add_candidates(syn::value_list<int, subwarp_size>,
     auto u_vals = u->get_const_values();
     auto l_new_row_ptrs = l_new->get_row_ptrs();
     auto u_new_row_ptrs = u_new->get_row_ptrs();
-    // count non-zeros per row
     if (num_blocks > 0) {
+        // count non-zeros per row
         kernel::tri_spgeam_nnz<subwarp_size>
             <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
                 lu_row_ptrs, lu_col_idxs, a_row_ptrs, a_col_idxs,
@@ -105,8 +106,8 @@ void add_candidates(syn::value_list<int, subwarp_size>,
     auto u_new_col_idxs = u_new->get_col_idxs();
     auto u_new_vals = u_new->get_values();
 
-    // fill columns and values
     if (num_blocks > 0) {
+        // fill columns and values
         kernel::tri_spgeam_init<subwarp_size>
             <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
                 lu_row_ptrs, lu_col_idxs, as_device_type(lu_vals), a_row_ptrs,
diff --git a/cuda/factorization/par_ilut_sweep_kernel.cu b/cuda/factorization/par_ilut_sweep_kernels.cu
similarity index 97%
rename from cuda/factorization/par_ilut_sweep_kernel.cu
rename to cuda/factorization/par_ilut_sweep_kernels.cu
index 85fb3f26e21..5924ebe328d 100644
--- a/cuda/factorization/par_ilut_sweep_kernel.cu
+++ b/cuda/factorization/par_ilut_sweep_kernels.cu
@@ -12,6 +12,8 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/memory.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
@@ -19,7 +21,6 @@
 #include "core/synthesizer/implementation_selection.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/components/intrinsics.cuh"
-#include "cuda/components/memory.cuh"
 #include "cuda/components/merging.cuh"
 #include "cuda/components/prefix_sum.cuh"
 #include "cuda/components/reduction.cuh"
diff --git a/cuda/log/batch_logger.cuh b/cuda/log/batch_logger.cuh
index 26c60ae78eb..3e53d6ef0a6 100644
--- a/cuda/log/batch_logger.cuh
+++ b/cuda/log/batch_logger.cuh
@@ -23,4 +23,5 @@ namespace batch_log {
 }  // namespace kernels
 }  // namespace gko
 
+
 #endif  // GKO_CUDA_LOG_BATCH_LOGGER_CUH_
diff --git a/cuda/matrix/batch_csr_kernels.cu b/cuda/matrix/batch_csr_kernels.cu
index 6be0a2cab3b..6ec20480405 100644
--- a/cuda/matrix/batch_csr_kernels.cu
+++ b/cuda/matrix/batch_csr_kernels.cu
@@ -13,12 +13,13 @@
 #include <ginkgo/core/matrix/batch_csr.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "cuda/base/batch_struct.hpp"
-#include "cuda/base/config.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/components/uninitialized_array.hpp"
diff --git a/cuda/matrix/batch_dense_kernels.cu b/cuda/matrix/batch_dense_kernels.cu
index 56268d8d6b4..673b08e5db1 100644
--- a/cuda/matrix/batch_dense_kernels.cu
+++ b/cuda/matrix/batch_dense_kernels.cu
@@ -9,15 +9,17 @@
 
 
 #include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "cuda/base/batch_struct.hpp"
-#include "cuda/base/config.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/components/uninitialized_array.hpp"
diff --git a/cuda/matrix/batch_ell_kernels.cu b/cuda/matrix/batch_ell_kernels.cu
index 3c824cf8da4..8f0160bd154 100644
--- a/cuda/matrix/batch_ell_kernels.cu
+++ b/cuda/matrix/batch_ell_kernels.cu
@@ -13,12 +13,13 @@
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "cuda/base/batch_struct.hpp"
-#include "cuda/base/config.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/components/uninitialized_array.hpp"
diff --git a/cuda/matrix/batch_struct.hpp b/cuda/matrix/batch_struct.hpp
index 1c17aea3bfe..5e9c803c9f6 100644
--- a/cuda/matrix/batch_struct.hpp
+++ b/cuda/matrix/batch_struct.hpp
@@ -13,8 +13,8 @@
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "core/base/batch_struct.hpp"
-#include "cuda/base/types.hpp"
 
 
 namespace gko {
diff --git a/cuda/matrix/coo_kernels.cu b/cuda/matrix/coo_kernels.cu
index 3d67144c9ec..f138d0b934e 100644
--- a/cuda/matrix/coo_kernels.cu
+++ b/cuda/matrix/coo_kernels.cu
@@ -12,25 +12,21 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
 #include "core/matrix/dense_kernels.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/cusparse_bindings.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
 #include "cuda/components/atomic.cuh"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/format_conversion.cuh"
 #include "cuda/components/segment_scan.cuh"
 #include "cuda/components/thread_ids.cuh"
 
 
 namespace gko {
 namespace kernels {
-/**
- * @brief The CUDA namespace.
- *
- * @ingroup cuda
- */
 namespace cuda {
 /**
  * @brief The Coordinate matrix format namespace.
diff --git a/cuda/matrix/csr_kernels.template.cu b/cuda/matrix/csr_kernels.template.cu
index 4a779775670..73ce267ec65 100644
--- a/cuda/matrix/csr_kernels.template.cu
+++ b/cuda/matrix/csr_kernels.template.cu
@@ -27,7 +27,13 @@
 #include <ginkgo/core/matrix/sellp.hpp>
 
 
-#include "accessor/cuda_helper.hpp"
+#include "accessor/cuda_hip_helper.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
 #include "core/base/array_access.hpp"
 #include "core/base/mixed_precision_types.hpp"
 #include "core/components/fill_array_kernels.hpp"
@@ -38,15 +44,9 @@
 #include "core/matrix/csr_lookup.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/cusparse_bindings.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/pointer_mode_guard.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/base/types.hpp"
 #include "cuda/components/atomic.cuh"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/format_conversion.cuh"
 #include "cuda/components/intrinsics.cuh"
 #include "cuda/components/merging.cuh"
 #include "cuda/components/prefix_sum.cuh"
@@ -133,10 +133,11 @@ void merge_path_spmv(syn::value_list<int, items_per_thread>,
                 kernel::abstract_merge_path_spmv<items_per_thread>
                     <<<grid, block, 0, exec->get_stream()>>>(
                         static_cast<IndexType>(a->get_size()[0]),
-                        acc::as_cuda_range(a_vals), a->get_const_col_idxs(),
+                        acc::as_device_range(a_vals), a->get_const_col_idxs(),
                         as_device_type(a->get_const_row_ptrs()),
                         as_device_type(a->get_const_srow()),
-                        acc::as_cuda_range(b_vals), acc::as_cuda_range(c_vals),
+                        acc::as_device_range(b_vals),
+                        acc::as_device_range(c_vals),
                         as_device_type(row_out.get_data()),
                         as_device_type(val_out.get_data()));
             }
@@ -144,7 +145,7 @@ void merge_path_spmv(syn::value_list<int, items_per_thread>,
                 abstract_reduce<<<1, spmv_block_size, 0, exec->get_stream()>>>(
                     grid_num, as_device_type(val_out.get_data()),
                     as_device_type(row_out.get_data()),
-                    acc::as_cuda_range(c_vals));
+                    acc::as_device_range(c_vals));
 
         } else if (alpha != nullptr && beta != nullptr) {
             if (grid_num > 0) {
@@ -152,12 +153,12 @@ void merge_path_spmv(syn::value_list<int, items_per_thread>,
                     <<<grid, block, 0, exec->get_stream()>>>(
                         static_cast<IndexType>(a->get_size()[0]),
                         as_device_type(alpha->get_const_values()),
-                        acc::as_cuda_range(a_vals), a->get_const_col_idxs(),
+                        acc::as_device_range(a_vals), a->get_const_col_idxs(),
                         as_device_type(a->get_const_row_ptrs()),
                         as_device_type(a->get_const_srow()),
-                        acc::as_cuda_range(b_vals),
+                        acc::as_device_range(b_vals),
                         as_device_type(beta->get_const_values()),
-                        acc::as_cuda_range(c_vals),
+                        acc::as_device_range(c_vals),
                         as_device_type(row_out.get_data()),
                         as_device_type(val_out.get_data()));
             }
@@ -166,7 +167,7 @@ void merge_path_spmv(syn::value_list<int, items_per_thread>,
                     grid_num, as_device_type(val_out.get_data()),
                     as_device_type(row_out.get_data()),
                     as_device_type(alpha->get_const_values()),
-                    acc::as_cuda_range(c_vals));
+                    acc::as_device_range(c_vals));
         } else {
             GKO_KERNEL_NOT_FOUND;
         }
@@ -245,21 +246,21 @@ void classical_spmv(syn::value_list<int, subwarp_size>,
         if (grid.x > 0 && grid.y > 0) {
             kernel::abstract_classical_spmv<subwarp_size>
                 <<<grid, block, 0, exec->get_stream()>>>(
-                    a->get_size()[0], acc::as_cuda_range(a_vals),
+                    a->get_size()[0], acc::as_device_range(a_vals),
                     a->get_const_col_idxs(),
                     as_device_type(a->get_const_row_ptrs()),
-                    acc::as_cuda_range(b_vals), acc::as_cuda_range(c_vals));
+                    acc::as_device_range(b_vals), acc::as_device_range(c_vals));
         }
     } else if (alpha != nullptr && beta != nullptr) {
         if (grid.x > 0 && grid.y > 0) {
             kernel::abstract_classical_spmv<subwarp_size>
                 <<<grid, block, 0, exec->get_stream()>>>(
                     a->get_size()[0], as_device_type(alpha->get_const_values()),
-                    acc::as_cuda_range(a_vals), a->get_const_col_idxs(),
+                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
                     as_device_type(a->get_const_row_ptrs()),
-                    acc::as_cuda_range(b_vals),
+                    acc::as_device_range(b_vals),
                     as_device_type(beta->get_const_values()),
-                    acc::as_cuda_range(c_vals));
+                    acc::as_device_range(c_vals));
         }
     } else {
         GKO_KERNEL_NOT_FOUND;
@@ -301,20 +302,20 @@ void load_balance_spmv(std::shared_ptr<const DefaultExecutor> exec,
                                         exec->get_stream()>>>(
                     nwarps, static_cast<IndexType>(a->get_size()[0]),
                     as_device_type(alpha->get_const_values()),
-                    acc::as_cuda_range(a_vals), a->get_const_col_idxs(),
+                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
                     as_device_type(a->get_const_row_ptrs()),
                     as_device_type(a->get_const_srow()),
-                    acc::as_cuda_range(b_vals), acc::as_cuda_range(c_vals));
+                    acc::as_device_range(b_vals), acc::as_device_range(c_vals));
             }
         } else {
             if (csr_grid.x > 0 && csr_grid.y > 0) {
                 kernel::abstract_spmv<<<csr_grid, csr_block, 0,
                                         exec->get_stream()>>>(
                     nwarps, static_cast<IndexType>(a->get_size()[0]),
-                    acc::as_cuda_range(a_vals), a->get_const_col_idxs(),
+                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
                     as_device_type(a->get_const_row_ptrs()),
                     as_device_type(a->get_const_srow()),
-                    acc::as_cuda_range(b_vals), acc::as_cuda_range(c_vals));
+                    acc::as_device_range(b_vals), acc::as_device_range(c_vals));
             }
         }
     }
@@ -329,55 +330,55 @@ bool try_general_sparselib_spmv(std::shared_ptr<const DefaultExecutor> exec,
                                 const ValueType* beta,
                                 matrix::Dense<ValueType>* c)
 {
-    auto handle = exec->get_cusparse_handle();
+    auto handle = exec->get_sparselib_handle();
 #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-    if (!cusparse::is_supported<ValueType, IndexType>::value ||
+    if (!sparselib::is_supported<ValueType, IndexType>::value ||
         b->get_stride() != 1 || c->get_stride() != 1 || b->get_size()[0] == 0 ||
         c->get_size()[0] == 0) {
         return false;
     }
 
-    auto descr = cusparse::create_mat_descr();
+    auto descr = sparselib::create_mat_descr();
     auto row_ptrs = a->get_const_row_ptrs();
     auto col_idxs = a->get_const_col_idxs();
-    cusparse::spmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, a->get_size()[0],
-                   a->get_size()[1], a->get_num_stored_elements(), alpha, descr,
-                   a->get_const_values(), row_ptrs, col_idxs,
-                   b->get_const_values(), beta, c->get_values());
+    sparselib::spmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, a->get_size()[0],
+                    a->get_size()[1], a->get_num_stored_elements(), alpha,
+                    descr, a->get_const_values(), row_ptrs, col_idxs,
+                    b->get_const_values(), beta, c->get_values());
 
-    cusparse::destroy(descr);
+    sparselib::destroy(descr);
 #else  // CUDA_VERSION >= 11000
     // workaround for a division by zero in cuSPARSE 11.?
     if (a->get_size()[1] == 0) {
         return false;
     }
-    cusparseOperation_t trans = CUSPARSE_OPERATION_NON_TRANSPOSE;
+    cusparseOperation_t trans = SPARSELIB_OPERATION_NON_TRANSPOSE;
     auto row_ptrs = const_cast<IndexType*>(a->get_const_row_ptrs());
     auto col_idxs = const_cast<IndexType*>(a->get_const_col_idxs());
     auto values = const_cast<ValueType*>(a->get_const_values());
-    auto mat = cusparse::create_csr(a->get_size()[0], a->get_size()[1],
-                                    a->get_num_stored_elements(), row_ptrs,
-                                    col_idxs, values);
+    auto mat = sparselib::create_csr(a->get_size()[0], a->get_size()[1],
+                                     a->get_num_stored_elements(), row_ptrs,
+                                     col_idxs, values);
     auto b_val = const_cast<ValueType*>(b->get_const_values());
     auto c_val = c->get_values();
     if (b->get_stride() == 1 && c->get_stride() == 1) {
-        auto vecb = cusparse::create_dnvec(b->get_size()[0], b_val);
-        auto vecc = cusparse::create_dnvec(c->get_size()[0], c_val);
+        auto vecb = sparselib::create_dnvec(b->get_size()[0], b_val);
+        auto vecc = sparselib::create_dnvec(c->get_size()[0], c_val);
 #if CUDA_VERSION >= 11021
         constexpr auto alg = CUSPARSE_SPMV_CSR_ALG1;
 #else
         constexpr auto alg = CUSPARSE_CSRMV_ALG1;
 #endif
         size_type buffer_size = 0;
-        cusparse::spmv_buffersize<ValueType>(handle, trans, alpha, mat, vecb,
-                                             beta, vecc, alg, &buffer_size);
+        sparselib::spmv_buffersize<ValueType>(handle, trans, alpha, mat, vecb,
+                                              beta, vecc, alg, &buffer_size);
 
         array<char> buffer_array(exec, buffer_size);
         auto buffer = buffer_array.get_data();
-        cusparse::spmv<ValueType>(handle, trans, alpha, mat, vecb, beta, vecc,
-                                  alg, buffer);
-        cusparse::destroy(vecb);
-        cusparse::destroy(vecc);
+        sparselib::spmv<ValueType>(handle, trans, alpha, mat, vecb, beta, vecc,
+                                   alg, buffer);
+        sparselib::destroy(vecb);
+        sparselib::destroy(vecc);
     } else {
 #if CUDA_VERSION >= 11060
         if (b->get_size()[1] == 1) {
@@ -388,22 +389,22 @@ bool try_general_sparselib_spmv(std::shared_ptr<const DefaultExecutor> exec,
 #endif  // CUDA_VERSION >= 11060
         cusparseSpMMAlg_t alg = CUSPARSE_SPMM_CSR_ALG2;
         auto vecb =
-            cusparse::create_dnmat(b->get_size(), b->get_stride(), b_val);
+            sparselib::create_dnmat(b->get_size(), b->get_stride(), b_val);
         auto vecc =
-            cusparse::create_dnmat(c->get_size(), c->get_stride(), c_val);
+            sparselib::create_dnmat(c->get_size(), c->get_stride(), c_val);
         size_type buffer_size = 0;
-        cusparse::spmm_buffersize<ValueType>(handle, trans, trans, alpha, mat,
-                                             vecb, beta, vecc, alg,
-                                             &buffer_size);
+        sparselib::spmm_buffersize<ValueType>(handle, trans, trans, alpha, mat,
+                                              vecb, beta, vecc, alg,
+                                              &buffer_size);
 
         array<char> buffer_array(exec, buffer_size);
         auto buffer = buffer_array.get_data();
-        cusparse::spmm<ValueType>(handle, trans, trans, alpha, mat, vecb, beta,
-                                  vecc, alg, buffer);
-        cusparse::destroy(vecb);
-        cusparse::destroy(vecc);
+        sparselib::spmm<ValueType>(handle, trans, trans, alpha, mat, vecb, beta,
+                                   vecc, alg, buffer);
+        sparselib::destroy(vecb);
+        sparselib::destroy(vecc);
     }
-    cusparse::destroy(mat);
+    sparselib::destroy(mat);
 #endif
     return true;
 }
@@ -437,8 +438,8 @@ bool try_sparselib_spmv(std::shared_ptr<const DefaultExecutor> exec,
         return try_general_sparselib_spmv(exec, alpha->get_const_values(), a, b,
                                           beta->get_const_values(), c);
     } else {
-        auto handle = exec->get_cusparse_handle();
-        cusparse::pointer_mode_guard pm_guard(handle);
+        auto handle = exec->get_sparselib_handle();
+        sparselib::pointer_mode_guard pm_guard(handle);
         const auto valpha = one<ValueType>();
         const auto vbeta = zero<ValueType>();
         return try_general_sparselib_spmv(exec, &valpha, a, b, &vbeta, c);
@@ -583,8 +584,8 @@ void spgemm(std::shared_ptr<const DefaultExecutor> exec,
     auto b_col_idxs = b->get_const_col_idxs();
     auto c_row_ptrs = c->get_row_ptrs();
 
-    auto handle = exec->get_cusparse_handle();
-    cusparse::pointer_mode_guard pm_guard(handle);
+    auto handle = exec->get_sparselib_handle();
+    sparselib::pointer_mode_guard pm_guard(handle);
 
     auto alpha = one<ValueType>();
     auto a_nnz = static_cast<IndexType>(a->get_num_stored_elements());
@@ -600,18 +601,18 @@ void spgemm(std::shared_ptr<const DefaultExecutor> exec,
     auto& c_vals_array = c_builder.get_value_array();
 
 #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-    if (!cusparse::is_supported<ValueType, IndexType>::value) {
+    if (!sparselib::is_supported<ValueType, IndexType>::value) {
         GKO_NOT_IMPLEMENTED;
     }
 
-    auto a_descr = cusparse::create_mat_descr();
-    auto b_descr = cusparse::create_mat_descr();
-    auto c_descr = cusparse::create_mat_descr();
-    auto d_descr = cusparse::create_mat_descr();
-    auto info = cusparse::create_spgemm_info();
+    auto a_descr = sparselib::create_mat_descr();
+    auto b_descr = sparselib::create_mat_descr();
+    auto c_descr = sparselib::create_mat_descr();
+    auto d_descr = sparselib::create_mat_descr();
+    auto info = sparselib::create_spgemm_info();
     // allocate buffer
     size_type buffer_size{};
-    cusparse::spgemm_buffer_size(
+    sparselib::spgemm_buffer_size(
         handle, m, n, k, &alpha, a_descr, a_nnz, a_row_ptrs, a_col_idxs,
         b_descr, b_nnz, b_row_ptrs, b_col_idxs, null_value, d_descr, zero_nnz,
         null_index, null_index, info, buffer_size);
@@ -620,74 +621,75 @@ void spgemm(std::shared_ptr<const DefaultExecutor> exec,
 
     // count nnz
     IndexType c_nnz{};
-    cusparse::spgemm_nnz(handle, m, n, k, a_descr, a_nnz, a_row_ptrs,
-                         a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs,
-                         d_descr, zero_nnz, null_index, null_index, c_descr,
-                         c_row_ptrs, &c_nnz, info, buffer);
+    sparselib::spgemm_nnz(handle, m, n, k, a_descr, a_nnz, a_row_ptrs,
+                          a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs,
+                          d_descr, zero_nnz, null_index, null_index, c_descr,
+                          c_row_ptrs, &c_nnz, info, buffer);
 
     // accumulate non-zeros
     c_col_idxs_array.resize_and_reset(c_nnz);
     c_vals_array.resize_and_reset(c_nnz);
     auto c_col_idxs = c_col_idxs_array.get_data();
     auto c_vals = c_vals_array.get_data();
-    cusparse::spgemm(handle, m, n, k, &alpha, a_descr, a_nnz, a_vals,
-                     a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals, b_row_ptrs,
-                     b_col_idxs, null_value, d_descr, zero_nnz, null_value,
-                     null_index, null_index, c_descr, c_vals, c_row_ptrs,
-                     c_col_idxs, info, buffer);
-
-    cusparse::destroy(info);
-    cusparse::destroy(d_descr);
-    cusparse::destroy(c_descr);
-    cusparse::destroy(b_descr);
-    cusparse::destroy(a_descr);
+    sparselib::spgemm(handle, m, n, k, &alpha, a_descr, a_nnz, a_vals,
+                      a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals,
+                      b_row_ptrs, b_col_idxs, null_value, d_descr, zero_nnz,
+                      null_value, null_index, null_index, c_descr, c_vals,
+                      c_row_ptrs, c_col_idxs, info, buffer);
+
+    sparselib::destroy(info);
+    sparselib::destroy(d_descr);
+    sparselib::destroy(c_descr);
+    sparselib::destroy(b_descr);
+    sparselib::destroy(a_descr);
 
 #else   // CUDA_VERSION >= 11000
     const auto beta = zero<ValueType>();
-    auto spgemm_descr = cusparse::create_spgemm_descr();
-    auto a_descr = cusparse::create_csr(
+    auto spgemm_descr = sparselib::create_spgemm_descr();
+    auto a_descr = sparselib::create_csr(
         m, k, a_nnz, const_cast<IndexType*>(a_row_ptrs),
         const_cast<IndexType*>(a_col_idxs), const_cast<ValueType*>(a_vals));
-    auto b_descr = cusparse::create_csr(
+    auto b_descr = sparselib::create_csr(
         k, n, b_nnz, const_cast<IndexType*>(b_row_ptrs),
         const_cast<IndexType*>(b_col_idxs), const_cast<ValueType*>(b_vals));
-    auto c_descr = cusparse::create_csr(m, n, zero_nnz, null_index, null_index,
-                                        null_value);
+    auto c_descr = sparselib::create_csr(m, n, zero_nnz, null_index, null_index,
+                                         null_value);
 
     // estimate work
     size_type buffer1_size{};
-    cusparse::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, &beta,
-                                     c_descr, spgemm_descr, buffer1_size,
-                                     nullptr);
+    sparselib::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, &beta,
+                                      c_descr, spgemm_descr, buffer1_size,
+                                      nullptr);
     array<char> buffer1{exec, buffer1_size};
-    cusparse::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, &beta,
-                                     c_descr, spgemm_descr, buffer1_size,
-                                     buffer1.get_data());
+    sparselib::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, &beta,
+                                      c_descr, spgemm_descr, buffer1_size,
+                                      buffer1.get_data());
 
     // compute spgemm
     size_type buffer2_size{};
-    cusparse::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, c_descr,
-                             spgemm_descr, buffer1.get_data(), buffer2_size,
-                             nullptr);
+    sparselib::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, c_descr,
+                              spgemm_descr, buffer1.get_data(), buffer2_size,
+                              nullptr);
     array<char> buffer2{exec, buffer2_size};
-    cusparse::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, c_descr,
-                             spgemm_descr, buffer1.get_data(), buffer2_size,
-                             buffer2.get_data());
+    sparselib::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, c_descr,
+                              spgemm_descr, buffer1.get_data(), buffer2_size,
+                              buffer2.get_data());
 
     // copy data to result
-    auto c_nnz = cusparse::sparse_matrix_nnz(c_descr);
+    auto c_nnz = sparselib::sparse_matrix_nnz(c_descr);
     c_col_idxs_array.resize_and_reset(c_nnz);
     c_vals_array.resize_and_reset(c_nnz);
-    cusparse::csr_set_pointers(c_descr, c_row_ptrs, c_col_idxs_array.get_data(),
-                               c_vals_array.get_data());
+    sparselib::csr_set_pointers(c_descr, c_row_ptrs,
+                                c_col_idxs_array.get_data(),
+                                c_vals_array.get_data());
 
-    cusparse::spgemm_copy(handle, &alpha, a_descr, b_descr, &beta, c_descr,
-                          spgemm_descr);
+    sparselib::spgemm_copy(handle, &alpha, a_descr, b_descr, &beta, c_descr,
+                           spgemm_descr);
 
-    cusparse::destroy(c_descr);
-    cusparse::destroy(b_descr);
-    cusparse::destroy(a_descr);
-    cusparse::destroy(spgemm_descr);
+    sparselib::destroy(c_descr);
+    sparselib::destroy(b_descr);
+    sparselib::destroy(a_descr);
+    sparselib::destroy(spgemm_descr);
 #endif  // CUDA_VERSION >= 11000
 }
 
@@ -701,8 +703,8 @@ void advanced_spgemm(std::shared_ptr<const DefaultExecutor> exec,
                      const matrix::Csr<ValueType, IndexType>* d,
                      matrix::Csr<ValueType, IndexType>* c)
 {
-    auto handle = exec->get_cusparse_handle();
-    cusparse::pointer_mode_guard pm_guard(handle);
+    auto handle = exec->get_sparselib_handle();
+    sparselib::pointer_mode_guard pm_guard(handle);
 
     auto valpha = exec->copy_val_to_host(alpha->get_const_values());
     auto a_nnz = IndexType(a->get_num_stored_elements());
@@ -724,102 +726,102 @@ void advanced_spgemm(std::shared_ptr<const DefaultExecutor> exec,
     auto c_row_ptrs = c->get_row_ptrs();
 
 #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-    if (!cusparse::is_supported<ValueType, IndexType>::value) {
+    if (!sparselib::is_supported<ValueType, IndexType>::value) {
         GKO_NOT_IMPLEMENTED;
     }
 
     matrix::CsrBuilder<ValueType, IndexType> c_builder{c};
     auto& c_col_idxs_array = c_builder.get_col_idx_array();
     auto& c_vals_array = c_builder.get_value_array();
-    auto a_descr = cusparse::create_mat_descr();
-    auto b_descr = cusparse::create_mat_descr();
-    auto c_descr = cusparse::create_mat_descr();
-    auto d_descr = cusparse::create_mat_descr();
-    auto info = cusparse::create_spgemm_info();
+    auto a_descr = sparselib::create_mat_descr();
+    auto b_descr = sparselib::create_mat_descr();
+    auto c_descr = sparselib::create_mat_descr();
+    auto d_descr = sparselib::create_mat_descr();
+    auto info = sparselib::create_spgemm_info();
     // allocate buffer
     size_type buffer_size{};
-    cusparse::spgemm_buffer_size(handle, m, n, k, &valpha, a_descr, a_nnz,
-                                 a_row_ptrs, a_col_idxs, b_descr, b_nnz,
-                                 b_row_ptrs, b_col_idxs, &vbeta, d_descr, d_nnz,
-                                 d_row_ptrs, d_col_idxs, info, buffer_size);
+    sparselib::spgemm_buffer_size(
+        handle, m, n, k, &valpha, a_descr, a_nnz, a_row_ptrs, a_col_idxs,
+        b_descr, b_nnz, b_row_ptrs, b_col_idxs, &vbeta, d_descr, d_nnz,
+        d_row_ptrs, d_col_idxs, info, buffer_size);
     array<char> buffer_array(exec, buffer_size);
     auto buffer = buffer_array.get_data();
 
     // count nnz
     IndexType c_nnz{};
-    cusparse::spgemm_nnz(handle, m, n, k, a_descr, a_nnz, a_row_ptrs,
-                         a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs,
-                         d_descr, d_nnz, d_row_ptrs, d_col_idxs, c_descr,
-                         c_row_ptrs, &c_nnz, info, buffer);
+    sparselib::spgemm_nnz(handle, m, n, k, a_descr, a_nnz, a_row_ptrs,
+                          a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs,
+                          d_descr, d_nnz, d_row_ptrs, d_col_idxs, c_descr,
+                          c_row_ptrs, &c_nnz, info, buffer);
 
     // accumulate non-zeros
     c_col_idxs_array.resize_and_reset(c_nnz);
     c_vals_array.resize_and_reset(c_nnz);
     auto c_col_idxs = c_col_idxs_array.get_data();
     auto c_vals = c_vals_array.get_data();
-    cusparse::spgemm(handle, m, n, k, &valpha, a_descr, a_nnz, a_vals,
-                     a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals, b_row_ptrs,
-                     b_col_idxs, &vbeta, d_descr, d_nnz, d_vals, d_row_ptrs,
-                     d_col_idxs, c_descr, c_vals, c_row_ptrs, c_col_idxs, info,
-                     buffer);
-
-    cusparse::destroy(info);
-    cusparse::destroy(d_descr);
-    cusparse::destroy(c_descr);
-    cusparse::destroy(b_descr);
-    cusparse::destroy(a_descr);
+    sparselib::spgemm(handle, m, n, k, &valpha, a_descr, a_nnz, a_vals,
+                      a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals,
+                      b_row_ptrs, b_col_idxs, &vbeta, d_descr, d_nnz, d_vals,
+                      d_row_ptrs, d_col_idxs, c_descr, c_vals, c_row_ptrs,
+                      c_col_idxs, info, buffer);
+
+    sparselib::destroy(info);
+    sparselib::destroy(d_descr);
+    sparselib::destroy(c_descr);
+    sparselib::destroy(b_descr);
+    sparselib::destroy(a_descr);
 #else   // CUDA_VERSION >= 11000
     auto null_value = static_cast<ValueType*>(nullptr);
     auto null_index = static_cast<IndexType*>(nullptr);
     auto one_val = one<ValueType>();
     auto zero_val = zero<ValueType>();
     auto zero_nnz = IndexType{};
-    auto spgemm_descr = cusparse::create_spgemm_descr();
-    auto a_descr = cusparse::create_csr(
+    auto spgemm_descr = sparselib::create_spgemm_descr();
+    auto a_descr = sparselib::create_csr(
         m, k, a_nnz, const_cast<IndexType*>(a_row_ptrs),
         const_cast<IndexType*>(a_col_idxs), const_cast<ValueType*>(a_vals));
-    auto b_descr = cusparse::create_csr(
+    auto b_descr = sparselib::create_csr(
         k, n, b_nnz, const_cast<IndexType*>(b_row_ptrs),
         const_cast<IndexType*>(b_col_idxs), const_cast<ValueType*>(b_vals));
-    auto c_descr = cusparse::create_csr(m, n, zero_nnz, null_index, null_index,
-                                        null_value);
+    auto c_descr = sparselib::create_csr(m, n, zero_nnz, null_index, null_index,
+                                         null_value);
 
     // estimate work
     size_type buffer1_size{};
-    cusparse::spgemm_work_estimation(handle, &one_val, a_descr, b_descr,
-                                     &zero_val, c_descr, spgemm_descr,
-                                     buffer1_size, nullptr);
+    sparselib::spgemm_work_estimation(handle, &one_val, a_descr, b_descr,
+                                      &zero_val, c_descr, spgemm_descr,
+                                      buffer1_size, nullptr);
     array<char> buffer1{exec, buffer1_size};
-    cusparse::spgemm_work_estimation(handle, &one_val, a_descr, b_descr,
-                                     &zero_val, c_descr, spgemm_descr,
-                                     buffer1_size, buffer1.get_data());
+    sparselib::spgemm_work_estimation(handle, &one_val, a_descr, b_descr,
+                                      &zero_val, c_descr, spgemm_descr,
+                                      buffer1_size, buffer1.get_data());
 
     // compute spgemm
     size_type buffer2_size{};
-    cusparse::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val,
-                             c_descr, spgemm_descr, buffer1.get_data(),
-                             buffer2_size, nullptr);
+    sparselib::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val,
+                              c_descr, spgemm_descr, buffer1.get_data(),
+                              buffer2_size, nullptr);
     array<char> buffer2{exec, buffer2_size};
-    cusparse::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val,
-                             c_descr, spgemm_descr, buffer1.get_data(),
-                             buffer2_size, buffer2.get_data());
+    sparselib::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val,
+                              c_descr, spgemm_descr, buffer1.get_data(),
+                              buffer2_size, buffer2.get_data());
 
     // write result to temporary storage
-    auto c_tmp_nnz = cusparse::sparse_matrix_nnz(c_descr);
+    auto c_tmp_nnz = sparselib::sparse_matrix_nnz(c_descr);
     array<IndexType> c_tmp_row_ptrs_array(exec, m + 1);
     array<IndexType> c_tmp_col_idxs_array(exec, c_tmp_nnz);
     array<ValueType> c_tmp_vals_array(exec, c_tmp_nnz);
-    cusparse::csr_set_pointers(c_descr, c_tmp_row_ptrs_array.get_data(),
-                               c_tmp_col_idxs_array.get_data(),
-                               c_tmp_vals_array.get_data());
+    sparselib::csr_set_pointers(c_descr, c_tmp_row_ptrs_array.get_data(),
+                                c_tmp_col_idxs_array.get_data(),
+                                c_tmp_vals_array.get_data());
 
-    cusparse::spgemm_copy(handle, &one_val, a_descr, b_descr, &zero_val,
-                          c_descr, spgemm_descr);
+    sparselib::spgemm_copy(handle, &one_val, a_descr, b_descr, &zero_val,
+                           c_descr, spgemm_descr);
 
-    cusparse::destroy(c_descr);
-    cusparse::destroy(b_descr);
-    cusparse::destroy(a_descr);
-    cusparse::destroy(spgemm_descr);
+    sparselib::destroy(c_descr);
+    sparselib::destroy(b_descr);
+    sparselib::destroy(a_descr);
+    sparselib::destroy(spgemm_descr);
 
     auto spgeam_total_nnz = c_tmp_nnz + d->get_num_stored_elements();
     auto nnz_per_row = spgeam_total_nnz / m;
@@ -846,13 +848,13 @@ void transpose(std::shared_ptr<const DefaultExecutor> exec,
     if (orig->get_size()[0] == 0) {
         return;
     }
-    if (cusparse::is_supported<ValueType, IndexType>::value) {
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
 #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
         cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
         cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
 
-        cusparse::transpose(
-            exec->get_cusparse_handle(), orig->get_size()[0],
+        sparselib::transpose(
+            exec->get_sparselib_handle(), orig->get_size()[0],
             orig->get_size()[1], orig->get_num_stored_elements(),
             orig->get_const_values(), orig->get_const_row_ptrs(),
             orig->get_const_col_idxs(), trans->get_values(),
@@ -864,8 +866,8 @@ void transpose(std::shared_ptr<const DefaultExecutor> exec,
         cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
         cusparseCsr2CscAlg_t alg = CUSPARSE_CSR2CSC_ALG1;
         size_type buffer_size = 0;
-        cusparse::transpose_buffersize(
-            exec->get_cusparse_handle(), orig->get_size()[0],
+        sparselib::transpose_buffersize(
+            exec->get_sparselib_handle(), orig->get_size()[0],
             orig->get_size()[1], orig->get_num_stored_elements(),
             orig->get_const_values(), orig->get_const_row_ptrs(),
             orig->get_const_col_idxs(), trans->get_values(),
@@ -873,8 +875,8 @@ void transpose(std::shared_ptr<const DefaultExecutor> exec,
             idxBase, alg, &buffer_size);
         array<char> buffer_array(exec, buffer_size);
         auto buffer = buffer_array.get_data();
-        cusparse::transpose(
-            exec->get_cusparse_handle(), orig->get_size()[0],
+        sparselib::transpose(
+            exec->get_sparselib_handle(), orig->get_size()[0],
             orig->get_size()[1], orig->get_num_stored_elements(),
             orig->get_const_values(), orig->get_const_row_ptrs(),
             orig->get_const_col_idxs(), trans->get_values(),
@@ -898,13 +900,13 @@ void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
     const auto block_size = default_block_size;
     const auto grid_size =
         ceildiv(trans->get_num_stored_elements(), block_size);
-    if (cusparse::is_supported<ValueType, IndexType>::value) {
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
 #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
         cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
         cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
 
-        cusparse::transpose(
-            exec->get_cusparse_handle(), orig->get_size()[0],
+        sparselib::transpose(
+            exec->get_sparselib_handle(), orig->get_size()[0],
             orig->get_size()[1], orig->get_num_stored_elements(),
             orig->get_const_values(), orig->get_const_row_ptrs(),
             orig->get_const_col_idxs(), trans->get_values(),
@@ -916,8 +918,8 @@ void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
         cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
         cusparseCsr2CscAlg_t alg = CUSPARSE_CSR2CSC_ALG1;
         size_type buffer_size = 0;
-        cusparse::transpose_buffersize(
-            exec->get_cusparse_handle(), orig->get_size()[0],
+        sparselib::transpose_buffersize(
+            exec->get_sparselib_handle(), orig->get_size()[0],
             orig->get_size()[1], orig->get_num_stored_elements(),
             orig->get_const_values(), orig->get_const_row_ptrs(),
             orig->get_const_col_idxs(), trans->get_values(),
@@ -925,8 +927,8 @@ void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
             idxBase, alg, &buffer_size);
         array<char> buffer_array(exec, buffer_size);
         auto buffer = buffer_array.get_data();
-        cusparse::transpose(
-            exec->get_cusparse_handle(), orig->get_size()[0],
+        sparselib::transpose(
+            exec->get_sparselib_handle(), orig->get_size()[0],
             orig->get_size()[1], orig->get_num_stored_elements(),
             orig->get_const_values(), orig->get_const_row_ptrs(),
             orig->get_const_col_idxs(), trans->get_values(),
@@ -948,9 +950,9 @@ template <typename ValueType, typename IndexType>
 void sort_by_column_index(std::shared_ptr<const DefaultExecutor> exec,
                           matrix::Csr<ValueType, IndexType>* to_sort)
 {
-    if (cusparse::is_supported<ValueType, IndexType>::value) {
-        auto handle = exec->get_cusparse_handle();
-        auto descr = cusparse::create_mat_descr();
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_sparselib_handle();
+        auto descr = sparselib::create_mat_descr();
         auto m = IndexType(to_sort->get_size()[0]);
         auto n = IndexType(to_sort->get_size()[1]);
         auto nnz = IndexType(to_sort->get_num_stored_elements());
@@ -966,30 +968,30 @@ void sort_by_column_index(std::shared_ptr<const DefaultExecutor> exec,
         // init identity permutation
         array<IndexType> permutation_array(exec, nnz);
         auto permutation = permutation_array.get_data();
-        cusparse::create_identity_permutation(handle, nnz, permutation);
+        sparselib::create_identity_permutation(handle, nnz, permutation);
 
         // allocate buffer
         size_type buffer_size{};
-        cusparse::csrsort_buffer_size(handle, m, n, nnz, row_ptrs, col_idxs,
-                                      buffer_size);
+        sparselib::csrsort_buffer_size(handle, m, n, nnz, row_ptrs, col_idxs,
+                                       buffer_size);
         array<char> buffer_array{exec, buffer_size};
         auto buffer = buffer_array.get_data();
 
         // sort column indices
-        cusparse::csrsort(handle, m, n, nnz, descr, row_ptrs, col_idxs,
-                          permutation, buffer);
+        sparselib::csrsort(handle, m, n, nnz, descr, row_ptrs, col_idxs,
+                           permutation, buffer);
 
         // sort values
 #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-        cusparse::gather(handle, nnz, tmp_vals, vals, permutation);
+        sparselib::gather(handle, nnz, tmp_vals, vals, permutation);
 #else  // CUDA_VERSION >= 11000
-        auto val_vec = cusparse::create_spvec(nnz, nnz, permutation, vals);
+        auto val_vec = sparselib::create_spvec(nnz, nnz, permutation, vals);
         auto tmp_vec =
-            cusparse::create_dnvec(nnz, const_cast<ValueType*>(tmp_vals));
-        cusparse::gather(handle, tmp_vec, val_vec);
+            sparselib::create_dnvec(nnz, const_cast<ValueType*>(tmp_vals));
+        sparselib::gather(handle, tmp_vec, val_vec);
 #endif
 
-        cusparse::destroy(descr);
+        sparselib::destroy(descr);
     } else {
         fallback_sort(exec, to_sort);
     }
diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu
index 04b34953c6a..b117c39107b 100644
--- a/cuda/matrix/dense_kernels.cu
+++ b/cuda/matrix/dense_kernels.cu
@@ -17,12 +17,13 @@
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/utils.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/cublas_bindings.hpp"
-#include "cuda/base/pointer_mode_guard.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/intrinsics.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
@@ -53,11 +54,11 @@ void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
                           matrix::Dense<ValueType>* result, array<char>& tmp)
 {
     if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
-        if (cublas::is_supported<ValueType>::value) {
-            auto handle = exec->get_cublas_handle();
-            cublas::dot(handle, x->get_size()[0], x->get_const_values(),
-                        x->get_stride(), y->get_const_values(), y->get_stride(),
-                        result->get_values());
+        if (blas::is_supported<ValueType>::value) {
+            auto handle = exec->get_blas_handle();
+            blas::dot(handle, x->get_size()[0], x->get_const_values(),
+                      x->get_stride(), y->get_const_values(), y->get_stride(),
+                      result->get_values());
         } else {
             compute_dot(exec, x, y, result, tmp);
         }
@@ -78,11 +79,11 @@ void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
                                array<char>& tmp)
 {
     if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
-        if (cublas::is_supported<ValueType>::value) {
-            auto handle = exec->get_cublas_handle();
-            cublas::conj_dot(handle, x->get_size()[0], x->get_const_values(),
-                             x->get_stride(), y->get_const_values(),
-                             y->get_stride(), result->get_values());
+        if (blas::is_supported<ValueType>::value) {
+            auto handle = exec->get_blas_handle();
+            blas::conj_dot(handle, x->get_size()[0], x->get_const_values(),
+                           x->get_stride(), y->get_const_values(),
+                           y->get_stride(), result->get_values());
         } else {
             compute_conj_dot(exec, x, y, result, tmp);
         }
@@ -102,10 +103,10 @@ void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
                             array<char>& tmp)
 {
     if (x->get_size()[1] == 1) {
-        if (cublas::is_supported<ValueType>::value) {
-            auto handle = exec->get_cublas_handle();
-            cublas::norm2(handle, x->get_size()[0], x->get_const_values(),
-                          x->get_stride(), result->get_values());
+        if (blas::is_supported<ValueType>::value) {
+            auto handle = exec->get_blas_handle();
+            blas::norm2(handle, x->get_size()[0], x->get_const_values(),
+                        x->get_stride(), result->get_values());
         } else {
             compute_norm2(exec, x, result, tmp);
         }
@@ -124,18 +125,18 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const matrix::Dense<ValueType>* b,
                   matrix::Dense<ValueType>* c)
 {
-    if (cublas::is_supported<ValueType>::value) {
-        auto handle = exec->get_cublas_handle();
+    if (blas::is_supported<ValueType>::value) {
+        auto handle = exec->get_blas_handle();
         if (c->get_size()[0] > 0 && c->get_size()[1] > 0) {
             if (a->get_size()[1] > 0) {
-                cublas::pointer_mode_guard pm_guard(handle);
+                blas::pointer_mode_guard pm_guard(handle);
                 auto alpha = one<ValueType>();
                 auto beta = zero<ValueType>();
-                cublas::gemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, c->get_size()[1],
-                             c->get_size()[0], a->get_size()[1], &alpha,
-                             b->get_const_values(), b->get_stride(),
-                             a->get_const_values(), a->get_stride(), &beta,
-                             c->get_values(), c->get_stride());
+                blas::gemm(handle, BLAS_OP_N, BLAS_OP_N, c->get_size()[1],
+                           c->get_size()[0], a->get_size()[1], &alpha,
+                           b->get_const_values(), b->get_stride(),
+                           a->get_const_values(), a->get_stride(), &beta,
+                           c->get_values(), c->get_stride());
             } else {
                 dense::fill(exec, c, zero<ValueType>());
             }
@@ -154,15 +155,15 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
            const matrix::Dense<ValueType>* a, const matrix::Dense<ValueType>* b,
            const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
 {
-    if (cublas::is_supported<ValueType>::value) {
+    if (blas::is_supported<ValueType>::value) {
         if (c->get_size()[0] > 0 && c->get_size()[1] > 0) {
             if (a->get_size()[1] > 0) {
-                cublas::gemm(
-                    exec->get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N,
-                    c->get_size()[1], c->get_size()[0], a->get_size()[1],
-                    alpha->get_const_values(), b->get_const_values(),
-                    b->get_stride(), a->get_const_values(), a->get_stride(),
-                    beta->get_const_values(), c->get_values(), c->get_stride());
+                blas::gemm(exec->get_blas_handle(), BLAS_OP_N, BLAS_OP_N,
+                           c->get_size()[1], c->get_size()[0], a->get_size()[1],
+                           alpha->get_const_values(), b->get_const_values(),
+                           b->get_stride(), a->get_const_values(),
+                           a->get_stride(), beta->get_const_values(),
+                           c->get_values(), c->get_stride());
             } else {
                 dense::scale(exec, beta, c);
             }
@@ -180,17 +181,17 @@ void transpose(std::shared_ptr<const DefaultExecutor> exec,
                const matrix::Dense<ValueType>* orig,
                matrix::Dense<ValueType>* trans)
 {
-    if (cublas::is_supported<ValueType>::value) {
-        auto handle = exec->get_cublas_handle();
+    if (blas::is_supported<ValueType>::value) {
+        auto handle = exec->get_blas_handle();
         if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) {
-            cublas::pointer_mode_guard pm_guard(handle);
+            blas::pointer_mode_guard pm_guard(handle);
             auto alpha = one<ValueType>();
             auto beta = zero<ValueType>();
-            cublas::geam(handle, CUBLAS_OP_T, CUBLAS_OP_N, orig->get_size()[0],
-                         orig->get_size()[1], &alpha, orig->get_const_values(),
-                         orig->get_stride(), &beta, trans->get_values(),
-                         trans->get_stride(), trans->get_values(),
-                         trans->get_stride());
+            blas::geam(handle, BLAS_OP_T, BLAS_OP_N, orig->get_size()[0],
+                       orig->get_size()[1], &alpha, orig->get_const_values(),
+                       orig->get_stride(), &beta, trans->get_const_values(),
+                       trans->get_stride(), trans->get_values(),
+                       trans->get_stride());
         }
     } else {
         GKO_NOT_IMPLEMENTED;
@@ -205,17 +206,17 @@ void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Dense<ValueType>* orig,
                     matrix::Dense<ValueType>* trans)
 {
-    if (cublas::is_supported<ValueType>::value) {
-        auto handle = exec->get_cublas_handle();
+    if (blas::is_supported<ValueType>::value) {
+        auto handle = exec->get_blas_handle();
         if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) {
-            cublas::pointer_mode_guard pm_guard(handle);
+            blas::pointer_mode_guard pm_guard(handle);
             auto alpha = one<ValueType>();
             auto beta = zero<ValueType>();
-            cublas::geam(handle, CUBLAS_OP_C, CUBLAS_OP_N, orig->get_size()[0],
-                         orig->get_size()[1], &alpha, orig->get_const_values(),
-                         orig->get_stride(), &beta, trans->get_values(),
-                         trans->get_stride(), trans->get_values(),
-                         trans->get_stride());
+            blas::geam(handle, BLAS_OP_C, BLAS_OP_N, orig->get_size()[0],
+                       orig->get_size()[1], &alpha, orig->get_const_values(),
+                       orig->get_stride(), &beta, trans->get_const_values(),
+                       trans->get_stride(), trans->get_values(),
+                       trans->get_stride());
         }
     } else {
         GKO_NOT_IMPLEMENTED;
diff --git a/cuda/matrix/diagonal_kernels.cu b/cuda/matrix/diagonal_kernels.cu
index b81329e29a0..e362ff0462b 100644
--- a/cuda/matrix/diagonal_kernels.cu
+++ b/cuda/matrix/diagonal_kernels.cu
@@ -9,9 +9,10 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
-#include "cuda/base/config.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "cuda/components/thread_ids.cuh"
 
 
diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu
index 9c23abc9dc4..105122ec4a9 100644
--- a/cuda/matrix/ell_kernels.cu
+++ b/cuda/matrix/ell_kernels.cu
@@ -15,19 +15,20 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
-#include "accessor/cuda_helper.hpp"
+#include "accessor/cuda_hip_helper.hpp"
 #include "accessor/reduced_row_major.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
 #include "core/base/mixed_precision_types.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/cusparse_bindings.hpp"
-#include "cuda/base/types.hpp"
 #include "cuda/components/atomic.cuh"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/format_conversion.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
 
@@ -97,9 +98,9 @@ void abstract_spmv(syn::value_list<int, info>,
     using arithmetic_type =
         highest_precision<InputValueType, OutputValueType, MatrixValueType>;
     using a_accessor =
-        gko::acc::reduced_row_major<1, arithmetic_type, const MatrixValueType>;
+        acc::reduced_row_major<1, arithmetic_type, const MatrixValueType>;
     using b_accessor =
-        gko::acc::reduced_row_major<2, arithmetic_type, const InputValueType>;
+        acc::reduced_row_major<2, arithmetic_type, const InputValueType>;
 
     const auto nrows = a->get_size()[0];
     const auto stride = a->get_stride();
@@ -114,11 +115,11 @@ void abstract_spmv(syn::value_list<int, info>,
     const dim3 grid_size(ceildiv(nrows * num_worker_per_row, block_size.x),
                          b->get_size()[1], 1);
 
-    const auto a_vals = gko::acc::range<a_accessor>(
+    const auto a_vals = acc::range<a_accessor>(
         std::array<acc::size_type, 1>{{static_cast<acc::size_type>(
             num_stored_elements_per_row * stride)}},
         a->get_const_values());
-    const auto b_vals = gko::acc::range<b_accessor>(
+    const auto b_vals = acc::range<b_accessor>(
         std::array<acc::size_type, 2>{
             {static_cast<acc::size_type>(b->get_size()[0]),
              static_cast<acc::size_type>(b->get_size()[1])}},
@@ -130,20 +131,21 @@ void abstract_spmv(syn::value_list<int, info>,
         if (grid_size.x > 0 && grid_size.y > 0) {
             kernel::spmv<num_thread_per_worker, atomic>
                 <<<grid_size, block_size, 0, exec->get_stream()>>>(
-                    nrows, num_worker_per_row, acc::as_cuda_range(a_vals),
+                    nrows, num_worker_per_row, acc::as_device_range(a_vals),
                     a->get_const_col_idxs(), stride,
-                    num_stored_elements_per_row, acc::as_cuda_range(b_vals),
+                    num_stored_elements_per_row, acc::as_device_range(b_vals),
                     as_device_type(c->get_values()), c->get_stride());
         }
     } else if (alpha != nullptr && beta != nullptr) {
-        const auto alpha_val = gko::acc::range<a_accessor>(
+        const auto alpha_val = acc::range<a_accessor>(
             std::array<acc::size_type, 1>{1}, alpha->get_const_values());
         if (grid_size.x > 0 && grid_size.y > 0) {
             kernel::spmv<num_thread_per_worker, atomic>
                 <<<grid_size, block_size, 0, exec->get_stream()>>>(
-                    nrows, num_worker_per_row, acc::as_cuda_range(alpha_val),
-                    acc::as_cuda_range(a_vals), a->get_const_col_idxs(), stride,
-                    num_stored_elements_per_row, acc::as_cuda_range(b_vals),
+                    nrows, num_worker_per_row, acc::as_device_range(alpha_val),
+                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
+                    stride, num_stored_elements_per_row,
+                    acc::as_device_range(b_vals),
                     as_device_type(beta->get_const_values()),
                     as_device_type(c->get_values()), c->get_stride());
         }
@@ -212,7 +214,7 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
     const int num_worker_per_row = std::get<2>(data);
 
     /**
-     * info is the parameter for selecting the cuda kernel.
+     * info is the parameter for selecting the device kernel.
      * for info == 0, it uses the kernel by warp_size threads with atomic
      * operation for other value, it uses the kernel without atomic_add
      */
@@ -246,7 +248,7 @@ void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
     const int num_worker_per_row = std::get<2>(data);
 
     /**
-     * info is the parameter for selecting the cuda kernel.
+     * info is the parameter for selecting the device kernel.
      * for info == 0, it uses the kernel by warp_size threads with atomic
      * operation for other value, it uses the kernel without atomic_add
      */
diff --git a/cuda/matrix/fbcsr_kernels.template.cu b/cuda/matrix/fbcsr_kernels.template.cu
index 8b835c6fd7d..ad36c84216e 100644
--- a/cuda/matrix/fbcsr_kernels.template.cu
+++ b/cuda/matrix/fbcsr_kernels.template.cu
@@ -24,6 +24,13 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "common/unified/base/kernel_launch.hpp"
 #include "core/base/array_access.hpp"
 #include "core/base/block_sizes.hpp"
@@ -33,16 +40,10 @@
 #include "core/matrix/csr_lookup.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/cublas_bindings.hpp"
-#include "cuda/base/cusparse_bindings.hpp"
 #include "cuda/base/cusparse_block_bindings.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/pointer_mode_guard.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/base/types.hpp"
 #include "cuda/components/atomic.cuh"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/merging.cuh"
 #include "cuda/components/prefix_sum.cuh"
 #include "cuda/components/reduction.cuh"
@@ -72,6 +73,7 @@ constexpr int default_block_size{512};
 
 namespace {
 
+
 template <typename ValueType>
 void dense_transpose(std::shared_ptr<const CudaExecutor> exec,
                      const size_type nrows, const size_type ncols,
@@ -81,21 +83,22 @@ void dense_transpose(std::shared_ptr<const CudaExecutor> exec,
     if (nrows == 0) {
         return;
     }
-    if (cublas::is_supported<ValueType>::value) {
-        auto handle = exec->get_cublas_handle();
+    if (blas::is_supported<ValueType>::value) {
+        auto handle = exec->get_blas_handle();
         {
-            cublas::pointer_mode_guard pm_guard(handle);
+            blas::pointer_mode_guard pm_guard(handle);
             auto alpha = one<ValueType>();
             auto beta = zero<ValueType>();
-            cublas::geam(handle, CUBLAS_OP_T, CUBLAS_OP_N, nrows, ncols, &alpha,
-                         orig, orig_stride, &beta, trans, trans_stride, trans,
-                         trans_stride);
+            blas::geam(handle, BLAS_OP_T, BLAS_OP_N, nrows, ncols, &alpha, orig,
+                       orig_stride, &beta, trans, trans_stride, trans,
+                       trans_stride);
         }
     } else {
         GKO_NOT_IMPLEMENTED;
     }
 }
 
+
 }  // namespace
 
 
@@ -114,12 +117,12 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
         dense::fill(exec, c, zero<ValueType>());
         return;
     }
-    if (cusparse::is_supported<ValueType, IndexType>::value) {
-        auto handle = exec->get_cusparse_handle();
-        cusparse::pointer_mode_guard pm_guard(handle);
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_sparselib_handle();
+        sparselib::pointer_mode_guard pm_guard(handle);
         const auto alpha = one<ValueType>();
         const auto beta = zero<ValueType>();
-        auto descr = cusparse::create_mat_descr();
+        auto descr = sparselib::create_mat_descr();
         const auto row_ptrs = a->get_const_row_ptrs();
         const auto col_idxs = a->get_const_col_idxs();
         const auto values = a->get_const_values();
@@ -133,21 +136,21 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
         const auto in_stride = b->get_stride();
         const auto out_stride = c->get_stride();
         if (nrhs == 1 && in_stride == 1 && out_stride == 1) {
-            cusparse::bsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, mb, nb,
-                            nnzb, &alpha, descr, values, row_ptrs, col_idxs, bs,
-                            b->get_const_values(), &beta, c->get_values());
+            sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb,
+                             nnzb, &alpha, descr, values, row_ptrs, col_idxs,
+                             bs, b->get_const_values(), &beta, c->get_values());
         } else {
             const auto trans_stride = nrows;
             auto trans_c = array<ValueType>(exec, nrows * nrhs);
-            cusparse::bsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                            CUSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
-                            &alpha, descr, values, row_ptrs, col_idxs, bs,
-                            b->get_const_values(), in_stride, &beta,
-                            trans_c.get_data(), trans_stride);
+            sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
+                             SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
+                             &alpha, descr, values, row_ptrs, col_idxs, bs,
+                             b->get_const_values(), in_stride, &beta,
+                             trans_c.get_data(), trans_stride);
             dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(),
                             out_stride, c->get_values());
         }
-        cusparse::destroy(descr);
+        sparselib::destroy(descr);
     } else {
         GKO_NOT_IMPLEMENTED;
     }
@@ -171,11 +174,11 @@ void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
         dense::scale(exec, beta, c);
         return;
     }
-    if (cusparse::is_supported<ValueType, IndexType>::value) {
-        auto handle = exec->get_cusparse_handle();
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_sparselib_handle();
         const auto alphp = alpha->get_const_values();
         const auto betap = beta->get_const_values();
-        auto descr = cusparse::create_mat_descr();
+        auto descr = sparselib::create_mat_descr();
         const auto row_ptrs = a->get_const_row_ptrs();
         const auto col_idxs = a->get_const_col_idxs();
         const auto values = a->get_const_values();
@@ -189,23 +192,23 @@ void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
         const auto in_stride = b->get_stride();
         const auto out_stride = c->get_stride();
         if (nrhs == 1 && in_stride == 1 && out_stride == 1) {
-            cusparse::bsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, mb, nb,
-                            nnzb, alphp, descr, values, row_ptrs, col_idxs, bs,
-                            b->get_const_values(), betap, c->get_values());
+            sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb,
+                             nnzb, alphp, descr, values, row_ptrs, col_idxs, bs,
+                             b->get_const_values(), betap, c->get_values());
         } else {
             const auto trans_stride = nrows;
             auto trans_c = array<ValueType>(exec, nrows * nrhs);
             dense_transpose(exec, nrows, nrhs, out_stride, c->get_values(),
                             trans_stride, trans_c.get_data());
-            cusparse::bsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                            CUSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
-                            alphp, descr, values, row_ptrs, col_idxs, bs,
-                            b->get_const_values(), in_stride, betap,
-                            trans_c.get_data(), trans_stride);
+            sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
+                             SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
+                             alphp, descr, values, row_ptrs, col_idxs, bs,
+                             b->get_const_values(), in_stride, betap,
+                             trans_c.get_data(), trans_stride);
             dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(),
                             out_stride, c->get_values());
         }
-        cusparse::destroy(descr);
+        sparselib::destroy(descr);
     } else {
         GKO_NOT_IMPLEMENTED;
     }
@@ -244,20 +247,21 @@ void transpose(const std::shared_ptr<const CudaExecutor> exec,
                const matrix::Fbcsr<ValueType, IndexType>* const orig,
                matrix::Fbcsr<ValueType, IndexType>* const trans)
 {
-    if (cusparse::is_supported<ValueType, IndexType>::value) {
+#ifdef GKO_COMPILING_CUDA
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
         const int bs = orig->get_block_size();
         const IndexType nnzb =
             static_cast<IndexType>(orig->get_num_stored_blocks());
         cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
         cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
-        const IndexType buffer_size = cusparse::bsr_transpose_buffersize(
-            exec->get_cusparse_handle(), orig->get_num_block_rows(),
+        const IndexType buffer_size = sparselib::bsr_transpose_buffersize(
+            exec->get_sparselib_handle(), orig->get_num_block_rows(),
             orig->get_num_block_cols(), nnzb, orig->get_const_values(),
             orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs);
         array<char> buffer_array(exec, buffer_size);
         auto buffer = buffer_array.get_data();
-        cusparse::bsr_transpose(
-            exec->get_cusparse_handle(), orig->get_num_block_rows(),
+        sparselib::bsr_transpose(
+            exec->get_sparselib_handle(), orig->get_num_block_rows(),
             orig->get_num_block_cols(), nnzb, orig->get_const_values(),
             orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs,
             trans->get_values(), trans->get_col_idxs(), trans->get_row_ptrs(),
@@ -268,7 +272,9 @@ void transpose(const std::shared_ptr<const CudaExecutor> exec,
             fixedblock::compiled_kernels(),
             [bs](int compiled_block_size) { return bs == compiled_block_size; },
             syn::value_list<int>(), syn::type_list<>(), exec, trans);
-    } else {
+    } else
+#endif
+    {
         fallback_transpose(exec, orig, trans);
     }
 }
diff --git a/cuda/matrix/sellp_kernels.cu b/cuda/matrix/sellp_kernels.cu
index 5eadf0d3858..d6c20075ef4 100644
--- a/cuda/matrix/sellp_kernels.cu
+++ b/cuda/matrix/sellp_kernels.cu
@@ -12,10 +12,11 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/cusparse_bindings.hpp"
-#include "cuda/base/types.hpp"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
 
diff --git a/cuda/matrix/sparsity_csr_kernels.cu b/cuda/matrix/sparsity_csr_kernels.cu
index 3a3d60b24e0..311e4d3782c 100644
--- a/cuda/matrix/sparsity_csr_kernels.cu
+++ b/cuda/matrix/sparsity_csr_kernels.cu
@@ -11,18 +11,19 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
-#include "accessor/cuda_helper.hpp"
+#include "accessor/cuda_hip_helper.hpp"
 #include "accessor/reduced_row_major.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/mixed_precision_types.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/format_conversion_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/cusparse_bindings.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/components/uninitialized_array.hpp"
@@ -41,7 +42,11 @@ namespace sparsity_csr {
 
 constexpr int classical_oversubscription = 32;
 constexpr int default_block_size = 512;
+#ifdef GKO_COMPILING_HIP
+constexpr int spmv_block_size = 256;
+#else
 constexpr int spmv_block_size = 128;
+#endif
 constexpr int warps_in_block = 4;
 
 
@@ -105,16 +110,16 @@ void classical_spmv(syn::value_list<int, subwarp_size>,
                 a->get_size()[0], as_device_type(a->get_const_value()),
                 a->get_const_col_idxs(),
                 as_device_type(a->get_const_row_ptrs()),
-                acc::as_cuda_range(b_vals), acc::as_cuda_range(c_vals));
+                acc::as_device_range(b_vals), acc::as_device_range(c_vals));
     } else if (alpha != nullptr && beta != nullptr) {
         kernel::abstract_classical_spmv<subwarp_size>
             <<<grid, block, 0, exec->get_stream()>>>(
                 a->get_size()[0], as_device_type(alpha->get_const_values()),
                 as_device_type(a->get_const_value()), a->get_const_col_idxs(),
                 as_device_type(a->get_const_row_ptrs()),
-                acc::as_cuda_range(b_vals),
+                acc::as_device_range(b_vals),
                 as_device_type(beta->get_const_values()),
-                acc::as_cuda_range(c_vals));
+                acc::as_device_range(c_vals));
     } else {
         GKO_KERNEL_NOT_FOUND;
     }
@@ -168,21 +173,21 @@ void sort_by_column_index(std::shared_ptr<const DefaultExecutor> exec,
     const auto num_cols = static_cast<IndexType>(to_sort->get_size()[1]);
     const auto row_ptrs = to_sort->get_const_row_ptrs();
     const auto col_idxs = to_sort->get_col_idxs();
-    if (cusparse::is_supported<ValueType, IndexType>::value) {
-        const auto handle = exec->get_cusparse_handle();
-        auto descr = cusparse::create_mat_descr();
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+        const auto handle = exec->get_sparselib_handle();
+        auto descr = sparselib::create_mat_descr();
         array<IndexType> permutation_array(exec, to_sort->get_num_nonzeros());
         auto permutation = permutation_array.get_data();
         components::fill_seq_array(exec, permutation,
                                    to_sort->get_num_nonzeros());
         size_type buffer_size{};
-        cusparse::csrsort_buffer_size(handle, num_rows, num_cols, nnz, row_ptrs,
-                                      col_idxs, buffer_size);
+        sparselib::csrsort_buffer_size(handle, num_rows, num_cols, nnz,
+                                       row_ptrs, col_idxs, buffer_size);
         array<char> buffer_array{exec, buffer_size};
         auto buffer = buffer_array.get_data();
-        cusparse::csrsort(handle, num_rows, num_cols, nnz, descr, row_ptrs,
-                          col_idxs, permutation, buffer);
-        cusparse::destroy(descr);
+        sparselib::csrsort(handle, num_rows, num_cols, nnz, descr, row_ptrs,
+                           col_idxs, permutation, buffer);
+        sparselib::destroy(descr);
     } else {
         fallback_sort(exec, to_sort);
     }
diff --git a/cuda/multigrid/pgm_kernels.cu b/cuda/multigrid/pgm_kernels.cu
index b5e9fa1612d..75c3dd911ad 100644
--- a/cuda/multigrid/pgm_kernels.cu
+++ b/cuda/multigrid/pgm_kernels.cu
@@ -8,8 +8,6 @@
 #include <memory>
 
 
-#include <cuda.h>
-#include <cusparse.h>
 #include <thrust/device_ptr.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/reduce.h>
@@ -21,8 +19,8 @@
 #include <ginkgo/core/base/math.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/base/types.hpp"
 
 
 namespace gko {
diff --git a/cuda/preconditioner/batch_preconditioners.cuh b/cuda/preconditioner/batch_preconditioners.cuh
index 0eae8650bdc..e83d6e04ee9 100644
--- a/cuda/preconditioner/batch_preconditioners.cuh
+++ b/cuda/preconditioner/batch_preconditioners.cuh
@@ -6,9 +6,9 @@
 #define GKO_CUDA_PRECONDITIONER_BATCH_PRECONDITIONERS_CUH_
 
 
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/preconditioner/batch_jacobi_helpers.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/reduction.cuh"
 
 
diff --git a/cuda/preconditioner/isai_kernels.cu b/cuda/preconditioner/isai_kernels.cu
index 6551f32bb86..d0dd516466a 100644
--- a/cuda/preconditioner/isai_kernels.cu
+++ b/cuda/preconditioner/isai_kernels.cu
@@ -10,12 +10,13 @@
 #include <ginkgo/core/matrix/csr.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
-#include "cuda/base/config.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/merging.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernel.cu b/cuda/preconditioner/jacobi_advanced_apply_kernels.cu
similarity index 100%
rename from cuda/preconditioner/jacobi_advanced_apply_kernel.cu
rename to cuda/preconditioner/jacobi_advanced_apply_kernels.cu
diff --git a/cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu
similarity index 94%
rename from cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu
rename to cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu
index 5633ad15a4b..ed33437c613 100644
--- a/cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu
+++ b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu
@@ -8,14 +8,14 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/config.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/components/warp_blas.cuh"
 #include "cuda/preconditioner/jacobi_common.hpp"
@@ -32,7 +32,7 @@ namespace cuda {
 namespace jacobi {
 
 
-#include "common/cuda_hip/preconditioner/jacobi_advanced_apply_kernel.hpp.inc"
+#include <common/cuda_hip/preconditioner/jacobi_advanced_apply_kernel.hpp.inc>
 
 
 // clang-format off
diff --git a/cuda/preconditioner/jacobi_common.hpp.in b/cuda/preconditioner/jacobi_common.hpp.in
index fe99fd88227..aeb47fec97e 100644
--- a/cuda/preconditioner/jacobi_common.hpp.in
+++ b/cuda/preconditioner/jacobi_common.hpp.in
@@ -6,7 +6,7 @@
 #include <ginkgo/core/synthesizer/containers.hpp>
 
 
-#include "cuda/base/config.hpp"
+#include "common/cuda_hip/base/config.hpp"
 
 namespace gko {
 namespace kernels {
diff --git a/cuda/preconditioner/jacobi_generate_kernel.cu b/cuda/preconditioner/jacobi_generate_kernels.cu
similarity index 100%
rename from cuda/preconditioner/jacobi_generate_kernel.cu
rename to cuda/preconditioner/jacobi_generate_kernels.cu
diff --git a/cuda/preconditioner/jacobi_generate_instantiate.inc.cu b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu
similarity index 94%
rename from cuda/preconditioner/jacobi_generate_instantiate.inc.cu
rename to cuda/preconditioner/jacobi_generate_kernels.instantiate.cu
index a76c4fba271..56e8ff6f16f 100644
--- a/cuda/preconditioner/jacobi_generate_instantiate.inc.cu
+++ b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu
@@ -9,14 +9,14 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/config.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/diagonal_block_manipulation.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/components/uninitialized_array.hpp"
@@ -35,7 +35,7 @@ namespace cuda {
 namespace jacobi {
 
 
-#include "common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc"
+#include <common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc>
 
 
 // clang-format off
diff --git a/cuda/preconditioner/jacobi_kernels.cu b/cuda/preconditioner/jacobi_kernels.cu
index 2508f33acb9..bce2ff23303 100644
--- a/cuda/preconditioner/jacobi_kernels.cu
+++ b/cuda/preconditioner/jacobi_kernels.cu
@@ -8,13 +8,14 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/config.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/preconditioner/jacobi_common.hpp"
 
@@ -30,8 +31,12 @@ namespace cuda {
 namespace jacobi {
 
 
-// a total of 32 warps (1024 threads)
+// a total of 32/16 warps (1024 threads)
+#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC
+constexpr int default_num_warps = 16;
+#else  // !defined(GKO_COMPILING_HIP) || GINKGO_HIP_PLATFORM_NVCC
 constexpr int default_num_warps = 32;
+#endif
 // with current architectures, at most 32 warps can be scheduled per SM (and
 // current GPUs have at most 84 SMs)
 constexpr int default_grid_size = 32 * 32 * 128;
diff --git a/cuda/preconditioner/jacobi_simple_apply_kernel.cu b/cuda/preconditioner/jacobi_simple_apply_kernels.cu
similarity index 100%
rename from cuda/preconditioner/jacobi_simple_apply_kernel.cu
rename to cuda/preconditioner/jacobi_simple_apply_kernels.cu
diff --git a/cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu
similarity index 93%
rename from cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu
rename to cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu
index 07689daa815..97a7bfff489 100644
--- a/cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu
+++ b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu
@@ -8,14 +8,14 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/config.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/components/warp_blas.cuh"
 #include "cuda/preconditioner/jacobi_common.hpp"
@@ -32,7 +32,7 @@ namespace cuda {
 namespace jacobi {
 
 
-#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc"
+#include <common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc>
 
 
 // clang-format off
diff --git a/cuda/reorder/rcm_kernels.cu b/cuda/reorder/rcm_kernels.cu
index d699d00dfb6..72322016fba 100644
--- a/cuda/reorder/rcm_kernels.cu
+++ b/cuda/reorder/rcm_kernels.cu
@@ -25,9 +25,9 @@
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
+#include "common/cuda_hip/components/memory.hpp"
 #include "core/base/array_access.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/components/memory.cuh"
 #include "cuda/components/thread_ids.cuh"
 
 
diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu
index 0ce95e2d34f..58e1a6b7b0d 100644
--- a/cuda/solver/batch_bicgstab_kernels.cu
+++ b/cuda/solver/batch_bicgstab_kernels.cu
@@ -13,15 +13,16 @@
 #include <ginkgo/core/base/math.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "cuda/base/batch_struct.hpp"
-#include "cuda/base/config.hpp"
 #include "cuda/base/kernel_config.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/components/uninitialized_array.hpp"
diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu
index f429e5f22f0..398e831eb09 100644
--- a/cuda/solver/batch_cg_kernels.cu
+++ b/cuda/solver/batch_cg_kernels.cu
@@ -13,15 +13,15 @@
 #include <ginkgo/core/base/math.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "cuda/base/batch_struct.hpp"
-#include "cuda/base/config.hpp"
 #include "cuda/base/kernel_config.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/components/uninitialized_array.hpp"
diff --git a/cuda/solver/cb_gmres_kernels.cu b/cuda/solver/cb_gmres_kernels.cu
index 107835ca1b5..3dbefadf22a 100644
--- a/cuda/solver/cb_gmres_kernels.cu
+++ b/cuda/solver/cb_gmres_kernels.cu
@@ -8,25 +8,25 @@
 #include <algorithm>
 
 
+#include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/range.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
 
-#include "accessor/cuda_helper.hpp"
+#include "accessor/cuda_hip_helper.hpp"
 #include "accessor/range.hpp"
 #include "accessor/reduced_row_major.hpp"
 #include "accessor/scaled_reduced_row_major.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/array_access.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/solver/cb_gmres_accessor.hpp"
-#include "cuda/base/config.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
 #include "cuda/components/atomic.cuh"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/components/uninitialized_array.hpp"
@@ -44,6 +44,8 @@ namespace cb_gmres {
 
 
 constexpr int default_block_size = 512;
+// default_dot_dim can not be 64 in hip because 64 * 64 exceeds their max block
+// size limit.
 constexpr int default_dot_dim = 32;
 constexpr int default_dot_size = default_dot_dim * default_dot_dim;
 
@@ -116,7 +118,7 @@ void restart(std::shared_ptr<const DefaultExecutor> exec,
     restart_1_kernel<block_size>
         <<<grid_dim_1, block_dim, 0, exec->get_stream()>>>(
             residual->get_size()[0], residual->get_size()[1], krylov_dim,
-            acc::as_cuda_range(krylov_bases),
+            acc::as_device_range(krylov_bases),
             as_device_type(residual_norm_collection->get_values()),
             residual_norm_collection->get_stride());
     kernels::cuda::dense::compute_norm2_dispatch(exec, residual, residual_norm,
@@ -145,7 +147,7 @@ void restart(std::shared_ptr<const DefaultExecutor> exec,
                 residual_norm->get_stride(),
                 as_device_type(arnoldi_norm->get_const_values() +
                                2 * stride_arnoldi),
-                stride_arnoldi, acc::as_cuda_range(krylov_bases));
+                stride_arnoldi, acc::as_device_range(krylov_bases));
     }
 
     const auto grid_dim_2 =
@@ -158,7 +160,7 @@ void restart(std::shared_ptr<const DefaultExecutor> exec,
             residual->get_stride(),
             as_device_type(residual_norm->get_const_values()),
             as_device_type(residual_norm_collection->get_values()),
-            acc::as_cuda_range(krylov_bases),
+            acc::as_device_range(krylov_bases),
             as_device_type(next_krylov_basis->get_values()),
             next_krylov_basis->get_stride(),
             as_device_type(final_iter_nums->get_data()));
@@ -212,6 +214,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
         as_device_type(next_krylov_basis->get_const_values()),
         stride_next_krylov, as_device_type(arnoldi_norm->get_values()),
         as_device_type(stop_status));
+    // nrmP = norm(next_krylov_basis)
     zero_matrix(exec, iter + 1, dim_size[1], stride_hessenberg,
                 hessenberg_iter->get_values());
     if (dim_size[1] > 1) {
@@ -219,7 +222,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
             <<<grid_size_num_iters, block_size, 0, exec->get_stream()>>>(
                 dim_size[0], dim_size[1],
                 as_device_type(next_krylov_basis->get_const_values()),
-                stride_next_krylov, acc::as_cuda_range(krylov_bases),
+                stride_next_krylov, acc::as_device_range(krylov_bases),
                 as_device_type(hessenberg_iter->get_values()),
                 stride_hessenberg, as_device_type(stop_status));
     } else {
@@ -228,7 +231,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
                exec->get_stream()>>>(
                 dim_size[0],
                 as_device_type(next_krylov_basis->get_const_values()),
-                stride_next_krylov, acc::as_cuda_range(krylov_bases),
+                stride_next_krylov, acc::as_device_range(krylov_bases),
                 as_device_type(hessenberg_iter->get_values()),
                 stride_hessenberg, as_device_type(stop_status));
     }
@@ -240,7 +243,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
            default_block_size, 0, exec->get_stream()>>>(
             iter + 1, dim_size[0], dim_size[1],
             as_device_type(next_krylov_basis->get_values()), stride_next_krylov,
-            acc::as_cuda_range(krylov_bases),
+            acc::as_device_range(krylov_bases),
             as_device_type(hessenberg_iter->get_const_values()),
             stride_hessenberg, as_device_type(stop_status));
 
@@ -269,7 +272,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
            exec->get_stream()>>>(
             dim_size[1], as_device_type(arnoldi_norm->get_values()),
             stride_arnoldi, as_device_type(hessenberg_iter->get_values()),
-            stride_hessenberg, iter + 1, acc::as_cuda_range(krylov_bases),
+            stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases),
             as_device_type(stop_status), as_device_type(reorth_status),
             as_device_type(num_reorth->get_data()));
     num_reorth_host = get_element(*num_reorth, 0);
@@ -282,7 +285,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
                 <<<grid_size_num_iters, block_size, 0, exec->get_stream()>>>(
                     dim_size[0], dim_size[1],
                     as_device_type(next_krylov_basis->get_const_values()),
-                    stride_next_krylov, acc::as_cuda_range(krylov_bases),
+                    stride_next_krylov, acc::as_device_range(krylov_bases),
                     as_device_type(buffer_iter->get_values()), stride_buffer,
                     as_device_type(stop_status));
         } else {
@@ -291,7 +294,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
                    exec->get_stream()>>>(
                     dim_size[0],
                     as_device_type(next_krylov_basis->get_const_values()),
-                    stride_next_krylov, acc::as_cuda_range(krylov_bases),
+                    stride_next_krylov, acc::as_device_range(krylov_bases),
                     as_device_type(buffer_iter->get_values()), stride_buffer,
                     as_device_type(stop_status));
         }
@@ -303,7 +306,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
                default_block_size, 0, exec->get_stream()>>>(
                 iter + 1, dim_size[0], dim_size[1],
                 as_device_type(next_krylov_basis->get_values()),
-                stride_next_krylov, acc::as_cuda_range(krylov_bases),
+                stride_next_krylov, acc::as_device_range(krylov_bases),
                 as_device_type(hessenberg_iter->get_values()),
                 stride_hessenberg,
                 as_device_type(buffer_iter->get_const_values()), stride_buffer,
@@ -335,18 +338,19 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
                exec->get_stream()>>>(
                 dim_size[1], as_device_type(arnoldi_norm->get_values()),
                 stride_arnoldi, as_device_type(hessenberg_iter->get_values()),
-                stride_hessenberg, iter + 1, acc::as_cuda_range(krylov_bases),
+                stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases),
                 as_device_type(stop_status), as_device_type(reorth_status),
                 num_reorth->get_data());
         num_reorth_host = get_element(*num_reorth, 0);
+        // num_reorth_host := number of next_krylov vector to be
+        // reorthogonalization
     }
-
     update_krylov_next_krylov_kernel<default_block_size>
         <<<ceildiv(dim_size[0] * stride_next_krylov, default_block_size),
            default_block_size, 0, exec->get_stream()>>>(
             iter, dim_size[0], dim_size[1],
             as_device_type(next_krylov_basis->get_values()), stride_next_krylov,
-            acc::as_cuda_range(krylov_bases),
+            acc::as_device_range(krylov_bases),
             as_device_type(hessenberg_iter->get_const_values()),
             stride_hessenberg, as_device_type(stop_status));
     // next_krylov_basis /= hessenberg(iter, iter + 1)
@@ -460,7 +464,7 @@ void calculate_qy(std::shared_ptr<const DefaultExecutor> exec,
 
     calculate_Qy_kernel<block_size>
         <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-            num_rows, num_cols, acc::as_cuda_range(krylov_bases),
+            num_rows, num_cols, acc::as_device_range(krylov_bases),
             as_device_type(y->get_const_values()), y->get_stride(),
             as_device_type(before_preconditioner->get_values()),
             stride_before_preconditioner,
diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh
index cb627e04b53..549925bf2e7 100644
--- a/cuda/solver/common_trs_kernels.cuh
+++ b/cuda/solver/common_trs_kernels.cuh
@@ -20,15 +20,15 @@
 #include <ginkgo/core/base/math.hpp>
 
 
+#include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/memory.hpp"
 #include "core/base/array_access.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/cusparse_bindings.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/pointer_mode_guard.hpp"
-#include "cuda/base/types.hpp"
 #include "cuda/components/atomic.cuh"
-#include "cuda/components/memory.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/components/uninitialized_array.hpp"
 
@@ -75,18 +75,18 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
         if (num_rhs == 0) {
             return;
         }
-        cusparse::pointer_mode_guard pm_guard(handle);
-        spsm_descr = cusparse::create_spsm_descr();
-        descr_a = cusparse::create_csr(
+        sparselib::pointer_mode_guard pm_guard(handle);
+        spsm_descr = sparselib::create_spsm_descr();
+        descr_a = sparselib::create_csr(
             matrix->get_size()[0], matrix->get_size()[1],
             matrix->get_num_stored_elements(),
             const_cast<IndexType*>(matrix->get_const_row_ptrs()),
             const_cast<IndexType*>(matrix->get_const_col_idxs()),
             const_cast<ValueType*>(matrix->get_const_values()));
-        cusparse::set_attribute<cusparseFillMode_t>(
+        sparselib::set_attribute<cusparseFillMode_t>(
             descr_a, CUSPARSE_SPMAT_FILL_MODE,
             is_upper ? CUSPARSE_FILL_MODE_UPPER : CUSPARSE_FILL_MODE_LOWER);
-        cusparse::set_attribute<cusparseDiagType_t>(
+        sparselib::set_attribute<cusparseDiagType_t>(
             descr_a, CUSPARSE_SPMAT_DIAG_TYPE,
             unit_diag ? CUSPARSE_DIAG_TYPE_UNIT : CUSPARSE_DIAG_TYPE_NON_UNIT);
 
@@ -94,28 +94,28 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
         // workaround suggested by NVIDIA engineers: for some reason
         // cusparse needs non-nullptr input vectors even for analysis
         // also make sure they are aligned by 16 bytes
-        auto descr_b = cusparse::create_dnmat(
+        auto descr_b = sparselib::create_dnmat(
             dim<2>{matrix->get_size()[0], num_rhs}, matrix->get_size()[1],
             reinterpret_cast<ValueType*>(0xDEAD0));
-        auto descr_c = cusparse::create_dnmat(
+        auto descr_c = sparselib::create_dnmat(
             dim<2>{matrix->get_size()[0], num_rhs}, matrix->get_size()[1],
             reinterpret_cast<ValueType*>(0xDEAF0));
 
-        auto work_size = cusparse::spsm_buffer_size(
+        auto work_size = sparselib::spsm_buffer_size(
             handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
             CUSPARSE_OPERATION_NON_TRANSPOSE, one<ValueType>(), descr_a,
             descr_b, descr_c, CUSPARSE_SPSM_ALG_DEFAULT, spsm_descr);
 
         work.resize_and_reset(work_size);
 
-        cusparse::spsm_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                one<ValueType>(), descr_a, descr_b, descr_c,
-                                CUSPARSE_SPSM_ALG_DEFAULT, spsm_descr,
-                                work.get_data());
+        sparselib::spsm_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                 CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                 one<ValueType>(), descr_a, descr_b, descr_c,
+                                 CUSPARSE_SPSM_ALG_DEFAULT, spsm_descr,
+                                 work.get_data());
 
-        cusparse::destroy(descr_b);
-        cusparse::destroy(descr_c);
+        sparselib::destroy(descr_b);
+        sparselib::destroy(descr_c);
     }
 
     void solve(const matrix::Csr<ValueType, IndexType>*,
@@ -134,30 +134,30 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
                 "provided at generation time. Check the value specified in "
                 ".with_num_rhs(...)."};
         }
-        cusparse::pointer_mode_guard pm_guard(handle);
-        auto descr_b = cusparse::create_dnmat(
+        sparselib::pointer_mode_guard pm_guard(handle);
+        auto descr_b = sparselib::create_dnmat(
             input->get_size(), input->get_stride(),
             const_cast<ValueType*>(input->get_const_values()));
-        auto descr_c = cusparse::create_dnmat(
+        auto descr_c = sparselib::create_dnmat(
             output->get_size(), output->get_stride(), output->get_values());
 
-        cusparse::spsm_solve(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                             CUSPARSE_OPERATION_NON_TRANSPOSE, one<ValueType>(),
-                             descr_a, descr_b, descr_c,
-                             CUSPARSE_SPSM_ALG_DEFAULT, spsm_descr);
+        sparselib::spsm_solve(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                              CUSPARSE_OPERATION_NON_TRANSPOSE,
+                              one<ValueType>(), descr_a, descr_b, descr_c,
+                              CUSPARSE_SPSM_ALG_DEFAULT, spsm_descr);
 
-        cusparse::destroy(descr_b);
-        cusparse::destroy(descr_c);
+        sparselib::destroy(descr_b);
+        sparselib::destroy(descr_c);
     }
 
     ~CudaSolveStruct()
     {
         if (descr_a) {
-            cusparse::destroy(descr_a);
+            sparselib::destroy(descr_a);
             descr_a = nullptr;
         }
         if (spsm_descr) {
-            cusparse::destroy(spsm_descr);
+            sparselib::destroy(spsm_descr);
             spsm_descr = nullptr;
         }
     }
@@ -200,21 +200,21 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
         if (num_rhs == 0) {
             return;
         }
-        cusparse::pointer_mode_guard pm_guard(handle);
-        factor_descr = cusparse::create_mat_descr();
-        solve_info = cusparse::create_solve_info();
-        cusparse::set_mat_fill_mode(
+        sparselib::pointer_mode_guard pm_guard(handle);
+        factor_descr = sparselib::create_mat_descr();
+        solve_info = sparselib::create_solve_info();
+        sparselib::set_mat_fill_mode(
             factor_descr,
             is_upper ? CUSPARSE_FILL_MODE_UPPER : CUSPARSE_FILL_MODE_LOWER);
-        cusparse::set_mat_diag_type(
+        sparselib::set_mat_diag_type(
             factor_descr,
             unit_diag ? CUSPARSE_DIAG_TYPE_UNIT : CUSPARSE_DIAG_TYPE_NON_UNIT);
         algorithm = 0;
-        policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
+        policy = SPARSELIB_SOLVE_POLICY_USE_LEVEL;
 
         size_type work_size{};
 
-        cusparse::buffer_size_ext(
+        sparselib::buffer_size_ext(
             handle, algorithm, CUSPARSE_OPERATION_NON_TRANSPOSE,
             CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs,
             matrix->get_num_stored_elements(), one<ValueType>(), factor_descr,
@@ -225,7 +225,7 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
         // allocate workspace
         work.resize_and_reset(work_size);
 
-        cusparse::csrsm2_analysis(
+        sparselib::csrsm2_analysis(
             handle, algorithm, CUSPARSE_OPERATION_NON_TRANSPOSE,
             CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs,
             matrix->get_num_stored_elements(), one<ValueType>(), factor_descr,
@@ -250,9 +250,9 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
                 "provided at generation time. Check the value specified in "
                 ".with_num_rhs(...)."};
         }
-        cusparse::pointer_mode_guard pm_guard(handle);
+        sparselib::pointer_mode_guard pm_guard(handle);
         dense::copy(exec, input, output);
-        cusparse::csrsm2_solve(
+        sparselib::csrsm2_solve(
             handle, algorithm, CUSPARSE_OPERATION_NON_TRANSPOSE,
             CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0],
             output->get_stride(), matrix->get_num_stored_elements(),
@@ -265,11 +265,11 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
     ~CudaSolveStruct()
     {
         if (factor_descr) {
-            cusparse::destroy(factor_descr);
+            sparselib::destroy(factor_descr);
             factor_descr = nullptr;
         }
         if (solve_info) {
-            cusparse::destroy(solve_info);
+            sparselib::destroy(solve_info);
             solve_info = nullptr;
         }
     }
@@ -304,7 +304,7 @@ void generate_kernel(std::shared_ptr<const CudaExecutor> exec,
     if (matrix->get_size()[0] == 0) {
         return;
     }
-    if (cusparse::is_supported<ValueType, IndexType>::value) {
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
         solve_struct = std::make_shared<CudaSolveStruct<ValueType, IndexType>>(
             exec, matrix, num_rhs, is_upper, unit_diag);
     } else {
@@ -327,7 +327,7 @@ void solve_kernel(std::shared_ptr<const CudaExecutor> exec,
     }
     using vec = matrix::Dense<ValueType>;
 
-    if (cusparse::is_supported<ValueType, IndexType>::value) {
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
         if (auto cuda_solve_struct =
                 dynamic_cast<const CudaSolveStruct<ValueType, IndexType>*>(
                     solve_struct)) {
diff --git a/cuda/solver/idr_kernels.cu b/cuda/solver/idr_kernels.cu
index 9c97d99f13c..f7e89c9d9d8 100644
--- a/cuda/solver/idr_kernels.cu
+++ b/cuda/solver/idr_kernels.cu
@@ -13,14 +13,15 @@
 #include <ginkgo/core/base/math.hpp>
 
 
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/randlib_bindings.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/fill_array_kernels.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/cublas_bindings.hpp"
-#include "cuda/base/curand_bindings.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
 #include "cuda/components/atomic.cuh"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
 
@@ -69,14 +70,14 @@ void initialize_subspace_vectors(std::shared_ptr<const DefaultExecutor> exec,
                                  bool deterministic)
 {
     if (!deterministic) {
-        auto gen = curand::rand_generator(std::random_device{}(),
-                                          CURAND_RNG_PSEUDO_DEFAULT,
-                                          exec->get_stream());
-        curand::rand_vector(
+        auto gen = randlib::rand_generator(std::random_device{}(),
+                                           RANDLIB_RNG_PSEUDO_DEFAULT,
+                                           exec->get_stream());
+        randlib::rand_vector(
             gen,
             subspace_vectors->get_size()[0] * subspace_vectors->get_stride(),
             0.0, 1.0, subspace_vectors->get_values());
-        curand::destroy(gen);
+        randlib::destroy(gen);
     }
 }
 
@@ -145,9 +146,8 @@ void update_g_and_u(std::shared_ptr<const DefaultExecutor> exec,
                 as_device_type(alpha->get_values()),
                 stop_status->get_const_data());
         } else {
-            cublas::dot(exec->get_cublas_handle(), size, p_i, 1,
-                        g_k->get_values(), g_k->get_stride(),
-                        alpha->get_values());
+            blas::dot(exec->get_blas_handle(), size, p_i, 1, g_k->get_values(),
+                      g_k->get_stride(), alpha->get_values());
         }
         update_g_k_and_u_kernel<default_block_size>
             <<<ceildiv(size * g_k->get_stride(), default_block_size),
@@ -196,8 +196,8 @@ void update_m(std::shared_ptr<const DefaultExecutor> exec, const size_type nrhs,
                 as_device_type(g_k->get_const_values()), g_k->get_stride(),
                 as_device_type(m_i), stop_status->get_const_data());
         } else {
-            cublas::dot(exec->get_cublas_handle(), size, p_i, 1,
-                        g_k->get_const_values(), g_k->get_stride(), m_i);
+            blas::dot(exec->get_blas_handle(), size, p_i, 1,
+                      g_k->get_const_values(), g_k->get_stride(), m_i);
         }
     }
 }
diff --git a/cuda/solver/lower_trs_kernels.cu b/cuda/solver/lower_trs_kernels.cu
index 46b4cb4c2e4..002cc0140cb 100644
--- a/cuda/solver/lower_trs_kernels.cu
+++ b/cuda/solver/lower_trs_kernels.cu
@@ -17,9 +17,9 @@
 #include <ginkgo/core/solver/triangular.hpp>
 
 
-#include "cuda/base/cusparse_bindings.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
 #include "cuda/solver/common_trs_kernels.cuh"
 
 
diff --git a/cuda/solver/multigrid_kernels.cu b/cuda/solver/multigrid_kernels.cu
index 4eea02883b2..1d31130623a 100644
--- a/cuda/solver/multigrid_kernels.cu
+++ b/cuda/solver/multigrid_kernels.cu
@@ -11,9 +11,10 @@
 #include <ginkgo/core/base/types.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
 #include "core/base/array_access.hpp"
 #include "core/components/fill_array_kernels.hpp"
-#include "cuda/base/types.hpp"
 #include "cuda/components/thread_ids.cuh"
 
 
diff --git a/cuda/solver/upper_trs_kernels.cu b/cuda/solver/upper_trs_kernels.cu
index a8ee5f77cca..e1e01538f79 100644
--- a/cuda/solver/upper_trs_kernels.cu
+++ b/cuda/solver/upper_trs_kernels.cu
@@ -17,9 +17,9 @@
 #include <ginkgo/core/solver/triangular.hpp>
 
 
-#include "cuda/base/cusparse_bindings.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
 #include "cuda/solver/common_trs_kernels.cuh"
 
 
diff --git a/cuda/stop/criterion_kernels.cu b/cuda/stop/criterion_kernels.cu
index 17bcbbc1567..e54b5d140f2 100644
--- a/cuda/stop/criterion_kernels.cu
+++ b/cuda/stop/criterion_kernels.cu
@@ -10,8 +10,8 @@
 #include <ginkgo/core/stop/stopping_status.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
 #include "cuda/components/thread_ids.cuh"
 
 
diff --git a/cuda/stop/residual_norm_kernels.cu b/cuda/stop/residual_norm_kernels.cu
index 18102d91ec5..7146d0cbf04 100644
--- a/cuda/stop/residual_norm_kernels.cu
+++ b/cuda/stop/residual_norm_kernels.cu
@@ -10,9 +10,9 @@
 #include <ginkgo/core/stop/residual_norm.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "core/base/array_access.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
 #include "cuda/components/thread_ids.cuh"
 
 
diff --git a/cuda/test/base/math.cu b/cuda/test/base/math.cu
index c7f70fe3011..944e7642223 100644
--- a/cuda/test/base/math.cu
+++ b/cuda/test/base/math.cu
@@ -17,8 +17,8 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
 #include "cuda/test/utils.hpp"
 
 
diff --git a/cuda/test/components/cooperative_groups.cu b/cuda/test/components/cooperative_groups.cu
index 1b514842e84..c9d9e6bf124 100644
--- a/cuda/test/components/cooperative_groups.cu
+++ b/cuda/test/components/cooperative_groups.cu
@@ -2,9 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "cuda/components/cooperative_groups.cuh"
-
-
 #include <memory>
 
 
@@ -15,7 +12,8 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
-#include "cuda/base/config.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "cuda/test/utils.hpp"
 
 
diff --git a/cuda/test/components/merging.cu b/cuda/test/components/merging.cu
index 6ef7d3ab3c4..37b032eb794 100644
--- a/cuda/test/components/merging.cu
+++ b/cuda/test/components/merging.cu
@@ -18,7 +18,7 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
-#include "cuda/components/cooperative_groups.cuh"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "cuda/test/utils.hpp"
 
 
diff --git a/cuda/test/components/searching.cu b/cuda/test/components/searching.cu
index 0eeb383c05c..ffe00c247c0 100644
--- a/cuda/test/components/searching.cu
+++ b/cuda/test/components/searching.cu
@@ -17,7 +17,7 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
-#include "cuda/components/cooperative_groups.cuh"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "cuda/test/utils.hpp"
 
 
diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index 8c68efae046..035134ac4e1 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -93,7 +93,7 @@ string(REPLACE ";" "," GKO_DPCPP_JACOBI_BLOCK_SIZES_CODE "${GKO_DPCPP_JACOBI_BLO
 configure_file(preconditioner/jacobi_common.hpp.in preconditioner/jacobi_common.hpp)
 
 ginkgo_compile_features(ginkgo_dpcpp)
-target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP _ONEDPL_COMPILE_KERNEL=0)
+target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP GKO_DEVICE_NAMESPACE=dpcpp _ONEDPL_COMPILE_KERNEL=0)
 
 set(GINKGO_DPCPP_FLAGS ${GINKGO_DPCPP_FLAGS} PARENT_SCOPE)
 target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_DPCPP_FLAGS}")
@@ -126,7 +126,7 @@ ginkgo_default_includes(ginkgo_dpcpp)
 ginkgo_install_library(ginkgo_dpcpp)
 
 if (GINKGO_CHECK_CIRCULAR_DEPS)
-    ginkgo_check_headers(ginkgo_dpcpp GKO_COMPILING_DPCPP)
+    ginkgo_check_headers(ginkgo_dpcpp "GKO_COMPILING_DPCPP;GKO_GKO_DEVICE_NAMESPACE=dpcpp")
 endif()
 
 if(GINKGO_BUILD_TESTS)
diff --git a/dpcpp/test/base/CMakeLists.txt b/dpcpp/test/base/CMakeLists.txt
index bb9c8a75050..38ecad08271 100644
--- a/dpcpp/test/base/CMakeLists.txt
+++ b/dpcpp/test/base/CMakeLists.txt
@@ -2,4 +2,4 @@ ginkgo_create_dpcpp_test(executor)
 ginkgo_create_dpcpp_test(dim3)
 ginkgo_create_dpcpp_test(kernel_launch)
 # set correct flags for kernel_launch.hpp
-target_compile_definitions(dpcpp_test_base_kernel_launch PRIVATE GKO_COMPILING_DPCPP)
+target_compile_definitions(dpcpp_test_base_kernel_launch PRIVATE GKO_COMPILING_DPCPP GKO_DEVICE_NAMESPACE=dpcpp)
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 046fd1e4d7a..de44eb20682 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -119,7 +119,7 @@ target_include_directories(ginkgo_hip
     PRIVATE
         ${CMAKE_CURRENT_BINARY_DIR}/.. # for generated headers like jacobi_common.hip.hpp
         )
-target_compile_definitions(ginkgo_hip PRIVATE GKO_COMPILING_HIP)
+target_compile_definitions(ginkgo_hip PRIVATE GKO_COMPILING_HIP GKO_DEVICE_NAMESPACE=hip)
 
 target_link_libraries(ginkgo_hip PUBLIC ginkgo_device)
 target_link_libraries(ginkgo_hip PRIVATE hip::host roc::hipblas roc::hipsparse hip::hiprand roc::rocrand)
@@ -138,7 +138,7 @@ ginkgo_default_includes(ginkgo_hip)
 ginkgo_install_library(ginkgo_hip)
 
 if (GINKGO_CHECK_CIRCULAR_DEPS)
-    ginkgo_check_headers(ginkgo_hip GKO_COMPILING_HIP)
+    ginkgo_check_headers(ginkgo_hip "GKO_COMPILING_HIP;GKO_GKO_DEVICE_NAMESPACE=hip")
 endif()
 
 if(GINKGO_BUILD_TESTS)
diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp
index f5a1dba3977..74e6c34dc5d 100644
--- a/hip/base/batch_multi_vector_kernels.hip.cpp
+++ b/hip/base/batch_multi_vector_kernels.hip.cpp
@@ -5,7 +5,6 @@
 #include "core/base/batch_multi_vector_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
 #include <thrust/functional.h>
 #include <thrust/transform.h>
 
@@ -14,13 +13,14 @@
 #include <ginkgo/core/base/range_accessors.hpp>
 
 
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/batch_struct.hpp"
 #include "hip/base/batch_struct.hip.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/hipblas_bindings.hip.hpp"
-#include "hip/base/pointer_mode_guard.hip.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/uninitialized_array.hip.hpp"
diff --git a/hip/base/batch_struct.hip.hpp b/hip/base/batch_struct.hip.hpp
index fa44a22b554..4f09ec66bb8 100644
--- a/hip/base/batch_struct.hip.hpp
+++ b/hip/base/batch_struct.hip.hpp
@@ -10,9 +10,9 @@
 #include <ginkgo/core/base/math.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
 #include "core/base/batch_struct.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/types.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/base/config.hip.hpp b/hip/base/config.hip.hpp
index fbad841fd0f..3f531616489 100644
--- a/hip/base/config.hip.hpp
+++ b/hip/base/config.hip.hpp
@@ -6,15 +6,13 @@
 #define GKO_HIP_BASE_CONFIG_HIP_HPP_
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <hip/device_functions.h>
 
 
 #include <ginkgo/core/base/types.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
 #include "hip/base/math.hip.hpp"
 
 
diff --git a/hip/base/device.hip.cpp b/hip/base/device.hip.cpp
index 58376c2175b..be897510056 100644
--- a/hip/base/device.hip.cpp
+++ b/hip/base/device.hip.cpp
@@ -5,14 +5,12 @@
 #include <ginkgo/core/base/device.hpp>
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/stream.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
 #include "hip/base/scoped_device_id.hip.hpp"
 
 
diff --git a/hip/base/device_matrix_data_kernels.hip.cpp b/hip/base/device_matrix_data_kernels.hip.cpp
index 745ba955014..5a0b762ea57 100644
--- a/hip/base/device_matrix_data_kernels.hip.cpp
+++ b/hip/base/device_matrix_data_kernels.hip.cpp
@@ -14,8 +14,8 @@
 #include <thrust/tuple.h>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/base/types.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/base/exception.hip.cpp b/hip/base/exception.hip.cpp
index aed5e803d60..3f569576c28 100644
--- a/hip/base/exception.hip.cpp
+++ b/hip/base/exception.hip.cpp
@@ -8,7 +8,7 @@
 #include <string>
 
 
-#include <hip/hip_runtime.h>
+#include "common/cuda_hip/base/runtime.hpp"
 #if HIP_VERSION >= 50200000
 #include <hipblas/hipblas.h>
 #include <hiprand/hiprand.h>
diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp
index 2694ce4177f..4b5ce7afa7b 100644
--- a/hip/base/executor.hip.cpp
+++ b/hip/base/executor.hip.cpp
@@ -8,15 +8,13 @@
 #include <iostream>
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/device.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
-#include "hip/base/config.hip.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
 #include "hip/base/hipblas_bindings.hip.hpp"
 #include "hip/base/hipsparse_bindings.hip.hpp"
 #include "hip/base/scoped_device_id.hip.hpp"
diff --git a/hip/base/hipblas_bindings.hip.hpp b/hip/base/hipblas_bindings.hip.hpp
index f4dd3f1a1e8..725c7e20698 100644
--- a/hip/base/hipblas_bindings.hip.hpp
+++ b/hip/base/hipblas_bindings.hip.hpp
@@ -6,7 +6,7 @@
 #define GKO_HIP_BASE_HIPBLAS_BINDINGS_HIP_HPP_
 
 
-#include <hip/hip_runtime.h>
+#include "common/cuda_hip/base/runtime.hpp"
 #if HIP_VERSION >= 50200000
 #include <hipblas/hipblas.h>
 #else
@@ -18,8 +18,8 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
 
 
 namespace gko {
@@ -260,6 +260,20 @@ inline void destroy_hipblas_handle(hipblasContext* handle)
 
 
 }  // namespace hipblas
+
+
+namespace blas {
+
+
+using namespace hipblas;
+
+
+#define BLAS_OP_N HIPBLAS_OP_N
+#define BLAS_OP_T HIPBLAS_OP_T
+#define BLAS_OP_C HIPBLAS_OP_C
+
+
+}  // namespace blas
 }  // namespace hip
 }  // namespace kernels
 }  // namespace gko
diff --git a/hip/base/hiprand_bindings.hip.hpp b/hip/base/hiprand_bindings.hip.hpp
index 471abb3ccd5..1dd772db250 100644
--- a/hip/base/hiprand_bindings.hip.hpp
+++ b/hip/base/hiprand_bindings.hip.hpp
@@ -6,7 +6,7 @@
 #define GKO_HIP_BASE_HIPRAND_BINDINGS_HIP_HPP_
 
 
-#include <hip/hip_runtime.h>
+#include "common/cuda_hip/base/runtime.hpp"
 #if HIP_VERSION >= 50200000
 #include <hiprand/hiprand.h>
 #else
@@ -17,8 +17,8 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
 
 
 namespace gko {
@@ -90,6 +90,18 @@ GKO_BIND_HIPRAND_RANDOM_VECTOR(std::complex<double>,
 
 
 }  // namespace hiprand
+
+
+namespace randlib {
+
+
+using namespace hiprand;
+
+
+#define RANDLIB_RNG_PSEUDO_DEFAULT HIPRAND_RNG_PSEUDO_DEFAULT
+
+
+}  // namespace randlib
 }  // namespace hip
 }  // namespace kernels
 }  // namespace gko
diff --git a/hip/base/hipsparse_bindings.hip.hpp b/hip/base/hipsparse_bindings.hip.hpp
index 62c7e60995e..997fc3d525f 100644
--- a/hip/base/hipsparse_bindings.hip.hpp
+++ b/hip/base/hipsparse_bindings.hip.hpp
@@ -6,7 +6,7 @@
 #define GKO_HIP_BASE_HIPSPARSE_BINDINGS_HIP_HPP_
 
 
-#include <hip/hip_runtime.h>
+#include "common/cuda_hip/base/runtime.hpp"
 #if HIP_VERSION >= 50200000
 #include <hipsparse/hipsparse.h>
 #else
@@ -18,7 +18,7 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
-#include "hip/base/types.hip.hpp"
+#include "common/cuda_hip/base/types.hpp"
 
 
 namespace gko {
@@ -955,6 +955,20 @@ GKO_BIND_HIPSPARSE_IC0(std::complex<double>, hipsparseZcsric02);
 
 
 }  // namespace hipsparse
+
+
+namespace sparselib {
+
+
+using namespace hipsparse;
+
+
+#define SPARSELIB_OPERATION_TRANSPOSE HIPSPARSE_OPERATION_TRANSPOSE
+#define SPARSELIB_OPERATION_NON_TRANSPOSE HIPSPARSE_OPERATION_NON_TRANSPOSE
+#define SPARSELIB_SOLVE_POLICY_USE_LEVEL HIPSPARSE_SOLVE_POLICY_USE_LEVEL
+
+
+}  // namespace sparselib
 }  // namespace hip
 }  // namespace kernels
 }  // namespace gko
diff --git a/hip/base/hipsparse_block_bindings.hip.hpp b/hip/base/hipsparse_block_bindings.hip.hpp
index eb9e8a31481..c69b0353f22 100644
--- a/hip/base/hipsparse_block_bindings.hip.hpp
+++ b/hip/base/hipsparse_block_bindings.hip.hpp
@@ -6,7 +6,7 @@
 #define GKO_HIP_BASE_HIPSPARSE_BLOCK_BINDINGS_HIP_HPP_
 
 
-#include <hip/hip_runtime.h>
+#include "common/cuda_hip/base/runtime.hpp"
 #if HIP_VERSION >= 50200000
 #include <hipsparse/hipsparse.h>
 #else
@@ -17,8 +17,8 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "hip/base/hipsparse_bindings.hip.hpp"
-#include "hip/base/types.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/base/kernel_launch.hip.hpp b/hip/base/kernel_launch.hip.hpp
index 1a00e99cac7..890b9922a4c 100644
--- a/hip/base/kernel_launch.hip.hpp
+++ b/hip/base/kernel_launch.hip.hpp
@@ -8,12 +8,12 @@
 #endif
 
 
-#include <hip/hip_runtime.h>
 #include <thrust/tuple.h>
 
 
-#include "accessor/hip_helper.hpp"
-#include "hip/base/types.hip.hpp"
+#include "accessor/cuda_hip_helper.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
 
@@ -24,21 +24,21 @@ namespace hip {
 
 template <typename AccessorType>
 struct to_device_type_impl<gko::acc::range<AccessorType>&> {
-    using type = std::decay_t<decltype(gko::acc::as_hip_range(
+    using type = std::decay_t<decltype(gko::acc::as_device_range(
         std::declval<gko::acc::range<AccessorType>>()))>;
     static type map_to_device(gko::acc::range<AccessorType>& range)
     {
-        return gko::acc::as_hip_range(range);
+        return gko::acc::as_device_range(range);
     }
 };
 
 template <typename AccessorType>
 struct to_device_type_impl<const gko::acc::range<AccessorType>&> {
-    using type = std::decay_t<decltype(gko::acc::as_hip_range(
+    using type = std::decay_t<decltype(gko::acc::as_device_range(
         std::declval<gko::acc::range<AccessorType>>()))>;
     static type map_to_device(const gko::acc::range<AccessorType>& range)
     {
-        return gko::acc::as_hip_range(range);
+        return gko::acc::as_device_range(range);
     }
 };
 
diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp
index 7c5d0c01c9c..c32fb592de0 100644
--- a/hip/base/kernel_launch_reduction.hip.hpp
+++ b/hip/base/kernel_launch_reduction.hip.hpp
@@ -8,9 +8,9 @@
 #endif
 
 
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
diff --git a/hip/base/kernel_launch_solver.hip.hpp b/hip/base/kernel_launch_solver.hip.hpp
index 18532c9754c..eda18f35eab 100644
--- a/hip/base/kernel_launch_solver.hip.hpp
+++ b/hip/base/kernel_launch_solver.hip.hpp
@@ -8,7 +8,7 @@
 #endif
 
 
-#include <hip/hip_runtime.h>
+#include "common/cuda_hip/base/runtime.hpp"
 
 
 namespace gko {
diff --git a/hip/base/memory.hip.cpp b/hip/base/memory.hip.cpp
index 0e14bf9f511..5fde8f518c6 100644
--- a/hip/base/memory.hip.cpp
+++ b/hip/base/memory.hip.cpp
@@ -5,12 +5,10 @@
 #include <ginkgo/core/base/memory.hpp>
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
 #include "hip/base/scoped_device_id.hip.hpp"
 
 
diff --git a/hip/base/pointer_mode_guard.hip.hpp b/hip/base/pointer_mode_guard.hip.hpp
index 2908164cccd..2c980b113a7 100644
--- a/hip/base/pointer_mode_guard.hip.hpp
+++ b/hip/base/pointer_mode_guard.hip.hpp
@@ -9,7 +9,7 @@
 #include <exception>
 
 
-#include <hip/hip_runtime.h>
+#include "common/cuda_hip/base/runtime.hpp"
 #if HIP_VERSION >= 50200000
 #include <hipblas/hipblas.h>
 #include <hipsparse/hipsparse.h>
diff --git a/hip/base/roctx.hip.cpp b/hip/base/roctx.hip.cpp
index 0ed12a54786..46dad3be816 100644
--- a/hip/base/roctx.hip.cpp
+++ b/hip/base/roctx.hip.cpp
@@ -2,10 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <hip/hip_runtime.h>
+#include <ginkgo/config.hpp>
 
 
-#include <ginkgo/config.hpp>
+#include "common/cuda_hip/base/runtime.hpp"
 
 
 #if GINKGO_HIP_PLATFORM_HCC && GKO_HAVE_ROCTX
diff --git a/hip/base/scoped_device_id.hip.cpp b/hip/base/scoped_device_id.hip.cpp
index ab6ed703da8..1fd7211b106 100644
--- a/hip/base/scoped_device_id.hip.cpp
+++ b/hip/base/scoped_device_id.hip.cpp
@@ -6,12 +6,10 @@
 #include <utility>
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
 #include "hip/base/scoped_device_id.hip.hpp"
 
 
diff --git a/hip/base/stream.hip.cpp b/hip/base/stream.hip.cpp
index 93c1fc008d9..b56c5104428 100644
--- a/hip/base/stream.hip.cpp
+++ b/hip/base/stream.hip.cpp
@@ -5,14 +5,12 @@
 #include <ginkgo/core/base/stream.hpp>
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/device.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
 #include "hip/base/scoped_device_id.hip.hpp"
 
 
diff --git a/hip/base/timer.hip.cpp b/hip/base/timer.hip.cpp
index 44fe5b7cbeb..bd81d9f3be5 100644
--- a/hip/base/timer.hip.cpp
+++ b/hip/base/timer.hip.cpp
@@ -5,12 +5,10 @@
 #include <ginkgo/core/base/timer.hpp>
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
 #include "hip/base/scoped_device_id.hip.hpp"
 
 
diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp
index 8827b2bea41..883e5812080 100644
--- a/hip/base/types.hip.hpp
+++ b/hip/base/types.hip.hpp
@@ -14,7 +14,9 @@
 
 #include <hip/hip_complex.h>
 #include <hip/hip_fp16.h>
-#include <hip/hip_runtime.h>
+
+
+#include "common/cuda_hip/base/runtime.hpp"
 #if HIP_VERSION >= 50200000
 #include <hipblas/hipblas.h>
 #else
@@ -430,6 +432,10 @@ GKO_INLINE GKO_ATTRIBUTES constexpr
 }
 
 
+using gpuComplex = hipComplex;
+using gpuDoubleComplex = hipDoubleComplex;
+
+
 }  // namespace hip
 }  // namespace kernels
 }  // namespace gko
diff --git a/hip/components/atomic.hip.hpp b/hip/components/atomic.hip.hpp
index f57705ff408..0dc8d7a3b46 100644
--- a/hip/components/atomic.hip.hpp
+++ b/hip/components/atomic.hip.hpp
@@ -9,8 +9,8 @@
 #include <type_traits>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
 
 
 namespace gko {
@@ -21,38 +21,6 @@ namespace hip {
 #include "common/cuda_hip/components/atomic.hpp.inc"
 
 
-/**
- * @internal
- *
- * @note It is not 'real' complex<float> atomic add operation
- */
-__forceinline__ __device__ thrust::complex<float> atomic_add(
-    thrust::complex<float>* __restrict__ address, thrust::complex<float> val)
-{
-    hipComplex* addr = reinterpret_cast<hipComplex*>(address);
-    // Separate to real part and imag part
-    auto real = atomic_add(static_cast<float*>(&(addr->x)), val.real());
-    auto imag = atomic_add(static_cast<float*>(&(addr->y)), val.imag());
-    return {real, imag};
-}
-
-
-/**
- * @internal
- *
- * @note It is not 'real' complex<double> atomic add operation
- */
-__forceinline__ __device__ thrust::complex<double> atomic_add(
-    thrust::complex<double>* __restrict__ address, thrust::complex<double> val)
-{
-    hipDoubleComplex* addr = reinterpret_cast<hipDoubleComplex*>(address);
-    // Separate to real part and imag part
-    auto real = atomic_add(static_cast<double*>(&(addr->x)), val.real());
-    auto imag = atomic_add(static_cast<double*>(&(addr->y)), val.imag());
-    return {real, imag};
-}
-
-
 }  // namespace hip
 }  // namespace kernels
 }  // namespace gko
diff --git a/hip/components/cooperative_groups.hip.hpp b/hip/components/cooperative_groups.hip.hpp
index 247218a1457..e81441a092b 100644
--- a/hip/components/cooperative_groups.hip.hpp
+++ b/hip/components/cooperative_groups.hip.hpp
@@ -9,8 +9,8 @@
 #include <type_traits>
 
 
-#include "hip/base/config.hip.hpp"
-#include "hip/base/types.hip.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
 
 
 namespace gko {
diff --git a/hip/components/diagonal_block_manipulation.hip.hpp b/hip/components/diagonal_block_manipulation.hip.hpp
index 0261c7549c5..290511e7583 100644
--- a/hip/components/diagonal_block_manipulation.hip.hpp
+++ b/hip/components/diagonal_block_manipulation.hip.hpp
@@ -9,9 +9,9 @@
 #include <type_traits>
 
 
-#include "hip/base/config.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 
 
 namespace gko {
diff --git a/hip/components/format_conversion.hip.hpp b/hip/components/format_conversion.hip.hpp
index 59c0405a874..07daf486d84 100644
--- a/hip/components/format_conversion.hip.hpp
+++ b/hip/components/format_conversion.hip.hpp
@@ -6,14 +6,12 @@
 #define GKO_HIP_COMPONENTS_FORMAT_CONVERSION_HIP_HPP_
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/executor.hpp>
 
 
-#include "hip/components/cooperative_groups.hip.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
 
diff --git a/hip/components/memory.hip.hpp b/hip/components/memory.hip.hpp
index fd4fbb8ce11..4bb6fa19ec0 100644
--- a/hip/components/memory.hip.hpp
+++ b/hip/components/memory.hip.hpp
@@ -13,7 +13,7 @@
 #include <ginkgo/core/base/math.hpp>
 
 
-#include "hip/base/types.hip.hpp"
+#include "common/cuda_hip/base/types.hpp"
 
 
 namespace gko {
diff --git a/hip/components/prefix_sum.hip.hpp b/hip/components/prefix_sum.hip.hpp
index b5065589d8e..5acde03cbec 100644
--- a/hip/components/prefix_sum.hip.hpp
+++ b/hip/components/prefix_sum.hip.hpp
@@ -9,8 +9,8 @@
 #include <type_traits>
 
 
-#include "hip/base/hipblas_bindings.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
diff --git a/hip/components/reduction.hip.hpp b/hip/components/reduction.hip.hpp
index c8fa5e58b4f..fb0539952ff 100644
--- a/hip/components/reduction.hip.hpp
+++ b/hip/components/reduction.hip.hpp
@@ -9,16 +9,15 @@
 #include <type_traits>
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/array_access.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/uninitialized_array.hip.hpp"
 
@@ -57,7 +56,6 @@ __host__ ValueType reduce_add_array(std::shared_ptr<const HipExecutor> exec,
 
         block_results.resize_and_reset(grid_dim);
 
-
         reduce_add_array<<<grid_dim, default_reduce_block_size, 0,
                            exec->get_stream()>>>(
             size, as_device_type(source),
diff --git a/hip/components/searching.hip.hpp b/hip/components/searching.hip.hpp
index 2a6be767c2c..9222de9e1d6 100644
--- a/hip/components/searching.hip.hpp
+++ b/hip/components/searching.hip.hpp
@@ -6,7 +6,7 @@
 #define GKO_HIP_COMPONENTS_SEARCHING_HIP_HPP_
 
 
-#include "hip/base/config.hip.hpp"
+#include "common/cuda_hip/base/config.hpp"
 #include "hip/components/intrinsics.hip.hpp"
 
 
diff --git a/hip/components/segment_scan.hip.hpp b/hip/components/segment_scan.hip.hpp
index 7f98d08cf69..93ebb35833a 100644
--- a/hip/components/segment_scan.hip.hpp
+++ b/hip/components/segment_scan.hip.hpp
@@ -6,7 +6,7 @@
 #define GKO_HIP_COMPONENTS_SEGMENT_SCAN_HIP_HPP_
 
 
-#include "hip/components/cooperative_groups.hip.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 
 
 namespace gko {
diff --git a/hip/components/sorting.hip.hpp b/hip/components/sorting.hip.hpp
index 730c3c56401..4a664aee453 100644
--- a/hip/components/sorting.hip.hpp
+++ b/hip/components/sorting.hip.hpp
@@ -6,8 +6,8 @@
 #define GKO_HIP_COMPONENTS_SORTING_HIP_HPP_
 
 
-#include "hip/base/config.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 
 
 namespace gko {
diff --git a/hip/components/syncfree.hip.hpp b/hip/components/syncfree.hip.hpp
index 9fe48944b56..7627a0a2781 100644
--- a/hip/components/syncfree.hip.hpp
+++ b/hip/components/syncfree.hip.hpp
@@ -9,11 +9,11 @@
 #include <ginkgo/core/base/array.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/memory.hpp"
 #include "core/components/fill_array_kernels.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/components/atomic.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/memory.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/components/thread_ids.hip.hpp b/hip/components/thread_ids.hip.hpp
index 03761983e02..6f0bd44ba9c 100644
--- a/hip/components/thread_ids.hip.hpp
+++ b/hip/components/thread_ids.hip.hpp
@@ -6,17 +6,12 @@
 #define GKO_HIP_COMPONENTS_THREAD_IDS_HIP_HPP_
 
 
-#include "hip/base/config.hip.hpp"
+#include "common/cuda_hip/base/config.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace hip {
-/**
- * @brief The HIP thread namespace.
- *
- * @ingroup hip_thread
- */
 namespace thread {
 
 
diff --git a/hip/factorization/cholesky_kernels.hip.cpp b/hip/factorization/cholesky_kernels.hip.cpp
index 1dd94bb05d0..419db21b811 100644
--- a/hip/factorization/cholesky_kernels.hip.cpp
+++ b/hip/factorization/cholesky_kernels.hip.cpp
@@ -20,15 +20,15 @@
 #include <ginkgo/core/matrix/csr.hpp>
 
 
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/format_conversion_kernels.hpp"
 #include "core/factorization/elimination_forest.hpp"
 #include "core/factorization/lu_kernels.hpp"
 #include "core/matrix/csr_lookup.hpp"
-#include "hip/base/hipsparse_bindings.hip.hpp"
 #include "hip/base/math.hip.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/intrinsics.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/syncfree.hip.hpp"
@@ -80,19 +80,19 @@ void symbolic_count(std::shared_ptr<const DefaultExecutor> exec,
     }
     // sort postorder_cols inside rows
     {
-        const auto handle = exec->get_hipsparse_handle();
-        auto descr = hipsparse::create_mat_descr();
+        const auto handle = exec->get_sparselib_handle();
+        auto descr = sparselib::create_mat_descr();
         array<IndexType> permutation_array(exec, mtx_nnz);
         auto permutation = permutation_array.get_data();
         components::fill_seq_array(exec, permutation, mtx_nnz);
         size_type buffer_size{};
-        hipsparse::csrsort_buffer_size(handle, num_rows, num_rows, mtx_nnz,
+        sparselib::csrsort_buffer_size(handle, num_rows, num_rows, mtx_nnz,
                                        row_ptrs, postorder_cols, buffer_size);
         array<char> buffer_array{exec, buffer_size};
         auto buffer = buffer_array.get_data();
-        hipsparse::csrsort(handle, num_rows, num_rows, mtx_nnz, descr, row_ptrs,
+        sparselib::csrsort(handle, num_rows, num_rows, mtx_nnz, descr, row_ptrs,
                            postorder_cols, permutation, buffer);
-        hipsparse::destroy(descr);
+        sparselib::destroy(descr);
     }
     // count nonzeros per row of L
     {
diff --git a/hip/factorization/factorization_kernels.hip.cpp b/hip/factorization/factorization_kernels.hip.cpp
index a2de4912fdb..4080768bc07 100644
--- a/hip/factorization/factorization_kernels.hip.cpp
+++ b/hip/factorization/factorization_kernels.hip.cpp
@@ -5,17 +5,16 @@
 #include "core/factorization/factorization_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/array.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/array_access.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/intrinsics.hip.hpp"
 #include "hip/components/searching.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
diff --git a/hip/factorization/ic_kernels.hip.cpp b/hip/factorization/ic_kernels.hip.cpp
index 7a845547d0d..edda974fd36 100644
--- a/hip/factorization/ic_kernels.hip.cpp
+++ b/hip/factorization/ic_kernels.hip.cpp
@@ -5,13 +5,11 @@
 #include "core/factorization/ic_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/array.hpp>
 
 
-#include "hip/base/hipsparse_bindings.hip.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
 
 
 namespace gko {
@@ -30,32 +28,32 @@ void compute(std::shared_ptr<const DefaultExecutor> exec,
              matrix::Csr<ValueType, IndexType>* m)
 {
     const auto id = exec->get_device_id();
-    auto handle = exec->get_hipsparse_handle();
-    auto desc = hipsparse::create_mat_descr();
-    auto info = hipsparse::create_ic0_info();
+    auto handle = exec->get_sparselib_handle();
+    auto desc = sparselib::create_mat_descr();
+    auto info = sparselib::create_ic0_info();
 
     // get buffer size for IC
     IndexType num_rows = m->get_size()[0];
     IndexType nnz = m->get_num_stored_elements();
     size_type buffer_size{};
-    hipsparse::ic0_buffer_size(handle, num_rows, nnz, desc,
+    sparselib::ic0_buffer_size(handle, num_rows, nnz, desc,
                                m->get_const_values(), m->get_const_row_ptrs(),
                                m->get_const_col_idxs(), info, buffer_size);
 
     array<char> buffer{exec, buffer_size};
 
     // set up IC(0)
-    hipsparse::ic0_analysis(handle, num_rows, nnz, desc, m->get_const_values(),
+    sparselib::ic0_analysis(handle, num_rows, nnz, desc, m->get_const_values(),
                             m->get_const_row_ptrs(), m->get_const_col_idxs(),
-                            info, HIPSPARSE_SOLVE_POLICY_USE_LEVEL,
+                            info, SPARSELIB_SOLVE_POLICY_USE_LEVEL,
                             buffer.get_data());
 
-    hipsparse::ic0(handle, num_rows, nnz, desc, m->get_values(),
+    sparselib::ic0(handle, num_rows, nnz, desc, m->get_values(),
                    m->get_const_row_ptrs(), m->get_const_col_idxs(), info,
-                   HIPSPARSE_SOLVE_POLICY_USE_LEVEL, buffer.get_data());
+                   SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data());
 
-    hipsparse::destroy_ic0_info(info);
-    hipsparse::destroy(desc);
+    sparselib::destroy_ic0_info(info);
+    sparselib::destroy(desc);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC_COMPUTE_KERNEL);
diff --git a/hip/factorization/ilu_kernels.hip.cpp b/hip/factorization/ilu_kernels.hip.cpp
index 071d3721536..f50df5ca75b 100644
--- a/hip/factorization/ilu_kernels.hip.cpp
+++ b/hip/factorization/ilu_kernels.hip.cpp
@@ -5,13 +5,11 @@
 #include "core/factorization/ilu_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/array.hpp>
 
 
-#include "hip/base/hipsparse_bindings.hip.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
 
 
 namespace gko {
@@ -30,32 +28,32 @@ void compute_lu(std::shared_ptr<const DefaultExecutor> exec,
                 matrix::Csr<ValueType, IndexType>* m)
 {
     const auto id = exec->get_device_id();
-    auto handle = exec->get_hipsparse_handle();
-    auto desc = hipsparse::create_mat_descr();
-    auto info = hipsparse::create_ilu0_info();
+    auto handle = exec->get_sparselib_handle();
+    auto desc = sparselib::create_mat_descr();
+    auto info = sparselib::create_ilu0_info();
 
     // get buffer size for ILU
     IndexType num_rows = m->get_size()[0];
     IndexType nnz = m->get_num_stored_elements();
     size_type buffer_size{};
-    hipsparse::ilu0_buffer_size(handle, num_rows, nnz, desc,
+    sparselib::ilu0_buffer_size(handle, num_rows, nnz, desc,
                                 m->get_const_values(), m->get_const_row_ptrs(),
                                 m->get_const_col_idxs(), info, buffer_size);
 
     array<char> buffer{exec, buffer_size};
 
     // set up ILU(0)
-    hipsparse::ilu0_analysis(handle, num_rows, nnz, desc, m->get_const_values(),
+    sparselib::ilu0_analysis(handle, num_rows, nnz, desc, m->get_const_values(),
                              m->get_const_row_ptrs(), m->get_const_col_idxs(),
-                             info, HIPSPARSE_SOLVE_POLICY_USE_LEVEL,
+                             info, SPARSELIB_SOLVE_POLICY_USE_LEVEL,
                              buffer.get_data());
 
-    hipsparse::ilu0(handle, num_rows, nnz, desc, m->get_values(),
+    sparselib::ilu0(handle, num_rows, nnz, desc, m->get_values(),
                     m->get_const_row_ptrs(), m->get_const_col_idxs(), info,
-                    HIPSPARSE_SOLVE_POLICY_USE_LEVEL, buffer.get_data());
+                    SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data());
 
-    hipsparse::destroy_ilu0_info(info);
-    hipsparse::destroy(desc);
+    sparselib::destroy_ilu0_info(info);
+    sparselib::destroy(desc);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/hip/factorization/lu_kernels.hip.cpp b/hip/factorization/lu_kernels.hip.cpp
index e1c60103dd3..ec3e771134e 100644
--- a/hip/factorization/lu_kernels.hip.cpp
+++ b/hip/factorization/lu_kernels.hip.cpp
@@ -17,11 +17,11 @@
 #include <ginkgo/core/matrix/csr.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/allocator.hpp"
 #include "core/matrix/csr_lookup.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/syncfree.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
diff --git a/hip/factorization/par_ic_kernels.hip.cpp b/hip/factorization/par_ic_kernels.hip.cpp
index dd91ac27339..e4cd0b2470b 100644
--- a/hip/factorization/par_ic_kernels.hip.cpp
+++ b/hip/factorization/par_ic_kernels.hip.cpp
@@ -10,9 +10,9 @@
 #include <ginkgo/core/matrix/csr.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/memory.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/memory.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
 
diff --git a/hip/factorization/par_ict_kernels.hip.cpp b/hip/factorization/par_ict_kernels.hip.cpp
index 4b27383bff5..7f5dba82eba 100644
--- a/hip/factorization/par_ict_kernels.hip.cpp
+++ b/hip/factorization/par_ict_kernels.hip.cpp
@@ -5,9 +5,6 @@
 #include "core/factorization/par_ict_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
@@ -15,6 +12,8 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/memory.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
@@ -22,7 +21,6 @@
 #include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/math.hip.hpp"
 #include "hip/components/intrinsics.hip.hpp"
-#include "hip/components/memory.hip.hpp"
 #include "hip/components/merging.hip.hpp"
 #include "hip/components/prefix_sum.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
@@ -49,8 +47,7 @@ using compiled_kernels =
     syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
 
 
-#include "common/cuda_hip/factorization/par_ict_spgeam_kernels.hpp.inc"
-#include "common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc"
+#include "common/cuda_hip/factorization/par_ict_kernels.hpp.inc"
 
 
 namespace {
diff --git a/hip/factorization/par_ilu_kernels.hip.cpp b/hip/factorization/par_ilu_kernels.hip.cpp
index b10941d44f1..fc05273bb09 100644
--- a/hip/factorization/par_ilu_kernels.hip.cpp
+++ b/hip/factorization/par_ilu_kernels.hip.cpp
@@ -5,16 +5,14 @@
 #include "core/factorization/par_ilu_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/std_extensions.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/memory.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/memory.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
 
diff --git a/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp b/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp
similarity index 97%
rename from hip/factorization/par_ilut_approx_filter_kernel.hip.cpp
rename to hip/factorization/par_ilut_approx_filter_kernels.hip.cpp
index d730e33e418..b5612ea29c6 100644
--- a/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp
+++ b/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp
@@ -8,9 +8,6 @@
 #include <algorithm>
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
@@ -18,16 +15,17 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
 #include "hip/components/atomic.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/intrinsics.hip.hpp"
 #include "hip/components/prefix_sum.hip.hpp"
 #include "hip/components/sorting.hip.hpp"
diff --git a/hip/factorization/par_ilut_filter_kernel.hip.cpp b/hip/factorization/par_ilut_filter_kernels.hip.cpp
similarity index 96%
rename from hip/factorization/par_ilut_filter_kernel.hip.cpp
rename to hip/factorization/par_ilut_filter_kernels.hip.cpp
index eef1044878e..e6d0a6348cc 100644
--- a/hip/factorization/par_ilut_filter_kernel.hip.cpp
+++ b/hip/factorization/par_ilut_filter_kernels.hip.cpp
@@ -5,9 +5,6 @@
 #include "core/factorization/par_ilut_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
@@ -15,15 +12,16 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/intrinsics.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
diff --git a/hip/factorization/par_ilut_select_common.hip.cpp b/hip/factorization/par_ilut_select_common.hip.cpp
index 85c2eaa7036..ddad307dc62 100644
--- a/hip/factorization/par_ilut_select_common.hip.cpp
+++ b/hip/factorization/par_ilut_select_common.hip.cpp
@@ -4,7 +4,7 @@
 
 // force-top: on
 // prevent compilation failure related to disappearing assert(...) statements
-#include <hip/hip_runtime.h>
+#include "common/cuda_hip/base/runtime.hpp"
 // force-top: off
 
 
diff --git a/hip/factorization/par_ilut_select_kernel.hip.cpp b/hip/factorization/par_ilut_select_kernels.hip.cpp
similarity index 99%
rename from hip/factorization/par_ilut_select_kernel.hip.cpp
rename to hip/factorization/par_ilut_select_kernels.hip.cpp
index b6d93e65b24..b259133b95d 100644
--- a/hip/factorization/par_ilut_select_kernel.hip.cpp
+++ b/hip/factorization/par_ilut_select_kernels.hip.cpp
@@ -8,14 +8,12 @@
 #include <algorithm>
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "hip/base/math.hip.hpp"
 #include "hip/components/atomic.hip.hpp"
diff --git a/hip/factorization/par_ilut_spgeam_kernel.hip.cpp b/hip/factorization/par_ilut_spgeam_kernels.hip.cpp
similarity index 98%
rename from hip/factorization/par_ilut_spgeam_kernel.hip.cpp
rename to hip/factorization/par_ilut_spgeam_kernels.hip.cpp
index ad102e49488..df77b1ba7a2 100644
--- a/hip/factorization/par_ilut_spgeam_kernel.hip.cpp
+++ b/hip/factorization/par_ilut_spgeam_kernels.hip.cpp
@@ -5,9 +5,6 @@
 #include "core/factorization/par_ilut_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
@@ -15,13 +12,14 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/intrinsics.hip.hpp"
 #include "hip/components/merging.hip.hpp"
 #include "hip/components/prefix_sum.hip.hpp"
diff --git a/hip/factorization/par_ilut_sweep_kernel.hip.cpp b/hip/factorization/par_ilut_sweep_kernels.hip.cpp
similarity index 97%
rename from hip/factorization/par_ilut_sweep_kernel.hip.cpp
rename to hip/factorization/par_ilut_sweep_kernels.hip.cpp
index bdcecc609d5..0f1e6455812 100644
--- a/hip/factorization/par_ilut_sweep_kernel.hip.cpp
+++ b/hip/factorization/par_ilut_sweep_kernels.hip.cpp
@@ -5,9 +5,6 @@
 #include "core/factorization/par_ilut_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
@@ -15,6 +12,8 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/memory.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
@@ -22,7 +21,6 @@
 #include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/math.hip.hpp"
 #include "hip/components/intrinsics.hip.hpp"
-#include "hip/components/memory.hip.hpp"
 #include "hip/components/merging.hip.hpp"
 #include "hip/components/prefix_sum.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
@@ -85,7 +83,6 @@ void compute_l_u_factors(syn::value_list<int, subwarp_size>,
     }
 }
 
-
 GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_l_u_factors,
                                     compute_l_u_factors);
 
diff --git a/hip/matrix/batch_csr_kernels.hip.cpp b/hip/matrix/batch_csr_kernels.hip.cpp
index 432213f3083..de73576ffed 100644
--- a/hip/matrix/batch_csr_kernels.hip.cpp
+++ b/hip/matrix/batch_csr_kernels.hip.cpp
@@ -5,7 +5,6 @@
 #include "core/matrix/batch_csr_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
 #include <thrust/functional.h>
 
 
@@ -14,12 +13,13 @@
 #include <ginkgo/core/matrix/batch_csr.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "hip/base/batch_struct.hip.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/uninitialized_array.hip.hpp"
diff --git a/hip/matrix/batch_dense_kernels.hip.cpp b/hip/matrix/batch_dense_kernels.hip.cpp
index 0d03d4ea10b..5d3b9d8cef9 100644
--- a/hip/matrix/batch_dense_kernels.hip.cpp
+++ b/hip/matrix/batch_dense_kernels.hip.cpp
@@ -5,19 +5,21 @@
 #include "core/matrix/batch_dense_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
 #include <thrust/functional.h>
 
 
+#include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "hip/base/batch_struct.hip.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/uninitialized_array.hip.hpp"
diff --git a/hip/matrix/batch_ell_kernels.hip.cpp b/hip/matrix/batch_ell_kernels.hip.cpp
index 221a3ec65dd..d415f114c3b 100644
--- a/hip/matrix/batch_ell_kernels.hip.cpp
+++ b/hip/matrix/batch_ell_kernels.hip.cpp
@@ -5,7 +5,6 @@
 #include "core/matrix/batch_ell_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
 #include <thrust/functional.h>
 
 
@@ -14,12 +13,13 @@
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "hip/base/batch_struct.hip.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/uninitialized_array.hip.hpp"
diff --git a/hip/matrix/batch_struct.hip.hpp b/hip/matrix/batch_struct.hip.hpp
index 6c98146161e..16a267d95b6 100644
--- a/hip/matrix/batch_struct.hip.hpp
+++ b/hip/matrix/batch_struct.hip.hpp
@@ -13,8 +13,8 @@
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "core/base/batch_struct.hpp"
-#include "hip/base/types.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/matrix/coo_kernels.hip.cpp b/hip/matrix/coo_kernels.hip.cpp
index 5e32e1d8502..8f7a050ef87 100644
--- a/hip/matrix/coo_kernels.hip.cpp
+++ b/hip/matrix/coo_kernels.hip.cpp
@@ -5,9 +5,6 @@
 #include "core/matrix/coo_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
@@ -15,25 +12,21 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
 #include "core/matrix/dense_kernels.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/hipsparse_bindings.hip.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
 #include "hip/components/atomic.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/format_conversion.hip.hpp"
 #include "hip/components/segment_scan.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
 
 namespace gko {
 namespace kernels {
-/**
- * @brief The HIP namespace.
- *
- * @ingroup hip
- */
 namespace hip {
 /**
  * @brief The Coordinate matrix format namespace.
diff --git a/hip/matrix/csr_kernels.template.hip.cpp b/hip/matrix/csr_kernels.template.hip.cpp
index 599a2df3669..31debd60a3d 100644
--- a/hip/matrix/csr_kernels.template.hip.cpp
+++ b/hip/matrix/csr_kernels.template.hip.cpp
@@ -8,7 +8,6 @@
 #include <algorithm>
 
 
-#include <hip/hip_runtime.h>
 #include <thrust/copy.h>
 #include <thrust/count.h>
 #include <thrust/device_ptr.h>
@@ -28,7 +27,13 @@
 #include <ginkgo/core/matrix/sellp.hpp>
 
 
-#include "accessor/hip_helper.hpp"
+#include "accessor/cuda_hip_helper.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/array_access.hpp"
 #include "core/base/mixed_precision_types.hpp"
 #include "core/components/fill_array_kernels.hpp"
@@ -39,14 +44,9 @@
 #include "core/matrix/csr_lookup.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/hipsparse_bindings.hip.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/pointer_mode_guard.hip.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/base/types.hip.hpp"
 #include "hip/components/atomic.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/intrinsics.hip.hpp"
 #include "hip/components/merging.hip.hpp"
 #include "hip/components/prefix_sum.hip.hpp"
@@ -133,10 +133,11 @@ void merge_path_spmv(syn::value_list<int, items_per_thread>,
                 kernel::abstract_merge_path_spmv<items_per_thread>
                     <<<grid, block, 0, exec->get_stream()>>>(
                         static_cast<IndexType>(a->get_size()[0]),
-                        acc::as_hip_range(a_vals), a->get_const_col_idxs(),
+                        acc::as_device_range(a_vals), a->get_const_col_idxs(),
                         as_device_type(a->get_const_row_ptrs()),
                         as_device_type(a->get_const_srow()),
-                        acc::as_hip_range(b_vals), acc::as_hip_range(c_vals),
+                        acc::as_device_range(b_vals),
+                        acc::as_device_range(c_vals),
                         as_device_type(row_out.get_data()),
                         as_device_type(val_out.get_data()));
             }
@@ -144,7 +145,7 @@ void merge_path_spmv(syn::value_list<int, items_per_thread>,
                 abstract_reduce<<<1, spmv_block_size, 0, exec->get_stream()>>>(
                     grid_num, as_device_type(val_out.get_data()),
                     as_device_type(row_out.get_data()),
-                    acc::as_hip_range(c_vals));
+                    acc::as_device_range(c_vals));
 
         } else if (alpha != nullptr && beta != nullptr) {
             if (grid_num > 0) {
@@ -152,12 +153,12 @@ void merge_path_spmv(syn::value_list<int, items_per_thread>,
                     <<<grid, block, 0, exec->get_stream()>>>(
                         static_cast<IndexType>(a->get_size()[0]),
                         as_device_type(alpha->get_const_values()),
-                        acc::as_hip_range(a_vals), a->get_const_col_idxs(),
+                        acc::as_device_range(a_vals), a->get_const_col_idxs(),
                         as_device_type(a->get_const_row_ptrs()),
                         as_device_type(a->get_const_srow()),
-                        acc::as_hip_range(b_vals),
+                        acc::as_device_range(b_vals),
                         as_device_type(beta->get_const_values()),
-                        acc::as_hip_range(c_vals),
+                        acc::as_device_range(c_vals),
                         as_device_type(row_out.get_data()),
                         as_device_type(val_out.get_data()));
             }
@@ -166,7 +167,7 @@ void merge_path_spmv(syn::value_list<int, items_per_thread>,
                     grid_num, as_device_type(val_out.get_data()),
                     as_device_type(row_out.get_data()),
                     as_device_type(alpha->get_const_values()),
-                    acc::as_hip_range(c_vals));
+                    acc::as_device_range(c_vals));
         } else {
             GKO_KERNEL_NOT_FOUND;
         }
@@ -262,21 +263,21 @@ void classical_spmv(syn::value_list<int, subwarp_size>,
         if (grid.x > 0 && grid.y > 0) {
             kernel::abstract_classical_spmv<subwarp_size>
                 <<<grid, block, 0, exec->get_stream()>>>(
-                    a->get_size()[0], acc::as_hip_range(a_vals),
+                    a->get_size()[0], acc::as_device_range(a_vals),
                     a->get_const_col_idxs(),
                     as_device_type(a->get_const_row_ptrs()),
-                    acc::as_hip_range(b_vals), acc::as_hip_range(c_vals));
+                    acc::as_device_range(b_vals), acc::as_device_range(c_vals));
         }
     } else if (alpha != nullptr && beta != nullptr) {
         if (grid.x > 0 && grid.y > 0) {
             kernel::abstract_classical_spmv<subwarp_size>
                 <<<grid, block, 0, exec->get_stream()>>>(
                     a->get_size()[0], as_device_type(alpha->get_const_values()),
-                    acc::as_hip_range(a_vals), a->get_const_col_idxs(),
+                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
                     as_device_type(a->get_const_row_ptrs()),
-                    acc::as_hip_range(b_vals),
+                    acc::as_device_range(b_vals),
                     as_device_type(beta->get_const_values()),
-                    acc::as_hip_range(c_vals));
+                    acc::as_device_range(c_vals));
         }
     } else {
         GKO_KERNEL_NOT_FOUND;
@@ -318,20 +319,20 @@ void load_balance_spmv(std::shared_ptr<const HipExecutor> exec,
                                         exec->get_stream()>>>(
                     nwarps, static_cast<IndexType>(a->get_size()[0]),
                     as_device_type(alpha->get_const_values()),
-                    acc::as_hip_range(a_vals), a->get_const_col_idxs(),
+                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
                     as_device_type(a->get_const_row_ptrs()),
                     as_device_type(a->get_const_srow()),
-                    acc::as_hip_range(b_vals), acc::as_hip_range(c_vals));
+                    acc::as_device_range(b_vals), acc::as_device_range(c_vals));
             }
         } else {
             if (csr_grid.x > 0 && csr_grid.y > 0) {
                 kernel::abstract_spmv<<<csr_grid, csr_block, 0,
                                         exec->get_stream()>>>(
                     nwarps, static_cast<IndexType>(a->get_size()[0]),
-                    acc::as_hip_range(a_vals), a->get_const_col_idxs(),
+                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
                     as_device_type(a->get_const_row_ptrs()),
                     as_device_type(a->get_const_srow()),
-                    acc::as_hip_range(b_vals), acc::as_hip_range(c_vals));
+                    acc::as_device_range(b_vals), acc::as_device_range(c_vals));
             }
         }
     }
@@ -346,24 +347,24 @@ bool try_general_sparselib_spmv(std::shared_ptr<const HipExecutor> exec,
                                 const ValueType* beta,
                                 matrix::Dense<ValueType>* c)
 {
-    bool try_sparselib = hipsparse::is_supported<ValueType, IndexType>::value;
+    bool try_sparselib = sparselib::is_supported<ValueType, IndexType>::value;
     try_sparselib =
         try_sparselib && b->get_stride() == 1 && c->get_stride() == 1;
     // rocSPARSE has issues with zero matrices
     try_sparselib = try_sparselib && a->get_num_stored_elements() > 0;
     if (try_sparselib) {
-        auto descr = hipsparse::create_mat_descr();
+        auto descr = sparselib::create_mat_descr();
 
         auto row_ptrs = a->get_const_row_ptrs();
         auto col_idxs = a->get_const_col_idxs();
 
-        hipsparse::spmv(exec->get_hipsparse_handle(),
-                        HIPSPARSE_OPERATION_NON_TRANSPOSE, a->get_size()[0],
+        sparselib::spmv(exec->get_sparselib_handle(),
+                        SPARSELIB_OPERATION_NON_TRANSPOSE, a->get_size()[0],
                         a->get_size()[1], a->get_num_stored_elements(), alpha,
                         descr, a->get_const_values(), row_ptrs, col_idxs,
                         b->get_const_values(), beta, c->get_values());
 
-        hipsparse::destroy(descr);
+        sparselib::destroy(descr);
     }
     return try_sparselib;
 }
@@ -397,8 +398,8 @@ bool try_sparselib_spmv(std::shared_ptr<const HipExecutor> exec,
         return try_general_sparselib_spmv(exec, alpha->get_const_values(), a, b,
                                           beta->get_const_values(), c);
     } else {
-        auto handle = exec->get_hipsparse_handle();
-        hipsparse::pointer_mode_guard pm_guard(handle);
+        auto handle = exec->get_sparselib_handle();
+        sparselib::pointer_mode_guard pm_guard(handle);
         const auto valpha = one<ValueType>();
         const auto vbeta = zero<ValueType>();
         return try_general_sparselib_spmv(exec, &valpha, a, b, &vbeta, c);
@@ -535,14 +536,14 @@ void spgemm(std::shared_ptr<const HipExecutor> exec,
             const matrix::Csr<ValueType, IndexType>* b,
             matrix::Csr<ValueType, IndexType>* c)
 {
-    if (hipsparse::is_supported<ValueType, IndexType>::value) {
-        auto handle = exec->get_hipsparse_handle();
-        hipsparse::pointer_mode_guard pm_guard(handle);
-        auto a_descr = hipsparse::create_mat_descr();
-        auto b_descr = hipsparse::create_mat_descr();
-        auto c_descr = hipsparse::create_mat_descr();
-        auto d_descr = hipsparse::create_mat_descr();
-        auto info = hipsparse::create_spgemm_info();
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_sparselib_handle();
+        sparselib::pointer_mode_guard pm_guard(handle);
+        auto a_descr = sparselib::create_mat_descr();
+        auto b_descr = sparselib::create_mat_descr();
+        auto c_descr = sparselib::create_mat_descr();
+        auto d_descr = sparselib::create_mat_descr();
+        auto info = sparselib::create_spgemm_info();
 
         auto alpha = one<ValueType>();
         auto a_nnz = static_cast<IndexType>(a->get_num_stored_elements());
@@ -566,7 +567,7 @@ void spgemm(std::shared_ptr<const HipExecutor> exec,
 
         // allocate buffer
         size_type buffer_size{};
-        hipsparse::spgemm_buffer_size(
+        sparselib::spgemm_buffer_size(
             handle, m, n, k, &alpha, a_descr, a_nnz, a_row_ptrs, a_col_idxs,
             b_descr, b_nnz, b_row_ptrs, b_col_idxs, null_value, d_descr,
             zero_nnz, null_index, null_index, info, buffer_size);
@@ -575,7 +576,7 @@ void spgemm(std::shared_ptr<const HipExecutor> exec,
 
         // count nnz
         IndexType c_nnz{};
-        hipsparse::spgemm_nnz(
+        sparselib::spgemm_nnz(
             handle, m, n, k, a_descr, a_nnz, a_row_ptrs, a_col_idxs, b_descr,
             b_nnz, b_row_ptrs, b_col_idxs, d_descr, zero_nnz, null_index,
             null_index, c_descr, c_row_ptrs, &c_nnz, info, buffer);
@@ -585,17 +586,17 @@ void spgemm(std::shared_ptr<const HipExecutor> exec,
         c_vals_array.resize_and_reset(c_nnz);
         auto c_col_idxs = c_col_idxs_array.get_data();
         auto c_vals = c_vals_array.get_data();
-        hipsparse::spgemm(handle, m, n, k, &alpha, a_descr, a_nnz, a_vals,
+        sparselib::spgemm(handle, m, n, k, &alpha, a_descr, a_nnz, a_vals,
                           a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals,
                           b_row_ptrs, b_col_idxs, null_value, d_descr, zero_nnz,
                           null_value, null_index, null_index, c_descr, c_vals,
                           c_row_ptrs, c_col_idxs, info, buffer);
 
-        hipsparse::destroy_spgemm_info(info);
-        hipsparse::destroy(d_descr);
-        hipsparse::destroy(c_descr);
-        hipsparse::destroy(b_descr);
-        hipsparse::destroy(a_descr);
+        sparselib::destroy_spgemm_info(info);
+        sparselib::destroy(d_descr);
+        sparselib::destroy(c_descr);
+        sparselib::destroy(b_descr);
+        sparselib::destroy(a_descr);
     } else {
         GKO_NOT_IMPLEMENTED;
     }
@@ -611,14 +612,14 @@ void advanced_spgemm(std::shared_ptr<const HipExecutor> exec,
                      const matrix::Csr<ValueType, IndexType>* d,
                      matrix::Csr<ValueType, IndexType>* c)
 {
-    if (hipsparse::is_supported<ValueType, IndexType>::value) {
-        auto handle = exec->get_hipsparse_handle();
-        hipsparse::pointer_mode_guard pm_guard(handle);
-        auto a_descr = hipsparse::create_mat_descr();
-        auto b_descr = hipsparse::create_mat_descr();
-        auto c_descr = hipsparse::create_mat_descr();
-        auto d_descr = hipsparse::create_mat_descr();
-        auto info = hipsparse::create_spgemm_info();
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_sparselib_handle();
+        sparselib::pointer_mode_guard pm_guard(handle);
+        auto a_descr = sparselib::create_mat_descr();
+        auto b_descr = sparselib::create_mat_descr();
+        auto c_descr = sparselib::create_mat_descr();
+        auto d_descr = sparselib::create_mat_descr();
+        auto info = sparselib::create_spgemm_info();
 
         auto a_nnz = static_cast<IndexType>(a->get_num_stored_elements());
         auto a_vals = a->get_const_values();
@@ -640,7 +641,7 @@ void advanced_spgemm(std::shared_ptr<const HipExecutor> exec,
 
         // allocate buffer
         size_type buffer_size{};
-        hipsparse::spgemm_buffer_size(
+        sparselib::spgemm_buffer_size(
             handle, m, n, k, &one_value, a_descr, a_nnz, a_row_ptrs, a_col_idxs,
             b_descr, b_nnz, b_row_ptrs, b_col_idxs, null_value, d_descr,
             IndexType{}, null_index, null_index, info, buffer_size);
@@ -651,7 +652,7 @@ void advanced_spgemm(std::shared_ptr<const HipExecutor> exec,
         array<IndexType> c_tmp_row_ptrs_array(exec, m + 1);
         auto c_tmp_row_ptrs = c_tmp_row_ptrs_array.get_data();
         IndexType c_nnz{};
-        hipsparse::spgemm_nnz(
+        sparselib::spgemm_nnz(
             handle, m, n, k, a_descr, a_nnz, a_row_ptrs, a_col_idxs, b_descr,
             b_nnz, b_row_ptrs, b_col_idxs, d_descr, IndexType{}, null_index,
             null_index, c_descr, c_tmp_row_ptrs, &c_nnz, info, buffer);
@@ -661,7 +662,7 @@ void advanced_spgemm(std::shared_ptr<const HipExecutor> exec,
         array<ValueType> c_tmp_vals_array(exec, c_nnz);
         auto c_tmp_col_idxs = c_tmp_col_idxs_array.get_data();
         auto c_tmp_vals = c_tmp_vals_array.get_data();
-        hipsparse::spgemm(handle, m, n, k, &one_value, a_descr, a_nnz, a_vals,
+        sparselib::spgemm(handle, m, n, k, &one_value, a_descr, a_nnz, a_vals,
                           a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals,
                           b_row_ptrs, b_col_idxs, null_value, d_descr,
                           IndexType{}, null_value, null_index, null_index,
@@ -669,11 +670,11 @@ void advanced_spgemm(std::shared_ptr<const HipExecutor> exec,
                           info, buffer);
 
         // destroy hipsparse context
-        hipsparse::destroy_spgemm_info(info);
-        hipsparse::destroy(d_descr);
-        hipsparse::destroy(c_descr);
-        hipsparse::destroy(b_descr);
-        hipsparse::destroy(a_descr);
+        sparselib::destroy_spgemm_info(info);
+        sparselib::destroy(d_descr);
+        sparselib::destroy(c_descr);
+        sparselib::destroy(b_descr);
+        sparselib::destroy(a_descr);
 
         auto total_nnz = c_nnz + d->get_num_stored_elements();
         auto nnz_per_row = total_nnz / m;
@@ -701,12 +702,12 @@ void transpose(std::shared_ptr<const HipExecutor> exec,
     if (orig->get_size()[0] == 0) {
         return;
     }
-    if (hipsparse::is_supported<ValueType, IndexType>::value) {
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
         hipsparseAction_t copyValues = HIPSPARSE_ACTION_NUMERIC;
         hipsparseIndexBase_t idxBase = HIPSPARSE_INDEX_BASE_ZERO;
 
-        hipsparse::transpose(
-            exec->get_hipsparse_handle(), orig->get_size()[0],
+        sparselib::transpose(
+            exec->get_sparselib_handle(), orig->get_size()[0],
             orig->get_size()[1], orig->get_num_stored_elements(),
             orig->get_const_values(), orig->get_const_row_ptrs(),
             orig->get_const_col_idxs(), trans->get_values(),
@@ -728,12 +729,12 @@ void conj_transpose(std::shared_ptr<const HipExecutor> exec,
     const auto block_size = default_block_size;
     const auto grid_size =
         ceildiv(trans->get_num_stored_elements(), block_size);
-    if (hipsparse::is_supported<ValueType, IndexType>::value) {
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
         hipsparseAction_t copyValues = HIPSPARSE_ACTION_NUMERIC;
         hipsparseIndexBase_t idxBase = HIPSPARSE_INDEX_BASE_ZERO;
 
-        hipsparse::transpose(
-            exec->get_hipsparse_handle(), orig->get_size()[0],
+        sparselib::transpose(
+            exec->get_sparselib_handle(), orig->get_size()[0],
             orig->get_size()[1], orig->get_num_stored_elements(),
             orig->get_const_values(), orig->get_const_row_ptrs(),
             orig->get_const_col_idxs(), trans->get_values(),
@@ -753,9 +754,9 @@ template <typename ValueType, typename IndexType>
 void sort_by_column_index(std::shared_ptr<const HipExecutor> exec,
                           matrix::Csr<ValueType, IndexType>* to_sort)
 {
-    if (hipsparse::is_supported<ValueType, IndexType>::value) {
-        auto handle = exec->get_hipsparse_handle();
-        auto descr = hipsparse::create_mat_descr();
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_sparselib_handle();
+        auto descr = sparselib::create_mat_descr();
         auto m = IndexType(to_sort->get_size()[0]);
         auto n = IndexType(to_sort->get_size()[1]);
         auto nnz = IndexType(to_sort->get_num_stored_elements());
@@ -771,23 +772,23 @@ void sort_by_column_index(std::shared_ptr<const HipExecutor> exec,
         // init identity permutation
         array<IndexType> permutation_array(exec, nnz);
         auto permutation = permutation_array.get_data();
-        hipsparse::create_identity_permutation(handle, nnz, permutation);
+        sparselib::create_identity_permutation(handle, nnz, permutation);
 
         // allocate buffer
         size_type buffer_size{};
-        hipsparse::csrsort_buffer_size(handle, m, n, nnz, row_ptrs, col_idxs,
+        sparselib::csrsort_buffer_size(handle, m, n, nnz, row_ptrs, col_idxs,
                                        buffer_size);
         array<char> buffer_array{exec, buffer_size};
         auto buffer = buffer_array.get_data();
 
         // sort column indices
-        hipsparse::csrsort(handle, m, n, nnz, descr, row_ptrs, col_idxs,
+        sparselib::csrsort(handle, m, n, nnz, descr, row_ptrs, col_idxs,
                            permutation, buffer);
 
         // sort values
-        hipsparse::gather(handle, nnz, tmp_vals, vals, permutation);
+        sparselib::gather(handle, nnz, tmp_vals, vals, permutation);
 
-        hipsparse::destroy(descr);
+        sparselib::destroy(descr);
     } else {
         fallback_sort(exec, to_sort);
     }
diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp
index 36e581049e0..8fed3c97c1b 100644
--- a/hip/matrix/dense_kernels.hip.cpp
+++ b/hip/matrix/dense_kernels.hip.cpp
@@ -5,9 +5,6 @@
 #include "core/matrix/dense_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/range_accessors.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
@@ -20,12 +17,13 @@
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/utils.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/hipblas_bindings.hip.hpp"
-#include "hip/base/pointer_mode_guard.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/intrinsics.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
@@ -56,11 +54,11 @@ void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
                           matrix::Dense<ValueType>* result, array<char>& tmp)
 {
     if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
-        if (hipblas::is_supported<ValueType>::value) {
-            auto handle = exec->get_hipblas_handle();
-            hipblas::dot(handle, x->get_size()[0], x->get_const_values(),
-                         x->get_stride(), y->get_const_values(),
-                         y->get_stride(), result->get_values());
+        if (blas::is_supported<ValueType>::value) {
+            auto handle = exec->get_blas_handle();
+            blas::dot(handle, x->get_size()[0], x->get_const_values(),
+                      x->get_stride(), y->get_const_values(), y->get_stride(),
+                      result->get_values());
         } else {
             compute_dot(exec, x, y, result, tmp);
         }
@@ -81,11 +79,11 @@ void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
                                array<char>& tmp)
 {
     if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
-        if (hipblas::is_supported<ValueType>::value) {
-            auto handle = exec->get_hipblas_handle();
-            hipblas::conj_dot(handle, x->get_size()[0], x->get_const_values(),
-                              x->get_stride(), y->get_const_values(),
-                              y->get_stride(), result->get_values());
+        if (blas::is_supported<ValueType>::value) {
+            auto handle = exec->get_blas_handle();
+            blas::conj_dot(handle, x->get_size()[0], x->get_const_values(),
+                           x->get_stride(), y->get_const_values(),
+                           y->get_stride(), result->get_values());
         } else {
             compute_conj_dot(exec, x, y, result, tmp);
         }
@@ -105,10 +103,10 @@ void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
                             array<char>& tmp)
 {
     if (x->get_size()[1] == 1) {
-        if (hipblas::is_supported<ValueType>::value) {
-            auto handle = exec->get_hipblas_handle();
-            hipblas::norm2(handle, x->get_size()[0], x->get_const_values(),
-                           x->get_stride(), result->get_values());
+        if (blas::is_supported<ValueType>::value) {
+            auto handle = exec->get_blas_handle();
+            blas::norm2(handle, x->get_size()[0], x->get_const_values(),
+                        x->get_stride(), result->get_values());
         } else {
             compute_norm2(exec, x, result, tmp);
         }
@@ -127,19 +125,18 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const matrix::Dense<ValueType>* b,
                   matrix::Dense<ValueType>* c)
 {
-    if (hipblas::is_supported<ValueType>::value) {
-        auto handle = exec->get_hipblas_handle();
+    if (blas::is_supported<ValueType>::value) {
+        auto handle = exec->get_blas_handle();
         if (c->get_size()[0] > 0 && c->get_size()[1] > 0) {
             if (a->get_size()[1] > 0) {
-                hipblas::pointer_mode_guard pm_guard(handle);
+                blas::pointer_mode_guard pm_guard(handle);
                 auto alpha = one<ValueType>();
                 auto beta = zero<ValueType>();
-                hipblas::gemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_N,
-                              c->get_size()[1], c->get_size()[0],
-                              a->get_size()[1], &alpha, b->get_const_values(),
-                              b->get_stride(), a->get_const_values(),
-                              a->get_stride(), &beta, c->get_values(),
-                              c->get_stride());
+                blas::gemm(handle, BLAS_OP_N, BLAS_OP_N, c->get_size()[1],
+                           c->get_size()[0], a->get_size()[1], &alpha,
+                           b->get_const_values(), b->get_stride(),
+                           a->get_const_values(), a->get_stride(), &beta,
+                           c->get_values(), c->get_stride());
             } else {
                 dense::fill(exec, c, zero<ValueType>());
             }
@@ -158,15 +155,15 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
            const matrix::Dense<ValueType>* a, const matrix::Dense<ValueType>* b,
            const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
 {
-    if (hipblas::is_supported<ValueType>::value) {
+    if (blas::is_supported<ValueType>::value) {
         if (c->get_size()[0] > 0 && c->get_size()[1] > 0) {
             if (a->get_size()[1] > 0) {
-                hipblas::gemm(
-                    exec->get_hipblas_handle(), HIPBLAS_OP_N, HIPBLAS_OP_N,
-                    c->get_size()[1], c->get_size()[0], a->get_size()[1],
-                    alpha->get_const_values(), b->get_const_values(),
-                    b->get_stride(), a->get_const_values(), a->get_stride(),
-                    beta->get_const_values(), c->get_values(), c->get_stride());
+                blas::gemm(exec->get_blas_handle(), BLAS_OP_N, BLAS_OP_N,
+                           c->get_size()[1], c->get_size()[0], a->get_size()[1],
+                           alpha->get_const_values(), b->get_const_values(),
+                           b->get_stride(), a->get_const_values(),
+                           a->get_stride(), beta->get_const_values(),
+                           c->get_values(), c->get_stride());
             } else {
                 dense::scale(exec, beta, c);
             }
@@ -184,17 +181,17 @@ void transpose(std::shared_ptr<const DefaultExecutor> exec,
                const matrix::Dense<ValueType>* orig,
                matrix::Dense<ValueType>* trans)
 {
-    if (hipblas::is_supported<ValueType>::value) {
-        auto handle = exec->get_hipblas_handle();
+    if (blas::is_supported<ValueType>::value) {
+        auto handle = exec->get_blas_handle();
         if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) {
-            hipblas::pointer_mode_guard pm_guard(handle);
+            blas::pointer_mode_guard pm_guard(handle);
             auto alpha = one<ValueType>();
             auto beta = zero<ValueType>();
-            hipblas::geam(handle, HIPBLAS_OP_T, HIPBLAS_OP_N,
-                          orig->get_size()[0], orig->get_size()[1], &alpha,
-                          orig->get_const_values(), orig->get_stride(), &beta,
-                          trans->get_const_values(), trans->get_stride(),
-                          trans->get_values(), trans->get_stride());
+            blas::geam(handle, BLAS_OP_T, BLAS_OP_N, orig->get_size()[0],
+                       orig->get_size()[1], &alpha, orig->get_const_values(),
+                       orig->get_stride(), &beta, trans->get_const_values(),
+                       trans->get_stride(), trans->get_values(),
+                       trans->get_stride());
         }
     } else {
         GKO_NOT_IMPLEMENTED;
@@ -209,17 +206,17 @@ void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Dense<ValueType>* orig,
                     matrix::Dense<ValueType>* trans)
 {
-    if (hipblas::is_supported<ValueType>::value) {
-        auto handle = exec->get_hipblas_handle();
+    if (blas::is_supported<ValueType>::value) {
+        auto handle = exec->get_blas_handle();
         if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) {
-            hipblas::pointer_mode_guard pm_guard(handle);
+            blas::pointer_mode_guard pm_guard(handle);
             auto alpha = one<ValueType>();
             auto beta = zero<ValueType>();
-            hipblas::geam(handle, HIPBLAS_OP_C, HIPBLAS_OP_N,
-                          orig->get_size()[0], orig->get_size()[1], &alpha,
-                          orig->get_const_values(), orig->get_stride(), &beta,
-                          trans->get_values(), trans->get_stride(),
-                          trans->get_values(), trans->get_stride());
+            blas::geam(handle, BLAS_OP_C, BLAS_OP_N, orig->get_size()[0],
+                       orig->get_size()[1], &alpha, orig->get_const_values(),
+                       orig->get_stride(), &beta, trans->get_const_values(),
+                       trans->get_stride(), trans->get_values(),
+                       trans->get_stride());
         }
     } else {
         GKO_NOT_IMPLEMENTED;
diff --git a/hip/matrix/diagonal_kernels.hip.cpp b/hip/matrix/diagonal_kernels.hip.cpp
index deedb9543ec..01033004c6b 100644
--- a/hip/matrix/diagonal_kernels.hip.cpp
+++ b/hip/matrix/diagonal_kernels.hip.cpp
@@ -5,16 +5,14 @@
 #include "core/matrix/diagonal_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
 
-#include "hip/base/hipsparse_bindings.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
 
diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp
index 51c34430f5c..4f1ff6a3539 100644
--- a/hip/matrix/ell_kernels.hip.cpp
+++ b/hip/matrix/ell_kernels.hip.cpp
@@ -8,9 +8,6 @@
 #include <array>
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
@@ -18,19 +15,20 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
-#include "accessor/hip_helper.hpp"
+#include "accessor/cuda_hip_helper.hpp"
 #include "accessor/reduced_row_major.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
 #include "core/base/mixed_precision_types.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/hipsparse_bindings.hip.hpp"
-#include "hip/base/types.hip.hpp"
 #include "hip/components/atomic.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/format_conversion.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
@@ -133,20 +131,21 @@ void abstract_spmv(syn::value_list<int, info>,
         if (grid_size.x > 0 && grid_size.y > 0) {
             kernel::spmv<num_thread_per_worker, atomic>
                 <<<grid_size, block_size, 0, exec->get_stream()>>>(
-                    nrows, num_worker_per_row, acc::as_hip_range(a_vals),
+                    nrows, num_worker_per_row, acc::as_device_range(a_vals),
                     a->get_const_col_idxs(), stride,
-                    num_stored_elements_per_row, acc::as_hip_range(b_vals),
+                    num_stored_elements_per_row, acc::as_device_range(b_vals),
                     as_device_type(c->get_values()), c->get_stride());
         }
     } else if (alpha != nullptr && beta != nullptr) {
+        const auto alpha_val = acc::range<a_accessor>(
+            std::array<acc::size_type, 1>{1}, alpha->get_const_values());
         if (grid_size.x > 0 && grid_size.y > 0) {
-            const auto alpha_val = acc::range<a_accessor>(
-                std::array<acc::size_type, 1>{1}, alpha->get_const_values());
             kernel::spmv<num_thread_per_worker, atomic>
                 <<<grid_size, block_size, 0, exec->get_stream()>>>(
-                    nrows, num_worker_per_row, acc::as_hip_range(alpha_val),
-                    acc::as_hip_range(a_vals), a->get_const_col_idxs(), stride,
-                    num_stored_elements_per_row, acc::as_hip_range(b_vals),
+                    nrows, num_worker_per_row, acc::as_device_range(alpha_val),
+                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
+                    stride, num_stored_elements_per_row,
+                    acc::as_device_range(b_vals),
                     as_device_type(beta->get_const_values()),
                     as_device_type(c->get_values()), c->get_stride());
         }
@@ -215,7 +214,7 @@ void spmv(std::shared_ptr<const HipExecutor> exec,
     const int num_worker_per_row = std::get<2>(data);
 
     /**
-     * info is the parameter for selecting the hip kernel.
+     * info is the parameter for selecting the device kernel.
      * for info == 0, it uses the kernel by warp_size threads with atomic
      * operation for other value, it uses the kernel without atomic_add
      */
@@ -249,7 +248,7 @@ void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
     const int num_worker_per_row = std::get<2>(data);
 
     /**
-     * info is the parameter for selecting the hip kernel.
+     * info is the parameter for selecting the device kernel.
      * for info == 0, it uses the kernel by warp_size threads with atomic
      * operation for other value, it uses the kernel without atomic_add
      */
diff --git a/hip/matrix/fbcsr_kernels.template.hip.cpp b/hip/matrix/fbcsr_kernels.template.hip.cpp
index b84e7644e80..0286aff0bba 100644
--- a/hip/matrix/fbcsr_kernels.template.hip.cpp
+++ b/hip/matrix/fbcsr_kernels.template.hip.cpp
@@ -8,7 +8,6 @@
 #include <algorithm>
 
 
-#include <hip/hip_runtime.h>
 #include <thrust/copy.h>
 #include <thrust/count.h>
 #include <thrust/device_ptr.h>
@@ -25,6 +24,13 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "common/unified/base/kernel_launch.hpp"
 #include "core/base/array_access.hpp"
 #include "core/base/block_sizes.hpp"
@@ -34,22 +40,17 @@
 #include "core/matrix/csr_lookup.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/hipblas_bindings.hip.hpp"
-#include "hip/base/hipsparse_bindings.hip.hpp"
 #include "hip/base/hipsparse_block_bindings.hip.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/pointer_mode_guard.hip.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/base/types.hip.hpp"
 #include "hip/components/atomic.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/merging.hip.hpp"
 #include "hip/components/prefix_sum.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/uninitialized_array.hip.hpp"
 
+
 namespace gko {
 namespace kernels {
 namespace hip {
@@ -82,15 +83,15 @@ void dense_transpose(std::shared_ptr<const HipExecutor> exec,
     if (nrows == 0) {
         return;
     }
-    if (hipblas::is_supported<ValueType>::value) {
-        auto handle = exec->get_hipblas_handle();
+    if (blas::is_supported<ValueType>::value) {
+        auto handle = exec->get_blas_handle();
         {
-            hipblas::pointer_mode_guard pm_guard(handle);
+            blas::pointer_mode_guard pm_guard(handle);
             auto alpha = one<ValueType>();
             auto beta = zero<ValueType>();
-            hipblas::geam(handle, HIPBLAS_OP_T, HIPBLAS_OP_N, nrows, ncols,
-                          &alpha, orig, orig_stride, &beta, trans, trans_stride,
-                          trans, trans_stride);
+            blas::geam(handle, BLAS_OP_T, BLAS_OP_N, nrows, ncols, &alpha, orig,
+                       orig_stride, &beta, trans, trans_stride, trans,
+                       trans_stride);
         }
     } else {
         GKO_NOT_IMPLEMENTED;
@@ -116,12 +117,12 @@ void spmv(std::shared_ptr<const HipExecutor> exec,
         dense::fill(exec, c, zero<ValueType>());
         return;
     }
-    if (hipsparse::is_supported<ValueType, IndexType>::value) {
-        auto handle = exec->get_hipsparse_handle();
-        hipsparse::pointer_mode_guard pm_guard(handle);
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_sparselib_handle();
+        sparselib::pointer_mode_guard pm_guard(handle);
         const auto alpha = one<ValueType>();
         const auto beta = zero<ValueType>();
-        auto descr = hipsparse::create_mat_descr();
+        auto descr = sparselib::create_mat_descr();
         const auto row_ptrs = a->get_const_row_ptrs();
         const auto col_idxs = a->get_const_col_idxs();
         const auto values = a->get_const_values();
@@ -135,21 +136,21 @@ void spmv(std::shared_ptr<const HipExecutor> exec,
         const auto in_stride = b->get_stride();
         const auto out_stride = c->get_stride();
         if (nrhs == 1 && in_stride == 1 && out_stride == 1) {
-            hipsparse::bsrmv(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, mb, nb,
+            sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb,
                              nnzb, &alpha, descr, values, row_ptrs, col_idxs,
                              bs, b->get_const_values(), &beta, c->get_values());
         } else {
             const auto trans_stride = nrows;
             auto trans_c = array<ValueType>(exec, nrows * nrhs);
-            hipsparse::bsrmm(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
-                             HIPSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
+            sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
+                             SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
                              &alpha, descr, values, row_ptrs, col_idxs, bs,
                              b->get_const_values(), in_stride, &beta,
                              trans_c.get_data(), trans_stride);
             dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(),
                             out_stride, c->get_values());
         }
-        hipsparse::destroy(descr);
+        sparselib::destroy(descr);
     } else {
         GKO_NOT_IMPLEMENTED;
     }
@@ -173,11 +174,11 @@ void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
         dense::scale(exec, beta, c);
         return;
     }
-    if (hipsparse::is_supported<ValueType, IndexType>::value) {
-        auto handle = exec->get_hipsparse_handle();
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_sparselib_handle();
         const auto alphp = alpha->get_const_values();
         const auto betap = beta->get_const_values();
-        auto descr = hipsparse::create_mat_descr();
+        auto descr = sparselib::create_mat_descr();
         const auto row_ptrs = a->get_const_row_ptrs();
         const auto col_idxs = a->get_const_col_idxs();
         const auto values = a->get_const_values();
@@ -191,7 +192,7 @@ void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
         const auto in_stride = b->get_stride();
         const auto out_stride = c->get_stride();
         if (nrhs == 1 && in_stride == 1 && out_stride == 1) {
-            hipsparse::bsrmv(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, mb, nb,
+            sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb,
                              nnzb, alphp, descr, values, row_ptrs, col_idxs, bs,
                              b->get_const_values(), betap, c->get_values());
         } else {
@@ -199,27 +200,83 @@ void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
             auto trans_c = array<ValueType>(exec, nrows * nrhs);
             dense_transpose(exec, nrows, nrhs, out_stride, c->get_values(),
                             trans_stride, trans_c.get_data());
-            hipsparse::bsrmm(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
-                             HIPSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
+            sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
+                             SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
                              alphp, descr, values, row_ptrs, col_idxs, bs,
                              b->get_const_values(), in_stride, betap,
                              trans_c.get_data(), trans_stride);
             dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(),
                             out_stride, c->get_values());
         }
-        hipsparse::destroy(descr);
+        sparselib::destroy(descr);
     } else {
         GKO_NOT_IMPLEMENTED;
     }
 }
 
 
+namespace {
+
+
+template <int mat_blk_sz, typename ValueType, typename IndexType>
+void transpose_blocks_impl(syn::value_list<int, mat_blk_sz>,
+                           std::shared_ptr<const DefaultExecutor> exec,
+                           matrix::Fbcsr<ValueType, IndexType>* const mat)
+{
+    constexpr int subwarp_size = config::warp_size;
+    const auto nbnz = mat->get_num_stored_blocks();
+    const auto numthreads = nbnz * subwarp_size;
+    const auto block_size = default_block_size;
+    const auto grid_dim = ceildiv(numthreads, block_size);
+    if (grid_dim > 0) {
+        kernel::transpose_blocks<mat_blk_sz, subwarp_size>
+            <<<grid_dim, block_size, 0, exec->get_stream()>>>(
+                nbnz, mat->get_values());
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_transpose_blocks,
+                                    transpose_blocks_impl);
+
+
+}  // namespace
+
+
 template <typename ValueType, typename IndexType>
 void transpose(const std::shared_ptr<const DefaultExecutor> exec,
-               const matrix::Fbcsr<ValueType, IndexType>* const input,
-               matrix::Fbcsr<ValueType, IndexType>* const output)
+               const matrix::Fbcsr<ValueType, IndexType>* const orig,
+               matrix::Fbcsr<ValueType, IndexType>* const trans)
 {
-    fallback_transpose(exec, input, output);
+#ifdef GKO_COMPILING_CUDA
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+        const int bs = orig->get_block_size();
+        const IndexType nnzb =
+            static_cast<IndexType>(orig->get_num_stored_blocks());
+        cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
+        cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
+        const IndexType buffer_size = sparselib::bsr_transpose_buffersize(
+            exec->get_sparselib_handle(), orig->get_num_block_rows(),
+            orig->get_num_block_cols(), nnzb, orig->get_const_values(),
+            orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs);
+        array<char> buffer_array(exec, buffer_size);
+        auto buffer = buffer_array.get_data();
+        sparselib::bsr_transpose(
+            exec->get_sparselib_handle(), orig->get_num_block_rows(),
+            orig->get_num_block_cols(), nnzb, orig->get_const_values(),
+            orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs,
+            trans->get_values(), trans->get_col_idxs(), trans->get_row_ptrs(),
+            copyValues, idxBase, buffer);
+
+        // transpose blocks
+        select_transpose_blocks(
+            fixedblock::compiled_kernels(),
+            [bs](int compiled_block_size) { return bs == compiled_block_size; },
+            syn::value_list<int>(), syn::type_list<>(), exec, trans);
+    } else
+#endif
+    {
+        fallback_transpose(exec, orig, trans);
+    }
 }
 
 
diff --git a/hip/matrix/fft_kernels.hip.cpp b/hip/matrix/fft_kernels.hip.cpp
index dc397b20892..92358d732c7 100644
--- a/hip/matrix/fft_kernels.hip.cpp
+++ b/hip/matrix/fft_kernels.hip.cpp
@@ -8,7 +8,7 @@
 #include <array>
 
 
-#include <hip/hip_runtime.h>
+#include "common/cuda_hip/base/runtime.hpp"
 #if HIP_VERSION >= 50200000
 #include <hipfft/hipfft.h>
 #else
diff --git a/hip/matrix/sellp_kernels.hip.cpp b/hip/matrix/sellp_kernels.hip.cpp
index 8028dd0777f..f1e15c946e0 100644
--- a/hip/matrix/sellp_kernels.hip.cpp
+++ b/hip/matrix/sellp_kernels.hip.cpp
@@ -5,9 +5,6 @@
 #include "core/matrix/sellp_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
@@ -15,10 +12,11 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/hipsparse_bindings.hip.hpp"
-#include "hip/base/types.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
diff --git a/hip/matrix/sparsity_csr_kernels.hip.cpp b/hip/matrix/sparsity_csr_kernels.hip.cpp
index e5a6900cdfe..487b134d28a 100644
--- a/hip/matrix/sparsity_csr_kernels.hip.cpp
+++ b/hip/matrix/sparsity_csr_kernels.hip.cpp
@@ -5,25 +5,25 @@
 #include "core/matrix/sparsity_csr_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
 #include <thrust/sort.h>
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
-#include "accessor/hip_helper.hpp"
+#include "accessor/cuda_hip_helper.hpp"
 #include "accessor/reduced_row_major.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/mixed_precision_types.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/format_conversion_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/hipsparse_bindings.hip.hpp"
 #include "hip/base/math.hip.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/uninitialized_array.hip.hpp"
@@ -42,7 +42,11 @@ namespace sparsity_csr {
 
 constexpr int classical_oversubscription = 32;
 constexpr int default_block_size = 512;
+#ifdef GKO_COMPILING_HIP
 constexpr int spmv_block_size = 256;
+#else
+constexpr int spmv_block_size = 128;
+#endif
 constexpr int warps_in_block = 4;
 
 
@@ -106,16 +110,16 @@ void classical_spmv(syn::value_list<int, subwarp_size>,
                 a->get_size()[0], as_device_type(a->get_const_value()),
                 a->get_const_col_idxs(),
                 as_device_type(a->get_const_row_ptrs()),
-                acc::as_hip_range(b_vals), acc::as_hip_range(c_vals));
+                acc::as_device_range(b_vals), acc::as_device_range(c_vals));
     } else if (alpha != nullptr && beta != nullptr) {
         kernel::abstract_classical_spmv<subwarp_size>
             <<<grid, block, 0, exec->get_stream()>>>(
                 a->get_size()[0], as_device_type(alpha->get_const_values()),
                 as_device_type(a->get_const_value()), a->get_const_col_idxs(),
                 as_device_type(a->get_const_row_ptrs()),
-                acc::as_hip_range(b_vals),
+                acc::as_device_range(b_vals),
                 as_device_type(beta->get_const_values()),
-                acc::as_hip_range(c_vals));
+                acc::as_device_range(c_vals));
     } else {
         GKO_KERNEL_NOT_FOUND;
     }
@@ -169,21 +173,21 @@ void sort_by_column_index(std::shared_ptr<const DefaultExecutor> exec,
     const auto num_cols = static_cast<IndexType>(to_sort->get_size()[1]);
     const auto row_ptrs = to_sort->get_const_row_ptrs();
     const auto col_idxs = to_sort->get_col_idxs();
-    if (hipsparse::is_supported<ValueType, IndexType>::value) {
-        const auto handle = exec->get_hipsparse_handle();
-        auto descr = hipsparse::create_mat_descr();
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+        const auto handle = exec->get_sparselib_handle();
+        auto descr = sparselib::create_mat_descr();
         array<IndexType> permutation_array(exec, to_sort->get_num_nonzeros());
         auto permutation = permutation_array.get_data();
         components::fill_seq_array(exec, permutation,
                                    to_sort->get_num_nonzeros());
         size_type buffer_size{};
-        hipsparse::csrsort_buffer_size(handle, num_rows, num_cols, nnz,
+        sparselib::csrsort_buffer_size(handle, num_rows, num_cols, nnz,
                                        row_ptrs, col_idxs, buffer_size);
         array<char> buffer_array{exec, buffer_size};
         auto buffer = buffer_array.get_data();
-        hipsparse::csrsort(handle, num_rows, num_cols, nnz, descr, row_ptrs,
+        sparselib::csrsort(handle, num_rows, num_cols, nnz, descr, row_ptrs,
                            col_idxs, permutation, buffer);
-        hipsparse::destroy(descr);
+        sparselib::destroy(descr);
     } else {
         fallback_sort(exec, to_sort);
     }
diff --git a/hip/multigrid/pgm_kernels.hip.cpp b/hip/multigrid/pgm_kernels.hip.cpp
index ed81d1c66dc..18c1f0957c4 100644
--- a/hip/multigrid/pgm_kernels.hip.cpp
+++ b/hip/multigrid/pgm_kernels.hip.cpp
@@ -19,8 +19,8 @@
 #include <ginkgo/core/base/math.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/base/types.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/preconditioner/batch_preconditioners.hip.hpp b/hip/preconditioner/batch_preconditioners.hip.hpp
index 6d58244a41a..f3969c16b81 100644
--- a/hip/preconditioner/batch_preconditioners.hip.hpp
+++ b/hip/preconditioner/batch_preconditioners.hip.hpp
@@ -6,9 +6,9 @@
 #define GKO_HIP_PRECONDITIONER_BATCH_PRECONDITIONERS_HIP_HPP_
 
 
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/preconditioner/batch_jacobi_helpers.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 
 
diff --git a/hip/preconditioner/isai_kernels.hip.cpp b/hip/preconditioner/isai_kernels.hip.cpp
index 7339bd0a754..4eaf65cc438 100644
--- a/hip/preconditioner/isai_kernels.hip.cpp
+++ b/hip/preconditioner/isai_kernels.hip.cpp
@@ -5,21 +5,18 @@
 #include "core/preconditioner/isai_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/atomic.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/merging.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
diff --git a/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp
index 326b9f6b720..67a65385ca4 100644
--- a/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp
+++ b/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp
@@ -5,20 +5,18 @@
 #include "core/preconditioner/jacobi_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/warp_blas.hip.hpp"
 #include "hip/preconditioner/jacobi_common.hip.hpp"
@@ -35,7 +33,7 @@ namespace hip {
 namespace jacobi {
 
 
-#include "common/cuda_hip/preconditioner/jacobi_advanced_apply_kernel.hpp.inc"
+#include <common/cuda_hip/preconditioner/jacobi_advanced_apply_kernel.hpp.inc>
 
 
 // clang-format off
diff --git a/hip/preconditioner/jacobi_common.hip.hpp.in b/hip/preconditioner/jacobi_common.hip.hpp.in
index 6e9c279a46f..2185e124db6 100644
--- a/hip/preconditioner/jacobi_common.hip.hpp.in
+++ b/hip/preconditioner/jacobi_common.hip.hpp.in
@@ -6,7 +6,7 @@
 #include <ginkgo/core/synthesizer/containers.hpp>
 
 
-#include "hip/base/config.hip.hpp"
+#include "common/cuda_hip/base/config.hpp"
 
 
 namespace gko {
diff --git a/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp
index 86a3b799590..d95a97d7068 100644
--- a/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp
+++ b/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp
@@ -9,14 +9,14 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/diagonal_block_manipulation.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/uninitialized_array.hip.hpp"
@@ -35,7 +35,7 @@ namespace hip {
 namespace jacobi {
 
 
-#include "common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc"
+#include <common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc>
 
 
 // clang-format off
diff --git a/hip/preconditioner/jacobi_generate_kernel.hip.cpp b/hip/preconditioner/jacobi_generate_kernel.hip.cpp
index 713be193250..50bf72ea964 100644
--- a/hip/preconditioner/jacobi_generate_kernel.hip.cpp
+++ b/hip/preconditioner/jacobi_generate_kernel.hip.cpp
@@ -5,21 +5,19 @@
 #include "core/preconditioner/jacobi_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/diagonal_block_manipulation.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/uninitialized_array.hip.hpp"
@@ -38,7 +36,7 @@ namespace hip {
 namespace jacobi {
 
 
-#include "common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc"
+#include <common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc>
 
 
 template <int warps_per_block, int max_block_size, typename ValueType,
diff --git a/hip/preconditioner/jacobi_kernels.hip.cpp b/hip/preconditioner/jacobi_kernels.hip.cpp
index 1646a7fb376..a3b2b7e5412 100644
--- a/hip/preconditioner/jacobi_kernels.hip.cpp
+++ b/hip/preconditioner/jacobi_kernels.hip.cpp
@@ -5,19 +5,17 @@
 #include "core/preconditioner/jacobi_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/preconditioner/jacobi_common.hip.hpp"
 
@@ -34,9 +32,9 @@ namespace jacobi {
 
 
 // a total of 32/16 warps (1024 threads)
-#if GINKGO_HIP_PLATFORM_HCC
+#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC
 constexpr int default_num_warps = 16;
-#else  // GINKGO_HIP_PLATFORM_NVCC
+#else  // !defined(GKO_COMPILING_HIP) || GINKGO_HIP_PLATFORM_NVCC
 constexpr int default_num_warps = 32;
 #endif
 // with current architectures, at most 32 warps can be scheduled per SM (and
diff --git a/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp
index be485af5730..b3e6e6fe73b 100644
--- a/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp
+++ b/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp
@@ -8,14 +8,14 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/warp_blas.hip.hpp"
 #include "hip/preconditioner/jacobi_common.hip.hpp"
@@ -32,7 +32,7 @@ namespace hip {
 namespace jacobi {
 
 
-#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc"
+#include <common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc>
 
 
 // clang-format off
diff --git a/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp
index 0763e986d41..e8e247210ec 100644
--- a/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp
+++ b/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp
@@ -5,20 +5,18 @@
 #include "core/preconditioner/jacobi_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/warp_blas.hip.hpp"
 #include "hip/preconditioner/jacobi_common.hip.hpp"
@@ -35,7 +33,7 @@ namespace hip {
 namespace jacobi {
 
 
-#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc"
+#include <common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc>
 
 
 template <int warps_per_block, int max_block_size, typename ValueType,
diff --git a/hip/reorder/rcm_kernels.hip.cpp b/hip/reorder/rcm_kernels.hip.cpp
index 0c83c728e79..9a5739064eb 100644
--- a/hip/reorder/rcm_kernels.hip.cpp
+++ b/hip/reorder/rcm_kernels.hip.cpp
@@ -25,9 +25,9 @@
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
+#include "common/cuda_hip/components/memory.hpp"
 #include "core/base/array_access.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/components/memory.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
 
diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp
index c62c11405a5..fdeb0580931 100644
--- a/hip/solver/batch_bicgstab_kernels.hip.cpp
+++ b/hip/solver/batch_bicgstab_kernels.hip.cpp
@@ -5,7 +5,6 @@
 #include "core/solver/batch_bicgstab_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
 #include <thrust/functional.h>
 #include <thrust/transform.h>
 
@@ -14,15 +13,16 @@
 #include <ginkgo/core/base/math.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "hip/base/batch_struct.hip.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/base/math.hip.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/uninitialized_array.hip.hpp"
diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp
index d61eead6fab..47c2bc498eb 100644
--- a/hip/solver/batch_cg_kernels.hip.cpp
+++ b/hip/solver/batch_cg_kernels.hip.cpp
@@ -5,7 +5,6 @@
 #include "core/solver/batch_cg_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
 #include <thrust/functional.h>
 #include <thrust/transform.h>
 
@@ -14,15 +13,16 @@
 #include <ginkgo/core/base/math.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "hip/base/batch_struct.hip.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/base/math.hip.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/uninitialized_array.hip.hpp"
diff --git a/hip/solver/cb_gmres_kernels.hip.cpp b/hip/solver/cb_gmres_kernels.hip.cpp
index 794ac9fd8a6..2f2df4ddf84 100644
--- a/hip/solver/cb_gmres_kernels.hip.cpp
+++ b/hip/solver/cb_gmres_kernels.hip.cpp
@@ -14,19 +14,19 @@
 #include <ginkgo/core/stop/stopping_status.hpp>
 
 
-#include "accessor/hip_helper.hpp"
+#include "accessor/cuda_hip_helper.hpp"
 #include "accessor/range.hpp"
 #include "accessor/reduced_row_major.hpp"
 #include "accessor/scaled_reduced_row_major.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/array_access.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/solver/cb_gmres_accessor.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
 #include "hip/components/atomic.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/uninitialized_array.hip.hpp"
@@ -118,7 +118,7 @@ void restart(std::shared_ptr<const DefaultExecutor> exec,
     restart_1_kernel<block_size>
         <<<grid_dim_1, block_dim, 0, exec->get_stream()>>>(
             residual->get_size()[0], residual->get_size()[1], krylov_dim,
-            acc::as_hip_range(krylov_bases),
+            acc::as_device_range(krylov_bases),
             as_device_type(residual_norm_collection->get_values()),
             residual_norm_collection->get_stride());
     kernels::hip::dense::compute_norm2_dispatch(exec, residual, residual_norm,
@@ -147,7 +147,7 @@ void restart(std::shared_ptr<const DefaultExecutor> exec,
                 residual_norm->get_stride(),
                 as_device_type(arnoldi_norm->get_const_values() +
                                2 * stride_arnoldi),
-                stride_arnoldi, acc::as_hip_range(krylov_bases));
+                stride_arnoldi, acc::as_device_range(krylov_bases));
     }
 
     const auto grid_dim_2 =
@@ -160,7 +160,7 @@ void restart(std::shared_ptr<const DefaultExecutor> exec,
             residual->get_stride(),
             as_device_type(residual_norm->get_const_values()),
             as_device_type(residual_norm_collection->get_values()),
-            acc::as_hip_range(krylov_bases),
+            acc::as_device_range(krylov_bases),
             as_device_type(next_krylov_basis->get_values()),
             next_krylov_basis->get_stride(),
             as_device_type(final_iter_nums->get_data()));
@@ -214,7 +214,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
         as_device_type(next_krylov_basis->get_const_values()),
         stride_next_krylov, as_device_type(arnoldi_norm->get_values()),
         as_device_type(stop_status));
-    // nrmP = norm(next_krylov_basis
+    // nrmP = norm(next_krylov_basis)
     zero_matrix(exec, iter + 1, dim_size[1], stride_hessenberg,
                 hessenberg_iter->get_values());
     if (dim_size[1] > 1) {
@@ -222,7 +222,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
             <<<grid_size_num_iters, block_size, 0, exec->get_stream()>>>(
                 dim_size[0], dim_size[1],
                 as_device_type(next_krylov_basis->get_const_values()),
-                stride_next_krylov, acc::as_hip_range(krylov_bases),
+                stride_next_krylov, acc::as_device_range(krylov_bases),
                 as_device_type(hessenberg_iter->get_values()),
                 stride_hessenberg, as_device_type(stop_status));
     } else {
@@ -231,7 +231,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
                exec->get_stream()>>>(
                 dim_size[0],
                 as_device_type(next_krylov_basis->get_const_values()),
-                stride_next_krylov, acc::as_hip_range(krylov_bases),
+                stride_next_krylov, acc::as_device_range(krylov_bases),
                 as_device_type(hessenberg_iter->get_values()),
                 stride_hessenberg, as_device_type(stop_status));
     }
@@ -243,7 +243,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
            default_block_size, 0, exec->get_stream()>>>(
             iter + 1, dim_size[0], dim_size[1],
             as_device_type(next_krylov_basis->get_values()), stride_next_krylov,
-            acc::as_hip_range(krylov_bases),
+            acc::as_device_range(krylov_bases),
             as_device_type(hessenberg_iter->get_const_values()),
             stride_hessenberg, as_device_type(stop_status));
 
@@ -272,7 +272,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
            exec->get_stream()>>>(
             dim_size[1], as_device_type(arnoldi_norm->get_values()),
             stride_arnoldi, as_device_type(hessenberg_iter->get_values()),
-            stride_hessenberg, iter + 1, acc::as_hip_range(krylov_bases),
+            stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases),
             as_device_type(stop_status), as_device_type(reorth_status),
             as_device_type(num_reorth->get_data()));
     num_reorth_host = get_element(*num_reorth, 0);
@@ -285,7 +285,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
                 <<<grid_size_num_iters, block_size, 0, exec->get_stream()>>>(
                     dim_size[0], dim_size[1],
                     as_device_type(next_krylov_basis->get_const_values()),
-                    stride_next_krylov, acc::as_hip_range(krylov_bases),
+                    stride_next_krylov, acc::as_device_range(krylov_bases),
                     as_device_type(buffer_iter->get_values()), stride_buffer,
                     as_device_type(stop_status));
         } else {
@@ -294,7 +294,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
                    exec->get_stream()>>>(
                     dim_size[0],
                     as_device_type(next_krylov_basis->get_const_values()),
-                    stride_next_krylov, acc::as_hip_range(krylov_bases),
+                    stride_next_krylov, acc::as_device_range(krylov_bases),
                     as_device_type(buffer_iter->get_values()), stride_buffer,
                     as_device_type(stop_status));
         }
@@ -306,7 +306,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
                default_block_size, 0, exec->get_stream()>>>(
                 iter + 1, dim_size[0], dim_size[1],
                 as_device_type(next_krylov_basis->get_values()),
-                stride_next_krylov, acc::as_hip_range(krylov_bases),
+                stride_next_krylov, acc::as_device_range(krylov_bases),
                 as_device_type(hessenberg_iter->get_values()),
                 stride_hessenberg,
                 as_device_type(buffer_iter->get_const_values()), stride_buffer,
@@ -338,7 +338,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
                exec->get_stream()>>>(
                 dim_size[1], as_device_type(arnoldi_norm->get_values()),
                 stride_arnoldi, as_device_type(hessenberg_iter->get_values()),
-                stride_hessenberg, iter + 1, acc::as_hip_range(krylov_bases),
+                stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases),
                 as_device_type(stop_status), as_device_type(reorth_status),
                 num_reorth->get_data());
         num_reorth_host = get_element(*num_reorth, 0);
@@ -350,7 +350,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
            default_block_size, 0, exec->get_stream()>>>(
             iter, dim_size[0], dim_size[1],
             as_device_type(next_krylov_basis->get_values()), stride_next_krylov,
-            acc::as_hip_range(krylov_bases),
+            acc::as_device_range(krylov_bases),
             as_device_type(hessenberg_iter->get_const_values()),
             stride_hessenberg, as_device_type(stop_status));
     // next_krylov_basis /= hessenberg(iter, iter + 1)
@@ -464,7 +464,7 @@ void calculate_qy(std::shared_ptr<const DefaultExecutor> exec,
 
     calculate_Qy_kernel<block_size>
         <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-            num_rows, num_cols, acc::as_hip_range(krylov_bases),
+            num_rows, num_cols, acc::as_device_range(krylov_bases),
             as_device_type(y->get_const_values()), y->get_stride(),
             as_device_type(before_preconditioner->get_values()),
             stride_before_preconditioner,
diff --git a/hip/solver/common_trs_kernels.hip.hpp b/hip/solver/common_trs_kernels.hip.hpp
index d05bc1a9f6f..daab3a387e6 100644
--- a/hip/solver/common_trs_kernels.hip.hpp
+++ b/hip/solver/common_trs_kernels.hip.hpp
@@ -10,7 +10,7 @@
 #include <memory>
 
 
-#include <hip/hip_runtime.h>
+#include "common/cuda_hip/base/runtime.hpp"
 #if HIP_VERSION >= 50200000
 #include <hipsparse/hipsparse.h>
 #else
@@ -22,12 +22,12 @@
 #include <ginkgo/core/base/math.hpp>
 
 
+#include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/hipsparse_bindings.hip.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/pointer_mode_guard.hip.hpp"
-#include "hip/base/types.hip.hpp"
 
 
 namespace gko {
@@ -63,7 +63,7 @@ struct SolveStruct : gko::solver::SolveStruct {
             factor_descr, unit_diag ? HIPSPARSE_DIAG_TYPE_UNIT
                                     : HIPSPARSE_DIAG_TYPE_NON_UNIT));
         GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseCreateCsrsv2Info(&solve_info));
-        policy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
+        policy = SPARSELIB_SOLVE_POLICY_USE_LEVEL;
     }
 
     SolveStruct(const SolveStruct&) = delete;
@@ -114,17 +114,17 @@ void generate_kernel(std::shared_ptr<const HipExecutor> exec,
     if (matrix->get_size()[0] == 0) {
         return;
     }
-    if (hipsparse::is_supported<ValueType, IndexType>::value) {
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
         solve_struct =
             std::make_shared<solver::hip::SolveStruct>(is_upper, unit_diag);
         if (auto hip_solve_struct =
                 std::dynamic_pointer_cast<solver::hip::SolveStruct>(
                     solve_struct)) {
-            auto handle = exec->get_hipsparse_handle();
+            auto handle = exec->get_sparselib_handle();
 
             {
-                hipsparse::pointer_mode_guard pm_guard(handle);
-                hipsparse::csrsv2_buffer_size(
+                sparselib::pointer_mode_guard pm_guard(handle);
+                sparselib::csrsv2_buffer_size(
                     handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
                     matrix->get_size()[0], matrix->get_num_stored_elements(),
                     hip_solve_struct->factor_descr, matrix->get_const_values(),
@@ -139,7 +139,7 @@ void generate_kernel(std::shared_ptr<const HipExecutor> exec,
                 hip_solve_struct->factor_work_vec =
                     exec->alloc<void*>(hip_solve_struct->factor_work_size);
 
-                hipsparse::csrsv2_analysis(
+                sparselib::csrsv2_analysis(
                     handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
                     matrix->get_size()[0], matrix->get_num_stored_elements(),
                     hip_solve_struct->factor_descr, matrix->get_const_values(),
@@ -170,16 +170,16 @@ void solve_kernel(std::shared_ptr<const HipExecutor> exec,
     }
     using vec = matrix::Dense<ValueType>;
 
-    if (hipsparse::is_supported<ValueType, IndexType>::value) {
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
         if (auto hip_solve_struct =
                 dynamic_cast<const solver::hip::SolveStruct*>(solve_struct)) {
             ValueType one = 1.0;
-            auto handle = exec->get_hipsparse_handle();
+            auto handle = exec->get_sparselib_handle();
 
             {
-                hipsparse::pointer_mode_guard pm_guard(handle);
+                sparselib::pointer_mode_guard pm_guard(handle);
                 if (b->get_stride() == 1) {
-                    hipsparse::csrsv2_solve(
+                    sparselib::csrsv2_solve(
                         handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
                         matrix->get_size()[0],
                         matrix->get_num_stored_elements(), &one,
@@ -194,7 +194,7 @@ void solve_kernel(std::shared_ptr<const HipExecutor> exec,
                     dense::transpose(exec, b, trans_b);
                     dense::transpose(exec, x, trans_x);
                     for (IndexType i = 0; i < trans_b->get_size()[0]; i++) {
-                        hipsparse::csrsv2_solve(
+                        sparselib::csrsv2_solve(
                             handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
                             matrix->get_size()[0],
                             matrix->get_num_stored_elements(), &one,
diff --git a/hip/solver/idr_kernels.hip.cpp b/hip/solver/idr_kernels.hip.cpp
index 83dbfe61f48..b1ef414c091 100644
--- a/hip/solver/idr_kernels.hip.cpp
+++ b/hip/solver/idr_kernels.hip.cpp
@@ -9,20 +9,19 @@
 #include <random>
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
 
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/randlib_bindings.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/fill_array_kernels.hpp"
-#include "hip/base/hipblas_bindings.hip.hpp"
-#include "hip/base/hiprand_bindings.hip.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
 #include "hip/components/atomic.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
@@ -71,14 +70,14 @@ void initialize_subspace_vectors(std::shared_ptr<const DefaultExecutor> exec,
                                  bool deterministic)
 {
     if (!deterministic) {
-        auto gen = hiprand::rand_generator(std::random_device{}(),
-                                           HIPRAND_RNG_PSEUDO_DEFAULT,
+        auto gen = randlib::rand_generator(std::random_device{}(),
+                                           RANDLIB_RNG_PSEUDO_DEFAULT,
                                            exec->get_stream());
-        hiprand::rand_vector(
+        randlib::rand_vector(
             gen,
             subspace_vectors->get_size()[0] * subspace_vectors->get_stride(),
             0.0, 1.0, subspace_vectors->get_values());
-        hiprand::destroy(gen);
+        randlib::destroy(gen);
     }
 }
 
@@ -147,9 +146,8 @@ void update_g_and_u(std::shared_ptr<const DefaultExecutor> exec,
                 as_device_type(alpha->get_values()),
                 stop_status->get_const_data());
         } else {
-            hipblas::dot(exec->get_hipblas_handle(), size, p_i, 1,
-                         g_k->get_values(), g_k->get_stride(),
-                         alpha->get_values());
+            blas::dot(exec->get_blas_handle(), size, p_i, 1, g_k->get_values(),
+                      g_k->get_stride(), alpha->get_values());
         }
         update_g_k_and_u_kernel<default_block_size>
             <<<ceildiv(size * g_k->get_stride(), default_block_size),
@@ -198,8 +196,8 @@ void update_m(std::shared_ptr<const DefaultExecutor> exec, const size_type nrhs,
                 as_device_type(g_k->get_const_values()), g_k->get_stride(),
                 as_device_type(m_i), stop_status->get_const_data());
         } else {
-            hipblas::dot(exec->get_hipblas_handle(), size, p_i, 1,
-                         g_k->get_const_values(), g_k->get_stride(), m_i);
+            blas::dot(exec->get_blas_handle(), size, p_i, 1,
+                      g_k->get_const_values(), g_k->get_stride(), m_i);
         }
     }
 }
diff --git a/hip/solver/lower_trs_kernels.hip.cpp b/hip/solver/lower_trs_kernels.hip.cpp
index 08f35d3d674..1a43b3c0151 100644
--- a/hip/solver/lower_trs_kernels.hip.cpp
+++ b/hip/solver/lower_trs_kernels.hip.cpp
@@ -8,7 +8,7 @@
 #include <memory>
 
 
-#include <hip/hip_runtime.h>
+#include "common/cuda_hip/base/runtime.hpp"
 #if HIP_VERSION >= 50200000
 #include <hipsparse/hipsparse.h>
 #else
@@ -21,9 +21,9 @@
 #include <ginkgo/core/solver/triangular.hpp>
 
 
-#include "hip/base/hipsparse_bindings.hip.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
 #include "hip/solver/common_trs_kernels.hip.hpp"
 
 
diff --git a/hip/solver/multigrid_kernels.hip.cpp b/hip/solver/multigrid_kernels.hip.cpp
index 41aab8003bd..f68105ba6d8 100644
--- a/hip/solver/multigrid_kernels.hip.cpp
+++ b/hip/solver/multigrid_kernels.hip.cpp
@@ -5,18 +5,16 @@
 #include "core/solver/multigrid_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
 #include "core/base/array_access.hpp"
 #include "core/components/fill_array_kernels.hpp"
-#include "hip/base/types.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
 
diff --git a/hip/solver/upper_trs_kernels.hip.cpp b/hip/solver/upper_trs_kernels.hip.cpp
index cd6b0719844..bcb63a26bc8 100644
--- a/hip/solver/upper_trs_kernels.hip.cpp
+++ b/hip/solver/upper_trs_kernels.hip.cpp
@@ -8,7 +8,7 @@
 #include <memory>
 
 
-#include <hip/hip_runtime.h>
+#include "common/cuda_hip/base/runtime.hpp"
 #if HIP_VERSION >= 50200000
 #include <hipsparse/hipsparse.h>
 #else
@@ -21,9 +21,9 @@
 #include <ginkgo/core/solver/triangular.hpp>
 
 
-#include "hip/base/hipsparse_bindings.hip.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
 #include "hip/solver/common_trs_kernels.hip.hpp"
 
 
diff --git a/hip/stop/criterion_kernels.hip.cpp b/hip/stop/criterion_kernels.hip.cpp
index 8c7caeb32b8..3d24daa5bd5 100644
--- a/hip/stop/criterion_kernels.hip.cpp
+++ b/hip/stop/criterion_kernels.hip.cpp
@@ -10,8 +10,8 @@
 #include <ginkgo/core/stop/stopping_status.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
 
diff --git a/hip/stop/residual_norm_kernels.hip.cpp b/hip/stop/residual_norm_kernels.hip.cpp
index d790dd652f0..7f2b0646ea2 100644
--- a/hip/stop/residual_norm_kernels.hip.cpp
+++ b/hip/stop/residual_norm_kernels.hip.cpp
@@ -5,17 +5,15 @@
 #include "core/stop/residual_norm_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
 #include "core/base/array_access.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
 
diff --git a/hip/test/base/math.hip.cpp b/hip/test/base/math.hip.cpp
index 2c25f5b3a7a..8462cbe5716 100644
--- a/hip/test/base/math.hip.cpp
+++ b/hip/test/base/math.hip.cpp
@@ -23,8 +23,8 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
 #include "hip/test/utils.hip.hpp"
 
 
diff --git a/hip/test/components/cooperative_groups.hip.cpp b/hip/test/components/cooperative_groups.hip.cpp
index d22dfeca0b6..53f4b9a72a0 100644
--- a/hip/test/components/cooperative_groups.hip.cpp
+++ b/hip/test/components/cooperative_groups.hip.cpp
@@ -8,9 +8,6 @@
 // force-top: off
 
 
-#include "hip/components/cooperative_groups.hip.hpp"
-
-
 #include <cstring>
 #include <memory>
 
@@ -22,7 +19,8 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
-#include "hip/base/types.hip.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "hip/test/utils.hip.hpp"
 
 
diff --git a/hip/test/components/merging.hip.cpp b/hip/test/components/merging.hip.cpp
index 7bfab76f795..b8ee2f03d29 100644
--- a/hip/test/components/merging.hip.cpp
+++ b/hip/test/components/merging.hip.cpp
@@ -24,7 +24,7 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
-#include "hip/components/cooperative_groups.hip.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "hip/test/utils.hip.hpp"
 
 
diff --git a/hip/test/components/searching.hip.cpp b/hip/test/components/searching.hip.cpp
index 1db0c6e9562..2662d367f4d 100644
--- a/hip/test/components/searching.hip.cpp
+++ b/hip/test/components/searching.hip.cpp
@@ -23,7 +23,7 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
-#include "hip/components/cooperative_groups.hip.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "hip/test/utils.hip.hpp"
 
 
diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index c1e3f54a720..ca20dba9007 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -1602,6 +1602,11 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
      */
     cublasContext* get_cublas_handle() const { return cublas_handle_.get(); }
 
+    /**
+     * @copydoc get_cublas_handle()
+     */
+    cublasContext* get_blas_handle() const { return get_cublas_handle(); }
+
     /**
      * Get the cusparse handle for this executor
      *
@@ -1612,6 +1617,14 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
         return cusparse_handle_.get();
     }
 
+    /**
+     * @copydoc get_cusparse_handle()
+     */
+    cusparseContext* get_sparselib_handle() const
+    {
+        return get_cusparse_handle();
+    }
+
     /**
      * Get the closest PUs
      *
@@ -1807,6 +1820,11 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
      */
     hipblasContext* get_hipblas_handle() const { return hipblas_handle_.get(); }
 
+    /**
+     * @copydoc get_hipblas_handle()
+     */
+    hipblasContext* get_blas_handle() const { return get_hipblas_handle(); }
+
     /**
      * Get the hipsparse handle for this executor
      *
@@ -1817,6 +1835,14 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
         return hipsparse_handle_.get();
     }
 
+    /**
+     * @copydoc get_hipsparse_handle()
+     */
+    hipsparseContext* get_sparselib_handle() const
+    {
+        return get_hipsparse_handle();
+    }
+
     /**
      * Get the closest NUMA node
      *
diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt
index 59d49e44140..333bb2a9b21 100644
--- a/omp/CMakeLists.txt
+++ b/omp/CMakeLists.txt
@@ -54,7 +54,7 @@ target_sources(ginkgo_omp
     )
 
 ginkgo_compile_features(ginkgo_omp)
-target_compile_definitions(ginkgo_omp PRIVATE GKO_COMPILING_OMP)
+target_compile_definitions(ginkgo_omp PRIVATE GKO_COMPILING_OMP GKO_GKO_DEVICE_NAMESPACE=omp)
 
 # TODO FIXME: Currently nvhpc 22.7+ optimizations break the omp jacobi's custom
 # precision implementation (mantissa segmentation)
@@ -94,7 +94,7 @@ ginkgo_default_includes(ginkgo_omp)
 ginkgo_install_library(ginkgo_omp)
 
 if (GINKGO_CHECK_CIRCULAR_DEPS)
-    ginkgo_check_headers(ginkgo_omp GKO_COMPILING_OMP)
+    ginkgo_check_headers(ginkgo_omp "GKO_COMPILING_OMP;GKO_GKO_DEVICE_NAMESPACE=omp")
 endif()
 
 if(GINKGO_BUILD_TESTS)

From 48146482c8a2dddaaf2439c8a3d566e7ec7b85eb Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 22 May 2024 14:04:35 +0200
Subject: [PATCH 008/448] fix compilation

---
 benchmark/CMakeLists.txt                      |  2 +
 common/cuda_hip/base/blas_bindings.hpp        |  8 ++--
 common/cuda_hip/base/config.hpp               |  8 ++--
 common/cuda_hip/base/pointer_mode_guard.hpp   |  8 ++--
 common/cuda_hip/base/randlib_bindings.hpp     |  8 ++--
 common/cuda_hip/base/runtime.hpp              |  6 ++-
 common/cuda_hip/base/sparselib_bindings.hpp   |  8 ++--
 common/cuda_hip/base/thrust.hpp               |  4 +-
 common/cuda_hip/base/types.hpp                | 11 ++++-
 .../components/cooperative_groups.hpp         |  8 ++--
 .../cuda_hip/components/format_conversion.hpp |  8 ++--
 common/cuda_hip/components/memory.hpp         |  8 ++--
 cuda/CMakeLists.txt                           | 34 ++++++-------
 ...cobi_advanced_apply_kernels.instantiate.cu |  2 +-
 .../jacobi_generate_kernels.instantiate.cu    |  2 +-
 ...jacobi_simple_apply_kernels.instantiate.cu |  2 +-
 hip/CMakeLists.txt                            | 48 +++++++++----------
 ... => jacobi_advanced_apply_kernels.hip.cpp} |  0
 ...dvanced_apply_kernels.instantiate.hip.cpp} |  2 +-
 ...ip.cpp => jacobi_generate_kernels.hip.cpp} |  2 +-
 ...cobi_generate_kernels.instantiate.hip.cpp} |  2 +-
 ...pp => jacobi_simple_apply_kernels.hip.cpp} |  2 +-
 ..._simple_apply_kernels.instantiate.hip.cpp} |  2 +-
 23 files changed, 109 insertions(+), 76 deletions(-)
 rename hip/preconditioner/{jacobi_advanced_apply_kernel.hip.cpp => jacobi_advanced_apply_kernels.hip.cpp} (100%)
 rename hip/preconditioner/{jacobi_advanced_apply_instantiate.inc.hip.cpp => jacobi_advanced_apply_kernels.instantiate.hip.cpp} (97%)
 rename hip/preconditioner/{jacobi_generate_kernel.hip.cpp => jacobi_generate_kernels.hip.cpp} (97%)
 rename hip/preconditioner/{jacobi_generate_instantiate.inc.hip.cpp => jacobi_generate_kernels.instantiate.hip.cpp} (98%)
 rename hip/preconditioner/{jacobi_simple_apply_kernel.hip.cpp => jacobi_simple_apply_kernels.hip.cpp} (97%)
 rename hip/preconditioner/{jacobi_simple_apply_instantiate.inc.hip.cpp => jacobi_simple_apply_kernels.instantiate.hip.cpp} (97%)

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index ca209e65057..306655d2315 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -20,6 +20,7 @@ function(ginkgo_benchmark_cusparse_linops type def)
     endif()
     # make the dependency public to catch issues
     target_compile_definitions(cusparse_linops_${type} PUBLIC ${def})
+    target_compile_definitions(cusparse_linops_${type} PRIVATE GKO_COMPILING_CUDA)
     target_link_libraries(cusparse_linops_${type} Ginkgo::ginkgo CUDA::cudart CUDA::cublas CUDA::cusparse)
 endfunction()
 
@@ -27,6 +28,7 @@ function(ginkgo_benchmark_hipsparse_linops type def)
     add_library(hipsparse_linops_${type} utils/hip_linops.hip.cpp)
     set_source_files_properties(utils/hip_linops.hip.cpp PROPERTIES LANGUAGE HIP)
     target_compile_definitions(hipsparse_linops_${type} PUBLIC ${def})
+    target_compile_definitions(hipsparse_linops_${type} PRIVATE GKO_COMPILING_HIP)
     target_include_directories(hipsparse_linops_${type} SYSTEM PRIVATE ${HIPBLAS_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS})
     target_link_libraries(hipsparse_linops_${type} Ginkgo::ginkgo ${HIPSPARSE_LIBRARIES})
 endfunction()
diff --git a/common/cuda_hip/base/blas_bindings.hpp b/common/cuda_hip/base/blas_bindings.hpp
index 1708fb88ce1..784d67515de 100644
--- a/common/cuda_hip/base/blas_bindings.hpp
+++ b/common/cuda_hip/base/blas_bindings.hpp
@@ -6,10 +6,12 @@
 #define GKO_COMMON_CUDA_HIP_BASE_BLAS_BINDINGS_HPP_
 
 
-#ifdef GKO_COMPILING_HIP
-#include "hip/base/hipblas_bindings.hip.hpp"
-#else  // GKO_COMPILING_CUDA
+#ifdef GKO_COMPILING_CUDA
 #include "cuda/base/cublas_bindings.hpp"
+#elif defined(GKO_COMPILING_HIP)
+#include "hip/base/hipblas_bindings.hip.hpp"
+#else
+#error "Executor definition missing"
 #endif
 
 
diff --git a/common/cuda_hip/base/config.hpp b/common/cuda_hip/base/config.hpp
index d2085ae946b..2bc4b78cfd9 100644
--- a/common/cuda_hip/base/config.hpp
+++ b/common/cuda_hip/base/config.hpp
@@ -6,10 +6,12 @@
 #define GKO_COMMON_CUDA_HIP_BASE_CONFIG_HPP_
 
 
-#ifdef GKO_COMPILING_HIP
-#include "hip/base/config.hip.hpp"
-#else  // GKO_COMPILING_CUDA
+#ifdef GKO_COMPILING_CUDA
 #include "cuda/base/config.hpp"
+#elif defined(GKO_COMPILING_HIP)
+#include "hip/base/config.hip.hpp"
+#else
+#error "Executor definition missing"
 #endif
 
 
diff --git a/common/cuda_hip/base/pointer_mode_guard.hpp b/common/cuda_hip/base/pointer_mode_guard.hpp
index 41ff6242e49..ddc51557ac4 100644
--- a/common/cuda_hip/base/pointer_mode_guard.hpp
+++ b/common/cuda_hip/base/pointer_mode_guard.hpp
@@ -6,10 +6,12 @@
 #define GKO_COMMON_CUDA_HIP_BASE_POINTER_MODE_GUARD_HPP_
 
 
-#ifdef GKO_COMPILING_HIP
-#include "hip/base/pointer_mode_guard.hip.hpp"
-#else  // GKO_COMPILING_CUDA
+#ifdef GKO_COMPILING_CUDA
 #include "cuda/base/pointer_mode_guard.hpp"
+#elif defined(GKO_COMPILING_HIP)
+#include "hip/base/pointer_mode_guard.hip.hpp"
+#else
+#error "Executor definition missing"
 #endif
 
 
diff --git a/common/cuda_hip/base/randlib_bindings.hpp b/common/cuda_hip/base/randlib_bindings.hpp
index 249489b0e68..d7d023d2b70 100644
--- a/common/cuda_hip/base/randlib_bindings.hpp
+++ b/common/cuda_hip/base/randlib_bindings.hpp
@@ -6,10 +6,12 @@
 #define GKO_COMMON_CUDA_HIP_BASE_RANDLIB_BINDINGS_HPP_
 
 
-#ifdef GKO_COMPILING_HIP
-#include "hip/base/hiprand_bindings.hip.hpp"
-#else  // GKO_COMPILING_CUDA
+#ifdef GKO_COMPILING_CUDA
 #include "cuda/base/curand_bindings.hpp"
+#elif defined(GKO_COMPILING_HIP)
+#include "hip/base/hiprand_bindings.hip.hpp"
+#else
+#error "Executor definition missing"
 #endif
 
 
diff --git a/common/cuda_hip/base/runtime.hpp b/common/cuda_hip/base/runtime.hpp
index ccddfdd2661..2020c744b71 100644
--- a/common/cuda_hip/base/runtime.hpp
+++ b/common/cuda_hip/base/runtime.hpp
@@ -6,8 +6,12 @@
 #define GKO_COMMON_CUDA_HIP_BASE_RUNTIME_HPP_
 
 
-#ifdef GKO_COMPILING_HIP
+#ifdef GKO_COMPILING_CUDA
+// nothing needed here
+#elif defined(GKO_COMPILING_HIP)
 #include <hip/hip_runtime.h>
+#else
+#error "Executor definition missing"
 #endif
 
 
diff --git a/common/cuda_hip/base/sparselib_bindings.hpp b/common/cuda_hip/base/sparselib_bindings.hpp
index bc565f9190a..eeb7cef0734 100644
--- a/common/cuda_hip/base/sparselib_bindings.hpp
+++ b/common/cuda_hip/base/sparselib_bindings.hpp
@@ -6,10 +6,12 @@
 #define GKO_COMMON_CUDA_HIP_BASE_SPARSELIB_BINDINGS_HPP_
 
 
-#ifdef GKO_COMPILING_HIP
-#include "hip/base/hipsparse_bindings.hip.hpp"
-#else  // GKO_COMPILING_CUDA
+#ifdef GKO_COMPILING_CUDA
 #include "cuda/base/cusparse_bindings.hpp"
+#elif defined(GKO_COMPILING_HIP)
+#include "hip/base/hipsparse_bindings.hip.hpp"
+#else
+#error "Executor definition missing"
 #endif
 
 
diff --git a/common/cuda_hip/base/thrust.hpp b/common/cuda_hip/base/thrust.hpp
index f2015d6d544..adc904d550c 100644
--- a/common/cuda_hip/base/thrust.hpp
+++ b/common/cuda_hip/base/thrust.hpp
@@ -31,7 +31,7 @@ inline auto thrust_policy(std::shared_ptr<const CudaExecutor> exec)
 {
     return thrust::cuda::par.on(exec->get_stream());
 }
-#else
+#elif defined(GKO_COMPILING_HIP)
 inline auto thrust_policy(std::shared_ptr<const HipExecutor> exec)
 {
 #if GINKGO_HIP_PLATFORM_HCC
@@ -40,6 +40,8 @@ inline auto thrust_policy(std::shared_ptr<const HipExecutor> exec)
     return thrust::cuda::par.on(exec->get_stream());
 #endif
 }
+#else
+#error "Executor definition missing"
 #endif
 
 
diff --git a/common/cuda_hip/base/types.hpp b/common/cuda_hip/base/types.hpp
index 213664d3a4d..ff6302a68fa 100644
--- a/common/cuda_hip/base/types.hpp
+++ b/common/cuda_hip/base/types.hpp
@@ -2,8 +2,17 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_BASE_TYPES_HPP_
+#define GKO_COMMON_CUDA_HIP_BASE_TYPES_HPP_
+
+
 #ifdef GKO_COMPILING_CUDA
 #include "cuda/base/types.hpp"
-#else
+#elif defined(GKO_COMPILING_HIP)
 #include "hip/base/types.hip.hpp"
+#else
+#error "Executor definition missing"
 #endif
+
+
+#endif  // GKO_COMMON_CUDA_HIP_BASE_TYPES_HPP_
diff --git a/common/cuda_hip/components/cooperative_groups.hpp b/common/cuda_hip/components/cooperative_groups.hpp
index b1f17842302..64c9be4fa8e 100644
--- a/common/cuda_hip/components/cooperative_groups.hpp
+++ b/common/cuda_hip/components/cooperative_groups.hpp
@@ -6,10 +6,12 @@
 #define GKO_COMMON_CUDA_HIP_COMPONENTS_COOPERATIVE_GROUPS_HPP_
 
 
-#ifdef GKO_COMPILING_HIP
-#include "hip/components/cooperative_groups.hip.hpp"
-#else  // GKO_COMPILING_CUDA
+#ifdef GKO_COMPILING_CUDA
 #include "cuda/components/cooperative_groups.cuh"
+#elif defined(GKO_COMPILING_HIP)
+#include "hip/components/cooperative_groups.hip.hpp"
+#else
+#error "Executor definition missing"
 #endif
 
 
diff --git a/common/cuda_hip/components/format_conversion.hpp b/common/cuda_hip/components/format_conversion.hpp
index a16d09b2e3a..af6461ccd5e 100644
--- a/common/cuda_hip/components/format_conversion.hpp
+++ b/common/cuda_hip/components/format_conversion.hpp
@@ -6,10 +6,12 @@
 #define GKO_COMMON_CUDA_HIP_COMPONENTS_FORMAT_CONVERSION_HPP_
 
 
-#ifdef GKO_COMPILING_HIP
-#include "hip/components/format_conversion.hip.hpp"
-#else  // GKO_COMPILING_CUDA
+#ifdef GKO_COMPILING_CUDA
 #include "cuda/components/format_conversion.cuh"
+#elif defined(GKO_COMPILING_HIP)
+#include "hip/components/format_conversion.hip.hpp"
+#else
+#error "Executor definition missing"
 #endif
 
 
diff --git a/common/cuda_hip/components/memory.hpp b/common/cuda_hip/components/memory.hpp
index 974431e2fb8..e7b1de548c6 100644
--- a/common/cuda_hip/components/memory.hpp
+++ b/common/cuda_hip/components/memory.hpp
@@ -6,10 +6,12 @@
 #define GKO_COMMON_CUDA_HIP_COMPONENTS_MEMORY_HPP_
 
 
-#ifdef GKO_COMPILING_HIP
-#include "hip/components/memory.hip.hpp"
-#else  // GKO_COMPILING_CUDA
+#ifdef GKO_COMPILING_CUDA
 #include "cuda/components/memory.cuh"
+#elif defined(GKO_COMPILING_HIP)
+#include "hip/components/memory.hip.hpp"
+#else
+#error "Executor definition missing"
 #endif
 
 
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index 88ae83e9005..3d251ecfa82 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -33,12 +33,12 @@ target_sources(ginkgo_cuda
     factorization/par_ic_kernels.cu
     factorization/par_ict_kernels.cu
     factorization/par_ilu_kernels.cu
-    factorization/par_ilut_approx_filter_kernel.cu
-    factorization/par_ilut_filter_kernel.cu
+    factorization/par_ilut_approx_filter_kernels.cu
+    factorization/par_ilut_filter_kernels.cu
     factorization/par_ilut_select_common.cu
-    factorization/par_ilut_select_kernel.cu
-    factorization/par_ilut_spgeam_kernel.cu
-    factorization/par_ilut_sweep_kernel.cu
+    factorization/par_ilut_select_kernels.cu
+    factorization/par_ilut_spgeam_kernels.cu
+    factorization/par_ilut_sweep_kernels.cu
     matrix/batch_csr_kernels.cu
     matrix/batch_dense_kernels.cu
     matrix/batch_ell_kernels.cu
@@ -54,10 +54,10 @@ target_sources(ginkgo_cuda
     multigrid/pgm_kernels.cu
     preconditioner/batch_jacobi_kernels.cu
     preconditioner/isai_kernels.cu
-    preconditioner/jacobi_advanced_apply_kernel.cu
-    preconditioner/jacobi_generate_kernel.cu
+    preconditioner/jacobi_advanced_apply_kernels.cu
+    preconditioner/jacobi_generate_kernels.cu
     preconditioner/jacobi_kernels.cu
-    preconditioner/jacobi_simple_apply_kernel.cu
+    preconditioner/jacobi_simple_apply_kernels.cu
     reorder/rcm_kernels.cu
     solver/batch_bicgstab_kernels.cu
     solver/batch_cg_kernels.cu
@@ -85,18 +85,18 @@ endif()
 set(GKO_CUDA_JACOBI_SOURCES)
 foreach(GKO_JACOBI_BLOCK_SIZE IN LISTS GKO_CUDA_JACOBI_BLOCK_SIZES)
     configure_file(
-        preconditioner/jacobi_generate_instantiate.inc.cu
-        preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu)
+        preconditioner/jacobi_generate_kernels.instantiate.cu
+        preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu)
     configure_file(
-        preconditioner/jacobi_simple_apply_instantiate.inc.cu
-        preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu)
+        preconditioner/jacobi_simple_apply_kernels.instantiate.cu
+        preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu)
     configure_file(
-        preconditioner/jacobi_advanced_apply_instantiate.inc.cu
-        preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu)
+        preconditioner/jacobi_advanced_apply_kernels.instantiate.cu
+        preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu)
     list(APPEND GKO_CUDA_JACOBI_SOURCES
-        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu
-        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu
-        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu)
+        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu
+        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu
+        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu)
 endforeach()
 target_sources(ginkgo_cuda PRIVATE ${GKO_CUDA_JACOBI_SOURCES})
 string(REPLACE ";" "," GKO_CUDA_JACOBI_BLOCK_SIZES_CODE "${GKO_CUDA_JACOBI_BLOCK_SIZES}")
diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu
index ed33437c613..10ede90da7e 100644
--- a/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu
+++ b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu
@@ -32,7 +32,7 @@ namespace cuda {
 namespace jacobi {
 
 
-#include <common/cuda_hip/preconditioner/jacobi_advanced_apply_kernel.hpp.inc>
+#include "common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc"
 
 
 // clang-format off
diff --git a/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu
index 56e8ff6f16f..129c50625f4 100644
--- a/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu
+++ b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu
@@ -35,7 +35,7 @@ namespace cuda {
 namespace jacobi {
 
 
-#include <common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc>
+#include "common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc"
 
 
 // clang-format off
diff --git a/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu
index 97a7bfff489..15f6dc138ad 100644
--- a/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu
+++ b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu
@@ -32,7 +32,7 @@ namespace cuda {
 namespace jacobi {
 
 
-#include <common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc>
+#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc"
 
 
 // clang-format off
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index de44eb20682..99e167b9798 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.21)
 include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
-add_instantiation_files(. matrix/csr_kernels.instantiate.hip.cpp CSR_INSTANTIATE)
-add_instantiation_files(. matrix/fbcsr_kernels.instantiate.hip.cpp FBCSR_INSTANTIATE)
+add_instantiation_files(. matrix/csr_kernels.instantiate.hip.cpp CSR_kernels.instantiate)
+add_instantiation_files(. matrix/fbcsr_kernels.instantiate.hip.cpp FBCSR_kernels.instantiate)
 # we don't split up the dense kernels into distinct compilations
 list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
 set(GINKGO_HIP_SOURCES
@@ -31,30 +31,30 @@ set(GINKGO_HIP_SOURCES
     factorization/par_ic_kernels.hip.cpp
     factorization/par_ict_kernels.hip.cpp
     factorization/par_ilu_kernels.hip.cpp
-    factorization/par_ilut_approx_filter_kernel.hip.cpp
-    factorization/par_ilut_filter_kernel.hip.cpp
+    factorization/par_ilut_approx_filter_kernels.hip.cpp
+    factorization/par_ilut_filter_kernels.hip.cpp
     factorization/par_ilut_select_common.hip.cpp
-    factorization/par_ilut_select_kernel.hip.cpp
-    factorization/par_ilut_spgeam_kernel.hip.cpp
-    factorization/par_ilut_sweep_kernel.hip.cpp
+    factorization/par_ilut_select_kernels.hip.cpp
+    factorization/par_ilut_spgeam_kernels.hip.cpp
+    factorization/par_ilut_sweep_kernels.hip.cpp
     matrix/batch_csr_kernels.hip.cpp
     matrix/batch_dense_kernels.hip.cpp
     matrix/batch_ell_kernels.hip.cpp
     matrix/coo_kernels.hip.cpp
-    ${CSR_INSTANTIATE}
+    ${CSR_kernels.instantiate}
     matrix/dense_kernels.hip.cpp
     matrix/diagonal_kernels.hip.cpp
     matrix/ell_kernels.hip.cpp
-    ${FBCSR_INSTANTIATE}
+    ${FBCSR_kernels.instantiate}
     matrix/sellp_kernels.hip.cpp
     matrix/sparsity_csr_kernels.hip.cpp
     multigrid/pgm_kernels.hip.cpp
     preconditioner/batch_jacobi_kernels.hip.cpp
     preconditioner/isai_kernels.hip.cpp
-    preconditioner/jacobi_advanced_apply_kernel.hip.cpp
-    preconditioner/jacobi_generate_kernel.hip.cpp
+    preconditioner/jacobi_advanced_apply_kernels.hip.cpp
+    preconditioner/jacobi_generate_kernels.hip.cpp
     preconditioner/jacobi_kernels.hip.cpp
-    preconditioner/jacobi_simple_apply_kernel.hip.cpp
+    preconditioner/jacobi_simple_apply_kernels.hip.cpp
     reorder/rcm_kernels.hip.cpp
     solver/batch_bicgstab_kernels.hip.cpp
     solver/batch_cg_kernels.hip.cpp
@@ -86,28 +86,28 @@ else()
 endif()
 foreach(GKO_JACOBI_BLOCK_SIZE IN LISTS GKO_HIP_JACOBI_BLOCK_SIZES)
     configure_file(
-        preconditioner/jacobi_generate_instantiate.inc.hip.cpp
-        preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp)
+        preconditioner/jacobi_generate_kernels.instantiate.hip.cpp
+        preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp)
     configure_file(
-        preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp
-        preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp)
+        preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp
+        preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp)
     configure_file(
-        preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp
-        preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp)
+        preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp
+        preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp)
     # The 3D indexing used in Jacobi kernel triggers an instruction selection bug in Debug builds
     # Probably the same as https://github.com/llvm/llvm-project/issues/67574
     # Fixed in ROCm 6.0 https://github.com/ROCm/llvm-project/commit/cd7f574a1fd1d3f3e8b9c1cae61fa8133a51de5f
     # and in LLVM trunk https://github.com/llvm/llvm-project/commit/cc3d2533cc2e4ea06981b86ede5087fbf801e789
     set_source_files_properties(
-        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp
-        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp
-        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp
+        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp
+        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp
+        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp
         PROPERTIES
         COMPILE_OPTIONS $<$<CONFIG:Debug>:-O2>)
     list(APPEND GINKGO_HIP_SOURCES
-        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp
-        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp
-        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp)
+        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp
+        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp
+        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp)
 endforeach()
 string(REPLACE ";" "," GKO_HIP_JACOBI_BLOCK_SIZES_CODE "${GKO_HIP_JACOBI_BLOCK_SIZES}")
 configure_file(preconditioner/jacobi_common.hip.hpp.in preconditioner/jacobi_common.hip.hpp)
diff --git a/hip/preconditioner/jacobi_advanced_apply_kernel.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp
similarity index 100%
rename from hip/preconditioner/jacobi_advanced_apply_kernel.hip.cpp
rename to hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp
diff --git a/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp
similarity index 97%
rename from hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp
rename to hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp
index 67a65385ca4..358c6f3b337 100644
--- a/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp
+++ b/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp
@@ -33,7 +33,7 @@ namespace hip {
 namespace jacobi {
 
 
-#include <common/cuda_hip/preconditioner/jacobi_advanced_apply_kernel.hpp.inc>
+#include "common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc"
 
 
 // clang-format off
diff --git a/hip/preconditioner/jacobi_generate_kernel.hip.cpp b/hip/preconditioner/jacobi_generate_kernels.hip.cpp
similarity index 97%
rename from hip/preconditioner/jacobi_generate_kernel.hip.cpp
rename to hip/preconditioner/jacobi_generate_kernels.hip.cpp
index 50bf72ea964..6365f6c132e 100644
--- a/hip/preconditioner/jacobi_generate_kernel.hip.cpp
+++ b/hip/preconditioner/jacobi_generate_kernels.hip.cpp
@@ -36,7 +36,7 @@ namespace hip {
 namespace jacobi {
 
 
-#include <common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc>
+#include "common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc"
 
 
 template <int warps_per_block, int max_block_size, typename ValueType,
diff --git a/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp
similarity index 98%
rename from hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp
rename to hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp
index d95a97d7068..4634f8a0c57 100644
--- a/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp
+++ b/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp
@@ -35,7 +35,7 @@ namespace hip {
 namespace jacobi {
 
 
-#include <common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc>
+#include "common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc"
 
 
 // clang-format off
diff --git a/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp
similarity index 97%
rename from hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp
rename to hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp
index e8e247210ec..37b78f17469 100644
--- a/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp
+++ b/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp
@@ -33,7 +33,7 @@ namespace hip {
 namespace jacobi {
 
 
-#include <common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc>
+#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc"
 
 
 template <int warps_per_block, int max_block_size, typename ValueType,
diff --git a/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp
similarity index 97%
rename from hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp
rename to hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp
index b3e6e6fe73b..421a32c3efc 100644
--- a/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp
+++ b/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp
@@ -32,7 +32,7 @@ namespace hip {
 namespace jacobi {
 
 
-#include <common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc>
+#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc"
 
 
 // clang-format off

From 334bca2d16f3bfaafa2f45ab7af9d1296aa93ef2 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Mon, 24 Jun 2024 14:21:56 +0200
Subject: [PATCH 009/448] review updates

- fix HIP compilation issues
- uniform ifdef checks
- deviceComplex type aliases
- remove unnecessary includes

Co-authored-by: Marcel Koch <marcel.koch@kit.edu>
---
 common/cuda_hip/base/blas_bindings.hpp            | 2 +-
 common/cuda_hip/base/config.hpp                   | 2 +-
 common/cuda_hip/base/pointer_mode_guard.hpp       | 2 +-
 common/cuda_hip/base/randlib_bindings.hpp         | 2 +-
 common/cuda_hip/base/runtime.hpp                  | 2 +-
 common/cuda_hip/base/sparselib_bindings.hpp       | 2 +-
 common/cuda_hip/base/thrust.hpp                   | 2 +-
 common/cuda_hip/base/types.hpp                    | 2 +-
 common/cuda_hip/components/cooperative_groups.hpp | 2 +-
 common/cuda_hip/components/format_conversion.hpp  | 2 +-
 common/cuda_hip/components/memory.hpp             | 2 +-
 cuda/base/types.hpp                               | 4 ++--
 cuda/distributed/vector_kernels.cu                | 3 ---
 cuda/factorization/par_ilu_kernels.cu             | 1 -
 hip/base/config.hip.hpp                           | 3 ---
 hip/base/types.hip.hpp                            | 4 ++--
 hip/distributed/vector_kernels.hip.cpp            | 3 ---
 hip/factorization/par_ilu_kernels.hip.cpp         | 1 -
 18 files changed, 15 insertions(+), 26 deletions(-)

diff --git a/common/cuda_hip/base/blas_bindings.hpp b/common/cuda_hip/base/blas_bindings.hpp
index 784d67515de..e59bbf0d7a0 100644
--- a/common/cuda_hip/base/blas_bindings.hpp
+++ b/common/cuda_hip/base/blas_bindings.hpp
@@ -6,7 +6,7 @@
 #define GKO_COMMON_CUDA_HIP_BASE_BLAS_BINDINGS_HPP_
 
 
-#ifdef GKO_COMPILING_CUDA
+#if defined(GKO_COMPILING_CUDA)
 #include "cuda/base/cublas_bindings.hpp"
 #elif defined(GKO_COMPILING_HIP)
 #include "hip/base/hipblas_bindings.hip.hpp"
diff --git a/common/cuda_hip/base/config.hpp b/common/cuda_hip/base/config.hpp
index 2bc4b78cfd9..00825fe8b72 100644
--- a/common/cuda_hip/base/config.hpp
+++ b/common/cuda_hip/base/config.hpp
@@ -6,7 +6,7 @@
 #define GKO_COMMON_CUDA_HIP_BASE_CONFIG_HPP_
 
 
-#ifdef GKO_COMPILING_CUDA
+#if defined(GKO_COMPILING_CUDA)
 #include "cuda/base/config.hpp"
 #elif defined(GKO_COMPILING_HIP)
 #include "hip/base/config.hip.hpp"
diff --git a/common/cuda_hip/base/pointer_mode_guard.hpp b/common/cuda_hip/base/pointer_mode_guard.hpp
index ddc51557ac4..40bf694ef73 100644
--- a/common/cuda_hip/base/pointer_mode_guard.hpp
+++ b/common/cuda_hip/base/pointer_mode_guard.hpp
@@ -6,7 +6,7 @@
 #define GKO_COMMON_CUDA_HIP_BASE_POINTER_MODE_GUARD_HPP_
 
 
-#ifdef GKO_COMPILING_CUDA
+#if defined(GKO_COMPILING_CUDA)
 #include "cuda/base/pointer_mode_guard.hpp"
 #elif defined(GKO_COMPILING_HIP)
 #include "hip/base/pointer_mode_guard.hip.hpp"
diff --git a/common/cuda_hip/base/randlib_bindings.hpp b/common/cuda_hip/base/randlib_bindings.hpp
index d7d023d2b70..7797ad38c64 100644
--- a/common/cuda_hip/base/randlib_bindings.hpp
+++ b/common/cuda_hip/base/randlib_bindings.hpp
@@ -6,7 +6,7 @@
 #define GKO_COMMON_CUDA_HIP_BASE_RANDLIB_BINDINGS_HPP_
 
 
-#ifdef GKO_COMPILING_CUDA
+#if defined(GKO_COMPILING_CUDA)
 #include "cuda/base/curand_bindings.hpp"
 #elif defined(GKO_COMPILING_HIP)
 #include "hip/base/hiprand_bindings.hip.hpp"
diff --git a/common/cuda_hip/base/runtime.hpp b/common/cuda_hip/base/runtime.hpp
index 2020c744b71..6a7a7a3c4a2 100644
--- a/common/cuda_hip/base/runtime.hpp
+++ b/common/cuda_hip/base/runtime.hpp
@@ -6,7 +6,7 @@
 #define GKO_COMMON_CUDA_HIP_BASE_RUNTIME_HPP_
 
 
-#ifdef GKO_COMPILING_CUDA
+#if defined(GKO_COMPILING_CUDA)
 // nothing needed here
 #elif defined(GKO_COMPILING_HIP)
 #include <hip/hip_runtime.h>
diff --git a/common/cuda_hip/base/sparselib_bindings.hpp b/common/cuda_hip/base/sparselib_bindings.hpp
index eeb7cef0734..26c0bda236d 100644
--- a/common/cuda_hip/base/sparselib_bindings.hpp
+++ b/common/cuda_hip/base/sparselib_bindings.hpp
@@ -6,7 +6,7 @@
 #define GKO_COMMON_CUDA_HIP_BASE_SPARSELIB_BINDINGS_HPP_
 
 
-#ifdef GKO_COMPILING_CUDA
+#if defined(GKO_COMPILING_CUDA)
 #include "cuda/base/cusparse_bindings.hpp"
 #elif defined(GKO_COMPILING_HIP)
 #include "hip/base/hipsparse_bindings.hip.hpp"
diff --git a/common/cuda_hip/base/thrust.hpp b/common/cuda_hip/base/thrust.hpp
index adc904d550c..02aaebc9f3d 100644
--- a/common/cuda_hip/base/thrust.hpp
+++ b/common/cuda_hip/base/thrust.hpp
@@ -26,7 +26,7 @@ namespace kernels {
 namespace GKO_DEVICE_NAMESPACE {
 
 
-#ifdef GKO_COMPILING_CUDA
+#if defined(GKO_COMPILING_CUDA)
 inline auto thrust_policy(std::shared_ptr<const CudaExecutor> exec)
 {
     return thrust::cuda::par.on(exec->get_stream());
diff --git a/common/cuda_hip/base/types.hpp b/common/cuda_hip/base/types.hpp
index ff6302a68fa..08f0516d691 100644
--- a/common/cuda_hip/base/types.hpp
+++ b/common/cuda_hip/base/types.hpp
@@ -6,7 +6,7 @@
 #define GKO_COMMON_CUDA_HIP_BASE_TYPES_HPP_
 
 
-#ifdef GKO_COMPILING_CUDA
+#if defined(GKO_COMPILING_CUDA)
 #include "cuda/base/types.hpp"
 #elif defined(GKO_COMPILING_HIP)
 #include "hip/base/types.hip.hpp"
diff --git a/common/cuda_hip/components/cooperative_groups.hpp b/common/cuda_hip/components/cooperative_groups.hpp
index 64c9be4fa8e..a57440f6d30 100644
--- a/common/cuda_hip/components/cooperative_groups.hpp
+++ b/common/cuda_hip/components/cooperative_groups.hpp
@@ -6,7 +6,7 @@
 #define GKO_COMMON_CUDA_HIP_COMPONENTS_COOPERATIVE_GROUPS_HPP_
 
 
-#ifdef GKO_COMPILING_CUDA
+#if defined(GKO_COMPILING_CUDA)
 #include "cuda/components/cooperative_groups.cuh"
 #elif defined(GKO_COMPILING_HIP)
 #include "hip/components/cooperative_groups.hip.hpp"
diff --git a/common/cuda_hip/components/format_conversion.hpp b/common/cuda_hip/components/format_conversion.hpp
index af6461ccd5e..9faf7a58c25 100644
--- a/common/cuda_hip/components/format_conversion.hpp
+++ b/common/cuda_hip/components/format_conversion.hpp
@@ -6,7 +6,7 @@
 #define GKO_COMMON_CUDA_HIP_COMPONENTS_FORMAT_CONVERSION_HPP_
 
 
-#ifdef GKO_COMPILING_CUDA
+#if defined(GKO_COMPILING_CUDA)
 #include "cuda/components/format_conversion.cuh"
 #elif defined(GKO_COMPILING_HIP)
 #include "hip/components/format_conversion.hip.hpp"
diff --git a/common/cuda_hip/components/memory.hpp b/common/cuda_hip/components/memory.hpp
index e7b1de548c6..9bfd9cba1e0 100644
--- a/common/cuda_hip/components/memory.hpp
+++ b/common/cuda_hip/components/memory.hpp
@@ -6,7 +6,7 @@
 #define GKO_COMMON_CUDA_HIP_COMPONENTS_MEMORY_HPP_
 
 
-#ifdef GKO_COMPILING_CUDA
+#if defined(GKO_COMPILING_CUDA)
 #include "cuda/components/memory.cuh"
 #elif defined(GKO_COMPILING_HIP)
 #include "hip/components/memory.hip.hpp"
diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp
index 510d7cef889..561612f2869 100644
--- a/cuda/base/types.hpp
+++ b/cuda/base/types.hpp
@@ -394,8 +394,8 @@ GKO_INLINE GKO_ATTRIBUTES constexpr
 }
 
 
-using gpuComplex = cuComplex;
-using gpuDoubleComplex = cuDoubleComplex;
+using deviceComplex = cuComplex;
+using deviceDoubleComplex = cuDoubleComplex;
 
 
 }  // namespace cuda
diff --git a/cuda/distributed/vector_kernels.cu b/cuda/distributed/vector_kernels.cu
index 7b06ada9f0e..ca9c419239b 100644
--- a/cuda/distributed/vector_kernels.cu
+++ b/cuda/distributed/vector_kernels.cu
@@ -5,9 +5,6 @@
 #include "core/distributed/vector_kernels.hpp"
 
 
-#include <functional>
-
-
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/transform_iterator.h>
diff --git a/cuda/factorization/par_ilu_kernels.cu b/cuda/factorization/par_ilu_kernels.cu
index cd48dd2a9db..755723e7d4c 100644
--- a/cuda/factorization/par_ilu_kernels.cu
+++ b/cuda/factorization/par_ilu_kernels.cu
@@ -5,7 +5,6 @@
 #include "core/factorization/par_ilu_kernels.hpp"
 
 
-#include <ginkgo/core/base/std_extensions.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 
 
diff --git a/hip/base/config.hip.hpp b/hip/base/config.hip.hpp
index 3f531616489..89dc67255fc 100644
--- a/hip/base/config.hip.hpp
+++ b/hip/base/config.hip.hpp
@@ -6,9 +6,6 @@
 #define GKO_HIP_BASE_CONFIG_HIP_HPP_
 
 
-#include <hip/device_functions.h>
-
-
 #include <ginkgo/core/base/types.hpp>
 
 
diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp
index 883e5812080..cb33cbf5df8 100644
--- a/hip/base/types.hip.hpp
+++ b/hip/base/types.hip.hpp
@@ -432,8 +432,8 @@ GKO_INLINE GKO_ATTRIBUTES constexpr
 }
 
 
-using gpuComplex = hipComplex;
-using gpuDoubleComplex = hipDoubleComplex;
+using deviceComplex = hipComplex;
+using deviceDoubleComplex = hipDoubleComplex;
 
 
 }  // namespace hip
diff --git a/hip/distributed/vector_kernels.hip.cpp b/hip/distributed/vector_kernels.hip.cpp
index 320d847ed85..fc6718dec0d 100644
--- a/hip/distributed/vector_kernels.hip.cpp
+++ b/hip/distributed/vector_kernels.hip.cpp
@@ -5,9 +5,6 @@
 #include "core/distributed/vector_kernels.hpp"
 
 
-#include <functional>
-
-
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/transform_iterator.h>
diff --git a/hip/factorization/par_ilu_kernels.hip.cpp b/hip/factorization/par_ilu_kernels.hip.cpp
index fc05273bb09..49608d6801f 100644
--- a/hip/factorization/par_ilu_kernels.hip.cpp
+++ b/hip/factorization/par_ilu_kernels.hip.cpp
@@ -5,7 +5,6 @@
 #include "core/factorization/par_ilu_kernels.hpp"
 
 
-#include <ginkgo/core/base/std_extensions.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 
 

From 8162796cf0dc74461aaa9078aa242b9b3eefe488 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Mon, 24 Jun 2024 15:49:44 +0200
Subject: [PATCH 010/448] fix replacement errors

---
 dpcpp/CMakeLists.txt |  2 +-
 hip/CMakeLists.txt   | 10 +++++-----
 omp/CMakeLists.txt   |  4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index 035134ac4e1..ee373243842 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -126,7 +126,7 @@ ginkgo_default_includes(ginkgo_dpcpp)
 ginkgo_install_library(ginkgo_dpcpp)
 
 if (GINKGO_CHECK_CIRCULAR_DEPS)
-    ginkgo_check_headers(ginkgo_dpcpp "GKO_COMPILING_DPCPP;GKO_GKO_DEVICE_NAMESPACE=dpcpp")
+    ginkgo_check_headers(ginkgo_dpcpp "GKO_COMPILING_DPCPP;GKO_DEVICE_NAMESPACE=dpcpp")
 endif()
 
 if(GINKGO_BUILD_TESTS)
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 99e167b9798..bf2d6a6cf58 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.21)
 include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
-add_instantiation_files(. matrix/csr_kernels.instantiate.hip.cpp CSR_kernels.instantiate)
-add_instantiation_files(. matrix/fbcsr_kernels.instantiate.hip.cpp FBCSR_kernels.instantiate)
+add_instantiation_files(. matrix/csr_kernels.instantiate.hip.cpp CSR_INSTANTIATE)
+add_instantiation_files(. matrix/fbcsr_kernels.instantiate.hip.cpp FBCSR_INSTANTIATE)
 # we don't split up the dense kernels into distinct compilations
 list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
 set(GINKGO_HIP_SOURCES
@@ -41,11 +41,11 @@ set(GINKGO_HIP_SOURCES
     matrix/batch_dense_kernels.hip.cpp
     matrix/batch_ell_kernels.hip.cpp
     matrix/coo_kernels.hip.cpp
-    ${CSR_kernels.instantiate}
+    ${CSR_INSTANTIATE}
     matrix/dense_kernels.hip.cpp
     matrix/diagonal_kernels.hip.cpp
     matrix/ell_kernels.hip.cpp
-    ${FBCSR_kernels.instantiate}
+    ${FBCSR_INSTANTIATE}
     matrix/sellp_kernels.hip.cpp
     matrix/sparsity_csr_kernels.hip.cpp
     multigrid/pgm_kernels.hip.cpp
@@ -138,7 +138,7 @@ ginkgo_default_includes(ginkgo_hip)
 ginkgo_install_library(ginkgo_hip)
 
 if (GINKGO_CHECK_CIRCULAR_DEPS)
-    ginkgo_check_headers(ginkgo_hip "GKO_COMPILING_HIP;GKO_GKO_DEVICE_NAMESPACE=hip")
+    ginkgo_check_headers(ginkgo_hip "GKO_COMPILING_HIP;GKO_DEVICE_NAMESPACE=hip")
 endif()
 
 if(GINKGO_BUILD_TESTS)
diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt
index 333bb2a9b21..41bec80673f 100644
--- a/omp/CMakeLists.txt
+++ b/omp/CMakeLists.txt
@@ -54,7 +54,7 @@ target_sources(ginkgo_omp
     )
 
 ginkgo_compile_features(ginkgo_omp)
-target_compile_definitions(ginkgo_omp PRIVATE GKO_COMPILING_OMP GKO_GKO_DEVICE_NAMESPACE=omp)
+target_compile_definitions(ginkgo_omp PRIVATE GKO_COMPILING_OMP GKO_DEVICE_NAMESPACE=omp)
 
 # TODO FIXME: Currently nvhpc 22.7+ optimizations break the omp jacobi's custom
 # precision implementation (mantissa segmentation)
@@ -94,7 +94,7 @@ ginkgo_default_includes(ginkgo_omp)
 ginkgo_install_library(ginkgo_omp)
 
 if (GINKGO_CHECK_CIRCULAR_DEPS)
-    ginkgo_check_headers(ginkgo_omp "GKO_COMPILING_OMP;GKO_GKO_DEVICE_NAMESPACE=omp")
+    ginkgo_check_headers(ginkgo_omp "GKO_COMPILING_OMP;GKO_DEVICE_NAMESPACE=omp")
 endif()
 
 if(GINKGO_BUILD_TESTS)

From 8be5bcd81506f800306bc885eb41cc365d633ebe Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 26 Jun 2024 18:12:00 +0200
Subject: [PATCH 011/448] review updates

- make sparselib/blas the only non-deprecated way of getting handles
- fix header orders

Co-authored-by: Yuhsiang M. Tsai <yhmtsai@gmail.com>
---
 benchmark/utils/cuda_linops.cpp           | 16 ++++++++--------
 benchmark/utils/hip_linops.hip.cpp        |  8 ++++----
 cuda/solver/common_trs_kernels.cuh        |  4 ++--
 hip/base/exception.hip.cpp                |  4 +++-
 hip/base/hipblas_bindings.hip.hpp         |  2 +-
 hip/base/hiprand_bindings.hip.hpp         |  2 +-
 hip/base/hipsparse_bindings.hip.hpp       |  2 +-
 hip/base/hipsparse_block_bindings.hip.hpp |  2 +-
 hip/base/pointer_mode_guard.hip.hpp       |  4 +++-
 hip/base/types.hip.hpp                    |  4 +++-
 hip/matrix/fft_kernels.hip.cpp            |  4 +++-
 hip/solver/common_trs_kernels.hip.hpp     |  2 +-
 hip/solver/lower_trs_kernels.hip.cpp      |  2 +-
 hip/solver/upper_trs_kernels.hip.cpp      |  2 +-
 include/ginkgo/core/base/executor.hpp     | 20 ++++++++++++--------
 15 files changed, 45 insertions(+), 33 deletions(-)

diff --git a/benchmark/utils/cuda_linops.cpp b/benchmark/utils/cuda_linops.cpp
index f239740d655..1222301a4cf 100644
--- a/benchmark/utils/cuda_linops.cpp
+++ b/benchmark/utils/cuda_linops.cpp
@@ -139,7 +139,7 @@ class CusparseCsrmp
 
         auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::cuda::cusparse::spmv_mp(
-            this->get_gpu_exec()->get_cusparse_handle(), trans_,
+            this->get_gpu_exec()->get_sparselib_handle(), trans_,
             this->get_size()[0], this->get_size()[1],
             csr_->get_num_stored_elements(), &scalars.get_const_data()[0],
             this->get_descr(), csr_->get_const_values(),
@@ -213,7 +213,7 @@ class CusparseCsr
 
         auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::cuda::cusparse::spmv(
-            this->get_gpu_exec()->get_cusparse_handle(), trans_,
+            this->get_gpu_exec()->get_sparselib_handle(), trans_,
             this->get_size()[0], this->get_size()[1],
             csr_->get_num_stored_elements(), &scalars.get_const_data()[0],
             this->get_descr(), csr_->get_const_values(),
@@ -288,7 +288,7 @@ class CusparseCsrmm
 
         auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::cuda::cusparse::spmm(
-            this->get_gpu_exec()->get_cusparse_handle(), trans_,
+            this->get_gpu_exec()->get_sparselib_handle(), trans_,
             this->get_size()[0], dense_b->get_size()[1], this->get_size()[1],
             csr_->get_num_stored_elements(), &scalars.get_const_data()[0],
             this->get_descr(), csr_->get_const_values(),
@@ -376,7 +376,7 @@ class CusparseCsrEx
         gko::size_type buffer_size = 0;
 
         auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
-        auto handle = this->get_gpu_exec()->get_cusparse_handle();
+        auto handle = this->get_gpu_exec()->get_sparselib_handle();
         // This function seems to require the pointer mode to be set to HOST.
         // Ginkgo use pointer mode DEVICE by default, so we change this
         // temporarily.
@@ -465,7 +465,7 @@ class CusparseHybrid
 
         auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::cuda::cusparse::csr2hyb(
-            this->get_gpu_exec()->get_cusparse_handle(), this->get_size()[0],
+            this->get_gpu_exec()->get_sparselib_handle(), this->get_size()[0],
             this->get_size()[1], this->get_descr(), t_csr->get_const_values(),
             t_csr->get_const_row_ptrs(), t_csr->get_const_col_idxs(), hyb_,
             Threshold, Partition);
@@ -496,7 +496,7 @@ class CusparseHybrid
 
         auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::cuda::cusparse::spmv(
-            this->get_gpu_exec()->get_cusparse_handle(), trans_,
+            this->get_gpu_exec()->get_sparselib_handle(), trans_,
             &scalars.get_const_data()[0], this->get_descr(), hyb_, db,
             &scalars.get_const_data()[1], dx);
     }
@@ -555,13 +555,13 @@ void cusparse_generic_spmv(std::shared_ptr<const gko::CudaExecutor> gpu_exec,
 
     gko::size_type buffer_size = 0;
     GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpMV_bufferSize(
-        gpu_exec->get_cusparse_handle(), trans, &scalars.get_const_data()[0],
+        gpu_exec->get_sparselib_handle(), trans, &scalars.get_const_data()[0],
         mat, vecb, &scalars.get_const_data()[1], vecx, cu_value, alg,
         &buffer_size));
     gko::array<char> buffer_array(gpu_exec, buffer_size);
     auto dbuffer = buffer_array.get_data();
     GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpMV(
-        gpu_exec->get_cusparse_handle(), trans, &scalars.get_const_data()[0],
+        gpu_exec->get_sparselib_handle(), trans, &scalars.get_const_data()[0],
         mat, vecb, &scalars.get_const_data()[1], vecx, cu_value, alg, dbuffer));
     GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyDnVec(vecx));
     GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyDnVec(vecb));
diff --git a/benchmark/utils/hip_linops.hip.cpp b/benchmark/utils/hip_linops.hip.cpp
index 2d952ce60e9..bcbeee5ca14 100644
--- a/benchmark/utils/hip_linops.hip.cpp
+++ b/benchmark/utils/hip_linops.hip.cpp
@@ -126,7 +126,7 @@ class HipsparseCsr
 
         auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::hip::hipsparse::spmv(
-            this->get_gpu_exec()->get_hipsparse_handle(), trans_,
+            this->get_gpu_exec()->get_sparselib_handle(), trans_,
             this->get_size()[0], this->get_size()[1],
             csr_->get_num_stored_elements(), &scalars.get_const_data()[0],
             this->get_descr(), csr_->get_const_values(),
@@ -201,7 +201,7 @@ class HipsparseCsrmm
 
         auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::hip::hipsparse::spmm(
-            this->get_gpu_exec()->get_hipsparse_handle(), trans_,
+            this->get_gpu_exec()->get_sparselib_handle(), trans_,
             this->get_size()[0], dense_b->get_size()[1], this->get_size()[1],
             csr_->get_num_stored_elements(), &scalars.get_const_data()[0],
             this->get_descr(), csr_->get_const_values(),
@@ -269,7 +269,7 @@ class HipsparseHybrid
 
         auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::hip::hipsparse::csr2hyb(
-            this->get_gpu_exec()->get_hipsparse_handle(), this->get_size()[0],
+            this->get_gpu_exec()->get_sparselib_handle(), this->get_size()[0],
             this->get_size()[1], this->get_descr(), t_csr->get_const_values(),
             t_csr->get_const_row_ptrs(), t_csr->get_const_col_idxs(), hyb_,
             Threshold, Partition);
@@ -300,7 +300,7 @@ class HipsparseHybrid
 
         auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::hip::hipsparse::spmv(
-            this->get_gpu_exec()->get_hipsparse_handle(), trans_,
+            this->get_gpu_exec()->get_sparselib_handle(), trans_,
             &scalars.get_const_data()[0], this->get_descr(), hyb_, db,
             &scalars.get_const_data()[1], dx);
     }
diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh
index 549925bf2e7..9013f9172bc 100644
--- a/cuda/solver/common_trs_kernels.cuh
+++ b/cuda/solver/common_trs_kernels.cuh
@@ -66,7 +66,7 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
     CudaSolveStruct(std::shared_ptr<const gko::CudaExecutor> exec,
                     const matrix::Csr<ValueType, IndexType>* matrix,
                     size_type num_rhs, bool is_upper, bool unit_diag)
-        : handle{exec->get_cusparse_handle()},
+        : handle{exec->get_sparselib_handle()},
           spsm_descr{},
           descr_a{},
           num_rhs{num_rhs},
@@ -189,7 +189,7 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
                     const matrix::Csr<ValueType, IndexType>* matrix,
                     size_type num_rhs, bool is_upper, bool unit_diag)
         : exec{exec},
-          handle{exec->get_cusparse_handle()},
+          handle{exec->get_sparselib_handle()},
           algorithm{},
           solve_info{},
           policy{},
diff --git a/hip/base/exception.hip.cpp b/hip/base/exception.hip.cpp
index 3f569576c28..f0e17f4e873 100644
--- a/hip/base/exception.hip.cpp
+++ b/hip/base/exception.hip.cpp
@@ -8,7 +8,6 @@
 #include <string>
 
 
-#include "common/cuda_hip/base/runtime.hpp"
 #if HIP_VERSION >= 50200000
 #include <hipblas/hipblas.h>
 #include <hiprand/hiprand.h>
@@ -23,6 +22,9 @@
 #include <ginkgo/core/base/types.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+
+
 namespace gko {
 
 
diff --git a/hip/base/hipblas_bindings.hip.hpp b/hip/base/hipblas_bindings.hip.hpp
index 725c7e20698..d5dc94d6138 100644
--- a/hip/base/hipblas_bindings.hip.hpp
+++ b/hip/base/hipblas_bindings.hip.hpp
@@ -6,7 +6,6 @@
 #define GKO_HIP_BASE_HIPBLAS_BINDINGS_HIP_HPP_
 
 
-#include "common/cuda_hip/base/runtime.hpp"
 #if HIP_VERSION >= 50200000
 #include <hipblas/hipblas.h>
 #else
@@ -18,6 +17,7 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "hip/base/math.hip.hpp"
 
diff --git a/hip/base/hiprand_bindings.hip.hpp b/hip/base/hiprand_bindings.hip.hpp
index 1dd772db250..9fd7ade8231 100644
--- a/hip/base/hiprand_bindings.hip.hpp
+++ b/hip/base/hiprand_bindings.hip.hpp
@@ -6,7 +6,6 @@
 #define GKO_HIP_BASE_HIPRAND_BINDINGS_HIP_HPP_
 
 
-#include "common/cuda_hip/base/runtime.hpp"
 #if HIP_VERSION >= 50200000
 #include <hiprand/hiprand.h>
 #else
@@ -17,6 +16,7 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "hip/base/math.hip.hpp"
 
diff --git a/hip/base/hipsparse_bindings.hip.hpp b/hip/base/hipsparse_bindings.hip.hpp
index 997fc3d525f..0337f0a03c6 100644
--- a/hip/base/hipsparse_bindings.hip.hpp
+++ b/hip/base/hipsparse_bindings.hip.hpp
@@ -6,7 +6,6 @@
 #define GKO_HIP_BASE_HIPSPARSE_BINDINGS_HIP_HPP_
 
 
-#include "common/cuda_hip/base/runtime.hpp"
 #if HIP_VERSION >= 50200000
 #include <hipsparse/hipsparse.h>
 #else
@@ -18,6 +17,7 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 
 
diff --git a/hip/base/hipsparse_block_bindings.hip.hpp b/hip/base/hipsparse_block_bindings.hip.hpp
index c69b0353f22..6fb70c4571c 100644
--- a/hip/base/hipsparse_block_bindings.hip.hpp
+++ b/hip/base/hipsparse_block_bindings.hip.hpp
@@ -6,7 +6,6 @@
 #define GKO_HIP_BASE_HIPSPARSE_BLOCK_BINDINGS_HIP_HPP_
 
 
-#include "common/cuda_hip/base/runtime.hpp"
 #if HIP_VERSION >= 50200000
 #include <hipsparse/hipsparse.h>
 #else
@@ -17,6 +16,7 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "hip/base/hipsparse_bindings.hip.hpp"
 
diff --git a/hip/base/pointer_mode_guard.hip.hpp b/hip/base/pointer_mode_guard.hip.hpp
index 2c980b113a7..5cd4b3ec58f 100644
--- a/hip/base/pointer_mode_guard.hip.hpp
+++ b/hip/base/pointer_mode_guard.hip.hpp
@@ -9,7 +9,6 @@
 #include <exception>
 
 
-#include "common/cuda_hip/base/runtime.hpp"
 #if HIP_VERSION >= 50200000
 #include <hipblas/hipblas.h>
 #include <hipsparse/hipsparse.h>
@@ -24,6 +23,9 @@
 #include <ginkgo/core/base/std_extensions.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+
+
 namespace gko {
 namespace kernels {
 namespace hip {
diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp
index cb33cbf5df8..9ae2224c064 100644
--- a/hip/base/types.hip.hpp
+++ b/hip/base/types.hip.hpp
@@ -16,7 +16,6 @@
 #include <hip/hip_fp16.h>
 
 
-#include "common/cuda_hip/base/runtime.hpp"
 #if HIP_VERSION >= 50200000
 #include <hipblas/hipblas.h>
 #else
@@ -28,6 +27,9 @@
 #include <ginkgo/core/base/matrix_data.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+
+
 namespace gko {
 
 
diff --git a/hip/matrix/fft_kernels.hip.cpp b/hip/matrix/fft_kernels.hip.cpp
index 92358d732c7..31e180b4414 100644
--- a/hip/matrix/fft_kernels.hip.cpp
+++ b/hip/matrix/fft_kernels.hip.cpp
@@ -8,7 +8,6 @@
 #include <array>
 
 
-#include "common/cuda_hip/base/runtime.hpp"
 #if HIP_VERSION >= 50200000
 #include <hipfft/hipfft.h>
 #else
@@ -21,6 +20,9 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+
+
 namespace gko {
 
 
diff --git a/hip/solver/common_trs_kernels.hip.hpp b/hip/solver/common_trs_kernels.hip.hpp
index daab3a387e6..555a62d57a0 100644
--- a/hip/solver/common_trs_kernels.hip.hpp
+++ b/hip/solver/common_trs_kernels.hip.hpp
@@ -10,7 +10,6 @@
 #include <memory>
 
 
-#include "common/cuda_hip/base/runtime.hpp"
 #if HIP_VERSION >= 50200000
 #include <hipsparse/hipsparse.h>
 #else
@@ -23,6 +22,7 @@
 
 
 #include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "core/matrix/dense_kernels.hpp"
diff --git a/hip/solver/lower_trs_kernels.hip.cpp b/hip/solver/lower_trs_kernels.hip.cpp
index 1a43b3c0151..d355940a487 100644
--- a/hip/solver/lower_trs_kernels.hip.cpp
+++ b/hip/solver/lower_trs_kernels.hip.cpp
@@ -8,7 +8,6 @@
 #include <memory>
 
 
-#include "common/cuda_hip/base/runtime.hpp"
 #if HIP_VERSION >= 50200000
 #include <hipsparse/hipsparse.h>
 #else
@@ -21,6 +20,7 @@
 #include <ginkgo/core/solver/triangular.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "hip/base/math.hip.hpp"
diff --git a/hip/solver/upper_trs_kernels.hip.cpp b/hip/solver/upper_trs_kernels.hip.cpp
index bcb63a26bc8..2a31e450d27 100644
--- a/hip/solver/upper_trs_kernels.hip.cpp
+++ b/hip/solver/upper_trs_kernels.hip.cpp
@@ -8,7 +8,6 @@
 #include <memory>
 
 
-#include "common/cuda_hip/base/runtime.hpp"
 #if HIP_VERSION >= 50200000
 #include <hipsparse/hipsparse.h>
 #else
@@ -21,6 +20,7 @@
 #include <ginkgo/core/solver/triangular.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "hip/base/math.hip.hpp"
diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index ca20dba9007..761405c0b3d 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -1600,21 +1600,23 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
      *
      * @return  the cublas handle (cublasContext*) for this executor
      */
-    cublasContext* get_cublas_handle() const { return cublas_handle_.get(); }
+    GKO_DEPRECATED("use get_blas_handle() instead")
+    cublasContext* get_cublas_handle() const { return get_blas_handle(); }
 
     /**
      * @copydoc get_cublas_handle()
      */
-    cublasContext* get_blas_handle() const { return get_cublas_handle(); }
+    cublasContext* get_blas_handle() const { return cublas_handle_.get(); }
 
     /**
      * Get the cusparse handle for this executor
      *
      * @return the cusparse handle (cusparseContext*) for this executor
      */
+    GKO_DEPRECATED("use get_sparselib_handle() instead")
     cusparseContext* get_cusparse_handle() const
     {
-        return cusparse_handle_.get();
+        return get_sparselib_handle();
     }
 
     /**
@@ -1622,7 +1624,7 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
      */
     cusparseContext* get_sparselib_handle() const
     {
-        return get_cusparse_handle();
+        return cusparse_handle_.get();
     }
 
     /**
@@ -1818,21 +1820,23 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
      *
      * @return  the hipblas handle (hipblasContext*) for this executor
      */
-    hipblasContext* get_hipblas_handle() const { return hipblas_handle_.get(); }
+    GKO_DEPRECATED("use get_blas_handle() instead")
+    hipblasContext* get_hipblas_handle() const { return get_blas_handle(); }
 
     /**
      * @copydoc get_hipblas_handle()
      */
-    hipblasContext* get_blas_handle() const { return get_hipblas_handle(); }
+    hipblasContext* get_blas_handle() const { return hipblas_handle_.get(); }
 
     /**
      * Get the hipsparse handle for this executor
      *
      * @return the hipsparse handle (hipsparseContext*) for this executor
      */
+    GKO_DEPRECATED("use get_sparselib_handle() instead")
     hipsparseContext* get_hipsparse_handle() const
     {
-        return hipsparse_handle_.get();
+        return get_sparselib_handle();
     }
 
     /**
@@ -1840,7 +1844,7 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
      */
     hipsparseContext* get_sparselib_handle() const
     {
-        return get_hipsparse_handle();
+        return hipsparse_handle_.get();
     }
 
     /**

From 1e2d818d1ce6e43fc66916f1ea9451d9196b65ae Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 26 Jun 2024 18:13:00 +0200
Subject: [PATCH 012/448] disable cuSPARSE deprecation warnings

---
 cuda/base/cusparse_bindings.hpp         | 17 ++++-------------
 cuda/base/cusparse_block_bindings.hpp   |  2 ++
 cuda/matrix/csr_kernels.template.cu     |  2 +-
 hip/matrix/csr_kernels.template.hip.cpp |  2 +-
 4 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/cuda/base/cusparse_bindings.hpp b/cuda/base/cusparse_bindings.hpp
index 06aaf0c6f1d..c18e1d7e9a6 100644
--- a/cuda/base/cusparse_bindings.hpp
+++ b/cuda/base/cusparse_bindings.hpp
@@ -940,6 +940,7 @@ inline void destroy(csrsm2Info_t info)
 #endif  // defined(CUDA_VERSION) && (CUDA_VERSION < 11031)
 
 
+GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS
 inline csrilu02Info_t create_ilu0_info()
 {
     csrilu02Info_t info{};
@@ -966,6 +967,7 @@ inline void destroy_ic0_info(csric02Info_t info)
 {
     GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyCsric02Info(info));
 }
+GKO_END_DISABLE_DEPRECATION_WARNINGS
 
 
 #if (defined(CUDA_VERSION) && (CUDA_VERSION < 11031))
@@ -1174,19 +1176,6 @@ void spsm_solve(cusparseHandle_t handle, cusparseOperation_t op_a,
 #endif  // (defined(CUDA_VERSION) && (CUDA_VERSION >= 11031))
 
 
-template <typename IndexType>
-void create_identity_permutation(cusparseHandle_t handle, IndexType size,
-                                 IndexType* permutation) GKO_NOT_IMPLEMENTED;
-
-template <>
-inline void create_identity_permutation<int32>(cusparseHandle_t handle,
-                                               int32 size, int32* permutation)
-{
-    GKO_ASSERT_NO_CUSPARSE_ERRORS(
-        cusparseCreateIdentityPermutation(handle, size, permutation));
-}
-
-
 template <typename IndexType>
 void csrsort_buffer_size(cusparseHandle_t handle, IndexType m, IndexType n,
                          IndexType nnz, const IndexType* row_ptrs,
@@ -1264,6 +1253,7 @@ inline void gather(cusparseHandle_t handle, cusparseDnVecDescr_t in,
 #endif
 
 
+GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS
 template <typename ValueType, typename IndexType>
 void ilu0_buffer_size(cusparseHandle_t handle, IndexType m, IndexType nnz,
                       const cusparseMatDescr_t descr, const ValueType* vals,
@@ -1458,6 +1448,7 @@ GKO_BIND_CUSPARSE_IC0(float, cusparseScsric02);
 GKO_BIND_CUSPARSE_IC0(double, cusparseDcsric02);
 GKO_BIND_CUSPARSE_IC0(std::complex<float>, cusparseCcsric02);
 GKO_BIND_CUSPARSE_IC0(std::complex<double>, cusparseZcsric02);
+GKO_END_DISABLE_DEPRECATION_WARNINGS
 
 #undef GKO_BIND_CUSPARSE_IC0
 
diff --git a/cuda/base/cusparse_block_bindings.hpp b/cuda/base/cusparse_block_bindings.hpp
index fc64c19796c..c3db763f0da 100644
--- a/cuda/base/cusparse_block_bindings.hpp
+++ b/cuda/base/cusparse_block_bindings.hpp
@@ -190,6 +190,7 @@ GKO_BIND_CUSPARSE_BLOCK_TRANSPOSE32(std::complex<double>, cusparseZgebsr2gebsc);
 #undef GKO_BIND_CUSPARSE_BLOCK_TRANSPOSE32
 
 
+GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS
 inline std::unique_ptr<std::remove_pointer_t<bsrsm2Info_t>,
                        std::function<void(bsrsm2Info_t)>>
 create_bsr_trsm_info()
@@ -457,6 +458,7 @@ GKO_BIND_CUSPARSE_BILU0(std::complex<float>, cusparseCbsrilu02);
 GKO_BIND_CUSPARSE_BILU0(std::complex<double>, cusparseZbsrilu02);
 
 #undef GKO_BIND_CUSPARSE_BILU0
+GKO_END_DISABLE_DEPRECATION_WARNINGS
 
 
 }  // namespace cusparse
diff --git a/cuda/matrix/csr_kernels.template.cu b/cuda/matrix/csr_kernels.template.cu
index 73ce267ec65..a0a7e4e97b8 100644
--- a/cuda/matrix/csr_kernels.template.cu
+++ b/cuda/matrix/csr_kernels.template.cu
@@ -968,7 +968,7 @@ void sort_by_column_index(std::shared_ptr<const DefaultExecutor> exec,
         // init identity permutation
         array<IndexType> permutation_array(exec, nnz);
         auto permutation = permutation_array.get_data();
-        sparselib::create_identity_permutation(handle, nnz, permutation);
+        components::fill_seq_array(exec, permutation, nnz);
 
         // allocate buffer
         size_type buffer_size{};
diff --git a/hip/matrix/csr_kernels.template.hip.cpp b/hip/matrix/csr_kernels.template.hip.cpp
index 31debd60a3d..8b3579f049c 100644
--- a/hip/matrix/csr_kernels.template.hip.cpp
+++ b/hip/matrix/csr_kernels.template.hip.cpp
@@ -772,7 +772,7 @@ void sort_by_column_index(std::shared_ptr<const HipExecutor> exec,
         // init identity permutation
         array<IndexType> permutation_array(exec, nnz);
         auto permutation = permutation_array.get_data();
-        sparselib::create_identity_permutation(handle, nnz, permutation);
+        components::fill_seq_array(exec, permutation, nnz);
 
         // allocate buffer
         size_type buffer_size{};

From eee11062ecfa37a8a6c5a2c1bc73f463f9984a06 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Thu, 27 Jun 2024 16:16:20 +0200
Subject: [PATCH 013/448] replace remaining usages of sparselib-specific macros

---
 benchmark/utils/cuda_linops.cpp       | 14 +++++++-------
 benchmark/utils/hip_linops.hip.cpp    |  6 +++---
 cuda/solver/common_trs_kernels.cuh    | 24 ++++++++++++------------
 hip/solver/common_trs_kernels.hip.hpp |  8 ++++----
 4 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/benchmark/utils/cuda_linops.cpp b/benchmark/utils/cuda_linops.cpp
index 1222301a4cf..a404f9151ea 100644
--- a/benchmark/utils/cuda_linops.cpp
+++ b/benchmark/utils/cuda_linops.cpp
@@ -156,7 +156,7 @@ class CusparseCsrmp
         : gko::EnableLinOp<CusparseCsrmp, CusparseBase>(exec, size),
           csr_(std::move(
               csr::create(exec, std::make_shared<typename csr::classical>()))),
-          trans_(CUSPARSE_OPERATION_NON_TRANSPOSE)
+          trans_(SPARSELIB_OPERATION_NON_TRANSPOSE)
     {}
 
 private:
@@ -230,7 +230,7 @@ class CusparseCsr
         : gko::EnableLinOp<CusparseCsr, CusparseBase>(exec, size),
           csr_(std::move(
               csr::create(exec, std::make_shared<typename csr::classical>()))),
-          trans_(CUSPARSE_OPERATION_NON_TRANSPOSE)
+          trans_(SPARSELIB_OPERATION_NON_TRANSPOSE)
     {}
 
 private:
@@ -306,7 +306,7 @@ class CusparseCsrmm
         : gko::EnableLinOp<CusparseCsrmm, CusparseBase>(exec, size),
           csr_(std::move(
               csr::create(exec, std::make_shared<typename csr::classical>()))),
-          trans_(CUSPARSE_OPERATION_NON_TRANSPOSE)
+          trans_(SPARSELIB_OPERATION_NON_TRANSPOSE)
     {}
 
 private:
@@ -407,7 +407,7 @@ class CusparseCsrEx
         : gko::EnableLinOp<CusparseCsrEx, CusparseBase>(exec, size),
           csr_(std::move(
               csr::create(exec, std::make_shared<typename csr::classical>()))),
-          trans_(CUSPARSE_OPERATION_NON_TRANSPOSE),
+          trans_(SPARSELIB_OPERATION_NON_TRANSPOSE),
           buffer_(exec)
     {
         algmode_ = CUSPARSE_ALG_MERGE_PATH;
@@ -508,7 +508,7 @@ class CusparseHybrid
     CusparseHybrid(std::shared_ptr<const gko::Executor> exec,
                    const gko::dim<2>& size = gko::dim<2>{})
         : gko::EnableLinOp<CusparseHybrid, CusparseBase>(exec, size),
-          trans_(CUSPARSE_OPERATION_NON_TRANSPOSE)
+          trans_(SPARSELIB_OPERATION_NON_TRANSPOSE)
     {
         auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateHybMat(&hyb_));
@@ -654,7 +654,7 @@ class CusparseGenericCsr
         : gko::EnableLinOp<CusparseGenericCsr, CusparseBase>(exec, size),
           csr_(std::move(
               csr::create(exec, std::make_shared<typename csr::classical>()))),
-          trans_(CUSPARSE_OPERATION_NON_TRANSPOSE)
+          trans_(SPARSELIB_OPERATION_NON_TRANSPOSE)
     {}
 
 private:
@@ -745,7 +745,7 @@ class CusparseGenericCoo
                        const gko::dim<2>& size = gko::dim<2>{})
         : gko::EnableLinOp<CusparseGenericCoo, CusparseBase>(exec, size),
           coo_(std::move(coo::create(exec))),
-          trans_(CUSPARSE_OPERATION_NON_TRANSPOSE)
+          trans_(SPARSELIB_OPERATION_NON_TRANSPOSE)
     {}
 
 private:
diff --git a/benchmark/utils/hip_linops.hip.cpp b/benchmark/utils/hip_linops.hip.cpp
index bcbeee5ca14..f0d7edb45c3 100644
--- a/benchmark/utils/hip_linops.hip.cpp
+++ b/benchmark/utils/hip_linops.hip.cpp
@@ -143,7 +143,7 @@ class HipsparseCsr
         : gko::EnableLinOp<HipsparseCsr, HipsparseBase>(exec, size),
           csr_(std::move(
               csr::create(exec, std::make_shared<typename csr::classical>()))),
-          trans_(HIPSPARSE_OPERATION_NON_TRANSPOSE)
+          trans_(SPARSELIB_OPERATION_NON_TRANSPOSE)
     {}
 
 private:
@@ -219,7 +219,7 @@ class HipsparseCsrmm
         : gko::EnableLinOp<HipsparseCsrmm, HipsparseBase>(exec, size),
           csr_(std::move(
               csr::create(exec, std::make_shared<typename csr::classical>()))),
-          trans_(HIPSPARSE_OPERATION_NON_TRANSPOSE)
+          trans_(SPARSELIB_OPERATION_NON_TRANSPOSE)
     {}
 
 private:
@@ -312,7 +312,7 @@ class HipsparseHybrid
     HipsparseHybrid(std::shared_ptr<const gko::Executor> exec,
                     const gko::dim<2>& size = gko::dim<2>{})
         : gko::EnableLinOp<HipsparseHybrid, HipsparseBase>(exec, size),
-          trans_(HIPSPARSE_OPERATION_NON_TRANSPOSE)
+          trans_(SPARSELIB_OPERATION_NON_TRANSPOSE)
     {
         auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseCreateHybMat(&hyb_));
diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh
index 9013f9172bc..992974e95ef 100644
--- a/cuda/solver/common_trs_kernels.cuh
+++ b/cuda/solver/common_trs_kernels.cuh
@@ -102,14 +102,14 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
             reinterpret_cast<ValueType*>(0xDEAF0));
 
         auto work_size = sparselib::spsm_buffer_size(
-            handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-            CUSPARSE_OPERATION_NON_TRANSPOSE, one<ValueType>(), descr_a,
+            handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
+            SPARSELIB_OPERATION_NON_TRANSPOSE, one<ValueType>(), descr_a,
             descr_b, descr_c, CUSPARSE_SPSM_ALG_DEFAULT, spsm_descr);
 
         work.resize_and_reset(work_size);
 
-        sparselib::spsm_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                 CUSPARSE_OPERATION_NON_TRANSPOSE,
+        sparselib::spsm_analysis(handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
+                                 SPARSELIB_OPERATION_NON_TRANSPOSE,
                                  one<ValueType>(), descr_a, descr_b, descr_c,
                                  CUSPARSE_SPSM_ALG_DEFAULT, spsm_descr,
                                  work.get_data());
@@ -141,8 +141,8 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
         auto descr_c = sparselib::create_dnmat(
             output->get_size(), output->get_stride(), output->get_values());
 
-        sparselib::spsm_solve(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                              CUSPARSE_OPERATION_NON_TRANSPOSE,
+        sparselib::spsm_solve(handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
+                              SPARSELIB_OPERATION_NON_TRANSPOSE,
                               one<ValueType>(), descr_a, descr_b, descr_c,
                               CUSPARSE_SPSM_ALG_DEFAULT, spsm_descr);
 
@@ -215,8 +215,8 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
         size_type work_size{};
 
         sparselib::buffer_size_ext(
-            handle, algorithm, CUSPARSE_OPERATION_NON_TRANSPOSE,
-            CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs,
+            handle, algorithm, SPARSELIB_OPERATION_NON_TRANSPOSE,
+            SPARSELIB_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs,
             matrix->get_num_stored_elements(), one<ValueType>(), factor_descr,
             matrix->get_const_values(), matrix->get_const_row_ptrs(),
             matrix->get_const_col_idxs(), nullptr, num_rhs, solve_info, policy,
@@ -226,8 +226,8 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
         work.resize_and_reset(work_size);
 
         sparselib::csrsm2_analysis(
-            handle, algorithm, CUSPARSE_OPERATION_NON_TRANSPOSE,
-            CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs,
+            handle, algorithm, SPARSELIB_OPERATION_NON_TRANSPOSE,
+            SPARSELIB_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs,
             matrix->get_num_stored_elements(), one<ValueType>(), factor_descr,
             matrix->get_const_values(), matrix->get_const_row_ptrs(),
             matrix->get_const_col_idxs(), nullptr, num_rhs, solve_info, policy,
@@ -253,8 +253,8 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
         sparselib::pointer_mode_guard pm_guard(handle);
         dense::copy(exec, input, output);
         sparselib::csrsm2_solve(
-            handle, algorithm, CUSPARSE_OPERATION_NON_TRANSPOSE,
-            CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0],
+            handle, algorithm, SPARSELIB_OPERATION_NON_TRANSPOSE,
+            SPARSELIB_OPERATION_TRANSPOSE, matrix->get_size()[0],
             output->get_stride(), matrix->get_num_stored_elements(),
             one<ValueType>(), factor_descr, matrix->get_const_values(),
             matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
diff --git a/hip/solver/common_trs_kernels.hip.hpp b/hip/solver/common_trs_kernels.hip.hpp
index 555a62d57a0..9fac4be8547 100644
--- a/hip/solver/common_trs_kernels.hip.hpp
+++ b/hip/solver/common_trs_kernels.hip.hpp
@@ -125,7 +125,7 @@ void generate_kernel(std::shared_ptr<const HipExecutor> exec,
             {
                 sparselib::pointer_mode_guard pm_guard(handle);
                 sparselib::csrsv2_buffer_size(
-                    handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
+                    handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
                     matrix->get_size()[0], matrix->get_num_stored_elements(),
                     hip_solve_struct->factor_descr, matrix->get_const_values(),
                     matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
@@ -140,7 +140,7 @@ void generate_kernel(std::shared_ptr<const HipExecutor> exec,
                     exec->alloc<void*>(hip_solve_struct->factor_work_size);
 
                 sparselib::csrsv2_analysis(
-                    handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
+                    handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
                     matrix->get_size()[0], matrix->get_num_stored_elements(),
                     hip_solve_struct->factor_descr, matrix->get_const_values(),
                     matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
@@ -180,7 +180,7 @@ void solve_kernel(std::shared_ptr<const HipExecutor> exec,
                 sparselib::pointer_mode_guard pm_guard(handle);
                 if (b->get_stride() == 1) {
                     sparselib::csrsv2_solve(
-                        handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
+                        handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
                         matrix->get_size()[0],
                         matrix->get_num_stored_elements(), &one,
                         hip_solve_struct->factor_descr,
@@ -195,7 +195,7 @@ void solve_kernel(std::shared_ptr<const HipExecutor> exec,
                     dense::transpose(exec, x, trans_x);
                     for (IndexType i = 0; i < trans_b->get_size()[0]; i++) {
                         sparselib::csrsv2_solve(
-                            handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
+                            handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
                             matrix->get_size()[0],
                             matrix->get_num_stored_elements(), &one,
                             hip_solve_struct->factor_descr,

From ce09e815e451cf96e423766b7ff4ac1341fa5750 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Wed, 15 Nov 2023 15:15:57 +0000
Subject: [PATCH 014/448] adds script to change main include to use "" instead
 of <>

TODO: remove or revert this commit
---
 dev_tools/scripts/change-main-include.py | 60 ++++++++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100755 dev_tools/scripts/change-main-include.py

diff --git a/dev_tools/scripts/change-main-include.py b/dev_tools/scripts/change-main-include.py
new file mode 100755
index 00000000000..7ee5e8cd922
--- /dev/null
+++ b/dev_tools/scripts/change-main-include.py
@@ -0,0 +1,60 @@
+#! /usr/bin/env python3
+import collections
+import sys
+import re
+
+files = sys.argv[1:]
+
+test_subdirectories = [
+    "base", "config", "distributed", "factorization",
+    "log", "matrix", "multigrid", "preconditioner",
+    "reorder", "solver", "stop", "synthesizer"
+]
+
+false_positives = [
+    "test/utils/executor.hpp",
+    "test/utils/mpi/executor.hpp"
+]
+
+
+for filename in files:
+    suffix = re.compile(r"(\.cpp|\.cu|\.inc)$")
+    main_include_re = re.compile(r"#include\s+<ginkgo/core/([^>]+)>")
+
+    Match = collections.namedtuple("Match", ["idx", "line"])
+
+    if not suffix.search(filename):
+        continue
+
+    if any(f"test/{subdir}" in filename for subdir in test_subdirectories):
+        continue
+
+    if any(filename.endswith(fp) for fp in false_positives):
+        continue
+
+    with open(filename, 'r') as file:
+        content = file.readlines()
+
+    try:
+        first_include = next(Match(idx=i, line=l) for i, l in enumerate(content) if l.startswith("#include"))
+    except:
+        first_include = Match(idx=-1, line="")
+    if "<ginkgo/core" not in first_include.line:
+        continue
+
+    try:
+        next_idx, next_line = next(Match(idx=i, line=l) for i, l in enumerate(content[first_include.idx + 1:]) if l.strip())
+    except:
+        continue
+    if next_line.startswith("#if") and next_idx == 0:
+        continue
+    if "<ginkgo/core" in next_line and next_idx == 0:
+        continue
+    if not next_line.startswith("#include") or next_line.startswith('#include "'):
+        # Uncertain if the first include is the main include
+        print(filename, file=sys.stderr)
+        continue
+
+    content[first_include.idx] = first_include.line.replace('<', '"').replace('>', '"')
+    with open(filename, 'w') as file:
+            file.writelines(content)

From 88f2197af98e705fa85aa3f1bbec1885b7377e31 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Fri, 28 Jun 2024 09:12:51 +0200
Subject: [PATCH 015/448] automatically change the 'main' include

changes from: `./dev_tools/scripts/change-main-include.py $(git ls-files)`

This is done based on the heuristic that the first include is the main include, except in some cases.

Here are the cases where the heuristic can't determine either way:

benchmark/utils/mpi_timer.cpp
benchmark/utils/tuning_variables.cpp
core/base/mpi.cpp
core/base/segmented_array.cpp
core/base/version.cpp
core/config/multigrid_config.cpp
core/distributed/index_map.cpp
core/distributed/partition.cpp
core/log/logger.cpp
core/preconditioner/batch_jacobi.cpp
core/stop/combined.cpp
core/stop/criterion.cpp
core/stop/iteration.cpp
core/stop/time.cpp
cuda/base/version.cpp
devices/cuda/executor.cpp
devices/hip/executor.cpp
dpcpp/base/version.dp.cpp
hip/base/version.hip.cpp
omp/base/version.cpp
reference/base/version.cpp
---
 core/base/array.cpp                         | 2 +-
 core/base/batch_multi_vector.cpp            | 2 +-
 core/base/block_operator.cpp                | 2 +-
 core/base/combination.cpp                   | 2 +-
 core/base/composition.cpp                   | 2 +-
 core/base/dense_cache.cpp                   | 2 +-
 core/base/device_matrix_data.cpp            | 2 +-
 core/base/executor.cpp                      | 2 +-
 core/base/index_set.cpp                     | 2 +-
 core/base/memory.cpp                        | 2 +-
 core/base/mtx_io.cpp                        | 2 +-
 core/base/perturbation.cpp                  | 2 +-
 core/base/timer.cpp                         | 2 +-
 core/config/config.cpp                      | 2 +-
 core/config/property_tree.cpp               | 2 +-
 core/config/registry.cpp                    | 2 +-
 core/config/type_descriptor.cpp             | 2 +-
 core/distributed/matrix.cpp                 | 2 +-
 core/distributed/partition_helpers.cpp      | 2 +-
 core/distributed/preconditioner/schwarz.cpp | 2 +-
 core/distributed/vector.cpp                 | 2 +-
 core/factorization/cholesky.cpp             | 2 +-
 core/factorization/factorization.cpp        | 2 +-
 core/factorization/ic.cpp                   | 2 +-
 core/factorization/ilu.cpp                  | 2 +-
 core/factorization/lu.cpp                   | 2 +-
 core/factorization/par_ic.cpp               | 2 +-
 core/factorization/par_ict.cpp              | 2 +-
 core/factorization/par_ilu.cpp              | 2 +-
 core/factorization/par_ilut.cpp             | 2 +-
 core/log/batch_logger.cpp                   | 2 +-
 core/log/convergence.cpp                    | 2 +-
 core/log/papi.cpp                           | 2 +-
 core/log/performance_hint.cpp               | 2 +-
 core/log/profiler_hook.cpp                  | 2 +-
 core/log/record.cpp                         | 2 +-
 core/log/stream.cpp                         | 2 +-
 core/matrix/batch_csr.cpp                   | 2 +-
 core/matrix/batch_dense.cpp                 | 2 +-
 core/matrix/batch_ell.cpp                   | 2 +-
 core/matrix/batch_identity.cpp              | 2 +-
 core/matrix/coo.cpp                         | 2 +-
 core/matrix/csr.cpp                         | 2 +-
 core/matrix/dense.cpp                       | 2 +-
 core/matrix/diagonal.cpp                    | 2 +-
 core/matrix/ell.cpp                         | 2 +-
 core/matrix/fbcsr.cpp                       | 2 +-
 core/matrix/fft.cpp                         | 2 +-
 core/matrix/hybrid.cpp                      | 2 +-
 core/matrix/identity.cpp                    | 2 +-
 core/matrix/permutation.cpp                 | 2 +-
 core/matrix/row_gatherer.cpp                | 2 +-
 core/matrix/scaled_permutation.cpp          | 2 +-
 core/matrix/sellp.cpp                       | 2 +-
 core/matrix/sparsity_csr.cpp                | 2 +-
 core/multigrid/fixed_coarsening.cpp         | 2 +-
 core/multigrid/pgm.cpp                      | 2 +-
 core/preconditioner/ic.cpp                  | 2 +-
 core/preconditioner/ilu.cpp                 | 2 +-
 core/preconditioner/isai.cpp                | 2 +-
 core/preconditioner/jacobi.cpp              | 2 +-
 core/reorder/amd.cpp                        | 2 +-
 core/reorder/mc64.cpp                       | 2 +-
 core/reorder/nested_dissection.cpp          | 2 +-
 core/reorder/rcm.cpp                        | 2 +-
 core/reorder/scaled_reordered.cpp           | 2 +-
 core/solver/batch_bicgstab.cpp              | 2 +-
 core/solver/batch_cg.cpp                    | 2 +-
 core/solver/bicg.cpp                        | 2 +-
 core/solver/bicgstab.cpp                    | 2 +-
 core/solver/cb_gmres.cpp                    | 2 +-
 core/solver/cg.cpp                          | 2 +-
 core/solver/cgs.cpp                         | 2 +-
 core/solver/direct.cpp                      | 2 +-
 core/solver/fcg.cpp                         | 2 +-
 core/solver/gcr.cpp                         | 2 +-
 core/solver/gmres.cpp                       | 2 +-
 core/solver/idr.cpp                         | 2 +-
 core/solver/ir.cpp                          | 2 +-
 core/solver/multigrid.cpp                   | 2 +-
 core/stop/residual_norm.cpp                 | 2 +-
 cuda/base/exception.cpp                     | 2 +-
 cuda/base/executor.cpp                      | 2 +-
 cuda/base/memory.cpp                        | 2 +-
 cuda/base/stream.cpp                        | 2 +-
 cuda/base/timer.cpp                         | 2 +-
 devices/dpcpp/executor.cpp                  | 2 +-
 devices/omp/executor.cpp                    | 2 +-
 dpcpp/base/executor.dp.cpp                  | 2 +-
 dpcpp/base/timer.dp.cpp                     | 2 +-
 hip/base/device.hip.cpp                     | 2 +-
 hip/base/exception.hip.cpp                  | 2 +-
 hip/base/executor.hip.cpp                   | 2 +-
 hip/base/memory.hip.cpp                     | 2 +-
 hip/base/stream.hip.cpp                     | 2 +-
 hip/base/timer.hip.cpp                      | 2 +-
 omp/base/executor.cpp                       | 2 +-
 97 files changed, 97 insertions(+), 97 deletions(-)

diff --git a/core/base/array.cpp b/core/base/array.cpp
index 44142e0fa2d..f529e3cf9d2 100644
--- a/core/base/array.cpp
+++ b/core/base/array.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/array.hpp>
+#include "ginkgo/core/base/array.hpp"
 
 
 #include <ginkgo/core/base/math.hpp>
diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp
index c4ec023e323..960158654f2 100644
--- a/core/base/batch_multi_vector.cpp
+++ b/core/base/batch_multi_vector.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include "ginkgo/core/base/batch_multi_vector.hpp"
 
 
 #include <algorithm>
diff --git a/core/base/block_operator.cpp b/core/base/block_operator.cpp
index b8190bad02d..43ac79c3c0e 100644
--- a/core/base/block_operator.cpp
+++ b/core/base/block_operator.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/block_operator.hpp>
+#include "ginkgo/core/base/block_operator.hpp"
 
 
 #include <utility>
diff --git a/core/base/combination.cpp b/core/base/combination.cpp
index 01d1d197820..324fa8d4ddf 100644
--- a/core/base/combination.cpp
+++ b/core/base/combination.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/combination.hpp>
+#include "ginkgo/core/base/combination.hpp"
 
 
 #include <ginkgo/core/base/precision_dispatch.hpp>
diff --git a/core/base/composition.cpp b/core/base/composition.cpp
index cf3789c45a7..515fb425633 100644
--- a/core/base/composition.cpp
+++ b/core/base/composition.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/composition.hpp>
+#include "ginkgo/core/base/composition.hpp"
 
 
 #include <algorithm>
diff --git a/core/base/dense_cache.cpp b/core/base/dense_cache.cpp
index e321b38b442..50e1abc3977 100644
--- a/core/base/dense_cache.cpp
+++ b/core/base/dense_cache.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/dense_cache.hpp>
+#include "ginkgo/core/base/dense_cache.hpp"
 
 
 #include <ginkgo/core/matrix/dense.hpp>
diff --git a/core/base/device_matrix_data.cpp b/core/base/device_matrix_data.cpp
index 4190ee4f6d0..085054cbd69 100644
--- a/core/base/device_matrix_data.cpp
+++ b/core/base/device_matrix_data.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/device_matrix_data.hpp>
+#include "ginkgo/core/base/device_matrix_data.hpp"
 
 
 #include <ginkgo/core/base/array.hpp>
diff --git a/core/base/executor.cpp b/core/base/executor.cpp
index a0efdc2291e..1fb1703c56f 100644
--- a/core/base/executor.cpp
+++ b/core/base/executor.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/executor.hpp>
+#include "ginkgo/core/base/executor.hpp"
 
 
 #include <ginkgo/core/base/exception.hpp>
diff --git a/core/base/index_set.cpp b/core/base/index_set.cpp
index c40f57586b8..b27d3803448 100644
--- a/core/base/index_set.cpp
+++ b/core/base/index_set.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/index_set.hpp>
+#include "ginkgo/core/base/index_set.hpp"
 
 
 #include <algorithm>
diff --git a/core/base/memory.cpp b/core/base/memory.cpp
index be3b231dedd..0b3e0ce833b 100644
--- a/core/base/memory.cpp
+++ b/core/base/memory.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/memory.hpp>
+#include "ginkgo/core/base/memory.hpp"
 
 
 #include <new>
diff --git a/core/base/mtx_io.cpp b/core/base/mtx_io.cpp
index a8208593096..e2f2dbf5d9b 100644
--- a/core/base/mtx_io.cpp
+++ b/core/base/mtx_io.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/mtx_io.hpp>
+#include "ginkgo/core/base/mtx_io.hpp"
 
 
 #include <algorithm>
diff --git a/core/base/perturbation.cpp b/core/base/perturbation.cpp
index 89b7f9d67fd..94a4975cfa0 100644
--- a/core/base/perturbation.cpp
+++ b/core/base/perturbation.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/perturbation.hpp>
+#include "ginkgo/core/base/perturbation.hpp"
 
 
 #include <ginkgo/core/base/precision_dispatch.hpp>
diff --git a/core/base/timer.cpp b/core/base/timer.cpp
index eb060d1bbce..abd5fbf61cd 100644
--- a/core/base/timer.cpp
+++ b/core/base/timer.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/timer.hpp>
+#include "ginkgo/core/base/timer.hpp"
 
 
 #include <chrono>
diff --git a/core/config/config.cpp b/core/config/config.cpp
index 291c7cab41c..87dd49b6c03 100644
--- a/core/config/config.cpp
+++ b/core/config/config.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/config/config.hpp>
+#include "ginkgo/core/config/config.hpp"
 
 
 #include <map>
diff --git a/core/config/property_tree.cpp b/core/config/property_tree.cpp
index 47e627d21e6..1ab33712953 100644
--- a/core/config/property_tree.cpp
+++ b/core/config/property_tree.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/config/property_tree.hpp>
+#include "ginkgo/core/config/property_tree.hpp"
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/core/config/registry.cpp b/core/config/registry.cpp
index 8ff619b4250..8b8bdbcaf0d 100644
--- a/core/config/registry.cpp
+++ b/core/config/registry.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/config/registry.hpp>
+#include "ginkgo/core/config/registry.hpp"
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/core/config/type_descriptor.cpp b/core/config/type_descriptor.cpp
index c2885407cad..cbc29c5088a 100644
--- a/core/config/type_descriptor.cpp
+++ b/core/config/type_descriptor.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/config/type_descriptor.hpp>
+#include "ginkgo/core/config/type_descriptor.hpp"
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp
index 1dcddbd1a6a..2d2d1304769 100644
--- a/core/distributed/matrix.cpp
+++ b/core/distributed/matrix.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/distributed/matrix.hpp>
+#include "ginkgo/core/distributed/matrix.hpp"
 
 
 #include <ginkgo/core/base/precision_dispatch.hpp>
diff --git a/core/distributed/partition_helpers.cpp b/core/distributed/partition_helpers.cpp
index 50b9bee0e5f..1a55daf8134 100644
--- a/core/distributed/partition_helpers.cpp
+++ b/core/distributed/partition_helpers.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/distributed/partition_helpers.hpp>
+#include "ginkgo/core/distributed/partition_helpers.hpp"
 
 
 #include <numeric>
diff --git a/core/distributed/preconditioner/schwarz.cpp b/core/distributed/preconditioner/schwarz.cpp
index 3bf61ac43d0..2def0a0f85c 100644
--- a/core/distributed/preconditioner/schwarz.cpp
+++ b/core/distributed/preconditioner/schwarz.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/distributed/preconditioner/schwarz.hpp>
+#include "ginkgo/core/distributed/preconditioner/schwarz.hpp"
 
 
 #include <memory>
diff --git a/core/distributed/vector.cpp b/core/distributed/vector.cpp
index 52993faa4bd..2e57fcf7451 100644
--- a/core/distributed/vector.cpp
+++ b/core/distributed/vector.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/distributed/vector.hpp>
+#include "ginkgo/core/distributed/vector.hpp"
 
 
 #include <ginkgo/core/distributed/partition.hpp>
diff --git a/core/factorization/cholesky.cpp b/core/factorization/cholesky.cpp
index 63bbde4f2fd..12456df4abc 100644
--- a/core/factorization/cholesky.cpp
+++ b/core/factorization/cholesky.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/factorization/cholesky.hpp>
+#include "ginkgo/core/factorization/cholesky.hpp"
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/core/factorization/factorization.cpp b/core/factorization/factorization.cpp
index 00cdd12648d..597fc7b48f4 100644
--- a/core/factorization/factorization.cpp
+++ b/core/factorization/factorization.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/factorization/factorization.hpp>
+#include "ginkgo/core/factorization/factorization.hpp"
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/core/factorization/ic.cpp b/core/factorization/ic.cpp
index 763a6364d09..67fb3df5b46 100644
--- a/core/factorization/ic.cpp
+++ b/core/factorization/ic.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/factorization/ic.hpp>
+#include "ginkgo/core/factorization/ic.hpp"
 
 
 #include <memory>
diff --git a/core/factorization/ilu.cpp b/core/factorization/ilu.cpp
index 5ae4ccb9654..15f3cef1831 100644
--- a/core/factorization/ilu.cpp
+++ b/core/factorization/ilu.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/factorization/ilu.hpp>
+#include "ginkgo/core/factorization/ilu.hpp"
 
 
 #include <memory>
diff --git a/core/factorization/lu.cpp b/core/factorization/lu.cpp
index e0da4ceb429..8ab1ddfc37f 100644
--- a/core/factorization/lu.cpp
+++ b/core/factorization/lu.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/factorization/lu.hpp>
+#include "ginkgo/core/factorization/lu.hpp"
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/core/factorization/par_ic.cpp b/core/factorization/par_ic.cpp
index 3bd415257f7..c21f66934aa 100644
--- a/core/factorization/par_ic.cpp
+++ b/core/factorization/par_ic.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/factorization/par_ic.hpp>
+#include "ginkgo/core/factorization/par_ic.hpp"
 
 
 #include <memory>
diff --git a/core/factorization/par_ict.cpp b/core/factorization/par_ict.cpp
index 8a7f8297f7e..54176d79545 100644
--- a/core/factorization/par_ict.cpp
+++ b/core/factorization/par_ict.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/factorization/par_ict.hpp>
+#include "ginkgo/core/factorization/par_ict.hpp"
 
 
 #include <memory>
diff --git a/core/factorization/par_ilu.cpp b/core/factorization/par_ilu.cpp
index 963b085d76f..f69947adcac 100644
--- a/core/factorization/par_ilu.cpp
+++ b/core/factorization/par_ilu.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/factorization/par_ilu.hpp>
+#include "ginkgo/core/factorization/par_ilu.hpp"
 
 
 #include <memory>
diff --git a/core/factorization/par_ilut.cpp b/core/factorization/par_ilut.cpp
index da45642490b..ff4b5b2a83e 100644
--- a/core/factorization/par_ilut.cpp
+++ b/core/factorization/par_ilut.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/factorization/par_ilut.hpp>
+#include "ginkgo/core/factorization/par_ilut.hpp"
 
 
 #include <memory>
diff --git a/core/log/batch_logger.cpp b/core/log/batch_logger.cpp
index e18ecd2d5e9..532cae64c28 100644
--- a/core/log/batch_logger.cpp
+++ b/core/log/batch_logger.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/log/batch_logger.hpp>
+#include "ginkgo/core/log/batch_logger.hpp"
 
 
 #include <ginkgo/core/base/array.hpp>
diff --git a/core/log/convergence.cpp b/core/log/convergence.cpp
index 51dc3cc32c8..16c89e08ffc 100644
--- a/core/log/convergence.cpp
+++ b/core/log/convergence.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/log/convergence.hpp>
+#include "ginkgo/core/log/convergence.hpp"
 
 
 #include <ginkgo/core/base/array.hpp>
diff --git a/core/log/papi.cpp b/core/log/papi.cpp
index ce23eb8ee29..83a9bd3b93c 100644
--- a/core/log/papi.cpp
+++ b/core/log/papi.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/log/papi.hpp>
+#include "ginkgo/core/log/papi.hpp"
 
 
 #include <ginkgo/core/base/dim.hpp>
diff --git a/core/log/performance_hint.cpp b/core/log/performance_hint.cpp
index fb06fdf4be8..3b0a720aa93 100644
--- a/core/log/performance_hint.cpp
+++ b/core/log/performance_hint.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/log/performance_hint.hpp>
+#include "ginkgo/core/log/performance_hint.hpp"
 
 
 #include <iomanip>
diff --git a/core/log/profiler_hook.cpp b/core/log/profiler_hook.cpp
index a8eef7668f2..87ea8f42d02 100644
--- a/core/log/profiler_hook.cpp
+++ b/core/log/profiler_hook.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/log/profiler_hook.hpp>
+#include "ginkgo/core/log/profiler_hook.hpp"
 
 
 #include <memory>
diff --git a/core/log/record.cpp b/core/log/record.cpp
index f58f6747ff5..6d995cd348c 100644
--- a/core/log/record.cpp
+++ b/core/log/record.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/log/record.hpp>
+#include "ginkgo/core/log/record.hpp"
 
 
 #include <ginkgo/core/base/array.hpp>
diff --git a/core/log/stream.cpp b/core/log/stream.cpp
index c02ecc77b09..033575c9b54 100644
--- a/core/log/stream.cpp
+++ b/core/log/stream.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/log/stream.hpp>
+#include "ginkgo/core/log/stream.hpp"
 
 
 #include <iomanip>
diff --git a/core/matrix/batch_csr.cpp b/core/matrix/batch_csr.cpp
index 96301f3e97b..8e4b1434f8e 100644
--- a/core/matrix/batch_csr.cpp
+++ b/core/matrix/batch_csr.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/batch_csr.hpp>
+#include "ginkgo/core/matrix/batch_csr.hpp"
 
 
 #include <algorithm>
diff --git a/core/matrix/batch_dense.cpp b/core/matrix/batch_dense.cpp
index ea6a19aa21c..a2eb017cf7c 100644
--- a/core/matrix/batch_dense.cpp
+++ b/core/matrix/batch_dense.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/batch_dense.hpp>
+#include "ginkgo/core/matrix/batch_dense.hpp"
 
 
 #include <algorithm>
diff --git a/core/matrix/batch_ell.cpp b/core/matrix/batch_ell.cpp
index 0db9640b406..5c3da632643 100644
--- a/core/matrix/batch_ell.cpp
+++ b/core/matrix/batch_ell.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/batch_ell.hpp>
+#include "ginkgo/core/matrix/batch_ell.hpp"
 
 
 #include <algorithm>
diff --git a/core/matrix/batch_identity.cpp b/core/matrix/batch_identity.cpp
index 0de3101a62a..480f0a10474 100644
--- a/core/matrix/batch_identity.cpp
+++ b/core/matrix/batch_identity.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/batch_identity.hpp>
+#include "ginkgo/core/matrix/batch_identity.hpp"
 
 
 #include <algorithm>
diff --git a/core/matrix/coo.cpp b/core/matrix/coo.cpp
index 9530dbd2624..eb8b33c0cf1 100644
--- a/core/matrix/coo.cpp
+++ b/core/matrix/coo.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/coo.hpp>
+#include "ginkgo/core/matrix/coo.hpp"
 
 
 #include <algorithm>
diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp
index a84298b6f95..8dad86568fb 100644
--- a/core/matrix/csr.cpp
+++ b/core/matrix/csr.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/csr.hpp>
+#include "ginkgo/core/matrix/csr.hpp"
 
 
 #include <ginkgo/core/base/array.hpp>
diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp
index d1f3da8e166..eb52c574db9 100644
--- a/core/matrix/dense.cpp
+++ b/core/matrix/dense.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/dense.hpp>
+#include "ginkgo/core/matrix/dense.hpp"
 
 
 #include <algorithm>
diff --git a/core/matrix/diagonal.cpp b/core/matrix/diagonal.cpp
index b6fe45dd5d0..08b1e00e340 100644
--- a/core/matrix/diagonal.cpp
+++ b/core/matrix/diagonal.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/diagonal.hpp>
+#include "ginkgo/core/matrix/diagonal.hpp"
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp
index ec51627e058..f6433fe156a 100644
--- a/core/matrix/ell.cpp
+++ b/core/matrix/ell.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/ell.hpp>
+#include "ginkgo/core/matrix/ell.hpp"
 
 
 #include <algorithm>
diff --git a/core/matrix/fbcsr.cpp b/core/matrix/fbcsr.cpp
index 8141853a229..1ea00d741bd 100644
--- a/core/matrix/fbcsr.cpp
+++ b/core/matrix/fbcsr.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/fbcsr.hpp>
+#include "ginkgo/core/matrix/fbcsr.hpp"
 
 
 #include <limits>
diff --git a/core/matrix/fft.cpp b/core/matrix/fft.cpp
index f86e8b94cf0..1ec69ce3338 100644
--- a/core/matrix/fft.cpp
+++ b/core/matrix/fft.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/fft.hpp>
+#include "ginkgo/core/matrix/fft.hpp"
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/core/matrix/hybrid.cpp b/core/matrix/hybrid.cpp
index 920d5b39ed1..c30c60ce0fb 100644
--- a/core/matrix/hybrid.cpp
+++ b/core/matrix/hybrid.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/hybrid.hpp>
+#include "ginkgo/core/matrix/hybrid.hpp"
 
 
 #include <algorithm>
diff --git a/core/matrix/identity.cpp b/core/matrix/identity.cpp
index 5f264ad2a6d..a58601f31f0 100644
--- a/core/matrix/identity.cpp
+++ b/core/matrix/identity.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/identity.hpp>
+#include "ginkgo/core/matrix/identity.hpp"
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/core/matrix/permutation.cpp b/core/matrix/permutation.cpp
index 84989c7eddb..76f5d7c8005 100644
--- a/core/matrix/permutation.cpp
+++ b/core/matrix/permutation.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/permutation.hpp>
+#include "ginkgo/core/matrix/permutation.hpp"
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/core/matrix/row_gatherer.cpp b/core/matrix/row_gatherer.cpp
index 0f570cda410..72a6cbe2808 100644
--- a/core/matrix/row_gatherer.cpp
+++ b/core/matrix/row_gatherer.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/row_gatherer.hpp>
+#include "ginkgo/core/matrix/row_gatherer.hpp"
 
 
 #include <ginkgo/core/matrix/dense.hpp>
diff --git a/core/matrix/scaled_permutation.cpp b/core/matrix/scaled_permutation.cpp
index ff1246a1299..c948c6071ad 100644
--- a/core/matrix/scaled_permutation.cpp
+++ b/core/matrix/scaled_permutation.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/scaled_permutation.hpp>
+#include "ginkgo/core/matrix/scaled_permutation.hpp"
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/core/matrix/sellp.cpp b/core/matrix/sellp.cpp
index 636fc3907ae..39e2c706b19 100644
--- a/core/matrix/sellp.cpp
+++ b/core/matrix/sellp.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/sellp.hpp>
+#include "ginkgo/core/matrix/sellp.hpp"
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/core/matrix/sparsity_csr.cpp b/core/matrix/sparsity_csr.cpp
index ed66ad09efb..2ec463613b0 100644
--- a/core/matrix/sparsity_csr.cpp
+++ b/core/matrix/sparsity_csr.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/sparsity_csr.hpp>
+#include "ginkgo/core/matrix/sparsity_csr.hpp"
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/core/multigrid/fixed_coarsening.cpp b/core/multigrid/fixed_coarsening.cpp
index 413614abf28..e7024d334ad 100644
--- a/core/multigrid/fixed_coarsening.cpp
+++ b/core/multigrid/fixed_coarsening.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/multigrid/fixed_coarsening.hpp>
+#include "ginkgo/core/multigrid/fixed_coarsening.hpp"
 
 
 #include <ginkgo/core/base/array.hpp>
diff --git a/core/multigrid/pgm.cpp b/core/multigrid/pgm.cpp
index d60835ca944..f0393794d94 100644
--- a/core/multigrid/pgm.cpp
+++ b/core/multigrid/pgm.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/multigrid/pgm.hpp>
+#include "ginkgo/core/multigrid/pgm.hpp"
 
 
 #include <ginkgo/core/base/array.hpp>
diff --git a/core/preconditioner/ic.cpp b/core/preconditioner/ic.cpp
index c4613d30ea6..37eb0cb5b3f 100644
--- a/core/preconditioner/ic.cpp
+++ b/core/preconditioner/ic.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/preconditioner/ic.hpp>
+#include "ginkgo/core/preconditioner/ic.hpp"
 
 
 #include <ginkgo/core/base/types.hpp>
diff --git a/core/preconditioner/ilu.cpp b/core/preconditioner/ilu.cpp
index 652ade0152c..00422300172 100644
--- a/core/preconditioner/ilu.cpp
+++ b/core/preconditioner/ilu.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/preconditioner/ilu.hpp>
+#include "ginkgo/core/preconditioner/ilu.hpp"
 
 
 #include <ginkgo/core/base/types.hpp>
diff --git a/core/preconditioner/isai.cpp b/core/preconditioner/isai.cpp
index b10eec36691..f825e2f5c82 100644
--- a/core/preconditioner/isai.cpp
+++ b/core/preconditioner/isai.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/preconditioner/isai.hpp>
+#include "ginkgo/core/preconditioner/isai.hpp"
 
 
 #include <functional>
diff --git a/core/preconditioner/jacobi.cpp b/core/preconditioner/jacobi.cpp
index b0e8224d06f..8081f31712a 100644
--- a/core/preconditioner/jacobi.cpp
+++ b/core/preconditioner/jacobi.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/preconditioner/jacobi.hpp>
+#include "ginkgo/core/preconditioner/jacobi.hpp"
 
 
 #include <memory>
diff --git a/core/reorder/amd.cpp b/core/reorder/amd.cpp
index 1b3198b248f..7cb24c39ea0 100644
--- a/core/reorder/amd.cpp
+++ b/core/reorder/amd.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/reorder/amd.hpp>
+#include "ginkgo/core/reorder/amd.hpp"
 
 
 #include <cstddef>
diff --git a/core/reorder/mc64.cpp b/core/reorder/mc64.cpp
index 1d4ad438a59..e47969c0b71 100644
--- a/core/reorder/mc64.cpp
+++ b/core/reorder/mc64.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/reorder/mc64.hpp>
+#include "ginkgo/core/reorder/mc64.hpp"
 
 
 #include <chrono>
diff --git a/core/reorder/nested_dissection.cpp b/core/reorder/nested_dissection.cpp
index f609a15653c..bf9c8ba7a3d 100644
--- a/core/reorder/nested_dissection.cpp
+++ b/core/reorder/nested_dissection.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/reorder/nested_dissection.hpp>
+#include "ginkgo/core/reorder/nested_dissection.hpp"
 
 
 #include <memory>
diff --git a/core/reorder/rcm.cpp b/core/reorder/rcm.cpp
index 5be8409ba79..f3a16cc92a6 100644
--- a/core/reorder/rcm.cpp
+++ b/core/reorder/rcm.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/reorder/rcm.hpp>
+#include "ginkgo/core/reorder/rcm.hpp"
 
 
 #include <memory>
diff --git a/core/reorder/scaled_reordered.cpp b/core/reorder/scaled_reordered.cpp
index 8ee0035101d..cf246ea3194 100644
--- a/core/reorder/scaled_reordered.cpp
+++ b/core/reorder/scaled_reordered.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/reorder/scaled_reordered.hpp>
+#include "ginkgo/core/reorder/scaled_reordered.hpp"
 
 
 #include <utility>
diff --git a/core/solver/batch_bicgstab.cpp b/core/solver/batch_bicgstab.cpp
index f322e042d27..9621f058097 100644
--- a/core/solver/batch_bicgstab.cpp
+++ b/core/solver/batch_bicgstab.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/batch_bicgstab.hpp>
+#include "ginkgo/core/solver/batch_bicgstab.hpp"
 
 
 #include <ginkgo/core/base/batch_lin_op.hpp>
diff --git a/core/solver/batch_cg.cpp b/core/solver/batch_cg.cpp
index 3efe95406e0..d2fe4a5f00d 100644
--- a/core/solver/batch_cg.cpp
+++ b/core/solver/batch_cg.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/batch_cg.hpp>
+#include "ginkgo/core/solver/batch_cg.hpp"
 
 
 #include <ginkgo/core/base/batch_lin_op.hpp>
diff --git a/core/solver/bicg.cpp b/core/solver/bicg.cpp
index b5831c33ada..51ba251aecd 100644
--- a/core/solver/bicg.cpp
+++ b/core/solver/bicg.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/bicg.hpp>
+#include "ginkgo/core/solver/bicg.hpp"
 
 
 #include <ginkgo/core/base/exception.hpp>
diff --git a/core/solver/bicgstab.cpp b/core/solver/bicgstab.cpp
index c6ae33918a1..e1f2f1cb77e 100644
--- a/core/solver/bicgstab.cpp
+++ b/core/solver/bicgstab.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/bicgstab.hpp>
+#include "ginkgo/core/solver/bicgstab.hpp"
 
 
 #include <ginkgo/core/base/exception.hpp>
diff --git a/core/solver/cb_gmres.cpp b/core/solver/cb_gmres.cpp
index bb888d660e4..812c6c222ce 100644
--- a/core/solver/cb_gmres.cpp
+++ b/core/solver/cb_gmres.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/cb_gmres.hpp>
+#include "ginkgo/core/solver/cb_gmres.hpp"
 
 
 #include <type_traits>
diff --git a/core/solver/cg.cpp b/core/solver/cg.cpp
index f83faf7e20f..a8e534588a0 100644
--- a/core/solver/cg.cpp
+++ b/core/solver/cg.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/cg.hpp>
+#include "ginkgo/core/solver/cg.hpp"
 
 
 #include <ginkgo/core/base/exception.hpp>
diff --git a/core/solver/cgs.cpp b/core/solver/cgs.cpp
index 6bb41338f77..9d6a575fdbf 100644
--- a/core/solver/cgs.cpp
+++ b/core/solver/cgs.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/cgs.hpp>
+#include "ginkgo/core/solver/cgs.hpp"
 
 
 #include <ginkgo/core/base/exception.hpp>
diff --git a/core/solver/direct.cpp b/core/solver/direct.cpp
index d540aa584f0..717fd71698f 100644
--- a/core/solver/direct.cpp
+++ b/core/solver/direct.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/direct.hpp>
+#include "ginkgo/core/solver/direct.hpp"
 
 
 #include <memory>
diff --git a/core/solver/fcg.cpp b/core/solver/fcg.cpp
index 5966664c14d..dee37467c46 100644
--- a/core/solver/fcg.cpp
+++ b/core/solver/fcg.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/fcg.hpp>
+#include "ginkgo/core/solver/fcg.hpp"
 
 
 #include <ginkgo/core/base/exception.hpp>
diff --git a/core/solver/gcr.cpp b/core/solver/gcr.cpp
index 24fb36aa42b..cb2b55a3460 100644
--- a/core/solver/gcr.cpp
+++ b/core/solver/gcr.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/gcr.hpp>
+#include "ginkgo/core/solver/gcr.hpp"
 
 
 #include <ginkgo/core/base/array.hpp>
diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp
index b261cf754eb..b0ad6baa01e 100644
--- a/core/solver/gmres.cpp
+++ b/core/solver/gmres.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/gmres.hpp>
+#include "ginkgo/core/solver/gmres.hpp"
 
 
 #include <ginkgo/core/base/array.hpp>
diff --git a/core/solver/idr.cpp b/core/solver/idr.cpp
index 9085876a85a..4bc56562d3b 100644
--- a/core/solver/idr.cpp
+++ b/core/solver/idr.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/idr.hpp>
+#include "ginkgo/core/solver/idr.hpp"
 
 
 #include <ginkgo/core/base/exception.hpp>
diff --git a/core/solver/ir.cpp b/core/solver/ir.cpp
index 16152dc63e9..3a6b0b1d2d0 100644
--- a/core/solver/ir.cpp
+++ b/core/solver/ir.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/ir.hpp>
+#include "ginkgo/core/solver/ir.hpp"
 
 
 #include <ginkgo/core/base/precision_dispatch.hpp>
diff --git a/core/solver/multigrid.cpp b/core/solver/multigrid.cpp
index 6dd06747883..d7fc1d3c997 100644
--- a/core/solver/multigrid.cpp
+++ b/core/solver/multigrid.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/multigrid.hpp>
+#include "ginkgo/core/solver/multigrid.hpp"
 
 
 #include <complex>
diff --git a/core/stop/residual_norm.cpp b/core/stop/residual_norm.cpp
index 44853670359..824ab87ec0f 100644
--- a/core/stop/residual_norm.cpp
+++ b/core/stop/residual_norm.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/stop/residual_norm.hpp>
+#include "ginkgo/core/stop/residual_norm.hpp"
 
 
 #include <ginkgo/core/base/precision_dispatch.hpp>
diff --git a/cuda/base/exception.cpp b/cuda/base/exception.cpp
index 24b5de36c6a..13557e3da50 100644
--- a/cuda/base/exception.cpp
+++ b/cuda/base/exception.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/exception.hpp>
+#include "ginkgo/core/base/exception.hpp"
 
 
 #include <string>
diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp
index 3d1dbf7c92c..c41bc6a72c6 100644
--- a/cuda/base/executor.cpp
+++ b/cuda/base/executor.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/executor.hpp>
+#include "ginkgo/core/base/executor.hpp"
 
 
 #include <iostream>
diff --git a/cuda/base/memory.cpp b/cuda/base/memory.cpp
index 5f36489744a..7949b07f78f 100644
--- a/cuda/base/memory.cpp
+++ b/cuda/base/memory.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/memory.hpp>
+#include "ginkgo/core/base/memory.hpp"
 
 
 #include <cuda.h>
diff --git a/cuda/base/stream.cpp b/cuda/base/stream.cpp
index f0d8086398c..703c9958ecd 100644
--- a/cuda/base/stream.cpp
+++ b/cuda/base/stream.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/stream.hpp>
+#include "ginkgo/core/base/stream.hpp"
 
 
 #include <cuda_runtime.h>
diff --git a/cuda/base/timer.cpp b/cuda/base/timer.cpp
index 35759f82dd4..01b96c19536 100644
--- a/cuda/base/timer.cpp
+++ b/cuda/base/timer.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/timer.hpp>
+#include "ginkgo/core/base/timer.hpp"
 
 
 #include <cuda.h>
diff --git a/devices/dpcpp/executor.cpp b/devices/dpcpp/executor.cpp
index 323e9efeca6..435d9426374 100644
--- a/devices/dpcpp/executor.cpp
+++ b/devices/dpcpp/executor.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/executor.hpp>
+#include "ginkgo/core/base/executor.hpp"
 
 
 #include <cstdlib>
diff --git a/devices/omp/executor.cpp b/devices/omp/executor.cpp
index db3058c8371..448d7b68d63 100644
--- a/devices/omp/executor.cpp
+++ b/devices/omp/executor.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/executor.hpp>
+#include "ginkgo/core/base/executor.hpp"
 
 
 #include <cstdlib>
diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp
index 58eeec10d17..159ee7eb533 100644
--- a/dpcpp/base/executor.dp.cpp
+++ b/dpcpp/base/executor.dp.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/executor.hpp>
+#include "ginkgo/core/base/executor.hpp"
 
 
 #include <algorithm>
diff --git a/dpcpp/base/timer.dp.cpp b/dpcpp/base/timer.dp.cpp
index e14ef40a439..da347b14ddf 100644
--- a/dpcpp/base/timer.dp.cpp
+++ b/dpcpp/base/timer.dp.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/timer.hpp>
+#include "ginkgo/core/base/timer.hpp"
 
 
 #include <CL/sycl.hpp>
diff --git a/hip/base/device.hip.cpp b/hip/base/device.hip.cpp
index be897510056..d1d4325c6f1 100644
--- a/hip/base/device.hip.cpp
+++ b/hip/base/device.hip.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/device.hpp>
+#include "ginkgo/core/base/device.hpp"
 
 
 #include <ginkgo/config.hpp>
diff --git a/hip/base/exception.hip.cpp b/hip/base/exception.hip.cpp
index f0e17f4e873..05b030ad375 100644
--- a/hip/base/exception.hip.cpp
+++ b/hip/base/exception.hip.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/exception.hpp>
+#include "ginkgo/core/base/exception.hpp"
 
 
 #include <string>
diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp
index 4b5ce7afa7b..e371e48f489 100644
--- a/hip/base/executor.hip.cpp
+++ b/hip/base/executor.hip.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/executor.hpp>
+#include "ginkgo/core/base/executor.hpp"
 
 
 #include <iostream>
diff --git a/hip/base/memory.hip.cpp b/hip/base/memory.hip.cpp
index 5fde8f518c6..27d510d784b 100644
--- a/hip/base/memory.hip.cpp
+++ b/hip/base/memory.hip.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/memory.hpp>
+#include "ginkgo/core/base/memory.hpp"
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/hip/base/stream.hip.cpp b/hip/base/stream.hip.cpp
index b56c5104428..d5acb978e22 100644
--- a/hip/base/stream.hip.cpp
+++ b/hip/base/stream.hip.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/stream.hpp>
+#include "ginkgo/core/base/stream.hpp"
 
 
 #include <ginkgo/config.hpp>
diff --git a/hip/base/timer.hip.cpp b/hip/base/timer.hip.cpp
index bd81d9f3be5..67a9a8153b6 100644
--- a/hip/base/timer.hip.cpp
+++ b/hip/base/timer.hip.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/timer.hpp>
+#include "ginkgo/core/base/timer.hpp"
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/omp/base/executor.cpp b/omp/base/executor.cpp
index 7d969eb89f8..98ef2d528ae 100644
--- a/omp/base/executor.cpp
+++ b/omp/base/executor.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/executor.hpp>
+#include "ginkgo/core/base/executor.hpp"
 
 
 #include <omp.h>

From 2e227f70de4bdba33f0ab33caa15be53efb8f371 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Fri, 28 Jun 2024 09:25:38 +0200
Subject: [PATCH 016/448] manually changed main header

---
 core/base/version.cpp                | 2 +-
 core/config/multigrid_config.cpp     | 2 +-
 core/distributed/index_map.cpp       | 2 +-
 core/distributed/partition.cpp       | 2 +-
 core/log/logger.cpp                  | 2 +-
 core/preconditioner/batch_jacobi.cpp | 2 +-
 core/stop/combined.cpp               | 2 +-
 core/stop/criterion.cpp              | 2 +-
 core/stop/iteration.cpp              | 2 +-
 core/stop/time.cpp                   | 2 +-
 cuda/base/version.cpp                | 2 +-
 devices/cuda/executor.cpp            | 2 +-
 devices/hip/executor.cpp             | 2 +-
 dpcpp/base/version.dp.cpp            | 2 +-
 hip/base/version.hip.cpp             | 2 +-
 omp/base/version.cpp                 | 2 +-
 reference/base/version.cpp           | 2 +-
 17 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/core/base/version.cpp b/core/base/version.cpp
index a7802a890dd..54f59eb7356 100644
--- a/core/base/version.cpp
+++ b/core/base/version.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/version.hpp>
+#include "ginkgo/core/base/version.hpp"
 
 
 namespace gko {
diff --git a/core/config/multigrid_config.cpp b/core/config/multigrid_config.cpp
index 6eb9f5ed872..553e6ca033d 100644
--- a/core/config/multigrid_config.cpp
+++ b/core/config/multigrid_config.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/multigrid/pgm.hpp>
+#include "ginkgo/core/multigrid/pgm.hpp"
 
 
 #include "core/config/parse_macro.hpp"
diff --git a/core/distributed/index_map.cpp b/core/distributed/index_map.cpp
index 5d2a1aebe18..e24d8141b4d 100644
--- a/core/distributed/index_map.cpp
+++ b/core/distributed/index_map.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/distributed/index_map.hpp>
+#include "ginkgo/core/distributed/index_map.hpp"
 
 
 #include "core/distributed/index_map_kernels.hpp"
diff --git a/core/distributed/partition.cpp b/core/distributed/partition.cpp
index 8a2fefcad79..5e6903de872 100644
--- a/core/distributed/partition.cpp
+++ b/core/distributed/partition.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/distributed/partition.hpp>
+#include "ginkgo/core/distributed/partition.hpp"
 
 
 #include "core/base/array_access.hpp"
diff --git a/core/log/logger.cpp b/core/log/logger.cpp
index f3d89a4657f..e141f1816dc 100644
--- a/core/log/logger.cpp
+++ b/core/log/logger.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/log/logger.hpp>
+#include "ginkgo/core/log/logger.hpp"
 
 
 namespace gko {
diff --git a/core/preconditioner/batch_jacobi.cpp b/core/preconditioner/batch_jacobi.cpp
index 7d6ffa66848..3f18a32123f 100644
--- a/core/preconditioner/batch_jacobi.cpp
+++ b/core/preconditioner/batch_jacobi.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/preconditioner/batch_jacobi.hpp>
+#include "ginkgo/core/preconditioner/batch_jacobi.hpp"
 
 
 #include "core/matrix/batch_csr_kernels.hpp"
diff --git a/core/stop/combined.cpp b/core/stop/combined.cpp
index 594b9214c08..d29d65f73bc 100644
--- a/core/stop/combined.cpp
+++ b/core/stop/combined.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/stop/combined.hpp>
+#include "ginkgo/core/stop/combined.hpp"
 
 
 namespace gko {
diff --git a/core/stop/criterion.cpp b/core/stop/criterion.cpp
index c907e4e03cd..02f04876f9f 100644
--- a/core/stop/criterion.cpp
+++ b/core/stop/criterion.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/stop/criterion.hpp>
+#include "ginkgo/core/stop/criterion.hpp"
 
 
 #include "core/stop/criterion_kernels.hpp"
diff --git a/core/stop/iteration.cpp b/core/stop/iteration.cpp
index 9e54a2c6384..2f712865eda 100644
--- a/core/stop/iteration.cpp
+++ b/core/stop/iteration.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/stop/iteration.hpp>
+#include "ginkgo/core/stop/iteration.hpp"
 
 
 namespace gko {
diff --git a/core/stop/time.cpp b/core/stop/time.cpp
index 5ff50c24b07..0481b9c91d3 100644
--- a/core/stop/time.cpp
+++ b/core/stop/time.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/stop/time.hpp>
+#include "ginkgo/core/stop/time.hpp"
 
 
 namespace gko {
diff --git a/cuda/base/version.cpp b/cuda/base/version.cpp
index d6e4b9b1068..0b95067a1c8 100644
--- a/cuda/base/version.cpp
+++ b/cuda/base/version.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/version.hpp>
+#include "ginkgo/core/base/version.hpp"
 
 
 namespace gko {
diff --git a/devices/cuda/executor.cpp b/devices/cuda/executor.cpp
index 58261c318fb..ff17a9ba8cd 100644
--- a/devices/cuda/executor.cpp
+++ b/devices/cuda/executor.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/executor.hpp>
+#include "ginkgo/core/base/executor.hpp"
 
 
 namespace gko {
diff --git a/devices/hip/executor.cpp b/devices/hip/executor.cpp
index 6954e31b24b..82001d667db 100644
--- a/devices/hip/executor.cpp
+++ b/devices/hip/executor.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/executor.hpp>
+#include "ginkgo/core/base/executor.hpp"
 
 
 namespace gko {
diff --git a/dpcpp/base/version.dp.cpp b/dpcpp/base/version.dp.cpp
index f53a6d2820c..6c6f9371d01 100644
--- a/dpcpp/base/version.dp.cpp
+++ b/dpcpp/base/version.dp.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/version.hpp>
+#include "ginkgo/core/base/version.hpp"
 
 
 namespace gko {
diff --git a/hip/base/version.hip.cpp b/hip/base/version.hip.cpp
index f2490fa691c..512e5ca6f1a 100644
--- a/hip/base/version.hip.cpp
+++ b/hip/base/version.hip.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/version.hpp>
+#include "ginkgo/core/base/version.hpp"
 
 
 namespace gko {
diff --git a/omp/base/version.cpp b/omp/base/version.cpp
index e96bfe5b0a4..dbca513323a 100644
--- a/omp/base/version.cpp
+++ b/omp/base/version.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/version.hpp>
+#include "ginkgo/core/base/version.hpp"
 
 
 namespace gko {
diff --git a/reference/base/version.cpp b/reference/base/version.cpp
index 04e44ee1848..74697ff70ab 100644
--- a/reference/base/version.cpp
+++ b/reference/base/version.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/version.hpp>
+#include "ginkgo/core/base/version.hpp"
 
 
 namespace gko {

From a20456bb7af148b675c6fd7f1d7d79c637a7fe22 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Thu, 16 Nov 2023 16:33:40 +0100
Subject: [PATCH 017/448] replace force-top with clang-format on/off

---
 core/test/log/logger.cpp                           | 4 ++--
 core/test/matrix/identity.cpp                      | 4 ++--
 cuda/test/base/scoped_device_id.cu                 | 4 ++--
 dpcpp/base/device_matrix_data_kernels.dp.cpp       | 4 ++--
 dpcpp/base/onedpl.hpp                              | 6 ++----
 dpcpp/distributed/partition_helpers_kernels.dp.cpp | 5 +++--
 dpcpp/distributed/partition_kernels.dp.cpp         | 5 +++--
 dpcpp/multigrid/pgm_kernels.dp.cpp                 | 5 +++--
 hip/factorization/par_ilut_select_common.hip.cpp   | 5 +++--
 hip/test/base/hip_executor.hip.cpp                 | 5 +++--
 hip/test/base/hip_executor_topology.hip.cpp        | 5 +++--
 hip/test/base/math.hip.cpp                         | 5 +++--
 hip/test/base/scoped_device_id.hip.cpp             | 5 +++--
 hip/test/components/cooperative_groups.hip.cpp     | 5 +++--
 hip/test/components/merging.hip.cpp                | 5 +++--
 hip/test/components/searching.hip.cpp              | 5 +++--
 16 files changed, 43 insertions(+), 34 deletions(-)

diff --git a/core/test/log/logger.cpp b/core/test/log/logger.cpp
index 90330dbd1d0..18315442559 100644
--- a/core/test/log/logger.cpp
+++ b/core/test/log/logger.cpp
@@ -2,10 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-// force-top: on
+// clang-format off
 #include <ginkgo/core/base/types.hpp>
 GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS
-// force-top: off
+// clang-format on
 
 
 #include <ginkgo/core/log/logger.hpp>
diff --git a/core/test/matrix/identity.cpp b/core/test/matrix/identity.cpp
index 28e0b0682e1..69370df07c5 100644
--- a/core/test/matrix/identity.cpp
+++ b/core/test/matrix/identity.cpp
@@ -2,9 +2,9 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-// force-top: on
+// clang-format off
 #include <ginkgo/core/base/types.hpp>
-// force-top: off
+// clang-format on
 
 
 #include <ginkgo/core/matrix/identity.hpp>
diff --git a/cuda/test/base/scoped_device_id.cu b/cuda/test/base/scoped_device_id.cu
index 2b2eb58db49..4abd8f5810b 100644
--- a/cuda/test/base/scoped_device_id.cu
+++ b/cuda/test/base/scoped_device_id.cu
@@ -2,10 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-// force-top: on
+// clang-format off
 // prevent compilation failure related to disappearing assert(...) statements
 #include <cuda_runtime.h>
-// force-top: off
+// clang-format on
 
 
 #include <gtest/gtest.h>
diff --git a/dpcpp/base/device_matrix_data_kernels.dp.cpp b/dpcpp/base/device_matrix_data_kernels.dp.cpp
index 9779ba576fd..5869c853385 100644
--- a/dpcpp/base/device_matrix_data_kernels.dp.cpp
+++ b/dpcpp/base/device_matrix_data_kernels.dp.cpp
@@ -2,10 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-// force-top: on
+// clang-format off
 // oneDPL needs to be first to avoid issues with libstdc++ TBB impl
 #include <oneapi/dpl/algorithm>
-// force-top: off
+// clang-format on
 
 
 #include "core/base/device_matrix_data_kernels.hpp"
diff --git a/dpcpp/base/onedpl.hpp b/dpcpp/base/onedpl.hpp
index 2f2f8ec3ab1..9dd5ba18976 100644
--- a/dpcpp/base/onedpl.hpp
+++ b/dpcpp/base/onedpl.hpp
@@ -5,11 +5,9 @@
 #ifndef GKO_DPCPP_BASE_ONEDPL_HPP_
 #define GKO_DPCPP_BASE_ONEDPL_HPP_
 
-
-// force-top: on
+// clang-format off
 #include <oneapi/dpl/execution>
-// force-top: off
-
+// clang-format on
 
 #include <ginkgo/core/base/executor.hpp>
 
diff --git a/dpcpp/distributed/partition_helpers_kernels.dp.cpp b/dpcpp/distributed/partition_helpers_kernels.dp.cpp
index 80eb073beee..8f85374c1d0 100644
--- a/dpcpp/distributed/partition_helpers_kernels.dp.cpp
+++ b/dpcpp/distributed/partition_helpers_kernels.dp.cpp
@@ -2,11 +2,12 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-// force-top: on
+
+// clang-format off
 #include <oneapi/dpl/algorithm>
 #include <oneapi/dpl/execution>
 #include <oneapi/dpl/iterator>
-// force-top: off
+// clang-format on
 
 
 #include "core/distributed/partition_helpers_kernels.hpp"
diff --git a/dpcpp/distributed/partition_kernels.dp.cpp b/dpcpp/distributed/partition_kernels.dp.cpp
index 04bc0ee7cdc..3d2c403e35d 100644
--- a/dpcpp/distributed/partition_kernels.dp.cpp
+++ b/dpcpp/distributed/partition_kernels.dp.cpp
@@ -2,10 +2,11 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-// force-top: on
+
+// clang-format off
 #include <oneapi/dpl/algorithm>
 #include <oneapi/dpl/iterator>
-// force-top: off
+// clang-format off
 
 
 #include "core/distributed/partition_kernels.hpp"
diff --git a/dpcpp/multigrid/pgm_kernels.dp.cpp b/dpcpp/multigrid/pgm_kernels.dp.cpp
index 644755e363c..d25d44ed8e9 100644
--- a/dpcpp/multigrid/pgm_kernels.dp.cpp
+++ b/dpcpp/multigrid/pgm_kernels.dp.cpp
@@ -2,10 +2,11 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-// force-top: on
+
+// clang-format off
 // oneDPL needs to be first to avoid issues with libstdc++ TBB impl
 #include <oneapi/dpl/algorithm>
-// force-top: off
+// clang-format on
 
 
 #include "core/multigrid/pgm_kernels.hpp"
diff --git a/hip/factorization/par_ilut_select_common.hip.cpp b/hip/factorization/par_ilut_select_common.hip.cpp
index ddad307dc62..5486b3f5ba5 100644
--- a/hip/factorization/par_ilut_select_common.hip.cpp
+++ b/hip/factorization/par_ilut_select_common.hip.cpp
@@ -2,10 +2,11 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-// force-top: on
+
+// clang-format off
 // prevent compilation failure related to disappearing assert(...) statements
 #include "common/cuda_hip/base/runtime.hpp"
-// force-top: off
+// clang-format on
 
 
 #include "hip/factorization/par_ilut_select_common.hip.hpp"
diff --git a/hip/test/base/hip_executor.hip.cpp b/hip/test/base/hip_executor.hip.cpp
index cfdfc3122fd..266532823e7 100644
--- a/hip/test/base/hip_executor.hip.cpp
+++ b/hip/test/base/hip_executor.hip.cpp
@@ -2,10 +2,11 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-// force-top: on
+
+// clang-format off
 // prevent compilation failure related to disappearing assert(...) statements
 #include <hip/hip_runtime.h>
-// force-top: off
+// clang-format on
 
 
 #include <ginkgo/core/base/executor.hpp>
diff --git a/hip/test/base/hip_executor_topology.hip.cpp b/hip/test/base/hip_executor_topology.hip.cpp
index 7a94ae6ded2..10ebac1bbc6 100644
--- a/hip/test/base/hip_executor_topology.hip.cpp
+++ b/hip/test/base/hip_executor_topology.hip.cpp
@@ -2,10 +2,11 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-// force-top: on
+
+// clang-format off
 // prevent compilation failure related to disappearing assert(...) statements
 #include <hip/hip_runtime.h>
-// force-top: off
+// clang-format on
 
 
 #include <ginkgo/core/base/executor.hpp>
diff --git a/hip/test/base/math.hip.cpp b/hip/test/base/math.hip.cpp
index 8462cbe5716..f018c634a6a 100644
--- a/hip/test/base/math.hip.cpp
+++ b/hip/test/base/math.hip.cpp
@@ -2,10 +2,11 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-// force-top: on
+
+// clang-format off
 // prevent compilation failure related to disappearing assert(...) statements
 #include <hip/hip_runtime.h>
-// force-top: off
+// clang-format on
 
 
 #include <ginkgo/core/base/math.hpp>
diff --git a/hip/test/base/scoped_device_id.hip.cpp b/hip/test/base/scoped_device_id.hip.cpp
index 991baa80e3a..78d51fc989d 100644
--- a/hip/test/base/scoped_device_id.hip.cpp
+++ b/hip/test/base/scoped_device_id.hip.cpp
@@ -2,10 +2,11 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-// force-top: on
+
+// clang-format off
 // prevent compilation failure related to disappearing assert(...) statements
 #include <hip/hip_runtime.h>
-// force-top: off
+// clang-format on
 
 
 #include <gtest/gtest.h>
diff --git a/hip/test/components/cooperative_groups.hip.cpp b/hip/test/components/cooperative_groups.hip.cpp
index 53f4b9a72a0..f99b4eb8a87 100644
--- a/hip/test/components/cooperative_groups.hip.cpp
+++ b/hip/test/components/cooperative_groups.hip.cpp
@@ -2,10 +2,11 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-// force-top: on
+
+// clang-format off
 // TODO remove when the HIP includes are fixed
 #include <hip/hip_runtime.h>
-// force-top: off
+// clang-format on
 
 
 #include <cstring>
diff --git a/hip/test/components/merging.hip.cpp b/hip/test/components/merging.hip.cpp
index b8ee2f03d29..be18447a901 100644
--- a/hip/test/components/merging.hip.cpp
+++ b/hip/test/components/merging.hip.cpp
@@ -2,10 +2,11 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-// force-top: on
+
+// clang-format off
 // TODO remove when the HIP includes are fixed
 #include <hip/hip_runtime.h>
-// force-top: off
+// clang-format on
 
 
 #include "hip/components/merging.hip.hpp"
diff --git a/hip/test/components/searching.hip.cpp b/hip/test/components/searching.hip.cpp
index 2662d367f4d..252e8841893 100644
--- a/hip/test/components/searching.hip.cpp
+++ b/hip/test/components/searching.hip.cpp
@@ -2,10 +2,11 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-// force-top: on
+
+// clang-format off
 // TODO remove when the HIP includes are fixed
 #include <hip/hip_runtime.h>
-// force-top: off
+// clang-format on
 
 
 #include "hip/components/searching.hip.hpp"

From 3a2d1012950fe0e8233e86df506b6909cf322e82 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Thu, 16 Nov 2023 16:37:09 +0000
Subject: [PATCH 018/448] remove format_header.sh

---
 .github/bot-pr-base.sh             |   2 -
 .github/bot-pr-format-base.sh      |   5 +-
 .pre-commit-config.yaml            |  13 -
 CMakeLists.txt                     |  20 --
 dev_tools/scripts/config           | 100 --------
 dev_tools/scripts/format_header.sh | 393 -----------------------------
 dev_tools/scripts/regroup          |  12 -
 7 files changed, 1 insertion(+), 544 deletions(-)
 delete mode 100644 dev_tools/scripts/config
 delete mode 100755 dev_tools/scripts/format_header.sh
 delete mode 100644 dev_tools/scripts/regroup

diff --git a/.github/bot-pr-base.sh b/.github/bot-pr-base.sh
index 697ecc7c848..61a86290db9 100644
--- a/.github/bot-pr-base.sh
+++ b/.github/bot-pr-base.sh
@@ -3,8 +3,6 @@
 source .github/bot-base.sh
 
 EXTENSION_REGEX='\.(cuh?|hpp|hpp\.inc?|cpp)$'
-FORMAT_HEADER_REGEX='^(benchmark|core|cuda|hip|include/ginkgo/core|omp|reference|dpcpp|common/unified|test)/'
-FORMAT_REGEX='^(common|examples)/'
 CLANG_FORMAT=clang-format-14
 
 echo -n "Collecting information on triggering PR"
diff --git a/.github/bot-pr-format-base.sh b/.github/bot-pr-format-base.sh
index 7c08dd605a1..8667f5b9473 100644
--- a/.github/bot-pr-format-base.sh
+++ b/.github/bot-pr-format-base.sh
@@ -14,9 +14,7 @@ git config user.name "ginkgo-bot"
 
 # save scripts from develop
 cp .clang-format .pre-commit-config.yaml /tmp
-pushd dev_tools/scripts || exit 1
-cp format_header.sh update_ginkgo_header.sh /tmp
-popd || exit 1
+cp dev_tools/scripts/update_ginkgo_header.sh /tmp
 
 # checkout current PR head
 LOCAL_BRANCH=format-tmp-$HEAD_BRANCH
@@ -25,7 +23,6 @@ git checkout -b $LOCAL_BRANCH fork/$HEAD_BRANCH
 # restore files from develop
 cp /tmp/.clang-format .
 cp /tmp/.pre-commit-config.yaml .
-cp /tmp/format_header.sh dev_tools/scripts/
 cp /tmp/update_ginkgo_header.sh dev_tools/scripts/
 
 # make base pre-commit config available
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 236f2bdea7b..9814e8fd810 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -25,19 +25,6 @@ repos:
         examples/external-lib-interfacing/external-lib-interfacing.cpp|
         core/base/workspace_aliases.hpp
       )$
-  - id: format-headers
-    name: format headers
-    entry: env CLANG_FORMAT=dev_tools/scripts/clang-format.sh dev_tools/scripts/format_header.sh
-    require_serial: true
-    language: system
-    types_or: [c, c++, cuda]
-    exclude: |
-        (?x)^(
-          third_party/SuiteSparse/AMD/.*|
-          third_party/identify_stream_usage/.*|
-          include/ginkgo/ginkgo.hpp|
-          core/base/workspace_aliases.hpp
-        )$
   - id: update-ginkgo-header
     name: update ginkgo header
     entry: dev_tools/scripts/update_ginkgo_header.sh
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d6ab1dbd936..21832c98592 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -367,26 +367,6 @@ if(NOT "${BASH}" STREQUAL "BASH-NOTFOUND" AND GINKGO_DEVEL_TOOLS)
     add_custom_target(generate_ginkgo_header ALL
         COMMAND ${Ginkgo_SOURCE_DIR}/dev_tools/scripts/update_ginkgo_header.sh
         WORKING_DIRECTORY ${Ginkgo_SOURCE_DIR})
-    find_program(GIT git)
-    if(NOT "${GIT}" STREQUAL "GIT-NOTFOUND")
-        add_custom_target(format_header
-            COMMAND echo "format header on the modified code files except build, examples, third_party, accessor/, dev_tools, ginkgo.hpp"
-            COMMAND bash -c "git diff --name-only origin/master...HEAD | \
-                grep -Ev 'build|examples|third_party|accessor/|dev_tools|ginkgo.hpp' | \
-                grep -E '(\.hip)?\.(cu|hpp|cuh|cpp)$' | \
-                xargs -r -n1 ${Ginkgo_SOURCE_DIR}/dev_tools/scripts/format_header.sh"
-            WORKING_DIRECTORY ${Ginkgo_SOURCE_DIR}
-            VERBATIM)
-    endif()
-    unset(GIT CACHE)
-    add_custom_target(format_header_all
-        COMMAND echo "format header on all code files except build, examples, third_party, accessor/, dev_tools, ginkgo.hpp"
-        COMMAND bash -c "find * -type f | \
-                grep -Ev 'build|examples|third_party|accessor/|dev_tools|ginkgo.hpp' | \
-                grep -E '(\.hip)?\.(cu|hpp|cuh|cpp)$' | \
-                xargs -r -n1 ${Ginkgo_SOURCE_DIR}/dev_tools/scripts/format_header.sh"
-        WORKING_DIRECTORY ${Ginkgo_SOURCE_DIR}
-        VERBATIM)
 endif()
 unset(BASH CACHE)
 
diff --git a/dev_tools/scripts/config b/dev_tools/scripts/config
deleted file mode 100644
index 79e6a227530..00000000000
--- a/dev_tools/scripts/config
+++ /dev/null
@@ -1,100 +0,0 @@
-- "test_install|benchmark"
-    - FixInclude: "ginkgo/ginkgo.hpp"
-- "executor"
-    - FixInclude: "ginkgo/core/base/executor.hpp"
-- "hip/base/config.hip.hpp"
-    - FixInclude: "hip/hip_runtime.h"
-- "hip/matrix/fft_kernels_stub"
-    - FixInclude: "core/matrix/fft_kernels.hpp"
-- "(cuda|hip|omp|dpcpp)/test/factorization/par_ilu_kernels"
-    - FixInclude: "core/factorization/par_ilu_kernels.hpp"
-- "(cuda|hip|omp|dpcpp)/test/factorization/par_ilut_kernels"
-    - FixInclude: "core/factorization/par_ilut_kernels.hpp"
-- "(cuda|hip|omp|dpcpp)/test/factorization/par_ict_kernels"
-    - FixInclude: "core/factorization/par_ict_kernels.hpp"
-- "(cuda|hip|omp|dpcpp)/test/factorization/par_ic_kernels"
-    - FixInclude: "core/factorization/par_ic_kernels.hpp"
-- "cuda/factorization/par_ilut_select_common"
-    - FixInclude: "cuda/factorization/par_ilut_select_common.cuh"
-- "hip/factorization/par_ilut_select_common"
-    - FixInclude: "hip/factorization/par_ilut_select_common.hip.hpp"
-- "(cuda|hip|dpcpp)/factorization/par_ilut_"
-    - FixInclude: "core/factorization/par_ilut_kernels.hpp"
-- "(cuda|hip|dpcpp)/factorization/par_ict_"
-    - FixInclude: "core/factorization/par_ict_kernels.hpp"
-- "(cuda|hip|dpcpp)/preconditioner/jacobi_"
-    - FixInclude: "core/preconditioner/jacobi_kernels.hpp"
-- "(cuda|hip|dpcpp|omp)/base/kernel_launch\."
-    - FixInclude: "common/unified/base/kernel_launch.hpp"
-- "(cuda|hip|dpcpp|omp)/test/base/kernel_launch\."
-    - FixInclude: "common/unified/base/kernel_launch.hpp"
-- "(cuda|hip|dpcpp|omp)/base/kernel_launch_solver\."
-    - FixInclude: "common/unified/base/kernel_launch_solver.hpp"
-- "(cuda|hip|dpcpp|omp)/base/kernel_launch_solver\."
-    - FixInclude: "common/unified/base/kernel_launch_solver.hpp"
-- "test/base/kernel_launch_generic.cpp"
-    - FixInclude: "common/unified/base/kernel_launch.hpp"
-- "^test/solver/(lower|upper)_trs_kernels.cpp"
-    - CoreSuffix: "_kernels"
-    - PathPrefix: "ginkgo/core"
-    - PathIgnore: "0"
-    - RemoveTest: "true"
-- "^test/matrix/csr_kernels2.cpp"
-    - CoreSuffix: "_kernels2"
-    - PathPrefix: "ginkgo/core"
-    - PathIgnore: "0"
-    - RemoveTest: "true"
-- "elimination_forest\.cpp"
-    - FixInclude: "core/factorization/elimination_forest.hpp"
-- "symbolic\.cpp"
-    - FixInclude: "core/factorization/symbolic.hpp"
-- "common/unified/.*.cpp"
-    - PathIgnore: "2"
-    - PathPrefix: "core"
-    - CoreSuffix: "\.template"
-- "core/test/base/(extended_float|iterator_factory)"
-    - RemoveTest: "true"
-- "core/test/base/allocator"
-    - FixInclude: "core/base/allocator.hpp"
-- "core/test/utils/matrix_utils_test"
-    - FixInclude: "core/utils/matrix_utils.hpp"
-- "reference/test/base/utils"
-    - FixInclude: "core/base/utils.hpp"
-- "_builder\.cpp"
-    - RemoveTest: "true"
-- "_builder\.hpp"
-    - CoreSuffix: "_builder"
-- "dpcpp/test/base/dim3\.dp\.cpp"
-    - FixInclude: "dpcpp/base/dim3.dp.hpp"
-- "test/base/kernel_launch"
-    - RemoveTest: "true"
-    - PathIgnore: "1"
-    - PathPrefix: "(cuda|hip|omp|dpcpp)"
-- "(cuda|hip|omp|dpcpp|reference)/base/.*_kernels"
-    - RemoveTest: "true"
-    - PathIgnore: "1"
-    - PathPrefix: "core"
-- "/components/.*_kernels"
-    - RemoveTest: "true"
-    - PathIgnore: "1"
-    - PathPrefix: "core"
-- "/components/"
-    - RemoveTest: "true"
-- "test/utils"
-    - CoreSuffix: "_test"
-    - PathIgnore: "1"
-    - PathPrefix: "core"
-- "core\/.*"
-    - CoreSuffix: "_kernels"
-    - PathPrefix: "ginkgo"
-    - PathIgnore: "0"
-    - RemoveTest: "true"
-- "/(test|base)/"
-    - CoreSuffix: "_kernels"
-    - PathPrefix: "ginkgo/core"
-    - PathIgnore: "1"
-    - RemoveTest: "true"
-- ".*"
-    - PathPrefix: "core"
-    - PathIgnore: "1"
-    - CoreSuffix: "\.template"
diff --git a/dev_tools/scripts/format_header.sh b/dev_tools/scripts/format_header.sh
deleted file mode 100755
index e7d51080b86..00000000000
--- a/dev_tools/scripts/format_header.sh
+++ /dev/null
@@ -1,393 +0,0 @@
-#!/usr/bin/env bash
-
-CLANG_FORMAT=${CLANG_FORMAT:="clang-format"}
-
-convert_header () {
-    local regex="^(#include )(<|\")(.*)(\"|>)$"
-    local jacobi_regex="^(cuda|hip|dpcpp)\/preconditioner\/jacobi_common(\.hip)?\.hpp"
-    if [[ $@ =~ ${regex} ]]; then
-        header_file="${BASH_REMATCH[3]}"
-        if [ -f "${header_file}" ]; then
-            if [[ "${header_file}" =~ ^ginkgo ]]; then
-                echo "#include <${header_file}>"
-            else
-                echo "#include \"${header_file}\""
-            fi
-        elif [ "${header_file}" = "matrices/config.hpp" ]; then
-            echo "#include \"${header_file}\""
-        elif [ "${header_file}" = "extensions/test/config/file_location.hpp" ]; then
-            echo "#include \"${header_file}\""
-	    elif [[ "${header_file}" =~ ${jacobi_regex} ]]; then
-            echo "#include \"${header_file}\""
-        else
-            echo "#include <${header_file}>"
-        fi
-    else
-        echo "$@"
-    fi
-}
-
-get_header_def () {
-    local regex="\.(hpp|cuh)"
-    if [[ $@ =~ $regex ]]; then
-        local def=$(echo "$@" | sed -E "s~include/ginkgo/~PUBLIC_~g;s~/|\.~_~g")
-	# Used to get rid of \r in Windows
-        def=$(echo "GKO_${def^^}_")
-        echo "$def"
-    else
-        echo ""
-    fi
-}
-
-add_regroup () {
-    cp .clang-format .clang-format.temp
-    sed -i "s~\.\.\.~~g" .clang-format
-    cat dev_tools/scripts/regroup >> .clang-format
-    echo "..." >> .clang-format
-}
-
-remove_regroup () {
-    mv .clang-format.temp .clang-format
-}
-
-# It reads "dev_tools/scripts/config" to generate the corresponding main header
-# The setting setting:
-# - "file_regex"
-#   - CoreSuffix: "core_suffix_regex"           (default "")
-#   - PathPrefix: "path_prefix_regex"           (default "")
-#   - PathIgnore: "path_ignore_number"          (default "0")
-#   - RemoveTest: "false/true"                  (default "false")
-#   - FixInclude: "the specific main header"    (default "")
-# Only "file_regex" without any setting is fine, and it means find the same name with header suffix
-# For example, /path/to/file.cpp will change to /path/to/file.hpp
-# file_regex : selecting which file apply this rule
-# CoreSuffix : remove the pattern which passes the "core_suffix_regex" of file
-# PathPrefix : adds "path_prefix_regex" before path, and the position depends on PathIgnore
-# PathIgnore : ignore the number "path_ignore_number" folder from top level, and then add "path_prefix_regex" into path
-# RemoveTest : Decide whether ignore /test/ in the path
-# FixInclude : Specify the main header. If it is set, ignore others setting
-# Note: This script picks the first fitting "file_regex" rules according the ordering in config
-get_include_regex () {
-    local file="$1"
-    declare -n local_output=$2
-    local core_suffix=""
-    local path_prefix=""
-    local path_ignore="0"
-    local fix_include=""
-    local remove_test="false"
-    local item_regex="^-\ +\"(.*)\""
-    local path_prefix_regex="PathPrefix:\ +\"(.*)\""
-    local core_suffix_regex="CoreSuffix:\ +\"(.*)\""
-    local path_ignore_regex="PathIgnore:\ +\"(.*)\""
-    local fix_include_regex="FixInclude:\ +\"(.*)\""
-    local remove_test_regex="RemoveTest:\ +\"(.*)\""
-    local match="false"
-    while IFS='' read -r line; do
-        if [[ "$line" =~ $item_regex ]]; then
-            file_regex="${BASH_REMATCH[1]}"
-            if [ "$FORMAT_HEADER_DEBUG" = "1" ]; then
-                echo "DEBUG: Checking pattern $line"
-            fi
-            if [[ "$match" = "true" ]]; then
-                break
-            elif [[ $file =~ $file_regex ]]; then
-                if [ "$FORMAT_HEADER_DEBUG" = "1" ]; then
-                    echo "DEBUG: Matching pattern $line for $file"
-                fi
-                match="true"
-            fi
-        elif [ "$match" = "true" ]; then
-            if [[ "$line" =~ $path_prefix_regex ]]; then
-                path_prefix="${BASH_REMATCH[1]}"
-                if [ "$FORMAT_HEADER_DEBUG" = "1" ]; then
-                    echo "DEBUG: Path prefix set to $path_prefix"
-                fi
-            elif [[ "$line" =~ $core_suffix_regex ]]; then
-                core_suffix="${BASH_REMATCH[1]}"
-                if [ "$FORMAT_HEADER_DEBUG" = "1" ]; then
-                    echo "DEBUG: Core suffix set to $core_suffix"
-                fi
-            elif [[ "$line" =~ $path_ignore_regex ]]; then
-                path_ignore="${BASH_REMATCH[1]}"
-                if [ "$FORMAT_HEADER_DEBUG" = "1" ]; then
-                    echo "DEBUG: Ignoring $path_ignore top-level dirs"
-                fi
-            elif [[ "$line" =~ $fix_include_regex ]]; then
-                fix_include="${BASH_REMATCH[1]}"
-                if [ "$FORMAT_HEADER_DEBUG" = "1" ]; then
-                    echo "DEBUG: Fixed include $fix_include"
-                fi
-            elif [[ "$line" =~ $remove_test_regex ]]; then
-                remove_test="${BASH_REMATCH[1]}"
-                if [ "$FORMAT_HEADER_DEBUG" = "1" ]; then
-                    echo "DEBUG: Remove test $remove_test"
-                fi
-            else
-                echo "Ignore unknow setting: \"${file_regex}\" - ${line}"
-            fi
-        fi
-    done < "dev_tools/scripts/config"
-    local_output=""
-    if [ -z "${fix_include}" ]; then
-        local path_regex="([a-zA-Z_]*\/){${path_ignore}}(.*)\.(cpp|hpp|cu|cuh)"
-        if [ ! -z "${path_prefix}" ]; then
-            path_prefix="${path_prefix}/"
-        fi
-        if [ "$FORMAT_HEADER_DEBUG" = "1" ]; then
-            echo "DEBUG: Handling $file"
-        fi
-        local_output=$(echo "${file}" | sed -E "s~\.(hip|dp)~~g;s~$path_regex~$path_prefix\2~g")
-        if [ "$FORMAT_HEADER_DEBUG" = "1" ]; then
-            echo "DEBUG: After removing path_ignore and path_prefix: $local_output"
-        fi
-        local_output=$(echo "${local_output}" | sed -E "s~$core_suffix$~~g")
-        if [ "$FORMAT_HEADER_DEBUG" = "1" ]; then
-            echo "DEBUG: After removing core_suffix: $local_output"
-        fi
-        local_output="#include (<|\")$local_output\.(hpp|hip\.hpp|dp\.hpp|cuh)(\"|>)"
-        if [ "${remove_test}" = "true" ]; then
-            local_output=$(echo "${local_output}" | sed -E "s~test/~~g")
-            if [ "$FORMAT_HEADER_DEBUG" = "1" ]; then
-                echo "DEBUG: After removing test: ${local_output}"
-            fi
-        fi
-    else
-        if [ "$FORMAT_HEADER_DEBUG" = "1" ]; then
-            echo "DEBUG: Fixing include $fix_include to the top"
-        fi
-        local_output="#include (<|\")$fix_include(\"|>)"
-    fi
-}
-
-# Test if required commands are present on the system:
-if ! command -v "$CLANG_FORMAT" &> /dev/null; then
-    echo "The command 'clang-format' is required for this script to work, but not supported by your system. It can be set via environment parameter CLANG_FORMAT=<clang-format path>" 1>&2
-    exit 1
-fi
-
-# Test the command on MacOS
-if ! declare -n &> /dev/null; then
-    echo "The command 'declare' needs to support the '-n' option. Please update bash or use 'brew install bash' if on MacOS" 1>&2
-    exit 1
-fi
-
-touch .dummy_file
-if ! sed -i 's///g' .dummy_file &> /dev/null; then
-    echo "The command 'sed' needs to support the '-i' option without suffix. Please use gnu sed or use 'brew install gnu-sed' if on MacOS" 1>&2
-    rm .dummy_file
-    exit 1
-fi
-
-if ! head -n -1 .dummy_file &> /dev/null; then
-    echo "The command 'head' needs to support '-NUM' option, Please use gnu head or use 'brew install coreutils' if on MacOS" 1>&2
-    rm .dummy_file
-    exit 1
-fi
-rm .dummy_file
-
-for current_file in $@; do
-    if [ -z "${current_file}" ]; then
-        echo "Usage: $0 path/to/fileA path/to/fileB ..."
-        exit 1
-    fi
-
-    if [ ! -f "${current_file}" ]; then
-        echo "${current_file} does not exist or it is not a file."
-        exit 1
-    fi
-
-    GINKGO_LICENSE_BEGIN="// SPDX-FileCopyrightText:"
-    GINKGO_LICENSE_END="// SPDX-License-Identifier:"
-
-    CONTENT="content.cpp" # Store the residual part (start from namespace)
-    BEFORE="before.cpp" # Store the main header and the #ifdef/#define of header file
-    HAS_HIP_RUNTIME="false"
-    DURING_LICENSE="false"
-    INCLUDE_REGEX="^#include.*"
-    INCLUDE_INC="\.inc"
-    MAIN_PART_MATCH=""
-
-    # FORCE_TOP_ON/OFF is only valid before other #include
-    FORCE_TOP_ON="// force-top: on"
-    FORCE_TOP_OFF="// force-top: off"
-    FORCE_TOP="force_top"
-    DURING_FORCE_TOP="false"
-
-    get_include_regex "${current_file}" MAIN_PART_MATCH
-    HEADER_DEF=$(get_header_def "${current_file}")
-
-    IFNDEF=""
-    DEFINE=""
-    IFNDEF_REGEX="^#ifndef GKO_"
-    DEFINE_REGEX="^#define GKO_"
-    HEADER_REGEX="\.(hpp|cuh)"
-    SKIP="true"
-    START_BLOCK_REX="^(#if| *\/\*)"
-    END_BLOCK_REX="^#endif|\*\/$"
-    ENDIF_REX="^#endif"
-    IN_BLOCK=0
-    KEEP_LINES=0
-    LAST_NONEMPTY=""
-    ALARM=""
-    COMMENT_REGEX="^ *\/\/"
-    CONSIDER_REGEX="${START_BLOCK_REX}|${END_BLOCK_REX}|${COMMENT_REGEX}|${INCLUDE_REGEX}"
-
-    # This part capture the main header and give the possible fail arrangement information
-    while IFS='' read -r line || [ -n "$line" ]; do
-        if [[ "${line}" =~ ${GINKGO_LICENSE_BEGIN}  ]] || [ "${DURING_LICENSE}" = "true" ]; then
-            DURING_LICENSE="true"
-            if [[ "${line}" =~ ${GINKGO_LICENSE_END} ]]; then
-                DURING_LICENSE="false"
-                SKIP="true"
-            fi
-        elif [ "${SKIP}" = "true" ] && ([ "$line" = "${FORCE_TOP_ON}" ] || [ "${DURING_FORCE_TOP}" = "true" ]); then
-            DURING_FORCE_TOP="true"
-            if [ "$line" = "${FORCE_TOP_OFF}" ]; then
-                DURING_FORCE_TOP="false"
-            fi
-            if [[ "${line}" =~ $INCLUDE_REGEX ]]; then
-                line="$(convert_header "${line}")"
-            fi
-            echo "$line" >> "${FORCE_TOP}"
-        elif [ -z "${line}" ] && [ "${SKIP}" = "true" ]; then
-        # Ignore all empty lines between LICENSE and Header
-            :
-        else
-            if [[ "${line}" =~ $INCLUDE_REGEX ]]; then
-                line="$(convert_header "${line}")"
-            fi
-            if [ -z "${line}" ]; then
-                KEEP_LINES=$((KEEP_LINES+1))
-            else
-                LAST_NONEMPTY="${line}"
-                KEEP_LINES=0
-            fi
-            if [[ "${current_file}" =~ ${HEADER_REGEX} ]] && [[ "${line}" =~ ${IFNDEF_REGEX} ]] && [ "${SKIP}" = "true" ] && [ -z "${DEFINE}" ]; then
-                IFNDEF="${line}"
-            elif [[ "${current_file}" =~ ${HEADER_REGEX} ]] && [[ "${line}" =~ ${DEFINE_REGEX} ]] && [ "${SKIP}" = "true" ] && [ -n "${IFNDEF}" ]; then
-                DEFINE="${line}"
-            elif [ -z "${MAIN_PART_MATCH}" ] || [[ ! "${line}" =~ ${MAIN_PART_MATCH} ]] || [[ "${IN_BLOCK}" -gt 0 ]]; then
-                echo "${line}" >> "${CONTENT}"
-                SKIP="false"
-                if [[ "${line}" =~ $START_BLOCK_REX ]]; then
-                    # keep everything in #if block and /* block
-                    IN_BLOCK=$((IN_BLOCK+1))
-                    if [ -z "${ALARM}" ]; then
-                        ALARM="set"
-                    fi
-                fi
-                if [[ "${IN_BLOCK}" = "0" ]] && [ -n "${line}" ] && [[ ! "${line}" =~ ${CONSIDER_REGEX} ]]; then
-                    if [ "${ALARM}" = "set" ]; then
-                        ALARM="true"
-                    elif [ -z "${ALARM}" ]; then
-                        ALARM="false"
-                    fi
-                fi
-                if [[ "${line}" =~ $END_BLOCK_REX ]]; then
-                    IN_BLOCK=$((IN_BLOCK-1))
-                fi
-            else
-                echo "${line}" >> ${BEFORE}
-            fi
-        fi
-    done < "${current_file}"
-    if [ "${ALARM}" = "true" ]; then
-        echo "Warning ${current_file}: sorting is probably incorrect"
-    fi
-
-    # Write license
-    CURRENT_YEAR=$(date +%Y)
-    echo "${GINKGO_LICENSE_BEGIN} 2017 - ${CURRENT_YEAR} The Ginkgo authors" > "${current_file}"
-    echo "//" >> "${current_file}"
-    echo "${GINKGO_LICENSE_END} BSD-3-Clause" >> "${current_file}"
-    echo "" >> "${current_file}"
-
-    # Write the definition of header according to path
-    if [ -n "${IFNDEF}" ] && [ -n "${DEFINE}" ]; then
-        IFNDEF="#ifndef ${HEADER_DEF}"
-        DEFINE="#define ${HEADER_DEF}"
-    elif [ -z "${IFNDEF}" ] && [ -z "${DEFINE}" ]; then
-        :
-    else
-        echo "Warning ${current_file}: only #ifndef GKO_ or #define GKO_ is in the header"
-    fi
-    if [ -n "${IFNDEF}" ]; then
-        echo "${IFNDEF}" >> "${current_file}"
-    fi
-    if [ -n "${DEFINE}" ]; then
-        echo "${DEFINE}" >> "${current_file}"
-        echo "" >> "${current_file}"
-        echo "" >> "${current_file}"
-    fi
-
-    # Write the force-top header
-    if [ -f "${FORCE_TOP}" ]; then
-        cat "${FORCE_TOP}" >> "${current_file}"
-        echo "" >> "${current_file}"
-        echo "" >> "${current_file}"
-        rm "${FORCE_TOP}"
-    fi
-
-    # Write the main header and give warnning if there are multiple matches
-    if [ -f "${BEFORE}" ]; then
-        # sort or remove the duplication
-        "${CLANG_FORMAT}" -i -style=file ${BEFORE}
-        if [ "$(wc -l < ${BEFORE})" -gt "1" ]; then
-            echo "Warning ${current_file}: there are multiple main header matchings"
-        fi
-        cat ${BEFORE} >> "${current_file}"
-        if [ -f "${CONTENT}" ]; then
-            echo "" >> "${current_file}"
-            echo "" >> "${current_file}"
-        fi
-        rm "${BEFORE}"
-    fi
-
-    # Arrange the remain files and give
-    if [ -f "${CONTENT}" ]; then
-        add_regroup
-        head -n -${KEEP_LINES} ${CONTENT} >> temp
-        if [ -n "${IFNDEF}" ] && [ -n "${DEFINE}" ]; then
-            # Ignore the last line #endif
-            if [[ "${LAST_NONEMPTY}" =~ $ENDIF_REX ]]; then
-                head -n -1 temp > ${CONTENT}
-                echo "#endif  // $HEADER_DEF" >> ${CONTENT}
-            else
-                echo "Warning ${current_file}: Found the begin header_def but did not find the end of header_def"
-                cat temp > ${CONTENT}
-            fi
-        else
-            cat temp > "${CONTENT}"
-        fi
-        "${CLANG_FORMAT}" -i -style=file "${CONTENT}"
-        rm temp
-        remove_regroup
-        PREV_INC=0
-        IN_IF="false"
-        SKIP="true"
-        while IFS='' read -r line; do
-            # Skip the empty line in the beginning
-            if [ "${SKIP}" = "true" ] && [[ -z "${line}" ]]; then
-                continue
-            else
-                SKIP="false"
-            fi
-            # Insert content with correct number empty lines
-            if [[ ${line} =~ ${INCLUDE_REGEX} ]] && [[ ! ${line} =~ ${INCLUDE_INC} ]]; then
-                if [[ ${PREV_INC} == 1 ]]; then
-                    echo "" >> "${current_file}"
-                fi
-                PREV_INC=0
-            else
-                if [ -z "${line}" ]; then
-                    PREV_INC=$((PREV_INC+1))
-                else
-                    # To keep the original lines
-                    PREV_INC=-3
-                fi
-            fi
-            echo "${line}" >> "${current_file}"
-        done < "${CONTENT}"
-        rm "${CONTENT}"
-    fi
-done
diff --git a/dev_tools/scripts/regroup b/dev_tools/scripts/regroup
deleted file mode 100644
index b10570f4982..00000000000
--- a/dev_tools/scripts/regroup
+++ /dev/null
@@ -1,12 +0,0 @@
-IncludeBlocks: Regroup
-IncludeCategories:
-  - Regex: '^<(nlohmann|gflags|gtest|papi).*'
-    Priority: 3
-  - Regex: '^<(omp|cu|hip|thrust|CL/|cooperative|oneapi|mpi|nvToolsExt|Kokkos_Core).*'
-    Priority: 2
-  - Regex: '^<ginkgo.*'
-    Priority: 5
-  - Regex: '^".*'
-    Priority: 6
-  - Regex: '.*'
-    Priority: 1

From 6975bf27174e1a98191e832fbbfcaf067a7016f7 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Fri, 17 Nov 2023 11:30:09 +0000
Subject: [PATCH 019/448] remove formatting from convert_source.sh

---
 dev_tools/oneapi/convert_source.sh | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/dev_tools/oneapi/convert_source.sh b/dev_tools/oneapi/convert_source.sh
index 7aaecf2e78d..090c31e8ccb 100755
--- a/dev_tools/oneapi/convert_source.sh
+++ b/dev_tools/oneapi/convert_source.sh
@@ -17,7 +17,6 @@
 #                    CMake's step is not required if copying the ginkgo config.hpp from another ginkgo build into "${ROOT_DIR}/include/ginkgo/".
 #   ROOT_BUILD_DIR: the complete path for build folder. The default is "${ROOT_DIR}/${BUILD_DIR}"
 #   GTEST_HEADER_DIR: the gtest header folder. The default is "${ROOT_BUILD_DIR}/_deps/googletest-src/googletest/include"
-#   CLANG_FORMAT: the clang-format exec. The default is "clang-format"
 #   VERBOSE: if it is set as 1, script will output the path information
 CURRENT_DIR="$( pwd )"
 cd "$( dirname "${BASH_SOURCE[0]}" )"
@@ -30,7 +29,6 @@ BUILD_DIR="${BUILD_DIR:="build"}"
 ROOT_BUILD_DIR="${ROOT_BUILD_DIR:="${ROOT_DIR}/${BUILD_DIR}"}"
 CUDA_HEADER_DIR="${CUDA_HEADER_DIR}"
 GTEST_HEADER_DIR="${GTEST_HEADER_DIR:="${ROOT_BUILD_DIR}/_deps/googletest-src/googletest/include"}"
-CLANG_FORMAT=${CLANG_FORMAT:="clang-format"}
 if [[ "${VERBOSE}" == 1 ]]; then
     echo "#####################"
     echo "# Environment Setting:"
@@ -40,7 +38,6 @@ if [[ "${VERBOSE}" == 1 ]]; then
     echo "ROOT_BUILD_DIR ${ROOT_BUILD_DIR}"
     echo "GTEST_HEADER_DIR ${GTEST_HEADER_DIR}"
     echo "CUDA_HEADER_DIR ${CUDA_HEADER_DIR}"
-    echo "CLANG_FORMAT ${CLANG_FORMAT}"
     echo "#####################"
 fi
 if [[ "${CUDA_HEADER_DIR}" == "" ]]; then
@@ -166,9 +163,8 @@ if [[ "${VERBOSE}" == 1 ]]; then
 fi
 rm "${OUTPUT_FILE}"
 echo "#define GET_QUEUE 0" >> "${OUTPUT_FILE}"
-# add empty ginkgo license such that format_header recognize some header before header def macro
 CURRENT_YEAR=$(date +%Y)
-echo "${GINKGO_LICENSE_BEGIN} 2017-${CURRENT_YEAR} The Ginkgo authors" >> "${OUTPUT_FILE}"
+echo "${GINKGO_LICENSE_BEGIN} ${CURRENT_YEAR} The Ginkgo authors" >> "${OUTPUT_FILE}"
 echo "//" >> "${OUTPUT_FILE}"
 echo "${GINKGO_LICENSE_END} BSD-3-Clause" >> "${OUTPUT_FILE}"
 rm "${GLOBAL_FILE}"
@@ -191,9 +187,6 @@ while IFS='' read -r line; do
     fi
 done < "${UNFORMAT_FILE}"
 
-# Call clang-format for better formatting.
-${CLANG_FORMAT} -style=file "${EMBED_FILE}" > "${FORMAT_FILE}"
-
 # Add an extra host function so that the converted DPC++ code will look like CUDA.
 "${SCRIPT_DIR}/add_host_function.sh" "${FORMAT_FILE}" > "${EMBED_HOST_FILE}"
 

From a1ec48e86b1533fbcbea8b98329566aa31f175e2 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Fri, 14 Jun 2024 08:51:57 +0000
Subject: [PATCH 020/448] use clang-format to format includes

Co-authored-by: Yu-Hsiang M. Tsai <yhmtsai@gmail.com>
---
 .clang-format           | 25 ++++++++++++++++++++++++-
 .pre-commit-config.yaml |  3 ++-
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/.clang-format b/.clang-format
index 3e1d9335acf..acd1e4321c2 100644
--- a/.clang-format
+++ b/.clang-format
@@ -58,7 +58,30 @@ ForEachMacros:
   - foreach
   - Q_FOREACH
   - BOOST_FOREACH
-IncludeIsMainRegex: '([-_](test|unittest))?$'
+IncludeBlocks: Regroup
+IncludeCategories:
+  - Regex:    '^<oneapi/dpl.*'  # needs to be on top
+    Priority: -2
+  - Regex:    '<[^.]+>'  # standard library
+    Priority: 1
+  - Regex:    '(^<(hip/hip_runtime|cuda(_runtime)?)\.h)|common/cuda_hip/base/runtime\.hpp$'
+    Priority: 2
+    SortPriority: 2
+  - Regex:    '^<(omp|cu|hip|oneapi|thrust|CL/|cooperative|mpi|nvToolsExt).*'
+    Priority: 2
+    SortPriority: 3
+  - Regex:    '^<(nlohmann|gflags|gtest|sde_lib|papi).*'
+    Priority: 4
+  - Regex:    '<ginkgo/ginkgo.hpp>'
+    Priority: 6
+  - Regex:    '^<ginkgo/.*'
+    Priority: 7
+  - Regex:    '^<.*'  # other library includes
+    Priority: 5
+  - Regex:    '^.*'
+    Priority: 8
+IncludeIsMainRegex: '(_(stub|kernels|kernels2|test))?$'
+IncludeIsMainSourceRegex: '\.cu$|_kernels\.hpp$|\.dp\.cpp$'
 IndentCaseLabels: false
 IndentWidth:     4
 IndentWrappedFunctionNames: false
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9814e8fd810..fca3a1ef28f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,7 +7,8 @@ repos:
     exclude: |
       (?x)^(
         third_party/SuiteSparse/AMD/.*|
-        third_party/identify_stream_usage/.*
+        third_party/identify_stream_usage/.*|
+        include/ginkgo/ginkgo.hpp
       )
 - repo: local
   hooks:

From 0b463905e82c65995689667f60348c880adeffe7 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Fri, 28 Jun 2024 09:34:01 +0200
Subject: [PATCH 021/448] remove force-top where made unnecessary

---
 cuda/test/base/scoped_device_id.cu                 | 3 ---
 dpcpp/base/device_matrix_data_kernels.dp.cpp       | 4 +---
 dpcpp/base/onedpl.hpp                              | 4 ++--
 dpcpp/distributed/partition_helpers_kernels.dp.cpp | 2 --
 dpcpp/distributed/partition_kernels.dp.cpp         | 2 --
 dpcpp/multigrid/pgm_kernels.dp.cpp                 | 3 ---
 6 files changed, 3 insertions(+), 15 deletions(-)

diff --git a/cuda/test/base/scoped_device_id.cu b/cuda/test/base/scoped_device_id.cu
index 4abd8f5810b..5c2e496b64b 100644
--- a/cuda/test/base/scoped_device_id.cu
+++ b/cuda/test/base/scoped_device_id.cu
@@ -2,10 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-// clang-format off
-// prevent compilation failure related to disappearing assert(...) statements
 #include <cuda_runtime.h>
-// clang-format on
 
 
 #include <gtest/gtest.h>
diff --git a/dpcpp/base/device_matrix_data_kernels.dp.cpp b/dpcpp/base/device_matrix_data_kernels.dp.cpp
index 5869c853385..a735470d5ba 100644
--- a/dpcpp/base/device_matrix_data_kernels.dp.cpp
+++ b/dpcpp/base/device_matrix_data_kernels.dp.cpp
@@ -2,10 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-// clang-format off
-// oneDPL needs to be first to avoid issues with libstdc++ TBB impl
+
 #include <oneapi/dpl/algorithm>
-// clang-format on
 
 
 #include "core/base/device_matrix_data_kernels.hpp"
diff --git a/dpcpp/base/onedpl.hpp b/dpcpp/base/onedpl.hpp
index 9dd5ba18976..8ea971f4602 100644
--- a/dpcpp/base/onedpl.hpp
+++ b/dpcpp/base/onedpl.hpp
@@ -5,9 +5,9 @@
 #ifndef GKO_DPCPP_BASE_ONEDPL_HPP_
 #define GKO_DPCPP_BASE_ONEDPL_HPP_
 
-// clang-format off
+
 #include <oneapi/dpl/execution>
-// clang-format on
+
 
 #include <ginkgo/core/base/executor.hpp>
 
diff --git a/dpcpp/distributed/partition_helpers_kernels.dp.cpp b/dpcpp/distributed/partition_helpers_kernels.dp.cpp
index 8f85374c1d0..c7a94baad54 100644
--- a/dpcpp/distributed/partition_helpers_kernels.dp.cpp
+++ b/dpcpp/distributed/partition_helpers_kernels.dp.cpp
@@ -3,11 +3,9 @@
 // SPDX-License-Identifier: BSD-3-Clause
 
 
-// clang-format off
 #include <oneapi/dpl/algorithm>
 #include <oneapi/dpl/execution>
 #include <oneapi/dpl/iterator>
-// clang-format on
 
 
 #include "core/distributed/partition_helpers_kernels.hpp"
diff --git a/dpcpp/distributed/partition_kernels.dp.cpp b/dpcpp/distributed/partition_kernels.dp.cpp
index 3d2c403e35d..5eeb2f85178 100644
--- a/dpcpp/distributed/partition_kernels.dp.cpp
+++ b/dpcpp/distributed/partition_kernels.dp.cpp
@@ -3,10 +3,8 @@
 // SPDX-License-Identifier: BSD-3-Clause
 
 
-// clang-format off
 #include <oneapi/dpl/algorithm>
 #include <oneapi/dpl/iterator>
-// clang-format off
 
 
 #include "core/distributed/partition_kernels.hpp"
diff --git a/dpcpp/multigrid/pgm_kernels.dp.cpp b/dpcpp/multigrid/pgm_kernels.dp.cpp
index d25d44ed8e9..3241c8b1ed1 100644
--- a/dpcpp/multigrid/pgm_kernels.dp.cpp
+++ b/dpcpp/multigrid/pgm_kernels.dp.cpp
@@ -3,10 +3,7 @@
 // SPDX-License-Identifier: BSD-3-Clause
 
 
-// clang-format off
-// oneDPL needs to be first to avoid issues with libstdc++ TBB impl
 #include <oneapi/dpl/algorithm>
-// clang-format on
 
 
 #include "core/multigrid/pgm_kernels.hpp"

From 65c7d504cf521d84ae35ca3d7aa08faebc5c5fd0 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Fri, 28 Jun 2024 09:47:27 +0200
Subject: [PATCH 022/448] fixup! use clang-format to format includes

---
 .clang-format | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.clang-format b/.clang-format
index acd1e4321c2..8bb6ededfdb 100644
--- a/.clang-format
+++ b/.clang-format
@@ -67,7 +67,7 @@ IncludeCategories:
   - Regex:    '(^<(hip/hip_runtime|cuda(_runtime)?)\.h)|common/cuda_hip/base/runtime\.hpp$'
     Priority: 2
     SortPriority: 2
-  - Regex:    '^<(omp|cu|hip|oneapi|thrust|CL/|cooperative|mpi|nvToolsExt).*'
+  - Regex:    '^<(omp|cu|hip|oneapi|thrust|CL/|cooperative|mpi|nvToolsExt|Kokkos).*'
     Priority: 2
     SortPriority: 3
   - Regex:    '^<(nlohmann|gflags|gtest|sde_lib|papi).*'

From b9545ddf0d1805331f30e00f79a75a108a86a62e Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Fri, 28 Jun 2024 09:48:01 +0200
Subject: [PATCH 023/448] automatically update header ordering with
 clang-format

---
 accessor/accessor_helper.hpp                             | 1 -
 accessor/block_col_major.hpp                             | 1 -
 accessor/cuda_helper.hpp                                 | 2 --
 accessor/hip_helper.hpp                                  | 2 --
 accessor/math.hpp                                        | 1 -
 accessor/range.hpp                                       | 1 -
 accessor/reduced_row_major.hpp                           | 1 -
 accessor/reduced_row_major_reference.hpp                 | 1 -
 accessor/reference_helper.hpp                            | 1 -
 accessor/row_major.hpp                                   | 1 -
 accessor/scaled_reduced_row_major.hpp                    | 1 -
 accessor/scaled_reduced_row_major_reference.hpp          | 1 -
 benchmark/blas/blas.cpp                                  | 4 +---
 benchmark/blas/blas_common.hpp                           | 4 +---
 benchmark/blas/distributed/multi_vector.cpp              | 5 ++---
 benchmark/conversion/conversion.cpp                      | 4 +---
 benchmark/matrix_generator/matrix_generator.cpp          | 4 +---
 benchmark/matrix_statistics/matrix_statistics.cpp        | 5 +----
 benchmark/preconditioner/preconditioner.cpp              | 4 +---
 benchmark/solver/distributed/solver.cpp                  | 5 ++---
 benchmark/solver/solver.cpp                              | 4 +---
 benchmark/sparse_blas/operations.cpp                     | 5 ++---
 benchmark/sparse_blas/operations.hpp                     | 4 +---
 benchmark/sparse_blas/sparse_blas.cpp                    | 4 +---
 benchmark/spmv/distributed/spmv.cpp                      | 5 ++---
 benchmark/spmv/spmv.cpp                                  | 4 +---
 benchmark/tools/matrix.cpp                               | 2 --
 benchmark/tools/mtx_to_binary.cpp                        | 1 -
 benchmark/utils/cuda_linops.cpp                          | 5 +----
 benchmark/utils/cuda_timer.cpp                           | 1 -
 benchmark/utils/dpcpp_linops.dp.cpp                      | 5 +----
 benchmark/utils/dpcpp_timer.dp.cpp                       | 2 --
 benchmark/utils/formats.hpp                              | 5 +----
 benchmark/utils/general.hpp                              | 6 +-----
 benchmark/utils/general_matrix.hpp                       | 4 +---
 benchmark/utils/generator.hpp                            | 1 -
 benchmark/utils/hip_linops.hip.cpp                       | 4 +---
 benchmark/utils/hip_timer.hip.cpp                        | 1 -
 benchmark/utils/iteration_control.hpp                    | 4 +---
 benchmark/utils/loggers.hpp                              | 4 +---
 benchmark/utils/mpi_timer.cpp                            | 1 -
 benchmark/utils/overhead_linop.hpp                       | 2 --
 benchmark/utils/preconditioners.hpp                      | 5 +----
 benchmark/utils/runner.hpp                               | 4 +---
 benchmark/utils/sparselib_linops.hpp                     | 1 -
 benchmark/utils/timer.hpp                                | 5 +----
 benchmark/utils/timer_impl.hpp                           | 5 ++---
 benchmark/utils/tuning_variables.cpp                     | 5 ++---
 benchmark/utils/types.hpp                                | 1 -
 cmake/openmpi_test.cpp                                   | 1 -
 common/cuda_hip/base/thrust.hpp                          | 1 -
 common/unified/base/device_matrix_data_kernels.cpp       | 2 --
 common/unified/base/index_set_kernels.cpp                | 2 --
 common/unified/base/kernel_launch.hpp                    | 1 -
 common/unified/components/absolute_array_kernels.cpp     | 1 -
 common/unified/components/fill_array_kernels.cpp         | 1 -
 common/unified/components/format_conversion_kernels.cpp  | 2 --
 .../unified/components/precision_conversion_kernels.cpp  | 1 -
 common/unified/components/reduce_array_kernels.cpp       | 2 --
 common/unified/distributed/partition_helpers_kernels.cpp | 1 -
 common/unified/distributed/partition_kernels.cpp         | 1 -
 common/unified/matrix/coo_kernels.cpp                    | 2 --
 common/unified/matrix/csr_kernels.cpp                    | 3 ---
 common/unified/matrix/dense_kernels.template.cpp         | 2 --
 common/unified/matrix/diagonal_kernels.cpp               | 2 --
 common/unified/matrix/ell_kernels.cpp                    | 2 --
 common/unified/matrix/hybrid_kernels.cpp                 | 1 -
 common/unified/matrix/permutation_kernels.cpp            | 2 --
 common/unified/matrix/scaled_permutation_kernels.cpp     | 2 --
 common/unified/matrix/sellp_kernels.cpp                  | 2 --
 common/unified/matrix/sparsity_csr_kernels.cpp           | 2 --
 common/unified/multigrid/pgm_kernels.cpp                 | 2 --
 common/unified/preconditioner/jacobi_kernels.cpp         | 2 --
 common/unified/solver/bicg_kernels.cpp                   | 2 --
 common/unified/solver/bicgstab_kernels.cpp               | 2 --
 common/unified/solver/cg_kernels.cpp                     | 2 --
 common/unified/solver/cgs_kernels.cpp                    | 2 --
 common/unified/solver/common_gmres_kernels.cpp           | 2 --
 common/unified/solver/fcg_kernels.cpp                    | 2 --
 common/unified/solver/gcr_kernels.cpp                    | 2 --
 common/unified/solver/gmres_kernels.cpp                  | 2 --
 common/unified/solver/ir_kernels.cpp                     | 1 -
 core/base/allocator.hpp                                  | 1 -
 core/base/array.cpp                                      | 2 --
 core/base/batch_multi_vector.cpp                         | 3 ---
 core/base/batch_multi_vector_kernels.hpp                 | 3 ---
 core/base/batch_utilities.hpp                            | 1 -
 core/base/block_operator.cpp                             | 3 ---
 core/base/combination.cpp                                | 1 -
 core/base/composition.cpp                                | 3 ---
 core/base/dense_cache.cpp                                | 1 -
 core/base/device_matrix_data.cpp                         | 2 --
 core/base/device_matrix_data_kernels.hpp                 | 6 +-----
 core/base/dispatch_helper.hpp                            | 1 -
 core/base/executor.cpp                                   | 1 -
 core/base/extended_float.hpp                             | 1 -
 core/base/index_range.hpp                                | 1 -
 core/base/index_set.cpp                                  | 3 ---
 core/base/index_set_kernels.hpp                          | 4 +---
 core/base/iterator_factory.hpp                           | 1 -
 core/base/memory.cpp                                     | 2 --
 core/base/mpi.cpp                                        | 1 -
 core/base/mtx_io.cpp                                     | 2 --
 core/base/perturbation.cpp                               | 1 -
 core/base/segmented_array.cpp                            | 1 -
 core/base/timer.cpp                                      | 3 ---
 core/base/utils.hpp                                      | 5 +----
 core/base/workspace_aliases.hpp                          | 1 -
 core/components/absolute_array_kernels.hpp               | 2 --
 core/components/addressable_pq.hpp                       | 2 --
 core/components/fill_array_kernels.hpp                   | 2 --
 core/components/format_conversion_kernels.hpp            | 2 --
 core/components/precision_conversion_kernels.hpp         | 2 --
 core/components/prefix_sum_kernels.hpp                   | 2 --
 core/components/reduce_array_kernels.hpp                 | 2 --
 core/config/config.cpp                                   | 3 ---
 core/config/config_helper.cpp                            | 5 ++---
 core/config/config_helper.hpp                            | 2 --
 core/config/dispatch.hpp                                 | 2 --
 core/config/factorization_config.cpp                     | 1 -
 core/config/multigrid_config.cpp                         | 4 +---
 core/config/parse_macro.hpp                              | 1 -
 core/config/preconditioner_config.cpp                    | 1 -
 core/config/property_tree.cpp                            | 1 -
 core/config/registry.cpp                                 | 2 --
 core/config/registry_accessor.hpp                        | 1 -
 core/config/solver_config.cpp                            | 4 ++--
 core/config/solver_config.hpp                            | 1 -
 core/config/stop_config.cpp                              | 4 ++--
 core/config/trisolver_config.hpp                         | 1 -
 core/config/type_descriptor.cpp                          | 2 --
 core/config/type_descriptor_helper.hpp                   | 1 -
 core/device_hooks/common_kernels.inc.cpp                 | 1 -
 core/device_hooks/cuda_hooks.cpp                         | 1 -
 core/device_hooks/dpcpp_hooks.cpp                        | 1 -
 core/device_hooks/hip_hooks.cpp                          | 1 -
 core/distributed/helpers.hpp                             | 2 --
 core/distributed/index_map.cpp                           | 1 -
 core/distributed/index_map_kernels.hpp                   | 5 +----
 core/distributed/matrix.cpp                              | 2 --
 core/distributed/matrix_kernels.hpp                      | 1 -
 core/distributed/partition.cpp                           | 1 -
 core/distributed/partition_helpers.cpp                   | 3 ---
 core/distributed/partition_helpers_kernels.hpp           | 1 -
 core/distributed/partition_kernels.hpp                   | 1 -
 core/distributed/preconditioner/schwarz.cpp              | 3 ---
 core/distributed/vector.cpp                              | 2 --
 core/distributed/vector_kernels.hpp                      | 1 -
 core/factorization/cholesky.cpp                          | 2 --
 core/factorization/cholesky_kernels.hpp                  | 2 --
 core/factorization/elimination_forest.cpp                | 1 -
 core/factorization/elimination_forest.hpp                | 1 -
 core/factorization/factorization.cpp                     | 2 --
 core/factorization/factorization_kernels.hpp             | 2 --
 core/factorization/ic.cpp                                | 3 ---
 core/factorization/ic_kernels.hpp                        | 6 +-----
 core/factorization/ilu.cpp                               | 3 ---
 core/factorization/ilu_kernels.hpp                       | 6 +-----
 core/factorization/lu.cpp                                | 2 --
 core/factorization/lu_kernels.hpp                        | 2 --
 core/factorization/par_ic.cpp                            | 3 ---
 core/factorization/par_ic_kernels.hpp                    | 6 +-----
 core/factorization/par_ict.cpp                           | 3 ---
 core/factorization/par_ict_kernels.hpp                   | 6 +-----
 core/factorization/par_ilu.cpp                           | 3 ---
 core/factorization/par_ilu_kernels.hpp                   | 6 +-----
 core/factorization/par_ilut.cpp                          | 3 ---
 core/factorization/par_ilut_kernels.hpp                  | 6 +-----
 core/factorization/symbolic.cpp                          | 2 --
 core/factorization/symbolic.hpp                          | 1 -
 core/log/batch_logger.cpp                                | 2 --
 core/log/convergence.cpp                                 | 2 --
 core/log/papi.cpp                                        | 2 --
 core/log/performance_hint.cpp                            | 1 -
 core/log/profiler_hook.cpp                               | 3 ---
 core/log/profiler_hook_summary.cpp                       | 1 -
 core/log/profiler_hook_summary_writer.cpp                | 1 -
 core/log/record.cpp                                      | 1 -
 core/log/stream.cpp                                      | 2 --
 core/matrix/batch_csr.cpp                                | 3 ---
 core/matrix/batch_csr_kernels.hpp                        | 5 +----
 core/matrix/batch_dense.cpp                              | 3 ---
 core/matrix/batch_dense_kernels.hpp                      | 5 +----
 core/matrix/batch_ell.cpp                                | 3 ---
 core/matrix/batch_ell_kernels.hpp                        | 5 +----
 core/matrix/batch_identity.cpp                           | 2 --
 core/matrix/coo.cpp                                      | 3 ---
 core/matrix/coo_kernels.hpp                              | 5 +----
 core/matrix/csr.cpp                                      | 2 --
 core/matrix/csr_accessor_helper.hpp                      | 1 -
 core/matrix/csr_kernels.hpp                              | 5 +----
 core/matrix/csr_lookup.hpp                               | 1 -
 core/matrix/dense.cpp                                    | 3 ---
 core/matrix/dense_kernels.hpp                            | 6 +-----
 core/matrix/diagonal.cpp                                 | 2 --
 core/matrix/diagonal_kernels.hpp                         | 5 +----
 core/matrix/ell.cpp                                      | 3 ---
 core/matrix/ell_kernels.hpp                              | 5 +----
 core/matrix/fbcsr.cpp                                    | 3 ---
 core/matrix/fbcsr_kernels.hpp                            | 5 +----
 core/matrix/fft.cpp                                      | 2 --
 core/matrix/hybrid.cpp                                   | 3 ---
 core/matrix/hybrid_kernels.hpp                           | 5 +----
 core/matrix/identity.cpp                                 | 1 -
 core/matrix/permutation.cpp                              | 2 --
 core/matrix/permutation.hpp                              | 4 +---
 core/matrix/permutation_kernels.hpp                      | 1 -
 core/matrix/row_gatherer.cpp                             | 2 --
 core/matrix/scaled_permutation.cpp                       | 2 --
 core/matrix/scaled_permutation_kernels.hpp               | 1 -
 core/matrix/sellp.cpp                                    | 2 --
 core/matrix/sellp_kernels.hpp                            | 5 +----
 core/matrix/sparsity_csr.cpp                             | 2 --
 core/matrix/sparsity_csr_kernels.hpp                     | 5 +----
 core/mpi/exception.cpp                                   | 2 --
 core/multigrid/fixed_coarsening.cpp                      | 2 --
 core/multigrid/pgm.cpp                                   | 2 --
 core/multigrid/pgm_kernels.hpp                           | 2 --
 core/preconditioner/batch_jacobi.cpp                     | 1 -
 core/preconditioner/batch_jacobi_kernels.hpp             | 5 +----
 core/preconditioner/ic.cpp                               | 2 --
 core/preconditioner/ilu.cpp                              | 2 --
 core/preconditioner/isai.cpp                             | 3 ---
 core/preconditioner/isai_kernels.hpp                     | 5 +----
 core/preconditioner/jacobi.cpp                           | 3 ---
 core/preconditioner/jacobi_kernels.hpp                   | 5 +----
 core/preconditioner/jacobi_utils.hpp                     | 1 -
 core/reorder/amd.cpp                                     | 3 ---
 core/reorder/mc64.cpp                                    | 3 ---
 core/reorder/mc64.hpp                                    | 5 +----
 core/reorder/nested_dissection.cpp                       | 2 --
 core/reorder/rcm.cpp                                     | 3 ---
 core/reorder/rcm_kernels.hpp                             | 6 +-----
 core/reorder/scaled_reordered.cpp                        | 2 --
 core/solver/batch_bicgstab.cpp                           | 2 --
 core/solver/batch_bicgstab_kernels.hpp                   | 1 -
 core/solver/batch_cg.cpp                                 | 2 --
 core/solver/batch_cg_kernels.hpp                         | 1 -
 core/solver/batch_dispatch.hpp                           | 1 -
 core/solver/bicg.cpp                                     | 2 --
 core/solver/bicg_kernels.hpp                             | 2 --
 core/solver/bicgstab.cpp                                 | 2 --
 core/solver/bicgstab_kernels.hpp                         | 2 --
 core/solver/cb_gmres.cpp                                 | 3 ---
 core/solver/cb_gmres_accessor.hpp                        | 2 --
 core/solver/cb_gmres_kernels.hpp                         | 1 -
 core/solver/cg.cpp                                       | 2 --
 core/solver/cg_kernels.hpp                               | 2 --
 core/solver/cgs.cpp                                      | 2 --
 core/solver/cgs_kernels.hpp                              | 2 --
 core/solver/common_gmres_kernels.hpp                     | 1 -
 core/solver/direct.cpp                                   | 3 ---
 core/solver/fcg.cpp                                      | 2 --
 core/solver/fcg_kernels.hpp                              | 2 --
 core/solver/gcr.cpp                                      | 2 --
 core/solver/gcr_kernels.hpp                              | 1 -
 core/solver/gmres.cpp                                    | 2 --
 core/solver/gmres_kernels.hpp                            | 1 -
 core/solver/idr.cpp                                      | 2 --
 core/solver/idr_kernels.hpp                              | 1 -
 core/solver/ir.cpp                                       | 2 --
 core/solver/ir_kernels.hpp                               | 2 --
 core/solver/lower_trs.cpp                                | 1 -
 core/solver/lower_trs_kernels.hpp                        | 2 --
 core/solver/multigrid.cpp                                | 3 ---
 core/solver/multigrid_kernels.hpp                        | 1 -
 core/solver/upper_trs.cpp                                | 1 -
 core/solver/upper_trs_kernels.hpp                        | 2 --
 core/stop/criterion.cpp                                  | 1 -
 core/stop/criterion_kernels.hpp                          | 1 -
 core/stop/residual_norm.cpp                              | 2 --
 core/synthesizer/implementation_selection.hpp            | 1 -
 core/test/accessor/block_col_major.cpp                   | 5 ++---
 core/test/accessor/index_span.cpp                        | 5 ++---
 core/test/accessor/math.cpp                              | 6 ++----
 core/test/accessor/range.cpp                             | 6 ++----
 core/test/accessor/reduced_row_major.cpp                 | 5 ++---
 core/test/accessor/reduced_row_major_ginkgo.cpp          | 2 --
 core/test/accessor/reduced_row_major_reference.cpp       | 5 ++---
 core/test/accessor/row_major.cpp                         | 5 ++---
 core/test/accessor/scaled_reduced_row_major.cpp          | 5 ++---
 .../test/accessor/scaled_reduced_row_major_reference.cpp | 5 ++---
 core/test/base/abstract_factory.cpp                      | 5 ++---
 core/test/base/allocator.cpp                             | 2 --
 core/test/base/array.cpp                                 | 7 +------
 core/test/base/batch_dim.cpp                             | 6 ++----
 core/test/base/batch_lin_op.cpp                          | 6 +-----
 core/test/base/batch_multi_vector.cpp                    | 6 +-----
 core/test/base/block_operator.cpp                        | 7 +------
 core/test/base/combination.cpp                           | 5 +----
 core/test/base/composition.cpp                           | 5 +----
 core/test/base/deferred_factory.cpp                      | 1 -
 core/test/base/dense_cache.cpp                           | 6 +-----
 core/test/base/dim.cpp                                   | 6 ++----
 core/test/base/exception.cpp                             | 5 ++---
 core/test/base/exception_helpers.cpp                     | 5 ++---
 core/test/base/executor.cpp                              | 6 ++----
 core/test/base/extended_float.cpp                        | 2 --
 core/test/base/index_range.cpp                           | 5 ++---
 core/test/base/iterator_factory.cpp                      | 3 ---
 core/test/base/lin_op.cpp                                | 6 +-----
 core/test/base/math.cpp                                  | 6 ++----
 core/test/base/matrix_assembly_data.cpp                  | 5 ++---
 core/test/base/matrix_data.cpp                           | 6 ++----
 core/test/base/mtx_io.cpp                                | 7 +------
 core/test/base/perturbation.cpp                          | 6 ++----
 core/test/base/polymorphic_object.cpp                    | 5 ++---
 core/test/base/range.cpp                                 | 6 ++----
 core/test/base/range_accessors.cpp                       | 6 +-----
 core/test/base/sanitizers.cpp                            | 1 -
 core/test/base/segmented_array.cpp                       | 5 ++---
 core/test/base/types.cpp                                 | 7 ++-----
 core/test/base/utils.cpp                                 | 5 +----
 core/test/base/version.cpp                               | 6 ++----
 core/test/components/addressable_pq.cpp                  | 4 ----
 core/test/components/disjoint_sets.cpp                   | 4 ----
 core/test/config/config.cpp                              | 6 +-----
 core/test/config/factorization.cpp                       | 3 ---
 core/test/config/multigrid.cpp                           | 3 ---
 core/test/config/preconditioner.cpp                      | 3 ---
 core/test/config/property_tree.cpp                       | 6 +-----
 core/test/config/registry.cpp                            | 6 +-----
 core/test/config/solver.cpp                              | 3 ---
 core/test/config/type_descriptor.cpp                     | 4 +---
 core/test/distributed/index_map.cpp                      | 6 +-----
 core/test/factorization/elimination_forest.cpp           | 4 ----
 core/test/factorization/par_ic.cpp                       | 6 +-----
 core/test/factorization/par_ict.cpp                      | 6 +-----
 core/test/factorization/par_ilu.cpp                      | 6 +-----
 core/test/factorization/par_ilut.cpp                     | 6 +-----
 core/test/gtest/environments.hpp                         | 3 ---
 core/test/gtest/ginkgo_main.cpp                          | 1 -
 core/test/gtest/ginkgo_mpi_main.cpp                      | 4 ----
 core/test/gtest/resources.cpp                            | 5 ++---
 core/test/log/convergence.cpp                            | 6 +-----
 core/test/log/logger.cpp                                 | 6 +-----
 core/test/log/papi.cpp                                   | 7 +------
 core/test/log/performance_hint.cpp                       | 7 +------
 core/test/log/profiler_hook.cpp                          | 8 ++------
 core/test/log/record.cpp                                 | 6 +-----
 core/test/log/stream.cpp                                 | 7 +------
 core/test/matrix/batch_csr.cpp                           | 6 +-----
 core/test/matrix/batch_dense.cpp                         | 6 +-----
 core/test/matrix/batch_ell.cpp                           | 6 +-----
 core/test/matrix/batch_identity.cpp                      | 6 +-----
 core/test/matrix/coo.cpp                                 | 4 +---
 core/test/matrix/coo_builder.cpp                         | 3 ---
 core/test/matrix/csr.cpp                                 | 6 +-----
 core/test/matrix/csr_builder.cpp                         | 3 ---
 core/test/matrix/dense.cpp                               | 6 +-----
 core/test/matrix/diagonal.cpp                            | 4 +---
 core/test/matrix/ell.cpp                                 | 4 +---
 core/test/matrix/fbcsr.cpp                               | 7 +------
 core/test/matrix/fbcsr_builder.cpp                       | 3 ---
 core/test/matrix/fbcsr_sample.hpp                        | 1 -
 core/test/matrix/hybrid.cpp                              | 4 +---
 core/test/matrix/identity.cpp                            | 6 +-----
 core/test/matrix/permutation.cpp                         | 6 +-----
 core/test/matrix/row_gatherer.cpp                        | 6 +-----
 core/test/matrix/sellp.cpp                               | 4 +---
 core/test/matrix/sparsity_csr.cpp                        | 7 +------
 core/test/mpi/base/bindings.cpp                          | 3 ---
 core/test/mpi/base/communicator.cpp                      | 2 --
 core/test/mpi/base/exception_helpers.cpp                 | 2 --
 core/test/mpi/base/polymorphic_object.cpp                | 1 -
 core/test/mpi/base/rank_mapping.cpp                      | 3 ---
 core/test/mpi/distributed/helpers.cpp                    | 5 ++---
 core/test/mpi/distributed/matrix.cpp                     | 2 --
 core/test/mpi/distributed/preconditioner/schwarz.cpp     | 2 --
 core/test/mpi/distributed/solver/multigrid.cpp           | 2 --
 core/test/multigrid/fixed_coarsening.cpp                 | 7 +------
 core/test/multigrid/pgm.cpp                              | 7 +------
 core/test/preconditioner/batch_jacobi.cpp                | 6 +-----
 core/test/preconditioner/ic.cpp                          | 3 ---
 core/test/preconditioner/ilu.cpp                         | 3 ---
 core/test/preconditioner/isai.cpp                        | 7 +------
 core/test/preconditioner/jacobi.cpp                      | 6 +-----
 core/test/reorder/amd.cpp                                | 7 +------
 core/test/reorder/nested_dissection.cpp                  | 7 +------
 core/test/reorder/rcm.cpp                                | 7 +------
 core/test/reorder/scaled_reordered.cpp                   | 6 +-----
 core/test/solver/batch_bicgstab.cpp                      | 6 +-----
 core/test/solver/batch_cg.cpp                            | 6 +-----
 core/test/solver/bicg.cpp                                | 7 +------
 core/test/solver/bicgstab.cpp                            | 6 +-----
 core/test/solver/cb_gmres.cpp                            | 7 +------
 core/test/solver/cg.cpp                                  | 7 +------
 core/test/solver/cgs.cpp                                 | 7 +------
 core/test/solver/direct.cpp                              | 7 +------
 core/test/solver/fcg.cpp                                 | 6 +-----
 core/test/solver/gcr.cpp                                 | 7 +------
 core/test/solver/gmres.cpp                               | 7 +------
 core/test/solver/idr.cpp                                 | 6 +-----
 core/test/solver/ir.cpp                                  | 7 +------
 core/test/solver/lower_trs.cpp                           | 3 ---
 core/test/solver/multigrid.cpp                           | 7 +------
 core/test/solver/upper_trs.cpp                           | 3 ---
 core/test/solver/workspace.cpp                           | 7 +------
 core/test/stop/combined.cpp                              | 6 +-----
 core/test/stop/criterion.cpp                             | 5 ++---
 core/test/stop/iteration.cpp                             | 5 ++---
 core/test/stop/stopping_status.cpp                       | 5 ++---
 core/test/stop/time.cpp                                  | 6 ++----
 core/test/utils.hpp                                      | 3 ---
 core/test/utils/array_generator.hpp                      | 1 -
 core/test/utils/array_generator_test.cpp                 | 3 ---
 core/test/utils/assertions.hpp                           | 3 ---
 core/test/utils/assertions_test.cpp                      | 3 ---
 core/test/utils/batch_helpers.hpp                        | 2 --
 core/test/utils/fb_matrix_generator.hpp                  | 2 --
 core/test/utils/fb_matrix_generator_test.cpp             | 3 ---
 core/test/utils/matrix_generator.hpp                     | 2 --
 core/test/utils/matrix_generator_test.cpp                | 3 ---
 core/test/utils/matrix_utils_test.cpp                    | 4 ----
 core/test/utils/unsort_matrix.hpp                        | 2 --
 core/test/utils/unsort_matrix_test.cpp                   | 4 ----
 core/test/utils/value_generator.hpp                      | 1 -
 core/test/utils/value_generator_test.cpp                 | 3 ---
 cuda/base/batch_multi_vector_kernels.cu                  | 3 ---
 cuda/base/batch_struct.hpp                               | 1 -
 cuda/base/config.hpp                                     | 1 -
 cuda/base/cublas_bindings.hpp                            | 2 --
 cuda/base/curand_bindings.hpp                            | 2 --
 cuda/base/cusparse_bindings.hpp                          | 2 --
 cuda/base/cusparse_block_bindings.hpp                    | 2 --
 cuda/base/cusparse_handle.hpp                            | 1 -
 cuda/base/device.cpp                                     | 5 ++---
 cuda/base/device_matrix_data_kernels.cu                  | 2 --
 cuda/base/exception.cpp                                  | 5 +----
 cuda/base/executor.cpp                                   | 4 ----
 cuda/base/index_set_kernels.cpp                          | 2 --
 cuda/base/kernel_config.hpp                              | 1 -
 cuda/base/kernel_launch.cuh                              | 1 -
 cuda/base/math.hpp                                       | 5 ++---
 cuda/base/memory.cpp                                     | 3 ---
 cuda/base/nvtx.cpp                                       | 1 -
 cuda/base/pointer_mode_guard.hpp                         | 4 +---
 cuda/base/scoped_device_id.cpp                           | 7 ++-----
 cuda/base/stream.cpp                                     | 3 ---
 cuda/base/thrust.cuh                                     | 1 -
 cuda/base/timer.cpp                                      | 3 ---
 cuda/base/types.hpp                                      | 8 ++------
 cuda/components/atomic.cuh                               | 1 -
 cuda/components/cooperative_groups.cuh                   | 4 +---
 cuda/components/diagonal_block_manipulation.cuh          | 1 -
 cuda/components/format_conversion.cuh                    | 1 -
 cuda/components/memory.cuh                               | 2 --
 cuda/components/prefix_sum.cuh                           | 1 -
 cuda/components/prefix_sum_kernels.cu                    | 4 ----
 cuda/components/reduction.cuh                            | 2 --
 cuda/components/syncfree.cuh                             | 1 -
 cuda/components/warp_blas.cuh                            | 2 --
 cuda/distributed/index_map_kernels.cu                    | 3 ---
 cuda/distributed/matrix_kernels.cu                       | 3 ---
 cuda/distributed/partition_helpers_kernels.cu            | 2 --
 cuda/distributed/partition_kernels.cu                    | 2 --
 cuda/distributed/vector_kernels.cu                       | 3 ---
 cuda/factorization/cholesky_kernels.cu                   | 4 ----
 cuda/factorization/factorization_kernels.cu              | 2 --
 cuda/factorization/ic_kernels.cu                         | 2 --
 cuda/factorization/ilu_kernels.cu                        | 2 --
 cuda/factorization/lu_kernels.cu                         | 4 ----
 cuda/factorization/par_ic_kernels.cu                     | 2 --
 cuda/factorization/par_ict_kernels.cu                    | 2 --
 cuda/factorization/par_ilu_kernels.cu                    | 2 --
 cuda/factorization/par_ilut_approx_filter_kernels.cu     | 6 +-----
 cuda/factorization/par_ilut_filter_kernels.cu            | 5 +----
 cuda/factorization/par_ilut_select_common.cu             | 1 -
 cuda/factorization/par_ilut_select_kernels.cu            | 6 +-----
 cuda/factorization/par_ilut_spgeam_kernels.cu            | 5 +----
 cuda/factorization/par_ilut_sweep_kernels.cu             | 5 +----
 cuda/matrix/batch_csr_kernels.cu                         | 3 ---
 cuda/matrix/batch_dense_kernels.cu                       | 3 ---
 cuda/matrix/batch_ell_kernels.cu                         | 3 ---
 cuda/matrix/batch_struct.hpp                             | 5 +----
 cuda/matrix/coo_kernels.cu                               | 2 --
 cuda/matrix/csr_kernels.template.cu                      | 4 ----
 cuda/matrix/dense_kernels.cu                             | 2 --
 cuda/matrix/diagonal_kernels.cu                          | 2 --
 cuda/matrix/ell_kernels.cu                               | 3 ---
 cuda/matrix/fbcsr_kernels.template.cu                    | 4 ----
 cuda/matrix/fft_kernels.cu                               | 3 ---
 cuda/matrix/sellp_kernels.cu                             | 2 --
 cuda/matrix/sparsity_csr_kernels.cu                      | 3 ---
 cuda/multigrid/pgm_kernels.cu                            | 4 ----
 cuda/preconditioner/batch_jacobi_kernels.cu              | 2 --
 cuda/preconditioner/isai_kernels.cu                      | 2 --
 cuda/preconditioner/jacobi_advanced_apply_kernels.cu     | 5 +----
 .../jacobi_advanced_apply_kernels.instantiate.cu         | 5 +----
 cuda/preconditioner/jacobi_generate_kernels.cu           | 5 +----
 .../jacobi_generate_kernels.instantiate.cu               | 5 +----
 cuda/preconditioner/jacobi_kernels.cu                    | 2 --
 cuda/preconditioner/jacobi_simple_apply_kernels.cu       | 5 +----
 .../jacobi_simple_apply_kernels.instantiate.cu           | 5 +----
 cuda/reorder/rcm_kernels.cu                              | 3 ---
 cuda/solver/batch_bicgstab_kernels.cu                    | 3 ---
 cuda/solver/batch_cg_kernels.cu                          | 3 ---
 cuda/solver/cb_gmres_kernels.cu                          | 3 ---
 cuda/solver/common_trs_kernels.cuh                       | 3 ---
 cuda/solver/idr_kernels.cu                               | 3 ---
 cuda/solver/lower_trs_kernels.cu                         | 4 ----
 cuda/solver/multigrid_kernels.cu                         | 2 --
 cuda/solver/upper_trs_kernels.cu                         | 4 ----
 cuda/stop/criterion_kernels.cu                           | 2 --
 cuda/stop/residual_norm_kernels.cu                       | 2 --
 cuda/test/base/array.cpp                                 | 6 +-----
 cuda/test/base/cuda_executor.cu                          | 6 +-----
 cuda/test/base/cuda_executor_topology.cu                 | 7 ++-----
 cuda/test/base/exception_helpers.cu                      | 8 +++-----
 cuda/test/base/index_set.cpp                             | 7 +------
 cuda/test/base/kernel_launch.cu                          | 4 ----
 cuda/test/base/lin_op.cpp                                | 1 -
 cuda/test/base/math.cu                                   | 8 ++------
 cuda/test/base/memory.cpp                                | 7 +------
 cuda/test/base/scoped_device_id.cu                       | 6 ++----
 cuda/test/components/cooperative_groups.cu               | 6 ++----
 cuda/test/components/merging.cu                          | 4 ----
 cuda/test/components/searching.cu                        | 4 ----
 cuda/test/components/sorting.cu                          | 4 ----
 cuda/test/solver/lower_trs_kernels.cu                    | 7 ++-----
 cuda/test/solver/upper_trs_kernels.cu                    | 7 ++-----
 cuda/test/utils.hpp                                      | 5 +----
 cuda/test/utils/assertions_test.cu                       | 3 ---
 devices/device.cpp                                       | 1 -
 devices/dpcpp/executor.cpp                               | 2 --
 devices/machine_topology.cpp                             | 1 -
 devices/omp/executor.cpp                                 | 2 --
 dpcpp/base/batch_multi_vector_kernels.dp.cpp             | 4 ----
 dpcpp/base/batch_struct.hpp                              | 1 -
 dpcpp/base/config.hpp                                    | 1 -
 dpcpp/base/device_matrix_data_kernels.dp.cpp             | 4 ----
 dpcpp/base/executor.dp.cpp                               | 3 ---
 dpcpp/base/helper.dp.cpp                                 | 5 ++---
 dpcpp/base/helper.hpp                                    | 3 ---
 dpcpp/base/index_set_kernels.dp.cpp                      | 2 --
 dpcpp/base/kernel_launch.dp.hpp                          | 1 -
 dpcpp/base/kernel_launch_reduction.dp.hpp                | 1 -
 dpcpp/base/onedpl.hpp                                    | 1 -
 dpcpp/base/onemkl_bindings.hpp                           | 2 --
 dpcpp/base/scoped_device_id.dp.cpp                       | 1 -
 dpcpp/base/timer.dp.cpp                                  | 2 --
 dpcpp/components/atomic.dp.hpp                           | 2 --
 dpcpp/components/cooperative_groups.dp.hpp               | 2 --
 dpcpp/components/diagonal_block_manipulation.dp.hpp      | 2 --
 dpcpp/components/format_conversion.dp.hpp                | 3 ---
 dpcpp/components/intrinsics.dp.hpp                       | 2 --
 dpcpp/components/merging.dp.hpp                          | 2 --
 dpcpp/components/prefix_sum.dp.hpp                       | 2 --
 dpcpp/components/prefix_sum_kernels.dp.cpp               | 3 ---
 dpcpp/components/reduction.dp.hpp                        | 3 ---
 dpcpp/components/searching.dp.hpp                        | 1 -
 dpcpp/components/segment_scan.dp.hpp                     | 1 -
 dpcpp/components/sorting.dp.hpp                          | 1 -
 dpcpp/components/thread_ids.dp.hpp                       | 1 -
 dpcpp/components/uninitialized_array.hpp                 | 1 -
 dpcpp/components/warp_blas.dp.hpp                        | 3 ---
 dpcpp/distributed/index_map_kernels.dp.cpp               | 1 -
 dpcpp/distributed/matrix_kernels.dp.cpp                  | 1 -
 dpcpp/distributed/partition_helpers_kernels.dp.cpp       | 2 --
 dpcpp/distributed/partition_kernels.dp.cpp               | 3 ---
 dpcpp/distributed/vector_kernels.dp.cpp                  | 1 -
 dpcpp/factorization/cholesky_kernels.dp.cpp              | 4 ----
 dpcpp/factorization/factorization_kernels.dp.cpp         | 3 ---
 dpcpp/factorization/lu_kernels.dp.cpp                    | 3 ---
 dpcpp/factorization/par_ic_kernels.dp.cpp                | 3 ---
 dpcpp/factorization/par_ict_kernels.dp.cpp               | 4 ----
 dpcpp/factorization/par_ilu_kernels.dp.cpp               | 3 ---
 dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp | 7 +------
 dpcpp/factorization/par_ilut_filter_kernel.dp.cpp        | 6 +-----
 dpcpp/factorization/par_ilut_kernels.dp.cpp              | 4 ----
 dpcpp/factorization/par_ilut_select_common.dp.cpp        | 7 ++-----
 dpcpp/factorization/par_ilut_select_kernel.dp.cpp        | 7 +------
 dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp        | 7 +------
 dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp         | 6 +-----
 dpcpp/log/batch_logger.hpp                               | 1 -
 dpcpp/matrix/batch_csr_kernels.dp.cpp                    | 4 ----
 dpcpp/matrix/batch_dense_kernels.dp.cpp                  | 4 ----
 dpcpp/matrix/batch_ell_kernels.dp.cpp                    | 4 ----
 dpcpp/matrix/batch_struct.hpp                            | 5 +----
 dpcpp/matrix/coo_kernels.dp.cpp                          | 3 ---
 dpcpp/matrix/csr_kernels.dp.cpp                          | 4 ----
 dpcpp/matrix/dense_kernels.dp.cpp                        | 3 ---
 dpcpp/matrix/diagonal_kernels.dp.cpp                     | 3 ---
 dpcpp/matrix/ell_kernels.dp.cpp                          | 4 ----
 dpcpp/matrix/fbcsr_kernels.dp.cpp                        | 3 ---
 dpcpp/matrix/fft_kernels.dp.cpp                          | 1 -
 dpcpp/matrix/sellp_kernels.dp.cpp                        | 3 ---
 dpcpp/matrix/sparsity_csr_kernels.dp.cpp                 | 3 ---
 dpcpp/multigrid/pgm_kernels.dp.cpp                       | 5 -----
 dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp         | 2 --
 dpcpp/preconditioner/isai_kernels.dp.cpp                 | 3 ---
 .../jacobi_advanced_apply_instantiate.inc.dp.cpp         | 6 +-----
 dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp | 5 +----
 .../jacobi_generate_instantiate.inc.dp.cpp               | 6 +-----
 dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp       | 5 +----
 dpcpp/preconditioner/jacobi_kernels.dp.cpp               | 3 ---
 .../jacobi_simple_apply_instantiate.inc.dp.cpp           | 6 +-----
 dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp   | 5 +----
 dpcpp/reorder/rcm_kernels.dp.cpp                         | 2 --
 dpcpp/solver/batch_bicgstab_kernels.dp.cpp               | 3 ---
 dpcpp/solver/batch_cg_kernels.dp.cpp                     | 3 ---
 dpcpp/solver/cb_gmres_kernels.dp.cpp                     | 4 ----
 dpcpp/solver/idr_kernels.dp.cpp                          | 7 ++-----
 dpcpp/solver/lower_trs_kernels.dp.cpp                    | 3 ---
 dpcpp/solver/multigrid_kernels.dp.cpp                    | 2 --
 dpcpp/solver/upper_trs_kernels.dp.cpp                    | 3 ---
 dpcpp/stop/batch_criteria.hpp                            | 1 -
 dpcpp/stop/criterion_kernels.dp.cpp                      | 2 --
 dpcpp/stop/residual_norm_kernels.dp.cpp                  | 3 ---
 dpcpp/synthesizer/implementation_selection.hpp           | 2 --
 dpcpp/test/base/dim3.dp.cpp                              | 2 --
 dpcpp/test/base/executor.dp.cpp                          | 7 +------
 dpcpp/test/base/kernel_launch.dp.cpp                     | 4 ----
 dpcpp/test/components/cooperative_groups.dp.cpp          | 5 -----
 dpcpp/test/matrix/fbcsr_kernels.dp.cpp                   | 2 --
 dpcpp/test/preconditioner/jacobi_kernels.dp.cpp          | 7 +------
 .../adaptiveprecision-blockjacobi.cpp                    | 5 ++---
 examples/cb-gmres/cb-gmres.cpp                           | 4 ++--
 examples/custom-matrix-format/custom-matrix-format.cpp   | 2 +-
 .../custom-stopping-criterion.cpp                        | 5 ++---
 .../external-lib-interfacing.cpp                         | 1 -
 examples/ginkgo-overhead/ginkgo-overhead.cpp             | 5 ++---
 examples/ginkgo-ranges/ginkgo-ranges.cpp                 | 3 ++-
 examples/heat-equation/heat-equation.cpp                 | 6 ++----
 .../ilu-preconditioned-solver.cpp                        | 5 ++---
 examples/inverse-iteration/inverse-iteration.cpp         | 5 ++---
 .../ir-ilu-preconditioned-solver.cpp                     | 5 ++---
 examples/iterative-refinement/iterative-refinement.cpp   | 5 ++---
 examples/kokkos-assembly/kokkos-assembly.cpp             | 3 +--
 examples/minimal-cuda-solver/minimal-cuda-solver.cpp     | 3 ++-
 .../mixed-multigrid-preconditioned-solver.cpp            | 5 ++---
 .../mixed-multigrid-solver/mixed-multigrid-solver.cpp    | 5 ++---
 examples/mixed-precision-ir/mixed-precision-ir.cpp       | 5 ++---
 .../multigrid-preconditioned-solver-customized.cpp       | 5 ++---
 .../multigrid-preconditioned-solver.cpp                  | 5 ++---
 .../nine-pt-stencil-solver/nine-pt-stencil-solver.cpp    | 3 ++-
 examples/papi-logging/papi-logging.cpp                   | 8 ++++----
 examples/par-ilu-convergence/par-ilu-convergence.cpp     | 1 -
 examples/performance-debugging/performance-debugging.cpp | 5 ++---
 examples/poisson-solver/poisson-solver.cpp               | 3 ++-
 examples/preconditioned-solver/preconditioned-solver.cpp | 5 ++---
 examples/preconditioner-export/preconditioner-export.cpp | 5 ++---
 .../reordered-preconditioned-solver.cpp                  | 1 -
 .../schroedinger-splitting/schroedinger-splitting.cpp    | 4 ++--
 examples/simple-solver-logging/simple-solver-logging.cpp | 5 ++---
 .../three-pt-stencil-solver/three-pt-stencil-solver.cpp  | 3 ++-
 extensions/test/config/json_config.cpp                   | 3 ---
 extensions/test/kokkos/kokkos_main.cpp                   | 2 --
 extensions/test/kokkos/spaces.cpp                        | 3 ---
 extensions/test/kokkos/types.cpp                         | 4 ----
 hip/base/batch_multi_vector_kernels.hip.cpp              | 3 ---
 hip/base/batch_struct.hip.hpp                            | 1 -
 hip/base/config.hip.hpp                                  | 1 -
 hip/base/device.hip.cpp                                  | 2 --
 hip/base/device_matrix_data_kernels.hip.cpp              | 2 --
 hip/base/exception.hip.cpp                               | 2 --
 hip/base/executor.hip.cpp                                | 3 ---
 hip/base/hipblas_bindings.hip.hpp                        | 1 -
 hip/base/hiprand_bindings.hip.hpp                        | 1 -
 hip/base/hipsparse_bindings.hip.hpp                      | 1 -
 hip/base/hipsparse_block_bindings.hip.hpp                | 1 -
 hip/base/index_set_kernels.hip.cpp                       | 2 --
 hip/base/kernel_launch.hip.hpp                           | 1 -
 hip/base/math.hip.hpp                                    | 5 ++---
 hip/base/memory.hip.cpp                                  | 2 --
 hip/base/pointer_mode_guard.hip.hpp                      | 1 -
 hip/base/roctx.hip.cpp                                   | 1 -
 hip/base/scoped_device_id.hip.cpp                        | 5 ++---
 hip/base/stream.hip.cpp                                  | 2 --
 hip/base/thrust.hip.hpp                                  | 1 -
 hip/base/timer.hip.cpp                                   | 2 --
 hip/base/types.hip.hpp                                   | 8 ++------
 hip/components/atomic.hip.hpp                            | 1 -
 hip/components/cooperative_groups.hip.hpp                | 1 -
 hip/components/diagonal_block_manipulation.hip.hpp       | 1 -
 hip/components/format_conversion.hip.hpp                 | 1 -
 hip/components/memory.hip.hpp                            | 2 --
 hip/components/prefix_sum.hip.hpp                        | 1 -
 hip/components/prefix_sum_kernels.hip.cpp                | 4 ----
 hip/components/reduction.hip.hpp                         | 2 --
 hip/components/syncfree.hip.hpp                          | 1 -
 hip/components/warp_blas.hip.hpp                         | 2 --
 hip/distributed/index_map_kernels.hip.cpp                | 3 ---
 hip/distributed/matrix_kernels.hip.cpp                   | 3 ---
 hip/distributed/partition_helpers_kernels.hip.cpp        | 2 --
 hip/distributed/partition_kernels.hip.cpp                | 2 --
 hip/distributed/vector_kernels.hip.cpp                   | 3 ---
 hip/factorization/cholesky_kernels.hip.cpp               | 4 ----
 hip/factorization/factorization_kernels.hip.cpp          | 2 --
 hip/factorization/ic_kernels.hip.cpp                     | 2 --
 hip/factorization/ilu_kernels.hip.cpp                    | 2 --
 hip/factorization/lu_kernels.hip.cpp                     | 4 ----
 hip/factorization/par_ic_kernels.hip.cpp                 | 2 --
 hip/factorization/par_ict_kernels.hip.cpp                | 2 --
 hip/factorization/par_ilu_kernels.hip.cpp                | 2 --
 hip/factorization/par_ilut_approx_filter_kernels.hip.cpp | 6 +-----
 hip/factorization/par_ilut_filter_kernels.hip.cpp        | 5 +----
 hip/factorization/par_ilut_select_common.hip.cpp         | 2 --
 hip/factorization/par_ilut_select_kernels.hip.cpp        | 6 +-----
 hip/factorization/par_ilut_spgeam_kernels.hip.cpp        | 5 +----
 hip/factorization/par_ilut_sweep_kernels.hip.cpp         | 5 +----
 hip/matrix/batch_csr_kernels.hip.cpp                     | 3 ---
 hip/matrix/batch_dense_kernels.hip.cpp                   | 3 ---
 hip/matrix/batch_ell_kernels.hip.cpp                     | 3 ---
 hip/matrix/batch_struct.hip.hpp                          | 5 +----
 hip/matrix/coo_kernels.hip.cpp                           | 2 --
 hip/matrix/csr_kernels.template.hip.cpp                  | 4 ----
 hip/matrix/dense_kernels.hip.cpp                         | 2 --
 hip/matrix/diagonal_kernels.hip.cpp                      | 2 --
 hip/matrix/ell_kernels.hip.cpp                           | 3 ---
 hip/matrix/fbcsr_kernels.template.hip.cpp                | 4 ----
 hip/matrix/fft_kernels.hip.cpp                           | 2 --
 hip/matrix/fft_kernels_stub.hip.cpp                      | 1 -
 hip/matrix/sellp_kernels.hip.cpp                         | 2 --
 hip/matrix/sparsity_csr_kernels.hip.cpp                  | 3 ---
 hip/multigrid/pgm_kernels.hip.cpp                        | 4 ----
 hip/preconditioner/batch_jacobi_kernels.hip.cpp          | 2 --
 hip/preconditioner/isai_kernels.hip.cpp                  | 2 --
 hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp | 5 +----
 .../jacobi_advanced_apply_kernels.instantiate.hip.cpp    | 5 +----
 hip/preconditioner/jacobi_generate_kernels.hip.cpp       | 5 +----
 .../jacobi_generate_kernels.instantiate.hip.cpp          | 5 +----
 hip/preconditioner/jacobi_kernels.hip.cpp                | 2 --
 hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp   | 5 +----
 .../jacobi_simple_apply_kernels.instantiate.hip.cpp      | 5 +----
 hip/reorder/rcm_kernels.hip.cpp                          | 3 ---
 hip/solver/batch_bicgstab_kernels.hip.cpp                | 3 ---
 hip/solver/batch_cg_kernels.hip.cpp                      | 3 ---
 hip/solver/cb_gmres_kernels.hip.cpp                      | 3 ---
 hip/solver/common_trs_kernels.hip.hpp                    | 1 -
 hip/solver/idr_kernels.hip.cpp                           | 3 ---
 hip/solver/lower_trs_kernels.hip.cpp                     | 2 --
 hip/solver/multigrid_kernels.hip.cpp                     | 2 --
 hip/solver/upper_trs_kernels.hip.cpp                     | 2 --
 hip/stop/criterion_kernels.hip.cpp                       | 2 --
 hip/stop/residual_norm_kernels.hip.cpp                   | 2 --
 hip/test/base/exception_helpers.hip.cpp                  | 5 ++---
 hip/test/base/hip_executor.hip.cpp                       | 7 +------
 hip/test/base/hip_executor_topology.hip.cpp              | 8 ++------
 hip/test/base/index_set.cpp                              | 7 +------
 hip/test/base/kernel_launch.hip.cpp                      | 4 ----
 hip/test/base/lin_op.cpp                                 | 5 ++---
 hip/test/base/math.hip.cpp                               | 9 ++-------
 hip/test/base/memory.cpp                                 | 7 +------
 hip/test/base/scoped_device_id.hip.cpp                   | 6 ++----
 hip/test/components/cooperative_groups.hip.cpp           | 7 ++-----
 hip/test/components/merging.hip.cpp                      | 5 -----
 hip/test/components/searching.hip.cpp                    | 5 -----
 hip/test/components/sorting.hip.cpp                      | 4 ----
 hip/test/matrix/fbcsr_kernels.cpp                        | 8 ++------
 hip/test/matrix/fft_kernels.hip.cpp                      | 6 ++----
 hip/test/solver/lower_trs_kernels.cpp                    | 6 ++----
 hip/test/solver/upper_trs_kernels.cpp                    | 6 ++----
 hip/test/utils.hip.hpp                                   | 5 +----
 hip/test/utils/assertions_test.cpp                       | 3 ---
 include/ginkgo/core/base/abstract_factory.hpp            | 1 -
 include/ginkgo/core/base/array.hpp                       | 1 -
 include/ginkgo/core/base/batch_dim.hpp                   | 1 -
 include/ginkgo/core/base/batch_lin_op.hpp                | 1 -
 include/ginkgo/core/base/batch_multi_vector.hpp          | 1 -
 include/ginkgo/core/base/combination.hpp                 | 1 -
 include/ginkgo/core/base/composition.hpp                 | 1 -
 include/ginkgo/core/base/dense_cache.hpp                 | 1 -
 include/ginkgo/core/base/device.hpp                      | 1 -
 include/ginkgo/core/base/dim.hpp                         | 1 -
 include/ginkgo/core/base/exception.hpp                   | 1 -
 include/ginkgo/core/base/exception_helpers.hpp           | 1 -
 include/ginkgo/core/base/executor.hpp                    | 1 -
 include/ginkgo/core/base/index_set.hpp                   | 1 -
 include/ginkgo/core/base/intrinsics.hpp                  | 1 -
 include/ginkgo/core/base/lin_op.hpp                      | 1 -
 include/ginkgo/core/base/machine_topology.hpp            | 1 -
 include/ginkgo/core/base/math.hpp                        | 1 -
 include/ginkgo/core/base/matrix_assembly_data.hpp        | 1 -
 include/ginkgo/core/base/matrix_data.hpp                 | 1 -
 include/ginkgo/core/base/mpi.hpp                         | 1 -
 include/ginkgo/core/base/mtx_io.hpp                      | 1 -
 include/ginkgo/core/base/perturbation.hpp                | 1 -
 include/ginkgo/core/base/polymorphic_object.hpp          | 1 -
 include/ginkgo/core/base/range.hpp                       | 1 -
 include/ginkgo/core/base/range_accessors.hpp             | 1 -
 include/ginkgo/core/base/segmented_array.hpp             | 1 -
 include/ginkgo/core/base/temporary_clone.hpp             | 1 -
 include/ginkgo/core/base/temporary_conversion.hpp        | 1 -
 include/ginkgo/core/base/timer.hpp                       | 1 -
 include/ginkgo/core/base/utils_helper.hpp                | 1 -
 include/ginkgo/core/base/version.hpp                     | 1 -
 include/ginkgo/core/config/config.hpp                    | 1 -
 include/ginkgo/core/config/registry.hpp                  | 1 -
 include/ginkgo/core/distributed/lin_op.hpp               | 1 -
 include/ginkgo/core/distributed/polymorphic_object.hpp   | 1 -
 include/ginkgo/core/factorization/cholesky.hpp           | 1 -
 include/ginkgo/core/factorization/ic.hpp                 | 1 -
 include/ginkgo/core/factorization/ilu.hpp                | 1 -
 include/ginkgo/core/factorization/lu.hpp                 | 1 -
 include/ginkgo/core/factorization/par_ic.hpp             | 1 -
 include/ginkgo/core/factorization/par_ict.hpp            | 1 -
 include/ginkgo/core/factorization/par_ilu.hpp            | 1 -
 include/ginkgo/core/factorization/par_ilut.hpp           | 1 -
 include/ginkgo/core/log/batch_logger.hpp                 | 1 -
 include/ginkgo/core/log/convergence.hpp                  | 1 -
 include/ginkgo/core/log/logger.hpp                       | 1 -
 include/ginkgo/core/log/papi.hpp                         | 2 +-
 include/ginkgo/core/log/performance_hint.hpp             | 1 -
 include/ginkgo/core/log/profiler_hook.hpp                | 1 -
 include/ginkgo/core/log/record.hpp                       | 1 -
 include/ginkgo/core/log/stream.hpp                       | 1 -
 include/ginkgo/core/matrix/batch_csr.hpp                 | 1 -
 include/ginkgo/core/matrix/batch_dense.hpp               | 1 -
 include/ginkgo/core/matrix/batch_ell.hpp                 | 1 -
 include/ginkgo/core/matrix/dense.hpp                     | 1 -
 include/ginkgo/core/matrix/hybrid.hpp                    | 1 -
 include/ginkgo/core/matrix/permutation.hpp               | 1 -
 include/ginkgo/core/matrix/row_gatherer.hpp              | 1 -
 include/ginkgo/core/matrix/scaled_permutation.hpp        | 1 -
 include/ginkgo/core/matrix/sparsity_csr.hpp              | 1 -
 include/ginkgo/core/multigrid/fixed_coarsening.hpp       | 1 -
 include/ginkgo/core/multigrid/multigrid_level.hpp        | 1 -
 include/ginkgo/core/multigrid/pgm.hpp                    | 1 -
 include/ginkgo/core/preconditioner/ic.hpp                | 1 -
 include/ginkgo/core/preconditioner/ilu.hpp               | 1 -
 include/ginkgo/core/preconditioner/isai.hpp              | 1 -
 include/ginkgo/core/preconditioner/utils.hpp             | 1 -
 include/ginkgo/core/reorder/amd.hpp                      | 1 -
 include/ginkgo/core/reorder/mc64.hpp                     | 1 -
 include/ginkgo/core/reorder/nested_dissection.hpp        | 1 -
 include/ginkgo/core/reorder/rcm.hpp                      | 1 -
 include/ginkgo/core/reorder/reordering_base.hpp          | 1 -
 include/ginkgo/core/solver/batch_bicgstab.hpp            | 1 -
 include/ginkgo/core/solver/batch_cg.hpp                  | 1 -
 include/ginkgo/core/solver/bicg.hpp                      | 1 -
 include/ginkgo/core/solver/bicgstab.hpp                  | 1 -
 include/ginkgo/core/solver/cb_gmres.hpp                  | 1 -
 include/ginkgo/core/solver/cg.hpp                        | 1 -
 include/ginkgo/core/solver/cgs.hpp                       | 1 -
 include/ginkgo/core/solver/fcg.hpp                       | 1 -
 include/ginkgo/core/solver/gcr.hpp                       | 1 -
 include/ginkgo/core/solver/gmres.hpp                     | 1 -
 include/ginkgo/core/solver/idr.hpp                       | 1 -
 include/ginkgo/core/solver/ir.hpp                        | 1 -
 include/ginkgo/core/solver/multigrid.hpp                 | 1 -
 include/ginkgo/core/solver/solver_base.hpp               | 1 -
 include/ginkgo/core/solver/triangular.hpp                | 1 -
 include/ginkgo/core/solver/workspace.hpp                 | 1 -
 include/ginkgo/core/stop/combined.hpp                    | 1 -
 include/ginkgo/core/stop/residual_norm.hpp               | 1 -
 include/ginkgo/core/stop/time.hpp                        | 1 -
 include/ginkgo/extensions/config/json_config.hpp         | 2 --
 include/ginkgo/extensions/kokkos/spaces.hpp              | 1 -
 include/ginkgo/extensions/kokkos/types.hpp               | 3 ---
 omp/base/batch_multi_vector_kernels.cpp                  | 3 ---
 omp/base/device_matrix_data_kernels.cpp                  | 3 ---
 omp/base/executor.cpp                                    | 1 -
 omp/base/index_set_kernels.cpp                           | 3 ---
 omp/base/kernel_launch.hpp                               | 1 -
 omp/base/kernel_launch_reduction.hpp                     | 1 -
 omp/base/scoped_device_id.cpp                            | 1 -
 omp/components/atomic.hpp                                | 1 -
 omp/components/csr_spgeam.hpp                            | 2 --
 omp/components/matrix_operations.hpp                     | 1 -
 omp/components/prefix_sum_kernels.cpp                    | 3 ---
 omp/components/sort_small.hpp                            | 1 -
 omp/distributed/index_map_kernels.cpp                    | 3 ---
 omp/distributed/matrix_kernels.cpp                       | 3 ---
 omp/distributed/partition_helpers_kernels.cpp            | 1 -
 omp/distributed/partition_kernels.cpp                    | 3 ---
 omp/distributed/vector_kernels.cpp                       | 1 -
 omp/factorization/cholesky_kernels.cpp                   | 3 ---
 omp/factorization/factorization_kernels.cpp              | 3 ---
 omp/factorization/lu_kernels.cpp                         | 3 ---
 omp/factorization/par_ic_kernels.cpp                     | 2 --
 omp/factorization/par_ict_kernels.cpp                    | 3 ---
 omp/factorization/par_ilu_kernels.cpp                    | 2 --
 omp/factorization/par_ilut_kernels.cpp                   | 4 ----
 omp/matrix/batch_csr_kernels.cpp                         | 3 ---
 omp/matrix/batch_dense_kernels.cpp                       | 3 ---
 omp/matrix/batch_ell_kernels.cpp                         | 3 ---
 omp/matrix/coo_kernels.cpp                               | 4 ----
 omp/matrix/csr_kernels.cpp                               | 4 ----
 omp/matrix/dense_kernels.cpp                             | 4 ----
 omp/matrix/diagonal_kernels.cpp                          | 2 --
 omp/matrix/ell_kernels.cpp                               | 4 ----
 omp/matrix/fbcsr_kernels.cpp                             | 4 ----
 omp/matrix/fft_kernels.cpp                               | 2 --
 omp/matrix/sellp_kernels.cpp                             | 3 ---
 omp/matrix/sparsity_csr_kernels.cpp                      | 4 ----
 omp/multigrid/pgm_kernels.cpp                            | 4 ----
 omp/preconditioner/batch_jacobi_kernels.cpp              | 1 -
 omp/preconditioner/isai_kernels.cpp                      | 4 ----
 omp/preconditioner/jacobi_kernels.cpp                    | 4 ----
 omp/reorder/rcm_kernels.cpp                              | 4 ----
 omp/solver/batch_bicgstab_kernels.cpp                    | 3 ---
 omp/solver/batch_cg_kernels.cpp                          | 3 ---
 omp/solver/cb_gmres_kernels.cpp                          | 3 ---
 omp/solver/idr_kernels.cpp                               | 4 ----
 omp/solver/lower_trs_kernels.cpp                         | 3 ---
 omp/solver/multigrid_kernels.cpp                         | 1 -
 omp/solver/upper_trs_kernels.cpp                         | 3 ---
 omp/stop/criterion_kernels.cpp                           | 1 -
 omp/stop/residual_norm_kernels.cpp                       | 2 --
 omp/test/base/index_set.cpp                              | 6 +-----
 omp/test/base/kernel_launch.cpp                          | 4 ----
 omp/test/matrix/fbcsr_kernels.cpp                        | 8 ++------
 reference/base/batch_multi_vector_kernels.cpp            | 3 ---
 reference/base/batch_struct.hpp                          | 1 -
 reference/base/device_matrix_data_kernels.cpp            | 3 ---
 reference/base/index_set_kernels.cpp                     | 3 ---
 reference/base/scoped_device_id.cpp                      | 1 -
 reference/components/convert_ptrs.hpp                    | 1 -
 reference/components/csr_spgeam.hpp                      | 2 --
 reference/components/fill_array_kernels.cpp              | 1 -
 reference/components/format_conversion_kernels.cpp       | 2 --
 reference/components/precision_conversion_kernels.cpp    | 1 -
 reference/components/reduce_array_kernels.cpp            | 1 -
 reference/distributed/index_map_kernels.cpp              | 2 --
 reference/distributed/matrix_kernels.cpp                 | 1 -
 reference/distributed/partition_helpers.hpp              | 1 -
 reference/distributed/partition_helpers_kernels.cpp      | 1 -
 reference/distributed/vector_kernels.cpp                 | 1 -
 reference/factorization/cholesky_kernels.cpp             | 3 ---
 reference/factorization/factorization_kernels.cpp        | 3 ---
 reference/factorization/ic_kernels.cpp                   | 2 --
 reference/factorization/ilu_kernels.cpp                  | 3 ---
 reference/factorization/lu_kernels.cpp                   | 3 ---
 reference/factorization/par_ic_kernels.cpp               | 2 --
 reference/factorization/par_ict_kernels.cpp              | 3 ---
 reference/factorization/par_ilu_kernels.cpp              | 2 --
 reference/factorization/par_ilut_kernels.cpp             | 3 ---
 reference/matrix/batch_csr_kernels.cpp                   | 3 ---
 reference/matrix/batch_dense_kernels.cpp                 | 3 ---
 reference/matrix/batch_ell_kernels.cpp                   | 3 ---
 reference/matrix/batch_struct.hpp                        | 5 +----
 reference/matrix/coo_kernels.cpp                         | 2 --
 reference/matrix/csr_kernels.cpp                         | 3 ---
 reference/matrix/dense_kernels.cpp                       | 3 ---
 reference/matrix/diagonal_kernels.cpp                    | 1 -
 reference/matrix/ell_kernels.cpp                         | 2 --
 reference/matrix/fbcsr_kernels.cpp                       | 3 ---
 reference/matrix/fft_kernels.cpp                         | 2 --
 reference/matrix/hybrid_kernels.cpp                      | 2 --
 reference/matrix/scaled_permutation_kernels.cpp          | 1 -
 reference/matrix/sellp_kernels.cpp                       | 2 --
 reference/matrix/sparsity_csr_kernels.cpp                | 3 ---
 reference/multigrid/pgm_kernels.cpp                      | 3 ---
 reference/preconditioner/batch_block_jacobi.hpp          | 1 -
 reference/preconditioner/batch_jacobi_kernels.cpp        | 1 -
 reference/preconditioner/isai_kernels.cpp                | 3 ---
 reference/preconditioner/jacobi_kernels.cpp              | 3 ---
 reference/reorder/rcm_kernels.cpp                        | 3 ---
 reference/solver/batch_bicgstab_kernels.cpp              | 1 -
 reference/solver/batch_cg_kernels.cpp                    | 1 -
 reference/solver/bicg_kernels.cpp                        | 1 -
 reference/solver/bicgstab_kernels.cpp                    | 2 --
 reference/solver/cb_gmres_kernels.cpp                    | 3 ---
 reference/solver/cg_kernels.cpp                          | 1 -
 reference/solver/cgs_kernels.cpp                         | 1 -
 reference/solver/common_gmres_kernels.cpp                | 2 --
 reference/solver/fcg_kernels.cpp                         | 1 -
 reference/solver/gcr_kernels.cpp                         | 1 -
 reference/solver/gmres_kernels.cpp                       | 1 -
 reference/solver/idr_kernels.cpp                         | 2 --
 reference/solver/lower_trs_kernels.cpp                   | 2 --
 reference/solver/multigrid_kernels.cpp                   | 1 -
 reference/solver/upper_trs_kernels.cpp                   | 2 --
 reference/stop/criterion_kernels.cpp                     | 1 -
 reference/stop/residual_norm_kernels.cpp                 | 2 --
 reference/test/base/array.cpp                            | 7 +------
 reference/test/base/batch_multi_vector_kernels.cpp       | 8 ++------
 reference/test/base/combination.cpp                      | 7 +------
 reference/test/base/composition.cpp                      | 7 +------
 reference/test/base/index_set.cpp                        | 7 +------
 reference/test/base/perturbation.cpp                     | 7 +------
 reference/test/base/utils.cpp                            | 4 ----
 reference/test/components/absolute_array_kernels.cpp     | 4 ----
 reference/test/components/fill_array_kernels.cpp         | 4 ----
 reference/test/components/format_conversion_kernels.cpp  | 3 ---
 .../test/components/precision_conversion_kernels.cpp     | 3 ---
 reference/test/components/prefix_sum_kernels.cpp         | 4 ----
 reference/test/components/reduce_array_kernels.cpp       | 4 ----
 reference/test/distributed/index_map_kernels.cpp         | 8 ++------
 reference/test/distributed/matrix_kernels.cpp            | 6 ++----
 reference/test/distributed/partition_helpers_kernels.cpp | 6 ++----
 reference/test/distributed/partition_kernels.cpp         | 8 ++------
 reference/test/distributed/vector_kernels.cpp            | 6 ++----
 reference/test/factorization/cholesky_kernels.cpp        | 8 ++------
 reference/test/factorization/factorization.cpp           | 7 +------
 reference/test/factorization/ic_kernels.cpp              | 7 +------
 reference/test/factorization/ilu_kernels.cpp             | 7 +------
 reference/test/factorization/lu_kernels.cpp              | 8 ++------
 reference/test/factorization/par_ic_kernels.cpp          | 8 ++------
 reference/test/factorization/par_ict_kernels.cpp         | 8 ++------
 reference/test/factorization/par_ilu_kernels.cpp         | 8 ++------
 reference/test/factorization/par_ilut_kernels.cpp        | 8 ++------
 reference/test/log/convergence.cpp                       | 6 +-----
 reference/test/log/papi.cpp                              | 6 +-----
 reference/test/matrix/batch_csr_kernels.cpp              | 8 ++------
 reference/test/matrix/batch_dense_kernels.cpp            | 8 ++------
 reference/test/matrix/batch_ell_kernels.cpp              | 8 ++------
 reference/test/matrix/coo_kernels.cpp                    | 8 ++------
 reference/test/matrix/csr_kernels.cpp                    | 8 ++------
 reference/test/matrix/dense_kernels.cpp                  | 8 ++------
 reference/test/matrix/diagonal_kernels.cpp               | 8 ++------
 reference/test/matrix/ell_kernels.cpp                    | 7 +------
 reference/test/matrix/fbcsr_kernels.cpp                  | 8 ++------
 reference/test/matrix/fft_kernels.cpp                    | 7 +------
 reference/test/matrix/hybrid_kernels.cpp                 | 8 ++------
 reference/test/matrix/identity.cpp                       | 6 +-----
 reference/test/matrix/permutation.cpp                    | 7 +------
 reference/test/matrix/scaled_permutation.cpp             | 7 +------
 reference/test/matrix/sellp_kernels.cpp                  | 7 ++-----
 reference/test/matrix/sparsity_csr.cpp                   | 6 +-----
 reference/test/matrix/sparsity_csr_kernels.cpp           | 8 ++------
 reference/test/multigrid/fixed_coarsening_kernels.cpp    | 7 +------
 reference/test/multigrid/pgm_kernels.cpp                 | 8 ++------
 reference/test/preconditioner/batch_jacobi_kernels.cpp   | 8 ++------
 reference/test/preconditioner/ic.cpp                     | 7 +------
 reference/test/preconditioner/ilu.cpp                    | 7 +------
 reference/test/preconditioner/isai_kernels.cpp           | 8 ++------
 reference/test/preconditioner/jacobi.cpp                 | 7 +------
 reference/test/preconditioner/jacobi_kernels.cpp         | 7 +------
 reference/test/reorder/mc64.cpp                          | 7 +------
 reference/test/reorder/mc64_kernels.cpp                  | 8 ++------
 reference/test/reorder/nested_dissection.cpp             | 7 ++-----
 reference/test/reorder/rcm.cpp                           | 7 +------
 reference/test/reorder/rcm_kernels.cpp                   | 7 +------
 reference/test/reorder/scaled_reordered.cpp              | 7 +------
 reference/test/solver/batch_bicgstab_kernels.cpp         | 8 ++------
 reference/test/solver/batch_cg_kernels.cpp               | 8 ++------
 reference/test/solver/bicg_kernels.cpp                   | 7 ++-----
 reference/test/solver/bicgstab_kernels.cpp               | 7 ++-----
 reference/test/solver/cb_gmres_kernels.cpp               | 7 +------
 reference/test/solver/cg_kernels.cpp                     | 7 ++-----
 reference/test/solver/cgs_kernels.cpp                    | 7 ++-----
 reference/test/solver/direct.cpp                         | 7 +------
 reference/test/solver/fcg_kernels.cpp                    | 7 ++-----
 reference/test/solver/gcr_kernels.cpp                    | 8 ++------
 reference/test/solver/gmres_kernels.cpp                  | 8 ++------
 reference/test/solver/idr_kernels.cpp                    | 6 +-----
 reference/test/solver/ir_kernels.cpp                     | 7 ++-----
 reference/test/solver/lower_trs.cpp                      | 3 ---
 reference/test/solver/lower_trs_kernels.cpp              | 6 ++----
 reference/test/solver/multigrid_kernels.cpp              | 6 +-----
 reference/test/solver/upper_trs.cpp                      | 3 ---
 reference/test/solver/upper_trs_kernels.cpp              | 6 ++----
 reference/test/stop/combined.cpp                         | 6 ++----
 reference/test/stop/criterion_kernels.cpp                | 5 +----
 reference/test/stop/iteration.cpp                        | 5 ++---
 reference/test/stop/residual_norm_kernels.cpp            | 7 +------
 reference/test/stop/time.cpp                             | 5 ++---
 reference/test/utils/assertions_test.cpp                 | 3 ---
 test/base/batch_multi_vector_kernels.cpp                 | 8 ++------
 test/base/device_matrix_data_kernels.cpp                 | 8 ++------
 test/base/executor.cpp                                   | 6 ++----
 test/base/index_range.cpp                                | 6 ++----
 test/base/kernel_launch_generic.cpp                      | 7 +------
 test/base/timer.cpp                                      | 5 +----
 test/components/absolute_array_kernels.cpp               | 4 ----
 test/components/fill_array_kernels.cpp                   | 4 ----
 test/components/format_conversion_kernels.cpp            | 3 ---
 test/components/precision_conversion_kernels.cpp         | 3 ---
 test/components/prefix_sum_kernels.cpp                   | 4 ----
 test/components/reduce_array_kernels.cpp                 | 4 ----
 test/distributed/index_map_kernels.cpp                   | 4 ----
 test/distributed/matrix_kernels.cpp                      | 4 ----
 test/distributed/partition_helper_kernels.cpp            | 2 --
 test/distributed/partition_kernels.cpp                   | 4 ----
 test/distributed/vector_kernels.cpp                      | 4 ----
 test/factorization/cholesky_kernels.cpp                  | 4 ----
 test/factorization/ic_kernels.cpp                        | 3 ---
 test/factorization/ilu_kernels.cpp                       | 3 ---
 test/factorization/lu_kernels.cpp                        | 4 ----
 test/factorization/par_ic_kernels.cpp                    | 4 ----
 test/factorization/par_ict_kernels.cpp                   | 4 ----
 test/factorization/par_ilu_kernels.cpp                   | 4 ----
 test/factorization/par_ilut_kernels.cpp                  | 4 ----
 test/log/profiler_hook.cpp                               | 3 ---
 test/matrix/batch_csr_kernels.cpp                        | 4 ----
 test/matrix/batch_dense_kernels.cpp                      | 4 ----
 test/matrix/batch_ell_kernels.cpp                        | 4 ----
 test/matrix/coo_kernels.cpp                              | 4 ----
 test/matrix/csr_kernels.cpp                              | 4 ----
 test/matrix/csr_kernels2.cpp                             | 7 +------
 test/matrix/dense_kernels.cpp                            | 4 ----
 test/matrix/diagonal_kernels.cpp                         | 4 ----
 test/matrix/ell_kernels.cpp                              | 4 ----
 test/matrix/fbcsr_kernels.cpp                            | 4 ----
 test/matrix/fft_kernels.cpp                              | 3 ---
 test/matrix/hybrid_kernels.cpp                           | 4 ----
 test/matrix/matrix.cpp                                   | 3 ---
 test/matrix/permutation_kernels.cpp                      | 3 ---
 test/matrix/scaled_permutation_kernels.cpp               | 3 ---
 test/matrix/sellp_kernels.cpp                            | 4 ----
 test/matrix/sparsity_csr_kernels.cpp                     | 4 ----
 test/mpi/matrix.cpp                                      | 4 ----
 test/mpi/multigrid/pgm.cpp                               | 4 ----
 test/mpi/partition_helpers.cpp                           | 1 -
 test/mpi/preconditioner/schwarz.cpp                      | 4 ----
 test/mpi/solver/solver.cpp                               | 3 ---
 test/mpi/vector.cpp                                      | 4 ----
 test/multigrid/fixed_coarsening_kernels.cpp              | 3 ---
 test/multigrid/pgm_kernels.cpp                           | 4 ----
 test/preconditioner/batch_jacobi_kernels.cpp             | 4 ----
 test/preconditioner/isai_kernels.cpp                     | 4 ----
 test/preconditioner/jacobi_kernels.cpp                   | 3 ---
 test/reorder/amd.cpp                                     | 3 ---
 test/reorder/mc64.cpp                                    | 2 --
 test/reorder/nested_dissection.cpp                       | 3 ---
 test/reorder/rcm.cpp                                     | 3 ---
 test/solver/batch_bicgstab_kernels.cpp                   | 4 ----
 test/solver/batch_cg_kernels.cpp                         | 4 ----
 test/solver/bicg_kernels.cpp                             | 4 ----
 test/solver/bicgstab_kernels.cpp                         | 4 ----
 test/solver/cb_gmres_kernels.cpp                         | 4 ----
 test/solver/cg_kernels.cpp                               | 4 ----
 test/solver/cgs_kernels.cpp                              | 4 ----
 test/solver/direct.cpp                                   | 3 ---
 test/solver/fcg_kernels.cpp                              | 4 ----
 test/solver/gcr_kernels.cpp                              | 4 ----
 test/solver/gmres_kernels.cpp                            | 4 ----
 test/solver/idr_kernels.cpp                              | 3 ---
 test/solver/ir_kernels.cpp                               | 4 ----
 test/solver/lower_trs_kernels.cpp                        | 3 ---
 test/solver/multigrid_kernels.cpp                        | 4 ----
 test/solver/solver.cpp                                   | 3 ---
 test/solver/upper_trs_kernels.cpp                        | 3 ---
 test/stop/combined_kernels.cpp                           | 2 --
 test/stop/criterion_kernels.cpp                          | 2 --
 test/stop/residual_norm_kernels.cpp                      | 2 --
 test/test_install/test_install.cpp                       | 5 ++---
 test/tools/resource_file_generator.cpp                   | 1 -
 test/utils/executor.hpp                                  | 7 +------
 test/utils/mpi/executor.hpp                              | 7 +------
 1132 files changed, 495 insertions(+), 3259 deletions(-)

diff --git a/accessor/accessor_helper.hpp b/accessor/accessor_helper.hpp
index 2e2eb20085c..d7c8f1513d9 100644
--- a/accessor/accessor_helper.hpp
+++ b/accessor/accessor_helper.hpp
@@ -13,7 +13,6 @@
 #include <type_traits>
 #include <utility>
 
-
 #include "index_span.hpp"
 #include "utils.hpp"
 
diff --git a/accessor/block_col_major.hpp b/accessor/block_col_major.hpp
index edd8ee15a61..6ffa7ea789b 100644
--- a/accessor/block_col_major.hpp
+++ b/accessor/block_col_major.hpp
@@ -8,7 +8,6 @@
 #include <array>
 #include <cinttypes>
 
-
 #include "accessor_helper.hpp"
 #include "range.hpp"
 #include "utils.hpp"
diff --git a/accessor/cuda_helper.hpp b/accessor/cuda_helper.hpp
index 8beb2e8a8df..31d3599516d 100644
--- a/accessor/cuda_helper.hpp
+++ b/accessor/cuda_helper.hpp
@@ -8,10 +8,8 @@
 
 #include <type_traits>
 
-
 #include <thrust/complex.h>
 
-
 #include "block_col_major.hpp"
 #include "reduced_row_major.hpp"
 #include "row_major.hpp"
diff --git a/accessor/hip_helper.hpp b/accessor/hip_helper.hpp
index ed1eda37775..6b76b726c10 100644
--- a/accessor/hip_helper.hpp
+++ b/accessor/hip_helper.hpp
@@ -8,10 +8,8 @@
 
 #include <type_traits>
 
-
 #include <thrust/complex.h>
 
-
 #include "block_col_major.hpp"
 #include "reduced_row_major.hpp"
 #include "row_major.hpp"
diff --git a/accessor/math.hpp b/accessor/math.hpp
index c2a64f66bc3..0e6cebbb992 100644
--- a/accessor/math.hpp
+++ b/accessor/math.hpp
@@ -7,7 +7,6 @@
 
 #include <type_traits>
 
-
 #include "utils.hpp"
 
 
diff --git a/accessor/range.hpp b/accessor/range.hpp
index 7667359f88b..e3e260c8781 100644
--- a/accessor/range.hpp
+++ b/accessor/range.hpp
@@ -8,7 +8,6 @@
 
 #include <utility>
 
-
 #include "utils.hpp"
 
 
diff --git a/accessor/reduced_row_major.hpp b/accessor/reduced_row_major.hpp
index dd5981fac6a..a9ed30f7b2f 100644
--- a/accessor/reduced_row_major.hpp
+++ b/accessor/reduced_row_major.hpp
@@ -12,7 +12,6 @@
 #include <type_traits>
 #include <utility>
 
-
 #include "accessor_helper.hpp"
 #include "index_span.hpp"
 #include "range.hpp"
diff --git a/accessor/reduced_row_major_reference.hpp b/accessor/reduced_row_major_reference.hpp
index 5d75146b457..34bb4c14b14 100644
--- a/accessor/reduced_row_major_reference.hpp
+++ b/accessor/reduced_row_major_reference.hpp
@@ -9,7 +9,6 @@
 #include <cmath>
 #include <type_traits>
 
-
 #include "math.hpp"
 #include "reference_helper.hpp"
 #include "utils.hpp"
diff --git a/accessor/reference_helper.hpp b/accessor/reference_helper.hpp
index 2fce0630ba4..a3a77352f8f 100644
--- a/accessor/reference_helper.hpp
+++ b/accessor/reference_helper.hpp
@@ -9,7 +9,6 @@
 #include <type_traits>
 #include <utility>
 
-
 #include "utils.hpp"
 
 
diff --git a/accessor/row_major.hpp b/accessor/row_major.hpp
index 500138a8ad6..c18f73524bd 100644
--- a/accessor/row_major.hpp
+++ b/accessor/row_major.hpp
@@ -8,7 +8,6 @@
 #include <array>
 #include <cinttypes>
 
-
 #include "accessor_helper.hpp"
 #include "range.hpp"
 #include "utils.hpp"
diff --git a/accessor/scaled_reduced_row_major.hpp b/accessor/scaled_reduced_row_major.hpp
index f7873640262..9d9f986b0fe 100644
--- a/accessor/scaled_reduced_row_major.hpp
+++ b/accessor/scaled_reduced_row_major.hpp
@@ -11,7 +11,6 @@
 #include <type_traits>
 #include <utility>
 
-
 #include "accessor_helper.hpp"
 #include "index_span.hpp"
 #include "range.hpp"
diff --git a/accessor/scaled_reduced_row_major_reference.hpp b/accessor/scaled_reduced_row_major_reference.hpp
index 6c18ea970e0..861dbd9a9bf 100644
--- a/accessor/scaled_reduced_row_major_reference.hpp
+++ b/accessor/scaled_reduced_row_major_reference.hpp
@@ -8,7 +8,6 @@
 
 #include <type_traits>
 
-
 #include "math.hpp"
 #include "reference_helper.hpp"
 #include "utils.hpp"
diff --git a/benchmark/blas/blas.cpp b/benchmark/blas/blas.cpp
index 0f4dc57e84e..57e0152d824 100644
--- a/benchmark/blas/blas.cpp
+++ b/benchmark/blas/blas.cpp
@@ -2,13 +2,11 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <cstdlib>
 #include <iomanip>
 #include <iostream>
 
+#include <ginkgo/ginkgo.hpp>
 
 #include "benchmark/blas/blas_common.hpp"
 #include "benchmark/utils/general.hpp"
diff --git a/benchmark/blas/blas_common.hpp b/benchmark/blas/blas_common.hpp
index 3a9e123f2e9..c930c8ba5ef 100644
--- a/benchmark/blas/blas_common.hpp
+++ b/benchmark/blas/blas_common.hpp
@@ -2,9 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <algorithm>
 #include <chrono>
 #include <cstdlib>
@@ -13,6 +10,7 @@
 #include <iostream>
 #include <typeinfo>
 
+#include <ginkgo/ginkgo.hpp>
 
 #include "benchmark/utils/general.hpp"
 #include "benchmark/utils/iteration_control.hpp"
diff --git a/benchmark/blas/distributed/multi_vector.cpp b/benchmark/blas/distributed/multi_vector.cpp
index 054faf50bdc..a4be6c502c1 100644
--- a/benchmark/blas/distributed/multi_vector.cpp
+++ b/benchmark/blas/distributed/multi_vector.cpp
@@ -2,13 +2,12 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <cstdlib>
 #include <iomanip>
 #include <iostream>
 
+#include <ginkgo/ginkgo.hpp>
+
 
 #define GKO_BENCHMARK_DISTRIBUTED
 
diff --git a/benchmark/conversion/conversion.cpp b/benchmark/conversion/conversion.cpp
index 5f3212e27ab..17d2ac48e47 100644
--- a/benchmark/conversion/conversion.cpp
+++ b/benchmark/conversion/conversion.cpp
@@ -2,9 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <algorithm>
 #include <chrono>
 #include <cstdlib>
@@ -14,6 +11,7 @@
 #include <iostream>
 #include <typeinfo>
 
+#include <ginkgo/ginkgo.hpp>
 
 #include "benchmark/utils/formats.hpp"
 #include "benchmark/utils/general.hpp"
diff --git a/benchmark/matrix_generator/matrix_generator.cpp b/benchmark/matrix_generator/matrix_generator.cpp
index b77abc8f1e5..dc1cb3cc08a 100644
--- a/benchmark/matrix_generator/matrix_generator.cpp
+++ b/benchmark/matrix_generator/matrix_generator.cpp
@@ -2,14 +2,12 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <cstdlib>
 #include <exception>
 #include <fstream>
 #include <iostream>
 
+#include <ginkgo/ginkgo.hpp>
 
 #include "benchmark/utils/general.hpp"
 #include "benchmark/utils/types.hpp"
diff --git a/benchmark/matrix_statistics/matrix_statistics.cpp b/benchmark/matrix_statistics/matrix_statistics.cpp
index 7228e56d9b4..8eb847f42f2 100644
--- a/benchmark/matrix_statistics/matrix_statistics.cpp
+++ b/benchmark/matrix_statistics/matrix_statistics.cpp
@@ -2,17 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <cmath>
 #include <exception>
 #include <iostream>
 
+#include <ginkgo/ginkgo.hpp>
 
 #include <ginkgo/core/base/executor.hpp>
 
-
 #include "benchmark/utils/general_matrix.hpp"
 #include "benchmark/utils/generator.hpp"
 #include "benchmark/utils/runner.hpp"
diff --git a/benchmark/preconditioner/preconditioner.cpp b/benchmark/preconditioner/preconditioner.cpp
index 5c7f95bfb6b..3c737d67d7b 100644
--- a/benchmark/preconditioner/preconditioner.cpp
+++ b/benchmark/preconditioner/preconditioner.cpp
@@ -2,15 +2,13 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <algorithm>
 #include <cstdlib>
 #include <exception>
 #include <iomanip>
 #include <iostream>
 
+#include <ginkgo/ginkgo.hpp>
 
 #include "benchmark/utils/formats.hpp"
 #include "benchmark/utils/general.hpp"
diff --git a/benchmark/solver/distributed/solver.cpp b/benchmark/solver/distributed/solver.cpp
index 18feedc3faa..196bae5331b 100644
--- a/benchmark/solver/distributed/solver.cpp
+++ b/benchmark/solver/distributed/solver.cpp
@@ -2,14 +2,13 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <cstdlib>
 #include <exception>
 #include <iostream>
 #include <set>
 
+#include <ginkgo/ginkgo.hpp>
+
 
 #define GKO_BENCHMARK_DISTRIBUTED
 
diff --git a/benchmark/solver/solver.cpp b/benchmark/solver/solver.cpp
index d1049b538c7..94956cadd21 100644
--- a/benchmark/solver/solver.cpp
+++ b/benchmark/solver/solver.cpp
@@ -2,9 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <algorithm>
 #include <chrono>
 #include <cmath>
@@ -17,6 +14,7 @@
 #include <sstream>
 #include <vector>
 
+#include <ginkgo/ginkgo.hpp>
 
 #include "benchmark/solver/solver_common.hpp"
 #include "benchmark/utils/general_matrix.hpp"
diff --git a/benchmark/sparse_blas/operations.cpp b/benchmark/sparse_blas/operations.cpp
index f5267359068..30f3b5a80fe 100644
--- a/benchmark/sparse_blas/operations.cpp
+++ b/benchmark/sparse_blas/operations.cpp
@@ -2,14 +2,13 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "benchmark/sparse_blas/operations.hpp"
+
 #include <map>
 #include <unordered_set>
 
-
 #include <gflags/gflags.h>
 
-
-#include "benchmark/sparse_blas/operations.hpp"
 #include "core/base/array_access.hpp"
 #include "core/factorization/elimination_forest.hpp"
 #include "core/factorization/symbolic.hpp"
diff --git a/benchmark/sparse_blas/operations.hpp b/benchmark/sparse_blas/operations.hpp
index 74e217b3605..900ae8037fb 100644
--- a/benchmark/sparse_blas/operations.hpp
+++ b/benchmark/sparse_blas/operations.hpp
@@ -2,11 +2,9 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <tuple>
 
+#include <ginkgo/ginkgo.hpp>
 
 #include "benchmark/utils/json.hpp"
 #include "benchmark/utils/types.hpp"
diff --git a/benchmark/sparse_blas/sparse_blas.cpp b/benchmark/sparse_blas/sparse_blas.cpp
index becd1643f44..3897689ca11 100644
--- a/benchmark/sparse_blas/sparse_blas.cpp
+++ b/benchmark/sparse_blas/sparse_blas.cpp
@@ -2,9 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <algorithm>
 #include <exception>
 #include <functional>
@@ -15,6 +12,7 @@
 #include <random>
 #include <typeinfo>
 
+#include <ginkgo/ginkgo.hpp>
 
 #include "benchmark/sparse_blas/operations.hpp"
 #include "benchmark/utils/general_matrix.hpp"
diff --git a/benchmark/spmv/distributed/spmv.cpp b/benchmark/spmv/distributed/spmv.cpp
index 5d12d23857a..2c2e0f57b0e 100644
--- a/benchmark/spmv/distributed/spmv.cpp
+++ b/benchmark/spmv/distributed/spmv.cpp
@@ -2,9 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <algorithm>
 #include <chrono>
 #include <cstdlib>
@@ -14,6 +11,8 @@
 #include <iostream>
 #include <typeinfo>
 
+#include <ginkgo/ginkgo.hpp>
+
 
 #define GKO_BENCHMARK_DISTRIBUTED
 
diff --git a/benchmark/spmv/spmv.cpp b/benchmark/spmv/spmv.cpp
index 5eef78546e9..960921257e3 100644
--- a/benchmark/spmv/spmv.cpp
+++ b/benchmark/spmv/spmv.cpp
@@ -2,12 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <cstdlib>
 #include <iostream>
 
+#include <ginkgo/ginkgo.hpp>
 
 #include "benchmark/spmv/spmv_common.hpp"
 #include "benchmark/utils/formats.hpp"
diff --git a/benchmark/tools/matrix.cpp b/benchmark/tools/matrix.cpp
index c57602baa8a..8bde597797e 100644
--- a/benchmark/tools/matrix.cpp
+++ b/benchmark/tools/matrix.cpp
@@ -6,11 +6,9 @@
 #include <iostream>
 #include <vector>
 
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/mtx_io.hpp>
 
-
 #include "core/utils/matrix_utils.hpp"
 
 
diff --git a/benchmark/tools/mtx_to_binary.cpp b/benchmark/tools/mtx_to_binary.cpp
index c9d61050a7c..f5ea82804c2 100644
--- a/benchmark/tools/mtx_to_binary.cpp
+++ b/benchmark/tools/mtx_to_binary.cpp
@@ -6,7 +6,6 @@
 #include <ios>
 #include <limits>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/mtx_io.hpp>
 
diff --git a/benchmark/utils/cuda_linops.cpp b/benchmark/utils/cuda_linops.cpp
index a404f9151ea..4683d6086e1 100644
--- a/benchmark/utils/cuda_linops.cpp
+++ b/benchmark/utils/cuda_linops.cpp
@@ -2,16 +2,13 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <memory>
 
-
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cusparse.h>
 
+#include <ginkgo/ginkgo.hpp>
 
 #include "benchmark/utils/sparselib_linops.hpp"
 #include "benchmark/utils/types.hpp"
diff --git a/benchmark/utils/cuda_timer.cpp b/benchmark/utils/cuda_timer.cpp
index 9fb5c3889fe..02e4d0016b8 100644
--- a/benchmark/utils/cuda_timer.cpp
+++ b/benchmark/utils/cuda_timer.cpp
@@ -5,7 +5,6 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 
-
 #include "benchmark/utils/timer_impl.hpp"
 
 
diff --git a/benchmark/utils/dpcpp_linops.dp.cpp b/benchmark/utils/dpcpp_linops.dp.cpp
index 7722a20dcf1..f91de85db2a 100644
--- a/benchmark/utils/dpcpp_linops.dp.cpp
+++ b/benchmark/utils/dpcpp_linops.dp.cpp
@@ -2,14 +2,11 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <memory>
 
-
 #include <oneapi/mkl.hpp>
 
+#include <ginkgo/ginkgo.hpp>
 
 #include "benchmark/utils/sparselib_linops.hpp"
 #include "benchmark/utils/types.hpp"
diff --git a/benchmark/utils/dpcpp_timer.dp.cpp b/benchmark/utils/dpcpp_timer.dp.cpp
index bd97593ccb8..c986f2d8fa0 100644
--- a/benchmark/utils/dpcpp_timer.dp.cpp
+++ b/benchmark/utils/dpcpp_timer.dp.cpp
@@ -4,10 +4,8 @@
 
 #include <iostream>
 
-
 #include <CL/sycl.hpp>
 
-
 #include "benchmark/utils/timer_impl.hpp"
 
 
diff --git a/benchmark/utils/formats.hpp b/benchmark/utils/formats.hpp
index cc609c6a087..13f2cee1056 100644
--- a/benchmark/utils/formats.hpp
+++ b/benchmark/utils/formats.hpp
@@ -6,16 +6,13 @@
 #define GKO_BENCHMARK_UTILS_FORMATS_HPP_
 
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <algorithm>
 #include <map>
 #include <string>
 
-
 #include <gflags/gflags.h>
 
+#include <ginkgo/ginkgo.hpp>
 
 #include "benchmark/utils/sparselib_linops.hpp"
 #include "benchmark/utils/types.hpp"
diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp
index e6137ca6f28..5ae34fa00ab 100644
--- a/benchmark/utils/general.hpp
+++ b/benchmark/utils/general.hpp
@@ -6,9 +6,6 @@
 #define GKO_BENCHMARK_UTILS_GENERAL_HPP_
 
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <algorithm>
 #include <array>
 #include <fstream>
@@ -24,13 +21,12 @@
 #include <utility>
 #include <vector>
 
-
 #include <gflags/gflags.h>
 
+#include <ginkgo/ginkgo.hpp>
 
 #include <ginkgo/core/base/memory.hpp>
 
-
 #include "benchmark/utils/json.hpp"
 #include "benchmark/utils/timer.hpp"
 #include "benchmark/utils/types.hpp"
diff --git a/benchmark/utils/general_matrix.hpp b/benchmark/utils/general_matrix.hpp
index a1f448e8bab..043a09d9994 100644
--- a/benchmark/utils/general_matrix.hpp
+++ b/benchmark/utils/general_matrix.hpp
@@ -6,11 +6,9 @@
 #define GKO_BENCHMARK_UTILS_GENERAL_MATRIX_HPP_
 
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <gflags/gflags.h>
 
+#include <ginkgo/ginkgo.hpp>
 
 #include "benchmark/utils/general.hpp"
 #include "benchmark/utils/generator.hpp"
diff --git a/benchmark/utils/generator.hpp b/benchmark/utils/generator.hpp
index 6b173651aa3..9ec22a33d1b 100644
--- a/benchmark/utils/generator.hpp
+++ b/benchmark/utils/generator.hpp
@@ -8,7 +8,6 @@
 
 #include <string>
 
-
 #include "benchmark/utils/formats.hpp"
 #include "benchmark/utils/general.hpp"
 #include "benchmark/utils/loggers.hpp"
diff --git a/benchmark/utils/hip_linops.hip.cpp b/benchmark/utils/hip_linops.hip.cpp
index f0d7edb45c3..b507a0c441b 100644
--- a/benchmark/utils/hip_linops.hip.cpp
+++ b/benchmark/utils/hip_linops.hip.cpp
@@ -2,11 +2,9 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <memory>
 
+#include <ginkgo/ginkgo.hpp>
 
 #include "benchmark/utils/sparselib_linops.hpp"
 #include "benchmark/utils/types.hpp"
diff --git a/benchmark/utils/hip_timer.hip.cpp b/benchmark/utils/hip_timer.hip.cpp
index 6f114e1d66b..dfc5e8e2b25 100644
--- a/benchmark/utils/hip_timer.hip.cpp
+++ b/benchmark/utils/hip_timer.hip.cpp
@@ -4,7 +4,6 @@
 
 #include <hip/hip_runtime.h>
 
-
 #include "benchmark/utils/timer_impl.hpp"
 
 
diff --git a/benchmark/utils/iteration_control.hpp b/benchmark/utils/iteration_control.hpp
index ff379ad1dd0..f70d0c88719 100644
--- a/benchmark/utils/iteration_control.hpp
+++ b/benchmark/utils/iteration_control.hpp
@@ -6,13 +6,11 @@
 #define GKO_BENCHMARK_UTILS_ITERATION_CONTROL_HPP_
 
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <memory>
 #include <string>
 #include <utility>
 
+#include <ginkgo/ginkgo.hpp>
 
 #include "benchmark/utils/general.hpp"
 #include "benchmark/utils/timer.hpp"
diff --git a/benchmark/utils/loggers.hpp b/benchmark/utils/loggers.hpp
index 917e7dd5f3d..65d086beecb 100644
--- a/benchmark/utils/loggers.hpp
+++ b/benchmark/utils/loggers.hpp
@@ -6,15 +6,13 @@
 #define GKO_BENCHMARK_UTILS_LOGGERS_HPP_
 
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <chrono>
 #include <cmath>
 #include <mutex>
 #include <regex>
 #include <unordered_map>
 
+#include <ginkgo/ginkgo.hpp>
 
 #include "benchmark/utils/general.hpp"
 #include "core/distributed/helpers.hpp"
diff --git a/benchmark/utils/mpi_timer.cpp b/benchmark/utils/mpi_timer.cpp
index 0c4e3cff35b..6ff8510f900 100644
--- a/benchmark/utils/mpi_timer.cpp
+++ b/benchmark/utils/mpi_timer.cpp
@@ -4,7 +4,6 @@
 
 #include <ginkgo/core/base/mpi.hpp>
 
-
 #include "benchmark/utils/timer_impl.hpp"
 
 
diff --git a/benchmark/utils/overhead_linop.hpp b/benchmark/utils/overhead_linop.hpp
index 9a54ae8cacf..02b52d9e070 100644
--- a/benchmark/utils/overhead_linop.hpp
+++ b/benchmark/utils/overhead_linop.hpp
@@ -10,12 +10,10 @@
 #include <memory>
 #include <vector>
 
-
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/stop/criterion.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/benchmark/utils/preconditioners.hpp b/benchmark/utils/preconditioners.hpp
index 26dd257bd04..63fd22708e6 100644
--- a/benchmark/utils/preconditioners.hpp
+++ b/benchmark/utils/preconditioners.hpp
@@ -6,15 +6,12 @@
 #define GKO_BENCHMARK_UTILS_PRECONDITIONERS_HPP_
 
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <map>
 #include <string>
 
-
 #include <gflags/gflags.h>
 
+#include <ginkgo/ginkgo.hpp>
 
 #include "benchmark/utils/general.hpp"
 #include "benchmark/utils/overhead_linop.hpp"
diff --git a/benchmark/utils/runner.hpp b/benchmark/utils/runner.hpp
index a306cd9ac29..2fd1be7874d 100644
--- a/benchmark/utils/runner.hpp
+++ b/benchmark/utils/runner.hpp
@@ -6,13 +6,11 @@
 #define GKO_BENCHMARK_UTILS_RUNNER_HPP_
 
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <iomanip>
 #include <iostream>
 #include <vector>
 
+#include <ginkgo/ginkgo.hpp>
 
 #include "benchmark/utils/general.hpp"
 
diff --git a/benchmark/utils/sparselib_linops.hpp b/benchmark/utils/sparselib_linops.hpp
index 80f2115713f..3bdb909b03d 100644
--- a/benchmark/utils/sparselib_linops.hpp
+++ b/benchmark/utils/sparselib_linops.hpp
@@ -8,7 +8,6 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 
diff --git a/benchmark/utils/timer.hpp b/benchmark/utils/timer.hpp
index 4ad9bbd12a9..27004202107 100644
--- a/benchmark/utils/timer.hpp
+++ b/benchmark/utils/timer.hpp
@@ -6,14 +6,11 @@
 #define GKO_BENCHMARK_UTILS_TIMER_HPP_
 
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <memory>
 
-
 #include <gflags/gflags.h>
 
+#include <ginkgo/ginkgo.hpp>
 
 #include "benchmark/utils/timer_impl.hpp"
 
diff --git a/benchmark/utils/timer_impl.hpp b/benchmark/utils/timer_impl.hpp
index 1f5fe426df2..4f15c600b12 100644
--- a/benchmark/utils/timer_impl.hpp
+++ b/benchmark/utils/timer_impl.hpp
@@ -6,12 +6,11 @@
 #define GKO_BENCHMARK_UTILS_TIMER_IMPL_HPP_
 
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <chrono>
 #include <memory>
 
+#include <ginkgo/ginkgo.hpp>
+
 
 class MpiWrappedTimer;
 
diff --git a/benchmark/utils/tuning_variables.cpp b/benchmark/utils/tuning_variables.cpp
index facf6d07b9a..1fba6a52924 100644
--- a/benchmark/utils/tuning_variables.cpp
+++ b/benchmark/utils/tuning_variables.cpp
@@ -2,11 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/types.hpp>
-
-
 #include "benchmark/utils/tuning_variables.hpp"
 
+#include <ginkgo/core/base/types.hpp>
+
 
 namespace gko {
 
diff --git a/benchmark/utils/types.hpp b/benchmark/utils/types.hpp
index 03a9ab1d70c..de7a8a0e45e 100644
--- a/benchmark/utils/types.hpp
+++ b/benchmark/utils/types.hpp
@@ -8,7 +8,6 @@
 
 #include <complex>
 
-
 #include <ginkgo/core/base/math.hpp>
 
 
diff --git a/cmake/openmpi_test.cpp b/cmake/openmpi_test.cpp
index bfc93e827d4..aba569577d6 100644
--- a/cmake/openmpi_test.cpp
+++ b/cmake/openmpi_test.cpp
@@ -4,7 +4,6 @@
 
 #include <cstdio>
 
-
 #include <mpi.h>
 
 
diff --git a/common/cuda_hip/base/thrust.hpp b/common/cuda_hip/base/thrust.hpp
index 02aaebc9f3d..365b308850c 100644
--- a/common/cuda_hip/base/thrust.hpp
+++ b/common/cuda_hip/base/thrust.hpp
@@ -8,7 +8,6 @@
 
 #include <thrust/execution_policy.h>
 
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/executor.hpp>
 
diff --git a/common/unified/base/device_matrix_data_kernels.cpp b/common/unified/base/device_matrix_data_kernels.cpp
index a3f0162c3c8..d801b47fcd5 100644
--- a/common/unified/base/device_matrix_data_kernels.cpp
+++ b/common/unified/base/device_matrix_data_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/base/device_matrix_data_kernels.hpp"
 
-
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "common/unified/base/kernel_launch.hpp"
 #include "core/components/fill_array_kernels.hpp"
 
diff --git a/common/unified/base/index_set_kernels.cpp b/common/unified/base/index_set_kernels.cpp
index cb8cc72b345..86aff129f00 100644
--- a/common/unified/base/index_set_kernels.cpp
+++ b/common/unified/base/index_set_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/base/index_set_kernels.hpp"
 
-
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/unified/base/kernel_launch.hpp"
 
 
diff --git a/common/unified/base/kernel_launch.hpp b/common/unified/base/kernel_launch.hpp
index 5ca25ecb1e3..fad327ae3b1 100644
--- a/common/unified/base/kernel_launch.hpp
+++ b/common/unified/base/kernel_launch.hpp
@@ -8,7 +8,6 @@
 
 #include <type_traits>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/types.hpp>
diff --git a/common/unified/components/absolute_array_kernels.cpp b/common/unified/components/absolute_array_kernels.cpp
index 9ed032f6a6f..c9ab364353c 100644
--- a/common/unified/components/absolute_array_kernels.cpp
+++ b/common/unified/components/absolute_array_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/components/absolute_array_kernels.hpp"
 
-
 #include "common/unified/base/kernel_launch.hpp"
 
 
diff --git a/common/unified/components/fill_array_kernels.cpp b/common/unified/components/fill_array_kernels.cpp
index 4586083f821..d78a6e9f346 100644
--- a/common/unified/components/fill_array_kernels.cpp
+++ b/common/unified/components/fill_array_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/components/fill_array_kernels.hpp"
 
-
 #include "common/unified/base/kernel_launch.hpp"
 
 
diff --git a/common/unified/components/format_conversion_kernels.cpp b/common/unified/components/format_conversion_kernels.cpp
index b2e48e32a6b..0f54cb04879 100644
--- a/common/unified/components/format_conversion_kernels.cpp
+++ b/common/unified/components/format_conversion_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/components/format_conversion_kernels.hpp"
 
-
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "common/unified/base/kernel_launch.hpp"
 #include "core/components/fill_array_kernels.hpp"
 
diff --git a/common/unified/components/precision_conversion_kernels.cpp b/common/unified/components/precision_conversion_kernels.cpp
index 10f051c3a75..0402d9bef68 100644
--- a/common/unified/components/precision_conversion_kernels.cpp
+++ b/common/unified/components/precision_conversion_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/components/precision_conversion_kernels.hpp"
 
-
 #include "common/unified/base/kernel_launch.hpp"
 
 
diff --git a/common/unified/components/reduce_array_kernels.cpp b/common/unified/components/reduce_array_kernels.cpp
index 7bf1974ccbd..bc8da6fa311 100644
--- a/common/unified/components/reduce_array_kernels.cpp
+++ b/common/unified/components/reduce_array_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/components/reduce_array_kernels.hpp"
 
-
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/unified/base/kernel_launch.hpp"
 #include "common/unified/base/kernel_launch_reduction.hpp"
 
diff --git a/common/unified/distributed/partition_helpers_kernels.cpp b/common/unified/distributed/partition_helpers_kernels.cpp
index ede7bd8be27..3a53157e721 100644
--- a/common/unified/distributed/partition_helpers_kernels.cpp
+++ b/common/unified/distributed/partition_helpers_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/distributed/partition_helpers_kernels.hpp"
 
-
 #include "common/unified/base/kernel_launch.hpp"
 #include "common/unified/base/kernel_launch_reduction.hpp"
 #include "core/base/array_access.hpp"
diff --git a/common/unified/distributed/partition_kernels.cpp b/common/unified/distributed/partition_kernels.cpp
index b76a4c690e4..8d6f23101ee 100644
--- a/common/unified/distributed/partition_kernels.cpp
+++ b/common/unified/distributed/partition_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/distributed/partition_kernels.hpp"
 
-
 #include "common/unified/base/kernel_launch.hpp"
 #include "common/unified/base/kernel_launch_reduction.hpp"
 #include "core/base/array_access.hpp"
diff --git a/common/unified/matrix/coo_kernels.cpp b/common/unified/matrix/coo_kernels.cpp
index 71277937e20..ce13d7500ab 100644
--- a/common/unified/matrix/coo_kernels.cpp
+++ b/common/unified/matrix/coo_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/matrix/coo_kernels.hpp"
 
-
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/unified/base/kernel_launch.hpp"
 
 
diff --git a/common/unified/matrix/csr_kernels.cpp b/common/unified/matrix/csr_kernels.cpp
index 761aefebb82..5236c1c9da9 100644
--- a/common/unified/matrix/csr_kernels.cpp
+++ b/common/unified/matrix/csr_kernels.cpp
@@ -4,13 +4,10 @@
 
 #include "core/matrix/csr_kernels.hpp"
 
-
 #include <algorithm>
 
-
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/unified/base/kernel_launch.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 
diff --git a/common/unified/matrix/dense_kernels.template.cpp b/common/unified/matrix/dense_kernels.template.cpp
index 05966ede9d3..f5b3cc03059 100644
--- a/common/unified/matrix/dense_kernels.template.cpp
+++ b/common/unified/matrix/dense_kernels.template.cpp
@@ -4,11 +4,9 @@
 
 #include "core/matrix/dense_kernels.hpp"
 
-
 #include <ginkgo/core/base/device_matrix_data.hpp>
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/unified/base/kernel_launch.hpp"
 #include "common/unified/base/kernel_launch_reduction.hpp"
 #include "core/base/array_access.hpp"
diff --git a/common/unified/matrix/diagonal_kernels.cpp b/common/unified/matrix/diagonal_kernels.cpp
index d1b2dcdb086..dae037a5134 100644
--- a/common/unified/matrix/diagonal_kernels.cpp
+++ b/common/unified/matrix/diagonal_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/matrix/diagonal_kernels.hpp"
 
-
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/unified/base/kernel_launch.hpp"
 
 
diff --git a/common/unified/matrix/ell_kernels.cpp b/common/unified/matrix/ell_kernels.cpp
index 64b0d093591..6d23e08b68b 100644
--- a/common/unified/matrix/ell_kernels.cpp
+++ b/common/unified/matrix/ell_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/matrix/ell_kernels.hpp"
 
-
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/unified/base/kernel_launch.hpp"
 #include "common/unified/base/kernel_launch_reduction.hpp"
 #include "core/base/array_access.hpp"
diff --git a/common/unified/matrix/hybrid_kernels.cpp b/common/unified/matrix/hybrid_kernels.cpp
index 25338bd0b12..8a21a2415f7 100644
--- a/common/unified/matrix/hybrid_kernels.cpp
+++ b/common/unified/matrix/hybrid_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/matrix/hybrid_kernels.hpp"
 
-
 #include "common/unified/base/kernel_launch.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 
diff --git a/common/unified/matrix/permutation_kernels.cpp b/common/unified/matrix/permutation_kernels.cpp
index 7a6b882c754..a1ba9ab54ad 100644
--- a/common/unified/matrix/permutation_kernels.cpp
+++ b/common/unified/matrix/permutation_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/matrix/permutation_kernels.hpp"
 
-
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/unified/base/kernel_launch.hpp"
 
 
diff --git a/common/unified/matrix/scaled_permutation_kernels.cpp b/common/unified/matrix/scaled_permutation_kernels.cpp
index d658f00ca4b..3eaab65e8e6 100644
--- a/common/unified/matrix/scaled_permutation_kernels.cpp
+++ b/common/unified/matrix/scaled_permutation_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/matrix/scaled_permutation_kernels.hpp"
 
-
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/unified/base/kernel_launch.hpp"
 
 
diff --git a/common/unified/matrix/sellp_kernels.cpp b/common/unified/matrix/sellp_kernels.cpp
index af8e463f78c..93b71ff43f2 100644
--- a/common/unified/matrix/sellp_kernels.cpp
+++ b/common/unified/matrix/sellp_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/matrix/sellp_kernels.hpp"
 
-
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/unified/base/kernel_launch.hpp"
 #include "common/unified/base/kernel_launch_reduction.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
diff --git a/common/unified/matrix/sparsity_csr_kernels.cpp b/common/unified/matrix/sparsity_csr_kernels.cpp
index 8e54a14becb..c5a9c79a89b 100644
--- a/common/unified/matrix/sparsity_csr_kernels.cpp
+++ b/common/unified/matrix/sparsity_csr_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/matrix/sparsity_csr_kernels.hpp"
 
-
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/unified/base/kernel_launch.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 
diff --git a/common/unified/multigrid/pgm_kernels.cpp b/common/unified/multigrid/pgm_kernels.cpp
index 3c163996565..9ba144cba2e 100644
--- a/common/unified/multigrid/pgm_kernels.cpp
+++ b/common/unified/multigrid/pgm_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/multigrid/pgm_kernels.hpp"
 
-
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/unified/base/kernel_launch.hpp"
 #include "common/unified/base/kernel_launch_reduction.hpp"
 #include "core/base/array_access.hpp"
diff --git a/common/unified/preconditioner/jacobi_kernels.cpp b/common/unified/preconditioner/jacobi_kernels.cpp
index bbc393ddab2..b8c19c24f79 100644
--- a/common/unified/preconditioner/jacobi_kernels.cpp
+++ b/common/unified/preconditioner/jacobi_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/preconditioner/jacobi_kernels.hpp"
 
-
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/unified/base/kernel_launch.hpp"
 
 
diff --git a/common/unified/solver/bicg_kernels.cpp b/common/unified/solver/bicg_kernels.cpp
index 60738c5618c..7d15718c05d 100644
--- a/common/unified/solver/bicg_kernels.cpp
+++ b/common/unified/solver/bicg_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/solver/bicg_kernels.hpp"
 
-
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/unified/base/kernel_launch_solver.hpp"
 
 
diff --git a/common/unified/solver/bicgstab_kernels.cpp b/common/unified/solver/bicgstab_kernels.cpp
index 58a6148c138..b696815f0d4 100644
--- a/common/unified/solver/bicgstab_kernels.cpp
+++ b/common/unified/solver/bicgstab_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/solver/bicgstab_kernels.hpp"
 
-
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/unified/base/kernel_launch_solver.hpp"
 
 
diff --git a/common/unified/solver/cg_kernels.cpp b/common/unified/solver/cg_kernels.cpp
index 37f8c885987..822dddf1c3b 100644
--- a/common/unified/solver/cg_kernels.cpp
+++ b/common/unified/solver/cg_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/solver/cg_kernels.hpp"
 
-
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/unified/base/kernel_launch_solver.hpp"
 
 
diff --git a/common/unified/solver/cgs_kernels.cpp b/common/unified/solver/cgs_kernels.cpp
index a20a3faf3c8..0618b8f8208 100644
--- a/common/unified/solver/cgs_kernels.cpp
+++ b/common/unified/solver/cgs_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/solver/cgs_kernels.hpp"
 
-
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/unified/base/kernel_launch_solver.hpp"
 
 
diff --git a/common/unified/solver/common_gmres_kernels.cpp b/common/unified/solver/common_gmres_kernels.cpp
index 8773cce4e66..0e6ba18bb64 100644
--- a/common/unified/solver/common_gmres_kernels.cpp
+++ b/common/unified/solver/common_gmres_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/solver/common_gmres_kernels.hpp"
 
-
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/unified/base/kernel_launch.hpp"
 #include "core/solver/cb_gmres_kernels.hpp"
 
diff --git a/common/unified/solver/fcg_kernels.cpp b/common/unified/solver/fcg_kernels.cpp
index cbe23526c09..7853d97c358 100644
--- a/common/unified/solver/fcg_kernels.cpp
+++ b/common/unified/solver/fcg_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/solver/fcg_kernels.hpp"
 
-
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/unified/base/kernel_launch_solver.hpp"
 
 
diff --git a/common/unified/solver/gcr_kernels.cpp b/common/unified/solver/gcr_kernels.cpp
index 57422ce9954..0c9e825228a 100644
--- a/common/unified/solver/gcr_kernels.cpp
+++ b/common/unified/solver/gcr_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/solver/gcr_kernels.hpp"
 
-
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/unified/base/kernel_launch_solver.hpp"
 
 
diff --git a/common/unified/solver/gmres_kernels.cpp b/common/unified/solver/gmres_kernels.cpp
index 5c0cd52bdcf..3997963f8d7 100644
--- a/common/unified/solver/gmres_kernels.cpp
+++ b/common/unified/solver/gmres_kernels.cpp
@@ -4,11 +4,9 @@
 
 #include "core/solver/gmres_kernels.hpp"
 
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
-
 #include "common/unified/base/kernel_launch.hpp"
 
 
diff --git a/common/unified/solver/ir_kernels.cpp b/common/unified/solver/ir_kernels.cpp
index 1e95206557e..96f0731f039 100644
--- a/common/unified/solver/ir_kernels.cpp
+++ b/common/unified/solver/ir_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/solver/ir_kernels.hpp"
 
-
 #include "common/unified/base/kernel_launch.hpp"
 
 
diff --git a/core/base/allocator.hpp b/core/base/allocator.hpp
index 0bfbd0158c3..e45c4aef813 100644
--- a/core/base/allocator.hpp
+++ b/core/base/allocator.hpp
@@ -15,7 +15,6 @@
 #include <unordered_set>
 #include <vector>
 
-
 #include <ginkgo/core/base/executor.hpp>
 
 
diff --git a/core/base/array.cpp b/core/base/array.cpp
index f529e3cf9d2..a41f7c07e55 100644
--- a/core/base/array.cpp
+++ b/core/base/array.cpp
@@ -4,10 +4,8 @@
 
 #include "ginkgo/core/base/array.hpp"
 
-
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "core/base/array_access.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/precision_conversion_kernels.hpp"
diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp
index 960158654f2..f4485377f25 100644
--- a/core/base/batch_multi_vector.cpp
+++ b/core/base/batch_multi_vector.cpp
@@ -4,11 +4,9 @@
 
 #include "ginkgo/core/base/batch_multi_vector.hpp"
 
-
 #include <algorithm>
 #include <type_traits>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
@@ -18,7 +16,6 @@
 #include <ginkgo/core/base/utils.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
 
-
 #include "core/base/batch_multi_vector_kernels.hpp"
 
 
diff --git a/core/base/batch_multi_vector_kernels.hpp b/core/base/batch_multi_vector_kernels.hpp
index 9a3618c06fb..45a9bf7e2d3 100644
--- a/core/base/batch_multi_vector_kernels.hpp
+++ b/core/base/batch_multi_vector_kernels.hpp
@@ -7,12 +7,9 @@
 
 
 #include <ginkgo/core/base/batch_multi_vector.hpp>
-
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/base/batch_utilities.hpp b/core/base/batch_utilities.hpp
index 877ed926101..6964eeee544 100644
--- a/core/base/batch_utilities.hpp
+++ b/core/base/batch_utilities.hpp
@@ -9,7 +9,6 @@
 #include <algorithm>
 #include <type_traits>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/exception.hpp>
diff --git a/core/base/block_operator.cpp b/core/base/block_operator.cpp
index 43ac79c3c0e..f53375301a8 100644
--- a/core/base/block_operator.cpp
+++ b/core/base/block_operator.cpp
@@ -4,14 +4,11 @@
 
 #include "ginkgo/core/base/block_operator.hpp"
 
-
 #include <utility>
 
-
 #include <ginkgo/core/base/precision_dispatch.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/base/dispatch_helper.hpp"
 
 
diff --git a/core/base/combination.cpp b/core/base/combination.cpp
index 324fa8d4ddf..3b30b77d38c 100644
--- a/core/base/combination.cpp
+++ b/core/base/combination.cpp
@@ -4,7 +4,6 @@
 
 #include "ginkgo/core/base/combination.hpp"
 
-
 #include <ginkgo/core/base/precision_dispatch.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
diff --git a/core/base/composition.cpp b/core/base/composition.cpp
index 515fb425633..82c8152300b 100644
--- a/core/base/composition.cpp
+++ b/core/base/composition.cpp
@@ -4,15 +4,12 @@
 
 #include "ginkgo/core/base/composition.hpp"
 
-
 #include <algorithm>
 #include <iterator>
 
-
 #include <ginkgo/core/base/precision_dispatch.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/components/fill_array_kernels.hpp"
 
 
diff --git a/core/base/dense_cache.cpp b/core/base/dense_cache.cpp
index 50e1abc3977..6adbb6107c9 100644
--- a/core/base/dense_cache.cpp
+++ b/core/base/dense_cache.cpp
@@ -4,7 +4,6 @@
 
 #include "ginkgo/core/base/dense_cache.hpp"
 
-
 #include <ginkgo/core/matrix/dense.hpp>
 
 
diff --git a/core/base/device_matrix_data.cpp b/core/base/device_matrix_data.cpp
index 085054cbd69..a2e5d6e7044 100644
--- a/core/base/device_matrix_data.cpp
+++ b/core/base/device_matrix_data.cpp
@@ -4,12 +4,10 @@
 
 #include "ginkgo/core/base/device_matrix_data.hpp"
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/temporary_clone.hpp>
 
-
 #include "core/base/device_matrix_data_kernels.hpp"
 
 
diff --git a/core/base/device_matrix_data_kernels.hpp b/core/base/device_matrix_data_kernels.hpp
index 2c7d2a81225..bcaeebdf0cb 100644
--- a/core/base/device_matrix_data_kernels.hpp
+++ b/core/base/device_matrix_data_kernels.hpp
@@ -6,16 +6,12 @@
 #define GKO_CORE_BASE_DEVICE_MATRIX_DATA_KERNELS_HPP_
 
 
-#include <ginkgo/core/base/device_matrix_data.hpp>
-
-
 #include <memory>
 
-
+#include <ginkgo/core/base/device_matrix_data.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/base/dispatch_helper.hpp b/core/base/dispatch_helper.hpp
index 36c664d80ff..169b907775b 100644
--- a/core/base/dispatch_helper.hpp
+++ b/core/base/dispatch_helper.hpp
@@ -8,7 +8,6 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
diff --git a/core/base/executor.cpp b/core/base/executor.cpp
index 1fb1703c56f..65019efe94c 100644
--- a/core/base/executor.cpp
+++ b/core/base/executor.cpp
@@ -4,7 +4,6 @@
 
 #include "ginkgo/core/base/executor.hpp"
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/name_demangling.hpp>
diff --git a/core/base/extended_float.hpp b/core/base/extended_float.hpp
index e2d4e01a7d4..2dc60afd329 100644
--- a/core/base/extended_float.hpp
+++ b/core/base/extended_float.hpp
@@ -9,7 +9,6 @@
 #include <limits>
 #include <type_traits>
 
-
 #include <ginkgo/core/base/std_extensions.hpp>
 #include <ginkgo/core/base/types.hpp>
 
diff --git a/core/base/index_range.hpp b/core/base/index_range.hpp
index 2330cc5b43c..ca972363b4a 100644
--- a/core/base/index_range.hpp
+++ b/core/base/index_range.hpp
@@ -10,7 +10,6 @@
 #include <iterator>
 #include <type_traits>
 
-
 #include "core/base/iterator_range.hpp"
 
 
diff --git a/core/base/index_set.cpp b/core/base/index_set.cpp
index b27d3803448..715916aa37a 100644
--- a/core/base/index_set.cpp
+++ b/core/base/index_set.cpp
@@ -4,17 +4,14 @@
 
 #include "ginkgo/core/base/index_set.hpp"
 
-
 #include <algorithm>
 #include <iostream>
 #include <mutex>
 #include <vector>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "core/base/array_access.hpp"
 #include "core/base/index_set_kernels.hpp"
 
diff --git a/core/base/index_set_kernels.hpp b/core/base/index_set_kernels.hpp
index 63170d0e853..7e742ea062f 100644
--- a/core/base/index_set_kernels.hpp
+++ b/core/base/index_set_kernels.hpp
@@ -6,11 +6,9 @@
 #define GKO_CORE_BASE_INDEX_SET_KERNELS_HPP_
 
 
-#include <ginkgo/core/base/index_set.hpp>
-
-
 #include <memory>
 
+#include <ginkgo/core/base/index_set.hpp>
 
 #include "core/base/kernel_declaration.hpp"
 
diff --git a/core/base/iterator_factory.hpp b/core/base/iterator_factory.hpp
index 366f1b3bf60..3d224836b1a 100644
--- a/core/base/iterator_factory.hpp
+++ b/core/base/iterator_factory.hpp
@@ -13,7 +13,6 @@
 #include <tuple>
 #include <utility>
 
-
 #include "core/base/copy_assignable.hpp"
 
 
diff --git a/core/base/memory.cpp b/core/base/memory.cpp
index 0b3e0ce833b..a9a07a74e6a 100644
--- a/core/base/memory.cpp
+++ b/core/base/memory.cpp
@@ -4,10 +4,8 @@
 
 #include "ginkgo/core/base/memory.hpp"
 
-
 #include <new>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
diff --git a/core/base/mpi.cpp b/core/base/mpi.cpp
index 0a703675158..652ef8662e5 100644
--- a/core/base/mpi.cpp
+++ b/core/base/mpi.cpp
@@ -10,7 +10,6 @@
 
 #include <string>
 
-
 #include <mpi.h>
 
 
diff --git a/core/base/mtx_io.cpp b/core/base/mtx_io.cpp
index e2f2dbf5d9b..c264a073f31 100644
--- a/core/base/mtx_io.cpp
+++ b/core/base/mtx_io.cpp
@@ -4,7 +4,6 @@
 
 #include "ginkgo/core/base/mtx_io.hpp"
 
-
 #include <algorithm>
 #include <cctype>
 #include <cstring>
@@ -14,7 +13,6 @@
 #include <string>
 #include <type_traits>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/utils.hpp>
diff --git a/core/base/perturbation.cpp b/core/base/perturbation.cpp
index 94a4975cfa0..686c54e5b2d 100644
--- a/core/base/perturbation.cpp
+++ b/core/base/perturbation.cpp
@@ -4,7 +4,6 @@
 
 #include "ginkgo/core/base/perturbation.hpp"
 
-
 #include <ginkgo/core/base/precision_dispatch.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
diff --git a/core/base/segmented_array.cpp b/core/base/segmented_array.cpp
index cc31abb7686..4a88d42128f 100644
--- a/core/base/segmented_array.cpp
+++ b/core/base/segmented_array.cpp
@@ -4,7 +4,6 @@
 
 #include <ginkgo/core/base/segmented_array.hpp>
 
-
 #include "core/base/array_access.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 
diff --git a/core/base/timer.cpp b/core/base/timer.cpp
index abd5fbf61cd..9050f00fa17 100644
--- a/core/base/timer.cpp
+++ b/core/base/timer.cpp
@@ -4,16 +4,13 @@
 
 #include "ginkgo/core/base/timer.hpp"
 
-
 #include <chrono>
 #include <memory>
 #include <utility>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 
-
 #include "cuda/base/device.hpp"
 #include "dpcpp/base/device.hpp"
 #include "hip/base/device.hpp"
diff --git a/core/base/utils.hpp b/core/base/utils.hpp
index fee0a300c16..061c6e303ed 100644
--- a/core/base/utils.hpp
+++ b/core/base/utils.hpp
@@ -6,15 +6,12 @@
 #define GKO_CORE_BASE_UTILS_HPP_
 
 
-#include <ginkgo/core/base/utils.hpp>
-
-
 #include <memory>
 #include <type_traits>
 
-
 #include <ginkgo/core/base/polymorphic_object.hpp>
 #include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/base/utils.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
 
diff --git a/core/base/workspace_aliases.hpp b/core/base/workspace_aliases.hpp
index af1391300f4..ddea34a71e9 100644
--- a/core/base/workspace_aliases.hpp
+++ b/core/base/workspace_aliases.hpp
@@ -11,7 +11,6 @@
 #include <algorithm>
 #include <cstdlib>
 
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/types.hpp>
 
diff --git a/core/components/absolute_array_kernels.hpp b/core/components/absolute_array_kernels.hpp
index 18f32ddc3e6..7617883cd1c 100644
--- a/core/components/absolute_array_kernels.hpp
+++ b/core/components/absolute_array_kernels.hpp
@@ -8,12 +8,10 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/components/addressable_pq.hpp b/core/components/addressable_pq.hpp
index d4e1f20fc80..e5b5a3e0fbe 100644
--- a/core/components/addressable_pq.hpp
+++ b/core/components/addressable_pq.hpp
@@ -9,11 +9,9 @@
 #include <algorithm>
 #include <vector>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "core/base/allocator.hpp"
 
 
diff --git a/core/components/fill_array_kernels.hpp b/core/components/fill_array_kernels.hpp
index 4a6d8c6a3d1..2608cabe409 100644
--- a/core/components/fill_array_kernels.hpp
+++ b/core/components/fill_array_kernels.hpp
@@ -8,11 +8,9 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/components/format_conversion_kernels.hpp b/core/components/format_conversion_kernels.hpp
index 2a72da998f9..10be3a10232 100644
--- a/core/components/format_conversion_kernels.hpp
+++ b/core/components/format_conversion_kernels.hpp
@@ -8,12 +8,10 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/device_matrix_data.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/components/precision_conversion_kernels.hpp b/core/components/precision_conversion_kernels.hpp
index 0839530a92c..8443a657502 100644
--- a/core/components/precision_conversion_kernels.hpp
+++ b/core/components/precision_conversion_kernels.hpp
@@ -8,12 +8,10 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/components/prefix_sum_kernels.hpp b/core/components/prefix_sum_kernels.hpp
index e43e2cf042c..8b68b54e29f 100644
--- a/core/components/prefix_sum_kernels.hpp
+++ b/core/components/prefix_sum_kernels.hpp
@@ -8,11 +8,9 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/components/reduce_array_kernels.hpp b/core/components/reduce_array_kernels.hpp
index 2d02906ebdc..b124e6ec2e3 100644
--- a/core/components/reduce_array_kernels.hpp
+++ b/core/components/reduce_array_kernels.hpp
@@ -8,12 +8,10 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/config/config.cpp b/core/config/config.cpp
index 87dd49b6c03..adb47e5ef75 100644
--- a/core/config/config.cpp
+++ b/core/config/config.cpp
@@ -4,14 +4,11 @@
 
 #include "ginkgo/core/config/config.hpp"
 
-
 #include <map>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/solver/solver_base.hpp>
 
-
 #include "core/config/config_helper.hpp"
 #include "core/config/registry_accessor.hpp"
 
diff --git a/core/config/config_helper.cpp b/core/config/config_helper.cpp
index c12143ff8d6..30b33063413 100644
--- a/core/config/config_helper.cpp
+++ b/core/config/config_helper.cpp
@@ -2,15 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <type_traits>
+#include "core/config/config_helper.hpp"
 
+#include <type_traits>
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/config/registry.hpp>
 
-
-#include "core/config/config_helper.hpp"
 #include "core/config/registry_accessor.hpp"
 #include "core/config/stop_config.hpp"
 
diff --git a/core/config/config_helper.hpp b/core/config/config_helper.hpp
index 0866cf9695a..f84e6799bf7 100644
--- a/core/config/config_helper.hpp
+++ b/core/config/config_helper.hpp
@@ -9,7 +9,6 @@
 #include <string>
 #include <type_traits>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/math.hpp>
@@ -18,7 +17,6 @@
 #include <ginkgo/core/solver/solver_base.hpp>
 #include <ginkgo/core/stop/criterion.hpp>
 
-
 #include "core/config/registry_accessor.hpp"
 
 
diff --git a/core/config/dispatch.hpp b/core/config/dispatch.hpp
index 5bf5dc3273e..0138665aac2 100644
--- a/core/config/dispatch.hpp
+++ b/core/config/dispatch.hpp
@@ -9,7 +9,6 @@
 #include <complex>
 #include <string>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/config/config.hpp>
@@ -17,7 +16,6 @@
 #include <ginkgo/core/solver/solver_base.hpp>
 #include <ginkgo/core/synthesizer/containers.hpp>
 
-
 #include "core/config/config_helper.hpp"
 #include "core/config/type_descriptor_helper.hpp"
 
diff --git a/core/config/factorization_config.cpp b/core/config/factorization_config.cpp
index df6439d1297..259d32cb872 100644
--- a/core/config/factorization_config.cpp
+++ b/core/config/factorization_config.cpp
@@ -14,7 +14,6 @@
 #include <ginkgo/core/factorization/par_ilu.hpp>
 #include <ginkgo/core/factorization/par_ilut.hpp>
 
-
 #include "core/config/config_helper.hpp"
 #include "core/config/dispatch.hpp"
 #include "core/config/parse_macro.hpp"
diff --git a/core/config/multigrid_config.cpp b/core/config/multigrid_config.cpp
index 553e6ca033d..83be1a1742b 100644
--- a/core/config/multigrid_config.cpp
+++ b/core/config/multigrid_config.cpp
@@ -2,10 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "ginkgo/core/multigrid/pgm.hpp"
-
-
 #include "core/config/parse_macro.hpp"
+#include "ginkgo/core/multigrid/pgm.hpp"
 
 
 namespace gko {
diff --git a/core/config/parse_macro.hpp b/core/config/parse_macro.hpp
index cbc9438fbb7..800b42f9493 100644
--- a/core/config/parse_macro.hpp
+++ b/core/config/parse_macro.hpp
@@ -10,7 +10,6 @@
 #include <ginkgo/core/config/registry.hpp>
 #include <ginkgo/core/config/type_descriptor.hpp>
 
-
 #include "core/config/config_helper.hpp"
 #include "core/config/dispatch.hpp"
 #include "core/config/type_descriptor_helper.hpp"
diff --git a/core/config/preconditioner_config.cpp b/core/config/preconditioner_config.cpp
index e5f8ee94ea6..cba54cb3356 100644
--- a/core/config/preconditioner_config.cpp
+++ b/core/config/preconditioner_config.cpp
@@ -13,7 +13,6 @@
 #include <ginkgo/core/solver/ir.hpp>
 #include <ginkgo/core/solver/triangular.hpp>
 
-
 #include "core/config/config_helper.hpp"
 #include "core/config/dispatch.hpp"
 #include "core/config/parse_macro.hpp"
diff --git a/core/config/property_tree.cpp b/core/config/property_tree.cpp
index 1ab33712953..3f6826bf634 100644
--- a/core/config/property_tree.cpp
+++ b/core/config/property_tree.cpp
@@ -4,7 +4,6 @@
 
 #include "ginkgo/core/config/property_tree.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
diff --git a/core/config/registry.cpp b/core/config/registry.cpp
index 8b8bdbcaf0d..1718de5fed2 100644
--- a/core/config/registry.cpp
+++ b/core/config/registry.cpp
@@ -4,11 +4,9 @@
 
 #include "ginkgo/core/config/registry.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/config/config.hpp>
 
-
 #include "core/config/config_helper.hpp"
 
 
diff --git a/core/config/registry_accessor.hpp b/core/config/registry_accessor.hpp
index 002e6245811..5b007632f0c 100644
--- a/core/config/registry_accessor.hpp
+++ b/core/config/registry_accessor.hpp
@@ -8,7 +8,6 @@
 
 #include <string>
 
-
 #include <ginkgo/core/config/registry.hpp>
 
 
diff --git a/core/config/solver_config.cpp b/core/config/solver_config.cpp
index 27c06f7f895..b35a639b8e7 100644
--- a/core/config/solver_config.cpp
+++ b/core/config/solver_config.cpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/config/solver_config.hpp"
+
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/config/config.hpp>
 #include <ginkgo/core/config/registry.hpp>
@@ -19,11 +21,9 @@
 #include <ginkgo/core/solver/multigrid.hpp>
 #include <ginkgo/core/solver/triangular.hpp>
 
-
 #include "core/config/config_helper.hpp"
 #include "core/config/dispatch.hpp"
 #include "core/config/parse_macro.hpp"
-#include "core/config/solver_config.hpp"
 
 
 namespace gko {
diff --git a/core/config/solver_config.hpp b/core/config/solver_config.hpp
index 3c820541f2c..e5f51ff85f4 100644
--- a/core/config/solver_config.hpp
+++ b/core/config/solver_config.hpp
@@ -9,7 +9,6 @@
 #include <ginkgo/core/config/config.hpp>
 #include <ginkgo/core/config/registry.hpp>
 
-
 #include "core/config/config_helper.hpp"
 #include "core/config/dispatch.hpp"
 
diff --git a/core/config/stop_config.cpp b/core/config/stop_config.cpp
index 2270cf5f84c..4623eb768fc 100644
--- a/core/config/stop_config.cpp
+++ b/core/config/stop_config.cpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/config/stop_config.hpp"
+
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/config/config.hpp>
 #include <ginkgo/core/config/registry.hpp>
@@ -12,11 +14,9 @@
 #include <ginkgo/core/stop/residual_norm.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
-
 #include "core/config/config_helper.hpp"
 #include "core/config/dispatch.hpp"
 #include "core/config/registry_accessor.hpp"
-#include "core/config/stop_config.hpp"
 #include "core/config/type_descriptor_helper.hpp"
 
 
diff --git a/core/config/trisolver_config.hpp b/core/config/trisolver_config.hpp
index 301109cab6b..8d7f6fb680d 100644
--- a/core/config/trisolver_config.hpp
+++ b/core/config/trisolver_config.hpp
@@ -10,7 +10,6 @@
 #include <ginkgo/core/config/registry.hpp>
 #include <ginkgo/core/solver/triangular.hpp>
 
-
 #include "core/config/config_helper.hpp"
 #include "core/config/dispatch.hpp"
 
diff --git a/core/config/type_descriptor.cpp b/core/config/type_descriptor.cpp
index cbc29c5088a..93ec1d3f929 100644
--- a/core/config/type_descriptor.cpp
+++ b/core/config/type_descriptor.cpp
@@ -4,10 +4,8 @@
 
 #include "ginkgo/core/config/type_descriptor.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "core/config/type_descriptor_helper.hpp"
 
 
diff --git a/core/config/type_descriptor_helper.hpp b/core/config/type_descriptor_helper.hpp
index 3917e317773..0edc4376f1a 100644
--- a/core/config/type_descriptor_helper.hpp
+++ b/core/config/type_descriptor_helper.hpp
@@ -8,7 +8,6 @@
 
 #include <string>
 
-
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/config/property_tree.hpp>
 #include <ginkgo/core/config/type_descriptor.hpp>
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index f26b221a799..f5dc92ce16e 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -5,7 +5,6 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "core/base/batch_multi_vector_kernels.hpp"
 #include "core/base/device_matrix_data_kernels.hpp"
 #include "core/base/index_set_kernels.hpp"
diff --git a/core/device_hooks/cuda_hooks.cpp b/core/device_hooks/cuda_hooks.cpp
index 15bb5d05735..abda9e4e0f6 100644
--- a/core/device_hooks/cuda_hooks.cpp
+++ b/core/device_hooks/cuda_hooks.cpp
@@ -4,7 +4,6 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/memory.hpp>
diff --git a/core/device_hooks/dpcpp_hooks.cpp b/core/device_hooks/dpcpp_hooks.cpp
index 4be09fc60a2..6cd86581998 100644
--- a/core/device_hooks/dpcpp_hooks.cpp
+++ b/core/device_hooks/dpcpp_hooks.cpp
@@ -5,7 +5,6 @@
 #include <memory>
 #include <string>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/memory.hpp>
diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp
index a90691e1af4..573fb37b8f0 100644
--- a/core/device_hooks/hip_hooks.cpp
+++ b/core/device_hooks/hip_hooks.cpp
@@ -5,7 +5,6 @@
 #include <memory>
 #include <string>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/memory.hpp>
diff --git a/core/distributed/helpers.hpp b/core/distributed/helpers.hpp
index 9edf8282ed9..5536dbe32f0 100644
--- a/core/distributed/helpers.hpp
+++ b/core/distributed/helpers.hpp
@@ -8,13 +8,11 @@
 
 #include <memory>
 
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/distributed/matrix.hpp>
 #include <ginkgo/core/distributed/vector.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/base/dispatch_helper.hpp"
 
 
diff --git a/core/distributed/index_map.cpp b/core/distributed/index_map.cpp
index e24d8141b4d..9f0ed8137ba 100644
--- a/core/distributed/index_map.cpp
+++ b/core/distributed/index_map.cpp
@@ -4,7 +4,6 @@
 
 #include "ginkgo/core/distributed/index_map.hpp"
 
-
 #include "core/distributed/index_map_kernels.hpp"
 
 
diff --git a/core/distributed/index_map_kernels.hpp b/core/distributed/index_map_kernels.hpp
index c808a4b8d19..4694ba6cc10 100644
--- a/core/distributed/index_map_kernels.hpp
+++ b/core/distributed/index_map_kernels.hpp
@@ -6,14 +6,11 @@
 #define GKO_CORE_DISTRIBUTED_INDEX_MAP_KERNELS_HPP_
 
 
-#include <ginkgo/core/distributed/index_map.hpp>
-
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/segmented_array.hpp>
+#include <ginkgo/core/distributed/index_map.hpp>
 #include <ginkgo/core/distributed/partition.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 #include "core/base/segmented_array.hpp"
 
diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp
index 2d2d1304769..8eee020a3e6 100644
--- a/core/distributed/matrix.cpp
+++ b/core/distributed/matrix.cpp
@@ -4,13 +4,11 @@
 
 #include "ginkgo/core/distributed/matrix.hpp"
 
-
 #include <ginkgo/core/base/precision_dispatch.hpp>
 #include <ginkgo/core/distributed/vector.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/distributed/matrix_kernels.hpp"
 
 
diff --git a/core/distributed/matrix_kernels.hpp b/core/distributed/matrix_kernels.hpp
index a424c49c442..f24e8c9945e 100644
--- a/core/distributed/matrix_kernels.hpp
+++ b/core/distributed/matrix_kernels.hpp
@@ -12,7 +12,6 @@
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/distributed/partition.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/distributed/partition.cpp b/core/distributed/partition.cpp
index 5e6903de872..763986f3a86 100644
--- a/core/distributed/partition.cpp
+++ b/core/distributed/partition.cpp
@@ -4,7 +4,6 @@
 
 #include "ginkgo/core/distributed/partition.hpp"
 
-
 #include "core/base/array_access.hpp"
 #include "core/distributed/partition_kernels.hpp"
 
diff --git a/core/distributed/partition_helpers.cpp b/core/distributed/partition_helpers.cpp
index 1a55daf8134..75c3d14f971 100644
--- a/core/distributed/partition_helpers.cpp
+++ b/core/distributed/partition_helpers.cpp
@@ -4,13 +4,10 @@
 
 #include "ginkgo/core/distributed/partition_helpers.hpp"
 
-
 #include <numeric>
 
-
 #include <ginkgo/core/distributed/partition.hpp>
 
-
 #include "core/components/fill_array_kernels.hpp"
 #include "core/distributed/partition_helpers_kernels.hpp"
 
diff --git a/core/distributed/partition_helpers_kernels.hpp b/core/distributed/partition_helpers_kernels.hpp
index 9c87629b60c..0262c3007ee 100644
--- a/core/distributed/partition_helpers_kernels.hpp
+++ b/core/distributed/partition_helpers_kernels.hpp
@@ -8,7 +8,6 @@
 
 #include <ginkgo/core/base/array.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/distributed/partition_kernels.hpp b/core/distributed/partition_kernels.hpp
index fd7e214dbbd..b1df933e5c8 100644
--- a/core/distributed/partition_kernels.hpp
+++ b/core/distributed/partition_kernels.hpp
@@ -8,7 +8,6 @@
 
 #include <ginkgo/core/distributed/partition.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/distributed/preconditioner/schwarz.cpp b/core/distributed/preconditioner/schwarz.cpp
index 2def0a0f85c..7235038847d 100644
--- a/core/distributed/preconditioner/schwarz.cpp
+++ b/core/distributed/preconditioner/schwarz.cpp
@@ -4,10 +4,8 @@
 
 #include "ginkgo/core/distributed/preconditioner/schwarz.hpp"
 
-
 #include <memory>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
@@ -18,7 +16,6 @@
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/base/utils.hpp"
 #include "core/distributed/helpers.hpp"
 
diff --git a/core/distributed/vector.cpp b/core/distributed/vector.cpp
index 2e57fcf7451..ae7ab182a85 100644
--- a/core/distributed/vector.cpp
+++ b/core/distributed/vector.cpp
@@ -4,10 +4,8 @@
 
 #include "ginkgo/core/distributed/vector.hpp"
 
-
 #include <ginkgo/core/distributed/partition.hpp>
 
-
 #include "core/distributed/vector_kernels.hpp"
 #include "core/matrix/dense_kernels.hpp"
 
diff --git a/core/distributed/vector_kernels.hpp b/core/distributed/vector_kernels.hpp
index 1d5fcb6a51e..c288b8918a1 100644
--- a/core/distributed/vector_kernels.hpp
+++ b/core/distributed/vector_kernels.hpp
@@ -13,7 +13,6 @@
 #include <ginkgo/core/distributed/partition.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/factorization/cholesky.cpp b/core/factorization/cholesky.cpp
index 12456df4abc..81627ad229b 100644
--- a/core/factorization/cholesky.cpp
+++ b/core/factorization/cholesky.cpp
@@ -4,14 +4,12 @@
 
 #include "ginkgo/core/factorization/cholesky.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/config/config.hpp>
 #include <ginkgo/core/config/registry.hpp>
 
-
 #include "core/base/array_access.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/config/config_helper.hpp"
diff --git a/core/factorization/cholesky_kernels.hpp b/core/factorization/cholesky_kernels.hpp
index ff758d988db..db889ce1162 100644
--- a/core/factorization/cholesky_kernels.hpp
+++ b/core/factorization/cholesky_kernels.hpp
@@ -8,12 +8,10 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 #include "core/factorization/elimination_forest.hpp"
 
diff --git a/core/factorization/elimination_forest.cpp b/core/factorization/elimination_forest.cpp
index 138db0f6350..1dc8ff060a0 100644
--- a/core/factorization/elimination_forest.cpp
+++ b/core/factorization/elimination_forest.cpp
@@ -4,7 +4,6 @@
 
 #include "core/factorization/elimination_forest.hpp"
 
-
 #include <ginkgo/core/base/types.hpp>
 
 
diff --git a/core/factorization/elimination_forest.hpp b/core/factorization/elimination_forest.hpp
index 7ab7f9c715b..5307a90384c 100644
--- a/core/factorization/elimination_forest.hpp
+++ b/core/factorization/elimination_forest.hpp
@@ -10,7 +10,6 @@
 #include <ginkgo/core/base/temporary_clone.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/components/disjoint_sets.hpp"
 
 
diff --git a/core/factorization/factorization.cpp b/core/factorization/factorization.cpp
index 597fc7b48f4..1df1f49aa13 100644
--- a/core/factorization/factorization.cpp
+++ b/core/factorization/factorization.cpp
@@ -4,12 +4,10 @@
 
 #include "ginkgo/core/factorization/factorization.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/base/array_access.hpp"
 #include "core/factorization/factorization_kernels.hpp"
 
diff --git a/core/factorization/factorization_kernels.hpp b/core/factorization/factorization_kernels.hpp
index 0dc0f9fc16e..bab3dd16bd2 100644
--- a/core/factorization/factorization_kernels.hpp
+++ b/core/factorization/factorization_kernels.hpp
@@ -8,12 +8,10 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/factorization/ic.cpp b/core/factorization/ic.cpp
index 67fb3df5b46..2257e6256e4 100644
--- a/core/factorization/ic.cpp
+++ b/core/factorization/ic.cpp
@@ -4,17 +4,14 @@
 
 #include "ginkgo/core/factorization/ic.hpp"
 
-
 #include <memory>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/composition.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/config/config.hpp>
 #include <ginkgo/core/config/registry.hpp>
 
-
 #include "core/base/array_access.hpp"
 #include "core/config/config_helper.hpp"
 #include "core/factorization/factorization_kernels.hpp"
diff --git a/core/factorization/ic_kernels.hpp b/core/factorization/ic_kernels.hpp
index 29d400ad183..187e6cf0e6d 100644
--- a/core/factorization/ic_kernels.hpp
+++ b/core/factorization/ic_kernels.hpp
@@ -6,16 +6,12 @@
 #define GKO_CORE_FACTORIZATION_IC_KERNELS_HPP_
 
 
-#include <ginkgo/core/factorization/ic.hpp>
-
-
 #include <memory>
 
-
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/factorization/ic.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/factorization/ilu.cpp b/core/factorization/ilu.cpp
index 15f3cef1831..41df4065979 100644
--- a/core/factorization/ilu.cpp
+++ b/core/factorization/ilu.cpp
@@ -4,16 +4,13 @@
 
 #include "ginkgo/core/factorization/ilu.hpp"
 
-
 #include <memory>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/config/config.hpp>
 #include <ginkgo/core/config/registry.hpp>
 
-
 #include "core/base/array_access.hpp"
 #include "core/config/config_helper.hpp"
 #include "core/factorization/factorization_kernels.hpp"
diff --git a/core/factorization/ilu_kernels.hpp b/core/factorization/ilu_kernels.hpp
index 562d6e0901d..2371c17fda4 100644
--- a/core/factorization/ilu_kernels.hpp
+++ b/core/factorization/ilu_kernels.hpp
@@ -6,17 +6,13 @@
 #define GKO_CORE_FACTORIZATION_ILU_KERNELS_HPP_
 
 
-#include <ginkgo/core/factorization/ilu.hpp>
-
-
 #include <memory>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/factorization/ilu.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/factorization/lu.cpp b/core/factorization/lu.cpp
index 8ab1ddfc37f..fb9cab4154a 100644
--- a/core/factorization/lu.cpp
+++ b/core/factorization/lu.cpp
@@ -4,14 +4,12 @@
 
 #include "ginkgo/core/factorization/lu.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/config/config.hpp>
 #include <ginkgo/core/config/registry.hpp>
 
-
 #include "core/base/array_access.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/config/config_helper.hpp"
diff --git a/core/factorization/lu_kernels.hpp b/core/factorization/lu_kernels.hpp
index 601f424087f..f497398cb90 100644
--- a/core/factorization/lu_kernels.hpp
+++ b/core/factorization/lu_kernels.hpp
@@ -8,12 +8,10 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/factorization/par_ic.cpp b/core/factorization/par_ic.cpp
index c21f66934aa..f4a4afd23d6 100644
--- a/core/factorization/par_ic.cpp
+++ b/core/factorization/par_ic.cpp
@@ -4,10 +4,8 @@
 
 #include "ginkgo/core/factorization/par_ic.hpp"
 
-
 #include <memory>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/composition.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
@@ -18,7 +16,6 @@
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/base/array_access.hpp"
 #include "core/components/format_conversion_kernels.hpp"
 #include "core/config/config_helper.hpp"
diff --git a/core/factorization/par_ic_kernels.hpp b/core/factorization/par_ic_kernels.hpp
index 47cfc8c37a2..59d2d97ffce 100644
--- a/core/factorization/par_ic_kernels.hpp
+++ b/core/factorization/par_ic_kernels.hpp
@@ -6,18 +6,14 @@
 #define GKO_CORE_FACTORIZATION_PAR_IC_KERNELS_HPP_
 
 
-#include <ginkgo/core/factorization/par_ic.hpp>
-
-
 #include <memory>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/factorization/par_ic.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/factorization/par_ict.cpp b/core/factorization/par_ict.cpp
index 54176d79545..a0e8a628ca8 100644
--- a/core/factorization/par_ict.cpp
+++ b/core/factorization/par_ict.cpp
@@ -4,10 +4,8 @@
 
 #include "ginkgo/core/factorization/par_ict.hpp"
 
-
 #include <memory>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/polymorphic_object.hpp>
@@ -17,7 +15,6 @@
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/base/array_access.hpp"
 #include "core/base/utils.hpp"
 #include "core/components/format_conversion_kernels.hpp"
diff --git a/core/factorization/par_ict_kernels.hpp b/core/factorization/par_ict_kernels.hpp
index 0b0c2c6bcd2..25172c0d649 100644
--- a/core/factorization/par_ict_kernels.hpp
+++ b/core/factorization/par_ict_kernels.hpp
@@ -6,18 +6,14 @@
 #define GKO_CORE_FACTORIZATION_PAR_ICT_KERNELS_HPP_
 
 
-#include <ginkgo/core/factorization/par_ict.hpp>
-
-
 #include <memory>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/factorization/par_ict.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/factorization/par_ilu.cpp b/core/factorization/par_ilu.cpp
index f69947adcac..68c0c0c4fc6 100644
--- a/core/factorization/par_ilu.cpp
+++ b/core/factorization/par_ilu.cpp
@@ -4,10 +4,8 @@
 
 #include "ginkgo/core/factorization/par_ilu.hpp"
 
-
 #include <memory>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/polymorphic_object.hpp>
@@ -17,7 +15,6 @@
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/base/array_access.hpp"
 #include "core/config/config_helper.hpp"
 #include "core/factorization/factorization_kernels.hpp"
diff --git a/core/factorization/par_ilu_kernels.hpp b/core/factorization/par_ilu_kernels.hpp
index 943cd1cf9bc..16d20859c3e 100644
--- a/core/factorization/par_ilu_kernels.hpp
+++ b/core/factorization/par_ilu_kernels.hpp
@@ -6,17 +6,13 @@
 #define GKO_CORE_FACTORIZATION_PAR_ILU_KERNELS_HPP_
 
 
-#include <ginkgo/core/factorization/par_ilu.hpp>
-
-
 #include <memory>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/factorization/par_ilu.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/factorization/par_ilut.cpp b/core/factorization/par_ilut.cpp
index ff4b5b2a83e..42e3cc03130 100644
--- a/core/factorization/par_ilut.cpp
+++ b/core/factorization/par_ilut.cpp
@@ -4,10 +4,8 @@
 
 #include "ginkgo/core/factorization/par_ilut.hpp"
 
-
 #include <memory>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/polymorphic_object.hpp>
@@ -17,7 +15,6 @@
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/base/array_access.hpp"
 #include "core/base/utils.hpp"
 #include "core/components/format_conversion_kernels.hpp"
diff --git a/core/factorization/par_ilut_kernels.hpp b/core/factorization/par_ilut_kernels.hpp
index 880acc7a4c2..2d8ac7b4f88 100644
--- a/core/factorization/par_ilut_kernels.hpp
+++ b/core/factorization/par_ilut_kernels.hpp
@@ -6,18 +6,14 @@
 #define GKO_CORE_FACTORIZATION_PAR_ILUT_KERNELS_HPP_
 
 
-#include <ginkgo/core/factorization/par_ilut.hpp>
-
-
 #include <memory>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/factorization/par_ilut.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/factorization/symbolic.cpp b/core/factorization/symbolic.cpp
index c55bfb3e759..23f6b94cc14 100644
--- a/core/factorization/symbolic.cpp
+++ b/core/factorization/symbolic.cpp
@@ -4,14 +4,12 @@
 
 #include "core/factorization/symbolic.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/temporary_clone.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/identity.hpp>
 
-
 #include "core/base/allocator.hpp"
 #include "core/base/array_access.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
diff --git a/core/factorization/symbolic.hpp b/core/factorization/symbolic.hpp
index c98c623c92d..096d8c998bc 100644
--- a/core/factorization/symbolic.hpp
+++ b/core/factorization/symbolic.hpp
@@ -4,7 +4,6 @@
 
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/factorization/elimination_forest.hpp"
 
 
diff --git a/core/log/batch_logger.cpp b/core/log/batch_logger.cpp
index 532cae64c28..286803c0ae1 100644
--- a/core/log/batch_logger.cpp
+++ b/core/log/batch_logger.cpp
@@ -4,12 +4,10 @@
 
 #include "ginkgo/core/log/batch_logger.hpp"
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "core/base/workspace_aliases.hpp"
 
 
diff --git a/core/log/convergence.cpp b/core/log/convergence.cpp
index 16c89e08ffc..7cfa764dfd1 100644
--- a/core/log/convergence.cpp
+++ b/core/log/convergence.cpp
@@ -4,7 +4,6 @@
 
 #include "ginkgo/core/log/convergence.hpp"
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/distributed/vector.hpp>
@@ -12,7 +11,6 @@
 #include <ginkgo/core/stop/criterion.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
-
 #include "core/base/dispatch_helper.hpp"
 #include "core/distributed/helpers.hpp"
 
diff --git a/core/log/papi.cpp b/core/log/papi.cpp
index 83a9bd3b93c..5ced377ca38 100644
--- a/core/log/papi.cpp
+++ b/core/log/papi.cpp
@@ -4,11 +4,9 @@
 
 #include "ginkgo/core/log/papi.hpp"
 
-
 #include <ginkgo/core/base/dim.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/distributed/helpers.hpp"
 
 
diff --git a/core/log/performance_hint.cpp b/core/log/performance_hint.cpp
index 3b0a720aa93..5f497c33fe0 100644
--- a/core/log/performance_hint.cpp
+++ b/core/log/performance_hint.cpp
@@ -4,7 +4,6 @@
 
 #include "ginkgo/core/log/performance_hint.hpp"
 
-
 #include <iomanip>
 #include <sstream>
 
diff --git a/core/log/profiler_hook.cpp b/core/log/profiler_hook.cpp
index 87ea8f42d02..7cb4f807919 100644
--- a/core/log/profiler_hook.cpp
+++ b/core/log/profiler_hook.cpp
@@ -4,18 +4,15 @@
 
 #include "ginkgo/core/log/profiler_hook.hpp"
 
-
 #include <memory>
 #include <mutex>
 #include <sstream>
 
-
 #include <ginkgo/core/base/name_demangling.hpp>
 #include <ginkgo/core/log/logger.hpp>
 #include <ginkgo/core/solver/solver_base.hpp>
 #include <ginkgo/core/stop/criterion.hpp>
 
-
 #include "core/log/profiler_hook.hpp"
 
 
diff --git a/core/log/profiler_hook_summary.cpp b/core/log/profiler_hook_summary.cpp
index c7d2f3ea95b..02641bdfd29 100644
--- a/core/log/profiler_hook_summary.cpp
+++ b/core/log/profiler_hook_summary.cpp
@@ -8,7 +8,6 @@
 #include <unordered_map>
 #include <vector>
 
-
 #include "core/log/profiler_hook.hpp"
 
 
diff --git a/core/log/profiler_hook_summary_writer.cpp b/core/log/profiler_hook_summary_writer.cpp
index 4139f5938c9..dd39687ffe4 100644
--- a/core/log/profiler_hook_summary_writer.cpp
+++ b/core/log/profiler_hook_summary_writer.cpp
@@ -5,7 +5,6 @@
 #include <iomanip>
 #include <numeric>
 
-
 #include <ginkgo/core/log/profiler_hook.hpp>
 
 
diff --git a/core/log/record.cpp b/core/log/record.cpp
index 6d995cd348c..0d810c05fa0 100644
--- a/core/log/record.cpp
+++ b/core/log/record.cpp
@@ -4,7 +4,6 @@
 
 #include "ginkgo/core/log/record.hpp"
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/stop/criterion.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
diff --git a/core/log/stream.cpp b/core/log/stream.cpp
index 033575c9b54..5e510d409e2 100644
--- a/core/log/stream.cpp
+++ b/core/log/stream.cpp
@@ -4,10 +4,8 @@
 
 #include "ginkgo/core/log/stream.hpp"
 
-
 #include <iomanip>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/name_demangling.hpp>
diff --git a/core/matrix/batch_csr.cpp b/core/matrix/batch_csr.cpp
index 8e4b1434f8e..1b1dc22a6c4 100644
--- a/core/matrix/batch_csr.cpp
+++ b/core/matrix/batch_csr.cpp
@@ -4,11 +4,9 @@
 
 #include "ginkgo/core/matrix/batch_csr.hpp"
 
-
 #include <algorithm>
 #include <type_traits>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
@@ -17,7 +15,6 @@
 #include <ginkgo/core/base/utils.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/matrix/batch_csr_kernels.hpp"
 #include "core/matrix/csr_kernels.hpp"
 
diff --git a/core/matrix/batch_csr_kernels.hpp b/core/matrix/batch_csr_kernels.hpp
index d2066389bba..2ee4e2100a2 100644
--- a/core/matrix/batch_csr_kernels.hpp
+++ b/core/matrix/batch_csr_kernels.hpp
@@ -6,14 +6,11 @@
 #define GKO_CORE_MATRIX_BATCH_CSR_KERNELS_HPP_
 
 
-#include <ginkgo/core/matrix/batch_csr.hpp>
-
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
-
+#include <ginkgo/core/matrix/batch_csr.hpp>
 
 #include "core/base/kernel_declaration.hpp"
 
diff --git a/core/matrix/batch_dense.cpp b/core/matrix/batch_dense.cpp
index a2eb017cf7c..6390a4c7ad0 100644
--- a/core/matrix/batch_dense.cpp
+++ b/core/matrix/batch_dense.cpp
@@ -4,11 +4,9 @@
 
 #include "ginkgo/core/matrix/batch_dense.hpp"
 
-
 #include <algorithm>
 #include <type_traits>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
@@ -18,7 +16,6 @@
 #include <ginkgo/core/base/utils.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/matrix/batch_dense_kernels.hpp"
 
 
diff --git a/core/matrix/batch_dense_kernels.hpp b/core/matrix/batch_dense_kernels.hpp
index 6c5f4a02242..13b88f9f4b2 100644
--- a/core/matrix/batch_dense_kernels.hpp
+++ b/core/matrix/batch_dense_kernels.hpp
@@ -6,12 +6,9 @@
 #define GKO_CORE_MATRIX_BATCH_DENSE_KERNELS_HPP_
 
 
-#include <ginkgo/core/matrix/batch_dense.hpp>
-
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/types.hpp>
-
+#include <ginkgo/core/matrix/batch_dense.hpp>
 
 #include "core/base/kernel_declaration.hpp"
 
diff --git a/core/matrix/batch_ell.cpp b/core/matrix/batch_ell.cpp
index 5c3da632643..3722c41de60 100644
--- a/core/matrix/batch_ell.cpp
+++ b/core/matrix/batch_ell.cpp
@@ -4,11 +4,9 @@
 
 #include "ginkgo/core/matrix/batch_ell.hpp"
 
-
 #include <algorithm>
 #include <type_traits>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
@@ -17,7 +15,6 @@
 #include <ginkgo/core/base/utils.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
 
-
 #include "core/matrix/batch_ell_kernels.hpp"
 #include "core/matrix/csr_kernels.hpp"
 
diff --git a/core/matrix/batch_ell_kernels.hpp b/core/matrix/batch_ell_kernels.hpp
index c5d7f8cb857..77707c89a8b 100644
--- a/core/matrix/batch_ell_kernels.hpp
+++ b/core/matrix/batch_ell_kernels.hpp
@@ -6,13 +6,10 @@
 #define GKO_CORE_MATRIX_BATCH_ELL_KERNELS_HPP_
 
 
-#include <ginkgo/core/matrix/batch_ell.hpp>
-
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
-
+#include <ginkgo/core/matrix/batch_ell.hpp>
 
 #include "core/base/kernel_declaration.hpp"
 
diff --git a/core/matrix/batch_identity.cpp b/core/matrix/batch_identity.cpp
index 480f0a10474..2220120d00b 100644
--- a/core/matrix/batch_identity.cpp
+++ b/core/matrix/batch_identity.cpp
@@ -4,11 +4,9 @@
 
 #include "ginkgo/core/matrix/batch_identity.hpp"
 
-
 #include <algorithm>
 #include <type_traits>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/batch_dim.hpp>
 #include <ginkgo/core/base/batch_multi_vector.hpp>
diff --git a/core/matrix/coo.cpp b/core/matrix/coo.cpp
index eb8b33c0cf1..1368dc261c3 100644
--- a/core/matrix/coo.cpp
+++ b/core/matrix/coo.cpp
@@ -4,11 +4,9 @@
 
 #include "ginkgo/core/matrix/coo.hpp"
 
-
 #include <algorithm>
 #include <numeric>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
@@ -18,7 +16,6 @@
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/base/device_matrix_data_kernels.hpp"
 #include "core/components/absolute_array_kernels.hpp"
 #include "core/components/fill_array_kernels.hpp"
diff --git a/core/matrix/coo_kernels.hpp b/core/matrix/coo_kernels.hpp
index eece195ab74..a2cc44b74d9 100644
--- a/core/matrix/coo_kernels.hpp
+++ b/core/matrix/coo_kernels.hpp
@@ -6,15 +6,12 @@
 #define GKO_CORE_MATRIX_COO_KERNELS_HPP_
 
 
-#include <ginkgo/core/matrix/coo.hpp>
-
-
 #include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp
index 8dad86568fb..e50732a3be9 100644
--- a/core/matrix/csr.cpp
+++ b/core/matrix/csr.cpp
@@ -4,7 +4,6 @@
 
 #include "ginkgo/core/matrix/csr.hpp"
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
@@ -22,7 +21,6 @@
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
 #include "core/base/array_access.hpp"
 #include "core/base/device_matrix_data_kernels.hpp"
 #include "core/components/absolute_array_kernels.hpp"
diff --git a/core/matrix/csr_accessor_helper.hpp b/core/matrix/csr_accessor_helper.hpp
index bfbca1d5cb5..2187439920e 100644
--- a/core/matrix/csr_accessor_helper.hpp
+++ b/core/matrix/csr_accessor_helper.hpp
@@ -9,7 +9,6 @@
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "accessor/index_span.hpp"
 #include "accessor/reduced_row_major.hpp"
 #include "accessor/utils.hpp"
diff --git a/core/matrix/csr_kernels.hpp b/core/matrix/csr_kernels.hpp
index 0cebe435e4f..6013e014c8a 100644
--- a/core/matrix/csr_kernels.hpp
+++ b/core/matrix/csr_kernels.hpp
@@ -6,13 +6,11 @@
 #define GKO_CORE_MATRIX_CSR_KERNELS_HPP_
 
 
-#include <ginkgo/core/matrix/csr.hpp>
-
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/index_set.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
@@ -20,7 +18,6 @@
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 #include "core/matrix/csr_lookup.hpp"
 
diff --git a/core/matrix/csr_lookup.hpp b/core/matrix/csr_lookup.hpp
index 6de3265ff21..a7b687c3618 100644
--- a/core/matrix/csr_lookup.hpp
+++ b/core/matrix/csr_lookup.hpp
@@ -8,7 +8,6 @@
 
 #include <type_traits>
 
-
 #include <ginkgo/core/base/intrinsics.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp
index eb52c574db9..171ff007b4a 100644
--- a/core/matrix/dense.cpp
+++ b/core/matrix/dense.cpp
@@ -4,11 +4,9 @@
 
 #include "ginkgo/core/matrix/dense.hpp"
 
-
 #include <algorithm>
 #include <type_traits>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
@@ -28,7 +26,6 @@
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
 #include "core/base/array_access.hpp"
 #include "core/base/dispatch_helper.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp
index 63999b12f82..7422b431aa0 100644
--- a/core/matrix/dense_kernels.hpp
+++ b/core/matrix/dense_kernels.hpp
@@ -6,17 +6,13 @@
 #define GKO_CORE_MATRIX_DENSE_KERNELS_HPP_
 
 
-#include <ginkgo/core/matrix/dense.hpp>
-
-
 #include <memory>
 
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/matrix/diagonal.cpp b/core/matrix/diagonal.cpp
index 08b1e00e340..1a442ffc789 100644
--- a/core/matrix/diagonal.cpp
+++ b/core/matrix/diagonal.cpp
@@ -4,13 +4,11 @@
 
 #include "ginkgo/core/matrix/diagonal.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/precision_dispatch.hpp>
 #include <ginkgo/core/base/utils.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/components/absolute_array_kernels.hpp"
 #include "core/matrix/diagonal_kernels.hpp"
 
diff --git a/core/matrix/diagonal_kernels.hpp b/core/matrix/diagonal_kernels.hpp
index 930144491f4..630c76e43ad 100644
--- a/core/matrix/diagonal_kernels.hpp
+++ b/core/matrix/diagonal_kernels.hpp
@@ -6,13 +6,10 @@
 #define GKO_CORE_MATRIX_DIAGONAL_KERNELS_HPP_
 
 
-#include <ginkgo/core/matrix/diagonal.hpp>
-
-
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
-
+#include <ginkgo/core/matrix/diagonal.hpp>
 
 #include "core/base/kernel_declaration.hpp"
 
diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp
index f6433fe156a..87b74c7f417 100644
--- a/core/matrix/ell.cpp
+++ b/core/matrix/ell.cpp
@@ -4,10 +4,8 @@
 
 #include "ginkgo/core/matrix/ell.hpp"
 
-
 #include <algorithm>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
@@ -17,7 +15,6 @@
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/base/allocator.hpp"
 #include "core/base/array_access.hpp"
 #include "core/base/device_matrix_data_kernels.hpp"
diff --git a/core/matrix/ell_kernels.hpp b/core/matrix/ell_kernels.hpp
index f31164702d9..7b88507d650 100644
--- a/core/matrix/ell_kernels.hpp
+++ b/core/matrix/ell_kernels.hpp
@@ -6,13 +6,10 @@
 #define GKO_CORE_MATRIX_ELL_KERNELS_HPP_
 
 
-#include <ginkgo/core/matrix/ell.hpp>
-
-
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
-
+#include <ginkgo/core/matrix/ell.hpp>
 
 #include "core/base/kernel_declaration.hpp"
 
diff --git a/core/matrix/fbcsr.cpp b/core/matrix/fbcsr.cpp
index 1ea00d741bd..a48e32be088 100644
--- a/core/matrix/fbcsr.cpp
+++ b/core/matrix/fbcsr.cpp
@@ -4,11 +4,9 @@
 
 #include "ginkgo/core/matrix/fbcsr.hpp"
 
-
 #include <limits>
 #include <map>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
@@ -22,7 +20,6 @@
 #include <ginkgo/core/matrix/identity.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
 #include "accessor/block_col_major.hpp"
 #include "accessor/range.hpp"
 #include "core/components/absolute_array_kernels.hpp"
diff --git a/core/matrix/fbcsr_kernels.hpp b/core/matrix/fbcsr_kernels.hpp
index c5f1ee6fb03..7a644d48d78 100644
--- a/core/matrix/fbcsr_kernels.hpp
+++ b/core/matrix/fbcsr_kernels.hpp
@@ -6,17 +6,14 @@
 #define GKO_CORE_MATRIX_FBCSR_KERNELS_HPP_
 
 
-#include <ginkgo/core/matrix/fbcsr.hpp>
-
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
+#include <ginkgo/core/matrix/fbcsr.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/matrix/fft.cpp b/core/matrix/fft.cpp
index 1ec69ce3338..cd6f20c1edc 100644
--- a/core/matrix/fft.cpp
+++ b/core/matrix/fft.cpp
@@ -4,12 +4,10 @@
 
 #include "ginkgo/core/matrix/fft.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/utils.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/matrix/fft_kernels.hpp"
 
 
diff --git a/core/matrix/hybrid.cpp b/core/matrix/hybrid.cpp
index c30c60ce0fb..d450a0dfc35 100644
--- a/core/matrix/hybrid.cpp
+++ b/core/matrix/hybrid.cpp
@@ -4,10 +4,8 @@
 
 #include "ginkgo/core/matrix/hybrid.hpp"
 
-
 #include <algorithm>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
@@ -16,7 +14,6 @@
 #include <ginkgo/core/base/utils.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/base/array_access.hpp"
 #include "core/base/device_matrix_data_kernels.hpp"
 #include "core/components/absolute_array_kernels.hpp"
diff --git a/core/matrix/hybrid_kernels.hpp b/core/matrix/hybrid_kernels.hpp
index 9460a521c90..85ff74bfab5 100644
--- a/core/matrix/hybrid_kernels.hpp
+++ b/core/matrix/hybrid_kernels.hpp
@@ -6,11 +6,8 @@
 #define GKO_CORE_MATRIX_HYBRID_KERNELS_HPP_
 
 
-#include <ginkgo/core/matrix/hybrid.hpp>
-
-
 #include <ginkgo/core/matrix/dense.hpp>
-
+#include <ginkgo/core/matrix/hybrid.hpp>
 
 #include "core/base/kernel_declaration.hpp"
 
diff --git a/core/matrix/identity.cpp b/core/matrix/identity.cpp
index a58601f31f0..7e035be82a3 100644
--- a/core/matrix/identity.cpp
+++ b/core/matrix/identity.cpp
@@ -4,7 +4,6 @@
 
 #include "ginkgo/core/matrix/identity.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/precision_dispatch.hpp>
 #include <ginkgo/core/base/utils.hpp>
diff --git a/core/matrix/permutation.cpp b/core/matrix/permutation.cpp
index 76f5d7c8005..0fe7ba2b2ce 100644
--- a/core/matrix/permutation.cpp
+++ b/core/matrix/permutation.cpp
@@ -4,14 +4,12 @@
 
 #include "ginkgo/core/matrix/permutation.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/precision_dispatch.hpp>
 #include <ginkgo/core/base/temporary_clone.hpp>
 #include <ginkgo/core/base/utils_helper.hpp>
 
-
 #include "core/base/dispatch_helper.hpp"
 #include "core/matrix/permutation_kernels.hpp"
 
diff --git a/core/matrix/permutation.hpp b/core/matrix/permutation.hpp
index 6e674f3db79..6ae375d63cf 100644
--- a/core/matrix/permutation.hpp
+++ b/core/matrix/permutation.hpp
@@ -6,10 +6,8 @@
 #define GKO_CORE_MATRIX_PERMUTATION_HPP_
 
 
-#include <ginkgo/core/matrix/permutation.hpp>
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/matrix/permutation.hpp>
 
 
 namespace gko {
diff --git a/core/matrix/permutation_kernels.hpp b/core/matrix/permutation_kernels.hpp
index 65b207f5fdf..9a1d269d610 100644
--- a/core/matrix/permutation_kernels.hpp
+++ b/core/matrix/permutation_kernels.hpp
@@ -18,7 +18,6 @@
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 #include "core/matrix/csr_lookup.hpp"
 
diff --git a/core/matrix/row_gatherer.cpp b/core/matrix/row_gatherer.cpp
index 72a6cbe2808..fecc60a0ca9 100644
--- a/core/matrix/row_gatherer.cpp
+++ b/core/matrix/row_gatherer.cpp
@@ -4,10 +4,8 @@
 
 #include "ginkgo/core/matrix/row_gatherer.hpp"
 
-
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/base/dispatch_helper.hpp"
 
 
diff --git a/core/matrix/scaled_permutation.cpp b/core/matrix/scaled_permutation.cpp
index c948c6071ad..0f295d6b5be 100644
--- a/core/matrix/scaled_permutation.cpp
+++ b/core/matrix/scaled_permutation.cpp
@@ -4,12 +4,10 @@
 
 #include "ginkgo/core/matrix/scaled_permutation.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/precision_dispatch.hpp>
 
-
 #include "core/matrix/scaled_permutation_kernels.hpp"
 
 
diff --git a/core/matrix/scaled_permutation_kernels.hpp b/core/matrix/scaled_permutation_kernels.hpp
index 1cc664002a3..696c53a387f 100644
--- a/core/matrix/scaled_permutation_kernels.hpp
+++ b/core/matrix/scaled_permutation_kernels.hpp
@@ -8,7 +8,6 @@
 
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/matrix/sellp.cpp b/core/matrix/sellp.cpp
index 39e2c706b19..a4787e758bf 100644
--- a/core/matrix/sellp.cpp
+++ b/core/matrix/sellp.cpp
@@ -4,7 +4,6 @@
 
 #include "ginkgo/core/matrix/sellp.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
@@ -13,7 +12,6 @@
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/base/allocator.hpp"
 #include "core/base/array_access.hpp"
 #include "core/base/device_matrix_data_kernels.hpp"
diff --git a/core/matrix/sellp_kernels.hpp b/core/matrix/sellp_kernels.hpp
index ce5cea36a84..fb4706039fb 100644
--- a/core/matrix/sellp_kernels.hpp
+++ b/core/matrix/sellp_kernels.hpp
@@ -6,13 +6,10 @@
 #define GKO_CORE_MATRIX_SELLP_KERNELS_HPP_
 
 
-#include <ginkgo/core/matrix/sellp.hpp>
-
-
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
-
+#include <ginkgo/core/matrix/sellp.hpp>
 
 #include "core/base/kernel_declaration.hpp"
 
diff --git a/core/matrix/sparsity_csr.cpp b/core/matrix/sparsity_csr.cpp
index 2ec463613b0..9b8ea04da52 100644
--- a/core/matrix/sparsity_csr.cpp
+++ b/core/matrix/sparsity_csr.cpp
@@ -4,7 +4,6 @@
 
 #include "ginkgo/core/matrix/sparsity_csr.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
@@ -13,7 +12,6 @@
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/base/array_access.hpp"
 #include "core/base/device_matrix_data_kernels.hpp"
 #include "core/components/format_conversion_kernels.hpp"
diff --git a/core/matrix/sparsity_csr_kernels.hpp b/core/matrix/sparsity_csr_kernels.hpp
index 869f43e1a7c..e07bb980dce 100644
--- a/core/matrix/sparsity_csr_kernels.hpp
+++ b/core/matrix/sparsity_csr_kernels.hpp
@@ -6,12 +6,9 @@
 #define GKO_CORE_MATRIX_SPARSITY_CSR_KERNELS_HPP_
 
 
-#include <ginkgo/core/matrix/sparsity_csr.hpp>
-
-
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
-
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 #include "core/base/kernel_declaration.hpp"
 
diff --git a/core/mpi/exception.cpp b/core/mpi/exception.cpp
index 8ffd6b0f133..d4d66012c14 100644
--- a/core/mpi/exception.cpp
+++ b/core/mpi/exception.cpp
@@ -5,10 +5,8 @@
 #include <array>
 #include <string>
 
-
 #include <mpi.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 
 
diff --git a/core/multigrid/fixed_coarsening.cpp b/core/multigrid/fixed_coarsening.cpp
index e7024d334ad..1cbdd557fb4 100644
--- a/core/multigrid/fixed_coarsening.cpp
+++ b/core/multigrid/fixed_coarsening.cpp
@@ -4,7 +4,6 @@
 
 #include "ginkgo/core/multigrid/fixed_coarsening.hpp"
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
@@ -17,7 +16,6 @@
 #include <ginkgo/core/matrix/row_gatherer.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
 #include "core/base/utils.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
diff --git a/core/multigrid/pgm.cpp b/core/multigrid/pgm.cpp
index f0393794d94..9f1f5b50ba6 100644
--- a/core/multigrid/pgm.cpp
+++ b/core/multigrid/pgm.cpp
@@ -4,7 +4,6 @@
 
 #include "ginkgo/core/multigrid/pgm.hpp"
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
@@ -22,7 +21,6 @@
 #include <ginkgo/core/matrix/row_gatherer.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
 #include "core/base/dispatch_helper.hpp"
 #include "core/base/iterator_factory.hpp"
 #include "core/base/utils.hpp"
diff --git a/core/multigrid/pgm_kernels.hpp b/core/multigrid/pgm_kernels.hpp
index 4118507ac7d..a7a0a4aa099 100644
--- a/core/multigrid/pgm_kernels.hpp
+++ b/core/multigrid/pgm_kernels.hpp
@@ -8,14 +8,12 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/preconditioner/batch_jacobi.cpp b/core/preconditioner/batch_jacobi.cpp
index 3f18a32123f..f92ccd18cfc 100644
--- a/core/preconditioner/batch_jacobi.cpp
+++ b/core/preconditioner/batch_jacobi.cpp
@@ -4,7 +4,6 @@
 
 #include "ginkgo/core/preconditioner/batch_jacobi.hpp"
 
-
 #include "core/matrix/batch_csr_kernels.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/preconditioner/batch_jacobi_kernels.hpp"
diff --git a/core/preconditioner/batch_jacobi_kernels.hpp b/core/preconditioner/batch_jacobi_kernels.hpp
index c37db81d72f..784ab2d1f6e 100644
--- a/core/preconditioner/batch_jacobi_kernels.hpp
+++ b/core/preconditioner/batch_jacobi_kernels.hpp
@@ -6,14 +6,11 @@
 #define GKO_CORE_PRECONDITIONER_BATCH_JACOBI_KERNELS_HPP_
 
 
-#include <ginkgo/core/preconditioner/batch_jacobi.hpp>
-
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/matrix/batch_csr.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
-
+#include <ginkgo/core/preconditioner/batch_jacobi.hpp>
 
 #include "core/base/kernel_declaration.hpp"
 #include "core/preconditioner/batch_jacobi_helpers.hpp"
diff --git a/core/preconditioner/ic.cpp b/core/preconditioner/ic.cpp
index 37eb0cb5b3f..691795ad60b 100644
--- a/core/preconditioner/ic.cpp
+++ b/core/preconditioner/ic.cpp
@@ -4,7 +4,6 @@
 
 #include "ginkgo/core/preconditioner/ic.hpp"
 
-
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/config/config.hpp>
 #include <ginkgo/core/config/registry.hpp>
@@ -13,7 +12,6 @@
 #include <ginkgo/core/solver/gmres.hpp>
 #include <ginkgo/core/solver/ir.hpp>
 
-
 #include "core/config/config_helper.hpp"
 #include "core/config/dispatch.hpp"
 
diff --git a/core/preconditioner/ilu.cpp b/core/preconditioner/ilu.cpp
index 00422300172..d6f49e49588 100644
--- a/core/preconditioner/ilu.cpp
+++ b/core/preconditioner/ilu.cpp
@@ -4,7 +4,6 @@
 
 #include "ginkgo/core/preconditioner/ilu.hpp"
 
-
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/config/config.hpp>
 #include <ginkgo/core/config/registry.hpp>
@@ -14,7 +13,6 @@
 #include <ginkgo/core/solver/gmres.hpp>
 #include <ginkgo/core/solver/ir.hpp>
 
-
 #include "core/config/config_helper.hpp"
 #include "core/config/dispatch.hpp"
 
diff --git a/core/preconditioner/isai.cpp b/core/preconditioner/isai.cpp
index f825e2f5c82..9684f1bdb27 100644
--- a/core/preconditioner/isai.cpp
+++ b/core/preconditioner/isai.cpp
@@ -4,12 +4,10 @@
 
 #include "ginkgo/core/preconditioner/isai.hpp"
 
-
 #include <functional>
 #include <memory>
 #include <type_traits>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/utils.hpp>
@@ -22,7 +20,6 @@
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "core/base/array_access.hpp"
 #include "core/base/utils.hpp"
 #include "core/config/config_helper.hpp"
diff --git a/core/preconditioner/isai_kernels.hpp b/core/preconditioner/isai_kernels.hpp
index 9c3f89d7b5e..d1897251916 100644
--- a/core/preconditioner/isai_kernels.hpp
+++ b/core/preconditioner/isai_kernels.hpp
@@ -6,11 +6,8 @@
 #define GKO_CORE_PRECONDITIONER_ISAI_KERNELS_HPP_
 
 
-#include <ginkgo/core/preconditioner/isai.hpp>
-
-
 #include <ginkgo/core/matrix/csr.hpp>
-
+#include <ginkgo/core/preconditioner/isai.hpp>
 
 #include "core/base/kernel_declaration.hpp"
 
diff --git a/core/preconditioner/jacobi.cpp b/core/preconditioner/jacobi.cpp
index 8081f31712a..f6d5b042a23 100644
--- a/core/preconditioner/jacobi.cpp
+++ b/core/preconditioner/jacobi.cpp
@@ -4,10 +4,8 @@
 
 #include "ginkgo/core/preconditioner/jacobi.hpp"
 
-
 #include <memory>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
@@ -19,7 +17,6 @@
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/base/extended_float.hpp"
 #include "core/base/utils.hpp"
 #include "core/config/config_helper.hpp"
diff --git a/core/preconditioner/jacobi_kernels.hpp b/core/preconditioner/jacobi_kernels.hpp
index ee5227a6c0b..e29791e0a6e 100644
--- a/core/preconditioner/jacobi_kernels.hpp
+++ b/core/preconditioner/jacobi_kernels.hpp
@@ -6,11 +6,8 @@
 #define GKO_CORE_PRECONDITIONER_JACOBI_KERNELS_HPP_
 
 
-#include <ginkgo/core/preconditioner/jacobi.hpp>
-
-
 #include <ginkgo/core/matrix/csr.hpp>
-
+#include <ginkgo/core/preconditioner/jacobi.hpp>
 
 #include "core/base/kernel_declaration.hpp"
 
diff --git a/core/preconditioner/jacobi_utils.hpp b/core/preconditioner/jacobi_utils.hpp
index 8fb80f0ad94..e159fd15776 100644
--- a/core/preconditioner/jacobi_utils.hpp
+++ b/core/preconditioner/jacobi_utils.hpp
@@ -9,7 +9,6 @@
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "core/base/extended_float.hpp"
 
 
diff --git a/core/reorder/amd.cpp b/core/reorder/amd.cpp
index 7cb24c39ea0..3828bce9197 100644
--- a/core/reorder/amd.cpp
+++ b/core/reorder/amd.cpp
@@ -4,10 +4,8 @@
 
 #include "ginkgo/core/reorder/amd.hpp"
 
-
 #include <cstddef>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
@@ -15,7 +13,6 @@
 #include <ginkgo/core/matrix/permutation.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
 #include "core/base/allocator.hpp"
 
 
diff --git a/core/reorder/mc64.cpp b/core/reorder/mc64.cpp
index e47969c0b71..97dd37b90fc 100644
--- a/core/reorder/mc64.cpp
+++ b/core/reorder/mc64.cpp
@@ -4,11 +4,9 @@
 
 #include "ginkgo/core/reorder/mc64.hpp"
 
-
 #include <chrono>
 #include <memory>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
@@ -20,7 +18,6 @@
 #include <ginkgo/core/matrix/scaled_permutation.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
 #include "core/components/addressable_pq.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/matrix/csr_kernels.hpp"
diff --git a/core/reorder/mc64.hpp b/core/reorder/mc64.hpp
index e6b34614f3c..97456f93464 100644
--- a/core/reorder/mc64.hpp
+++ b/core/reorder/mc64.hpp
@@ -6,12 +6,9 @@
 #define GKO_CORE_REORDER_MC64_HPP_
 
 
-#include <ginkgo/core/reorder/mc64.hpp>
-
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
-
+#include <ginkgo/core/reorder/mc64.hpp>
 
 #include "core/components/addressable_pq.hpp"
 
diff --git a/core/reorder/nested_dissection.cpp b/core/reorder/nested_dissection.cpp
index bf9c8ba7a3d..e14af9ffbfc 100644
--- a/core/reorder/nested_dissection.cpp
+++ b/core/reorder/nested_dissection.cpp
@@ -4,10 +4,8 @@
 
 #include "ginkgo/core/reorder/nested_dissection.hpp"
 
-
 #include <memory>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/temporary_clone.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
diff --git a/core/reorder/rcm.cpp b/core/reorder/rcm.cpp
index f3a16cc92a6..1acf4d97f1f 100644
--- a/core/reorder/rcm.cpp
+++ b/core/reorder/rcm.cpp
@@ -4,10 +4,8 @@
 
 #include "ginkgo/core/reorder/rcm.hpp"
 
-
 #include <memory>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
@@ -18,7 +16,6 @@
 #include <ginkgo/core/matrix/permutation.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
 #include "core/matrix/csr_kernels.hpp"
 #include "core/reorder/rcm_kernels.hpp"
 
diff --git a/core/reorder/rcm_kernels.hpp b/core/reorder/rcm_kernels.hpp
index 77e1ce68ff0..a89b2732cb0 100644
--- a/core/reorder/rcm_kernels.hpp
+++ b/core/reorder/rcm_kernels.hpp
@@ -6,19 +6,15 @@
 #define GKO_CORE_REORDER_RCM_KERNELS_HPP_
 
 
-#include <ginkgo/core/reorder/rcm.hpp>
-
-
 #include <memory>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/permutation.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
-
+#include <ginkgo/core/reorder/rcm.hpp>
 
 #include "core/base/kernel_declaration.hpp"
 
diff --git a/core/reorder/scaled_reordered.cpp b/core/reorder/scaled_reordered.cpp
index cf246ea3194..264122c0b8f 100644
--- a/core/reorder/scaled_reordered.cpp
+++ b/core/reorder/scaled_reordered.cpp
@@ -4,10 +4,8 @@
 
 #include "ginkgo/core/reorder/scaled_reordered.hpp"
 
-
 #include <utility>
 
-
 #include <ginkgo/core/base/precision_dispatch.hpp>
 #include <ginkgo/core/matrix/permutation.hpp>
 
diff --git a/core/solver/batch_bicgstab.cpp b/core/solver/batch_bicgstab.cpp
index 9621f058097..c22c712b411 100644
--- a/core/solver/batch_bicgstab.cpp
+++ b/core/solver/batch_bicgstab.cpp
@@ -4,12 +4,10 @@
 
 #include "ginkgo/core/solver/batch_bicgstab.hpp"
 
-
 #include <ginkgo/core/base/batch_lin_op.hpp>
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "core/base/batch_multi_vector_kernels.hpp"
 #include "core/solver/batch_bicgstab_kernels.hpp"
 
diff --git a/core/solver/batch_bicgstab_kernels.hpp b/core/solver/batch_bicgstab_kernels.hpp
index 43f55f1356d..1eed30aba5a 100644
--- a/core/solver/batch_bicgstab_kernels.hpp
+++ b/core/solver/batch_bicgstab_kernels.hpp
@@ -12,7 +12,6 @@
 #include <ginkgo/core/matrix/batch_ell.hpp>
 #include <ginkgo/core/stop/batch_stop_enum.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/solver/batch_cg.cpp b/core/solver/batch_cg.cpp
index d2fe4a5f00d..0ab1ca8564f 100644
--- a/core/solver/batch_cg.cpp
+++ b/core/solver/batch_cg.cpp
@@ -4,12 +4,10 @@
 
 #include "ginkgo/core/solver/batch_cg.hpp"
 
-
 #include <ginkgo/core/base/batch_lin_op.hpp>
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "core/base/batch_multi_vector_kernels.hpp"
 #include "core/solver/batch_cg_kernels.hpp"
 
diff --git a/core/solver/batch_cg_kernels.hpp b/core/solver/batch_cg_kernels.hpp
index d2c64460be2..6fdb595862e 100644
--- a/core/solver/batch_cg_kernels.hpp
+++ b/core/solver/batch_cg_kernels.hpp
@@ -12,7 +12,6 @@
 #include <ginkgo/core/matrix/batch_ell.hpp>
 #include <ginkgo/core/stop/batch_stop_enum.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp
index 1cafda169c2..8a142a5224a 100644
--- a/core/solver/batch_dispatch.hpp
+++ b/core/solver/batch_dispatch.hpp
@@ -17,7 +17,6 @@
 #include <ginkgo/core/solver/batch_bicgstab.hpp>
 #include <ginkgo/core/stop/batch_stop_enum.hpp>
 
-
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 
diff --git a/core/solver/bicg.cpp b/core/solver/bicg.cpp
index 51ba251aecd..c379cb8df08 100644
--- a/core/solver/bicg.cpp
+++ b/core/solver/bicg.cpp
@@ -4,7 +4,6 @@
 
 #include "ginkgo/core/solver/bicg.hpp"
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
@@ -12,7 +11,6 @@
 #include <ginkgo/core/base/name_demangling.hpp>
 #include <ginkgo/core/base/precision_dispatch.hpp>
 
-
 #include "core/config/solver_config.hpp"
 #include "core/solver/bicg_kernels.hpp"
 #include "core/solver/solver_boilerplate.hpp"
diff --git a/core/solver/bicg_kernels.hpp b/core/solver/bicg_kernels.hpp
index 712df21e90c..5e94d8ca350 100644
--- a/core/solver/bicg_kernels.hpp
+++ b/core/solver/bicg_kernels.hpp
@@ -8,14 +8,12 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/solver/bicgstab.cpp b/core/solver/bicgstab.cpp
index e1f2f1cb77e..c254b417765 100644
--- a/core/solver/bicgstab.cpp
+++ b/core/solver/bicgstab.cpp
@@ -4,7 +4,6 @@
 
 #include "ginkgo/core/solver/bicgstab.hpp"
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
@@ -13,7 +12,6 @@
 #include <ginkgo/core/base/utils.hpp>
 #include <ginkgo/core/solver/solver_base.hpp>
 
-
 #include "core/config/solver_config.hpp"
 #include "core/distributed/helpers.hpp"
 #include "core/solver/bicgstab_kernels.hpp"
diff --git a/core/solver/bicgstab_kernels.hpp b/core/solver/bicgstab_kernels.hpp
index 8160381d4f3..e3bfbdcdcb6 100644
--- a/core/solver/bicgstab_kernels.hpp
+++ b/core/solver/bicgstab_kernels.hpp
@@ -8,14 +8,12 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/solver/cb_gmres.cpp b/core/solver/cb_gmres.cpp
index 812c6c222ce..274948531ab 100644
--- a/core/solver/cb_gmres.cpp
+++ b/core/solver/cb_gmres.cpp
@@ -4,10 +4,8 @@
 
 #include "ginkgo/core/solver/cb_gmres.hpp"
 
-
 #include <type_traits>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
@@ -18,7 +16,6 @@
 #include <ginkgo/core/log/logger.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/base/extended_float.hpp"
 #include "core/config/solver_config.hpp"
 #include "core/solver/cb_gmres_accessor.hpp"
diff --git a/core/solver/cb_gmres_accessor.hpp b/core/solver/cb_gmres_accessor.hpp
index e216171a6f5..64a7c9a46e5 100644
--- a/core/solver/cb_gmres_accessor.hpp
+++ b/core/solver/cb_gmres_accessor.hpp
@@ -12,7 +12,6 @@
 #include <memory>
 #include <type_traits>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/dim.hpp>
 #include <ginkgo/core/base/executor.hpp>
@@ -20,7 +19,6 @@
 #include <ginkgo/core/base/std_extensions.hpp>
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "accessor/range.hpp"
 #include "accessor/reduced_row_major.hpp"
 #include "accessor/scaled_reduced_row_major.hpp"
diff --git a/core/solver/cb_gmres_kernels.hpp b/core/solver/cb_gmres_kernels.hpp
index 3e5d8c89f25..29a84f25ba1 100644
--- a/core/solver/cb_gmres_kernels.hpp
+++ b/core/solver/cb_gmres_kernels.hpp
@@ -12,7 +12,6 @@
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "accessor/reduced_row_major.hpp"
 #include "accessor/scaled_reduced_row_major.hpp"
 #include "core/base/extended_float.hpp"
diff --git a/core/solver/cg.cpp b/core/solver/cg.cpp
index a8e534588a0..20487b4cd0d 100644
--- a/core/solver/cg.cpp
+++ b/core/solver/cg.cpp
@@ -4,7 +4,6 @@
 
 #include "ginkgo/core/solver/cg.hpp"
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
@@ -13,7 +12,6 @@
 #include <ginkgo/core/base/precision_dispatch.hpp>
 #include <ginkgo/core/base/utils.hpp>
 
-
 #include "core/config/solver_config.hpp"
 #include "core/distributed/helpers.hpp"
 #include "core/solver/cg_kernels.hpp"
diff --git a/core/solver/cg_kernels.hpp b/core/solver/cg_kernels.hpp
index 127126317d7..bec5f04d0e5 100644
--- a/core/solver/cg_kernels.hpp
+++ b/core/solver/cg_kernels.hpp
@@ -8,14 +8,12 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/solver/cgs.cpp b/core/solver/cgs.cpp
index 9d6a575fdbf..19f625228a3 100644
--- a/core/solver/cgs.cpp
+++ b/core/solver/cgs.cpp
@@ -4,7 +4,6 @@
 
 #include "ginkgo/core/solver/cgs.hpp"
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
@@ -13,7 +12,6 @@
 #include <ginkgo/core/base/utils.hpp>
 #include <ginkgo/core/solver/solver_base.hpp>
 
-
 #include "core/config/solver_config.hpp"
 #include "core/distributed/helpers.hpp"
 #include "core/solver/cgs_kernels.hpp"
diff --git a/core/solver/cgs_kernels.hpp b/core/solver/cgs_kernels.hpp
index 5d64a7a0ed1..d64aeedb549 100644
--- a/core/solver/cgs_kernels.hpp
+++ b/core/solver/cgs_kernels.hpp
@@ -8,14 +8,12 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/solver/common_gmres_kernels.hpp b/core/solver/common_gmres_kernels.hpp
index 9174cc2c4e4..0209284c446 100644
--- a/core/solver/common_gmres_kernels.hpp
+++ b/core/solver/common_gmres_kernels.hpp
@@ -12,7 +12,6 @@
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/solver/direct.cpp b/core/solver/direct.cpp
index 717fd71698f..c999fdea4fc 100644
--- a/core/solver/direct.cpp
+++ b/core/solver/direct.cpp
@@ -4,15 +4,12 @@
 
 #include "ginkgo/core/solver/direct.hpp"
 
-
 #include <memory>
 
-
 #include <ginkgo/core/base/precision_dispatch.hpp>
 #include <ginkgo/core/factorization/factorization.hpp>
 #include <ginkgo/core/solver/solver_base.hpp>
 
-
 #include "core/config/config_helper.hpp"
 
 
diff --git a/core/solver/fcg.cpp b/core/solver/fcg.cpp
index dee37467c46..c4f79854c0a 100644
--- a/core/solver/fcg.cpp
+++ b/core/solver/fcg.cpp
@@ -4,7 +4,6 @@
 
 #include "ginkgo/core/solver/fcg.hpp"
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
@@ -12,7 +11,6 @@
 #include <ginkgo/core/base/precision_dispatch.hpp>
 #include <ginkgo/core/base/utils.hpp>
 
-
 #include "core/config/solver_config.hpp"
 #include "core/distributed/helpers.hpp"
 #include "core/solver/fcg_kernels.hpp"
diff --git a/core/solver/fcg_kernels.hpp b/core/solver/fcg_kernels.hpp
index 4eda3e631c8..bb646055906 100644
--- a/core/solver/fcg_kernels.hpp
+++ b/core/solver/fcg_kernels.hpp
@@ -8,13 +8,11 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/solver/gcr.cpp b/core/solver/gcr.cpp
index cb2b55a3460..d5131632dc3 100644
--- a/core/solver/gcr.cpp
+++ b/core/solver/gcr.cpp
@@ -4,7 +4,6 @@
 
 #include "ginkgo/core/solver/gcr.hpp"
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
@@ -15,7 +14,6 @@
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/identity.hpp>
 
-
 #include "core/config/solver_config.hpp"
 #include "core/distributed/helpers.hpp"
 #include "core/solver/gcr_kernels.hpp"
diff --git a/core/solver/gcr_kernels.hpp b/core/solver/gcr_kernels.hpp
index 6d4e827b4f9..d33f31db571 100644
--- a/core/solver/gcr_kernels.hpp
+++ b/core/solver/gcr_kernels.hpp
@@ -12,7 +12,6 @@
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp
index b0ad6baa01e..cd3d88a5c02 100644
--- a/core/solver/gmres.cpp
+++ b/core/solver/gmres.cpp
@@ -4,7 +4,6 @@
 
 #include "ginkgo/core/solver/gmres.hpp"
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
@@ -16,7 +15,6 @@
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/identity.hpp>
 
-
 #include "core/config/solver_config.hpp"
 #include "core/distributed/helpers.hpp"
 #include "core/solver/common_gmres_kernels.hpp"
diff --git a/core/solver/gmres_kernels.hpp b/core/solver/gmres_kernels.hpp
index 3cc5d457edb..196b0de3ab0 100644
--- a/core/solver/gmres_kernels.hpp
+++ b/core/solver/gmres_kernels.hpp
@@ -12,7 +12,6 @@
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/solver/idr.cpp b/core/solver/idr.cpp
index 4bc56562d3b..c6d89b84ea6 100644
--- a/core/solver/idr.cpp
+++ b/core/solver/idr.cpp
@@ -4,7 +4,6 @@
 
 #include "ginkgo/core/solver/idr.hpp"
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
@@ -12,7 +11,6 @@
 #include <ginkgo/core/base/precision_dispatch.hpp>
 #include <ginkgo/core/solver/solver_base.hpp>
 
-
 #include "core/config/solver_config.hpp"
 #include "core/distributed/helpers.hpp"
 #include "core/solver/idr_kernels.hpp"
diff --git a/core/solver/idr_kernels.hpp b/core/solver/idr_kernels.hpp
index 02d9fa88511..3d579bd01af 100644
--- a/core/solver/idr_kernels.hpp
+++ b/core/solver/idr_kernels.hpp
@@ -12,7 +12,6 @@
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/solver/ir.cpp b/core/solver/ir.cpp
index 3a6b0b1d2d0..75efac351f9 100644
--- a/core/solver/ir.cpp
+++ b/core/solver/ir.cpp
@@ -4,12 +4,10 @@
 
 #include "ginkgo/core/solver/ir.hpp"
 
-
 #include <ginkgo/core/base/precision_dispatch.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/solver/solver_base.hpp>
 
-
 #include "core/config/config_helper.hpp"
 #include "core/distributed/helpers.hpp"
 #include "core/solver/ir_kernels.hpp"
diff --git a/core/solver/ir_kernels.hpp b/core/solver/ir_kernels.hpp
index f6d0f79242d..a411c9f375d 100644
--- a/core/solver/ir_kernels.hpp
+++ b/core/solver/ir_kernels.hpp
@@ -8,13 +8,11 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/solver/lower_trs.cpp b/core/solver/lower_trs.cpp
index e36ec98f8fb..e8230625ab3 100644
--- a/core/solver/lower_trs.cpp
+++ b/core/solver/lower_trs.cpp
@@ -13,7 +13,6 @@
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/solver/triangular.hpp>
 
-
 #include "core/config/config_helper.hpp"
 #include "core/config/trisolver_config.hpp"
 #include "core/solver/lower_trs_kernels.hpp"
diff --git a/core/solver/lower_trs_kernels.hpp b/core/solver/lower_trs_kernels.hpp
index ce13f4b4f14..8bb0031e801 100644
--- a/core/solver/lower_trs_kernels.hpp
+++ b/core/solver/lower_trs_kernels.hpp
@@ -8,13 +8,11 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/solver/triangular.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/solver/multigrid.cpp b/core/solver/multigrid.cpp
index d7fc1d3c997..6a8b5ee151b 100644
--- a/core/solver/multigrid.cpp
+++ b/core/solver/multigrid.cpp
@@ -4,10 +4,8 @@
 
 #include "ginkgo/core/solver/multigrid.hpp"
 
-
 #include <complex>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
@@ -27,7 +25,6 @@
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "core/base/dispatch_helper.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/config/config_helper.hpp"
diff --git a/core/solver/multigrid_kernels.hpp b/core/solver/multigrid_kernels.hpp
index 2e123cdbfec..73c660cbefb 100644
--- a/core/solver/multigrid_kernels.hpp
+++ b/core/solver/multigrid_kernels.hpp
@@ -12,7 +12,6 @@
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/solver/upper_trs.cpp b/core/solver/upper_trs.cpp
index 5a854bddf1e..be6fcc71275 100644
--- a/core/solver/upper_trs.cpp
+++ b/core/solver/upper_trs.cpp
@@ -13,7 +13,6 @@
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/solver/triangular.hpp>
 
-
 #include "core/config/config_helper.hpp"
 #include "core/config/trisolver_config.hpp"
 #include "core/solver/upper_trs_kernels.hpp"
diff --git a/core/solver/upper_trs_kernels.hpp b/core/solver/upper_trs_kernels.hpp
index e002b68f92d..d409aa7db09 100644
--- a/core/solver/upper_trs_kernels.hpp
+++ b/core/solver/upper_trs_kernels.hpp
@@ -8,13 +8,11 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/solver/triangular.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/stop/criterion.cpp b/core/stop/criterion.cpp
index 02f04876f9f..1684e4bdeab 100644
--- a/core/stop/criterion.cpp
+++ b/core/stop/criterion.cpp
@@ -4,7 +4,6 @@
 
 #include "ginkgo/core/stop/criterion.hpp"
 
-
 #include "core/stop/criterion_kernels.hpp"
 
 
diff --git a/core/stop/criterion_kernels.hpp b/core/stop/criterion_kernels.hpp
index 242a2ee7f52..62e4135ee37 100644
--- a/core/stop/criterion_kernels.hpp
+++ b/core/stop/criterion_kernels.hpp
@@ -10,7 +10,6 @@
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
-
 #include "core/base/kernel_declaration.hpp"
 
 
diff --git a/core/stop/residual_norm.cpp b/core/stop/residual_norm.cpp
index 824ab87ec0f..adf7da3e2e6 100644
--- a/core/stop/residual_norm.cpp
+++ b/core/stop/residual_norm.cpp
@@ -4,10 +4,8 @@
 
 #include "ginkgo/core/stop/residual_norm.hpp"
 
-
 #include <ginkgo/core/base/precision_dispatch.hpp>
 
-
 #include "core/base/dispatch_helper.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/distributed/helpers.hpp"
diff --git a/core/synthesizer/implementation_selection.hpp b/core/synthesizer/implementation_selection.hpp
index 5e8796bb6b4..d34949ca3e3 100644
--- a/core/synthesizer/implementation_selection.hpp
+++ b/core/synthesizer/implementation_selection.hpp
@@ -8,7 +8,6 @@
 
 #include <utility>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/synthesizer/containers.hpp>
 
diff --git a/core/test/accessor/block_col_major.cpp b/core/test/accessor/block_col_major.cpp
index 327343f8eb1..2b214416220 100644
--- a/core/test/accessor/block_col_major.cpp
+++ b/core/test/accessor/block_col_major.cpp
@@ -2,15 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "accessor/block_col_major.hpp"
+
 #include <array>
 #include <tuple>
 #include <type_traits>
 
-
 #include <gtest/gtest.h>
 
-
-#include "accessor/block_col_major.hpp"
 #include "accessor/index_span.hpp"
 #include "accessor/range.hpp"
 
diff --git a/core/test/accessor/index_span.cpp b/core/test/accessor/index_span.cpp
index 37cb1f36612..368b1d1c3ae 100644
--- a/core/test/accessor/index_span.cpp
+++ b/core/test/accessor/index_span.cpp
@@ -2,11 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <gtest/gtest.h>
-
-
 #include "accessor/index_span.hpp"
 
+#include <gtest/gtest.h>
+
 
 namespace {
 
diff --git a/core/test/accessor/math.cpp b/core/test/accessor/math.cpp
index f15644be93f..32bacac9043 100644
--- a/core/test/accessor/math.cpp
+++ b/core/test/accessor/math.cpp
@@ -2,16 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "accessor/math.hpp"
+
 #include <cinttypes>
 #include <complex>
 
-
 #include <gtest/gtest.h>
 
 
-#include "accessor/math.hpp"
-
-
 namespace {
 
 
diff --git a/core/test/accessor/range.cpp b/core/test/accessor/range.cpp
index a4fe79ec68e..d03f0ae6d1d 100644
--- a/core/test/accessor/range.cpp
+++ b/core/test/accessor/range.cpp
@@ -2,16 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "accessor/range.hpp"
+
 #include <array>
 #include <cstddef>
 
-
 #include <gtest/gtest.h>
 
 
-#include "accessor/range.hpp"
-
-
 namespace {
 
 
diff --git a/core/test/accessor/reduced_row_major.cpp b/core/test/accessor/reduced_row_major.cpp
index 636ba7a6d17..a270e90fb2a 100644
--- a/core/test/accessor/reduced_row_major.cpp
+++ b/core/test/accessor/reduced_row_major.cpp
@@ -2,19 +2,18 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "accessor/reduced_row_major.hpp"
+
 #include <array>
 #include <cmath>
 #include <limits>
 #include <tuple>
 #include <type_traits>
 
-
 #include <gtest/gtest.h>
 
-
 #include "accessor/index_span.hpp"
 #include "accessor/range.hpp"
-#include "accessor/reduced_row_major.hpp"
 #include "accessor/utils.hpp"
 
 
diff --git a/core/test/accessor/reduced_row_major_ginkgo.cpp b/core/test/accessor/reduced_row_major_ginkgo.cpp
index 1b484ce3e8c..7acad0b9638 100644
--- a/core/test/accessor/reduced_row_major_ginkgo.cpp
+++ b/core/test/accessor/reduced_row_major_ginkgo.cpp
@@ -8,10 +8,8 @@
 #include <tuple>
 #include <type_traits>
 
-
 #include <gtest/gtest.h>
 
-
 #include "accessor/index_span.hpp"
 #include "accessor/range.hpp"
 #include "accessor/reduced_row_major.hpp"
diff --git a/core/test/accessor/reduced_row_major_reference.cpp b/core/test/accessor/reduced_row_major_reference.cpp
index a6da6277b1d..58f249c5275 100644
--- a/core/test/accessor/reduced_row_major_reference.cpp
+++ b/core/test/accessor/reduced_row_major_reference.cpp
@@ -2,18 +2,17 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "accessor/reduced_row_major_reference.hpp"
+
 #include <cinttypes>
 #include <complex>
 #include <limits>
 #include <tuple>
 #include <type_traits>
 
-
 #include <gtest/gtest.h>
 
-
 #include "accessor/math.hpp"
-#include "accessor/reduced_row_major_reference.hpp"
 #include "accessor/utils.hpp"
 
 
diff --git a/core/test/accessor/row_major.cpp b/core/test/accessor/row_major.cpp
index 4902c29b9af..68f4e295006 100644
--- a/core/test/accessor/row_major.cpp
+++ b/core/test/accessor/row_major.cpp
@@ -2,17 +2,16 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "accessor/row_major.hpp"
+
 #include <array>
 #include <tuple>
 #include <type_traits>
 
-
 #include <gtest/gtest.h>
 
-
 #include "accessor/index_span.hpp"
 #include "accessor/range.hpp"
-#include "accessor/row_major.hpp"
 
 
 namespace {
diff --git a/core/test/accessor/scaled_reduced_row_major.cpp b/core/test/accessor/scaled_reduced_row_major.cpp
index eacd196985d..5a7b2bb2fe1 100644
--- a/core/test/accessor/scaled_reduced_row_major.cpp
+++ b/core/test/accessor/scaled_reduced_row_major.cpp
@@ -2,19 +2,18 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "accessor/scaled_reduced_row_major.hpp"
+
 #include <array>
 #include <cinttypes>
 #include <limits>
 #include <tuple>
 #include <type_traits>
 
-
 #include <gtest/gtest.h>
 
-
 #include "accessor/index_span.hpp"
 #include "accessor/range.hpp"
-#include "accessor/scaled_reduced_row_major.hpp"
 
 
 namespace {
diff --git a/core/test/accessor/scaled_reduced_row_major_reference.cpp b/core/test/accessor/scaled_reduced_row_major_reference.cpp
index 281ae9a6735..a91fe81b428 100644
--- a/core/test/accessor/scaled_reduced_row_major_reference.cpp
+++ b/core/test/accessor/scaled_reduced_row_major_reference.cpp
@@ -2,18 +2,17 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "accessor/scaled_reduced_row_major_reference.hpp"
+
 #include <cinttypes>
 #include <complex>
 #include <limits>
 #include <tuple>
 #include <type_traits>
 
-
 #include <gtest/gtest.h>
 
-
 #include "accessor/math.hpp"
-#include "accessor/scaled_reduced_row_major_reference.hpp"
 #include "accessor/utils.hpp"
 
 
diff --git a/core/test/base/abstract_factory.cpp b/core/test/base/abstract_factory.cpp
index afee76c0ed2..26127301b67 100644
--- a/core/test/base/abstract_factory.cpp
+++ b/core/test/base/abstract_factory.cpp
@@ -2,11 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/abstract_factory.hpp>
-
-
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/base/abstract_factory.hpp>
+
 
 namespace {
 
diff --git a/core/test/base/allocator.cpp b/core/test/base/allocator.cpp
index 69ae72cd398..2dce9babec8 100644
--- a/core/test/base/allocator.cpp
+++ b/core/test/base/allocator.cpp
@@ -4,10 +4,8 @@
 
 #include "core/base/allocator.hpp"
 
-
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
 
diff --git a/core/test/base/array.cpp b/core/test/base/array.cpp
index 59b1ef177f3..71816f690ce 100644
--- a/core/test/base/array.cpp
+++ b/core/test/base/array.cpp
@@ -2,19 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/array.hpp>
-
-
 #include <algorithm>
 #include <type_traits>
 
-
 #include <gtest/gtest.h>
 
-
+#include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 
-
 #include "core/base/array_access.hpp"
 #include "core/test/utils.hpp"
 
diff --git a/core/test/base/batch_dim.cpp b/core/test/base/batch_dim.cpp
index 989e153bb75..a8d324ba1d5 100644
--- a/core/test/base/batch_dim.cpp
+++ b/core/test/base/batch_dim.cpp
@@ -2,14 +2,12 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/batch_dim.hpp>
-
-
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/base/batch_dim.hpp>
+
 
 TEST(BatchDim, ConstructsCorrectUniformObject)
 {
diff --git a/core/test/base/batch_lin_op.cpp b/core/test/base/batch_lin_op.cpp
index 6b7a40de3c0..865d7f10314 100644
--- a/core/test/base/batch_lin_op.cpp
+++ b/core/test/base/batch_lin_op.cpp
@@ -2,17 +2,13 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/batch_lin_op.hpp>
-
-
 #include <complex>
 #include <memory>
 #include <type_traits>
 
-
 #include <gtest/gtest.h>
 
-
+#include <ginkgo/core/base/batch_lin_op.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/math.hpp>
diff --git a/core/test/base/batch_multi_vector.cpp b/core/test/base/batch_multi_vector.cpp
index 748a52d3227..3798f30ce65 100644
--- a/core/test/base/batch_multi_vector.cpp
+++ b/core/test/base/batch_multi_vector.cpp
@@ -2,17 +2,13 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-
-
 #include <gtest/gtest.h>
 
-
+#include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/range.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/base/batch_utilities.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/batch_helpers.hpp"
diff --git a/core/test/base/block_operator.cpp b/core/test/base/block_operator.cpp
index 54ac3649d9e..2d2bddb357f 100644
--- a/core/test/base/block_operator.cpp
+++ b/core/test/base/block_operator.cpp
@@ -2,18 +2,13 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/block_operator.hpp>
-
-
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
+#include <ginkgo/core/base/block_operator.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/base/combination.cpp b/core/test/base/combination.cpp
index 1be8324e95b..73c30ffe11c 100644
--- a/core/test/base/combination.cpp
+++ b/core/test/base/combination.cpp
@@ -2,14 +2,11 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/combination.hpp>
-
-
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/base/combination.hpp>
 
 #include "core/test/utils.hpp"
 
diff --git a/core/test/base/composition.cpp b/core/test/base/composition.cpp
index 686ff23678a..122755b8f92 100644
--- a/core/test/base/composition.cpp
+++ b/core/test/base/composition.cpp
@@ -2,14 +2,11 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/composition.hpp>
-
-
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/base/composition.hpp>
 
 #include "core/test/utils.hpp"
 
diff --git a/core/test/base/deferred_factory.cpp b/core/test/base/deferred_factory.cpp
index 79bd9672fbd..a1c02103cf8 100644
--- a/core/test/base/deferred_factory.cpp
+++ b/core/test/base/deferred_factory.cpp
@@ -4,7 +4,6 @@
 
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/abstract_factory.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
diff --git a/core/test/base/dense_cache.cpp b/core/test/base/dense_cache.cpp
index 35eeaab64f3..526187610a4 100644
--- a/core/test/base/dense_cache.cpp
+++ b/core/test/base/dense_cache.cpp
@@ -2,15 +2,11 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/dense_cache.hpp>
-
-
 #include <gtest/gtest.h>
 
-
+#include <ginkgo/core/base/dense_cache.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/base/dim.cpp b/core/test/base/dim.cpp
index 2ea3f65dcf7..168f2bccc01 100644
--- a/core/test/base/dim.cpp
+++ b/core/test/base/dim.cpp
@@ -2,14 +2,12 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/dim.hpp>
-
-
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/base/dim.hpp>
+
 
 namespace {
 
diff --git a/core/test/base/exception.cpp b/core/test/base/exception.cpp
index 45f629351f4..ec5d4bf5763 100644
--- a/core/test/base/exception.cpp
+++ b/core/test/base/exception.cpp
@@ -2,11 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/exception.hpp>
-
-
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/base/exception.hpp>
+
 
 namespace {
 
diff --git a/core/test/base/exception_helpers.cpp b/core/test/base/exception_helpers.cpp
index 228389c738a..50f81707ead 100644
--- a/core/test/base/exception_helpers.cpp
+++ b/core/test/base/exception_helpers.cpp
@@ -2,11 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/exception_helpers.hpp>
-
-
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/base/exception_helpers.hpp>
+
 
 namespace {
 
diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp
index dc4ea5aad63..64a11929983 100644
--- a/core/test/base/executor.cpp
+++ b/core/test/base/executor.cpp
@@ -2,12 +2,11 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/executor.hpp>
-
-
 #include <thread>
 #include <type_traits>
 
+#include <ginkgo/core/base/executor.hpp>
+
 
 #if defined(__unix__) || defined(__APPLE__)
 #include <utmpx.h>
@@ -16,7 +15,6 @@
 
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/memory.hpp>
 
diff --git a/core/test/base/extended_float.cpp b/core/test/base/extended_float.cpp
index af31de1ae3c..6148c7c350a 100644
--- a/core/test/base/extended_float.cpp
+++ b/core/test/base/extended_float.cpp
@@ -4,11 +4,9 @@
 
 #include "core/base/extended_float.hpp"
 
-
 #include <bitset>
 #include <string>
 
-
 #include <gtest/gtest.h>
 
 
diff --git a/core/test/base/index_range.cpp b/core/test/base/index_range.cpp
index 8fef94e407f..9845638d446 100644
--- a/core/test/base/index_range.cpp
+++ b/core/test/base/index_range.cpp
@@ -2,11 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <gtest/gtest.h>
-
-
 #include "core/base/index_range.hpp"
 
+#include <gtest/gtest.h>
+
 
 TEST(IRange, KnowsItsProperties)
 {
diff --git a/core/test/base/iterator_factory.cpp b/core/test/base/iterator_factory.cpp
index d113f3198f4..42ddff343c0 100644
--- a/core/test/base/iterator_factory.cpp
+++ b/core/test/base/iterator_factory.cpp
@@ -4,16 +4,13 @@
 
 #include "core/base/iterator_factory.hpp"
 
-
 #include <algorithm>
 #include <complex>
 #include <numeric>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/base/lin_op.cpp b/core/test/base/lin_op.cpp
index 5ffdc640b6b..2496c612193 100644
--- a/core/test/base/lin_op.cpp
+++ b/core/test/base/lin_op.cpp
@@ -2,18 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/lin_op.hpp>
-
-
 #include <complex>
 #include <memory>
 #include <type_traits>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/math.hpp>
 
 
diff --git a/core/test/base/math.cpp b/core/test/base/math.cpp
index 33aed51d71d..c947f5c0d1b 100644
--- a/core/test/base/math.cpp
+++ b/core/test/base/math.cpp
@@ -2,17 +2,15 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/math.hpp>
-
-
 #include <cmath>
 #include <complex>
 #include <limits>
 #include <type_traits>
 
-
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/base/math.hpp>
+
 
 namespace {
 
diff --git a/core/test/base/matrix_assembly_data.cpp b/core/test/base/matrix_assembly_data.cpp
index e65f9079946..d3ed8057659 100644
--- a/core/test/base/matrix_assembly_data.cpp
+++ b/core/test/base/matrix_assembly_data.cpp
@@ -2,11 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/matrix_assembly_data.hpp>
-
-
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/base/matrix_assembly_data.hpp>
+
 
 namespace {
 
diff --git a/core/test/base/matrix_data.cpp b/core/test/base/matrix_data.cpp
index cbc09e99dec..aa63d03dfc2 100644
--- a/core/test/base/matrix_data.cpp
+++ b/core/test/base/matrix_data.cpp
@@ -2,14 +2,12 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/matrix_data.hpp>
-
-
 #include <random>
 
-
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/base/matrix_data.hpp>
+
 
 namespace {
 
diff --git a/core/test/base/mtx_io.cpp b/core/test/base/mtx_io.cpp
index 3743f7d05b8..66b6766b2d3 100644
--- a/core/test/base/mtx_io.cpp
+++ b/core/test/base/mtx_io.cpp
@@ -2,22 +2,17 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/mtx_io.hpp>
-
-
 #include <cstring>
 #include <sstream>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
+#include <ginkgo/core/base/mtx_io.hpp>
 #include <ginkgo/core/base/name_demangling.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/base/perturbation.cpp b/core/test/base/perturbation.cpp
index 578c54555aa..e79d52659da 100644
--- a/core/test/base/perturbation.cpp
+++ b/core/test/base/perturbation.cpp
@@ -2,14 +2,12 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/perturbation.hpp>
-
-
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/base/perturbation.hpp>
+
 
 namespace {
 
diff --git a/core/test/base/polymorphic_object.cpp b/core/test/base/polymorphic_object.cpp
index 8c6a0a23ef4..ff41a47913b 100644
--- a/core/test/base/polymorphic_object.cpp
+++ b/core/test/base/polymorphic_object.cpp
@@ -2,11 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/polymorphic_object.hpp>
-
-
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/base/polymorphic_object.hpp>
+
 
 namespace {
 
diff --git a/core/test/base/range.cpp b/core/test/base/range.cpp
index c6a81806d86..76535038b97 100644
--- a/core/test/base/range.cpp
+++ b/core/test/base/range.cpp
@@ -2,14 +2,12 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/range.hpp>
-
-
 #include <array>
 
-
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/base/range.hpp>
+
 
 namespace {
 
diff --git a/core/test/base/range_accessors.cpp b/core/test/base/range_accessors.cpp
index b9758a39d19..51335c98da7 100644
--- a/core/test/base/range_accessors.cpp
+++ b/core/test/base/range_accessors.cpp
@@ -2,15 +2,11 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/range_accessors.hpp>
-
-
 #include <gtest/gtest.h>
 
-
+#include <ginkgo/core/base/range_accessors.hpp>
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/base/sanitizers.cpp b/core/test/base/sanitizers.cpp
index 44792ecc502..bdd02b09575 100644
--- a/core/test/base/sanitizers.cpp
+++ b/core/test/base/sanitizers.cpp
@@ -6,7 +6,6 @@
 #include <memory>
 #include <thread>
 
-
 #include <gtest/gtest.h>
 
 
diff --git a/core/test/base/segmented_array.cpp b/core/test/base/segmented_array.cpp
index 0c004223b12..2741990036f 100644
--- a/core/test/base/segmented_array.cpp
+++ b/core/test/base/segmented_array.cpp
@@ -2,13 +2,12 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/segmented_array.hpp>
-
+#include "core/base/segmented_array.hpp"
 
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/base/segmented_array.hpp>
 
-#include "core/base/segmented_array.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/base/types.cpp b/core/test/base/types.cpp
index e537eba9bc7..507b86cd056 100644
--- a/core/test/base/types.cpp
+++ b/core/test/base/types.cpp
@@ -2,19 +2,16 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/types.hpp>
-
+#include "core/base/types.hpp"
 
 #include <array>
 #include <cstdint>
 #include <stdexcept>
 #include <type_traits>
 
-
 #include <gtest/gtest.h>
 
-
-#include "core/base/types.hpp"
+#include <ginkgo/core/base/types.hpp>
 
 
 namespace {
diff --git a/core/test/base/utils.cpp b/core/test/base/utils.cpp
index f6f7ff2b046..1ad4705b824 100644
--- a/core/test/base/utils.cpp
+++ b/core/test/base/utils.cpp
@@ -2,13 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/utils.hpp>
-
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/polymorphic_object.hpp>
+#include <ginkgo/core/base/utils.hpp>
 
 
 namespace {
diff --git a/core/test/base/version.cpp b/core/test/base/version.cpp
index a08fb308e51..2e65f25ea1a 100644
--- a/core/test/base/version.cpp
+++ b/core/test/base/version.cpp
@@ -2,14 +2,12 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/version.hpp>
-
-
 #include <sstream>
 
-
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/base/version.hpp>
+
 
 namespace {
 
diff --git a/core/test/components/addressable_pq.cpp b/core/test/components/addressable_pq.cpp
index 834440b45fb..6301cd44fb4 100644
--- a/core/test/components/addressable_pq.cpp
+++ b/core/test/components/addressable_pq.cpp
@@ -4,17 +4,13 @@
 
 #include "core/components/addressable_pq.hpp"
 
-
 #include <algorithm>
 #include <type_traits>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/components/disjoint_sets.cpp b/core/test/components/disjoint_sets.cpp
index e23fb75dcb9..a014d47cbb1 100644
--- a/core/test/components/disjoint_sets.cpp
+++ b/core/test/components/disjoint_sets.cpp
@@ -4,18 +4,14 @@
 
 #include "core/components/disjoint_sets.hpp"
 
-
 #include <algorithm>
 #include <bitset>
 #include <type_traits>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/config/config.cpp b/core/test/config/config.cpp
index 163f6936de2..d5fed0f90c3 100644
--- a/core/test/config/config.cpp
+++ b/core/test/config/config.cpp
@@ -2,13 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/config/config.hpp>
-
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/config/config.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/solver/bicg.hpp>
 #include <ginkgo/core/solver/cg.hpp>
@@ -16,7 +13,6 @@
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "core/config/config_helper.hpp"
 #include "core/test/utils.hpp"
 
diff --git a/core/test/config/factorization.cpp b/core/test/config/factorization.cpp
index 405641265a2..f5a4b19d3d9 100644
--- a/core/test/config/factorization.cpp
+++ b/core/test/config/factorization.cpp
@@ -4,10 +4,8 @@
 
 #include <typeinfo>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/config/config.hpp>
 #include <ginkgo/core/factorization/cholesky.hpp>
@@ -22,7 +20,6 @@
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 
-
 #include "core/config/config_helper.hpp"
 #include "core/config/registry_accessor.hpp"
 #include "core/test/utils.hpp"
diff --git a/core/test/config/multigrid.cpp b/core/test/config/multigrid.cpp
index 5c13d0525cf..5eb8e622088 100644
--- a/core/test/config/multigrid.cpp
+++ b/core/test/config/multigrid.cpp
@@ -4,10 +4,8 @@
 
 #include <typeinfo>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/config/config.hpp>
 #include <ginkgo/core/multigrid/fixed_coarsening.hpp>
@@ -16,7 +14,6 @@
 #include <ginkgo/core/solver/multigrid.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 
-
 #include "core/config/config_helper.hpp"
 #include "core/config/registry_accessor.hpp"
 #include "core/test/utils.hpp"
diff --git a/core/test/config/preconditioner.cpp b/core/test/config/preconditioner.cpp
index d404a97b4b8..b11ea3b6705 100644
--- a/core/test/config/preconditioner.cpp
+++ b/core/test/config/preconditioner.cpp
@@ -4,10 +4,8 @@
 
 #include <typeinfo>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/config/config.hpp>
 #include <ginkgo/core/preconditioner/ic.hpp>
@@ -18,7 +16,6 @@
 #include <ginkgo/core/solver/ir.hpp>
 #include <ginkgo/core/solver/triangular.hpp>
 
-
 #include "core/config/config_helper.hpp"
 #include "core/config/registry_accessor.hpp"
 #include "core/test/utils.hpp"
diff --git a/core/test/config/property_tree.cpp b/core/test/config/property_tree.cpp
index a552a6c08d8..8fe49aa6926 100644
--- a/core/test/config/property_tree.cpp
+++ b/core/test/config/property_tree.cpp
@@ -2,19 +2,15 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/config/property_tree.hpp>
-
-
 #include <cstdint>
 #include <exception>
 #include <limits>
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/config/property_tree.hpp>
 
 
 using namespace gko::config;
diff --git a/core/test/config/registry.cpp b/core/test/config/registry.cpp
index e6fc8eef671..a8d1acb1cf8 100644
--- a/core/test/config/registry.cpp
+++ b/core/test/config/registry.cpp
@@ -2,21 +2,17 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/config/registry.hpp>
-
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/config/registry.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/solver/cg.hpp>
 #include <ginkgo/core/solver/solver_base.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
-
 #include "core/config/config_helper.hpp"
 #include "core/config/registry_accessor.hpp"
 #include "core/test/utils.hpp"
diff --git a/core/test/config/solver.cpp b/core/test/config/solver.cpp
index b40c4dc1781..8a2f025d00a 100644
--- a/core/test/config/solver.cpp
+++ b/core/test/config/solver.cpp
@@ -4,10 +4,8 @@
 
 #include <typeinfo>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/config/config.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
@@ -25,7 +23,6 @@
 #include <ginkgo/core/solver/triangular.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 
-
 #include "core/config/config_helper.hpp"
 #include "core/config/registry_accessor.hpp"
 #include "core/test/utils.hpp"
diff --git a/core/test/config/type_descriptor.cpp b/core/test/config/type_descriptor.cpp
index a387ebe44b7..ff519e88101 100644
--- a/core/test/config/type_descriptor.cpp
+++ b/core/test/config/type_descriptor.cpp
@@ -2,11 +2,9 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/config/type_descriptor.hpp>
-
-
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/config/type_descriptor.hpp>
 
 #include "core/config/type_descriptor_helper.hpp"
 #include "core/test/utils.hpp"
diff --git a/core/test/distributed/index_map.cpp b/core/test/distributed/index_map.cpp
index 521a2e2d094..8602bb025f5 100644
--- a/core/test/distributed/index_map.cpp
+++ b/core/test/distributed/index_map.cpp
@@ -2,14 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/distributed/index_map.hpp>
-
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
-
+#include <ginkgo/core/distributed/index_map.hpp>
 
 #include "core/test/utils.hpp"
 
diff --git a/core/test/factorization/elimination_forest.cpp b/core/test/factorization/elimination_forest.cpp
index 3cbe5f3ae7b..292b366f50e 100644
--- a/core/test/factorization/elimination_forest.cpp
+++ b/core/test/factorization/elimination_forest.cpp
@@ -4,17 +4,13 @@
 
 #include "core/factorization/elimination_forest.hpp"
 
-
 #include <algorithm>
 #include <numeric>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "matrices/config.hpp"
 
diff --git a/core/test/factorization/par_ic.cpp b/core/test/factorization/par_ic.cpp
index c580b5ea139..d6de0f9fc98 100644
--- a/core/test/factorization/par_ic.cpp
+++ b/core/test/factorization/par_ic.cpp
@@ -2,14 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/factorization/par_ic.hpp>
-
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
-
+#include <ginkgo/core/factorization/par_ic.hpp>
 
 #include "core/test/utils.hpp"
 
diff --git a/core/test/factorization/par_ict.cpp b/core/test/factorization/par_ict.cpp
index 10e0dd7b99c..07eec8db549 100644
--- a/core/test/factorization/par_ict.cpp
+++ b/core/test/factorization/par_ict.cpp
@@ -2,14 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/factorization/par_ict.hpp>
-
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
-
+#include <ginkgo/core/factorization/par_ict.hpp>
 
 #include "core/test/utils.hpp"
 
diff --git a/core/test/factorization/par_ilu.cpp b/core/test/factorization/par_ilu.cpp
index f3904093024..a0b8f37e3d4 100644
--- a/core/test/factorization/par_ilu.cpp
+++ b/core/test/factorization/par_ilu.cpp
@@ -2,14 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/factorization/par_ilu.hpp>
-
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
-
+#include <ginkgo/core/factorization/par_ilu.hpp>
 
 #include "core/test/utils.hpp"
 
diff --git a/core/test/factorization/par_ilut.cpp b/core/test/factorization/par_ilut.cpp
index 4d5e8ea88d8..ad466e62407 100644
--- a/core/test/factorization/par_ilut.cpp
+++ b/core/test/factorization/par_ilut.cpp
@@ -2,14 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/factorization/par_ilut.hpp>
-
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
-
+#include <ginkgo/core/factorization/par_ilut.hpp>
 
 #include "core/test/utils.hpp"
 
diff --git a/core/test/gtest/environments.hpp b/core/test/gtest/environments.hpp
index 269845c3e9c..01250c41929 100644
--- a/core/test/gtest/environments.hpp
+++ b/core/test/gtest/environments.hpp
@@ -10,15 +10,12 @@
 #include <regex>
 #include <sstream>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/mpi.hpp>
 
-
 #include "core/test/gtest/resources.hpp"
 
 
diff --git a/core/test/gtest/ginkgo_main.cpp b/core/test/gtest/ginkgo_main.cpp
index d0e5560be84..7ba3d80c52c 100644
--- a/core/test/gtest/ginkgo_main.cpp
+++ b/core/test/gtest/ginkgo_main.cpp
@@ -4,7 +4,6 @@
 
 #include <gtest/gtest.h>
 
-
 #include "core/test/gtest/environments.hpp"
 
 
diff --git a/core/test/gtest/ginkgo_mpi_main.cpp b/core/test/gtest/ginkgo_mpi_main.cpp
index a94cdc710a5..07a1c2c343d 100644
--- a/core/test/gtest/ginkgo_mpi_main.cpp
+++ b/core/test/gtest/ginkgo_mpi_main.cpp
@@ -16,16 +16,12 @@
 #include <string>
 #include <vector>
 
-
 #include <mpi.h>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/mpi.hpp>
 
-
 #include "core/test/gtest/environments.hpp"
 
 
diff --git a/core/test/gtest/resources.cpp b/core/test/gtest/resources.cpp
index be7d37efd9e..62dee2b014f 100644
--- a/core/test/gtest/resources.cpp
+++ b/core/test/gtest/resources.cpp
@@ -2,14 +2,13 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/test/gtest/resources.hpp"
+
 #include <algorithm>
 #include <regex>
 #include <sstream>
 
 
-#include "core/test/gtest/resources.hpp"
-
-
 #ifdef GKO_COMPILING_OMP
 #include <omp.h>
 #endif
diff --git a/core/test/log/convergence.cpp b/core/test/log/convergence.cpp
index 944549346ef..8fff0c17b8e 100644
--- a/core/test/log/convergence.cpp
+++ b/core/test/log/convergence.cpp
@@ -2,16 +2,12 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/log/convergence.hpp>
-
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/log/convergence.hpp>
 #include <ginkgo/core/solver/ir.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/log/logger.cpp b/core/test/log/logger.cpp
index 18315442559..b065db66768 100644
--- a/core/test/log/logger.cpp
+++ b/core/test/log/logger.cpp
@@ -8,16 +8,12 @@ GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS
 // clang-format on
 
 
-#include <ginkgo/core/log/logger.hpp>
-
-
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/log/convergence.hpp>
+#include <ginkgo/core/log/logger.hpp>
 #include <ginkgo/core/log/record.hpp>
 #include <ginkgo/core/log/stream.hpp>
 
diff --git a/core/test/log/papi.cpp b/core/test/log/papi.cpp
index bab3d8f3ff7..8278120cc49 100644
--- a/core/test/log/papi.cpp
+++ b/core/test/log/papi.cpp
@@ -2,22 +2,17 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/log/papi.hpp>
-
-
 #include <stdexcept>
 
-
 #include <gtest/gtest.h>
 #include <papi.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/log/papi.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/solver/bicgstab.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/log/performance_hint.cpp b/core/test/log/performance_hint.cpp
index 6bef7ca24c3..eaac858e378 100644
--- a/core/test/log/performance_hint.cpp
+++ b/core/test/log/performance_hint.cpp
@@ -2,18 +2,13 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/log/performance_hint.hpp>
-
-
 #include <iomanip>
 #include <sstream>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
-
+#include <ginkgo/core/log/performance_hint.hpp>
 
 #include "core/test/utils.hpp"
 
diff --git a/core/test/log/profiler_hook.cpp b/core/test/log/profiler_hook.cpp
index 4ace584670f..40bd6394475 100644
--- a/core/test/log/profiler_hook.cpp
+++ b/core/test/log/profiler_hook.cpp
@@ -2,22 +2,18 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/log/profiler_hook.hpp>
-
+#include "core/log/profiler_hook.hpp"
 
 #include <chrono>
 #include <string>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/log/profiler_hook.hpp>
 #include <ginkgo/core/solver/ir.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 
-
-#include "core/log/profiler_hook.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/log/record.cpp b/core/test/log/record.cpp
index f8595c0c0b9..b81bd7b899e 100644
--- a/core/test/log/record.cpp
+++ b/core/test/log/record.cpp
@@ -2,18 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/log/record.hpp>
-
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/utils.hpp>
+#include <ginkgo/core/log/record.hpp>
 #include <ginkgo/core/solver/bicgstab.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 
-
 #include "core/test/utils/assertions.hpp"
 
 
diff --git a/core/test/log/stream.cpp b/core/test/log/stream.cpp
index 7ac1dca2cfe..995a9975b89 100644
--- a/core/test/log/stream.cpp
+++ b/core/test/log/stream.cpp
@@ -2,23 +2,18 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/log/stream.hpp>
-
-
 #include <iomanip>
 #include <sstream>
 #include <string>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/log/stream.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/solver/bicgstab.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/matrix/batch_csr.cpp b/core/test/matrix/batch_csr.cpp
index bae7d08c837..57cae53d646 100644
--- a/core/test/matrix/batch_csr.cpp
+++ b/core/test/matrix/batch_csr.cpp
@@ -2,17 +2,13 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/batch_csr.hpp>
-
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/batch_csr.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/base/batch_utilities.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/batch_helpers.hpp"
diff --git a/core/test/matrix/batch_dense.cpp b/core/test/matrix/batch_dense.cpp
index 892d9a36b1f..334df5c0e93 100644
--- a/core/test/matrix/batch_dense.cpp
+++ b/core/test/matrix/batch_dense.cpp
@@ -2,18 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/batch_dense.hpp>
-
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/range.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/base/batch_utilities.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/batch_helpers.hpp"
diff --git a/core/test/matrix/batch_ell.cpp b/core/test/matrix/batch_ell.cpp
index b455364933e..11f6381a43d 100644
--- a/core/test/matrix/batch_ell.cpp
+++ b/core/test/matrix/batch_ell.cpp
@@ -2,17 +2,13 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/batch_ell.hpp>
-
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
 
-
 #include "core/base/batch_utilities.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/batch_helpers.hpp"
diff --git a/core/test/matrix/batch_identity.cpp b/core/test/matrix/batch_identity.cpp
index fdde07b6919..dd7a3675110 100644
--- a/core/test/matrix/batch_identity.cpp
+++ b/core/test/matrix/batch_identity.cpp
@@ -2,18 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/batch_identity.hpp>
-
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/range.hpp>
+#include <ginkgo/core/matrix/batch_identity.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/base/batch_utilities.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/batch_helpers.hpp"
diff --git a/core/test/matrix/coo.cpp b/core/test/matrix/coo.cpp
index b4365138860..ffb8d5aee9f 100644
--- a/core/test/matrix/coo.cpp
+++ b/core/test/matrix/coo.cpp
@@ -2,11 +2,9 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/coo.hpp>
-
-
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/matrix/coo.hpp>
 
 #include "core/test/utils.hpp"
 
diff --git a/core/test/matrix/coo_builder.cpp b/core/test/matrix/coo_builder.cpp
index 1c0bf020d79..9bfae5cf3af 100644
--- a/core/test/matrix/coo_builder.cpp
+++ b/core/test/matrix/coo_builder.cpp
@@ -4,13 +4,10 @@
 
 #include "core/matrix/coo_builder.hpp"
 
-
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/matrix/csr.cpp b/core/test/matrix/csr.cpp
index 6dc4477c101..4bbdc63851a 100644
--- a/core/test/matrix/csr.cpp
+++ b/core/test/matrix/csr.cpp
@@ -2,14 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/csr.hpp>
-
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/device_matrix_data.hpp>
-
+#include <ginkgo/core/matrix/csr.hpp>
 
 #include "core/test/utils.hpp"
 
diff --git a/core/test/matrix/csr_builder.cpp b/core/test/matrix/csr_builder.cpp
index e28e17999b8..a06437bed12 100644
--- a/core/test/matrix/csr_builder.cpp
+++ b/core/test/matrix/csr_builder.cpp
@@ -4,13 +4,10 @@
 
 #include "core/matrix/csr_builder.hpp"
 
-
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/matrix/dense.cpp b/core/test/matrix/dense.cpp
index 9a2564b2ae8..e7158a15aed 100644
--- a/core/test/matrix/dense.cpp
+++ b/core/test/matrix/dense.cpp
@@ -2,15 +2,11 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/dense.hpp>
-
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/range.hpp>
-
+#include <ginkgo/core/matrix/dense.hpp>
 
 #include "core/test/utils.hpp"
 
diff --git a/core/test/matrix/diagonal.cpp b/core/test/matrix/diagonal.cpp
index 3eb7ab66091..de03a9350bb 100644
--- a/core/test/matrix/diagonal.cpp
+++ b/core/test/matrix/diagonal.cpp
@@ -2,11 +2,9 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/diagonal.hpp>
-
-
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/matrix/diagonal.hpp>
 
 #include "core/test/utils.hpp"
 
diff --git a/core/test/matrix/ell.cpp b/core/test/matrix/ell.cpp
index add0a5ad677..bcc2b591a50 100644
--- a/core/test/matrix/ell.cpp
+++ b/core/test/matrix/ell.cpp
@@ -2,11 +2,9 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/ell.hpp>
-
-
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/matrix/ell.hpp>
 
 #include "core/test/utils.hpp"
 
diff --git a/core/test/matrix/fbcsr.cpp b/core/test/matrix/fbcsr.cpp
index 8e2c4a5808b..3d3d4ee738d 100644
--- a/core/test/matrix/fbcsr.cpp
+++ b/core/test/matrix/fbcsr.cpp
@@ -2,18 +2,13 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/fbcsr.hpp>
-
-
 #include <algorithm>
 #include <limits>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/types.hpp>
-
+#include <ginkgo/core/matrix/fbcsr.hpp>
 
 #include "accessor/block_col_major.hpp"
 #include "accessor/range.hpp"
diff --git a/core/test/matrix/fbcsr_builder.cpp b/core/test/matrix/fbcsr_builder.cpp
index 3f1e915319e..d91a0c7b70a 100644
--- a/core/test/matrix/fbcsr_builder.cpp
+++ b/core/test/matrix/fbcsr_builder.cpp
@@ -4,13 +4,10 @@
 
 #include "core/matrix/fbcsr_builder.hpp"
 
-
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/matrix/fbcsr_sample.hpp b/core/test/matrix/fbcsr_sample.hpp
index 618cc2a2456..d84cbd31f19 100644
--- a/core/test/matrix/fbcsr_sample.hpp
+++ b/core/test/matrix/fbcsr_sample.hpp
@@ -13,7 +13,6 @@
 #include <ginkgo/core/matrix/fbcsr.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
 #include "accessor/block_col_major.hpp"
 #include "accessor/range.hpp"
 #include "core/test/utils.hpp"
diff --git a/core/test/matrix/hybrid.cpp b/core/test/matrix/hybrid.cpp
index 10b5603c75c..d1a69312755 100644
--- a/core/test/matrix/hybrid.cpp
+++ b/core/test/matrix/hybrid.cpp
@@ -2,11 +2,9 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/hybrid.hpp>
-
-
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/matrix/hybrid.hpp>
 
 #include "core/test/utils.hpp"
 
diff --git a/core/test/matrix/identity.cpp b/core/test/matrix/identity.cpp
index 69370df07c5..bcf9c036992 100644
--- a/core/test/matrix/identity.cpp
+++ b/core/test/matrix/identity.cpp
@@ -7,15 +7,11 @@
 // clang-format on
 
 
-#include <ginkgo/core/matrix/identity.hpp>
-
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
-
+#include <ginkgo/core/matrix/identity.hpp>
 
 #include "core/test/utils.hpp"
 
diff --git a/core/test/matrix/permutation.cpp b/core/test/matrix/permutation.cpp
index 1412e2924af..edb1532696b 100644
--- a/core/test/matrix/permutation.cpp
+++ b/core/test/matrix/permutation.cpp
@@ -2,17 +2,13 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/permutation.hpp>
-
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/range.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
-
+#include <ginkgo/core/matrix/permutation.hpp>
 
 #include "core/test/utils.hpp"
 
diff --git a/core/test/matrix/row_gatherer.cpp b/core/test/matrix/row_gatherer.cpp
index e8c15e454d2..801f639c206 100644
--- a/core/test/matrix/row_gatherer.cpp
+++ b/core/test/matrix/row_gatherer.cpp
@@ -2,17 +2,13 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/row_gatherer.hpp>
-
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/range.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
-
+#include <ginkgo/core/matrix/row_gatherer.hpp>
 
 #include "core/test/utils.hpp"
 #include "core/test/utils/assertions.hpp"
diff --git a/core/test/matrix/sellp.cpp b/core/test/matrix/sellp.cpp
index 0160a329ddf..123d7bae773 100644
--- a/core/test/matrix/sellp.cpp
+++ b/core/test/matrix/sellp.cpp
@@ -2,11 +2,9 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/sellp.hpp>
-
-
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/matrix/sellp.hpp>
 
 #include "core/test/utils.hpp"
 
diff --git a/core/test/matrix/sparsity_csr.cpp b/core/test/matrix/sparsity_csr.cpp
index dca3ef8b5d6..e929f960f1e 100644
--- a/core/test/matrix/sparsity_csr.cpp
+++ b/core/test/matrix/sparsity_csr.cpp
@@ -2,18 +2,13 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/sparsity_csr.hpp>
-
-
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/dim.hpp>
-
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 #include "core/test/utils.hpp"
 
diff --git a/core/test/mpi/base/bindings.cpp b/core/test/mpi/base/bindings.cpp
index fc715d37782..d3ecf359908 100644
--- a/core/test/mpi/base/bindings.cpp
+++ b/core/test/mpi/base/bindings.cpp
@@ -4,17 +4,14 @@
 
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/mpi.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/mpi/base/communicator.cpp b/core/test/mpi/base/communicator.cpp
index eaf05e148b7..a010b92b935 100644
--- a/core/test/mpi/base/communicator.cpp
+++ b/core/test/mpi/base/communicator.cpp
@@ -4,10 +4,8 @@
 
 #include <mpi.h>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/mpi.hpp>
 
diff --git a/core/test/mpi/base/exception_helpers.cpp b/core/test/mpi/base/exception_helpers.cpp
index 04a13a3422b..a8b74b04ba5 100644
--- a/core/test/mpi/base/exception_helpers.cpp
+++ b/core/test/mpi/base/exception_helpers.cpp
@@ -4,10 +4,8 @@
 
 #include <mpi.h>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 
diff --git a/core/test/mpi/base/polymorphic_object.cpp b/core/test/mpi/base/polymorphic_object.cpp
index cd55b7c533f..0c00f2af468 100644
--- a/core/test/mpi/base/polymorphic_object.cpp
+++ b/core/test/mpi/base/polymorphic_object.cpp
@@ -4,7 +4,6 @@
 
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/distributed/polymorphic_object.hpp>
 
 
diff --git a/core/test/mpi/base/rank_mapping.cpp b/core/test/mpi/base/rank_mapping.cpp
index 6588ce5abcd..97999163035 100644
--- a/core/test/mpi/base/rank_mapping.cpp
+++ b/core/test/mpi/base/rank_mapping.cpp
@@ -4,13 +4,10 @@
 
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/mpi.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/mpi/distributed/helpers.cpp b/core/test/mpi/distributed/helpers.cpp
index 69c11f7e0ae..a2349ce395d 100644
--- a/core/test/mpi/distributed/helpers.cpp
+++ b/core/test/mpi/distributed/helpers.cpp
@@ -2,13 +2,12 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <gtest/gtest.h>
+#include "core/distributed/helpers.hpp"
 
+#include <gtest/gtest.h>
 
 #include <ginkgo/core/matrix/csr.hpp>
 
-
-#include "core/distributed/helpers.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/mpi/distributed/matrix.cpp b/core/test/mpi/distributed/matrix.cpp
index 48a7d3e2f17..4062393564c 100644
--- a/core/test/mpi/distributed/matrix.cpp
+++ b/core/test/mpi/distributed/matrix.cpp
@@ -4,7 +4,6 @@
 
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/distributed/matrix.hpp>
 #include <ginkgo/core/distributed/partition.hpp>
@@ -17,7 +16,6 @@
 #include <ginkgo/core/matrix/hybrid.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/mpi/distributed/preconditioner/schwarz.cpp b/core/test/mpi/distributed/preconditioner/schwarz.cpp
index 457303e8285..c6c0dc00650 100644
--- a/core/test/mpi/distributed/preconditioner/schwarz.cpp
+++ b/core/test/mpi/distributed/preconditioner/schwarz.cpp
@@ -4,13 +4,11 @@
 
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/distributed/preconditioner/schwarz.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/preconditioner/jacobi.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/mpi/distributed/solver/multigrid.cpp b/core/test/mpi/distributed/solver/multigrid.cpp
index c654edff30c..c8ab6f6d284 100644
--- a/core/test/mpi/distributed/solver/multigrid.cpp
+++ b/core/test/mpi/distributed/solver/multigrid.cpp
@@ -4,7 +4,6 @@
 
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/distributed/matrix.hpp>
 #include <ginkgo/core/distributed/partition.hpp>
@@ -13,7 +12,6 @@
 #include <ginkgo/core/solver/multigrid.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/multigrid/fixed_coarsening.cpp b/core/test/multigrid/fixed_coarsening.cpp
index 7e27a6a4eed..5cab7282b5d 100644
--- a/core/test/multigrid/fixed_coarsening.cpp
+++ b/core/test/multigrid/fixed_coarsening.cpp
@@ -2,17 +2,12 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/multigrid/fixed_coarsening.hpp>
-
-
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
-
+#include <ginkgo/core/multigrid/fixed_coarsening.hpp>
 
 #include "core/test/utils.hpp"
 
diff --git a/core/test/multigrid/pgm.cpp b/core/test/multigrid/pgm.cpp
index ca1c409753a..7798e97f5d6 100644
--- a/core/test/multigrid/pgm.cpp
+++ b/core/test/multigrid/pgm.cpp
@@ -2,17 +2,12 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/multigrid/pgm.hpp>
-
-
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
-
+#include <ginkgo/core/multigrid/pgm.hpp>
 
 #include "core/test/utils.hpp"
 
diff --git a/core/test/preconditioner/batch_jacobi.cpp b/core/test/preconditioner/batch_jacobi.cpp
index 08ccedb2f3f..f9c5f5fe124 100644
--- a/core/test/preconditioner/batch_jacobi.cpp
+++ b/core/test/preconditioner/batch_jacobi.cpp
@@ -2,16 +2,12 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/preconditioner/batch_jacobi.hpp>
-
-
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/preconditioner/batch_jacobi.hpp>
 #include <ginkgo/core/solver/bicgstab.hpp>
 
 
diff --git a/core/test/preconditioner/ic.cpp b/core/test/preconditioner/ic.cpp
index d290dbfd463..fc02e800052 100644
--- a/core/test/preconditioner/ic.cpp
+++ b/core/test/preconditioner/ic.cpp
@@ -10,15 +10,12 @@ GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS
 
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/factorization/par_ic.hpp>
 #include <ginkgo/core/solver/bicgstab.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/preconditioner/ilu.cpp b/core/test/preconditioner/ilu.cpp
index f3e38702399..08806a4e92c 100644
--- a/core/test/preconditioner/ilu.cpp
+++ b/core/test/preconditioner/ilu.cpp
@@ -10,15 +10,12 @@ GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS
 
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/factorization/par_ilu.hpp>
 #include <ginkgo/core/solver/bicgstab.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/preconditioner/isai.cpp b/core/test/preconditioner/isai.cpp
index ff597b83bf1..b5e7400d0e8 100644
--- a/core/test/preconditioner/isai.cpp
+++ b/core/test/preconditioner/isai.cpp
@@ -2,20 +2,15 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/preconditioner/isai.hpp>
-
-
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/preconditioner/isai.hpp>
 #include <ginkgo/core/solver/bicgstab.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/preconditioner/jacobi.cpp b/core/test/preconditioner/jacobi.cpp
index 44b53f520c4..8813b4c3c4d 100644
--- a/core/test/preconditioner/jacobi.cpp
+++ b/core/test/preconditioner/jacobi.cpp
@@ -2,14 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/preconditioner/jacobi.hpp>
-
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/matrix/csr.hpp>
-
+#include <ginkgo/core/preconditioner/jacobi.hpp>
 
 #include "core/test/utils.hpp"
 
diff --git a/core/test/reorder/amd.cpp b/core/test/reorder/amd.cpp
index e1ae5360aee..9eecf3777e1 100644
--- a/core/test/reorder/amd.cpp
+++ b/core/test/reorder/amd.cpp
@@ -2,19 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/reorder/amd.hpp>
-
-
 #include <algorithm>
 #include <initializer_list>
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/matrix/csr.hpp>
-
+#include <ginkgo/core/reorder/amd.hpp>
 
 #include "core/factorization/symbolic.hpp"
 #include "core/test/utils.hpp"
diff --git a/core/test/reorder/nested_dissection.cpp b/core/test/reorder/nested_dissection.cpp
index 88b39cd4e87..fc6d7e3a06a 100644
--- a/core/test/reorder/nested_dissection.cpp
+++ b/core/test/reorder/nested_dissection.cpp
@@ -2,17 +2,12 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/reorder/nested_dissection.hpp>
-
-
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
-
+#include <ginkgo/core/reorder/nested_dissection.hpp>
 
 #include "core/test/utils.hpp"
 
diff --git a/core/test/reorder/rcm.cpp b/core/test/reorder/rcm.cpp
index 544628c191a..e1ca032b64f 100644
--- a/core/test/reorder/rcm.cpp
+++ b/core/test/reorder/rcm.cpp
@@ -2,17 +2,12 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/reorder/rcm.hpp>
-
-
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
-
+#include <ginkgo/core/reorder/rcm.hpp>
 
 #include "core/test/utils.hpp"
 
diff --git a/core/test/reorder/scaled_reordered.cpp b/core/test/reorder/scaled_reordered.cpp
index 7bc8452e907..8a4c12ca232 100644
--- a/core/test/reorder/scaled_reordered.cpp
+++ b/core/test/reorder/scaled_reordered.cpp
@@ -2,19 +2,15 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/reorder/scaled_reordered.hpp>
-
-
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/factorization/par_ic.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/reorder/rcm.hpp>
+#include <ginkgo/core/reorder/scaled_reordered.hpp>
 #include <ginkgo/core/solver/bicgstab.hpp>
 
 
diff --git a/core/test/solver/batch_bicgstab.cpp b/core/test/solver/batch_bicgstab.cpp
index 9ff775e7d37..cd9446d07b2 100644
--- a/core/test/solver/batch_bicgstab.cpp
+++ b/core/test/solver/batch_bicgstab.cpp
@@ -2,16 +2,12 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/batch_bicgstab.hpp>
-
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
-
+#include <ginkgo/core/solver/batch_bicgstab.hpp>
 
 #include "core/base/batch_utilities.hpp"
 #include "core/test/utils.hpp"
diff --git a/core/test/solver/batch_cg.cpp b/core/test/solver/batch_cg.cpp
index f890f26c7ae..1e97c765f8a 100644
--- a/core/test/solver/batch_cg.cpp
+++ b/core/test/solver/batch_cg.cpp
@@ -2,16 +2,12 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/batch_cg.hpp>
-
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
-
+#include <ginkgo/core/solver/batch_cg.hpp>
 
 #include "core/base/batch_utilities.hpp"
 #include "core/test/utils.hpp"
diff --git a/core/test/solver/bicg.cpp b/core/test/solver/bicg.cpp
index d983808139f..e5a40e0c4f8 100644
--- a/core/test/solver/bicg.cpp
+++ b/core/test/solver/bicg.cpp
@@ -2,22 +2,17 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/bicg.hpp>
-
-
 #include <typeinfo>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/solver/bicg.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/solver/bicgstab.cpp b/core/test/solver/bicgstab.cpp
index 6dc92670f52..f8b8d3c7b05 100644
--- a/core/test/solver/bicgstab.cpp
+++ b/core/test/solver/bicgstab.cpp
@@ -2,20 +2,16 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/bicgstab.hpp>
-
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/solver/bicgstab.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/solver/cb_gmres.cpp b/core/test/solver/cb_gmres.cpp
index 19996fe0275..21600ed2b70 100644
--- a/core/test/solver/cb_gmres.cpp
+++ b/core/test/solver/cb_gmres.cpp
@@ -2,22 +2,17 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/cb_gmres.hpp>
-
-
 #include <tuple>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/solver/cb_gmres.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/solver/cg.cpp b/core/test/solver/cg.cpp
index 3261188285b..cbf637de302 100644
--- a/core/test/solver/cg.cpp
+++ b/core/test/solver/cg.cpp
@@ -2,22 +2,17 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/cg.hpp>
-
-
 #include <typeinfo>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/solver/cg.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/solver/cgs.cpp b/core/test/solver/cgs.cpp
index e76c40ab9f8..5dc80892a1b 100644
--- a/core/test/solver/cgs.cpp
+++ b/core/test/solver/cgs.cpp
@@ -2,22 +2,17 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/cgs.hpp>
-
-
 #include <typeinfo>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/solver/cgs.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/solver/direct.cpp b/core/test/solver/direct.cpp
index a7df3e68a92..d895892a8be 100644
--- a/core/test/solver/direct.cpp
+++ b/core/test/solver/direct.cpp
@@ -2,18 +2,13 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/direct.hpp>
-
-
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/factorization/lu.hpp>
-
+#include <ginkgo/core/solver/direct.hpp>
 
 #include "core/test/utils.hpp"
 
diff --git a/core/test/solver/fcg.cpp b/core/test/solver/fcg.cpp
index ec1b4ded76f..2898a5f5c46 100644
--- a/core/test/solver/fcg.cpp
+++ b/core/test/solver/fcg.cpp
@@ -2,19 +2,15 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/fcg.hpp>
-
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/solver/fcg.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/solver/gcr.cpp b/core/test/solver/gcr.cpp
index 29ef0e1f578..2d7b5ea7974 100644
--- a/core/test/solver/gcr.cpp
+++ b/core/test/solver/gcr.cpp
@@ -2,22 +2,17 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/gcr.hpp>
-
-
 #include <typeinfo>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/solver/gcr.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/solver/gmres.cpp b/core/test/solver/gmres.cpp
index a2bbd523bce..5d9c9e3c40e 100644
--- a/core/test/solver/gmres.cpp
+++ b/core/test/solver/gmres.cpp
@@ -2,22 +2,17 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/gmres.hpp>
-
-
 #include <typeinfo>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/solver/gmres.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/solver/idr.cpp b/core/test/solver/idr.cpp
index 1fe6c651a23..9eb79356046 100644
--- a/core/test/solver/idr.cpp
+++ b/core/test/solver/idr.cpp
@@ -2,20 +2,16 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/idr.hpp>
-
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/solver/idr.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/solver/ir.cpp b/core/test/solver/ir.cpp
index e3d54d9c894..1137862a395 100644
--- a/core/test/solver/ir.cpp
+++ b/core/test/solver/ir.cpp
@@ -2,23 +2,18 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/ir.hpp>
-
-
 #include <typeinfo>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/log/profiler_hook.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/solver/ir.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/solver/lower_trs.cpp b/core/test/solver/lower_trs.cpp
index a218072bdb0..dfcb564ca12 100644
--- a/core/test/solver/lower_trs.cpp
+++ b/core/test/solver/lower_trs.cpp
@@ -4,14 +4,11 @@
 
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/solver/triangular.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/solver/multigrid.cpp b/core/test/solver/multigrid.cpp
index 6d38016099e..8cb545f6cb2 100644
--- a/core/test/solver/multigrid.cpp
+++ b/core/test/solver/multigrid.cpp
@@ -2,24 +2,19 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/multigrid.hpp>
-
-
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/preconditioner/jacobi.hpp>
 #include <ginkgo/core/solver/direct.hpp>
 #include <ginkgo/core/solver/ir.hpp>
+#include <ginkgo/core/solver/multigrid.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/solver/upper_trs.cpp b/core/test/solver/upper_trs.cpp
index 425d869156c..2e84cb81e10 100644
--- a/core/test/solver/upper_trs.cpp
+++ b/core/test/solver/upper_trs.cpp
@@ -4,14 +4,11 @@
 
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/solver/triangular.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/solver/workspace.cpp b/core/test/solver/workspace.cpp
index 1c8996aeb65..3126cc67501 100644
--- a/core/test/solver/workspace.cpp
+++ b/core/test/solver/workspace.cpp
@@ -2,18 +2,13 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/workspace.hpp>
-
-
 #include <typeinfo>
 
-
 #include <gtest/gtest-death-test.h>
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
-
+#include <ginkgo/core/solver/workspace.hpp>
 
 #include "core/test/utils.hpp"
 
diff --git a/core/test/stop/combined.cpp b/core/test/stop/combined.cpp
index 401cd63fb34..2995414a7b0 100644
--- a/core/test/stop/combined.cpp
+++ b/core/test/stop/combined.cpp
@@ -2,16 +2,12 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/stop/combined.hpp>
-
-
 #include <thread>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
+#include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
diff --git a/core/test/stop/criterion.cpp b/core/test/stop/criterion.cpp
index 700f1829dfb..ce555d01969 100644
--- a/core/test/stop/criterion.cpp
+++ b/core/test/stop/criterion.cpp
@@ -2,11 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/stop/criterion.hpp>
-
-
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/stop/criterion.hpp>
+
 
 namespace {
 
diff --git a/core/test/stop/iteration.cpp b/core/test/stop/iteration.cpp
index de36e2107b4..e538885e5d6 100644
--- a/core/test/stop/iteration.cpp
+++ b/core/test/stop/iteration.cpp
@@ -2,11 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/stop/iteration.hpp>
-
-
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/stop/iteration.hpp>
+
 
 namespace {
 
diff --git a/core/test/stop/stopping_status.cpp b/core/test/stop/stopping_status.cpp
index 4e6046568a8..46b3ce86c42 100644
--- a/core/test/stop/stopping_status.cpp
+++ b/core/test/stop/stopping_status.cpp
@@ -2,11 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/stop/stopping_status.hpp>
-
-
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/stop/stopping_status.hpp>
+
 
 namespace {
 
diff --git a/core/test/stop/time.cpp b/core/test/stop/time.cpp
index fb08055b2b3..60a22b79ad3 100644
--- a/core/test/stop/time.cpp
+++ b/core/test/stop/time.cpp
@@ -2,15 +2,13 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/stop/time.hpp>
-
-
 #include <chrono>
 #include <thread>
 
-
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/stop/time.hpp>
+
 
 namespace {
 
diff --git a/core/test/utils.hpp b/core/test/utils.hpp
index c6ce7c273d1..43ded30cde5 100644
--- a/core/test/utils.hpp
+++ b/core/test/utils.hpp
@@ -13,16 +13,13 @@
 #include <tuple>
 #include <type_traits>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/name_demangling.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/base/utils_helper.hpp>
 
-
 #include "core/base/extended_float.hpp"
 #include "core/test/utils/array_generator.hpp"
 #include "core/test/utils/assertions.hpp"
diff --git a/core/test/utils/array_generator.hpp b/core/test/utils/array_generator.hpp
index a70751a12de..83b9018939e 100644
--- a/core/test/utils/array_generator.hpp
+++ b/core/test/utils/array_generator.hpp
@@ -10,7 +10,6 @@
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/utils.hpp>
 
-
 #include "core/test/utils/value_generator.hpp"
 
 
diff --git a/core/test/utils/array_generator_test.cpp b/core/test/utils/array_generator_test.cpp
index 21e617e2937..ae66e4686da 100644
--- a/core/test/utils/array_generator_test.cpp
+++ b/core/test/utils/array_generator_test.cpp
@@ -4,14 +4,11 @@
 
 #include "core/test/utils/array_generator.hpp"
 
-
 #include <cmath>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include "core/test/utils.hpp"
 
 namespace {
diff --git a/core/test/utils/assertions.hpp b/core/test/utils/assertions.hpp
index e1194ba72a5..7bdc71ea94e 100644
--- a/core/test/utils/assertions.hpp
+++ b/core/test/utils/assertions.hpp
@@ -17,10 +17,8 @@
 #include <type_traits>
 #include <typeinfo>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/math.hpp>
@@ -28,7 +26,6 @@
 #include <ginkgo/core/base/name_demangling.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/base/batch_utilities.hpp"
 #include "core/base/extended_float.hpp"
 
diff --git a/core/test/utils/assertions_test.cpp b/core/test/utils/assertions_test.cpp
index b9129ea52b6..73900397fbe 100644
--- a/core/test/utils/assertions_test.cpp
+++ b/core/test/utils/assertions_test.cpp
@@ -4,13 +4,10 @@
 
 #include "core/test/utils/assertions.hpp"
 
-
 #include <type_traits>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
diff --git a/core/test/utils/batch_helpers.hpp b/core/test/utils/batch_helpers.hpp
index 94f4163a387..eff6626de31 100644
--- a/core/test/utils/batch_helpers.hpp
+++ b/core/test/utils/batch_helpers.hpp
@@ -9,14 +9,12 @@
 #include <random>
 #include <vector>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/device_matrix_data.hpp>
 #include <ginkgo/core/base/matrix_data.hpp>
 #include <ginkgo/core/log/batch_logger.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/test/utils/assertions.hpp"
 #include "core/test/utils/matrix_generator.hpp"
 #include "core/utils/matrix_utils.hpp"
diff --git a/core/test/utils/fb_matrix_generator.hpp b/core/test/utils/fb_matrix_generator.hpp
index 0b41151d807..034dd95fce1 100644
--- a/core/test/utils/fb_matrix_generator.hpp
+++ b/core/test/utils/fb_matrix_generator.hpp
@@ -11,13 +11,11 @@
 #include <type_traits>
 #include <vector>
 
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/utils.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/fbcsr.hpp>
 
-
 #include "core/factorization/factorization_kernels.hpp"
 #include "core/test/utils/matrix_generator.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
diff --git a/core/test/utils/fb_matrix_generator_test.cpp b/core/test/utils/fb_matrix_generator_test.cpp
index 6dd93b55c58..ccbb0aa477f 100644
--- a/core/test/utils/fb_matrix_generator_test.cpp
+++ b/core/test/utils/fb_matrix_generator_test.cpp
@@ -4,16 +4,13 @@
 
 #include "core/test/utils/fb_matrix_generator.hpp"
 
-
 #include <algorithm>
 #include <cmath>
 #include <iostream>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include "accessor/block_col_major.hpp"
 #include "core/base/utils.hpp"
 #include "core/test/utils/matrix_generator.hpp"
diff --git a/core/test/utils/matrix_generator.hpp b/core/test/utils/matrix_generator.hpp
index 33eb6dd0b49..56ff38c520d 100644
--- a/core/test/utils/matrix_generator.hpp
+++ b/core/test/utils/matrix_generator.hpp
@@ -13,13 +13,11 @@
 #include <type_traits>
 #include <vector>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/utils.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/test/utils/value_generator.hpp"
 
 
diff --git a/core/test/utils/matrix_generator_test.cpp b/core/test/utils/matrix_generator_test.cpp
index 9a59c999c9d..43756bc1709 100644
--- a/core/test/utils/matrix_generator_test.cpp
+++ b/core/test/utils/matrix_generator_test.cpp
@@ -4,14 +4,11 @@
 
 #include "core/test/utils/matrix_generator.hpp"
 
-
 #include <cmath>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include "core/base/utils.hpp"
 #include "core/test/utils.hpp"
 
diff --git a/core/test/utils/matrix_utils_test.cpp b/core/test/utils/matrix_utils_test.cpp
index 58463eb361a..3c67571e1b2 100644
--- a/core/test/utils/matrix_utils_test.cpp
+++ b/core/test/utils/matrix_utils_test.cpp
@@ -4,18 +4,14 @@
 
 #include "core/utils/matrix_utils.hpp"
 
-
 #include <cmath>
 #include <random>
 #include <type_traits>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "core/test/utils/matrix_generator.hpp"
 
diff --git a/core/test/utils/unsort_matrix.hpp b/core/test/utils/unsort_matrix.hpp
index 1b1a403bee2..b721597b634 100644
--- a/core/test/utils/unsort_matrix.hpp
+++ b/core/test/utils/unsort_matrix.hpp
@@ -9,13 +9,11 @@
 #include <algorithm>
 #include <random>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/matrix_data.hpp>
 #include <ginkgo/core/base/temporary_clone.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/base/iterator_factory.hpp"
 
 
diff --git a/core/test/utils/unsort_matrix_test.cpp b/core/test/utils/unsort_matrix_test.cpp
index d402b0381cb..5d2f88f982a 100644
--- a/core/test/utils/unsort_matrix_test.cpp
+++ b/core/test/utils/unsort_matrix_test.cpp
@@ -4,15 +4,12 @@
 
 #include "core/test/utils/unsort_matrix.hpp"
 
-
 #include <cmath>
 #include <memory>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/dim.hpp>
 #include <ginkgo/core/base/mtx_io.hpp>
@@ -20,7 +17,6 @@
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/core/test/utils/value_generator.hpp b/core/test/utils/value_generator.hpp
index 0c6b7140b8b..f18f2170c96 100644
--- a/core/test/utils/value_generator.hpp
+++ b/core/test/utils/value_generator.hpp
@@ -9,7 +9,6 @@
 #include <random>
 #include <type_traits>
 
-
 #include <ginkgo/core/base/math.hpp>
 
 
diff --git a/core/test/utils/value_generator_test.cpp b/core/test/utils/value_generator_test.cpp
index 4f905ce3516..633565a66ef 100644
--- a/core/test/utils/value_generator_test.cpp
+++ b/core/test/utils/value_generator_test.cpp
@@ -4,15 +4,12 @@
 
 #include "core/test/utils/value_generator.hpp"
 
-
 #include <cmath>
 #include <random>
 #include <type_traits>
 
-
 #include <gtest/gtest.h>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu
index dcaafd5a46c..704192d0bff 100644
--- a/cuda/base/batch_multi_vector_kernels.cu
+++ b/cuda/base/batch_multi_vector_kernels.cu
@@ -4,15 +4,12 @@
 
 #include "core/base/batch_multi_vector_kernels.hpp"
 
-
 #include <thrust/functional.h>
 #include <thrust/transform.h>
 
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/range_accessors.hpp>
 
-
 #include "common/cuda_hip/base/blas_bindings.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/pointer_mode_guard.hpp"
diff --git a/cuda/base/batch_struct.hpp b/cuda/base/batch_struct.hpp
index 5251c594d42..9f07b6b4532 100644
--- a/cuda/base/batch_struct.hpp
+++ b/cuda/base/batch_struct.hpp
@@ -9,7 +9,6 @@
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "core/base/batch_struct.hpp"
diff --git a/cuda/base/config.hpp b/cuda/base/config.hpp
index 44c304bde5d..1ff249066bd 100644
--- a/cuda/base/config.hpp
+++ b/cuda/base/config.hpp
@@ -8,7 +8,6 @@
 
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "cuda/base/math.hpp"
 
 
diff --git a/cuda/base/cublas_bindings.hpp b/cuda/base/cublas_bindings.hpp
index c1cdf1f996e..bc8da5851d5 100644
--- a/cuda/base/cublas_bindings.hpp
+++ b/cuda/base/cublas_bindings.hpp
@@ -8,10 +8,8 @@
 
 #include <cublas_v2.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "common/cuda_hip/base/types.hpp"
 #include "cuda/base/math.hpp"
 
diff --git a/cuda/base/curand_bindings.hpp b/cuda/base/curand_bindings.hpp
index 10e09f4a356..8d31ac2e90e 100644
--- a/cuda/base/curand_bindings.hpp
+++ b/cuda/base/curand_bindings.hpp
@@ -8,10 +8,8 @@
 
 #include <curand.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "common/cuda_hip/base/types.hpp"
 #include "cuda/base/math.hpp"
 
diff --git a/cuda/base/cusparse_bindings.hpp b/cuda/base/cusparse_bindings.hpp
index c18e1d7e9a6..bca0a80a37b 100644
--- a/cuda/base/cusparse_bindings.hpp
+++ b/cuda/base/cusparse_bindings.hpp
@@ -9,10 +9,8 @@
 #include <cuda.h>
 #include <cusparse.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "common/cuda_hip/base/types.hpp"
 
 
diff --git a/cuda/base/cusparse_block_bindings.hpp b/cuda/base/cusparse_block_bindings.hpp
index c3db763f0da..484401460ec 100644
--- a/cuda/base/cusparse_block_bindings.hpp
+++ b/cuda/base/cusparse_block_bindings.hpp
@@ -9,10 +9,8 @@
 #include <cuda.h>
 #include <cusparse.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "common/cuda_hip/base/types.hpp"
 #include "cuda/base/cusparse_bindings.hpp"
 
diff --git a/cuda/base/cusparse_handle.hpp b/cuda/base/cusparse_handle.hpp
index 118aa976bab..39f1876a275 100644
--- a/cuda/base/cusparse_handle.hpp
+++ b/cuda/base/cusparse_handle.hpp
@@ -9,7 +9,6 @@
 #include <cuda.h>
 #include <cusparse.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
diff --git a/cuda/base/device.cpp b/cuda/base/device.cpp
index d7a9808ab2d..eb6f25695ad 100644
--- a/cuda/base/device.cpp
+++ b/cuda/base/device.cpp
@@ -2,13 +2,12 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <cuda_runtime.h>
+#include "cuda/base/device.hpp"
 
+#include <cuda_runtime.h>
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
-#include "cuda/base/device.hpp"
 #include "cuda/base/scoped_device_id.hpp"
 
 
diff --git a/cuda/base/device_matrix_data_kernels.cu b/cuda/base/device_matrix_data_kernels.cu
index 554abe8bc37..678c121016c 100644
--- a/cuda/base/device_matrix_data_kernels.cu
+++ b/cuda/base/device_matrix_data_kernels.cu
@@ -4,7 +4,6 @@
 
 #include "core/base/device_matrix_data_kernels.hpp"
 
-
 #include <thrust/copy.h>
 #include <thrust/count.h>
 #include <thrust/device_ptr.h>
@@ -13,7 +12,6 @@
 #include <thrust/sort.h>
 #include <thrust/tuple.h>
 
-
 #include "common/cuda_hip/base/types.hpp"
 #include "cuda/base/thrust.cuh"
 
diff --git a/cuda/base/exception.cpp b/cuda/base/exception.cpp
index 13557e3da50..7bb7fae5bd5 100644
--- a/cuda/base/exception.cpp
+++ b/cuda/base/exception.cpp
@@ -4,17 +4,14 @@
 
 #include "ginkgo/core/base/exception.hpp"
 
-
 #include <string>
 
-
-#include <cublas_v2.h>
 #include <cuda_runtime.h>
+#include <cublas_v2.h>
 #include <cufft.h>
 #include <curand.h>
 #include <cusparse.h>
 
-
 #include <ginkgo/core/base/types.hpp>
 
 
diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp
index c41bc6a72c6..1b1410ca8bb 100644
--- a/cuda/base/executor.cpp
+++ b/cuda/base/executor.cpp
@@ -4,22 +4,18 @@
 
 #include "ginkgo/core/base/executor.hpp"
 
-
 #include <iostream>
 #include <stdexcept>
 #include <thread>
 
-
 #include <cuda_runtime.h>
 
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/device.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/memory.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "cuda/base/cublas_bindings.hpp"
 #include "cuda/base/cusparse_handle.hpp"
diff --git a/cuda/base/index_set_kernels.cpp b/cuda/base/index_set_kernels.cpp
index 8655836a414..2041833e4c2 100644
--- a/cuda/base/index_set_kernels.cpp
+++ b/cuda/base/index_set_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/base/index_set_kernels.hpp"
 
-
 #include <memory>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/types.hpp>
diff --git a/cuda/base/kernel_config.hpp b/cuda/base/kernel_config.hpp
index f077290b4c5..f0821a42976 100644
--- a/cuda/base/kernel_config.hpp
+++ b/cuda/base/kernel_config.hpp
@@ -8,7 +8,6 @@
 
 #include <cuda_runtime.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
diff --git a/cuda/base/kernel_launch.cuh b/cuda/base/kernel_launch.cuh
index 0d4bc4eebd5..4b1d5ac05c3 100644
--- a/cuda/base/kernel_launch.cuh
+++ b/cuda/base/kernel_launch.cuh
@@ -10,7 +10,6 @@
 
 #include <thrust/tuple.h>
 
-
 #include "accessor/cuda_hip_helper.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
diff --git a/cuda/base/math.hpp b/cuda/base/math.hpp
index d86a85a083e..d9fa5165cf6 100644
--- a/cuda/base/math.hpp
+++ b/cuda/base/math.hpp
@@ -6,11 +6,10 @@
 #define GKO_CUDA_BASE_MATH_HPP_
 
 
-#include <ginkgo/core/base/math.hpp>
-
-
 #include <thrust/complex.h>
 
+#include <ginkgo/core/base/math.hpp>
+
 
 namespace gko {
 
diff --git a/cuda/base/memory.cpp b/cuda/base/memory.cpp
index 7949b07f78f..e9c1658907c 100644
--- a/cuda/base/memory.cpp
+++ b/cuda/base/memory.cpp
@@ -4,14 +4,11 @@
 
 #include "ginkgo/core/base/memory.hpp"
 
-
 #include <cuda.h>
 #include <cuda_runtime.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "cuda/base/scoped_device_id.hpp"
 
 
diff --git a/cuda/base/nvtx.cpp b/cuda/base/nvtx.cpp
index 6daa8afc2ca..e456cde5be0 100644
--- a/cuda/base/nvtx.cpp
+++ b/cuda/base/nvtx.cpp
@@ -4,7 +4,6 @@
 
 #include <cuda_runtime.h>
 
-
 #include <ginkgo/config.hpp>
 
 
diff --git a/cuda/base/pointer_mode_guard.hpp b/cuda/base/pointer_mode_guard.hpp
index 03327fb4dfe..39af6100c46 100644
--- a/cuda/base/pointer_mode_guard.hpp
+++ b/cuda/base/pointer_mode_guard.hpp
@@ -8,12 +8,10 @@
 
 #include <exception>
 
-
-#include <cublas_v2.h>
 #include <cuda_runtime.h>
+#include <cublas_v2.h>
 #include <cusparse.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/std_extensions.hpp>
 
diff --git a/cuda/base/scoped_device_id.cpp b/cuda/base/scoped_device_id.cpp
index a10e8d8913b..5851a1fe16b 100644
--- a/cuda/base/scoped_device_id.cpp
+++ b/cuda/base/scoped_device_id.cpp
@@ -2,19 +2,16 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "cuda/base/scoped_device_id.hpp"
+
 #include <exception>
 #include <utility>
 
-
 #include <cuda_runtime.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
-#include "cuda/base/scoped_device_id.hpp"
-
-
 namespace gko {
 namespace detail {
 
diff --git a/cuda/base/stream.cpp b/cuda/base/stream.cpp
index 703c9958ecd..c6f846c3f68 100644
--- a/cuda/base/stream.cpp
+++ b/cuda/base/stream.cpp
@@ -4,13 +4,10 @@
 
 #include "ginkgo/core/base/stream.hpp"
 
-
 #include <cuda_runtime.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "cuda/base/scoped_device_id.hpp"
 
 
diff --git a/cuda/base/thrust.cuh b/cuda/base/thrust.cuh
index 35e858a2555..5d5d58e0f33 100644
--- a/cuda/base/thrust.cuh
+++ b/cuda/base/thrust.cuh
@@ -9,7 +9,6 @@
 #include <thrust/execution_policy.h>
 #include <thrust/system/cuda/detail/execution_policy.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 
 
diff --git a/cuda/base/timer.cpp b/cuda/base/timer.cpp
index 01b96c19536..f9559bffb95 100644
--- a/cuda/base/timer.cpp
+++ b/cuda/base/timer.cpp
@@ -4,14 +4,11 @@
 
 #include "ginkgo/core/base/timer.hpp"
 
-
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "cuda/base/scoped_device_id.hpp"
 
 
diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp
index 561612f2869..7252f7d673d 100644
--- a/cuda/base/types.hpp
+++ b/cuda/base/types.hpp
@@ -6,20 +6,16 @@
 #define GKO_CUDA_BASE_TYPES_HPP_
 
 
-#include <ginkgo/core/base/types.hpp>
-
-
 #include <type_traits>
 
-
-#include <cublas_v2.h>
 #include <cuda.h>
+#include <cublas_v2.h>
 #include <cuda_fp16.h>
 #include <cusparse.h>
 #include <thrust/complex.h>
 
-
 #include <ginkgo/core/base/matrix_data.hpp>
+#include <ginkgo/core/base/types.hpp>
 
 
 namespace gko {
diff --git a/cuda/components/atomic.cuh b/cuda/components/atomic.cuh
index 1964f0ae196..a9d63677267 100644
--- a/cuda/components/atomic.cuh
+++ b/cuda/components/atomic.cuh
@@ -8,7 +8,6 @@
 
 #include <type_traits>
 
-
 #include "common/cuda_hip/base/types.hpp"
 #include "cuda/base/math.hpp"
 
diff --git a/cuda/components/cooperative_groups.cuh b/cuda/components/cooperative_groups.cuh
index 70643a3b16a..c4ceca9e409 100644
--- a/cuda/components/cooperative_groups.cuh
+++ b/cuda/components/cooperative_groups.cuh
@@ -8,10 +8,8 @@
 
 #include <type_traits>
 
-
-#include <cooperative_groups.h>
 #include <cuda.h>
-
+#include <cooperative_groups.h>
 
 #include "common/cuda_hip/base/config.hpp"
 
diff --git a/cuda/components/diagonal_block_manipulation.cuh b/cuda/components/diagonal_block_manipulation.cuh
index a8f27d3a81f..7f19555ace5 100644
--- a/cuda/components/diagonal_block_manipulation.cuh
+++ b/cuda/components/diagonal_block_manipulation.cuh
@@ -8,7 +8,6 @@
 
 #include <type_traits>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
diff --git a/cuda/components/format_conversion.cuh b/cuda/components/format_conversion.cuh
index f0ef007c53c..6690368cc4f 100644
--- a/cuda/components/format_conversion.cuh
+++ b/cuda/components/format_conversion.cuh
@@ -9,7 +9,6 @@
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/executor.hpp>
 
-
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "cuda/components/thread_ids.cuh"
 
diff --git a/cuda/components/memory.cuh b/cuda/components/memory.cuh
index 97e5d67c23a..7dd0ba13ba4 100644
--- a/cuda/components/memory.cuh
+++ b/cuda/components/memory.cuh
@@ -8,10 +8,8 @@
 
 #include <type_traits>
 
-
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/cuda_hip/base/types.hpp"
 
 
diff --git a/cuda/components/prefix_sum.cuh b/cuda/components/prefix_sum.cuh
index 2f6f145e304..6693bbfc326 100644
--- a/cuda/components/prefix_sum.cuh
+++ b/cuda/components/prefix_sum.cuh
@@ -8,7 +8,6 @@
 
 #include <type_traits>
 
-
 #include "common/cuda_hip/base/blas_bindings.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "cuda/components/reduction.cuh"
diff --git a/cuda/components/prefix_sum_kernels.cu b/cuda/components/prefix_sum_kernels.cu
index d330ce0a2b0..60b406ff894 100644
--- a/cuda/components/prefix_sum_kernels.cu
+++ b/cuda/components/prefix_sum_kernels.cu
@@ -4,18 +4,14 @@
 
 #include "core/components/prefix_sum_kernels.hpp"
 
-
 #include <limits>
 
-
 #include <thrust/scan.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/name_demangling.hpp>
 
-
 #include "cuda/base/thrust.cuh"
 
 
diff --git a/cuda/components/reduction.cuh b/cuda/components/reduction.cuh
index 250c560d44b..1e4b7cb447c 100644
--- a/cuda/components/reduction.cuh
+++ b/cuda/components/reduction.cuh
@@ -8,11 +8,9 @@
 
 #include <type_traits>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
diff --git a/cuda/components/syncfree.cuh b/cuda/components/syncfree.cuh
index 0d5c0d11f43..7d519891065 100644
--- a/cuda/components/syncfree.cuh
+++ b/cuda/components/syncfree.cuh
@@ -8,7 +8,6 @@
 
 #include <ginkgo/core/base/array.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "common/cuda_hip/components/memory.hpp"
diff --git a/cuda/components/warp_blas.cuh b/cuda/components/warp_blas.cuh
index fa5e3d3ae3b..8e0042cfdad 100644
--- a/cuda/components/warp_blas.cuh
+++ b/cuda/components/warp_blas.cuh
@@ -9,10 +9,8 @@
 #include <cassert>
 #include <type_traits>
 
-
 #include <ginkgo/config.hpp>
 
-
 #include "cuda/base/math.hpp"
 #include "cuda/components/reduction.cuh"
 
diff --git a/cuda/distributed/index_map_kernels.cu b/cuda/distributed/index_map_kernels.cu
index a5d838e901f..42e8f118301 100644
--- a/cuda/distributed/index_map_kernels.cu
+++ b/cuda/distributed/index_map_kernels.cu
@@ -4,7 +4,6 @@
 
 #include "core/distributed/index_map_kernels.hpp"
 
-
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
 #include <thrust/distance.h>
@@ -18,10 +17,8 @@
 #include <thrust/transform_reduce.h>
 #include <thrust/unique.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "cuda/base/thrust.cuh"
 #include "cuda/components/atomic.cuh"
 #include "cuda/components/searching.cuh"
diff --git a/cuda/distributed/matrix_kernels.cu b/cuda/distributed/matrix_kernels.cu
index 3ad815d7090..1cb939d40e7 100644
--- a/cuda/distributed/matrix_kernels.cu
+++ b/cuda/distributed/matrix_kernels.cu
@@ -4,7 +4,6 @@
 
 #include "core/distributed/matrix_kernels.hpp"
 
-
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
 #include <thrust/distance.h>
@@ -17,10 +16,8 @@
 #include <thrust/transform_reduce.h>
 #include <thrust/unique.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "cuda/base/thrust.cuh"
 #include "cuda/components/atomic.cuh"
 
diff --git a/cuda/distributed/partition_helpers_kernels.cu b/cuda/distributed/partition_helpers_kernels.cu
index b478477ce18..738d478d99a 100644
--- a/cuda/distributed/partition_helpers_kernels.cu
+++ b/cuda/distributed/partition_helpers_kernels.cu
@@ -4,13 +4,11 @@
 
 #include "core/distributed/partition_helpers_kernels.hpp"
 
-
 #include <thrust/device_ptr.h>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/sort.h>
 
-
 #include "cuda/base/thrust.cuh"
 
 
diff --git a/cuda/distributed/partition_kernels.cu b/cuda/distributed/partition_kernels.cu
index de6c5bc6c02..050d6d285d6 100644
--- a/cuda/distributed/partition_kernels.cu
+++ b/cuda/distributed/partition_kernels.cu
@@ -4,7 +4,6 @@
 
 #include "core/distributed/partition_kernels.hpp"
 
-
 #include <thrust/count.h>
 #include <thrust/device_ptr.h>
 #include <thrust/execution_policy.h>
@@ -12,7 +11,6 @@
 #include <thrust/scan.h>
 #include <thrust/sort.h>
 
-
 #include "common/unified/base/kernel_launch.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "cuda/base/thrust.cuh"
diff --git a/cuda/distributed/vector_kernels.cu b/cuda/distributed/vector_kernels.cu
index ca9c419239b..60388150da4 100644
--- a/cuda/distributed/vector_kernels.cu
+++ b/cuda/distributed/vector_kernels.cu
@@ -4,7 +4,6 @@
 
 #include "core/distributed/vector_kernels.hpp"
 
-
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/transform_iterator.h>
@@ -12,10 +11,8 @@
 #include <thrust/scatter.h>
 #include <thrust/tuple.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "cuda/base/thrust.cuh"
 
 
diff --git a/cuda/factorization/cholesky_kernels.cu b/cuda/factorization/cholesky_kernels.cu
index e05b0803dc2..7d5fe2c3d08 100644
--- a/cuda/factorization/cholesky_kernels.cu
+++ b/cuda/factorization/cholesky_kernels.cu
@@ -4,11 +4,9 @@
 
 #include "core/factorization/cholesky_kernels.hpp"
 
-
 #include <algorithm>
 #include <memory>
 
-
 #include <thrust/execution_policy.h>
 #include <thrust/functional.h>
 #include <thrust/sequence.h>
@@ -16,10 +14,8 @@
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
-
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/fill_array_kernels.hpp"
diff --git a/cuda/factorization/factorization_kernels.cu b/cuda/factorization/factorization_kernels.cu
index 309ded37d34..fcabf3676e6 100644
--- a/cuda/factorization/factorization_kernels.cu
+++ b/cuda/factorization/factorization_kernels.cu
@@ -4,10 +4,8 @@
 
 #include "core/factorization/factorization_kernels.hpp"
 
-
 #include <ginkgo/core/base/array.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
diff --git a/cuda/factorization/ic_kernels.cu b/cuda/factorization/ic_kernels.cu
index 9d55856f139..3a4b4a55411 100644
--- a/cuda/factorization/ic_kernels.cu
+++ b/cuda/factorization/ic_kernels.cu
@@ -4,10 +4,8 @@
 
 #include "core/factorization/ic_kernels.hpp"
 
-
 #include <ginkgo/core/base/array.hpp>
 
-
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
 
 
diff --git a/cuda/factorization/ilu_kernels.cu b/cuda/factorization/ilu_kernels.cu
index acebec6e94c..6096e89ef4b 100644
--- a/cuda/factorization/ilu_kernels.cu
+++ b/cuda/factorization/ilu_kernels.cu
@@ -4,10 +4,8 @@
 
 #include "core/factorization/ilu_kernels.hpp"
 
-
 #include <ginkgo/core/base/array.hpp>
 
-
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
 
 
diff --git a/cuda/factorization/lu_kernels.cu b/cuda/factorization/lu_kernels.cu
index 9c3069f62cf..57ed7ac8531 100644
--- a/cuda/factorization/lu_kernels.cu
+++ b/cuda/factorization/lu_kernels.cu
@@ -4,19 +4,15 @@
 
 #include "core/factorization/lu_kernels.hpp"
 
-
 #include <algorithm>
 #include <memory>
 
-
 #include <thrust/copy.h>
 #include <thrust/iterator/transform_output_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 
-
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/allocator.hpp"
diff --git a/cuda/factorization/par_ic_kernels.cu b/cuda/factorization/par_ic_kernels.cu
index f493cb11fd1..473272fe1fb 100644
--- a/cuda/factorization/par_ic_kernels.cu
+++ b/cuda/factorization/par_ic_kernels.cu
@@ -4,12 +4,10 @@
 
 #include "core/factorization/par_ic_kernels.hpp"
 
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/memory.hpp"
 #include "cuda/base/math.hpp"
diff --git a/cuda/factorization/par_ict_kernels.cu b/cuda/factorization/par_ict_kernels.cu
index d958f81d2f4..fb7a0b0370a 100644
--- a/cuda/factorization/par_ict_kernels.cu
+++ b/cuda/factorization/par_ict_kernels.cu
@@ -4,14 +4,12 @@
 
 #include "core/factorization/par_ict_kernels.hpp"
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/components/memory.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
diff --git a/cuda/factorization/par_ilu_kernels.cu b/cuda/factorization/par_ilu_kernels.cu
index 755723e7d4c..1f023892afb 100644
--- a/cuda/factorization/par_ilu_kernels.cu
+++ b/cuda/factorization/par_ilu_kernels.cu
@@ -4,10 +4,8 @@
 
 #include "core/factorization/par_ilu_kernels.hpp"
 
-
 #include <ginkgo/core/matrix/coo.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/memory.hpp"
diff --git a/cuda/factorization/par_ilut_approx_filter_kernels.cu b/cuda/factorization/par_ilut_approx_filter_kernels.cu
index ae544939e17..51127ffd43b 100644
--- a/cuda/factorization/par_ilut_approx_filter_kernels.cu
+++ b/cuda/factorization/par_ilut_approx_filter_kernels.cu
@@ -2,23 +2,19 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/factorization/par_ilut_kernels.hpp"
-
-
 #include <algorithm>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
+#include "core/factorization/par_ilut_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
diff --git a/cuda/factorization/par_ilut_filter_kernels.cu b/cuda/factorization/par_ilut_filter_kernels.cu
index 4a24c5f305b..e15c7ec4cf6 100644
--- a/cuda/factorization/par_ilut_filter_kernels.cu
+++ b/cuda/factorization/par_ilut_filter_kernels.cu
@@ -2,21 +2,18 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/factorization/par_ilut_kernels.hpp"
-
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
+#include "core/factorization/par_ilut_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
diff --git a/cuda/factorization/par_ilut_select_common.cu b/cuda/factorization/par_ilut_select_common.cu
index bbba93595c8..3f910f4884e 100644
--- a/cuda/factorization/par_ilut_select_common.cu
+++ b/cuda/factorization/par_ilut_select_common.cu
@@ -4,7 +4,6 @@
 
 #include "cuda/factorization/par_ilut_select_common.cuh"
 
-
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/par_ilut_kernels.hpp"
 #include "cuda/base/math.hpp"
diff --git a/cuda/factorization/par_ilut_select_kernels.cu b/cuda/factorization/par_ilut_select_kernels.cu
index 6a7bd53c1c4..ac37e3a7595 100644
--- a/cuda/factorization/par_ilut_select_kernels.cu
+++ b/cuda/factorization/par_ilut_select_kernels.cu
@@ -2,19 +2,15 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/factorization/par_ilut_kernels.hpp"
-
-
 #include <algorithm>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
+#include "core/factorization/par_ilut_kernels.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/components/atomic.cuh"
 #include "cuda/components/intrinsics.cuh"
diff --git a/cuda/factorization/par_ilut_spgeam_kernels.cu b/cuda/factorization/par_ilut_spgeam_kernels.cu
index 0a751c2f48f..83ec9c974b8 100644
--- a/cuda/factorization/par_ilut_spgeam_kernels.cu
+++ b/cuda/factorization/par_ilut_spgeam_kernels.cu
@@ -2,19 +2,16 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/factorization/par_ilut_kernels.hpp"
-
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
+#include "core/factorization/par_ilut_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
diff --git a/cuda/factorization/par_ilut_sweep_kernels.cu b/cuda/factorization/par_ilut_sweep_kernels.cu
index 5924ebe328d..8bdf6c9380a 100644
--- a/cuda/factorization/par_ilut_sweep_kernels.cu
+++ b/cuda/factorization/par_ilut_sweep_kernels.cu
@@ -2,19 +2,16 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/factorization/par_ilut_kernels.hpp"
-
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/components/memory.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
+#include "core/factorization/par_ilut_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
diff --git a/cuda/matrix/batch_csr_kernels.cu b/cuda/matrix/batch_csr_kernels.cu
index 6ec20480405..4fc5137646c 100644
--- a/cuda/matrix/batch_csr_kernels.cu
+++ b/cuda/matrix/batch_csr_kernels.cu
@@ -4,15 +4,12 @@
 
 #include "core/matrix/batch_csr_kernels.hpp"
 
-
 #include <thrust/functional.h>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/batch_csr.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
diff --git a/cuda/matrix/batch_dense_kernels.cu b/cuda/matrix/batch_dense_kernels.cu
index 673b08e5db1..e28d4f91670 100644
--- a/cuda/matrix/batch_dense_kernels.cu
+++ b/cuda/matrix/batch_dense_kernels.cu
@@ -4,15 +4,12 @@
 
 #include "core/matrix/batch_dense_kernels.hpp"
 
-
 #include <thrust/functional.h>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
diff --git a/cuda/matrix/batch_ell_kernels.cu b/cuda/matrix/batch_ell_kernels.cu
index 8f0160bd154..90caf963200 100644
--- a/cuda/matrix/batch_ell_kernels.cu
+++ b/cuda/matrix/batch_ell_kernels.cu
@@ -4,15 +4,12 @@
 
 #include "core/matrix/batch_ell_kernels.hpp"
 
-
 #include <thrust/functional.h>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
diff --git a/cuda/matrix/batch_struct.hpp b/cuda/matrix/batch_struct.hpp
index 5e9c803c9f6..5845fb2235e 100644
--- a/cuda/matrix/batch_struct.hpp
+++ b/cuda/matrix/batch_struct.hpp
@@ -6,15 +6,12 @@
 #define GKO_CUDA_MATRIX_BATCH_STRUCT_HPP_
 
 
-#include "core/matrix/batch_struct.hpp"
-
-
 #include <ginkgo/core/matrix/batch_dense.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
-
 #include "common/cuda_hip/base/types.hpp"
 #include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
 
 
 namespace gko {
diff --git a/cuda/matrix/coo_kernels.cu b/cuda/matrix/coo_kernels.cu
index f138d0b934e..1536e88345e 100644
--- a/cuda/matrix/coo_kernels.cu
+++ b/cuda/matrix/coo_kernels.cu
@@ -4,14 +4,12 @@
 
 #include "core/matrix/coo_kernels.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
diff --git a/cuda/matrix/csr_kernels.template.cu b/cuda/matrix/csr_kernels.template.cu
index a0a7e4e97b8..600f4ffb5a3 100644
--- a/cuda/matrix/csr_kernels.template.cu
+++ b/cuda/matrix/csr_kernels.template.cu
@@ -4,10 +4,8 @@
 
 #include "core/matrix/csr_kernels.hpp"
 
-
 #include <algorithm>
 
-
 #include <thrust/copy.h>
 #include <thrust/count.h>
 #include <thrust/device_ptr.h>
@@ -16,7 +14,6 @@
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/sort.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
@@ -26,7 +23,6 @@
 #include <ginkgo/core/matrix/hybrid.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 
-
 #include "accessor/cuda_hip_helper.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/pointer_mode_guard.hpp"
diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu
index b117c39107b..b2114f936e7 100644
--- a/cuda/matrix/dense_kernels.cu
+++ b/cuda/matrix/dense_kernels.cu
@@ -4,7 +4,6 @@
 
 #include "core/matrix/dense_kernels.hpp"
 
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/range_accessors.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
@@ -16,7 +15,6 @@
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
 #include "common/cuda_hip/base/blas_bindings.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/pointer_mode_guard.hpp"
diff --git a/cuda/matrix/diagonal_kernels.cu b/cuda/matrix/diagonal_kernels.cu
index e362ff0462b..78c0babe3a0 100644
--- a/cuda/matrix/diagonal_kernels.cu
+++ b/cuda/matrix/diagonal_kernels.cu
@@ -4,11 +4,9 @@
 
 #include "core/matrix/diagonal_kernels.hpp"
 
-
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu
index 105122ec4a9..5c81fa7c994 100644
--- a/cuda/matrix/ell_kernels.cu
+++ b/cuda/matrix/ell_kernels.cu
@@ -4,17 +4,14 @@
 
 #include "core/matrix/ell_kernels.hpp"
 
-
 #include <array>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "accessor/cuda_hip_helper.hpp"
 #include "accessor/reduced_row_major.hpp"
 #include "common/cuda_hip/base/config.hpp"
diff --git a/cuda/matrix/fbcsr_kernels.template.cu b/cuda/matrix/fbcsr_kernels.template.cu
index ad36c84216e..120a81c247c 100644
--- a/cuda/matrix/fbcsr_kernels.template.cu
+++ b/cuda/matrix/fbcsr_kernels.template.cu
@@ -4,10 +4,8 @@
 
 #include "core/matrix/fbcsr_kernels.hpp"
 
-
 #include <algorithm>
 
-
 #include <thrust/copy.h>
 #include <thrust/count.h>
 #include <thrust/device_ptr.h>
@@ -16,14 +14,12 @@
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/sort.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "common/cuda_hip/base/blas_bindings.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/pointer_mode_guard.hpp"
diff --git a/cuda/matrix/fft_kernels.cu b/cuda/matrix/fft_kernels.cu
index d02f1c63c70..80e938fbbff 100644
--- a/cuda/matrix/fft_kernels.cu
+++ b/cuda/matrix/fft_kernels.cu
@@ -4,13 +4,10 @@
 
 #include "core/matrix/fft_kernels.hpp"
 
-
 #include <array>
 
-
 #include <cufft.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
diff --git a/cuda/matrix/sellp_kernels.cu b/cuda/matrix/sellp_kernels.cu
index d6c20075ef4..07f5d5d8ec0 100644
--- a/cuda/matrix/sellp_kernels.cu
+++ b/cuda/matrix/sellp_kernels.cu
@@ -4,14 +4,12 @@
 
 #include "core/matrix/sellp_kernels.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
diff --git a/cuda/matrix/sparsity_csr_kernels.cu b/cuda/matrix/sparsity_csr_kernels.cu
index 311e4d3782c..17a1e004935 100644
--- a/cuda/matrix/sparsity_csr_kernels.cu
+++ b/cuda/matrix/sparsity_csr_kernels.cu
@@ -4,13 +4,10 @@
 
 #include "core/matrix/sparsity_csr_kernels.hpp"
 
-
 #include <thrust/sort.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "accessor/cuda_hip_helper.hpp"
 #include "accessor/reduced_row_major.hpp"
 #include "common/cuda_hip/base/config.hpp"
diff --git a/cuda/multigrid/pgm_kernels.cu b/cuda/multigrid/pgm_kernels.cu
index 75c3dd911ad..399d8a06c1b 100644
--- a/cuda/multigrid/pgm_kernels.cu
+++ b/cuda/multigrid/pgm_kernels.cu
@@ -4,21 +4,17 @@
 
 #include "core/multigrid/pgm_kernels.hpp"
 
-
 #include <memory>
 
-
 #include <thrust/device_ptr.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/reduce.h>
 #include <thrust/sort.h>
 #include <thrust/tuple.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/cuda_hip/base/types.hpp"
 #include "cuda/base/thrust.cuh"
 
diff --git a/cuda/preconditioner/batch_jacobi_kernels.cu b/cuda/preconditioner/batch_jacobi_kernels.cu
index 67c41634637..1bc39df9781 100644
--- a/cuda/preconditioner/batch_jacobi_kernels.cu
+++ b/cuda/preconditioner/batch_jacobi_kernels.cu
@@ -4,12 +4,10 @@
 
 #include "core/preconditioner/batch_jacobi_kernels.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/matrix/batch_csr.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
-
 #include "core/base/batch_struct.hpp"
 #include "core/base/utils.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
diff --git a/cuda/preconditioner/isai_kernels.cu b/cuda/preconditioner/isai_kernels.cu
index d0dd516466a..8867bf643b0 100644
--- a/cuda/preconditioner/isai_kernels.cu
+++ b/cuda/preconditioner/isai_kernels.cu
@@ -4,12 +4,10 @@
 
 #include "core/preconditioner/isai_kernels.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernels.cu b/cuda/preconditioner/jacobi_advanced_apply_kernels.cu
index 6150ea5b12d..74c7dea9b6b 100644
--- a/cuda/preconditioner/jacobi_advanced_apply_kernels.cu
+++ b/cuda/preconditioner/jacobi_advanced_apply_kernels.cu
@@ -2,13 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/preconditioner/jacobi_kernels.hpp"
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "core/matrix/dense_kernels.hpp"
+#include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "cuda/preconditioner/jacobi_common.hpp"
 
diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu
index 10ede90da7e..e0b9145a0f7 100644
--- a/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu
+++ b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu
@@ -2,17 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/preconditioner/jacobi_kernels.hpp"
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/matrix/dense_kernels.hpp"
+#include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "cuda/base/math.hpp"
diff --git a/cuda/preconditioner/jacobi_generate_kernels.cu b/cuda/preconditioner/jacobi_generate_kernels.cu
index f1e8320611b..651dcec611a 100644
--- a/cuda/preconditioner/jacobi_generate_kernels.cu
+++ b/cuda/preconditioner/jacobi_generate_kernels.cu
@@ -2,14 +2,11 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/preconditioner/jacobi_kernels.hpp"
-
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "core/components/fill_array_kernels.hpp"
+#include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "cuda/preconditioner/jacobi_common.hpp"
 
diff --git a/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu
index 129c50625f4..c12df449e42 100644
--- a/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu
+++ b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu
@@ -2,18 +2,15 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/preconditioner/jacobi_kernels.hpp"
-
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/components/fill_array_kernels.hpp"
+#include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "cuda/base/math.hpp"
diff --git a/cuda/preconditioner/jacobi_kernels.cu b/cuda/preconditioner/jacobi_kernels.cu
index bce2ff23303..783de652733 100644
--- a/cuda/preconditioner/jacobi_kernels.cu
+++ b/cuda/preconditioner/jacobi_kernels.cu
@@ -4,10 +4,8 @@
 
 #include "core/preconditioner/jacobi_kernels.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
diff --git a/cuda/preconditioner/jacobi_simple_apply_kernels.cu b/cuda/preconditioner/jacobi_simple_apply_kernels.cu
index d510aab6963..5cac209b8b2 100644
--- a/cuda/preconditioner/jacobi_simple_apply_kernels.cu
+++ b/cuda/preconditioner/jacobi_simple_apply_kernels.cu
@@ -2,12 +2,9 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/preconditioner/jacobi_kernels.hpp"
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
+#include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "cuda/preconditioner/jacobi_common.hpp"
diff --git a/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu
index 15f6dc138ad..45af2ec668f 100644
--- a/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu
+++ b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu
@@ -2,17 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/preconditioner/jacobi_kernels.hpp"
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/matrix/dense_kernels.hpp"
+#include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "cuda/base/math.hpp"
diff --git a/cuda/reorder/rcm_kernels.cu b/cuda/reorder/rcm_kernels.cu
index 72322016fba..8308cf88e60 100644
--- a/cuda/reorder/rcm_kernels.cu
+++ b/cuda/reorder/rcm_kernels.cu
@@ -4,7 +4,6 @@
 
 #include "core/reorder/rcm_kernels.hpp"
 
-
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
 #include <thrust/count.h>
@@ -16,7 +15,6 @@
 #include <thrust/sort.h>
 #include <thrust/transform.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/std_extensions.hpp>
 #include <ginkgo/core/base/types.hpp>
@@ -24,7 +22,6 @@
 #include <ginkgo/core/matrix/permutation.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
 #include "common/cuda_hip/components/memory.hpp"
 #include "core/base/array_access.hpp"
 #include "cuda/base/thrust.cuh"
diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu
index 58e1a6b7b0d..b6ae74a5064 100644
--- a/cuda/solver/batch_bicgstab_kernels.cu
+++ b/cuda/solver/batch_bicgstab_kernels.cu
@@ -4,15 +4,12 @@
 
 #include "core/solver/batch_bicgstab_kernels.hpp"
 
-
 #include <thrust/functional.h>
 #include <thrust/transform.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu
index 398e831eb09..5425bd9cd9c 100644
--- a/cuda/solver/batch_cg_kernels.cu
+++ b/cuda/solver/batch_cg_kernels.cu
@@ -4,15 +4,12 @@
 
 #include "core/solver/batch_cg_kernels.hpp"
 
-
 #include <thrust/functional.h>
 #include <thrust/transform.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
diff --git a/cuda/solver/cb_gmres_kernels.cu b/cuda/solver/cb_gmres_kernels.cu
index 3dbefadf22a..8b1a28d5581 100644
--- a/cuda/solver/cb_gmres_kernels.cu
+++ b/cuda/solver/cb_gmres_kernels.cu
@@ -4,16 +4,13 @@
 
 #include "core/solver/cb_gmres_kernels.hpp"
 
-
 #include <algorithm>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
-
 #include "accessor/cuda_hip_helper.hpp"
 #include "accessor/range.hpp"
 #include "accessor/reduced_row_major.hpp"
diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh
index 992974e95ef..a205f155487 100644
--- a/cuda/solver/common_trs_kernels.cuh
+++ b/cuda/solver/common_trs_kernels.cuh
@@ -10,16 +10,13 @@
 #include <iostream>
 #include <memory>
 
-
 #include <cuda.h>
 #include <cusparse.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/cuda_hip/base/pointer_mode_guard.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
 #include "common/cuda_hip/base/types.hpp"
diff --git a/cuda/solver/idr_kernels.cu b/cuda/solver/idr_kernels.cu
index f7e89c9d9d8..34aac3751d6 100644
--- a/cuda/solver/idr_kernels.cu
+++ b/cuda/solver/idr_kernels.cu
@@ -4,15 +4,12 @@
 
 #include "core/solver/idr_kernels.hpp"
 
-
 #include <ctime>
 #include <random>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/cuda_hip/base/blas_bindings.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/randlib_bindings.hpp"
diff --git a/cuda/solver/lower_trs_kernels.cu b/cuda/solver/lower_trs_kernels.cu
index 002cc0140cb..898ffb92552 100644
--- a/cuda/solver/lower_trs_kernels.cu
+++ b/cuda/solver/lower_trs_kernels.cu
@@ -4,19 +4,15 @@
 
 #include "core/solver/lower_trs_kernels.hpp"
 
-
 #include <memory>
 
-
 #include <cuda.h>
 #include <cusparse.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/solver/triangular.hpp>
 
-
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "cuda/base/math.hpp"
diff --git a/cuda/solver/multigrid_kernels.cu b/cuda/solver/multigrid_kernels.cu
index 1d31130623a..6001d42614d 100644
--- a/cuda/solver/multigrid_kernels.cu
+++ b/cuda/solver/multigrid_kernels.cu
@@ -4,13 +4,11 @@
 
 #include "core/solver/multigrid_kernels.hpp"
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "core/base/array_access.hpp"
diff --git a/cuda/solver/upper_trs_kernels.cu b/cuda/solver/upper_trs_kernels.cu
index e1e01538f79..b1f9e43ed2c 100644
--- a/cuda/solver/upper_trs_kernels.cu
+++ b/cuda/solver/upper_trs_kernels.cu
@@ -4,19 +4,15 @@
 
 #include "core/solver/upper_trs_kernels.hpp"
 
-
 #include <memory>
 
-
 #include <cuda.h>
 #include <cusparse.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/solver/triangular.hpp>
 
-
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "cuda/base/math.hpp"
diff --git a/cuda/stop/criterion_kernels.cu b/cuda/stop/criterion_kernels.cu
index e54b5d140f2..20538e87304 100644
--- a/cuda/stop/criterion_kernels.cu
+++ b/cuda/stop/criterion_kernels.cu
@@ -4,12 +4,10 @@
 
 #include "core/stop/criterion_kernels.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
-
 #include "common/cuda_hip/base/types.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/components/thread_ids.cuh"
diff --git a/cuda/stop/residual_norm_kernels.cu b/cuda/stop/residual_norm_kernels.cu
index 7146d0cbf04..d59f937b918 100644
--- a/cuda/stop/residual_norm_kernels.cu
+++ b/cuda/stop/residual_norm_kernels.cu
@@ -4,12 +4,10 @@
 
 #include "core/stop/residual_norm_kernels.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "common/cuda_hip/base/types.hpp"
 #include "core/base/array_access.hpp"
 #include "cuda/base/math.hpp"
diff --git a/cuda/test/base/array.cpp b/cuda/test/base/array.cpp
index 6e63b13ff7c..edb6b71676a 100644
--- a/cuda/test/base/array.cpp
+++ b/cuda/test/base/array.cpp
@@ -2,15 +2,11 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/array.hpp>
-
-
 #include <gtest/gtest.h>
 
-
+#include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 
-
 #include "core/base/array_access.hpp"
 #include "cuda/test/utils.hpp"
 
diff --git a/cuda/test/base/cuda_executor.cu b/cuda/test/base/cuda_executor.cu
index 012b5017dc3..8eb3dbd19fe 100644
--- a/cuda/test/base/cuda_executor.cu
+++ b/cuda/test/base/cuda_executor.cu
@@ -2,18 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/executor.hpp>
-
-
 #include <memory>
 #include <type_traits>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/stream.hpp>
 
 #include "common/cuda_hip/base/executor.hpp.inc"
diff --git a/cuda/test/base/cuda_executor_topology.cu b/cuda/test/base/cuda_executor_topology.cu
index 2a6d5e9b528..790fc0be1f1 100644
--- a/cuda/test/base/cuda_executor_topology.cu
+++ b/cuda/test/base/cuda_executor_topology.cu
@@ -2,13 +2,12 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/executor.hpp>
-
-
 #include <memory>
 #include <thread>
 #include <type_traits>
 
+#include <ginkgo/core/base/executor.hpp>
+
 
 #if defined(__unix__) || defined(__APPLE__)
 #include <numa.h>
@@ -18,11 +17,9 @@
 
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "cuda/test/utils.hpp"
 
 
diff --git a/cuda/test/base/exception_helpers.cu b/cuda/test/base/exception_helpers.cu
index 7e85601328a..7ee7ca0e8f0 100644
--- a/cuda/test/base/exception_helpers.cu
+++ b/cuda/test/base/exception_helpers.cu
@@ -2,18 +2,16 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/exception_helpers.hpp>
-
-
-#include <cublas_v2.h>
 #include <cuda_runtime.h>
+#include <cublas_v2.h>
 #include <cufft.h>
 #include <curand.h>
 #include <cusparse.h>
 
-
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/base/exception_helpers.hpp>
+
 
 namespace {
 
diff --git a/cuda/test/base/index_set.cpp b/cuda/test/base/index_set.cpp
index 797bc3f1f44..0e75f3dd140 100644
--- a/cuda/test/base/index_set.cpp
+++ b/cuda/test/base/index_set.cpp
@@ -2,20 +2,15 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/index_set.hpp>
-
-
 #include <algorithm>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/index_set.hpp>
 #include <ginkgo/core/base/range.hpp>
 
-
 #include "cuda/test/utils.hpp"
 
 
diff --git a/cuda/test/base/kernel_launch.cu b/cuda/test/base/kernel_launch.cu
index da52b3ffc87..ddf14f7baf9 100644
--- a/cuda/test/base/kernel_launch.cu
+++ b/cuda/test/base/kernel_launch.cu
@@ -4,20 +4,16 @@
 
 #include "common/unified/base/kernel_launch.hpp"
 
-
 #include <memory>
 #include <type_traits>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/dim.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "common/unified/base/kernel_launch_reduction.hpp"
 #include "common/unified/base/kernel_launch_solver.hpp"
 #include "core/base/array_access.hpp"
diff --git a/cuda/test/base/lin_op.cpp b/cuda/test/base/lin_op.cpp
index dd703ec07fa..87cd8ee32bc 100644
--- a/cuda/test/base/lin_op.cpp
+++ b/cuda/test/base/lin_op.cpp
@@ -4,7 +4,6 @@
 
 #include <ginkgo/core/base/lin_op.hpp>
 
-
 #include "cuda/test/utils.hpp"
 
 
diff --git a/cuda/test/base/math.cu b/cuda/test/base/math.cu
index 944e7642223..e3c1d78ed39 100644
--- a/cuda/test/base/math.cu
+++ b/cuda/test/base/math.cu
@@ -2,23 +2,19 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/math.hpp>
-
+#include "cuda/base/math.hpp"
 
 #include <cmath>
 #include <complex>
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
-
+#include <ginkgo/core/base/math.hpp>
 
 #include "common/cuda_hip/base/types.hpp"
-#include "cuda/base/math.hpp"
 #include "cuda/test/utils.hpp"
 
 
diff --git a/cuda/test/base/memory.cpp b/cuda/test/base/memory.cpp
index f1657639ff0..345616c0588 100644
--- a/cuda/test/base/memory.cpp
+++ b/cuda/test/base/memory.cpp
@@ -2,20 +2,15 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/memory.hpp>
-
-
 #include <memory>
 #include <type_traits>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
-
+#include <ginkgo/core/base/memory.hpp>
 
 #include "cuda/test/utils.hpp"
 
diff --git a/cuda/test/base/scoped_device_id.cu b/cuda/test/base/scoped_device_id.cu
index 5c2e496b64b..0ac4b21e207 100644
--- a/cuda/test/base/scoped_device_id.cu
+++ b/cuda/test/base/scoped_device_id.cu
@@ -2,16 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <cuda_runtime.h>
+#include "cuda/base/scoped_device_id.hpp"
 
+#include <cuda_runtime.h>
 
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 
-
-#include "cuda/base/scoped_device_id.hpp"
 #include "cuda/test/utils.hpp"
 
 
diff --git a/cuda/test/components/cooperative_groups.cu b/cuda/test/components/cooperative_groups.cu
index c9d9e6bf124..df3cef86bb8 100644
--- a/cuda/test/components/cooperative_groups.cu
+++ b/cuda/test/components/cooperative_groups.cu
@@ -2,18 +2,16 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <memory>
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 
+#include <memory>
 
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "cuda/test/utils.hpp"
 
 
diff --git a/cuda/test/components/merging.cu b/cuda/test/components/merging.cu
index 37b032eb794..2788767b078 100644
--- a/cuda/test/components/merging.cu
+++ b/cuda/test/components/merging.cu
@@ -4,20 +4,16 @@
 
 #include "cuda/components/merging.cuh"
 
-
 #include <algorithm>
 #include <memory>
 #include <random>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 
-
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "cuda/test/utils.hpp"
 
diff --git a/cuda/test/components/searching.cu b/cuda/test/components/searching.cu
index ffe00c247c0..afe7fb4b442 100644
--- a/cuda/test/components/searching.cu
+++ b/cuda/test/components/searching.cu
@@ -4,19 +4,15 @@
 
 #include "cuda/components/searching.cuh"
 
-
 #include <memory>
 #include <numeric>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 
-
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "cuda/test/utils.hpp"
 
diff --git a/cuda/test/components/sorting.cu b/cuda/test/components/sorting.cu
index 19c7daab782..e1524ce0078 100644
--- a/cuda/test/components/sorting.cu
+++ b/cuda/test/components/sorting.cu
@@ -4,18 +4,14 @@
 
 #include "cuda/components/sorting.cuh"
 
-
 #include <memory>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 
-
 #include "cuda/test/utils.hpp"
 
 
diff --git a/cuda/test/solver/lower_trs_kernels.cu b/cuda/test/solver/lower_trs_kernels.cu
index 00d4f371ac5..67eeaf51847 100644
--- a/cuda/test/solver/lower_trs_kernels.cu
+++ b/cuda/test/solver/lower_trs_kernels.cu
@@ -2,24 +2,21 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/solver/lower_trs_kernels.hpp"
+
 #include <memory>
 #include <random>
 
-
 #include <cuda.h>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/solver/triangular.hpp>
 
-
-#include "core/solver/lower_trs_kernels.hpp"
 #include "cuda/test/utils.hpp"
 
 
diff --git a/cuda/test/solver/upper_trs_kernels.cu b/cuda/test/solver/upper_trs_kernels.cu
index de2368be6ab..3ad061e2bc5 100644
--- a/cuda/test/solver/upper_trs_kernels.cu
+++ b/cuda/test/solver/upper_trs_kernels.cu
@@ -2,24 +2,21 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/solver/upper_trs_kernels.hpp"
+
 #include <memory>
 #include <random>
 
-
 #include <cuda.h>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/solver/triangular.hpp>
 
-
-#include "core/solver/upper_trs_kernels.hpp"
 #include "cuda/test/utils.hpp"
 
 
diff --git a/cuda/test/utils.hpp b/cuda/test/utils.hpp
index a398deedd7b..17dd1fd8722 100644
--- a/cuda/test/utils.hpp
+++ b/cuda/test/utils.hpp
@@ -6,14 +6,11 @@
 #define GKO_CUDA_TEST_UTILS_HPP_
 
 
-#include "core/test/utils.hpp"
-
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/stream.hpp>
 
-
 #include "core/test/gtest/resources.hpp"
+#include "core/test/utils.hpp"
 #include "cuda/base/device.hpp"
 
 
diff --git a/cuda/test/utils/assertions_test.cu b/cuda/test/utils/assertions_test.cu
index 482744a893b..65b4cdc75a7 100644
--- a/cuda/test/utils/assertions_test.cu
+++ b/cuda/test/utils/assertions_test.cu
@@ -4,14 +4,11 @@
 
 #include "core/test/utils/assertions.hpp"
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "cuda/test/utils.hpp"
 
 
diff --git a/devices/device.cpp b/devices/device.cpp
index 5a036f491c1..cac4be8aa7b 100644
--- a/devices/device.cpp
+++ b/devices/device.cpp
@@ -5,7 +5,6 @@
 #include <memory>
 #include <mutex>
 
-
 #include <ginkgo/core/base/device.hpp>
 
 
diff --git a/devices/dpcpp/executor.cpp b/devices/dpcpp/executor.cpp
index 435d9426374..aaca50d3931 100644
--- a/devices/dpcpp/executor.cpp
+++ b/devices/dpcpp/executor.cpp
@@ -4,11 +4,9 @@
 
 #include "ginkgo/core/base/executor.hpp"
 
-
 #include <cstdlib>
 #include <cstring>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 
diff --git a/devices/machine_topology.cpp b/devices/machine_topology.cpp
index af881af6df4..406580ef7d1 100644
--- a/devices/machine_topology.cpp
+++ b/devices/machine_topology.cpp
@@ -6,7 +6,6 @@
 #include <memory>
 #include <mutex>
 
-
 #include <ginkgo/core/base/machine_topology.hpp>
 
 
diff --git a/devices/omp/executor.cpp b/devices/omp/executor.cpp
index 448d7b68d63..54b9c9c36be 100644
--- a/devices/omp/executor.cpp
+++ b/devices/omp/executor.cpp
@@ -4,11 +4,9 @@
 
 #include "ginkgo/core/base/executor.hpp"
 
-
 #include <cstdlib>
 #include <cstring>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 
diff --git a/dpcpp/base/batch_multi_vector_kernels.dp.cpp b/dpcpp/base/batch_multi_vector_kernels.dp.cpp
index bb84d945745..8f607725bc8 100644
--- a/dpcpp/base/batch_multi_vector_kernels.dp.cpp
+++ b/dpcpp/base/batch_multi_vector_kernels.dp.cpp
@@ -4,19 +4,15 @@
 
 #include "core/base/batch_multi_vector_kernels.hpp"
 
-
 #include <algorithm>
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "core/base/batch_struct.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "dpcpp/base/batch_struct.hpp"
diff --git a/dpcpp/base/batch_struct.hpp b/dpcpp/base/batch_struct.hpp
index e183e11dcf8..9c20a8574ef 100644
--- a/dpcpp/base/batch_struct.hpp
+++ b/dpcpp/base/batch_struct.hpp
@@ -9,7 +9,6 @@
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "core/base/batch_struct.hpp"
 #include "dpcpp/base/config.hpp"
 
diff --git a/dpcpp/base/config.hpp b/dpcpp/base/config.hpp
index 03a419bf260..12330c1b992 100644
--- a/dpcpp/base/config.hpp
+++ b/dpcpp/base/config.hpp
@@ -10,7 +10,6 @@
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/synthesizer/containers.hpp>
 
-
 #include "core/base/types.hpp"
 
 
diff --git a/dpcpp/base/device_matrix_data_kernels.dp.cpp b/dpcpp/base/device_matrix_data_kernels.dp.cpp
index a735470d5ba..f39615613fe 100644
--- a/dpcpp/base/device_matrix_data_kernels.dp.cpp
+++ b/dpcpp/base/device_matrix_data_kernels.dp.cpp
@@ -2,16 +2,12 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-
 #include <oneapi/dpl/algorithm>
 
-
 #include "core/base/device_matrix_data_kernels.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "dpcpp/base/onedpl.hpp"
 
 
diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp
index 159ee7eb533..29f0810d9d9 100644
--- a/dpcpp/base/executor.dp.cpp
+++ b/dpcpp/base/executor.dp.cpp
@@ -4,17 +4,14 @@
 
 #include "ginkgo/core/base/executor.hpp"
 
-
 #include <algorithm>
 #include <cctype>
 #include <iostream>
 #include <map>
 #include <string>
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 
diff --git a/dpcpp/base/helper.dp.cpp b/dpcpp/base/helper.dp.cpp
index 7e0f3f9ce8c..f4ae9f0560d 100644
--- a/dpcpp/base/helper.dp.cpp
+++ b/dpcpp/base/helper.dp.cpp
@@ -2,11 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <CL/sycl.hpp>
-
-
 #include "dpcpp/base/helper.hpp"
 
+#include <CL/sycl.hpp>
+
 
 namespace gko {
 namespace kernels {
diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp
index 78b933a2e32..b8cf1a8451c 100644
--- a/dpcpp/base/helper.hpp
+++ b/dpcpp/base/helper.hpp
@@ -8,14 +8,11 @@
 
 #include <utility>
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "core/base/types.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 
diff --git a/dpcpp/base/index_set_kernels.dp.cpp b/dpcpp/base/index_set_kernels.dp.cpp
index abd4caaa482..8f6c46d2405 100644
--- a/dpcpp/base/index_set_kernels.dp.cpp
+++ b/dpcpp/base/index_set_kernels.dp.cpp
@@ -4,10 +4,8 @@
 
 #include "core/base/index_set_kernels.hpp"
 
-
 #include <memory>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/types.hpp>
diff --git a/dpcpp/base/kernel_launch.dp.hpp b/dpcpp/base/kernel_launch.dp.hpp
index 38928adf531..7aa117692f7 100644
--- a/dpcpp/base/kernel_launch.dp.hpp
+++ b/dpcpp/base/kernel_launch.dp.hpp
@@ -10,7 +10,6 @@
 
 #include <tuple>
 
-
 #include <CL/sycl.hpp>
 
 
diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp
index a92eb89737f..83436966ecb 100644
--- a/dpcpp/base/kernel_launch_reduction.dp.hpp
+++ b/dpcpp/base/kernel_launch_reduction.dp.hpp
@@ -10,7 +10,6 @@
 
 #include <algorithm>
 
-
 #include "core/synthesizer/implementation_selection.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
diff --git a/dpcpp/base/onedpl.hpp b/dpcpp/base/onedpl.hpp
index 8ea971f4602..213d4296700 100644
--- a/dpcpp/base/onedpl.hpp
+++ b/dpcpp/base/onedpl.hpp
@@ -8,7 +8,6 @@
 
 #include <oneapi/dpl/execution>
 
-
 #include <ginkgo/core/base/executor.hpp>
 
 
diff --git a/dpcpp/base/onemkl_bindings.hpp b/dpcpp/base/onemkl_bindings.hpp
index 784c53b87bb..004c296553c 100644
--- a/dpcpp/base/onemkl_bindings.hpp
+++ b/dpcpp/base/onemkl_bindings.hpp
@@ -8,11 +8,9 @@
 
 #include <type_traits>
 
-
 #include <CL/sycl.hpp>
 #include <oneapi/mkl.hpp>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
diff --git a/dpcpp/base/scoped_device_id.dp.cpp b/dpcpp/base/scoped_device_id.dp.cpp
index 97c8b6714d4..161c5a26003 100644
--- a/dpcpp/base/scoped_device_id.dp.cpp
+++ b/dpcpp/base/scoped_device_id.dp.cpp
@@ -5,7 +5,6 @@
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/scoped_device_id_guard.hpp>
 
-
 #include "core/base/noop_scoped_device_id_guard.hpp"
 
 
diff --git a/dpcpp/base/timer.dp.cpp b/dpcpp/base/timer.dp.cpp
index da347b14ddf..ed21e1b79a5 100644
--- a/dpcpp/base/timer.dp.cpp
+++ b/dpcpp/base/timer.dp.cpp
@@ -4,10 +4,8 @@
 
 #include "ginkgo/core/base/timer.hpp"
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
diff --git a/dpcpp/components/atomic.dp.hpp b/dpcpp/components/atomic.dp.hpp
index 3126e9a6ab1..8168421a488 100644
--- a/dpcpp/components/atomic.dp.hpp
+++ b/dpcpp/components/atomic.dp.hpp
@@ -8,10 +8,8 @@
 
 #include <type_traits>
 
-
 #include <CL/sycl.hpp>
 
-
 #include "dpcpp/base/dpct.hpp"
 
 
diff --git a/dpcpp/components/cooperative_groups.dp.hpp b/dpcpp/components/cooperative_groups.dp.hpp
index 879f0c25d25..c758cf42710 100644
--- a/dpcpp/components/cooperative_groups.dp.hpp
+++ b/dpcpp/components/cooperative_groups.dp.hpp
@@ -8,10 +8,8 @@
 
 #include <type_traits>
 
-
 #include <ginkgo/config.hpp>
 
-
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dpct.hpp"
 
diff --git a/dpcpp/components/diagonal_block_manipulation.dp.hpp b/dpcpp/components/diagonal_block_manipulation.dp.hpp
index 3e19efebec2..626a225c4fa 100644
--- a/dpcpp/components/diagonal_block_manipulation.dp.hpp
+++ b/dpcpp/components/diagonal_block_manipulation.dp.hpp
@@ -8,10 +8,8 @@
 
 #include <type_traits>
 
-
 #include <CL/sycl.hpp>
 
-
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
diff --git a/dpcpp/components/format_conversion.dp.hpp b/dpcpp/components/format_conversion.dp.hpp
index a9a3ac408a3..17cf55389df 100644
--- a/dpcpp/components/format_conversion.dp.hpp
+++ b/dpcpp/components/format_conversion.dp.hpp
@@ -8,14 +8,11 @@
 
 #include <algorithm>
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
diff --git a/dpcpp/components/intrinsics.dp.hpp b/dpcpp/components/intrinsics.dp.hpp
index 7230f5124b0..369a3dff8b9 100644
--- a/dpcpp/components/intrinsics.dp.hpp
+++ b/dpcpp/components/intrinsics.dp.hpp
@@ -8,10 +8,8 @@
 
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "dpcpp/base/dpct.hpp"
 
 
diff --git a/dpcpp/components/merging.dp.hpp b/dpcpp/components/merging.dp.hpp
index f700364769e..8d2f96e70bf 100644
--- a/dpcpp/components/merging.dp.hpp
+++ b/dpcpp/components/merging.dp.hpp
@@ -8,10 +8,8 @@
 
 #include <limits>
 
-
 #include <CL/sycl.hpp>
 
-
 #include "core/base/utils.hpp"
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/components/intrinsics.dp.hpp"
diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp
index 18287b82fe7..b1ae9da32bb 100644
--- a/dpcpp/components/prefix_sum.dp.hpp
+++ b/dpcpp/components/prefix_sum.dp.hpp
@@ -8,10 +8,8 @@
 
 #include <type_traits>
 
-
 #include <CL/sycl.hpp>
 
-
 #include "core/base/types.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/dpct.hpp"
diff --git a/dpcpp/components/prefix_sum_kernels.dp.cpp b/dpcpp/components/prefix_sum_kernels.dp.cpp
index c8a663e20d4..a47f45e9565 100644
--- a/dpcpp/components/prefix_sum_kernels.dp.cpp
+++ b/dpcpp/components/prefix_sum_kernels.dp.cpp
@@ -4,13 +4,10 @@
 
 #include "core/components/prefix_sum_kernels.hpp"
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "core/base/types.hpp"
 #include "dpcpp/base/helper.hpp"
 #include "dpcpp/components/prefix_sum.dp.hpp"
diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp
index 1bdaa7dbb10..aed8166d601 100644
--- a/dpcpp/components/reduction.dp.hpp
+++ b/dpcpp/components/reduction.dp.hpp
@@ -8,15 +8,12 @@
 
 #include <type_traits>
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/synthesizer/containers.hpp>
 
-
 #include "core/base/array_access.hpp"
 #include "core/base/types.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
diff --git a/dpcpp/components/searching.dp.hpp b/dpcpp/components/searching.dp.hpp
index 903492599bc..b4cbd1bb726 100644
--- a/dpcpp/components/searching.dp.hpp
+++ b/dpcpp/components/searching.dp.hpp
@@ -8,7 +8,6 @@
 
 #include <CL/sycl.hpp>
 
-
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/components/intrinsics.dp.hpp"
diff --git a/dpcpp/components/segment_scan.dp.hpp b/dpcpp/components/segment_scan.dp.hpp
index 23b2f0a15d0..b6c26523f30 100644
--- a/dpcpp/components/segment_scan.dp.hpp
+++ b/dpcpp/components/segment_scan.dp.hpp
@@ -8,7 +8,6 @@
 
 #include <CL/sycl.hpp>
 
-
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
diff --git a/dpcpp/components/sorting.dp.hpp b/dpcpp/components/sorting.dp.hpp
index 7b7ddacb221..e616903721c 100644
--- a/dpcpp/components/sorting.dp.hpp
+++ b/dpcpp/components/sorting.dp.hpp
@@ -8,7 +8,6 @@
 
 #include <CL/sycl.hpp>
 
-
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
diff --git a/dpcpp/components/thread_ids.dp.hpp b/dpcpp/components/thread_ids.dp.hpp
index 5a1b98e4e05..09f7b24c6ee 100644
--- a/dpcpp/components/thread_ids.dp.hpp
+++ b/dpcpp/components/thread_ids.dp.hpp
@@ -8,7 +8,6 @@
 
 #include <CL/sycl.hpp>
 
-
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dpct.hpp"
 
diff --git a/dpcpp/components/uninitialized_array.hpp b/dpcpp/components/uninitialized_array.hpp
index 019a3cb6644..1d25cbf3837 100644
--- a/dpcpp/components/uninitialized_array.hpp
+++ b/dpcpp/components/uninitialized_array.hpp
@@ -8,7 +8,6 @@
 
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "dpcpp/base/dpct.hpp"
 
 
diff --git a/dpcpp/components/warp_blas.dp.hpp b/dpcpp/components/warp_blas.dp.hpp
index 0f2c4644dea..dabc812930f 100644
--- a/dpcpp/components/warp_blas.dp.hpp
+++ b/dpcpp/components/warp_blas.dp.hpp
@@ -9,13 +9,10 @@
 #include <cassert>
 #include <type_traits>
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/config.hpp>
 
-
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
diff --git a/dpcpp/distributed/index_map_kernels.dp.cpp b/dpcpp/distributed/index_map_kernels.dp.cpp
index 84424976778..cf1b28140e1 100644
--- a/dpcpp/distributed/index_map_kernels.dp.cpp
+++ b/dpcpp/distributed/index_map_kernels.dp.cpp
@@ -4,7 +4,6 @@
 
 #include "core/distributed/index_map_kernels.hpp"
 
-
 #include "common/unified/base/kernel_launch.hpp"
 
 
diff --git a/dpcpp/distributed/matrix_kernels.dp.cpp b/dpcpp/distributed/matrix_kernels.dp.cpp
index 5649cb579c9..47adaaeca59 100644
--- a/dpcpp/distributed/matrix_kernels.dp.cpp
+++ b/dpcpp/distributed/matrix_kernels.dp.cpp
@@ -4,7 +4,6 @@
 
 #include "core/distributed/matrix_kernels.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
diff --git a/dpcpp/distributed/partition_helpers_kernels.dp.cpp b/dpcpp/distributed/partition_helpers_kernels.dp.cpp
index c7a94baad54..28a0cfd5997 100644
--- a/dpcpp/distributed/partition_helpers_kernels.dp.cpp
+++ b/dpcpp/distributed/partition_helpers_kernels.dp.cpp
@@ -2,12 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-
 #include <oneapi/dpl/algorithm>
 #include <oneapi/dpl/execution>
 #include <oneapi/dpl/iterator>
 
-
 #include "core/distributed/partition_helpers_kernels.hpp"
 
 
diff --git a/dpcpp/distributed/partition_kernels.dp.cpp b/dpcpp/distributed/partition_kernels.dp.cpp
index 5eeb2f85178..175ea3ac050 100644
--- a/dpcpp/distributed/partition_kernels.dp.cpp
+++ b/dpcpp/distributed/partition_kernels.dp.cpp
@@ -2,14 +2,11 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-
 #include <oneapi/dpl/algorithm>
 #include <oneapi/dpl/iterator>
 
-
 #include "core/distributed/partition_kernels.hpp"
 
-
 #include "common/unified/base/kernel_launch.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "dpcpp/base/onedpl.hpp"
diff --git a/dpcpp/distributed/vector_kernels.dp.cpp b/dpcpp/distributed/vector_kernels.dp.cpp
index 2f7769d37c3..fdc5dd2e52d 100644
--- a/dpcpp/distributed/vector_kernels.dp.cpp
+++ b/dpcpp/distributed/vector_kernels.dp.cpp
@@ -4,7 +4,6 @@
 
 #include "core/distributed/vector_kernels.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
diff --git a/dpcpp/factorization/cholesky_kernels.dp.cpp b/dpcpp/factorization/cholesky_kernels.dp.cpp
index b69f50e8dfb..b381e6989e4 100644
--- a/dpcpp/factorization/cholesky_kernels.dp.cpp
+++ b/dpcpp/factorization/cholesky_kernels.dp.cpp
@@ -4,17 +4,13 @@
 
 #include "core/factorization/cholesky_kernels.hpp"
 
-
 #include <algorithm>
 #include <memory>
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/factorization/elimination_forest.hpp"
 
 
diff --git a/dpcpp/factorization/factorization_kernels.dp.cpp b/dpcpp/factorization/factorization_kernels.dp.cpp
index 374e966e46d..1d9912b4f12 100644
--- a/dpcpp/factorization/factorization_kernels.dp.cpp
+++ b/dpcpp/factorization/factorization_kernels.dp.cpp
@@ -4,13 +4,10 @@
 
 #include "core/factorization/factorization_kernels.hpp"
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/array.hpp>
 
-
 #include "core/base/array_access.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
diff --git a/dpcpp/factorization/lu_kernels.dp.cpp b/dpcpp/factorization/lu_kernels.dp.cpp
index c4a471b8c4b..a891b5b7b2f 100644
--- a/dpcpp/factorization/lu_kernels.dp.cpp
+++ b/dpcpp/factorization/lu_kernels.dp.cpp
@@ -4,14 +4,11 @@
 
 #include "core/factorization/lu_kernels.hpp"
 
-
 #include <algorithm>
 #include <memory>
 
-
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/base/allocator.hpp"
 #include "core/matrix/csr_lookup.hpp"
 
diff --git a/dpcpp/factorization/par_ic_kernels.dp.cpp b/dpcpp/factorization/par_ic_kernels.dp.cpp
index 3f43a488abc..5428460fac5 100644
--- a/dpcpp/factorization/par_ic_kernels.dp.cpp
+++ b/dpcpp/factorization/par_ic_kernels.dp.cpp
@@ -4,15 +4,12 @@
 
 #include "core/factorization/par_ic_kernels.hpp"
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
 
diff --git a/dpcpp/factorization/par_ict_kernels.dp.cpp b/dpcpp/factorization/par_ict_kernels.dp.cpp
index c65fd094955..fb99b662dec 100644
--- a/dpcpp/factorization/par_ict_kernels.dp.cpp
+++ b/dpcpp/factorization/par_ict_kernels.dp.cpp
@@ -4,20 +4,16 @@
 
 #include "core/factorization/par_ict_kernels.hpp"
 
-
 #include <limits>
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
diff --git a/dpcpp/factorization/par_ilu_kernels.dp.cpp b/dpcpp/factorization/par_ilu_kernels.dp.cpp
index 61c059e8c7d..abfd2d72238 100644
--- a/dpcpp/factorization/par_ilu_kernels.dp.cpp
+++ b/dpcpp/factorization/par_ilu_kernels.dp.cpp
@@ -4,13 +4,10 @@
 
 #include "core/factorization/par_ilu_kernels.hpp"
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/matrix/coo.hpp>
 
-
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
 
diff --git a/dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp b/dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp
index 9f3a3062ad6..776ffba3fb1 100644
--- a/dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp
+++ b/dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp
@@ -2,24 +2,19 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/factorization/par_ilut_kernels.hpp"
-
-
 #include <algorithm>
 #include <limits>
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/components/prefix_sum_kernels.hpp"
+#include "core/factorization/par_ilut_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
diff --git a/dpcpp/factorization/par_ilut_filter_kernel.dp.cpp b/dpcpp/factorization/par_ilut_filter_kernel.dp.cpp
index 273e21e47fd..5ce9df8a0a9 100644
--- a/dpcpp/factorization/par_ilut_filter_kernel.dp.cpp
+++ b/dpcpp/factorization/par_ilut_filter_kernel.dp.cpp
@@ -2,20 +2,16 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/factorization/par_ilut_kernels.hpp"
-
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/components/prefix_sum_kernels.hpp"
+#include "core/factorization/par_ilut_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
diff --git a/dpcpp/factorization/par_ilut_kernels.dp.cpp b/dpcpp/factorization/par_ilut_kernels.dp.cpp
index cfde68b298b..5c9d4c6d769 100644
--- a/dpcpp/factorization/par_ilut_kernels.dp.cpp
+++ b/dpcpp/factorization/par_ilut_kernels.dp.cpp
@@ -4,22 +4,18 @@
 
 #include "core/factorization/par_ilut_kernels.hpp"
 
-
 #include <algorithm>
 #include <tuple>
 #include <unordered_map>
 #include <unordered_set>
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/base/utils.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
diff --git a/dpcpp/factorization/par_ilut_select_common.dp.cpp b/dpcpp/factorization/par_ilut_select_common.dp.cpp
index 1ee22bcef2a..acf383f84a0 100644
--- a/dpcpp/factorization/par_ilut_select_common.dp.cpp
+++ b/dpcpp/factorization/par_ilut_select_common.dp.cpp
@@ -2,16 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/factorization/par_ilut_kernels.hpp"
-
+#include "dpcpp/factorization/par_ilut_select_common.dp.hpp"
 
 #include <limits>
 
-
 #include <CL/sycl.hpp>
 
-
 #include "core/components/prefix_sum_kernels.hpp"
+#include "core/factorization/par_ilut_kernels.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/components/atomic.dp.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
@@ -20,7 +18,6 @@
 #include "dpcpp/components/searching.dp.hpp"
 #include "dpcpp/components/sorting.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
-#include "dpcpp/factorization/par_ilut_select_common.dp.hpp"
 
 
 namespace gko {
diff --git a/dpcpp/factorization/par_ilut_select_kernel.dp.cpp b/dpcpp/factorization/par_ilut_select_kernel.dp.cpp
index 8b5e6f36d10..589f8267f21 100644
--- a/dpcpp/factorization/par_ilut_select_kernel.dp.cpp
+++ b/dpcpp/factorization/par_ilut_select_kernel.dp.cpp
@@ -2,22 +2,17 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/factorization/par_ilut_kernels.hpp"
-
-
 #include <algorithm>
 #include <limits>
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/components/prefix_sum_kernels.hpp"
+#include "core/factorization/par_ilut_kernels.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/components/atomic.dp.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
diff --git a/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp b/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp
index 6ba0c7987cd..246228763bf 100644
--- a/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp
+++ b/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp
@@ -2,23 +2,18 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/factorization/par_ilut_kernels.hpp"
-
-
 #include <limits>
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/components/prefix_sum_kernels.hpp"
+#include "core/factorization/par_ilut_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
diff --git a/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp b/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp
index 9501a35f3c9..601e5dc12d3 100644
--- a/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp
+++ b/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp
@@ -2,20 +2,16 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/factorization/par_ilut_kernels.hpp"
-
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/components/prefix_sum_kernels.hpp"
+#include "core/factorization/par_ilut_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
diff --git a/dpcpp/log/batch_logger.hpp b/dpcpp/log/batch_logger.hpp
index 309c624d6fc..c6ba9044db4 100644
--- a/dpcpp/log/batch_logger.hpp
+++ b/dpcpp/log/batch_logger.hpp
@@ -10,7 +10,6 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/dpct.hpp"
diff --git a/dpcpp/matrix/batch_csr_kernels.dp.cpp b/dpcpp/matrix/batch_csr_kernels.dp.cpp
index 31ef1e2e1e1..9feb824a3aa 100644
--- a/dpcpp/matrix/batch_csr_kernels.dp.cpp
+++ b/dpcpp/matrix/batch_csr_kernels.dp.cpp
@@ -4,17 +4,13 @@
 
 #include "core/matrix/batch_csr_kernels.hpp"
 
-
 #include <algorithm>
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/matrix/batch_csr.hpp>
 
-
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "dpcpp/base/batch_struct.hpp"
diff --git a/dpcpp/matrix/batch_dense_kernels.dp.cpp b/dpcpp/matrix/batch_dense_kernels.dp.cpp
index 34efd9525fb..a9f6afce0f5 100644
--- a/dpcpp/matrix/batch_dense_kernels.dp.cpp
+++ b/dpcpp/matrix/batch_dense_kernels.dp.cpp
@@ -4,19 +4,15 @@
 
 #include "core/matrix/batch_dense_kernels.hpp"
 
-
 #include <algorithm>
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
 
-
 #include "core/base/batch_struct.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/batch_struct.hpp"
diff --git a/dpcpp/matrix/batch_ell_kernels.dp.cpp b/dpcpp/matrix/batch_ell_kernels.dp.cpp
index 9db98da7108..2cb40dc35eb 100644
--- a/dpcpp/matrix/batch_ell_kernels.dp.cpp
+++ b/dpcpp/matrix/batch_ell_kernels.dp.cpp
@@ -4,17 +4,13 @@
 
 #include "core/matrix/batch_ell_kernels.hpp"
 
-
 #include <algorithm>
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
-
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "dpcpp/base/batch_struct.hpp"
diff --git a/dpcpp/matrix/batch_struct.hpp b/dpcpp/matrix/batch_struct.hpp
index e504afdbc81..77b9eb6b3d5 100644
--- a/dpcpp/matrix/batch_struct.hpp
+++ b/dpcpp/matrix/batch_struct.hpp
@@ -6,14 +6,11 @@
 #define GKO_DPCPP_MATRIX_BATCH_STRUCT_HPP_
 
 
-#include "core/matrix/batch_struct.hpp"
-
-
 #include <ginkgo/core/matrix/batch_dense.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
-
 #include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
 
 
 namespace gko {
diff --git a/dpcpp/matrix/coo_kernels.dp.cpp b/dpcpp/matrix/coo_kernels.dp.cpp
index ba7b5013253..595af92b33b 100644
--- a/dpcpp/matrix/coo_kernels.dp.cpp
+++ b/dpcpp/matrix/coo_kernels.dp.cpp
@@ -4,17 +4,14 @@
 
 #include "core/matrix/coo_kernels.hpp"
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/matrix/dense_kernels.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp
index e276c7520c1..7e5d0229c86 100644
--- a/dpcpp/matrix/csr_kernels.dp.cpp
+++ b/dpcpp/matrix/csr_kernels.dp.cpp
@@ -4,14 +4,11 @@
 
 #include "core/matrix/csr_kernels.hpp"
 
-
 #include <algorithm>
 
-
 #include <CL/sycl.hpp>
 #include <oneapi/mkl.hpp>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
@@ -21,7 +18,6 @@
 #include <ginkgo/core/matrix/hybrid.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 
-
 #include "core/base/array_access.hpp"
 #include "core/base/mixed_precision_types.hpp"
 #include "core/base/utils.hpp"
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index 1192b893010..04f3229eaed 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -4,11 +4,9 @@
 
 #include "core/matrix/dense_kernels.hpp"
 
-
 #include <CL/sycl.hpp>
 #include <oneapi/mkl.hpp>
 
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/range_accessors.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
@@ -19,7 +17,6 @@
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
 #include "core/components/prefix_sum_kernels.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
diff --git a/dpcpp/matrix/diagonal_kernels.dp.cpp b/dpcpp/matrix/diagonal_kernels.dp.cpp
index e1ee7ac8b17..2b63138abbe 100644
--- a/dpcpp/matrix/diagonal_kernels.dp.cpp
+++ b/dpcpp/matrix/diagonal_kernels.dp.cpp
@@ -4,14 +4,11 @@
 
 #include "core/matrix/diagonal_kernels.hpp"
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/helper.hpp"
diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp
index 55ec656ae25..a97cb602d52 100644
--- a/dpcpp/matrix/ell_kernels.dp.cpp
+++ b/dpcpp/matrix/ell_kernels.dp.cpp
@@ -4,20 +4,16 @@
 
 #include "core/matrix/ell_kernels.hpp"
 
-
 #include <array>
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "accessor/reduced_row_major.hpp"
 #include "core/base/mixed_precision_types.hpp"
 #include "core/components/fill_array_kernels.hpp"
diff --git a/dpcpp/matrix/fbcsr_kernels.dp.cpp b/dpcpp/matrix/fbcsr_kernels.dp.cpp
index 6a2b43a4165..bf858be51e3 100644
--- a/dpcpp/matrix/fbcsr_kernels.dp.cpp
+++ b/dpcpp/matrix/fbcsr_kernels.dp.cpp
@@ -4,17 +4,14 @@
 
 #include "core/matrix/fbcsr_kernels.hpp"
 
-
 #include <algorithm>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "dpcpp/base/config.hpp"
 
 
diff --git a/dpcpp/matrix/fft_kernels.dp.cpp b/dpcpp/matrix/fft_kernels.dp.cpp
index 713f0d99a0b..83c085e8d15 100644
--- a/dpcpp/matrix/fft_kernels.dp.cpp
+++ b/dpcpp/matrix/fft_kernels.dp.cpp
@@ -4,7 +4,6 @@
 
 #include "core/matrix/fft_kernels.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
diff --git a/dpcpp/matrix/sellp_kernels.dp.cpp b/dpcpp/matrix/sellp_kernels.dp.cpp
index 83078369371..9c0fe717e8a 100644
--- a/dpcpp/matrix/sellp_kernels.dp.cpp
+++ b/dpcpp/matrix/sellp_kernels.dp.cpp
@@ -4,17 +4,14 @@
 
 #include "core/matrix/sellp_kernels.hpp"
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/components/prefix_sum_kernels.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
diff --git a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp
index f355216eb08..66c57ac5b35 100644
--- a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp
+++ b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp
@@ -4,13 +4,10 @@
 
 #include "core/matrix/sparsity_csr_kernels.hpp"
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "accessor/reduced_row_major.hpp"
 #include "core/base/mixed_precision_types.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
diff --git a/dpcpp/multigrid/pgm_kernels.dp.cpp b/dpcpp/multigrid/pgm_kernels.dp.cpp
index 3241c8b1ed1..a9148c54ff4 100644
--- a/dpcpp/multigrid/pgm_kernels.dp.cpp
+++ b/dpcpp/multigrid/pgm_kernels.dp.cpp
@@ -2,20 +2,15 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-
 #include <oneapi/dpl/algorithm>
 
-
 #include "core/multigrid/pgm_kernels.hpp"
 
-
 #include <memory>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "dpcpp/base/onedpl.hpp"
 
 
diff --git a/dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp b/dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp
index 752ae1d41de..e66e7141a47 100644
--- a/dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp
+++ b/dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp
@@ -4,10 +4,8 @@
 
 #include "core/preconditioner/batch_jacobi_kernels.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "core/base/batch_struct.hpp"
 #include "core/base/utils.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
diff --git a/dpcpp/preconditioner/isai_kernels.dp.cpp b/dpcpp/preconditioner/isai_kernels.dp.cpp
index c4bc933a4d7..4082035ff9f 100644
--- a/dpcpp/preconditioner/isai_kernels.dp.cpp
+++ b/dpcpp/preconditioner/isai_kernels.dp.cpp
@@ -4,15 +4,12 @@
 
 #include "core/preconditioner/isai_kernels.hpp"
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "dpcpp/base/config.hpp"
diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
index 0b54a14693c..e8c086ec0a6 100644
--- a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
@@ -2,17 +2,13 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/preconditioner/jacobi_kernels.hpp"
-
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "core/base/extended_float.hpp"
 #include "core/matrix/dense_kernels.hpp"
+#include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "dpcpp/base/config.hpp"
diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
index 2098b7057e7..0e26989808e 100644
--- a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
@@ -2,13 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/preconditioner/jacobi_kernels.hpp"
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "core/matrix/dense_kernels.hpp"
+#include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "dpcpp/preconditioner/jacobi_common.hpp"
 
diff --git a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
index c23e9101d1a..d957ea2c5be 100644
--- a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
@@ -2,18 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/preconditioner/jacobi_kernels.hpp"
-
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "core/base/extended_float.hpp"
 #include "core/components/fill_array_kernels.hpp"
+#include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "dpcpp/base/config.hpp"
diff --git a/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp
index d6c3d4ce14c..62ff7fdbb51 100644
--- a/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp
@@ -2,14 +2,11 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/preconditioner/jacobi_kernels.hpp"
-
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "core/components/fill_array_kernels.hpp"
+#include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "dpcpp/preconditioner/jacobi_common.hpp"
 
diff --git a/dpcpp/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/preconditioner/jacobi_kernels.dp.cpp
index 12b2251c7a5..886f96e88e3 100644
--- a/dpcpp/preconditioner/jacobi_kernels.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_kernels.dp.cpp
@@ -4,13 +4,10 @@
 
 #include "core/preconditioner/jacobi_kernels.hpp"
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "core/base/extended_float.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
diff --git a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
index aade299f05b..c088ae8e986 100644
--- a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
@@ -2,17 +2,13 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/preconditioner/jacobi_kernels.hpp"
-
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "core/base/extended_float.hpp"
 #include "core/matrix/dense_kernels.hpp"
+#include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "dpcpp/base/config.hpp"
diff --git a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
index ceed1affd14..25701c6dc55 100644
--- a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
@@ -2,12 +2,9 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/preconditioner/jacobi_kernels.hpp"
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
+#include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "dpcpp/preconditioner/jacobi_common.hpp"
diff --git a/dpcpp/reorder/rcm_kernels.dp.cpp b/dpcpp/reorder/rcm_kernels.dp.cpp
index 95a8fa38b80..350b4c90a6d 100644
--- a/dpcpp/reorder/rcm_kernels.dp.cpp
+++ b/dpcpp/reorder/rcm_kernels.dp.cpp
@@ -4,10 +4,8 @@
 
 #include "core/reorder/rcm_kernels.hpp"
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/std_extensions.hpp>
 #include <ginkgo/core/base/types.hpp>
diff --git a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
index aab068d103e..344e4af56b9 100644
--- a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
+++ b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
@@ -4,15 +4,12 @@
 
 #include "core/solver/batch_bicgstab_kernels.hpp"
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 #include <ginkgo/core/solver/batch_bicgstab.hpp>
 
-
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
diff --git a/dpcpp/solver/batch_cg_kernels.dp.cpp b/dpcpp/solver/batch_cg_kernels.dp.cpp
index 02c40424a35..0787afa6fd3 100644
--- a/dpcpp/solver/batch_cg_kernels.dp.cpp
+++ b/dpcpp/solver/batch_cg_kernels.dp.cpp
@@ -4,15 +4,12 @@
 
 #include "core/solver/batch_cg_kernels.hpp"
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 #include <ginkgo/core/solver/batch_cg.hpp>
 
-
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
diff --git a/dpcpp/solver/cb_gmres_kernels.dp.cpp b/dpcpp/solver/cb_gmres_kernels.dp.cpp
index dbf4bdfadcb..7ab010ba29f 100644
--- a/dpcpp/solver/cb_gmres_kernels.dp.cpp
+++ b/dpcpp/solver/cb_gmres_kernels.dp.cpp
@@ -4,19 +4,15 @@
 
 #include "core/solver/cb_gmres_kernels.hpp"
 
-
 #include <algorithm>
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/range.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
-
 #include "accessor/range.hpp"
 #include "accessor/reduced_row_major.hpp"
 #include "accessor/scaled_reduced_row_major.hpp"
diff --git a/dpcpp/solver/idr_kernels.dp.cpp b/dpcpp/solver/idr_kernels.dp.cpp
index fdd924ad4d6..d59ada362f9 100644
--- a/dpcpp/solver/idr_kernels.dp.cpp
+++ b/dpcpp/solver/idr_kernels.dp.cpp
@@ -2,21 +2,18 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/solver/idr_kernels.hpp"
+#include <oneapi/dpl/random>
 
+#include "core/solver/idr_kernels.hpp"
 
 #include <ctime>
 #include <random>
 
-
 #include <CL/sycl.hpp>
-#include <oneapi/dpl/random>
-
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "core/components/fill_array_kernels.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
diff --git a/dpcpp/solver/lower_trs_kernels.dp.cpp b/dpcpp/solver/lower_trs_kernels.dp.cpp
index 6c4f8fa8537..449bfe5cfcf 100644
--- a/dpcpp/solver/lower_trs_kernels.dp.cpp
+++ b/dpcpp/solver/lower_trs_kernels.dp.cpp
@@ -4,13 +4,10 @@
 
 #include "core/solver/lower_trs_kernels.hpp"
 
-
 #include <memory>
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
diff --git a/dpcpp/solver/multigrid_kernels.dp.cpp b/dpcpp/solver/multigrid_kernels.dp.cpp
index d818211c28b..aaf0ab63354 100644
--- a/dpcpp/solver/multigrid_kernels.dp.cpp
+++ b/dpcpp/solver/multigrid_kernels.dp.cpp
@@ -4,13 +4,11 @@
 
 #include "core/solver/multigrid_kernels.hpp"
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "core/components/fill_array_kernels.hpp"
 
 
diff --git a/dpcpp/solver/upper_trs_kernels.dp.cpp b/dpcpp/solver/upper_trs_kernels.dp.cpp
index 3729492eb18..7ac4950fe82 100644
--- a/dpcpp/solver/upper_trs_kernels.dp.cpp
+++ b/dpcpp/solver/upper_trs_kernels.dp.cpp
@@ -4,13 +4,10 @@
 
 #include "core/solver/upper_trs_kernels.hpp"
 
-
 #include <memory>
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
diff --git a/dpcpp/stop/batch_criteria.hpp b/dpcpp/stop/batch_criteria.hpp
index 3818831df11..a0b12326302 100644
--- a/dpcpp/stop/batch_criteria.hpp
+++ b/dpcpp/stop/batch_criteria.hpp
@@ -10,7 +10,6 @@
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/stop/batch_stop_enum.hpp>
 
-
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/dpct.hpp"
diff --git a/dpcpp/stop/criterion_kernels.dp.cpp b/dpcpp/stop/criterion_kernels.dp.cpp
index ea8ab78aace..2970263f6ae 100644
--- a/dpcpp/stop/criterion_kernels.dp.cpp
+++ b/dpcpp/stop/criterion_kernels.dp.cpp
@@ -4,10 +4,8 @@
 
 #include "core/stop/criterion_kernels.hpp"
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
diff --git a/dpcpp/stop/residual_norm_kernels.dp.cpp b/dpcpp/stop/residual_norm_kernels.dp.cpp
index 09aae963b16..ddb617a1a84 100644
--- a/dpcpp/stop/residual_norm_kernels.dp.cpp
+++ b/dpcpp/stop/residual_norm_kernels.dp.cpp
@@ -4,15 +4,12 @@
 
 #include "core/stop/residual_norm_kernels.hpp"
 
-
 #include <CL/sycl.hpp>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "core/base/array_access.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
diff --git a/dpcpp/synthesizer/implementation_selection.hpp b/dpcpp/synthesizer/implementation_selection.hpp
index 05cea2521b0..9bec1a42cd5 100644
--- a/dpcpp/synthesizer/implementation_selection.hpp
+++ b/dpcpp/synthesizer/implementation_selection.hpp
@@ -8,11 +8,9 @@
 
 #include <utility>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/synthesizer/containers.hpp>
 
-
 #include "dpcpp/base/config.hpp"
 
 
diff --git a/dpcpp/test/base/dim3.dp.cpp b/dpcpp/test/base/dim3.dp.cpp
index 6688e4e4163..cf0e5d1da30 100644
--- a/dpcpp/test/base/dim3.dp.cpp
+++ b/dpcpp/test/base/dim3.dp.cpp
@@ -4,10 +4,8 @@
 
 #include "dpcpp/base/dim3.dp.hpp"
 
-
 #include <CL/sycl.hpp>
 
-
 #include <gtest/gtest.h>
 
 
diff --git a/dpcpp/test/base/executor.dp.cpp b/dpcpp/test/base/executor.dp.cpp
index 771330e08bf..83a29a3b6db 100644
--- a/dpcpp/test/base/executor.dp.cpp
+++ b/dpcpp/test/base/executor.dp.cpp
@@ -2,23 +2,18 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/executor.hpp>
-
-
 #include <exception>
 #include <memory>
 #include <type_traits>
 
-
 #include <CL/sycl.hpp>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
 
 
 namespace {
diff --git a/dpcpp/test/base/kernel_launch.dp.cpp b/dpcpp/test/base/kernel_launch.dp.cpp
index e95fac1082a..a6687583340 100644
--- a/dpcpp/test/base/kernel_launch.dp.cpp
+++ b/dpcpp/test/base/kernel_launch.dp.cpp
@@ -4,20 +4,16 @@
 
 #include "common/unified/base/kernel_launch.hpp"
 
-
 #include <memory>
 #include <type_traits>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/dim.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "common/unified/base/kernel_launch_reduction.hpp"
 #include "common/unified/base/kernel_launch_solver.hpp"
 #include "core/base/array_access.hpp"
diff --git a/dpcpp/test/components/cooperative_groups.dp.cpp b/dpcpp/test/components/cooperative_groups.dp.cpp
index ab94fc0364b..27e14b62d2d 100644
--- a/dpcpp/test/components/cooperative_groups.dp.cpp
+++ b/dpcpp/test/components/cooperative_groups.dp.cpp
@@ -4,22 +4,17 @@
 
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 
-
 #include <iostream>
 #include <memory>
 
-
 #include <CL/sycl.hpp>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "core/base/types.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "core/test/utils/assertions.hpp"
diff --git a/dpcpp/test/matrix/fbcsr_kernels.dp.cpp b/dpcpp/test/matrix/fbcsr_kernels.dp.cpp
index 6d6f9fb6e93..98849e4fe00 100644
--- a/dpcpp/test/matrix/fbcsr_kernels.dp.cpp
+++ b/dpcpp/test/matrix/fbcsr_kernels.dp.cpp
@@ -4,10 +4,8 @@
 
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 
-
 #include "core/test/matrix/fbcsr_sample.hpp"
 #include "core/test/utils.hpp"
 
diff --git a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp
index 3b62c328366..6dcfe460c71 100644
--- a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp
+++ b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp
@@ -2,22 +2,17 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/preconditioner/jacobi.hpp>
-
-
 #include <initializer_list>
 #include <random>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
-
+#include <ginkgo/core/preconditioner/jacobi.hpp>
 
 #include "core/test/utils.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
diff --git a/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp b/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp
index 04dde86a07a..541798b8c00 100644
--- a/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp
+++ b/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp
@@ -2,15 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <map>
 #include <string>
 
+#include <ginkgo/ginkgo.hpp>
+
 
 int main(int argc, char* argv[])
 {
diff --git a/examples/cb-gmres/cb-gmres.cpp b/examples/cb-gmres/cb-gmres.cpp
index 84b9c37592b..3eb221b3a48 100644
--- a/examples/cb-gmres/cb-gmres.cpp
+++ b/examples/cb-gmres/cb-gmres.cpp
@@ -3,8 +3,6 @@
 // SPDX-License-Identifier: BSD-3-Clause
 
 // This is the main ginkgo header file.
-#include <ginkgo/ginkgo.hpp>
-
 #include <chrono>
 #include <cmath>
 #include <fstream>
@@ -12,6 +10,8 @@
 #include <map>
 #include <string>
 
+#include <ginkgo/ginkgo.hpp>
+
 
 // Helper function which measures the time of `solver->apply(b, x)` in seconds
 // To get an accurate result, the solve is repeated multiple times (while
diff --git a/examples/custom-matrix-format/custom-matrix-format.cpp b/examples/custom-matrix-format/custom-matrix-format.cpp
index a5e3cc94997..d2ec94215b3 100644
--- a/examples/custom-matrix-format/custom-matrix-format.cpp
+++ b/examples/custom-matrix-format/custom-matrix-format.cpp
@@ -6,8 +6,8 @@
 #include <map>
 #include <string>
 
-
 #include <omp.h>
+
 #include <ginkgo/ginkgo.hpp>
 
 
diff --git a/examples/custom-stopping-criterion/custom-stopping-criterion.cpp b/examples/custom-stopping-criterion/custom-stopping-criterion.cpp
index 39baed56f67..030e11323af 100644
--- a/examples/custom-stopping-criterion/custom-stopping-criterion.cpp
+++ b/examples/custom-stopping-criterion/custom-stopping-criterion.cpp
@@ -2,15 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <fstream>
 #include <iostream>
 #include <map>
 #include <string>
 #include <thread>
 
+#include <ginkgo/ginkgo.hpp>
+
 
 /**
  * The ByInteraction class is a criterion which asks for user input to stop
diff --git a/examples/external-lib-interfacing/external-lib-interfacing.cpp b/examples/external-lib-interfacing/external-lib-interfacing.cpp
index 04824cb9578..a3b37b00b1a 100644
--- a/examples/external-lib-interfacing/external-lib-interfacing.cpp
+++ b/examples/external-lib-interfacing/external-lib-interfacing.cpp
@@ -69,7 +69,6 @@
 // not unlike the <code>Function</code> class, but with the difference that
 // the return value is tensor-valued rather than scalar of vector-valued.
 #include <deal.II/base/tensor_function.h>
-
 #include <deal.II/numerics/error_estimator.h>
 
 // Ginkgo's header file
diff --git a/examples/ginkgo-overhead/ginkgo-overhead.cpp b/examples/ginkgo-overhead/ginkgo-overhead.cpp
index 5330dda1e7d..c36cf60c39c 100644
--- a/examples/ginkgo-overhead/ginkgo-overhead.cpp
+++ b/examples/ginkgo-overhead/ginkgo-overhead.cpp
@@ -2,13 +2,12 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <chrono>
 #include <cmath>
 #include <iostream>
 
+#include <ginkgo/ginkgo.hpp>
+
 
 [[noreturn]] void print_usage_and_exit(const char* name)
 {
diff --git a/examples/ginkgo-ranges/ginkgo-ranges.cpp b/examples/ginkgo-ranges/ginkgo-ranges.cpp
index 38486a25b2e..503ee8b62e3 100644
--- a/examples/ginkgo-ranges/ginkgo-ranges.cpp
+++ b/examples/ginkgo-ranges/ginkgo-ranges.cpp
@@ -2,10 +2,11 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
 #include <iomanip>
 #include <iostream>
 
+#include <ginkgo/ginkgo.hpp>
+
 
 // LU factorization implementation using Ginkgo ranges
 // For simplicity, we only consider square matrices, and no pivoting.
diff --git a/examples/heat-equation/heat-equation.cpp b/examples/heat-equation/heat-equation.cpp
index c026c343997..286559e1cc3 100644
--- a/examples/heat-equation/heat-equation.cpp
+++ b/examples/heat-equation/heat-equation.cpp
@@ -36,17 +36,15 @@ vector initialization, solver setup and the use of Ginkgo in a more complex
 setting.
 *****************************<DESCRIPTION>**********************************/
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <chrono>
 #include <fstream>
 #include <iostream>
 
-
 #include <opencv2/core.hpp>
 #include <opencv2/videoio.hpp>
 
+#include <ginkgo/ginkgo.hpp>
+
 
 // This function implements a simple Ginkgo-themed clamped color mapping for
 // values in the range [0,5].
diff --git a/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp b/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp
index ad7e1c07158..54a45f0f2e1 100644
--- a/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp
+++ b/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp
@@ -2,15 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <cstdlib>
 #include <fstream>
 #include <iostream>
 #include <map>
 #include <string>
 
+#include <ginkgo/ginkgo.hpp>
+
 
 int main(int argc, char* argv[])
 {
diff --git a/examples/inverse-iteration/inverse-iteration.cpp b/examples/inverse-iteration/inverse-iteration.cpp
index 03c9f1fe5e8..a348cfe635c 100644
--- a/examples/inverse-iteration/inverse-iteration.cpp
+++ b/examples/inverse-iteration/inverse-iteration.cpp
@@ -2,9 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <cmath>
 #include <complex>
 #include <fstream>
@@ -13,6 +10,8 @@
 #include <map>
 #include <string>
 
+#include <ginkgo/ginkgo.hpp>
+
 
 int main(int argc, char* argv[])
 {
diff --git a/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp b/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp
index 34fc684bcf6..10126427441 100644
--- a/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp
+++ b/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp
@@ -2,15 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <cstdlib>
 #include <fstream>
 #include <iostream>
 #include <map>
 #include <string>
 
+#include <ginkgo/ginkgo.hpp>
+
 
 int main(int argc, char* argv[])
 {
diff --git a/examples/iterative-refinement/iterative-refinement.cpp b/examples/iterative-refinement/iterative-refinement.cpp
index aa38e54ede2..4684b425b0f 100644
--- a/examples/iterative-refinement/iterative-refinement.cpp
+++ b/examples/iterative-refinement/iterative-refinement.cpp
@@ -2,15 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <map>
 #include <string>
 
+#include <ginkgo/ginkgo.hpp>
+
 
 int main(int argc, char* argv[])
 {
diff --git a/examples/kokkos-assembly/kokkos-assembly.cpp b/examples/kokkos-assembly/kokkos-assembly.cpp
index d1c19d1b3e7..3eed9271d6c 100644
--- a/examples/kokkos-assembly/kokkos-assembly.cpp
+++ b/examples/kokkos-assembly/kokkos-assembly.cpp
@@ -5,12 +5,11 @@
 #include <iostream>
 #include <string>
 
-
 #include <Kokkos_Core.hpp>
 
+#include <ginkgo/ginkgo.hpp>
 
 #include <ginkgo/extensions/kokkos.hpp>
-#include <ginkgo/ginkgo.hpp>
 
 
 namespace gko::ext::kokkos::detail {
diff --git a/examples/minimal-cuda-solver/minimal-cuda-solver.cpp b/examples/minimal-cuda-solver/minimal-cuda-solver.cpp
index 0d4ba7d67d4..848742cf544 100644
--- a/examples/minimal-cuda-solver/minimal-cuda-solver.cpp
+++ b/examples/minimal-cuda-solver/minimal-cuda-solver.cpp
@@ -2,9 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
 #include <iostream>
 
+#include <ginkgo/ginkgo.hpp>
+
 int main()
 {
     // Instantiate a CUDA executor
diff --git a/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp b/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp
index 6690c8e13d3..d598bb48a46 100644
--- a/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp
+++ b/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp
@@ -2,15 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <map>
 #include <string>
 
+#include <ginkgo/ginkgo.hpp>
+
 
 int main(int argc, char* argv[])
 {
diff --git a/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp b/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp
index 08575d6306c..383c721a3e1 100644
--- a/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp
+++ b/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp
@@ -2,15 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <map>
 #include <string>
 
+#include <ginkgo/ginkgo.hpp>
+
 
 int main(int argc, char* argv[])
 {
diff --git a/examples/mixed-precision-ir/mixed-precision-ir.cpp b/examples/mixed-precision-ir/mixed-precision-ir.cpp
index ed6fda2c689..4e8b37f6732 100644
--- a/examples/mixed-precision-ir/mixed-precision-ir.cpp
+++ b/examples/mixed-precision-ir/mixed-precision-ir.cpp
@@ -2,15 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <map>
 #include <string>
 
+#include <ginkgo/ginkgo.hpp>
+
 
 int main(int argc, char* argv[])
 {
diff --git a/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp b/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp
index 9b114b611af..962e96c69a2 100644
--- a/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp
+++ b/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp
@@ -2,15 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <map>
 #include <string>
 
+#include <ginkgo/ginkgo.hpp>
+
 
 int main(int argc, char* argv[])
 {
diff --git a/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp b/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp
index 51f17b7821c..64d39e806f3 100644
--- a/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp
+++ b/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp
@@ -2,15 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <map>
 #include <string>
 
+#include <ginkgo/ginkgo.hpp>
+
 
 int main(int argc, char* argv[])
 {
diff --git a/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp b/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp
index 59c756e2a69..155d4a59370 100644
--- a/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp
+++ b/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp
@@ -43,12 +43,13 @@ hand side vector changes when increasing the dimension.
 
 #include <array>
 #include <chrono>
-#include <ginkgo/ginkgo.hpp>
 #include <iostream>
 #include <map>
 #include <string>
 #include <vector>
 
+#include <ginkgo/ginkgo.hpp>
+
 // Stencil values. Ordering can be seen in the main function
 // Can also be changed by passing additional parameter when executing
 constexpr double default_alpha = 10.0 / 3.0;
diff --git a/examples/papi-logging/papi-logging.cpp b/examples/papi-logging/papi-logging.cpp
index 6be633aff03..159d5cf647d 100644
--- a/examples/papi-logging/papi-logging.cpp
+++ b/examples/papi-logging/papi-logging.cpp
@@ -2,16 +2,16 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
-#include <papi.h>
 #include <fstream>
 #include <iostream>
 #include <map>
 #include <string>
 #include <thread>
 
+#include <papi.h>
+
+#include <ginkgo/ginkgo.hpp>
+
 
 namespace {
 
diff --git a/examples/par-ilu-convergence/par-ilu-convergence.cpp b/examples/par-ilu-convergence/par-ilu-convergence.cpp
index bf0e4e7a990..72e72cf7480 100644
--- a/examples/par-ilu-convergence/par-ilu-convergence.cpp
+++ b/examples/par-ilu-convergence/par-ilu-convergence.cpp
@@ -9,7 +9,6 @@
 #include <memory>
 #include <string>
 
-
 #include <ginkgo/ginkgo.hpp>
 
 
diff --git a/examples/performance-debugging/performance-debugging.cpp b/examples/performance-debugging/performance-debugging.cpp
index 00dafc45378..9f956106fd5 100644
--- a/examples/performance-debugging/performance-debugging.cpp
+++ b/examples/performance-debugging/performance-debugging.cpp
@@ -2,9 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <algorithm>
 #include <array>
 #include <chrono>
@@ -20,6 +17,8 @@
 #include <utility>
 #include <vector>
 
+#include <ginkgo/ginkgo.hpp>
+
 
 template <typename ValueType>
 using vec = gko::matrix::Dense<ValueType>;
diff --git a/examples/poisson-solver/poisson-solver.cpp b/examples/poisson-solver/poisson-solver.cpp
index d70dd1aa506..f508869c63d 100644
--- a/examples/poisson-solver/poisson-solver.cpp
+++ b/examples/poisson-solver/poisson-solver.cpp
@@ -2,12 +2,13 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
 #include <iostream>
 #include <map>
 #include <string>
 #include <vector>
 
+#include <ginkgo/ginkgo.hpp>
+
 
 // Creates a stencil matrix in CSR format for the given number of discretization
 // points.
diff --git a/examples/preconditioned-solver/preconditioned-solver.cpp b/examples/preconditioned-solver/preconditioned-solver.cpp
index 2291c3cb2ed..0284fdf26cc 100644
--- a/examples/preconditioned-solver/preconditioned-solver.cpp
+++ b/examples/preconditioned-solver/preconditioned-solver.cpp
@@ -2,14 +2,13 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <fstream>
 #include <iostream>
 #include <map>
 #include <string>
 
+#include <ginkgo/ginkgo.hpp>
+
 
 int main(int argc, char* argv[])
 {
diff --git a/examples/preconditioner-export/preconditioner-export.cpp b/examples/preconditioner-export/preconditioner-export.cpp
index e6a405cde4a..c37951bcaff 100644
--- a/examples/preconditioner-export/preconditioner-export.cpp
+++ b/examples/preconditioner-export/preconditioner-export.cpp
@@ -2,9 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <fstream>
 #include <functional>
 #include <iostream>
@@ -12,6 +9,8 @@
 #include <memory>
 #include <string>
 
+#include <ginkgo/ginkgo.hpp>
+
 
 const std::map<std::string, std::function<std::shared_ptr<gko::Executor>()>>
     executors{{"reference", [] { return gko::ReferenceExecutor::create(); }},
diff --git a/examples/reordered-preconditioned-solver/reordered-preconditioned-solver.cpp b/examples/reordered-preconditioned-solver/reordered-preconditioned-solver.cpp
index 490e36ad387..7a227fd0ee2 100644
--- a/examples/reordered-preconditioned-solver/reordered-preconditioned-solver.cpp
+++ b/examples/reordered-preconditioned-solver/reordered-preconditioned-solver.cpp
@@ -7,7 +7,6 @@
 #include <map>
 #include <string>
 
-
 #include <ginkgo/ginkgo.hpp>
 
 
diff --git a/examples/schroedinger-splitting/schroedinger-splitting.cpp b/examples/schroedinger-splitting/schroedinger-splitting.cpp
index 4390287c30f..cadb186c23b 100644
--- a/examples/schroedinger-splitting/schroedinger-splitting.cpp
+++ b/examples/schroedinger-splitting/schroedinger-splitting.cpp
@@ -47,8 +47,6 @@ to the non-linear part, which turns it into the Gross–Pitaevskii equation.
 
 *****************************<DESCRIPTION>**********************************/
 
-#include <ginkgo/ginkgo.hpp>
-
 #include <algorithm>
 #include <chrono>
 #include <fstream>
@@ -58,6 +56,8 @@ to the non-linear part, which turns it into the Gross–Pitaevskii equation.
 #include <opencv2/core.hpp>
 #include <opencv2/videoio.hpp>
 
+#include <ginkgo/ginkgo.hpp>
+
 
 // This function implements a simple Ginkgo-themed clamped color mapping for
 // values in the range [0,5].
diff --git a/examples/simple-solver-logging/simple-solver-logging.cpp b/examples/simple-solver-logging/simple-solver-logging.cpp
index 3bcbd834bc3..158f94cff25 100644
--- a/examples/simple-solver-logging/simple-solver-logging.cpp
+++ b/examples/simple-solver-logging/simple-solver-logging.cpp
@@ -2,15 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <map>
 #include <string>
 
+#include <ginkgo/ginkgo.hpp>
+
 
 namespace {
 
diff --git a/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp b/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp
index a28ab925c88..0f77d69cf3d 100644
--- a/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp
+++ b/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp
@@ -41,12 +41,13 @@ use Ginkgo, and the only part where Ginkgo is introduced is inside the
 `solve_system` function.
 *****************************<DESCRIPTION>**********************************/
 
-#include <ginkgo/ginkgo.hpp>
 #include <iostream>
 #include <map>
 #include <string>
 #include <vector>
 
+#include <ginkgo/ginkgo.hpp>
+
 
 // Creates a stencil matrix in CSR format for the given number of discretization
 // points.
diff --git a/extensions/test/config/json_config.cpp b/extensions/test/config/json_config.cpp
index 13191a2ff9a..a46cdd93628 100644
--- a/extensions/test/config/json_config.cpp
+++ b/extensions/test/config/json_config.cpp
@@ -4,15 +4,12 @@
 
 #include <stdexcept>
 
-
 #include <gtest/gtest.h>
 #include <nlohmann/json.hpp>
 
-
 #include <ginkgo/core/config/property_tree.hpp>
 #include <ginkgo/extensions/config/json_config.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "extensions/test/config/file_location.hpp"
 
diff --git a/extensions/test/kokkos/kokkos_main.cpp b/extensions/test/kokkos/kokkos_main.cpp
index e541d362244..7a85c379cdd 100644
--- a/extensions/test/kokkos/kokkos_main.cpp
+++ b/extensions/test/kokkos/kokkos_main.cpp
@@ -4,10 +4,8 @@
 
 #include <Kokkos_Core.hpp>
 
-
 #include <gtest/gtest.h>
 
-
 #include "core/test/gtest/environments.hpp"
 
 
diff --git a/extensions/test/kokkos/spaces.cpp b/extensions/test/kokkos/spaces.cpp
index 47e24aac93e..e15c3579564 100644
--- a/extensions/test/kokkos/spaces.cpp
+++ b/extensions/test/kokkos/spaces.cpp
@@ -4,13 +4,10 @@
 
 #include <Kokkos_Core.hpp>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/extensions/kokkos/spaces.hpp>
 
-
 #include "core/test/gtest/environments.hpp"
 #include "core/test/utils.hpp"
 
diff --git a/extensions/test/kokkos/types.cpp b/extensions/test/kokkos/types.cpp
index 4bff41499e9..bb3252b149c 100644
--- a/extensions/test/kokkos/types.cpp
+++ b/extensions/test/kokkos/types.cpp
@@ -5,18 +5,14 @@
 #include <cstring>
 #include <sstream>
 
-
 #include <Kokkos_Core.hpp>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/mtx_io.hpp>
 #include <ginkgo/extensions/kokkos/spaces.hpp>
 #include <ginkgo/extensions/kokkos/types.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp
index 74e6c34dc5d..86b16c8975d 100644
--- a/hip/base/batch_multi_vector_kernels.hip.cpp
+++ b/hip/base/batch_multi_vector_kernels.hip.cpp
@@ -4,15 +4,12 @@
 
 #include "core/base/batch_multi_vector_kernels.hpp"
 
-
 #include <thrust/functional.h>
 #include <thrust/transform.h>
 
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/range_accessors.hpp>
 
-
 #include "common/cuda_hip/base/blas_bindings.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/pointer_mode_guard.hpp"
diff --git a/hip/base/batch_struct.hip.hpp b/hip/base/batch_struct.hip.hpp
index 4f09ec66bb8..3e4cba6a747 100644
--- a/hip/base/batch_struct.hip.hpp
+++ b/hip/base/batch_struct.hip.hpp
@@ -9,7 +9,6 @@
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "core/base/batch_struct.hpp"
diff --git a/hip/base/config.hip.hpp b/hip/base/config.hip.hpp
index 89dc67255fc..e74153cc34e 100644
--- a/hip/base/config.hip.hpp
+++ b/hip/base/config.hip.hpp
@@ -8,7 +8,6 @@
 
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 #include "hip/base/math.hip.hpp"
 
diff --git a/hip/base/device.hip.cpp b/hip/base/device.hip.cpp
index d1d4325c6f1..f9e5dadce52 100644
--- a/hip/base/device.hip.cpp
+++ b/hip/base/device.hip.cpp
@@ -4,12 +4,10 @@
 
 #include "ginkgo/core/base/device.hpp"
 
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/stream.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 #include "hip/base/scoped_device_id.hip.hpp"
 
diff --git a/hip/base/device_matrix_data_kernels.hip.cpp b/hip/base/device_matrix_data_kernels.hip.cpp
index 5a0b762ea57..d63a8e27ed5 100644
--- a/hip/base/device_matrix_data_kernels.hip.cpp
+++ b/hip/base/device_matrix_data_kernels.hip.cpp
@@ -4,7 +4,6 @@
 
 #include "core/base/device_matrix_data_kernels.hpp"
 
-
 #include <thrust/copy.h>
 #include <thrust/count.h>
 #include <thrust/device_ptr.h>
@@ -13,7 +12,6 @@
 #include <thrust/sort.h>
 #include <thrust/tuple.h>
 
-
 #include "common/cuda_hip/base/types.hpp"
 #include "hip/base/thrust.hip.hpp"
 
diff --git a/hip/base/exception.hip.cpp b/hip/base/exception.hip.cpp
index 05b030ad375..c83778951d0 100644
--- a/hip/base/exception.hip.cpp
+++ b/hip/base/exception.hip.cpp
@@ -4,7 +4,6 @@
 
 #include "ginkgo/core/base/exception.hpp"
 
-
 #include <string>
 
 
@@ -21,7 +20,6 @@
 
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 
 
diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp
index e371e48f489..9e09912c5c9 100644
--- a/hip/base/executor.hip.cpp
+++ b/hip/base/executor.hip.cpp
@@ -4,15 +4,12 @@
 
 #include "ginkgo/core/base/executor.hpp"
 
-
 #include <iostream>
 
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/device.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "hip/base/hipblas_bindings.hip.hpp"
diff --git a/hip/base/hipblas_bindings.hip.hpp b/hip/base/hipblas_bindings.hip.hpp
index d5dc94d6138..21c44e664b8 100644
--- a/hip/base/hipblas_bindings.hip.hpp
+++ b/hip/base/hipblas_bindings.hip.hpp
@@ -16,7 +16,6 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "hip/base/math.hip.hpp"
diff --git a/hip/base/hiprand_bindings.hip.hpp b/hip/base/hiprand_bindings.hip.hpp
index 9fd7ade8231..a76274c45a7 100644
--- a/hip/base/hiprand_bindings.hip.hpp
+++ b/hip/base/hiprand_bindings.hip.hpp
@@ -15,7 +15,6 @@
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "hip/base/math.hip.hpp"
diff --git a/hip/base/hipsparse_bindings.hip.hpp b/hip/base/hipsparse_bindings.hip.hpp
index 0337f0a03c6..af01f9dc94a 100644
--- a/hip/base/hipsparse_bindings.hip.hpp
+++ b/hip/base/hipsparse_bindings.hip.hpp
@@ -16,7 +16,6 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 
diff --git a/hip/base/hipsparse_block_bindings.hip.hpp b/hip/base/hipsparse_block_bindings.hip.hpp
index 6fb70c4571c..d68ceb48ddd 100644
--- a/hip/base/hipsparse_block_bindings.hip.hpp
+++ b/hip/base/hipsparse_block_bindings.hip.hpp
@@ -15,7 +15,6 @@
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "hip/base/hipsparse_bindings.hip.hpp"
diff --git a/hip/base/index_set_kernels.hip.cpp b/hip/base/index_set_kernels.hip.cpp
index a246b5bf57e..9f9f967fe35 100644
--- a/hip/base/index_set_kernels.hip.cpp
+++ b/hip/base/index_set_kernels.hip.cpp
@@ -4,10 +4,8 @@
 
 #include "core/base/index_set_kernels.hpp"
 
-
 #include <memory>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/types.hpp>
diff --git a/hip/base/kernel_launch.hip.hpp b/hip/base/kernel_launch.hip.hpp
index 890b9922a4c..ff9f398c0bc 100644
--- a/hip/base/kernel_launch.hip.hpp
+++ b/hip/base/kernel_launch.hip.hpp
@@ -10,7 +10,6 @@
 
 #include <thrust/tuple.h>
 
-
 #include "accessor/cuda_hip_helper.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
diff --git a/hip/base/math.hip.hpp b/hip/base/math.hip.hpp
index f9427089126..9f577812f3e 100644
--- a/hip/base/math.hip.hpp
+++ b/hip/base/math.hip.hpp
@@ -6,11 +6,10 @@
 #define GKO_HIP_BASE_MATH_HIP_HPP_
 
 
-#include <ginkgo/core/base/math.hpp>
-
-
 #include <thrust/complex.h>
 
+#include <ginkgo/core/base/math.hpp>
+
 
 namespace gko {
 
diff --git a/hip/base/memory.hip.cpp b/hip/base/memory.hip.cpp
index 27d510d784b..6ac3070192a 100644
--- a/hip/base/memory.hip.cpp
+++ b/hip/base/memory.hip.cpp
@@ -4,10 +4,8 @@
 
 #include "ginkgo/core/base/memory.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 #include "hip/base/scoped_device_id.hip.hpp"
 
diff --git a/hip/base/pointer_mode_guard.hip.hpp b/hip/base/pointer_mode_guard.hip.hpp
index 5cd4b3ec58f..d14c8468c0b 100644
--- a/hip/base/pointer_mode_guard.hip.hpp
+++ b/hip/base/pointer_mode_guard.hip.hpp
@@ -22,7 +22,6 @@
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/std_extensions.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 
 
diff --git a/hip/base/roctx.hip.cpp b/hip/base/roctx.hip.cpp
index 46dad3be816..9f4f44ec815 100644
--- a/hip/base/roctx.hip.cpp
+++ b/hip/base/roctx.hip.cpp
@@ -4,7 +4,6 @@
 
 #include <ginkgo/config.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 
 
diff --git a/hip/base/scoped_device_id.hip.cpp b/hip/base/scoped_device_id.hip.cpp
index 1fd7211b106..e16c2b5701a 100644
--- a/hip/base/scoped_device_id.hip.cpp
+++ b/hip/base/scoped_device_id.hip.cpp
@@ -2,15 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "hip/base/scoped_device_id.hip.hpp"
+
 #include <exception>
 #include <utility>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
-#include "hip/base/scoped_device_id.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/base/stream.hip.cpp b/hip/base/stream.hip.cpp
index d5acb978e22..d57f63c4e7c 100644
--- a/hip/base/stream.hip.cpp
+++ b/hip/base/stream.hip.cpp
@@ -4,12 +4,10 @@
 
 #include "ginkgo/core/base/stream.hpp"
 
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/device.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 #include "hip/base/scoped_device_id.hip.hpp"
 
diff --git a/hip/base/thrust.hip.hpp b/hip/base/thrust.hip.hpp
index 2c0412fb67d..2aecdd79328 100644
--- a/hip/base/thrust.hip.hpp
+++ b/hip/base/thrust.hip.hpp
@@ -8,7 +8,6 @@
 
 #include <thrust/execution_policy.h>
 
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #if GINKGO_HIP_PLATFORM_HCC
diff --git a/hip/base/timer.hip.cpp b/hip/base/timer.hip.cpp
index 67a9a8153b6..800f4a739c1 100644
--- a/hip/base/timer.hip.cpp
+++ b/hip/base/timer.hip.cpp
@@ -4,10 +4,8 @@
 
 #include "ginkgo/core/base/timer.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 #include "hip/base/scoped_device_id.hip.hpp"
 
diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp
index 9ae2224c064..bb0d4a2d0c9 100644
--- a/hip/base/types.hip.hpp
+++ b/hip/base/types.hip.hpp
@@ -6,15 +6,13 @@
 #define GKO_HIP_BASE_TYPES_HIP_HPP_
 
 
-#include <ginkgo/core/base/types.hpp>
-
-
 #include <type_traits>
 
-
 #include <hip/hip_complex.h>
 #include <hip/hip_fp16.h>
 
+#include <ginkgo/core/base/types.hpp>
+
 
 #if HIP_VERSION >= 50200000
 #include <hipblas/hipblas.h>
@@ -23,10 +21,8 @@
 #endif
 #include <thrust/complex.h>
 
-
 #include <ginkgo/core/base/matrix_data.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 
 
diff --git a/hip/components/atomic.hip.hpp b/hip/components/atomic.hip.hpp
index 0dc8d7a3b46..64d39a90d78 100644
--- a/hip/components/atomic.hip.hpp
+++ b/hip/components/atomic.hip.hpp
@@ -8,7 +8,6 @@
 
 #include <type_traits>
 
-
 #include "common/cuda_hip/base/types.hpp"
 #include "hip/base/math.hip.hpp"
 
diff --git a/hip/components/cooperative_groups.hip.hpp b/hip/components/cooperative_groups.hip.hpp
index e81441a092b..d3dbc44a5c8 100644
--- a/hip/components/cooperative_groups.hip.hpp
+++ b/hip/components/cooperative_groups.hip.hpp
@@ -8,7 +8,6 @@
 
 #include <type_traits>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/types.hpp"
 
diff --git a/hip/components/diagonal_block_manipulation.hip.hpp b/hip/components/diagonal_block_manipulation.hip.hpp
index 290511e7583..7a3893fa031 100644
--- a/hip/components/diagonal_block_manipulation.hip.hpp
+++ b/hip/components/diagonal_block_manipulation.hip.hpp
@@ -8,7 +8,6 @@
 
 #include <type_traits>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
diff --git a/hip/components/format_conversion.hip.hpp b/hip/components/format_conversion.hip.hpp
index 07daf486d84..d2cbc3062a5 100644
--- a/hip/components/format_conversion.hip.hpp
+++ b/hip/components/format_conversion.hip.hpp
@@ -9,7 +9,6 @@
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/executor.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "hip/components/thread_ids.hip.hpp"
diff --git a/hip/components/memory.hip.hpp b/hip/components/memory.hip.hpp
index 4bb6fa19ec0..d8238c11795 100644
--- a/hip/components/memory.hip.hpp
+++ b/hip/components/memory.hip.hpp
@@ -9,10 +9,8 @@
 #include <cstring>
 #include <type_traits>
 
-
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/cuda_hip/base/types.hpp"
 
 
diff --git a/hip/components/prefix_sum.hip.hpp b/hip/components/prefix_sum.hip.hpp
index 5acde03cbec..deb78288e6c 100644
--- a/hip/components/prefix_sum.hip.hpp
+++ b/hip/components/prefix_sum.hip.hpp
@@ -8,7 +8,6 @@
 
 #include <type_traits>
 
-
 #include "common/cuda_hip/base/blas_bindings.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "hip/components/reduction.hip.hpp"
diff --git a/hip/components/prefix_sum_kernels.hip.cpp b/hip/components/prefix_sum_kernels.hip.cpp
index ad55c0954d1..283e8c161a1 100644
--- a/hip/components/prefix_sum_kernels.hip.cpp
+++ b/hip/components/prefix_sum_kernels.hip.cpp
@@ -4,18 +4,14 @@
 
 #include "core/components/prefix_sum_kernels.hpp"
 
-
 #include <limits>
 
-
 #include <thrust/scan.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/name_demangling.hpp>
 
-
 #include "hip/base/thrust.hip.hpp"
 
 
diff --git a/hip/components/reduction.hip.hpp b/hip/components/reduction.hip.hpp
index fb0539952ff..bc2594dd96d 100644
--- a/hip/components/reduction.hip.hpp
+++ b/hip/components/reduction.hip.hpp
@@ -8,11 +8,9 @@
 
 #include <type_traits>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
diff --git a/hip/components/syncfree.hip.hpp b/hip/components/syncfree.hip.hpp
index 7627a0a2781..c174224c9c4 100644
--- a/hip/components/syncfree.hip.hpp
+++ b/hip/components/syncfree.hip.hpp
@@ -8,7 +8,6 @@
 
 #include <ginkgo/core/base/array.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "common/cuda_hip/components/memory.hpp"
diff --git a/hip/components/warp_blas.hip.hpp b/hip/components/warp_blas.hip.hpp
index 8ac59719aa7..9164a1914b3 100644
--- a/hip/components/warp_blas.hip.hpp
+++ b/hip/components/warp_blas.hip.hpp
@@ -9,10 +9,8 @@
 #include <cassert>
 #include <type_traits>
 
-
 #include <ginkgo/config.hpp>
 
-
 #include "hip/base/math.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 
diff --git a/hip/distributed/index_map_kernels.hip.cpp b/hip/distributed/index_map_kernels.hip.cpp
index d45674a66a3..536b09a1bb1 100644
--- a/hip/distributed/index_map_kernels.hip.cpp
+++ b/hip/distributed/index_map_kernels.hip.cpp
@@ -4,7 +4,6 @@
 
 #include "core/distributed/index_map_kernels.hpp"
 
-
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
 #include <thrust/distance.h>
@@ -18,10 +17,8 @@
 #include <thrust/transform_reduce.h>
 #include <thrust/unique.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "hip/base/thrust.hip.hpp"
 #include "hip/components/atomic.hip.hpp"
 #include "hip/components/searching.hip.hpp"
diff --git a/hip/distributed/matrix_kernels.hip.cpp b/hip/distributed/matrix_kernels.hip.cpp
index 54cde64c429..535fdaacb44 100644
--- a/hip/distributed/matrix_kernels.hip.cpp
+++ b/hip/distributed/matrix_kernels.hip.cpp
@@ -4,7 +4,6 @@
 
 #include "core/distributed/matrix_kernels.hpp"
 
-
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
 #include <thrust/distance.h>
@@ -17,10 +16,8 @@
 #include <thrust/transform_reduce.h>
 #include <thrust/unique.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "hip/base/thrust.hip.hpp"
 #include "hip/components/atomic.hip.hpp"
 
diff --git a/hip/distributed/partition_helpers_kernels.hip.cpp b/hip/distributed/partition_helpers_kernels.hip.cpp
index 744d8de887b..a2083a55303 100644
--- a/hip/distributed/partition_helpers_kernels.hip.cpp
+++ b/hip/distributed/partition_helpers_kernels.hip.cpp
@@ -4,13 +4,11 @@
 
 #include "core/distributed/partition_helpers_kernels.hpp"
 
-
 #include <thrust/device_ptr.h>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/sort.h>
 
-
 #include "hip/base/thrust.hip.hpp"
 
 
diff --git a/hip/distributed/partition_kernels.hip.cpp b/hip/distributed/partition_kernels.hip.cpp
index 00dc74b910f..c2c4a8f28ea 100644
--- a/hip/distributed/partition_kernels.hip.cpp
+++ b/hip/distributed/partition_kernels.hip.cpp
@@ -4,7 +4,6 @@
 
 #include "core/distributed/partition_kernels.hpp"
 
-
 #include <thrust/count.h>
 #include <thrust/device_ptr.h>
 #include <thrust/execution_policy.h>
@@ -12,7 +11,6 @@
 #include <thrust/scan.h>
 #include <thrust/sort.h>
 
-
 #include "common/unified/base/kernel_launch.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "hip/base/thrust.hip.hpp"
diff --git a/hip/distributed/vector_kernels.hip.cpp b/hip/distributed/vector_kernels.hip.cpp
index fc6718dec0d..eff7936076d 100644
--- a/hip/distributed/vector_kernels.hip.cpp
+++ b/hip/distributed/vector_kernels.hip.cpp
@@ -4,7 +4,6 @@
 
 #include "core/distributed/vector_kernels.hpp"
 
-
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/transform_iterator.h>
@@ -12,10 +11,8 @@
 #include <thrust/scatter.h>
 #include <thrust/tuple.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "hip/base/thrust.hip.hpp"
 
 
diff --git a/hip/factorization/cholesky_kernels.hip.cpp b/hip/factorization/cholesky_kernels.hip.cpp
index 419db21b811..1c1ce1d3170 100644
--- a/hip/factorization/cholesky_kernels.hip.cpp
+++ b/hip/factorization/cholesky_kernels.hip.cpp
@@ -4,11 +4,9 @@
 
 #include "core/factorization/cholesky_kernels.hpp"
 
-
 #include <algorithm>
 #include <memory>
 
-
 #include <thrust/execution_policy.h>
 #include <thrust/functional.h>
 #include <thrust/sequence.h>
@@ -16,10 +14,8 @@
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
-
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/fill_array_kernels.hpp"
diff --git a/hip/factorization/factorization_kernels.hip.cpp b/hip/factorization/factorization_kernels.hip.cpp
index 4080768bc07..d6768e5e9c6 100644
--- a/hip/factorization/factorization_kernels.hip.cpp
+++ b/hip/factorization/factorization_kernels.hip.cpp
@@ -4,10 +4,8 @@
 
 #include "core/factorization/factorization_kernels.hpp"
 
-
 #include <ginkgo/core/base/array.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
diff --git a/hip/factorization/ic_kernels.hip.cpp b/hip/factorization/ic_kernels.hip.cpp
index edda974fd36..cfbb12bd5b3 100644
--- a/hip/factorization/ic_kernels.hip.cpp
+++ b/hip/factorization/ic_kernels.hip.cpp
@@ -4,10 +4,8 @@
 
 #include "core/factorization/ic_kernels.hpp"
 
-
 #include <ginkgo/core/base/array.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
 
diff --git a/hip/factorization/ilu_kernels.hip.cpp b/hip/factorization/ilu_kernels.hip.cpp
index f50df5ca75b..45d468d0500 100644
--- a/hip/factorization/ilu_kernels.hip.cpp
+++ b/hip/factorization/ilu_kernels.hip.cpp
@@ -4,10 +4,8 @@
 
 #include "core/factorization/ilu_kernels.hpp"
 
-
 #include <ginkgo/core/base/array.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
 
diff --git a/hip/factorization/lu_kernels.hip.cpp b/hip/factorization/lu_kernels.hip.cpp
index ec3e771134e..8e37d1a2445 100644
--- a/hip/factorization/lu_kernels.hip.cpp
+++ b/hip/factorization/lu_kernels.hip.cpp
@@ -4,19 +4,15 @@
 
 #include "core/factorization/lu_kernels.hpp"
 
-
 #include <algorithm>
 #include <memory>
 
-
 #include <thrust/copy.h>
 #include <thrust/iterator/transform_output_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 
-
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/allocator.hpp"
diff --git a/hip/factorization/par_ic_kernels.hip.cpp b/hip/factorization/par_ic_kernels.hip.cpp
index e4cd0b2470b..f0e0cb0b632 100644
--- a/hip/factorization/par_ic_kernels.hip.cpp
+++ b/hip/factorization/par_ic_kernels.hip.cpp
@@ -4,12 +4,10 @@
 
 #include "core/factorization/par_ic_kernels.hpp"
 
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/memory.hpp"
 #include "hip/base/math.hip.hpp"
diff --git a/hip/factorization/par_ict_kernels.hip.cpp b/hip/factorization/par_ict_kernels.hip.cpp
index 7f5dba82eba..99b2f09274b 100644
--- a/hip/factorization/par_ict_kernels.hip.cpp
+++ b/hip/factorization/par_ict_kernels.hip.cpp
@@ -4,14 +4,12 @@
 
 #include "core/factorization/par_ict_kernels.hpp"
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/components/memory.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
diff --git a/hip/factorization/par_ilu_kernels.hip.cpp b/hip/factorization/par_ilu_kernels.hip.cpp
index 49608d6801f..b4897a23cf9 100644
--- a/hip/factorization/par_ilu_kernels.hip.cpp
+++ b/hip/factorization/par_ilu_kernels.hip.cpp
@@ -4,10 +4,8 @@
 
 #include "core/factorization/par_ilu_kernels.hpp"
 
-
 #include <ginkgo/core/matrix/coo.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/memory.hpp"
diff --git a/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp b/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp
index b5612ea29c6..b4fdd7e6e6d 100644
--- a/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp
+++ b/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp
@@ -2,24 +2,20 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/factorization/par_ilut_kernels.hpp"
-
-
 #include <algorithm>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
+#include "core/factorization/par_ilut_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
diff --git a/hip/factorization/par_ilut_filter_kernels.hip.cpp b/hip/factorization/par_ilut_filter_kernels.hip.cpp
index e6d0a6348cc..8f91e6f7087 100644
--- a/hip/factorization/par_ilut_filter_kernels.hip.cpp
+++ b/hip/factorization/par_ilut_filter_kernels.hip.cpp
@@ -2,21 +2,18 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/factorization/par_ilut_kernels.hpp"
-
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
+#include "core/factorization/par_ilut_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
diff --git a/hip/factorization/par_ilut_select_common.hip.cpp b/hip/factorization/par_ilut_select_common.hip.cpp
index 5486b3f5ba5..098ce5c9887 100644
--- a/hip/factorization/par_ilut_select_common.hip.cpp
+++ b/hip/factorization/par_ilut_select_common.hip.cpp
@@ -2,7 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-
 // clang-format off
 // prevent compilation failure related to disappearing assert(...) statements
 #include "common/cuda_hip/base/runtime.hpp"
@@ -11,7 +10,6 @@
 
 #include "hip/factorization/par_ilut_select_common.hip.hpp"
 
-
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/par_ilut_kernels.hpp"
 #include "hip/base/math.hip.hpp"
diff --git a/hip/factorization/par_ilut_select_kernels.hip.cpp b/hip/factorization/par_ilut_select_kernels.hip.cpp
index b259133b95d..55180bc3d05 100644
--- a/hip/factorization/par_ilut_select_kernels.hip.cpp
+++ b/hip/factorization/par_ilut_select_kernels.hip.cpp
@@ -2,19 +2,15 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/factorization/par_ilut_kernels.hpp"
-
-
 #include <algorithm>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
+#include "core/factorization/par_ilut_kernels.hpp"
 #include "hip/base/math.hip.hpp"
 #include "hip/components/atomic.hip.hpp"
 #include "hip/components/intrinsics.hip.hpp"
diff --git a/hip/factorization/par_ilut_spgeam_kernels.hip.cpp b/hip/factorization/par_ilut_spgeam_kernels.hip.cpp
index df77b1ba7a2..200a16ea849 100644
--- a/hip/factorization/par_ilut_spgeam_kernels.hip.cpp
+++ b/hip/factorization/par_ilut_spgeam_kernels.hip.cpp
@@ -2,19 +2,16 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/factorization/par_ilut_kernels.hpp"
-
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
+#include "core/factorization/par_ilut_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
diff --git a/hip/factorization/par_ilut_sweep_kernels.hip.cpp b/hip/factorization/par_ilut_sweep_kernels.hip.cpp
index 0f1e6455812..b3994706567 100644
--- a/hip/factorization/par_ilut_sweep_kernels.hip.cpp
+++ b/hip/factorization/par_ilut_sweep_kernels.hip.cpp
@@ -2,19 +2,16 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/factorization/par_ilut_kernels.hpp"
-
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/components/memory.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
+#include "core/factorization/par_ilut_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
diff --git a/hip/matrix/batch_csr_kernels.hip.cpp b/hip/matrix/batch_csr_kernels.hip.cpp
index de73576ffed..4b0e6799834 100644
--- a/hip/matrix/batch_csr_kernels.hip.cpp
+++ b/hip/matrix/batch_csr_kernels.hip.cpp
@@ -4,15 +4,12 @@
 
 #include "core/matrix/batch_csr_kernels.hpp"
 
-
 #include <thrust/functional.h>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/batch_csr.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
diff --git a/hip/matrix/batch_dense_kernels.hip.cpp b/hip/matrix/batch_dense_kernels.hip.cpp
index 5d3b9d8cef9..328f268251f 100644
--- a/hip/matrix/batch_dense_kernels.hip.cpp
+++ b/hip/matrix/batch_dense_kernels.hip.cpp
@@ -4,15 +4,12 @@
 
 #include "core/matrix/batch_dense_kernels.hpp"
 
-
 #include <thrust/functional.h>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
diff --git a/hip/matrix/batch_ell_kernels.hip.cpp b/hip/matrix/batch_ell_kernels.hip.cpp
index d415f114c3b..01294ac3d63 100644
--- a/hip/matrix/batch_ell_kernels.hip.cpp
+++ b/hip/matrix/batch_ell_kernels.hip.cpp
@@ -4,15 +4,12 @@
 
 #include "core/matrix/batch_ell_kernels.hpp"
 
-
 #include <thrust/functional.h>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
diff --git a/hip/matrix/batch_struct.hip.hpp b/hip/matrix/batch_struct.hip.hpp
index 16a267d95b6..bb9f7912cd6 100644
--- a/hip/matrix/batch_struct.hip.hpp
+++ b/hip/matrix/batch_struct.hip.hpp
@@ -6,15 +6,12 @@
 #define GKO_HIP_MATRIX_BATCH_STRUCT_HIP_HPP_
 
 
-#include "core/matrix/batch_struct.hpp"
-
-
 #include <ginkgo/core/matrix/batch_dense.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
-
 #include "common/cuda_hip/base/types.hpp"
 #include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
 
 
 namespace gko {
diff --git a/hip/matrix/coo_kernels.hip.cpp b/hip/matrix/coo_kernels.hip.cpp
index 8f7a050ef87..fe78b938e3c 100644
--- a/hip/matrix/coo_kernels.hip.cpp
+++ b/hip/matrix/coo_kernels.hip.cpp
@@ -4,14 +4,12 @@
 
 #include "core/matrix/coo_kernels.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
diff --git a/hip/matrix/csr_kernels.template.hip.cpp b/hip/matrix/csr_kernels.template.hip.cpp
index 8b3579f049c..acd0b0144bb 100644
--- a/hip/matrix/csr_kernels.template.hip.cpp
+++ b/hip/matrix/csr_kernels.template.hip.cpp
@@ -4,10 +4,8 @@
 
 #include "core/matrix/csr_kernels.hpp"
 
-
 #include <algorithm>
 
-
 #include <thrust/copy.h>
 #include <thrust/count.h>
 #include <thrust/device_ptr.h>
@@ -16,7 +14,6 @@
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/sort.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
@@ -26,7 +23,6 @@
 #include <ginkgo/core/matrix/hybrid.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 
-
 #include "accessor/cuda_hip_helper.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/pointer_mode_guard.hpp"
diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp
index 8fed3c97c1b..82599050719 100644
--- a/hip/matrix/dense_kernels.hip.cpp
+++ b/hip/matrix/dense_kernels.hip.cpp
@@ -4,7 +4,6 @@
 
 #include "core/matrix/dense_kernels.hpp"
 
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/range_accessors.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
@@ -16,7 +15,6 @@
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
 #include "common/cuda_hip/base/blas_bindings.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/pointer_mode_guard.hpp"
diff --git a/hip/matrix/diagonal_kernels.hip.cpp b/hip/matrix/diagonal_kernels.hip.cpp
index 01033004c6b..b9585db9b41 100644
--- a/hip/matrix/diagonal_kernels.hip.cpp
+++ b/hip/matrix/diagonal_kernels.hip.cpp
@@ -4,11 +4,9 @@
 
 #include "core/matrix/diagonal_kernels.hpp"
 
-
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp
index 4f1ff6a3539..cb8cca32d89 100644
--- a/hip/matrix/ell_kernels.hip.cpp
+++ b/hip/matrix/ell_kernels.hip.cpp
@@ -4,17 +4,14 @@
 
 #include "core/matrix/ell_kernels.hpp"
 
-
 #include <array>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "accessor/cuda_hip_helper.hpp"
 #include "accessor/reduced_row_major.hpp"
 #include "common/cuda_hip/base/config.hpp"
diff --git a/hip/matrix/fbcsr_kernels.template.hip.cpp b/hip/matrix/fbcsr_kernels.template.hip.cpp
index 0286aff0bba..c5d49215042 100644
--- a/hip/matrix/fbcsr_kernels.template.hip.cpp
+++ b/hip/matrix/fbcsr_kernels.template.hip.cpp
@@ -4,10 +4,8 @@
 
 #include "core/matrix/fbcsr_kernels.hpp"
 
-
 #include <algorithm>
 
-
 #include <thrust/copy.h>
 #include <thrust/count.h>
 #include <thrust/device_ptr.h>
@@ -16,14 +14,12 @@
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/sort.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "common/cuda_hip/base/blas_bindings.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/pointer_mode_guard.hpp"
diff --git a/hip/matrix/fft_kernels.hip.cpp b/hip/matrix/fft_kernels.hip.cpp
index 31e180b4414..6b14aaf067d 100644
--- a/hip/matrix/fft_kernels.hip.cpp
+++ b/hip/matrix/fft_kernels.hip.cpp
@@ -4,7 +4,6 @@
 
 #include "core/matrix/fft_kernels.hpp"
 
-
 #include <array>
 
 
@@ -19,7 +18,6 @@
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 
 
diff --git a/hip/matrix/fft_kernels_stub.hip.cpp b/hip/matrix/fft_kernels_stub.hip.cpp
index f50bec4ff0b..210349e58e4 100644
--- a/hip/matrix/fft_kernels_stub.hip.cpp
+++ b/hip/matrix/fft_kernels_stub.hip.cpp
@@ -4,7 +4,6 @@
 
 #include "core/matrix/fft_kernels.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
diff --git a/hip/matrix/sellp_kernels.hip.cpp b/hip/matrix/sellp_kernels.hip.cpp
index f1e15c946e0..4caf83fdaa1 100644
--- a/hip/matrix/sellp_kernels.hip.cpp
+++ b/hip/matrix/sellp_kernels.hip.cpp
@@ -4,14 +4,12 @@
 
 #include "core/matrix/sellp_kernels.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
diff --git a/hip/matrix/sparsity_csr_kernels.hip.cpp b/hip/matrix/sparsity_csr_kernels.hip.cpp
index 487b134d28a..7a7a4ba49d5 100644
--- a/hip/matrix/sparsity_csr_kernels.hip.cpp
+++ b/hip/matrix/sparsity_csr_kernels.hip.cpp
@@ -4,13 +4,10 @@
 
 #include "core/matrix/sparsity_csr_kernels.hpp"
 
-
 #include <thrust/sort.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "accessor/cuda_hip_helper.hpp"
 #include "accessor/reduced_row_major.hpp"
 #include "common/cuda_hip/base/config.hpp"
diff --git a/hip/multigrid/pgm_kernels.hip.cpp b/hip/multigrid/pgm_kernels.hip.cpp
index 18c1f0957c4..da5890315bc 100644
--- a/hip/multigrid/pgm_kernels.hip.cpp
+++ b/hip/multigrid/pgm_kernels.hip.cpp
@@ -4,21 +4,17 @@
 
 #include "core/multigrid/pgm_kernels.hpp"
 
-
 #include <memory>
 
-
 #include <thrust/device_ptr.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/reduce.h>
 #include <thrust/sort.h>
 #include <thrust/tuple.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/cuda_hip/base/types.hpp"
 #include "hip/base/thrust.hip.hpp"
 
diff --git a/hip/preconditioner/batch_jacobi_kernels.hip.cpp b/hip/preconditioner/batch_jacobi_kernels.hip.cpp
index f366636a48f..db6e5a27b58 100644
--- a/hip/preconditioner/batch_jacobi_kernels.hip.cpp
+++ b/hip/preconditioner/batch_jacobi_kernels.hip.cpp
@@ -4,12 +4,10 @@
 
 #include "core/preconditioner/batch_jacobi_kernels.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/matrix/batch_csr.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
-
 #include "core/base/batch_struct.hpp"
 #include "core/base/utils.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
diff --git a/hip/preconditioner/isai_kernels.hip.cpp b/hip/preconditioner/isai_kernels.hip.cpp
index 4eaf65cc438..d3c2bd0fb1d 100644
--- a/hip/preconditioner/isai_kernels.hip.cpp
+++ b/hip/preconditioner/isai_kernels.hip.cpp
@@ -4,12 +4,10 @@
 
 #include "core/preconditioner/isai_kernels.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
diff --git a/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp
index 0a78eac4145..0eccbb2d6eb 100644
--- a/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp
+++ b/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp
@@ -2,13 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/preconditioner/jacobi_kernels.hpp"
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "core/matrix/dense_kernels.hpp"
+#include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "hip/preconditioner/jacobi_common.hip.hpp"
 
diff --git a/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp
index 358c6f3b337..7e6311bcd52 100644
--- a/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp
+++ b/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp
@@ -2,18 +2,15 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/preconditioner/jacobi_kernels.hpp"
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/matrix/dense_kernels.hpp"
+#include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/math.hip.hpp"
diff --git a/hip/preconditioner/jacobi_generate_kernels.hip.cpp b/hip/preconditioner/jacobi_generate_kernels.hip.cpp
index 6365f6c132e..9f2d3238a83 100644
--- a/hip/preconditioner/jacobi_generate_kernels.hip.cpp
+++ b/hip/preconditioner/jacobi_generate_kernels.hip.cpp
@@ -2,19 +2,16 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/preconditioner/jacobi_kernels.hpp"
-
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/components/fill_array_kernels.hpp"
+#include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/math.hip.hpp"
diff --git a/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp b/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp
index 4634f8a0c57..3685df4aa0e 100644
--- a/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp
+++ b/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp
@@ -2,18 +2,15 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/preconditioner/jacobi_kernels.hpp"
-
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/components/fill_array_kernels.hpp"
+#include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/math.hip.hpp"
diff --git a/hip/preconditioner/jacobi_kernels.hip.cpp b/hip/preconditioner/jacobi_kernels.hip.cpp
index a3b2b7e5412..122e53f636d 100644
--- a/hip/preconditioner/jacobi_kernels.hip.cpp
+++ b/hip/preconditioner/jacobi_kernels.hip.cpp
@@ -4,10 +4,8 @@
 
 #include "core/preconditioner/jacobi_kernels.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
diff --git a/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp
index 37b78f17469..d922d178f88 100644
--- a/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp
+++ b/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp
@@ -2,18 +2,15 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/preconditioner/jacobi_kernels.hpp"
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/matrix/dense_kernels.hpp"
+#include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/math.hip.hpp"
diff --git a/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp
index 421a32c3efc..baa847c58a5 100644
--- a/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp
+++ b/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp
@@ -2,17 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/preconditioner/jacobi_kernels.hpp"
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/matrix/dense_kernels.hpp"
+#include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/math.hip.hpp"
diff --git a/hip/reorder/rcm_kernels.hip.cpp b/hip/reorder/rcm_kernels.hip.cpp
index 9a5739064eb..9ac6e44e173 100644
--- a/hip/reorder/rcm_kernels.hip.cpp
+++ b/hip/reorder/rcm_kernels.hip.cpp
@@ -4,7 +4,6 @@
 
 #include "core/reorder/rcm_kernels.hpp"
 
-
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
 #include <thrust/count.h>
@@ -16,7 +15,6 @@
 #include <thrust/sort.h>
 #include <thrust/transform.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/std_extensions.hpp>
 #include <ginkgo/core/base/types.hpp>
@@ -24,7 +22,6 @@
 #include <ginkgo/core/matrix/permutation.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
 #include "common/cuda_hip/components/memory.hpp"
 #include "core/base/array_access.hpp"
 #include "hip/base/thrust.hip.hpp"
diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp
index fdeb0580931..44e2f0f3c48 100644
--- a/hip/solver/batch_bicgstab_kernels.hip.cpp
+++ b/hip/solver/batch_bicgstab_kernels.hip.cpp
@@ -4,15 +4,12 @@
 
 #include "core/solver/batch_bicgstab_kernels.hpp"
 
-
 #include <thrust/functional.h>
 #include <thrust/transform.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp
index 47c2bc498eb..450d02a302c 100644
--- a/hip/solver/batch_cg_kernels.hip.cpp
+++ b/hip/solver/batch_cg_kernels.hip.cpp
@@ -4,15 +4,12 @@
 
 #include "core/solver/batch_cg_kernels.hpp"
 
-
 #include <thrust/functional.h>
 #include <thrust/transform.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
diff --git a/hip/solver/cb_gmres_kernels.hip.cpp b/hip/solver/cb_gmres_kernels.hip.cpp
index 2f2df4ddf84..fd046d000b4 100644
--- a/hip/solver/cb_gmres_kernels.hip.cpp
+++ b/hip/solver/cb_gmres_kernels.hip.cpp
@@ -4,16 +4,13 @@
 
 #include "core/solver/cb_gmres_kernels.hpp"
 
-
 #include <algorithm>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
-
 #include "accessor/cuda_hip_helper.hpp"
 #include "accessor/range.hpp"
 #include "accessor/reduced_row_major.hpp"
diff --git a/hip/solver/common_trs_kernels.hip.hpp b/hip/solver/common_trs_kernels.hip.hpp
index 9fac4be8547..ce5cd4192a9 100644
--- a/hip/solver/common_trs_kernels.hip.hpp
+++ b/hip/solver/common_trs_kernels.hip.hpp
@@ -20,7 +20,6 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/cuda_hip/base/pointer_mode_guard.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
diff --git a/hip/solver/idr_kernels.hip.cpp b/hip/solver/idr_kernels.hip.cpp
index b1ef414c091..c516597bd2b 100644
--- a/hip/solver/idr_kernels.hip.cpp
+++ b/hip/solver/idr_kernels.hip.cpp
@@ -4,15 +4,12 @@
 
 #include "core/solver/idr_kernels.hpp"
 
-
 #include <ctime>
 #include <random>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/cuda_hip/base/blas_bindings.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/randlib_bindings.hpp"
diff --git a/hip/solver/lower_trs_kernels.hip.cpp b/hip/solver/lower_trs_kernels.hip.cpp
index d355940a487..322c87d37b3 100644
--- a/hip/solver/lower_trs_kernels.hip.cpp
+++ b/hip/solver/lower_trs_kernels.hip.cpp
@@ -4,7 +4,6 @@
 
 #include "core/solver/lower_trs_kernels.hpp"
 
-
 #include <memory>
 
 
@@ -19,7 +18,6 @@
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/solver/triangular.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
 #include "common/cuda_hip/base/types.hpp"
diff --git a/hip/solver/multigrid_kernels.hip.cpp b/hip/solver/multigrid_kernels.hip.cpp
index f68105ba6d8..6e19606a78e 100644
--- a/hip/solver/multigrid_kernels.hip.cpp
+++ b/hip/solver/multigrid_kernels.hip.cpp
@@ -4,13 +4,11 @@
 
 #include "core/solver/multigrid_kernels.hpp"
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "core/base/array_access.hpp"
diff --git a/hip/solver/upper_trs_kernels.hip.cpp b/hip/solver/upper_trs_kernels.hip.cpp
index 2a31e450d27..6be850959cb 100644
--- a/hip/solver/upper_trs_kernels.hip.cpp
+++ b/hip/solver/upper_trs_kernels.hip.cpp
@@ -4,7 +4,6 @@
 
 #include "core/solver/upper_trs_kernels.hpp"
 
-
 #include <memory>
 
 
@@ -19,7 +18,6 @@
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/solver/triangular.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
 #include "common/cuda_hip/base/types.hpp"
diff --git a/hip/stop/criterion_kernels.hip.cpp b/hip/stop/criterion_kernels.hip.cpp
index 3d24daa5bd5..8f856f0ed8d 100644
--- a/hip/stop/criterion_kernels.hip.cpp
+++ b/hip/stop/criterion_kernels.hip.cpp
@@ -4,12 +4,10 @@
 
 #include "core/stop/criterion_kernels.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
-
 #include "common/cuda_hip/base/types.hpp"
 #include "hip/base/math.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
diff --git a/hip/stop/residual_norm_kernels.hip.cpp b/hip/stop/residual_norm_kernels.hip.cpp
index 7f2b0646ea2..eb6c89a2e2e 100644
--- a/hip/stop/residual_norm_kernels.hip.cpp
+++ b/hip/stop/residual_norm_kernels.hip.cpp
@@ -4,12 +4,10 @@
 
 #include "core/stop/residual_norm_kernels.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "core/base/array_access.hpp"
diff --git a/hip/test/base/exception_helpers.hip.cpp b/hip/test/base/exception_helpers.hip.cpp
index 5f2dd3cd881..85a28fc1c41 100644
--- a/hip/test/base/exception_helpers.hip.cpp
+++ b/hip/test/base/exception_helpers.hip.cpp
@@ -2,10 +2,9 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/exception_helpers.hpp>
-
-
 #include <hip/hip_runtime.h>
+
+#include <ginkgo/core/base/exception_helpers.hpp>
 #if HIP_VERSION >= 50200000
 #include <hipblas/hipblas.h>
 #include <hiprand/hiprand.h>
diff --git a/hip/test/base/hip_executor.hip.cpp b/hip/test/base/hip_executor.hip.cpp
index 266532823e7..55d8ffe5863 100644
--- a/hip/test/base/hip_executor.hip.cpp
+++ b/hip/test/base/hip_executor.hip.cpp
@@ -2,25 +2,20 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-
 // clang-format off
 // prevent compilation failure related to disappearing assert(...) statements
 #include <hip/hip_runtime.h>
 // clang-format on
 
 
-#include <ginkgo/core/base/executor.hpp>
-
-
 #include <memory>
 #include <type_traits>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
 
 #include "common/cuda_hip/base/executor.hpp.inc"
 #include "hip/base/scoped_device_id.hip.hpp"
diff --git a/hip/test/base/hip_executor_topology.hip.cpp b/hip/test/base/hip_executor_topology.hip.cpp
index 10ebac1bbc6..50111fd5712 100644
--- a/hip/test/base/hip_executor_topology.hip.cpp
+++ b/hip/test/base/hip_executor_topology.hip.cpp
@@ -2,20 +2,18 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-
 // clang-format off
 // prevent compilation failure related to disappearing assert(...) statements
 #include <hip/hip_runtime.h>
 // clang-format on
 
 
-#include <ginkgo/core/base/executor.hpp>
-
-
 #include <memory>
 #include <thread>
 #include <type_traits>
 
+#include <ginkgo/core/base/executor.hpp>
+
 
 #if defined(__unix__) || defined(__APPLE__)
 #include <numa.h>
@@ -25,11 +23,9 @@
 
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "hip/test/utils.hip.hpp"
 
 
diff --git a/hip/test/base/index_set.cpp b/hip/test/base/index_set.cpp
index fdca7ebb905..c34ff5693c2 100644
--- a/hip/test/base/index_set.cpp
+++ b/hip/test/base/index_set.cpp
@@ -2,20 +2,15 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/index_set.hpp>
-
-
 #include <algorithm>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/index_set.hpp>
 #include <ginkgo/core/base/range.hpp>
 
-
 #include "hip/test/utils.hip.hpp"
 
 
diff --git a/hip/test/base/kernel_launch.hip.cpp b/hip/test/base/kernel_launch.hip.cpp
index 4eea4805d87..4ab5bf12602 100644
--- a/hip/test/base/kernel_launch.hip.cpp
+++ b/hip/test/base/kernel_launch.hip.cpp
@@ -4,20 +4,16 @@
 
 #include "common/unified/base/kernel_launch.hpp"
 
-
 #include <memory>
 #include <type_traits>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/dim.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "common/unified/base/kernel_launch_reduction.hpp"
 #include "common/unified/base/kernel_launch_solver.hpp"
 #include "core/base/array_access.hpp"
diff --git a/hip/test/base/lin_op.cpp b/hip/test/base/lin_op.cpp
index dbc0235f67e..939ad3046d0 100644
--- a/hip/test/base/lin_op.cpp
+++ b/hip/test/base/lin_op.cpp
@@ -2,11 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/lin_op.hpp>
-
-
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/base/lin_op.hpp>
+
 
 namespace {
 
diff --git a/hip/test/base/math.hip.cpp b/hip/test/base/math.hip.cpp
index f018c634a6a..1a882989854 100644
--- a/hip/test/base/math.hip.cpp
+++ b/hip/test/base/math.hip.cpp
@@ -2,30 +2,25 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-
 // clang-format off
 // prevent compilation failure related to disappearing assert(...) statements
 #include <hip/hip_runtime.h>
 // clang-format on
 
 
-#include <ginkgo/core/base/math.hpp>
-
+#include "hip/base/math.hip.hpp"
 
 #include <cmath>
 #include <complex>
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
-
+#include <ginkgo/core/base/math.hpp>
 
 #include "common/cuda_hip/base/types.hpp"
-#include "hip/base/math.hip.hpp"
 #include "hip/test/utils.hip.hpp"
 
 
diff --git a/hip/test/base/memory.cpp b/hip/test/base/memory.cpp
index ece86d640ad..651630fce08 100644
--- a/hip/test/base/memory.cpp
+++ b/hip/test/base/memory.cpp
@@ -2,20 +2,15 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/memory.hpp>
-
-
 #include <memory>
 #include <type_traits>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
-
+#include <ginkgo/core/base/memory.hpp>
 
 #include "hip/test/utils.hip.hpp"
 
diff --git a/hip/test/base/scoped_device_id.hip.cpp b/hip/test/base/scoped_device_id.hip.cpp
index 78d51fc989d..07c40214297 100644
--- a/hip/test/base/scoped_device_id.hip.cpp
+++ b/hip/test/base/scoped_device_id.hip.cpp
@@ -2,20 +2,18 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-
 // clang-format off
 // prevent compilation failure related to disappearing assert(...) statements
 #include <hip/hip_runtime.h>
 // clang-format on
 
 
-#include <gtest/gtest.h>
+#include "hip/base/scoped_device_id.hip.hpp"
 
+#include <gtest/gtest.h>
 
 #include <ginkgo/core/base/executor.hpp>
 
-
-#include "hip/base/scoped_device_id.hip.hpp"
 #include "hip/test/utils.hip.hpp"
 
 
diff --git a/hip/test/components/cooperative_groups.hip.cpp b/hip/test/components/cooperative_groups.hip.cpp
index f99b4eb8a87..06a104a8879 100644
--- a/hip/test/components/cooperative_groups.hip.cpp
+++ b/hip/test/components/cooperative_groups.hip.cpp
@@ -2,26 +2,23 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-
 // clang-format off
 // TODO remove when the HIP includes are fixed
 #include <hip/hip_runtime.h>
 // clang-format on
 
 
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+
 #include <cstring>
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 
-
 #include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "hip/test/utils.hip.hpp"
 
 
diff --git a/hip/test/components/merging.hip.cpp b/hip/test/components/merging.hip.cpp
index be18447a901..7fc3b9a173a 100644
--- a/hip/test/components/merging.hip.cpp
+++ b/hip/test/components/merging.hip.cpp
@@ -2,7 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-
 // clang-format off
 // TODO remove when the HIP includes are fixed
 #include <hip/hip_runtime.h>
@@ -11,20 +10,16 @@
 
 #include "hip/components/merging.hip.hpp"
 
-
 #include <algorithm>
 #include <memory>
 #include <random>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 
-
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "hip/test/utils.hip.hpp"
 
diff --git a/hip/test/components/searching.hip.cpp b/hip/test/components/searching.hip.cpp
index 252e8841893..85c54075231 100644
--- a/hip/test/components/searching.hip.cpp
+++ b/hip/test/components/searching.hip.cpp
@@ -2,7 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-
 // clang-format off
 // TODO remove when the HIP includes are fixed
 #include <hip/hip_runtime.h>
@@ -11,19 +10,15 @@
 
 #include "hip/components/searching.hip.hpp"
 
-
 #include <memory>
 #include <numeric>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 
-
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "hip/test/utils.hip.hpp"
 
diff --git a/hip/test/components/sorting.hip.cpp b/hip/test/components/sorting.hip.cpp
index 5cab0048a4b..79de1dc2269 100644
--- a/hip/test/components/sorting.hip.cpp
+++ b/hip/test/components/sorting.hip.cpp
@@ -4,18 +4,14 @@
 
 #include "hip/components/sorting.hip.hpp"
 
-
 #include <memory>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 
-
 #include "hip/test/utils.hip.hpp"
 
 
diff --git a/hip/test/matrix/fbcsr_kernels.cpp b/hip/test/matrix/fbcsr_kernels.cpp
index 27ff7309ea4..0b4b16086ca 100644
--- a/hip/test/matrix/fbcsr_kernels.cpp
+++ b/hip/test/matrix/fbcsr_kernels.cpp
@@ -2,19 +2,15 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/fbcsr.hpp>
-
+#include "core/matrix/fbcsr_kernels.hpp"
 
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/fbcsr.hpp>
 
-
-#include "core/matrix/fbcsr_kernels.hpp"
 #include "core/test/matrix/fbcsr_sample.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/fb_matrix_generator.hpp"
diff --git a/hip/test/matrix/fft_kernels.hip.cpp b/hip/test/matrix/fft_kernels.hip.cpp
index 366e04c3290..d3ec4d1c58a 100644
--- a/hip/test/matrix/fft_kernels.hip.cpp
+++ b/hip/test/matrix/fft_kernels.hip.cpp
@@ -2,10 +2,9 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/fft.hpp>
-
-
 #include <hip/hip_runtime.h>
+
+#include <ginkgo/core/matrix/fft.hpp>
 #if HIP_VERSION >= 50200000
 #include <hipfft/hipfft.h>
 #else
@@ -15,7 +14,6 @@
 
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 
diff --git a/hip/test/solver/lower_trs_kernels.cpp b/hip/test/solver/lower_trs_kernels.cpp
index c2ad9cda357..d249ae3cca3 100644
--- a/hip/test/solver/lower_trs_kernels.cpp
+++ b/hip/test/solver/lower_trs_kernels.cpp
@@ -2,21 +2,19 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/solver/lower_trs_kernels.hpp"
+
 #include <memory>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/solver/triangular.hpp>
 
-
-#include "core/solver/lower_trs_kernels.hpp"
 #include "hip/test/utils.hip.hpp"
 
 
diff --git a/hip/test/solver/upper_trs_kernels.cpp b/hip/test/solver/upper_trs_kernels.cpp
index c161bfcd32f..fbe8259bad6 100644
--- a/hip/test/solver/upper_trs_kernels.cpp
+++ b/hip/test/solver/upper_trs_kernels.cpp
@@ -2,21 +2,19 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/solver/upper_trs_kernels.hpp"
+
 #include <memory>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/solver/triangular.hpp>
 
-
-#include "core/solver/upper_trs_kernels.hpp"
 #include "hip/test/utils.hip.hpp"
 
 
diff --git a/hip/test/utils.hip.hpp b/hip/test/utils.hip.hpp
index 112ae6c24e1..3f5e1f1f858 100644
--- a/hip/test/utils.hip.hpp
+++ b/hip/test/utils.hip.hpp
@@ -6,14 +6,11 @@
 #define GKO_HIP_TEST_UTILS_HIP_HPP_
 
 
-#include "core/test/utils.hpp"
-
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/stream.hpp>
 
-
 #include "core/test/gtest/resources.hpp"
+#include "core/test/utils.hpp"
 #include "hip/base/device.hpp"
 
 
diff --git a/hip/test/utils/assertions_test.cpp b/hip/test/utils/assertions_test.cpp
index 17363313ab5..582967469ec 100644
--- a/hip/test/utils/assertions_test.cpp
+++ b/hip/test/utils/assertions_test.cpp
@@ -4,14 +4,11 @@
 
 #include "core/test/utils/assertions.hpp"
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "hip/test/utils.hip.hpp"
 
 
diff --git a/include/ginkgo/core/base/abstract_factory.hpp b/include/ginkgo/core/base/abstract_factory.hpp
index 8ba10a648d1..5c799ab58f1 100644
--- a/include/ginkgo/core/base/abstract_factory.hpp
+++ b/include/ginkgo/core/base/abstract_factory.hpp
@@ -8,7 +8,6 @@
 
 #include <unordered_map>
 
-
 #include <ginkgo/core/base/polymorphic_object.hpp>
 
 
diff --git a/include/ginkgo/core/base/array.hpp b/include/ginkgo/core/base/array.hpp
index 5d88206cf2b..e0cf8c22ab3 100644
--- a/include/ginkgo/core/base/array.hpp
+++ b/include/ginkgo/core/base/array.hpp
@@ -12,7 +12,6 @@
 #include <type_traits>
 #include <utility>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
diff --git a/include/ginkgo/core/base/batch_dim.hpp b/include/ginkgo/core/base/batch_dim.hpp
index de0699d6876..e34c9a4c2c4 100644
--- a/include/ginkgo/core/base/batch_dim.hpp
+++ b/include/ginkgo/core/base/batch_dim.hpp
@@ -8,7 +8,6 @@
 
 #include <iostream>
 
-
 #include <ginkgo/core/base/dim.hpp>
 #include <ginkgo/core/base/types.hpp>
 
diff --git a/include/ginkgo/core/base/batch_lin_op.hpp b/include/ginkgo/core/base/batch_lin_op.hpp
index 9de2db6b724..701d25bbcb3 100644
--- a/include/ginkgo/core/base/batch_lin_op.hpp
+++ b/include/ginkgo/core/base/batch_lin_op.hpp
@@ -10,7 +10,6 @@
 #include <type_traits>
 #include <utility>
 
-
 #include <ginkgo/core/base/abstract_factory.hpp>
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/dim.hpp>
diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp
index 66ea677fca9..d04e9562fce 100644
--- a/include/ginkgo/core/base/batch_multi_vector.hpp
+++ b/include/ginkgo/core/base/batch_multi_vector.hpp
@@ -9,7 +9,6 @@
 #include <initializer_list>
 #include <vector>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/batch_dim.hpp>
 #include <ginkgo/core/base/dim.hpp>
diff --git a/include/ginkgo/core/base/combination.hpp b/include/ginkgo/core/base/combination.hpp
index 8992394ea0d..f3cdea82dcb 100644
--- a/include/ginkgo/core/base/combination.hpp
+++ b/include/ginkgo/core/base/combination.hpp
@@ -8,7 +8,6 @@
 
 #include <vector>
 
-
 #include <ginkgo/core/base/lin_op.hpp>
 
 
diff --git a/include/ginkgo/core/base/composition.hpp b/include/ginkgo/core/base/composition.hpp
index 62b854264a2..e151e121b56 100644
--- a/include/ginkgo/core/base/composition.hpp
+++ b/include/ginkgo/core/base/composition.hpp
@@ -8,7 +8,6 @@
 
 #include <vector>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 
diff --git a/include/ginkgo/core/base/dense_cache.hpp b/include/ginkgo/core/base/dense_cache.hpp
index a9dd8b57ba2..dd2918ab6a7 100644
--- a/include/ginkgo/core/base/dense_cache.hpp
+++ b/include/ginkgo/core/base/dense_cache.hpp
@@ -8,7 +8,6 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/executor.hpp>
 
 
diff --git a/include/ginkgo/core/base/device.hpp b/include/ginkgo/core/base/device.hpp
index 90240b6791a..5b82d79f0b8 100644
--- a/include/ginkgo/core/base/device.hpp
+++ b/include/ginkgo/core/base/device.hpp
@@ -11,7 +11,6 @@
 #include <mutex>
 #include <type_traits>
 
-
 #include <ginkgo/config.hpp>
 
 
diff --git a/include/ginkgo/core/base/dim.hpp b/include/ginkgo/core/base/dim.hpp
index 10a4a90fa2d..ffa38aa6a76 100644
--- a/include/ginkgo/core/base/dim.hpp
+++ b/include/ginkgo/core/base/dim.hpp
@@ -8,7 +8,6 @@
 
 #include <iostream>
 
-
 #include <ginkgo/core/base/types.hpp>
 
 
diff --git a/include/ginkgo/core/base/exception.hpp b/include/ginkgo/core/base/exception.hpp
index 678c714dada..febc5e17034 100644
--- a/include/ginkgo/core/base/exception.hpp
+++ b/include/ginkgo/core/base/exception.hpp
@@ -9,7 +9,6 @@
 #include <exception>
 #include <string>
 
-
 #include <ginkgo/core/base/types.hpp>
 
 
diff --git a/include/ginkgo/core/base/exception_helpers.hpp b/include/ginkgo/core/base/exception_helpers.hpp
index 0482a50a334..f0104ba1a7c 100644
--- a/include/ginkgo/core/base/exception_helpers.hpp
+++ b/include/ginkgo/core/base/exception_helpers.hpp
@@ -8,7 +8,6 @@
 
 #include <typeinfo>
 
-
 #include <ginkgo/core/base/batch_dim.hpp>
 #include <ginkgo/core/base/dim.hpp>
 #include <ginkgo/core/base/exception.hpp>
diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index 761405c0b3d..0d592485c1c 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -17,7 +17,6 @@
 #include <type_traits>
 #include <vector>
 
-
 #include <ginkgo/core/base/device.hpp>
 #include <ginkgo/core/base/fwd_decls.hpp>
 #include <ginkgo/core/base/machine_topology.hpp>
diff --git a/include/ginkgo/core/base/index_set.hpp b/include/ginkgo/core/base/index_set.hpp
index 8a0a60972a7..260896d6b2f 100644
--- a/include/ginkgo/core/base/index_set.hpp
+++ b/include/ginkgo/core/base/index_set.hpp
@@ -11,7 +11,6 @@
 #include <mutex>
 #include <vector>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
diff --git a/include/ginkgo/core/base/intrinsics.hpp b/include/ginkgo/core/base/intrinsics.hpp
index 941a32458c7..37e7f361781 100644
--- a/include/ginkgo/core/base/intrinsics.hpp
+++ b/include/ginkgo/core/base/intrinsics.hpp
@@ -8,7 +8,6 @@
 
 #include <bitset>
 
-
 #include <ginkgo/core/base/types.hpp>
 
 
diff --git a/include/ginkgo/core/base/lin_op.hpp b/include/ginkgo/core/base/lin_op.hpp
index f9f60f9c3c4..26e1c1b9baa 100644
--- a/include/ginkgo/core/base/lin_op.hpp
+++ b/include/ginkgo/core/base/lin_op.hpp
@@ -10,7 +10,6 @@
 #include <type_traits>
 #include <utility>
 
-
 #include <ginkgo/core/base/abstract_factory.hpp>
 #include <ginkgo/core/base/device_matrix_data.hpp>
 #include <ginkgo/core/base/dim.hpp>
diff --git a/include/ginkgo/core/base/machine_topology.hpp b/include/ginkgo/core/base/machine_topology.hpp
index 453281395ef..0a1fff15268 100644
--- a/include/ginkgo/core/base/machine_topology.hpp
+++ b/include/ginkgo/core/base/machine_topology.hpp
@@ -16,7 +16,6 @@
 #include <type_traits>
 #include <vector>
 
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp
index 30b0da475d0..42eff5a5d40 100644
--- a/include/ginkgo/core/base/math.hpp
+++ b/include/ginkgo/core/base/math.hpp
@@ -13,7 +13,6 @@
 #include <type_traits>
 #include <utility>
 
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/base/utils.hpp>
diff --git a/include/ginkgo/core/base/matrix_assembly_data.hpp b/include/ginkgo/core/base/matrix_assembly_data.hpp
index 4eeed1fd702..6993f2004f2 100644
--- a/include/ginkgo/core/base/matrix_assembly_data.hpp
+++ b/include/ginkgo/core/base/matrix_assembly_data.hpp
@@ -12,7 +12,6 @@
 #include <tuple>
 #include <unordered_map>
 
-
 #include <ginkgo/core/base/dim.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/matrix_data.hpp>
diff --git a/include/ginkgo/core/base/matrix_data.hpp b/include/ginkgo/core/base/matrix_data.hpp
index 954bf678b18..0edb39a9c6d 100644
--- a/include/ginkgo/core/base/matrix_data.hpp
+++ b/include/ginkgo/core/base/matrix_data.hpp
@@ -11,7 +11,6 @@
 #include <tuple>
 #include <vector>
 
-
 #include <ginkgo/core/base/dim.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/range.hpp>
diff --git a/include/ginkgo/core/base/mpi.hpp b/include/ginkgo/core/base/mpi.hpp
index 0909dce5cea..64c04e1805a 100644
--- a/include/ginkgo/core/base/mpi.hpp
+++ b/include/ginkgo/core/base/mpi.hpp
@@ -10,7 +10,6 @@
 #include <type_traits>
 #include <utility>
 
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/include/ginkgo/core/base/mtx_io.hpp b/include/ginkgo/core/base/mtx_io.hpp
index 14c04244df3..102cb446cc4 100644
--- a/include/ginkgo/core/base/mtx_io.hpp
+++ b/include/ginkgo/core/base/mtx_io.hpp
@@ -8,7 +8,6 @@
 
 #include <istream>
 
-
 #include <ginkgo/core/base/matrix_data.hpp>
 
 
diff --git a/include/ginkgo/core/base/perturbation.hpp b/include/ginkgo/core/base/perturbation.hpp
index 6db017ac5b4..b6f2f95c008 100644
--- a/include/ginkgo/core/base/perturbation.hpp
+++ b/include/ginkgo/core/base/perturbation.hpp
@@ -8,7 +8,6 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
diff --git a/include/ginkgo/core/base/polymorphic_object.hpp b/include/ginkgo/core/base/polymorphic_object.hpp
index e7e3d4b154b..3a17cfd27ef 100644
--- a/include/ginkgo/core/base/polymorphic_object.hpp
+++ b/include/ginkgo/core/base/polymorphic_object.hpp
@@ -9,7 +9,6 @@
 #include <memory>
 #include <type_traits>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/utils.hpp>
 #include <ginkgo/core/log/logger.hpp>
diff --git a/include/ginkgo/core/base/range.hpp b/include/ginkgo/core/base/range.hpp
index 322ac246385..680bc47bcb6 100644
--- a/include/ginkgo/core/base/range.hpp
+++ b/include/ginkgo/core/base/range.hpp
@@ -8,7 +8,6 @@
 
 #include <type_traits>
 
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/base/utils.hpp>
diff --git a/include/ginkgo/core/base/range_accessors.hpp b/include/ginkgo/core/base/range_accessors.hpp
index 9046d33cf85..56335b8dd97 100644
--- a/include/ginkgo/core/base/range_accessors.hpp
+++ b/include/ginkgo/core/base/range_accessors.hpp
@@ -8,7 +8,6 @@
 
 #include <array>
 
-
 #include <ginkgo/core/base/range.hpp>
 #include <ginkgo/core/base/types.hpp>
 
diff --git a/include/ginkgo/core/base/segmented_array.hpp b/include/ginkgo/core/base/segmented_array.hpp
index a31273c0f06..49a7e6f9d38 100644
--- a/include/ginkgo/core/base/segmented_array.hpp
+++ b/include/ginkgo/core/base/segmented_array.hpp
@@ -5,7 +5,6 @@
 #pragma once
 #include <numeric>
 
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/array.hpp>
 
diff --git a/include/ginkgo/core/base/temporary_clone.hpp b/include/ginkgo/core/base/temporary_clone.hpp
index baa348a34c9..2e4cc40dcf7 100644
--- a/include/ginkgo/core/base/temporary_clone.hpp
+++ b/include/ginkgo/core/base/temporary_clone.hpp
@@ -10,7 +10,6 @@
 #include <memory>
 #include <type_traits>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/utils_helper.hpp>
diff --git a/include/ginkgo/core/base/temporary_conversion.hpp b/include/ginkgo/core/base/temporary_conversion.hpp
index 6b8528f11ec..d0e14806719 100644
--- a/include/ginkgo/core/base/temporary_conversion.hpp
+++ b/include/ginkgo/core/base/temporary_conversion.hpp
@@ -10,7 +10,6 @@
 #include <tuple>
 #include <type_traits>
 
-
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/utils.hpp>
 
diff --git a/include/ginkgo/core/base/timer.hpp b/include/ginkgo/core/base/timer.hpp
index 8008cecfb94..6f647330126 100644
--- a/include/ginkgo/core/base/timer.hpp
+++ b/include/ginkgo/core/base/timer.hpp
@@ -8,7 +8,6 @@
 
 #include <chrono>
 
-
 #include <ginkgo/core/base/executor.hpp>
 
 
diff --git a/include/ginkgo/core/base/utils_helper.hpp b/include/ginkgo/core/base/utils_helper.hpp
index 62ffa6be554..faa74974703 100644
--- a/include/ginkgo/core/base/utils_helper.hpp
+++ b/include/ginkgo/core/base/utils_helper.hpp
@@ -10,7 +10,6 @@
 #include <memory>
 #include <type_traits>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/name_demangling.hpp>
 #include <ginkgo/core/base/std_extensions.hpp>
diff --git a/include/ginkgo/core/base/version.hpp b/include/ginkgo/core/base/version.hpp
index de2f6abe485..9fad9430527 100644
--- a/include/ginkgo/core/base/version.hpp
+++ b/include/ginkgo/core/base/version.hpp
@@ -8,7 +8,6 @@
 
 #include <ostream>
 
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/types.hpp>
 
diff --git a/include/ginkgo/core/config/config.hpp b/include/ginkgo/core/config/config.hpp
index 4bbf58d8584..27c08caa3a3 100644
--- a/include/ginkgo/core/config/config.hpp
+++ b/include/ginkgo/core/config/config.hpp
@@ -10,7 +10,6 @@
 #include <string>
 #include <unordered_map>
 
-
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/config/type_descriptor.hpp>
diff --git a/include/ginkgo/core/config/registry.hpp b/include/ginkgo/core/config/registry.hpp
index 9a6dc23b6ae..1e5073f8c42 100644
--- a/include/ginkgo/core/config/registry.hpp
+++ b/include/ginkgo/core/config/registry.hpp
@@ -13,7 +13,6 @@
 #include <unordered_map>
 #include <utility>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/types.hpp>
diff --git a/include/ginkgo/core/distributed/lin_op.hpp b/include/ginkgo/core/distributed/lin_op.hpp
index a84425be465..144c9654eb5 100644
--- a/include/ginkgo/core/distributed/lin_op.hpp
+++ b/include/ginkgo/core/distributed/lin_op.hpp
@@ -10,7 +10,6 @@
 #include <type_traits>
 #include <utility>
 
-
 #include <ginkgo/config.hpp>
 
 
diff --git a/include/ginkgo/core/distributed/polymorphic_object.hpp b/include/ginkgo/core/distributed/polymorphic_object.hpp
index 5cfe55049e6..553dc4d2d19 100644
--- a/include/ginkgo/core/distributed/polymorphic_object.hpp
+++ b/include/ginkgo/core/distributed/polymorphic_object.hpp
@@ -9,7 +9,6 @@
 #include <memory>
 #include <type_traits>
 
-
 #include <ginkgo/config.hpp>
 
 
diff --git a/include/ginkgo/core/factorization/cholesky.hpp b/include/ginkgo/core/factorization/cholesky.hpp
index c5b0c6c2e58..0b3a7fb0caf 100644
--- a/include/ginkgo/core/factorization/cholesky.hpp
+++ b/include/ginkgo/core/factorization/cholesky.hpp
@@ -4,7 +4,6 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/composition.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/polymorphic_object.hpp>
diff --git a/include/ginkgo/core/factorization/ic.hpp b/include/ginkgo/core/factorization/ic.hpp
index 2b2f213341a..616360ce039 100644
--- a/include/ginkgo/core/factorization/ic.hpp
+++ b/include/ginkgo/core/factorization/ic.hpp
@@ -8,7 +8,6 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/composition.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/types.hpp>
diff --git a/include/ginkgo/core/factorization/ilu.hpp b/include/ginkgo/core/factorization/ilu.hpp
index 96efcd1f4ba..80f11ab7b6f 100644
--- a/include/ginkgo/core/factorization/ilu.hpp
+++ b/include/ginkgo/core/factorization/ilu.hpp
@@ -8,7 +8,6 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/composition.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/types.hpp>
diff --git a/include/ginkgo/core/factorization/lu.hpp b/include/ginkgo/core/factorization/lu.hpp
index c77fd48f6bb..d00f5a111b3 100644
--- a/include/ginkgo/core/factorization/lu.hpp
+++ b/include/ginkgo/core/factorization/lu.hpp
@@ -4,7 +4,6 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/composition.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/polymorphic_object.hpp>
diff --git a/include/ginkgo/core/factorization/par_ic.hpp b/include/ginkgo/core/factorization/par_ic.hpp
index 35cf01f3c79..b5f14a997b4 100644
--- a/include/ginkgo/core/factorization/par_ic.hpp
+++ b/include/ginkgo/core/factorization/par_ic.hpp
@@ -8,7 +8,6 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/composition.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/types.hpp>
diff --git a/include/ginkgo/core/factorization/par_ict.hpp b/include/ginkgo/core/factorization/par_ict.hpp
index 904878e2a0a..bc2e38eadf4 100644
--- a/include/ginkgo/core/factorization/par_ict.hpp
+++ b/include/ginkgo/core/factorization/par_ict.hpp
@@ -8,7 +8,6 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/composition.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/types.hpp>
diff --git a/include/ginkgo/core/factorization/par_ilu.hpp b/include/ginkgo/core/factorization/par_ilu.hpp
index d147d912749..88d183a939c 100644
--- a/include/ginkgo/core/factorization/par_ilu.hpp
+++ b/include/ginkgo/core/factorization/par_ilu.hpp
@@ -8,7 +8,6 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/composition.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/types.hpp>
diff --git a/include/ginkgo/core/factorization/par_ilut.hpp b/include/ginkgo/core/factorization/par_ilut.hpp
index 166dc04a973..c73e3a1b905 100644
--- a/include/ginkgo/core/factorization/par_ilut.hpp
+++ b/include/ginkgo/core/factorization/par_ilut.hpp
@@ -8,7 +8,6 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/composition.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/types.hpp>
diff --git a/include/ginkgo/core/log/batch_logger.hpp b/include/ginkgo/core/log/batch_logger.hpp
index 5043c1a0841..16b3c26aa20 100644
--- a/include/ginkgo/core/log/batch_logger.hpp
+++ b/include/ginkgo/core/log/batch_logger.hpp
@@ -8,7 +8,6 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/log/logger.hpp>
diff --git a/include/ginkgo/core/log/convergence.hpp b/include/ginkgo/core/log/convergence.hpp
index 7327f7ff815..767146623a3 100644
--- a/include/ginkgo/core/log/convergence.hpp
+++ b/include/ginkgo/core/log/convergence.hpp
@@ -8,7 +8,6 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/log/logger.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/criterion.hpp>
diff --git a/include/ginkgo/core/log/logger.hpp b/include/ginkgo/core/log/logger.hpp
index 7f7351addf5..907bc418906 100644
--- a/include/ginkgo/core/log/logger.hpp
+++ b/include/ginkgo/core/log/logger.hpp
@@ -12,7 +12,6 @@
 #include <type_traits>
 #include <vector>
 
-
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/base/utils_helper.hpp>
 
diff --git a/include/ginkgo/core/log/papi.hpp b/include/ginkgo/core/log/papi.hpp
index 2595a574a05..a395a7b7108 100644
--- a/include/ginkgo/core/log/papi.hpp
+++ b/include/ginkgo/core/log/papi.hpp
@@ -12,12 +12,12 @@
 #if GKO_HAVE_PAPI_SDE
 
 
-#include <sde_lib.h>
 #include <cstddef>
 #include <iostream>
 #include <map>
 #include <mutex>
 
+#include <sde_lib.h>
 
 #include <ginkgo/core/base/polymorphic_object.hpp>
 #include <ginkgo/core/log/logger.hpp>
diff --git a/include/ginkgo/core/log/performance_hint.hpp b/include/ginkgo/core/log/performance_hint.hpp
index 035dc690f7a..1a693ae184b 100644
--- a/include/ginkgo/core/log/performance_hint.hpp
+++ b/include/ginkgo/core/log/performance_hint.hpp
@@ -10,7 +10,6 @@
 #include <iostream>
 #include <unordered_map>
 
-
 #include <ginkgo/core/log/logger.hpp>
 
 
diff --git a/include/ginkgo/core/log/profiler_hook.hpp b/include/ginkgo/core/log/profiler_hook.hpp
index 1821d3a8f64..ce5e8831f1c 100644
--- a/include/ginkgo/core/log/profiler_hook.hpp
+++ b/include/ginkgo/core/log/profiler_hook.hpp
@@ -9,7 +9,6 @@
 #include <iostream>
 #include <unordered_map>
 
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/timer.hpp>
 #include <ginkgo/core/log/logger.hpp>
diff --git a/include/ginkgo/core/log/record.hpp b/include/ginkgo/core/log/record.hpp
index afeb1f3b973..41bfe245dc4 100644
--- a/include/ginkgo/core/log/record.hpp
+++ b/include/ginkgo/core/log/record.hpp
@@ -9,7 +9,6 @@
 #include <deque>
 #include <memory>
 
-
 #include <ginkgo/core/log/logger.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/criterion.hpp>
diff --git a/include/ginkgo/core/log/stream.hpp b/include/ginkgo/core/log/stream.hpp
index 120cbd84a59..83ef8b2e607 100644
--- a/include/ginkgo/core/log/stream.hpp
+++ b/include/ginkgo/core/log/stream.hpp
@@ -9,7 +9,6 @@
 #include <fstream>
 #include <iostream>
 
-
 #include <ginkgo/core/log/logger.hpp>
 
 
diff --git a/include/ginkgo/core/matrix/batch_csr.hpp b/include/ginkgo/core/matrix/batch_csr.hpp
index 9e1ea9283e5..e431454063d 100644
--- a/include/ginkgo/core/matrix/batch_csr.hpp
+++ b/include/ginkgo/core/matrix/batch_csr.hpp
@@ -9,7 +9,6 @@
 #include <initializer_list>
 #include <vector>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/batch_lin_op.hpp>
 #include <ginkgo/core/base/batch_multi_vector.hpp>
diff --git a/include/ginkgo/core/matrix/batch_dense.hpp b/include/ginkgo/core/matrix/batch_dense.hpp
index fba9dbe1514..5ea7c3ee128 100644
--- a/include/ginkgo/core/matrix/batch_dense.hpp
+++ b/include/ginkgo/core/matrix/batch_dense.hpp
@@ -9,7 +9,6 @@
 #include <initializer_list>
 #include <vector>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/batch_lin_op.hpp>
 #include <ginkgo/core/base/batch_multi_vector.hpp>
diff --git a/include/ginkgo/core/matrix/batch_ell.hpp b/include/ginkgo/core/matrix/batch_ell.hpp
index a729f54191b..b760cee795a 100644
--- a/include/ginkgo/core/matrix/batch_ell.hpp
+++ b/include/ginkgo/core/matrix/batch_ell.hpp
@@ -9,7 +9,6 @@
 #include <initializer_list>
 #include <vector>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/batch_lin_op.hpp>
 #include <ginkgo/core/base/batch_multi_vector.hpp>
diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp
index 46728246329..d7e9b1a10e0 100644
--- a/include/ginkgo/core/matrix/dense.hpp
+++ b/include/ginkgo/core/matrix/dense.hpp
@@ -9,7 +9,6 @@
 #include <initializer_list>
 #include <type_traits>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
diff --git a/include/ginkgo/core/matrix/hybrid.hpp b/include/ginkgo/core/matrix/hybrid.hpp
index 3cd36658f4e..5e995cb0ba0 100644
--- a/include/ginkgo/core/matrix/hybrid.hpp
+++ b/include/ginkgo/core/matrix/hybrid.hpp
@@ -8,7 +8,6 @@
 
 #include <algorithm>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
diff --git a/include/ginkgo/core/matrix/permutation.hpp b/include/ginkgo/core/matrix/permutation.hpp
index b15d14e4b83..5549b75f694 100644
--- a/include/ginkgo/core/matrix/permutation.hpp
+++ b/include/ginkgo/core/matrix/permutation.hpp
@@ -11,7 +11,6 @@
 #include <numeric>
 #include <vector>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/include/ginkgo/core/matrix/row_gatherer.hpp b/include/ginkgo/core/matrix/row_gatherer.hpp
index ebc4f175a17..bf55f03bdb0 100644
--- a/include/ginkgo/core/matrix/row_gatherer.hpp
+++ b/include/ginkgo/core/matrix/row_gatherer.hpp
@@ -11,7 +11,6 @@
 #include <numeric>
 #include <vector>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/include/ginkgo/core/matrix/scaled_permutation.hpp b/include/ginkgo/core/matrix/scaled_permutation.hpp
index 88dff395ab6..8f48bb38f88 100644
--- a/include/ginkgo/core/matrix/scaled_permutation.hpp
+++ b/include/ginkgo/core/matrix/scaled_permutation.hpp
@@ -8,7 +8,6 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
diff --git a/include/ginkgo/core/matrix/sparsity_csr.hpp b/include/ginkgo/core/matrix/sparsity_csr.hpp
index 8dfe8b06713..0e6aa98f5ae 100644
--- a/include/ginkgo/core/matrix/sparsity_csr.hpp
+++ b/include/ginkgo/core/matrix/sparsity_csr.hpp
@@ -8,7 +8,6 @@
 
 #include <vector>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/polymorphic_object.hpp>
diff --git a/include/ginkgo/core/multigrid/fixed_coarsening.hpp b/include/ginkgo/core/multigrid/fixed_coarsening.hpp
index 2ab3211b609..86c21acba39 100644
--- a/include/ginkgo/core/multigrid/fixed_coarsening.hpp
+++ b/include/ginkgo/core/multigrid/fixed_coarsening.hpp
@@ -8,7 +8,6 @@
 
 #include <vector>
 
-
 #include <ginkgo/core/base/composition.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
diff --git a/include/ginkgo/core/multigrid/multigrid_level.hpp b/include/ginkgo/core/multigrid/multigrid_level.hpp
index e52122b6bed..7c5b7e09684 100644
--- a/include/ginkgo/core/multigrid/multigrid_level.hpp
+++ b/include/ginkgo/core/multigrid/multigrid_level.hpp
@@ -9,7 +9,6 @@
 #include <functional>
 #include <memory>
 
-
 #include <ginkgo/core/base/abstract_factory.hpp>
 #include <ginkgo/core/base/composition.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/include/ginkgo/core/multigrid/pgm.hpp b/include/ginkgo/core/multigrid/pgm.hpp
index 99b0856e819..d07001be2f1 100644
--- a/include/ginkgo/core/multigrid/pgm.hpp
+++ b/include/ginkgo/core/multigrid/pgm.hpp
@@ -8,7 +8,6 @@
 
 #include <vector>
 
-
 #include <ginkgo/core/base/composition.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
diff --git a/include/ginkgo/core/preconditioner/ic.hpp b/include/ginkgo/core/preconditioner/ic.hpp
index 5cb1ad201e4..f78e00eea09 100644
--- a/include/ginkgo/core/preconditioner/ic.hpp
+++ b/include/ginkgo/core/preconditioner/ic.hpp
@@ -9,7 +9,6 @@
 #include <memory>
 #include <type_traits>
 
-
 #include <ginkgo/core/base/abstract_factory.hpp>
 #include <ginkgo/core/base/composition.hpp>
 #include <ginkgo/core/base/exception.hpp>
diff --git a/include/ginkgo/core/preconditioner/ilu.hpp b/include/ginkgo/core/preconditioner/ilu.hpp
index 816f6e600cb..869681fc547 100644
--- a/include/ginkgo/core/preconditioner/ilu.hpp
+++ b/include/ginkgo/core/preconditioner/ilu.hpp
@@ -9,7 +9,6 @@
 #include <memory>
 #include <type_traits>
 
-
 #include <ginkgo/core/base/abstract_factory.hpp>
 #include <ginkgo/core/base/composition.hpp>
 #include <ginkgo/core/base/exception.hpp>
diff --git a/include/ginkgo/core/preconditioner/isai.hpp b/include/ginkgo/core/preconditioner/isai.hpp
index 3080815f1f8..e17bff28bc7 100644
--- a/include/ginkgo/core/preconditioner/isai.hpp
+++ b/include/ginkgo/core/preconditioner/isai.hpp
@@ -8,7 +8,6 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/composition.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
diff --git a/include/ginkgo/core/preconditioner/utils.hpp b/include/ginkgo/core/preconditioner/utils.hpp
index 0ef114fcea3..1e3f35c8ada 100644
--- a/include/ginkgo/core/preconditioner/utils.hpp
+++ b/include/ginkgo/core/preconditioner/utils.hpp
@@ -8,7 +8,6 @@
 
 #include <type_traits>
 
-
 #include <ginkgo/core/preconditioner/isai.hpp>
 
 
diff --git a/include/ginkgo/core/reorder/amd.hpp b/include/ginkgo/core/reorder/amd.hpp
index 764a5426922..9dbffaa1c8c 100644
--- a/include/ginkgo/core/reorder/amd.hpp
+++ b/include/ginkgo/core/reorder/amd.hpp
@@ -8,7 +8,6 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/abstract_factory.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/polymorphic_object.hpp>
diff --git a/include/ginkgo/core/reorder/mc64.hpp b/include/ginkgo/core/reorder/mc64.hpp
index afef9639a6e..b2c1fd1a644 100644
--- a/include/ginkgo/core/reorder/mc64.hpp
+++ b/include/ginkgo/core/reorder/mc64.hpp
@@ -8,7 +8,6 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/abstract_factory.hpp>
 #include <ginkgo/core/base/composition.hpp>
 #include <ginkgo/core/base/dim.hpp>
diff --git a/include/ginkgo/core/reorder/nested_dissection.hpp b/include/ginkgo/core/reorder/nested_dissection.hpp
index 89563380cb3..735b56cd354 100644
--- a/include/ginkgo/core/reorder/nested_dissection.hpp
+++ b/include/ginkgo/core/reorder/nested_dissection.hpp
@@ -15,7 +15,6 @@
 #include <memory>
 #include <unordered_map>
 
-
 #include <ginkgo/core/base/abstract_factory.hpp>
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/dim.hpp>
diff --git a/include/ginkgo/core/reorder/rcm.hpp b/include/ginkgo/core/reorder/rcm.hpp
index 09f11d90189..589d38e29d1 100644
--- a/include/ginkgo/core/reorder/rcm.hpp
+++ b/include/ginkgo/core/reorder/rcm.hpp
@@ -8,7 +8,6 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/abstract_factory.hpp>
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/dim.hpp>
diff --git a/include/ginkgo/core/reorder/reordering_base.hpp b/include/ginkgo/core/reorder/reordering_base.hpp
index 83a2dd1886b..8dde7a6734f 100644
--- a/include/ginkgo/core/reorder/reordering_base.hpp
+++ b/include/ginkgo/core/reorder/reordering_base.hpp
@@ -8,7 +8,6 @@
 
 #include <memory>
 
-
 #include <ginkgo/core/base/abstract_factory.hpp>
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
diff --git a/include/ginkgo/core/solver/batch_bicgstab.hpp b/include/ginkgo/core/solver/batch_bicgstab.hpp
index 50015b49c45..bb287b17a53 100644
--- a/include/ginkgo/core/solver/batch_bicgstab.hpp
+++ b/include/ginkgo/core/solver/batch_bicgstab.hpp
@@ -8,7 +8,6 @@
 
 #include <vector>
 
-
 #include <ginkgo/core/base/batch_lin_op.hpp>
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/include/ginkgo/core/solver/batch_cg.hpp b/include/ginkgo/core/solver/batch_cg.hpp
index a6cceebdb09..677936aa397 100644
--- a/include/ginkgo/core/solver/batch_cg.hpp
+++ b/include/ginkgo/core/solver/batch_cg.hpp
@@ -8,7 +8,6 @@
 
 #include <vector>
 
-
 #include <ginkgo/core/base/batch_lin_op.hpp>
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/include/ginkgo/core/solver/bicg.hpp b/include/ginkgo/core/solver/bicg.hpp
index cf428e8c74f..9f1ef54cc34 100644
--- a/include/ginkgo/core/solver/bicg.hpp
+++ b/include/ginkgo/core/solver/bicg.hpp
@@ -8,7 +8,6 @@
 
 #include <vector>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
diff --git a/include/ginkgo/core/solver/bicgstab.hpp b/include/ginkgo/core/solver/bicgstab.hpp
index 38382670597..a57a6c27aa4 100644
--- a/include/ginkgo/core/solver/bicgstab.hpp
+++ b/include/ginkgo/core/solver/bicgstab.hpp
@@ -8,7 +8,6 @@
 
 #include <vector>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
diff --git a/include/ginkgo/core/solver/cb_gmres.hpp b/include/ginkgo/core/solver/cb_gmres.hpp
index 60d5cd32b4d..976712cd673 100644
--- a/include/ginkgo/core/solver/cb_gmres.hpp
+++ b/include/ginkgo/core/solver/cb_gmres.hpp
@@ -9,7 +9,6 @@
 #include <memory>
 #include <vector>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
diff --git a/include/ginkgo/core/solver/cg.hpp b/include/ginkgo/core/solver/cg.hpp
index 38acccf9597..9d850ecbe6d 100644
--- a/include/ginkgo/core/solver/cg.hpp
+++ b/include/ginkgo/core/solver/cg.hpp
@@ -8,7 +8,6 @@
 
 #include <vector>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
diff --git a/include/ginkgo/core/solver/cgs.hpp b/include/ginkgo/core/solver/cgs.hpp
index d930de00736..bde23d76910 100644
--- a/include/ginkgo/core/solver/cgs.hpp
+++ b/include/ginkgo/core/solver/cgs.hpp
@@ -8,7 +8,6 @@
 
 #include <vector>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
diff --git a/include/ginkgo/core/solver/fcg.hpp b/include/ginkgo/core/solver/fcg.hpp
index ff030191225..4577dd1b1d4 100644
--- a/include/ginkgo/core/solver/fcg.hpp
+++ b/include/ginkgo/core/solver/fcg.hpp
@@ -8,7 +8,6 @@
 
 #include <vector>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
diff --git a/include/ginkgo/core/solver/gcr.hpp b/include/ginkgo/core/solver/gcr.hpp
index 7b8e1c85fa8..62ce9c9c93c 100644
--- a/include/ginkgo/core/solver/gcr.hpp
+++ b/include/ginkgo/core/solver/gcr.hpp
@@ -8,7 +8,6 @@
 
 #include <vector>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
diff --git a/include/ginkgo/core/solver/gmres.hpp b/include/ginkgo/core/solver/gmres.hpp
index fa2af094b6a..57bbca0b529 100644
--- a/include/ginkgo/core/solver/gmres.hpp
+++ b/include/ginkgo/core/solver/gmres.hpp
@@ -8,7 +8,6 @@
 
 #include <vector>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
diff --git a/include/ginkgo/core/solver/idr.hpp b/include/ginkgo/core/solver/idr.hpp
index 78f95ba38c5..9f167d9b2eb 100644
--- a/include/ginkgo/core/solver/idr.hpp
+++ b/include/ginkgo/core/solver/idr.hpp
@@ -10,7 +10,6 @@
 #include <typeinfo>
 #include <vector>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
diff --git a/include/ginkgo/core/solver/ir.hpp b/include/ginkgo/core/solver/ir.hpp
index 9ac7acfaa91..91949261a79 100644
--- a/include/ginkgo/core/solver/ir.hpp
+++ b/include/ginkgo/core/solver/ir.hpp
@@ -8,7 +8,6 @@
 
 #include <vector>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/types.hpp>
diff --git a/include/ginkgo/core/solver/multigrid.hpp b/include/ginkgo/core/solver/multigrid.hpp
index 7a4e8b83be4..2d0278b538e 100644
--- a/include/ginkgo/core/solver/multigrid.hpp
+++ b/include/ginkgo/core/solver/multigrid.hpp
@@ -11,7 +11,6 @@
 #include <memory>
 #include <utility>
 
-
 #include <ginkgo/core/base/abstract_factory.hpp>
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/include/ginkgo/core/solver/solver_base.hpp b/include/ginkgo/core/solver/solver_base.hpp
index 43a941f6374..159ad2c15a7 100644
--- a/include/ginkgo/core/solver/solver_base.hpp
+++ b/include/ginkgo/core/solver/solver_base.hpp
@@ -10,7 +10,6 @@
 #include <type_traits>
 #include <utility>
 
-
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/log/logger.hpp>
diff --git a/include/ginkgo/core/solver/triangular.hpp b/include/ginkgo/core/solver/triangular.hpp
index 36e6c483c69..2d42e3bb97a 100644
--- a/include/ginkgo/core/solver/triangular.hpp
+++ b/include/ginkgo/core/solver/triangular.hpp
@@ -9,7 +9,6 @@
 #include <memory>
 #include <utility>
 
-
 #include <ginkgo/core/base/abstract_factory.hpp>
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/dim.hpp>
diff --git a/include/ginkgo/core/solver/workspace.hpp b/include/ginkgo/core/solver/workspace.hpp
index e40eccbb039..e169a7caf1c 100644
--- a/include/ginkgo/core/solver/workspace.hpp
+++ b/include/ginkgo/core/solver/workspace.hpp
@@ -8,7 +8,6 @@
 
 #include <typeinfo>
 
-
 #include <ginkgo/core/matrix/dense.hpp>
 
 
diff --git a/include/ginkgo/core/stop/combined.hpp b/include/ginkgo/core/stop/combined.hpp
index 5f01e499511..62451538431 100644
--- a/include/ginkgo/core/stop/combined.hpp
+++ b/include/ginkgo/core/stop/combined.hpp
@@ -8,7 +8,6 @@
 
 #include <vector>
 
-
 #include <ginkgo/core/stop/criterion.hpp>
 
 
diff --git a/include/ginkgo/core/stop/residual_norm.hpp b/include/ginkgo/core/stop/residual_norm.hpp
index 273b28a5a35..6ee3c843e6a 100644
--- a/include/ginkgo/core/stop/residual_norm.hpp
+++ b/include/ginkgo/core/stop/residual_norm.hpp
@@ -8,7 +8,6 @@
 
 #include <type_traits>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/utils.hpp>
diff --git a/include/ginkgo/core/stop/time.hpp b/include/ginkgo/core/stop/time.hpp
index ec734324985..a41b9c49499 100644
--- a/include/ginkgo/core/stop/time.hpp
+++ b/include/ginkgo/core/stop/time.hpp
@@ -8,7 +8,6 @@
 
 #include <chrono>
 
-
 #include <ginkgo/core/stop/criterion.hpp>
 
 
diff --git a/include/ginkgo/extensions/config/json_config.hpp b/include/ginkgo/extensions/config/json_config.hpp
index d21a3623f46..f8c3cfd5860 100644
--- a/include/ginkgo/extensions/config/json_config.hpp
+++ b/include/ginkgo/extensions/config/json_config.hpp
@@ -10,10 +10,8 @@
 #include <stdexcept>
 #include <string>
 
-
 #include <nlohmann/json.hpp>
 
-
 #include <ginkgo/core/config/property_tree.hpp>
 
 
diff --git a/include/ginkgo/extensions/kokkos/spaces.hpp b/include/ginkgo/extensions/kokkos/spaces.hpp
index 6875f931152..1eb4fada3d1 100644
--- a/include/ginkgo/extensions/kokkos/spaces.hpp
+++ b/include/ginkgo/extensions/kokkos/spaces.hpp
@@ -7,7 +7,6 @@
 
 #include <Kokkos_Core.hpp>
 
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/include/ginkgo/extensions/kokkos/types.hpp b/include/ginkgo/extensions/kokkos/types.hpp
index 88362f317b1..d595461e409 100644
--- a/include/ginkgo/extensions/kokkos/types.hpp
+++ b/include/ginkgo/extensions/kokkos/types.hpp
@@ -6,11 +6,8 @@
 #define GINKGO_EXTENSIONS_KOKKOS_TYPES_HPP
 
 #include <Kokkos_Complex.hpp>
-
-
 #include <Kokkos_Core.hpp>
 
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
diff --git a/omp/base/batch_multi_vector_kernels.cpp b/omp/base/batch_multi_vector_kernels.cpp
index 9dc8d266924..395bf96cc7a 100644
--- a/omp/base/batch_multi_vector_kernels.cpp
+++ b/omp/base/batch_multi_vector_kernels.cpp
@@ -4,15 +4,12 @@
 
 #include "core/base/batch_multi_vector_kernels.hpp"
 
-
 #include <algorithm>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/range_accessors.hpp>
 
-
 #include "core/components/prefix_sum_kernels.hpp"
 #include "reference/base/batch_struct.hpp"
 
diff --git a/omp/base/device_matrix_data_kernels.cpp b/omp/base/device_matrix_data_kernels.cpp
index e8330ce589b..bce89e2f409 100644
--- a/omp/base/device_matrix_data_kernels.cpp
+++ b/omp/base/device_matrix_data_kernels.cpp
@@ -4,13 +4,10 @@
 
 #include "core/base/device_matrix_data_kernels.hpp"
 
-
 #include <algorithm>
 
-
 #include <omp.h>
 
-
 #include "core/base/allocator.hpp"
 #include "core/components/format_conversion_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
diff --git a/omp/base/executor.cpp b/omp/base/executor.cpp
index 98ef2d528ae..5e846946e5e 100644
--- a/omp/base/executor.cpp
+++ b/omp/base/executor.cpp
@@ -4,7 +4,6 @@
 
 #include "ginkgo/core/base/executor.hpp"
 
-
 #include <omp.h>
 
 
diff --git a/omp/base/index_set_kernels.cpp b/omp/base/index_set_kernels.cpp
index fbfa04a93b4..6dca856b96f 100644
--- a/omp/base/index_set_kernels.cpp
+++ b/omp/base/index_set_kernels.cpp
@@ -4,18 +4,15 @@
 
 #include "core/base/index_set_kernels.hpp"
 
-
 #include <algorithm>
 #include <iostream>
 #include <mutex>
 #include <vector>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "core/base/allocator.hpp"
 
 
diff --git a/omp/base/kernel_launch.hpp b/omp/base/kernel_launch.hpp
index a5fcc32bffc..ac7131fea86 100644
--- a/omp/base/kernel_launch.hpp
+++ b/omp/base/kernel_launch.hpp
@@ -10,7 +10,6 @@
 
 #include <tuple>
 
-
 #include "core/synthesizer/implementation_selection.hpp"
 
 
diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp
index c7ef7a38220..bc489cb78dc 100644
--- a/omp/base/kernel_launch_reduction.hpp
+++ b/omp/base/kernel_launch_reduction.hpp
@@ -10,7 +10,6 @@
 
 #include <numeric>
 
-
 #include <omp.h>
 
 
diff --git a/omp/base/scoped_device_id.cpp b/omp/base/scoped_device_id.cpp
index 9f74a43eb50..6e61d1b63bc 100644
--- a/omp/base/scoped_device_id.cpp
+++ b/omp/base/scoped_device_id.cpp
@@ -5,7 +5,6 @@
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/scoped_device_id_guard.hpp>
 
-
 #include "core/base/noop_scoped_device_id_guard.hpp"
 
 
diff --git a/omp/components/atomic.hpp b/omp/components/atomic.hpp
index 01094a8a8dc..c3580cd36bb 100644
--- a/omp/components/atomic.hpp
+++ b/omp/components/atomic.hpp
@@ -8,7 +8,6 @@
 
 #include <type_traits>
 
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 
diff --git a/omp/components/csr_spgeam.hpp b/omp/components/csr_spgeam.hpp
index 8ff417df442..e4b3b9b6e51 100644
--- a/omp/components/csr_spgeam.hpp
+++ b/omp/components/csr_spgeam.hpp
@@ -8,10 +8,8 @@
 
 #include <limits>
 
-
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/base/utils.hpp"
 
 
diff --git a/omp/components/matrix_operations.hpp b/omp/components/matrix_operations.hpp
index 749bb754676..522915bf05b 100644
--- a/omp/components/matrix_operations.hpp
+++ b/omp/components/matrix_operations.hpp
@@ -8,7 +8,6 @@
 
 #include <omp.h>
 
-
 #include <ginkgo/core/base/math.hpp>
 
 
diff --git a/omp/components/prefix_sum_kernels.cpp b/omp/components/prefix_sum_kernels.cpp
index 231acd52685..08d184b7616 100644
--- a/omp/components/prefix_sum_kernels.cpp
+++ b/omp/components/prefix_sum_kernels.cpp
@@ -4,14 +4,11 @@
 
 #include "core/components/prefix_sum_kernels.hpp"
 
-
 #include <algorithm>
 #include <limits>
 
-
 #include <omp.h>
 
-
 #include "core/base/allocator.hpp"
 
 
diff --git a/omp/components/sort_small.hpp b/omp/components/sort_small.hpp
index b862dab8457..12128576a53 100644
--- a/omp/components/sort_small.hpp
+++ b/omp/components/sort_small.hpp
@@ -8,7 +8,6 @@
 
 #include <algorithm>
 
-
 #include <ginkgo/core/base/types.hpp>
 
 
diff --git a/omp/distributed/index_map_kernels.cpp b/omp/distributed/index_map_kernels.cpp
index 02ae63261a0..b01dab9cb33 100644
--- a/omp/distributed/index_map_kernels.cpp
+++ b/omp/distributed/index_map_kernels.cpp
@@ -4,13 +4,10 @@
 
 #include "core/distributed/index_map_kernels.hpp"
 
-
 #include <omp.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "core/base/allocator.hpp"
 #include "core/base/device_matrix_data_kernels.hpp"
 #include "core/base/iterator_factory.hpp"
diff --git a/omp/distributed/matrix_kernels.cpp b/omp/distributed/matrix_kernels.cpp
index 9f7b5594fa7..2f36ec4a778 100644
--- a/omp/distributed/matrix_kernels.cpp
+++ b/omp/distributed/matrix_kernels.cpp
@@ -4,13 +4,10 @@
 
 #include "core/distributed/matrix_kernels.hpp"
 
-
 #include <omp.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-
 #include "core/base/allocator.hpp"
 #include "core/base/device_matrix_data_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
diff --git a/omp/distributed/partition_helpers_kernels.cpp b/omp/distributed/partition_helpers_kernels.cpp
index 70d01d18368..ceae3e17679 100644
--- a/omp/distributed/partition_helpers_kernels.cpp
+++ b/omp/distributed/partition_helpers_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/distributed/partition_helpers_kernels.hpp"
 
-
 #include "core/base/iterator_factory.hpp"
 
 
diff --git a/omp/distributed/partition_kernels.cpp b/omp/distributed/partition_kernels.cpp
index c1549989384..25b7b0bfce8 100644
--- a/omp/distributed/partition_kernels.cpp
+++ b/omp/distributed/partition_kernels.cpp
@@ -4,13 +4,10 @@
 
 #include "core/distributed/partition_kernels.hpp"
 
-
 #include <omp.h>
 
-
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "core/base/allocator.hpp"
 
 
diff --git a/omp/distributed/vector_kernels.cpp b/omp/distributed/vector_kernels.cpp
index e4daf7d5602..1ae60ed108e 100644
--- a/omp/distributed/vector_kernels.cpp
+++ b/omp/distributed/vector_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/distributed/vector_kernels.hpp"
 
-
 #include "core/components/prefix_sum_kernels.hpp"
 #include "reference/distributed/partition_helpers.hpp"
 
diff --git a/omp/factorization/cholesky_kernels.cpp b/omp/factorization/cholesky_kernels.cpp
index 19d31647b88..8ce5392ebde 100644
--- a/omp/factorization/cholesky_kernels.cpp
+++ b/omp/factorization/cholesky_kernels.cpp
@@ -4,14 +4,11 @@
 
 #include "core/factorization/cholesky_kernels.hpp"
 
-
 #include <algorithm>
 #include <memory>
 
-
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/base/iterator_factory.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/format_conversion_kernels.hpp"
diff --git a/omp/factorization/factorization_kernels.cpp b/omp/factorization/factorization_kernels.cpp
index 6d042114e69..f4b41cbdac5 100644
--- a/omp/factorization/factorization_kernels.cpp
+++ b/omp/factorization/factorization_kernels.cpp
@@ -4,15 +4,12 @@
 
 #include "core/factorization/factorization_kernels.hpp"
 
-
 #include <algorithm>
 #include <memory>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
 
diff --git a/omp/factorization/lu_kernels.cpp b/omp/factorization/lu_kernels.cpp
index c942991b13a..53847ff2b6c 100644
--- a/omp/factorization/lu_kernels.cpp
+++ b/omp/factorization/lu_kernels.cpp
@@ -4,14 +4,11 @@
 
 #include "core/factorization/lu_kernels.hpp"
 
-
 #include <algorithm>
 #include <memory>
 
-
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/base/allocator.hpp"
 #include "core/matrix/csr_lookup.hpp"
 
diff --git a/omp/factorization/par_ic_kernels.cpp b/omp/factorization/par_ic_kernels.cpp
index 48f4047875e..93093783acc 100644
--- a/omp/factorization/par_ic_kernels.cpp
+++ b/omp/factorization/par_ic_kernels.cpp
@@ -4,12 +4,10 @@
 
 #include "core/factorization/par_ic_kernels.hpp"
 
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/base/utils.hpp"
 
 
diff --git a/omp/factorization/par_ict_kernels.cpp b/omp/factorization/par_ict_kernels.cpp
index d997531c304..b5546e1a644 100644
--- a/omp/factorization/par_ict_kernels.cpp
+++ b/omp/factorization/par_ict_kernels.cpp
@@ -4,19 +4,16 @@
 
 #include "core/factorization/par_ict_kernels.hpp"
 
-
 #include <algorithm>
 #include <tuple>
 #include <unordered_map>
 #include <unordered_set>
 
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/base/utils.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
diff --git a/omp/factorization/par_ilu_kernels.cpp b/omp/factorization/par_ilu_kernels.cpp
index 44ab0cf9cc0..da42a631b81 100644
--- a/omp/factorization/par_ilu_kernels.cpp
+++ b/omp/factorization/par_ilu_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/factorization/par_ilu_kernels.hpp"
 
-
 #include <memory>
 
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
diff --git a/omp/factorization/par_ilut_kernels.cpp b/omp/factorization/par_ilut_kernels.cpp
index 48d97920a88..a24709e4f1a 100644
--- a/omp/factorization/par_ilut_kernels.cpp
+++ b/omp/factorization/par_ilut_kernels.cpp
@@ -4,22 +4,18 @@
 
 #include "core/factorization/par_ilut_kernels.hpp"
 
-
 #include <algorithm>
 #include <tuple>
 #include <unordered_map>
 #include <unordered_set>
 
-
 #include <omp.h>
 
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/base/utils.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
diff --git a/omp/matrix/batch_csr_kernels.cpp b/omp/matrix/batch_csr_kernels.cpp
index e40b06350bb..eacb26c12cb 100644
--- a/omp/matrix/batch_csr_kernels.cpp
+++ b/omp/matrix/batch_csr_kernels.cpp
@@ -4,14 +4,11 @@
 
 #include "core/matrix/batch_csr_kernels.hpp"
 
-
 #include <algorithm>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/matrix/batch_csr.hpp>
 
-
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "reference/base/batch_struct.hpp"
diff --git a/omp/matrix/batch_dense_kernels.cpp b/omp/matrix/batch_dense_kernels.cpp
index a8cf119f02d..836908260a7 100644
--- a/omp/matrix/batch_dense_kernels.cpp
+++ b/omp/matrix/batch_dense_kernels.cpp
@@ -4,14 +4,11 @@
 
 #include "core/matrix/batch_dense_kernels.hpp"
 
-
 #include <algorithm>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
 
-
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "reference/base/batch_struct.hpp"
diff --git a/omp/matrix/batch_ell_kernels.cpp b/omp/matrix/batch_ell_kernels.cpp
index 74cb4e06aa1..4fb5aeea6fa 100644
--- a/omp/matrix/batch_ell_kernels.cpp
+++ b/omp/matrix/batch_ell_kernels.cpp
@@ -4,14 +4,11 @@
 
 #include "core/matrix/batch_ell_kernels.hpp"
 
-
 #include <algorithm>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
-
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "reference/base/batch_struct.hpp"
diff --git a/omp/matrix/coo_kernels.cpp b/omp/matrix/coo_kernels.cpp
index e0f606b448f..021795d8e9c 100644
--- a/omp/matrix/coo_kernels.cpp
+++ b/omp/matrix/coo_kernels.cpp
@@ -4,19 +4,15 @@
 
 #include "core/matrix/coo_kernels.hpp"
 
-
 #include <array>
 
-
 #include <omp.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/matrix/dense_kernels.hpp"
 #include "omp/components/atomic.hpp"
 
diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp
index 70df9f07944..09d1465896b 100644
--- a/omp/matrix/csr_kernels.cpp
+++ b/omp/matrix/csr_kernels.cpp
@@ -4,16 +4,13 @@
 
 #include "core/matrix/csr_kernels.hpp"
 
-
 #include <algorithm>
 #include <limits>
 #include <numeric>
 #include <utility>
 
-
 #include <omp.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/index_set.hpp>
@@ -22,7 +19,6 @@
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/hybrid.hpp>
 
-
 #include "core/base/allocator.hpp"
 #include "core/base/index_set_kernels.hpp"
 #include "core/base/iterator_factory.hpp"
diff --git a/omp/matrix/dense_kernels.cpp b/omp/matrix/dense_kernels.cpp
index 20e09f2a747..d1c0f2f8949 100644
--- a/omp/matrix/dense_kernels.cpp
+++ b/omp/matrix/dense_kernels.cpp
@@ -4,13 +4,10 @@
 
 #include "core/matrix/dense_kernels.hpp"
 
-
 #include <algorithm>
 
-
 #include <omp.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/range_accessors.hpp>
@@ -23,7 +20,6 @@
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
 #include "accessor/block_col_major.hpp"
 #include "accessor/range.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
diff --git a/omp/matrix/diagonal_kernels.cpp b/omp/matrix/diagonal_kernels.cpp
index 622c195755b..71363c7bc6e 100644
--- a/omp/matrix/diagonal_kernels.cpp
+++ b/omp/matrix/diagonal_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/matrix/diagonal_kernels.hpp"
 
-
 #include <omp.h>
 
-
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
diff --git a/omp/matrix/ell_kernels.cpp b/omp/matrix/ell_kernels.cpp
index aa7c7f35bd1..c35a3654b86 100644
--- a/omp/matrix/ell_kernels.cpp
+++ b/omp/matrix/ell_kernels.cpp
@@ -4,19 +4,15 @@
 
 #include "core/matrix/ell_kernels.hpp"
 
-
 #include <array>
 
-
 #include <omp.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "accessor/reduced_row_major.hpp"
 #include "core/base/mixed_precision_types.hpp"
 
diff --git a/omp/matrix/fbcsr_kernels.cpp b/omp/matrix/fbcsr_kernels.cpp
index 0008c3c19a6..db60d85db79 100644
--- a/omp/matrix/fbcsr_kernels.cpp
+++ b/omp/matrix/fbcsr_kernels.cpp
@@ -4,21 +4,17 @@
 
 #include "core/matrix/fbcsr_kernels.hpp"
 
-
 #include <algorithm>
 #include <numeric>
 #include <utility>
 
-
 #include <omp.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "accessor/block_col_major.hpp"
 #include "core/base/allocator.hpp"
 #include "core/base/block_sizes.hpp"
diff --git a/omp/matrix/fft_kernels.cpp b/omp/matrix/fft_kernels.cpp
index 1a7ae601fb6..0301b9093ff 100644
--- a/omp/matrix/fft_kernels.cpp
+++ b/omp/matrix/fft_kernels.cpp
@@ -4,12 +4,10 @@
 
 #include "core/matrix/fft_kernels.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/base/allocator.hpp"
 
 
diff --git a/omp/matrix/sellp_kernels.cpp b/omp/matrix/sellp_kernels.cpp
index a657d5d54a7..7f8b16264ce 100644
--- a/omp/matrix/sellp_kernels.cpp
+++ b/omp/matrix/sellp_kernels.cpp
@@ -4,13 +4,10 @@
 
 #include "core/matrix/sellp_kernels.hpp"
 
-
 #include <array>
 
-
 #include <omp.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
diff --git a/omp/matrix/sparsity_csr_kernels.cpp b/omp/matrix/sparsity_csr_kernels.cpp
index 5782e764845..35bb42c70a6 100644
--- a/omp/matrix/sparsity_csr_kernels.cpp
+++ b/omp/matrix/sparsity_csr_kernels.cpp
@@ -4,20 +4,16 @@
 
 #include "core/matrix/sparsity_csr_kernels.hpp"
 
-
 #include <algorithm>
 #include <numeric>
 #include <utility>
 
-
 #include <omp.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/base/mixed_precision_types.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/format_conversion_kernels.hpp"
diff --git a/omp/multigrid/pgm_kernels.cpp b/omp/multigrid/pgm_kernels.cpp
index 09279c6db21..9d2aa047cc4 100644
--- a/omp/multigrid/pgm_kernels.cpp
+++ b/omp/multigrid/pgm_kernels.cpp
@@ -4,18 +4,14 @@
 
 #include "core/multigrid/pgm_kernels.hpp"
 
-
 #include <algorithm>
 #include <memory>
 
-
 #include <omp.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "core/base/iterator_factory.hpp"
 
 
diff --git a/omp/preconditioner/batch_jacobi_kernels.cpp b/omp/preconditioner/batch_jacobi_kernels.cpp
index 15c7f0ab471..9dfe06be32b 100644
--- a/omp/preconditioner/batch_jacobi_kernels.cpp
+++ b/omp/preconditioner/batch_jacobi_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/preconditioner/batch_jacobi_kernels.hpp"
 
-
 #include "core/base/batch_struct.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/batch_struct.hpp"
diff --git a/omp/preconditioner/isai_kernels.cpp b/omp/preconditioner/isai_kernels.cpp
index b3af8d3a926..6f2fe4838d9 100644
--- a/omp/preconditioner/isai_kernels.cpp
+++ b/omp/preconditioner/isai_kernels.cpp
@@ -4,20 +4,16 @@
 
 #include "core/preconditioner/isai_kernels.hpp"
 
-
 #include <algorithm>
 #include <memory>
 
-
 #include <omp.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
 
diff --git a/omp/preconditioner/jacobi_kernels.cpp b/omp/preconditioner/jacobi_kernels.cpp
index 686cb4d9777..76224f97a2f 100644
--- a/omp/preconditioner/jacobi_kernels.cpp
+++ b/omp/preconditioner/jacobi_kernels.cpp
@@ -4,23 +4,19 @@
 
 #include "core/preconditioner/jacobi_kernels.hpp"
 
-
 #include <algorithm>
 #include <cmath>
 #include <iterator>
 #include <numeric>
 #include <vector>
 
-
 #include <omp.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/base/allocator.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
diff --git a/omp/reorder/rcm_kernels.cpp b/omp/reorder/rcm_kernels.cpp
index 44f5b95f034..dd4eb020695 100644
--- a/omp/reorder/rcm_kernels.cpp
+++ b/omp/reorder/rcm_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/reorder/rcm_kernels.hpp"
 
-
 #include <algorithm>
 #include <iterator>
 #include <memory>
@@ -12,10 +11,8 @@
 #include <utility>
 #include <vector>
 
-
 #include <omp.h>
 
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
@@ -25,7 +22,6 @@
 #include <ginkgo/core/matrix/permutation.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
 #include "core/base/allocator.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "omp/components/omp_mutex.hpp"
diff --git a/omp/solver/batch_bicgstab_kernels.cpp b/omp/solver/batch_bicgstab_kernels.cpp
index 294365c2410..81df9c45e51 100644
--- a/omp/solver/batch_bicgstab_kernels.cpp
+++ b/omp/solver/batch_bicgstab_kernels.cpp
@@ -4,13 +4,10 @@
 
 #include "core/solver/batch_bicgstab_kernels.hpp"
 
-
 #include <omp.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 
-
 #include "core/solver/batch_dispatch.hpp"
 
 
diff --git a/omp/solver/batch_cg_kernels.cpp b/omp/solver/batch_cg_kernels.cpp
index bdfcd50e050..51c794ab597 100644
--- a/omp/solver/batch_cg_kernels.cpp
+++ b/omp/solver/batch_cg_kernels.cpp
@@ -4,13 +4,10 @@
 
 #include "core/solver/batch_cg_kernels.hpp"
 
-
 #include <omp.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 
-
 #include "core/solver/batch_dispatch.hpp"
 
 
diff --git a/omp/solver/cb_gmres_kernels.cpp b/omp/solver/cb_gmres_kernels.cpp
index 76cff297bff..a53294b9fbe 100644
--- a/omp/solver/cb_gmres_kernels.cpp
+++ b/omp/solver/cb_gmres_kernels.cpp
@@ -4,16 +4,13 @@
 
 #include "core/solver/cb_gmres_kernels.hpp"
 
-
 #include <omp.h>
 
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/solver/cb_gmres.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
-
 #include "common/unified/base/kernel_launch_reduction.hpp"
 #include "core/solver/cb_gmres_accessor.hpp"
 
diff --git a/omp/solver/idr_kernels.cpp b/omp/solver/idr_kernels.cpp
index 1d8ce7fec1b..a93002e4833 100644
--- a/omp/solver/idr_kernels.cpp
+++ b/omp/solver/idr_kernels.cpp
@@ -4,20 +4,16 @@
 
 #include "core/solver/idr_kernels.hpp"
 
-
 #include <algorithm>
 #include <ctime>
 #include <random>
 
-
 #include <omp.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "common/unified/base/kernel_launch_reduction.hpp"
 
 
diff --git a/omp/solver/lower_trs_kernels.cpp b/omp/solver/lower_trs_kernels.cpp
index ee5b8b4b5a2..6dac6b46078 100644
--- a/omp/solver/lower_trs_kernels.cpp
+++ b/omp/solver/lower_trs_kernels.cpp
@@ -4,13 +4,10 @@
 
 #include "core/solver/lower_trs_kernels.hpp"
 
-
 #include <memory>
 
-
 #include <omp.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
diff --git a/omp/solver/multigrid_kernels.cpp b/omp/solver/multigrid_kernels.cpp
index 09ed8e4cba8..12e5bad8577 100644
--- a/omp/solver/multigrid_kernels.cpp
+++ b/omp/solver/multigrid_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/solver/multigrid_kernels.hpp"
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
diff --git a/omp/solver/upper_trs_kernels.cpp b/omp/solver/upper_trs_kernels.cpp
index 7e6793a45f4..ea05cabeb63 100644
--- a/omp/solver/upper_trs_kernels.cpp
+++ b/omp/solver/upper_trs_kernels.cpp
@@ -4,13 +4,10 @@
 
 #include "core/solver/upper_trs_kernels.hpp"
 
-
 #include <memory>
 
-
 #include <omp.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
diff --git a/omp/stop/criterion_kernels.cpp b/omp/stop/criterion_kernels.cpp
index c345ad4ef39..65d880515d9 100644
--- a/omp/stop/criterion_kernels.cpp
+++ b/omp/stop/criterion_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/stop/criterion_kernels.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
diff --git a/omp/stop/residual_norm_kernels.cpp b/omp/stop/residual_norm_kernels.cpp
index c72124640df..0ec4395a16b 100644
--- a/omp/stop/residual_norm_kernels.cpp
+++ b/omp/stop/residual_norm_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/stop/residual_norm_kernels.hpp"
 
-
 #include <omp.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
diff --git a/omp/test/base/index_set.cpp b/omp/test/base/index_set.cpp
index 52f54774116..98a11bb8720 100644
--- a/omp/test/base/index_set.cpp
+++ b/omp/test/base/index_set.cpp
@@ -2,18 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/index_set.hpp>
-
-
 #include <random>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/index_set.hpp>
 #include <ginkgo/core/base/range.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/base/index_set_kernels.hpp"
 #include "core/test/utils.hpp"
 
diff --git a/omp/test/base/kernel_launch.cpp b/omp/test/base/kernel_launch.cpp
index b01ffa4214d..5644d67caee 100644
--- a/omp/test/base/kernel_launch.cpp
+++ b/omp/test/base/kernel_launch.cpp
@@ -4,20 +4,16 @@
 
 #include "common/unified/base/kernel_launch.hpp"
 
-
 #include <memory>
 #include <type_traits>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/dim.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "common/unified/base/kernel_launch_reduction.hpp"
 #include "common/unified/base/kernel_launch_solver.hpp"
 #include "core/test/utils.hpp"
diff --git a/omp/test/matrix/fbcsr_kernels.cpp b/omp/test/matrix/fbcsr_kernels.cpp
index 51c35171ad5..97f1008d53d 100644
--- a/omp/test/matrix/fbcsr_kernels.cpp
+++ b/omp/test/matrix/fbcsr_kernels.cpp
@@ -2,26 +2,22 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/fbcsr.hpp>
-
+#include "core/matrix/fbcsr_kernels.hpp"
 
 #include <algorithm>
 #include <numeric>
 #include <random>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
+#include <ginkgo/core/matrix/fbcsr.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
-#include "core/matrix/fbcsr_kernels.hpp"
 #include "core/test/matrix/fbcsr_sample.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/fb_matrix_generator.hpp"
diff --git a/reference/base/batch_multi_vector_kernels.cpp b/reference/base/batch_multi_vector_kernels.cpp
index 0692f66b465..b0d20a6b826 100644
--- a/reference/base/batch_multi_vector_kernels.cpp
+++ b/reference/base/batch_multi_vector_kernels.cpp
@@ -4,15 +4,12 @@
 
 #include "core/base/batch_multi_vector_kernels.hpp"
 
-
 #include <algorithm>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/range_accessors.hpp>
 
-
 #include "core/base/batch_struct.hpp"
 #include "reference/base/batch_struct.hpp"
 
diff --git a/reference/base/batch_struct.hpp b/reference/base/batch_struct.hpp
index 20837cbecc5..e64cac3ba88 100644
--- a/reference/base/batch_struct.hpp
+++ b/reference/base/batch_struct.hpp
@@ -9,7 +9,6 @@
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "core/base/batch_struct.hpp"
 
 
diff --git a/reference/base/device_matrix_data_kernels.cpp b/reference/base/device_matrix_data_kernels.cpp
index 86912ab2d6e..f9a23b35e69 100644
--- a/reference/base/device_matrix_data_kernels.cpp
+++ b/reference/base/device_matrix_data_kernels.cpp
@@ -4,13 +4,10 @@
 
 #include "core/base/device_matrix_data_kernels.hpp"
 
-
 #include <algorithm>
 
-
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "core/components/prefix_sum_kernels.hpp"
 
 
diff --git a/reference/base/index_set_kernels.cpp b/reference/base/index_set_kernels.cpp
index edd0671e7d7..6f769472a6f 100644
--- a/reference/base/index_set_kernels.cpp
+++ b/reference/base/index_set_kernels.cpp
@@ -4,20 +4,17 @@
 
 #include "core/base/index_set_kernels.hpp"
 
-
 #include <algorithm>
 #include <iostream>
 #include <memory>
 #include <mutex>
 #include <vector>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "core/base/allocator.hpp"
 
 
diff --git a/reference/base/scoped_device_id.cpp b/reference/base/scoped_device_id.cpp
index f0aa33412be..4be131dccea 100644
--- a/reference/base/scoped_device_id.cpp
+++ b/reference/base/scoped_device_id.cpp
@@ -5,7 +5,6 @@
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/scoped_device_id_guard.hpp>
 
-
 #include "core/base/noop_scoped_device_id_guard.hpp"
 
 
diff --git a/reference/components/convert_ptrs.hpp b/reference/components/convert_ptrs.hpp
index c1fa5542e71..4d8d21db4d8 100644
--- a/reference/components/convert_ptrs.hpp
+++ b/reference/components/convert_ptrs.hpp
@@ -5,7 +5,6 @@
 #include <algorithm>
 #include <numeric>
 
-
 #include <ginkgo/core/base/types.hpp>
 
 
diff --git a/reference/components/csr_spgeam.hpp b/reference/components/csr_spgeam.hpp
index 4fb52aaa8dd..e8f5f314c5e 100644
--- a/reference/components/csr_spgeam.hpp
+++ b/reference/components/csr_spgeam.hpp
@@ -8,10 +8,8 @@
 
 #include <limits>
 
-
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/base/utils.hpp"
 
 
diff --git a/reference/components/fill_array_kernels.cpp b/reference/components/fill_array_kernels.cpp
index 3dc865d9b97..1649aa87982 100644
--- a/reference/components/fill_array_kernels.cpp
+++ b/reference/components/fill_array_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/components/fill_array_kernels.hpp"
 
-
 #include <numeric>
 
 
diff --git a/reference/components/format_conversion_kernels.cpp b/reference/components/format_conversion_kernels.cpp
index 2fa201e544a..faac67c8e27 100644
--- a/reference/components/format_conversion_kernels.cpp
+++ b/reference/components/format_conversion_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/components/format_conversion_kernels.hpp"
 
-
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 
diff --git a/reference/components/precision_conversion_kernels.cpp b/reference/components/precision_conversion_kernels.cpp
index 3ce42fbe5cb..db12d9316ee 100644
--- a/reference/components/precision_conversion_kernels.cpp
+++ b/reference/components/precision_conversion_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/components/precision_conversion_kernels.hpp"
 
-
 #include <algorithm>
 
 
diff --git a/reference/components/reduce_array_kernels.cpp b/reference/components/reduce_array_kernels.cpp
index e2c497f219c..a70ef95a878 100644
--- a/reference/components/reduce_array_kernels.cpp
+++ b/reference/components/reduce_array_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/components/reduce_array_kernels.hpp"
 
-
 #include <numeric>
 
 
diff --git a/reference/distributed/index_map_kernels.cpp b/reference/distributed/index_map_kernels.cpp
index 5f13581eee0..322a95c6cdb 100644
--- a/reference/distributed/index_map_kernels.cpp
+++ b/reference/distributed/index_map_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/distributed/index_map_kernels.hpp"
 
-
 #include <ginkgo/core/distributed/index_map.hpp>
 
-
 #include "core/base/allocator.hpp"
 #include "core/base/segmented_array.hpp"
 #include "reference/distributed/partition_helpers.hpp"
diff --git a/reference/distributed/matrix_kernels.cpp b/reference/distributed/matrix_kernels.cpp
index 9b4ff9231df..95176b34656 100644
--- a/reference/distributed/matrix_kernels.cpp
+++ b/reference/distributed/matrix_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/distributed/matrix_kernels.hpp"
 
-
 #include "core/base/allocator.hpp"
 #include "core/base/device_matrix_data_kernels.hpp"
 #include "core/base/iterator_factory.hpp"
diff --git a/reference/distributed/partition_helpers.hpp b/reference/distributed/partition_helpers.hpp
index fda114b43c6..06bd1e11f32 100644
--- a/reference/distributed/partition_helpers.hpp
+++ b/reference/distributed/partition_helpers.hpp
@@ -8,7 +8,6 @@
 
 #include <algorithm>
 
-
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/distributed/partition.hpp>
 
diff --git a/reference/distributed/partition_helpers_kernels.cpp b/reference/distributed/partition_helpers_kernels.cpp
index 4cb3d145038..b57daab2eaa 100644
--- a/reference/distributed/partition_helpers_kernels.cpp
+++ b/reference/distributed/partition_helpers_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/distributed/partition_helpers_kernels.hpp"
 
-
 #include "core/base/iterator_factory.hpp"
 
 
diff --git a/reference/distributed/vector_kernels.cpp b/reference/distributed/vector_kernels.cpp
index de6e462cc2e..76a8be06a0f 100644
--- a/reference/distributed/vector_kernels.cpp
+++ b/reference/distributed/vector_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/distributed/vector_kernels.hpp"
 
-
 #include "core/components/prefix_sum_kernels.hpp"
 #include "reference/distributed/partition_helpers.hpp"
 
diff --git a/reference/factorization/cholesky_kernels.cpp b/reference/factorization/cholesky_kernels.cpp
index d24bf0d74fd..2aeee99d45d 100644
--- a/reference/factorization/cholesky_kernels.cpp
+++ b/reference/factorization/cholesky_kernels.cpp
@@ -4,15 +4,12 @@
 
 #include "core/factorization/cholesky_kernels.hpp"
 
-
 #include <algorithm>
 #include <memory>
 #include <numeric>
 
-
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/base/allocator.hpp"
 #include "core/base/iterator_factory.hpp"
 #include "core/components/fill_array_kernels.hpp"
diff --git a/reference/factorization/factorization_kernels.cpp b/reference/factorization/factorization_kernels.cpp
index 482bf75cb1d..085e2f62ecc 100644
--- a/reference/factorization/factorization_kernels.cpp
+++ b/reference/factorization/factorization_kernels.cpp
@@ -4,15 +4,12 @@
 
 #include "core/factorization/factorization_kernels.hpp"
 
-
 #include <algorithm>
 #include <memory>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
 
diff --git a/reference/factorization/ic_kernels.cpp b/reference/factorization/ic_kernels.cpp
index 28e5a00be6f..6f88467262a 100644
--- a/reference/factorization/ic_kernels.cpp
+++ b/reference/factorization/ic_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/factorization/ic_kernels.hpp"
 
-
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "core/base/allocator.hpp"
 
 
diff --git a/reference/factorization/ilu_kernels.cpp b/reference/factorization/ilu_kernels.cpp
index 263a1b0de38..fdbe8a9e86f 100644
--- a/reference/factorization/ilu_kernels.cpp
+++ b/reference/factorization/ilu_kernels.cpp
@@ -4,13 +4,10 @@
 
 #include "core/factorization/ilu_kernels.hpp"
 
-
 #include <algorithm>
 
-
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "core/base/allocator.hpp"
 
 
diff --git a/reference/factorization/lu_kernels.cpp b/reference/factorization/lu_kernels.cpp
index 50398c2f980..d8516cffb49 100644
--- a/reference/factorization/lu_kernels.cpp
+++ b/reference/factorization/lu_kernels.cpp
@@ -4,14 +4,11 @@
 
 #include "core/factorization/lu_kernels.hpp"
 
-
 #include <algorithm>
 #include <memory>
 
-
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/base/allocator.hpp"
 #include "core/matrix/csr_lookup.hpp"
 
diff --git a/reference/factorization/par_ic_kernels.cpp b/reference/factorization/par_ic_kernels.cpp
index 1753bca0814..4da317cf201 100644
--- a/reference/factorization/par_ic_kernels.cpp
+++ b/reference/factorization/par_ic_kernels.cpp
@@ -4,12 +4,10 @@
 
 #include "core/factorization/par_ic_kernels.hpp"
 
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/base/utils.hpp"
 
 
diff --git a/reference/factorization/par_ict_kernels.cpp b/reference/factorization/par_ict_kernels.cpp
index 52e5099c60f..684158d380c 100644
--- a/reference/factorization/par_ict_kernels.cpp
+++ b/reference/factorization/par_ict_kernels.cpp
@@ -4,18 +4,15 @@
 
 #include "core/factorization/par_ict_kernels.hpp"
 
-
 #include <algorithm>
 #include <tuple>
 #include <unordered_map>
 #include <unordered_set>
 
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/base/utils.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
diff --git a/reference/factorization/par_ilu_kernels.cpp b/reference/factorization/par_ilu_kernels.cpp
index 2e2694b0f1f..44c2e5f66bc 100644
--- a/reference/factorization/par_ilu_kernels.cpp
+++ b/reference/factorization/par_ilu_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/factorization/par_ilu_kernels.hpp"
 
-
 #include <memory>
 
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
diff --git a/reference/factorization/par_ilut_kernels.cpp b/reference/factorization/par_ilut_kernels.cpp
index 293a17e2b83..abef6e9b5f2 100644
--- a/reference/factorization/par_ilut_kernels.cpp
+++ b/reference/factorization/par_ilut_kernels.cpp
@@ -4,19 +4,16 @@
 
 #include "core/factorization/par_ilut_kernels.hpp"
 
-
 #include <algorithm>
 #include <tuple>
 #include <unordered_map>
 #include <unordered_set>
 
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/base/utils.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
diff --git a/reference/matrix/batch_csr_kernels.cpp b/reference/matrix/batch_csr_kernels.cpp
index 6b5c8829cbd..7c6d9a6c000 100644
--- a/reference/matrix/batch_csr_kernels.cpp
+++ b/reference/matrix/batch_csr_kernels.cpp
@@ -4,14 +4,11 @@
 
 #include "core/matrix/batch_csr_kernels.hpp"
 
-
 #include <algorithm>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/matrix/batch_csr.hpp>
 
-
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "reference/base/batch_struct.hpp"
diff --git a/reference/matrix/batch_dense_kernels.cpp b/reference/matrix/batch_dense_kernels.cpp
index 5fba7fce9b2..2116a691fb9 100644
--- a/reference/matrix/batch_dense_kernels.cpp
+++ b/reference/matrix/batch_dense_kernels.cpp
@@ -4,14 +4,11 @@
 
 #include "core/matrix/batch_dense_kernels.hpp"
 
-
 #include <algorithm>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
 
-
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "reference/base/batch_struct.hpp"
diff --git a/reference/matrix/batch_ell_kernels.cpp b/reference/matrix/batch_ell_kernels.cpp
index f1a0d6c4016..0d47f9ea601 100644
--- a/reference/matrix/batch_ell_kernels.cpp
+++ b/reference/matrix/batch_ell_kernels.cpp
@@ -4,14 +4,11 @@
 
 #include "core/matrix/batch_ell_kernels.hpp"
 
-
 #include <algorithm>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
-
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "reference/base/batch_struct.hpp"
diff --git a/reference/matrix/batch_struct.hpp b/reference/matrix/batch_struct.hpp
index 47d31061be3..90e1e445c9b 100644
--- a/reference/matrix/batch_struct.hpp
+++ b/reference/matrix/batch_struct.hpp
@@ -6,16 +6,13 @@
 #define GKO_REFERENCE_MATRIX_BATCH_STRUCT_HPP_
 
 
-#include "core/matrix/batch_struct.hpp"
-
-
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/batch_csr.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
-
 #include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
 
 
 namespace gko {
diff --git a/reference/matrix/coo_kernels.cpp b/reference/matrix/coo_kernels.cpp
index 9b8789f6d8c..f9bf9f5f33d 100644
--- a/reference/matrix/coo_kernels.cpp
+++ b/reference/matrix/coo_kernels.cpp
@@ -4,13 +4,11 @@
 
 #include "core/matrix/coo_kernels.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/components/format_conversion_kernels.hpp"
 #include "core/matrix/dense_kernels.hpp"
 
diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp
index 711efdc9175..f7e2fab4411 100644
--- a/reference/matrix/csr_kernels.cpp
+++ b/reference/matrix/csr_kernels.cpp
@@ -4,13 +4,11 @@
 
 #include "core/matrix/csr_kernels.hpp"
 
-
 #include <algorithm>
 #include <iterator>
 #include <numeric>
 #include <utility>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/index_set.hpp>
@@ -21,7 +19,6 @@
 #include <ginkgo/core/matrix/hybrid.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 
-
 #include "core/base/allocator.hpp"
 #include "core/base/index_set_kernels.hpp"
 #include "core/base/iterator_factory.hpp"
diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp
index fa88c30bd19..53773a131fe 100644
--- a/reference/matrix/dense_kernels.cpp
+++ b/reference/matrix/dense_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/matrix/dense_kernels.hpp"
 
-
 #include <algorithm>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/range_accessors.hpp>
@@ -20,7 +18,6 @@
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
 #include "accessor/block_col_major.hpp"
 #include "accessor/range.hpp"
 #include "core/base/mixed_precision_types.hpp"
diff --git a/reference/matrix/diagonal_kernels.cpp b/reference/matrix/diagonal_kernels.cpp
index 6c41fa41170..028b7685c2b 100644
--- a/reference/matrix/diagonal_kernels.cpp
+++ b/reference/matrix/diagonal_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/matrix/diagonal_kernels.hpp"
 
-
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
diff --git a/reference/matrix/ell_kernels.cpp b/reference/matrix/ell_kernels.cpp
index 13b7912669e..1fa37c4e250 100644
--- a/reference/matrix/ell_kernels.cpp
+++ b/reference/matrix/ell_kernels.cpp
@@ -4,13 +4,11 @@
 
 #include "core/matrix/ell_kernels.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "accessor/reduced_row_major.hpp"
 #include "core/base/mixed_precision_types.hpp"
 
diff --git a/reference/matrix/fbcsr_kernels.cpp b/reference/matrix/fbcsr_kernels.cpp
index 3bae91dad68..9e60e380d9c 100644
--- a/reference/matrix/fbcsr_kernels.cpp
+++ b/reference/matrix/fbcsr_kernels.cpp
@@ -4,19 +4,16 @@
 
 #include "core/matrix/fbcsr_kernels.hpp"
 
-
 #include <algorithm>
 #include <iterator>
 #include <numeric>
 #include <utility>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "accessor/block_col_major.hpp"
 #include "core/base/allocator.hpp"
 #include "core/base/block_sizes.hpp"
diff --git a/reference/matrix/fft_kernels.cpp b/reference/matrix/fft_kernels.cpp
index c262a0a0b7b..00af068803c 100644
--- a/reference/matrix/fft_kernels.cpp
+++ b/reference/matrix/fft_kernels.cpp
@@ -4,12 +4,10 @@
 
 #include "core/matrix/fft_kernels.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/base/allocator.hpp"
 
 
diff --git a/reference/matrix/hybrid_kernels.cpp b/reference/matrix/hybrid_kernels.cpp
index 70ccf8d5583..f2a06c321f2 100644
--- a/reference/matrix/hybrid_kernels.cpp
+++ b/reference/matrix/hybrid_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/matrix/hybrid_kernels.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
@@ -12,7 +11,6 @@
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
 
-
 #include "core/components/format_conversion_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/ell_kernels.hpp"
diff --git a/reference/matrix/scaled_permutation_kernels.cpp b/reference/matrix/scaled_permutation_kernels.cpp
index 05c3d28ef49..b00e06f72f2 100644
--- a/reference/matrix/scaled_permutation_kernels.cpp
+++ b/reference/matrix/scaled_permutation_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/matrix/scaled_permutation_kernels.hpp"
 
-
 #include <ginkgo/core/matrix/dense.hpp>
 
 
diff --git a/reference/matrix/sellp_kernels.cpp b/reference/matrix/sellp_kernels.cpp
index 1f2aa604e2a..120194d6952 100644
--- a/reference/matrix/sellp_kernels.cpp
+++ b/reference/matrix/sellp_kernels.cpp
@@ -4,13 +4,11 @@
 
 #include "core/matrix/sellp_kernels.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/components/prefix_sum_kernels.hpp"
 
 
diff --git a/reference/matrix/sparsity_csr_kernels.cpp b/reference/matrix/sparsity_csr_kernels.cpp
index e4a3c6d13f6..c511a16a292 100644
--- a/reference/matrix/sparsity_csr_kernels.cpp
+++ b/reference/matrix/sparsity_csr_kernels.cpp
@@ -4,17 +4,14 @@
 
 #include "core/matrix/sparsity_csr_kernels.hpp"
 
-
 #include <algorithm>
 #include <numeric>
 #include <utility>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/base/mixed_precision_types.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/format_conversion_kernels.hpp"
diff --git a/reference/multigrid/pgm_kernels.cpp b/reference/multigrid/pgm_kernels.cpp
index ea2d91b84c5..2a6e3252a9f 100644
--- a/reference/multigrid/pgm_kernels.cpp
+++ b/reference/multigrid/pgm_kernels.cpp
@@ -4,12 +4,10 @@
 
 #include "core/multigrid/pgm_kernels.hpp"
 
-
 #include <algorithm>
 #include <memory>
 #include <tuple>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
@@ -17,7 +15,6 @@
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
 
-
 #include "core/base/allocator.hpp"
 #include "core/base/iterator_factory.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
diff --git a/reference/preconditioner/batch_block_jacobi.hpp b/reference/preconditioner/batch_block_jacobi.hpp
index 009108f1985..0ca4807cd3a 100644
--- a/reference/preconditioner/batch_block_jacobi.hpp
+++ b/reference/preconditioner/batch_block_jacobi.hpp
@@ -8,7 +8,6 @@
 
 #include <ginkgo/core/preconditioner/batch_jacobi.hpp>
 
-
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/preconditioner/batch_jacobi_helpers.hpp"
diff --git a/reference/preconditioner/batch_jacobi_kernels.cpp b/reference/preconditioner/batch_jacobi_kernels.cpp
index d90a1621a65..3c03a21fae7 100644
--- a/reference/preconditioner/batch_jacobi_kernels.cpp
+++ b/reference/preconditioner/batch_jacobi_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/preconditioner/batch_jacobi_kernels.hpp"
 
-
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
diff --git a/reference/preconditioner/isai_kernels.cpp b/reference/preconditioner/isai_kernels.cpp
index 2112c3b4e5c..55f56b5705e 100644
--- a/reference/preconditioner/isai_kernels.cpp
+++ b/reference/preconditioner/isai_kernels.cpp
@@ -4,17 +4,14 @@
 
 #include "core/preconditioner/isai_kernels.hpp"
 
-
 #include <algorithm>
 #include <memory>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/matrix/csr_builder.hpp"
 
 
diff --git a/reference/preconditioner/jacobi_kernels.cpp b/reference/preconditioner/jacobi_kernels.cpp
index 6c37aa4c3aa..4eaf0988a00 100644
--- a/reference/preconditioner/jacobi_kernels.cpp
+++ b/reference/preconditioner/jacobi_kernels.cpp
@@ -4,19 +4,16 @@
 
 #include "core/preconditioner/jacobi_kernels.hpp"
 
-
 #include <algorithm>
 #include <cmath>
 #include <iterator>
 #include <numeric>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/base/allocator.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
diff --git a/reference/reorder/rcm_kernels.cpp b/reference/reorder/rcm_kernels.cpp
index 3c6c9567d36..ff4bcd70214 100644
--- a/reference/reorder/rcm_kernels.cpp
+++ b/reference/reorder/rcm_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/reorder/rcm_kernels.hpp"
 
-
 #include <algorithm>
 #include <iterator>
 #include <memory>
@@ -12,7 +11,6 @@
 #include <utility>
 #include <vector>
 
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
@@ -22,7 +20,6 @@
 #include <ginkgo/core/matrix/permutation.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
 #include "core/base/allocator.hpp"
 
 
diff --git a/reference/solver/batch_bicgstab_kernels.cpp b/reference/solver/batch_bicgstab_kernels.cpp
index 00290eb9c81..97de157fb90 100644
--- a/reference/solver/batch_bicgstab_kernels.cpp
+++ b/reference/solver/batch_bicgstab_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/solver/batch_bicgstab_kernels.hpp"
 
-
 #include "core/solver/batch_dispatch.hpp"
 
 
diff --git a/reference/solver/batch_cg_kernels.cpp b/reference/solver/batch_cg_kernels.cpp
index 408828fce95..290fbc3718b 100644
--- a/reference/solver/batch_cg_kernels.cpp
+++ b/reference/solver/batch_cg_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/solver/batch_cg_kernels.hpp"
 
-
 #include "core/solver/batch_dispatch.hpp"
 
 
diff --git a/reference/solver/bicg_kernels.cpp b/reference/solver/bicg_kernels.cpp
index 8fc03dc42d4..dee2d30b8dc 100644
--- a/reference/solver/bicg_kernels.cpp
+++ b/reference/solver/bicg_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/solver/bicg_kernels.hpp"
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
diff --git a/reference/solver/bicgstab_kernels.cpp b/reference/solver/bicgstab_kernels.cpp
index 85facf0a6fb..31955a59c53 100644
--- a/reference/solver/bicgstab_kernels.cpp
+++ b/reference/solver/bicgstab_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/solver/bicgstab_kernels.hpp"
 
-
 #include <algorithm>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
diff --git a/reference/solver/cb_gmres_kernels.cpp b/reference/solver/cb_gmres_kernels.cpp
index 372253c94fb..5d41a0d0e00 100644
--- a/reference/solver/cb_gmres_kernels.cpp
+++ b/reference/solver/cb_gmres_kernels.cpp
@@ -4,15 +4,12 @@
 
 #include "core/solver/cb_gmres_kernels.hpp"
 
-
 #include <type_traits>
 
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
-
 #include "core/solver/cb_gmres_accessor.hpp"
 
 
diff --git a/reference/solver/cg_kernels.cpp b/reference/solver/cg_kernels.cpp
index b5ef5b6f050..5af15692414 100644
--- a/reference/solver/cg_kernels.cpp
+++ b/reference/solver/cg_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/solver/cg_kernels.hpp"
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
diff --git a/reference/solver/cgs_kernels.cpp b/reference/solver/cgs_kernels.cpp
index 2e56702a2ab..a5a5f8c5862 100644
--- a/reference/solver/cgs_kernels.cpp
+++ b/reference/solver/cgs_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/solver/cgs_kernels.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
diff --git a/reference/solver/common_gmres_kernels.cpp b/reference/solver/common_gmres_kernels.cpp
index c1cb3ce6cd4..643c164b828 100644
--- a/reference/solver/common_gmres_kernels.cpp
+++ b/reference/solver/common_gmres_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/solver/common_gmres_kernels.hpp"
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
@@ -12,7 +11,6 @@
 #include <ginkgo/core/solver/gmres.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
-
 #include "core/solver/cb_gmres_kernels.hpp"
 
 
diff --git a/reference/solver/fcg_kernels.cpp b/reference/solver/fcg_kernels.cpp
index dae9a45cbf9..65b6bf27698 100644
--- a/reference/solver/fcg_kernels.cpp
+++ b/reference/solver/fcg_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/solver/fcg_kernels.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
diff --git a/reference/solver/gcr_kernels.cpp b/reference/solver/gcr_kernels.cpp
index 822ca3874ac..531814c641e 100644
--- a/reference/solver/gcr_kernels.cpp
+++ b/reference/solver/gcr_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/solver/gcr_kernels.hpp"
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
diff --git a/reference/solver/gmres_kernels.cpp b/reference/solver/gmres_kernels.cpp
index a0bc15499cf..a0b22862998 100644
--- a/reference/solver/gmres_kernels.cpp
+++ b/reference/solver/gmres_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/solver/gmres_kernels.hpp"
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
diff --git a/reference/solver/idr_kernels.cpp b/reference/solver/idr_kernels.cpp
index c1a6eeebcb9..606def8a18b 100644
--- a/reference/solver/idr_kernels.cpp
+++ b/reference/solver/idr_kernels.cpp
@@ -4,12 +4,10 @@
 
 #include "core/solver/idr_kernels.hpp"
 
-
 #include <algorithm>
 #include <ctime>
 #include <random>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
diff --git a/reference/solver/lower_trs_kernels.cpp b/reference/solver/lower_trs_kernels.cpp
index 3a655656d29..ba02c9c838c 100644
--- a/reference/solver/lower_trs_kernels.cpp
+++ b/reference/solver/lower_trs_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/solver/lower_trs_kernels.hpp"
 
-
 #include <memory>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
diff --git a/reference/solver/multigrid_kernels.cpp b/reference/solver/multigrid_kernels.cpp
index ff3f7d97a20..b08c9857d3a 100644
--- a/reference/solver/multigrid_kernels.cpp
+++ b/reference/solver/multigrid_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/solver/multigrid_kernels.hpp"
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
diff --git a/reference/solver/upper_trs_kernels.cpp b/reference/solver/upper_trs_kernels.cpp
index c85ef2b172f..f0c23a9c4cc 100644
--- a/reference/solver/upper_trs_kernels.cpp
+++ b/reference/solver/upper_trs_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/solver/upper_trs_kernels.hpp"
 
-
 #include <memory>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
diff --git a/reference/stop/criterion_kernels.cpp b/reference/stop/criterion_kernels.cpp
index 1e77df0b63d..4a91429d784 100644
--- a/reference/stop/criterion_kernels.cpp
+++ b/reference/stop/criterion_kernels.cpp
@@ -4,7 +4,6 @@
 
 #include "core/stop/criterion_kernels.hpp"
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
diff --git a/reference/stop/residual_norm_kernels.cpp b/reference/stop/residual_norm_kernels.cpp
index d7e6783eace..ba2672edc28 100644
--- a/reference/stop/residual_norm_kernels.cpp
+++ b/reference/stop/residual_norm_kernels.cpp
@@ -4,10 +4,8 @@
 
 #include "core/stop/residual_norm_kernels.hpp"
 
-
 #include <algorithm>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
diff --git a/reference/test/base/array.cpp b/reference/test/base/array.cpp
index 4d2c3ea909b..be0396383e1 100644
--- a/reference/test/base/array.cpp
+++ b/reference/test/base/array.cpp
@@ -2,18 +2,13 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/array.hpp>
-
-
 #include <algorithm>
 
-
 #include <gtest/gtest.h>
 
-
+#include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/base/batch_multi_vector_kernels.cpp b/reference/test/base/batch_multi_vector_kernels.cpp
index 85c66b8ae44..e673046a490 100644
--- a/reference/test/base/batch_multi_vector_kernels.cpp
+++ b/reference/test/base/batch_multi_vector_kernels.cpp
@@ -2,24 +2,20 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-
+#include "core/base/batch_multi_vector_kernels.hpp"
 
 #include <complex>
 #include <memory>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
+#include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
-#include "core/base/batch_multi_vector_kernels.hpp"
 #include "core/base/batch_utilities.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/batch_helpers.hpp"
diff --git a/reference/test/base/combination.cpp b/reference/test/base/combination.cpp
index 8f4d2362724..aea578f4e7e 100644
--- a/reference/test/base/combination.cpp
+++ b/reference/test/base/combination.cpp
@@ -2,18 +2,13 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/combination.hpp>
-
-
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
+#include <ginkgo/core/base/combination.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/base/composition.cpp b/reference/test/base/composition.cpp
index ed46c58aa73..f736edb53f9 100644
--- a/reference/test/base/composition.cpp
+++ b/reference/test/base/composition.cpp
@@ -2,18 +2,13 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/composition.hpp>
-
-
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
+#include <ginkgo/core/base/composition.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/base/index_set.cpp b/reference/test/base/index_set.cpp
index 71fdaeb4f13..0d35ecac495 100644
--- a/reference/test/base/index_set.cpp
+++ b/reference/test/base/index_set.cpp
@@ -2,20 +2,15 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/index_set.hpp>
-
-
 #include <algorithm>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/index_set.hpp>
 #include <ginkgo/core/base/range.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/base/perturbation.cpp b/reference/test/base/perturbation.cpp
index ad9774257ad..b6be9ab1563 100644
--- a/reference/test/base/perturbation.cpp
+++ b/reference/test/base/perturbation.cpp
@@ -2,18 +2,13 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/perturbation.hpp>
-
-
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
+#include <ginkgo/core/base/perturbation.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/base/utils.cpp b/reference/test/base/utils.cpp
index b3e4f9f9612..27a3a31c1e0 100644
--- a/reference/test/base/utils.cpp
+++ b/reference/test/base/utils.cpp
@@ -4,13 +4,10 @@
 
 #include "core/base/utils.hpp"
 
-
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/dim.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/polymorphic_object.hpp>
@@ -19,7 +16,6 @@
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
 
diff --git a/reference/test/components/absolute_array_kernels.cpp b/reference/test/components/absolute_array_kernels.cpp
index b16dfdd9989..c192d540032 100644
--- a/reference/test/components/absolute_array_kernels.cpp
+++ b/reference/test/components/absolute_array_kernels.cpp
@@ -4,19 +4,15 @@
 
 #include "core/components/absolute_array_kernels.hpp"
 
-
 #include <memory>
 #include <random>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/components/fill_array_kernels.cpp b/reference/test/components/fill_array_kernels.cpp
index 2f534be94b8..d087c833c96 100644
--- a/reference/test/components/fill_array_kernels.cpp
+++ b/reference/test/components/fill_array_kernels.cpp
@@ -4,18 +4,14 @@
 
 #include "core/components/fill_array_kernels.hpp"
 
-
 #include <memory>
 #include <random>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/components/format_conversion_kernels.cpp b/reference/test/components/format_conversion_kernels.cpp
index f9f9ef828b5..e5d2c2d9692 100644
--- a/reference/test/components/format_conversion_kernels.cpp
+++ b/reference/test/components/format_conversion_kernels.cpp
@@ -4,17 +4,14 @@
 
 #include "core/components/format_conversion_kernels.hpp"
 
-
 #include <cstring>
 #include <limits>
 #include <memory>
 #include <random>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/components/precision_conversion_kernels.cpp b/reference/test/components/precision_conversion_kernels.cpp
index e251101e1e7..129758e0b95 100644
--- a/reference/test/components/precision_conversion_kernels.cpp
+++ b/reference/test/components/precision_conversion_kernels.cpp
@@ -8,13 +8,10 @@
 #include <random>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/components/prefix_sum_kernels.cpp b/reference/test/components/prefix_sum_kernels.cpp
index c8820d5031c..00265442cce 100644
--- a/reference/test/components/prefix_sum_kernels.cpp
+++ b/reference/test/components/prefix_sum_kernels.cpp
@@ -4,20 +4,16 @@
 
 #include "core/components/prefix_sum_kernels.hpp"
 
-
 #include <algorithm>
 #include <limits>
 #include <memory>
 #include <type_traits>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/components/reduce_array_kernels.cpp b/reference/test/components/reduce_array_kernels.cpp
index 2599f83178f..b88ec181261 100644
--- a/reference/test/components/reduce_array_kernels.cpp
+++ b/reference/test/components/reduce_array_kernels.cpp
@@ -4,16 +4,12 @@
 
 #include "core/components/reduce_array_kernels.hpp"
 
-
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/distributed/index_map_kernels.cpp b/reference/test/distributed/index_map_kernels.cpp
index 972db10654f..72b0a0e523b 100644
--- a/reference/test/distributed/index_map_kernels.cpp
+++ b/reference/test/distributed/index_map_kernels.cpp
@@ -2,22 +2,18 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/distributed/index_map.hpp>
-
+#include "core/distributed/index_map_kernels.hpp"
 
 #include <algorithm>
 #include <memory>
 #include <vector>
 
-
 #include <gtest/gtest-typed-test.h>
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/distributed/index_map.hpp>
 
-
-#include "core/distributed/index_map_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/distributed/matrix_kernels.cpp b/reference/test/distributed/matrix_kernels.cpp
index 5d96f4f9c64..a34844cbde9 100644
--- a/reference/test/distributed/matrix_kernels.cpp
+++ b/reference/test/distributed/matrix_kernels.cpp
@@ -2,22 +2,20 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/distributed/matrix_kernels.hpp"
+
 #include <algorithm>
 #include <memory>
 #include <vector>
 
-
 #include <gtest/gtest-typed-test.h>
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/device_matrix_data.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/matrix_data.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
-#include "core/distributed/matrix_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/distributed/partition_helpers_kernels.cpp b/reference/test/distributed/partition_helpers_kernels.cpp
index 08c17615dd6..18b95cb9dad 100644
--- a/reference/test/distributed/partition_helpers_kernels.cpp
+++ b/reference/test/distributed/partition_helpers_kernels.cpp
@@ -2,20 +2,18 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/distributed/partition_helpers_kernels.hpp"
+
 #include <algorithm>
 #include <memory>
 #include <vector>
 
-
 #include <gtest/gtest-typed-test.h>
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/distributed/partition.hpp>
 
-
-#include "core/distributed/partition_helpers_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/distributed/partition_kernels.cpp b/reference/test/distributed/partition_kernels.cpp
index 426195eef25..e06f3cc4029 100644
--- a/reference/test/distributed/partition_kernels.cpp
+++ b/reference/test/distributed/partition_kernels.cpp
@@ -2,22 +2,18 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/distributed/partition.hpp>
-
+#include "core/distributed/partition_kernels.hpp"
 
 #include <algorithm>
 #include <memory>
 #include <vector>
 
-
 #include <gtest/gtest-typed-test.h>
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/distributed/partition.hpp>
 
-
-#include "core/distributed/partition_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/distributed/vector_kernels.cpp b/reference/test/distributed/vector_kernels.cpp
index 0afe9787a48..7de3104b7fb 100644
--- a/reference/test/distributed/vector_kernels.cpp
+++ b/reference/test/distributed/vector_kernels.cpp
@@ -2,20 +2,18 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/distributed/vector_kernels.hpp"
+
 #include <algorithm>
 #include <memory>
 #include <vector>
 
-
 #include <gtest/gtest-typed-test.h>
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/matrix_data.hpp>
 
-
-#include "core/distributed/vector_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/factorization/cholesky_kernels.cpp b/reference/test/factorization/cholesky_kernels.cpp
index 87433681d89..d63e491e26a 100644
--- a/reference/test/factorization/cholesky_kernels.cpp
+++ b/reference/test/factorization/cholesky_kernels.cpp
@@ -2,24 +2,20 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/factorization/cholesky.hpp>
-
+#include "core/factorization/cholesky_kernels.hpp"
 
 #include <algorithm>
 #include <initializer_list>
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/matrix_data.hpp>
+#include <ginkgo/core/factorization/cholesky.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/identity.hpp>
 
-
 #include "core/components/prefix_sum_kernels.hpp"
-#include "core/factorization/cholesky_kernels.hpp"
 #include "core/factorization/elimination_forest.hpp"
 #include "core/factorization/symbolic.hpp"
 #include "core/matrix/csr_kernels.hpp"
diff --git a/reference/test/factorization/factorization.cpp b/reference/test/factorization/factorization.cpp
index 7a9eab5c8a7..2ded81d4867 100644
--- a/reference/test/factorization/factorization.cpp
+++ b/reference/test/factorization/factorization.cpp
@@ -2,25 +2,20 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/factorization/factorization.hpp>
-
-
 #include <algorithm>
 #include <fstream>
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/factorization/factorization.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "core/test/utils/assertions.hpp"
 
diff --git a/reference/test/factorization/ic_kernels.cpp b/reference/test/factorization/ic_kernels.cpp
index 22deb9fa2b2..cdcb6b12bc8 100644
--- a/reference/test/factorization/ic_kernels.cpp
+++ b/reference/test/factorization/ic_kernels.cpp
@@ -2,23 +2,18 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/factorization/ic.hpp>
-
-
 #include <algorithm>
 #include <memory>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/factorization/ic.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/factorization/ilu_kernels.cpp b/reference/test/factorization/ilu_kernels.cpp
index 23098724e75..c750ca93fc8 100644
--- a/reference/test/factorization/ilu_kernels.cpp
+++ b/reference/test/factorization/ilu_kernels.cpp
@@ -2,24 +2,19 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/factorization/ilu.hpp>
-
-
 #include <algorithm>
 #include <initializer_list>
 #include <memory>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/factorization/ilu.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/factorization/lu_kernels.cpp b/reference/test/factorization/lu_kernels.cpp
index 1b5baf54e15..f4a8b240b38 100644
--- a/reference/test/factorization/lu_kernels.cpp
+++ b/reference/test/factorization/lu_kernels.cpp
@@ -2,28 +2,24 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/factorization/lu.hpp>
-
+#include "core/factorization/lu_kernels.hpp"
 
 #include <algorithm>
 #include <fstream>
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/factorization/factorization.hpp>
+#include <ginkgo/core/factorization/lu.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/cholesky_kernels.hpp"
 #include "core/factorization/elimination_forest.hpp"
-#include "core/factorization/lu_kernels.hpp"
 #include "core/factorization/symbolic.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/matrix/csr_lookup.hpp"
diff --git a/reference/test/factorization/par_ic_kernels.cpp b/reference/test/factorization/par_ic_kernels.cpp
index f044d03194c..b9caf8c9e5e 100644
--- a/reference/test/factorization/par_ic_kernels.cpp
+++ b/reference/test/factorization/par_ic_kernels.cpp
@@ -2,25 +2,21 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/factorization/par_ic.hpp>
-
+#include "core/factorization/par_ic_kernels.hpp"
 
 #include <algorithm>
 #include <memory>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/factorization/par_ic.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/factorization/factorization_kernels.hpp"
-#include "core/factorization/par_ic_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/factorization/par_ict_kernels.cpp b/reference/test/factorization/par_ict_kernels.cpp
index 6506ed59b0e..55ac5771732 100644
--- a/reference/test/factorization/par_ict_kernels.cpp
+++ b/reference/test/factorization/par_ict_kernels.cpp
@@ -2,25 +2,21 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/factorization/par_ict.hpp>
-
+#include "core/factorization/par_ict_kernels.hpp"
 
 #include <algorithm>
 #include <memory>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/factorization/par_ict.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/factorization/factorization_kernels.hpp"
-#include "core/factorization/par_ict_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/factorization/par_ilu_kernels.cpp b/reference/test/factorization/par_ilu_kernels.cpp
index 25b172a25d6..bf4e422f640 100644
--- a/reference/test/factorization/par_ilu_kernels.cpp
+++ b/reference/test/factorization/par_ilu_kernels.cpp
@@ -2,26 +2,22 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/factorization/par_ilu.hpp>
-
+#include "core/factorization/par_ilu_kernels.hpp"
 
 #include <algorithm>
 #include <initializer_list>
 #include <memory>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/factorization/par_ilu.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/factorization/factorization_kernels.hpp"
-#include "core/factorization/par_ilu_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/factorization/par_ilut_kernels.cpp b/reference/test/factorization/par_ilut_kernels.cpp
index 36b68fe2815..59805f246f8 100644
--- a/reference/test/factorization/par_ilut_kernels.cpp
+++ b/reference/test/factorization/par_ilut_kernels.cpp
@@ -2,24 +2,20 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/factorization/par_ilut.hpp>
-
+#include "core/factorization/par_ilut_kernels.hpp"
 
 #include <algorithm>
 #include <memory>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/factorization/par_ilut.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
-#include "core/factorization/par_ilut_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/log/convergence.cpp b/reference/test/log/convergence.cpp
index 986adb52e89..50db0db49c4 100644
--- a/reference/test/log/convergence.cpp
+++ b/reference/test/log/convergence.cpp
@@ -2,18 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/log/convergence.hpp>
-
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/log/convergence.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/log/papi.cpp b/reference/test/log/papi.cpp
index 54e35f1218e..4f1d9e469f1 100644
--- a/reference/test/log/papi.cpp
+++ b/reference/test/log/papi.cpp
@@ -2,18 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/log/papi.hpp>
-
-
 #include <gtest/gtest.h>
 #include <papi.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/log/papi.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/matrix/batch_csr_kernels.cpp b/reference/test/matrix/batch_csr_kernels.cpp
index 76ff47be730..920bb67696b 100644
--- a/reference/test/matrix/batch_csr_kernels.cpp
+++ b/reference/test/matrix/batch_csr_kernels.cpp
@@ -2,26 +2,22 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/batch_csr.hpp>
-
+#include "core/matrix/batch_csr_kernels.hpp"
 
 #include <complex>
 #include <memory>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/batch_csr.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
-#include "core/matrix/batch_csr_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/matrix/batch_dense_kernels.cpp b/reference/test/matrix/batch_dense_kernels.cpp
index 1eabb3cb749..50c1909959f 100644
--- a/reference/test/matrix/batch_dense_kernels.cpp
+++ b/reference/test/matrix/batch_dense_kernels.cpp
@@ -2,25 +2,21 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/batch_dense.hpp>
-
+#include "core/matrix/batch_dense_kernels.hpp"
 
 #include <complex>
 #include <memory>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
-#include "core/matrix/batch_dense_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/matrix/batch_ell_kernels.cpp b/reference/test/matrix/batch_ell_kernels.cpp
index 44eda90cca9..a2c9ef4e83c 100644
--- a/reference/test/matrix/batch_ell_kernels.cpp
+++ b/reference/test/matrix/batch_ell_kernels.cpp
@@ -2,26 +2,22 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/batch_ell.hpp>
-
+#include "core/matrix/batch_ell_kernels.hpp"
 
 #include <complex>
 #include <memory>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
 
-
-#include "core/matrix/batch_ell_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/matrix/coo_kernels.cpp b/reference/test/matrix/coo_kernels.cpp
index 19752d27a1b..42b68d1cb4c 100644
--- a/reference/test/matrix/coo_kernels.cpp
+++ b/reference/test/matrix/coo_kernels.cpp
@@ -2,24 +2,20 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/coo.hpp>
-
+#include "core/matrix/coo_kernels.hpp"
 
 #include <algorithm>
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
 
-
-#include "core/matrix/coo_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp
index a206c8c40c2..2d4c61786ad 100644
--- a/reference/test/matrix/csr_kernels.cpp
+++ b/reference/test/matrix/csr_kernels.cpp
@@ -2,20 +2,18 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/csr.hpp>
-
+#include "core/matrix/csr_kernels.hpp"
 
 #include <algorithm>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
@@ -26,8 +24,6 @@
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
-#include "core/matrix/csr_kernels.hpp"
 #include "core/matrix/csr_lookup.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/assertions.hpp"
diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp
index c0bd7fd363b..41294c89d49 100644
--- a/reference/test/matrix/dense_kernels.cpp
+++ b/reference/test/matrix/dense_kernels.cpp
@@ -2,24 +2,22 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/dense.hpp>
-
+#include "core/matrix/dense_kernels.hpp"
 
 #include <complex>
 #include <memory>
 #include <numeric>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/precision_dispatch.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
 #include <ginkgo/core/matrix/hybrid.hpp>
@@ -28,8 +26,6 @@
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
-#include "core/matrix/dense_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/matrix/diagonal_kernels.cpp b/reference/test/matrix/diagonal_kernels.cpp
index 6baa11a8e60..208c9d98639 100644
--- a/reference/test/matrix/diagonal_kernels.cpp
+++ b/reference/test/matrix/diagonal_kernels.cpp
@@ -2,25 +2,21 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/diagonal.hpp>
-
+#include "core/matrix/diagonal_kernels.hpp"
 
 #include <algorithm>
 #include <complex>
 #include <memory>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/matrix/diagonal.hpp>
 
-
-#include "core/matrix/diagonal_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/matrix/ell_kernels.cpp b/reference/test/matrix/ell_kernels.cpp
index 30128021c09..c96dcae773a 100644
--- a/reference/test/matrix/ell_kernels.cpp
+++ b/reference/test/matrix/ell_kernels.cpp
@@ -2,22 +2,17 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/ell.hpp>
-
-
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
-
+#include <ginkgo/core/matrix/ell.hpp>
 
 #include "core/test/utils.hpp"
 
diff --git a/reference/test/matrix/fbcsr_kernels.cpp b/reference/test/matrix/fbcsr_kernels.cpp
index 576193ba50e..cd82bade8b7 100644
--- a/reference/test/matrix/fbcsr_kernels.cpp
+++ b/reference/test/matrix/fbcsr_kernels.cpp
@@ -2,28 +2,24 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/fbcsr.hpp>
-
+#include "core/matrix/fbcsr_kernels.hpp"
 
 #include <algorithm>
 #include <iostream>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
+#include <ginkgo/core/matrix/fbcsr.hpp>
 #include <ginkgo/core/matrix/identity.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
 #include "core/matrix/csr_kernels.hpp"
-#include "core/matrix/fbcsr_kernels.hpp"
 #include "core/test/matrix/fbcsr_sample.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/value_generator.hpp"
diff --git a/reference/test/matrix/fft_kernels.cpp b/reference/test/matrix/fft_kernels.cpp
index b157b992e49..12c2521b71c 100644
--- a/reference/test/matrix/fft_kernels.cpp
+++ b/reference/test/matrix/fft_kernels.cpp
@@ -2,21 +2,16 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/fft.hpp>
-
-
 #include <complex>
 #include <memory>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
-
+#include <ginkgo/core/matrix/fft.hpp>
 
 #include "core/test/utils.hpp"
 #include "matrices/config.hpp"
diff --git a/reference/test/matrix/hybrid_kernels.cpp b/reference/test/matrix/hybrid_kernels.cpp
index 32ee8853163..014b5bb1024 100644
--- a/reference/test/matrix/hybrid_kernels.cpp
+++ b/reference/test/matrix/hybrid_kernels.cpp
@@ -2,24 +2,20 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/hybrid.hpp>
-
+#include "core/matrix/hybrid_kernels.hpp"
 
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
+#include <ginkgo/core/matrix/hybrid.hpp>
 
-
-#include "core/matrix/hybrid_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/matrix/identity.cpp b/reference/test/matrix/identity.cpp
index 345efb99261..11953de338a 100644
--- a/reference/test/matrix/identity.cpp
+++ b/reference/test/matrix/identity.cpp
@@ -2,14 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/identity.hpp>
-
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/matrix/dense.hpp>
-
+#include <ginkgo/core/matrix/identity.hpp>
 
 #include "core/test/utils.hpp"
 
diff --git a/reference/test/matrix/permutation.cpp b/reference/test/matrix/permutation.cpp
index bb80dcc4736..5418f97353b 100644
--- a/reference/test/matrix/permutation.cpp
+++ b/reference/test/matrix/permutation.cpp
@@ -2,20 +2,15 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/permutation.hpp>
-
-
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
-
+#include <ginkgo/core/matrix/permutation.hpp>
 
 #include "core/test/utils.hpp"
 
diff --git a/reference/test/matrix/scaled_permutation.cpp b/reference/test/matrix/scaled_permutation.cpp
index 87924687148..ba65705bf29 100644
--- a/reference/test/matrix/scaled_permutation.cpp
+++ b/reference/test/matrix/scaled_permutation.cpp
@@ -2,19 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/scaled_permutation.hpp>
-
-
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/permutation.hpp>
-
+#include <ginkgo/core/matrix/scaled_permutation.hpp>
 
 #include "core/test/utils.hpp"
 
diff --git a/reference/test/matrix/sellp_kernels.cpp b/reference/test/matrix/sellp_kernels.cpp
index 87ef9d19b90..18cf793c7f3 100644
--- a/reference/test/matrix/sellp_kernels.cpp
+++ b/reference/test/matrix/sellp_kernels.cpp
@@ -2,21 +2,18 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/sellp.hpp>
-
+#include "core/matrix/sellp_kernels.hpp"
 
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
+#include <ginkgo/core/matrix/sellp.hpp>
 
-
-#include "core/matrix/sellp_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/matrix/sparsity_csr.cpp b/reference/test/matrix/sparsity_csr.cpp
index 1dead5e362a..d8ed6147e30 100644
--- a/reference/test/matrix/sparsity_csr.cpp
+++ b/reference/test/matrix/sparsity_csr.cpp
@@ -2,18 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/sparsity_csr.hpp>
-
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
-
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 #include "core/test/utils.hpp"
 
diff --git a/reference/test/matrix/sparsity_csr_kernels.cpp b/reference/test/matrix/sparsity_csr_kernels.cpp
index 3616ae0a91e..f08d6c352ca 100644
--- a/reference/test/matrix/sparsity_csr_kernels.cpp
+++ b/reference/test/matrix/sparsity_csr_kernels.cpp
@@ -2,22 +2,18 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/sparsity_csr.hpp>
-
+#include "core/matrix/sparsity_csr_kernels.hpp"
 
 #include <algorithm>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
-#include "core/matrix/sparsity_csr_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/multigrid/fixed_coarsening_kernels.cpp b/reference/test/multigrid/fixed_coarsening_kernels.cpp
index c6b577523da..b79b1b578dd 100644
--- a/reference/test/multigrid/fixed_coarsening_kernels.cpp
+++ b/reference/test/multigrid/fixed_coarsening_kernels.cpp
@@ -2,15 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/multigrid/fixed_coarsening.hpp>
-
-
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
@@ -20,12 +15,12 @@
 #include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/matrix/row_gatherer.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
+#include <ginkgo/core/multigrid/fixed_coarsening.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/multigrid/pgm_kernels.cpp b/reference/test/multigrid/pgm_kernels.cpp
index 6f80f27e040..2fc754f23b3 100644
--- a/reference/test/multigrid/pgm_kernels.cpp
+++ b/reference/test/multigrid/pgm_kernels.cpp
@@ -2,15 +2,12 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/multigrid/pgm.hpp>
-
+#include "core/multigrid/pgm_kernels.hpp"
 
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
@@ -19,13 +16,12 @@
 #include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/matrix/row_gatherer.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
+#include <ginkgo/core/multigrid/pgm.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
-
-#include "core/multigrid/pgm_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/preconditioner/batch_jacobi_kernels.cpp b/reference/test/preconditioner/batch_jacobi_kernels.cpp
index 520e6c11f31..afc59c0f783 100644
--- a/reference/test/preconditioner/batch_jacobi_kernels.cpp
+++ b/reference/test/preconditioner/batch_jacobi_kernels.cpp
@@ -2,23 +2,19 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/preconditioner/batch_jacobi.hpp>
-
+#include "core/preconditioner/batch_jacobi_kernels.hpp"
 
 #include <limits>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/batch_csr.hpp>
+#include <ginkgo/core/preconditioner/batch_jacobi.hpp>
 #include <ginkgo/core/preconditioner/jacobi.hpp>
 
-
-#include "core/preconditioner/batch_jacobi_kernels.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/batch_helpers.hpp"
 
diff --git a/reference/test/preconditioner/ic.cpp b/reference/test/preconditioner/ic.cpp
index 1e1bc18bda1..16ffc8d7b3c 100644
--- a/reference/test/preconditioner/ic.cpp
+++ b/reference/test/preconditioner/ic.cpp
@@ -2,24 +2,19 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/preconditioner/ic.hpp>
-
-
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/composition.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/factorization/par_ic.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/preconditioner/ic.hpp>
 #include <ginkgo/core/preconditioner/isai.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/preconditioner/ilu.cpp b/reference/test/preconditioner/ilu.cpp
index 7b8bd657955..180b92be9ec 100644
--- a/reference/test/preconditioner/ilu.cpp
+++ b/reference/test/preconditioner/ilu.cpp
@@ -2,27 +2,22 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/preconditioner/ilu.hpp>
-
-
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/composition.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/factorization/par_ilu.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/preconditioner/ilu.hpp>
 #include <ginkgo/core/solver/bicgstab.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/preconditioner/isai_kernels.cpp b/reference/test/preconditioner/isai_kernels.cpp
index 007f0e428c9..e989125c61d 100644
--- a/reference/test/preconditioner/isai_kernels.cpp
+++ b/reference/test/preconditioner/isai_kernels.cpp
@@ -2,29 +2,25 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/preconditioner/isai.hpp>
-
+#include "core/preconditioner/isai_kernels.hpp"
 
 #include <algorithm>
 #include <fstream>
 #include <memory>
 #include <type_traits>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/mtx_io.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/preconditioner/ilu.hpp>
+#include <ginkgo/core/preconditioner/isai.hpp>
 #include <ginkgo/core/preconditioner/jacobi.hpp>
 #include <ginkgo/core/solver/gmres.hpp>
 
-
 #include "core/base/utils.hpp"
-#include "core/preconditioner/isai_kernels.hpp"
 #include "core/test/utils.hpp"
 #include "matrices/config.hpp"
 
diff --git a/reference/test/preconditioner/jacobi.cpp b/reference/test/preconditioner/jacobi.cpp
index 959a2a6e3b6..801250a9826 100644
--- a/reference/test/preconditioner/jacobi.cpp
+++ b/reference/test/preconditioner/jacobi.cpp
@@ -2,18 +2,13 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/preconditioner/jacobi.hpp>
-
-
 #include <algorithm>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
-
+#include <ginkgo/core/preconditioner/jacobi.hpp>
 
 #include "core/base/extended_float.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
diff --git a/reference/test/preconditioner/jacobi_kernels.cpp b/reference/test/preconditioner/jacobi_kernels.cpp
index 8e88310dfb9..97d9951be7a 100644
--- a/reference/test/preconditioner/jacobi_kernels.cpp
+++ b/reference/test/preconditioner/jacobi_kernels.cpp
@@ -2,19 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/preconditioner/jacobi.hpp>
-
-
 #include <algorithm>
 #include <type_traits>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
-
+#include <ginkgo/core/preconditioner/jacobi.hpp>
 
 #include "core/base/extended_float.hpp"
 #include "core/test/utils.hpp"
diff --git a/reference/test/reorder/mc64.cpp b/reference/test/reorder/mc64.cpp
index 64ddb667c14..2c64538e9b2 100644
--- a/reference/test/reorder/mc64.cpp
+++ b/reference/test/reorder/mc64.cpp
@@ -2,17 +2,12 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/reorder/mc64.hpp>
-
-
 #include <algorithm>
 #include <fstream>
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/composition.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
@@ -20,7 +15,7 @@
 #include <ginkgo/core/matrix/permutation.hpp>
 #include <ginkgo/core/matrix/scaled_permutation.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
-
+#include <ginkgo/core/reorder/mc64.hpp>
 
 #include "core/test/utils.hpp"
 #include "core/test/utils/assertions.hpp"
diff --git a/reference/test/reorder/mc64_kernels.cpp b/reference/test/reorder/mc64_kernels.cpp
index 85352854934..15f90839e1b 100644
--- a/reference/test/reorder/mc64_kernels.cpp
+++ b/reference/test/reorder/mc64_kernels.cpp
@@ -2,8 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/reorder/mc64.hpp>
-
+#include "core/reorder/mc64.hpp"
 
 #include <algorithm>
 #include <cmath>
@@ -11,17 +10,14 @@
 #include <limits>
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/permutation.hpp>
-
+#include <ginkgo/core/reorder/mc64.hpp>
 
 #include "core/components/addressable_pq.hpp"
-#include "core/reorder/mc64.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/assertions.hpp"
 #include "matrices/config.hpp"
diff --git a/reference/test/reorder/nested_dissection.cpp b/reference/test/reorder/nested_dissection.cpp
index c5054a19f6f..64a900f7742 100644
--- a/reference/test/reorder/nested_dissection.cpp
+++ b/reference/test/reorder/nested_dissection.cpp
@@ -2,13 +2,11 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/reorder/nested_dissection.hpp>
-
-
 #include <memory>
 
-
 #include <gtest/gtest.h>
+
+#include <ginkgo/core/reorder/nested_dissection.hpp>
 #include GKO_METIS_HEADER
 
 
@@ -18,7 +16,6 @@
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "core/test/utils/assertions.hpp"
 
diff --git a/reference/test/reorder/rcm.cpp b/reference/test/reorder/rcm.cpp
index e1a330c046f..f8a18e5b6ec 100644
--- a/reference/test/reorder/rcm.cpp
+++ b/reference/test/reorder/rcm.cpp
@@ -2,22 +2,17 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/reorder/rcm.hpp>
-
-
 #include <algorithm>
 #include <fstream>
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
-
+#include <ginkgo/core/reorder/rcm.hpp>
 
 #include "core/test/utils.hpp"
 #include "core/test/utils/assertions.hpp"
diff --git a/reference/test/reorder/rcm_kernels.cpp b/reference/test/reorder/rcm_kernels.cpp
index 0f36839ba33..f9d44f2dfd6 100644
--- a/reference/test/reorder/rcm_kernels.cpp
+++ b/reference/test/reorder/rcm_kernels.cpp
@@ -2,22 +2,17 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/reorder/rcm.hpp>
-
-
 #include <algorithm>
 #include <fstream>
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/permutation.hpp>
-
+#include <ginkgo/core/reorder/rcm.hpp>
 
 #include "core/test/utils/assertions.hpp"
 
diff --git a/reference/test/reorder/scaled_reordered.cpp b/reference/test/reorder/scaled_reordered.cpp
index d511ed4a3f4..75ab3728a30 100644
--- a/reference/test/reorder/scaled_reordered.cpp
+++ b/reference/test/reorder/scaled_reordered.cpp
@@ -2,27 +2,22 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/reorder/scaled_reordered.hpp>
-
-
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/reorder/rcm.hpp>
+#include <ginkgo/core/reorder/scaled_reordered.hpp>
 #include <ginkgo/core/solver/bicgstab.hpp>
 #include <ginkgo/core/solver/cg.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/solver/batch_bicgstab_kernels.cpp b/reference/test/solver/batch_bicgstab_kernels.cpp
index 2051b1764b4..ddb6d09e12a 100644
--- a/reference/test/solver/batch_bicgstab_kernels.cpp
+++ b/reference/test/solver/batch_bicgstab_kernels.cpp
@@ -2,26 +2,22 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/batch_bicgstab.hpp>
-
+#include "core/solver/batch_bicgstab_kernels.hpp"
 
 #include <memory>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/log/batch_logger.hpp>
 #include <ginkgo/core/matrix/batch_csr.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
-
+#include <ginkgo/core/solver/batch_bicgstab.hpp>
 
 #include "core/base/batch_utilities.hpp"
 #include "core/matrix/batch_dense_kernels.hpp"
-#include "core/solver/batch_bicgstab_kernels.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/batch_helpers.hpp"
 
diff --git a/reference/test/solver/batch_cg_kernels.cpp b/reference/test/solver/batch_cg_kernels.cpp
index a3c5dde92bc..4ccabfb8849 100644
--- a/reference/test/solver/batch_cg_kernels.cpp
+++ b/reference/test/solver/batch_cg_kernels.cpp
@@ -2,26 +2,22 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/batch_cg.hpp>
-
+#include "core/solver/batch_cg_kernels.hpp"
 
 #include <memory>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/log/batch_logger.hpp>
 #include <ginkgo/core/matrix/batch_csr.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
-
+#include <ginkgo/core/solver/batch_cg.hpp>
 
 #include "core/base/batch_utilities.hpp"
 #include "core/matrix/batch_dense_kernels.hpp"
-#include "core/solver/batch_cg_kernels.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/batch_helpers.hpp"
 
diff --git a/reference/test/solver/bicg_kernels.cpp b/reference/test/solver/bicg_kernels.cpp
index c615015ce63..837920ec520 100644
--- a/reference/test/solver/bicg_kernels.cpp
+++ b/reference/test/solver/bicg_kernels.cpp
@@ -2,22 +2,19 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/bicg.hpp>
-
+#include "core/solver/bicg_kernels.hpp"
 
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/solver/bicg.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
-
-#include "core/solver/bicg_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/solver/bicgstab_kernels.cpp b/reference/test/solver/bicgstab_kernels.cpp
index 32966138310..f09e78137b3 100644
--- a/reference/test/solver/bicgstab_kernels.cpp
+++ b/reference/test/solver/bicgstab_kernels.cpp
@@ -2,22 +2,19 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/bicgstab.hpp>
-
+#include "core/solver/bicgstab_kernels.hpp"
 
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/solver/bicgstab.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
-
-#include "core/solver/bicgstab_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/solver/cb_gmres_kernels.cpp b/reference/test/solver/cb_gmres_kernels.cpp
index 95c31bbd500..eeeca82494c 100644
--- a/reference/test/solver/cb_gmres_kernels.cpp
+++ b/reference/test/solver/cb_gmres_kernels.cpp
@@ -2,25 +2,20 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/cb_gmres.hpp>
-
-
 #include <tuple>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/preconditioner/jacobi.hpp>
+#include <ginkgo/core/solver/cb_gmres.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/solver/cg_kernels.cpp b/reference/test/solver/cg_kernels.cpp
index 6892d322906..7cbc629717c 100644
--- a/reference/test/solver/cg_kernels.cpp
+++ b/reference/test/solver/cg_kernels.cpp
@@ -2,22 +2,19 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/cg.hpp>
-
+#include "core/solver/cg_kernels.hpp"
 
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/solver/cg.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
-
-#include "core/solver/cg_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/solver/cgs_kernels.cpp b/reference/test/solver/cgs_kernels.cpp
index ee6bad8ab8f..9024623ade8 100644
--- a/reference/test/solver/cgs_kernels.cpp
+++ b/reference/test/solver/cgs_kernels.cpp
@@ -2,22 +2,19 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/cgs.hpp>
-
+#include "core/solver/cgs_kernels.hpp"
 
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/solver/cgs.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
-
-#include "core/solver/cgs_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/solver/direct.cpp b/reference/test/solver/direct.cpp
index 23b7a069b90..1fb147a7a2b 100644
--- a/reference/test/solver/direct.cpp
+++ b/reference/test/solver/direct.cpp
@@ -2,26 +2,21 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/direct.hpp>
-
-
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/factorization/lu.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/solver/direct.hpp>
 #include <ginkgo/core/solver/gmres.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "core/test/utils/matrix_generator.hpp"
 #include "matrices/config.hpp"
diff --git a/reference/test/solver/fcg_kernels.cpp b/reference/test/solver/fcg_kernels.cpp
index de20c8e47d9..2b7b97ffc3b 100644
--- a/reference/test/solver/fcg_kernels.cpp
+++ b/reference/test/solver/fcg_kernels.cpp
@@ -2,22 +2,19 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/fcg.hpp>
-
+#include "core/solver/fcg_kernels.hpp"
 
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/solver/fcg.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
-
-#include "core/solver/fcg_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/solver/gcr_kernels.cpp b/reference/test/solver/gcr_kernels.cpp
index 498447e6763..a81c3ce4285 100644
--- a/reference/test/solver/gcr_kernels.cpp
+++ b/reference/test/solver/gcr_kernels.cpp
@@ -2,28 +2,24 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/gcr.hpp>
-
+#include "core/solver/gcr_kernels.hpp"
 
 #include <algorithm>
 #include <limits>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/preconditioner/jacobi.hpp>
+#include <ginkgo/core/solver/gcr.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
-
-#include "core/solver/gcr_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/solver/gmres_kernels.cpp b/reference/test/solver/gmres_kernels.cpp
index 7968bd2ac4f..00f7766179f 100644
--- a/reference/test/solver/gmres_kernels.cpp
+++ b/reference/test/solver/gmres_kernels.cpp
@@ -2,29 +2,25 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/gmres.hpp>
-
+#include "core/solver/gmres_kernels.hpp"
 
 #include <algorithm>
 #include <limits>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/preconditioner/jacobi.hpp>
+#include <ginkgo/core/solver/gmres.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
-
 #include "core/solver/common_gmres_kernels.hpp"
-#include "core/solver/gmres_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/solver/idr_kernels.cpp b/reference/test/solver/idr_kernels.cpp
index 056ddf0dfe5..c3ca4fc1bd9 100644
--- a/reference/test/solver/idr_kernels.cpp
+++ b/reference/test/solver/idr_kernels.cpp
@@ -2,21 +2,17 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/idr.hpp>
-
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/solver/idr.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/solver/ir_kernels.cpp b/reference/test/solver/ir_kernels.cpp
index f41c1803afa..b0c1029f693 100644
--- a/reference/test/solver/ir_kernels.cpp
+++ b/reference/test/solver/ir_kernels.cpp
@@ -2,22 +2,19 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/ir.hpp>
-
+#include "core/solver/ir_kernels.hpp"
 
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/solver/gmres.hpp>
+#include <ginkgo/core/solver/ir.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
-#include "core/solver/ir_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/solver/lower_trs.cpp b/reference/test/solver/lower_trs.cpp
index b716235e587..d52ee028b53 100644
--- a/reference/test/solver/lower_trs.cpp
+++ b/reference/test/solver/lower_trs.cpp
@@ -4,16 +4,13 @@
 
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/solver/triangular.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/solver/lower_trs_kernels.cpp b/reference/test/solver/lower_trs_kernels.cpp
index 351550f2d44..da2e55700f5 100644
--- a/reference/test/solver/lower_trs_kernels.cpp
+++ b/reference/test/solver/lower_trs_kernels.cpp
@@ -2,12 +2,12 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <memory>
+#include "core/solver/lower_trs_kernels.hpp"
 
+#include <memory>
 
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
@@ -18,8 +18,6 @@
 #include <ginkgo/core/stop/residual_norm.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
-
-#include "core/solver/lower_trs_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/solver/multigrid_kernels.cpp b/reference/test/solver/multigrid_kernels.cpp
index b7fa09058fb..57ba8fba84d 100644
--- a/reference/test/solver/multigrid_kernels.cpp
+++ b/reference/test/solver/multigrid_kernels.cpp
@@ -2,12 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/solver/multigrid.hpp>
-
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
@@ -16,11 +12,11 @@
 #include <ginkgo/core/preconditioner/jacobi.hpp>
 #include <ginkgo/core/solver/cg.hpp>
 #include <ginkgo/core/solver/ir.hpp>
+#include <ginkgo/core/solver/multigrid.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/solver/upper_trs.cpp b/reference/test/solver/upper_trs.cpp
index f9d30a5a595..9980c51f9d1 100644
--- a/reference/test/solver/upper_trs.cpp
+++ b/reference/test/solver/upper_trs.cpp
@@ -4,16 +4,13 @@
 
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/solver/triangular.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/solver/upper_trs_kernels.cpp b/reference/test/solver/upper_trs_kernels.cpp
index 1c259b65e14..dc964e6b83d 100644
--- a/reference/test/solver/upper_trs_kernels.cpp
+++ b/reference/test/solver/upper_trs_kernels.cpp
@@ -2,12 +2,12 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <memory>
+#include "core/solver/upper_trs_kernels.hpp"
 
+#include <memory>
 
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
@@ -18,8 +18,6 @@
 #include <ginkgo/core/stop/residual_norm.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
-
-#include "core/solver/upper_trs_kernels.hpp"
 #include "core/test/utils.hpp"
 
 
diff --git a/reference/test/stop/combined.cpp b/reference/test/stop/combined.cpp
index 29183c29924..900e8131aba 100644
--- a/reference/test/stop/combined.cpp
+++ b/reference/test/stop/combined.cpp
@@ -2,11 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/stop/combined.hpp>
-
-
 #include <chrono>
 #include <thread>
+
+#include <ginkgo/core/stop/combined.hpp>
 #if defined(_WIN32) || defined(__CYGWIN__)
 #include <windows.h>
 #endif  // defined(_WIN32) || defined(__CYGWIN__)
@@ -14,7 +13,6 @@
 
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
diff --git a/reference/test/stop/criterion_kernels.cpp b/reference/test/stop/criterion_kernels.cpp
index 784544afe92..39ea9c72098 100644
--- a/reference/test/stop/criterion_kernels.cpp
+++ b/reference/test/stop/criterion_kernels.cpp
@@ -2,12 +2,9 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/stop/criterion.hpp>
-
-
 #include <gtest/gtest.h>
 
-
+#include <ginkgo/core/stop/criterion.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 
 
diff --git a/reference/test/stop/iteration.cpp b/reference/test/stop/iteration.cpp
index ec869f82812..fbe53888c61 100644
--- a/reference/test/stop/iteration.cpp
+++ b/reference/test/stop/iteration.cpp
@@ -2,11 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/stop/iteration.hpp>
-
-
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/stop/iteration.hpp>
+
 
 namespace {
 
diff --git a/reference/test/stop/residual_norm_kernels.cpp b/reference/test/stop/residual_norm_kernels.cpp
index 2996fe153a4..43b865796b7 100644
--- a/reference/test/stop/residual_norm_kernels.cpp
+++ b/reference/test/stop/residual_norm_kernels.cpp
@@ -2,18 +2,13 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/stop/residual_norm.hpp>
-
-
 #include <type_traits>
 
-
 #include <gtest/gtest-typed-test.h>
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/math.hpp>
-
+#include <ginkgo/core/stop/residual_norm.hpp>
 
 #include "core/test/utils.hpp"
 
diff --git a/reference/test/stop/time.cpp b/reference/test/stop/time.cpp
index 42eeb6e6a63..a5ea6107fbf 100644
--- a/reference/test/stop/time.cpp
+++ b/reference/test/stop/time.cpp
@@ -2,11 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/stop/time.hpp>
-
-
 #include <chrono>
 #include <thread>
+
+#include <ginkgo/core/stop/time.hpp>
 #if defined(_WIN32) || defined(__CYGWIN__)
 #include <windows.h>
 #endif  // defined(_WIN32) || defined(__CYGWIN__)
diff --git a/reference/test/utils/assertions_test.cpp b/reference/test/utils/assertions_test.cpp
index 95286f7571a..98f1ec68e0d 100644
--- a/reference/test/utils/assertions_test.cpp
+++ b/reference/test/utils/assertions_test.cpp
@@ -4,14 +4,11 @@
 
 #include "core/test/utils/assertions.hpp"
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/test/utils.hpp"
 
 
diff --git a/test/base/batch_multi_vector_kernels.cpp b/test/base/batch_multi_vector_kernels.cpp
index 07749d9bed2..d15e6d2165f 100644
--- a/test/base/batch_multi_vector_kernels.cpp
+++ b/test/base/batch_multi_vector_kernels.cpp
@@ -2,20 +2,16 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-
+#include "core/base/batch_multi_vector_kernels.hpp"
 
 #include <memory>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
+#include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/math.hpp>
 
-
-#include "core/base/batch_multi_vector_kernels.hpp"
 #include "core/base/batch_utilities.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/assertions.hpp"
diff --git a/test/base/device_matrix_data_kernels.cpp b/test/base/device_matrix_data_kernels.cpp
index 26dbcb73cf4..59c9ec209c3 100644
--- a/test/base/device_matrix_data_kernels.cpp
+++ b/test/base/device_matrix_data_kernels.cpp
@@ -2,22 +2,18 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/device_matrix_data.hpp>
-
+#include "core/base/device_matrix_data_kernels.hpp"
 
 #include <memory>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/device_matrix_data.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/matrix_data.hpp>
 
-
-#include "core/base/device_matrix_data_kernels.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/assertions.hpp"
 #include "test/utils/executor.hpp"
diff --git a/test/base/executor.cpp b/test/base/executor.cpp
index 541360d01d4..3b93d7e748a 100644
--- a/test/base/executor.cpp
+++ b/test/base/executor.cpp
@@ -2,17 +2,15 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/executor.hpp>
-
+#include "test/utils/executor.hpp"
 
 #include <map>
 
-
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/base/executor.hpp>
 
 #include "core/test/utils/assertions.hpp"
-#include "test/utils/executor.hpp"
 
 
 namespace reference {
diff --git a/test/base/index_range.cpp b/test/base/index_range.cpp
index b16b5fb9046..8bb5519c457 100644
--- a/test/base/index_range.cpp
+++ b/test/base/index_range.cpp
@@ -2,17 +2,15 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <memory>
+#include "core/base/index_range.hpp"
 
+#include <memory>
 
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 
-
 #include "common/unified/base/kernel_launch.hpp"
-#include "core/base/index_range.hpp"
 #include "core/test/utils.hpp"
 #include "test/utils/executor.hpp"
 
diff --git a/test/base/kernel_launch_generic.cpp b/test/base/kernel_launch_generic.cpp
index c746a5b3461..8107e6a3eef 100644
--- a/test/base/kernel_launch_generic.cpp
+++ b/test/base/kernel_launch_generic.cpp
@@ -2,23 +2,18 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "common/unified/base/kernel_launch.hpp"
-
-
 #include <algorithm>
 #include <memory>
 #include <type_traits>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/dim.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
+#include "common/unified/base/kernel_launch.hpp"
 #include "common/unified/base/kernel_launch_reduction.hpp"
 #include "common/unified/base/kernel_launch_solver.hpp"
 #include "core/base/array_access.hpp"
diff --git a/test/base/timer.cpp b/test/base/timer.cpp
index a817ddeef96..f2f0da113bf 100644
--- a/test/base/timer.cpp
+++ b/test/base/timer.cpp
@@ -2,15 +2,12 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/base/timer.hpp>
-
-
 #include <map>
 #include <thread>
 
-
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/base/timer.hpp>
 
 #include "core/test/utils/assertions.hpp"
 #include "test/utils/executor.hpp"
diff --git a/test/components/absolute_array_kernels.cpp b/test/components/absolute_array_kernels.cpp
index 08dd52f35e3..a18ab1534c9 100644
--- a/test/components/absolute_array_kernels.cpp
+++ b/test/components/absolute_array_kernels.cpp
@@ -4,18 +4,14 @@
 
 #include "core/components/absolute_array_kernels.hpp"
 
-
 #include <memory>
 #include <random>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "test/utils/executor.hpp"
 
diff --git a/test/components/fill_array_kernels.cpp b/test/components/fill_array_kernels.cpp
index 3997c5830ea..122edb4dc27 100644
--- a/test/components/fill_array_kernels.cpp
+++ b/test/components/fill_array_kernels.cpp
@@ -4,18 +4,14 @@
 
 #include "core/components/fill_array_kernels.hpp"
 
-
 #include <memory>
 #include <random>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "test/utils/executor.hpp"
 
diff --git a/test/components/format_conversion_kernels.cpp b/test/components/format_conversion_kernels.cpp
index 053171ffbe2..3e783206af5 100644
--- a/test/components/format_conversion_kernels.cpp
+++ b/test/components/format_conversion_kernels.cpp
@@ -4,15 +4,12 @@
 
 #include "core/components/format_conversion_kernels.hpp"
 
-
 #include <memory>
 #include <random>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include "core/test/utils.hpp"
 #include "test/utils/executor.hpp"
 
diff --git a/test/components/precision_conversion_kernels.cpp b/test/components/precision_conversion_kernels.cpp
index f75aa948286..dcd6a0dba83 100644
--- a/test/components/precision_conversion_kernels.cpp
+++ b/test/components/precision_conversion_kernels.cpp
@@ -8,13 +8,10 @@
 #include <random>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "test/utils/executor.hpp"
 
diff --git a/test/components/prefix_sum_kernels.cpp b/test/components/prefix_sum_kernels.cpp
index 73cb0c7874e..1ec97b6eadc 100644
--- a/test/components/prefix_sum_kernels.cpp
+++ b/test/components/prefix_sum_kernels.cpp
@@ -4,20 +4,16 @@
 
 #include "core/components/prefix_sum_kernels.hpp"
 
-
 #include <limits>
 #include <memory>
 #include <random>
 #include <type_traits>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "test/utils/executor.hpp"
 
diff --git a/test/components/reduce_array_kernels.cpp b/test/components/reduce_array_kernels.cpp
index dfc2e046c84..35c358099ad 100644
--- a/test/components/reduce_array_kernels.cpp
+++ b/test/components/reduce_array_kernels.cpp
@@ -4,18 +4,14 @@
 
 #include "core/components/reduce_array_kernels.hpp"
 
-
 #include <memory>
 #include <random>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "test/utils/executor.hpp"
 
diff --git a/test/distributed/index_map_kernels.cpp b/test/distributed/index_map_kernels.cpp
index cafd7b4da35..718fe84ce92 100644
--- a/test/distributed/index_map_kernels.cpp
+++ b/test/distributed/index_map_kernels.cpp
@@ -4,22 +4,18 @@
 
 #include "core/distributed/index_map_kernels.hpp"
 
-
 #include <algorithm>
 #include <memory>
 
-
 #include <gtest/gtest-typed-test.h>
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/device_matrix_data.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/matrix_data.hpp>
 #include <ginkgo/core/distributed/index_map.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/distributed/partition_kernels.hpp"
 #include "core/test/utils.hpp"
 #include "test/utils/executor.hpp"
diff --git a/test/distributed/matrix_kernels.cpp b/test/distributed/matrix_kernels.cpp
index 8445aee6a0e..3dcede95bfb 100644
--- a/test/distributed/matrix_kernels.cpp
+++ b/test/distributed/matrix_kernels.cpp
@@ -4,18 +4,14 @@
 
 #include "core/distributed/matrix_kernels.hpp"
 
-
 #include <algorithm>
 
-
 #include <gtest/gtest-typed-test.h>
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/device_matrix_data.hpp>
 #include <ginkgo/core/base/executor.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "test/utils/executor.hpp"
 
diff --git a/test/distributed/partition_helper_kernels.cpp b/test/distributed/partition_helper_kernels.cpp
index 9e985ffec9e..5b014625e7d 100644
--- a/test/distributed/partition_helper_kernels.cpp
+++ b/test/distributed/partition_helper_kernels.cpp
@@ -5,10 +5,8 @@
 #include <gtest/gtest-typed-test.h>
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 
-
 #include "core/base/iterator_factory.hpp"
 #include "core/distributed/partition_helpers_kernels.hpp"
 #include "core/test/utils.hpp"
diff --git a/test/distributed/partition_kernels.cpp b/test/distributed/partition_kernels.cpp
index e857e734154..b00d266170c 100644
--- a/test/distributed/partition_kernels.cpp
+++ b/test/distributed/partition_kernels.cpp
@@ -4,20 +4,16 @@
 
 #include "core/distributed/partition_kernels.hpp"
 
-
 #include <algorithm>
 #include <memory>
 #include <vector>
 
-
 #include <gtest/gtest-typed-test.h>
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/distributed/partition.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "test/utils/executor.hpp"
 
diff --git a/test/distributed/vector_kernels.cpp b/test/distributed/vector_kernels.cpp
index 86faca6b2b2..294b72d861e 100644
--- a/test/distributed/vector_kernels.cpp
+++ b/test/distributed/vector_kernels.cpp
@@ -4,20 +4,16 @@
 
 #include "core/distributed/vector_kernels.hpp"
 
-
 #include <algorithm>
 #include <memory>
 #include <vector>
 
-
 #include <gtest/gtest-typed-test.h>
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/matrix_data.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "test/utils/executor.hpp"
 
diff --git a/test/factorization/cholesky_kernels.cpp b/test/factorization/cholesky_kernels.cpp
index c1d0a6c7336..b7c290eec17 100644
--- a/test/factorization/cholesky_kernels.cpp
+++ b/test/factorization/cholesky_kernels.cpp
@@ -4,20 +4,16 @@
 
 #include "core/factorization/cholesky_kernels.hpp"
 
-
 #include <algorithm>
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/factorization/cholesky.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/identity.hpp>
 
-
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/elimination_forest.hpp"
diff --git a/test/factorization/ic_kernels.cpp b/test/factorization/ic_kernels.cpp
index c7cdbe5d435..ddb38575e03 100644
--- a/test/factorization/ic_kernels.cpp
+++ b/test/factorization/ic_kernels.cpp
@@ -7,15 +7,12 @@
 #include <random>
 #include <string>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/factorization/ic.hpp>
 #include <ginkgo/core/factorization/par_ic.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
 #include "matrices/config.hpp"
diff --git a/test/factorization/ilu_kernels.cpp b/test/factorization/ilu_kernels.cpp
index 8a5ced59041..bc7edeac57f 100644
--- a/test/factorization/ilu_kernels.cpp
+++ b/test/factorization/ilu_kernels.cpp
@@ -7,15 +7,12 @@
 #include <random>
 #include <string>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/factorization/ilu.hpp>
 #include <ginkgo/core/factorization/par_ilu.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
 #include "matrices/config.hpp"
diff --git a/test/factorization/lu_kernels.cpp b/test/factorization/lu_kernels.cpp
index 0ea06bed506..035e938c7c8 100644
--- a/test/factorization/lu_kernels.cpp
+++ b/test/factorization/lu_kernels.cpp
@@ -4,15 +4,12 @@
 
 #include "core/factorization/lu_kernels.hpp"
 
-
 #include <algorithm>
 #include <fstream>
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/types.hpp>
@@ -20,7 +17,6 @@
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/cholesky_kernels.hpp"
diff --git a/test/factorization/par_ic_kernels.cpp b/test/factorization/par_ic_kernels.cpp
index 40a40b5acf5..64541612343 100644
--- a/test/factorization/par_ic_kernels.cpp
+++ b/test/factorization/par_ic_kernels.cpp
@@ -4,24 +4,20 @@
 
 #include "core/factorization/par_ic_kernels.hpp"
 
-
 #include <algorithm>
 #include <fstream>
 #include <memory>
 #include <random>
 #include <string>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/factorization/factorization_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
diff --git a/test/factorization/par_ict_kernels.cpp b/test/factorization/par_ict_kernels.cpp
index 81d1dd83ffb..b157971ff90 100644
--- a/test/factorization/par_ict_kernels.cpp
+++ b/test/factorization/par_ict_kernels.cpp
@@ -4,24 +4,20 @@
 
 #include "core/factorization/par_ict_kernels.hpp"
 
-
 #include <algorithm>
 #include <fstream>
 #include <memory>
 #include <random>
 #include <string>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/factorization/factorization_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
diff --git a/test/factorization/par_ilu_kernels.cpp b/test/factorization/par_ilu_kernels.cpp
index 0d853af0745..a2f3f774ba7 100644
--- a/test/factorization/par_ilu_kernels.cpp
+++ b/test/factorization/par_ilu_kernels.cpp
@@ -4,24 +4,20 @@
 
 #include "core/factorization/par_ilu_kernels.hpp"
 
-
 #include <algorithm>
 #include <fstream>
 #include <memory>
 #include <random>
 #include <string>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/base/iterator_factory.hpp"
 #include "core/factorization/factorization_kernels.hpp"
 #include "core/test/utils.hpp"
diff --git a/test/factorization/par_ilut_kernels.cpp b/test/factorization/par_ilut_kernels.cpp
index 7d46f7979ac..6426e725fdf 100644
--- a/test/factorization/par_ilut_kernels.cpp
+++ b/test/factorization/par_ilut_kernels.cpp
@@ -4,24 +4,20 @@
 
 #include "core/factorization/par_ilut_kernels.hpp"
 
-
 #include <algorithm>
 #include <fstream>
 #include <memory>
 #include <random>
 #include <string>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/factorization/factorization_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
diff --git a/test/log/profiler_hook.cpp b/test/log/profiler_hook.cpp
index 656134ce981..6e0ed2933db 100644
--- a/test/log/profiler_hook.cpp
+++ b/test/log/profiler_hook.cpp
@@ -4,13 +4,10 @@
 
 #include <string>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/log/profiler_hook.hpp>
 
-
 #include "test/utils/executor.hpp"
 
 
diff --git a/test/matrix/batch_csr_kernels.cpp b/test/matrix/batch_csr_kernels.cpp
index 28f3ba65b98..d2a1b2d9aa4 100644
--- a/test/matrix/batch_csr_kernels.cpp
+++ b/test/matrix/batch_csr_kernels.cpp
@@ -4,20 +4,16 @@
 
 #include "core/matrix/batch_csr_kernels.hpp"
 
-
 #include <memory>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/batch_csr.hpp>
 
-
 #include "core/base/batch_utilities.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/array_generator.hpp"
diff --git a/test/matrix/batch_dense_kernels.cpp b/test/matrix/batch_dense_kernels.cpp
index 4a6665b80c1..222ccf6e4b9 100644
--- a/test/matrix/batch_dense_kernels.cpp
+++ b/test/matrix/batch_dense_kernels.cpp
@@ -4,20 +4,16 @@
 
 #include "core/matrix/batch_dense_kernels.hpp"
 
-
 #include <memory>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
 
-
 #include "core/base/batch_utilities.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/array_generator.hpp"
diff --git a/test/matrix/batch_ell_kernels.cpp b/test/matrix/batch_ell_kernels.cpp
index 72406cac549..7edef2c4fb0 100644
--- a/test/matrix/batch_ell_kernels.cpp
+++ b/test/matrix/batch_ell_kernels.cpp
@@ -4,20 +4,16 @@
 
 #include "core/matrix/batch_ell_kernels.hpp"
 
-
 #include <memory>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
-
 #include "core/base/batch_utilities.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/assertions.hpp"
diff --git a/test/matrix/coo_kernels.cpp b/test/matrix/coo_kernels.cpp
index 26bcdb8791b..3da488cf843 100644
--- a/test/matrix/coo_kernels.cpp
+++ b/test/matrix/coo_kernels.cpp
@@ -4,13 +4,10 @@
 
 #include "core/matrix/coo_kernels.hpp"
 
-
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
@@ -19,7 +16,6 @@
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
 #include "test/utils/executor.hpp"
diff --git a/test/matrix/csr_kernels.cpp b/test/matrix/csr_kernels.cpp
index d3a7bb8f8e5..1a1f100e1fd 100644
--- a/test/matrix/csr_kernels.cpp
+++ b/test/matrix/csr_kernels.cpp
@@ -4,20 +4,16 @@
 
 #include "core/matrix/csr_kernels.hpp"
 
-
 #include <algorithm>
 #include <numeric>
 #include <random>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "common/unified/base/kernel_launch.hpp"
 #include "core/base/array_access.hpp"
 #include "core/components/fill_array_kernels.hpp"
diff --git a/test/matrix/csr_kernels2.cpp b/test/matrix/csr_kernels2.cpp
index 4ff8e749766..9272e99546e 100644
--- a/test/matrix/csr_kernels2.cpp
+++ b/test/matrix/csr_kernels2.cpp
@@ -2,19 +2,15 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/matrix/csr.hpp>
-
-
 #include <random>
 #include <stdexcept>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
@@ -25,7 +21,6 @@
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/test/utils.hpp"
diff --git a/test/matrix/dense_kernels.cpp b/test/matrix/dense_kernels.cpp
index 56ca536187e..b8fd4d7900c 100644
--- a/test/matrix/dense_kernels.cpp
+++ b/test/matrix/dense_kernels.cpp
@@ -4,16 +4,13 @@
 
 #include "core/matrix/dense_kernels.hpp"
 
-
 #include <algorithm>
 #include <numeric>
 #include <random>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
@@ -27,7 +24,6 @@
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
 #include "core/components/fill_array_kernels.hpp"
 #include "core/test/utils.hpp"
 #include "test/utils/executor.hpp"
diff --git a/test/matrix/diagonal_kernels.cpp b/test/matrix/diagonal_kernels.cpp
index ffe1f4267e1..ca0a9eff205 100644
--- a/test/matrix/diagonal_kernels.cpp
+++ b/test/matrix/diagonal_kernels.cpp
@@ -4,20 +4,16 @@
 
 #include "core/matrix/diagonal_kernels.hpp"
 
-
 #include <algorithm>
 #include <numeric>
 #include <random>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "test/utils/executor.hpp"
 
diff --git a/test/matrix/ell_kernels.cpp b/test/matrix/ell_kernels.cpp
index b61d97a0a7a..78af81ccafc 100644
--- a/test/matrix/ell_kernels.cpp
+++ b/test/matrix/ell_kernels.cpp
@@ -4,13 +4,10 @@
 
 #include "core/matrix/ell_kernels.hpp"
 
-
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
@@ -19,7 +16,6 @@
 #include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "test/utils/executor.hpp"
 
diff --git a/test/matrix/fbcsr_kernels.cpp b/test/matrix/fbcsr_kernels.cpp
index 737bb926bfe..a3b85143bf0 100644
--- a/test/matrix/fbcsr_kernels.cpp
+++ b/test/matrix/fbcsr_kernels.cpp
@@ -4,17 +4,13 @@
 
 #include "core/matrix/fbcsr_kernels.hpp"
 
-
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/fbcsr.hpp>
 
-
 #include "core/test/matrix/fbcsr_sample.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/fb_matrix_generator.hpp"
diff --git a/test/matrix/fft_kernels.cpp b/test/matrix/fft_kernels.cpp
index ed186b1df60..056087fb9f3 100644
--- a/test/matrix/fft_kernels.cpp
+++ b/test/matrix/fft_kernels.cpp
@@ -4,17 +4,14 @@
 
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/fft.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "test/utils/executor.hpp"
 
diff --git a/test/matrix/hybrid_kernels.cpp b/test/matrix/hybrid_kernels.cpp
index 8fc3346d667..64179259deb 100644
--- a/test/matrix/hybrid_kernels.cpp
+++ b/test/matrix/hybrid_kernels.cpp
@@ -4,20 +4,16 @@
 
 #include "core/matrix/hybrid_kernels.hpp"
 
-
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "test/utils/executor.hpp"
 
diff --git a/test/matrix/matrix.cpp b/test/matrix/matrix.cpp
index 3a18b6700ab..7398b3edb06 100644
--- a/test/matrix/matrix.cpp
+++ b/test/matrix/matrix.cpp
@@ -7,10 +7,8 @@
 #include <random>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/device_matrix_data.hpp>
 #include <ginkgo/core/base/matrix_data.hpp>
 #include <ginkgo/core/base/name_demangling.hpp>
@@ -23,7 +21,6 @@
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "test/utils/executor.hpp"
 
diff --git a/test/matrix/permutation_kernels.cpp b/test/matrix/permutation_kernels.cpp
index 7c3aac97f55..e6324c15f1d 100644
--- a/test/matrix/permutation_kernels.cpp
+++ b/test/matrix/permutation_kernels.cpp
@@ -5,14 +5,11 @@
 #include <algorithm>
 #include <numeric>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/permutation.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "test/utils/executor.hpp"
 
diff --git a/test/matrix/scaled_permutation_kernels.cpp b/test/matrix/scaled_permutation_kernels.cpp
index d81a40b6f63..7239862a8d9 100644
--- a/test/matrix/scaled_permutation_kernels.cpp
+++ b/test/matrix/scaled_permutation_kernels.cpp
@@ -5,13 +5,10 @@
 #include <algorithm>
 #include <numeric>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/matrix/scaled_permutation.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "test/utils/executor.hpp"
 
diff --git a/test/matrix/sellp_kernels.cpp b/test/matrix/sellp_kernels.cpp
index ae6b9053e45..053369f7fa6 100644
--- a/test/matrix/sellp_kernels.cpp
+++ b/test/matrix/sellp_kernels.cpp
@@ -4,13 +4,10 @@
 
 #include "core/matrix/sellp_kernels.hpp"
 
-
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
@@ -18,7 +15,6 @@
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "test/utils/executor.hpp"
 
diff --git a/test/matrix/sparsity_csr_kernels.cpp b/test/matrix/sparsity_csr_kernels.cpp
index 010bd7faa86..8d3728f240d 100644
--- a/test/matrix/sparsity_csr_kernels.cpp
+++ b/test/matrix/sparsity_csr_kernels.cpp
@@ -4,21 +4,17 @@
 
 #include "core/matrix/sparsity_csr_kernels.hpp"
 
-
 #include <algorithm>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "core/test/utils/assertions.hpp"
 #include "core/test/utils/matrix_generator.hpp"
diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp
index d836eb008d9..8a201c78733 100644
--- a/test/mpi/matrix.cpp
+++ b/test/mpi/matrix.cpp
@@ -6,13 +6,10 @@
 #include <memory>
 #include <random>
 
-
 #include <mpi.h>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/matrix_data.hpp>
@@ -22,7 +19,6 @@
 #include <ginkgo/core/log/logger.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "test/utils/mpi/executor.hpp"
 
diff --git a/test/mpi/multigrid/pgm.cpp b/test/mpi/multigrid/pgm.cpp
index 8e72588128b..ccd7dd46b44 100644
--- a/test/mpi/multigrid/pgm.cpp
+++ b/test/mpi/multigrid/pgm.cpp
@@ -5,13 +5,10 @@
 #include <array>
 #include <memory>
 
-
 #include <mpi.h>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/matrix_data.hpp>
@@ -21,7 +18,6 @@
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/multigrid/pgm.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "test/utils/mpi/executor.hpp"
 
diff --git a/test/mpi/partition_helpers.cpp b/test/mpi/partition_helpers.cpp
index c19e3277510..b89295acc13 100644
--- a/test/mpi/partition_helpers.cpp
+++ b/test/mpi/partition_helpers.cpp
@@ -5,7 +5,6 @@
 #include <ginkgo/core/distributed/partition.hpp>
 #include <ginkgo/core/distributed/partition_helpers.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "test/utils/mpi/executor.hpp"
 
diff --git a/test/mpi/preconditioner/schwarz.cpp b/test/mpi/preconditioner/schwarz.cpp
index 5c17254a970..cf29ea77016 100644
--- a/test/mpi/preconditioner/schwarz.cpp
+++ b/test/mpi/preconditioner/schwarz.cpp
@@ -6,13 +6,10 @@
 #include <memory>
 #include <random>
 
-
 #include <mpi.h>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/matrix_data.hpp>
@@ -29,7 +26,6 @@
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "core/test/utils/matrix_generator.hpp"
 #include "core/utils/matrix_utils.hpp"
diff --git a/test/mpi/solver/solver.cpp b/test/mpi/solver/solver.cpp
index e4e7077e4c7..4548dc5d6b7 100644
--- a/test/mpi/solver/solver.cpp
+++ b/test/mpi/solver/solver.cpp
@@ -8,10 +8,8 @@
 #include <random>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/matrix_data.hpp>
 #include <ginkgo/core/base/name_demangling.hpp>
 #include <ginkgo/core/distributed/matrix.hpp>
@@ -30,7 +28,6 @@
 #include <ginkgo/core/solver/multigrid.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "core/test/utils/matrix_generator.hpp"
 #include "core/utils/matrix_utils.hpp"
diff --git a/test/mpi/vector.cpp b/test/mpi/vector.cpp
index 11fe41aded2..3af6886dd84 100644
--- a/test/mpi/vector.cpp
+++ b/test/mpi/vector.cpp
@@ -5,13 +5,10 @@
 #include <memory>
 #include <random>
 
-
 #include <mpi.h>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/matrix_data.hpp>
@@ -19,7 +16,6 @@
 #include <ginkgo/core/distributed/vector.hpp>
 #include <ginkgo/core/log/logger.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "test/utils/mpi/executor.hpp"
 
diff --git a/test/multigrid/fixed_coarsening_kernels.cpp b/test/multigrid/fixed_coarsening_kernels.cpp
index 91699b8631e..0f3c7e56b2a 100644
--- a/test/multigrid/fixed_coarsening_kernels.cpp
+++ b/test/multigrid/fixed_coarsening_kernels.cpp
@@ -8,10 +8,8 @@
 #include <string>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
@@ -23,7 +21,6 @@
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "core/components/fill_array_kernels.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/matrix_generator.hpp"
diff --git a/test/multigrid/pgm_kernels.cpp b/test/multigrid/pgm_kernels.cpp
index 10e5cf01a7a..b0e3b338cbd 100644
--- a/test/multigrid/pgm_kernels.cpp
+++ b/test/multigrid/pgm_kernels.cpp
@@ -4,15 +4,12 @@
 
 #include "core/multigrid/pgm_kernels.hpp"
 
-
 #include <fstream>
 #include <random>
 #include <string>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
@@ -24,7 +21,6 @@
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "core/test/utils/matrix_generator.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
diff --git a/test/preconditioner/batch_jacobi_kernels.cpp b/test/preconditioner/batch_jacobi_kernels.cpp
index f8a1bd015ef..9bdbb015949 100644
--- a/test/preconditioner/batch_jacobi_kernels.cpp
+++ b/test/preconditioner/batch_jacobi_kernels.cpp
@@ -4,14 +4,11 @@
 
 #include "core/preconditioner/batch_jacobi_kernels.hpp"
 
-
 #include <limits>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
@@ -21,7 +18,6 @@
 #include <ginkgo/core/preconditioner/jacobi.hpp>
 #include <ginkgo/core/solver/batch_bicgstab.hpp>
 
-
 #include "core/solver/batch_bicgstab_kernels.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/assertions.hpp"
diff --git a/test/preconditioner/isai_kernels.cpp b/test/preconditioner/isai_kernels.cpp
index 6e737d31790..077379ab226 100644
--- a/test/preconditioner/isai_kernels.cpp
+++ b/test/preconditioner/isai_kernels.cpp
@@ -4,13 +4,10 @@
 
 #include "core/preconditioner/isai_kernels.hpp"
 
-
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/mtx_io.hpp>
@@ -19,7 +16,6 @@
 #include <ginkgo/core/matrix/identity.hpp>
 #include <ginkgo/core/preconditioner/isai.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "matrices/config.hpp"
 #include "test/utils/executor.hpp"
diff --git a/test/preconditioner/jacobi_kernels.cpp b/test/preconditioner/jacobi_kernels.cpp
index d7586a9890e..5ae7c56e715 100644
--- a/test/preconditioner/jacobi_kernels.cpp
+++ b/test/preconditioner/jacobi_kernels.cpp
@@ -4,15 +4,12 @@
 
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/preconditioner/jacobi.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
 #include "core/utils/matrix_utils.hpp"
diff --git a/test/reorder/amd.cpp b/test/reorder/amd.cpp
index 27639d11aad..8137ed8ad7e 100644
--- a/test/reorder/amd.cpp
+++ b/test/reorder/amd.cpp
@@ -7,14 +7,11 @@
 #include <memory>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/reorder/amd.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
 #include "matrices/config.hpp"
diff --git a/test/reorder/mc64.cpp b/test/reorder/mc64.cpp
index d4e4b176da7..0cc3ea33a3d 100644
--- a/test/reorder/mc64.cpp
+++ b/test/reorder/mc64.cpp
@@ -4,11 +4,9 @@
 
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/composition.hpp>
 #include <ginkgo/core/reorder/mc64.hpp>
 
-
 #include "core/test/utils/assertions.hpp"
 #include "test/utils/executor.hpp"
 
diff --git a/test/reorder/nested_dissection.cpp b/test/reorder/nested_dissection.cpp
index 93517b73f6d..2d11bdccb12 100644
--- a/test/reorder/nested_dissection.cpp
+++ b/test/reorder/nested_dissection.cpp
@@ -4,14 +4,11 @@
 
 #include <fstream>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/reorder/nested_dissection.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "matrices/config.hpp"
 #include "test/utils/executor.hpp"
diff --git a/test/reorder/rcm.cpp b/test/reorder/rcm.cpp
index 923a5c1f10f..848d0deea5d 100644
--- a/test/reorder/rcm.cpp
+++ b/test/reorder/rcm.cpp
@@ -8,16 +8,13 @@
 #include <memory>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/permutation.hpp>
 #include <ginkgo/core/reorder/rcm.hpp>
 
-
 #include "core/components/disjoint_sets.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/assertions.hpp"
diff --git a/test/solver/batch_bicgstab_kernels.cpp b/test/solver/batch_bicgstab_kernels.cpp
index 14bca65e41f..8f4bfca00cc 100644
--- a/test/solver/batch_bicgstab_kernels.cpp
+++ b/test/solver/batch_bicgstab_kernels.cpp
@@ -4,14 +4,11 @@
 
 #include "core/solver/batch_bicgstab_kernels.hpp"
 
-
 #include <memory>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/log/batch_logger.hpp>
 #include <ginkgo/core/matrix/batch_csr.hpp>
@@ -19,7 +16,6 @@
 #include <ginkgo/core/matrix/batch_ell.hpp>
 #include <ginkgo/core/solver/batch_bicgstab.hpp>
 
-
 #include "core/base/batch_utilities.hpp"
 #include "core/matrix/batch_dense_kernels.hpp"
 #include "core/test/utils.hpp"
diff --git a/test/solver/batch_cg_kernels.cpp b/test/solver/batch_cg_kernels.cpp
index 7c013020686..7b5a85a1e5b 100644
--- a/test/solver/batch_cg_kernels.cpp
+++ b/test/solver/batch_cg_kernels.cpp
@@ -4,21 +4,17 @@
 
 #include "core/solver/batch_cg_kernels.hpp"
 
-
 #include <memory>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/log/batch_logger.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 #include <ginkgo/core/solver/batch_cg.hpp>
 
-
 #include "core/base/batch_utilities.hpp"
 #include "core/matrix/batch_dense_kernels.hpp"
 #include "core/test/utils.hpp"
diff --git a/test/solver/bicg_kernels.cpp b/test/solver/bicg_kernels.cpp
index ab63b01f9cc..5f9dd818711 100644
--- a/test/solver/bicg_kernels.cpp
+++ b/test/solver/bicg_kernels.cpp
@@ -4,13 +4,10 @@
 
 #include "core/solver/bicg_kernels.hpp"
 
-
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
@@ -19,7 +16,6 @@
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "core/utils/matrix_utils.hpp"
 #include "matrices/config.hpp"
diff --git a/test/solver/bicgstab_kernels.cpp b/test/solver/bicgstab_kernels.cpp
index 4f68edd6a8e..9548c99daf9 100644
--- a/test/solver/bicgstab_kernels.cpp
+++ b/test/solver/bicgstab_kernels.cpp
@@ -4,13 +4,10 @@
 
 #include "core/solver/bicgstab_kernels.hpp"
 
-
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
@@ -20,7 +17,6 @@
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "core/utils/matrix_utils.hpp"
 #include "test/utils/executor.hpp"
diff --git a/test/solver/cb_gmres_kernels.cpp b/test/solver/cb_gmres_kernels.cpp
index 3b5f5956c2e..45a752a2292 100644
--- a/test/solver/cb_gmres_kernels.cpp
+++ b/test/solver/cb_gmres_kernels.cpp
@@ -4,15 +4,12 @@
 
 #include "core/solver/cb_gmres_kernels.hpp"
 
-
 #include <algorithm>
 #include <cmath>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
@@ -21,7 +18,6 @@
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "core/solver/cb_gmres_accessor.hpp"
 #include "core/test/utils.hpp"
 #include "test/utils/executor.hpp"
diff --git a/test/solver/cg_kernels.cpp b/test/solver/cg_kernels.cpp
index be9dc052314..b4408851da6 100644
--- a/test/solver/cg_kernels.cpp
+++ b/test/solver/cg_kernels.cpp
@@ -4,13 +4,10 @@
 
 #include "core/solver/cg_kernels.hpp"
 
-
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
@@ -19,7 +16,6 @@
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "core/utils/matrix_utils.hpp"
 #include "test/utils/executor.hpp"
diff --git a/test/solver/cgs_kernels.cpp b/test/solver/cgs_kernels.cpp
index 6c2bab293e3..392167d2106 100644
--- a/test/solver/cgs_kernels.cpp
+++ b/test/solver/cgs_kernels.cpp
@@ -4,13 +4,10 @@
 
 #include "core/solver/cgs_kernels.hpp"
 
-
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
@@ -19,7 +16,6 @@
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "core/utils/matrix_utils.hpp"
 #include "test/utils/executor.hpp"
diff --git a/test/solver/direct.cpp b/test/solver/direct.cpp
index be68593628f..c2e6c757f76 100644
--- a/test/solver/direct.cpp
+++ b/test/solver/direct.cpp
@@ -5,17 +5,14 @@
 #include <algorithm>
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/factorization/lu.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 #include <ginkgo/core/solver/direct.hpp>
 
-
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/cholesky_kernels.hpp"
diff --git a/test/solver/fcg_kernels.cpp b/test/solver/fcg_kernels.cpp
index f1f09f759bc..9ad2be9eb05 100644
--- a/test/solver/fcg_kernels.cpp
+++ b/test/solver/fcg_kernels.cpp
@@ -4,13 +4,10 @@
 
 #include "core/solver/fcg_kernels.hpp"
 
-
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
@@ -19,7 +16,6 @@
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "core/utils/matrix_utils.hpp"
 #include "test/utils/executor.hpp"
diff --git a/test/solver/gcr_kernels.cpp b/test/solver/gcr_kernels.cpp
index 7a00b3fed30..d26b5ef265c 100644
--- a/test/solver/gcr_kernels.cpp
+++ b/test/solver/gcr_kernels.cpp
@@ -4,13 +4,10 @@
 
 #include "core/solver/gcr_kernels.hpp"
 
-
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
@@ -20,7 +17,6 @@
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "core/utils/matrix_utils.hpp"
 #include "test/utils/executor.hpp"
diff --git a/test/solver/gmres_kernels.cpp b/test/solver/gmres_kernels.cpp
index 08259c91ce0..52ee885e29d 100644
--- a/test/solver/gmres_kernels.cpp
+++ b/test/solver/gmres_kernels.cpp
@@ -4,13 +4,10 @@
 
 #include "core/solver/gmres_kernels.hpp"
 
-
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
@@ -20,7 +17,6 @@
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "core/solver/common_gmres_kernels.hpp"
 #include "core/test/utils.hpp"
 #include "test/utils/executor.hpp"
diff --git a/test/solver/idr_kernels.cpp b/test/solver/idr_kernels.cpp
index b165824dbe0..7afac1c2f33 100644
--- a/test/solver/idr_kernels.cpp
+++ b/test/solver/idr_kernels.cpp
@@ -4,11 +4,9 @@
 
 #include "core/solver/idr_kernels.hpp"
 
-
 #include <fstream>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
 
@@ -28,7 +26,6 @@
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "test/utils/executor.hpp"
 
diff --git a/test/solver/ir_kernels.cpp b/test/solver/ir_kernels.cpp
index 7a8e84324bd..114dee3c06b 100644
--- a/test/solver/ir_kernels.cpp
+++ b/test/solver/ir_kernels.cpp
@@ -4,13 +4,10 @@
 
 #include "core/solver/ir_kernels.hpp"
 
-
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
@@ -19,7 +16,6 @@
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "test/utils/executor.hpp"
 
diff --git a/test/solver/lower_trs_kernels.cpp b/test/solver/lower_trs_kernels.cpp
index 1f99499a129..4bccf283faf 100644
--- a/test/solver/lower_trs_kernels.cpp
+++ b/test/solver/lower_trs_kernels.cpp
@@ -5,17 +5,14 @@
 #include <memory>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/solver/triangular.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "core/utils/matrix_utils.hpp"
 #include "test/utils/executor.hpp"
diff --git a/test/solver/multigrid_kernels.cpp b/test/solver/multigrid_kernels.cpp
index 4b4b0157df5..894f4280346 100644
--- a/test/solver/multigrid_kernels.cpp
+++ b/test/solver/multigrid_kernels.cpp
@@ -4,13 +4,10 @@
 
 #include "core/solver/multigrid_kernels.hpp"
 
-
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
@@ -18,7 +15,6 @@
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "test/utils/executor.hpp"
 
diff --git a/test/solver/solver.cpp b/test/solver/solver.cpp
index fab351227f9..5b24234ce14 100644
--- a/test/solver/solver.cpp
+++ b/test/solver/solver.cpp
@@ -8,10 +8,8 @@
 #include <random>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/matrix_data.hpp>
 #include <ginkgo/core/base/name_demangling.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
@@ -31,7 +29,6 @@
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "core/utils/matrix_utils.hpp"
 #include "test/utils/executor.hpp"
diff --git a/test/solver/upper_trs_kernels.cpp b/test/solver/upper_trs_kernels.cpp
index 33d2196e097..c7041865dd1 100644
--- a/test/solver/upper_trs_kernels.cpp
+++ b/test/solver/upper_trs_kernels.cpp
@@ -5,17 +5,14 @@
 #include <memory>
 #include <random>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/solver/triangular.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "core/utils/matrix_utils.hpp"
 #include "test/utils/executor.hpp"
diff --git a/test/stop/combined_kernels.cpp b/test/stop/combined_kernels.cpp
index 8d9b0986c91..7e18a0c32aa 100644
--- a/test/stop/combined_kernels.cpp
+++ b/test/stop/combined_kernels.cpp
@@ -4,11 +4,9 @@
 
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 
-
 #include "test/utils/executor.hpp"
 
 
diff --git a/test/stop/criterion_kernels.cpp b/test/stop/criterion_kernels.cpp
index 91795d59bed..6b6094125ba 100644
--- a/test/stop/criterion_kernels.cpp
+++ b/test/stop/criterion_kernels.cpp
@@ -4,11 +4,9 @@
 
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/stop/criterion.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 
-
 #include "test/utils/executor.hpp"
 
 
diff --git a/test/stop/residual_norm_kernels.cpp b/test/stop/residual_norm_kernels.cpp
index ed3b775a61c..7c3ddf6624e 100644
--- a/test/stop/residual_norm_kernels.cpp
+++ b/test/stop/residual_norm_kernels.cpp
@@ -4,11 +4,9 @@
 
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "test/utils/executor.hpp"
 
diff --git a/test/test_install/test_install.cpp b/test/test_install/test_install.cpp
index 48252ef9bbe..2f4cdeda6e4 100644
--- a/test/test_install/test_install.cpp
+++ b/test/test_install/test_install.cpp
@@ -2,9 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/ginkgo.hpp>
-
-
 #include <chrono>
 #include <cmath>
 #include <cstdlib>
@@ -17,6 +14,8 @@
 #include <utility>
 #include <vector>
 
+#include <ginkgo/ginkgo.hpp>
+
 
 void assert_similar_matrices(gko::ptr_param<const gko::matrix::Dense<>> m1,
                              gko::ptr_param<const gko::matrix::Dense<>> m2,
diff --git a/test/tools/resource_file_generator.cpp b/test/tools/resource_file_generator.cpp
index c1d4996267c..f6b35229a19 100644
--- a/test/tools/resource_file_generator.cpp
+++ b/test/tools/resource_file_generator.cpp
@@ -5,7 +5,6 @@
 #include <iomanip>
 #include <thread>
 
-
 #include <ginkgo/core/base/executor.hpp>
 
 
diff --git a/test/utils/executor.hpp b/test/utils/executor.hpp
index 21c40a70c0a..b31d1242f35 100644
--- a/test/utils/executor.hpp
+++ b/test/utils/executor.hpp
@@ -6,19 +6,14 @@
 #define GKO_TEST_UTILS_EXECUTOR_HPP_
 
 
-#include <ginkgo/core/base/executor.hpp>
-
-
 #include <memory>
 #include <stdexcept>
 
-
 #include <gtest/gtest.h>
 
-
+#include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/stream.hpp>
 
-
 #include "core/test/gtest/resources.hpp"
 
 
diff --git a/test/utils/mpi/executor.hpp b/test/utils/mpi/executor.hpp
index 180c31f37cf..199de02c054 100644
--- a/test/utils/mpi/executor.hpp
+++ b/test/utils/mpi/executor.hpp
@@ -6,18 +6,13 @@
 #define GKO_TEST_UTILS_MPI_EXECUTOR_HPP_
 
 
-#include <ginkgo/core/base/executor.hpp>
-
-
 #include <memory>
 
-
 #include <gtest/gtest.h>
 
-
+#include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/mpi.hpp>
 
-
 #include "test/utils/executor.hpp"
 
 

From 099481fb2234dc29118bc9e542446863d3948466 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Thu, 16 Nov 2023 18:20:54 +0000
Subject: [PATCH 024/448] update contributing documentation

---
 CONTRIBUTING.md | 62 ++++++++++++++++++-------------------------------
 1 file changed, 22 insertions(+), 40 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 04cf35aec8c..d460087b3c8 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -176,12 +176,12 @@ improvements from code reviews.
 
 ### Automatic code formatting
 
-Ginkgo uses [ClangFormat](https://clang.llvm.org/docs/ClangFormat.html)
-(executable is usually named `clang-format`) and a custom `.clang-format`
-configuration file (mostly based on ClangFormat's _Google_ style) to
-automatically format your code. __Make sure you have ClangFormat set up and
-running properly__ ( you should be able to run `make format` from Ginkgo's build
-directory) before committing anything that will end up in a pull request against
+Ginkgo uses [pre-commit](https://pre-commit.com/) to automatically apply
+code formatting when committing changes to git. What formatting is applied
+is managed through [ClangFormat](https://clang.llvm.org/docs/ClangFormat.html)
+with a custom `.clang-format` configuration file (mostly based on ClangFormat's
+_Google_ style). __Make sure you have pre-commit set up and running properly__
+before committing anything that will end up in a pull request against
 `ginkgo-project/ginkgo` repository. In addition, you should __never__ modify the
 `.clang-format` configuration file shipped with Ginkgo. E.g. if ClangFormat has
 trouble reading this file on your system, you should install a newer version of
@@ -339,64 +339,53 @@ Thus, contributors should be aware of the following rules for blank lines:
 
 ### Include statement grouping
 
+The concrete ordering will be done by `clang-format`.
+Here are the rules that `clang-format` will follow.
 In general, all include statements should be present on the top of the file,
-ordered in the following groups, with two blank lines between each group:
+ordered in the following groups, with *one* blank lines between each group:
 
-1. Related header file (e.g. `core/foo/bar.hpp` included in `core/foo/bar.cpp`,
+1. Main header file (e.g. `core/foo/bar.hpp` included in `core/foo/bar.cpp`,
    or in the unit test`core/test/foo/bar.cpp`)
 2. Standard library headers (e.g. `vector`)
 3. Executor specific library headers (e.g. `omp.h`)
 4. System third-party library headers (e.g. `papi.h`)
-5. Local third-party library headers
-6. Public Ginkgo headers
-7. Private Ginkgo headers
+5. Public Ginkgo headers
+6. Local headers
 
 _Example_: A file `core/base/my_file.cpp` might have an include list like this:
 
 ```c++
-#include <ginkgo/core/base/my_file.hpp>
-
+#include "ginkgo/core/base/my_file.hpp"
 
 #include <algorithm>
 #include <vector>
 #include <tuple>
 
-
 #include <omp.h>
 
-
 #include <papi.h>
 
-
-#include "third_party/blas/cblas.hpp"
-#include "third_party/lapack/lapack.hpp"
-
+#include <third_party/blas/cblas.hpp>
+#include <third_party/lapack/lapack.hpp>
 
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/types.hpp>
 
-
 #include "core/base/my_file_kernels.hpp"
 ```
 
 #### Main header
 
-This section presents general rules used to define the main header attributed to
-the file. In the previous example, this would be ` #include
-<ginkgo/core/base/my_file.hpp>`.
+This section presents the handling of the main header attributed to a file.
+For a given file, the main header is the header that contains the declarations of the
+functions, classes, etc., which are implemented in this file.
+In the previous example, this would be ` #include "ginkgo/core/base/my_file.hpp"`.
+The `clang-format` tool figures out the main header. The only intervention form
+a contributor is to *always* include the main header using `"..."`.
 
-General rules:
-1. Some fixed main header.
-2. components:
-  - with `_kernel` suffix looks for the header in the same folder.
-  - without `_kernel` suffix looks for the header in `core`.
-3. `test/utils`: looks for the header in `core`
-4. `core`: looks for the header in `ginkgo`
-5. `test` or `base`: looks for the header in `ginkgo/core`
-6. others: looks for the header in `core`
+Please note that this only applies to implementation files, so files ending in `.cpp` or `.cu`.
 
-_Note_: Please see the detail in the `dev_tools/scripts/config`.
 
 #### Some general comments.
 
@@ -405,13 +394,6 @@ _Note_: Please see the detail in the `dev_tools/scripts/config`.
 When compiling with `GINKGO_CHECK_CIRCULAR_DEPS` enabled, this property is explicitly checked.
 3. The recommendations of the `iwyu` (Include what you use) tool can be used to make sure that the headers are self-sufficient and that the compiled files ( `.cu`, `.cpp`, `.hip.cpp` ) include only what they use. A [CI pipeline](https://gitlab.com/ginkgo-project/ginkgo-public-ci/-/jobs/584358356) is available that runs with the `iwyu` tool. Please be aware that this tool can be incorrect in some cases.
 
-#### Automatic header arrangement
-
-1. `dev_tools/script/format_header.sh` will take care of the group/sorting of
-   headers according to this guideline.
-2. `make format_header` arranges the header of the modified files in the branch.
-3. `make format_header_all` arranges the header of all files.
-
 
 ### Other Code Formatting not handled by ClangFormat
 

From 95cff7495dd28fb967b1bd449924a16057077105 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Fri, 14 Jun 2024 14:48:37 +0200
Subject: [PATCH 025/448] [test] fix whitespace bug in profiler_hook test

---
 core/test/log/profiler_hook.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/core/test/log/profiler_hook.cpp b/core/test/log/profiler_hook.cpp
index 40bd6394475..ce74e526879 100644
--- a/core/test/log/profiler_hook.cpp
+++ b/core/test/log/profiler_hook.cpp
@@ -387,8 +387,9 @@ TEST(ProfilerHookTableSummaryWriter, SummaryWorks)
     entries.push_back({"medium", 1ms, 500us, 4});  // check division by count
     entries.push_back({"long", 120s, 60s, 1});
     entries.push_back({"eternal", 24h, 24h, 1});
+    // clang-format off
     const auto expected = R"(Test header
-Overhead estimate 1.0 s 
+Overhead estimate 1.0 ns
 |   name   | total  | total (self) | count |   avg    | avg (self) |
 |----------|-------:|-------------:|------:|---------:|-----------:|
 | eternal  | 1.0 d  |       1.0 d  |     1 |   1.0 d  |     1.0 d  |
@@ -398,8 +399,9 @@ Overhead estimate 1.0 s
 | short    | 1.0 ns |       0.0 ns |     1 |   1.0 ns |     0.0 ns |
 | empty    | 0.0 ns |       0.0 ns |     0 |   0.0 ns |     0.0 ns |
 )";
+    // clang-format on
 
-    writer.write(entries, 1s);
+    writer.write(entries, 1ns);
 
     ASSERT_EQ(ss.str(), expected);
 }
@@ -422,6 +424,7 @@ TEST(ProfilerHookTableSummaryWriter, NestedSummaryWorks)
              2,
              {ProfilerHook::nested_summary_entry{"child", 100ns, 2, {}}}},
          ProfilerHook::nested_summary_entry{"baz", 1ns, 2, {}}}};
+    // clang-format off
     const auto expected = R"(Test header
 Overhead estimate 1.0 ns
 |    name    |  total   | fraction | count |   avg    |
@@ -434,6 +437,7 @@ Overhead estimate 1.0 ns
 |   foo      | 100.0 ns |    5.0 % |     5 |  20.0 ns |
 |   baz      |   1.0 ns |    0.1 % |     2 |   0.0 ns |
 )";
+    // clang-format on
 
     writer.write_nested(entry, 1ns);
 

From 648b10e995588fe485cbc1cf47656893a2d09fcb Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Fri, 28 Jun 2024 09:54:00 +0200
Subject: [PATCH 026/448] Revert "adds script to change main include to use ""
 instead of <>"

This reverts commit a9880c2ae61571edc4846887cc299ee7a2da3850.
---
 dev_tools/scripts/change-main-include.py | 60 ------------------------
 1 file changed, 60 deletions(-)
 delete mode 100755 dev_tools/scripts/change-main-include.py

diff --git a/dev_tools/scripts/change-main-include.py b/dev_tools/scripts/change-main-include.py
deleted file mode 100755
index 7ee5e8cd922..00000000000
--- a/dev_tools/scripts/change-main-include.py
+++ /dev/null
@@ -1,60 +0,0 @@
-#! /usr/bin/env python3
-import collections
-import sys
-import re
-
-files = sys.argv[1:]
-
-test_subdirectories = [
-    "base", "config", "distributed", "factorization",
-    "log", "matrix", "multigrid", "preconditioner",
-    "reorder", "solver", "stop", "synthesizer"
-]
-
-false_positives = [
-    "test/utils/executor.hpp",
-    "test/utils/mpi/executor.hpp"
-]
-
-
-for filename in files:
-    suffix = re.compile(r"(\.cpp|\.cu|\.inc)$")
-    main_include_re = re.compile(r"#include\s+<ginkgo/core/([^>]+)>")
-
-    Match = collections.namedtuple("Match", ["idx", "line"])
-
-    if not suffix.search(filename):
-        continue
-
-    if any(f"test/{subdir}" in filename for subdir in test_subdirectories):
-        continue
-
-    if any(filename.endswith(fp) for fp in false_positives):
-        continue
-
-    with open(filename, 'r') as file:
-        content = file.readlines()
-
-    try:
-        first_include = next(Match(idx=i, line=l) for i, l in enumerate(content) if l.startswith("#include"))
-    except:
-        first_include = Match(idx=-1, line="")
-    if "<ginkgo/core" not in first_include.line:
-        continue
-
-    try:
-        next_idx, next_line = next(Match(idx=i, line=l) for i, l in enumerate(content[first_include.idx + 1:]) if l.strip())
-    except:
-        continue
-    if next_line.startswith("#if") and next_idx == 0:
-        continue
-    if "<ginkgo/core" in next_line and next_idx == 0:
-        continue
-    if not next_line.startswith("#include") or next_line.startswith('#include "'):
-        # Uncertain if the first include is the main include
-        print(filename, file=sys.stderr)
-        continue
-
-    content[first_include.idx] = first_include.line.replace('<', '"').replace('>', '"')
-    with open(filename, 'w') as file:
-            file.writelines(content)

From 7afbbf4e377ca639e25bba21d4c783668c0b8a2f Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Fri, 28 Jun 2024 11:01:49 +0200
Subject: [PATCH 027/448] [cuda] rem deprecated shmem config guard

---
 cuda/base/kernel_config.hpp           | 59 ---------------------------
 cuda/solver/batch_bicgstab_kernels.cu |  4 --
 cuda/solver/batch_cg_kernels.cu       |  4 --
 3 files changed, 67 deletions(-)
 delete mode 100644 cuda/base/kernel_config.hpp

diff --git a/cuda/base/kernel_config.hpp b/cuda/base/kernel_config.hpp
deleted file mode 100644
index f0821a42976..00000000000
--- a/cuda/base/kernel_config.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_BASE_KERNEL_CONFIG_HPP_
-#define GKO_CUDA_BASE_KERNEL_CONFIG_HPP_
-
-
-#include <cuda_runtime.h>
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-namespace detail {
-
-
-template <typename ValueType>
-class shared_memory_config_guard {
-public:
-    using value_type = ValueType;
-    shared_memory_config_guard() : original_config_{}
-    {
-        GKO_ASSERT_NO_CUDA_ERRORS(
-            cudaDeviceGetSharedMemConfig(&original_config_));
-
-        if (sizeof(value_type) == 4) {
-            GKO_ASSERT_NO_CUDA_ERRORS(
-                cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte));
-        } else if (sizeof(value_type) % 8 == 0) {
-            GKO_ASSERT_NO_CUDA_ERRORS(
-                cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));
-        } else {
-            GKO_ASSERT_NO_CUDA_ERRORS(
-                cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeDefault));
-        }
-    }
-
-
-    ~shared_memory_config_guard()
-    {
-        // No need to exit or throw if we cant set the value back.
-        cudaDeviceSetSharedMemConfig(original_config_);
-    }
-
-private:
-    cudaSharedMemConfig original_config_;
-};
-
-
-}  // namespace detail
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_BASE_KERNEL_CONFIG_HPP_
diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu
index b6ae74a5064..28efaf07475 100644
--- a/cuda/solver/batch_bicgstab_kernels.cu
+++ b/cuda/solver/batch_bicgstab_kernels.cu
@@ -18,7 +18,6 @@
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "cuda/base/batch_struct.hpp"
-#include "cuda/base/kernel_config.hpp"
 #include "cuda/base/thrust.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
@@ -143,9 +142,6 @@ public:
         constexpr int align_multiple = 8;
         const int padded_num_rows =
             ceildiv(mat.num_rows, align_multiple) * align_multiple;
-        auto shem_guard =
-            gko::kernels::cuda::detail::shared_memory_config_guard<
-                value_type>();
         const int shmem_per_blk =
             get_max_dynamic_shared_memory<StopType, PrecType, LogType,
                                           BatchMatrixType, value_type>(exec_);
diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu
index 5425bd9cd9c..cff72652629 100644
--- a/cuda/solver/batch_cg_kernels.cu
+++ b/cuda/solver/batch_cg_kernels.cu
@@ -17,7 +17,6 @@
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "cuda/base/batch_struct.hpp"
-#include "cuda/base/kernel_config.hpp"
 #include "cuda/base/thrust.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
@@ -141,9 +140,6 @@ public:
         constexpr int align_multiple = 8;
         const int padded_num_rows =
             ceildiv(mat.num_rows, align_multiple) * align_multiple;
-        auto shem_guard =
-            gko::kernels::cuda::detail::shared_memory_config_guard<
-                value_type>();
         const int shmem_per_blk =
             get_max_dynamic_shared_memory<StopType, PrecType, LogType,
                                           BatchMatrixType, value_type>(exec_);

From 81ebe46b76f8d4f014651b71a480243e3f5d3eae Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Sun, 19 May 2024 01:43:14 +0200
Subject: [PATCH 028/448] add executor description

---
 core/device_hooks/cuda_hooks.cpp      |  3 ++
 core/device_hooks/dpcpp_hooks.cpp     |  3 ++
 core/device_hooks/hip_hooks.cpp       |  3 ++
 core/device_hooks/omp_hooks.cpp       |  3 ++
 core/test/gtest/environments.hpp      | 56 ++++++---------------------
 cuda/base/executor.cpp                |  9 +++++
 dpcpp/base/executor.dp.cpp            | 11 ++++++
 hip/base/executor.hip.cpp             |  9 +++++
 include/ginkgo/core/base/executor.hpp | 13 +++++++
 omp/base/executor.cpp                 |  7 ++++
 10 files changed, 73 insertions(+), 44 deletions(-)

diff --git a/core/device_hooks/cuda_hooks.cpp b/core/device_hooks/cuda_hooks.cpp
index abda9e4e0f6..4124ac2bea5 100644
--- a/core/device_hooks/cuda_hooks.cpp
+++ b/core/device_hooks/cuda_hooks.cpp
@@ -148,6 +148,9 @@ scoped_device_id_guard CudaExecutor::get_scoped_device_id_guard() const
     GKO_NOT_COMPILED(cuda);
 
 
+std::string CudaExecutor::get_description() const GKO_NOT_COMPILED(cuda);
+
+
 std::string CudaError::get_error(int64)
 {
     return "ginkgo CUDA module is not compiled";
diff --git a/core/device_hooks/dpcpp_hooks.cpp b/core/device_hooks/dpcpp_hooks.cpp
index 6cd86581998..470fd9befc4 100644
--- a/core/device_hooks/dpcpp_hooks.cpp
+++ b/core/device_hooks/dpcpp_hooks.cpp
@@ -91,6 +91,9 @@ scoped_device_id_guard DpcppExecutor::get_scoped_device_id_guard() const
     GKO_NOT_COMPILED(dpcpp);
 
 
+std::string DpcppExecutor::get_description() const GKO_NOT_COMPILED(dpcpp);
+
+
 int DpcppExecutor::get_num_devices(std::string) { return 0; }
 
 
diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp
index 573fb37b8f0..7f3497e8020 100644
--- a/core/device_hooks/hip_hooks.cpp
+++ b/core/device_hooks/hip_hooks.cpp
@@ -147,6 +147,9 @@ scoped_device_id_guard HipExecutor::get_scoped_device_id_guard() const
     GKO_NOT_COMPILED(hip);
 
 
+std::string HipExecutor::get_description() const GKO_NOT_COMPILED(hip);
+
+
 std::string HipError::get_error(int64)
 {
     return "ginkgo HIP module is not compiled";
diff --git a/core/device_hooks/omp_hooks.cpp b/core/device_hooks/omp_hooks.cpp
index c371f8ff767..33025006a4d 100644
--- a/core/device_hooks/omp_hooks.cpp
+++ b/core/device_hooks/omp_hooks.cpp
@@ -24,6 +24,9 @@ scoped_device_id_guard::scoped_device_id_guard(const OmpExecutor* exec,
     GKO_NOT_COMPILED(omp);
 
 
+std::string OmpExecutor::get_description() const GKO_NOT_COMPILED(omp);
+
+
 int OmpExecutor::get_num_omp_threads() { return 1; }
 
 
diff --git a/core/test/gtest/environments.hpp b/core/test/gtest/environments.hpp
index 01250c41929..1268b92c4c1 100644
--- a/core/test/gtest/environments.hpp
+++ b/core/test/gtest/environments.hpp
@@ -17,6 +17,7 @@
 #include <ginkgo/core/base/mpi.hpp>
 
 #include "core/test/gtest/resources.hpp"
+#include "test/utils/executor.hpp"
 
 
 #ifdef GKO_COMPILING_OMP
@@ -43,60 +44,27 @@ class DeviceEnvironment : public ::testing::Environment {
 public:
     explicit DeviceEnvironment(int rank) : rank_(rank) { print_environment(); }
 
-#ifdef GKO_COMPILING_OMP
     void print_environment() const
     {
+        auto ref = gko::ReferenceExecutor::create();
+#ifdef GKO_COMPILING_OMP
         if (ResourceEnvironment::omp_threads > 0) {
             omp_set_num_threads(ResourceEnvironment::omp_threads);
         }
-        std::stringstream ss;
-        ss << "Rank " << rank_ << ": OMP threads " << omp_get_max_threads()
-           << std::endl;
-        std::cerr << ss.str();
-    }
+        std::shared_ptr<gko::OmpExecutor> exec;
 #elif defined(GKO_COMPILING_CUDA)
-    void print_environment() const
-    {
-        auto device_id = ResourceEnvironment::cuda_device_id;
-        std::stringstream ss;
-        ss << "Rank " << rank_ << ": CUDA device "
-           << gko::kernels::cuda::get_device_name(device_id) << " ID "
-           << device_id << std::endl;
-        std::cerr << ss.str();
-    }
-
-    void TearDown() override
-    {
-        gko::kernels::cuda::reset_device(ResourceEnvironment::cuda_device_id);
-    }
+        std::shared_ptr<gko::CudaExecutor> exec;
 #elif defined(GKO_COMPILING_HIP)
-    void print_environment() const
-    {
-        auto device_id = ResourceEnvironment::hip_device_id;
-        std::stringstream ss;
-        ss << "Rank " << rank_ << ": HIP device "
-           << gko::kernels::hip::get_device_name(device_id) << " ID "
-           << device_id << std::endl;
-        std::cerr << ss.str();
-    }
-
-    void TearDown() override
-    {
-        gko::kernels::hip::reset_device(ResourceEnvironment::hip_device_id);
-    }
+        std::shared_ptr<gko::HipExecutor> exec;
 #elif defined(GKO_COMPILING_DPCPP)
-    void print_environment() const
-    {
-        auto device_id = ResourceEnvironment::sycl_device_id;
-        std::stringstream ss;
-        ss << "Rank " << rank_ << ": SYCL device "
-           << gko::kernels::dpcpp::get_device_name(device_id) << " ID "
-           << device_id << std::endl;
-        std::cerr << ss.str();
-    }
+        std::shared_ptr<gko::DpcppExecutor> exec;
 #else
-    void print_environment() const {}
+        std::shared_ptr<gko::ReferenceExecutor> exec;
 #endif
+        init_executor(ref, exec);
+        std::cerr << "Rank " << rank_ << ": " << exec->get_description()
+                  << std::endl;
+    }
 
 private:
     int rank_;
diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp
index 1b1410ca8bb..caf5269fa3d 100644
--- a/cuda/base/executor.cpp
+++ b/cuda/base/executor.cpp
@@ -19,6 +19,7 @@
 #include "common/cuda_hip/base/config.hpp"
 #include "cuda/base/cublas_bindings.hpp"
 #include "cuda/base/cusparse_handle.hpp"
+#include "cuda/base/device.hpp"
 #include "cuda/base/scoped_device_id.hpp"
 
 
@@ -178,6 +179,14 @@ scoped_device_id_guard CudaExecutor::get_scoped_device_id_guard() const
 }
 
 
+std::string CudaExecutor::get_description() const
+{
+    return "CudaExecutor on device " + std::to_string(this->get_device_id()) +
+           " (" + gko::kernels::cuda::get_device_name(this->get_device_id()) +
+           ") with host " + this->get_master()->get_description();
+}
+
+
 int CudaExecutor::get_num_devices()
 {
     int deviceCount = 0;
diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp
index 29f0810d9d9..8a7460f6bcd 100644
--- a/dpcpp/base/executor.dp.cpp
+++ b/dpcpp/base/executor.dp.cpp
@@ -162,6 +162,17 @@ scoped_device_id_guard DpcppExecutor::get_scoped_device_id_guard() const
 }
 
 
+std::string DpcppExecutor::get_description() const
+{
+    return "DpcppExecutor on device " + std::to_string(this->get_device_id()) +
+           " (" +
+           this->get_queue()
+               ->get_device()
+               .get_info<sycl::info::device::name>() +
+           ") with host " + this->get_master()->get_description();
+}
+
+
 int DpcppExecutor::get_num_devices(std::string device_type)
 {
     return detail::get_devices(device_type).size();
diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp
index 9e09912c5c9..d4b1d614681 100644
--- a/hip/base/executor.hip.cpp
+++ b/hip/base/executor.hip.cpp
@@ -12,6 +12,7 @@
 
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
+#include "hip/base/device.hpp"
 #include "hip/base/hipblas_bindings.hip.hpp"
 #include "hip/base/hipsparse_bindings.hip.hpp"
 #include "hip/base/scoped_device_id.hip.hpp"
@@ -173,6 +174,14 @@ scoped_device_id_guard HipExecutor::get_scoped_device_id_guard() const
 }
 
 
+std::string HipExecutor::get_description() const
+{
+    return "HipExecutor on device " + std::to_string(this->get_device_id()) +
+           " (" + gko::kernels::hip::get_device_name(this->get_device_id()) +
+           ") with host " + this->get_master()->get_description();
+}
+
+
 int HipExecutor::get_num_devices()
 {
     int deviceCount = 0;
diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index 0d592485c1c..95373b3e847 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -865,6 +865,9 @@ class Executor : public log::EnableLogging<Executor> {
 
     virtual scoped_device_id_guard get_scoped_device_id_guard() const = 0;
 
+    /** @return a textual representation of the executor and its device. */
+    virtual std::string get_description() const = 0;
+
 protected:
     /**
      * A struct that abstracts the executor info for different executors
@@ -1368,6 +1371,8 @@ class OmpExecutor : public detail::ExecutorBase<OmpExecutor>,
 
     scoped_device_id_guard get_scoped_device_id_guard() const override;
 
+    std::string get_description() const override;
+
 protected:
     OmpExecutor(std::shared_ptr<CpuAllocatorBase> alloc)
         : alloc_{std::move(alloc)}
@@ -1426,6 +1431,8 @@ class ReferenceExecutor : public OmpExecutor {
         return {this, 0};
     }
 
+    std::string get_description() const override { return "ReferenceExecutor"; }
+
     void run(const Operation& op) const override
     {
         this->template log<log::Logger::operation_launched>(this, &op);
@@ -1532,6 +1539,8 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
 
     scoped_device_id_guard get_scoped_device_id_guard() const override;
 
+    std::string get_description() const override;
+
     /**
      * Get the CUDA device id of the device associated to this executor.
      */
@@ -1752,6 +1761,8 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
 
     scoped_device_id_guard get_scoped_device_id_guard() const override;
 
+    std::string get_description() const override;
+
     /**
      * Get the HIP device id of the device associated to this executor.
      */
@@ -1953,6 +1964,8 @@ class DpcppExecutor : public detail::ExecutorBase<DpcppExecutor>,
 
     scoped_device_id_guard get_scoped_device_id_guard() const override;
 
+    std::string get_description() const override;
+
     /**
      * Get the DPCPP device id of the device associated to this executor.
      *
diff --git a/omp/base/executor.cpp b/omp/base/executor.cpp
index 5e846946e5e..7505b78ede6 100644
--- a/omp/base/executor.cpp
+++ b/omp/base/executor.cpp
@@ -20,4 +20,11 @@ int OmpExecutor::get_num_omp_threads()
 }
 
 
+std::string OmpExecutor::get_description() const
+{
+    return "OmpExecutor (" + std::to_string(this->get_num_omp_threads()) +
+           " threads)";
+}
+
+
 }  // namespace gko

From d1c30d271a830831fc2c9b2de2f50f189df07e01 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Sun, 19 May 2024 01:43:48 +0200
Subject: [PATCH 029/448] use Executor:::get_description() in benchmarks

---
 benchmark/blas/blas.cpp                                     | 2 +-
 benchmark/blas/distributed/multi_vector.cpp                 | 6 +++---
 benchmark/conversion/conversion.cpp                         | 2 +-
 benchmark/preconditioner/preconditioner.cpp                 | 2 +-
 benchmark/solver/distributed/solver.cpp                     | 2 +-
 benchmark/solver/solver.cpp                                 | 2 +-
 benchmark/sparse_blas/sparse_blas.cpp                       | 2 +-
 benchmark/spmv/distributed/spmv.cpp                         | 6 +++---
 benchmark/spmv/spmv.cpp                                     | 3 ++-
 benchmark/test/reference/blas.profile.stderr                | 2 +-
 benchmark/test/reference/blas.simple.stderr                 | 2 +-
 benchmark/test/reference/conversion.all.stderr              | 2 +-
 benchmark/test/reference/conversion.matrix.stderr           | 2 +-
 benchmark/test/reference/conversion.profile.stderr          | 2 +-
 benchmark/test/reference/conversion.simple.stderr           | 2 +-
 benchmark/test/reference/distributed_solver.matrix.stderr   | 2 +-
 benchmark/test/reference/distributed_solver.profile.stderr  | 2 +-
 benchmark/test/reference/distributed_solver.simple.stderr   | 2 +-
 .../test/reference/multi_vector_distributed.profile.stderr  | 2 +-
 .../test/reference/multi_vector_distributed.simple.stderr   | 2 +-
 benchmark/test/reference/preconditioner.matrix.stderr       | 2 +-
 benchmark/test/reference/preconditioner.precond.stderr      | 2 +-
 benchmark/test/reference/preconditioner.profile.stderr      | 2 +-
 benchmark/test/reference/preconditioner.reordered.stderr    | 2 +-
 benchmark/test/reference/preconditioner.simple.stderr       | 2 +-
 benchmark/test/reference/solver.matrix.stderr               | 2 +-
 benchmark/test/reference/solver.profile.stderr              | 2 +-
 benchmark/test/reference/solver.reordered.stderr            | 2 +-
 benchmark/test/reference/solver.simple.stderr               | 2 +-
 benchmark/test/reference/sparse_blas.matrix.stderr          | 2 +-
 benchmark/test/reference/sparse_blas.profile.stderr         | 2 +-
 benchmark/test/reference/sparse_blas.reordered.stderr       | 2 +-
 benchmark/test/reference/sparse_blas.simple.stderr          | 2 +-
 benchmark/test/reference/spmv.matrix.stderr                 | 2 +-
 benchmark/test/reference/spmv.profile.stderr                | 2 +-
 benchmark/test/reference/spmv.reordered.stderr              | 2 +-
 benchmark/test/reference/spmv.simple.stderr                 | 2 +-
 benchmark/test/reference/spmv_distributed.profile.stderr    | 2 +-
 benchmark/test/reference/spmv_distributed.simple.stderr     | 2 +-
 benchmark/utils/general.hpp                                 | 6 +++---
 40 files changed, 47 insertions(+), 46 deletions(-)

diff --git a/benchmark/blas/blas.cpp b/benchmark/blas/blas.cpp
index 57e0152d824..2a682f49917 100644
--- a/benchmark/blas/blas.cpp
+++ b/benchmark/blas/blas.cpp
@@ -104,8 +104,8 @@ Parameters for a benchmark case are:
     initialize_argument_parsing(&argc, &argv, header, format);
 
     std::string extra_information = "The operations are " + FLAGS_operations;
-    print_general_information(extra_information);
     auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer);
+    print_general_information(extra_information, exec);
 
     auto test_cases = json::parse(get_input_stream());
 
diff --git a/benchmark/blas/distributed/multi_vector.cpp b/benchmark/blas/distributed/multi_vector.cpp
index a4be6c502c1..e0dfa36fb19 100644
--- a/benchmark/blas/distributed/multi_vector.cpp
+++ b/benchmark/blas/distributed/multi_vector.cpp
@@ -41,14 +41,14 @@ Parameters for a benchmark case are:
     std::string format = Generator::get_example_config();
     initialize_argument_parsing(&argc, &argv, header, format, do_print);
 
+    auto exec = executor_factory_mpi.at(FLAGS_executor)(comm.get());
+
     if (do_print) {
         std::string extra_information =
             "The operations are " + FLAGS_operations;
-        print_general_information(extra_information);
+        print_general_information(extra_information, exec);
     }
 
-    auto exec = executor_factory_mpi.at(FLAGS_executor)(comm.get());
-
     std::string json_input = broadcast_json_input(get_input_stream(), comm);
     auto test_cases = json::parse(json_input);
 
diff --git a/benchmark/conversion/conversion.cpp b/benchmark/conversion/conversion.cpp
index 17d2ac48e47..59c052b11e7 100644
--- a/benchmark/conversion/conversion.cpp
+++ b/benchmark/conversion/conversion.cpp
@@ -163,9 +163,9 @@ int main(int argc, char* argv[])
 
     std::string extra_information =
         std::string() + "The formats are " + FLAGS_formats;
-    print_general_information(extra_information);
 
     auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer);
+    print_general_information(extra_information, exec);
     auto formats = split(FLAGS_formats, ',');
 
     auto test_cases = json::parse(get_input_stream());
diff --git a/benchmark/preconditioner/preconditioner.cpp b/benchmark/preconditioner/preconditioner.cpp
index 3c737d67d7b..b9dfe1d8369 100644
--- a/benchmark/preconditioner/preconditioner.cpp
+++ b/benchmark/preconditioner/preconditioner.cpp
@@ -275,9 +275,9 @@ int main(int argc, char* argv[])
 
     std::string extra_information =
         "Running with preconditioners: " + FLAGS_preconditioners;
-    print_general_information(extra_information);
 
     auto exec = get_executor(FLAGS_gpu_timer);
+    print_general_information(extra_information, exec);
     auto& engine = get_engine();
 
     auto preconditioners = split(FLAGS_preconditioners, ',');
diff --git a/benchmark/solver/distributed/solver.cpp b/benchmark/solver/distributed/solver.cpp
index 196bae5331b..9605d5bbf8f 100644
--- a/benchmark/solver/distributed/solver.cpp
+++ b/benchmark/solver/distributed/solver.cpp
@@ -91,7 +91,7 @@ int main(int argc, char* argv[])
         ss_rel_res_goal.str() + "\nThe number of right hand sides is " +
         std::to_string(FLAGS_nrhs);
     if (do_print) {
-        print_general_information(extra_information);
+        print_general_information(extra_information, exec);
     }
 
     std::set<std::string> supported_solvers = {"cg", "fcg", "cgs", "bicgstab",
diff --git a/benchmark/solver/solver.cpp b/benchmark/solver/solver.cpp
index 94956cadd21..548c843f898 100644
--- a/benchmark/solver/solver.cpp
+++ b/benchmark/solver/solver.cpp
@@ -43,9 +43,9 @@ int main(int argc, char* argv[])
         std::to_string(FLAGS_max_iters) + " iterations and residual goal of " +
         ss_rel_res_goal.str() + "\nThe number of right hand sides is " +
         std::to_string(FLAGS_nrhs);
-    print_general_information(extra_information);
 
     auto exec = get_executor(FLAGS_gpu_timer);
+    print_general_information(extra_information, exec);
 
     json test_cases;
     if (!FLAGS_overhead) {
diff --git a/benchmark/sparse_blas/sparse_blas.cpp b/benchmark/sparse_blas/sparse_blas.cpp
index 3897689ca11..cfd87b53439 100644
--- a/benchmark/sparse_blas/sparse_blas.cpp
+++ b/benchmark/sparse_blas/sparse_blas.cpp
@@ -166,7 +166,7 @@ int main(int argc, char* argv[])
     auto test_cases = json::parse(get_input_stream());
 
     std::string extra_information = "The operations are " + FLAGS_operations;
-    print_general_information(extra_information);
+    print_general_information(extra_information, exec);
 
     run_test_cases(SparseBlasBenchmark{}, exec,
                    get_timer(exec, FLAGS_gpu_timer), test_cases);
diff --git a/benchmark/spmv/distributed/spmv.cpp b/benchmark/spmv/distributed/spmv.cpp
index 2c2e0f57b0e..135e7e4e4f0 100644
--- a/benchmark/spmv/distributed/spmv.cpp
+++ b/benchmark/spmv/distributed/spmv.cpp
@@ -49,16 +49,16 @@ int main(int argc, char* argv[])
     initialize_argument_parsing_matrix(&argc, &argv, header, format, "",
                                        do_print);
 
+    auto exec = executor_factory_mpi.at(FLAGS_executor)(comm.get());
+
     if (do_print) {
         std::string extra_information =
             "The formats are [" + FLAGS_local_formats + "]x[" +
             FLAGS_non_local_formats + "]\n" +
             "The number of right hand sides is " + std::to_string(FLAGS_nrhs);
-        print_general_information(extra_information);
+        print_general_information(extra_information, exec);
     }
 
-    auto exec = executor_factory_mpi.at(FLAGS_executor)(comm.get());
-
     auto local_formats = split(FLAGS_local_formats, ',');
     auto non_local_formats = split(FLAGS_non_local_formats, ',');
     std::vector<std::string> formats;
diff --git a/benchmark/spmv/spmv.cpp b/benchmark/spmv/spmv.cpp
index 960921257e3..40a48232be0 100644
--- a/benchmark/spmv/spmv.cpp
+++ b/benchmark/spmv/spmv.cpp
@@ -26,10 +26,11 @@ int main(int argc, char* argv[])
     std::string extra_information = "The formats are " + FLAGS_formats +
                                     "\nThe number of right hand sides is " +
                                     std::to_string(FLAGS_nrhs);
-    print_general_information(extra_information);
 
     auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer);
 
+    print_general_information(extra_information, exec);
+
     auto test_cases = json::parse(get_input_stream());
 
     run_test_cases(SpmvBenchmark<Generator>{Generator{}, split(FLAGS_formats)},
diff --git a/benchmark/test/reference/blas.profile.stderr b/benchmark/test/reference/blas.profile.stderr
index 8cb03d61ab0..b4a132b8ebd 100644
--- a/benchmark/test/reference/blas.profile.stderr
+++ b/benchmark/test/reference/blas.profile.stderr
@@ -1,4 +1,4 @@
-Running on reference(0)
+Running on ReferenceExecutor
 Running with 0 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 The operations are copy,axpy,scal
diff --git a/benchmark/test/reference/blas.simple.stderr b/benchmark/test/reference/blas.simple.stderr
index 3ed5cf3784e..ff505a3f1c9 100644
--- a/benchmark/test/reference/blas.simple.stderr
+++ b/benchmark/test/reference/blas.simple.stderr
@@ -1,4 +1,4 @@
-Running on reference(0)
+Running on ReferenceExecutor
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 The operations are copy,axpy,scal
diff --git a/benchmark/test/reference/conversion.all.stderr b/benchmark/test/reference/conversion.all.stderr
index d1759e1021e..ed52cf42fb4 100644
--- a/benchmark/test/reference/conversion.all.stderr
+++ b/benchmark/test/reference/conversion.all.stderr
@@ -1,4 +1,4 @@
-Running on reference(0)
+Running on ReferenceExecutor
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 The formats are coo,csr,ell,sellp,hybrid
diff --git a/benchmark/test/reference/conversion.matrix.stderr b/benchmark/test/reference/conversion.matrix.stderr
index 2f9d717e268..2ad5c0a1545 100644
--- a/benchmark/test/reference/conversion.matrix.stderr
+++ b/benchmark/test/reference/conversion.matrix.stderr
@@ -1,4 +1,4 @@
-Running on reference(0)
+Running on ReferenceExecutor
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 The formats are coo,csr
diff --git a/benchmark/test/reference/conversion.profile.stderr b/benchmark/test/reference/conversion.profile.stderr
index 7b543f87314..561680cc885 100644
--- a/benchmark/test/reference/conversion.profile.stderr
+++ b/benchmark/test/reference/conversion.profile.stderr
@@ -1,4 +1,4 @@
-Running on reference(0)
+Running on ReferenceExecutor
 Running with 0 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 The formats are coo,csr
diff --git a/benchmark/test/reference/conversion.simple.stderr b/benchmark/test/reference/conversion.simple.stderr
index 76c52df1d56..23a27c4372a 100644
--- a/benchmark/test/reference/conversion.simple.stderr
+++ b/benchmark/test/reference/conversion.simple.stderr
@@ -1,4 +1,4 @@
-Running on reference(0)
+Running on ReferenceExecutor
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 The formats are coo,csr
diff --git a/benchmark/test/reference/distributed_solver.matrix.stderr b/benchmark/test/reference/distributed_solver.matrix.stderr
index 8fa38bfb7ed..dddd27e145a 100644
--- a/benchmark/test/reference/distributed_solver.matrix.stderr
+++ b/benchmark/test/reference/distributed_solver.matrix.stderr
@@ -1,4 +1,4 @@
-Running on reference(0)
+Running on ReferenceExecutor
 Running with 2 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 Running cg with 1000 iterations and residual goal of 1.000000e-06
diff --git a/benchmark/test/reference/distributed_solver.profile.stderr b/benchmark/test/reference/distributed_solver.profile.stderr
index 907ff8a9c98..cf5006ab785 100644
--- a/benchmark/test/reference/distributed_solver.profile.stderr
+++ b/benchmark/test/reference/distributed_solver.profile.stderr
@@ -1,4 +1,4 @@
-Running on reference(0)
+Running on ReferenceExecutor
 Running with 0 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 Running cg with 1000 iterations and residual goal of 1.000000e-06
diff --git a/benchmark/test/reference/distributed_solver.simple.stderr b/benchmark/test/reference/distributed_solver.simple.stderr
index 952be67e93c..9d4b1f7094e 100644
--- a/benchmark/test/reference/distributed_solver.simple.stderr
+++ b/benchmark/test/reference/distributed_solver.simple.stderr
@@ -1,4 +1,4 @@
-Running on reference(0)
+Running on ReferenceExecutor
 Running with 2 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 Running cg with 1000 iterations and residual goal of 1.000000e-06
diff --git a/benchmark/test/reference/multi_vector_distributed.profile.stderr b/benchmark/test/reference/multi_vector_distributed.profile.stderr
index 39fc91b3fed..10de82cae01 100644
--- a/benchmark/test/reference/multi_vector_distributed.profile.stderr
+++ b/benchmark/test/reference/multi_vector_distributed.profile.stderr
@@ -1,4 +1,4 @@
-Running on reference(0)
+Running on ReferenceExecutor
 Running with 0 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 The operations are copy,axpy,scal
diff --git a/benchmark/test/reference/multi_vector_distributed.simple.stderr b/benchmark/test/reference/multi_vector_distributed.simple.stderr
index 3ed5cf3784e..ff505a3f1c9 100644
--- a/benchmark/test/reference/multi_vector_distributed.simple.stderr
+++ b/benchmark/test/reference/multi_vector_distributed.simple.stderr
@@ -1,4 +1,4 @@
-Running on reference(0)
+Running on ReferenceExecutor
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 The operations are copy,axpy,scal
diff --git a/benchmark/test/reference/preconditioner.matrix.stderr b/benchmark/test/reference/preconditioner.matrix.stderr
index 12af18b503e..ad79bbffc34 100644
--- a/benchmark/test/reference/preconditioner.matrix.stderr
+++ b/benchmark/test/reference/preconditioner.matrix.stderr
@@ -1,4 +1,4 @@
-Running on reference(0)
+Running on ReferenceExecutor
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 Running with preconditioners: none
diff --git a/benchmark/test/reference/preconditioner.precond.stderr b/benchmark/test/reference/preconditioner.precond.stderr
index 52c54ffdd65..49bb9820f76 100644
--- a/benchmark/test/reference/preconditioner.precond.stderr
+++ b/benchmark/test/reference/preconditioner.precond.stderr
@@ -1,4 +1,4 @@
-Running on reference(0)
+Running on ReferenceExecutor
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 Running with preconditioners: jacobi
diff --git a/benchmark/test/reference/preconditioner.profile.stderr b/benchmark/test/reference/preconditioner.profile.stderr
index 56cfcc39c89..34cf27acbc6 100644
--- a/benchmark/test/reference/preconditioner.profile.stderr
+++ b/benchmark/test/reference/preconditioner.profile.stderr
@@ -1,4 +1,4 @@
-Running on reference(0)
+Running on ReferenceExecutor
 Running with 0 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 Running with preconditioners: none
diff --git a/benchmark/test/reference/preconditioner.reordered.stderr b/benchmark/test/reference/preconditioner.reordered.stderr
index e26d2a7b0dd..d36bc663e57 100644
--- a/benchmark/test/reference/preconditioner.reordered.stderr
+++ b/benchmark/test/reference/preconditioner.reordered.stderr
@@ -1,4 +1,4 @@
-Running on reference(0)
+Running on ReferenceExecutor
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 Running with preconditioners: none
diff --git a/benchmark/test/reference/preconditioner.simple.stderr b/benchmark/test/reference/preconditioner.simple.stderr
index e26d2a7b0dd..d36bc663e57 100644
--- a/benchmark/test/reference/preconditioner.simple.stderr
+++ b/benchmark/test/reference/preconditioner.simple.stderr
@@ -1,4 +1,4 @@
-Running on reference(0)
+Running on ReferenceExecutor
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 Running with preconditioners: none
diff --git a/benchmark/test/reference/solver.matrix.stderr b/benchmark/test/reference/solver.matrix.stderr
index 8fa38bfb7ed..dddd27e145a 100644
--- a/benchmark/test/reference/solver.matrix.stderr
+++ b/benchmark/test/reference/solver.matrix.stderr
@@ -1,4 +1,4 @@
-Running on reference(0)
+Running on ReferenceExecutor
 Running with 2 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 Running cg with 1000 iterations and residual goal of 1.000000e-06
diff --git a/benchmark/test/reference/solver.profile.stderr b/benchmark/test/reference/solver.profile.stderr
index 70bfe336298..f70cf743888 100644
--- a/benchmark/test/reference/solver.profile.stderr
+++ b/benchmark/test/reference/solver.profile.stderr
@@ -1,4 +1,4 @@
-Running on reference(0)
+Running on ReferenceExecutor
 Running with 0 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 Running cg with 1000 iterations and residual goal of 1.000000e-06
diff --git a/benchmark/test/reference/solver.reordered.stderr b/benchmark/test/reference/solver.reordered.stderr
index fa61d6c4050..6baa84ee792 100644
--- a/benchmark/test/reference/solver.reordered.stderr
+++ b/benchmark/test/reference/solver.reordered.stderr
@@ -1,4 +1,4 @@
-Running on reference(0)
+Running on ReferenceExecutor
 Running with 2 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 Running cg with 1000 iterations and residual goal of 1.000000e-06
diff --git a/benchmark/test/reference/solver.simple.stderr b/benchmark/test/reference/solver.simple.stderr
index fa61d6c4050..6baa84ee792 100644
--- a/benchmark/test/reference/solver.simple.stderr
+++ b/benchmark/test/reference/solver.simple.stderr
@@ -1,4 +1,4 @@
-Running on reference(0)
+Running on ReferenceExecutor
 Running with 2 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 Running cg with 1000 iterations and residual goal of 1.000000e-06
diff --git a/benchmark/test/reference/sparse_blas.matrix.stderr b/benchmark/test/reference/sparse_blas.matrix.stderr
index 1702804ddb9..e8f92862042 100644
--- a/benchmark/test/reference/sparse_blas.matrix.stderr
+++ b/benchmark/test/reference/sparse_blas.matrix.stderr
@@ -1,4 +1,4 @@
-Running on reference(0)
+Running on ReferenceExecutor
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 The operations are transpose
diff --git a/benchmark/test/reference/sparse_blas.profile.stderr b/benchmark/test/reference/sparse_blas.profile.stderr
index 23af617c55b..cfb7998cfb6 100644
--- a/benchmark/test/reference/sparse_blas.profile.stderr
+++ b/benchmark/test/reference/sparse_blas.profile.stderr
@@ -1,4 +1,4 @@
-Running on reference(0)
+Running on ReferenceExecutor
 Running with 0 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 The operations are transpose
diff --git a/benchmark/test/reference/sparse_blas.reordered.stderr b/benchmark/test/reference/sparse_blas.reordered.stderr
index c7259f7e4ea..874e6c4a7f1 100644
--- a/benchmark/test/reference/sparse_blas.reordered.stderr
+++ b/benchmark/test/reference/sparse_blas.reordered.stderr
@@ -1,4 +1,4 @@
-Running on reference(0)
+Running on ReferenceExecutor
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 The operations are symbolic_cholesky
diff --git a/benchmark/test/reference/sparse_blas.simple.stderr b/benchmark/test/reference/sparse_blas.simple.stderr
index 0e8b2dfeb66..d4e29cd9cd7 100644
--- a/benchmark/test/reference/sparse_blas.simple.stderr
+++ b/benchmark/test/reference/sparse_blas.simple.stderr
@@ -1,4 +1,4 @@
-Running on reference(0)
+Running on ReferenceExecutor
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 The operations are transpose
diff --git a/benchmark/test/reference/spmv.matrix.stderr b/benchmark/test/reference/spmv.matrix.stderr
index b3d2b47dffb..7896bb14728 100644
--- a/benchmark/test/reference/spmv.matrix.stderr
+++ b/benchmark/test/reference/spmv.matrix.stderr
@@ -1,4 +1,4 @@
-Running on reference(0)
+Running on ReferenceExecutor
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 The formats are coo
diff --git a/benchmark/test/reference/spmv.profile.stderr b/benchmark/test/reference/spmv.profile.stderr
index 16d23370d16..4861b217d44 100644
--- a/benchmark/test/reference/spmv.profile.stderr
+++ b/benchmark/test/reference/spmv.profile.stderr
@@ -1,4 +1,4 @@
-Running on reference(0)
+Running on ReferenceExecutor
 Running with 0 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 The formats are coo
diff --git a/benchmark/test/reference/spmv.reordered.stderr b/benchmark/test/reference/spmv.reordered.stderr
index 555459ca70c..a1f6a62e866 100644
--- a/benchmark/test/reference/spmv.reordered.stderr
+++ b/benchmark/test/reference/spmv.reordered.stderr
@@ -1,4 +1,4 @@
-Running on reference(0)
+Running on ReferenceExecutor
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 The formats are coo
diff --git a/benchmark/test/reference/spmv.simple.stderr b/benchmark/test/reference/spmv.simple.stderr
index 555459ca70c..a1f6a62e866 100644
--- a/benchmark/test/reference/spmv.simple.stderr
+++ b/benchmark/test/reference/spmv.simple.stderr
@@ -1,4 +1,4 @@
-Running on reference(0)
+Running on ReferenceExecutor
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 The formats are coo
diff --git a/benchmark/test/reference/spmv_distributed.profile.stderr b/benchmark/test/reference/spmv_distributed.profile.stderr
index d3a645aa0f7..a671e0d660f 100644
--- a/benchmark/test/reference/spmv_distributed.profile.stderr
+++ b/benchmark/test/reference/spmv_distributed.profile.stderr
@@ -1,4 +1,4 @@
-Running on reference(0)
+Running on ReferenceExecutor
 Running with 0 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 The formats are [csr]x[csr]
diff --git a/benchmark/test/reference/spmv_distributed.simple.stderr b/benchmark/test/reference/spmv_distributed.simple.stderr
index fc3409bded7..b3739ed8774 100644
--- a/benchmark/test/reference/spmv_distributed.simple.stderr
+++ b/benchmark/test/reference/spmv_distributed.simple.stderr
@@ -1,4 +1,4 @@
-Running on reference(0)
+Running on ReferenceExecutor
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 The formats are [csr]x[csr]
diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp
index 5ae34fa00ab..58b5410478d 100644
--- a/benchmark/utils/general.hpp
+++ b/benchmark/utils/general.hpp
@@ -168,11 +168,11 @@ void initialize_argument_parsing(int* argc, char** argv[], std::string& header,
  *
  * @param extra  describes benchmark specific extra parameters to output
  */
-void print_general_information(const std::string& extra)
+void print_general_information(const std::string& extra,
+                               std::shared_ptr<const gko::Executor> exec)
 {
     std::clog << gko::version_info::get() << std::endl
-              << "Running on " << FLAGS_executor << "(" << FLAGS_device_id
-              << ")\n"
+              << "Running on " << exec->get_description() << std::endl
               << "Running with " << FLAGS_warmup << " warm iterations and ";
     if (FLAGS_repetitions == "auto") {
         std::clog << "adaptively determined repetititions with "

From 818a31258e583eb7ec815dc818241662e9b82eff Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Sun, 19 May 2024 01:44:42 +0200
Subject: [PATCH 030/448] use Executor:::get_description() in tests

---
 test/base/batch_multi_vector_kernels.cpp      |  2 +-
 test/base/device_matrix_data_kernels.cpp      |  2 +-
 test/base/executor.cpp                        |  1 +
 test/base/index_range.cpp                     |  2 +-
 test/base/kernel_launch_generic.cpp           |  2 +-
 test/base/timer.cpp                           |  2 +-
 test/components/absolute_array_kernels.cpp    |  2 +-
 test/components/fill_array_kernels.cpp        |  2 +-
 test/components/format_conversion_kernels.cpp |  2 +-
 .../precision_conversion_kernels.cpp          |  2 +-
 test/components/prefix_sum_kernels.cpp        |  2 +-
 test/components/reduce_array_kernels.cpp      |  2 +-
 test/distributed/matrix_kernels.cpp           |  2 +-
 test/distributed/partition_helper_kernels.cpp |  2 +-
 test/distributed/partition_kernels.cpp        |  2 +-
 test/distributed/vector_kernels.cpp           |  2 +-
 test/factorization/cholesky_kernels.cpp       |  2 +-
 test/factorization/ic_kernels.cpp             |  2 +-
 test/factorization/ilu_kernels.cpp            |  2 +-
 test/factorization/lu_kernels.cpp             |  2 +-
 test/factorization/par_ic_kernels.cpp         |  2 +-
 test/factorization/par_ict_kernels.cpp        |  2 +-
 test/factorization/par_ilu_kernels.cpp        |  2 +-
 test/factorization/par_ilut_kernels.cpp       |  2 +-
 test/log/profiler_hook.cpp                    |  2 +-
 test/matrix/batch_csr_kernels.cpp             |  2 +-
 test/matrix/batch_dense_kernels.cpp           |  2 +-
 test/matrix/batch_ell_kernels.cpp             |  2 +-
 test/matrix/coo_kernels.cpp                   |  2 +-
 test/matrix/csr_kernels.cpp                   |  2 +-
 test/matrix/csr_kernels2.cpp                  |  2 +-
 test/matrix/dense_kernels.cpp                 |  2 +-
 test/matrix/diagonal_kernels.cpp              |  2 +-
 test/matrix/ell_kernels.cpp                   |  2 +-
 test/matrix/fbcsr_kernels.cpp                 |  2 +-
 test/matrix/fft_kernels.cpp                   |  2 +-
 test/matrix/hybrid_kernels.cpp                |  2 +-
 test/matrix/matrix.cpp                        |  2 +-
 test/matrix/permutation_kernels.cpp           |  2 +-
 test/matrix/scaled_permutation_kernels.cpp    |  2 +-
 test/matrix/sellp_kernels.cpp                 |  2 +-
 test/matrix/sparsity_csr_kernels.cpp          |  2 +-
 test/mpi/matrix.cpp                           |  2 +-
 test/mpi/multigrid/pgm.cpp                    |  2 +-
 test/mpi/partition_helpers.cpp                |  2 +-
 test/mpi/preconditioner/schwarz.cpp           |  2 +-
 test/mpi/solver/solver.cpp                    |  2 +-
 test/mpi/vector.cpp                           |  2 +-
 test/multigrid/fixed_coarsening_kernels.cpp   |  2 +-
 test/multigrid/pgm_kernels.cpp                |  2 +-
 test/preconditioner/batch_jacobi_kernels.cpp  |  2 +-
 test/preconditioner/isai_kernels.cpp          |  2 +-
 test/preconditioner/jacobi_kernels.cpp        |  2 +-
 test/reorder/amd.cpp                          |  2 +-
 test/reorder/mc64.cpp                         |  2 +-
 test/reorder/nested_dissection.cpp            |  2 +-
 test/reorder/rcm.cpp                          |  2 +-
 test/solver/batch_bicgstab_kernels.cpp        |  2 +-
 test/solver/batch_cg_kernels.cpp              |  2 +-
 test/solver/bicg_kernels.cpp                  |  2 +-
 test/solver/bicgstab_kernels.cpp              |  2 +-
 test/solver/cb_gmres_kernels.cpp              |  2 +-
 test/solver/cg_kernels.cpp                    |  2 +-
 test/solver/cgs_kernels.cpp                   |  2 +-
 test/solver/direct.cpp                        |  2 +-
 test/solver/fcg_kernels.cpp                   |  2 +-
 test/solver/gcr_kernels.cpp                   |  2 +-
 test/solver/gmres_kernels.cpp                 |  2 +-
 test/solver/idr_kernels.cpp                   |  2 +-
 test/solver/ir_kernels.cpp                    |  2 +-
 test/solver/lower_trs_kernels.cpp             |  2 +-
 test/solver/multigrid_kernels.cpp             |  2 +-
 test/solver/solver.cpp                        |  2 +-
 test/solver/upper_trs_kernels.cpp             |  2 +-
 test/stop/combined_kernels.cpp                |  2 +-
 test/stop/criterion_kernels.cpp               |  2 +-
 test/stop/residual_norm_kernels.cpp           |  2 +-
 test/utils/common_fixture.hpp                 | 82 +++++++++++++++++++
 test/utils/executor.hpp                       | 58 -------------
 .../mpi/{executor.hpp => common_fixture.hpp}  |  6 +-
 80 files changed, 162 insertions(+), 137 deletions(-)
 create mode 100644 test/utils/common_fixture.hpp
 rename test/utils/mpi/{executor.hpp => common_fixture.hpp} (90%)

diff --git a/test/base/batch_multi_vector_kernels.cpp b/test/base/batch_multi_vector_kernels.cpp
index d15e6d2165f..6ce391c92cb 100644
--- a/test/base/batch_multi_vector_kernels.cpp
+++ b/test/base/batch_multi_vector_kernels.cpp
@@ -16,7 +16,7 @@
 #include "core/test/utils.hpp"
 #include "core/test/utils/assertions.hpp"
 #include "core/test/utils/batch_helpers.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class MultiVector : public CommonTestFixture {
diff --git a/test/base/device_matrix_data_kernels.cpp b/test/base/device_matrix_data_kernels.cpp
index 59c9ec209c3..ffadbcfb245 100644
--- a/test/base/device_matrix_data_kernels.cpp
+++ b/test/base/device_matrix_data_kernels.cpp
@@ -16,7 +16,7 @@
 
 #include "core/test/utils.hpp"
 #include "core/test/utils/assertions.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 template <typename ValueIndexType>
diff --git a/test/base/executor.cpp b/test/base/executor.cpp
index 3b93d7e748a..8a344eb224d 100644
--- a/test/base/executor.cpp
+++ b/test/base/executor.cpp
@@ -11,6 +11,7 @@
 #include <ginkgo/core/base/executor.hpp>
 
 #include "core/test/utils/assertions.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 namespace reference {
diff --git a/test/base/index_range.cpp b/test/base/index_range.cpp
index 8bb5519c457..0a344a63d9a 100644
--- a/test/base/index_range.cpp
+++ b/test/base/index_range.cpp
@@ -12,7 +12,7 @@
 
 #include "common/unified/base/kernel_launch.hpp"
 #include "core/test/utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class IndexRange : public CommonTestFixture {
diff --git a/test/base/kernel_launch_generic.cpp b/test/base/kernel_launch_generic.cpp
index 8107e6a3eef..0d187b07bdf 100644
--- a/test/base/kernel_launch_generic.cpp
+++ b/test/base/kernel_launch_generic.cpp
@@ -18,7 +18,7 @@
 #include "common/unified/base/kernel_launch_solver.hpp"
 #include "core/base/array_access.hpp"
 #include "core/test/utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 using gko::dim;
diff --git a/test/base/timer.cpp b/test/base/timer.cpp
index f2f0da113bf..2463f508450 100644
--- a/test/base/timer.cpp
+++ b/test/base/timer.cpp
@@ -10,7 +10,7 @@
 #include <ginkgo/core/base/timer.hpp>
 
 #include "core/test/utils/assertions.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class Timer : public CommonTestFixture {
diff --git a/test/components/absolute_array_kernels.cpp b/test/components/absolute_array_kernels.cpp
index a18ab1534c9..3a4a2d787aa 100644
--- a/test/components/absolute_array_kernels.cpp
+++ b/test/components/absolute_array_kernels.cpp
@@ -13,7 +13,7 @@
 #include <ginkgo/core/base/array.hpp>
 
 #include "core/test/utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class AbsoluteArray : public CommonTestFixture {
diff --git a/test/components/fill_array_kernels.cpp b/test/components/fill_array_kernels.cpp
index 122edb4dc27..4756180f896 100644
--- a/test/components/fill_array_kernels.cpp
+++ b/test/components/fill_array_kernels.cpp
@@ -13,7 +13,7 @@
 #include <ginkgo/core/base/array.hpp>
 
 #include "core/test/utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 template <typename T>
diff --git a/test/components/format_conversion_kernels.cpp b/test/components/format_conversion_kernels.cpp
index 3e783206af5..217ecd22600 100644
--- a/test/components/format_conversion_kernels.cpp
+++ b/test/components/format_conversion_kernels.cpp
@@ -11,7 +11,7 @@
 #include <gtest/gtest.h>
 
 #include "core/test/utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 template <typename IndexType>
diff --git a/test/components/precision_conversion_kernels.cpp b/test/components/precision_conversion_kernels.cpp
index dcd6a0dba83..9eb26f0a9b8 100644
--- a/test/components/precision_conversion_kernels.cpp
+++ b/test/components/precision_conversion_kernels.cpp
@@ -13,7 +13,7 @@
 #include <ginkgo/core/base/array.hpp>
 
 #include "core/test/utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 #if !(GINKGO_COMMON_SINGLE_MODE)
diff --git a/test/components/prefix_sum_kernels.cpp b/test/components/prefix_sum_kernels.cpp
index 1ec97b6eadc..4a1c950855a 100644
--- a/test/components/prefix_sum_kernels.cpp
+++ b/test/components/prefix_sum_kernels.cpp
@@ -15,7 +15,7 @@
 #include <ginkgo/core/base/array.hpp>
 
 #include "core/test/utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 template <typename T>
diff --git a/test/components/reduce_array_kernels.cpp b/test/components/reduce_array_kernels.cpp
index 35c358099ad..182928412f2 100644
--- a/test/components/reduce_array_kernels.cpp
+++ b/test/components/reduce_array_kernels.cpp
@@ -13,7 +13,7 @@
 #include <ginkgo/core/base/array.hpp>
 
 #include "core/test/utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 template <typename T>
diff --git a/test/distributed/matrix_kernels.cpp b/test/distributed/matrix_kernels.cpp
index 3dcede95bfb..ad91d699496 100644
--- a/test/distributed/matrix_kernels.cpp
+++ b/test/distributed/matrix_kernels.cpp
@@ -13,7 +13,7 @@
 #include <ginkgo/core/base/executor.hpp>
 
 #include "core/test/utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 using comm_index_type = gko::experimental::distributed::comm_index_type;
diff --git a/test/distributed/partition_helper_kernels.cpp b/test/distributed/partition_helper_kernels.cpp
index 5b014625e7d..2f1c8a2002d 100644
--- a/test/distributed/partition_helper_kernels.cpp
+++ b/test/distributed/partition_helper_kernels.cpp
@@ -10,7 +10,7 @@
 #include "core/base/iterator_factory.hpp"
 #include "core/distributed/partition_helpers_kernels.hpp"
 #include "core/test/utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 using gko::experimental::distributed::comm_index_type;
diff --git a/test/distributed/partition_kernels.cpp b/test/distributed/partition_kernels.cpp
index b00d266170c..6634744211d 100644
--- a/test/distributed/partition_kernels.cpp
+++ b/test/distributed/partition_kernels.cpp
@@ -15,7 +15,7 @@
 #include <ginkgo/core/distributed/partition.hpp>
 
 #include "core/test/utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 using comm_index_type = gko::experimental::distributed::comm_index_type;
diff --git a/test/distributed/vector_kernels.cpp b/test/distributed/vector_kernels.cpp
index 294b72d861e..1246da9a116 100644
--- a/test/distributed/vector_kernels.cpp
+++ b/test/distributed/vector_kernels.cpp
@@ -15,7 +15,7 @@
 #include <ginkgo/core/base/matrix_data.hpp>
 
 #include "core/test/utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 using comm_index_type = gko::experimental::distributed::comm_index_type;
diff --git a/test/factorization/cholesky_kernels.cpp b/test/factorization/cholesky_kernels.cpp
index b7c290eec17..94d31fe33db 100644
--- a/test/factorization/cholesky_kernels.cpp
+++ b/test/factorization/cholesky_kernels.cpp
@@ -24,7 +24,7 @@
 #include "core/test/utils/assertions.hpp"
 #include "core/utils/matrix_utils.hpp"
 #include "matrices/config.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 namespace {
diff --git a/test/factorization/ic_kernels.cpp b/test/factorization/ic_kernels.cpp
index ddb38575e03..9f0b60443f2 100644
--- a/test/factorization/ic_kernels.cpp
+++ b/test/factorization/ic_kernels.cpp
@@ -16,7 +16,7 @@
 #include "core/test/utils.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
 #include "matrices/config.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class Ic : public CommonTestFixture {
diff --git a/test/factorization/ilu_kernels.cpp b/test/factorization/ilu_kernels.cpp
index bc7edeac57f..004b0d34a4f 100644
--- a/test/factorization/ilu_kernels.cpp
+++ b/test/factorization/ilu_kernels.cpp
@@ -16,7 +16,7 @@
 #include "core/test/utils.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
 #include "matrices/config.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class Ilu : public CommonTestFixture {
diff --git a/test/factorization/lu_kernels.cpp b/test/factorization/lu_kernels.cpp
index 035e938c7c8..830ba6ddd5f 100644
--- a/test/factorization/lu_kernels.cpp
+++ b/test/factorization/lu_kernels.cpp
@@ -27,7 +27,7 @@
 #include "core/test/utils.hpp"
 #include "core/test/utils/assertions.hpp"
 #include "matrices/config.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 template <typename ValueIndexType>
diff --git a/test/factorization/par_ic_kernels.cpp b/test/factorization/par_ic_kernels.cpp
index 64541612343..de2342a28db 100644
--- a/test/factorization/par_ic_kernels.cpp
+++ b/test/factorization/par_ic_kernels.cpp
@@ -23,7 +23,7 @@
 #include "core/matrix/csr_kernels.hpp"
 #include "core/test/utils.hpp"
 #include "matrices/config.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 template <typename ValueIndexType>
diff --git a/test/factorization/par_ict_kernels.cpp b/test/factorization/par_ict_kernels.cpp
index b157971ff90..3b33e52630c 100644
--- a/test/factorization/par_ict_kernels.cpp
+++ b/test/factorization/par_ict_kernels.cpp
@@ -23,7 +23,7 @@
 #include "core/matrix/csr_kernels.hpp"
 #include "core/test/utils.hpp"
 #include "matrices/config.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 template <typename ValueIndexType>
diff --git a/test/factorization/par_ilu_kernels.cpp b/test/factorization/par_ilu_kernels.cpp
index a2f3f774ba7..88f5ecff0d9 100644
--- a/test/factorization/par_ilu_kernels.cpp
+++ b/test/factorization/par_ilu_kernels.cpp
@@ -22,7 +22,7 @@
 #include "core/factorization/factorization_kernels.hpp"
 #include "core/test/utils.hpp"
 #include "matrices/config.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 template <typename ValueIndexType>
diff --git a/test/factorization/par_ilut_kernels.cpp b/test/factorization/par_ilut_kernels.cpp
index 6426e725fdf..dff3cc702c1 100644
--- a/test/factorization/par_ilut_kernels.cpp
+++ b/test/factorization/par_ilut_kernels.cpp
@@ -23,7 +23,7 @@
 #include "core/matrix/csr_kernels.hpp"
 #include "core/test/utils.hpp"
 #include "matrices/config.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 template <typename ValueIndexType>
diff --git a/test/log/profiler_hook.cpp b/test/log/profiler_hook.cpp
index 6e0ed2933db..414477b996a 100644
--- a/test/log/profiler_hook.cpp
+++ b/test/log/profiler_hook.cpp
@@ -8,7 +8,7 @@
 
 #include <ginkgo/core/log/profiler_hook.hpp>
 
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class ProfilerHook : public CommonTestFixture {
diff --git a/test/matrix/batch_csr_kernels.cpp b/test/matrix/batch_csr_kernels.cpp
index d2a1b2d9aa4..d466885d056 100644
--- a/test/matrix/batch_csr_kernels.cpp
+++ b/test/matrix/batch_csr_kernels.cpp
@@ -19,7 +19,7 @@
 #include "core/test/utils/array_generator.hpp"
 #include "core/test/utils/assertions.hpp"
 #include "core/test/utils/batch_helpers.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class Csr : public CommonTestFixture {
diff --git a/test/matrix/batch_dense_kernels.cpp b/test/matrix/batch_dense_kernels.cpp
index 222ccf6e4b9..17f27b8afa8 100644
--- a/test/matrix/batch_dense_kernels.cpp
+++ b/test/matrix/batch_dense_kernels.cpp
@@ -19,7 +19,7 @@
 #include "core/test/utils/array_generator.hpp"
 #include "core/test/utils/assertions.hpp"
 #include "core/test/utils/batch_helpers.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class Dense : public CommonTestFixture {
diff --git a/test/matrix/batch_ell_kernels.cpp b/test/matrix/batch_ell_kernels.cpp
index 7edef2c4fb0..77c4dae13b0 100644
--- a/test/matrix/batch_ell_kernels.cpp
+++ b/test/matrix/batch_ell_kernels.cpp
@@ -18,7 +18,7 @@
 #include "core/test/utils.hpp"
 #include "core/test/utils/assertions.hpp"
 #include "core/test/utils/batch_helpers.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class Ell : public CommonTestFixture {
diff --git a/test/matrix/coo_kernels.cpp b/test/matrix/coo_kernels.cpp
index 3da488cf843..091f95544e6 100644
--- a/test/matrix/coo_kernels.cpp
+++ b/test/matrix/coo_kernels.cpp
@@ -18,7 +18,7 @@
 
 #include "core/test/utils.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class Coo : public CommonTestFixture {
diff --git a/test/matrix/csr_kernels.cpp b/test/matrix/csr_kernels.cpp
index 1a1f100e1fd..ec726d856c8 100644
--- a/test/matrix/csr_kernels.cpp
+++ b/test/matrix/csr_kernels.cpp
@@ -18,7 +18,7 @@
 #include "core/base/array_access.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/test/utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class Csr : public CommonTestFixture {
diff --git a/test/matrix/csr_kernels2.cpp b/test/matrix/csr_kernels2.cpp
index 9272e99546e..9b3f09a13fc 100644
--- a/test/matrix/csr_kernels2.cpp
+++ b/test/matrix/csr_kernels2.cpp
@@ -27,7 +27,7 @@
 #include "core/test/utils/assertions.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
 #include "core/utils/matrix_utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class Csr : public CommonTestFixture {
diff --git a/test/matrix/dense_kernels.cpp b/test/matrix/dense_kernels.cpp
index b8fd4d7900c..76e6487aa89 100644
--- a/test/matrix/dense_kernels.cpp
+++ b/test/matrix/dense_kernels.cpp
@@ -26,7 +26,7 @@
 
 #include "core/components/fill_array_kernels.hpp"
 #include "core/test/utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class Dense : public CommonTestFixture {
diff --git a/test/matrix/diagonal_kernels.cpp b/test/matrix/diagonal_kernels.cpp
index ca0a9eff205..3d2f505a19f 100644
--- a/test/matrix/diagonal_kernels.cpp
+++ b/test/matrix/diagonal_kernels.cpp
@@ -15,7 +15,7 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 #include "core/test/utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class Diagonal : public CommonTestFixture {
diff --git a/test/matrix/ell_kernels.cpp b/test/matrix/ell_kernels.cpp
index 78af81ccafc..9900caa10c8 100644
--- a/test/matrix/ell_kernels.cpp
+++ b/test/matrix/ell_kernels.cpp
@@ -17,7 +17,7 @@
 #include <ginkgo/core/matrix/ell.hpp>
 
 #include "core/test/utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class Ell : public CommonTestFixture {
diff --git a/test/matrix/fbcsr_kernels.cpp b/test/matrix/fbcsr_kernels.cpp
index a3b85143bf0..8cff04c28a0 100644
--- a/test/matrix/fbcsr_kernels.cpp
+++ b/test/matrix/fbcsr_kernels.cpp
@@ -14,7 +14,7 @@
 #include "core/test/matrix/fbcsr_sample.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/fb_matrix_generator.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 template <typename T>
diff --git a/test/matrix/fft_kernels.cpp b/test/matrix/fft_kernels.cpp
index 056087fb9f3..5b2c33085e3 100644
--- a/test/matrix/fft_kernels.cpp
+++ b/test/matrix/fft_kernels.cpp
@@ -13,7 +13,7 @@
 #include <ginkgo/core/matrix/fft.hpp>
 
 #include "core/test/utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 template <typename ValueType>
diff --git a/test/matrix/hybrid_kernels.cpp b/test/matrix/hybrid_kernels.cpp
index 64179259deb..7028a14bd96 100644
--- a/test/matrix/hybrid_kernels.cpp
+++ b/test/matrix/hybrid_kernels.cpp
@@ -15,7 +15,7 @@
 #include <ginkgo/core/matrix/diagonal.hpp>
 
 #include "core/test/utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class Hybrid : public CommonTestFixture {
diff --git a/test/matrix/matrix.cpp b/test/matrix/matrix.cpp
index 7398b3edb06..eea1a67ef5f 100644
--- a/test/matrix/matrix.cpp
+++ b/test/matrix/matrix.cpp
@@ -22,7 +22,7 @@
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 #include "core/test/utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 #if GINKGO_COMMON_SINGLE_MODE
diff --git a/test/matrix/permutation_kernels.cpp b/test/matrix/permutation_kernels.cpp
index e6324c15f1d..3e2a97c02bd 100644
--- a/test/matrix/permutation_kernels.cpp
+++ b/test/matrix/permutation_kernels.cpp
@@ -11,7 +11,7 @@
 #include <ginkgo/core/matrix/permutation.hpp>
 
 #include "core/test/utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class Permutation : public CommonTestFixture {
diff --git a/test/matrix/scaled_permutation_kernels.cpp b/test/matrix/scaled_permutation_kernels.cpp
index 7239862a8d9..545c7fd064d 100644
--- a/test/matrix/scaled_permutation_kernels.cpp
+++ b/test/matrix/scaled_permutation_kernels.cpp
@@ -10,7 +10,7 @@
 #include <ginkgo/core/matrix/scaled_permutation.hpp>
 
 #include "core/test/utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class ScaledPermutation : public CommonTestFixture {
diff --git a/test/matrix/sellp_kernels.cpp b/test/matrix/sellp_kernels.cpp
index 053369f7fa6..549277b40a3 100644
--- a/test/matrix/sellp_kernels.cpp
+++ b/test/matrix/sellp_kernels.cpp
@@ -16,7 +16,7 @@
 #include <ginkgo/core/matrix/diagonal.hpp>
 
 #include "core/test/utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class Sellp : public CommonTestFixture {
diff --git a/test/matrix/sparsity_csr_kernels.cpp b/test/matrix/sparsity_csr_kernels.cpp
index 8d3728f240d..75af81874e3 100644
--- a/test/matrix/sparsity_csr_kernels.cpp
+++ b/test/matrix/sparsity_csr_kernels.cpp
@@ -19,7 +19,7 @@
 #include "core/test/utils/assertions.hpp"
 #include "core/test/utils/matrix_generator.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 namespace {
diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp
index 8a201c78733..cc9ec219a88 100644
--- a/test/mpi/matrix.cpp
+++ b/test/mpi/matrix.cpp
@@ -20,7 +20,7 @@
 #include <ginkgo/core/matrix/csr.hpp>
 
 #include "core/test/utils.hpp"
-#include "test/utils/mpi/executor.hpp"
+#include "test/utils/mpi/common_fixture.hpp"
 
 
 #ifndef GKO_COMPILING_DPCPP
diff --git a/test/mpi/multigrid/pgm.cpp b/test/mpi/multigrid/pgm.cpp
index ccd7dd46b44..664ad0cd4ec 100644
--- a/test/mpi/multigrid/pgm.cpp
+++ b/test/mpi/multigrid/pgm.cpp
@@ -19,7 +19,7 @@
 #include <ginkgo/core/multigrid/pgm.hpp>
 
 #include "core/test/utils.hpp"
-#include "test/utils/mpi/executor.hpp"
+#include "test/utils/mpi/common_fixture.hpp"
 
 
 #if GINKGO_DPCPP_SINGLE_MODE
diff --git a/test/mpi/partition_helpers.cpp b/test/mpi/partition_helpers.cpp
index b89295acc13..43b4783d896 100644
--- a/test/mpi/partition_helpers.cpp
+++ b/test/mpi/partition_helpers.cpp
@@ -6,7 +6,7 @@
 #include <ginkgo/core/distributed/partition_helpers.hpp>
 
 #include "core/test/utils.hpp"
-#include "test/utils/mpi/executor.hpp"
+#include "test/utils/mpi/common_fixture.hpp"
 
 
 using comm_index_type = gko::experimental::distributed::comm_index_type;
diff --git a/test/mpi/preconditioner/schwarz.cpp b/test/mpi/preconditioner/schwarz.cpp
index cf29ea77016..6717cd9d888 100644
--- a/test/mpi/preconditioner/schwarz.cpp
+++ b/test/mpi/preconditioner/schwarz.cpp
@@ -29,7 +29,7 @@
 #include "core/test/utils.hpp"
 #include "core/test/utils/matrix_generator.hpp"
 #include "core/utils/matrix_utils.hpp"
-#include "test/utils/mpi/executor.hpp"
+#include "test/utils/mpi/common_fixture.hpp"
 
 
 #if GINKGO_DPCPP_SINGLE_MODE
diff --git a/test/mpi/solver/solver.cpp b/test/mpi/solver/solver.cpp
index 4548dc5d6b7..589be91bcba 100644
--- a/test/mpi/solver/solver.cpp
+++ b/test/mpi/solver/solver.cpp
@@ -31,7 +31,7 @@
 #include "core/test/utils.hpp"
 #include "core/test/utils/matrix_generator.hpp"
 #include "core/utils/matrix_utils.hpp"
-#include "test/utils/mpi/executor.hpp"
+#include "test/utils/mpi/common_fixture.hpp"
 
 
 #if GINKGO_DPCPP_SINGLE_MODE
diff --git a/test/mpi/vector.cpp b/test/mpi/vector.cpp
index 3af6886dd84..cedd483b0a2 100644
--- a/test/mpi/vector.cpp
+++ b/test/mpi/vector.cpp
@@ -17,7 +17,7 @@
 #include <ginkgo/core/log/logger.hpp>
 
 #include "core/test/utils.hpp"
-#include "test/utils/mpi/executor.hpp"
+#include "test/utils/mpi/common_fixture.hpp"
 
 
 bool needs_transfers(std::shared_ptr<const gko::Executor> exec)
diff --git a/test/multigrid/fixed_coarsening_kernels.cpp b/test/multigrid/fixed_coarsening_kernels.cpp
index 0f3c7e56b2a..91c1e021a76 100644
--- a/test/multigrid/fixed_coarsening_kernels.cpp
+++ b/test/multigrid/fixed_coarsening_kernels.cpp
@@ -26,7 +26,7 @@
 #include "core/test/utils/matrix_generator.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
 #include "core/utils/matrix_utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class FixedCoarsening : public CommonTestFixture {
diff --git a/test/multigrid/pgm_kernels.cpp b/test/multigrid/pgm_kernels.cpp
index b0e3b338cbd..cdbfb5295f2 100644
--- a/test/multigrid/pgm_kernels.cpp
+++ b/test/multigrid/pgm_kernels.cpp
@@ -25,7 +25,7 @@
 #include "core/test/utils/matrix_generator.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
 #include "core/utils/matrix_utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class Pgm : public CommonTestFixture {
diff --git a/test/preconditioner/batch_jacobi_kernels.cpp b/test/preconditioner/batch_jacobi_kernels.cpp
index 9bdbb015949..62e309361c9 100644
--- a/test/preconditioner/batch_jacobi_kernels.cpp
+++ b/test/preconditioner/batch_jacobi_kernels.cpp
@@ -22,7 +22,7 @@
 #include "core/test/utils.hpp"
 #include "core/test/utils/assertions.hpp"
 #include "core/test/utils/batch_helpers.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 namespace detail {
diff --git a/test/preconditioner/isai_kernels.cpp b/test/preconditioner/isai_kernels.cpp
index 077379ab226..8ac1ad1e8ba 100644
--- a/test/preconditioner/isai_kernels.cpp
+++ b/test/preconditioner/isai_kernels.cpp
@@ -18,7 +18,7 @@
 
 #include "core/test/utils.hpp"
 #include "matrices/config.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 enum struct matrix_type { lower, upper, general, spd };
diff --git a/test/preconditioner/jacobi_kernels.cpp b/test/preconditioner/jacobi_kernels.cpp
index 5ae7c56e715..23347d8d896 100644
--- a/test/preconditioner/jacobi_kernels.cpp
+++ b/test/preconditioner/jacobi_kernels.cpp
@@ -13,7 +13,7 @@
 #include "core/test/utils.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
 #include "core/utils/matrix_utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class Jacobi : public CommonTestFixture {
diff --git a/test/reorder/amd.cpp b/test/reorder/amd.cpp
index 8137ed8ad7e..a1ca7c09359 100644
--- a/test/reorder/amd.cpp
+++ b/test/reorder/amd.cpp
@@ -15,7 +15,7 @@
 #include "core/test/utils.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
 #include "matrices/config.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 template <typename ValueIndexType>
diff --git a/test/reorder/mc64.cpp b/test/reorder/mc64.cpp
index 0cc3ea33a3d..f05b13d19c0 100644
--- a/test/reorder/mc64.cpp
+++ b/test/reorder/mc64.cpp
@@ -8,7 +8,7 @@
 #include <ginkgo/core/reorder/mc64.hpp>
 
 #include "core/test/utils/assertions.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 namespace {
diff --git a/test/reorder/nested_dissection.cpp b/test/reorder/nested_dissection.cpp
index 2d11bdccb12..d35818f28e6 100644
--- a/test/reorder/nested_dissection.cpp
+++ b/test/reorder/nested_dissection.cpp
@@ -11,7 +11,7 @@
 
 #include "core/test/utils.hpp"
 #include "matrices/config.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 template <typename IndexType>
diff --git a/test/reorder/rcm.cpp b/test/reorder/rcm.cpp
index 848d0deea5d..9ae656fbc1c 100644
--- a/test/reorder/rcm.cpp
+++ b/test/reorder/rcm.cpp
@@ -19,7 +19,7 @@
 #include "core/test/utils.hpp"
 #include "core/test/utils/assertions.hpp"
 #include "matrices/config.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class Rcm : public CommonTestFixture {
diff --git a/test/solver/batch_bicgstab_kernels.cpp b/test/solver/batch_bicgstab_kernels.cpp
index 8f4bfca00cc..1a852eacfe9 100644
--- a/test/solver/batch_bicgstab_kernels.cpp
+++ b/test/solver/batch_bicgstab_kernels.cpp
@@ -20,7 +20,7 @@
 #include "core/matrix/batch_dense_kernels.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/batch_helpers.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class BatchBicgstab : public CommonTestFixture {
diff --git a/test/solver/batch_cg_kernels.cpp b/test/solver/batch_cg_kernels.cpp
index 7b5a85a1e5b..4c6de9004c9 100644
--- a/test/solver/batch_cg_kernels.cpp
+++ b/test/solver/batch_cg_kernels.cpp
@@ -19,7 +19,7 @@
 #include "core/matrix/batch_dense_kernels.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/batch_helpers.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class BatchCg : public CommonTestFixture {
diff --git a/test/solver/bicg_kernels.cpp b/test/solver/bicg_kernels.cpp
index 5f9dd818711..3f3b6a01ae1 100644
--- a/test/solver/bicg_kernels.cpp
+++ b/test/solver/bicg_kernels.cpp
@@ -19,7 +19,7 @@
 #include "core/test/utils.hpp"
 #include "core/utils/matrix_utils.hpp"
 #include "matrices/config.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class Bicg : public CommonTestFixture {
diff --git a/test/solver/bicgstab_kernels.cpp b/test/solver/bicgstab_kernels.cpp
index 9548c99daf9..a90451a3f3a 100644
--- a/test/solver/bicgstab_kernels.cpp
+++ b/test/solver/bicgstab_kernels.cpp
@@ -19,7 +19,7 @@
 
 #include "core/test/utils.hpp"
 #include "core/utils/matrix_utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class Bicgstab : public CommonTestFixture {
diff --git a/test/solver/cb_gmres_kernels.cpp b/test/solver/cb_gmres_kernels.cpp
index 45a752a2292..022899d21e6 100644
--- a/test/solver/cb_gmres_kernels.cpp
+++ b/test/solver/cb_gmres_kernels.cpp
@@ -20,7 +20,7 @@
 
 #include "core/solver/cb_gmres_accessor.hpp"
 #include "core/test/utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class CbGmres : public CommonTestFixture {
diff --git a/test/solver/cg_kernels.cpp b/test/solver/cg_kernels.cpp
index b4408851da6..13e6905fa81 100644
--- a/test/solver/cg_kernels.cpp
+++ b/test/solver/cg_kernels.cpp
@@ -18,7 +18,7 @@
 
 #include "core/test/utils.hpp"
 #include "core/utils/matrix_utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class Cg : public CommonTestFixture {
diff --git a/test/solver/cgs_kernels.cpp b/test/solver/cgs_kernels.cpp
index 392167d2106..f952e68170e 100644
--- a/test/solver/cgs_kernels.cpp
+++ b/test/solver/cgs_kernels.cpp
@@ -18,7 +18,7 @@
 
 #include "core/test/utils.hpp"
 #include "core/utils/matrix_utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class Cgs : public CommonTestFixture {
diff --git a/test/solver/direct.cpp b/test/solver/direct.cpp
index c2e6c757f76..a58d3d46f3f 100644
--- a/test/solver/direct.cpp
+++ b/test/solver/direct.cpp
@@ -23,7 +23,7 @@
 #include "core/test/utils.hpp"
 #include "core/test/utils/assertions.hpp"
 #include "matrices/config.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 namespace {
diff --git a/test/solver/fcg_kernels.cpp b/test/solver/fcg_kernels.cpp
index 9ad2be9eb05..194151f203e 100644
--- a/test/solver/fcg_kernels.cpp
+++ b/test/solver/fcg_kernels.cpp
@@ -18,7 +18,7 @@
 
 #include "core/test/utils.hpp"
 #include "core/utils/matrix_utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class Fcg : public CommonTestFixture {
diff --git a/test/solver/gcr_kernels.cpp b/test/solver/gcr_kernels.cpp
index d26b5ef265c..5a46bbbb940 100644
--- a/test/solver/gcr_kernels.cpp
+++ b/test/solver/gcr_kernels.cpp
@@ -19,7 +19,7 @@
 
 #include "core/test/utils.hpp"
 #include "core/utils/matrix_utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class Gcr : public CommonTestFixture {
diff --git a/test/solver/gmres_kernels.cpp b/test/solver/gmres_kernels.cpp
index 52ee885e29d..a6c74bd45c0 100644
--- a/test/solver/gmres_kernels.cpp
+++ b/test/solver/gmres_kernels.cpp
@@ -19,7 +19,7 @@
 
 #include "core/solver/common_gmres_kernels.hpp"
 #include "core/test/utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class Gmres : public CommonTestFixture {
diff --git a/test/solver/idr_kernels.cpp b/test/solver/idr_kernels.cpp
index 7afac1c2f33..a9857952615 100644
--- a/test/solver/idr_kernels.cpp
+++ b/test/solver/idr_kernels.cpp
@@ -27,7 +27,7 @@
 #include <ginkgo/core/stop/residual_norm.hpp>
 
 #include "core/test/utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 // use another alias to avoid conflict name in the Idr
diff --git a/test/solver/ir_kernels.cpp b/test/solver/ir_kernels.cpp
index 114dee3c06b..31973e849b1 100644
--- a/test/solver/ir_kernels.cpp
+++ b/test/solver/ir_kernels.cpp
@@ -17,7 +17,7 @@
 #include <ginkgo/core/stop/iteration.hpp>
 
 #include "core/test/utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class Ir : public CommonTestFixture {
diff --git a/test/solver/lower_trs_kernels.cpp b/test/solver/lower_trs_kernels.cpp
index 4bccf283faf..b838c1df14b 100644
--- a/test/solver/lower_trs_kernels.cpp
+++ b/test/solver/lower_trs_kernels.cpp
@@ -15,7 +15,7 @@
 
 #include "core/test/utils.hpp"
 #include "core/utils/matrix_utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class LowerTrs : public CommonTestFixture {
diff --git a/test/solver/multigrid_kernels.cpp b/test/solver/multigrid_kernels.cpp
index 894f4280346..2efb7cf8158 100644
--- a/test/solver/multigrid_kernels.cpp
+++ b/test/solver/multigrid_kernels.cpp
@@ -16,7 +16,7 @@
 #include <ginkgo/core/stop/residual_norm.hpp>
 
 #include "core/test/utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class Multigrid : public CommonTestFixture {
diff --git a/test/solver/solver.cpp b/test/solver/solver.cpp
index 5b24234ce14..47414f83041 100644
--- a/test/solver/solver.cpp
+++ b/test/solver/solver.cpp
@@ -31,7 +31,7 @@
 
 #include "core/test/utils.hpp"
 #include "core/utils/matrix_utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 #if GINKGO_COMMON_SINGLE_MODE
diff --git a/test/solver/upper_trs_kernels.cpp b/test/solver/upper_trs_kernels.cpp
index c7041865dd1..6825d9f6c3b 100644
--- a/test/solver/upper_trs_kernels.cpp
+++ b/test/solver/upper_trs_kernels.cpp
@@ -15,7 +15,7 @@
 
 #include "core/test/utils.hpp"
 #include "core/utils/matrix_utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class UpperTrs : public CommonTestFixture {
diff --git a/test/stop/combined_kernels.cpp b/test/stop/combined_kernels.cpp
index 7e18a0c32aa..96cf8656c33 100644
--- a/test/stop/combined_kernels.cpp
+++ b/test/stop/combined_kernels.cpp
@@ -7,7 +7,7 @@
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 constexpr gko::size_type test_iterations = 10;
diff --git a/test/stop/criterion_kernels.cpp b/test/stop/criterion_kernels.cpp
index 6b6094125ba..30280e848d8 100644
--- a/test/stop/criterion_kernels.cpp
+++ b/test/stop/criterion_kernels.cpp
@@ -7,7 +7,7 @@
 #include <ginkgo/core/stop/criterion.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 constexpr gko::size_type test_iterations = 10;
diff --git a/test/stop/residual_norm_kernels.cpp b/test/stop/residual_norm_kernels.cpp
index 7c3ddf6624e..a0a144bcf3b 100644
--- a/test/stop/residual_norm_kernels.cpp
+++ b/test/stop/residual_norm_kernels.cpp
@@ -8,7 +8,7 @@
 #include <ginkgo/core/stop/residual_norm.hpp>
 
 #include "core/test/utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 template <typename Mtx>
diff --git a/test/utils/common_fixture.hpp b/test/utils/common_fixture.hpp
new file mode 100644
index 00000000000..7d4883470e7
--- /dev/null
+++ b/test/utils/common_fixture.hpp
@@ -0,0 +1,82 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_TEST_UTILS_COMMON_FIXTURE_HPP_
+#define GKO_TEST_UTILS_COMMON_FIXTURE_HPP_
+
+
+#include <memory>
+#include <stdexcept>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/stream.hpp>
+
+
+#include "core/test/gtest/resources.hpp"
+#include "test/utils/executor.hpp"
+
+
+#if GINKGO_COMMON_SINGLE_MODE
+#define SKIP_IF_SINGLE_MODE GTEST_SKIP() << "Skip due to single mode"
+#else
+#define SKIP_IF_SINGLE_MODE                                                  \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+#endif
+
+
+class CommonTestFixture : public ::testing::Test {
+public:
+#if GINKGO_COMMON_SINGLE_MODE
+    using value_type = float;
+#else
+    using value_type = double;
+#endif
+    using index_type = int;
+
+    CommonTestFixture()
+        :
+#if defined(GKO_TEST_NONDEFAULT_STREAM) && defined(GKO_COMPILING_CUDA)
+          stream(ResourceEnvironment::cuda_device_id),
+#endif
+#if defined(GKO_TEST_NONDEFAULT_STREAM) && defined(GKO_COMPILING_HIP)
+          stream(ResourceEnvironment::hip_device_id),
+#endif
+          ref{gko::ReferenceExecutor::create()}
+    {
+#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP)
+        init_executor(ref, exec, stream.get());
+#else
+        init_executor(ref, exec);
+#endif
+        // set device-id test-wide since some test call device
+        // kernels directly
+        guard = exec->get_scoped_device_id_guard();
+    }
+
+    void TearDown() final
+    {
+        if (exec != nullptr) {
+            ASSERT_NO_THROW(exec->synchronize());
+        }
+    }
+
+#ifdef GKO_COMPILING_CUDA
+    gko::cuda_stream stream;
+#endif
+#ifdef GKO_COMPILING_HIP
+    gko::hip_stream stream;
+#endif
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::EXEC_TYPE> exec;
+    gko::scoped_device_id_guard guard;
+};
+
+
+#endif  // GKO_TEST_UTILS_COMMON_FIXTURE_HPP_
diff --git a/test/utils/executor.hpp b/test/utils/executor.hpp
index b31d1242f35..9c63d514cb4 100644
--- a/test/utils/executor.hpp
+++ b/test/utils/executor.hpp
@@ -17,16 +17,6 @@
 #include "core/test/gtest/resources.hpp"
 
 
-#if GINKGO_COMMON_SINGLE_MODE
-#define SKIP_IF_SINGLE_MODE GTEST_SKIP() << "Skip due to single mode"
-#else
-#define SKIP_IF_SINGLE_MODE                                                  \
-    static_assert(true,                                                      \
-                  "This assert is used to counter the false positive extra " \
-                  "semi-colon warnings")
-#endif
-
-
 inline void init_executor(std::shared_ptr<gko::ReferenceExecutor>,
                           std::shared_ptr<gko::ReferenceExecutor>& exec)
 {
@@ -83,52 +73,4 @@ inline void init_executor(std::shared_ptr<gko::ReferenceExecutor> ref,
 }
 
 
-class CommonTestFixture : public ::testing::Test {
-public:
-#if GINKGO_COMMON_SINGLE_MODE
-    using value_type = float;
-#else
-    using value_type = double;
-#endif
-    using index_type = int;
-
-    CommonTestFixture()
-        :
-#if defined(GKO_TEST_NONDEFAULT_STREAM) && defined(GKO_COMPILING_CUDA)
-          stream(ResourceEnvironment::cuda_device_id),
-#endif
-#if defined(GKO_TEST_NONDEFAULT_STREAM) && defined(GKO_COMPILING_HIP)
-          stream(ResourceEnvironment::hip_device_id),
-#endif
-          ref{gko::ReferenceExecutor::create()}
-    {
-#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP)
-        init_executor(ref, exec, stream.get());
-#else
-        init_executor(ref, exec);
-#endif
-        // set device-id test-wide since some test call device
-        // kernels directly
-        guard = exec->get_scoped_device_id_guard();
-    }
-
-    void TearDown() final
-    {
-        if (exec != nullptr) {
-            ASSERT_NO_THROW(exec->synchronize());
-        }
-    }
-
-#ifdef GKO_COMPILING_CUDA
-    gko::cuda_stream stream;
-#endif
-#ifdef GKO_COMPILING_HIP
-    gko::hip_stream stream;
-#endif
-    std::shared_ptr<gko::ReferenceExecutor> ref;
-    std::shared_ptr<gko::EXEC_TYPE> exec;
-    gko::scoped_device_id_guard guard;
-};
-
-
 #endif  // GKO_TEST_UTILS_EXECUTOR_HPP_
diff --git a/test/utils/mpi/executor.hpp b/test/utils/mpi/common_fixture.hpp
similarity index 90%
rename from test/utils/mpi/executor.hpp
rename to test/utils/mpi/common_fixture.hpp
index 199de02c054..67936bba2b6 100644
--- a/test/utils/mpi/executor.hpp
+++ b/test/utils/mpi/common_fixture.hpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#ifndef GKO_TEST_UTILS_MPI_EXECUTOR_HPP_
-#define GKO_TEST_UTILS_MPI_EXECUTOR_HPP_
+#ifndef GKO_TEST_UTILS_MPI_COMMON_FIXTURE_HPP_
+#define GKO_TEST_UTILS_MPI_COMMON_FIXTURE_HPP_
 
 
 #include <memory>
@@ -64,4 +64,4 @@ class CommonMpiTestFixture : public ::testing::Test {
 };
 
 
-#endif  // GKO_TEST_UTILS_MPI_EXECUTOR_HPP_
+#endif  // GKO_TEST_UTILS_MPI_COMMON_FIXTURE_HPP_

From 55ad08b3ad99371474ce0c50a7a9e1617cb93808 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Fri, 28 Jun 2024 13:01:31 +0200
Subject: [PATCH 031/448] fix format

---
 test/utils/common_fixture.hpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/test/utils/common_fixture.hpp b/test/utils/common_fixture.hpp
index 7d4883470e7..55107bc88e9 100644
--- a/test/utils/common_fixture.hpp
+++ b/test/utils/common_fixture.hpp
@@ -9,14 +9,11 @@
 #include <memory>
 #include <stdexcept>
 
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/stream.hpp>
 
-
 #include "core/test/gtest/resources.hpp"
 #include "test/utils/executor.hpp"
 

From 2a55ebc065bb9780117d29ae07c00ea662f28d43 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Fri, 28 Jun 2024 15:19:28 +0200
Subject: [PATCH 032/448] fix include

---
 test/distributed/index_map_kernels.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/distributed/index_map_kernels.cpp b/test/distributed/index_map_kernels.cpp
index 718fe84ce92..4fb6f111123 100644
--- a/test/distributed/index_map_kernels.cpp
+++ b/test/distributed/index_map_kernels.cpp
@@ -18,7 +18,7 @@
 
 #include "core/distributed/partition_kernels.hpp"
 #include "core/test/utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 using comm_index_type = gko::experimental::distributed::comm_index_type;

From 5d717e26dcaa95a73f4598de6f4f14778c765680 Mon Sep 17 00:00:00 2001
From: Gregor Olenik <gregor.olenik@web.de>
Date: Wed, 12 Jun 2024 09:49:12 +0200
Subject: [PATCH 033/448] fixup! mean computation

---
 common/unified/matrix/dense_kernels.template.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/unified/matrix/dense_kernels.template.cpp b/common/unified/matrix/dense_kernels.template.cpp
index f5b3cc03059..155efc94db0 100644
--- a/common/unified/matrix/dense_kernels.template.cpp
+++ b/common/unified/matrix/dense_kernels.template.cpp
@@ -261,7 +261,7 @@ void compute_mean(std::shared_ptr<const DefaultExecutor> exec,
             return x(i, j) * inv_total_size;
         },
         GKO_KERNEL_REDUCE_SUM(ValueType), result->get_values(), x->get_size(),
-        tmp, x, ValueType_nc{1.} / x->get_size()[0]);
+        tmp, x, ValueType_nc{1.} / std::max(1ul, x->get_size()[0]));
 }
 
 

From cff75df8f7821484354fd32e95d14f7a8caa92ef Mon Sep 17 00:00:00 2001
From: Gregor Olenik <gregor.olenik@web.de>
Date: Mon, 3 Jun 2024 13:50:02 +0200
Subject: [PATCH 034/448] add results rows checks + early return

---
 reference/matrix/dense_kernels.cpp      | 4 ++++
 reference/test/matrix/dense_kernels.cpp | 8 ++++++++
 2 files changed, 12 insertions(+)

diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp
index 53773a131fe..06bea588d50 100644
--- a/reference/matrix/dense_kernels.cpp
+++ b/reference/matrix/dense_kernels.cpp
@@ -371,11 +371,15 @@ void compute_mean(std::shared_ptr<const ReferenceExecutor> exec,
                   const matrix::Dense<ValueType>* x,
                   matrix::Dense<ValueType>* result, array<char>&)
 {
+    GKO_ASSERT_EQ(result->get_size()[0], 1);
+
     using ValueType_nc = gko::remove_complex<ValueType>;
     for (size_type j = 0; j < x->get_size()[1]; ++j) {
         result->at(0, j) = zero<ValueType>();
     }
 
+    if (x->get_size()[0] == 0) return;
+
     for (size_type i = 0; i < x->get_size()[1]; ++i) {
         for (size_type j = 0; j < x->get_size()[0]; ++j) {
             result->at(0, i) += x->at(j, i);
diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp
index 41294c89d49..e7f95abf4f2 100644
--- a/reference/test/matrix/dense_kernels.cpp
+++ b/reference/test/matrix/dense_kernels.cpp
@@ -691,6 +691,14 @@ TYPED_TEST(Dense, ComputesMean)
     GKO_EXPECT_NEAR(result->at(0, 2), T{1.0}, r<T>::value * 10);
 }
 
+TYPED_TEST(Dense, ComputesMeanFailsOnZeroRowResults)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    auto result = Mtx::create(this->exec, gko::dim<2>{0, 1});
+
+    ASSERT_THROW(this->mtx4->compute_mean(result), gko::ValueMismatch);
+}
 
 TYPED_TEST(Dense, ComputesMeanFailsOnWrongResultSize)
 {

From 2bbc14258133691c3c35ac52e4ca9ace22b281d1 Mon Sep 17 00:00:00 2001
From: Gregor Olenik <gregor.olenik@web.de>
Date: Thu, 27 Jun 2024 09:12:06 +0200
Subject: [PATCH 035/448] Update
 common/unified/matrix/dense_kernels.template.cpp

Co-authored-by: Tobias Ribizel <mail@ribizel.de>
---
 common/unified/matrix/dense_kernels.template.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/unified/matrix/dense_kernels.template.cpp b/common/unified/matrix/dense_kernels.template.cpp
index 155efc94db0..f469bd997aa 100644
--- a/common/unified/matrix/dense_kernels.template.cpp
+++ b/common/unified/matrix/dense_kernels.template.cpp
@@ -261,7 +261,7 @@ void compute_mean(std::shared_ptr<const DefaultExecutor> exec,
             return x(i, j) * inv_total_size;
         },
         GKO_KERNEL_REDUCE_SUM(ValueType), result->get_values(), x->get_size(),
-        tmp, x, ValueType_nc{1.} / std::max(1ul, x->get_size()[0]));
+        tmp, x, ValueType_nc{1.} / std::max<size_type>(1, x->get_size()[0]));
 }
 
 

From 6aa2fe519dde843cbd93c40d9a7f12b81df6af2c Mon Sep 17 00:00:00 2001
From: Gregor Olenik <gregor.olenik@web.de>
Date: Thu, 27 Jun 2024 09:58:45 +0200
Subject: [PATCH 036/448] Apply review comments

    Co-authored-by: Tobias Ribizel <ribizel@kit.edu>
    Co-authored-by: Marcel Koch <marcel.koch@kit.edu>
---
 reference/matrix/dense_kernels.cpp      | 2 --
 reference/test/matrix/dense_kernels.cpp | 8 --------
 2 files changed, 10 deletions(-)

diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp
index 06bea588d50..40c3c40a3ae 100644
--- a/reference/matrix/dense_kernels.cpp
+++ b/reference/matrix/dense_kernels.cpp
@@ -371,8 +371,6 @@ void compute_mean(std::shared_ptr<const ReferenceExecutor> exec,
                   const matrix::Dense<ValueType>* x,
                   matrix::Dense<ValueType>* result, array<char>&)
 {
-    GKO_ASSERT_EQ(result->get_size()[0], 1);
-
     using ValueType_nc = gko::remove_complex<ValueType>;
     for (size_type j = 0; j < x->get_size()[1]; ++j) {
         result->at(0, j) = zero<ValueType>();
diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp
index e7f95abf4f2..41294c89d49 100644
--- a/reference/test/matrix/dense_kernels.cpp
+++ b/reference/test/matrix/dense_kernels.cpp
@@ -691,14 +691,6 @@ TYPED_TEST(Dense, ComputesMean)
     GKO_EXPECT_NEAR(result->at(0, 2), T{1.0}, r<T>::value * 10);
 }
 
-TYPED_TEST(Dense, ComputesMeanFailsOnZeroRowResults)
-{
-    using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto result = Mtx::create(this->exec, gko::dim<2>{0, 1});
-
-    ASSERT_THROW(this->mtx4->compute_mean(result), gko::ValueMismatch);
-}
 
 TYPED_TEST(Dense, ComputesMeanFailsOnWrongResultSize)
 {

From c123083b51e271ee3555494c9335d2e27e3183c3 Mon Sep 17 00:00:00 2001
From: Gregor Olenik <gregor.olenik@web.de>
Date: Tue, 2 Jul 2024 14:18:52 +0200
Subject: [PATCH 037/448] update citation file

Co-authored-by: Terry Cojean <terry.cojean@kit.edu>
Co-authored-by: Yu-Hsiang M. Tsai <yhmtsai@gmail.com>
---
 CITATION.cff | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/CITATION.cff b/CITATION.cff
index d3efc13e771..34accbe7c71 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -26,11 +26,12 @@ authors:
 - family-names: "Riemer"
   given-names: "Lukas"
 - family-names: "Tsai"
-  given-names: "Yuhsiang"
+  given-names: "Yu-Hsiang"
 title: "Ginkgo: A Modern Linear Operator Algebra Framework for High Performance Computing"
-version: 1.5.0
-date-released: 2022-11-12
+version: 1.8.0
+date-released: 2024-06-13
 url: "https://github.com/ginkgo-project/ginkgo"
+license: BSD-3-Clause
 preferred-citation:
   type: article
   authors:

From 6a3283519b2c967683eb09a83d39e58cf5685b27 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Sat, 29 Jun 2024 14:35:37 +0200
Subject: [PATCH 038/448] run script

---
 ...hpp.inc => batch_multi_vector_kernels.cpp} |  52 ++
 ...hpp.inc => device_matrix_data_kernels.cpp} |  26 +
 ...ernel_launch.hpp.inc => kernel_launch.hpp} |  51 ++
 ...on.hpp.inc => kernel_launch_reduction.hpp} |  23 +
 ...olver.hpp.inc => kernel_launch_solver.hpp} |  19 +
 .../components/{atomic.hpp.inc => atomic.hpp} |  23 +
 ...pp.inc => diagonal_block_manipulation.hpp} |  26 +
 .../{intrinsics.hpp.inc => intrinsics.hpp}    |  20 +
 .../{merging.hpp.inc => merging.hpp}          |  23 +
 .../{prefix_sum.hpp.inc => prefix_sum.hpp}    |  25 +
 ...kernels.hpp.inc => prefix_sum_kernels.cpp} |  25 +
 .../{reduction.hpp.inc => reduction.hpp}      |  75 +++
 .../{searching.hpp.inc => searching.hpp}      |  21 +
 ...{segment_scan.hpp.inc => segment_scan.hpp} |  20 +
 .../{sorting.hpp.inc => sorting.hpp}          |  21 +
 .../{syncfree.hpp.inc => syncfree.hpp}        |  26 +
 .../{thread_ids.hpp.inc => thread_ids.hpp}    |  22 +
 .../{warp_blas.hpp.inc => warp_blas.hpp}      |  26 +
 ...rix_kernels.hpp.inc => matrix_kernels.cpp} |  32 ++
 ....hpp.inc => partition_helpers_kernels.cpp} |  22 +
 ..._kernels.hpp.inc => partition_kernels.cpp} |  26 +
 ...tor_kernels.hpp.inc => vector_kernels.cpp} |  26 +
 ...y_kernels.hpp.inc => cholesky_kernels.cpp} | 106 ++++
 ...nels.hpp.inc => factorization_kernels.cpp} |  36 ++
 .../{lu_kernels.hpp.inc => lu_kernels.cpp}    |  41 ++
 ..._ic_kernels.hpp.inc => par_ic_kernels.cpp} |  32 ++
 ...lu_kernels.hpp.inc => par_ilu_kernels.cpp} |  31 ++
 ...{batch_logger.hpp.inc => batch_logger.hpp} |  22 +
 ..._kernels.hpp.inc => batch_csr_kernels.cpp} |  51 ++
 ...ernels.hpp.inc => batch_dense_kernels.cpp} |  52 ++
 ..._kernels.hpp.inc => batch_ell_kernels.cpp} |  51 ++
 .../{coo_kernels.hpp.inc => coo_kernels.cpp}  |  42 ++
 ...ense_kernels.hpp.inc => dense_kernels.cpp} | 225 ++++++++
 ...l_kernels.hpp.inc => diagonal_kernels.cpp} |  32 ++
 .../cuda_hip/matrix/ell_kernels.cpp           | 148 ++++-
 common/cuda_hip/matrix/ell_kernels.hpp.inc    | 133 -----
 ...bcsr_kernels.hpp.inc => fbcsr_kernels.cpp} | 295 ++++++++++
 ...ellp_kernels.hpp.inc => sellp_kernels.cpp} |  37 ++
 .../cuda_hip/matrix/sparsity_csr_kernels.cpp  | 131 ++++-
 .../matrix/sparsity_csr_kernels.hpp.inc       | 111 ----
 .../{pgm_kernels.hpp.inc => pgm_kernels.cpp}  |  34 ++
 ...{isai_kernels.hpp.inc => isai_kernels.cpp} |  42 ++
 ...obi_kernels.hpp.inc => jacobi_kernels.cpp} |  44 ++
 .../{rcm_kernels.hpp.inc => rcm_kernels.cpp}  |  46 ++
 ...s_kernels.hpp.inc => cb_gmres_kernels.cpp} | 499 +++++++++++++++++
 .../cuda_hip/solver/idr_kernels.cpp           | 329 +++++++++++-
 common/cuda_hip/solver/idr_kernels.hpp.inc    | 318 -----------
 ..._kernels.hpp.inc => multigrid_kernels.cpp} |  34 ++
 ...ch_criteria.hpp.inc => batch_criteria.hpp} |  21 +
 cuda/base/batch_multi_vector_kernels.cu       |  56 --
 cuda/base/device_matrix_data_kernels.cu       |  31 --
 cuda/base/kernel_launch.cuh                   |  56 --
 cuda/base/kernel_launch_reduction.cuh         |  28 -
 cuda/base/kernel_launch_solver.cuh            |  24 -
 cuda/components/atomic.cuh                    |  28 -
 .../diagonal_block_manipulation.cuh           |  31 --
 cuda/components/intrinsics.cuh                |  25 -
 cuda/components/merging.cuh                   |  28 -
 cuda/components/prefix_sum.cuh                |  30 --
 cuda/components/prefix_sum_kernels.cu         |  30 --
 cuda/components/reduction.cuh                 |  80 ---
 cuda/components/searching.cuh                 |  26 -
 cuda/components/segment_scan.cuh              |  25 -
 cuda/components/sorting.cuh                   |  26 -
 cuda/components/syncfree.cuh                  |  31 --
 cuda/components/thread_ids.cuh                |  27 -
 cuda/components/warp_blas.cuh                 |  31 --
 cuda/distributed/matrix_kernels.cu            |  37 --
 cuda/distributed/partition_helpers_kernels.cu |  27 -
 cuda/distributed/partition_kernels.cu         |  31 --
 cuda/distributed/vector_kernels.cu            |  31 --
 cuda/factorization/cholesky_kernels.cu        | 111 ----
 cuda/factorization/factorization_kernels.cu   |  41 --
 cuda/factorization/lu_kernels.cu              |  46 --
 cuda/factorization/par_ic_kernels.cu          |  37 --
 cuda/factorization/par_ilu_kernels.cu         |  36 --
 cuda/log/batch_logger.cuh                     |  27 -
 cuda/matrix/batch_csr_kernels.cu              |  55 --
 cuda/matrix/batch_dense_kernels.cu            |  56 --
 cuda/matrix/batch_ell_kernels.cu              |  55 --
 cuda/matrix/coo_kernels.cu                    |  47 --
 cuda/matrix/dense_kernels.cu                  | 230 --------
 cuda/matrix/diagonal_kernels.cu               |  37 --
 cuda/matrix/fbcsr_kernels.template.cu         | 299 -----------
 cuda/matrix/sellp_kernels.cu                  |  42 --
 cuda/matrix/sparsity_csr_kernels.cu           | 223 --------
 cuda/multigrid/pgm_kernels.cu                 |  39 --
 cuda/preconditioner/isai_kernels.cu           |  47 --
 cuda/preconditioner/jacobi_kernels.cu         |  49 --
 cuda/reorder/rcm_kernels.cu                   |  51 --
 cuda/solver/cb_gmres_kernels.cu               | 504 ------------------
 cuda/solver/multigrid_kernels.cu              |  39 --
 cuda/stop/batch_criteria.cuh                  |  26 -
 hip/base/batch_multi_vector_kernels.hip.cpp   |  56 --
 hip/base/device_matrix_data_kernels.hip.cpp   |  31 --
 hip/base/kernel_launch.hip.hpp                |  56 --
 hip/base/kernel_launch_reduction.hip.hpp      |  28 -
 hip/base/kernel_launch_solver.hip.hpp         |  24 -
 hip/components/atomic.hip.hpp                 |  28 -
 .../diagonal_block_manipulation.hip.hpp       |  31 --
 hip/components/intrinsics.hip.hpp             |  25 -
 hip/components/merging.hip.hpp                |  28 -
 hip/components/prefix_sum.hip.hpp             |  30 --
 hip/components/prefix_sum_kernels.hip.cpp     |  30 --
 hip/components/reduction.hip.hpp              |  80 ---
 hip/components/searching.hip.hpp              |  26 -
 hip/components/segment_scan.hip.hpp           |  25 -
 hip/components/sorting.hip.hpp                |  26 -
 hip/components/syncfree.hip.hpp               |  31 --
 hip/components/thread_ids.hip.hpp             |  27 -
 hip/components/warp_blas.hip.hpp              |  31 --
 hip/distributed/matrix_kernels.hip.cpp        |  37 --
 .../partition_helpers_kernels.hip.cpp         |  27 -
 hip/distributed/partition_kernels.hip.cpp     |  31 --
 hip/distributed/vector_kernels.hip.cpp        |  31 --
 hip/factorization/cholesky_kernels.hip.cpp    | 111 ----
 .../factorization_kernels.hip.cpp             |  41 --
 hip/factorization/lu_kernels.hip.cpp          |  46 --
 hip/factorization/par_ic_kernels.hip.cpp      |  37 --
 hip/factorization/par_ilu_kernels.hip.cpp     |  36 --
 hip/log/batch_logger.hip.hpp                  |  26 -
 hip/matrix/batch_csr_kernels.hip.cpp          |  55 --
 hip/matrix/batch_dense_kernels.hip.cpp        |  56 --
 hip/matrix/batch_ell_kernels.hip.cpp          |  55 --
 hip/matrix/coo_kernels.hip.cpp                |  47 --
 hip/matrix/dense_kernels.hip.cpp              | 230 --------
 hip/matrix/diagonal_kernels.hip.cpp           |  37 --
 hip/matrix/ell_kernels.hip.cpp                | 270 ----------
 hip/matrix/fbcsr_kernels.template.hip.cpp     | 299 -----------
 hip/matrix/sellp_kernels.hip.cpp              |  42 --
 hip/multigrid/pgm_kernels.hip.cpp             |  39 --
 hip/preconditioner/isai_kernels.hip.cpp       |  47 --
 hip/preconditioner/jacobi_kernels.hip.cpp     |  49 --
 hip/reorder/rcm_kernels.hip.cpp               |  51 --
 hip/solver/cb_gmres_kernels.hip.cpp           | 504 ------------------
 hip/solver/idr_kernels.hip.cpp                | 340 ------------
 hip/solver/multigrid_kernels.hip.cpp          |  39 --
 hip/stop/batch_criteria.hip.hpp               |  26 -
 138 files changed, 2981 insertions(+), 6650 deletions(-)
 rename common/cuda_hip/base/{batch_multi_vector_kernels.hpp.inc => batch_multi_vector_kernels.cpp} (89%)
 rename common/cuda_hip/base/{device_matrix_data_kernels.hpp.inc => device_matrix_data_kernels.cpp} (88%)
 rename common/cuda_hip/base/{kernel_launch.hpp.inc => kernel_launch.hpp} (58%)
 rename common/cuda_hip/base/{kernel_launch_reduction.hpp.inc => kernel_launch_reduction.hpp} (97%)
 rename common/cuda_hip/base/{kernel_launch_solver.hpp.inc => kernel_launch_solver.hpp} (77%)
 rename common/cuda_hip/components/{atomic.hpp.inc => atomic.hpp} (95%)
 rename common/cuda_hip/components/{diagonal_block_manipulation.hpp.inc => diagonal_block_manipulation.hpp} (81%)
 rename common/cuda_hip/components/{intrinsics.hpp.inc => intrinsics.hpp} (74%)
 rename common/cuda_hip/components/{merging.hpp.inc => merging.hpp} (95%)
 rename common/cuda_hip/components/{prefix_sum.hpp.inc => prefix_sum.hpp} (91%)
 rename common/cuda_hip/components/{prefix_sum_kernels.hpp.inc => prefix_sum_kernels.cpp} (80%)
 rename common/cuda_hip/components/{reduction.hpp.inc => reduction.hpp} (78%)
 rename common/cuda_hip/components/{searching.hpp.inc => searching.hpp} (95%)
 rename common/cuda_hip/components/{segment_scan.hpp.inc => segment_scan.hpp} (73%)
 rename common/cuda_hip/components/{sorting.hpp.inc => sorting.hpp} (96%)
 rename common/cuda_hip/components/{syncfree.hpp.inc => syncfree.hpp} (86%)
 rename common/cuda_hip/components/{thread_ids.hpp.inc => thread_ids.hpp} (94%)
 rename common/cuda_hip/components/{warp_blas.hpp.inc => warp_blas.hpp} (97%)
 rename common/cuda_hip/distributed/{matrix_kernels.hpp.inc => matrix_kernels.cpp} (91%)
 rename common/cuda_hip/distributed/{partition_helpers_kernels.hpp.inc => partition_helpers_kernels.cpp} (70%)
 rename common/cuda_hip/distributed/{partition_kernels.hpp.inc => partition_kernels.cpp} (89%)
 rename common/cuda_hip/distributed/{vector_kernels.hpp.inc => vector_kernels.cpp} (84%)
 rename common/cuda_hip/factorization/{cholesky_kernels.hpp.inc => cholesky_kernels.cpp} (78%)
 rename common/cuda_hip/factorization/{factorization_kernels.hpp.inc => factorization_kernels.cpp} (95%)
 rename common/cuda_hip/factorization/{lu_kernels.hpp.inc => lu_kernels.cpp} (92%)
 rename common/cuda_hip/factorization/{par_ic_kernels.hpp.inc => par_ic_kernels.cpp} (84%)
 rename common/cuda_hip/factorization/{par_ilu_kernels.hpp.inc => par_ilu_kernels.cpp} (84%)
 rename common/cuda_hip/log/{batch_logger.hpp.inc => batch_logger.hpp} (67%)
 rename common/cuda_hip/matrix/{batch_csr_kernels.hpp.inc => batch_csr_kernels.cpp} (87%)
 rename common/cuda_hip/matrix/{batch_dense_kernels.hpp.inc => batch_dense_kernels.cpp} (89%)
 rename common/cuda_hip/matrix/{batch_ell_kernels.hpp.inc => batch_ell_kernels.cpp} (87%)
 rename common/cuda_hip/matrix/{coo_kernels.hpp.inc => coo_kernels.cpp} (91%)
 rename common/cuda_hip/matrix/{dense_kernels.hpp.inc => dense_kernels.cpp} (75%)
 rename common/cuda_hip/matrix/{diagonal_kernels.hpp.inc => diagonal_kernels.cpp} (73%)
 rename cuda/matrix/ell_kernels.cu => common/cuda_hip/matrix/ell_kernels.cpp (61%)
 delete mode 100644 common/cuda_hip/matrix/ell_kernels.hpp.inc
 rename common/cuda_hip/matrix/{fbcsr_kernels.hpp.inc => fbcsr_kernels.cpp} (57%)
 rename common/cuda_hip/matrix/{sellp_kernels.hpp.inc => sellp_kernels.cpp} (83%)
 rename hip/matrix/sparsity_csr_kernels.hip.cpp => common/cuda_hip/matrix/sparsity_csr_kernels.cpp (61%)
 delete mode 100644 common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc
 rename common/cuda_hip/multigrid/{pgm_kernels.hpp.inc => pgm_kernels.cpp} (77%)
 rename common/cuda_hip/preconditioner/{isai_kernels.hpp.inc => isai_kernels.cpp} (95%)
 rename common/cuda_hip/preconditioner/{jacobi_kernels.hpp.inc => jacobi_kernels.cpp} (91%)
 rename common/cuda_hip/reorder/{rcm_kernels.hpp.inc => rcm_kernels.cpp} (95%)
 rename common/cuda_hip/solver/{cb_gmres_kernels.hpp.inc => cb_gmres_kernels.cpp} (50%)
 rename cuda/solver/idr_kernels.cu => common/cuda_hip/solver/idr_kernels.cpp (52%)
 delete mode 100644 common/cuda_hip/solver/idr_kernels.hpp.inc
 rename common/cuda_hip/solver/{multigrid_kernels.hpp.inc => multigrid_kernels.cpp} (89%)
 rename common/cuda_hip/stop/{batch_criteria.hpp.inc => batch_criteria.hpp} (75%)
 delete mode 100644 cuda/base/batch_multi_vector_kernels.cu
 delete mode 100644 cuda/base/device_matrix_data_kernels.cu
 delete mode 100644 cuda/base/kernel_launch.cuh
 delete mode 100644 cuda/base/kernel_launch_reduction.cuh
 delete mode 100644 cuda/base/kernel_launch_solver.cuh
 delete mode 100644 cuda/components/atomic.cuh
 delete mode 100644 cuda/components/diagonal_block_manipulation.cuh
 delete mode 100644 cuda/components/intrinsics.cuh
 delete mode 100644 cuda/components/merging.cuh
 delete mode 100644 cuda/components/prefix_sum.cuh
 delete mode 100644 cuda/components/prefix_sum_kernels.cu
 delete mode 100644 cuda/components/reduction.cuh
 delete mode 100644 cuda/components/searching.cuh
 delete mode 100644 cuda/components/segment_scan.cuh
 delete mode 100644 cuda/components/sorting.cuh
 delete mode 100644 cuda/components/syncfree.cuh
 delete mode 100644 cuda/components/thread_ids.cuh
 delete mode 100644 cuda/components/warp_blas.cuh
 delete mode 100644 cuda/distributed/matrix_kernels.cu
 delete mode 100644 cuda/distributed/partition_helpers_kernels.cu
 delete mode 100644 cuda/distributed/partition_kernels.cu
 delete mode 100644 cuda/distributed/vector_kernels.cu
 delete mode 100644 cuda/factorization/cholesky_kernels.cu
 delete mode 100644 cuda/factorization/factorization_kernels.cu
 delete mode 100644 cuda/factorization/lu_kernels.cu
 delete mode 100644 cuda/factorization/par_ic_kernels.cu
 delete mode 100644 cuda/factorization/par_ilu_kernels.cu
 delete mode 100644 cuda/log/batch_logger.cuh
 delete mode 100644 cuda/matrix/batch_csr_kernels.cu
 delete mode 100644 cuda/matrix/batch_dense_kernels.cu
 delete mode 100644 cuda/matrix/batch_ell_kernels.cu
 delete mode 100644 cuda/matrix/coo_kernels.cu
 delete mode 100644 cuda/matrix/dense_kernels.cu
 delete mode 100644 cuda/matrix/diagonal_kernels.cu
 delete mode 100644 cuda/matrix/fbcsr_kernels.template.cu
 delete mode 100644 cuda/matrix/sellp_kernels.cu
 delete mode 100644 cuda/matrix/sparsity_csr_kernels.cu
 delete mode 100644 cuda/multigrid/pgm_kernels.cu
 delete mode 100644 cuda/preconditioner/isai_kernels.cu
 delete mode 100644 cuda/preconditioner/jacobi_kernels.cu
 delete mode 100644 cuda/reorder/rcm_kernels.cu
 delete mode 100644 cuda/solver/cb_gmres_kernels.cu
 delete mode 100644 cuda/solver/multigrid_kernels.cu
 delete mode 100644 cuda/stop/batch_criteria.cuh
 delete mode 100644 hip/base/batch_multi_vector_kernels.hip.cpp
 delete mode 100644 hip/base/device_matrix_data_kernels.hip.cpp
 delete mode 100644 hip/base/kernel_launch.hip.hpp
 delete mode 100644 hip/base/kernel_launch_reduction.hip.hpp
 delete mode 100644 hip/base/kernel_launch_solver.hip.hpp
 delete mode 100644 hip/components/atomic.hip.hpp
 delete mode 100644 hip/components/diagonal_block_manipulation.hip.hpp
 delete mode 100644 hip/components/intrinsics.hip.hpp
 delete mode 100644 hip/components/merging.hip.hpp
 delete mode 100644 hip/components/prefix_sum.hip.hpp
 delete mode 100644 hip/components/prefix_sum_kernels.hip.cpp
 delete mode 100644 hip/components/reduction.hip.hpp
 delete mode 100644 hip/components/searching.hip.hpp
 delete mode 100644 hip/components/segment_scan.hip.hpp
 delete mode 100644 hip/components/sorting.hip.hpp
 delete mode 100644 hip/components/syncfree.hip.hpp
 delete mode 100644 hip/components/thread_ids.hip.hpp
 delete mode 100644 hip/components/warp_blas.hip.hpp
 delete mode 100644 hip/distributed/matrix_kernels.hip.cpp
 delete mode 100644 hip/distributed/partition_helpers_kernels.hip.cpp
 delete mode 100644 hip/distributed/partition_kernels.hip.cpp
 delete mode 100644 hip/distributed/vector_kernels.hip.cpp
 delete mode 100644 hip/factorization/cholesky_kernels.hip.cpp
 delete mode 100644 hip/factorization/factorization_kernels.hip.cpp
 delete mode 100644 hip/factorization/lu_kernels.hip.cpp
 delete mode 100644 hip/factorization/par_ic_kernels.hip.cpp
 delete mode 100644 hip/factorization/par_ilu_kernels.hip.cpp
 delete mode 100644 hip/log/batch_logger.hip.hpp
 delete mode 100644 hip/matrix/batch_csr_kernels.hip.cpp
 delete mode 100644 hip/matrix/batch_dense_kernels.hip.cpp
 delete mode 100644 hip/matrix/batch_ell_kernels.hip.cpp
 delete mode 100644 hip/matrix/coo_kernels.hip.cpp
 delete mode 100644 hip/matrix/dense_kernels.hip.cpp
 delete mode 100644 hip/matrix/diagonal_kernels.hip.cpp
 delete mode 100644 hip/matrix/ell_kernels.hip.cpp
 delete mode 100644 hip/matrix/fbcsr_kernels.template.hip.cpp
 delete mode 100644 hip/matrix/sellp_kernels.hip.cpp
 delete mode 100644 hip/multigrid/pgm_kernels.hip.cpp
 delete mode 100644 hip/preconditioner/isai_kernels.hip.cpp
 delete mode 100644 hip/preconditioner/jacobi_kernels.hip.cpp
 delete mode 100644 hip/reorder/rcm_kernels.hip.cpp
 delete mode 100644 hip/solver/cb_gmres_kernels.hip.cpp
 delete mode 100644 hip/solver/idr_kernels.hip.cpp
 delete mode 100644 hip/solver/multigrid_kernels.hip.cpp
 delete mode 100644 hip/stop/batch_criteria.hip.hpp

diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.cpp
similarity index 89%
rename from common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
rename to common/cuda_hip/base/batch_multi_vector_kernels.cpp
index 9b6301674be..0261dbb97ce 100644
--- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.cpp
@@ -2,6 +2,47 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/base/batch_multi_vector_kernels.hpp"
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/range_accessors.hpp>
+
+#include "common/cuda_hip/base/batch_struct.hpp"
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "core/base/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The MultiVector matrix format namespace.
+ *
+ * @ingroup batch_multi_vector
+ */
+namespace batch_multi_vector {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+
 template <typename ValueType, typename Mapping>
 __device__ __forceinline__ void scale(
     const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
@@ -299,3 +340,14 @@ __launch_bounds__(default_block_size, sm_oversubscription) void copy_kernel(
         copy(src_b, dst_b);
     }
 }
+
+
+#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc"
+
+// clang-format on
+
+
+}  // namespace batch_multi_vector
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc b/common/cuda_hip/base/device_matrix_data_kernels.cpp
similarity index 88%
rename from common/cuda_hip/base/device_matrix_data_kernels.hpp.inc
rename to common/cuda_hip/base/device_matrix_data_kernels.cpp
index 70cbd9e7391..61a7a6281a9 100644
--- a/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc
+++ b/common/cuda_hip/base/device_matrix_data_kernels.cpp
@@ -2,6 +2,26 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/base/device_matrix_data_kernels.hpp"
+
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/device_ptr.h>
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sort.h>
+#include <thrust/tuple.h>
+
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/base/types.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace components {
+
+
 template <typename ValueType, typename IndexType>
 void remove_zeros(std::shared_ptr<const DefaultExecutor> exec,
                   array<ValueType>& values, array<IndexType>& row_idxs,
@@ -99,3 +119,9 @@ void sort_row_major(std::shared_ptr<const DefaultExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DEVICE_MATRIX_DATA_SORT_ROW_MAJOR_KERNEL);
+
+
+}  // namespace components
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/base/kernel_launch.hpp.inc b/common/cuda_hip/base/kernel_launch.hpp
similarity index 58%
rename from common/cuda_hip/base/kernel_launch.hpp.inc
rename to common/cuda_hip/base/kernel_launch.hpp
index c46e6c879cb..dd20eb5769f 100644
--- a/common/cuda_hip/base/kernel_launch.hpp.inc
+++ b/common/cuda_hip/base/kernel_launch.hpp
@@ -2,6 +2,52 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_HPP_
+#error \
+    "This file can only be used from inside common/unified/base/kernel_launch.hpp"
+#endif
+
+
+#include <thrust/tuple.h>
+
+#include "accessor/cuda_hip_helper.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
+template <typename AccessorType>
+struct to_device_type_impl<gko::acc::range<AccessorType>&> {
+    using type = std::decay_t<decltype(gko::acc::as_device_range(
+        std::declval<gko::acc::range<AccessorType>>()))>;
+    static type map_to_device(gko::acc::range<AccessorType>& range)
+    {
+        return gko::acc::as_device_range(range);
+    }
+};
+
+template <typename AccessorType>
+struct to_device_type_impl<const gko::acc::range<AccessorType>&> {
+    using type = std::decay_t<decltype(gko::acc::as_device_range(
+        std::declval<gko::acc::range<AccessorType>>()))>;
+    static type map_to_device(const gko::acc::range<AccessorType>& range)
+    {
+        return gko::acc::as_device_range(range);
+    }
+};
+
+
+namespace device_std = thrust;
+
+
+constexpr int default_block_size = 512;
+
+
 template <typename KernelFunction, typename... KernelArgs>
 __global__ __launch_bounds__(default_block_size) void generic_kernel_1d(
     int64 size, KernelFunction fn, KernelArgs... args)
@@ -52,3 +98,8 @@ void run_kernel(std::shared_ptr<const DefaultExecutor> exec, KernelFunction fn,
             map_to_device(args)...);
     }
 }
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/base/kernel_launch_reduction.hpp.inc b/common/cuda_hip/base/kernel_launch_reduction.hpp
similarity index 97%
rename from common/cuda_hip/base/kernel_launch_reduction.hpp.inc
rename to common/cuda_hip/base/kernel_launch_reduction.hpp
index e5caedacb1f..86e082ac2c1 100644
--- a/common/cuda_hip/base/kernel_launch_reduction.hpp.inc
+++ b/common/cuda_hip/base/kernel_launch_reduction.hpp
@@ -2,6 +2,24 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
+#error \
+    "This file can only be used from inside common/unified/base/kernel_launch_reduction.hpp"
+#endif
+
+
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
 template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
 __global__ __launch_bounds__(
@@ -505,3 +523,8 @@ void run_kernel_col_reduction_cached(
         }
     }
 }
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/base/kernel_launch_solver.hpp.inc b/common/cuda_hip/base/kernel_launch_solver.hpp
similarity index 77%
rename from common/cuda_hip/base/kernel_launch_solver.hpp.inc
rename to common/cuda_hip/base/kernel_launch_solver.hpp
index cef3c8a3adc..742da85fd96 100644
--- a/common/cuda_hip/base/kernel_launch_solver.hpp.inc
+++ b/common/cuda_hip/base/kernel_launch_solver.hpp
@@ -2,6 +2,20 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_SOLVER_HPP_
+#error \
+    "This file can only be used from inside common/unified/base/kernel_launch_solver.hpp"
+#endif
+
+
+#include "common/cuda_hip/base/runtime.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
 template <typename KernelFunction, typename... KernelArgs>
 __global__ __launch_bounds__(default_block_size) void generic_kernel_2d_solver(
     int64 rows, int64 cols, int64 default_stride, KernelFunction fn,
@@ -32,3 +46,8 @@ void run_kernel_solver(std::shared_ptr<const DefaultExecutor> exec,
             static_cast<int64>(default_stride), fn, map_to_device(args)...);
     }
 }
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/components/atomic.hpp.inc b/common/cuda_hip/components/atomic.hpp
similarity index 95%
rename from common/cuda_hip/components/atomic.hpp.inc
rename to common/cuda_hip/components/atomic.hpp
index 60eaf5a9dd9..e0384222734 100644
--- a/common/cuda_hip/components/atomic.hpp.inc
+++ b/common/cuda_hip/components/atomic.hpp
@@ -2,6 +2,21 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_ATOMIC_HPP_INC_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_ATOMIC_HPP_INC_
+
+
+#include <type_traits>
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/types.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
 namespace detail {
 
 
@@ -228,3 +243,11 @@ __forceinline__ __device__ thrust::complex<double> atomic_add(
     auto imag = atomic_add(addr + 1, val.imag());
     return {real, imag};
 }
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_ATOMIC_HPP_INC_
\ No newline at end of file
diff --git a/common/cuda_hip/components/diagonal_block_manipulation.hpp.inc b/common/cuda_hip/components/diagonal_block_manipulation.hpp
similarity index 81%
rename from common/cuda_hip/components/diagonal_block_manipulation.hpp.inc
rename to common/cuda_hip/components/diagonal_block_manipulation.hpp
index a8e7004b5aa..5c0be150d21 100644
--- a/common/cuda_hip/components/diagonal_block_manipulation.hpp.inc
+++ b/common/cuda_hip/components/diagonal_block_manipulation.hpp
@@ -2,6 +2,23 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HPP_INC_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HPP_INC_
+
+
+#include <type_traits>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace csr {
+
+
 /**
  * @internal
  *
@@ -63,3 +80,12 @@ __device__ __forceinline__ void extract_transposed_diag_blocks(
         }
     }
 }
+
+
+}  // namespace csr
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HPP_INC_
\ No newline at end of file
diff --git a/common/cuda_hip/components/intrinsics.hpp.inc b/common/cuda_hip/components/intrinsics.hpp
similarity index 74%
rename from common/cuda_hip/components/intrinsics.hpp.inc
rename to common/cuda_hip/components/intrinsics.hpp
index 3fc28cee871..398e4325cc2 100644
--- a/common/cuda_hip/components/intrinsics.hpp.inc
+++ b/common/cuda_hip/components/intrinsics.hpp
@@ -2,6 +2,18 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_INTRINSICS_HPP_INC_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_INTRINSICS_HPP_INC_
+
+
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
 /**
  * @internal
  * Returns the number of set bits in the given mask.
@@ -36,3 +48,11 @@ __forceinline__ __device__ int clz(uint32 mask) { return __clz(mask); }
 
 /** @copydoc clz */
 __forceinline__ __device__ int clz(uint64 mask) { return __clzll(mask); }
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_INTRINSICS_HPP_INC_
\ No newline at end of file
diff --git a/common/cuda_hip/components/merging.hpp.inc b/common/cuda_hip/components/merging.hpp
similarity index 95%
rename from common/cuda_hip/components/merging.hpp.inc
rename to common/cuda_hip/components/merging.hpp
index d77707795a1..b1bca2a0c78 100644
--- a/common/cuda_hip/components/merging.hpp.inc
+++ b/common/cuda_hip/components/merging.hpp
@@ -2,6 +2,21 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_MERGING_HPP_INC_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_MERGING_HPP_INC_
+
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "core/base/utils.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
 namespace detail {
 
 
@@ -280,3 +295,11 @@ __forceinline__ __device__ void sequential_match(const ValueType* a,
                          return a_idx < a_size && b_idx < b_size;
                      });
 }
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_MERGING_HPP_INC_
\ No newline at end of file
diff --git a/common/cuda_hip/components/prefix_sum.hpp.inc b/common/cuda_hip/components/prefix_sum.hpp
similarity index 91%
rename from common/cuda_hip/components/prefix_sum.hpp.inc
rename to common/cuda_hip/components/prefix_sum.hpp
index 474b0b88cd1..8fc5bbe63b0 100644
--- a/common/cuda_hip/components/prefix_sum.hpp.inc
+++ b/common/cuda_hip/components/prefix_sum.hpp
@@ -2,6 +2,23 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_INC_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_INC_
+
+
+#include <type_traits>
+
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
 /**
  * @internal
  * Computes the prefix sum and total sum of `element` over a subwarp.
@@ -158,3 +175,11 @@ __global__ __launch_bounds__(block_size) void finalize_prefix_sum(
         elements[tidx] += prefix_block_sum;
     }
 }
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_INC_
\ No newline at end of file
diff --git a/common/cuda_hip/components/prefix_sum_kernels.hpp.inc b/common/cuda_hip/components/prefix_sum_kernels.cpp
similarity index 80%
rename from common/cuda_hip/components/prefix_sum_kernels.hpp.inc
rename to common/cuda_hip/components/prefix_sum_kernels.cpp
index c232e115a22..40cb1bc48fc 100644
--- a/common/cuda_hip/components/prefix_sum_kernels.hpp.inc
+++ b/common/cuda_hip/components/prefix_sum_kernels.cpp
@@ -2,6 +2,25 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/components/prefix_sum_kernels.hpp"
+
+#include <limits>
+
+#include <thrust/scan.h>
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/name_demangling.hpp>
+
+#include "common/cuda_hip/base/thrust.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace components {
+
+
 template <typename IndexType>
 struct overflowing_sum {
     constexpr static IndexType max = std::numeric_limits<IndexType>::max();
@@ -56,3 +75,9 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_NONNEGATIVE_KERNEL);
 // instantiate for size_type as well, as this is used in the Sellp format
 template void prefix_sum_nonnegative<size_type>(
     std::shared_ptr<const DefaultExecutor>, size_type*, size_type);
+
+
+}  // namespace components
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/components/reduction.hpp.inc b/common/cuda_hip/components/reduction.hpp
similarity index 78%
rename from common/cuda_hip/components/reduction.hpp.inc
rename to common/cuda_hip/components/reduction.hpp
index 1a6a64d6fb7..d2889bb9c7e 100644
--- a/common/cuda_hip/components/reduction.hpp.inc
+++ b/common/cuda_hip/components/reduction.hpp
@@ -2,6 +2,32 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_REDUCTION_HPP_INC_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_REDUCTION_HPP_INC_
+
+
+#include <type_traits>
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "core/base/array_access.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
+constexpr int default_reduce_block_size = 512;
+
+
 /**
  * @internal
  *
@@ -222,3 +248,52 @@ __launch_bounds__(default_reduce_block_size) void reduce_add_array_with_initial_
         result[blockIdx.x] += block_sum[0];
     }
 }
+
+
+/**
+ * Compute a reduction using add operation (+).
+ *
+ * @param exec  Executor associated to the array
+ * @param size  size of the array
+ * @param source  the pointer of the array
+ *
+ * @return the reduction result
+ */
+template <typename ValueType>
+__host__ ValueType reduce_add_array(std::shared_ptr<const DefaultExecutor> exec,
+                                    size_type size, const ValueType* source)
+{
+    auto block_results_val = source;
+    size_type grid_dim = size;
+    auto block_results = array<ValueType>(exec);
+    if (size > default_reduce_block_size) {
+        const auto n = ceildiv(size, default_reduce_block_size);
+        grid_dim =
+            (n <= default_reduce_block_size) ? n : default_reduce_block_size;
+
+        block_results.resize_and_reset(grid_dim);
+
+        reduce_add_array<<<grid_dim, default_reduce_block_size, 0,
+                           exec->get_stream()>>>(
+            size, as_device_type(source),
+            as_device_type(block_results.get_data()));
+
+        block_results_val = block_results.get_const_data();
+    }
+
+    auto d_result = array<ValueType>(exec, 1);
+
+    reduce_add_array<<<1, default_reduce_block_size, 0, exec->get_stream()>>>(
+        grid_dim, as_device_type(block_results_val),
+        as_device_type(d_result.get_data()));
+    auto answer = get_element(d_result, 0);
+    return answer;
+}
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_REDUCTION_HPP_INC_
\ No newline at end of file
diff --git a/common/cuda_hip/components/searching.hpp.inc b/common/cuda_hip/components/searching.hpp
similarity index 95%
rename from common/cuda_hip/components/searching.hpp.inc
rename to common/cuda_hip/components/searching.hpp
index a0f842dca35..599e7a8581c 100644
--- a/common/cuda_hip/components/searching.hpp.inc
+++ b/common/cuda_hip/components/searching.hpp
@@ -2,6 +2,19 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SEARCHING_HPP_INC_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_SEARCHING_HPP_INC_
+
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
 /**
  * @internal
  * Generic binary search that finds the first index where a predicate is true.
@@ -208,3 +221,11 @@ __forceinline__ __device__ IndexType group_ary_search(IndexType offset,
     auto pos = mask == 0 ? group.size() : ffs(mask) - 1;
     return offset + pos;
 }
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_SEARCHING_HPP_INC_
\ No newline at end of file
diff --git a/common/cuda_hip/components/segment_scan.hpp.inc b/common/cuda_hip/components/segment_scan.hpp
similarity index 73%
rename from common/cuda_hip/components/segment_scan.hpp.inc
rename to common/cuda_hip/components/segment_scan.hpp
index 75cc0654531..d2f992850ef 100644
--- a/common/cuda_hip/components/segment_scan.hpp.inc
+++ b/common/cuda_hip/components/segment_scan.hpp
@@ -2,6 +2,18 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SEGMENT_SCAN_HPP_INC_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_SEGMENT_SCAN_HPP_INC_
+
+
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
 /**
  * @internal
  *
@@ -33,3 +45,11 @@ __device__ __forceinline__ bool segment_scan(
     }
     return head;
 }
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_SEGMENT_SCAN_HPP_INC_
\ No newline at end of file
diff --git a/common/cuda_hip/components/sorting.hpp.inc b/common/cuda_hip/components/sorting.hpp
similarity index 96%
rename from common/cuda_hip/components/sorting.hpp.inc
rename to common/cuda_hip/components/sorting.hpp
index 10db7eb6daa..ecc9c5289f9 100644
--- a/common/cuda_hip/components/sorting.hpp.inc
+++ b/common/cuda_hip/components/sorting.hpp
@@ -2,6 +2,19 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SORTING_HPP_INC_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_SORTING_HPP_INC_
+
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
 namespace detail {
 
 
@@ -291,3 +304,11 @@ __forceinline__ __device__ void bitonic_sort(ValueType* local_elements,
             local_elements, false);
     }
 }
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_SORTING_HPP_INC_
\ No newline at end of file
diff --git a/common/cuda_hip/components/syncfree.hpp.inc b/common/cuda_hip/components/syncfree.hpp
similarity index 86%
rename from common/cuda_hip/components/syncfree.hpp.inc
rename to common/cuda_hip/components/syncfree.hpp
index f0d0bbe4d22..3c82c916a21 100644
--- a/common/cuda_hip/components/syncfree.hpp.inc
+++ b/common/cuda_hip/components/syncfree.hpp
@@ -2,6 +2,24 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SYNCFREE_HPP_INC_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_SYNCFREE_HPP_INC_
+
+
+#include <ginkgo/core/base/array.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/memory.hpp"
+#include "core/components/fill_array_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
 struct syncfree_storage {
     using status_word = int;
 
@@ -110,3 +128,11 @@ class syncfree_scheduler {
     IndexType work_id;
     IndexType block_id;
 };
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_SYNCFREE_HPP_INC_
\ No newline at end of file
diff --git a/common/cuda_hip/components/thread_ids.hpp.inc b/common/cuda_hip/components/thread_ids.hpp
similarity index 94%
rename from common/cuda_hip/components/thread_ids.hpp.inc
rename to common/cuda_hip/components/thread_ids.hpp
index 1befa428f3c..4fef650f51c 100644
--- a/common/cuda_hip/components/thread_ids.hpp.inc
+++ b/common/cuda_hip/components/thread_ids.hpp
@@ -2,6 +2,19 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_THREAD_IDS_HPP_INC_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_THREAD_IDS_HPP_INC_
+
+
+#include "common/cuda_hip/base/config.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace thread {
+
+
 /**
  * @internal
  *
@@ -242,3 +255,12 @@ __device__ __forceinline__ IndexType get_subwarp_num_flat()
                   "subwarp_size must be a power of two");
     return blockDim.x / subwarp_size * static_cast<IndexType>(gridDim.x);
 }
+
+
+}  // namespace thread
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_THREAD_IDS_HPP_INC_
\ No newline at end of file
diff --git a/common/cuda_hip/components/warp_blas.hpp.inc b/common/cuda_hip/components/warp_blas.hpp
similarity index 97%
rename from common/cuda_hip/components/warp_blas.hpp.inc
rename to common/cuda_hip/components/warp_blas.hpp
index 61b2ae25e7f..1f25bb61634 100644
--- a/common/cuda_hip/components/warp_blas.hpp.inc
+++ b/common/cuda_hip/components/warp_blas.hpp
@@ -2,6 +2,24 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_WARP_BLAS_HPP_INC_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_WARP_BLAS_HPP_INC_
+
+
+#include <cassert>
+#include <type_traits>
+
+#include <ginkgo/config.hpp>
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
 /**
  * @internal
  *
@@ -409,3 +427,11 @@ __device__ __forceinline__ remove_complex<ValueType> compute_infinity_norm(
     return reduce(group, sum,
                   [](result_type x, result_type y) { return max(x, y); });
 }
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_WARP_BLAS_HPP_INC_
\ No newline at end of file
diff --git a/common/cuda_hip/distributed/matrix_kernels.hpp.inc b/common/cuda_hip/distributed/matrix_kernels.cpp
similarity index 91%
rename from common/cuda_hip/distributed/matrix_kernels.hpp.inc
rename to common/cuda_hip/distributed/matrix_kernels.cpp
index 8848e490c18..6b5f997d153 100644
--- a/common/cuda_hip/distributed/matrix_kernels.hpp.inc
+++ b/common/cuda_hip/distributed/matrix_kernels.cpp
@@ -2,6 +2,32 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/distributed/matrix_kernels.hpp"
+
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/distance.h>
+#include <thrust/execution_policy.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/transform_reduce.h>
+#include <thrust/unique.h>
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace distributed_matrix {
+
+
 template <typename ValueType, typename GlobalIndexType>
 struct input_type {
     GlobalIndexType row;
@@ -170,3 +196,9 @@ void separate_local_nonlocal(
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL);
+
+
+}  // namespace distributed_matrix
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc b/common/cuda_hip/distributed/partition_helpers_kernels.cpp
similarity index 70%
rename from common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc
rename to common/cuda_hip/distributed/partition_helpers_kernels.cpp
index 88343370d99..cd1419230d2 100644
--- a/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc
+++ b/common/cuda_hip/distributed/partition_helpers_kernels.cpp
@@ -2,6 +2,22 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/distributed/partition_helpers_kernels.hpp"
+
+#include <thrust/device_ptr.h>
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sort.h>
+
+#include "common/cuda_hip/base/thrust.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace partition_helpers {
+
+
 template <typename GlobalIndexType>
 void sort_by_range_start(
     std::shared_ptr<const DefaultExecutor> exec,
@@ -24,3 +40,9 @@ void sort_by_range_start(
 
 GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
     GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START);
+
+
+}  // namespace partition_helpers
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/distributed/partition_kernels.hpp.inc b/common/cuda_hip/distributed/partition_kernels.cpp
similarity index 89%
rename from common/cuda_hip/distributed/partition_kernels.hpp.inc
rename to common/cuda_hip/distributed/partition_kernels.cpp
index 20f3ebd47dc..b4e051b97f5 100644
--- a/common/cuda_hip/distributed/partition_kernels.hpp.inc
+++ b/common/cuda_hip/distributed/partition_kernels.cpp
@@ -2,6 +2,26 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/distributed/partition_kernels.hpp"
+
+#include <thrust/count.h>
+#include <thrust/device_ptr.h>
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/scan.h>
+#include <thrust/sort.h>
+
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/unified/base/kernel_launch.hpp"
+#include "core/components/fill_array_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace partition {
+
+
 namespace kernel {
 
 
@@ -110,3 +130,9 @@ void build_starting_indices(std::shared_ptr<const DefaultExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_PARTITION_BUILD_STARTING_INDICES);
+
+
+}  // namespace partition
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/distributed/vector_kernels.hpp.inc b/common/cuda_hip/distributed/vector_kernels.cpp
similarity index 84%
rename from common/cuda_hip/distributed/vector_kernels.hpp.inc
rename to common/cuda_hip/distributed/vector_kernels.cpp
index 6a0497db78a..91bd838497d 100644
--- a/common/cuda_hip/distributed/vector_kernels.hpp.inc
+++ b/common/cuda_hip/distributed/vector_kernels.cpp
@@ -2,6 +2,26 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/distributed/vector_kernels.hpp"
+
+#include <thrust/binary_search.h>
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/scatter.h>
+#include <thrust/tuple.h>
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+#include "common/cuda_hip/base/thrust.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace distributed_vector {
+
+
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void build_local(
     std::shared_ptr<const DefaultExecutor> exec,
@@ -65,3 +85,9 @@ void build_local(
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL);
+
+
+}  // namespace distributed_vector
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/factorization/cholesky_kernels.hpp.inc b/common/cuda_hip/factorization/cholesky_kernels.cpp
similarity index 78%
rename from common/cuda_hip/factorization/cholesky_kernels.hpp.inc
rename to common/cuda_hip/factorization/cholesky_kernels.cpp
index e6220019d22..6e6be7b81fd 100644
--- a/common/cuda_hip/factorization/cholesky_kernels.hpp.inc
+++ b/common/cuda_hip/factorization/cholesky_kernels.cpp
@@ -2,6 +2,49 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/factorization/cholesky_kernels.hpp"
+
+#include <algorithm>
+#include <memory>
+
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+#include <thrust/tuple.h>
+
+#include <ginkgo/core/matrix/csr.hpp>
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/syncfree.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/components/fill_array_kernels.hpp"
+#include "core/components/format_conversion_kernels.hpp"
+#include "core/factorization/elimination_forest.hpp"
+#include "core/factorization/lu_kernels.hpp"
+#include "core/matrix/csr_lookup.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The Cholesky namespace.
+ *
+ * @ingroup factor
+ */
+namespace cholesky {
+
+
+constexpr int default_block_size = 512;
+
+
 #include "core/factorization/elimination_forest.hpp"
 namespace kernel {
 
@@ -330,3 +373,66 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE);
+
+
+template <typename ValueType, typename IndexType>
+void symbolic_count(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* mtx,
+                    const factorization::elimination_forest<IndexType>& forest,
+                    IndexType* row_nnz, array<IndexType>& tmp_storage)
+{
+    const auto num_rows = static_cast<IndexType>(mtx->get_size()[0]);
+    if (num_rows == 0) {
+        return;
+    }
+    const auto mtx_nnz = static_cast<IndexType>(mtx->get_num_stored_elements());
+    tmp_storage.resize_and_reset(mtx_nnz + num_rows);
+    const auto postorder_cols = tmp_storage.get_data();
+    const auto lower_ends = postorder_cols + mtx_nnz;
+    const auto row_ptrs = mtx->get_const_row_ptrs();
+    const auto cols = mtx->get_const_col_idxs();
+    const auto inv_postorder = forest.inv_postorder.get_const_data();
+    const auto postorder_parent = forest.postorder_parents.get_const_data();
+    // transform col indices to postorder indices
+    {
+        const auto num_blocks = ceildiv(num_rows, default_block_size);
+        kernel::build_postorder_cols<<<num_blocks, default_block_size, 0,
+                                       exec->get_stream()>>>(
+            num_rows, cols, row_ptrs, inv_postorder, postorder_cols,
+            lower_ends);
+    }
+    // sort postorder_cols inside rows
+    {
+        const auto handle = exec->get_sparselib_handle();
+        auto descr = sparselib::create_mat_descr();
+        array<IndexType> permutation_array(exec, mtx_nnz);
+        auto permutation = permutation_array.get_data();
+        components::fill_seq_array(exec, permutation, mtx_nnz);
+        size_type buffer_size{};
+        sparselib::csrsort_buffer_size(handle, num_rows, num_rows, mtx_nnz,
+                                       row_ptrs, postorder_cols, buffer_size);
+        array<char> buffer_array{exec, buffer_size};
+        auto buffer = buffer_array.get_data();
+        sparselib::csrsort(handle, num_rows, num_rows, mtx_nnz, descr, row_ptrs,
+                           postorder_cols, permutation, buffer);
+        sparselib::destroy(descr);
+    }
+    // count nonzeros per row of L
+    {
+        const auto num_blocks =
+            ceildiv(num_rows, default_block_size / config::warp_size);
+        kernel::symbolic_count<config::warp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                num_rows, row_ptrs, lower_ends, inv_postorder, postorder_cols,
+                postorder_parent, row_nnz);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT);
+
+
+}  // namespace cholesky
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/factorization/factorization_kernels.hpp.inc b/common/cuda_hip/factorization/factorization_kernels.cpp
similarity index 95%
rename from common/cuda_hip/factorization/factorization_kernels.hpp.inc
rename to common/cuda_hip/factorization/factorization_kernels.cpp
index 806797e60d8..da2666feb25 100644
--- a/common/cuda_hip/factorization/factorization_kernels.hpp.inc
+++ b/common/cuda_hip/factorization/factorization_kernels.cpp
@@ -2,6 +2,36 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/factorization/factorization_kernels.hpp"
+
+#include <ginkgo/core/base/array.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/base/array_access.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+#include "core/matrix/csr_builder.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace factorization {
+
+
+constexpr int default_block_size{512};
+
+
 namespace kernel {
 
 
@@ -520,3 +550,9 @@ void initialize_l(std::shared_ptr<const DefaultExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL);
+
+
+}  // namespace factorization
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/factorization/lu_kernels.hpp.inc b/common/cuda_hip/factorization/lu_kernels.cpp
similarity index 92%
rename from common/cuda_hip/factorization/lu_kernels.hpp.inc
rename to common/cuda_hip/factorization/lu_kernels.cpp
index f8f317bc6a5..71d09e93ef7 100644
--- a/common/cuda_hip/factorization/lu_kernels.hpp.inc
+++ b/common/cuda_hip/factorization/lu_kernels.cpp
@@ -2,6 +2,41 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/factorization/lu_kernels.hpp"
+
+#include <algorithm>
+#include <memory>
+
+#include <thrust/copy.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+
+#include <ginkgo/core/matrix/csr.hpp>
+
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/syncfree.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/base/allocator.hpp"
+#include "core/matrix/csr_lookup.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The LU namespace.
+ *
+ * @ingroup factor
+ */
+namespace lu_factorization {
+
+
+constexpr static int default_block_size = 512;
+
+
 namespace kernel {
 
 
@@ -301,3 +336,9 @@ void symbolic_factorize_simple_finalize(
 
 GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
     GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE_FINALIZE);
+
+
+}  // namespace lu_factorization
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/factorization/par_ic_kernels.hpp.inc b/common/cuda_hip/factorization/par_ic_kernels.cpp
similarity index 84%
rename from common/cuda_hip/factorization/par_ic_kernels.hpp.inc
rename to common/cuda_hip/factorization/par_ic_kernels.cpp
index dd30eb2fc1c..7102d782b94 100644
--- a/common/cuda_hip/factorization/par_ic_kernels.hpp.inc
+++ b/common/cuda_hip/factorization/par_ic_kernels.cpp
@@ -2,6 +2,32 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/factorization/par_ic_kernels.hpp"
+
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/memory.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The parallel ic factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ic_factorization {
+
+
+constexpr int default_block_size = 512;
+
+
 namespace kernel {
 
 
@@ -111,3 +137,9 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_IC_COMPUTE_FACTOR_KERNEL);
+
+
+}  // namespace par_ic_factorization
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/factorization/par_ilu_kernels.hpp.inc b/common/cuda_hip/factorization/par_ilu_kernels.cpp
similarity index 84%
rename from common/cuda_hip/factorization/par_ilu_kernels.hpp.inc
rename to common/cuda_hip/factorization/par_ilu_kernels.cpp
index 1029c0d08f6..447fdb99c2c 100644
--- a/common/cuda_hip/factorization/par_ilu_kernels.hpp.inc
+++ b/common/cuda_hip/factorization/par_ilu_kernels.cpp
@@ -2,6 +2,31 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/factorization/par_ilu_kernels.hpp"
+
+#include <ginkgo/core/matrix/coo.hpp>
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/memory.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The parallel ilu factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ilu_factorization {
+
+
+constexpr int default_block_size{512};
+
+
 namespace kernel {
 
 
@@ -85,3 +110,9 @@ void compute_l_u_factors(std::shared_ptr<const DefaultExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL);
+
+
+}  // namespace par_ilu_factorization
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/log/batch_logger.hpp.inc b/common/cuda_hip/log/batch_logger.hpp
similarity index 67%
rename from common/cuda_hip/log/batch_logger.hpp.inc
rename to common/cuda_hip/log/batch_logger.hpp
index 04b614b50f9..bca07fb9c37 100644
--- a/common/cuda_hip/log/batch_logger.hpp.inc
+++ b/common/cuda_hip/log/batch_logger.hpp
@@ -2,6 +2,19 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_LOG_BATCH_LOGGER_HPP_INC_
+#define GKO_COMMON_CUDA_HIP_LOG_BATCH_LOGGER_HPP_INC_
+
+
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_log {
+
+
 /**
  * @see reference/log/batch_logger.hpp
  */
@@ -28,3 +41,12 @@ class SimpleFinalLogger final {
     real_type* const final_residuals_;
     idx_type* const final_iters_;
 };
+
+
+}  // namespace batch_log
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_LOG_BATCH_LOGGER_HPP_INC_
\ No newline at end of file
diff --git a/common/cuda_hip/matrix/batch_csr_kernels.hpp.inc b/common/cuda_hip/matrix/batch_csr_kernels.cpp
similarity index 87%
rename from common/cuda_hip/matrix/batch_csr_kernels.hpp.inc
rename to common/cuda_hip/matrix/batch_csr_kernels.cpp
index e041dadaa3e..01edb0e1310 100644
--- a/common/cuda_hip/matrix/batch_csr_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/batch_csr_kernels.cpp
@@ -2,6 +2,46 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/matrix/batch_csr_kernels.hpp"
+
+#include <thrust/functional.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_csr.hpp>
+
+#include "common/cuda_hip/base/batch_struct.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The Csr matrix format namespace.
+ * @ref Csr
+ * @ingroup batch_csr
+ */
+namespace batch_csr {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+
 template <typename ValueType, typename IndexType>
 __device__ __forceinline__ void simple_apply(
     const gko::batch::matrix::csr::batch_item<const ValueType, IndexType>& mat,
@@ -196,3 +236,14 @@ __global__ void add_scaled_identity_kernel(
         add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b);
     }
 }
+
+
+#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc"
+
+// clang-format on
+
+
+}  // namespace batch_csr
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc b/common/cuda_hip/matrix/batch_dense_kernels.cpp
similarity index 89%
rename from common/cuda_hip/matrix/batch_dense_kernels.hpp.inc
rename to common/cuda_hip/matrix/batch_dense_kernels.cpp
index f8abf9131a1..90cafc5d1ca 100644
--- a/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/batch_dense_kernels.cpp
@@ -2,6 +2,46 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/matrix/batch_dense_kernels.hpp"
+
+#include <thrust/functional.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+
+#include "common/cuda_hip/base/batch_struct.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The Dense matrix format namespace.
+ *
+ * @ingroup batch_dense
+ */
+namespace batch_dense {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+
 template <typename ValueType>
 __device__ __forceinline__ void simple_apply(
     const gko::batch::matrix::dense::batch_item<const ValueType>& mat,
@@ -243,3 +283,15 @@ __global__ void add_scaled_identity_kernel(
         add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b);
     }
 }
+
+
+#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc"
+
+
+// clang-format on
+
+
+}  // namespace batch_dense
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc b/common/cuda_hip/matrix/batch_ell_kernels.cpp
similarity index 87%
rename from common/cuda_hip/matrix/batch_ell_kernels.hpp.inc
rename to common/cuda_hip/matrix/batch_ell_kernels.cpp
index 0a6d1927c96..c5e27e9d1d1 100644
--- a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/batch_ell_kernels.cpp
@@ -2,6 +2,46 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/matrix/batch_ell_kernels.hpp"
+
+#include <thrust/functional.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
+
+#include "common/cuda_hip/base/batch_struct.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The Ell matrix format namespace.
+ * @ref Ell
+ * @ingroup batch_ell
+ */
+namespace batch_ell {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+
 template <typename ValueType, typename IndexType>
 __device__ __forceinline__ void simple_apply(
     const gko::batch::matrix::ell::batch_item<const ValueType, IndexType>& mat,
@@ -205,3 +245,14 @@ __global__ void add_scaled_identity_kernel(
         add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b);
     }
 }
+
+
+#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc"
+
+// clang-format on
+
+
+}  // namespace batch_ell
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/matrix/coo_kernels.hpp.inc b/common/cuda_hip/matrix/coo_kernels.cpp
similarity index 91%
rename from common/cuda_hip/matrix/coo_kernels.hpp.inc
rename to common/cuda_hip/matrix/coo_kernels.cpp
index 98332f6cd7b..00ab983bc9f 100644
--- a/common/cuda_hip/matrix/coo_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/coo_kernels.cpp
@@ -2,6 +2,42 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/matrix/coo_kernels.hpp"
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
+#include "common/cuda_hip/components/segment_scan.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/matrix/dense_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The Coordinate matrix format namespace.
+ *
+ * @ingroup coo
+ */
+namespace coo {
+
+
+constexpr int warps_in_block = 4;
+constexpr int spmv_block_size = warps_in_block * config::warp_size;
+
+
 namespace {
 
 
@@ -304,3 +340,9 @@ void advanced_spmv2(std::shared_ptr<const DefaultExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL);
+
+
+}  // namespace coo
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/matrix/dense_kernels.hpp.inc b/common/cuda_hip/matrix/dense_kernels.cpp
similarity index 75%
rename from common/cuda_hip/matrix/dense_kernels.hpp.inc
rename to common/cuda_hip/matrix/dense_kernels.cpp
index b48d2c4ff4f..b44c0396823 100644
--- a/common/cuda_hip/matrix/dense_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/dense_kernels.cpp
@@ -2,6 +2,46 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/matrix/dense_kernels.hpp"
+
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/range_accessors.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/diagonal.hpp>
+#include <ginkgo/core/matrix/ell.hpp>
+#include <ginkgo/core/matrix/fbcsr.hpp>
+#include <ginkgo/core/matrix/hybrid.hpp>
+#include <ginkgo/core/matrix/sellp.hpp>
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
+
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "core/base/utils.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The Dense matrix format namespace.
+ *
+ * @ingroup dense
+ */
+namespace dense {
+
+
+constexpr int default_block_size = 512;
+
+
 namespace kernel {
 
 
@@ -619,3 +659,188 @@ void convert_to_sparsity_csr(std::shared_ptr<const DefaultExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL);
+
+
+template <typename ValueType>
+void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                          const matrix::Dense<ValueType>* x,
+                          const matrix::Dense<ValueType>* y,
+                          matrix::Dense<ValueType>* result, array<char>& tmp)
+{
+    if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
+        if (blas::is_supported<ValueType>::value) {
+            auto handle = exec->get_blas_handle();
+            blas::dot(handle, x->get_size()[0], x->get_const_values(),
+                      x->get_stride(), y->get_const_values(), y->get_stride(),
+                      result->get_values());
+        } else {
+            compute_dot(exec, x, y, result, tmp);
+        }
+    } else {
+        compute_dot(exec, x, y, result, tmp);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
+
+
+template <typename ValueType>
+void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                               const matrix::Dense<ValueType>* x,
+                               const matrix::Dense<ValueType>* y,
+                               matrix::Dense<ValueType>* result,
+                               array<char>& tmp)
+{
+    if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
+        if (blas::is_supported<ValueType>::value) {
+            auto handle = exec->get_blas_handle();
+            blas::conj_dot(handle, x->get_size()[0], x->get_const_values(),
+                           x->get_stride(), y->get_const_values(),
+                           y->get_stride(), result->get_values());
+        } else {
+            compute_conj_dot(exec, x, y, result, tmp);
+        }
+    } else {
+        compute_conj_dot(exec, x, y, result, tmp);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
+
+
+template <typename ValueType>
+void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                            const matrix::Dense<ValueType>* x,
+                            matrix::Dense<remove_complex<ValueType>>* result,
+                            array<char>& tmp)
+{
+    if (x->get_size()[1] == 1) {
+        if (blas::is_supported<ValueType>::value) {
+            auto handle = exec->get_blas_handle();
+            blas::norm2(handle, x->get_size()[0], x->get_const_values(),
+                        x->get_stride(), result->get_values());
+        } else {
+            compute_norm2(exec, x, result, tmp);
+        }
+    } else {
+        compute_norm2(exec, x, result, tmp);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
+
+
+template <typename ValueType>
+void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
+                  const matrix::Dense<ValueType>* a,
+                  const matrix::Dense<ValueType>* b,
+                  matrix::Dense<ValueType>* c)
+{
+    if (blas::is_supported<ValueType>::value) {
+        auto handle = exec->get_blas_handle();
+        if (c->get_size()[0] > 0 && c->get_size()[1] > 0) {
+            if (a->get_size()[1] > 0) {
+                blas::pointer_mode_guard pm_guard(handle);
+                auto alpha = one<ValueType>();
+                auto beta = zero<ValueType>();
+                blas::gemm(handle, BLAS_OP_N, BLAS_OP_N, c->get_size()[1],
+                           c->get_size()[0], a->get_size()[1], &alpha,
+                           b->get_const_values(), b->get_stride(),
+                           a->get_const_values(), a->get_stride(), &beta,
+                           c->get_values(), c->get_stride());
+            } else {
+                dense::fill(exec, c, zero<ValueType>());
+            }
+        }
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
+
+
+template <typename ValueType>
+void apply(std::shared_ptr<const DefaultExecutor> exec,
+           const matrix::Dense<ValueType>* alpha,
+           const matrix::Dense<ValueType>* a, const matrix::Dense<ValueType>* b,
+           const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
+{
+    if (blas::is_supported<ValueType>::value) {
+        if (c->get_size()[0] > 0 && c->get_size()[1] > 0) {
+            if (a->get_size()[1] > 0) {
+                blas::gemm(exec->get_blas_handle(), BLAS_OP_N, BLAS_OP_N,
+                           c->get_size()[1], c->get_size()[0], a->get_size()[1],
+                           alpha->get_const_values(), b->get_const_values(),
+                           b->get_stride(), a->get_const_values(),
+                           a->get_stride(), beta->get_const_values(),
+                           c->get_values(), c->get_stride());
+            } else {
+                dense::scale(exec, beta, c);
+            }
+        }
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
+
+
+template <typename ValueType>
+void transpose(std::shared_ptr<const DefaultExecutor> exec,
+               const matrix::Dense<ValueType>* orig,
+               matrix::Dense<ValueType>* trans)
+{
+    if (blas::is_supported<ValueType>::value) {
+        auto handle = exec->get_blas_handle();
+        if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) {
+            blas::pointer_mode_guard pm_guard(handle);
+            auto alpha = one<ValueType>();
+            auto beta = zero<ValueType>();
+            blas::geam(handle, BLAS_OP_T, BLAS_OP_N, orig->get_size()[0],
+                       orig->get_size()[1], &alpha, orig->get_const_values(),
+                       orig->get_stride(), &beta, trans->get_const_values(),
+                       trans->get_stride(), trans->get_values(),
+                       trans->get_stride());
+        }
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+};
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
+
+
+template <typename ValueType>
+void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Dense<ValueType>* orig,
+                    matrix::Dense<ValueType>* trans)
+{
+    if (blas::is_supported<ValueType>::value) {
+        auto handle = exec->get_blas_handle();
+        if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) {
+            blas::pointer_mode_guard pm_guard(handle);
+            auto alpha = one<ValueType>();
+            auto beta = zero<ValueType>();
+            blas::geam(handle, BLAS_OP_C, BLAS_OP_N, orig->get_size()[0],
+                       orig->get_size()[1], &alpha, orig->get_const_values(),
+                       orig->get_stride(), &beta, trans->get_const_values(),
+                       trans->get_stride(), trans->get_values(),
+                       trans->get_stride());
+        }
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
+
+
+}  // namespace dense
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/matrix/diagonal_kernels.hpp.inc b/common/cuda_hip/matrix/diagonal_kernels.cpp
similarity index 73%
rename from common/cuda_hip/matrix/diagonal_kernels.hpp.inc
rename to common/cuda_hip/matrix/diagonal_kernels.cpp
index c3919fda079..a824abc6f7c 100644
--- a/common/cuda_hip/matrix/diagonal_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/diagonal_kernels.cpp
@@ -2,6 +2,32 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/matrix/diagonal_kernels.hpp"
+
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The Diagonal matrix format namespace.
+ *
+ * @ingroup diagonal
+ */
+namespace diagonal {
+
+
+constexpr int default_block_size = 512;
+
+
 namespace kernel {
 
 
@@ -57,3 +83,9 @@ void apply_to_csr(std::shared_ptr<const DefaultExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DIAGONAL_APPLY_TO_CSR_KERNEL);
+
+
+}  // namespace diagonal
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/cuda/matrix/ell_kernels.cu b/common/cuda_hip/matrix/ell_kernels.cpp
similarity index 61%
rename from cuda/matrix/ell_kernels.cu
rename to common/cuda_hip/matrix/ell_kernels.cpp
index 5c81fa7c994..40f174a25c7 100644
--- a/cuda/matrix/ell_kernels.cu
+++ b/common/cuda_hip/matrix/ell_kernels.cpp
@@ -18,21 +18,21 @@
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
 #include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "common/cuda_hip/components/format_conversion.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 #include "core/base/mixed_precision_types.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/components/atomic.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
 
 
 namespace gko {
 namespace kernels {
-namespace cuda {
+namespace GKO_DEVICE_NAMESPACE {
 /**
  * @brief The ELL matrix format namespace.
  *
@@ -75,7 +75,135 @@ constexpr int max_thread_per_worker = 32;
 using compiled_kernels = syn::value_list<int, 0, 1, 2, 4, 8, 16, 32>;
 
 
-#include "common/cuda_hip/matrix/ell_kernels.hpp.inc"
+namespace kernel {
+
+
+template <int num_thread_per_worker, bool atomic, typename b_accessor,
+          typename a_accessor, typename OutputValueType, typename IndexType,
+          typename Closure>
+__device__ void spmv_kernel(
+    const size_type num_rows, const int num_worker_per_row,
+    acc::range<a_accessor> val, const IndexType* __restrict__ col,
+    const size_type stride, const size_type num_stored_elements_per_row,
+    acc::range<b_accessor> b, OutputValueType* __restrict__ c,
+    const size_type c_stride, Closure op)
+{
+    using arithmetic_type = typename a_accessor::arithmetic_type;
+    const auto tidx = thread::get_thread_id_flat();
+    const decltype(tidx) column_id = blockIdx.y;
+    if (num_thread_per_worker == 1) {
+        // Specialize the num_thread_per_worker = 1. It doesn't need the shared
+        // memory, __syncthreads, and atomic_add
+        if (tidx < num_rows) {
+            auto temp = zero<arithmetic_type>();
+            for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) {
+                const auto ind = tidx + idx * stride;
+                const auto col_idx = col[ind];
+                if (col_idx == invalid_index<IndexType>()) {
+                    break;
+                } else {
+                    temp += val(ind) * b(col_idx, column_id);
+                }
+            }
+            const auto c_ind = tidx * c_stride + column_id;
+            c[c_ind] = op(temp, c[c_ind]);
+        }
+    } else {
+        if (tidx < num_worker_per_row * num_rows) {
+            const auto idx_in_worker = threadIdx.y;
+            const auto x = tidx % num_rows;
+            const auto worker_id = tidx / num_rows;
+            const auto step_size = num_worker_per_row * num_thread_per_worker;
+            __shared__ uninitialized_array<
+                arithmetic_type, default_block_size / num_thread_per_worker>
+                storage;
+            if (idx_in_worker == 0) {
+                storage[threadIdx.x] = 0;
+            }
+            __syncthreads();
+            auto temp = zero<arithmetic_type>();
+            for (size_type idx =
+                     worker_id * num_thread_per_worker + idx_in_worker;
+                 idx < num_stored_elements_per_row; idx += step_size) {
+                const auto ind = x + idx * stride;
+                const auto col_idx = col[ind];
+                if (col_idx == invalid_index<IndexType>()) {
+                    break;
+                } else {
+                    temp += val(ind) * b(col_idx, column_id);
+                }
+            }
+            atomic_add(&storage[threadIdx.x], temp);
+            __syncthreads();
+            if (idx_in_worker == 0) {
+                const auto c_ind = x * c_stride + column_id;
+                if (atomic) {
+                    atomic_add(&(c[c_ind]), op(storage[threadIdx.x], c[c_ind]));
+                } else {
+                    c[c_ind] = op(storage[threadIdx.x], c[c_ind]);
+                }
+            }
+        }
+    }
+}
+
+
+template <int num_thread_per_worker, bool atomic = false, typename b_accessor,
+          typename a_accessor, typename OutputValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void spmv(
+    const size_type num_rows, const int num_worker_per_row,
+    acc::range<a_accessor> val, const IndexType* __restrict__ col,
+    const size_type stride, const size_type num_stored_elements_per_row,
+    acc::range<b_accessor> b, OutputValueType* __restrict__ c,
+    const size_type c_stride)
+{
+    spmv_kernel<num_thread_per_worker, atomic>(
+        num_rows, num_worker_per_row, val, col, stride,
+        num_stored_elements_per_row, b, c, c_stride,
+        [](const auto& x, const OutputValueType& y) {
+            return static_cast<OutputValueType>(x);
+        });
+}
+
+
+template <int num_thread_per_worker, bool atomic = false, typename b_accessor,
+          typename a_accessor, typename OutputValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void spmv(
+    const size_type num_rows, const int num_worker_per_row,
+    acc::range<a_accessor> alpha, acc::range<a_accessor> val,
+    const IndexType* __restrict__ col, const size_type stride,
+    const size_type num_stored_elements_per_row, acc::range<b_accessor> b,
+    const OutputValueType* __restrict__ beta, OutputValueType* __restrict__ c,
+    const size_type c_stride)
+{
+    using arithmetic_type = typename a_accessor::arithmetic_type;
+    const auto alpha_val = alpha(0);
+    const OutputValueType beta_val = beta[0];
+    if (atomic) {
+        // Because the atomic operation changes the values of c during
+        // computation, it can not directly do alpha * a * b + beta * c
+        // operation. The beta * c needs to be done before calling this kernel.
+        // Then, this kernel only adds alpha * a * b when it uses atomic
+        // operation.
+        spmv_kernel<num_thread_per_worker, atomic>(
+            num_rows, num_worker_per_row, val, col, stride,
+            num_stored_elements_per_row, b, c, c_stride,
+            [&alpha_val](const auto& x, const OutputValueType& y) {
+                return static_cast<OutputValueType>(alpha_val * x);
+            });
+    } else {
+        spmv_kernel<num_thread_per_worker, atomic>(
+            num_rows, num_worker_per_row, val, col, stride,
+            num_stored_elements_per_row, b, c, c_stride,
+            [&alpha_val, &beta_val](const auto& x, const OutputValueType& y) {
+                return static_cast<OutputValueType>(
+                    alpha_val * x + static_cast<arithmetic_type>(beta_val * y));
+            });
+    }
+}
+
+
+}  // namespace kernel
 
 
 namespace {
@@ -156,7 +284,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_abstract_spmv, abstract_spmv);
 
 template <typename ValueType, typename IndexType>
 std::array<int, 3> compute_thread_worker_and_atomicity(
-    std::shared_ptr<const CudaExecutor> exec,
+    std::shared_ptr<const DefaultExecutor> exec,
     const matrix::Ell<ValueType, IndexType>* a)
 {
     int num_thread_per_worker = 1;
@@ -200,7 +328,7 @@ std::array<int, 3> compute_thread_worker_and_atomicity(
 
 template <typename InputValueType, typename MatrixValueType,
           typename OutputValueType, typename IndexType>
-void spmv(std::shared_ptr<const CudaExecutor> exec,
+void spmv(std::shared_ptr<const DefaultExecutor> exec,
           const matrix::Ell<MatrixValueType, IndexType>* a,
           const matrix::Dense<InputValueType>* b,
           matrix::Dense<OutputValueType>* c)
@@ -232,7 +360,7 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
 
 template <typename InputValueType, typename MatrixValueType,
           typename OutputValueType, typename IndexType>
-void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
+void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec,
                    const matrix::Dense<MatrixValueType>* alpha,
                    const matrix::Ell<MatrixValueType, IndexType>* a,
                    const matrix::Dense<InputValueType>* b,
@@ -265,6 +393,6 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
 
 
 }  // namespace ell
-}  // namespace cuda
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/matrix/ell_kernels.hpp.inc b/common/cuda_hip/matrix/ell_kernels.hpp.inc
deleted file mode 100644
index a5fd37c1d05..00000000000
--- a/common/cuda_hip/matrix/ell_kernels.hpp.inc
+++ /dev/null
@@ -1,133 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-namespace kernel {
-
-
-template <int num_thread_per_worker, bool atomic, typename b_accessor,
-          typename a_accessor, typename OutputValueType, typename IndexType,
-          typename Closure>
-__device__ void spmv_kernel(
-    const size_type num_rows, const int num_worker_per_row,
-    acc::range<a_accessor> val, const IndexType* __restrict__ col,
-    const size_type stride, const size_type num_stored_elements_per_row,
-    acc::range<b_accessor> b, OutputValueType* __restrict__ c,
-    const size_type c_stride, Closure op)
-{
-    using arithmetic_type = typename a_accessor::arithmetic_type;
-    const auto tidx = thread::get_thread_id_flat();
-    const decltype(tidx) column_id = blockIdx.y;
-    if (num_thread_per_worker == 1) {
-        // Specialize the num_thread_per_worker = 1. It doesn't need the shared
-        // memory, __syncthreads, and atomic_add
-        if (tidx < num_rows) {
-            auto temp = zero<arithmetic_type>();
-            for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) {
-                const auto ind = tidx + idx * stride;
-                const auto col_idx = col[ind];
-                if (col_idx == invalid_index<IndexType>()) {
-                    break;
-                } else {
-                    temp += val(ind) * b(col_idx, column_id);
-                }
-            }
-            const auto c_ind = tidx * c_stride + column_id;
-            c[c_ind] = op(temp, c[c_ind]);
-        }
-    } else {
-        if (tidx < num_worker_per_row * num_rows) {
-            const auto idx_in_worker = threadIdx.y;
-            const auto x = tidx % num_rows;
-            const auto worker_id = tidx / num_rows;
-            const auto step_size = num_worker_per_row * num_thread_per_worker;
-            __shared__ uninitialized_array<
-                arithmetic_type, default_block_size / num_thread_per_worker>
-                storage;
-            if (idx_in_worker == 0) {
-                storage[threadIdx.x] = 0;
-            }
-            __syncthreads();
-            auto temp = zero<arithmetic_type>();
-            for (size_type idx =
-                     worker_id * num_thread_per_worker + idx_in_worker;
-                 idx < num_stored_elements_per_row; idx += step_size) {
-                const auto ind = x + idx * stride;
-                const auto col_idx = col[ind];
-                if (col_idx == invalid_index<IndexType>()) {
-                    break;
-                } else {
-                    temp += val(ind) * b(col_idx, column_id);
-                }
-            }
-            atomic_add(&storage[threadIdx.x], temp);
-            __syncthreads();
-            if (idx_in_worker == 0) {
-                const auto c_ind = x * c_stride + column_id;
-                if (atomic) {
-                    atomic_add(&(c[c_ind]), op(storage[threadIdx.x], c[c_ind]));
-                } else {
-                    c[c_ind] = op(storage[threadIdx.x], c[c_ind]);
-                }
-            }
-        }
-    }
-}
-
-
-template <int num_thread_per_worker, bool atomic = false, typename b_accessor,
-          typename a_accessor, typename OutputValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void spmv(
-    const size_type num_rows, const int num_worker_per_row,
-    acc::range<a_accessor> val, const IndexType* __restrict__ col,
-    const size_type stride, const size_type num_stored_elements_per_row,
-    acc::range<b_accessor> b, OutputValueType* __restrict__ c,
-    const size_type c_stride)
-{
-    spmv_kernel<num_thread_per_worker, atomic>(
-        num_rows, num_worker_per_row, val, col, stride,
-        num_stored_elements_per_row, b, c, c_stride,
-        [](const auto& x, const OutputValueType& y) {
-            return static_cast<OutputValueType>(x);
-        });
-}
-
-
-template <int num_thread_per_worker, bool atomic = false, typename b_accessor,
-          typename a_accessor, typename OutputValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void spmv(
-    const size_type num_rows, const int num_worker_per_row,
-    acc::range<a_accessor> alpha, acc::range<a_accessor> val,
-    const IndexType* __restrict__ col, const size_type stride,
-    const size_type num_stored_elements_per_row, acc::range<b_accessor> b,
-    const OutputValueType* __restrict__ beta, OutputValueType* __restrict__ c,
-    const size_type c_stride)
-{
-    using arithmetic_type = typename a_accessor::arithmetic_type;
-    const auto alpha_val = alpha(0);
-    const OutputValueType beta_val = beta[0];
-    if (atomic) {
-        // Because the atomic operation changes the values of c during
-        // computation, it can not directly do alpha * a * b + beta * c
-        // operation. The beta * c needs to be done before calling this kernel.
-        // Then, this kernel only adds alpha * a * b when it uses atomic
-        // operation.
-        spmv_kernel<num_thread_per_worker, atomic>(
-            num_rows, num_worker_per_row, val, col, stride,
-            num_stored_elements_per_row, b, c, c_stride,
-            [&alpha_val](const auto& x, const OutputValueType& y) {
-                return static_cast<OutputValueType>(alpha_val * x);
-            });
-    } else {
-        spmv_kernel<num_thread_per_worker, atomic>(
-            num_rows, num_worker_per_row, val, col, stride,
-            num_stored_elements_per_row, b, c, c_stride,
-            [&alpha_val, &beta_val](const auto& x, const OutputValueType& y) {
-                return static_cast<OutputValueType>(
-                    alpha_val * x + static_cast<arithmetic_type>(beta_val * y));
-            });
-    }
-}
-
-
-}  // namespace kernel
diff --git a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc b/common/cuda_hip/matrix/fbcsr_kernels.cpp
similarity index 57%
rename from common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
rename to common/cuda_hip/matrix/fbcsr_kernels.cpp
index d801876adbc..f6276fdd056 100644
--- a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/fbcsr_kernels.cpp
@@ -2,6 +2,69 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/matrix/fbcsr_kernels.hpp"
+
+#include <algorithm>
+
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sort.h>
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/sparselib_block_bindings.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/merging.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "common/unified/base/kernel_launch.hpp"
+#include "core/base/array_access.hpp"
+#include "core/base/block_sizes.hpp"
+#include "core/base/device_matrix_data_kernels.hpp"
+#include "core/components/fill_array_kernels.hpp"
+#include "core/components/format_conversion_kernels.hpp"
+#include "core/matrix/csr_lookup.hpp"
+#include "core/matrix/dense_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
+/**
+ * @brief The fixed-size block compressed sparse row matrix format namespace.
+ *
+ * @ingroup fbcsr
+ */
+namespace fbcsr {
+
+
+constexpr int default_block_size{512};
+
+
+#include "common/cuda_hip/matrix/csr_common.hpp.inc"
+
 namespace kernel {
 
 
@@ -341,3 +404,235 @@ template <typename ValueType, typename IndexType>
 void extract_diagonal(std::shared_ptr<const DefaultExecutor> exec,
                       const matrix::Fbcsr<ValueType, IndexType>* orig,
                       matrix::Diagonal<ValueType>* diag) GKO_NOT_IMPLEMENTED;
+
+
+namespace {
+
+
+template <typename ValueType>
+void dense_transpose(std::shared_ptr<const DefaultExecutor> exec,
+                     const size_type nrows, const size_type ncols,
+                     const size_type orig_stride, const ValueType* const orig,
+                     const size_type trans_stride, ValueType* const trans)
+{
+    if (nrows == 0) {
+        return;
+    }
+    if (blas::is_supported<ValueType>::value) {
+        auto handle = exec->get_blas_handle();
+        {
+            blas::pointer_mode_guard pm_guard(handle);
+            auto alpha = one<ValueType>();
+            auto beta = zero<ValueType>();
+            blas::geam(handle, BLAS_OP_T, BLAS_OP_N, nrows, ncols, &alpha, orig,
+                       orig_stride, &beta, trans, trans_stride, trans,
+                       trans_stride);
+        }
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void spmv(std::shared_ptr<const DefaultExecutor> exec,
+          const matrix::Fbcsr<ValueType, IndexType>* const a,
+          const matrix::Dense<ValueType>* const b,
+          matrix::Dense<ValueType>* const c)
+{
+    if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
+        // empty output: nothing to do
+        return;
+    }
+    if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) {
+        // empty input: fill output with zero
+        dense::fill(exec, c, zero<ValueType>());
+        return;
+    }
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_sparselib_handle();
+        sparselib::pointer_mode_guard pm_guard(handle);
+        const auto alpha = one<ValueType>();
+        const auto beta = zero<ValueType>();
+        auto descr = sparselib::create_mat_descr();
+        const auto row_ptrs = a->get_const_row_ptrs();
+        const auto col_idxs = a->get_const_col_idxs();
+        const auto values = a->get_const_values();
+        const int bs = a->get_block_size();
+        const IndexType mb = a->get_num_block_rows();
+        const IndexType nb = a->get_num_block_cols();
+        const auto nnzb = static_cast<IndexType>(a->get_num_stored_blocks());
+        const auto nrhs = static_cast<IndexType>(b->get_size()[1]);
+        const auto nrows = a->get_size()[0];
+        const auto ncols = a->get_size()[1];
+        const auto in_stride = b->get_stride();
+        const auto out_stride = c->get_stride();
+        if (nrhs == 1 && in_stride == 1 && out_stride == 1) {
+            sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb,
+                             nnzb, &alpha, descr, values, row_ptrs, col_idxs,
+                             bs, b->get_const_values(), &beta, c->get_values());
+        } else {
+            const auto trans_stride = nrows;
+            auto trans_c = array<ValueType>(exec, nrows * nrhs);
+            sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
+                             SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
+                             &alpha, descr, values, row_ptrs, col_idxs, bs,
+                             b->get_const_values(), in_stride, &beta,
+                             trans_c.get_data(), trans_stride);
+            dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(),
+                            out_stride, c->get_values());
+        }
+        sparselib::destroy(descr);
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec,
+                   const matrix::Dense<ValueType>* const alpha,
+                   const matrix::Fbcsr<ValueType, IndexType>* const a,
+                   const matrix::Dense<ValueType>* const b,
+                   const matrix::Dense<ValueType>* const beta,
+                   matrix::Dense<ValueType>* const c)
+{
+    if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
+        // empty output: nothing to do
+        return;
+    }
+    if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) {
+        // empty input: scale output
+        dense::scale(exec, beta, c);
+        return;
+    }
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_sparselib_handle();
+        const auto alphp = alpha->get_const_values();
+        const auto betap = beta->get_const_values();
+        auto descr = sparselib::create_mat_descr();
+        const auto row_ptrs = a->get_const_row_ptrs();
+        const auto col_idxs = a->get_const_col_idxs();
+        const auto values = a->get_const_values();
+        const int bs = a->get_block_size();
+        const IndexType mb = a->get_num_block_rows();
+        const IndexType nb = a->get_num_block_cols();
+        const auto nnzb = static_cast<IndexType>(a->get_num_stored_blocks());
+        const auto nrhs = static_cast<IndexType>(b->get_size()[1]);
+        const auto nrows = a->get_size()[0];
+        const auto ncols = a->get_size()[1];
+        const auto in_stride = b->get_stride();
+        const auto out_stride = c->get_stride();
+        if (nrhs == 1 && in_stride == 1 && out_stride == 1) {
+            sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb,
+                             nnzb, alphp, descr, values, row_ptrs, col_idxs, bs,
+                             b->get_const_values(), betap, c->get_values());
+        } else {
+            const auto trans_stride = nrows;
+            auto trans_c = array<ValueType>(exec, nrows * nrhs);
+            dense_transpose(exec, nrows, nrhs, out_stride, c->get_values(),
+                            trans_stride, trans_c.get_data());
+            sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
+                             SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
+                             alphp, descr, values, row_ptrs, col_idxs, bs,
+                             b->get_const_values(), in_stride, betap,
+                             trans_c.get_data(), trans_stride);
+            dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(),
+                            out_stride, c->get_values());
+        }
+        sparselib::destroy(descr);
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+
+namespace {
+
+
+template <int mat_blk_sz, typename ValueType, typename IndexType>
+void transpose_blocks_impl(syn::value_list<int, mat_blk_sz>,
+                           std::shared_ptr<const DefaultExecutor> exec,
+                           matrix::Fbcsr<ValueType, IndexType>* const mat)
+{
+    constexpr int subwarp_size = config::warp_size;
+    const auto nbnz = mat->get_num_stored_blocks();
+    const auto numthreads = nbnz * subwarp_size;
+    const auto block_size = default_block_size;
+    const auto grid_dim = ceildiv(numthreads, block_size);
+    if (grid_dim > 0) {
+        kernel::transpose_blocks<mat_blk_sz, subwarp_size>
+            <<<grid_dim, block_size, 0, exec->get_stream()>>>(
+                nbnz, mat->get_values());
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_transpose_blocks,
+                                    transpose_blocks_impl);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void transpose(const std::shared_ptr<const DefaultExecutor> exec,
+               const matrix::Fbcsr<ValueType, IndexType>* const orig,
+               matrix::Fbcsr<ValueType, IndexType>* const trans)
+{
+#ifdef GKO_COMPILING_CUDA
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+        const int bs = orig->get_block_size();
+        const IndexType nnzb =
+            static_cast<IndexType>(orig->get_num_stored_blocks());
+        cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
+        cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
+        const IndexType buffer_size = sparselib::bsr_transpose_buffersize(
+            exec->get_sparselib_handle(), orig->get_num_block_rows(),
+            orig->get_num_block_cols(), nnzb, orig->get_const_values(),
+            orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs);
+        array<char> buffer_array(exec, buffer_size);
+        auto buffer = buffer_array.get_data();
+        sparselib::bsr_transpose(
+            exec->get_sparselib_handle(), orig->get_num_block_rows(),
+            orig->get_num_block_cols(), nnzb, orig->get_const_values(),
+            orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs,
+            trans->get_values(), trans->get_col_idxs(), trans->get_row_ptrs(),
+            copyValues, idxBase, buffer);
+
+        // transpose blocks
+        select_transpose_blocks(
+            fixedblock::compiled_kernels(),
+            [bs](int compiled_block_size) { return bs == compiled_block_size; },
+            syn::value_list<int>(), syn::type_list<>(), exec, trans);
+    } else
+#endif
+    {
+        fallback_transpose(exec, orig, trans);
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Fbcsr<ValueType, IndexType>* orig,
+                    matrix::Fbcsr<ValueType, IndexType>* trans)
+{
+    const int grid_size =
+        ceildiv(trans->get_num_stored_elements(), default_block_size);
+    transpose(exec, orig, trans);
+    if (grid_size > 0 && is_complex<ValueType>()) {
+        kernel::
+            conjugate<<<grid_size, default_block_size, 0, exec->get_stream()>>>(
+                trans->get_num_stored_elements(),
+                as_device_type(trans->get_values()));
+    }
+}
+
+
+}  // namespace fbcsr
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/matrix/sellp_kernels.hpp.inc b/common/cuda_hip/matrix/sellp_kernels.cpp
similarity index 83%
rename from common/cuda_hip/matrix/sellp_kernels.hpp.inc
rename to common/cuda_hip/matrix/sellp_kernels.cpp
index f4f0035c276..64c672b8d8d 100644
--- a/common/cuda_hip/matrix/sellp_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/sellp_kernels.cpp
@@ -2,6 +2,37 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/matrix/sellp_kernels.hpp"
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The SELL-P matrix format namespace.
+ *
+ * @ingroup sellp
+ */
+namespace sellp {
+
+
+constexpr int default_block_size = 512;
+
+
 template <typename ValueType, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void spmv_kernel(
     size_type num_rows, size_type num_right_hand_sides, size_type b_stride,
@@ -102,3 +133,9 @@ void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL);
+
+
+}  // namespace sellp
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/hip/matrix/sparsity_csr_kernels.hip.cpp b/common/cuda_hip/matrix/sparsity_csr_kernels.cpp
similarity index 61%
rename from hip/matrix/sparsity_csr_kernels.hip.cpp
rename to common/cuda_hip/matrix/sparsity_csr_kernels.cpp
index 7a7a4ba49d5..067b2749097 100644
--- a/hip/matrix/sparsity_csr_kernels.hip.cpp
+++ b/common/cuda_hip/matrix/sparsity_csr_kernels.cpp
@@ -11,24 +11,24 @@
 #include "accessor/cuda_hip_helper.hpp"
 #include "accessor/reduced_row_major.hpp"
 #include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
 #include "core/base/mixed_precision_types.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/format_conversion_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/base/thrust.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
 
 
 namespace gko {
 namespace kernels {
-namespace hip {
+namespace GKO_DEVICE_NAMESPACE {
 /**
  * @brief The Compressed sparse row matrix format namespace.
  *
@@ -51,7 +51,114 @@ using classical_kernels = syn::value_list<int, 2>;
 
 
 #include "common/cuda_hip/matrix/csr_common.hpp.inc"
-#include "common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc"
+
+namespace kernel {
+
+
+template <size_type subwarp_size, typename MatrixValueType,
+          typename input_accessor, typename output_accessor, typename IndexType,
+          typename Closure>
+__device__ void device_classical_spmv(const size_type num_rows,
+                                      const MatrixValueType* __restrict__ val,
+                                      const IndexType* __restrict__ col_idxs,
+                                      const IndexType* __restrict__ row_ptrs,
+                                      acc::range<input_accessor> b,
+                                      acc::range<output_accessor> c,
+                                      Closure scale)
+{
+    using arithmetic_type = typename output_accessor::arithmetic_type;
+    auto subwarp_tile =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    const auto subrow = thread::get_subwarp_num_flat<subwarp_size>();
+    const auto subid = subwarp_tile.thread_rank();
+    const IndexType column_id = blockIdx.y;
+    const arithmetic_type value = val[0];
+    auto row = thread::get_subwarp_id_flat<subwarp_size>();
+    for (; row < num_rows; row += subrow) {
+        const auto ind_end = row_ptrs[row + 1];
+        arithmetic_type temp_val = zero<arithmetic_type>();
+        for (auto ind = row_ptrs[row] + subid; ind < ind_end;
+             ind += subwarp_size) {
+            temp_val += value * b(col_idxs[ind], column_id);
+        }
+        auto subwarp_result =
+            reduce(subwarp_tile, temp_val,
+                   [](const arithmetic_type& a, const arithmetic_type& b) {
+                       return a + b;
+                   });
+        if (subid == 0) {
+            c(row, column_id) = scale(subwarp_result, c(row, column_id));
+        }
+    }
+}
+
+
+template <size_type subwarp_size, typename MatrixValueType,
+          typename input_accessor, typename output_accessor, typename IndexType>
+__global__ __launch_bounds__(spmv_block_size) void abstract_classical_spmv(
+    const size_type num_rows, const MatrixValueType* __restrict__ val,
+    const IndexType* __restrict__ col_idxs,
+    const IndexType* __restrict__ row_ptrs, acc::range<input_accessor> b,
+    acc::range<output_accessor> c)
+{
+    using type = typename output_accessor::arithmetic_type;
+    device_classical_spmv<subwarp_size>(
+        num_rows, val, col_idxs, row_ptrs, b, c,
+        [](const type& x, const type& y) { return x; });
+}
+
+
+template <size_type subwarp_size, typename MatrixValueType,
+          typename input_accessor, typename output_accessor, typename IndexType>
+__global__ __launch_bounds__(spmv_block_size) void abstract_classical_spmv(
+    const size_type num_rows, const MatrixValueType* __restrict__ alpha,
+    const MatrixValueType* __restrict__ val,
+    const IndexType* __restrict__ col_idxs,
+    const IndexType* __restrict__ row_ptrs, acc::range<input_accessor> b,
+    const typename output_accessor::storage_type* __restrict__ beta,
+    acc::range<output_accessor> c)
+{
+    using type = typename output_accessor::arithmetic_type;
+    const type alpha_val = alpha[0];
+    const type beta_val = beta[0];
+    device_classical_spmv<subwarp_size>(
+        num_rows, val, col_idxs, row_ptrs, b, c,
+        [&alpha_val, &beta_val](const type& x, const type& y) {
+            return alpha_val * x + beta_val * y;
+        });
+}
+
+
+}  // namespace kernel
+
+
+template <typename ValueType, typename IndexType>
+void transpose(std::shared_ptr<const DefaultExecutor> exec,
+               const matrix::SparsityCsr<ValueType, IndexType>* orig,
+               matrix::SparsityCsr<ValueType, IndexType>* trans)
+    GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void fallback_sort(std::shared_ptr<const DefaultExecutor> exec,
+                   matrix::SparsityCsr<ValueType, IndexType>* to_sort)
+{
+    const auto row_ptrs = to_sort->get_const_row_ptrs();
+    const auto col_idxs = to_sort->get_col_idxs();
+    const auto nnz = to_sort->get_num_nonzeros();
+    const auto num_rows = to_sort->get_size()[0];
+    array<IndexType> row_idx_array(exec, nnz);
+    const auto row_idxs = row_idx_array.get_data();
+    components::convert_ptrs_to_idxs(exec, row_ptrs, num_rows, row_idxs);
+    // two sorts by integer keys hopefully enable Thrust to use cub's RadixSort
+    thrust::sort_by_key(thrust_policy(exec), col_idxs, col_idxs + nnz,
+                        row_idxs);
+    thrust::stable_sort_by_key(thrust_policy(exec), row_idxs, row_idxs + nnz,
+                               col_idxs);
+}
 
 
 namespace host_kernel {
@@ -60,7 +167,7 @@ namespace host_kernel {
 template <int subwarp_size, typename MatrixValueType, typename InputValueType,
           typename OutputValueType, typename IndexType>
 void classical_spmv(syn::value_list<int, subwarp_size>,
-                    std::shared_ptr<const HipExecutor> exec,
+                    std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::SparsityCsr<MatrixValueType, IndexType>* a,
                     const matrix::Dense<InputValueType>* b,
                     matrix::Dense<OutputValueType>* c,
@@ -129,7 +236,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv);
 
 template <typename MatrixValueType, typename InputValueType,
           typename OutputValueType, typename IndexType>
-void spmv(std::shared_ptr<const HipExecutor> exec,
+void spmv(std::shared_ptr<const DefaultExecutor> exec,
           const matrix::SparsityCsr<MatrixValueType, IndexType>* a,
           const matrix::Dense<InputValueType>* b,
           matrix::Dense<OutputValueType>* c)
@@ -145,7 +252,7 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
 
 template <typename MatrixValueType, typename InputValueType,
           typename OutputValueType, typename IndexType>
-void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
+void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec,
                    const matrix::Dense<MatrixValueType>* alpha,
                    const matrix::SparsityCsr<MatrixValueType, IndexType>* a,
                    const matrix::Dense<InputValueType>* b,
@@ -218,6 +325,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 }  // namespace sparsity_csr
-}  // namespace hip
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc b/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc
deleted file mode 100644
index aedf9638888..00000000000
--- a/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc
+++ /dev/null
@@ -1,111 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-namespace kernel {
-
-
-template <size_type subwarp_size, typename MatrixValueType,
-          typename input_accessor, typename output_accessor, typename IndexType,
-          typename Closure>
-__device__ void device_classical_spmv(const size_type num_rows,
-                                      const MatrixValueType* __restrict__ val,
-                                      const IndexType* __restrict__ col_idxs,
-                                      const IndexType* __restrict__ row_ptrs,
-                                      acc::range<input_accessor> b,
-                                      acc::range<output_accessor> c,
-                                      Closure scale)
-{
-    using arithmetic_type = typename output_accessor::arithmetic_type;
-    auto subwarp_tile =
-        group::tiled_partition<subwarp_size>(group::this_thread_block());
-    const auto subrow = thread::get_subwarp_num_flat<subwarp_size>();
-    const auto subid = subwarp_tile.thread_rank();
-    const IndexType column_id = blockIdx.y;
-    const arithmetic_type value = val[0];
-    auto row = thread::get_subwarp_id_flat<subwarp_size>();
-    for (; row < num_rows; row += subrow) {
-        const auto ind_end = row_ptrs[row + 1];
-        arithmetic_type temp_val = zero<arithmetic_type>();
-        for (auto ind = row_ptrs[row] + subid; ind < ind_end;
-             ind += subwarp_size) {
-            temp_val += value * b(col_idxs[ind], column_id);
-        }
-        auto subwarp_result =
-            reduce(subwarp_tile, temp_val,
-                   [](const arithmetic_type& a, const arithmetic_type& b) {
-                       return a + b;
-                   });
-        if (subid == 0) {
-            c(row, column_id) = scale(subwarp_result, c(row, column_id));
-        }
-    }
-}
-
-
-template <size_type subwarp_size, typename MatrixValueType,
-          typename input_accessor, typename output_accessor, typename IndexType>
-__global__ __launch_bounds__(spmv_block_size) void abstract_classical_spmv(
-    const size_type num_rows, const MatrixValueType* __restrict__ val,
-    const IndexType* __restrict__ col_idxs,
-    const IndexType* __restrict__ row_ptrs, acc::range<input_accessor> b,
-    acc::range<output_accessor> c)
-{
-    using type = typename output_accessor::arithmetic_type;
-    device_classical_spmv<subwarp_size>(
-        num_rows, val, col_idxs, row_ptrs, b, c,
-        [](const type& x, const type& y) { return x; });
-}
-
-
-template <size_type subwarp_size, typename MatrixValueType,
-          typename input_accessor, typename output_accessor, typename IndexType>
-__global__ __launch_bounds__(spmv_block_size) void abstract_classical_spmv(
-    const size_type num_rows, const MatrixValueType* __restrict__ alpha,
-    const MatrixValueType* __restrict__ val,
-    const IndexType* __restrict__ col_idxs,
-    const IndexType* __restrict__ row_ptrs, acc::range<input_accessor> b,
-    const typename output_accessor::storage_type* __restrict__ beta,
-    acc::range<output_accessor> c)
-{
-    using type = typename output_accessor::arithmetic_type;
-    const type alpha_val = alpha[0];
-    const type beta_val = beta[0];
-    device_classical_spmv<subwarp_size>(
-        num_rows, val, col_idxs, row_ptrs, b, c,
-        [&alpha_val, &beta_val](const type& x, const type& y) {
-            return alpha_val * x + beta_val * y;
-        });
-}
-
-
-}  // namespace kernel
-
-
-template <typename ValueType, typename IndexType>
-void transpose(std::shared_ptr<const DefaultExecutor> exec,
-               const matrix::SparsityCsr<ValueType, IndexType>* orig,
-               matrix::SparsityCsr<ValueType, IndexType>* trans)
-    GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void fallback_sort(std::shared_ptr<const DefaultExecutor> exec,
-                   matrix::SparsityCsr<ValueType, IndexType>* to_sort)
-{
-    const auto row_ptrs = to_sort->get_const_row_ptrs();
-    const auto col_idxs = to_sort->get_col_idxs();
-    const auto nnz = to_sort->get_num_nonzeros();
-    const auto num_rows = to_sort->get_size()[0];
-    array<IndexType> row_idx_array(exec, nnz);
-    const auto row_idxs = row_idx_array.get_data();
-    components::convert_ptrs_to_idxs(exec, row_ptrs, num_rows, row_idxs);
-    // two sorts by integer keys hopefully enable Thrust to use cub's RadixSort
-    thrust::sort_by_key(thrust_policy(exec), col_idxs, col_idxs + nnz,
-                        row_idxs);
-    thrust::stable_sort_by_key(thrust_policy(exec), row_idxs, row_idxs + nnz,
-                               col_idxs);
-}
diff --git a/common/cuda_hip/multigrid/pgm_kernels.hpp.inc b/common/cuda_hip/multigrid/pgm_kernels.cpp
similarity index 77%
rename from common/cuda_hip/multigrid/pgm_kernels.hpp.inc
rename to common/cuda_hip/multigrid/pgm_kernels.cpp
index 9b2a5735c71..a2c5d608a50 100644
--- a/common/cuda_hip/multigrid/pgm_kernels.hpp.inc
+++ b/common/cuda_hip/multigrid/pgm_kernels.cpp
@@ -2,6 +2,34 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/multigrid/pgm_kernels.hpp"
+
+#include <memory>
+
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/sort.h>
+#include <thrust/tuple.h>
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/base/types.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The PGM solver namespace.
+ *
+ * @ingroup pgm
+ */
+namespace pgm {
+
+
 template <typename IndexType>
 void sort_agg(std::shared_ptr<const DefaultExecutor> exec, IndexType num,
               IndexType* row_idxs, IndexType* col_idxs)
@@ -52,3 +80,9 @@ void compute_coarse_coo(std::shared_ptr<const DefaultExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PGM_COMPUTE_COARSE_COO);
+
+
+}  // namespace pgm
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/preconditioner/isai_kernels.hpp.inc b/common/cuda_hip/preconditioner/isai_kernels.cpp
similarity index 95%
rename from common/cuda_hip/preconditioner/isai_kernels.hpp.inc
rename to common/cuda_hip/preconditioner/isai_kernels.cpp
index 86d47680e0e..eda1f9a0661 100644
--- a/common/cuda_hip/preconditioner/isai_kernels.hpp.inc
+++ b/common/cuda_hip/preconditioner/isai_kernels.cpp
@@ -2,6 +2,42 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/preconditioner/isai_kernels.hpp"
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/merging.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+#include "core/matrix/csr_builder.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The Isai preconditioner namespace.
+ * @ref Isai
+ * @ingroup isai
+ */
+namespace isai {
+
+
+constexpr int subwarp_size{row_size_limit};
+constexpr int subwarps_per_block{2};
+constexpr int default_block_size{subwarps_per_block * subwarp_size};
+
+
 namespace kernel {
 
 
@@ -559,3 +595,9 @@ void scatter_excess_solution(std::shared_ptr<const DefaultExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL);
+
+
+}  // namespace isai
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc b/common/cuda_hip/preconditioner/jacobi_kernels.cpp
similarity index 91%
rename from common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc
rename to common/cuda_hip/preconditioner/jacobi_kernels.cpp
index e0d7cfef0e9..27069d2f693 100644
--- a/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc
+++ b/common/cuda_hip/preconditioner/jacobi_kernels.cpp
@@ -2,6 +2,44 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/preconditioner/jacobi_kernels.hpp"
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/preconditioner/jacobi_common.hpp"
+#include "core/base/extended_float.hpp"
+#include "core/preconditioner/jacobi_utils.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The Jacobi preconditioner namespace.
+ * @ref Jacobi
+ * @ingroup jacobi
+ */
+namespace jacobi {
+
+
+// a total of 32/16 warps (1024 threads)
+#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC
+constexpr int default_num_warps = 16;
+#else  // !defined(GKO_COMPILING_HIP) || GINKGO_HIP_PLATFORM_NVCC
+constexpr int default_num_warps = 32;
+#endif
+// with current architectures, at most 32 warps can be scheduled per SM (and
+// current GPUs have at most 84 SMs)
+constexpr int default_grid_size = 32 * 32 * 128;
+
+
 namespace {
 
 
@@ -369,3 +407,9 @@ void convert_to_dense(
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL);
+
+
+}  // namespace jacobi
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/reorder/rcm_kernels.hpp.inc b/common/cuda_hip/reorder/rcm_kernels.cpp
similarity index 95%
rename from common/cuda_hip/reorder/rcm_kernels.hpp.inc
rename to common/cuda_hip/reorder/rcm_kernels.cpp
index 05fe3bce07e..380ef69fac8 100644
--- a/common/cuda_hip/reorder/rcm_kernels.hpp.inc
+++ b/common/cuda_hip/reorder/rcm_kernels.cpp
@@ -2,6 +2,46 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/reorder/rcm_kernels.hpp"
+
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/permutation_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/std_extensions.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/permutation.hpp>
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
+
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/components/memory.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/base/array_access.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The reordering namespace.
+ *
+ * @ingroup reorder
+ */
+namespace rcm {
+
+
+constexpr int default_block_size = 512;
+
+
 template <typename IndexType>
 array<IndexType> compute_node_degrees(
     std::shared_ptr<const DefaultExecutor> exec,
@@ -613,3 +653,9 @@ void compute_permutation(std::shared_ptr<const DefaultExecutor> exec,
 }
 
 GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_RCM_COMPUTE_PERMUTATION_KERNEL);
+
+
+}  // namespace rcm
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/solver/cb_gmres_kernels.hpp.inc b/common/cuda_hip/solver/cb_gmres_kernels.cpp
similarity index 50%
rename from common/cuda_hip/solver/cb_gmres_kernels.hpp.inc
rename to common/cuda_hip/solver/cb_gmres_kernels.cpp
index 2a5a6c3f7f9..59c9812dc65 100644
--- a/common/cuda_hip/solver/cb_gmres_kernels.hpp.inc
+++ b/common/cuda_hip/solver/cb_gmres_kernels.cpp
@@ -2,6 +2,51 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/solver/cb_gmres_kernels.hpp"
+
+#include <algorithm>
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/stop/stopping_status.hpp>
+
+#include "accessor/cuda_hip_helper.hpp"
+#include "accessor/range.hpp"
+#include "accessor/reduced_row_major.hpp"
+#include "accessor/scaled_reduced_row_major.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "core/base/array_access.hpp"
+#include "core/components/fill_array_kernels.hpp"
+#include "core/matrix/dense_kernels.hpp"
+#include "core/solver/cb_gmres_accessor.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The CB_GMRES solver namespace.
+ *
+ * @ingroup cb_gmres
+ */
+namespace cb_gmres {
+
+
+constexpr int default_block_size = 512;
+// default_dot_dim can not be 64 in hip because 64 * 64 exceeds their max block
+// size limit.
+constexpr int default_dot_dim = 32;
+constexpr int default_dot_size = default_dot_dim * default_dot_dim;
+
+
 #include "common/cuda_hip/solver/common_gmres_kernels.hpp.inc"
 
 
@@ -551,3 +596,457 @@ __global__ __launch_bounds__(block_size) void calculate_Qy_kernel(
         before_preconditioner[global_id] = temp;
     }
 }
+
+
+template <typename ValueType>
+void zero_matrix(std::shared_ptr<const DefaultExecutor> exec, size_type m,
+                 size_type n, size_type stride, ValueType* array)
+{
+    const auto block_size = default_block_size;
+    const auto grid_size = ceildiv(n, block_size);
+    zero_matrix_kernel<<<grid_size, block_size, 0, exec->get_stream()>>>(
+        m, n, stride, as_device_type(array));
+}
+
+
+template <typename ValueType>
+void initialize(std::shared_ptr<const DefaultExecutor> exec,
+                const matrix::Dense<ValueType>* b,
+                matrix::Dense<ValueType>* residual,
+                matrix::Dense<ValueType>* givens_sin,
+                matrix::Dense<ValueType>* givens_cos,
+                array<stopping_status>* stop_status, size_type krylov_dim)
+{
+    const auto num_threads = std::max(b->get_size()[0] * b->get_stride(),
+                                      krylov_dim * b->get_size()[1]);
+    const auto grid_dim = ceildiv(num_threads, default_block_size);
+    const auto block_dim = default_block_size;
+    constexpr auto block_size = default_block_size;
+
+    initialize_kernel<block_size>
+        <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
+            b->get_size()[0], b->get_size()[1], krylov_dim,
+            as_device_type(b->get_const_values()), b->get_stride(),
+            as_device_type(residual->get_values()), residual->get_stride(),
+            as_device_type(givens_sin->get_values()), givens_sin->get_stride(),
+            as_device_type(givens_cos->get_values()), givens_cos->get_stride(),
+            as_device_type(stop_status->get_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL);
+
+
+template <typename ValueType, typename Accessor3d>
+void restart(std::shared_ptr<const DefaultExecutor> exec,
+             const matrix::Dense<ValueType>* residual,
+             matrix::Dense<remove_complex<ValueType>>* residual_norm,
+             matrix::Dense<ValueType>* residual_norm_collection,
+             matrix::Dense<remove_complex<ValueType>>* arnoldi_norm,
+             Accessor3d krylov_bases,
+             matrix::Dense<ValueType>* next_krylov_basis,
+             array<size_type>* final_iter_nums, array<char>& reduction_tmp,
+             size_type krylov_dim)
+{
+    constexpr bool use_scalar =
+        gko::cb_gmres::detail::has_3d_scaled_accessor<Accessor3d>::value;
+    const auto num_rows = residual->get_size()[0];
+    const auto num_rhs = residual->get_size()[1];
+    const auto krylov_stride =
+        gko::cb_gmres::helper_functions_accessor<Accessor3d>::get_stride(
+            krylov_bases);
+    const auto grid_dim_1 =
+        ceildiv((krylov_dim + 1) * krylov_stride[0], default_block_size);
+    const auto block_dim = default_block_size;
+    constexpr auto block_size = default_block_size;
+    const auto stride_arnoldi = arnoldi_norm->get_stride();
+
+    restart_1_kernel<block_size>
+        <<<grid_dim_1, block_dim, 0, exec->get_stream()>>>(
+            residual->get_size()[0], residual->get_size()[1], krylov_dim,
+            acc::as_device_range(krylov_bases),
+            as_device_type(residual_norm_collection->get_values()),
+            residual_norm_collection->get_stride());
+    kernels::GKO_DEVICE_NAMESPACE::dense::compute_norm2_dispatch(
+        exec, residual, residual_norm, reduction_tmp);
+
+    if (use_scalar) {
+        components::fill_array(exec,
+                               arnoldi_norm->get_values() + 2 * stride_arnoldi,
+                               num_rhs, zero<remove_complex<ValueType>>());
+        const dim3 grid_size_nrm(ceildiv(num_rhs, default_dot_dim),
+                                 exec->get_num_multiprocessor() * 2);
+        const dim3 block_size_nrm(default_dot_dim, default_dot_dim);
+        multinorminf_without_stop_kernel<<<grid_size_nrm, block_size_nrm, 0,
+                                           exec->get_stream()>>>(
+            num_rows, num_rhs, as_device_type(residual->get_const_values()),
+            residual->get_stride(),
+            as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi), 0);
+    }
+
+    if (gko::cb_gmres::detail::has_3d_scaled_accessor<Accessor3d>::value) {
+        set_scalar_kernel<default_block_size>
+            <<<ceildiv(num_rhs * (krylov_dim + 1), default_block_size),
+               default_block_size, 0, exec->get_stream()>>>(
+                num_rhs, krylov_dim + 1,
+                as_device_type(residual_norm->get_const_values()),
+                residual_norm->get_stride(),
+                as_device_type(arnoldi_norm->get_const_values() +
+                               2 * stride_arnoldi),
+                stride_arnoldi, acc::as_device_range(krylov_bases));
+    }
+
+    const auto grid_dim_2 =
+        ceildiv(std::max<size_type>(num_rows, 1) * krylov_stride[1],
+                default_block_size);
+    restart_2_kernel<block_size>
+        <<<grid_dim_2, block_dim, 0, exec->get_stream()>>>(
+            residual->get_size()[0], residual->get_size()[1],
+            as_device_type(residual->get_const_values()),
+            residual->get_stride(),
+            as_device_type(residual_norm->get_const_values()),
+            as_device_type(residual_norm_collection->get_values()),
+            acc::as_device_range(krylov_bases),
+            as_device_type(next_krylov_basis->get_values()),
+            next_krylov_basis->get_stride(),
+            as_device_type(final_iter_nums->get_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_RESTART_KERNEL);
+
+
+template <typename ValueType, typename Accessor3dim>
+void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
+                        matrix::Dense<ValueType>* next_krylov_basis,
+                        Accessor3dim krylov_bases,
+                        matrix::Dense<ValueType>* hessenberg_iter,
+                        matrix::Dense<ValueType>* buffer_iter,
+                        matrix::Dense<remove_complex<ValueType>>* arnoldi_norm,
+                        size_type iter, const stopping_status* stop_status,
+                        stopping_status* reorth_status,
+                        array<size_type>* num_reorth)
+{
+    const auto dim_size = next_krylov_basis->get_size();
+    if (dim_size[1] == 0) {
+        return;
+    }
+    using non_complex = remove_complex<ValueType>;
+    // optimization parameter
+    constexpr int singledot_block_size = default_dot_dim;
+    constexpr bool use_scalar =
+        gko::cb_gmres::detail::has_3d_scaled_accessor<Accessor3dim>::value;
+    const auto stride_next_krylov = next_krylov_basis->get_stride();
+    const auto stride_hessenberg = hessenberg_iter->get_stride();
+    const auto stride_buffer = buffer_iter->get_stride();
+    const auto stride_arnoldi = arnoldi_norm->get_stride();
+    const dim3 grid_size(ceildiv(dim_size[1], default_dot_dim),
+                         exec->get_num_multiprocessor() * 2);
+    const dim3 grid_size_num_iters(ceildiv(dim_size[1], default_dot_dim),
+                                   exec->get_num_multiprocessor() * 2,
+                                   iter + 1);
+    const dim3 block_size(default_dot_dim, default_dot_dim);
+    // Note: having iter first (instead of row_idx information) is likely
+    //       beneficial for avoiding atomic_add conflicts, but that needs
+    //       further investigation.
+    const dim3 grid_size_iters_single(exec->get_num_multiprocessor() * 2,
+                                      iter + 1);
+    const auto block_size_iters_single = singledot_block_size;
+    size_type num_reorth_host;
+
+    components::fill_array(exec, arnoldi_norm->get_values(), dim_size[1],
+                           zero<non_complex>());
+    multinorm2_kernel<<<grid_size, block_size, 0, exec->get_stream()>>>(
+        dim_size[0], dim_size[1],
+        as_device_type(next_krylov_basis->get_const_values()),
+        stride_next_krylov, as_device_type(arnoldi_norm->get_values()),
+        as_device_type(stop_status));
+    // nrmP = norm(next_krylov_basis)
+    zero_matrix(exec, iter + 1, dim_size[1], stride_hessenberg,
+                hessenberg_iter->get_values());
+    if (dim_size[1] > 1) {
+        multidot_kernel<default_dot_dim>
+            <<<grid_size_num_iters, block_size, 0, exec->get_stream()>>>(
+                dim_size[0], dim_size[1],
+                as_device_type(next_krylov_basis->get_const_values()),
+                stride_next_krylov, acc::as_device_range(krylov_bases),
+                as_device_type(hessenberg_iter->get_values()),
+                stride_hessenberg, as_device_type(stop_status));
+    } else {
+        singledot_kernel<singledot_block_size>
+            <<<grid_size_iters_single, block_size_iters_single, 0,
+               exec->get_stream()>>>(
+                dim_size[0],
+                as_device_type(next_krylov_basis->get_const_values()),
+                stride_next_krylov, acc::as_device_range(krylov_bases),
+                as_device_type(hessenberg_iter->get_values()),
+                stride_hessenberg, as_device_type(stop_status));
+    }
+    // for i in 1:iter
+    //     hessenberg(iter, i) = next_krylov_basis' * krylov_bases(:, i)
+    // end
+    update_next_krylov_kernel<default_block_size>
+        <<<ceildiv(dim_size[0] * stride_next_krylov, default_block_size),
+           default_block_size, 0, exec->get_stream()>>>(
+            iter + 1, dim_size[0], dim_size[1],
+            as_device_type(next_krylov_basis->get_values()), stride_next_krylov,
+            acc::as_device_range(krylov_bases),
+            as_device_type(hessenberg_iter->get_const_values()),
+            stride_hessenberg, as_device_type(stop_status));
+
+    // for i in 1:iter
+    //     next_krylov_basis  -= hessenberg(iter, i) * krylov_bases(:, i)
+    // end
+    components::fill_array(exec, arnoldi_norm->get_values() + stride_arnoldi,
+                           dim_size[1], zero<non_complex>());
+    if (use_scalar) {
+        components::fill_array(exec,
+                               arnoldi_norm->get_values() + 2 * stride_arnoldi,
+                               dim_size[1], zero<non_complex>());
+    }
+    multinorm2_inf_kernel<use_scalar>
+        <<<grid_size, block_size, 0, exec->get_stream()>>>(
+            dim_size[0], dim_size[1],
+            as_device_type(next_krylov_basis->get_const_values()),
+            stride_next_krylov,
+            as_device_type(arnoldi_norm->get_values() + stride_arnoldi),
+            as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi),
+            as_device_type(stop_status));
+    // nrmN = norm(next_krylov_basis)
+    components::fill_array(exec, num_reorth->get_data(), 1, zero<size_type>());
+    check_arnoldi_norms<default_block_size>
+        <<<ceildiv(dim_size[1], default_block_size), default_block_size, 0,
+           exec->get_stream()>>>(
+            dim_size[1], as_device_type(arnoldi_norm->get_values()),
+            stride_arnoldi, as_device_type(hessenberg_iter->get_values()),
+            stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases),
+            as_device_type(stop_status), as_device_type(reorth_status),
+            as_device_type(num_reorth->get_data()));
+    num_reorth_host = get_element(*num_reorth, 0);
+    // num_reorth_host := number of next_krylov vector to be reorthogonalization
+    for (size_type l = 1; (num_reorth_host > 0) && (l < 3); l++) {
+        zero_matrix(exec, iter + 1, dim_size[1], stride_buffer,
+                    buffer_iter->get_values());
+        if (dim_size[1] > 1) {
+            multidot_kernel<default_dot_dim>
+                <<<grid_size_num_iters, block_size, 0, exec->get_stream()>>>(
+                    dim_size[0], dim_size[1],
+                    as_device_type(next_krylov_basis->get_const_values()),
+                    stride_next_krylov, acc::as_device_range(krylov_bases),
+                    as_device_type(buffer_iter->get_values()), stride_buffer,
+                    as_device_type(stop_status));
+        } else {
+            singledot_kernel<singledot_block_size>
+                <<<grid_size_iters_single, block_size_iters_single, 0,
+                   exec->get_stream()>>>(
+                    dim_size[0],
+                    as_device_type(next_krylov_basis->get_const_values()),
+                    stride_next_krylov, acc::as_device_range(krylov_bases),
+                    as_device_type(buffer_iter->get_values()), stride_buffer,
+                    as_device_type(stop_status));
+        }
+        // for i in 1:iter
+        //     hessenberg(iter, i) = next_krylov_basis' * krylov_bases(:, i)
+        // end
+        update_next_krylov_and_add_kernel<default_block_size>
+            <<<ceildiv(dim_size[0] * stride_next_krylov, default_block_size),
+               default_block_size, 0, exec->get_stream()>>>(
+                iter + 1, dim_size[0], dim_size[1],
+                as_device_type(next_krylov_basis->get_values()),
+                stride_next_krylov, acc::as_device_range(krylov_bases),
+                as_device_type(hessenberg_iter->get_values()),
+                stride_hessenberg,
+                as_device_type(buffer_iter->get_const_values()), stride_buffer,
+                as_device_type(stop_status), as_device_type(reorth_status));
+        // for i in 1:iter
+        //     next_krylov_basis  -= hessenberg(iter, i) * krylov_bases(:, i)
+        // end
+        components::fill_array(exec,
+                               arnoldi_norm->get_values() + stride_arnoldi,
+                               dim_size[1], zero<non_complex>());
+        if (use_scalar) {
+            components::fill_array(
+                exec, arnoldi_norm->get_values() + 2 * stride_arnoldi,
+                dim_size[1], zero<non_complex>());
+        }
+        multinorm2_inf_kernel<use_scalar>
+            <<<grid_size, block_size, 0, exec->get_stream()>>>(
+                dim_size[0], dim_size[1],
+                as_device_type(next_krylov_basis->get_const_values()),
+                stride_next_krylov,
+                as_device_type(arnoldi_norm->get_values() + stride_arnoldi),
+                as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi),
+                as_device_type(stop_status));
+        // nrmN = norm(next_krylov_basis)
+        components::fill_array(exec, num_reorth->get_data(), 1,
+                               zero<size_type>());
+        check_arnoldi_norms<default_block_size>
+            <<<ceildiv(dim_size[1], default_block_size), default_block_size, 0,
+               exec->get_stream()>>>(
+                dim_size[1], as_device_type(arnoldi_norm->get_values()),
+                stride_arnoldi, as_device_type(hessenberg_iter->get_values()),
+                stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases),
+                as_device_type(stop_status), as_device_type(reorth_status),
+                num_reorth->get_data());
+        num_reorth_host = get_element(*num_reorth, 0);
+        // num_reorth_host := number of next_krylov vector to be
+        // reorthogonalization
+    }
+    update_krylov_next_krylov_kernel<default_block_size>
+        <<<ceildiv(dim_size[0] * stride_next_krylov, default_block_size),
+           default_block_size, 0, exec->get_stream()>>>(
+            iter, dim_size[0], dim_size[1],
+            as_device_type(next_krylov_basis->get_values()), stride_next_krylov,
+            acc::as_device_range(krylov_bases),
+            as_device_type(hessenberg_iter->get_const_values()),
+            stride_hessenberg, as_device_type(stop_status));
+    // next_krylov_basis /= hessenberg(iter, iter + 1)
+    // krylov_bases(:, iter + 1) = next_krylov_basis
+    // End of arnoldi
+}
+
+template <typename ValueType>
+void givens_rotation(std::shared_ptr<const DefaultExecutor> exec,
+                     matrix::Dense<ValueType>* givens_sin,
+                     matrix::Dense<ValueType>* givens_cos,
+                     matrix::Dense<ValueType>* hessenberg_iter,
+                     matrix::Dense<remove_complex<ValueType>>* residual_norm,
+                     matrix::Dense<ValueType>* residual_norm_collection,
+                     size_type iter, const array<stopping_status>* stop_status)
+{
+    // TODO: tune block_size for optimal performance
+    constexpr auto block_size = default_block_size;
+    const auto num_cols = hessenberg_iter->get_size()[1];
+    const auto block_dim = block_size;
+    const auto grid_dim =
+        static_cast<unsigned int>(ceildiv(num_cols, block_size));
+
+    givens_rotation_kernel<block_size>
+        <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
+            hessenberg_iter->get_size()[0], hessenberg_iter->get_size()[1],
+            iter, as_device_type(hessenberg_iter->get_values()),
+            hessenberg_iter->get_stride(),
+            as_device_type(givens_sin->get_values()), givens_sin->get_stride(),
+            as_device_type(givens_cos->get_values()), givens_cos->get_stride(),
+            as_device_type(residual_norm->get_values()),
+            as_device_type(residual_norm_collection->get_values()),
+            residual_norm_collection->get_stride(),
+            stop_status->get_const_data());
+}
+
+
+template <typename ValueType, typename Accessor3d>
+void arnoldi(std::shared_ptr<const DefaultExecutor> exec,
+             matrix::Dense<ValueType>* next_krylov_basis,
+             matrix::Dense<ValueType>* givens_sin,
+             matrix::Dense<ValueType>* givens_cos,
+             matrix::Dense<remove_complex<ValueType>>* residual_norm,
+             matrix::Dense<ValueType>* residual_norm_collection,
+             Accessor3d krylov_bases, matrix::Dense<ValueType>* hessenberg_iter,
+             matrix::Dense<ValueType>* buffer_iter,
+             matrix::Dense<remove_complex<ValueType>>* arnoldi_norm,
+             size_type iter, array<size_type>* final_iter_nums,
+             const array<stopping_status>* stop_status,
+             array<stopping_status>* reorth_status,
+             array<size_type>* num_reorth)
+{
+    increase_final_iteration_numbers_kernel<<<
+        static_cast<unsigned int>(
+            ceildiv(final_iter_nums->get_size(), default_block_size)),
+        default_block_size, 0, exec->get_stream()>>>(
+        as_device_type(final_iter_nums->get_data()),
+        stop_status->get_const_data(), final_iter_nums->get_size());
+    finish_arnoldi_CGS(exec, next_krylov_basis, krylov_bases, hessenberg_iter,
+                       buffer_iter, arnoldi_norm, iter,
+                       stop_status->get_const_data(), reorth_status->get_data(),
+                       num_reorth);
+    givens_rotation(exec, givens_sin, givens_cos, hessenberg_iter,
+                    residual_norm, residual_norm_collection, iter, stop_status);
+}
+
+GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_ARNOLDI_KERNEL);
+
+
+template <typename ValueType>
+void solve_upper_triangular(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Dense<ValueType>* residual_norm_collection,
+    const matrix::Dense<ValueType>* hessenberg, matrix::Dense<ValueType>* y,
+    const array<size_type>* final_iter_nums)
+{
+    // TODO: tune block_size for optimal performance
+    constexpr auto block_size = default_block_size;
+    const auto num_rhs = residual_norm_collection->get_size()[1];
+    const auto block_dim = block_size;
+    const auto grid_dim =
+        static_cast<unsigned int>(ceildiv(num_rhs, block_size));
+
+    solve_upper_triangular_kernel<block_size>
+        <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
+            hessenberg->get_size()[1], num_rhs,
+            as_device_type(residual_norm_collection->get_const_values()),
+            residual_norm_collection->get_stride(),
+            as_device_type(hessenberg->get_const_values()),
+            hessenberg->get_stride(), as_device_type(y->get_values()),
+            y->get_stride(), as_device_type(final_iter_nums->get_const_data()));
+}
+
+
+template <typename ValueType, typename ConstAccessor3d>
+void calculate_qy(std::shared_ptr<const DefaultExecutor> exec,
+                  ConstAccessor3d krylov_bases, size_type num_krylov_bases,
+                  const matrix::Dense<ValueType>* y,
+                  matrix::Dense<ValueType>* before_preconditioner,
+                  const array<size_type>* final_iter_nums)
+{
+    const auto num_rows = before_preconditioner->get_size()[0];
+    const auto num_cols = before_preconditioner->get_size()[1];
+    const auto stride_before_preconditioner =
+        before_preconditioner->get_stride();
+
+    constexpr auto block_size = default_block_size;
+    const auto grid_dim = static_cast<unsigned int>(
+        ceildiv(num_rows * stride_before_preconditioner, block_size));
+    const auto block_dim = block_size;
+
+    calculate_Qy_kernel<block_size>
+        <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
+            num_rows, num_cols, acc::as_device_range(krylov_bases),
+            as_device_type(y->get_const_values()), y->get_stride(),
+            as_device_type(before_preconditioner->get_values()),
+            stride_before_preconditioner,
+            as_device_type(final_iter_nums->get_const_data()));
+    // Calculate qy
+    // before_preconditioner = krylov_bases * y
+}
+
+
+template <typename ValueType, typename ConstAccessor3d>
+void solve_krylov(std::shared_ptr<const DefaultExecutor> exec,
+                  const matrix::Dense<ValueType>* residual_norm_collection,
+                  ConstAccessor3d krylov_bases,
+                  const matrix::Dense<ValueType>* hessenberg,
+                  matrix::Dense<ValueType>* y,
+                  matrix::Dense<ValueType>* before_preconditioner,
+                  const array<size_type>* final_iter_nums)
+{
+    if (before_preconditioner->get_size()[1] == 0) {
+        return;
+    }
+    // since hessenberg has dims:  iters x iters * num_rhs
+    // krylov_bases has dims:  (iters + 1) x sysmtx[0] x num_rhs
+    const auto iters =
+        hessenberg->get_size()[1] / before_preconditioner->get_size()[1];
+    const auto num_krylov_bases = iters + 1;
+    solve_upper_triangular(exec, residual_norm_collection, hessenberg, y,
+                           final_iter_nums);
+    calculate_qy(exec, krylov_bases, num_krylov_bases, y, before_preconditioner,
+                 final_iter_nums);
+}
+
+GKO_INSTANTIATE_FOR_EACH_CB_GMRES_CONST_TYPE(
+    GKO_DECLARE_CB_GMRES_SOLVE_KRYLOV_KERNEL);
+
+
+}  // namespace cb_gmres
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/cuda/solver/idr_kernels.cu b/common/cuda_hip/solver/idr_kernels.cpp
similarity index 52%
rename from cuda/solver/idr_kernels.cu
rename to common/cuda_hip/solver/idr_kernels.cpp
index 34aac3751d6..63c5f015f68 100644
--- a/cuda/solver/idr_kernels.cu
+++ b/common/cuda_hip/solver/idr_kernels.cpp
@@ -12,20 +12,20 @@
 
 #include "common/cuda_hip/base/blas_bindings.hpp"
 #include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/randlib_bindings.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 #include "core/components/fill_array_kernels.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/components/atomic.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
 
 
 namespace gko {
 namespace kernels {
-namespace cuda {
+namespace GKO_DEVICE_NAMESPACE {
 /**
  * @brief The IDR solver namespace.
  *
@@ -39,7 +39,320 @@ constexpr int default_dot_dim = 32;
 constexpr int default_dot_size = default_dot_dim * default_dot_dim;
 
 
-#include "common/cuda_hip/solver/idr_kernels.hpp.inc"
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void initialize_m_kernel(
+    size_type subspace_dim, size_type nrhs, ValueType* __restrict__ m_values,
+    size_type m_stride, stopping_status* __restrict__ stop_status)
+{
+    const auto global_id = thread::get_thread_id_flat();
+    const auto row = global_id / m_stride;
+    const auto col = global_id % m_stride;
+
+    if (global_id < nrhs) {
+        stop_status[global_id].reset();
+    }
+
+    if (row < subspace_dim && col < nrhs * subspace_dim) {
+        m_values[row * m_stride + col] =
+            (row == col / nrhs) ? one<ValueType>() : zero<ValueType>();
+    }
+}
+
+
+template <size_type block_size, typename ValueType>
+__global__
+__launch_bounds__(block_size) void orthonormalize_subspace_vectors_kernel(
+    size_type num_rows, size_type num_cols, ValueType* __restrict__ values,
+    size_type stride)
+{
+    const auto tidx = thread::get_thread_id_flat();
+
+    __shared__ uninitialized_array<ValueType, block_size>
+        reduction_helper_array;
+    // they are not be used in the same time.
+    ValueType* reduction_helper = reduction_helper_array;
+    auto reduction_helper_real =
+        reinterpret_cast<remove_complex<ValueType>*>(reduction_helper);
+
+    for (size_type row = 0; row < num_rows; row++) {
+        for (size_type i = 0; i < row; i++) {
+            auto dot = zero<ValueType>();
+            for (size_type j = tidx; j < num_cols; j += block_size) {
+                dot += values[row * stride + j] * conj(values[i * stride + j]);
+            }
+
+            // Ensure already finish reading this shared memory
+            __syncthreads();
+            reduction_helper[tidx] = dot;
+            reduce(
+                group::this_thread_block(), reduction_helper,
+                [](const ValueType& a, const ValueType& b) { return a + b; });
+            __syncthreads();
+
+            dot = reduction_helper[0];
+            for (size_type j = tidx; j < num_cols; j += block_size) {
+                values[row * stride + j] -= dot * values[i * stride + j];
+            }
+        }
+
+        auto norm = zero<remove_complex<ValueType>>();
+        for (size_type j = tidx; j < num_cols; j += block_size) {
+            norm += squared_norm(values[row * stride + j]);
+        }
+        // Ensure already finish reading this shared memory
+        __syncthreads();
+        reduction_helper_real[tidx] = norm;
+        reduce(group::this_thread_block(), reduction_helper_real,
+               [](const remove_complex<ValueType>& a,
+                  const remove_complex<ValueType>& b) { return a + b; });
+        __syncthreads();
+
+        norm = sqrt(reduction_helper_real[0]);
+        for (size_type j = tidx; j < num_cols; j += block_size) {
+            values[row * stride + j] /= norm;
+        }
+    }
+}
+
+
+template <typename ValueType>
+__global__
+__launch_bounds__(default_block_size) void solve_lower_triangular_kernel(
+    size_type subspace_dim, size_type nrhs,
+    const ValueType* __restrict__ m_values, size_type m_stride,
+    const ValueType* __restrict__ f_values, size_type f_stride,
+    ValueType* __restrict__ c_values, size_type c_stride,
+    const stopping_status* __restrict__ stop_status)
+{
+    const auto global_id = thread::get_thread_id_flat();
+
+    if (global_id >= nrhs) {
+        return;
+    }
+
+    if (!stop_status[global_id].has_stopped()) {
+        for (size_type row = 0; row < subspace_dim; row++) {
+            auto temp = f_values[row * f_stride + global_id];
+            for (size_type col = 0; col < row; col++) {
+                temp -= m_values[row * m_stride + col * nrhs + global_id] *
+                        c_values[col * c_stride + global_id];
+            }
+            c_values[row * c_stride + global_id] =
+                temp / m_values[row * m_stride + row * nrhs + global_id];
+        }
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void step_1_kernel(
+    size_type k, size_type num_rows, size_type subspace_dim, size_type nrhs,
+    const ValueType* __restrict__ residual_values, size_type residual_stride,
+    const ValueType* __restrict__ c_values, size_type c_stride,
+    const ValueType* __restrict__ g_values, size_type g_stride,
+    ValueType* __restrict__ v_values, size_type v_stride,
+    const stopping_status* __restrict__ stop_status)
+{
+    const auto global_id = thread::get_thread_id_flat();
+    const auto row = global_id / nrhs;
+    const auto col = global_id % nrhs;
+
+    if (row >= num_rows) {
+        return;
+    }
+
+    if (!stop_status[col].has_stopped()) {
+        auto temp = residual_values[row * residual_stride + col];
+        for (size_type j = k; j < subspace_dim; j++) {
+            temp -= c_values[j * c_stride + col] *
+                    g_values[row * g_stride + j * nrhs + col];
+        }
+        v_values[row * v_stride + col] = temp;
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void step_2_kernel(
+    size_type k, size_type num_rows, size_type subspace_dim, size_type nrhs,
+    const ValueType* __restrict__ omega_values,
+    const ValueType* __restrict__ v_values, size_type v_stride,
+    const ValueType* __restrict__ c_values, size_type c_stride,
+    ValueType* __restrict__ u_values, size_type u_stride,
+    const stopping_status* __restrict__ stop_status)
+{
+    const auto global_id = thread::get_thread_id_flat();
+    const auto row = global_id / nrhs;
+    const auto col = global_id % nrhs;
+
+    if (row >= num_rows) {
+        return;
+    }
+
+    if (!stop_status[col].has_stopped()) {
+        auto temp = omega_values[col] * v_values[row * v_stride + col];
+        for (size_type j = k; j < subspace_dim; j++) {
+            temp += c_values[j * c_stride + col] *
+                    u_values[row * u_stride + j * nrhs + col];
+        }
+        u_values[row * u_stride + k * nrhs + col] = temp;
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_dot_size) void multidot_kernel(
+    size_type num_rows, size_type nrhs, const ValueType* __restrict__ p_i,
+    const ValueType* __restrict__ g_k, size_type g_k_stride,
+    ValueType* __restrict__ alpha,
+    const stopping_status* __restrict__ stop_status)
+{
+    const auto tidx = threadIdx.x;
+    const auto tidy = threadIdx.y;
+    const auto rhs = blockIdx.x * default_dot_dim + tidx;
+    const auto num = ceildiv(num_rows, gridDim.y);
+    const auto start_row = blockIdx.y * num;
+    const auto end_row =
+        ((blockIdx.y + 1) * num > num_rows) ? num_rows : (blockIdx.y + 1) * num;
+    // Used that way to get around dynamic initialization warning and
+    // template error when using `reduction_helper_array` directly in `reduce`
+    __shared__
+        uninitialized_array<ValueType, default_dot_dim*(default_dot_dim + 1)>
+            reduction_helper_array;
+    ValueType* __restrict__ reduction_helper = reduction_helper_array;
+
+    ValueType local_res = zero<ValueType>();
+    if (rhs < nrhs && !stop_status[rhs].has_stopped()) {
+        for (size_type i = start_row + tidy; i < end_row;
+             i += default_dot_dim) {
+            const auto g_idx = i * g_k_stride + rhs;
+            local_res += p_i[i] * g_k[g_idx];
+        }
+    }
+    reduction_helper[tidx * (default_dot_dim + 1) + tidy] = local_res;
+    __syncthreads();
+    local_res = reduction_helper[tidy * (default_dot_dim + 1) + tidx];
+    const auto tile_block =
+        group::tiled_partition<default_dot_dim>(group::this_thread_block());
+    const auto sum =
+        reduce(tile_block, local_res,
+               [](const ValueType& a, const ValueType& b) { return a + b; });
+    const auto new_rhs = blockIdx.x * default_dot_dim + tidy;
+    if (tidx == 0 && new_rhs < nrhs && !stop_status[new_rhs].has_stopped()) {
+        atomic_add(alpha + new_rhs, sum);
+    }
+}
+
+
+template <size_type block_size, typename ValueType>
+__global__ __launch_bounds__(block_size) void update_g_k_and_u_kernel(
+    size_type k, size_type i, size_type size, size_type nrhs,
+    const ValueType* __restrict__ alpha, const ValueType* __restrict__ m_values,
+    size_type m_stride, const ValueType* __restrict__ g_values,
+    size_type g_stride, ValueType* __restrict__ g_k_values,
+    size_type g_k_stride, ValueType* __restrict__ u_values, size_type u_stride,
+    const stopping_status* __restrict__ stop_status)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    const auto row = tidx / g_k_stride;
+    const auto rhs = tidx % g_k_stride;
+
+    if (row >= size || rhs >= nrhs) {
+        return;
+    }
+
+    if (!stop_status[rhs].has_stopped()) {
+        const auto fact = alpha[rhs] / m_values[i * m_stride + i * nrhs + rhs];
+        g_k_values[row * g_k_stride + rhs] -=
+            fact * g_values[row * g_stride + i * nrhs + rhs];
+        u_values[row * u_stride + k * nrhs + rhs] -=
+            fact * u_values[row * u_stride + i * nrhs + rhs];
+    }
+}
+
+
+template <size_type block_size, typename ValueType>
+__global__ __launch_bounds__(block_size) void update_g_kernel(
+    size_type k, size_type size, size_type nrhs,
+    const ValueType* __restrict__ g_k_values, size_type g_k_stride,
+    ValueType* __restrict__ g_values, size_type g_stride,
+    const stopping_status* __restrict__ stop_status)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    const auto row = tidx / g_k_stride;
+    const auto rhs = tidx % nrhs;
+
+    if (row >= size || rhs >= nrhs) {
+        return;
+    }
+
+    if (!stop_status[rhs].has_stopped()) {
+        g_values[row * g_stride + k * nrhs + rhs] =
+            g_k_values[row * g_k_stride + rhs];
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void update_x_r_and_f_kernel(
+    size_type k, size_type size, size_type subspace_dim, size_type nrhs,
+    const ValueType* __restrict__ m_values, size_type m_stride,
+    const ValueType* __restrict__ g_values, size_type g_stride,
+    const ValueType* __restrict__ u_values, size_type u_stride,
+    ValueType* __restrict__ f_values, size_type f_stride,
+    ValueType* __restrict__ r_values, size_type r_stride,
+    ValueType* __restrict__ x_values, size_type x_stride,
+    const stopping_status* __restrict__ stop_status)
+{
+    const auto global_id = thread::get_thread_id_flat();
+    const auto row = global_id / x_stride;
+    const auto col = global_id % x_stride;
+
+    if (row >= size || col >= nrhs) {
+        return;
+    }
+
+    if (!stop_status[col].has_stopped()) {
+        const auto beta = f_values[k * f_stride + col] /
+                          m_values[k * m_stride + k * nrhs + col];
+        r_values[row * r_stride + col] -=
+            beta * g_values[row * g_stride + k * nrhs + col];
+        x_values[row * x_stride + col] +=
+            beta * u_values[row * u_stride + k * nrhs + col];
+
+        if (k < row && k + 1 < subspace_dim && row < subspace_dim) {
+            f_values[row * f_stride + col] -=
+                beta * m_values[row * m_stride + k * nrhs + col];
+        }
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(config::warp_size) void compute_omega_kernel(
+    size_type nrhs, const remove_complex<ValueType> kappa,
+    const ValueType* __restrict__ tht,
+    const remove_complex<ValueType>* __restrict__ residual_norm,
+    ValueType* __restrict__ omega,
+    const stopping_status* __restrict__ stop_status)
+{
+    const auto global_id = thread::get_thread_id_flat();
+
+    if (global_id >= nrhs) {
+        return;
+    }
+
+    if (!stop_status[global_id].has_stopped()) {
+        auto thr = omega[global_id];
+        omega[global_id] /= tht[global_id];
+        auto absrho =
+            abs(thr / (sqrt(real(tht[global_id])) * residual_norm[global_id]));
+
+        if (absrho < kappa) {
+            omega[global_id] *= kappa / absrho;
+        }
+    }
+}
 
 
 namespace {
@@ -335,6 +648,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL);
 
 
 }  // namespace idr
-}  // namespace cuda
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/solver/idr_kernels.hpp.inc b/common/cuda_hip/solver/idr_kernels.hpp.inc
deleted file mode 100644
index 465417a6edb..00000000000
--- a/common/cuda_hip/solver/idr_kernels.hpp.inc
+++ /dev/null
@@ -1,318 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void initialize_m_kernel(
-    size_type subspace_dim, size_type nrhs, ValueType* __restrict__ m_values,
-    size_type m_stride, stopping_status* __restrict__ stop_status)
-{
-    const auto global_id = thread::get_thread_id_flat();
-    const auto row = global_id / m_stride;
-    const auto col = global_id % m_stride;
-
-    if (global_id < nrhs) {
-        stop_status[global_id].reset();
-    }
-
-    if (row < subspace_dim && col < nrhs * subspace_dim) {
-        m_values[row * m_stride + col] =
-            (row == col / nrhs) ? one<ValueType>() : zero<ValueType>();
-    }
-}
-
-
-template <size_type block_size, typename ValueType>
-__global__
-__launch_bounds__(block_size) void orthonormalize_subspace_vectors_kernel(
-    size_type num_rows, size_type num_cols, ValueType* __restrict__ values,
-    size_type stride)
-{
-    const auto tidx = thread::get_thread_id_flat();
-
-    __shared__ uninitialized_array<ValueType, block_size>
-        reduction_helper_array;
-    // they are not be used in the same time.
-    ValueType* reduction_helper = reduction_helper_array;
-    auto reduction_helper_real =
-        reinterpret_cast<remove_complex<ValueType>*>(reduction_helper);
-
-    for (size_type row = 0; row < num_rows; row++) {
-        for (size_type i = 0; i < row; i++) {
-            auto dot = zero<ValueType>();
-            for (size_type j = tidx; j < num_cols; j += block_size) {
-                dot += values[row * stride + j] * conj(values[i * stride + j]);
-            }
-
-            // Ensure already finish reading this shared memory
-            __syncthreads();
-            reduction_helper[tidx] = dot;
-            reduce(
-                group::this_thread_block(), reduction_helper,
-                [](const ValueType& a, const ValueType& b) { return a + b; });
-            __syncthreads();
-
-            dot = reduction_helper[0];
-            for (size_type j = tidx; j < num_cols; j += block_size) {
-                values[row * stride + j] -= dot * values[i * stride + j];
-            }
-        }
-
-        auto norm = zero<remove_complex<ValueType>>();
-        for (size_type j = tidx; j < num_cols; j += block_size) {
-            norm += squared_norm(values[row * stride + j]);
-        }
-        // Ensure already finish reading this shared memory
-        __syncthreads();
-        reduction_helper_real[tidx] = norm;
-        reduce(group::this_thread_block(), reduction_helper_real,
-               [](const remove_complex<ValueType>& a,
-                  const remove_complex<ValueType>& b) { return a + b; });
-        __syncthreads();
-
-        norm = sqrt(reduction_helper_real[0]);
-        for (size_type j = tidx; j < num_cols; j += block_size) {
-            values[row * stride + j] /= norm;
-        }
-    }
-}
-
-
-template <typename ValueType>
-__global__
-__launch_bounds__(default_block_size) void solve_lower_triangular_kernel(
-    size_type subspace_dim, size_type nrhs,
-    const ValueType* __restrict__ m_values, size_type m_stride,
-    const ValueType* __restrict__ f_values, size_type f_stride,
-    ValueType* __restrict__ c_values, size_type c_stride,
-    const stopping_status* __restrict__ stop_status)
-{
-    const auto global_id = thread::get_thread_id_flat();
-
-    if (global_id >= nrhs) {
-        return;
-    }
-
-    if (!stop_status[global_id].has_stopped()) {
-        for (size_type row = 0; row < subspace_dim; row++) {
-            auto temp = f_values[row * f_stride + global_id];
-            for (size_type col = 0; col < row; col++) {
-                temp -= m_values[row * m_stride + col * nrhs + global_id] *
-                        c_values[col * c_stride + global_id];
-            }
-            c_values[row * c_stride + global_id] =
-                temp / m_values[row * m_stride + row * nrhs + global_id];
-        }
-    }
-}
-
-
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void step_1_kernel(
-    size_type k, size_type num_rows, size_type subspace_dim, size_type nrhs,
-    const ValueType* __restrict__ residual_values, size_type residual_stride,
-    const ValueType* __restrict__ c_values, size_type c_stride,
-    const ValueType* __restrict__ g_values, size_type g_stride,
-    ValueType* __restrict__ v_values, size_type v_stride,
-    const stopping_status* __restrict__ stop_status)
-{
-    const auto global_id = thread::get_thread_id_flat();
-    const auto row = global_id / nrhs;
-    const auto col = global_id % nrhs;
-
-    if (row >= num_rows) {
-        return;
-    }
-
-    if (!stop_status[col].has_stopped()) {
-        auto temp = residual_values[row * residual_stride + col];
-        for (size_type j = k; j < subspace_dim; j++) {
-            temp -= c_values[j * c_stride + col] *
-                    g_values[row * g_stride + j * nrhs + col];
-        }
-        v_values[row * v_stride + col] = temp;
-    }
-}
-
-
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void step_2_kernel(
-    size_type k, size_type num_rows, size_type subspace_dim, size_type nrhs,
-    const ValueType* __restrict__ omega_values,
-    const ValueType* __restrict__ v_values, size_type v_stride,
-    const ValueType* __restrict__ c_values, size_type c_stride,
-    ValueType* __restrict__ u_values, size_type u_stride,
-    const stopping_status* __restrict__ stop_status)
-{
-    const auto global_id = thread::get_thread_id_flat();
-    const auto row = global_id / nrhs;
-    const auto col = global_id % nrhs;
-
-    if (row >= num_rows) {
-        return;
-    }
-
-    if (!stop_status[col].has_stopped()) {
-        auto temp = omega_values[col] * v_values[row * v_stride + col];
-        for (size_type j = k; j < subspace_dim; j++) {
-            temp += c_values[j * c_stride + col] *
-                    u_values[row * u_stride + j * nrhs + col];
-        }
-        u_values[row * u_stride + k * nrhs + col] = temp;
-    }
-}
-
-
-template <typename ValueType>
-__global__ __launch_bounds__(default_dot_size) void multidot_kernel(
-    size_type num_rows, size_type nrhs, const ValueType* __restrict__ p_i,
-    const ValueType* __restrict__ g_k, size_type g_k_stride,
-    ValueType* __restrict__ alpha,
-    const stopping_status* __restrict__ stop_status)
-{
-    const auto tidx = threadIdx.x;
-    const auto tidy = threadIdx.y;
-    const auto rhs = blockIdx.x * default_dot_dim + tidx;
-    const auto num = ceildiv(num_rows, gridDim.y);
-    const auto start_row = blockIdx.y * num;
-    const auto end_row =
-        ((blockIdx.y + 1) * num > num_rows) ? num_rows : (blockIdx.y + 1) * num;
-    // Used that way to get around dynamic initialization warning and
-    // template error when using `reduction_helper_array` directly in `reduce`
-    __shared__
-        uninitialized_array<ValueType, default_dot_dim*(default_dot_dim + 1)>
-            reduction_helper_array;
-    ValueType* __restrict__ reduction_helper = reduction_helper_array;
-
-    ValueType local_res = zero<ValueType>();
-    if (rhs < nrhs && !stop_status[rhs].has_stopped()) {
-        for (size_type i = start_row + tidy; i < end_row;
-             i += default_dot_dim) {
-            const auto g_idx = i * g_k_stride + rhs;
-            local_res += p_i[i] * g_k[g_idx];
-        }
-    }
-    reduction_helper[tidx * (default_dot_dim + 1) + tidy] = local_res;
-    __syncthreads();
-    local_res = reduction_helper[tidy * (default_dot_dim + 1) + tidx];
-    const auto tile_block =
-        group::tiled_partition<default_dot_dim>(group::this_thread_block());
-    const auto sum =
-        reduce(tile_block, local_res,
-               [](const ValueType& a, const ValueType& b) { return a + b; });
-    const auto new_rhs = blockIdx.x * default_dot_dim + tidy;
-    if (tidx == 0 && new_rhs < nrhs && !stop_status[new_rhs].has_stopped()) {
-        atomic_add(alpha + new_rhs, sum);
-    }
-}
-
-
-template <size_type block_size, typename ValueType>
-__global__ __launch_bounds__(block_size) void update_g_k_and_u_kernel(
-    size_type k, size_type i, size_type size, size_type nrhs,
-    const ValueType* __restrict__ alpha, const ValueType* __restrict__ m_values,
-    size_type m_stride, const ValueType* __restrict__ g_values,
-    size_type g_stride, ValueType* __restrict__ g_k_values,
-    size_type g_k_stride, ValueType* __restrict__ u_values, size_type u_stride,
-    const stopping_status* __restrict__ stop_status)
-{
-    const auto tidx = thread::get_thread_id_flat();
-    const auto row = tidx / g_k_stride;
-    const auto rhs = tidx % g_k_stride;
-
-    if (row >= size || rhs >= nrhs) {
-        return;
-    }
-
-    if (!stop_status[rhs].has_stopped()) {
-        const auto fact = alpha[rhs] / m_values[i * m_stride + i * nrhs + rhs];
-        g_k_values[row * g_k_stride + rhs] -=
-            fact * g_values[row * g_stride + i * nrhs + rhs];
-        u_values[row * u_stride + k * nrhs + rhs] -=
-            fact * u_values[row * u_stride + i * nrhs + rhs];
-    }
-}
-
-
-template <size_type block_size, typename ValueType>
-__global__ __launch_bounds__(block_size) void update_g_kernel(
-    size_type k, size_type size, size_type nrhs,
-    const ValueType* __restrict__ g_k_values, size_type g_k_stride,
-    ValueType* __restrict__ g_values, size_type g_stride,
-    const stopping_status* __restrict__ stop_status)
-{
-    const auto tidx = thread::get_thread_id_flat();
-    const auto row = tidx / g_k_stride;
-    const auto rhs = tidx % nrhs;
-
-    if (row >= size || rhs >= nrhs) {
-        return;
-    }
-
-    if (!stop_status[rhs].has_stopped()) {
-        g_values[row * g_stride + k * nrhs + rhs] =
-            g_k_values[row * g_k_stride + rhs];
-    }
-}
-
-
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void update_x_r_and_f_kernel(
-    size_type k, size_type size, size_type subspace_dim, size_type nrhs,
-    const ValueType* __restrict__ m_values, size_type m_stride,
-    const ValueType* __restrict__ g_values, size_type g_stride,
-    const ValueType* __restrict__ u_values, size_type u_stride,
-    ValueType* __restrict__ f_values, size_type f_stride,
-    ValueType* __restrict__ r_values, size_type r_stride,
-    ValueType* __restrict__ x_values, size_type x_stride,
-    const stopping_status* __restrict__ stop_status)
-{
-    const auto global_id = thread::get_thread_id_flat();
-    const auto row = global_id / x_stride;
-    const auto col = global_id % x_stride;
-
-    if (row >= size || col >= nrhs) {
-        return;
-    }
-
-    if (!stop_status[col].has_stopped()) {
-        const auto beta = f_values[k * f_stride + col] /
-                          m_values[k * m_stride + k * nrhs + col];
-        r_values[row * r_stride + col] -=
-            beta * g_values[row * g_stride + k * nrhs + col];
-        x_values[row * x_stride + col] +=
-            beta * u_values[row * u_stride + k * nrhs + col];
-
-        if (k < row && k + 1 < subspace_dim && row < subspace_dim) {
-            f_values[row * f_stride + col] -=
-                beta * m_values[row * m_stride + k * nrhs + col];
-        }
-    }
-}
-
-
-template <typename ValueType>
-__global__ __launch_bounds__(config::warp_size) void compute_omega_kernel(
-    size_type nrhs, const remove_complex<ValueType> kappa,
-    const ValueType* __restrict__ tht,
-    const remove_complex<ValueType>* __restrict__ residual_norm,
-    ValueType* __restrict__ omega,
-    const stopping_status* __restrict__ stop_status)
-{
-    const auto global_id = thread::get_thread_id_flat();
-
-    if (global_id >= nrhs) {
-        return;
-    }
-
-    if (!stop_status[global_id].has_stopped()) {
-        auto thr = omega[global_id];
-        omega[global_id] /= tht[global_id];
-        auto absrho =
-            abs(thr / (sqrt(real(tht[global_id])) * residual_norm[global_id]));
-
-        if (absrho < kappa) {
-            omega[global_id] *= kappa / absrho;
-        }
-    }
-}
diff --git a/common/cuda_hip/solver/multigrid_kernels.hpp.inc b/common/cuda_hip/solver/multigrid_kernels.cpp
similarity index 89%
rename from common/cuda_hip/solver/multigrid_kernels.hpp.inc
rename to common/cuda_hip/solver/multigrid_kernels.cpp
index 98b1fcfeff4..61b6ee44836 100644
--- a/common/cuda_hip/solver/multigrid_kernels.hpp.inc
+++ b/common/cuda_hip/solver/multigrid_kernels.cpp
@@ -2,6 +2,34 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/solver/multigrid_kernels.hpp"
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/base/array_access.hpp"
+#include "core/components/fill_array_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The MULTIGRID solver namespace.
+ *
+ * @ingroup multigrid
+ */
+namespace multigrid {
+
+
+constexpr int default_block_size = 512;
+
+
 namespace kernel {
 
 
@@ -171,3 +199,9 @@ void kcycle_check_stop(std::shared_ptr<const DefaultExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
     GKO_DECLARE_MULTIGRID_KCYCLE_CHECK_STOP_KERNEL);
+
+
+}  // namespace multigrid
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/stop/batch_criteria.hpp.inc b/common/cuda_hip/stop/batch_criteria.hpp
similarity index 75%
rename from common/cuda_hip/stop/batch_criteria.hpp.inc
rename to common/cuda_hip/stop/batch_criteria.hpp
index 38072467765..cecaa6b19d1 100644
--- a/common/cuda_hip/stop/batch_criteria.hpp.inc
+++ b/common/cuda_hip/stop/batch_criteria.hpp
@@ -2,6 +2,19 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_STOP_BATCH_CRITERIA_HPP_INC_
+#define GKO_COMMON_CUDA_HIP_STOP_BATCH_CRITERIA_HPP_INC_
+
+
+#include <ginkgo/core/base/math.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_stop {
+
+
 /**
  * @see reference/stop/batch_criteria.hpp
  */
@@ -49,3 +62,11 @@ class SimpleAbsResidual {
 private:
     const real_type abs_tol_;
 };
+
+
+}  // namespace batch_stop
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+#endif  // GKO_COMMON_CUDA_HIP_STOP_BATCH_CRITERIA_HPP_INC_
\ No newline at end of file
diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu
deleted file mode 100644
index 704192d0bff..00000000000
--- a/cuda/base/batch_multi_vector_kernels.cu
+++ /dev/null
@@ -1,56 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/base/batch_multi_vector_kernels.hpp"
-
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/range_accessors.hpp>
-
-#include "common/cuda_hip/base/blas_bindings.hpp"
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/pointer_mode_guard.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "core/base/batch_struct.hpp"
-#include "cuda/base/batch_struct.hpp"
-#include "cuda/base/thrust.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The MultiVector matrix format namespace.
- *
- * @ingroup batch_multi_vector
- */
-namespace batch_multi_vector {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc"
-
-// clang-format on
-
-
-}  // namespace batch_multi_vector
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/base/device_matrix_data_kernels.cu b/cuda/base/device_matrix_data_kernels.cu
deleted file mode 100644
index 678c121016c..00000000000
--- a/cuda/base/device_matrix_data_kernels.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/base/device_matrix_data_kernels.hpp"
-
-#include <thrust/copy.h>
-#include <thrust/count.h>
-#include <thrust/device_ptr.h>
-#include <thrust/execution_policy.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/sort.h>
-#include <thrust/tuple.h>
-
-#include "common/cuda_hip/base/types.hpp"
-#include "cuda/base/thrust.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-namespace components {
-
-
-#include "common/cuda_hip/base/device_matrix_data_kernels.hpp.inc"
-
-
-}  // namespace components
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/base/kernel_launch.cuh b/cuda/base/kernel_launch.cuh
deleted file mode 100644
index 4b1d5ac05c3..00000000000
--- a/cuda/base/kernel_launch.cuh
+++ /dev/null
@@ -1,56 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_HPP_
-#error \
-    "This file can only be used from inside common/unified/base/kernel_launch.hpp"
-#endif
-
-
-#include <thrust/tuple.h>
-
-#include "accessor/cuda_hip_helper.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "cuda/components/thread_ids.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-template <typename AccessorType>
-struct to_device_type_impl<gko::acc::range<AccessorType>&> {
-    using type = std::decay_t<decltype(gko::acc::as_device_range(
-        std::declval<gko::acc::range<AccessorType>>()))>;
-    static type map_to_device(gko::acc::range<AccessorType>& range)
-    {
-        return gko::acc::as_device_range(range);
-    }
-};
-
-template <typename AccessorType>
-struct to_device_type_impl<const gko::acc::range<AccessorType>&> {
-    using type = std::decay_t<decltype(gko::acc::as_device_range(
-        std::declval<gko::acc::range<AccessorType>>()))>;
-    static type map_to_device(const gko::acc::range<AccessorType>& range)
-    {
-        return gko::acc::as_device_range(range);
-    }
-};
-
-
-namespace device_std = thrust;
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/base/kernel_launch.hpp.inc"
-
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh
deleted file mode 100644
index 817d19006bc..00000000000
--- a/cuda/base/kernel_launch_reduction.cuh
+++ /dev/null
@@ -1,28 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
-#error \
-    "This file can only be used from inside common/unified/base/kernel_launch_reduction.hpp"
-#endif
-
-
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-#include "common/cuda_hip/base/kernel_launch_reduction.hpp.inc"
-
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/base/kernel_launch_solver.cuh b/cuda/base/kernel_launch_solver.cuh
deleted file mode 100644
index 0d9eaeb2653..00000000000
--- a/cuda/base/kernel_launch_solver.cuh
+++ /dev/null
@@ -1,24 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_SOLVER_HPP_
-#error \
-    "This file can only be used from inside common/unified/base/kernel_launch_solver.hpp"
-#endif
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-#include "common/cuda_hip/base/kernel_launch_solver.hpp.inc"
-
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/components/atomic.cuh b/cuda/components/atomic.cuh
deleted file mode 100644
index a9d63677267..00000000000
--- a/cuda/components/atomic.cuh
+++ /dev/null
@@ -1,28 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_COMPONENTS_ATOMIC_CUH_
-#define GKO_CUDA_COMPONENTS_ATOMIC_CUH_
-
-
-#include <type_traits>
-
-#include "common/cuda_hip/base/types.hpp"
-#include "cuda/base/math.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-#include "common/cuda_hip/components/atomic.hpp.inc"
-
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_COMPONENTS_ATOMIC_CUH_
diff --git a/cuda/components/diagonal_block_manipulation.cuh b/cuda/components/diagonal_block_manipulation.cuh
deleted file mode 100644
index 7f19555ace5..00000000000
--- a/cuda/components/diagonal_block_manipulation.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_CUH_
-#define GKO_CUDA_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_CUH_
-
-
-#include <type_traits>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-namespace csr {
-
-
-#include "common/cuda_hip/components/diagonal_block_manipulation.hpp.inc"
-
-
-}  // namespace csr
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_CUH_
diff --git a/cuda/components/intrinsics.cuh b/cuda/components/intrinsics.cuh
deleted file mode 100644
index d35043c34ce..00000000000
--- a/cuda/components/intrinsics.cuh
+++ /dev/null
@@ -1,25 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_COMPONENTS_INTRINSICS_CUH_
-#define GKO_CUDA_COMPONENTS_INTRINSICS_CUH_
-
-
-#include <ginkgo/core/base/types.hpp>
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-#include "common/cuda_hip/components/intrinsics.hpp.inc"
-
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_COMPONENTS_INTRINSICS_CUH_
diff --git a/cuda/components/merging.cuh b/cuda/components/merging.cuh
deleted file mode 100644
index 3c7f5e52d47..00000000000
--- a/cuda/components/merging.cuh
+++ /dev/null
@@ -1,28 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_COMPONENTS_MERGING_CUH_
-#define GKO_CUDA_COMPONENTS_MERGING_CUH_
-
-
-#include "core/base/utils.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/components/intrinsics.cuh"
-#include "cuda/components/searching.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-#include "common/cuda_hip/components/merging.hpp.inc"
-
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_COMPONENTS_MERGING_CUH_
diff --git a/cuda/components/prefix_sum.cuh b/cuda/components/prefix_sum.cuh
deleted file mode 100644
index 6693bbfc326..00000000000
--- a/cuda/components/prefix_sum.cuh
+++ /dev/null
@@ -1,30 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_COMPONENTS_PREFIX_SUM_CUH_
-#define GKO_CUDA_COMPONENTS_PREFIX_SUM_CUH_
-
-
-#include <type_traits>
-
-#include "common/cuda_hip/base/blas_bindings.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-#include "common/cuda_hip/components/prefix_sum.hpp.inc"
-
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_COMPONENTS_PREFIX_SUM_CUH_
diff --git a/cuda/components/prefix_sum_kernels.cu b/cuda/components/prefix_sum_kernels.cu
deleted file mode 100644
index 60b406ff894..00000000000
--- a/cuda/components/prefix_sum_kernels.cu
+++ /dev/null
@@ -1,30 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/components/prefix_sum_kernels.hpp"
-
-#include <limits>
-
-#include <thrust/scan.h>
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/exception.hpp>
-#include <ginkgo/core/base/name_demangling.hpp>
-
-#include "cuda/base/thrust.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-namespace components {
-
-
-#include "common/cuda_hip/components/prefix_sum_kernels.hpp.inc"
-
-
-}  // namespace components
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/components/reduction.cuh b/cuda/components/reduction.cuh
deleted file mode 100644
index 1e4b7cb447c..00000000000
--- a/cuda/components/reduction.cuh
+++ /dev/null
@@ -1,80 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_COMPONENTS_REDUCTION_CUH_
-#define GKO_CUDA_COMPONENTS_REDUCTION_CUH_
-
-
-#include <type_traits>
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/executor.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "core/base/array_access.hpp"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-constexpr int default_reduce_block_size = 512;
-
-
-#include "common/cuda_hip/components/reduction.hpp.inc"
-
-
-/**
- * Compute a reduction using add operation (+).
- *
- * @param exec  Executor associated to the array
- * @param size  size of the array
- * @param source  the pointer of the array
- *
- * @return the reduction result
- */
-template <typename ValueType>
-__host__ ValueType reduce_add_array(std::shared_ptr<const CudaExecutor> exec,
-                                    size_type size, const ValueType* source)
-{
-    auto block_results_val = source;
-    size_type grid_dim = size;
-    auto block_results = array<ValueType>(exec);
-    if (size > default_reduce_block_size) {
-        const auto n = ceildiv(size, default_reduce_block_size);
-        grid_dim =
-            (n <= default_reduce_block_size) ? n : default_reduce_block_size;
-
-        block_results.resize_and_reset(grid_dim);
-
-        reduce_add_array<<<grid_dim, default_reduce_block_size, 0,
-                           exec->get_stream()>>>(
-            size, as_device_type(source),
-            as_device_type(block_results.get_data()));
-
-        block_results_val = block_results.get_const_data();
-    }
-
-    auto d_result = array<ValueType>(exec, 1);
-
-    reduce_add_array<<<1, default_reduce_block_size, 0, exec->get_stream()>>>(
-        grid_dim, as_device_type(block_results_val),
-        as_device_type(d_result.get_data()));
-    auto answer = get_element(d_result, 0);
-    return answer;
-}
-
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_COMPONENTS_REDUCTION_CUH_
diff --git a/cuda/components/searching.cuh b/cuda/components/searching.cuh
deleted file mode 100644
index 5472ac46ed1..00000000000
--- a/cuda/components/searching.cuh
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_COMPONENTS_SEARCHING_CUH_
-#define GKO_CUDA_COMPONENTS_SEARCHING_CUH_
-
-
-#include "common/cuda_hip/base/config.hpp"
-#include "cuda/components/intrinsics.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-#include "common/cuda_hip/components/searching.hpp.inc"
-
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_COMPONENTS_SEARCHING_CUH_
diff --git a/cuda/components/segment_scan.cuh b/cuda/components/segment_scan.cuh
deleted file mode 100644
index 6ffb8028334..00000000000
--- a/cuda/components/segment_scan.cuh
+++ /dev/null
@@ -1,25 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_COMPONENTS_SEGMENT_SCAN_CUH_
-#define GKO_CUDA_COMPONENTS_SEGMENT_SCAN_CUH_
-
-
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-#include "common/cuda_hip/components/segment_scan.hpp.inc"
-
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_COMPONENTS_SEGMENT_SCAN_CUH_
diff --git a/cuda/components/sorting.cuh b/cuda/components/sorting.cuh
deleted file mode 100644
index 59e44d1bb82..00000000000
--- a/cuda/components/sorting.cuh
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_COMPONENTS_SORTING_CUH_
-#define GKO_CUDA_COMPONENTS_SORTING_CUH_
-
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-#include "common/cuda_hip/components/sorting.hpp.inc"
-
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_COMPONENTS_SORTING_CUH_
diff --git a/cuda/components/syncfree.cuh b/cuda/components/syncfree.cuh
deleted file mode 100644
index 7d519891065..00000000000
--- a/cuda/components/syncfree.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_COMPONENTS_SYNCFREE_CUH_
-#define GKO_CUDA_COMPONENTS_SYNCFREE_CUH_
-
-
-#include <ginkgo/core/base/array.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/memory.hpp"
-#include "core/components/fill_array_kernels.hpp"
-#include "cuda/components/atomic.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-#include "common/cuda_hip/components/syncfree.hpp.inc"
-
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_COMPONENTS_SYNCFREE_CUH_
diff --git a/cuda/components/thread_ids.cuh b/cuda/components/thread_ids.cuh
deleted file mode 100644
index 1113ea75fc6..00000000000
--- a/cuda/components/thread_ids.cuh
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_COMPONENTS_THREAD_IDS_CUH_
-#define GKO_CUDA_COMPONENTS_THREAD_IDS_CUH_
-
-
-#include "common/cuda_hip/base/config.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-namespace thread {
-
-
-#include "common/cuda_hip/components/thread_ids.hpp.inc"
-
-
-}  // namespace thread
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_COMPONENTS_THREAD_IDS_CUH_
diff --git a/cuda/components/warp_blas.cuh b/cuda/components/warp_blas.cuh
deleted file mode 100644
index 8e0042cfdad..00000000000
--- a/cuda/components/warp_blas.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_COMPONENTS_WARP_BLAS_CUH_
-#define GKO_CUDA_COMPONENTS_WARP_BLAS_CUH_
-
-
-#include <cassert>
-#include <type_traits>
-
-#include <ginkgo/config.hpp>
-
-#include "cuda/base/math.hpp"
-#include "cuda/components/reduction.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-#include "common/cuda_hip/components/warp_blas.hpp.inc"
-
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_COMPONENTS_WARP_BLAS_CUH_
diff --git a/cuda/distributed/matrix_kernels.cu b/cuda/distributed/matrix_kernels.cu
deleted file mode 100644
index 1cb939d40e7..00000000000
--- a/cuda/distributed/matrix_kernels.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/distributed/matrix_kernels.hpp"
-
-#include <thrust/binary_search.h>
-#include <thrust/copy.h>
-#include <thrust/distance.h>
-#include <thrust/execution_policy.h>
-#include <thrust/for_each.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#include <thrust/transform_reduce.h>
-#include <thrust/unique.h>
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-
-#include "cuda/base/thrust.cuh"
-#include "cuda/components/atomic.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-namespace distributed_matrix {
-
-
-#include "common/cuda_hip/distributed/matrix_kernels.hpp.inc"
-
-
-}  // namespace distributed_matrix
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/distributed/partition_helpers_kernels.cu b/cuda/distributed/partition_helpers_kernels.cu
deleted file mode 100644
index 738d478d99a..00000000000
--- a/cuda/distributed/partition_helpers_kernels.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/distributed/partition_helpers_kernels.hpp"
-
-#include <thrust/device_ptr.h>
-#include <thrust/execution_policy.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/sort.h>
-
-#include "cuda/base/thrust.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-namespace partition_helpers {
-
-
-#include "common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc"
-
-
-}  // namespace partition_helpers
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/distributed/partition_kernels.cu b/cuda/distributed/partition_kernels.cu
deleted file mode 100644
index 050d6d285d6..00000000000
--- a/cuda/distributed/partition_kernels.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/distributed/partition_kernels.hpp"
-
-#include <thrust/count.h>
-#include <thrust/device_ptr.h>
-#include <thrust/execution_policy.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/scan.h>
-#include <thrust/sort.h>
-
-#include "common/unified/base/kernel_launch.hpp"
-#include "core/components/fill_array_kernels.hpp"
-#include "cuda/base/thrust.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-namespace partition {
-
-
-#include "common/cuda_hip/distributed/partition_kernels.hpp.inc"
-
-
-}  // namespace partition
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/distributed/vector_kernels.cu b/cuda/distributed/vector_kernels.cu
deleted file mode 100644
index 60388150da4..00000000000
--- a/cuda/distributed/vector_kernels.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/distributed/vector_kernels.hpp"
-
-#include <thrust/binary_search.h>
-#include <thrust/execution_policy.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/scatter.h>
-#include <thrust/tuple.h>
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-
-#include "cuda/base/thrust.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-namespace distributed_vector {
-
-
-#include "common/cuda_hip/distributed/vector_kernels.hpp.inc"
-
-
-}  // namespace distributed_vector
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/factorization/cholesky_kernels.cu b/cuda/factorization/cholesky_kernels.cu
deleted file mode 100644
index 7d5fe2c3d08..00000000000
--- a/cuda/factorization/cholesky_kernels.cu
+++ /dev/null
@@ -1,111 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/cholesky_kernels.hpp"
-
-#include <algorithm>
-#include <memory>
-
-#include <thrust/execution_policy.h>
-#include <thrust/functional.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#include <thrust/transform.h>
-#include <thrust/tuple.h>
-
-#include <ginkgo/core/matrix/csr.hpp>
-
-#include "common/cuda_hip/base/sparselib_bindings.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "core/components/fill_array_kernels.hpp"
-#include "core/components/format_conversion_kernels.hpp"
-#include "core/factorization/elimination_forest.hpp"
-#include "core/factorization/lu_kernels.hpp"
-#include "core/matrix/csr_lookup.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/base/thrust.cuh"
-#include "cuda/components/intrinsics.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/syncfree.cuh"
-#include "cuda/components/thread_ids.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Cholesky namespace.
- *
- * @ingroup factor
- */
-namespace cholesky {
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/factorization/cholesky_kernels.hpp.inc"
-
-
-template <typename ValueType, typename IndexType>
-void symbolic_count(std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* mtx,
-                    const factorization::elimination_forest<IndexType>& forest,
-                    IndexType* row_nnz, array<IndexType>& tmp_storage)
-{
-    const auto num_rows = static_cast<IndexType>(mtx->get_size()[0]);
-    if (num_rows == 0) {
-        return;
-    }
-    const auto mtx_nnz = static_cast<IndexType>(mtx->get_num_stored_elements());
-    tmp_storage.resize_and_reset(mtx_nnz + num_rows);
-    const auto postorder_cols = tmp_storage.get_data();
-    const auto lower_ends = postorder_cols + mtx_nnz;
-    const auto row_ptrs = mtx->get_const_row_ptrs();
-    const auto cols = mtx->get_const_col_idxs();
-    const auto inv_postorder = forest.inv_postorder.get_const_data();
-    const auto postorder_parent = forest.postorder_parents.get_const_data();
-    // transform col indices to postorder indices
-    {
-        const auto num_blocks = ceildiv(num_rows, default_block_size);
-        kernel::build_postorder_cols<<<num_blocks, default_block_size, 0,
-                                       exec->get_stream()>>>(
-            num_rows, cols, row_ptrs, inv_postorder, postorder_cols,
-            lower_ends);
-    }
-    // sort postorder_cols inside rows
-    {
-        const auto handle = exec->get_sparselib_handle();
-        auto descr = sparselib::create_mat_descr();
-        array<IndexType> permutation_array(exec, mtx_nnz);
-        auto permutation = permutation_array.get_data();
-        components::fill_seq_array(exec, permutation, mtx_nnz);
-        size_type buffer_size{};
-        sparselib::csrsort_buffer_size(handle, num_rows, num_rows, mtx_nnz,
-                                       row_ptrs, postorder_cols, buffer_size);
-        array<char> buffer_array{exec, buffer_size};
-        auto buffer = buffer_array.get_data();
-        sparselib::csrsort(handle, num_rows, num_rows, mtx_nnz, descr, row_ptrs,
-                           postorder_cols, permutation, buffer);
-        sparselib::destroy(descr);
-    }
-    // count nonzeros per row of L
-    {
-        const auto num_blocks =
-            ceildiv(num_rows, default_block_size / config::warp_size);
-        kernel::symbolic_count<config::warp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                num_rows, row_ptrs, lower_ends, inv_postorder, postorder_cols,
-                postorder_parent, row_nnz);
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT);
-
-
-}  // namespace cholesky
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/factorization/factorization_kernels.cu b/cuda/factorization/factorization_kernels.cu
deleted file mode 100644
index fcabf3676e6..00000000000
--- a/cuda/factorization/factorization_kernels.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/factorization_kernels.hpp"
-
-#include <ginkgo/core/base/array.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "core/base/array_access.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/matrix/csr_builder.hpp"
-#include "cuda/components/intrinsics.cuh"
-#include "cuda/components/searching.cuh"
-#include "cuda/components/thread_ids.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The factorization namespace.
- *
- * @ingroup factor
- */
-namespace factorization {
-
-
-constexpr int default_block_size{512};
-
-
-#include "common/cuda_hip/factorization/factorization_kernels.hpp.inc"
-
-
-}  // namespace factorization
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/factorization/lu_kernels.cu b/cuda/factorization/lu_kernels.cu
deleted file mode 100644
index 57ed7ac8531..00000000000
--- a/cuda/factorization/lu_kernels.cu
+++ /dev/null
@@ -1,46 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/lu_kernels.hpp"
-
-#include <algorithm>
-#include <memory>
-
-#include <thrust/copy.h>
-#include <thrust/iterator/transform_output_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-
-#include <ginkgo/core/matrix/csr.hpp>
-
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "core/base/allocator.hpp"
-#include "core/matrix/csr_lookup.hpp"
-#include "cuda/base/thrust.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/syncfree.cuh"
-#include "cuda/components/thread_ids.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The LU namespace.
- *
- * @ingroup factor
- */
-namespace lu_factorization {
-
-
-constexpr static int default_block_size = 512;
-
-
-#include "common/cuda_hip/factorization/lu_kernels.hpp.inc"
-
-
-}  // namespace lu_factorization
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/factorization/par_ic_kernels.cu b/cuda/factorization/par_ic_kernels.cu
deleted file mode 100644
index 473272fe1fb..00000000000
--- a/cuda/factorization/par_ic_kernels.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/par_ic_kernels.hpp"
-
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/memory.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/components/thread_ids.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The parallel ic factorization namespace.
- *
- * @ingroup factor
- */
-namespace par_ic_factorization {
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/factorization/par_ic_kernels.hpp.inc"
-
-
-}  // namespace par_ic_factorization
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/factorization/par_ilu_kernels.cu b/cuda/factorization/par_ilu_kernels.cu
deleted file mode 100644
index 1f023892afb..00000000000
--- a/cuda/factorization/par_ilu_kernels.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/par_ilu_kernels.hpp"
-
-#include <ginkgo/core/matrix/coo.hpp>
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/memory.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/components/thread_ids.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The parallel ilu factorization namespace.
- *
- * @ingroup factor
- */
-namespace par_ilu_factorization {
-
-
-constexpr int default_block_size{512};
-
-
-#include "common/cuda_hip/factorization/par_ilu_kernels.hpp.inc"
-
-
-}  // namespace par_ilu_factorization
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/log/batch_logger.cuh b/cuda/log/batch_logger.cuh
deleted file mode 100644
index 3e53d6ef0a6..00000000000
--- a/cuda/log/batch_logger.cuh
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_LOG_BATCH_LOGGER_CUH_
-#define GKO_CUDA_LOG_BATCH_LOGGER_CUH_
-
-
-#include <ginkgo/core/base/types.hpp>
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-namespace batch_log {
-
-
-#include "common/cuda_hip/log/batch_logger.hpp.inc"
-
-
-}  // namespace batch_log
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_LOG_BATCH_LOGGER_CUH_
diff --git a/cuda/matrix/batch_csr_kernels.cu b/cuda/matrix/batch_csr_kernels.cu
deleted file mode 100644
index 4fc5137646c..00000000000
--- a/cuda/matrix/batch_csr_kernels.cu
+++ /dev/null
@@ -1,55 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/batch_csr_kernels.hpp"
-
-#include <thrust/functional.h>
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/batch_csr.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
-#include "cuda/base/batch_struct.hpp"
-#include "cuda/base/thrust.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
-#include "cuda/matrix/batch_struct.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Csr matrix format namespace.
- * @ref Csr
- * @ingroup batch_csr
- */
-namespace batch_csr {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc"
-
-// clang-format on
-
-
-}  // namespace batch_csr
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/matrix/batch_dense_kernels.cu b/cuda/matrix/batch_dense_kernels.cu
deleted file mode 100644
index e28d4f91670..00000000000
--- a/cuda/matrix/batch_dense_kernels.cu
+++ /dev/null
@@ -1,56 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/batch_dense_kernels.hpp"
-
-#include <thrust/functional.h>
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/batch_dense.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
-#include "cuda/base/batch_struct.hpp"
-#include "cuda/base/thrust.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
-#include "cuda/matrix/batch_struct.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Dense matrix format namespace.
- *
- * @ingroup batch_dense
- */
-namespace batch_dense {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc"
-
-
-// clang-format on
-
-
-}  // namespace batch_dense
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/matrix/batch_ell_kernels.cu b/cuda/matrix/batch_ell_kernels.cu
deleted file mode 100644
index 90caf963200..00000000000
--- a/cuda/matrix/batch_ell_kernels.cu
+++ /dev/null
@@ -1,55 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/batch_ell_kernels.hpp"
-
-#include <thrust/functional.h>
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/batch_ell.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
-#include "cuda/base/batch_struct.hpp"
-#include "cuda/base/thrust.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
-#include "cuda/matrix/batch_struct.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Ell matrix format namespace.
- * @ref Ell
- * @ingroup batch_ell
- */
-namespace batch_ell {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc"
-
-// clang-format on
-
-
-}  // namespace batch_ell
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/matrix/coo_kernels.cu b/cuda/matrix/coo_kernels.cu
deleted file mode 100644
index 1536e88345e..00000000000
--- a/cuda/matrix/coo_kernels.cu
+++ /dev/null
@@ -1,47 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/coo_kernels.hpp"
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/sparselib_bindings.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/format_conversion.hpp"
-#include "core/matrix/dense_kernels.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/components/atomic.cuh"
-#include "cuda/components/segment_scan.cuh"
-#include "cuda/components/thread_ids.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Coordinate matrix format namespace.
- *
- * @ingroup coo
- */
-namespace coo {
-
-
-constexpr int warps_in_block = 4;
-constexpr int spmv_block_size = warps_in_block * config::warp_size;
-
-
-#include "common/cuda_hip/matrix/coo_kernels.hpp.inc"
-
-
-}  // namespace coo
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu
deleted file mode 100644
index b2114f936e7..00000000000
--- a/cuda/matrix/dense_kernels.cu
+++ /dev/null
@@ -1,230 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/dense_kernels.hpp"
-
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/range_accessors.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/diagonal.hpp>
-#include <ginkgo/core/matrix/ell.hpp>
-#include <ginkgo/core/matrix/fbcsr.hpp>
-#include <ginkgo/core/matrix/hybrid.hpp>
-#include <ginkgo/core/matrix/sellp.hpp>
-#include <ginkgo/core/matrix/sparsity_csr.hpp>
-
-#include "common/cuda_hip/base/blas_bindings.hpp"
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/pointer_mode_guard.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "core/base/utils.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "cuda/components/intrinsics.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Dense matrix format namespace.
- *
- * @ingroup dense
- */
-namespace dense {
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/matrix/dense_kernels.hpp.inc"
-
-
-template <typename ValueType>
-void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
-                          const matrix::Dense<ValueType>* x,
-                          const matrix::Dense<ValueType>* y,
-                          matrix::Dense<ValueType>* result, array<char>& tmp)
-{
-    if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
-        if (blas::is_supported<ValueType>::value) {
-            auto handle = exec->get_blas_handle();
-            blas::dot(handle, x->get_size()[0], x->get_const_values(),
-                      x->get_stride(), y->get_const_values(), y->get_stride(),
-                      result->get_values());
-        } else {
-            compute_dot(exec, x, y, result, tmp);
-        }
-    } else {
-        compute_dot(exec, x, y, result, tmp);
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
-
-
-template <typename ValueType>
-void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
-                               const matrix::Dense<ValueType>* x,
-                               const matrix::Dense<ValueType>* y,
-                               matrix::Dense<ValueType>* result,
-                               array<char>& tmp)
-{
-    if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
-        if (blas::is_supported<ValueType>::value) {
-            auto handle = exec->get_blas_handle();
-            blas::conj_dot(handle, x->get_size()[0], x->get_const_values(),
-                           x->get_stride(), y->get_const_values(),
-                           y->get_stride(), result->get_values());
-        } else {
-            compute_conj_dot(exec, x, y, result, tmp);
-        }
-    } else {
-        compute_conj_dot(exec, x, y, result, tmp);
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
-
-
-template <typename ValueType>
-void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
-                            const matrix::Dense<ValueType>* x,
-                            matrix::Dense<remove_complex<ValueType>>* result,
-                            array<char>& tmp)
-{
-    if (x->get_size()[1] == 1) {
-        if (blas::is_supported<ValueType>::value) {
-            auto handle = exec->get_blas_handle();
-            blas::norm2(handle, x->get_size()[0], x->get_const_values(),
-                        x->get_stride(), result->get_values());
-        } else {
-            compute_norm2(exec, x, result, tmp);
-        }
-    } else {
-        compute_norm2(exec, x, result, tmp);
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
-
-
-template <typename ValueType>
-void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
-                  const matrix::Dense<ValueType>* a,
-                  const matrix::Dense<ValueType>* b,
-                  matrix::Dense<ValueType>* c)
-{
-    if (blas::is_supported<ValueType>::value) {
-        auto handle = exec->get_blas_handle();
-        if (c->get_size()[0] > 0 && c->get_size()[1] > 0) {
-            if (a->get_size()[1] > 0) {
-                blas::pointer_mode_guard pm_guard(handle);
-                auto alpha = one<ValueType>();
-                auto beta = zero<ValueType>();
-                blas::gemm(handle, BLAS_OP_N, BLAS_OP_N, c->get_size()[1],
-                           c->get_size()[0], a->get_size()[1], &alpha,
-                           b->get_const_values(), b->get_stride(),
-                           a->get_const_values(), a->get_stride(), &beta,
-                           c->get_values(), c->get_stride());
-            } else {
-                dense::fill(exec, c, zero<ValueType>());
-            }
-        }
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
-
-
-template <typename ValueType>
-void apply(std::shared_ptr<const DefaultExecutor> exec,
-           const matrix::Dense<ValueType>* alpha,
-           const matrix::Dense<ValueType>* a, const matrix::Dense<ValueType>* b,
-           const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
-{
-    if (blas::is_supported<ValueType>::value) {
-        if (c->get_size()[0] > 0 && c->get_size()[1] > 0) {
-            if (a->get_size()[1] > 0) {
-                blas::gemm(exec->get_blas_handle(), BLAS_OP_N, BLAS_OP_N,
-                           c->get_size()[1], c->get_size()[0], a->get_size()[1],
-                           alpha->get_const_values(), b->get_const_values(),
-                           b->get_stride(), a->get_const_values(),
-                           a->get_stride(), beta->get_const_values(),
-                           c->get_values(), c->get_stride());
-            } else {
-                dense::scale(exec, beta, c);
-            }
-        }
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
-
-
-template <typename ValueType>
-void transpose(std::shared_ptr<const DefaultExecutor> exec,
-               const matrix::Dense<ValueType>* orig,
-               matrix::Dense<ValueType>* trans)
-{
-    if (blas::is_supported<ValueType>::value) {
-        auto handle = exec->get_blas_handle();
-        if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) {
-            blas::pointer_mode_guard pm_guard(handle);
-            auto alpha = one<ValueType>();
-            auto beta = zero<ValueType>();
-            blas::geam(handle, BLAS_OP_T, BLAS_OP_N, orig->get_size()[0],
-                       orig->get_size()[1], &alpha, orig->get_const_values(),
-                       orig->get_stride(), &beta, trans->get_const_values(),
-                       trans->get_stride(), trans->get_values(),
-                       trans->get_stride());
-        }
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
-};
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
-
-
-template <typename ValueType>
-void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Dense<ValueType>* orig,
-                    matrix::Dense<ValueType>* trans)
-{
-    if (blas::is_supported<ValueType>::value) {
-        auto handle = exec->get_blas_handle();
-        if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) {
-            blas::pointer_mode_guard pm_guard(handle);
-            auto alpha = one<ValueType>();
-            auto beta = zero<ValueType>();
-            blas::geam(handle, BLAS_OP_C, BLAS_OP_N, orig->get_size()[0],
-                       orig->get_size()[1], &alpha, orig->get_const_values(),
-                       orig->get_stride(), &beta, trans->get_const_values(),
-                       trans->get_stride(), trans->get_values(),
-                       trans->get_stride());
-        }
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
-
-
-}  // namespace dense
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/matrix/diagonal_kernels.cu b/cuda/matrix/diagonal_kernels.cu
deleted file mode 100644
index 78c0babe3a0..00000000000
--- a/cuda/matrix/diagonal_kernels.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/diagonal_kernels.hpp"
-
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "cuda/components/thread_ids.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Diagonal matrix format namespace.
- *
- * @ingroup diagonal
- */
-namespace diagonal {
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/matrix/diagonal_kernels.hpp.inc"
-
-
-}  // namespace diagonal
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/matrix/fbcsr_kernels.template.cu b/cuda/matrix/fbcsr_kernels.template.cu
deleted file mode 100644
index 120a81c247c..00000000000
--- a/cuda/matrix/fbcsr_kernels.template.cu
+++ /dev/null
@@ -1,299 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/fbcsr_kernels.hpp"
-
-#include <algorithm>
-
-#include <thrust/copy.h>
-#include <thrust/count.h>
-#include <thrust/device_ptr.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_output_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/sort.h>
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-#include "common/cuda_hip/base/blas_bindings.hpp"
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/pointer_mode_guard.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/sparselib_bindings.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/unified/base/kernel_launch.hpp"
-#include "core/base/array_access.hpp"
-#include "core/base/block_sizes.hpp"
-#include "core/base/device_matrix_data_kernels.hpp"
-#include "core/components/fill_array_kernels.hpp"
-#include "core/components/format_conversion_kernels.hpp"
-#include "core/matrix/csr_lookup.hpp"
-#include "core/matrix/dense_kernels.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/cusparse_block_bindings.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/base/thrust.cuh"
-#include "cuda/components/atomic.cuh"
-#include "cuda/components/merging.cuh"
-#include "cuda/components/prefix_sum.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-/**
- * @brief The fixed-size block compressed sparse row matrix format namespace.
- *
- * @ingroup fbcsr
- */
-namespace fbcsr {
-
-
-constexpr int default_block_size{512};
-
-
-#include "common/cuda_hip/matrix/csr_common.hpp.inc"
-#include "common/cuda_hip/matrix/fbcsr_kernels.hpp.inc"
-
-
-namespace {
-
-
-template <typename ValueType>
-void dense_transpose(std::shared_ptr<const CudaExecutor> exec,
-                     const size_type nrows, const size_type ncols,
-                     const size_type orig_stride, const ValueType* const orig,
-                     const size_type trans_stride, ValueType* const trans)
-{
-    if (nrows == 0) {
-        return;
-    }
-    if (blas::is_supported<ValueType>::value) {
-        auto handle = exec->get_blas_handle();
-        {
-            blas::pointer_mode_guard pm_guard(handle);
-            auto alpha = one<ValueType>();
-            auto beta = zero<ValueType>();
-            blas::geam(handle, BLAS_OP_T, BLAS_OP_N, nrows, ncols, &alpha, orig,
-                       orig_stride, &beta, trans, trans_stride, trans,
-                       trans_stride);
-        }
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
-}
-
-
-}  // namespace
-
-
-template <typename ValueType, typename IndexType>
-void spmv(std::shared_ptr<const CudaExecutor> exec,
-          const matrix::Fbcsr<ValueType, IndexType>* const a,
-          const matrix::Dense<ValueType>* const b,
-          matrix::Dense<ValueType>* const c)
-{
-    if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
-        // empty output: nothing to do
-        return;
-    }
-    if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) {
-        // empty input: fill output with zero
-        dense::fill(exec, c, zero<ValueType>());
-        return;
-    }
-    if (sparselib::is_supported<ValueType, IndexType>::value) {
-        auto handle = exec->get_sparselib_handle();
-        sparselib::pointer_mode_guard pm_guard(handle);
-        const auto alpha = one<ValueType>();
-        const auto beta = zero<ValueType>();
-        auto descr = sparselib::create_mat_descr();
-        const auto row_ptrs = a->get_const_row_ptrs();
-        const auto col_idxs = a->get_const_col_idxs();
-        const auto values = a->get_const_values();
-        const int bs = a->get_block_size();
-        const IndexType mb = a->get_num_block_rows();
-        const IndexType nb = a->get_num_block_cols();
-        const auto nnzb = static_cast<IndexType>(a->get_num_stored_blocks());
-        const auto nrhs = static_cast<IndexType>(b->get_size()[1]);
-        const auto nrows = a->get_size()[0];
-        const auto ncols = a->get_size()[1];
-        const auto in_stride = b->get_stride();
-        const auto out_stride = c->get_stride();
-        if (nrhs == 1 && in_stride == 1 && out_stride == 1) {
-            sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb,
-                             nnzb, &alpha, descr, values, row_ptrs, col_idxs,
-                             bs, b->get_const_values(), &beta, c->get_values());
-        } else {
-            const auto trans_stride = nrows;
-            auto trans_c = array<ValueType>(exec, nrows * nrhs);
-            sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
-                             SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
-                             &alpha, descr, values, row_ptrs, col_idxs, bs,
-                             b->get_const_values(), in_stride, &beta,
-                             trans_c.get_data(), trans_stride);
-            dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(),
-                            out_stride, c->get_values());
-        }
-        sparselib::destroy(descr);
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
-                   const matrix::Dense<ValueType>* const alpha,
-                   const matrix::Fbcsr<ValueType, IndexType>* const a,
-                   const matrix::Dense<ValueType>* const b,
-                   const matrix::Dense<ValueType>* const beta,
-                   matrix::Dense<ValueType>* const c)
-{
-    if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
-        // empty output: nothing to do
-        return;
-    }
-    if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) {
-        // empty input: scale output
-        dense::scale(exec, beta, c);
-        return;
-    }
-    if (sparselib::is_supported<ValueType, IndexType>::value) {
-        auto handle = exec->get_sparselib_handle();
-        const auto alphp = alpha->get_const_values();
-        const auto betap = beta->get_const_values();
-        auto descr = sparselib::create_mat_descr();
-        const auto row_ptrs = a->get_const_row_ptrs();
-        const auto col_idxs = a->get_const_col_idxs();
-        const auto values = a->get_const_values();
-        const int bs = a->get_block_size();
-        const IndexType mb = a->get_num_block_rows();
-        const IndexType nb = a->get_num_block_cols();
-        const auto nnzb = static_cast<IndexType>(a->get_num_stored_blocks());
-        const auto nrhs = static_cast<IndexType>(b->get_size()[1]);
-        const auto nrows = a->get_size()[0];
-        const auto ncols = a->get_size()[1];
-        const auto in_stride = b->get_stride();
-        const auto out_stride = c->get_stride();
-        if (nrhs == 1 && in_stride == 1 && out_stride == 1) {
-            sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb,
-                             nnzb, alphp, descr, values, row_ptrs, col_idxs, bs,
-                             b->get_const_values(), betap, c->get_values());
-        } else {
-            const auto trans_stride = nrows;
-            auto trans_c = array<ValueType>(exec, nrows * nrhs);
-            dense_transpose(exec, nrows, nrhs, out_stride, c->get_values(),
-                            trans_stride, trans_c.get_data());
-            sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
-                             SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
-                             alphp, descr, values, row_ptrs, col_idxs, bs,
-                             b->get_const_values(), in_stride, betap,
-                             trans_c.get_data(), trans_stride);
-            dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(),
-                            out_stride, c->get_values());
-        }
-        sparselib::destroy(descr);
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
-}
-
-
-namespace {
-
-
-template <int mat_blk_sz, typename ValueType, typename IndexType>
-void transpose_blocks_impl(syn::value_list<int, mat_blk_sz>,
-                           std::shared_ptr<const DefaultExecutor> exec,
-                           matrix::Fbcsr<ValueType, IndexType>* const mat)
-{
-    constexpr int subwarp_size = config::warp_size;
-    const auto nbnz = mat->get_num_stored_blocks();
-    const auto numthreads = nbnz * subwarp_size;
-    const auto block_size = default_block_size;
-    const auto grid_dim = ceildiv(numthreads, block_size);
-    if (grid_dim > 0) {
-        kernel::transpose_blocks<mat_blk_sz, subwarp_size>
-            <<<grid_dim, block_size, 0, exec->get_stream()>>>(
-                nbnz, mat->get_values());
-    }
-}
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_transpose_blocks,
-                                    transpose_blocks_impl);
-
-
-}  // namespace
-
-
-template <typename ValueType, typename IndexType>
-void transpose(const std::shared_ptr<const CudaExecutor> exec,
-               const matrix::Fbcsr<ValueType, IndexType>* const orig,
-               matrix::Fbcsr<ValueType, IndexType>* const trans)
-{
-#ifdef GKO_COMPILING_CUDA
-    if (sparselib::is_supported<ValueType, IndexType>::value) {
-        const int bs = orig->get_block_size();
-        const IndexType nnzb =
-            static_cast<IndexType>(orig->get_num_stored_blocks());
-        cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
-        cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
-        const IndexType buffer_size = sparselib::bsr_transpose_buffersize(
-            exec->get_sparselib_handle(), orig->get_num_block_rows(),
-            orig->get_num_block_cols(), nnzb, orig->get_const_values(),
-            orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs);
-        array<char> buffer_array(exec, buffer_size);
-        auto buffer = buffer_array.get_data();
-        sparselib::bsr_transpose(
-            exec->get_sparselib_handle(), orig->get_num_block_rows(),
-            orig->get_num_block_cols(), nnzb, orig->get_const_values(),
-            orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs,
-            trans->get_values(), trans->get_col_idxs(), trans->get_row_ptrs(),
-            copyValues, idxBase, buffer);
-
-        // transpose blocks
-        select_transpose_blocks(
-            fixedblock::compiled_kernels(),
-            [bs](int compiled_block_size) { return bs == compiled_block_size; },
-            syn::value_list<int>(), syn::type_list<>(), exec, trans);
-    } else
-#endif
-    {
-        fallback_transpose(exec, orig, trans);
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-void conj_transpose(std::shared_ptr<const CudaExecutor> exec,
-                    const matrix::Fbcsr<ValueType, IndexType>* orig,
-                    matrix::Fbcsr<ValueType, IndexType>* trans)
-{
-    const int grid_size =
-        ceildiv(trans->get_num_stored_elements(), default_block_size);
-    transpose(exec, orig, trans);
-    if (grid_size > 0 && is_complex<ValueType>()) {
-        kernel::
-            conjugate<<<grid_size, default_block_size, 0, exec->get_stream()>>>(
-                trans->get_num_stored_elements(),
-                as_device_type(trans->get_values()));
-    }
-}
-
-
-}  // namespace fbcsr
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/matrix/sellp_kernels.cu b/cuda/matrix/sellp_kernels.cu
deleted file mode 100644
index 07f5d5d8ec0..00000000000
--- a/cuda/matrix/sellp_kernels.cu
+++ /dev/null
@@ -1,42 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/sellp_kernels.hpp"
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/sparselib_bindings.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The SELL-P matrix format namespace.
- *
- * @ingroup sellp
- */
-namespace sellp {
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/matrix/sellp_kernels.hpp.inc"
-
-
-}  // namespace sellp
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/matrix/sparsity_csr_kernels.cu b/cuda/matrix/sparsity_csr_kernels.cu
deleted file mode 100644
index 17a1e004935..00000000000
--- a/cuda/matrix/sparsity_csr_kernels.cu
+++ /dev/null
@@ -1,223 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/sparsity_csr_kernels.hpp"
-
-#include <thrust/sort.h>
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-
-#include "accessor/cuda_hip_helper.hpp"
-#include "accessor/reduced_row_major.hpp"
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/sparselib_bindings.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "core/base/mixed_precision_types.hpp"
-#include "core/components/fill_array_kernels.hpp"
-#include "core/components/format_conversion_kernels.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/base/thrust.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Compressed sparse row matrix format namespace.
- *
- * @ingroup sparsity
- */
-namespace sparsity_csr {
-
-
-constexpr int classical_oversubscription = 32;
-constexpr int default_block_size = 512;
-#ifdef GKO_COMPILING_HIP
-constexpr int spmv_block_size = 256;
-#else
-constexpr int spmv_block_size = 128;
-#endif
-constexpr int warps_in_block = 4;
-
-
-using classical_kernels = syn::value_list<int, 2>;
-
-
-#include "common/cuda_hip/matrix/csr_common.hpp.inc"
-#include "common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc"
-
-
-namespace host_kernel {
-
-
-template <int subwarp_size, typename MatrixValueType, typename InputValueType,
-          typename OutputValueType, typename IndexType>
-void classical_spmv(syn::value_list<int, subwarp_size>,
-                    std::shared_ptr<const CudaExecutor> exec,
-                    const matrix::SparsityCsr<MatrixValueType, IndexType>* a,
-                    const matrix::Dense<InputValueType>* b,
-                    matrix::Dense<OutputValueType>* c,
-                    const matrix::Dense<MatrixValueType>* alpha = nullptr,
-                    const matrix::Dense<OutputValueType>* beta = nullptr)
-{
-    using arithmetic_type =
-        highest_precision<InputValueType, OutputValueType, MatrixValueType>;
-    using input_accessor =
-        gko::acc::reduced_row_major<2, arithmetic_type, const InputValueType>;
-    using output_accessor =
-        gko::acc::reduced_row_major<2, arithmetic_type, OutputValueType>;
-
-    const auto nwarps = exec->get_num_warps_per_sm() *
-                        exec->get_num_multiprocessor() *
-                        classical_oversubscription;
-    const auto gridx =
-        std::min(ceildiv(a->get_size()[0], spmv_block_size / subwarp_size),
-                 int64(nwarps / warps_in_block));
-    const dim3 grid(gridx, b->get_size()[1]);
-    const auto block = spmv_block_size;
-
-    const auto b_vals = gko::acc::range<input_accessor>(
-        std::array<acc::size_type, 2>{
-            {static_cast<acc::size_type>(b->get_size()[0]),
-             static_cast<acc::size_type>(b->get_size()[1])}},
-        b->get_const_values(),
-        std::array<acc::size_type, 1>{
-            {static_cast<acc::size_type>(b->get_stride())}});
-    auto c_vals = gko::acc::range<output_accessor>(
-        std::array<acc::size_type, 2>{
-            {static_cast<acc::size_type>(c->get_size()[0]),
-             static_cast<acc::size_type>(c->get_size()[1])}},
-        c->get_values(),
-        std::array<acc::size_type, 1>{
-            {static_cast<acc::size_type>(c->get_stride())}});
-    if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
-        // empty output: nothing to do
-        return;
-    }
-    if (alpha == nullptr && beta == nullptr) {
-        kernel::abstract_classical_spmv<subwarp_size>
-            <<<grid, block, 0, exec->get_stream()>>>(
-                a->get_size()[0], as_device_type(a->get_const_value()),
-                a->get_const_col_idxs(),
-                as_device_type(a->get_const_row_ptrs()),
-                acc::as_device_range(b_vals), acc::as_device_range(c_vals));
-    } else if (alpha != nullptr && beta != nullptr) {
-        kernel::abstract_classical_spmv<subwarp_size>
-            <<<grid, block, 0, exec->get_stream()>>>(
-                a->get_size()[0], as_device_type(alpha->get_const_values()),
-                as_device_type(a->get_const_value()), a->get_const_col_idxs(),
-                as_device_type(a->get_const_row_ptrs()),
-                acc::as_device_range(b_vals),
-                as_device_type(beta->get_const_values()),
-                acc::as_device_range(c_vals));
-    } else {
-        GKO_KERNEL_NOT_FOUND;
-    }
-}
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv);
-
-
-}  // namespace host_kernel
-
-template <typename MatrixValueType, typename InputValueType,
-          typename OutputValueType, typename IndexType>
-void spmv(std::shared_ptr<const CudaExecutor> exec,
-          const matrix::SparsityCsr<MatrixValueType, IndexType>* a,
-          const matrix::Dense<InputValueType>* b,
-          matrix::Dense<OutputValueType>* c)
-{
-    host_kernel::select_classical_spmv(
-        classical_kernels(), [](int compiled_info) { return true; },
-        syn::value_list<int>(), syn::type_list<>(), exec, a, b, c);
-}
-
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL);
-
-
-template <typename MatrixValueType, typename InputValueType,
-          typename OutputValueType, typename IndexType>
-void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
-                   const matrix::Dense<MatrixValueType>* alpha,
-                   const matrix::SparsityCsr<MatrixValueType, IndexType>* a,
-                   const matrix::Dense<InputValueType>* b,
-                   const matrix::Dense<OutputValueType>* beta,
-                   matrix::Dense<OutputValueType>* c)
-{
-    host_kernel::select_classical_spmv(
-        classical_kernels(), [](int compiled_info) { return true; },
-        syn::value_list<int>(), syn::type_list<>(), exec, a, b, c, alpha, beta);
-}
-
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void sort_by_column_index(std::shared_ptr<const DefaultExecutor> exec,
-                          matrix::SparsityCsr<ValueType, IndexType>* to_sort)
-{
-    const auto nnz = static_cast<IndexType>(to_sort->get_num_nonzeros());
-    const auto num_rows = static_cast<IndexType>(to_sort->get_size()[0]);
-    const auto num_cols = static_cast<IndexType>(to_sort->get_size()[1]);
-    const auto row_ptrs = to_sort->get_const_row_ptrs();
-    const auto col_idxs = to_sort->get_col_idxs();
-    if (sparselib::is_supported<ValueType, IndexType>::value) {
-        const auto handle = exec->get_sparselib_handle();
-        auto descr = sparselib::create_mat_descr();
-        array<IndexType> permutation_array(exec, to_sort->get_num_nonzeros());
-        auto permutation = permutation_array.get_data();
-        components::fill_seq_array(exec, permutation,
-                                   to_sort->get_num_nonzeros());
-        size_type buffer_size{};
-        sparselib::csrsort_buffer_size(handle, num_rows, num_cols, nnz,
-                                       row_ptrs, col_idxs, buffer_size);
-        array<char> buffer_array{exec, buffer_size};
-        auto buffer = buffer_array.get_data();
-        sparselib::csrsort(handle, num_rows, num_cols, nnz, descr, row_ptrs,
-                           col_idxs, permutation, buffer);
-        sparselib::destroy(descr);
-    } else {
-        fallback_sort(exec, to_sort);
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX);
-
-
-template <typename ValueType, typename IndexType>
-void is_sorted_by_column_index(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const matrix::SparsityCsr<ValueType, IndexType>* to_check, bool* is_sorted)
-{
-    *is_sorted = true;
-    auto cpu_array = make_array_view(exec->get_master(), 1, is_sorted);
-    auto gpu_array = array<bool>{exec, cpu_array};
-    const auto num_rows = static_cast<IndexType>(to_check->get_size()[0]);
-    auto num_blocks = ceildiv(num_rows, default_block_size);
-    if (num_blocks > 0) {
-        kernel::check_unsorted<<<num_blocks, default_block_size, 0,
-                                 exec->get_stream()>>>(
-            to_check->get_const_row_ptrs(), to_check->get_const_col_idxs(),
-            num_rows, gpu_array.get_data());
-    }
-    cpu_array = gpu_array;
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_SPARSITY_CSR_IS_SORTED_BY_COLUMN_INDEX);
-
-
-}  // namespace sparsity_csr
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/multigrid/pgm_kernels.cu b/cuda/multigrid/pgm_kernels.cu
deleted file mode 100644
index 399d8a06c1b..00000000000
--- a/cuda/multigrid/pgm_kernels.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/multigrid/pgm_kernels.hpp"
-
-#include <memory>
-
-#include <thrust/device_ptr.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/reduce.h>
-#include <thrust/sort.h>
-#include <thrust/tuple.h>
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-
-#include "common/cuda_hip/base/types.hpp"
-#include "cuda/base/thrust.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The PGM solver namespace.
- *
- * @ingroup pgm
- */
-namespace pgm {
-
-
-#include "common/cuda_hip/multigrid/pgm_kernels.hpp.inc"
-
-
-}  // namespace pgm
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/preconditioner/isai_kernels.cu b/cuda/preconditioner/isai_kernels.cu
deleted file mode 100644
index 8867bf643b0..00000000000
--- a/cuda/preconditioner/isai_kernels.cu
+++ /dev/null
@@ -1,47 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/preconditioner/isai_kernels.hpp"
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/executor.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/matrix/csr_builder.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/components/merging.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
-#include "cuda/components/warp_blas.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Isai preconditioner namespace.
- * @ref Isai
- * @ingroup isai
- */
-namespace isai {
-
-
-constexpr int subwarp_size{row_size_limit};
-constexpr int subwarps_per_block{2};
-constexpr int default_block_size{subwarps_per_block * subwarp_size};
-
-
-#include "common/cuda_hip/preconditioner/isai_kernels.hpp.inc"
-
-
-}  // namespace isai
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/preconditioner/jacobi_kernels.cu b/cuda/preconditioner/jacobi_kernels.cu
deleted file mode 100644
index 783de652733..00000000000
--- a/cuda/preconditioner/jacobi_kernels.cu
+++ /dev/null
@@ -1,49 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/preconditioner/jacobi_kernels.hpp"
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "core/base/extended_float.hpp"
-#include "core/preconditioner/jacobi_utils.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/preconditioner/jacobi_common.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Jacobi preconditioner namespace.
- * @ref Jacobi
- * @ingroup jacobi
- */
-namespace jacobi {
-
-
-// a total of 32/16 warps (1024 threads)
-#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC
-constexpr int default_num_warps = 16;
-#else  // !defined(GKO_COMPILING_HIP) || GINKGO_HIP_PLATFORM_NVCC
-constexpr int default_num_warps = 32;
-#endif
-// with current architectures, at most 32 warps can be scheduled per SM (and
-// current GPUs have at most 84 SMs)
-constexpr int default_grid_size = 32 * 32 * 128;
-
-
-#include "common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc"
-
-
-}  // namespace jacobi
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/reorder/rcm_kernels.cu b/cuda/reorder/rcm_kernels.cu
deleted file mode 100644
index 8308cf88e60..00000000000
--- a/cuda/reorder/rcm_kernels.cu
+++ /dev/null
@@ -1,51 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/reorder/rcm_kernels.hpp"
-
-#include <thrust/binary_search.h>
-#include <thrust/copy.h>
-#include <thrust/count.h>
-#include <thrust/iterator/discard_iterator.h>
-#include <thrust/iterator/permutation_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/reduce.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#include <thrust/transform.h>
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/std_extensions.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/permutation.hpp>
-#include <ginkgo/core/matrix/sparsity_csr.hpp>
-
-#include "common/cuda_hip/components/memory.hpp"
-#include "core/base/array_access.hpp"
-#include "cuda/base/thrust.cuh"
-#include "cuda/components/thread_ids.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The reordering namespace.
- *
- * @ingroup reorder
- */
-namespace rcm {
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/reorder/rcm_kernels.hpp.inc"
-
-
-}  // namespace rcm
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/solver/cb_gmres_kernels.cu b/cuda/solver/cb_gmres_kernels.cu
deleted file mode 100644
index 8b1a28d5581..00000000000
--- a/cuda/solver/cb_gmres_kernels.cu
+++ /dev/null
@@ -1,504 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/solver/cb_gmres_kernels.hpp"
-
-#include <algorithm>
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-#include <ginkgo/core/stop/stopping_status.hpp>
-
-#include "accessor/cuda_hip_helper.hpp"
-#include "accessor/range.hpp"
-#include "accessor/reduced_row_major.hpp"
-#include "accessor/scaled_reduced_row_major.hpp"
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "core/base/array_access.hpp"
-#include "core/components/fill_array_kernels.hpp"
-#include "core/matrix/dense_kernels.hpp"
-#include "core/solver/cb_gmres_accessor.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/components/atomic.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The CB_GMRES solver namespace.
- *
- * @ingroup cb_gmres
- */
-namespace cb_gmres {
-
-
-constexpr int default_block_size = 512;
-// default_dot_dim can not be 64 in hip because 64 * 64 exceeds their max block
-// size limit.
-constexpr int default_dot_dim = 32;
-constexpr int default_dot_size = default_dot_dim * default_dot_dim;
-
-
-#include "common/cuda_hip/solver/cb_gmres_kernels.hpp.inc"
-
-
-template <typename ValueType>
-void zero_matrix(std::shared_ptr<const DefaultExecutor> exec, size_type m,
-                 size_type n, size_type stride, ValueType* array)
-{
-    const auto block_size = default_block_size;
-    const auto grid_size = ceildiv(n, block_size);
-    zero_matrix_kernel<<<grid_size, block_size, 0, exec->get_stream()>>>(
-        m, n, stride, as_device_type(array));
-}
-
-
-template <typename ValueType>
-void initialize(std::shared_ptr<const DefaultExecutor> exec,
-                const matrix::Dense<ValueType>* b,
-                matrix::Dense<ValueType>* residual,
-                matrix::Dense<ValueType>* givens_sin,
-                matrix::Dense<ValueType>* givens_cos,
-                array<stopping_status>* stop_status, size_type krylov_dim)
-{
-    const auto num_threads = std::max(b->get_size()[0] * b->get_stride(),
-                                      krylov_dim * b->get_size()[1]);
-    const auto grid_dim = ceildiv(num_threads, default_block_size);
-    const auto block_dim = default_block_size;
-    constexpr auto block_size = default_block_size;
-
-    initialize_kernel<block_size>
-        <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-            b->get_size()[0], b->get_size()[1], krylov_dim,
-            as_device_type(b->get_const_values()), b->get_stride(),
-            as_device_type(residual->get_values()), residual->get_stride(),
-            as_device_type(givens_sin->get_values()), givens_sin->get_stride(),
-            as_device_type(givens_cos->get_values()), givens_cos->get_stride(),
-            as_device_type(stop_status->get_data()));
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL);
-
-
-template <typename ValueType, typename Accessor3d>
-void restart(std::shared_ptr<const DefaultExecutor> exec,
-             const matrix::Dense<ValueType>* residual,
-             matrix::Dense<remove_complex<ValueType>>* residual_norm,
-             matrix::Dense<ValueType>* residual_norm_collection,
-             matrix::Dense<remove_complex<ValueType>>* arnoldi_norm,
-             Accessor3d krylov_bases,
-             matrix::Dense<ValueType>* next_krylov_basis,
-             array<size_type>* final_iter_nums, array<char>& reduction_tmp,
-             size_type krylov_dim)
-{
-    constexpr bool use_scalar =
-        gko::cb_gmres::detail::has_3d_scaled_accessor<Accessor3d>::value;
-    const auto num_rows = residual->get_size()[0];
-    const auto num_rhs = residual->get_size()[1];
-    const auto krylov_stride =
-        gko::cb_gmres::helper_functions_accessor<Accessor3d>::get_stride(
-            krylov_bases);
-    const auto grid_dim_1 =
-        ceildiv((krylov_dim + 1) * krylov_stride[0], default_block_size);
-    const auto block_dim = default_block_size;
-    constexpr auto block_size = default_block_size;
-    const auto stride_arnoldi = arnoldi_norm->get_stride();
-
-    restart_1_kernel<block_size>
-        <<<grid_dim_1, block_dim, 0, exec->get_stream()>>>(
-            residual->get_size()[0], residual->get_size()[1], krylov_dim,
-            acc::as_device_range(krylov_bases),
-            as_device_type(residual_norm_collection->get_values()),
-            residual_norm_collection->get_stride());
-    kernels::cuda::dense::compute_norm2_dispatch(exec, residual, residual_norm,
-                                                 reduction_tmp);
-
-    if (use_scalar) {
-        components::fill_array(exec,
-                               arnoldi_norm->get_values() + 2 * stride_arnoldi,
-                               num_rhs, zero<remove_complex<ValueType>>());
-        const dim3 grid_size_nrm(ceildiv(num_rhs, default_dot_dim),
-                                 exec->get_num_multiprocessor() * 2);
-        const dim3 block_size_nrm(default_dot_dim, default_dot_dim);
-        multinorminf_without_stop_kernel<<<grid_size_nrm, block_size_nrm, 0,
-                                           exec->get_stream()>>>(
-            num_rows, num_rhs, as_device_type(residual->get_const_values()),
-            residual->get_stride(),
-            as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi), 0);
-    }
-
-    if (gko::cb_gmres::detail::has_3d_scaled_accessor<Accessor3d>::value) {
-        set_scalar_kernel<default_block_size>
-            <<<ceildiv(num_rhs * (krylov_dim + 1), default_block_size),
-               default_block_size, 0, exec->get_stream()>>>(
-                num_rhs, krylov_dim + 1,
-                as_device_type(residual_norm->get_const_values()),
-                residual_norm->get_stride(),
-                as_device_type(arnoldi_norm->get_const_values() +
-                               2 * stride_arnoldi),
-                stride_arnoldi, acc::as_device_range(krylov_bases));
-    }
-
-    const auto grid_dim_2 =
-        ceildiv(std::max<size_type>(num_rows, 1) * krylov_stride[1],
-                default_block_size);
-    restart_2_kernel<block_size>
-        <<<grid_dim_2, block_dim, 0, exec->get_stream()>>>(
-            residual->get_size()[0], residual->get_size()[1],
-            as_device_type(residual->get_const_values()),
-            residual->get_stride(),
-            as_device_type(residual_norm->get_const_values()),
-            as_device_type(residual_norm_collection->get_values()),
-            acc::as_device_range(krylov_bases),
-            as_device_type(next_krylov_basis->get_values()),
-            next_krylov_basis->get_stride(),
-            as_device_type(final_iter_nums->get_data()));
-}
-
-GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_RESTART_KERNEL);
-
-
-template <typename ValueType, typename Accessor3dim>
-void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
-                        matrix::Dense<ValueType>* next_krylov_basis,
-                        Accessor3dim krylov_bases,
-                        matrix::Dense<ValueType>* hessenberg_iter,
-                        matrix::Dense<ValueType>* buffer_iter,
-                        matrix::Dense<remove_complex<ValueType>>* arnoldi_norm,
-                        size_type iter, const stopping_status* stop_status,
-                        stopping_status* reorth_status,
-                        array<size_type>* num_reorth)
-{
-    const auto dim_size = next_krylov_basis->get_size();
-    if (dim_size[1] == 0) {
-        return;
-    }
-    using non_complex = remove_complex<ValueType>;
-    // optimization parameter
-    constexpr int singledot_block_size = default_dot_dim;
-    constexpr bool use_scalar =
-        gko::cb_gmres::detail::has_3d_scaled_accessor<Accessor3dim>::value;
-    const auto stride_next_krylov = next_krylov_basis->get_stride();
-    const auto stride_hessenberg = hessenberg_iter->get_stride();
-    const auto stride_buffer = buffer_iter->get_stride();
-    const auto stride_arnoldi = arnoldi_norm->get_stride();
-    const dim3 grid_size(ceildiv(dim_size[1], default_dot_dim),
-                         exec->get_num_multiprocessor() * 2);
-    const dim3 grid_size_num_iters(ceildiv(dim_size[1], default_dot_dim),
-                                   exec->get_num_multiprocessor() * 2,
-                                   iter + 1);
-    const dim3 block_size(default_dot_dim, default_dot_dim);
-    // Note: having iter first (instead of row_idx information) is likely
-    //       beneficial for avoiding atomic_add conflicts, but that needs
-    //       further investigation.
-    const dim3 grid_size_iters_single(exec->get_num_multiprocessor() * 2,
-                                      iter + 1);
-    const auto block_size_iters_single = singledot_block_size;
-    size_type num_reorth_host;
-
-    components::fill_array(exec, arnoldi_norm->get_values(), dim_size[1],
-                           zero<non_complex>());
-    multinorm2_kernel<<<grid_size, block_size, 0, exec->get_stream()>>>(
-        dim_size[0], dim_size[1],
-        as_device_type(next_krylov_basis->get_const_values()),
-        stride_next_krylov, as_device_type(arnoldi_norm->get_values()),
-        as_device_type(stop_status));
-    // nrmP = norm(next_krylov_basis)
-    zero_matrix(exec, iter + 1, dim_size[1], stride_hessenberg,
-                hessenberg_iter->get_values());
-    if (dim_size[1] > 1) {
-        multidot_kernel<default_dot_dim>
-            <<<grid_size_num_iters, block_size, 0, exec->get_stream()>>>(
-                dim_size[0], dim_size[1],
-                as_device_type(next_krylov_basis->get_const_values()),
-                stride_next_krylov, acc::as_device_range(krylov_bases),
-                as_device_type(hessenberg_iter->get_values()),
-                stride_hessenberg, as_device_type(stop_status));
-    } else {
-        singledot_kernel<singledot_block_size>
-            <<<grid_size_iters_single, block_size_iters_single, 0,
-               exec->get_stream()>>>(
-                dim_size[0],
-                as_device_type(next_krylov_basis->get_const_values()),
-                stride_next_krylov, acc::as_device_range(krylov_bases),
-                as_device_type(hessenberg_iter->get_values()),
-                stride_hessenberg, as_device_type(stop_status));
-    }
-    // for i in 1:iter
-    //     hessenberg(iter, i) = next_krylov_basis' * krylov_bases(:, i)
-    // end
-    update_next_krylov_kernel<default_block_size>
-        <<<ceildiv(dim_size[0] * stride_next_krylov, default_block_size),
-           default_block_size, 0, exec->get_stream()>>>(
-            iter + 1, dim_size[0], dim_size[1],
-            as_device_type(next_krylov_basis->get_values()), stride_next_krylov,
-            acc::as_device_range(krylov_bases),
-            as_device_type(hessenberg_iter->get_const_values()),
-            stride_hessenberg, as_device_type(stop_status));
-
-    // for i in 1:iter
-    //     next_krylov_basis  -= hessenberg(iter, i) * krylov_bases(:, i)
-    // end
-    components::fill_array(exec, arnoldi_norm->get_values() + stride_arnoldi,
-                           dim_size[1], zero<non_complex>());
-    if (use_scalar) {
-        components::fill_array(exec,
-                               arnoldi_norm->get_values() + 2 * stride_arnoldi,
-                               dim_size[1], zero<non_complex>());
-    }
-    multinorm2_inf_kernel<use_scalar>
-        <<<grid_size, block_size, 0, exec->get_stream()>>>(
-            dim_size[0], dim_size[1],
-            as_device_type(next_krylov_basis->get_const_values()),
-            stride_next_krylov,
-            as_device_type(arnoldi_norm->get_values() + stride_arnoldi),
-            as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi),
-            as_device_type(stop_status));
-    // nrmN = norm(next_krylov_basis)
-    components::fill_array(exec, num_reorth->get_data(), 1, zero<size_type>());
-    check_arnoldi_norms<default_block_size>
-        <<<ceildiv(dim_size[1], default_block_size), default_block_size, 0,
-           exec->get_stream()>>>(
-            dim_size[1], as_device_type(arnoldi_norm->get_values()),
-            stride_arnoldi, as_device_type(hessenberg_iter->get_values()),
-            stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases),
-            as_device_type(stop_status), as_device_type(reorth_status),
-            as_device_type(num_reorth->get_data()));
-    num_reorth_host = get_element(*num_reorth, 0);
-    // num_reorth_host := number of next_krylov vector to be reorthogonalization
-    for (size_type l = 1; (num_reorth_host > 0) && (l < 3); l++) {
-        zero_matrix(exec, iter + 1, dim_size[1], stride_buffer,
-                    buffer_iter->get_values());
-        if (dim_size[1] > 1) {
-            multidot_kernel<default_dot_dim>
-                <<<grid_size_num_iters, block_size, 0, exec->get_stream()>>>(
-                    dim_size[0], dim_size[1],
-                    as_device_type(next_krylov_basis->get_const_values()),
-                    stride_next_krylov, acc::as_device_range(krylov_bases),
-                    as_device_type(buffer_iter->get_values()), stride_buffer,
-                    as_device_type(stop_status));
-        } else {
-            singledot_kernel<singledot_block_size>
-                <<<grid_size_iters_single, block_size_iters_single, 0,
-                   exec->get_stream()>>>(
-                    dim_size[0],
-                    as_device_type(next_krylov_basis->get_const_values()),
-                    stride_next_krylov, acc::as_device_range(krylov_bases),
-                    as_device_type(buffer_iter->get_values()), stride_buffer,
-                    as_device_type(stop_status));
-        }
-        // for i in 1:iter
-        //     hessenberg(iter, i) = next_krylov_basis' * krylov_bases(:, i)
-        // end
-        update_next_krylov_and_add_kernel<default_block_size>
-            <<<ceildiv(dim_size[0] * stride_next_krylov, default_block_size),
-               default_block_size, 0, exec->get_stream()>>>(
-                iter + 1, dim_size[0], dim_size[1],
-                as_device_type(next_krylov_basis->get_values()),
-                stride_next_krylov, acc::as_device_range(krylov_bases),
-                as_device_type(hessenberg_iter->get_values()),
-                stride_hessenberg,
-                as_device_type(buffer_iter->get_const_values()), stride_buffer,
-                as_device_type(stop_status), as_device_type(reorth_status));
-        // for i in 1:iter
-        //     next_krylov_basis  -= hessenberg(iter, i) * krylov_bases(:, i)
-        // end
-        components::fill_array(exec,
-                               arnoldi_norm->get_values() + stride_arnoldi,
-                               dim_size[1], zero<non_complex>());
-        if (use_scalar) {
-            components::fill_array(
-                exec, arnoldi_norm->get_values() + 2 * stride_arnoldi,
-                dim_size[1], zero<non_complex>());
-        }
-        multinorm2_inf_kernel<use_scalar>
-            <<<grid_size, block_size, 0, exec->get_stream()>>>(
-                dim_size[0], dim_size[1],
-                as_device_type(next_krylov_basis->get_const_values()),
-                stride_next_krylov,
-                as_device_type(arnoldi_norm->get_values() + stride_arnoldi),
-                as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi),
-                as_device_type(stop_status));
-        // nrmN = norm(next_krylov_basis)
-        components::fill_array(exec, num_reorth->get_data(), 1,
-                               zero<size_type>());
-        check_arnoldi_norms<default_block_size>
-            <<<ceildiv(dim_size[1], default_block_size), default_block_size, 0,
-               exec->get_stream()>>>(
-                dim_size[1], as_device_type(arnoldi_norm->get_values()),
-                stride_arnoldi, as_device_type(hessenberg_iter->get_values()),
-                stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases),
-                as_device_type(stop_status), as_device_type(reorth_status),
-                num_reorth->get_data());
-        num_reorth_host = get_element(*num_reorth, 0);
-        // num_reorth_host := number of next_krylov vector to be
-        // reorthogonalization
-    }
-    update_krylov_next_krylov_kernel<default_block_size>
-        <<<ceildiv(dim_size[0] * stride_next_krylov, default_block_size),
-           default_block_size, 0, exec->get_stream()>>>(
-            iter, dim_size[0], dim_size[1],
-            as_device_type(next_krylov_basis->get_values()), stride_next_krylov,
-            acc::as_device_range(krylov_bases),
-            as_device_type(hessenberg_iter->get_const_values()),
-            stride_hessenberg, as_device_type(stop_status));
-    // next_krylov_basis /= hessenberg(iter, iter + 1)
-    // krylov_bases(:, iter + 1) = next_krylov_basis
-    // End of arnoldi
-}
-
-template <typename ValueType>
-void givens_rotation(std::shared_ptr<const DefaultExecutor> exec,
-                     matrix::Dense<ValueType>* givens_sin,
-                     matrix::Dense<ValueType>* givens_cos,
-                     matrix::Dense<ValueType>* hessenberg_iter,
-                     matrix::Dense<remove_complex<ValueType>>* residual_norm,
-                     matrix::Dense<ValueType>* residual_norm_collection,
-                     size_type iter, const array<stopping_status>* stop_status)
-{
-    // TODO: tune block_size for optimal performance
-    constexpr auto block_size = default_block_size;
-    const auto num_cols = hessenberg_iter->get_size()[1];
-    const auto block_dim = block_size;
-    const auto grid_dim =
-        static_cast<unsigned int>(ceildiv(num_cols, block_size));
-
-    givens_rotation_kernel<block_size>
-        <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-            hessenberg_iter->get_size()[0], hessenberg_iter->get_size()[1],
-            iter, as_device_type(hessenberg_iter->get_values()),
-            hessenberg_iter->get_stride(),
-            as_device_type(givens_sin->get_values()), givens_sin->get_stride(),
-            as_device_type(givens_cos->get_values()), givens_cos->get_stride(),
-            as_device_type(residual_norm->get_values()),
-            as_device_type(residual_norm_collection->get_values()),
-            residual_norm_collection->get_stride(),
-            stop_status->get_const_data());
-}
-
-
-template <typename ValueType, typename Accessor3d>
-void arnoldi(std::shared_ptr<const DefaultExecutor> exec,
-             matrix::Dense<ValueType>* next_krylov_basis,
-             matrix::Dense<ValueType>* givens_sin,
-             matrix::Dense<ValueType>* givens_cos,
-             matrix::Dense<remove_complex<ValueType>>* residual_norm,
-             matrix::Dense<ValueType>* residual_norm_collection,
-             Accessor3d krylov_bases, matrix::Dense<ValueType>* hessenberg_iter,
-             matrix::Dense<ValueType>* buffer_iter,
-             matrix::Dense<remove_complex<ValueType>>* arnoldi_norm,
-             size_type iter, array<size_type>* final_iter_nums,
-             const array<stopping_status>* stop_status,
-             array<stopping_status>* reorth_status,
-             array<size_type>* num_reorth)
-{
-    increase_final_iteration_numbers_kernel<<<
-        static_cast<unsigned int>(
-            ceildiv(final_iter_nums->get_size(), default_block_size)),
-        default_block_size, 0, exec->get_stream()>>>(
-        as_device_type(final_iter_nums->get_data()),
-        stop_status->get_const_data(), final_iter_nums->get_size());
-    finish_arnoldi_CGS(exec, next_krylov_basis, krylov_bases, hessenberg_iter,
-                       buffer_iter, arnoldi_norm, iter,
-                       stop_status->get_const_data(), reorth_status->get_data(),
-                       num_reorth);
-    givens_rotation(exec, givens_sin, givens_cos, hessenberg_iter,
-                    residual_norm, residual_norm_collection, iter, stop_status);
-}
-
-GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_ARNOLDI_KERNEL);
-
-
-template <typename ValueType>
-void solve_upper_triangular(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const matrix::Dense<ValueType>* residual_norm_collection,
-    const matrix::Dense<ValueType>* hessenberg, matrix::Dense<ValueType>* y,
-    const array<size_type>* final_iter_nums)
-{
-    // TODO: tune block_size for optimal performance
-    constexpr auto block_size = default_block_size;
-    const auto num_rhs = residual_norm_collection->get_size()[1];
-    const auto block_dim = block_size;
-    const auto grid_dim =
-        static_cast<unsigned int>(ceildiv(num_rhs, block_size));
-
-    solve_upper_triangular_kernel<block_size>
-        <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-            hessenberg->get_size()[1], num_rhs,
-            as_device_type(residual_norm_collection->get_const_values()),
-            residual_norm_collection->get_stride(),
-            as_device_type(hessenberg->get_const_values()),
-            hessenberg->get_stride(), as_device_type(y->get_values()),
-            y->get_stride(), as_device_type(final_iter_nums->get_const_data()));
-}
-
-
-template <typename ValueType, typename ConstAccessor3d>
-void calculate_qy(std::shared_ptr<const DefaultExecutor> exec,
-                  ConstAccessor3d krylov_bases, size_type num_krylov_bases,
-                  const matrix::Dense<ValueType>* y,
-                  matrix::Dense<ValueType>* before_preconditioner,
-                  const array<size_type>* final_iter_nums)
-{
-    const auto num_rows = before_preconditioner->get_size()[0];
-    const auto num_cols = before_preconditioner->get_size()[1];
-    const auto stride_before_preconditioner =
-        before_preconditioner->get_stride();
-
-    constexpr auto block_size = default_block_size;
-    const auto grid_dim = static_cast<unsigned int>(
-        ceildiv(num_rows * stride_before_preconditioner, block_size));
-    const auto block_dim = block_size;
-
-    calculate_Qy_kernel<block_size>
-        <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-            num_rows, num_cols, acc::as_device_range(krylov_bases),
-            as_device_type(y->get_const_values()), y->get_stride(),
-            as_device_type(before_preconditioner->get_values()),
-            stride_before_preconditioner,
-            as_device_type(final_iter_nums->get_const_data()));
-    // Calculate qy
-    // before_preconditioner = krylov_bases * y
-}
-
-
-template <typename ValueType, typename ConstAccessor3d>
-void solve_krylov(std::shared_ptr<const DefaultExecutor> exec,
-                  const matrix::Dense<ValueType>* residual_norm_collection,
-                  ConstAccessor3d krylov_bases,
-                  const matrix::Dense<ValueType>* hessenberg,
-                  matrix::Dense<ValueType>* y,
-                  matrix::Dense<ValueType>* before_preconditioner,
-                  const array<size_type>* final_iter_nums)
-{
-    if (before_preconditioner->get_size()[1] == 0) {
-        return;
-    }
-    // since hessenberg has dims:  iters x iters * num_rhs
-    // krylov_bases has dims:  (iters + 1) x sysmtx[0] x num_rhs
-    const auto iters =
-        hessenberg->get_size()[1] / before_preconditioner->get_size()[1];
-    const auto num_krylov_bases = iters + 1;
-    solve_upper_triangular(exec, residual_norm_collection, hessenberg, y,
-                           final_iter_nums);
-    calculate_qy(exec, krylov_bases, num_krylov_bases, y, before_preconditioner,
-                 final_iter_nums);
-}
-
-GKO_INSTANTIATE_FOR_EACH_CB_GMRES_CONST_TYPE(
-    GKO_DECLARE_CB_GMRES_SOLVE_KRYLOV_KERNEL);
-
-
-}  // namespace cb_gmres
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/solver/multigrid_kernels.cu b/cuda/solver/multigrid_kernels.cu
deleted file mode 100644
index 6001d42614d..00000000000
--- a/cuda/solver/multigrid_kernels.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/solver/multigrid_kernels.hpp"
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/types.hpp>
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "core/base/array_access.hpp"
-#include "core/components/fill_array_kernels.hpp"
-#include "cuda/components/thread_ids.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The MULTIGRID solver namespace.
- *
- * @ingroup multigrid
- */
-namespace multigrid {
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/solver/multigrid_kernels.hpp.inc"
-
-
-}  // namespace multigrid
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/stop/batch_criteria.cuh b/cuda/stop/batch_criteria.cuh
deleted file mode 100644
index f4f434dda11..00000000000
--- a/cuda/stop/batch_criteria.cuh
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_STOP_BATCH_CRITERIA_CUH_
-#define GKO_CUDA_STOP_BATCH_CRITERIA_CUH_
-
-
-#include <ginkgo/core/base/math.hpp>
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-namespace batch_stop {
-
-
-#include "common/cuda_hip/stop/batch_criteria.hpp.inc"
-
-
-}  // namespace batch_stop
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-#endif  // GKO_CUDA_STOP_BATCH_CRITERIA_CUH_
diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp
deleted file mode 100644
index 86b16c8975d..00000000000
--- a/hip/base/batch_multi_vector_kernels.hip.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/base/batch_multi_vector_kernels.hpp"
-
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/range_accessors.hpp>
-
-#include "common/cuda_hip/base/blas_bindings.hpp"
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/pointer_mode_guard.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "core/base/batch_struct.hpp"
-#include "hip/base/batch_struct.hip.hpp"
-#include "hip/base/thrust.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The MultiVector matrix format namespace.
- *
- * @ingroup batch_multi_vector
- */
-namespace batch_multi_vector {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc"
-
-// clang-format on
-
-
-}  // namespace batch_multi_vector
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/base/device_matrix_data_kernels.hip.cpp b/hip/base/device_matrix_data_kernels.hip.cpp
deleted file mode 100644
index d63a8e27ed5..00000000000
--- a/hip/base/device_matrix_data_kernels.hip.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/base/device_matrix_data_kernels.hpp"
-
-#include <thrust/copy.h>
-#include <thrust/count.h>
-#include <thrust/device_ptr.h>
-#include <thrust/execution_policy.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/sort.h>
-#include <thrust/tuple.h>
-
-#include "common/cuda_hip/base/types.hpp"
-#include "hip/base/thrust.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-namespace components {
-
-
-#include "common/cuda_hip/base/device_matrix_data_kernels.hpp.inc"
-
-
-}  // namespace components
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/base/kernel_launch.hip.hpp b/hip/base/kernel_launch.hip.hpp
deleted file mode 100644
index ff9f398c0bc..00000000000
--- a/hip/base/kernel_launch.hip.hpp
+++ /dev/null
@@ -1,56 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_HPP_
-#error \
-    "This file can only be used from inside common/unified/base/kernel_launch.hpp"
-#endif
-
-
-#include <thrust/tuple.h>
-
-#include "accessor/cuda_hip_helper.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-template <typename AccessorType>
-struct to_device_type_impl<gko::acc::range<AccessorType>&> {
-    using type = std::decay_t<decltype(gko::acc::as_device_range(
-        std::declval<gko::acc::range<AccessorType>>()))>;
-    static type map_to_device(gko::acc::range<AccessorType>& range)
-    {
-        return gko::acc::as_device_range(range);
-    }
-};
-
-template <typename AccessorType>
-struct to_device_type_impl<const gko::acc::range<AccessorType>&> {
-    using type = std::decay_t<decltype(gko::acc::as_device_range(
-        std::declval<gko::acc::range<AccessorType>>()))>;
-    static type map_to_device(const gko::acc::range<AccessorType>& range)
-    {
-        return gko::acc::as_device_range(range);
-    }
-};
-
-
-namespace device_std = thrust;
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/base/kernel_launch.hpp.inc"
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp
deleted file mode 100644
index c32fb592de0..00000000000
--- a/hip/base/kernel_launch_reduction.hip.hpp
+++ /dev/null
@@ -1,28 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
-#error \
-    "This file can only be used from inside common/unified/base/kernel_launch_reduction.hpp"
-#endif
-
-
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-#include "common/cuda_hip/base/kernel_launch_reduction.hpp.inc"
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/base/kernel_launch_solver.hip.hpp b/hip/base/kernel_launch_solver.hip.hpp
deleted file mode 100644
index eda18f35eab..00000000000
--- a/hip/base/kernel_launch_solver.hip.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_SOLVER_HPP_
-#error \
-    "This file can only be used from inside common/unified/base/kernel_launch_solver.hpp"
-#endif
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-#include "common/cuda_hip/base/kernel_launch_solver.hpp.inc"
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/components/atomic.hip.hpp b/hip/components/atomic.hip.hpp
deleted file mode 100644
index 64d39a90d78..00000000000
--- a/hip/components/atomic.hip.hpp
+++ /dev/null
@@ -1,28 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_COMPONENTS_ATOMIC_HIP_HPP_
-#define GKO_HIP_COMPONENTS_ATOMIC_HIP_HPP_
-
-
-#include <type_traits>
-
-#include "common/cuda_hip/base/types.hpp"
-#include "hip/base/math.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-#include "common/cuda_hip/components/atomic.hpp.inc"
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_COMPONENTS_ATOMIC_HIP_HPP_
diff --git a/hip/components/diagonal_block_manipulation.hip.hpp b/hip/components/diagonal_block_manipulation.hip.hpp
deleted file mode 100644
index 7a3893fa031..00000000000
--- a/hip/components/diagonal_block_manipulation.hip.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HIP_HPP_
-#define GKO_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HIP_HPP_
-
-
-#include <type_traits>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-namespace csr {
-
-
-#include "common/cuda_hip/components/diagonal_block_manipulation.hpp.inc"
-
-
-}  // namespace csr
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HIP_HPP_
diff --git a/hip/components/intrinsics.hip.hpp b/hip/components/intrinsics.hip.hpp
deleted file mode 100644
index af849d4471a..00000000000
--- a/hip/components/intrinsics.hip.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_COMPONENTS_INTRINSICS_HIP_HPP_
-#define GKO_HIP_COMPONENTS_INTRINSICS_HIP_HPP_
-
-
-#include <ginkgo/core/base/types.hpp>
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-#include "common/cuda_hip/components/intrinsics.hpp.inc"
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_COMPONENTS_INTRINSICS_HIP_HPP_
diff --git a/hip/components/merging.hip.hpp b/hip/components/merging.hip.hpp
deleted file mode 100644
index 3f031947940..00000000000
--- a/hip/components/merging.hip.hpp
+++ /dev/null
@@ -1,28 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_COMPONENTS_MERGING_HIP_HPP_
-#define GKO_HIP_COMPONENTS_MERGING_HIP_HPP_
-
-
-#include "core/base/utils.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/components/intrinsics.hip.hpp"
-#include "hip/components/searching.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-#include "common/cuda_hip/components/merging.hpp.inc"
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_COMPONENTS_MERGING_HIP_HPP_
diff --git a/hip/components/prefix_sum.hip.hpp b/hip/components/prefix_sum.hip.hpp
deleted file mode 100644
index deb78288e6c..00000000000
--- a/hip/components/prefix_sum.hip.hpp
+++ /dev/null
@@ -1,30 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_COMPONENTS_PREFIX_SUM_HIP_HPP_
-#define GKO_HIP_COMPONENTS_PREFIX_SUM_HIP_HPP_
-
-
-#include <type_traits>
-
-#include "common/cuda_hip/base/blas_bindings.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-#include "common/cuda_hip/components/prefix_sum.hpp.inc"
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_COMPONENTS_PREFIX_SUM_HIP_HPP_
diff --git a/hip/components/prefix_sum_kernels.hip.cpp b/hip/components/prefix_sum_kernels.hip.cpp
deleted file mode 100644
index 283e8c161a1..00000000000
--- a/hip/components/prefix_sum_kernels.hip.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/components/prefix_sum_kernels.hpp"
-
-#include <limits>
-
-#include <thrust/scan.h>
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/exception.hpp>
-#include <ginkgo/core/base/name_demangling.hpp>
-
-#include "hip/base/thrust.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-namespace components {
-
-
-#include "common/cuda_hip/components/prefix_sum_kernels.hpp.inc"
-
-
-}  // namespace components
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/components/reduction.hip.hpp b/hip/components/reduction.hip.hpp
deleted file mode 100644
index bc2594dd96d..00000000000
--- a/hip/components/reduction.hip.hpp
+++ /dev/null
@@ -1,80 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_COMPONENTS_REDUCTION_HIP_HPP_
-#define GKO_HIP_COMPONENTS_REDUCTION_HIP_HPP_
-
-
-#include <type_traits>
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/executor.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "core/base/array_access.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-constexpr int default_reduce_block_size = 512;
-
-
-#include "common/cuda_hip/components/reduction.hpp.inc"
-
-
-/**
- * Compute a reduction using add operation (+).
- *
- * @param exec  Executor associated to the array
- * @param size  size of the array
- * @param source  the pointer of the array
- *
- * @return the reduction result
- */
-template <typename ValueType>
-__host__ ValueType reduce_add_array(std::shared_ptr<const HipExecutor> exec,
-                                    size_type size, const ValueType* source)
-{
-    auto block_results_val = source;
-    size_type grid_dim = size;
-    auto block_results = array<ValueType>(exec);
-    if (size > default_reduce_block_size) {
-        const auto n = ceildiv(size, default_reduce_block_size);
-        grid_dim =
-            (n <= default_reduce_block_size) ? n : default_reduce_block_size;
-
-        block_results.resize_and_reset(grid_dim);
-
-        reduce_add_array<<<grid_dim, default_reduce_block_size, 0,
-                           exec->get_stream()>>>(
-            size, as_device_type(source),
-            as_device_type(block_results.get_data()));
-
-        block_results_val = block_results.get_const_data();
-    }
-
-    auto d_result = array<ValueType>(exec, 1);
-
-    reduce_add_array<<<1, default_reduce_block_size, 0, exec->get_stream()>>>(
-        grid_dim, as_device_type(block_results_val),
-        as_device_type(d_result.get_data()));
-    auto answer = get_element(d_result, 0);
-    return answer;
-}
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_COMPONENTS_REDUCTION_HIP_HPP_
diff --git a/hip/components/searching.hip.hpp b/hip/components/searching.hip.hpp
deleted file mode 100644
index 9222de9e1d6..00000000000
--- a/hip/components/searching.hip.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_COMPONENTS_SEARCHING_HIP_HPP_
-#define GKO_HIP_COMPONENTS_SEARCHING_HIP_HPP_
-
-
-#include "common/cuda_hip/base/config.hpp"
-#include "hip/components/intrinsics.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-#include "common/cuda_hip/components/searching.hpp.inc"
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_COMPONENTS_SEARCHING_HIP_HPP_
diff --git a/hip/components/segment_scan.hip.hpp b/hip/components/segment_scan.hip.hpp
deleted file mode 100644
index 93ebb35833a..00000000000
--- a/hip/components/segment_scan.hip.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_COMPONENTS_SEGMENT_SCAN_HIP_HPP_
-#define GKO_HIP_COMPONENTS_SEGMENT_SCAN_HIP_HPP_
-
-
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-#include "common/cuda_hip/components/segment_scan.hpp.inc"
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_COMPONENTS_SEGMENT_SCAN_HIP_HPP_
diff --git a/hip/components/sorting.hip.hpp b/hip/components/sorting.hip.hpp
deleted file mode 100644
index 4a664aee453..00000000000
--- a/hip/components/sorting.hip.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_COMPONENTS_SORTING_HIP_HPP_
-#define GKO_HIP_COMPONENTS_SORTING_HIP_HPP_
-
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-#include "common/cuda_hip/components/sorting.hpp.inc"
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_COMPONENTS_SORTING_HIP_HPP_
diff --git a/hip/components/syncfree.hip.hpp b/hip/components/syncfree.hip.hpp
deleted file mode 100644
index c174224c9c4..00000000000
--- a/hip/components/syncfree.hip.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_COMPONENTS_SYNCFREE_HIP_HPP_
-#define GKO_HIP_COMPONENTS_SYNCFREE_HIP_HPP_
-
-
-#include <ginkgo/core/base/array.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/memory.hpp"
-#include "core/components/fill_array_kernels.hpp"
-#include "hip/components/atomic.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-#include "common/cuda_hip/components/syncfree.hpp.inc"
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_COMPONENTS_SYNCFREE_HIP_HPP_
diff --git a/hip/components/thread_ids.hip.hpp b/hip/components/thread_ids.hip.hpp
deleted file mode 100644
index 6f0bd44ba9c..00000000000
--- a/hip/components/thread_ids.hip.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_COMPONENTS_THREAD_IDS_HIP_HPP_
-#define GKO_HIP_COMPONENTS_THREAD_IDS_HIP_HPP_
-
-
-#include "common/cuda_hip/base/config.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-namespace thread {
-
-
-#include "common/cuda_hip/components/thread_ids.hpp.inc"
-
-
-}  // namespace thread
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_COMPONENTS_THREAD_IDS_HIP_HPP_
diff --git a/hip/components/warp_blas.hip.hpp b/hip/components/warp_blas.hip.hpp
deleted file mode 100644
index 9164a1914b3..00000000000
--- a/hip/components/warp_blas.hip.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_COMPONENTS_WARP_BLAS_HIP_HPP_
-#define GKO_HIP_COMPONENTS_WARP_BLAS_HIP_HPP_
-
-
-#include <cassert>
-#include <type_traits>
-
-#include <ginkgo/config.hpp>
-
-#include "hip/base/math.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-#include "common/cuda_hip/components/warp_blas.hpp.inc"
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_COMPONENTS_WARP_BLAS_HIP_HPP_
diff --git a/hip/distributed/matrix_kernels.hip.cpp b/hip/distributed/matrix_kernels.hip.cpp
deleted file mode 100644
index 535fdaacb44..00000000000
--- a/hip/distributed/matrix_kernels.hip.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/distributed/matrix_kernels.hpp"
-
-#include <thrust/binary_search.h>
-#include <thrust/copy.h>
-#include <thrust/distance.h>
-#include <thrust/execution_policy.h>
-#include <thrust/for_each.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#include <thrust/transform_reduce.h>
-#include <thrust/unique.h>
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-
-#include "hip/base/thrust.hip.hpp"
-#include "hip/components/atomic.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-namespace distributed_matrix {
-
-
-#include "common/cuda_hip/distributed/matrix_kernels.hpp.inc"
-
-
-}  // namespace distributed_matrix
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/distributed/partition_helpers_kernels.hip.cpp b/hip/distributed/partition_helpers_kernels.hip.cpp
deleted file mode 100644
index a2083a55303..00000000000
--- a/hip/distributed/partition_helpers_kernels.hip.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/distributed/partition_helpers_kernels.hpp"
-
-#include <thrust/device_ptr.h>
-#include <thrust/execution_policy.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/sort.h>
-
-#include "hip/base/thrust.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-namespace partition_helpers {
-
-
-#include "common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc"
-
-
-}  // namespace partition_helpers
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/distributed/partition_kernels.hip.cpp b/hip/distributed/partition_kernels.hip.cpp
deleted file mode 100644
index c2c4a8f28ea..00000000000
--- a/hip/distributed/partition_kernels.hip.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/distributed/partition_kernels.hpp"
-
-#include <thrust/count.h>
-#include <thrust/device_ptr.h>
-#include <thrust/execution_policy.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/scan.h>
-#include <thrust/sort.h>
-
-#include "common/unified/base/kernel_launch.hpp"
-#include "core/components/fill_array_kernels.hpp"
-#include "hip/base/thrust.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-namespace partition {
-
-
-#include "common/cuda_hip/distributed/partition_kernels.hpp.inc"
-
-
-}  // namespace partition
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/distributed/vector_kernels.hip.cpp b/hip/distributed/vector_kernels.hip.cpp
deleted file mode 100644
index eff7936076d..00000000000
--- a/hip/distributed/vector_kernels.hip.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/distributed/vector_kernels.hpp"
-
-#include <thrust/binary_search.h>
-#include <thrust/execution_policy.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/scatter.h>
-#include <thrust/tuple.h>
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-
-#include "hip/base/thrust.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-namespace distributed_vector {
-
-
-#include "common/cuda_hip/distributed/vector_kernels.hpp.inc"
-
-
-}  // namespace distributed_vector
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/factorization/cholesky_kernels.hip.cpp b/hip/factorization/cholesky_kernels.hip.cpp
deleted file mode 100644
index 1c1ce1d3170..00000000000
--- a/hip/factorization/cholesky_kernels.hip.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/cholesky_kernels.hpp"
-
-#include <algorithm>
-#include <memory>
-
-#include <thrust/execution_policy.h>
-#include <thrust/functional.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#include <thrust/transform.h>
-#include <thrust/tuple.h>
-
-#include <ginkgo/core/matrix/csr.hpp>
-
-#include "common/cuda_hip/base/sparselib_bindings.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "core/components/fill_array_kernels.hpp"
-#include "core/components/format_conversion_kernels.hpp"
-#include "core/factorization/elimination_forest.hpp"
-#include "core/factorization/lu_kernels.hpp"
-#include "core/matrix/csr_lookup.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/base/thrust.hip.hpp"
-#include "hip/components/intrinsics.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/syncfree.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The Cholesky namespace.
- *
- * @ingroup factor
- */
-namespace cholesky {
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/factorization/cholesky_kernels.hpp.inc"
-
-
-template <typename ValueType, typename IndexType>
-void symbolic_count(std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* mtx,
-                    const factorization::elimination_forest<IndexType>& forest,
-                    IndexType* row_nnz, array<IndexType>& tmp_storage)
-{
-    const auto num_rows = static_cast<IndexType>(mtx->get_size()[0]);
-    if (num_rows == 0) {
-        return;
-    }
-    const auto mtx_nnz = static_cast<IndexType>(mtx->get_num_stored_elements());
-    tmp_storage.resize_and_reset(mtx_nnz + num_rows);
-    const auto postorder_cols = tmp_storage.get_data();
-    const auto lower_ends = postorder_cols + mtx_nnz;
-    const auto row_ptrs = mtx->get_const_row_ptrs();
-    const auto cols = mtx->get_const_col_idxs();
-    const auto inv_postorder = forest.inv_postorder.get_const_data();
-    const auto postorder_parent = forest.postorder_parents.get_const_data();
-    // transform col indices to postorder indices
-    {
-        const auto num_blocks = ceildiv(num_rows, default_block_size);
-        kernel::build_postorder_cols<<<num_blocks, default_block_size, 0,
-                                       exec->get_stream()>>>(
-            num_rows, cols, row_ptrs, inv_postorder, postorder_cols,
-            lower_ends);
-    }
-    // sort postorder_cols inside rows
-    {
-        const auto handle = exec->get_sparselib_handle();
-        auto descr = sparselib::create_mat_descr();
-        array<IndexType> permutation_array(exec, mtx_nnz);
-        auto permutation = permutation_array.get_data();
-        components::fill_seq_array(exec, permutation, mtx_nnz);
-        size_type buffer_size{};
-        sparselib::csrsort_buffer_size(handle, num_rows, num_rows, mtx_nnz,
-                                       row_ptrs, postorder_cols, buffer_size);
-        array<char> buffer_array{exec, buffer_size};
-        auto buffer = buffer_array.get_data();
-        sparselib::csrsort(handle, num_rows, num_rows, mtx_nnz, descr, row_ptrs,
-                           postorder_cols, permutation, buffer);
-        sparselib::destroy(descr);
-    }
-    // count nonzeros per row of L
-    {
-        const auto num_blocks =
-            ceildiv(num_rows, default_block_size / config::warp_size);
-        kernel::symbolic_count<config::warp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                num_rows, row_ptrs, lower_ends, inv_postorder, postorder_cols,
-                postorder_parent, row_nnz);
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT);
-
-
-}  // namespace cholesky
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/factorization/factorization_kernels.hip.cpp b/hip/factorization/factorization_kernels.hip.cpp
deleted file mode 100644
index d6768e5e9c6..00000000000
--- a/hip/factorization/factorization_kernels.hip.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/factorization_kernels.hpp"
-
-#include <ginkgo/core/base/array.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "core/base/array_access.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/matrix/csr_builder.hpp"
-#include "hip/components/intrinsics.hip.hpp"
-#include "hip/components/searching.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The factorization namespace.
- *
- * @ingroup factor
- */
-namespace factorization {
-
-
-constexpr int default_block_size{512};
-
-
-#include "common/cuda_hip/factorization/factorization_kernels.hpp.inc"
-
-
-}  // namespace factorization
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/factorization/lu_kernels.hip.cpp b/hip/factorization/lu_kernels.hip.cpp
deleted file mode 100644
index 8e37d1a2445..00000000000
--- a/hip/factorization/lu_kernels.hip.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/lu_kernels.hpp"
-
-#include <algorithm>
-#include <memory>
-
-#include <thrust/copy.h>
-#include <thrust/iterator/transform_output_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-
-#include <ginkgo/core/matrix/csr.hpp>
-
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "core/base/allocator.hpp"
-#include "core/matrix/csr_lookup.hpp"
-#include "hip/base/thrust.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/syncfree.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The LU namespace.
- *
- * @ingroup factor
- */
-namespace lu_factorization {
-
-
-constexpr static int default_block_size = 512;
-
-
-#include "common/cuda_hip/factorization/lu_kernels.hpp.inc"
-
-
-}  // namespace lu_factorization
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/factorization/par_ic_kernels.hip.cpp b/hip/factorization/par_ic_kernels.hip.cpp
deleted file mode 100644
index f0e0cb0b632..00000000000
--- a/hip/factorization/par_ic_kernels.hip.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/par_ic_kernels.hpp"
-
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/memory.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The parallel ic factorization namespace.
- *
- * @ingroup factor
- */
-namespace par_ic_factorization {
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/factorization/par_ic_kernels.hpp.inc"
-
-
-}  // namespace par_ic_factorization
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/factorization/par_ilu_kernels.hip.cpp b/hip/factorization/par_ilu_kernels.hip.cpp
deleted file mode 100644
index b4897a23cf9..00000000000
--- a/hip/factorization/par_ilu_kernels.hip.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/par_ilu_kernels.hpp"
-
-#include <ginkgo/core/matrix/coo.hpp>
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/memory.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The parallel ilu factorization namespace.
- *
- * @ingroup factor
- */
-namespace par_ilu_factorization {
-
-
-constexpr int default_block_size{512};
-
-
-#include "common/cuda_hip/factorization/par_ilu_kernels.hpp.inc"
-
-
-}  // namespace par_ilu_factorization
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/log/batch_logger.hip.hpp b/hip/log/batch_logger.hip.hpp
deleted file mode 100644
index a2540f2bd9d..00000000000
--- a/hip/log/batch_logger.hip.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_
-#define GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_
-
-
-#include <ginkgo/core/base/types.hpp>
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-namespace batch_log {
-
-#include "common/cuda_hip/log/batch_logger.hpp.inc"
-
-
-}  // namespace batch_log
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_
diff --git a/hip/matrix/batch_csr_kernels.hip.cpp b/hip/matrix/batch_csr_kernels.hip.cpp
deleted file mode 100644
index 4b0e6799834..00000000000
--- a/hip/matrix/batch_csr_kernels.hip.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/batch_csr_kernels.hpp"
-
-#include <thrust/functional.h>
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/batch_csr.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
-#include "hip/base/batch_struct.hip.hpp"
-#include "hip/base/thrust.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
-#include "hip/matrix/batch_struct.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The Csr matrix format namespace.
- * @ref Csr
- * @ingroup batch_csr
- */
-namespace batch_csr {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc"
-
-// clang-format on
-
-
-}  // namespace batch_csr
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/matrix/batch_dense_kernels.hip.cpp b/hip/matrix/batch_dense_kernels.hip.cpp
deleted file mode 100644
index 328f268251f..00000000000
--- a/hip/matrix/batch_dense_kernels.hip.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/batch_dense_kernels.hpp"
-
-#include <thrust/functional.h>
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/batch_dense.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
-#include "hip/base/batch_struct.hip.hpp"
-#include "hip/base/thrust.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
-#include "hip/matrix/batch_struct.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The Dense matrix format namespace.
- *
- * @ingroup batch_dense
- */
-namespace batch_dense {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc"
-
-
-// clang-format on
-
-
-}  // namespace batch_dense
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/matrix/batch_ell_kernels.hip.cpp b/hip/matrix/batch_ell_kernels.hip.cpp
deleted file mode 100644
index 01294ac3d63..00000000000
--- a/hip/matrix/batch_ell_kernels.hip.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/batch_ell_kernels.hpp"
-
-#include <thrust/functional.h>
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/batch_ell.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
-#include "hip/base/batch_struct.hip.hpp"
-#include "hip/base/thrust.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
-#include "hip/matrix/batch_struct.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The Ell matrix format namespace.
- * @ref Ell
- * @ingroup batch_ell
- */
-namespace batch_ell {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc"
-
-// clang-format on
-
-
-}  // namespace batch_ell
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/matrix/coo_kernels.hip.cpp b/hip/matrix/coo_kernels.hip.cpp
deleted file mode 100644
index fe78b938e3c..00000000000
--- a/hip/matrix/coo_kernels.hip.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/coo_kernels.hpp"
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/sparselib_bindings.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/format_conversion.hpp"
-#include "core/matrix/dense_kernels.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/components/atomic.hip.hpp"
-#include "hip/components/segment_scan.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The Coordinate matrix format namespace.
- *
- * @ingroup coo
- */
-namespace coo {
-
-
-constexpr int warps_in_block = 4;
-constexpr int spmv_block_size = warps_in_block * config::warp_size;
-
-
-#include "common/cuda_hip/matrix/coo_kernels.hpp.inc"
-
-
-}  // namespace coo
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp
deleted file mode 100644
index 82599050719..00000000000
--- a/hip/matrix/dense_kernels.hip.cpp
+++ /dev/null
@@ -1,230 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/dense_kernels.hpp"
-
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/range_accessors.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/diagonal.hpp>
-#include <ginkgo/core/matrix/ell.hpp>
-#include <ginkgo/core/matrix/fbcsr.hpp>
-#include <ginkgo/core/matrix/hybrid.hpp>
-#include <ginkgo/core/matrix/sellp.hpp>
-#include <ginkgo/core/matrix/sparsity_csr.hpp>
-
-#include "common/cuda_hip/base/blas_bindings.hpp"
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/pointer_mode_guard.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "core/base/utils.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "hip/components/intrinsics.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The Dense matrix format namespace.
- *
- * @ingroup dense
- */
-namespace dense {
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/matrix/dense_kernels.hpp.inc"
-
-
-template <typename ValueType>
-void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
-                          const matrix::Dense<ValueType>* x,
-                          const matrix::Dense<ValueType>* y,
-                          matrix::Dense<ValueType>* result, array<char>& tmp)
-{
-    if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
-        if (blas::is_supported<ValueType>::value) {
-            auto handle = exec->get_blas_handle();
-            blas::dot(handle, x->get_size()[0], x->get_const_values(),
-                      x->get_stride(), y->get_const_values(), y->get_stride(),
-                      result->get_values());
-        } else {
-            compute_dot(exec, x, y, result, tmp);
-        }
-    } else {
-        compute_dot(exec, x, y, result, tmp);
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
-
-
-template <typename ValueType>
-void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
-                               const matrix::Dense<ValueType>* x,
-                               const matrix::Dense<ValueType>* y,
-                               matrix::Dense<ValueType>* result,
-                               array<char>& tmp)
-{
-    if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
-        if (blas::is_supported<ValueType>::value) {
-            auto handle = exec->get_blas_handle();
-            blas::conj_dot(handle, x->get_size()[0], x->get_const_values(),
-                           x->get_stride(), y->get_const_values(),
-                           y->get_stride(), result->get_values());
-        } else {
-            compute_conj_dot(exec, x, y, result, tmp);
-        }
-    } else {
-        compute_conj_dot(exec, x, y, result, tmp);
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
-
-
-template <typename ValueType>
-void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
-                            const matrix::Dense<ValueType>* x,
-                            matrix::Dense<remove_complex<ValueType>>* result,
-                            array<char>& tmp)
-{
-    if (x->get_size()[1] == 1) {
-        if (blas::is_supported<ValueType>::value) {
-            auto handle = exec->get_blas_handle();
-            blas::norm2(handle, x->get_size()[0], x->get_const_values(),
-                        x->get_stride(), result->get_values());
-        } else {
-            compute_norm2(exec, x, result, tmp);
-        }
-    } else {
-        compute_norm2(exec, x, result, tmp);
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
-
-
-template <typename ValueType>
-void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
-                  const matrix::Dense<ValueType>* a,
-                  const matrix::Dense<ValueType>* b,
-                  matrix::Dense<ValueType>* c)
-{
-    if (blas::is_supported<ValueType>::value) {
-        auto handle = exec->get_blas_handle();
-        if (c->get_size()[0] > 0 && c->get_size()[1] > 0) {
-            if (a->get_size()[1] > 0) {
-                blas::pointer_mode_guard pm_guard(handle);
-                auto alpha = one<ValueType>();
-                auto beta = zero<ValueType>();
-                blas::gemm(handle, BLAS_OP_N, BLAS_OP_N, c->get_size()[1],
-                           c->get_size()[0], a->get_size()[1], &alpha,
-                           b->get_const_values(), b->get_stride(),
-                           a->get_const_values(), a->get_stride(), &beta,
-                           c->get_values(), c->get_stride());
-            } else {
-                dense::fill(exec, c, zero<ValueType>());
-            }
-        }
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
-
-
-template <typename ValueType>
-void apply(std::shared_ptr<const DefaultExecutor> exec,
-           const matrix::Dense<ValueType>* alpha,
-           const matrix::Dense<ValueType>* a, const matrix::Dense<ValueType>* b,
-           const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
-{
-    if (blas::is_supported<ValueType>::value) {
-        if (c->get_size()[0] > 0 && c->get_size()[1] > 0) {
-            if (a->get_size()[1] > 0) {
-                blas::gemm(exec->get_blas_handle(), BLAS_OP_N, BLAS_OP_N,
-                           c->get_size()[1], c->get_size()[0], a->get_size()[1],
-                           alpha->get_const_values(), b->get_const_values(),
-                           b->get_stride(), a->get_const_values(),
-                           a->get_stride(), beta->get_const_values(),
-                           c->get_values(), c->get_stride());
-            } else {
-                dense::scale(exec, beta, c);
-            }
-        }
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
-
-
-template <typename ValueType>
-void transpose(std::shared_ptr<const DefaultExecutor> exec,
-               const matrix::Dense<ValueType>* orig,
-               matrix::Dense<ValueType>* trans)
-{
-    if (blas::is_supported<ValueType>::value) {
-        auto handle = exec->get_blas_handle();
-        if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) {
-            blas::pointer_mode_guard pm_guard(handle);
-            auto alpha = one<ValueType>();
-            auto beta = zero<ValueType>();
-            blas::geam(handle, BLAS_OP_T, BLAS_OP_N, orig->get_size()[0],
-                       orig->get_size()[1], &alpha, orig->get_const_values(),
-                       orig->get_stride(), &beta, trans->get_const_values(),
-                       trans->get_stride(), trans->get_values(),
-                       trans->get_stride());
-        }
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
-};
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
-
-
-template <typename ValueType>
-void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Dense<ValueType>* orig,
-                    matrix::Dense<ValueType>* trans)
-{
-    if (blas::is_supported<ValueType>::value) {
-        auto handle = exec->get_blas_handle();
-        if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) {
-            blas::pointer_mode_guard pm_guard(handle);
-            auto alpha = one<ValueType>();
-            auto beta = zero<ValueType>();
-            blas::geam(handle, BLAS_OP_C, BLAS_OP_N, orig->get_size()[0],
-                       orig->get_size()[1], &alpha, orig->get_const_values(),
-                       orig->get_stride(), &beta, trans->get_const_values(),
-                       trans->get_stride(), trans->get_values(),
-                       trans->get_stride());
-        }
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
-
-
-}  // namespace dense
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/matrix/diagonal_kernels.hip.cpp b/hip/matrix/diagonal_kernels.hip.cpp
deleted file mode 100644
index b9585db9b41..00000000000
--- a/hip/matrix/diagonal_kernels.hip.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/diagonal_kernels.hpp"
-
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The Diagonal matrix format namespace.
- *
- * @ingroup diagonal
- */
-namespace diagonal {
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/matrix/diagonal_kernels.hpp.inc"
-
-
-}  // namespace diagonal
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp
deleted file mode 100644
index cb8cca32d89..00000000000
--- a/hip/matrix/ell_kernels.hip.cpp
+++ /dev/null
@@ -1,270 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/ell_kernels.hpp"
-
-#include <array>
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-#include "accessor/cuda_hip_helper.hpp"
-#include "accessor/reduced_row_major.hpp"
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/sparselib_bindings.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/format_conversion.hpp"
-#include "core/base/mixed_precision_types.hpp"
-#include "core/components/fill_array_kernels.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/matrix/dense_kernels.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-#include "hip/components/atomic.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The ELL matrix format namespace.
- *
- * @ingroup ell
- */
-namespace ell {
-
-
-constexpr int default_block_size = 512;
-
-
-// TODO: num_threads_per_core and ratio are parameters should be tuned
-/**
- * num_threads_per_core is the oversubscribing parameter. There are
- * `num_threads_per_core` threads assigned to each physical core.
- */
-constexpr int num_threads_per_core = 4;
-
-
-/**
- * ratio is the parameter to decide when to use threads to do reduction on each
- * row. (#cols/#rows > ratio)
- */
-constexpr double ratio = 1e-2;
-
-
-/**
- * max_thread_per_worker is the max number of thread per worker. The
- * `compiled_kernels` must be a list <0, 1, 2, ..., max_thread_per_worker>
- */
-constexpr int max_thread_per_worker = 32;
-
-
-/**
- * A compile-time list of sub-warp sizes for which the spmv kernels should be
- * compiled.
- * 0 is a special case where it uses a sub-warp size of warp_size in
- * combination with atomic_adds.
- */
-using compiled_kernels = syn::value_list<int, 0, 1, 2, 4, 8, 16, 32>;
-
-
-#include "common/cuda_hip/matrix/ell_kernels.hpp.inc"
-
-
-namespace {
-
-
-template <int info, typename InputValueType, typename MatrixValueType,
-          typename OutputValueType, typename IndexType>
-void abstract_spmv(syn::value_list<int, info>,
-                   std::shared_ptr<const DefaultExecutor> exec,
-                   int num_worker_per_row,
-                   const matrix::Ell<MatrixValueType, IndexType>* a,
-                   const matrix::Dense<InputValueType>* b,
-                   matrix::Dense<OutputValueType>* c,
-                   const matrix::Dense<MatrixValueType>* alpha = nullptr,
-                   const matrix::Dense<OutputValueType>* beta = nullptr)
-{
-    using arithmetic_type =
-        highest_precision<InputValueType, OutputValueType, MatrixValueType>;
-    using a_accessor =
-        acc::reduced_row_major<1, arithmetic_type, const MatrixValueType>;
-    using b_accessor =
-        acc::reduced_row_major<2, arithmetic_type, const InputValueType>;
-
-    const auto nrows = a->get_size()[0];
-    const auto stride = a->get_stride();
-    const auto num_stored_elements_per_row =
-        a->get_num_stored_elements_per_row();
-
-    constexpr int num_thread_per_worker =
-        (info == 0) ? max_thread_per_worker : info;
-    constexpr bool atomic = (info == 0);
-    const dim3 block_size(default_block_size / num_thread_per_worker,
-                          num_thread_per_worker, 1);
-    const dim3 grid_size(ceildiv(nrows * num_worker_per_row, block_size.x),
-                         b->get_size()[1], 1);
-
-    const auto a_vals = acc::range<a_accessor>(
-        std::array<acc::size_type, 1>{{static_cast<acc::size_type>(
-            num_stored_elements_per_row * stride)}},
-        a->get_const_values());
-    const auto b_vals = acc::range<b_accessor>(
-        std::array<acc::size_type, 2>{
-            {static_cast<acc::size_type>(b->get_size()[0]),
-             static_cast<acc::size_type>(b->get_size()[1])}},
-        b->get_const_values(),
-        std::array<acc::size_type, 1>{
-            {static_cast<acc::size_type>(b->get_stride())}});
-
-    if (alpha == nullptr && beta == nullptr) {
-        if (grid_size.x > 0 && grid_size.y > 0) {
-            kernel::spmv<num_thread_per_worker, atomic>
-                <<<grid_size, block_size, 0, exec->get_stream()>>>(
-                    nrows, num_worker_per_row, acc::as_device_range(a_vals),
-                    a->get_const_col_idxs(), stride,
-                    num_stored_elements_per_row, acc::as_device_range(b_vals),
-                    as_device_type(c->get_values()), c->get_stride());
-        }
-    } else if (alpha != nullptr && beta != nullptr) {
-        const auto alpha_val = acc::range<a_accessor>(
-            std::array<acc::size_type, 1>{1}, alpha->get_const_values());
-        if (grid_size.x > 0 && grid_size.y > 0) {
-            kernel::spmv<num_thread_per_worker, atomic>
-                <<<grid_size, block_size, 0, exec->get_stream()>>>(
-                    nrows, num_worker_per_row, acc::as_device_range(alpha_val),
-                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
-                    stride, num_stored_elements_per_row,
-                    acc::as_device_range(b_vals),
-                    as_device_type(beta->get_const_values()),
-                    as_device_type(c->get_values()), c->get_stride());
-        }
-    } else {
-        GKO_KERNEL_NOT_FOUND;
-    }
-}
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_abstract_spmv, abstract_spmv);
-
-
-template <typename ValueType, typename IndexType>
-std::array<int, 3> compute_thread_worker_and_atomicity(
-    std::shared_ptr<const HipExecutor> exec,
-    const matrix::Ell<ValueType, IndexType>* a)
-{
-    int num_thread_per_worker = 1;
-    int atomic = 0;
-    int num_worker_per_row = 1;
-
-    const auto nrows = a->get_size()[0];
-    const auto ell_ncols = a->get_num_stored_elements_per_row();
-    // TODO: num_threads_per_core should be tuned for AMD gpu
-    const auto nwarps = exec->get_num_warps_per_sm() *
-                        exec->get_num_multiprocessor() * num_threads_per_core;
-
-    // Use multithreads to perform the reduction on each row when the matrix is
-    // wide.
-    // To make every thread have computation, so pick the value which is the
-    // power of 2 less than max_thread_per_worker and is less than or equal to
-    // ell_ncols. If the num_thread_per_worker is max_thread_per_worker and
-    // allow more than one worker to work on the same row, use atomic add to
-    // handle the worker write the value into the same position. The #worker is
-    // decided according to the number of worker allowed on GPU.
-    if (static_cast<double>(ell_ncols) / nrows > ratio) {
-        while (num_thread_per_worker < max_thread_per_worker &&
-               (num_thread_per_worker << 1) <= ell_ncols) {
-            num_thread_per_worker <<= 1;
-        }
-        if (num_thread_per_worker == max_thread_per_worker) {
-            num_worker_per_row =
-                std::min(ell_ncols / max_thread_per_worker, nwarps / nrows);
-            num_worker_per_row = std::max(num_worker_per_row, 1);
-        }
-        if (num_worker_per_row > 1) {
-            atomic = 1;
-        }
-    }
-    return {num_thread_per_worker, atomic, num_worker_per_row};
-}
-
-
-}  // namespace
-
-
-template <typename InputValueType, typename MatrixValueType,
-          typename OutputValueType, typename IndexType>
-void spmv(std::shared_ptr<const HipExecutor> exec,
-          const matrix::Ell<MatrixValueType, IndexType>* a,
-          const matrix::Dense<InputValueType>* b,
-          matrix::Dense<OutputValueType>* c)
-{
-    const auto data = compute_thread_worker_and_atomicity(exec, a);
-    const int num_thread_per_worker = std::get<0>(data);
-    const int atomic = std::get<1>(data);
-    const int num_worker_per_row = std::get<2>(data);
-
-    /**
-     * info is the parameter for selecting the device kernel.
-     * for info == 0, it uses the kernel by warp_size threads with atomic
-     * operation for other value, it uses the kernel without atomic_add
-     */
-    const int info = (!atomic) * num_thread_per_worker;
-    if (atomic) {
-        dense::fill(exec, c, zero<OutputValueType>());
-    }
-    select_abstract_spmv(
-        compiled_kernels(),
-        [&info](int compiled_info) { return info == compiled_info; },
-        syn::value_list<int>(), syn::type_list<>(), exec, num_worker_per_row, a,
-        b, c);
-}
-
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_ELL_SPMV_KERNEL);
-
-
-template <typename InputValueType, typename MatrixValueType,
-          typename OutputValueType, typename IndexType>
-void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
-                   const matrix::Dense<MatrixValueType>* alpha,
-                   const matrix::Ell<MatrixValueType, IndexType>* a,
-                   const matrix::Dense<InputValueType>* b,
-                   const matrix::Dense<OutputValueType>* beta,
-                   matrix::Dense<OutputValueType>* c)
-{
-    const auto data = compute_thread_worker_and_atomicity(exec, a);
-    const int num_thread_per_worker = std::get<0>(data);
-    const int atomic = std::get<1>(data);
-    const int num_worker_per_row = std::get<2>(data);
-
-    /**
-     * info is the parameter for selecting the device kernel.
-     * for info == 0, it uses the kernel by warp_size threads with atomic
-     * operation for other value, it uses the kernel without atomic_add
-     */
-    const int info = (!atomic) * num_thread_per_worker;
-    if (atomic) {
-        dense::scale(exec, beta, c);
-    }
-    select_abstract_spmv(
-        compiled_kernels(),
-        [&info](int compiled_info) { return info == compiled_info; },
-        syn::value_list<int>(), syn::type_list<>(), exec, num_worker_per_row, a,
-        b, c, alpha, beta);
-}
-
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL);
-
-
-}  // namespace ell
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/matrix/fbcsr_kernels.template.hip.cpp b/hip/matrix/fbcsr_kernels.template.hip.cpp
deleted file mode 100644
index c5d49215042..00000000000
--- a/hip/matrix/fbcsr_kernels.template.hip.cpp
+++ /dev/null
@@ -1,299 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/fbcsr_kernels.hpp"
-
-#include <algorithm>
-
-#include <thrust/copy.h>
-#include <thrust/count.h>
-#include <thrust/device_ptr.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_output_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/sort.h>
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-#include "common/cuda_hip/base/blas_bindings.hpp"
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/pointer_mode_guard.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/sparselib_bindings.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/unified/base/kernel_launch.hpp"
-#include "core/base/array_access.hpp"
-#include "core/base/block_sizes.hpp"
-#include "core/base/device_matrix_data_kernels.hpp"
-#include "core/components/fill_array_kernels.hpp"
-#include "core/components/format_conversion_kernels.hpp"
-#include "core/matrix/csr_lookup.hpp"
-#include "core/matrix/dense_kernels.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/hipsparse_block_bindings.hip.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/base/thrust.hip.hpp"
-#include "hip/components/atomic.hip.hpp"
-#include "hip/components/merging.hip.hpp"
-#include "hip/components/prefix_sum.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-/**
- * @brief The fixed-size block compressed sparse row matrix format namespace.
- *
- * @ingroup fbcsr
- */
-namespace fbcsr {
-
-
-constexpr int default_block_size{512};
-
-
-#include "common/cuda_hip/matrix/csr_common.hpp.inc"
-#include "common/cuda_hip/matrix/fbcsr_kernels.hpp.inc"
-
-
-namespace {
-
-
-template <typename ValueType>
-void dense_transpose(std::shared_ptr<const HipExecutor> exec,
-                     const size_type nrows, const size_type ncols,
-                     const size_type orig_stride, const ValueType* const orig,
-                     const size_type trans_stride, ValueType* const trans)
-{
-    if (nrows == 0) {
-        return;
-    }
-    if (blas::is_supported<ValueType>::value) {
-        auto handle = exec->get_blas_handle();
-        {
-            blas::pointer_mode_guard pm_guard(handle);
-            auto alpha = one<ValueType>();
-            auto beta = zero<ValueType>();
-            blas::geam(handle, BLAS_OP_T, BLAS_OP_N, nrows, ncols, &alpha, orig,
-                       orig_stride, &beta, trans, trans_stride, trans,
-                       trans_stride);
-        }
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
-}
-
-
-}  // namespace
-
-
-template <typename ValueType, typename IndexType>
-void spmv(std::shared_ptr<const HipExecutor> exec,
-          const matrix::Fbcsr<ValueType, IndexType>* const a,
-          const matrix::Dense<ValueType>* const b,
-          matrix::Dense<ValueType>* const c)
-{
-    if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
-        // empty output: nothing to do
-        return;
-    }
-    if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) {
-        // empty input: fill output with zero
-        dense::fill(exec, c, zero<ValueType>());
-        return;
-    }
-    if (sparselib::is_supported<ValueType, IndexType>::value) {
-        auto handle = exec->get_sparselib_handle();
-        sparselib::pointer_mode_guard pm_guard(handle);
-        const auto alpha = one<ValueType>();
-        const auto beta = zero<ValueType>();
-        auto descr = sparselib::create_mat_descr();
-        const auto row_ptrs = a->get_const_row_ptrs();
-        const auto col_idxs = a->get_const_col_idxs();
-        const auto values = a->get_const_values();
-        const int bs = a->get_block_size();
-        const IndexType mb = a->get_num_block_rows();
-        const IndexType nb = a->get_num_block_cols();
-        const auto nnzb = static_cast<IndexType>(a->get_num_stored_blocks());
-        const auto nrhs = static_cast<IndexType>(b->get_size()[1]);
-        const auto nrows = a->get_size()[0];
-        const auto ncols = a->get_size()[1];
-        const auto in_stride = b->get_stride();
-        const auto out_stride = c->get_stride();
-        if (nrhs == 1 && in_stride == 1 && out_stride == 1) {
-            sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb,
-                             nnzb, &alpha, descr, values, row_ptrs, col_idxs,
-                             bs, b->get_const_values(), &beta, c->get_values());
-        } else {
-            const auto trans_stride = nrows;
-            auto trans_c = array<ValueType>(exec, nrows * nrhs);
-            sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
-                             SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
-                             &alpha, descr, values, row_ptrs, col_idxs, bs,
-                             b->get_const_values(), in_stride, &beta,
-                             trans_c.get_data(), trans_stride);
-            dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(),
-                            out_stride, c->get_values());
-        }
-        sparselib::destroy(descr);
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
-                   const matrix::Dense<ValueType>* const alpha,
-                   const matrix::Fbcsr<ValueType, IndexType>* const a,
-                   const matrix::Dense<ValueType>* const b,
-                   const matrix::Dense<ValueType>* const beta,
-                   matrix::Dense<ValueType>* const c)
-{
-    if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
-        // empty output: nothing to do
-        return;
-    }
-    if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) {
-        // empty input: scale output
-        dense::scale(exec, beta, c);
-        return;
-    }
-    if (sparselib::is_supported<ValueType, IndexType>::value) {
-        auto handle = exec->get_sparselib_handle();
-        const auto alphp = alpha->get_const_values();
-        const auto betap = beta->get_const_values();
-        auto descr = sparselib::create_mat_descr();
-        const auto row_ptrs = a->get_const_row_ptrs();
-        const auto col_idxs = a->get_const_col_idxs();
-        const auto values = a->get_const_values();
-        const int bs = a->get_block_size();
-        const IndexType mb = a->get_num_block_rows();
-        const IndexType nb = a->get_num_block_cols();
-        const auto nnzb = static_cast<IndexType>(a->get_num_stored_blocks());
-        const auto nrhs = static_cast<IndexType>(b->get_size()[1]);
-        const auto nrows = a->get_size()[0];
-        const auto ncols = a->get_size()[1];
-        const auto in_stride = b->get_stride();
-        const auto out_stride = c->get_stride();
-        if (nrhs == 1 && in_stride == 1 && out_stride == 1) {
-            sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb,
-                             nnzb, alphp, descr, values, row_ptrs, col_idxs, bs,
-                             b->get_const_values(), betap, c->get_values());
-        } else {
-            const auto trans_stride = nrows;
-            auto trans_c = array<ValueType>(exec, nrows * nrhs);
-            dense_transpose(exec, nrows, nrhs, out_stride, c->get_values(),
-                            trans_stride, trans_c.get_data());
-            sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
-                             SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
-                             alphp, descr, values, row_ptrs, col_idxs, bs,
-                             b->get_const_values(), in_stride, betap,
-                             trans_c.get_data(), trans_stride);
-            dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(),
-                            out_stride, c->get_values());
-        }
-        sparselib::destroy(descr);
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
-}
-
-
-namespace {
-
-
-template <int mat_blk_sz, typename ValueType, typename IndexType>
-void transpose_blocks_impl(syn::value_list<int, mat_blk_sz>,
-                           std::shared_ptr<const DefaultExecutor> exec,
-                           matrix::Fbcsr<ValueType, IndexType>* const mat)
-{
-    constexpr int subwarp_size = config::warp_size;
-    const auto nbnz = mat->get_num_stored_blocks();
-    const auto numthreads = nbnz * subwarp_size;
-    const auto block_size = default_block_size;
-    const auto grid_dim = ceildiv(numthreads, block_size);
-    if (grid_dim > 0) {
-        kernel::transpose_blocks<mat_blk_sz, subwarp_size>
-            <<<grid_dim, block_size, 0, exec->get_stream()>>>(
-                nbnz, mat->get_values());
-    }
-}
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_transpose_blocks,
-                                    transpose_blocks_impl);
-
-
-}  // namespace
-
-
-template <typename ValueType, typename IndexType>
-void transpose(const std::shared_ptr<const DefaultExecutor> exec,
-               const matrix::Fbcsr<ValueType, IndexType>* const orig,
-               matrix::Fbcsr<ValueType, IndexType>* const trans)
-{
-#ifdef GKO_COMPILING_CUDA
-    if (sparselib::is_supported<ValueType, IndexType>::value) {
-        const int bs = orig->get_block_size();
-        const IndexType nnzb =
-            static_cast<IndexType>(orig->get_num_stored_blocks());
-        cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
-        cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
-        const IndexType buffer_size = sparselib::bsr_transpose_buffersize(
-            exec->get_sparselib_handle(), orig->get_num_block_rows(),
-            orig->get_num_block_cols(), nnzb, orig->get_const_values(),
-            orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs);
-        array<char> buffer_array(exec, buffer_size);
-        auto buffer = buffer_array.get_data();
-        sparselib::bsr_transpose(
-            exec->get_sparselib_handle(), orig->get_num_block_rows(),
-            orig->get_num_block_cols(), nnzb, orig->get_const_values(),
-            orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs,
-            trans->get_values(), trans->get_col_idxs(), trans->get_row_ptrs(),
-            copyValues, idxBase, buffer);
-
-        // transpose blocks
-        select_transpose_blocks(
-            fixedblock::compiled_kernels(),
-            [bs](int compiled_block_size) { return bs == compiled_block_size; },
-            syn::value_list<int>(), syn::type_list<>(), exec, trans);
-    } else
-#endif
-    {
-        fallback_transpose(exec, orig, trans);
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-void conj_transpose(std::shared_ptr<const HipExecutor> exec,
-                    const matrix::Fbcsr<ValueType, IndexType>* orig,
-                    matrix::Fbcsr<ValueType, IndexType>* trans)
-{
-    const int grid_size =
-        ceildiv(trans->get_num_stored_elements(), default_block_size);
-    transpose(exec, orig, trans);
-    if (grid_size > 0 && is_complex<ValueType>()) {
-        kernel::
-            conjugate<<<grid_size, default_block_size, 0, exec->get_stream()>>>(
-                trans->get_num_stored_elements(),
-                as_device_type(trans->get_values()));
-    }
-}
-
-
-}  // namespace fbcsr
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/matrix/sellp_kernels.hip.cpp b/hip/matrix/sellp_kernels.hip.cpp
deleted file mode 100644
index 4caf83fdaa1..00000000000
--- a/hip/matrix/sellp_kernels.hip.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/sellp_kernels.hpp"
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/sparselib_bindings.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The SELL-P matrix format namespace.
- *
- * @ingroup sellp
- */
-namespace sellp {
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/matrix/sellp_kernels.hpp.inc"
-
-
-}  // namespace sellp
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/multigrid/pgm_kernels.hip.cpp b/hip/multigrid/pgm_kernels.hip.cpp
deleted file mode 100644
index da5890315bc..00000000000
--- a/hip/multigrid/pgm_kernels.hip.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/multigrid/pgm_kernels.hpp"
-
-#include <memory>
-
-#include <thrust/device_ptr.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/reduce.h>
-#include <thrust/sort.h>
-#include <thrust/tuple.h>
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-
-#include "common/cuda_hip/base/types.hpp"
-#include "hip/base/thrust.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The PGM solver namespace.
- *
- * @ingroup pgm
- */
-namespace pgm {
-
-
-#include "common/cuda_hip/multigrid/pgm_kernels.hpp.inc"
-
-
-}  // namespace pgm
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/preconditioner/isai_kernels.hip.cpp b/hip/preconditioner/isai_kernels.hip.cpp
deleted file mode 100644
index d3c2bd0fb1d..00000000000
--- a/hip/preconditioner/isai_kernels.hip.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/preconditioner/isai_kernels.hpp"
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/executor.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/matrix/csr_builder.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/components/merging.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
-#include "hip/components/warp_blas.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The Isai preconditioner namespace.
- * @ref Isai
- * @ingroup isai
- */
-namespace isai {
-
-
-constexpr int subwarp_size{row_size_limit};
-constexpr int subwarps_per_block{2};
-constexpr int default_block_size{subwarps_per_block * subwarp_size};
-
-
-#include "common/cuda_hip/preconditioner/isai_kernels.hpp.inc"
-
-
-}  // namespace isai
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/preconditioner/jacobi_kernels.hip.cpp b/hip/preconditioner/jacobi_kernels.hip.cpp
deleted file mode 100644
index 122e53f636d..00000000000
--- a/hip/preconditioner/jacobi_kernels.hip.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/preconditioner/jacobi_kernels.hpp"
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "core/base/extended_float.hpp"
-#include "core/preconditioner/jacobi_utils.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/preconditioner/jacobi_common.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The Jacobi preconditioner namespace.
- * @ref Jacobi
- * @ingroup jacobi
- */
-namespace jacobi {
-
-
-// a total of 32/16 warps (1024 threads)
-#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC
-constexpr int default_num_warps = 16;
-#else  // !defined(GKO_COMPILING_HIP) || GINKGO_HIP_PLATFORM_NVCC
-constexpr int default_num_warps = 32;
-#endif
-// with current architectures, at most 32 warps can be scheduled per SM (and
-// current GPUs have at most 84 SMs)
-constexpr int default_grid_size = 32 * 32 * 128;
-
-
-#include "common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc"
-
-
-}  // namespace jacobi
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/reorder/rcm_kernels.hip.cpp b/hip/reorder/rcm_kernels.hip.cpp
deleted file mode 100644
index 9ac6e44e173..00000000000
--- a/hip/reorder/rcm_kernels.hip.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/reorder/rcm_kernels.hpp"
-
-#include <thrust/binary_search.h>
-#include <thrust/copy.h>
-#include <thrust/count.h>
-#include <thrust/iterator/discard_iterator.h>
-#include <thrust/iterator/permutation_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/reduce.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#include <thrust/transform.h>
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/std_extensions.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/permutation.hpp>
-#include <ginkgo/core/matrix/sparsity_csr.hpp>
-
-#include "common/cuda_hip/components/memory.hpp"
-#include "core/base/array_access.hpp"
-#include "hip/base/thrust.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The reordering namespace.
- *
- * @ingroup reorder
- */
-namespace rcm {
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/reorder/rcm_kernels.hpp.inc"
-
-
-}  // namespace rcm
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/solver/cb_gmres_kernels.hip.cpp b/hip/solver/cb_gmres_kernels.hip.cpp
deleted file mode 100644
index fd046d000b4..00000000000
--- a/hip/solver/cb_gmres_kernels.hip.cpp
+++ /dev/null
@@ -1,504 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/solver/cb_gmres_kernels.hpp"
-
-#include <algorithm>
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-#include <ginkgo/core/stop/stopping_status.hpp>
-
-#include "accessor/cuda_hip_helper.hpp"
-#include "accessor/range.hpp"
-#include "accessor/reduced_row_major.hpp"
-#include "accessor/scaled_reduced_row_major.hpp"
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "core/base/array_access.hpp"
-#include "core/components/fill_array_kernels.hpp"
-#include "core/matrix/dense_kernels.hpp"
-#include "core/solver/cb_gmres_accessor.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/components/atomic.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The CB_GMRES solver namespace.
- *
- * @ingroup cb_gmres
- */
-namespace cb_gmres {
-
-
-constexpr int default_block_size = 512;
-// default_dot_dim can not be 64 in hip because 64 * 64 exceeds their max block
-// size limit.
-constexpr int default_dot_dim = 32;
-constexpr int default_dot_size = default_dot_dim * default_dot_dim;
-
-
-#include "common/cuda_hip/solver/cb_gmres_kernels.hpp.inc"
-
-
-template <typename ValueType>
-void zero_matrix(std::shared_ptr<const DefaultExecutor> exec, size_type m,
-                 size_type n, size_type stride, ValueType* array)
-{
-    const auto block_size = default_block_size;
-    const auto grid_size = ceildiv(n, block_size);
-    zero_matrix_kernel<<<grid_size, block_size, 0, exec->get_stream()>>>(
-        m, n, stride, as_device_type(array));
-}
-
-
-template <typename ValueType>
-void initialize(std::shared_ptr<const DefaultExecutor> exec,
-                const matrix::Dense<ValueType>* b,
-                matrix::Dense<ValueType>* residual,
-                matrix::Dense<ValueType>* givens_sin,
-                matrix::Dense<ValueType>* givens_cos,
-                array<stopping_status>* stop_status, size_type krylov_dim)
-{
-    const auto num_threads = std::max(b->get_size()[0] * b->get_stride(),
-                                      krylov_dim * b->get_size()[1]);
-    const auto grid_dim = ceildiv(num_threads, default_block_size);
-    const auto block_dim = default_block_size;
-    constexpr auto block_size = default_block_size;
-
-    initialize_kernel<block_size>
-        <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-            b->get_size()[0], b->get_size()[1], krylov_dim,
-            as_device_type(b->get_const_values()), b->get_stride(),
-            as_device_type(residual->get_values()), residual->get_stride(),
-            as_device_type(givens_sin->get_values()), givens_sin->get_stride(),
-            as_device_type(givens_cos->get_values()), givens_cos->get_stride(),
-            as_device_type(stop_status->get_data()));
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL);
-
-
-template <typename ValueType, typename Accessor3d>
-void restart(std::shared_ptr<const DefaultExecutor> exec,
-             const matrix::Dense<ValueType>* residual,
-             matrix::Dense<remove_complex<ValueType>>* residual_norm,
-             matrix::Dense<ValueType>* residual_norm_collection,
-             matrix::Dense<remove_complex<ValueType>>* arnoldi_norm,
-             Accessor3d krylov_bases,
-             matrix::Dense<ValueType>* next_krylov_basis,
-             array<size_type>* final_iter_nums, array<char>& reduction_tmp,
-             size_type krylov_dim)
-{
-    constexpr bool use_scalar =
-        gko::cb_gmres::detail::has_3d_scaled_accessor<Accessor3d>::value;
-    const auto num_rows = residual->get_size()[0];
-    const auto num_rhs = residual->get_size()[1];
-    const auto krylov_stride =
-        gko::cb_gmres::helper_functions_accessor<Accessor3d>::get_stride(
-            krylov_bases);
-    const auto grid_dim_1 =
-        ceildiv((krylov_dim + 1) * krylov_stride[0], default_block_size);
-    const auto block_dim = default_block_size;
-    constexpr auto block_size = default_block_size;
-    const auto stride_arnoldi = arnoldi_norm->get_stride();
-
-    restart_1_kernel<block_size>
-        <<<grid_dim_1, block_dim, 0, exec->get_stream()>>>(
-            residual->get_size()[0], residual->get_size()[1], krylov_dim,
-            acc::as_device_range(krylov_bases),
-            as_device_type(residual_norm_collection->get_values()),
-            residual_norm_collection->get_stride());
-    kernels::hip::dense::compute_norm2_dispatch(exec, residual, residual_norm,
-                                                reduction_tmp);
-
-    if (use_scalar) {
-        components::fill_array(exec,
-                               arnoldi_norm->get_values() + 2 * stride_arnoldi,
-                               num_rhs, zero<remove_complex<ValueType>>());
-        const dim3 grid_size_nrm(ceildiv(num_rhs, default_dot_dim),
-                                 exec->get_num_multiprocessor() * 2);
-        const dim3 block_size_nrm(default_dot_dim, default_dot_dim);
-        multinorminf_without_stop_kernel<<<grid_size_nrm, block_size_nrm, 0,
-                                           exec->get_stream()>>>(
-            num_rows, num_rhs, as_device_type(residual->get_const_values()),
-            residual->get_stride(),
-            as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi), 0);
-    }
-
-    if (gko::cb_gmres::detail::has_3d_scaled_accessor<Accessor3d>::value) {
-        set_scalar_kernel<default_block_size>
-            <<<ceildiv(num_rhs * (krylov_dim + 1), default_block_size),
-               default_block_size, 0, exec->get_stream()>>>(
-                num_rhs, krylov_dim + 1,
-                as_device_type(residual_norm->get_const_values()),
-                residual_norm->get_stride(),
-                as_device_type(arnoldi_norm->get_const_values() +
-                               2 * stride_arnoldi),
-                stride_arnoldi, acc::as_device_range(krylov_bases));
-    }
-
-    const auto grid_dim_2 =
-        ceildiv(std::max<size_type>(num_rows, 1) * krylov_stride[1],
-                default_block_size);
-    restart_2_kernel<block_size>
-        <<<grid_dim_2, block_dim, 0, exec->get_stream()>>>(
-            residual->get_size()[0], residual->get_size()[1],
-            as_device_type(residual->get_const_values()),
-            residual->get_stride(),
-            as_device_type(residual_norm->get_const_values()),
-            as_device_type(residual_norm_collection->get_values()),
-            acc::as_device_range(krylov_bases),
-            as_device_type(next_krylov_basis->get_values()),
-            next_krylov_basis->get_stride(),
-            as_device_type(final_iter_nums->get_data()));
-}
-
-GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_RESTART_KERNEL);
-
-
-template <typename ValueType, typename Accessor3dim>
-void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
-                        matrix::Dense<ValueType>* next_krylov_basis,
-                        Accessor3dim krylov_bases,
-                        matrix::Dense<ValueType>* hessenberg_iter,
-                        matrix::Dense<ValueType>* buffer_iter,
-                        matrix::Dense<remove_complex<ValueType>>* arnoldi_norm,
-                        size_type iter, const stopping_status* stop_status,
-                        stopping_status* reorth_status,
-                        array<size_type>* num_reorth)
-{
-    const auto dim_size = next_krylov_basis->get_size();
-    if (dim_size[1] == 0) {
-        return;
-    }
-    using non_complex = remove_complex<ValueType>;
-    // optimization parameter
-    constexpr int singledot_block_size = default_dot_dim;
-    constexpr bool use_scalar =
-        gko::cb_gmres::detail::has_3d_scaled_accessor<Accessor3dim>::value;
-    const auto stride_next_krylov = next_krylov_basis->get_stride();
-    const auto stride_hessenberg = hessenberg_iter->get_stride();
-    const auto stride_buffer = buffer_iter->get_stride();
-    const auto stride_arnoldi = arnoldi_norm->get_stride();
-    const dim3 grid_size(ceildiv(dim_size[1], default_dot_dim),
-                         exec->get_num_multiprocessor() * 2);
-    const dim3 grid_size_num_iters(ceildiv(dim_size[1], default_dot_dim),
-                                   exec->get_num_multiprocessor() * 2,
-                                   iter + 1);
-    const dim3 block_size(default_dot_dim, default_dot_dim);
-    // Note: having iter first (instead of row_idx information) is likely
-    //       beneficial for avoiding atomic_add conflicts, but that needs
-    //       further investigation.
-    const dim3 grid_size_iters_single(exec->get_num_multiprocessor() * 2,
-                                      iter + 1);
-    const auto block_size_iters_single = singledot_block_size;
-    size_type num_reorth_host;
-
-    components::fill_array(exec, arnoldi_norm->get_values(), dim_size[1],
-                           zero<non_complex>());
-    multinorm2_kernel<<<grid_size, block_size, 0, exec->get_stream()>>>(
-        dim_size[0], dim_size[1],
-        as_device_type(next_krylov_basis->get_const_values()),
-        stride_next_krylov, as_device_type(arnoldi_norm->get_values()),
-        as_device_type(stop_status));
-    // nrmP = norm(next_krylov_basis)
-    zero_matrix(exec, iter + 1, dim_size[1], stride_hessenberg,
-                hessenberg_iter->get_values());
-    if (dim_size[1] > 1) {
-        multidot_kernel<default_dot_dim>
-            <<<grid_size_num_iters, block_size, 0, exec->get_stream()>>>(
-                dim_size[0], dim_size[1],
-                as_device_type(next_krylov_basis->get_const_values()),
-                stride_next_krylov, acc::as_device_range(krylov_bases),
-                as_device_type(hessenberg_iter->get_values()),
-                stride_hessenberg, as_device_type(stop_status));
-    } else {
-        singledot_kernel<singledot_block_size>
-            <<<grid_size_iters_single, block_size_iters_single, 0,
-               exec->get_stream()>>>(
-                dim_size[0],
-                as_device_type(next_krylov_basis->get_const_values()),
-                stride_next_krylov, acc::as_device_range(krylov_bases),
-                as_device_type(hessenberg_iter->get_values()),
-                stride_hessenberg, as_device_type(stop_status));
-    }
-    // for i in 1:iter
-    //     hessenberg(iter, i) = next_krylov_basis' * krylov_bases(:, i)
-    // end
-    update_next_krylov_kernel<default_block_size>
-        <<<ceildiv(dim_size[0] * stride_next_krylov, default_block_size),
-           default_block_size, 0, exec->get_stream()>>>(
-            iter + 1, dim_size[0], dim_size[1],
-            as_device_type(next_krylov_basis->get_values()), stride_next_krylov,
-            acc::as_device_range(krylov_bases),
-            as_device_type(hessenberg_iter->get_const_values()),
-            stride_hessenberg, as_device_type(stop_status));
-
-    // for i in 1:iter
-    //     next_krylov_basis  -= hessenberg(iter, i) * krylov_bases(:, i)
-    // end
-    components::fill_array(exec, arnoldi_norm->get_values() + stride_arnoldi,
-                           dim_size[1], zero<non_complex>());
-    if (use_scalar) {
-        components::fill_array(exec,
-                               arnoldi_norm->get_values() + 2 * stride_arnoldi,
-                               dim_size[1], zero<non_complex>());
-    }
-    multinorm2_inf_kernel<use_scalar>
-        <<<grid_size, block_size, 0, exec->get_stream()>>>(
-            dim_size[0], dim_size[1],
-            as_device_type(next_krylov_basis->get_const_values()),
-            stride_next_krylov,
-            as_device_type(arnoldi_norm->get_values() + stride_arnoldi),
-            as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi),
-            as_device_type(stop_status));
-    // nrmN = norm(next_krylov_basis)
-    components::fill_array(exec, num_reorth->get_data(), 1, zero<size_type>());
-    check_arnoldi_norms<default_block_size>
-        <<<ceildiv(dim_size[1], default_block_size), default_block_size, 0,
-           exec->get_stream()>>>(
-            dim_size[1], as_device_type(arnoldi_norm->get_values()),
-            stride_arnoldi, as_device_type(hessenberg_iter->get_values()),
-            stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases),
-            as_device_type(stop_status), as_device_type(reorth_status),
-            as_device_type(num_reorth->get_data()));
-    num_reorth_host = get_element(*num_reorth, 0);
-    // num_reorth_host := number of next_krylov vector to be reorthogonalization
-    for (size_type l = 1; (num_reorth_host > 0) && (l < 3); l++) {
-        zero_matrix(exec, iter + 1, dim_size[1], stride_buffer,
-                    buffer_iter->get_values());
-        if (dim_size[1] > 1) {
-            multidot_kernel<default_dot_dim>
-                <<<grid_size_num_iters, block_size, 0, exec->get_stream()>>>(
-                    dim_size[0], dim_size[1],
-                    as_device_type(next_krylov_basis->get_const_values()),
-                    stride_next_krylov, acc::as_device_range(krylov_bases),
-                    as_device_type(buffer_iter->get_values()), stride_buffer,
-                    as_device_type(stop_status));
-        } else {
-            singledot_kernel<singledot_block_size>
-                <<<grid_size_iters_single, block_size_iters_single, 0,
-                   exec->get_stream()>>>(
-                    dim_size[0],
-                    as_device_type(next_krylov_basis->get_const_values()),
-                    stride_next_krylov, acc::as_device_range(krylov_bases),
-                    as_device_type(buffer_iter->get_values()), stride_buffer,
-                    as_device_type(stop_status));
-        }
-        // for i in 1:iter
-        //     hessenberg(iter, i) = next_krylov_basis' * krylov_bases(:, i)
-        // end
-        update_next_krylov_and_add_kernel<default_block_size>
-            <<<ceildiv(dim_size[0] * stride_next_krylov, default_block_size),
-               default_block_size, 0, exec->get_stream()>>>(
-                iter + 1, dim_size[0], dim_size[1],
-                as_device_type(next_krylov_basis->get_values()),
-                stride_next_krylov, acc::as_device_range(krylov_bases),
-                as_device_type(hessenberg_iter->get_values()),
-                stride_hessenberg,
-                as_device_type(buffer_iter->get_const_values()), stride_buffer,
-                as_device_type(stop_status), as_device_type(reorth_status));
-        // for i in 1:iter
-        //     next_krylov_basis  -= hessenberg(iter, i) * krylov_bases(:, i)
-        // end
-        components::fill_array(exec,
-                               arnoldi_norm->get_values() + stride_arnoldi,
-                               dim_size[1], zero<non_complex>());
-        if (use_scalar) {
-            components::fill_array(
-                exec, arnoldi_norm->get_values() + 2 * stride_arnoldi,
-                dim_size[1], zero<non_complex>());
-        }
-        multinorm2_inf_kernel<use_scalar>
-            <<<grid_size, block_size, 0, exec->get_stream()>>>(
-                dim_size[0], dim_size[1],
-                as_device_type(next_krylov_basis->get_const_values()),
-                stride_next_krylov,
-                as_device_type(arnoldi_norm->get_values() + stride_arnoldi),
-                as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi),
-                as_device_type(stop_status));
-        // nrmN = norm(next_krylov_basis)
-        components::fill_array(exec, num_reorth->get_data(), 1,
-                               zero<size_type>());
-        check_arnoldi_norms<default_block_size>
-            <<<ceildiv(dim_size[1], default_block_size), default_block_size, 0,
-               exec->get_stream()>>>(
-                dim_size[1], as_device_type(arnoldi_norm->get_values()),
-                stride_arnoldi, as_device_type(hessenberg_iter->get_values()),
-                stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases),
-                as_device_type(stop_status), as_device_type(reorth_status),
-                num_reorth->get_data());
-        num_reorth_host = get_element(*num_reorth, 0);
-        // num_reorth_host := number of next_krylov vector to be
-        // reorthogonalization
-    }
-    update_krylov_next_krylov_kernel<default_block_size>
-        <<<ceildiv(dim_size[0] * stride_next_krylov, default_block_size),
-           default_block_size, 0, exec->get_stream()>>>(
-            iter, dim_size[0], dim_size[1],
-            as_device_type(next_krylov_basis->get_values()), stride_next_krylov,
-            acc::as_device_range(krylov_bases),
-            as_device_type(hessenberg_iter->get_const_values()),
-            stride_hessenberg, as_device_type(stop_status));
-    // next_krylov_basis /= hessenberg(iter, iter + 1)
-    // krylov_bases(:, iter + 1) = next_krylov_basis
-    // End of arnoldi
-}
-
-template <typename ValueType>
-void givens_rotation(std::shared_ptr<const DefaultExecutor> exec,
-                     matrix::Dense<ValueType>* givens_sin,
-                     matrix::Dense<ValueType>* givens_cos,
-                     matrix::Dense<ValueType>* hessenberg_iter,
-                     matrix::Dense<remove_complex<ValueType>>* residual_norm,
-                     matrix::Dense<ValueType>* residual_norm_collection,
-                     size_type iter, const array<stopping_status>* stop_status)
-{
-    // TODO: tune block_size for optimal performance
-    constexpr auto block_size = default_block_size;
-    const auto num_cols = hessenberg_iter->get_size()[1];
-    const auto block_dim = block_size;
-    const auto grid_dim =
-        static_cast<unsigned int>(ceildiv(num_cols, block_size));
-
-    givens_rotation_kernel<block_size>
-        <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-            hessenberg_iter->get_size()[0], hessenberg_iter->get_size()[1],
-            iter, as_device_type(hessenberg_iter->get_values()),
-            hessenberg_iter->get_stride(),
-            as_device_type(givens_sin->get_values()), givens_sin->get_stride(),
-            as_device_type(givens_cos->get_values()), givens_cos->get_stride(),
-            as_device_type(residual_norm->get_values()),
-            as_device_type(residual_norm_collection->get_values()),
-            residual_norm_collection->get_stride(),
-            stop_status->get_const_data());
-}
-
-
-template <typename ValueType, typename Accessor3d>
-void arnoldi(std::shared_ptr<const DefaultExecutor> exec,
-             matrix::Dense<ValueType>* next_krylov_basis,
-             matrix::Dense<ValueType>* givens_sin,
-             matrix::Dense<ValueType>* givens_cos,
-             matrix::Dense<remove_complex<ValueType>>* residual_norm,
-             matrix::Dense<ValueType>* residual_norm_collection,
-             Accessor3d krylov_bases, matrix::Dense<ValueType>* hessenberg_iter,
-             matrix::Dense<ValueType>* buffer_iter,
-             matrix::Dense<remove_complex<ValueType>>* arnoldi_norm,
-             size_type iter, array<size_type>* final_iter_nums,
-             const array<stopping_status>* stop_status,
-             array<stopping_status>* reorth_status,
-             array<size_type>* num_reorth)
-{
-    increase_final_iteration_numbers_kernel<<<
-        static_cast<unsigned int>(
-            ceildiv(final_iter_nums->get_size(), default_block_size)),
-        default_block_size, 0, exec->get_stream()>>>(
-        as_device_type(final_iter_nums->get_data()),
-        stop_status->get_const_data(), final_iter_nums->get_size());
-    finish_arnoldi_CGS(exec, next_krylov_basis, krylov_bases, hessenberg_iter,
-                       buffer_iter, arnoldi_norm, iter,
-                       stop_status->get_const_data(), reorth_status->get_data(),
-                       num_reorth);
-    givens_rotation(exec, givens_sin, givens_cos, hessenberg_iter,
-                    residual_norm, residual_norm_collection, iter, stop_status);
-}
-
-GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_ARNOLDI_KERNEL);
-
-
-template <typename ValueType>
-void solve_upper_triangular(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const matrix::Dense<ValueType>* residual_norm_collection,
-    const matrix::Dense<ValueType>* hessenberg, matrix::Dense<ValueType>* y,
-    const array<size_type>* final_iter_nums)
-{
-    // TODO: tune block_size for optimal performance
-    constexpr auto block_size = default_block_size;
-    const auto num_rhs = residual_norm_collection->get_size()[1];
-    const auto block_dim = block_size;
-    const auto grid_dim =
-        static_cast<unsigned int>(ceildiv(num_rhs, block_size));
-
-    solve_upper_triangular_kernel<block_size>
-        <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-            hessenberg->get_size()[1], num_rhs,
-            as_device_type(residual_norm_collection->get_const_values()),
-            residual_norm_collection->get_stride(),
-            as_device_type(hessenberg->get_const_values()),
-            hessenberg->get_stride(), as_device_type(y->get_values()),
-            y->get_stride(), as_device_type(final_iter_nums->get_const_data()));
-}
-
-
-template <typename ValueType, typename ConstAccessor3d>
-void calculate_qy(std::shared_ptr<const DefaultExecutor> exec,
-                  ConstAccessor3d krylov_bases, size_type num_krylov_bases,
-                  const matrix::Dense<ValueType>* y,
-                  matrix::Dense<ValueType>* before_preconditioner,
-                  const array<size_type>* final_iter_nums)
-{
-    const auto num_rows = before_preconditioner->get_size()[0];
-    const auto num_cols = before_preconditioner->get_size()[1];
-    const auto stride_before_preconditioner =
-        before_preconditioner->get_stride();
-
-    constexpr auto block_size = default_block_size;
-    const auto grid_dim = static_cast<unsigned int>(
-        ceildiv(num_rows * stride_before_preconditioner, block_size));
-    const auto block_dim = block_size;
-
-    calculate_Qy_kernel<block_size>
-        <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-            num_rows, num_cols, acc::as_device_range(krylov_bases),
-            as_device_type(y->get_const_values()), y->get_stride(),
-            as_device_type(before_preconditioner->get_values()),
-            stride_before_preconditioner,
-            as_device_type(final_iter_nums->get_const_data()));
-    // Calculate qy
-    // before_preconditioner = krylov_bases * y
-}
-
-
-template <typename ValueType, typename ConstAccessor3d>
-void solve_krylov(std::shared_ptr<const DefaultExecutor> exec,
-                  const matrix::Dense<ValueType>* residual_norm_collection,
-                  ConstAccessor3d krylov_bases,
-                  const matrix::Dense<ValueType>* hessenberg,
-                  matrix::Dense<ValueType>* y,
-                  matrix::Dense<ValueType>* before_preconditioner,
-                  const array<size_type>* final_iter_nums)
-{
-    if (before_preconditioner->get_size()[1] == 0) {
-        return;
-    }
-    // since hessenberg has dims:  iters x iters * num_rhs
-    // krylov_bases has dims:  (iters + 1) x sysmtx[0] x num_rhs
-    const auto iters =
-        hessenberg->get_size()[1] / before_preconditioner->get_size()[1];
-    const auto num_krylov_bases = iters + 1;
-    solve_upper_triangular(exec, residual_norm_collection, hessenberg, y,
-                           final_iter_nums);
-    calculate_qy(exec, krylov_bases, num_krylov_bases, y, before_preconditioner,
-                 final_iter_nums);
-}
-
-GKO_INSTANTIATE_FOR_EACH_CB_GMRES_CONST_TYPE(
-    GKO_DECLARE_CB_GMRES_SOLVE_KRYLOV_KERNEL);
-
-
-}  // namespace cb_gmres
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/solver/idr_kernels.hip.cpp b/hip/solver/idr_kernels.hip.cpp
deleted file mode 100644
index c516597bd2b..00000000000
--- a/hip/solver/idr_kernels.hip.cpp
+++ /dev/null
@@ -1,340 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/solver/idr_kernels.hpp"
-
-#include <ctime>
-#include <random>
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-
-#include "common/cuda_hip/base/blas_bindings.hpp"
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/randlib_bindings.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "core/components/fill_array_kernels.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/components/atomic.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The IDR solver namespace.
- *
- * @ingroup idr
- */
-namespace idr {
-
-
-constexpr int default_block_size = 512;
-constexpr int default_dot_dim = 32;
-constexpr int default_dot_size = default_dot_dim * default_dot_dim;
-
-
-#include "common/cuda_hip/solver/idr_kernels.hpp.inc"
-
-
-namespace {
-
-
-template <typename ValueType>
-void initialize_m(std::shared_ptr<const DefaultExecutor> exec,
-                  const size_type nrhs, matrix::Dense<ValueType>* m,
-                  array<stopping_status>* stop_status)
-{
-    const auto subspace_dim = m->get_size()[0];
-    const auto m_stride = m->get_stride();
-
-    const auto grid_dim = ceildiv(m_stride * subspace_dim, default_block_size);
-    initialize_m_kernel<<<grid_dim, default_block_size, 0,
-                          exec->get_stream()>>>(
-        subspace_dim, nrhs, as_device_type(m->get_values()), m_stride,
-        as_device_type(stop_status->get_data()));
-}
-
-
-template <typename ValueType>
-void initialize_subspace_vectors(std::shared_ptr<const DefaultExecutor> exec,
-                                 matrix::Dense<ValueType>* subspace_vectors,
-                                 bool deterministic)
-{
-    if (!deterministic) {
-        auto gen = randlib::rand_generator(std::random_device{}(),
-                                           RANDLIB_RNG_PSEUDO_DEFAULT,
-                                           exec->get_stream());
-        randlib::rand_vector(
-            gen,
-            subspace_vectors->get_size()[0] * subspace_vectors->get_stride(),
-            0.0, 1.0, subspace_vectors->get_values());
-        randlib::destroy(gen);
-    }
-}
-
-
-template <typename ValueType>
-void orthonormalize_subspace_vectors(
-    std::shared_ptr<const DefaultExecutor> exec,
-    matrix::Dense<ValueType>* subspace_vectors)
-{
-    orthonormalize_subspace_vectors_kernel<default_block_size>
-        <<<1, default_block_size, 0, exec->get_stream()>>>(
-            subspace_vectors->get_size()[0], subspace_vectors->get_size()[1],
-            as_device_type(subspace_vectors->get_values()),
-            subspace_vectors->get_stride());
-}
-
-
-template <typename ValueType>
-void solve_lower_triangular(std::shared_ptr<const DefaultExecutor> exec,
-                            const size_type nrhs,
-                            const matrix::Dense<ValueType>* m,
-                            const matrix::Dense<ValueType>* f,
-                            matrix::Dense<ValueType>* c,
-                            const array<stopping_status>* stop_status)
-{
-    const auto subspace_dim = m->get_size()[0];
-
-    const auto grid_dim = ceildiv(nrhs, default_block_size);
-    solve_lower_triangular_kernel<<<grid_dim, default_block_size, 0,
-                                    exec->get_stream()>>>(
-        subspace_dim, nrhs, as_device_type(m->get_const_values()),
-        m->get_stride(), as_device_type(f->get_const_values()), f->get_stride(),
-        as_device_type(c->get_values()), c->get_stride(),
-        stop_status->get_const_data());
-}
-
-
-template <typename ValueType>
-void update_g_and_u(std::shared_ptr<const DefaultExecutor> exec,
-                    const size_type nrhs, const size_type k,
-                    const matrix::Dense<ValueType>* p,
-                    const matrix::Dense<ValueType>* m,
-                    matrix::Dense<ValueType>* alpha,
-                    matrix::Dense<ValueType>* g, matrix::Dense<ValueType>* g_k,
-                    matrix::Dense<ValueType>* u,
-                    const array<stopping_status>* stop_status)
-{
-    if (nrhs == 0) {
-        return;
-    }
-    const auto size = g->get_size()[0];
-    const auto p_stride = p->get_stride();
-
-    const dim3 grid_dim(ceildiv(nrhs, default_dot_dim),
-                        exec->get_num_multiprocessor() * 2);
-    const dim3 block_dim(default_dot_dim, default_dot_dim);
-
-    for (size_type i = 0; i < k; i++) {
-        const auto p_i = p->get_const_values() + i * p_stride;
-        if (nrhs > 1 || is_complex<ValueType>()) {
-            components::fill_array(exec, alpha->get_values(), nrhs,
-                                   zero<ValueType>());
-            multidot_kernel<<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-                size, nrhs, as_device_type(p_i),
-                as_device_type(g_k->get_values()), g_k->get_stride(),
-                as_device_type(alpha->get_values()),
-                stop_status->get_const_data());
-        } else {
-            blas::dot(exec->get_blas_handle(), size, p_i, 1, g_k->get_values(),
-                      g_k->get_stride(), alpha->get_values());
-        }
-        update_g_k_and_u_kernel<default_block_size>
-            <<<ceildiv(size * g_k->get_stride(), default_block_size),
-               default_block_size, 0, exec->get_stream()>>>(
-                k, i, size, nrhs, as_device_type(alpha->get_const_values()),
-                as_device_type(m->get_const_values()), m->get_stride(),
-                as_device_type(g->get_const_values()), g->get_stride(),
-                as_device_type(g_k->get_values()), g_k->get_stride(),
-                as_device_type(u->get_values()), u->get_stride(),
-                stop_status->get_const_data());
-    }
-    update_g_kernel<default_block_size>
-        <<<ceildiv(size * g_k->get_stride(), default_block_size),
-           default_block_size, 0, exec->get_stream()>>>(
-            k, size, nrhs, as_device_type(g_k->get_const_values()),
-            g_k->get_stride(), as_device_type(g->get_values()), g->get_stride(),
-            stop_status->get_const_data());
-}
-
-
-template <typename ValueType>
-void update_m(std::shared_ptr<const DefaultExecutor> exec, const size_type nrhs,
-              const size_type k, const matrix::Dense<ValueType>* p,
-              const matrix::Dense<ValueType>* g_k, matrix::Dense<ValueType>* m,
-              const array<stopping_status>* stop_status)
-{
-    if (nrhs == 0) {
-        return;
-    }
-    const auto size = g_k->get_size()[0];
-    const auto subspace_dim = m->get_size()[0];
-    const auto p_stride = p->get_stride();
-    const auto m_stride = m->get_stride();
-
-    const dim3 grid_dim(ceildiv(nrhs, default_dot_dim),
-                        exec->get_num_multiprocessor() * 2);
-    const dim3 block_dim(default_dot_dim, default_dot_dim);
-
-    for (size_type i = k; i < subspace_dim; i++) {
-        const auto p_i = p->get_const_values() + i * p_stride;
-        auto m_i = m->get_values() + i * m_stride + k * nrhs;
-        if (nrhs > 1 || is_complex<ValueType>()) {
-            components::fill_array(exec, m_i, nrhs, zero<ValueType>());
-            multidot_kernel<<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-                size, nrhs, as_device_type(p_i),
-                as_device_type(g_k->get_const_values()), g_k->get_stride(),
-                as_device_type(m_i), stop_status->get_const_data());
-        } else {
-            blas::dot(exec->get_blas_handle(), size, p_i, 1,
-                      g_k->get_const_values(), g_k->get_stride(), m_i);
-        }
-    }
-}
-
-
-template <typename ValueType>
-void update_x_r_and_f(std::shared_ptr<const DefaultExecutor> exec,
-                      const size_type nrhs, const size_type k,
-                      const matrix::Dense<ValueType>* m,
-                      const matrix::Dense<ValueType>* g,
-                      const matrix::Dense<ValueType>* u,
-                      matrix::Dense<ValueType>* f, matrix::Dense<ValueType>* r,
-                      matrix::Dense<ValueType>* x,
-                      const array<stopping_status>* stop_status)
-{
-    const auto size = x->get_size()[0];
-    const auto subspace_dim = m->get_size()[0];
-
-    const auto grid_dim = ceildiv(size * x->get_stride(), default_block_size);
-    update_x_r_and_f_kernel<<<grid_dim, default_block_size, 0,
-                              exec->get_stream()>>>(
-        k, size, subspace_dim, nrhs, as_device_type(m->get_const_values()),
-        m->get_stride(), as_device_type(g->get_const_values()), g->get_stride(),
-        as_device_type(u->get_const_values()), u->get_stride(),
-        as_device_type(f->get_values()), f->get_stride(),
-        as_device_type(r->get_values()), r->get_stride(),
-        as_device_type(x->get_values()), x->get_stride(),
-        stop_status->get_const_data());
-    components::fill_array(exec, f->get_values() + k * f->get_stride(), nrhs,
-                           zero<ValueType>());
-}
-
-
-}  // namespace
-
-
-template <typename ValueType>
-void initialize(std::shared_ptr<const DefaultExecutor> exec,
-                const size_type nrhs, matrix::Dense<ValueType>* m,
-                matrix::Dense<ValueType>* subspace_vectors, bool deterministic,
-                array<stopping_status>* stop_status)
-{
-    initialize_m(exec, nrhs, m, stop_status);
-    initialize_subspace_vectors(exec, subspace_vectors, deterministic);
-    orthonormalize_subspace_vectors(exec, subspace_vectors);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_INITIALIZE_KERNEL);
-
-
-template <typename ValueType>
-void step_1(std::shared_ptr<const DefaultExecutor> exec, const size_type nrhs,
-            const size_type k, const matrix::Dense<ValueType>* m,
-            const matrix::Dense<ValueType>* f,
-            const matrix::Dense<ValueType>* residual,
-            const matrix::Dense<ValueType>* g, matrix::Dense<ValueType>* c,
-            matrix::Dense<ValueType>* v,
-            const array<stopping_status>* stop_status)
-{
-    solve_lower_triangular(exec, nrhs, m, f, c, stop_status);
-
-    const auto num_rows = v->get_size()[0];
-    const auto subspace_dim = m->get_size()[0];
-
-    const auto grid_dim = ceildiv(nrhs * num_rows, default_block_size);
-    step_1_kernel<<<grid_dim, default_block_size, 0, exec->get_stream()>>>(
-        k, num_rows, subspace_dim, nrhs,
-        as_device_type(residual->get_const_values()), residual->get_stride(),
-        as_device_type(c->get_const_values()), c->get_stride(),
-        as_device_type(g->get_const_values()), g->get_stride(),
-        as_device_type(v->get_values()), v->get_stride(),
-        stop_status->get_const_data());
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_1_KERNEL);
-
-
-template <typename ValueType>
-void step_2(std::shared_ptr<const DefaultExecutor> exec, const size_type nrhs,
-            const size_type k, const matrix::Dense<ValueType>* omega,
-            const matrix::Dense<ValueType>* preconditioned_vector,
-            const matrix::Dense<ValueType>* c, matrix::Dense<ValueType>* u,
-            const array<stopping_status>* stop_status)
-{
-    if (nrhs == 0) {
-        return;
-    }
-    const auto num_rows = preconditioned_vector->get_size()[0];
-    const auto subspace_dim = u->get_size()[1] / nrhs;
-
-    const auto grid_dim = ceildiv(nrhs * num_rows, default_block_size);
-    step_2_kernel<<<grid_dim, default_block_size, 0, exec->get_stream()>>>(
-        k, num_rows, subspace_dim, nrhs,
-        as_device_type(omega->get_const_values()),
-        as_device_type(preconditioned_vector->get_const_values()),
-        preconditioned_vector->get_stride(),
-        as_device_type(c->get_const_values()), c->get_stride(),
-        as_device_type(u->get_values()), u->get_stride(),
-        stop_status->get_const_data());
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_2_KERNEL);
-
-
-template <typename ValueType>
-void step_3(std::shared_ptr<const DefaultExecutor> exec, const size_type nrhs,
-            const size_type k, const matrix::Dense<ValueType>* p,
-            matrix::Dense<ValueType>* g, matrix::Dense<ValueType>* g_k,
-            matrix::Dense<ValueType>* u, matrix::Dense<ValueType>* m,
-            matrix::Dense<ValueType>* f, matrix::Dense<ValueType>* alpha,
-            matrix::Dense<ValueType>* residual, matrix::Dense<ValueType>* x,
-            const array<stopping_status>* stop_status)
-{
-    update_g_and_u(exec, nrhs, k, p, m, alpha, g, g_k, u, stop_status);
-    update_m(exec, nrhs, k, p, g_k, m, stop_status);
-    update_x_r_and_f(exec, nrhs, k, m, g, u, f, residual, x, stop_status);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_3_KERNEL);
-
-
-template <typename ValueType>
-void compute_omega(
-    std::shared_ptr<const DefaultExecutor> exec, const size_type nrhs,
-    const remove_complex<ValueType> kappa, const matrix::Dense<ValueType>* tht,
-    const matrix::Dense<remove_complex<ValueType>>* residual_norm,
-    matrix::Dense<ValueType>* omega, const array<stopping_status>* stop_status)
-{
-    const auto grid_dim = ceildiv(nrhs, config::warp_size);
-    compute_omega_kernel<<<grid_dim, config::warp_size, 0,
-                           exec->get_stream()>>>(
-        nrhs, as_device_type(kappa), as_device_type(tht->get_const_values()),
-        as_device_type(residual_norm->get_const_values()),
-        as_device_type(omega->get_values()), stop_status->get_const_data());
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL);
-
-
-}  // namespace idr
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/solver/multigrid_kernels.hip.cpp b/hip/solver/multigrid_kernels.hip.cpp
deleted file mode 100644
index 6e19606a78e..00000000000
--- a/hip/solver/multigrid_kernels.hip.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/solver/multigrid_kernels.hpp"
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/types.hpp>
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "core/base/array_access.hpp"
-#include "core/components/fill_array_kernels.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The MULTIGRID solver namespace.
- *
- * @ingroup multigrid
- */
-namespace multigrid {
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/solver/multigrid_kernels.hpp.inc"
-
-
-}  // namespace multigrid
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/stop/batch_criteria.hip.hpp b/hip/stop/batch_criteria.hip.hpp
deleted file mode 100644
index 1f721e36aaf..00000000000
--- a/hip/stop/batch_criteria.hip.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_
-#define GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_
-
-
-#include <ginkgo/core/base/math.hpp>
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-namespace batch_stop {
-
-
-#include "common/cuda_hip/stop/batch_criteria.hpp.inc"
-
-
-}  // namespace batch_stop
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-#endif  // GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_

From 403fcc3963911067b1fd9df637e0ca1e7dc82b65 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Sat, 29 Jun 2024 15:55:45 +0200
Subject: [PATCH 039/448] unify missing files, CMake changes

---
 common/CMakeLists.txt                         |  2 +
 common/cuda_hip/CMakeLists.txt                | 51 +++++++++++++++++++
 .../cuda_hip/base/{math.hpp.inc => math.hpp}  | 18 +++++++
 ..._array.hpp.inc => uninitialized_array.hpp} | 20 ++++++++
 cuda/CMakeLists.txt                           | 29 +----------
 cuda/base/math.hpp                            | 23 ---------
 cuda/components/uninitialized_array.hpp       | 25 ---------
 hip/CMakeLists.txt                            | 29 +----------
 hip/base/math.hip.hpp                         | 23 ---------
 hip/components/uninitialized_array.hip.hpp    | 25 ---------
 10 files changed, 93 insertions(+), 152 deletions(-)
 create mode 100644 common/cuda_hip/CMakeLists.txt
 rename common/cuda_hip/base/{math.hpp.inc => math.hpp} (79%)
 rename common/cuda_hip/components/{uninitialized_array.hpp.inc => uninitialized_array.hpp} (82%)
 delete mode 100644 cuda/base/math.hpp
 delete mode 100644 cuda/components/uninitialized_array.hpp
 delete mode 100644 hip/base/math.hip.hpp
 delete mode 100644 hip/components/uninitialized_array.hip.hpp

diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 77bdd7230b9..e84ff9f5660 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -1,2 +1,4 @@
 add_subdirectory(unified)
+add_subdirectory(cuda_hip)
 set(GKO_UNIFIED_COMMON_SOURCES ${GKO_UNIFIED_COMMON_SOURCES} PARENT_SCOPE)
+set(GKO_CUDA_HIP_COMMON_SOURCES ${GKO_CUDA_HIP_COMMON_SOURCES} PARENT_SCOPE)
diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt
new file mode 100644
index 00000000000..2cfbe6e9b0d
--- /dev/null
+++ b/common/cuda_hip/CMakeLists.txt
@@ -0,0 +1,51 @@
+include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
+set(CUDA_HIP_SOURCES
+    base/batch_multi_vector_kernels.cpp
+    base/device_matrix_data_kernels.cpp
+    base/kernel_launch.hpp
+    base/kernel_launch_reduction.hpp
+    base/kernel_launch_solver.hpp
+    components/atomic.hpp
+    components/diagonal_block_manipulation.hpp
+    components/intrinsics.hpp
+    components/merging.hpp
+    components/prefix_sum.hpp
+    components/prefix_sum_kernels.cpp
+    components/reduction.hpp
+    components/searching.hpp
+    components/segment_scan.hpp
+    components/sorting.hpp
+    components/syncfree.hpp
+    components/thread_ids.hpp
+    components/warp_blas.hpp
+    distributed/matrix_kernels.cpp
+    distributed/partition_helpers_kernels.cpp
+    distributed/partition_kernels.cpp
+    distributed/vector_kernels.cpp
+    factorization/cholesky_kernels.cpp
+    factorization/factorization_kernels.cpp
+    factorization/lu_kernels.cpp
+    factorization/par_ic_kernels.cpp
+    factorization/par_ilu_kernels.cpp
+    log/batch_logger.hpp
+    matrix/batch_csr_kernels.cpp
+    matrix/batch_dense_kernels.cpp
+    matrix/batch_ell_kernels.cpp
+    matrix/coo_kernels.cpp
+    matrix/dense_kernels.cpp
+    matrix/diagonal_kernels.cpp
+    matrix/ell_kernels.cpp
+    matrix/fbcsr_kernels.cpp
+    matrix/sellp_kernels.cpp
+    matrix/sparsity_csr_kernels.cpp
+    multigrid/pgm_kernels.cpp
+    preconditioner/isai_kernels.cpp
+    preconditioner/jacobi_kernels.cpp
+    reorder/rcm_kernels.cpp
+    solver/cb_gmres_kernels.cpp
+    solver/idr_kernels.cpp
+    solver/multigrid_kernels.cpp
+    stop/batch_criteria.hpp
+    )
+list(TRANSFORM CUDA_HIP_SOURCES PREPEND ${CMAKE_CURRENT_SOURCE_DIR}/)
+set(GKO_CUDA_HIP_COMMON_SOURCES ${CUDA_HIP_SOURCES} PARENT_SCOPE)
diff --git a/common/cuda_hip/base/math.hpp.inc b/common/cuda_hip/base/math.hpp
similarity index 79%
rename from common/cuda_hip/base/math.hpp.inc
rename to common/cuda_hip/base/math.hpp
index 430163f3791..44a26cadb53 100644
--- a/common/cuda_hip/base/math.hpp.inc
+++ b/common/cuda_hip/base/math.hpp
@@ -2,6 +2,18 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_INC_
+#define GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_INC_
+
+
+#include <thrust/complex.h>
+
+#include <ginkgo/core/base/math.hpp>
+
+
+namespace gko {
+
+
 // We need this struct, because otherwise we would call a __host__ function in a
 // __device__ function (even though it is constexpr)
 template <typename T>
@@ -37,3 +49,9 @@ struct truncate_type_impl<thrust::complex<T>> {
 
 
 }  // namespace detail
+
+
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_INC_
\ No newline at end of file
diff --git a/common/cuda_hip/components/uninitialized_array.hpp.inc b/common/cuda_hip/components/uninitialized_array.hpp
similarity index 82%
rename from common/cuda_hip/components/uninitialized_array.hpp.inc
rename to common/cuda_hip/components/uninitialized_array.hpp
index 932ae8a5caa..215c7f5751a 100644
--- a/common/cuda_hip/components/uninitialized_array.hpp.inc
+++ b/common/cuda_hip/components/uninitialized_array.hpp
@@ -2,6 +2,18 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_INC_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_INC_
+
+
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
 /**
  * Stores an array with uninitialized contents.
  *
@@ -63,3 +75,11 @@ class uninitialized_array {
 private:
     unsigned char data_[sizeof(ValueType) / sizeof(unsigned char) * size];
 };
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_INC_
\ No newline at end of file
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index 3d251ecfa82..505b222bb8d 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -7,9 +7,7 @@ add_instantiation_files(. matrix/fbcsr_kernels.instantiate.cu FBCSR_INSTANTIATE)
 list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
 target_sources(ginkgo_cuda
     PRIVATE
-    base/batch_multi_vector_kernels.cu
     base/device.cpp
-    base/device_matrix_data_kernels.cu
     base/exception.cpp
     base/executor.cpp
     base/index_set_kernels.cpp
@@ -19,56 +17,31 @@ target_sources(ginkgo_cuda
     base/stream.cpp
     base/timer.cpp
     base/version.cpp
-    components/prefix_sum_kernels.cu
     distributed/index_map_kernels.cu
-    distributed/matrix_kernels.cu
-    distributed/partition_helpers_kernels.cu
-    distributed/partition_kernels.cu
-    distributed/vector_kernels.cu
-    factorization/cholesky_kernels.cu
-    factorization/factorization_kernels.cu
     factorization/ic_kernels.cu
     factorization/ilu_kernels.cu
-    factorization/lu_kernels.cu
-    factorization/par_ic_kernels.cu
     factorization/par_ict_kernels.cu
-    factorization/par_ilu_kernels.cu
     factorization/par_ilut_approx_filter_kernels.cu
     factorization/par_ilut_filter_kernels.cu
     factorization/par_ilut_select_common.cu
     factorization/par_ilut_select_kernels.cu
     factorization/par_ilut_spgeam_kernels.cu
     factorization/par_ilut_sweep_kernels.cu
-    matrix/batch_csr_kernels.cu
-    matrix/batch_dense_kernels.cu
-    matrix/batch_ell_kernels.cu
-    matrix/coo_kernels.cu
     ${CSR_INSTANTIATE}
-    matrix/dense_kernels.cu
-    matrix/diagonal_kernels.cu
-    matrix/ell_kernels.cu
     ${FBCSR_INSTANTIATE}
     matrix/fft_kernels.cu
-    matrix/sellp_kernels.cu
-    matrix/sparsity_csr_kernels.cu
-    multigrid/pgm_kernels.cu
     preconditioner/batch_jacobi_kernels.cu
-    preconditioner/isai_kernels.cu
     preconditioner/jacobi_advanced_apply_kernels.cu
     preconditioner/jacobi_generate_kernels.cu
-    preconditioner/jacobi_kernels.cu
     preconditioner/jacobi_simple_apply_kernels.cu
-    reorder/rcm_kernels.cu
     solver/batch_bicgstab_kernels.cu
     solver/batch_cg_kernels.cu
-    solver/cb_gmres_kernels.cu
-    solver/idr_kernels.cu
     solver/lower_trs_kernels.cu
-    solver/multigrid_kernels.cu
     solver/upper_trs_kernels.cu
     stop/criterion_kernels.cu
     stop/residual_norm_kernels.cu
     ${GKO_UNIFIED_COMMON_SOURCES}
+    ${GKO_CUDA_HIP_COMMON_SOURCES}
     )
 # override the default language mapping for the common files, set them to CUDA
 foreach(source_file IN LISTS GKO_UNIFIED_COMMON_SOURCES)
diff --git a/cuda/base/math.hpp b/cuda/base/math.hpp
deleted file mode 100644
index d9fa5165cf6..00000000000
--- a/cuda/base/math.hpp
+++ /dev/null
@@ -1,23 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_BASE_MATH_HPP_
-#define GKO_CUDA_BASE_MATH_HPP_
-
-
-#include <thrust/complex.h>
-
-#include <ginkgo/core/base/math.hpp>
-
-
-namespace gko {
-
-
-#include "common/cuda_hip/base/math.hpp.inc"
-
-
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_BASE_MATH_HPP_
diff --git a/cuda/components/uninitialized_array.hpp b/cuda/components/uninitialized_array.hpp
deleted file mode 100644
index b98c812c16d..00000000000
--- a/cuda/components/uninitialized_array.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_COMPONENTS_UNINITIALIZED_ARRAY_HPP_
-#define GKO_CUDA_COMPONENTS_UNINITIALIZED_ARRAY_HPP_
-
-
-#include <ginkgo/core/base/types.hpp>
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-#include "common/cuda_hip/components/uninitialized_array.hpp.inc"
-
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_COMPONENTS_UNINITIALIZED_ARRAY_HPP_
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index bf2d6a6cf58..19f4dd54b2a 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -5,9 +5,7 @@ add_instantiation_files(. matrix/fbcsr_kernels.instantiate.hip.cpp FBCSR_INSTANT
 # we don't split up the dense kernels into distinct compilations
 list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
 set(GINKGO_HIP_SOURCES
-    base/batch_multi_vector_kernels.hip.cpp
     base/device.hip.cpp
-    base/device_matrix_data_kernels.hip.cpp
     base/exception.hip.cpp
     base/executor.hip.cpp
     base/index_set_kernels.hip.cpp
@@ -17,55 +15,30 @@ set(GINKGO_HIP_SOURCES
     base/stream.hip.cpp
     base/timer.hip.cpp
     base/version.hip.cpp
-    components/prefix_sum_kernels.hip.cpp
     distributed/index_map_kernels.hip.cpp
-    distributed/matrix_kernels.hip.cpp
-    distributed/partition_helpers_kernels.hip.cpp
-    distributed/partition_kernels.hip.cpp
-    distributed/vector_kernels.hip.cpp
-    factorization/cholesky_kernels.hip.cpp
-    factorization/factorization_kernels.hip.cpp
     factorization/ic_kernels.hip.cpp
     factorization/ilu_kernels.hip.cpp
-    factorization/lu_kernels.hip.cpp
-    factorization/par_ic_kernels.hip.cpp
     factorization/par_ict_kernels.hip.cpp
-    factorization/par_ilu_kernels.hip.cpp
     factorization/par_ilut_approx_filter_kernels.hip.cpp
     factorization/par_ilut_filter_kernels.hip.cpp
     factorization/par_ilut_select_common.hip.cpp
     factorization/par_ilut_select_kernels.hip.cpp
     factorization/par_ilut_spgeam_kernels.hip.cpp
     factorization/par_ilut_sweep_kernels.hip.cpp
-    matrix/batch_csr_kernels.hip.cpp
-    matrix/batch_dense_kernels.hip.cpp
-    matrix/batch_ell_kernels.hip.cpp
-    matrix/coo_kernels.hip.cpp
     ${CSR_INSTANTIATE}
-    matrix/dense_kernels.hip.cpp
-    matrix/diagonal_kernels.hip.cpp
-    matrix/ell_kernels.hip.cpp
     ${FBCSR_INSTANTIATE}
-    matrix/sellp_kernels.hip.cpp
-    matrix/sparsity_csr_kernels.hip.cpp
-    multigrid/pgm_kernels.hip.cpp
     preconditioner/batch_jacobi_kernels.hip.cpp
-    preconditioner/isai_kernels.hip.cpp
     preconditioner/jacobi_advanced_apply_kernels.hip.cpp
     preconditioner/jacobi_generate_kernels.hip.cpp
-    preconditioner/jacobi_kernels.hip.cpp
     preconditioner/jacobi_simple_apply_kernels.hip.cpp
-    reorder/rcm_kernels.hip.cpp
     solver/batch_bicgstab_kernels.hip.cpp
     solver/batch_cg_kernels.hip.cpp
-    solver/cb_gmres_kernels.hip.cpp
-    solver/idr_kernels.hip.cpp
     solver/lower_trs_kernels.hip.cpp
-    solver/multigrid_kernels.hip.cpp
     solver/upper_trs_kernels.hip.cpp
     stop/criterion_kernels.hip.cpp
     stop/residual_norm_kernels.hip.cpp
     ${GKO_UNIFIED_COMMON_SOURCES}
+    ${GKO_CUDA_HIP_COMMON_SOURCES}
     )
 
 if(hipfft_FOUND)
diff --git a/hip/base/math.hip.hpp b/hip/base/math.hip.hpp
deleted file mode 100644
index 9f577812f3e..00000000000
--- a/hip/base/math.hip.hpp
+++ /dev/null
@@ -1,23 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_BASE_MATH_HIP_HPP_
-#define GKO_HIP_BASE_MATH_HIP_HPP_
-
-
-#include <thrust/complex.h>
-
-#include <ginkgo/core/base/math.hpp>
-
-
-namespace gko {
-
-
-#include "common/cuda_hip/base/math.hpp.inc"
-
-
-}  // namespace gko
-
-
-#endif  // GKO_HIP_BASE_MATH_HIP_HPP_
diff --git a/hip/components/uninitialized_array.hip.hpp b/hip/components/uninitialized_array.hip.hpp
deleted file mode 100644
index e59d2c21a63..00000000000
--- a/hip/components/uninitialized_array.hip.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HIP_HPP_
-#define GKO_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HIP_HPP_
-
-
-#include <ginkgo/core/base/types.hpp>
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-#include "common/cuda_hip/components/uninitialized_array.hpp.inc"
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HIP_HPP_

From 6144a60850e89a360754b84d0a36e5e8f5f28e00 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Sat, 29 Jun 2024 15:56:35 +0200
Subject: [PATCH 040/448] adapt headers

---
 common/unified/base/kernel_launch.hpp          |  6 ++----
 .../unified/base/kernel_launch_reduction.hpp   |  6 ++----
 common/unified/base/kernel_launch_solver.hpp   |  6 ++----
 core/solver/batch_dispatch.hpp                 | 17 +++++++++--------
 cuda/CMakeLists.txt                            |  2 +-
 cuda/base/config.hpp                           |  2 +-
 cuda/base/cublas_bindings.hpp                  |  2 +-
 cuda/base/curand_bindings.hpp                  |  2 +-
 cuda/components/format_conversion.cuh          |  2 +-
 cuda/distributed/index_map_kernels.cu          |  4 ++--
 cuda/factorization/par_ict_kernels.cu          | 14 +++++++-------
 .../par_ilut_approx_filter_kernels.cu          | 12 ++++++------
 cuda/factorization/par_ilut_filter_kernels.cu  |  6 +++---
 cuda/factorization/par_ilut_select_common.cu   | 14 +++++++-------
 cuda/factorization/par_ilut_select_kernels.cu  | 14 +++++++-------
 cuda/factorization/par_ilut_spgeam_kernels.cu  | 12 ++++++------
 cuda/factorization/par_ilut_sweep_kernels.cu   | 14 +++++++-------
 cuda/matrix/csr_kernels.template.cu            | 18 +++++++++---------
 cuda/preconditioner/batch_jacobi_kernels.cu    |  4 ++--
 cuda/preconditioner/batch_preconditioners.cuh  |  2 +-
 ...acobi_advanced_apply_kernels.instantiate.cu |  6 +++---
 .../jacobi_generate_kernels.instantiate.cu     | 10 +++++-----
 .../jacobi_simple_apply_kernels.instantiate.cu |  6 +++---
 cuda/solver/batch_bicgstab_kernels.cu          |  6 +++---
 cuda/solver/batch_cg_kernels.cu                |  6 +++---
 cuda/solver/common_trs_kernels.cuh             |  8 ++++----
 cuda/solver/lower_trs_kernels.cu               |  2 +-
 cuda/solver/upper_trs_kernels.cu               |  2 +-
 cuda/stop/criterion_kernels.cu                 |  4 ++--
 cuda/stop/residual_norm_kernels.cu             |  4 ++--
 cuda/test/base/math.cu                         |  2 +-
 hip/base/config.hip.hpp                        |  2 +-
 hip/base/hipblas_bindings.hip.hpp              |  2 +-
 hip/base/hiprand_bindings.hip.hpp              |  2 +-
 hip/components/format_conversion.hip.hpp       |  2 +-
 hip/distributed/index_map_kernels.hip.cpp      |  4 ++--
 hip/factorization/par_ict_kernels.hip.cpp      | 14 +++++++-------
 .../par_ilut_approx_filter_kernels.hip.cpp     | 12 ++++++------
 .../par_ilut_filter_kernels.hip.cpp            |  6 +++---
 .../par_ilut_select_common.hip.cpp             | 14 +++++++-------
 .../par_ilut_select_kernels.hip.cpp            | 14 +++++++-------
 .../par_ilut_spgeam_kernels.hip.cpp            | 12 ++++++------
 .../par_ilut_sweep_kernels.hip.cpp             | 14 +++++++-------
 hip/matrix/csr_kernels.template.hip.cpp        | 18 +++++++++---------
 .../batch_jacobi_kernels.hip.cpp               | 10 +++++-----
 .../batch_preconditioners.hip.hpp              |  2 +-
 ..._advanced_apply_kernels.instantiate.hip.cpp |  6 +++---
 .../jacobi_generate_kernels.hip.cpp            | 10 +++++-----
 ...jacobi_generate_kernels.instantiate.hip.cpp | 10 +++++-----
 .../jacobi_simple_apply_kernels.hip.cpp        |  6 +++---
 ...bi_simple_apply_kernels.instantiate.hip.cpp |  6 +++---
 hip/solver/batch_bicgstab_kernels.hip.cpp      |  8 ++++----
 hip/solver/batch_cg_kernels.hip.cpp            |  8 ++++----
 hip/solver/common_trs_kernels.hip.hpp          |  2 +-
 hip/solver/lower_trs_kernels.hip.cpp           |  2 +-
 hip/solver/upper_trs_kernels.hip.cpp           |  2 +-
 hip/stop/criterion_kernels.hip.cpp             |  4 ++--
 hip/stop/residual_norm_kernels.hip.cpp         |  4 ++--
 hip/test/base/math.hip.cpp                     |  2 +-
 59 files changed, 204 insertions(+), 209 deletions(-)

diff --git a/common/unified/base/kernel_launch.hpp b/common/unified/base/kernel_launch.hpp
index fad327ae3b1..73d37eb2ac2 100644
--- a/common/unified/base/kernel_launch.hpp
+++ b/common/unified/base/kernel_launch.hpp
@@ -269,10 +269,8 @@ typename to_device_type_impl<T>::type map_to_device(T&& param)
 }  // namespace gko
 
 
-#if defined(GKO_COMPILING_CUDA)
-#include "cuda/base/kernel_launch.cuh"
-#elif defined(GKO_COMPILING_HIP)
-#include "hip/base/kernel_launch.hip.hpp"
+#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP)
+#include "common/cuda_hip/base/kernel_launch.hpp"
 #elif defined(GKO_COMPILING_DPCPP)
 #include "dpcpp/base/kernel_launch.dp.hpp"
 #elif defined(GKO_COMPILING_OMP)
diff --git a/common/unified/base/kernel_launch_reduction.hpp b/common/unified/base/kernel_launch_reduction.hpp
index c3158d35a1c..b7b3e258dd4 100644
--- a/common/unified/base/kernel_launch_reduction.hpp
+++ b/common/unified/base/kernel_launch_reduction.hpp
@@ -19,10 +19,8 @@
     {}
 
 
-#if defined(GKO_COMPILING_CUDA)
-#include "cuda/base/kernel_launch_reduction.cuh"
-#elif defined(GKO_COMPILING_HIP)
-#include "hip/base/kernel_launch_reduction.hip.hpp"
+#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP)
+#include "common/cuda_hip/base/kernel_launch_reduction.hpp"
 #elif defined(GKO_COMPILING_DPCPP)
 #include "dpcpp/base/kernel_launch_reduction.dp.hpp"
 #elif defined(GKO_COMPILING_OMP)
diff --git a/common/unified/base/kernel_launch_solver.hpp b/common/unified/base/kernel_launch_solver.hpp
index f4240805c64..14f2cbfeacf 100644
--- a/common/unified/base/kernel_launch_solver.hpp
+++ b/common/unified/base/kernel_launch_solver.hpp
@@ -107,10 +107,8 @@ const device_type<ValueType>* row_vector(const matrix::Dense<ValueType>* mtx)
 }  // namespace gko
 
 
-#if defined(GKO_COMPILING_CUDA)
-#include "cuda/base/kernel_launch_solver.cuh"
-#elif defined(GKO_COMPILING_HIP)
-#include "hip/base/kernel_launch_solver.hip.hpp"
+#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP)
+#include "common/cuda_hip/base/kernel_launch_solver.hpp"
 #elif defined(GKO_COMPILING_DPCPP)
 #include "dpcpp/base/kernel_launch_solver.dp.hpp"
 #elif defined(GKO_COMPILING_OMP)
diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp
index 8a142a5224a..ce8c4d86e71 100644
--- a/core/solver/batch_dispatch.hpp
+++ b/core/solver/batch_dispatch.hpp
@@ -21,15 +21,15 @@
 #include "core/matrix/batch_struct.hpp"
 
 
-#if defined GKO_COMPILING_CUDA
+#if defined(GKO_COMPILING_CUDA)
 
 
+#include "common/cuda_hip/log/batch_logger.hpp"
+#include "common/cuda_hip/stop/batch_criteria.hpp"
 #include "cuda/base/batch_struct.hpp"
 #include "cuda/components/cooperative_groups.cuh"
-#include "cuda/log/batch_logger.cuh"
 #include "cuda/matrix/batch_struct.hpp"
 #include "cuda/preconditioner/batch_preconditioners.cuh"
-#include "cuda/stop/batch_criteria.cuh"
 
 
 namespace gko {
@@ -37,11 +37,12 @@ namespace batch {
 namespace solver {
 
 
-namespace device = gko::kernels::cuda;
+namespace device = gko::kernels::GKO_DEVICE_NAMESPACE;
 
 
 template <typename ValueType>
-using DeviceValueType = typename gko::kernels::cuda::cuda_type<ValueType>;
+using DeviceValueType =
+    typename gko::kernels::GKO_DEVICE_NAMESPACE::device_type<ValueType>;
 
 
 }  // namespace solver
@@ -49,15 +50,15 @@ using DeviceValueType = typename gko::kernels::cuda::cuda_type<ValueType>;
 }  // namespace gko
 
 
-#elif defined GKO_COMPILING_HIP
+#elif defined(GKO_COMPILING_HIP)
 
 
+#include "common/cuda_hip/log/batch_logger.hpp"
+#include "common/cuda_hip/stop/batch_criteria.hpp"
 #include "hip/base/batch_struct.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/log/batch_logger.hip.hpp"
 #include "hip/matrix/batch_struct.hip.hpp"
 #include "hip/preconditioner/batch_preconditioners.hip.hpp"
-#include "hip/stop/batch_criteria.hip.hpp"
 
 
 namespace gko {
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index 505b222bb8d..bef62c12a9a 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -44,7 +44,7 @@ target_sources(ginkgo_cuda
     ${GKO_CUDA_HIP_COMMON_SOURCES}
     )
 # override the default language mapping for the common files, set them to CUDA
-foreach(source_file IN LISTS GKO_UNIFIED_COMMON_SOURCES)
+foreach(source_file IN LISTS GKO_UNIFIED_COMMON_SOURCES GKO_CUDA_HIP_COMMON_SOURCES)
     set_source_files_properties(${source_file} PROPERTIES LANGUAGE CUDA)
 endforeach(source_file)
 if(GINKGO_JACOBI_FULL_OPTIMIZATIONS)
diff --git a/cuda/base/config.hpp b/cuda/base/config.hpp
index 1ff249066bd..fe280c76dec 100644
--- a/cuda/base/config.hpp
+++ b/cuda/base/config.hpp
@@ -8,7 +8,7 @@
 
 #include <ginkgo/core/base/types.hpp>
 
-#include "cuda/base/math.hpp"
+#include "common/cuda_hip/base/math.hpp"
 
 
 namespace gko {
diff --git a/cuda/base/cublas_bindings.hpp b/cuda/base/cublas_bindings.hpp
index bc8da5851d5..ae5e66b6448 100644
--- a/cuda/base/cublas_bindings.hpp
+++ b/cuda/base/cublas_bindings.hpp
@@ -10,8 +10,8 @@
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/types.hpp"
-#include "cuda/base/math.hpp"
 
 
 namespace gko {
diff --git a/cuda/base/curand_bindings.hpp b/cuda/base/curand_bindings.hpp
index 8d31ac2e90e..eb3dbee6b7b 100644
--- a/cuda/base/curand_bindings.hpp
+++ b/cuda/base/curand_bindings.hpp
@@ -10,8 +10,8 @@
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/types.hpp"
-#include "cuda/base/math.hpp"
 
 
 namespace gko {
diff --git a/cuda/components/format_conversion.cuh b/cuda/components/format_conversion.cuh
index 6690368cc4f..9ece2cdffe4 100644
--- a/cuda/components/format_conversion.cuh
+++ b/cuda/components/format_conversion.cuh
@@ -10,7 +10,7 @@
 #include <ginkgo/core/base/executor.hpp>
 
 #include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "cuda/components/thread_ids.cuh"
+#include "common/cuda_hip/components/thread_ids.hpp"
 
 
 #ifdef GINKGO_BENCHMARK_ENABLE_TUNING
diff --git a/cuda/distributed/index_map_kernels.cu b/cuda/distributed/index_map_kernels.cu
index 42e8f118301..e55a4148e51 100644
--- a/cuda/distributed/index_map_kernels.cu
+++ b/cuda/distributed/index_map_kernels.cu
@@ -19,9 +19,9 @@
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 
+#include "common/cuda_hip/components/atomic.hpp"
+#include "common/cuda_hip/components/searching.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/components/atomic.cuh"
-#include "cuda/components/searching.cuh"
 
 
 namespace gko {
diff --git a/cuda/factorization/par_ict_kernels.cu b/cuda/factorization/par_ict_kernels.cu
index fb7a0b0370a..62964925aa4 100644
--- a/cuda/factorization/par_ict_kernels.cu
+++ b/cuda/factorization/par_ict_kernels.cu
@@ -10,20 +10,20 @@
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
 #include "common/cuda_hip/components/memory.hpp"
+#include "common/cuda_hip/components/merging.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/components/intrinsics.cuh"
-#include "cuda/components/merging.cuh"
-#include "cuda/components/prefix_sum.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/searching.cuh"
-#include "cuda/components/thread_ids.cuh"
 
 
 namespace gko {
diff --git a/cuda/factorization/par_ilut_approx_filter_kernels.cu b/cuda/factorization/par_ilut_approx_filter_kernels.cu
index 51127ffd43b..93c0ef7fc95 100644
--- a/cuda/factorization/par_ilut_approx_filter_kernels.cu
+++ b/cuda/factorization/par_ilut_approx_filter_kernels.cu
@@ -11,20 +11,20 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 #include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/sorting.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/par_ilut_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/components/atomic.cuh"
-#include "cuda/components/intrinsics.cuh"
-#include "cuda/components/prefix_sum.cuh"
-#include "cuda/components/sorting.cuh"
-#include "cuda/components/thread_ids.cuh"
 #include "cuda/factorization/par_ilut_select_common.cuh"
 
 
diff --git a/cuda/factorization/par_ilut_filter_kernels.cu b/cuda/factorization/par_ilut_filter_kernels.cu
index e15c7ec4cf6..3d6b41f07e6 100644
--- a/cuda/factorization/par_ilut_filter_kernels.cu
+++ b/cuda/factorization/par_ilut_filter_kernels.cu
@@ -9,18 +9,18 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 #include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/par_ilut_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/components/intrinsics.cuh"
-#include "cuda/components/thread_ids.cuh"
 
 
 namespace gko {
diff --git a/cuda/factorization/par_ilut_select_common.cu b/cuda/factorization/par_ilut_select_common.cu
index 3f910f4884e..e0b81a81a1c 100644
--- a/cuda/factorization/par_ilut_select_common.cu
+++ b/cuda/factorization/par_ilut_select_common.cu
@@ -4,15 +4,15 @@
 
 #include "cuda/factorization/par_ilut_select_common.cuh"
 
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "common/cuda_hip/components/sorting.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/par_ilut_kernels.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/components/atomic.cuh"
-#include "cuda/components/intrinsics.cuh"
-#include "cuda/components/prefix_sum.cuh"
-#include "cuda/components/searching.cuh"
-#include "cuda/components/sorting.cuh"
-#include "cuda/components/thread_ids.cuh"
 
 
 namespace gko {
diff --git a/cuda/factorization/par_ilut_select_kernels.cu b/cuda/factorization/par_ilut_select_kernels.cu
index ac37e3a7595..a2395a16aea 100644
--- a/cuda/factorization/par_ilut_select_kernels.cu
+++ b/cuda/factorization/par_ilut_select_kernels.cu
@@ -8,16 +8,16 @@
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "common/cuda_hip/components/sorting.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/par_ilut_kernels.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/components/atomic.cuh"
-#include "cuda/components/intrinsics.cuh"
-#include "cuda/components/prefix_sum.cuh"
-#include "cuda/components/searching.cuh"
-#include "cuda/components/sorting.cuh"
-#include "cuda/components/thread_ids.cuh"
 #include "cuda/factorization/par_ilut_select_common.cuh"
 
 
diff --git a/cuda/factorization/par_ilut_spgeam_kernels.cu b/cuda/factorization/par_ilut_spgeam_kernels.cu
index 83ec9c974b8..7277093314a 100644
--- a/cuda/factorization/par_ilut_spgeam_kernels.cu
+++ b/cuda/factorization/par_ilut_spgeam_kernels.cu
@@ -8,20 +8,20 @@
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/merging.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/par_ilut_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/components/intrinsics.cuh"
-#include "cuda/components/merging.cuh"
-#include "cuda/components/prefix_sum.cuh"
-#include "cuda/components/searching.cuh"
-#include "cuda/components/thread_ids.cuh"
 
 
 namespace gko {
diff --git a/cuda/factorization/par_ilut_sweep_kernels.cu b/cuda/factorization/par_ilut_sweep_kernels.cu
index 8bdf6c9380a..9e277549aa4 100644
--- a/cuda/factorization/par_ilut_sweep_kernels.cu
+++ b/cuda/factorization/par_ilut_sweep_kernels.cu
@@ -8,21 +8,21 @@
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
 #include "common/cuda_hip/components/memory.hpp"
+#include "common/cuda_hip/components/merging.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/par_ilut_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/components/intrinsics.cuh"
-#include "cuda/components/merging.cuh"
-#include "cuda/components/prefix_sum.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/searching.cuh"
-#include "cuda/components/thread_ids.cuh"
 
 
 namespace gko {
diff --git a/cuda/matrix/csr_kernels.template.cu b/cuda/matrix/csr_kernels.template.cu
index 600f4ffb5a3..151351c9204 100644
--- a/cuda/matrix/csr_kernels.template.cu
+++ b/cuda/matrix/csr_kernels.template.cu
@@ -25,11 +25,20 @@
 
 #include "accessor/cuda_hip_helper.hpp"
 #include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/pointer_mode_guard.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
 #include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "common/cuda_hip/components/format_conversion.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/merging.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/segment_scan.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
 #include "core/base/array_access.hpp"
 #include "core/base/mixed_precision_types.hpp"
 #include "core/components/fill_array_kernels.hpp"
@@ -40,16 +49,7 @@
 #include "core/matrix/csr_lookup.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/math.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/components/atomic.cuh"
-#include "cuda/components/intrinsics.cuh"
-#include "cuda/components/merging.cuh"
-#include "cuda/components/prefix_sum.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/segment_scan.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
 
 
 namespace gko {
diff --git a/cuda/preconditioner/batch_jacobi_kernels.cu b/cuda/preconditioner/batch_jacobi_kernels.cu
index 1bc39df9781..e31e17dcafc 100644
--- a/cuda/preconditioner/batch_jacobi_kernels.cu
+++ b/cuda/preconditioner/batch_jacobi_kernels.cu
@@ -8,6 +8,8 @@
 #include <ginkgo/core/matrix/batch_csr.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/base/utils.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
@@ -18,8 +20,6 @@
 #include "cuda/base/config.hpp"
 #include "cuda/base/types.hpp"
 #include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/intrinsics.cuh"
-#include "cuda/components/thread_ids.cuh"
 #include "cuda/matrix/batch_struct.hpp"
 #include "cuda/preconditioner/jacobi_common.hpp"
 
diff --git a/cuda/preconditioner/batch_preconditioners.cuh b/cuda/preconditioner/batch_preconditioners.cuh
index e83d6e04ee9..01001c036b2 100644
--- a/cuda/preconditioner/batch_preconditioners.cuh
+++ b/cuda/preconditioner/batch_preconditioners.cuh
@@ -7,9 +7,9 @@
 
 
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/preconditioner/batch_jacobi_helpers.hpp"
-#include "cuda/components/reduction.cuh"
 
 
 namespace gko {
diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu
index e0b9145a0f7..60823cf6f4b 100644
--- a/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu
+++ b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu
@@ -5,16 +5,16 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 #include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/warp_blas.cuh"
 #include "cuda/preconditioner/jacobi_common.hpp"
 
 
diff --git a/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu
index c12df449e42..ff36c8efb1b 100644
--- a/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu
+++ b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu
@@ -6,18 +6,18 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 #include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/diagonal_block_manipulation.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/components/diagonal_block_manipulation.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
-#include "cuda/components/warp_blas.cuh"
 #include "cuda/preconditioner/jacobi_common.hpp"
 
 
diff --git a/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu
index 45af2ec668f..d727c9439f9 100644
--- a/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu
+++ b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu
@@ -5,16 +5,16 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 #include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/warp_blas.cuh"
 #include "cuda/preconditioner/jacobi_common.hpp"
 
 
diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu
index 28efaf07475..8d76f865a20 100644
--- a/cuda/solver/batch_bicgstab_kernels.cu
+++ b/cuda/solver/batch_bicgstab_kernels.cu
@@ -14,14 +14,14 @@
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "cuda/base/batch_struct.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
 #include "cuda/matrix/batch_struct.hpp"
 
 
diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu
index cff72652629..2083cd98b5a 100644
--- a/cuda/solver/batch_cg_kernels.cu
+++ b/cuda/solver/batch_cg_kernels.cu
@@ -13,14 +13,14 @@
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "cuda/base/batch_struct.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
 #include "cuda/matrix/batch_struct.hpp"
 
 
diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh
index a205f155487..7cedf2fbd2e 100644
--- a/cuda/solver/common_trs_kernels.cuh
+++ b/cuda/solver/common_trs_kernels.cuh
@@ -17,17 +17,17 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/pointer_mode_guard.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
 #include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
 #include "common/cuda_hip/components/memory.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
 #include "core/base/array_access.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/components/atomic.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
 
 
 namespace gko {
diff --git a/cuda/solver/lower_trs_kernels.cu b/cuda/solver/lower_trs_kernels.cu
index 898ffb92552..b37f6536b0f 100644
--- a/cuda/solver/lower_trs_kernels.cu
+++ b/cuda/solver/lower_trs_kernels.cu
@@ -13,9 +13,9 @@
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/solver/triangular.hpp>
 
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
 #include "common/cuda_hip/base/types.hpp"
-#include "cuda/base/math.hpp"
 #include "cuda/solver/common_trs_kernels.cuh"
 
 
diff --git a/cuda/solver/upper_trs_kernels.cu b/cuda/solver/upper_trs_kernels.cu
index b1f9e43ed2c..eb7d8386083 100644
--- a/cuda/solver/upper_trs_kernels.cu
+++ b/cuda/solver/upper_trs_kernels.cu
@@ -13,9 +13,9 @@
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/solver/triangular.hpp>
 
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
 #include "common/cuda_hip/base/types.hpp"
-#include "cuda/base/math.hpp"
 #include "cuda/solver/common_trs_kernels.cuh"
 
 
diff --git a/cuda/stop/criterion_kernels.cu b/cuda/stop/criterion_kernels.cu
index 20538e87304..fa596f0c03f 100644
--- a/cuda/stop/criterion_kernels.cu
+++ b/cuda/stop/criterion_kernels.cu
@@ -8,9 +8,9 @@
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/types.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/components/thread_ids.cuh"
+#include "common/cuda_hip/components/thread_ids.hpp"
 
 
 namespace gko {
diff --git a/cuda/stop/residual_norm_kernels.cu b/cuda/stop/residual_norm_kernels.cu
index d59f937b918..e52a74cf422 100644
--- a/cuda/stop/residual_norm_kernels.cu
+++ b/cuda/stop/residual_norm_kernels.cu
@@ -8,10 +8,10 @@
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 #include "core/base/array_access.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/components/thread_ids.cuh"
 
 
 namespace gko {
diff --git a/cuda/test/base/math.cu b/cuda/test/base/math.cu
index e3c1d78ed39..71532b45e80 100644
--- a/cuda/test/base/math.cu
+++ b/cuda/test/base/math.cu
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "cuda/base/math.hpp"
+#include "common/cuda_hip/base/math.hpp"
 
 #include <cmath>
 #include <complex>
diff --git a/hip/base/config.hip.hpp b/hip/base/config.hip.hpp
index e74153cc34e..114eb2f0f0a 100644
--- a/hip/base/config.hip.hpp
+++ b/hip/base/config.hip.hpp
@@ -8,8 +8,8 @@
 
 #include <ginkgo/core/base/types.hpp>
 
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
-#include "hip/base/math.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/base/hipblas_bindings.hip.hpp b/hip/base/hipblas_bindings.hip.hpp
index 21c44e664b8..4641b64277d 100644
--- a/hip/base/hipblas_bindings.hip.hpp
+++ b/hip/base/hipblas_bindings.hip.hpp
@@ -16,9 +16,9 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
-#include "hip/base/math.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/base/hiprand_bindings.hip.hpp b/hip/base/hiprand_bindings.hip.hpp
index a76274c45a7..7cd76b9d320 100644
--- a/hip/base/hiprand_bindings.hip.hpp
+++ b/hip/base/hiprand_bindings.hip.hpp
@@ -15,9 +15,9 @@
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
-#include "hip/base/math.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/components/format_conversion.hip.hpp b/hip/components/format_conversion.hip.hpp
index d2cbc3062a5..2e6c4eb1236 100644
--- a/hip/components/format_conversion.hip.hpp
+++ b/hip/components/format_conversion.hip.hpp
@@ -11,7 +11,7 @@
 
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "hip/components/thread_ids.hip.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 
 
 #ifdef GINKGO_BENCHMARK_ENABLE_TUNING
diff --git a/hip/distributed/index_map_kernels.hip.cpp b/hip/distributed/index_map_kernels.hip.cpp
index 536b09a1bb1..c722952f85d 100644
--- a/hip/distributed/index_map_kernels.hip.cpp
+++ b/hip/distributed/index_map_kernels.hip.cpp
@@ -19,9 +19,9 @@
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 
+#include "common/cuda_hip/components/atomic.hpp"
+#include "common/cuda_hip/components/searching.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/components/atomic.hip.hpp"
-#include "hip/components/searching.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/factorization/par_ict_kernels.hip.cpp b/hip/factorization/par_ict_kernels.hip.cpp
index 99b2f09274b..ed7b104471b 100644
--- a/hip/factorization/par_ict_kernels.hip.cpp
+++ b/hip/factorization/par_ict_kernels.hip.cpp
@@ -10,20 +10,20 @@
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
 #include "common/cuda_hip/components/memory.hpp"
+#include "common/cuda_hip/components/merging.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/components/intrinsics.hip.hpp"
-#include "hip/components/merging.hip.hpp"
-#include "hip/components/prefix_sum.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/searching.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp b/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp
index b4fdd7e6e6d..31482cd4034 100644
--- a/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp
+++ b/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp
@@ -11,21 +11,21 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 #include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/sorting.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/par_ilut_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/components/atomic.hip.hpp"
-#include "hip/components/intrinsics.hip.hpp"
-#include "hip/components/prefix_sum.hip.hpp"
-#include "hip/components/sorting.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
 #include "hip/factorization/par_ilut_select_common.hip.hpp"
 
 
diff --git a/hip/factorization/par_ilut_filter_kernels.hip.cpp b/hip/factorization/par_ilut_filter_kernels.hip.cpp
index 8f91e6f7087..bbe0b197d7c 100644
--- a/hip/factorization/par_ilut_filter_kernels.hip.cpp
+++ b/hip/factorization/par_ilut_filter_kernels.hip.cpp
@@ -9,18 +9,18 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 #include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/par_ilut_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/components/intrinsics.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/factorization/par_ilut_select_common.hip.cpp b/hip/factorization/par_ilut_select_common.hip.cpp
index 098ce5c9887..89ceca0a024 100644
--- a/hip/factorization/par_ilut_select_common.hip.cpp
+++ b/hip/factorization/par_ilut_select_common.hip.cpp
@@ -10,15 +10,15 @@
 
 #include "hip/factorization/par_ilut_select_common.hip.hpp"
 
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "common/cuda_hip/components/sorting.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/par_ilut_kernels.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/components/atomic.hip.hpp"
-#include "hip/components/intrinsics.hip.hpp"
-#include "hip/components/prefix_sum.hip.hpp"
-#include "hip/components/searching.hip.hpp"
-#include "hip/components/sorting.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/factorization/par_ilut_select_kernels.hip.cpp b/hip/factorization/par_ilut_select_kernels.hip.cpp
index 55180bc3d05..2e75f7de81b 100644
--- a/hip/factorization/par_ilut_select_kernels.hip.cpp
+++ b/hip/factorization/par_ilut_select_kernels.hip.cpp
@@ -8,16 +8,16 @@
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "common/cuda_hip/components/sorting.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/par_ilut_kernels.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/components/atomic.hip.hpp"
-#include "hip/components/intrinsics.hip.hpp"
-#include "hip/components/prefix_sum.hip.hpp"
-#include "hip/components/searching.hip.hpp"
-#include "hip/components/sorting.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
 #include "hip/factorization/par_ilut_select_common.hip.hpp"
 
 
diff --git a/hip/factorization/par_ilut_spgeam_kernels.hip.cpp b/hip/factorization/par_ilut_spgeam_kernels.hip.cpp
index 200a16ea849..5757e00d2a3 100644
--- a/hip/factorization/par_ilut_spgeam_kernels.hip.cpp
+++ b/hip/factorization/par_ilut_spgeam_kernels.hip.cpp
@@ -8,20 +8,20 @@
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/merging.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/par_ilut_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/components/intrinsics.hip.hpp"
-#include "hip/components/merging.hip.hpp"
-#include "hip/components/prefix_sum.hip.hpp"
-#include "hip/components/searching.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/factorization/par_ilut_sweep_kernels.hip.cpp b/hip/factorization/par_ilut_sweep_kernels.hip.cpp
index b3994706567..de271d6eebd 100644
--- a/hip/factorization/par_ilut_sweep_kernels.hip.cpp
+++ b/hip/factorization/par_ilut_sweep_kernels.hip.cpp
@@ -8,21 +8,21 @@
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
 #include "common/cuda_hip/components/memory.hpp"
+#include "common/cuda_hip/components/merging.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/par_ilut_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/components/intrinsics.hip.hpp"
-#include "hip/components/merging.hip.hpp"
-#include "hip/components/prefix_sum.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/searching.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/matrix/csr_kernels.template.hip.cpp b/hip/matrix/csr_kernels.template.hip.cpp
index acd0b0144bb..f7766b8648b 100644
--- a/hip/matrix/csr_kernels.template.hip.cpp
+++ b/hip/matrix/csr_kernels.template.hip.cpp
@@ -25,11 +25,20 @@
 
 #include "accessor/cuda_hip_helper.hpp"
 #include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/pointer_mode_guard.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
 #include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/merging.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/segment_scan.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
 #include "core/base/array_access.hpp"
 #include "core/base/mixed_precision_types.hpp"
 #include "core/components/fill_array_kernels.hpp"
@@ -40,16 +49,7 @@
 #include "core/matrix/csr_lookup.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/math.hip.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/components/atomic.hip.hpp"
-#include "hip/components/intrinsics.hip.hpp"
-#include "hip/components/merging.hip.hpp"
-#include "hip/components/prefix_sum.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/segment_scan.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/preconditioner/batch_jacobi_kernels.hip.cpp b/hip/preconditioner/batch_jacobi_kernels.hip.cpp
index db6e5a27b58..a112e3beb92 100644
--- a/hip/preconditioner/batch_jacobi_kernels.hip.cpp
+++ b/hip/preconditioner/batch_jacobi_kernels.hip.cpp
@@ -8,6 +8,11 @@
 #include <ginkgo/core/matrix/batch_csr.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/components/diagonal_block_manipulation.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/base/utils.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
@@ -15,13 +20,8 @@
 #include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/batch_struct.hip.hpp"
 #include "hip/base/config.hip.hpp"
-#include "hip/base/math.hip.hpp"
 #include "hip/base/types.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/diagonal_block_manipulation.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
-#include "hip/components/warp_blas.hip.hpp"
 #include "hip/matrix/batch_struct.hip.hpp"
 #include "hip/preconditioner/jacobi_common.hip.hpp"
 
diff --git a/hip/preconditioner/batch_preconditioners.hip.hpp b/hip/preconditioner/batch_preconditioners.hip.hpp
index f3969c16b81..f62000ff46f 100644
--- a/hip/preconditioner/batch_preconditioners.hip.hpp
+++ b/hip/preconditioner/batch_preconditioners.hip.hpp
@@ -7,9 +7,9 @@
 
 
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/preconditioner/batch_jacobi_helpers.hpp"
-#include "hip/components/reduction.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp
index 7e6311bcd52..d30f4edd787 100644
--- a/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp
+++ b/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp
@@ -5,17 +5,17 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 #include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/warp_blas.hip.hpp"
 #include "hip/preconditioner/jacobi_common.hip.hpp"
 
 
diff --git a/hip/preconditioner/jacobi_generate_kernels.hip.cpp b/hip/preconditioner/jacobi_generate_kernels.hip.cpp
index 9f2d3238a83..3f6d3a4e91f 100644
--- a/hip/preconditioner/jacobi_generate_kernels.hip.cpp
+++ b/hip/preconditioner/jacobi_generate_kernels.hip.cpp
@@ -6,19 +6,19 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 #include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/diagonal_block_manipulation.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/components/diagonal_block_manipulation.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
-#include "hip/components/warp_blas.hip.hpp"
 #include "hip/preconditioner/jacobi_common.hip.hpp"
 
 
diff --git a/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp b/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp
index 3685df4aa0e..3c18703557d 100644
--- a/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp
+++ b/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp
@@ -6,18 +6,18 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 #include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/diagonal_block_manipulation.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/components/diagonal_block_manipulation.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
-#include "hip/components/warp_blas.hip.hpp"
 #include "hip/preconditioner/jacobi_common.hip.hpp"
 
 
diff --git a/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp
index d922d178f88..563f5829536 100644
--- a/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp
+++ b/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp
@@ -5,17 +5,17 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 #include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/warp_blas.hip.hpp"
 #include "hip/preconditioner/jacobi_common.hip.hpp"
 
 
diff --git a/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp
index baa847c58a5..7a6e2a46b04 100644
--- a/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp
+++ b/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp
@@ -5,16 +5,16 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 #include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/warp_blas.hip.hpp"
 #include "hip/preconditioner/jacobi_common.hip.hpp"
 
 
diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp
index 44e2f0f3c48..96587f8479e 100644
--- a/hip/solver/batch_bicgstab_kernels.hip.cpp
+++ b/hip/solver/batch_bicgstab_kernels.hip.cpp
@@ -11,18 +11,18 @@
 #include <ginkgo/core/base/math.hpp>
 
 #include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "hip/base/batch_struct.hip.hpp"
-#include "hip/base/math.hip.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
 #include "hip/matrix/batch_struct.hip.hpp"
 
 
diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp
index 450d02a302c..e12445b2c84 100644
--- a/hip/solver/batch_cg_kernels.hip.cpp
+++ b/hip/solver/batch_cg_kernels.hip.cpp
@@ -11,18 +11,18 @@
 #include <ginkgo/core/base/math.hpp>
 
 #include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "hip/base/batch_struct.hip.hpp"
-#include "hip/base/math.hip.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
 #include "hip/matrix/batch_struct.hip.hpp"
 
 
diff --git a/hip/solver/common_trs_kernels.hip.hpp b/hip/solver/common_trs_kernels.hip.hpp
index ce5cd4192a9..b029e09d400 100644
--- a/hip/solver/common_trs_kernels.hip.hpp
+++ b/hip/solver/common_trs_kernels.hip.hpp
@@ -20,13 +20,13 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/pointer_mode_guard.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/math.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/solver/lower_trs_kernels.hip.cpp b/hip/solver/lower_trs_kernels.hip.cpp
index 322c87d37b3..5eab76ed5fa 100644
--- a/hip/solver/lower_trs_kernels.hip.cpp
+++ b/hip/solver/lower_trs_kernels.hip.cpp
@@ -18,10 +18,10 @@
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/solver/triangular.hpp>
 
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
 #include "common/cuda_hip/base/types.hpp"
-#include "hip/base/math.hip.hpp"
 #include "hip/solver/common_trs_kernels.hip.hpp"
 
 
diff --git a/hip/solver/upper_trs_kernels.hip.cpp b/hip/solver/upper_trs_kernels.hip.cpp
index 6be850959cb..fb480d9b22d 100644
--- a/hip/solver/upper_trs_kernels.hip.cpp
+++ b/hip/solver/upper_trs_kernels.hip.cpp
@@ -18,10 +18,10 @@
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/solver/triangular.hpp>
 
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
 #include "common/cuda_hip/base/types.hpp"
-#include "hip/base/math.hip.hpp"
 #include "hip/solver/common_trs_kernels.hip.hpp"
 
 
diff --git a/hip/stop/criterion_kernels.hip.cpp b/hip/stop/criterion_kernels.hip.cpp
index 8f856f0ed8d..0b8e300f978 100644
--- a/hip/stop/criterion_kernels.hip.cpp
+++ b/hip/stop/criterion_kernels.hip.cpp
@@ -8,9 +8,9 @@
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/types.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 
 
 namespace gko {
diff --git a/hip/stop/residual_norm_kernels.hip.cpp b/hip/stop/residual_norm_kernels.hip.cpp
index eb6c89a2e2e..0a9af423128 100644
--- a/hip/stop/residual_norm_kernels.hip.cpp
+++ b/hip/stop/residual_norm_kernels.hip.cpp
@@ -8,11 +8,11 @@
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 #include "core/base/array_access.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/test/base/math.hip.cpp b/hip/test/base/math.hip.cpp
index 1a882989854..01fb96afa7c 100644
--- a/hip/test/base/math.hip.cpp
+++ b/hip/test/base/math.hip.cpp
@@ -8,7 +8,7 @@
 // clang-format on
 
 
-#include "hip/base/math.hip.hpp"
+#include "common/cuda_hip/base/math.hpp"
 
 #include <cmath>
 #include <complex>

From e89b595f9cca516dbfa76a0cd6a260b1803f2400 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Sat, 29 Jun 2024 16:10:10 +0200
Subject: [PATCH 041/448] fix fbcsr instantiation

---
 common/cuda_hip/CMakeLists.txt                | 18 -------
 .../matrix/fbcsr_kernels.instantiate.cpp      |  6 +--
 ...kernels.cpp => fbcsr_kernels.template.cpp} |  0
 cuda/CMakeLists.txt                           |  4 +-
 hip/CMakeLists.txt                            |  2 +-
 hip/matrix/fbcsr_kernels.instantiate.hip.cpp  | 47 -------------------
 6 files changed, 6 insertions(+), 71 deletions(-)
 rename cuda/matrix/fbcsr_kernels.instantiate.cu => common/cuda_hip/matrix/fbcsr_kernels.instantiate.cpp (90%)
 rename common/cuda_hip/matrix/{fbcsr_kernels.cpp => fbcsr_kernels.template.cpp} (100%)
 delete mode 100644 hip/matrix/fbcsr_kernels.instantiate.hip.cpp

diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt
index 2cfbe6e9b0d..79af0c5fd0d 100644
--- a/common/cuda_hip/CMakeLists.txt
+++ b/common/cuda_hip/CMakeLists.txt
@@ -2,22 +2,7 @@ include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
 set(CUDA_HIP_SOURCES
     base/batch_multi_vector_kernels.cpp
     base/device_matrix_data_kernels.cpp
-    base/kernel_launch.hpp
-    base/kernel_launch_reduction.hpp
-    base/kernel_launch_solver.hpp
-    components/atomic.hpp
-    components/diagonal_block_manipulation.hpp
-    components/intrinsics.hpp
-    components/merging.hpp
-    components/prefix_sum.hpp
     components/prefix_sum_kernels.cpp
-    components/reduction.hpp
-    components/searching.hpp
-    components/segment_scan.hpp
-    components/sorting.hpp
-    components/syncfree.hpp
-    components/thread_ids.hpp
-    components/warp_blas.hpp
     distributed/matrix_kernels.cpp
     distributed/partition_helpers_kernels.cpp
     distributed/partition_kernels.cpp
@@ -27,7 +12,6 @@ set(CUDA_HIP_SOURCES
     factorization/lu_kernels.cpp
     factorization/par_ic_kernels.cpp
     factorization/par_ilu_kernels.cpp
-    log/batch_logger.hpp
     matrix/batch_csr_kernels.cpp
     matrix/batch_dense_kernels.cpp
     matrix/batch_ell_kernels.cpp
@@ -35,7 +19,6 @@ set(CUDA_HIP_SOURCES
     matrix/dense_kernels.cpp
     matrix/diagonal_kernels.cpp
     matrix/ell_kernels.cpp
-    matrix/fbcsr_kernels.cpp
     matrix/sellp_kernels.cpp
     matrix/sparsity_csr_kernels.cpp
     multigrid/pgm_kernels.cpp
@@ -45,7 +28,6 @@ set(CUDA_HIP_SOURCES
     solver/cb_gmres_kernels.cpp
     solver/idr_kernels.cpp
     solver/multigrid_kernels.cpp
-    stop/batch_criteria.hpp
     )
 list(TRANSFORM CUDA_HIP_SOURCES PREPEND ${CMAKE_CURRENT_SOURCE_DIR}/)
 set(GKO_CUDA_HIP_COMMON_SOURCES ${CUDA_HIP_SOURCES} PARENT_SCOPE)
diff --git a/cuda/matrix/fbcsr_kernels.instantiate.cu b/common/cuda_hip/matrix/fbcsr_kernels.instantiate.cpp
similarity index 90%
rename from cuda/matrix/fbcsr_kernels.instantiate.cu
rename to common/cuda_hip/matrix/fbcsr_kernels.instantiate.cpp
index f6165ac5e5c..a3beaac4a85 100644
--- a/cuda/matrix/fbcsr_kernels.instantiate.cu
+++ b/common/cuda_hip/matrix/fbcsr_kernels.instantiate.cpp
@@ -2,12 +2,12 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "cuda/matrix/fbcsr_kernels.template.cu"
+#include "common/cuda_hip/matrix/fbcsr_kernels.template.cpp"
 
 
 namespace gko {
 namespace kernels {
-namespace cuda {
+namespace GKO_DEVICE_NAMESPACE {
 /**
  * @brief The fixed-size block compressed sparse row matrix format namespace.
  *
@@ -42,6 +42,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 }  // namespace fbcsr
-}  // namespace cuda
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/common/cuda_hip/matrix/fbcsr_kernels.cpp b/common/cuda_hip/matrix/fbcsr_kernels.template.cpp
similarity index 100%
rename from common/cuda_hip/matrix/fbcsr_kernels.cpp
rename to common/cuda_hip/matrix/fbcsr_kernels.template.cpp
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index bef62c12a9a..4dd7bccd2c9 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 add_library(ginkgo_cuda $<TARGET_OBJECTS:ginkgo_cuda_device> "")
 include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
 add_instantiation_files(. matrix/csr_kernels.instantiate.cu CSR_INSTANTIATE)
-add_instantiation_files(. matrix/fbcsr_kernels.instantiate.cu FBCSR_INSTANTIATE)
+add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE)
 # we don't split up the dense kernels into distinct compilations
 list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
 target_sources(ginkgo_cuda
@@ -44,7 +44,7 @@ target_sources(ginkgo_cuda
     ${GKO_CUDA_HIP_COMMON_SOURCES}
     )
 # override the default language mapping for the common files, set them to CUDA
-foreach(source_file IN LISTS GKO_UNIFIED_COMMON_SOURCES GKO_CUDA_HIP_COMMON_SOURCES)
+foreach(source_file IN LISTS GKO_UNIFIED_COMMON_SOURCES GKO_CUDA_HIP_COMMON_SOURCES FBCSR_INSTANTIATE)
     set_source_files_properties(${source_file} PROPERTIES LANGUAGE CUDA)
 endforeach(source_file)
 if(GINKGO_JACOBI_FULL_OPTIMIZATIONS)
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 19f4dd54b2a..3de4f4b4d65 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.21)
 include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
 add_instantiation_files(. matrix/csr_kernels.instantiate.hip.cpp CSR_INSTANTIATE)
-add_instantiation_files(. matrix/fbcsr_kernels.instantiate.hip.cpp FBCSR_INSTANTIATE)
+add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE)
 # we don't split up the dense kernels into distinct compilations
 list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
 set(GINKGO_HIP_SOURCES
diff --git a/hip/matrix/fbcsr_kernels.instantiate.hip.cpp b/hip/matrix/fbcsr_kernels.instantiate.hip.cpp
deleted file mode 100644
index 54e90fc4297..00000000000
--- a/hip/matrix/fbcsr_kernels.instantiate.hip.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "hip/matrix/fbcsr_kernels.template.hip.cpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The fixed-size block compressed sparse row matrix format namespace.
- *
- * @ingroup fbcsr
- */
-namespace fbcsr {
-
-
-// begin
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL);
-// split
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL);
-// split
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL);
-// end
-
-
-}  // namespace fbcsr
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko

From 2b71d3de0c9da74198037e842d9720ac3450b9a8 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Sat, 29 Jun 2024 18:29:33 +0200
Subject: [PATCH 042/448] fix includes for thrust and sparselib block

---
 .../base/sparselib_block_bindings.hpp         | 18 +++++++++
 cuda/base/thrust.cuh                          | 31 --------------
 cuda/distributed/index_map_kernels.cu         |  2 +-
 cuda/matrix/csr_kernels.template.cu           |  2 +-
 hip/base/thrust.hip.hpp                       | 40 -------------------
 hip/distributed/index_map_kernels.hip.cpp     |  2 +-
 hip/matrix/csr_kernels.template.hip.cpp       |  2 +-
 7 files changed, 22 insertions(+), 75 deletions(-)
 create mode 100644 common/cuda_hip/base/sparselib_block_bindings.hpp
 delete mode 100644 cuda/base/thrust.cuh
 delete mode 100644 hip/base/thrust.hip.hpp

diff --git a/common/cuda_hip/base/sparselib_block_bindings.hpp b/common/cuda_hip/base/sparselib_block_bindings.hpp
new file mode 100644
index 00000000000..38bbebc6c14
--- /dev/null
+++ b/common/cuda_hip/base/sparselib_block_bindings.hpp
@@ -0,0 +1,18 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_BASE_SPARSELIB_BLOCK_BINDINGS_HPP_
+#define GKO_COMMON_CUDA_HIP_BASE_SPARSELIB_BLOCK_BINDINGS_HPP_
+
+
+#if defined(GKO_COMPILING_CUDA)
+#include "cuda/base/cusparse_block_bindings.hpp"
+#elif defined(GKO_COMPILING_HIP)
+#include "hip/base/hipsparse_block_bindings.hip.hpp"
+#else
+#error "Executor definition missing"
+#endif
+
+
+#endif  // GKO_COMMON_CUDA_HIP_BASE_SPARSELIB_BLOCK_BINDINGS_HPP_
diff --git a/cuda/base/thrust.cuh b/cuda/base/thrust.cuh
deleted file mode 100644
index 5d5d58e0f33..00000000000
--- a/cuda/base/thrust.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_BASE_THRUST_CUH_
-#define GKO_CUDA_BASE_THRUST_CUH_
-
-
-#include <thrust/execution_policy.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-#include <ginkgo/core/base/executor.hpp>
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-inline auto thrust_policy(std::shared_ptr<const CudaExecutor> exec)
-{
-    return thrust::cuda::par.on(exec->get_stream());
-}
-
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_BASE_THRUST_CUH_
diff --git a/cuda/distributed/index_map_kernels.cu b/cuda/distributed/index_map_kernels.cu
index e55a4148e51..3c23d098a0e 100644
--- a/cuda/distributed/index_map_kernels.cu
+++ b/cuda/distributed/index_map_kernels.cu
@@ -19,9 +19,9 @@
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 
+#include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/components/atomic.hpp"
 #include "common/cuda_hip/components/searching.hpp"
-#include "cuda/base/thrust.cuh"
 
 
 namespace gko {
diff --git a/cuda/matrix/csr_kernels.template.cu b/cuda/matrix/csr_kernels.template.cu
index 151351c9204..c8d193e09af 100644
--- a/cuda/matrix/csr_kernels.template.cu
+++ b/cuda/matrix/csr_kernels.template.cu
@@ -28,6 +28,7 @@
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/pointer_mode_guard.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/atomic.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
@@ -49,7 +50,6 @@
 #include "core/matrix/csr_lookup.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/thrust.cuh"
 
 
 namespace gko {
diff --git a/hip/base/thrust.hip.hpp b/hip/base/thrust.hip.hpp
deleted file mode 100644
index 2aecdd79328..00000000000
--- a/hip/base/thrust.hip.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_BASE_THRUST_HIP_HPP_
-#define GKO_HIP_BASE_THRUST_HIP_HPP_
-
-
-#include <thrust/execution_policy.h>
-
-#include <ginkgo/config.hpp>
-#include <ginkgo/core/base/executor.hpp>
-#if GINKGO_HIP_PLATFORM_HCC
-#include <thrust/system/hip/detail/execution_policy.h>
-#else
-#include <thrust/system/cuda/detail/execution_policy.h>
-#endif
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-inline auto thrust_policy(std::shared_ptr<const HipExecutor> exec)
-{
-#if GINKGO_HIP_PLATFORM_HCC
-    return thrust::hip::par.on(exec->get_stream());
-#else
-    return thrust::cuda::par.on(exec->get_stream());
-#endif
-}
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_BASE_THRUST_HIP_HPP_
diff --git a/hip/distributed/index_map_kernels.hip.cpp b/hip/distributed/index_map_kernels.hip.cpp
index c722952f85d..67ff2f72857 100644
--- a/hip/distributed/index_map_kernels.hip.cpp
+++ b/hip/distributed/index_map_kernels.hip.cpp
@@ -19,9 +19,9 @@
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 
+#include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/components/atomic.hpp"
 #include "common/cuda_hip/components/searching.hpp"
-#include "hip/base/thrust.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/matrix/csr_kernels.template.hip.cpp b/hip/matrix/csr_kernels.template.hip.cpp
index f7766b8648b..473361029c8 100644
--- a/hip/matrix/csr_kernels.template.hip.cpp
+++ b/hip/matrix/csr_kernels.template.hip.cpp
@@ -29,6 +29,7 @@
 #include "common/cuda_hip/base/pointer_mode_guard.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/atomic.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
@@ -49,7 +50,6 @@
 #include "core/matrix/csr_lookup.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/thrust.hip.hpp"
 
 
 namespace gko {

From 46d9259c008b588efd3d953969e2b726a6c951b7 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Sat, 29 Jun 2024 18:30:50 +0200
Subject: [PATCH 043/448] revert batch changes

---
 core/solver/batch_dispatch.hpp                | 17 +++---
 cuda/base/batch_multi_vector_kernels.cu       | 56 +++++++++++++++++++
 cuda/log/batch_logger.cuh                     | 27 +++++++++
 cuda/matrix/batch_csr_kernels.cu              | 55 ++++++++++++++++++
 cuda/matrix/batch_dense_kernels.cu            | 56 +++++++++++++++++++
 cuda/matrix/batch_ell_kernels.cu              | 55 ++++++++++++++++++
 cuda/preconditioner/batch_jacobi_kernels.cu   |  4 +-
 cuda/preconditioner/batch_preconditioners.cuh |  2 +-
 cuda/solver/batch_bicgstab_kernels.cu         |  6 +-
 cuda/solver/batch_cg_kernels.cu               |  6 +-
 cuda/stop/batch_criteria.cuh                  | 26 +++++++++
 hip/base/batch_multi_vector_kernels.hip.cpp   | 56 +++++++++++++++++++
 hip/log/batch_logger.hip.hpp                  | 26 +++++++++
 hip/matrix/batch_csr_kernels.hip.cpp          | 55 ++++++++++++++++++
 hip/matrix/batch_dense_kernels.hip.cpp        | 56 +++++++++++++++++++
 hip/matrix/batch_ell_kernels.hip.cpp          | 55 ++++++++++++++++++
 .../batch_jacobi_kernels.hip.cpp              | 10 ++--
 .../batch_preconditioners.hip.hpp             |  2 +-
 hip/solver/batch_bicgstab_kernels.hip.cpp     |  8 +--
 hip/solver/batch_cg_kernels.hip.cpp           |  8 +--
 hip/stop/batch_criteria.hip.hpp               | 26 +++++++++
 21 files changed, 580 insertions(+), 32 deletions(-)
 create mode 100644 cuda/base/batch_multi_vector_kernels.cu
 create mode 100644 cuda/log/batch_logger.cuh
 create mode 100644 cuda/matrix/batch_csr_kernels.cu
 create mode 100644 cuda/matrix/batch_dense_kernels.cu
 create mode 100644 cuda/matrix/batch_ell_kernels.cu
 create mode 100644 cuda/stop/batch_criteria.cuh
 create mode 100644 hip/base/batch_multi_vector_kernels.hip.cpp
 create mode 100644 hip/log/batch_logger.hip.hpp
 create mode 100644 hip/matrix/batch_csr_kernels.hip.cpp
 create mode 100644 hip/matrix/batch_dense_kernels.hip.cpp
 create mode 100644 hip/matrix/batch_ell_kernels.hip.cpp
 create mode 100644 hip/stop/batch_criteria.hip.hpp

diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp
index ce8c4d86e71..8a142a5224a 100644
--- a/core/solver/batch_dispatch.hpp
+++ b/core/solver/batch_dispatch.hpp
@@ -21,15 +21,15 @@
 #include "core/matrix/batch_struct.hpp"
 
 
-#if defined(GKO_COMPILING_CUDA)
+#if defined GKO_COMPILING_CUDA
 
 
-#include "common/cuda_hip/log/batch_logger.hpp"
-#include "common/cuda_hip/stop/batch_criteria.hpp"
 #include "cuda/base/batch_struct.hpp"
 #include "cuda/components/cooperative_groups.cuh"
+#include "cuda/log/batch_logger.cuh"
 #include "cuda/matrix/batch_struct.hpp"
 #include "cuda/preconditioner/batch_preconditioners.cuh"
+#include "cuda/stop/batch_criteria.cuh"
 
 
 namespace gko {
@@ -37,12 +37,11 @@ namespace batch {
 namespace solver {
 
 
-namespace device = gko::kernels::GKO_DEVICE_NAMESPACE;
+namespace device = gko::kernels::cuda;
 
 
 template <typename ValueType>
-using DeviceValueType =
-    typename gko::kernels::GKO_DEVICE_NAMESPACE::device_type<ValueType>;
+using DeviceValueType = typename gko::kernels::cuda::cuda_type<ValueType>;
 
 
 }  // namespace solver
@@ -50,15 +49,15 @@ using DeviceValueType =
 }  // namespace gko
 
 
-#elif defined(GKO_COMPILING_HIP)
+#elif defined GKO_COMPILING_HIP
 
 
-#include "common/cuda_hip/log/batch_logger.hpp"
-#include "common/cuda_hip/stop/batch_criteria.hpp"
 #include "hip/base/batch_struct.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/log/batch_logger.hip.hpp"
 #include "hip/matrix/batch_struct.hip.hpp"
 #include "hip/preconditioner/batch_preconditioners.hip.hpp"
+#include "hip/stop/batch_criteria.hip.hpp"
 
 
 namespace gko {
diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu
new file mode 100644
index 00000000000..704192d0bff
--- /dev/null
+++ b/cuda/base/batch_multi_vector_kernels.cu
@@ -0,0 +1,56 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/base/batch_multi_vector_kernels.hpp"
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/range_accessors.hpp>
+
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "core/base/batch_struct.hpp"
+#include "cuda/base/batch_struct.hpp"
+#include "cuda/base/thrust.cuh"
+#include "cuda/components/reduction.cuh"
+#include "cuda/components/thread_ids.cuh"
+#include "cuda/components/uninitialized_array.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The MultiVector matrix format namespace.
+ *
+ * @ingroup batch_multi_vector
+ */
+namespace batch_multi_vector {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
+
+
+#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc"
+
+// clang-format on
+
+
+}  // namespace batch_multi_vector
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/log/batch_logger.cuh b/cuda/log/batch_logger.cuh
new file mode 100644
index 00000000000..3e53d6ef0a6
--- /dev/null
+++ b/cuda/log/batch_logger.cuh
@@ -0,0 +1,27 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_CUDA_LOG_BATCH_LOGGER_CUH_
+#define GKO_CUDA_LOG_BATCH_LOGGER_CUH_
+
+
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+namespace batch_log {
+
+
+#include "common/cuda_hip/log/batch_logger.hpp.inc"
+
+
+}  // namespace batch_log
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CUDA_LOG_BATCH_LOGGER_CUH_
diff --git a/cuda/matrix/batch_csr_kernels.cu b/cuda/matrix/batch_csr_kernels.cu
new file mode 100644
index 00000000000..4fc5137646c
--- /dev/null
+++ b/cuda/matrix/batch_csr_kernels.cu
@@ -0,0 +1,55 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/matrix/batch_csr_kernels.hpp"
+
+#include <thrust/functional.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_csr.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "cuda/base/batch_struct.hpp"
+#include "cuda/base/thrust.cuh"
+#include "cuda/components/reduction.cuh"
+#include "cuda/components/thread_ids.cuh"
+#include "cuda/components/uninitialized_array.hpp"
+#include "cuda/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The Csr matrix format namespace.
+ * @ref Csr
+ * @ingroup batch_csr
+ */
+namespace batch_csr {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
+
+
+#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc"
+
+// clang-format on
+
+
+}  // namespace batch_csr
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/matrix/batch_dense_kernels.cu b/cuda/matrix/batch_dense_kernels.cu
new file mode 100644
index 00000000000..e28d4f91670
--- /dev/null
+++ b/cuda/matrix/batch_dense_kernels.cu
@@ -0,0 +1,56 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/matrix/batch_dense_kernels.hpp"
+
+#include <thrust/functional.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "cuda/base/batch_struct.hpp"
+#include "cuda/base/thrust.cuh"
+#include "cuda/components/reduction.cuh"
+#include "cuda/components/thread_ids.cuh"
+#include "cuda/components/uninitialized_array.hpp"
+#include "cuda/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The Dense matrix format namespace.
+ *
+ * @ingroup batch_dense
+ */
+namespace batch_dense {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
+
+
+#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc"
+
+
+// clang-format on
+
+
+}  // namespace batch_dense
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/matrix/batch_ell_kernels.cu b/cuda/matrix/batch_ell_kernels.cu
new file mode 100644
index 00000000000..90caf963200
--- /dev/null
+++ b/cuda/matrix/batch_ell_kernels.cu
@@ -0,0 +1,55 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/matrix/batch_ell_kernels.hpp"
+
+#include <thrust/functional.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "cuda/base/batch_struct.hpp"
+#include "cuda/base/thrust.cuh"
+#include "cuda/components/reduction.cuh"
+#include "cuda/components/thread_ids.cuh"
+#include "cuda/components/uninitialized_array.hpp"
+#include "cuda/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The Ell matrix format namespace.
+ * @ref Ell
+ * @ingroup batch_ell
+ */
+namespace batch_ell {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
+
+
+#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc"
+
+// clang-format on
+
+
+}  // namespace batch_ell
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/preconditioner/batch_jacobi_kernels.cu b/cuda/preconditioner/batch_jacobi_kernels.cu
index e31e17dcafc..1bc39df9781 100644
--- a/cuda/preconditioner/batch_jacobi_kernels.cu
+++ b/cuda/preconditioner/batch_jacobi_kernels.cu
@@ -8,8 +8,6 @@
 #include <ginkgo/core/matrix/batch_csr.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
-#include "common/cuda_hip/components/intrinsics.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/base/utils.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
@@ -20,6 +18,8 @@
 #include "cuda/base/config.hpp"
 #include "cuda/base/types.hpp"
 #include "cuda/components/cooperative_groups.cuh"
+#include "cuda/components/intrinsics.cuh"
+#include "cuda/components/thread_ids.cuh"
 #include "cuda/matrix/batch_struct.hpp"
 #include "cuda/preconditioner/jacobi_common.hpp"
 
diff --git a/cuda/preconditioner/batch_preconditioners.cuh b/cuda/preconditioner/batch_preconditioners.cuh
index 01001c036b2..e83d6e04ee9 100644
--- a/cuda/preconditioner/batch_preconditioners.cuh
+++ b/cuda/preconditioner/batch_preconditioners.cuh
@@ -7,9 +7,9 @@
 
 
 #include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/preconditioner/batch_jacobi_helpers.hpp"
+#include "cuda/components/reduction.cuh"
 
 
 namespace gko {
diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu
index 8d76f865a20..28efaf07475 100644
--- a/cuda/solver/batch_bicgstab_kernels.cu
+++ b/cuda/solver/batch_bicgstab_kernels.cu
@@ -14,14 +14,14 @@
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/uninitialized_array.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "cuda/base/batch_struct.hpp"
 #include "cuda/base/thrust.cuh"
+#include "cuda/components/reduction.cuh"
+#include "cuda/components/thread_ids.cuh"
+#include "cuda/components/uninitialized_array.hpp"
 #include "cuda/matrix/batch_struct.hpp"
 
 
diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu
index 2083cd98b5a..cff72652629 100644
--- a/cuda/solver/batch_cg_kernels.cu
+++ b/cuda/solver/batch_cg_kernels.cu
@@ -13,14 +13,14 @@
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/uninitialized_array.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "cuda/base/batch_struct.hpp"
 #include "cuda/base/thrust.cuh"
+#include "cuda/components/reduction.cuh"
+#include "cuda/components/thread_ids.cuh"
+#include "cuda/components/uninitialized_array.hpp"
 #include "cuda/matrix/batch_struct.hpp"
 
 
diff --git a/cuda/stop/batch_criteria.cuh b/cuda/stop/batch_criteria.cuh
new file mode 100644
index 00000000000..f4f434dda11
--- /dev/null
+++ b/cuda/stop/batch_criteria.cuh
@@ -0,0 +1,26 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_CUDA_STOP_BATCH_CRITERIA_CUH_
+#define GKO_CUDA_STOP_BATCH_CRITERIA_CUH_
+
+
+#include <ginkgo/core/base/math.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+namespace batch_stop {
+
+
+#include "common/cuda_hip/stop/batch_criteria.hpp.inc"
+
+
+}  // namespace batch_stop
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
+
+#endif  // GKO_CUDA_STOP_BATCH_CRITERIA_CUH_
diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp
new file mode 100644
index 00000000000..86b16c8975d
--- /dev/null
+++ b/hip/base/batch_multi_vector_kernels.hip.cpp
@@ -0,0 +1,56 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/base/batch_multi_vector_kernels.hpp"
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/range_accessors.hpp>
+
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "core/base/batch_struct.hpp"
+#include "hip/base/batch_struct.hip.hpp"
+#include "hip/base/thrust.hip.hpp"
+#include "hip/components/reduction.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+#include "hip/components/uninitialized_array.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The MultiVector matrix format namespace.
+ *
+ * @ingroup batch_multi_vector
+ */
+namespace batch_multi_vector {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
+
+
+#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc"
+
+// clang-format on
+
+
+}  // namespace batch_multi_vector
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/log/batch_logger.hip.hpp b/hip/log/batch_logger.hip.hpp
new file mode 100644
index 00000000000..a2540f2bd9d
--- /dev/null
+++ b/hip/log/batch_logger.hip.hpp
@@ -0,0 +1,26 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_
+#define GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_
+
+
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+namespace batch_log {
+
+#include "common/cuda_hip/log/batch_logger.hpp.inc"
+
+
+}  // namespace batch_log
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_
diff --git a/hip/matrix/batch_csr_kernels.hip.cpp b/hip/matrix/batch_csr_kernels.hip.cpp
new file mode 100644
index 00000000000..4b0e6799834
--- /dev/null
+++ b/hip/matrix/batch_csr_kernels.hip.cpp
@@ -0,0 +1,55 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/matrix/batch_csr_kernels.hpp"
+
+#include <thrust/functional.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_csr.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "hip/base/batch_struct.hip.hpp"
+#include "hip/base/thrust.hip.hpp"
+#include "hip/components/reduction.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+#include "hip/components/uninitialized_array.hip.hpp"
+#include "hip/matrix/batch_struct.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The Csr matrix format namespace.
+ * @ref Csr
+ * @ingroup batch_csr
+ */
+namespace batch_csr {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
+
+
+#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc"
+
+// clang-format on
+
+
+}  // namespace batch_csr
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/matrix/batch_dense_kernels.hip.cpp b/hip/matrix/batch_dense_kernels.hip.cpp
new file mode 100644
index 00000000000..328f268251f
--- /dev/null
+++ b/hip/matrix/batch_dense_kernels.hip.cpp
@@ -0,0 +1,56 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/matrix/batch_dense_kernels.hpp"
+
+#include <thrust/functional.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "hip/base/batch_struct.hip.hpp"
+#include "hip/base/thrust.hip.hpp"
+#include "hip/components/reduction.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+#include "hip/components/uninitialized_array.hip.hpp"
+#include "hip/matrix/batch_struct.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The Dense matrix format namespace.
+ *
+ * @ingroup batch_dense
+ */
+namespace batch_dense {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
+
+
+#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc"
+
+
+// clang-format on
+
+
+}  // namespace batch_dense
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/matrix/batch_ell_kernels.hip.cpp b/hip/matrix/batch_ell_kernels.hip.cpp
new file mode 100644
index 00000000000..01294ac3d63
--- /dev/null
+++ b/hip/matrix/batch_ell_kernels.hip.cpp
@@ -0,0 +1,55 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/matrix/batch_ell_kernels.hpp"
+
+#include <thrust/functional.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "hip/base/batch_struct.hip.hpp"
+#include "hip/base/thrust.hip.hpp"
+#include "hip/components/reduction.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+#include "hip/components/uninitialized_array.hip.hpp"
+#include "hip/matrix/batch_struct.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The Ell matrix format namespace.
+ * @ref Ell
+ * @ingroup batch_ell
+ */
+namespace batch_ell {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
+
+
+#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc"
+
+// clang-format on
+
+
+}  // namespace batch_ell
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/preconditioner/batch_jacobi_kernels.hip.cpp b/hip/preconditioner/batch_jacobi_kernels.hip.cpp
index a112e3beb92..db6e5a27b58 100644
--- a/hip/preconditioner/batch_jacobi_kernels.hip.cpp
+++ b/hip/preconditioner/batch_jacobi_kernels.hip.cpp
@@ -8,11 +8,6 @@
 #include <ginkgo/core/matrix/batch_csr.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
-#include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/components/diagonal_block_manipulation.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/uninitialized_array.hpp"
-#include "common/cuda_hip/components/warp_blas.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/base/utils.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
@@ -20,8 +15,13 @@
 #include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/batch_struct.hip.hpp"
 #include "hip/base/config.hip.hpp"
+#include "hip/base/math.hip.hpp"
 #include "hip/base/types.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/diagonal_block_manipulation.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+#include "hip/components/uninitialized_array.hip.hpp"
+#include "hip/components/warp_blas.hip.hpp"
 #include "hip/matrix/batch_struct.hip.hpp"
 #include "hip/preconditioner/jacobi_common.hip.hpp"
 
diff --git a/hip/preconditioner/batch_preconditioners.hip.hpp b/hip/preconditioner/batch_preconditioners.hip.hpp
index f62000ff46f..f3969c16b81 100644
--- a/hip/preconditioner/batch_preconditioners.hip.hpp
+++ b/hip/preconditioner/batch_preconditioners.hip.hpp
@@ -7,9 +7,9 @@
 
 
 #include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/preconditioner/batch_jacobi_helpers.hpp"
+#include "hip/components/reduction.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp
index 96587f8479e..44e2f0f3c48 100644
--- a/hip/solver/batch_bicgstab_kernels.hip.cpp
+++ b/hip/solver/batch_bicgstab_kernels.hip.cpp
@@ -11,18 +11,18 @@
 #include <ginkgo/core/base/math.hpp>
 
 #include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/uninitialized_array.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "hip/base/batch_struct.hip.hpp"
+#include "hip/base/math.hip.hpp"
 #include "hip/base/thrust.hip.hpp"
+#include "hip/components/reduction.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+#include "hip/components/uninitialized_array.hip.hpp"
 #include "hip/matrix/batch_struct.hip.hpp"
 
 
diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp
index e12445b2c84..450d02a302c 100644
--- a/hip/solver/batch_cg_kernels.hip.cpp
+++ b/hip/solver/batch_cg_kernels.hip.cpp
@@ -11,18 +11,18 @@
 #include <ginkgo/core/base/math.hpp>
 
 #include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/uninitialized_array.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "hip/base/batch_struct.hip.hpp"
+#include "hip/base/math.hip.hpp"
 #include "hip/base/thrust.hip.hpp"
+#include "hip/components/reduction.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+#include "hip/components/uninitialized_array.hip.hpp"
 #include "hip/matrix/batch_struct.hip.hpp"
 
 
diff --git a/hip/stop/batch_criteria.hip.hpp b/hip/stop/batch_criteria.hip.hpp
new file mode 100644
index 00000000000..1f721e36aaf
--- /dev/null
+++ b/hip/stop/batch_criteria.hip.hpp
@@ -0,0 +1,26 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_
+#define GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_
+
+
+#include <ginkgo/core/base/math.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+namespace batch_stop {
+
+
+#include "common/cuda_hip/stop/batch_criteria.hpp.inc"
+
+
+}  // namespace batch_stop
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+#endif  // GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_

From 9d76d0f69ba894b8b9cffad18d36ff65f1510a2b Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Sat, 29 Jun 2024 19:26:35 +0200
Subject: [PATCH 044/448] fix batch

---
 common/cuda_hip/CMakeLists.txt                |  4 --
 ...cpp => batch_multi_vector_kernels.hpp.inc} | 52 -------------------
 ...{batch_logger.hpp => batch_logger.hpp.inc} | 22 --------
 ..._kernels.cpp => batch_csr_kernels.hpp.inc} | 51 ------------------
 ...ernels.cpp => batch_dense_kernels.hpp.inc} | 52 -------------------
 ..._kernels.cpp => batch_ell_kernels.hpp.inc} | 51 ------------------
 .../preconditioner/jacobi_kernels.cpp         |  2 +-
 ...ch_criteria.hpp => batch_criteria.hpp.inc} | 21 --------
 cuda/CMakeLists.txt                           |  6 ++-
 cuda/base/batch_multi_vector_kernels.cu       |  8 +--
 cuda/matrix/batch_csr_kernels.cu              |  8 +--
 cuda/matrix/batch_dense_kernels.cu            |  8 +--
 cuda/matrix/batch_ell_kernels.cu              |  8 +--
 cuda/preconditioner/batch_jacobi_kernels.cu   |  6 +--
 cuda/preconditioner/batch_preconditioners.cuh |  2 +-
 .../jacobi_advanced_apply_kernels.cu          |  2 +-
 ...cobi_advanced_apply_kernels.instantiate.cu |  2 +-
 .../preconditioner/jacobi_generate_kernels.cu |  2 +-
 .../jacobi_generate_kernels.instantiate.cu    |  2 +-
 .../jacobi_simple_apply_kernels.cu            |  2 +-
 ...jacobi_simple_apply_kernels.instantiate.cu |  2 +-
 cuda/solver/batch_bicgstab_kernels.cu         |  9 ++--
 cuda/solver/batch_cg_kernels.cu               |  9 ++--
 cuda/test/components/merging.cu               |  2 +-
 cuda/test/components/searching.cu             |  2 +-
 cuda/test/components/sorting.cu               |  2 +-
 hip/CMakeLists.txt                            |  8 ++-
 hip/base/batch_multi_vector_kernels.hip.cpp   |  8 +--
 hip/matrix/batch_csr_kernels.hip.cpp          |  8 +--
 hip/matrix/batch_dense_kernels.hip.cpp        |  8 +--
 hip/matrix/batch_ell_kernels.hip.cpp          |  8 +--
 .../batch_jacobi_kernels.hip.cpp              | 12 ++---
 .../batch_preconditioners.hip.hpp             |  2 +-
 .../jacobi_advanced_apply_kernels.hip.cpp     |  2 +-
 ...advanced_apply_kernels.instantiate.hip.cpp |  2 +-
 .../jacobi_generate_kernels.hip.cpp           |  2 +-
 ...acobi_generate_kernels.instantiate.hip.cpp |  2 +-
 .../jacobi_simple_apply_kernels.hip.cpp       |  2 +-
 ...i_simple_apply_kernels.instantiate.hip.cpp |  2 +-
 hip/solver/batch_bicgstab_kernels.hip.cpp     | 11 ++--
 hip/solver/batch_cg_kernels.hip.cpp           | 11 ++--
 hip/test/components/merging.hip.cpp           |  2 +-
 hip/test/components/searching.hip.cpp         |  2 +-
 hip/test/components/sorting.hip.cpp           |  2 +-
 44 files changed, 91 insertions(+), 340 deletions(-)
 rename common/cuda_hip/base/{batch_multi_vector_kernels.cpp => batch_multi_vector_kernels.hpp.inc} (89%)
 rename common/cuda_hip/log/{batch_logger.hpp => batch_logger.hpp.inc} (67%)
 rename common/cuda_hip/matrix/{batch_csr_kernels.cpp => batch_csr_kernels.hpp.inc} (87%)
 rename common/cuda_hip/matrix/{batch_dense_kernels.cpp => batch_dense_kernels.hpp.inc} (89%)
 rename common/cuda_hip/matrix/{batch_ell_kernels.cpp => batch_ell_kernels.hpp.inc} (87%)
 rename common/cuda_hip/stop/{batch_criteria.hpp => batch_criteria.hpp.inc} (75%)

diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt
index 79af0c5fd0d..0225e3ad872 100644
--- a/common/cuda_hip/CMakeLists.txt
+++ b/common/cuda_hip/CMakeLists.txt
@@ -1,6 +1,5 @@
 include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
 set(CUDA_HIP_SOURCES
-    base/batch_multi_vector_kernels.cpp
     base/device_matrix_data_kernels.cpp
     components/prefix_sum_kernels.cpp
     distributed/matrix_kernels.cpp
@@ -12,9 +11,6 @@ set(CUDA_HIP_SOURCES
     factorization/lu_kernels.cpp
     factorization/par_ic_kernels.cpp
     factorization/par_ilu_kernels.cpp
-    matrix/batch_csr_kernels.cpp
-    matrix/batch_dense_kernels.cpp
-    matrix/batch_ell_kernels.cpp
     matrix/coo_kernels.cpp
     matrix/dense_kernels.cpp
     matrix/diagonal_kernels.cpp
diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.cpp b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
similarity index 89%
rename from common/cuda_hip/base/batch_multi_vector_kernels.cpp
rename to common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
index 0261dbb97ce..9b6301674be 100644
--- a/common/cuda_hip/base/batch_multi_vector_kernels.cpp
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
@@ -2,47 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/base/batch_multi_vector_kernels.hpp"
-
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/range_accessors.hpp>
-
-#include "common/cuda_hip/base/batch_struct.hpp"
-#include "common/cuda_hip/base/blas_bindings.hpp"
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/pointer_mode_guard.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/uninitialized_array.hpp"
-#include "core/base/batch_struct.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace GKO_DEVICE_NAMESPACE {
-/**
- * @brief The MultiVector matrix format namespace.
- *
- * @ingroup batch_multi_vector
- */
-namespace batch_multi_vector {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-
 template <typename ValueType, typename Mapping>
 __device__ __forceinline__ void scale(
     const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
@@ -340,14 +299,3 @@ __launch_bounds__(default_block_size, sm_oversubscription) void copy_kernel(
         copy(src_b, dst_b);
     }
 }
-
-
-#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc"
-
-// clang-format on
-
-
-}  // namespace batch_multi_vector
-}  // namespace GKO_DEVICE_NAMESPACE
-}  // namespace kernels
-}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/log/batch_logger.hpp b/common/cuda_hip/log/batch_logger.hpp.inc
similarity index 67%
rename from common/cuda_hip/log/batch_logger.hpp
rename to common/cuda_hip/log/batch_logger.hpp.inc
index bca07fb9c37..04b614b50f9 100644
--- a/common/cuda_hip/log/batch_logger.hpp
+++ b/common/cuda_hip/log/batch_logger.hpp.inc
@@ -2,19 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#ifndef GKO_COMMON_CUDA_HIP_LOG_BATCH_LOGGER_HPP_INC_
-#define GKO_COMMON_CUDA_HIP_LOG_BATCH_LOGGER_HPP_INC_
-
-
-#include <ginkgo/core/base/types.hpp>
-
-
-namespace gko {
-namespace kernels {
-namespace GKO_DEVICE_NAMESPACE {
-namespace batch_log {
-
-
 /**
  * @see reference/log/batch_logger.hpp
  */
@@ -41,12 +28,3 @@ class SimpleFinalLogger final {
     real_type* const final_residuals_;
     idx_type* const final_iters_;
 };
-
-
-}  // namespace batch_log
-}  // namespace GKO_DEVICE_NAMESPACE
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_COMMON_CUDA_HIP_LOG_BATCH_LOGGER_HPP_INC_
\ No newline at end of file
diff --git a/common/cuda_hip/matrix/batch_csr_kernels.cpp b/common/cuda_hip/matrix/batch_csr_kernels.hpp.inc
similarity index 87%
rename from common/cuda_hip/matrix/batch_csr_kernels.cpp
rename to common/cuda_hip/matrix/batch_csr_kernels.hpp.inc
index 01edb0e1310..e041dadaa3e 100644
--- a/common/cuda_hip/matrix/batch_csr_kernels.cpp
+++ b/common/cuda_hip/matrix/batch_csr_kernels.hpp.inc
@@ -2,46 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/matrix/batch_csr_kernels.hpp"
-
-#include <thrust/functional.h>
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/batch_csr.hpp>
-
-#include "common/cuda_hip/base/batch_struct.hpp"
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/uninitialized_array.hpp"
-#include "common/cuda_hip/matrix/batch_struct.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace GKO_DEVICE_NAMESPACE {
-/**
- * @brief The Csr matrix format namespace.
- * @ref Csr
- * @ingroup batch_csr
- */
-namespace batch_csr {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-
 template <typename ValueType, typename IndexType>
 __device__ __forceinline__ void simple_apply(
     const gko::batch::matrix::csr::batch_item<const ValueType, IndexType>& mat,
@@ -236,14 +196,3 @@ __global__ void add_scaled_identity_kernel(
         add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b);
     }
 }
-
-
-#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc"
-
-// clang-format on
-
-
-}  // namespace batch_csr
-}  // namespace GKO_DEVICE_NAMESPACE
-}  // namespace kernels
-}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/matrix/batch_dense_kernels.cpp b/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc
similarity index 89%
rename from common/cuda_hip/matrix/batch_dense_kernels.cpp
rename to common/cuda_hip/matrix/batch_dense_kernels.hpp.inc
index 90cafc5d1ca..f8abf9131a1 100644
--- a/common/cuda_hip/matrix/batch_dense_kernels.cpp
+++ b/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc
@@ -2,46 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/matrix/batch_dense_kernels.hpp"
-
-#include <thrust/functional.h>
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/batch_dense.hpp>
-
-#include "common/cuda_hip/base/batch_struct.hpp"
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/uninitialized_array.hpp"
-#include "common/cuda_hip/matrix/batch_struct.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace GKO_DEVICE_NAMESPACE {
-/**
- * @brief The Dense matrix format namespace.
- *
- * @ingroup batch_dense
- */
-namespace batch_dense {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-
 template <typename ValueType>
 __device__ __forceinline__ void simple_apply(
     const gko::batch::matrix::dense::batch_item<const ValueType>& mat,
@@ -283,15 +243,3 @@ __global__ void add_scaled_identity_kernel(
         add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b);
     }
 }
-
-
-#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc"
-
-
-// clang-format on
-
-
-}  // namespace batch_dense
-}  // namespace GKO_DEVICE_NAMESPACE
-}  // namespace kernels
-}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/matrix/batch_ell_kernels.cpp b/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc
similarity index 87%
rename from common/cuda_hip/matrix/batch_ell_kernels.cpp
rename to common/cuda_hip/matrix/batch_ell_kernels.hpp.inc
index c5e27e9d1d1..0a6d1927c96 100644
--- a/common/cuda_hip/matrix/batch_ell_kernels.cpp
+++ b/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc
@@ -2,46 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/matrix/batch_ell_kernels.hpp"
-
-#include <thrust/functional.h>
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/batch_ell.hpp>
-
-#include "common/cuda_hip/base/batch_struct.hpp"
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/uninitialized_array.hpp"
-#include "common/cuda_hip/matrix/batch_struct.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace GKO_DEVICE_NAMESPACE {
-/**
- * @brief The Ell matrix format namespace.
- * @ref Ell
- * @ingroup batch_ell
- */
-namespace batch_ell {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-
 template <typename ValueType, typename IndexType>
 __device__ __forceinline__ void simple_apply(
     const gko::batch::matrix::ell::batch_item<const ValueType, IndexType>& mat,
@@ -245,14 +205,3 @@ __global__ void add_scaled_identity_kernel(
         add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b);
     }
 }
-
-
-#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc"
-
-// clang-format on
-
-
-}  // namespace batch_ell
-}  // namespace GKO_DEVICE_NAMESPACE
-}  // namespace kernels
-}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/preconditioner/jacobi_kernels.cpp b/common/cuda_hip/preconditioner/jacobi_kernels.cpp
index 27069d2f693..3c581546be2 100644
--- a/common/cuda_hip/preconditioner/jacobi_kernels.cpp
+++ b/common/cuda_hip/preconditioner/jacobi_kernels.cpp
@@ -12,10 +12,10 @@
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/preconditioner/jacobi_common.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
+#include "preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/common/cuda_hip/stop/batch_criteria.hpp b/common/cuda_hip/stop/batch_criteria.hpp.inc
similarity index 75%
rename from common/cuda_hip/stop/batch_criteria.hpp
rename to common/cuda_hip/stop/batch_criteria.hpp.inc
index cecaa6b19d1..38072467765 100644
--- a/common/cuda_hip/stop/batch_criteria.hpp
+++ b/common/cuda_hip/stop/batch_criteria.hpp.inc
@@ -2,19 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#ifndef GKO_COMMON_CUDA_HIP_STOP_BATCH_CRITERIA_HPP_INC_
-#define GKO_COMMON_CUDA_HIP_STOP_BATCH_CRITERIA_HPP_INC_
-
-
-#include <ginkgo/core/base/math.hpp>
-
-
-namespace gko {
-namespace kernels {
-namespace GKO_DEVICE_NAMESPACE {
-namespace batch_stop {
-
-
 /**
  * @see reference/stop/batch_criteria.hpp
  */
@@ -62,11 +49,3 @@ class SimpleAbsResidual {
 private:
     const real_type abs_tol_;
 };
-
-
-}  // namespace batch_stop
-}  // namespace GKO_DEVICE_NAMESPACE
-}  // namespace kernels
-}  // namespace gko
-
-#endif  // GKO_COMMON_CUDA_HIP_STOP_BATCH_CRITERIA_HPP_INC_
\ No newline at end of file
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index 4dd7bccd2c9..1552f4f3ee5 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -7,6 +7,7 @@ add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kerne
 list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
 target_sources(ginkgo_cuda
     PRIVATE
+    base/batch_multi_vector_kernels.cu
     base/device.cpp
     base/exception.cpp
     base/executor.cpp
@@ -27,6 +28,9 @@ target_sources(ginkgo_cuda
     factorization/par_ilut_select_kernels.cu
     factorization/par_ilut_spgeam_kernels.cu
     factorization/par_ilut_sweep_kernels.cu
+    matrix/batch_csr_kernels.cu
+    matrix/batch_dense_kernels.cu
+    matrix/batch_ell_kernels.cu
     ${CSR_INSTANTIATE}
     ${FBCSR_INSTANTIATE}
     matrix/fft_kernels.cu
@@ -97,7 +101,7 @@ target_compile_definitions(ginkgo_cuda PRIVATE GKO_COMPILING_CUDA GKO_DEVICE_NAM
 
 # include path for generated headers like jacobi_common.hpp
 target_include_directories(ginkgo_cuda
-    PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/..)
+    PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
 target_link_libraries(ginkgo_cuda PRIVATE CUDA::cudart CUDA::cublas CUDA::cusparse CUDA::curand CUDA::cufft nvtx::nvtx)
 # NVTX3 is header-only and requires dlopen/dlclose in static builds
 target_link_libraries(ginkgo_cuda PUBLIC ginkgo_device ${CMAKE_DL_LIBS})
diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu
index 704192d0bff..3dad5ba94f1 100644
--- a/cuda/base/batch_multi_vector_kernels.cu
+++ b/cuda/base/batch_multi_vector_kernels.cu
@@ -14,13 +14,13 @@
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/pointer_mode_guard.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
 #include "core/base/batch_struct.hpp"
 #include "cuda/base/batch_struct.hpp"
-#include "cuda/base/thrust.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
 
 
 namespace gko {
diff --git a/cuda/matrix/batch_csr_kernels.cu b/cuda/matrix/batch_csr_kernels.cu
index 4fc5137646c..95b4f85cdfc 100644
--- a/cuda/matrix/batch_csr_kernels.cu
+++ b/cuda/matrix/batch_csr_kernels.cu
@@ -12,14 +12,14 @@
 
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "cuda/base/batch_struct.hpp"
-#include "cuda/base/thrust.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
 #include "cuda/matrix/batch_struct.hpp"
 
 
diff --git a/cuda/matrix/batch_dense_kernels.cu b/cuda/matrix/batch_dense_kernels.cu
index e28d4f91670..10148ee242b 100644
--- a/cuda/matrix/batch_dense_kernels.cu
+++ b/cuda/matrix/batch_dense_kernels.cu
@@ -12,14 +12,14 @@
 
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "cuda/base/batch_struct.hpp"
-#include "cuda/base/thrust.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
 #include "cuda/matrix/batch_struct.hpp"
 
 
diff --git a/cuda/matrix/batch_ell_kernels.cu b/cuda/matrix/batch_ell_kernels.cu
index 90caf963200..25281cf6f81 100644
--- a/cuda/matrix/batch_ell_kernels.cu
+++ b/cuda/matrix/batch_ell_kernels.cu
@@ -12,14 +12,14 @@
 
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "cuda/base/batch_struct.hpp"
-#include "cuda/base/thrust.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
 #include "cuda/matrix/batch_struct.hpp"
 
 
diff --git a/cuda/preconditioner/batch_jacobi_kernels.cu b/cuda/preconditioner/batch_jacobi_kernels.cu
index 1bc39df9781..178b53d04ea 100644
--- a/cuda/preconditioner/batch_jacobi_kernels.cu
+++ b/cuda/preconditioner/batch_jacobi_kernels.cu
@@ -8,6 +8,8 @@
 #include <ginkgo/core/matrix/batch_csr.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/base/utils.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
@@ -18,10 +20,8 @@
 #include "cuda/base/config.hpp"
 #include "cuda/base/types.hpp"
 #include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/intrinsics.cuh"
-#include "cuda/components/thread_ids.cuh"
 #include "cuda/matrix/batch_struct.hpp"
-#include "cuda/preconditioner/jacobi_common.hpp"
+#include "preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/cuda/preconditioner/batch_preconditioners.cuh b/cuda/preconditioner/batch_preconditioners.cuh
index e83d6e04ee9..01001c036b2 100644
--- a/cuda/preconditioner/batch_preconditioners.cuh
+++ b/cuda/preconditioner/batch_preconditioners.cuh
@@ -7,9 +7,9 @@
 
 
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/preconditioner/batch_jacobi_helpers.hpp"
-#include "cuda/components/reduction.cuh"
 
 
 namespace gko {
diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernels.cu b/cuda/preconditioner/jacobi_advanced_apply_kernels.cu
index 74c7dea9b6b..fca6b24ba05 100644
--- a/cuda/preconditioner/jacobi_advanced_apply_kernels.cu
+++ b/cuda/preconditioner/jacobi_advanced_apply_kernels.cu
@@ -7,7 +7,7 @@
 #include "core/matrix/dense_kernels.hpp"
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/preconditioner/jacobi_common.hpp"
+#include "preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu
index 60823cf6f4b..80c3b5e1e73 100644
--- a/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu
+++ b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu
@@ -15,7 +15,7 @@
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/preconditioner/jacobi_common.hpp"
+#include "preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/cuda/preconditioner/jacobi_generate_kernels.cu b/cuda/preconditioner/jacobi_generate_kernels.cu
index 651dcec611a..e558594f5ce 100644
--- a/cuda/preconditioner/jacobi_generate_kernels.cu
+++ b/cuda/preconditioner/jacobi_generate_kernels.cu
@@ -8,7 +8,7 @@
 #include "core/components/fill_array_kernels.hpp"
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/preconditioner/jacobi_common.hpp"
+#include "preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu
index ff36c8efb1b..0dc21311af9 100644
--- a/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu
+++ b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu
@@ -18,7 +18,7 @@
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/preconditioner/jacobi_common.hpp"
+#include "preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/cuda/preconditioner/jacobi_simple_apply_kernels.cu b/cuda/preconditioner/jacobi_simple_apply_kernels.cu
index 5cac209b8b2..0bb09b1064a 100644
--- a/cuda/preconditioner/jacobi_simple_apply_kernels.cu
+++ b/cuda/preconditioner/jacobi_simple_apply_kernels.cu
@@ -7,7 +7,7 @@
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/preconditioner/jacobi_common.hpp"
+#include "preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu
index d727c9439f9..0721c03126b 100644
--- a/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu
+++ b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu
@@ -15,7 +15,7 @@
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/preconditioner/jacobi_common.hpp"
+#include "preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu
index 28efaf07475..6b3dca28607 100644
--- a/cuda/solver/batch_bicgstab_kernels.cu
+++ b/cuda/solver/batch_bicgstab_kernels.cu
@@ -12,16 +12,16 @@
 
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "cuda/base/batch_struct.hpp"
-#include "cuda/base/thrust.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
 #include "cuda/matrix/batch_struct.hpp"
 
 
@@ -44,7 +44,6 @@ namespace batch_bicgstab {
 
 
 #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
-#include "common/cuda_hip/components/uninitialized_array.hpp.inc"
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu
index cff72652629..746be0365e7 100644
--- a/cuda/solver/batch_cg_kernels.cu
+++ b/cuda/solver/batch_cg_kernels.cu
@@ -11,16 +11,16 @@
 #include <ginkgo/core/base/math.hpp>
 
 #include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "cuda/base/batch_struct.hpp"
-#include "cuda/base/thrust.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
 #include "cuda/matrix/batch_struct.hpp"
 
 
@@ -43,7 +43,6 @@ namespace batch_cg {
 
 
 #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
-#include "common/cuda_hip/components/uninitialized_array.hpp.inc"
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
diff --git a/cuda/test/components/merging.cu b/cuda/test/components/merging.cu
index 2788767b078..0a66c92ca3a 100644
--- a/cuda/test/components/merging.cu
+++ b/cuda/test/components/merging.cu
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "cuda/components/merging.cuh"
+#include "common/cuda_hip/components/merging.hpp"
 
 #include <algorithm>
 #include <memory>
diff --git a/cuda/test/components/searching.cu b/cuda/test/components/searching.cu
index afe7fb4b442..d0166418448 100644
--- a/cuda/test/components/searching.cu
+++ b/cuda/test/components/searching.cu
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "cuda/components/searching.cuh"
+#include "common/cuda_hip/components/searching.hpp"
 
 #include <memory>
 #include <numeric>
diff --git a/cuda/test/components/sorting.cu b/cuda/test/components/sorting.cu
index e1524ce0078..0cc54e5904e 100644
--- a/cuda/test/components/sorting.cu
+++ b/cuda/test/components/sorting.cu
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "cuda/components/sorting.cuh"
+#include "common/cuda_hip/components/sorting.hpp"
 
 #include <memory>
 #include <random>
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 3de4f4b4d65..71d41ad47df 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -5,6 +5,7 @@ add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kerne
 # we don't split up the dense kernels into distinct compilations
 list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
 set(GINKGO_HIP_SOURCES
+    base/batch_multi_vector_kernels.hip.cpp
     base/device.hip.cpp
     base/exception.hip.cpp
     base/executor.hip.cpp
@@ -25,6 +26,9 @@ set(GINKGO_HIP_SOURCES
     factorization/par_ilut_select_kernels.hip.cpp
     factorization/par_ilut_spgeam_kernels.hip.cpp
     factorization/par_ilut_sweep_kernels.hip.cpp
+    matrix/batch_csr_kernels.hip.cpp
+    matrix/batch_dense_kernels.hip.cpp
+    matrix/batch_ell_kernels.hip.cpp
     ${CSR_INSTANTIATE}
     ${FBCSR_INSTANTIATE}
     preconditioner/batch_jacobi_kernels.hip.cpp
@@ -83,14 +87,14 @@ foreach(GKO_JACOBI_BLOCK_SIZE IN LISTS GKO_HIP_JACOBI_BLOCK_SIZES)
         ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp)
 endforeach()
 string(REPLACE ";" "," GKO_HIP_JACOBI_BLOCK_SIZES_CODE "${GKO_HIP_JACOBI_BLOCK_SIZES}")
-configure_file(preconditioner/jacobi_common.hip.hpp.in preconditioner/jacobi_common.hip.hpp)
+configure_file(preconditioner/jacobi_common.hip.hpp.in preconditioner/jacobi_common.hpp)
 
 set_source_files_properties(${GINKGO_HIP_SOURCES} PROPERTIES LANGUAGE HIP)
 add_library(ginkgo_hip $<TARGET_OBJECTS:ginkgo_hip_device> ${GINKGO_HIP_SOURCES})
 
 target_include_directories(ginkgo_hip
     PRIVATE
-        ${CMAKE_CURRENT_BINARY_DIR}/.. # for generated headers like jacobi_common.hip.hpp
+        ${CMAKE_CURRENT_BINARY_DIR} # for generated headers like jacobi_common.hip.hpp
         )
 target_compile_definitions(ginkgo_hip PRIVATE GKO_COMPILING_HIP GKO_DEVICE_NAMESPACE=hip)
 
diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp
index 86b16c8975d..701f4655a9a 100644
--- a/hip/base/batch_multi_vector_kernels.hip.cpp
+++ b/hip/base/batch_multi_vector_kernels.hip.cpp
@@ -14,13 +14,13 @@
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/pointer_mode_guard.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
 #include "core/base/batch_struct.hpp"
 #include "hip/base/batch_struct.hip.hpp"
-#include "hip/base/thrust.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/matrix/batch_csr_kernels.hip.cpp b/hip/matrix/batch_csr_kernels.hip.cpp
index 4b0e6799834..b77b9416505 100644
--- a/hip/matrix/batch_csr_kernels.hip.cpp
+++ b/hip/matrix/batch_csr_kernels.hip.cpp
@@ -12,14 +12,14 @@
 
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "hip/base/batch_struct.hip.hpp"
-#include "hip/base/thrust.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
 #include "hip/matrix/batch_struct.hip.hpp"
 
 
diff --git a/hip/matrix/batch_dense_kernels.hip.cpp b/hip/matrix/batch_dense_kernels.hip.cpp
index 328f268251f..67dfd78e264 100644
--- a/hip/matrix/batch_dense_kernels.hip.cpp
+++ b/hip/matrix/batch_dense_kernels.hip.cpp
@@ -12,14 +12,14 @@
 
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "hip/base/batch_struct.hip.hpp"
-#include "hip/base/thrust.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
 #include "hip/matrix/batch_struct.hip.hpp"
 
 
diff --git a/hip/matrix/batch_ell_kernels.hip.cpp b/hip/matrix/batch_ell_kernels.hip.cpp
index 01294ac3d63..68b59c042f1 100644
--- a/hip/matrix/batch_ell_kernels.hip.cpp
+++ b/hip/matrix/batch_ell_kernels.hip.cpp
@@ -12,14 +12,14 @@
 
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "hip/base/batch_struct.hip.hpp"
-#include "hip/base/thrust.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
 #include "hip/matrix/batch_struct.hip.hpp"
 
 
diff --git a/hip/preconditioner/batch_jacobi_kernels.hip.cpp b/hip/preconditioner/batch_jacobi_kernels.hip.cpp
index db6e5a27b58..cfef615dcad 100644
--- a/hip/preconditioner/batch_jacobi_kernels.hip.cpp
+++ b/hip/preconditioner/batch_jacobi_kernels.hip.cpp
@@ -8,6 +8,11 @@
 #include <ginkgo/core/matrix/batch_csr.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/components/diagonal_block_manipulation.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/base/utils.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
@@ -15,15 +20,10 @@
 #include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/batch_struct.hip.hpp"
 #include "hip/base/config.hip.hpp"
-#include "hip/base/math.hip.hpp"
 #include "hip/base/types.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/diagonal_block_manipulation.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
-#include "hip/components/warp_blas.hip.hpp"
 #include "hip/matrix/batch_struct.hip.hpp"
-#include "hip/preconditioner/jacobi_common.hip.hpp"
+#include "preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/hip/preconditioner/batch_preconditioners.hip.hpp b/hip/preconditioner/batch_preconditioners.hip.hpp
index f3969c16b81..f62000ff46f 100644
--- a/hip/preconditioner/batch_preconditioners.hip.hpp
+++ b/hip/preconditioner/batch_preconditioners.hip.hpp
@@ -7,9 +7,9 @@
 
 
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/preconditioner/batch_jacobi_helpers.hpp"
-#include "hip/components/reduction.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp
index 0eccbb2d6eb..ce260ec1e16 100644
--- a/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp
+++ b/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp
@@ -7,7 +7,7 @@
 #include "core/matrix/dense_kernels.hpp"
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/preconditioner/jacobi_common.hip.hpp"
+#include "preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp
index d30f4edd787..9cc4978a1f8 100644
--- a/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp
+++ b/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp
@@ -16,7 +16,7 @@
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/preconditioner/jacobi_common.hip.hpp"
+#include "preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/hip/preconditioner/jacobi_generate_kernels.hip.cpp b/hip/preconditioner/jacobi_generate_kernels.hip.cpp
index 3f6d3a4e91f..673ca8c373e 100644
--- a/hip/preconditioner/jacobi_generate_kernels.hip.cpp
+++ b/hip/preconditioner/jacobi_generate_kernels.hip.cpp
@@ -19,7 +19,7 @@
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/preconditioner/jacobi_common.hip.hpp"
+#include "preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp b/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp
index 3c18703557d..a6be610a839 100644
--- a/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp
+++ b/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp
@@ -18,7 +18,7 @@
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/preconditioner/jacobi_common.hip.hpp"
+#include "preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp
index 563f5829536..72f2e4fe556 100644
--- a/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp
+++ b/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp
@@ -16,7 +16,7 @@
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/preconditioner/jacobi_common.hip.hpp"
+#include "preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp
index 7a6e2a46b04..1ea34bff93f 100644
--- a/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp
+++ b/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp
@@ -15,7 +15,7 @@
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/preconditioner/jacobi_common.hip.hpp"
+#include "preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp
index 44e2f0f3c48..92051a81640 100644
--- a/hip/solver/batch_bicgstab_kernels.hip.cpp
+++ b/hip/solver/batch_bicgstab_kernels.hip.cpp
@@ -11,18 +11,18 @@
 #include <ginkgo/core/base/math.hpp>
 
 #include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "hip/base/batch_struct.hip.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/base/thrust.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
 #include "hip/matrix/batch_struct.hip.hpp"
 
 
@@ -43,7 +43,6 @@ namespace batch_bicgstab {
 
 
 #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
-#include "common/cuda_hip/components/uninitialized_array.hpp.inc"
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp
index 450d02a302c..2df02a6f0a8 100644
--- a/hip/solver/batch_cg_kernels.hip.cpp
+++ b/hip/solver/batch_cg_kernels.hip.cpp
@@ -11,18 +11,18 @@
 #include <ginkgo/core/base/math.hpp>
 
 #include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "hip/base/batch_struct.hip.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/base/thrust.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
 #include "hip/matrix/batch_struct.hip.hpp"
 
 
@@ -43,7 +43,6 @@ namespace batch_cg {
 
 
 #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
-#include "common/cuda_hip/components/uninitialized_array.hpp.inc"
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
diff --git a/hip/test/components/merging.hip.cpp b/hip/test/components/merging.hip.cpp
index 7fc3b9a173a..5c8cc21bd5e 100644
--- a/hip/test/components/merging.hip.cpp
+++ b/hip/test/components/merging.hip.cpp
@@ -8,7 +8,7 @@
 // clang-format on
 
 
-#include "hip/components/merging.hip.hpp"
+#include "common/cuda_hip/components/merging.hpp"
 
 #include <algorithm>
 #include <memory>
diff --git a/hip/test/components/searching.hip.cpp b/hip/test/components/searching.hip.cpp
index 85c54075231..d9dc6b47ab0 100644
--- a/hip/test/components/searching.hip.cpp
+++ b/hip/test/components/searching.hip.cpp
@@ -8,7 +8,7 @@
 // clang-format on
 
 
-#include "hip/components/searching.hip.hpp"
+#include "common/cuda_hip/components/searching.hpp"
 
 #include <memory>
 #include <numeric>
diff --git a/hip/test/components/sorting.hip.cpp b/hip/test/components/sorting.hip.cpp
index 79de1dc2269..653a0f536eb 100644
--- a/hip/test/components/sorting.hip.cpp
+++ b/hip/test/components/sorting.hip.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "hip/components/sorting.hip.hpp"
+#include "common/cuda_hip/components/sorting.hpp"
 
 #include <memory>
 #include <random>

From d53fa460c62c0c756f578ab46d93d2173310fb57 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Sat, 29 Jun 2024 20:04:09 +0200
Subject: [PATCH 045/448] add newlines to the end of files where missing

---
 common/cuda_hip/base/device_matrix_data_kernels.cpp        | 2 +-
 common/cuda_hip/base/kernel_launch.hpp                     | 2 +-
 common/cuda_hip/base/kernel_launch_reduction.hpp           | 2 +-
 common/cuda_hip/base/kernel_launch_solver.hpp              | 2 +-
 common/cuda_hip/base/math.hpp                              | 2 +-
 common/cuda_hip/components/atomic.hpp                      | 2 +-
 common/cuda_hip/components/diagonal_block_manipulation.hpp | 2 +-
 common/cuda_hip/components/intrinsics.hpp                  | 2 +-
 common/cuda_hip/components/merging.hpp                     | 2 +-
 common/cuda_hip/components/prefix_sum.hpp                  | 2 +-
 common/cuda_hip/components/prefix_sum_kernels.cpp          | 2 +-
 common/cuda_hip/components/reduction.hpp                   | 2 +-
 common/cuda_hip/components/searching.hpp                   | 2 +-
 common/cuda_hip/components/segment_scan.hpp                | 2 +-
 common/cuda_hip/components/sorting.hpp                     | 2 +-
 common/cuda_hip/components/syncfree.hpp                    | 2 +-
 common/cuda_hip/components/thread_ids.hpp                  | 2 +-
 common/cuda_hip/components/uninitialized_array.hpp         | 2 +-
 common/cuda_hip/components/warp_blas.hpp                   | 2 +-
 common/cuda_hip/distributed/matrix_kernels.cpp             | 2 +-
 common/cuda_hip/distributed/partition_helpers_kernels.cpp  | 2 +-
 common/cuda_hip/distributed/partition_kernels.cpp          | 2 +-
 common/cuda_hip/distributed/vector_kernels.cpp             | 2 +-
 common/cuda_hip/factorization/cholesky_kernels.cpp         | 2 +-
 common/cuda_hip/factorization/factorization_kernels.cpp    | 2 +-
 common/cuda_hip/factorization/lu_kernels.cpp               | 2 +-
 common/cuda_hip/factorization/par_ic_kernels.cpp           | 2 +-
 common/cuda_hip/factorization/par_ilu_kernels.cpp          | 2 +-
 common/cuda_hip/matrix/coo_kernels.cpp                     | 2 +-
 common/cuda_hip/matrix/dense_kernels.cpp                   | 2 +-
 common/cuda_hip/matrix/diagonal_kernels.cpp                | 2 +-
 common/cuda_hip/matrix/ell_kernels.cpp                     | 2 +-
 common/cuda_hip/matrix/fbcsr_kernels.template.cpp          | 2 +-
 common/cuda_hip/matrix/sellp_kernels.cpp                   | 2 +-
 common/cuda_hip/matrix/sparsity_csr_kernels.cpp            | 2 +-
 common/cuda_hip/multigrid/pgm_kernels.cpp                  | 2 +-
 common/cuda_hip/preconditioner/isai_kernels.cpp            | 2 +-
 common/cuda_hip/preconditioner/jacobi_kernels.cpp          | 2 +-
 common/cuda_hip/reorder/rcm_kernels.cpp                    | 2 +-
 common/cuda_hip/solver/cb_gmres_kernels.cpp                | 2 +-
 common/cuda_hip/solver/idr_kernels.cpp                     | 2 +-
 common/cuda_hip/solver/multigrid_kernels.cpp               | 2 +-
 42 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/common/cuda_hip/base/device_matrix_data_kernels.cpp b/common/cuda_hip/base/device_matrix_data_kernels.cpp
index 61a7a6281a9..c5742653a93 100644
--- a/common/cuda_hip/base/device_matrix_data_kernels.cpp
+++ b/common/cuda_hip/base/device_matrix_data_kernels.cpp
@@ -124,4 +124,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 }  // namespace components
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/common/cuda_hip/base/kernel_launch.hpp b/common/cuda_hip/base/kernel_launch.hpp
index dd20eb5769f..04c54786422 100644
--- a/common/cuda_hip/base/kernel_launch.hpp
+++ b/common/cuda_hip/base/kernel_launch.hpp
@@ -102,4 +102,4 @@ void run_kernel(std::shared_ptr<const DefaultExecutor> exec, KernelFunction fn,
 
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/common/cuda_hip/base/kernel_launch_reduction.hpp b/common/cuda_hip/base/kernel_launch_reduction.hpp
index 86e082ac2c1..4c4fb366802 100644
--- a/common/cuda_hip/base/kernel_launch_reduction.hpp
+++ b/common/cuda_hip/base/kernel_launch_reduction.hpp
@@ -527,4 +527,4 @@ void run_kernel_col_reduction_cached(
 
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/common/cuda_hip/base/kernel_launch_solver.hpp b/common/cuda_hip/base/kernel_launch_solver.hpp
index 742da85fd96..e32ba52e79a 100644
--- a/common/cuda_hip/base/kernel_launch_solver.hpp
+++ b/common/cuda_hip/base/kernel_launch_solver.hpp
@@ -50,4 +50,4 @@ void run_kernel_solver(std::shared_ptr<const DefaultExecutor> exec,
 
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/common/cuda_hip/base/math.hpp b/common/cuda_hip/base/math.hpp
index 44a26cadb53..ea11c7d73a9 100644
--- a/common/cuda_hip/base/math.hpp
+++ b/common/cuda_hip/base/math.hpp
@@ -54,4 +54,4 @@ struct truncate_type_impl<thrust::complex<T>> {
 }  // namespace gko
 
 
-#endif  // GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_INC_
\ No newline at end of file
+#endif  // GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_INC_
diff --git a/common/cuda_hip/components/atomic.hpp b/common/cuda_hip/components/atomic.hpp
index e0384222734..3279c9433f1 100644
--- a/common/cuda_hip/components/atomic.hpp
+++ b/common/cuda_hip/components/atomic.hpp
@@ -250,4 +250,4 @@ __forceinline__ __device__ thrust::complex<double> atomic_add(
 }  // namespace gko
 
 
-#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_ATOMIC_HPP_INC_
\ No newline at end of file
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_ATOMIC_HPP_INC_
diff --git a/common/cuda_hip/components/diagonal_block_manipulation.hpp b/common/cuda_hip/components/diagonal_block_manipulation.hpp
index 5c0be150d21..890d080018e 100644
--- a/common/cuda_hip/components/diagonal_block_manipulation.hpp
+++ b/common/cuda_hip/components/diagonal_block_manipulation.hpp
@@ -88,4 +88,4 @@ __device__ __forceinline__ void extract_transposed_diag_blocks(
 }  // namespace gko
 
 
-#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HPP_INC_
\ No newline at end of file
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HPP_INC_
diff --git a/common/cuda_hip/components/intrinsics.hpp b/common/cuda_hip/components/intrinsics.hpp
index 398e4325cc2..e8c236e22b1 100644
--- a/common/cuda_hip/components/intrinsics.hpp
+++ b/common/cuda_hip/components/intrinsics.hpp
@@ -55,4 +55,4 @@ __forceinline__ __device__ int clz(uint64 mask) { return __clzll(mask); }
 }  // namespace gko
 
 
-#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_INTRINSICS_HPP_INC_
\ No newline at end of file
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_INTRINSICS_HPP_INC_
diff --git a/common/cuda_hip/components/merging.hpp b/common/cuda_hip/components/merging.hpp
index b1bca2a0c78..4c1bfa4cd2d 100644
--- a/common/cuda_hip/components/merging.hpp
+++ b/common/cuda_hip/components/merging.hpp
@@ -302,4 +302,4 @@ __forceinline__ __device__ void sequential_match(const ValueType* a,
 }  // namespace gko
 
 
-#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_MERGING_HPP_INC_
\ No newline at end of file
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_MERGING_HPP_INC_
diff --git a/common/cuda_hip/components/prefix_sum.hpp b/common/cuda_hip/components/prefix_sum.hpp
index 8fc5bbe63b0..a09eb8f17c5 100644
--- a/common/cuda_hip/components/prefix_sum.hpp
+++ b/common/cuda_hip/components/prefix_sum.hpp
@@ -182,4 +182,4 @@ __global__ __launch_bounds__(block_size) void finalize_prefix_sum(
 }  // namespace gko
 
 
-#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_INC_
\ No newline at end of file
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_INC_
diff --git a/common/cuda_hip/components/prefix_sum_kernels.cpp b/common/cuda_hip/components/prefix_sum_kernels.cpp
index 40cb1bc48fc..ebf102a7181 100644
--- a/common/cuda_hip/components/prefix_sum_kernels.cpp
+++ b/common/cuda_hip/components/prefix_sum_kernels.cpp
@@ -80,4 +80,4 @@ template void prefix_sum_nonnegative<size_type>(
 }  // namespace components
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/common/cuda_hip/components/reduction.hpp b/common/cuda_hip/components/reduction.hpp
index d2889bb9c7e..582de3de1fb 100644
--- a/common/cuda_hip/components/reduction.hpp
+++ b/common/cuda_hip/components/reduction.hpp
@@ -296,4 +296,4 @@ __host__ ValueType reduce_add_array(std::shared_ptr<const DefaultExecutor> exec,
 }  // namespace gko
 
 
-#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_REDUCTION_HPP_INC_
\ No newline at end of file
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_REDUCTION_HPP_INC_
diff --git a/common/cuda_hip/components/searching.hpp b/common/cuda_hip/components/searching.hpp
index 599e7a8581c..61efde54197 100644
--- a/common/cuda_hip/components/searching.hpp
+++ b/common/cuda_hip/components/searching.hpp
@@ -228,4 +228,4 @@ __forceinline__ __device__ IndexType group_ary_search(IndexType offset,
 }  // namespace gko
 
 
-#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_SEARCHING_HPP_INC_
\ No newline at end of file
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_SEARCHING_HPP_INC_
diff --git a/common/cuda_hip/components/segment_scan.hpp b/common/cuda_hip/components/segment_scan.hpp
index d2f992850ef..af3953a4176 100644
--- a/common/cuda_hip/components/segment_scan.hpp
+++ b/common/cuda_hip/components/segment_scan.hpp
@@ -52,4 +52,4 @@ __device__ __forceinline__ bool segment_scan(
 }  // namespace gko
 
 
-#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_SEGMENT_SCAN_HPP_INC_
\ No newline at end of file
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_SEGMENT_SCAN_HPP_INC_
diff --git a/common/cuda_hip/components/sorting.hpp b/common/cuda_hip/components/sorting.hpp
index ecc9c5289f9..b3ce253b451 100644
--- a/common/cuda_hip/components/sorting.hpp
+++ b/common/cuda_hip/components/sorting.hpp
@@ -311,4 +311,4 @@ __forceinline__ __device__ void bitonic_sort(ValueType* local_elements,
 }  // namespace gko
 
 
-#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_SORTING_HPP_INC_
\ No newline at end of file
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_SORTING_HPP_INC_
diff --git a/common/cuda_hip/components/syncfree.hpp b/common/cuda_hip/components/syncfree.hpp
index 3c82c916a21..e1693fe4e4d 100644
--- a/common/cuda_hip/components/syncfree.hpp
+++ b/common/cuda_hip/components/syncfree.hpp
@@ -135,4 +135,4 @@ class syncfree_scheduler {
 }  // namespace gko
 
 
-#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_SYNCFREE_HPP_INC_
\ No newline at end of file
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_SYNCFREE_HPP_INC_
diff --git a/common/cuda_hip/components/thread_ids.hpp b/common/cuda_hip/components/thread_ids.hpp
index 4fef650f51c..7d7c5e2bda3 100644
--- a/common/cuda_hip/components/thread_ids.hpp
+++ b/common/cuda_hip/components/thread_ids.hpp
@@ -263,4 +263,4 @@ __device__ __forceinline__ IndexType get_subwarp_num_flat()
 }  // namespace gko
 
 
-#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_THREAD_IDS_HPP_INC_
\ No newline at end of file
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_THREAD_IDS_HPP_INC_
diff --git a/common/cuda_hip/components/uninitialized_array.hpp b/common/cuda_hip/components/uninitialized_array.hpp
index 215c7f5751a..d4a2b5939af 100644
--- a/common/cuda_hip/components/uninitialized_array.hpp
+++ b/common/cuda_hip/components/uninitialized_array.hpp
@@ -82,4 +82,4 @@ class uninitialized_array {
 }  // namespace gko
 
 
-#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_INC_
\ No newline at end of file
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_INC_
diff --git a/common/cuda_hip/components/warp_blas.hpp b/common/cuda_hip/components/warp_blas.hpp
index 1f25bb61634..cfa46b8a045 100644
--- a/common/cuda_hip/components/warp_blas.hpp
+++ b/common/cuda_hip/components/warp_blas.hpp
@@ -434,4 +434,4 @@ __device__ __forceinline__ remove_complex<ValueType> compute_infinity_norm(
 }  // namespace gko
 
 
-#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_WARP_BLAS_HPP_INC_
\ No newline at end of file
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_WARP_BLAS_HPP_INC_
diff --git a/common/cuda_hip/distributed/matrix_kernels.cpp b/common/cuda_hip/distributed/matrix_kernels.cpp
index 6b5f997d153..88988febbb0 100644
--- a/common/cuda_hip/distributed/matrix_kernels.cpp
+++ b/common/cuda_hip/distributed/matrix_kernels.cpp
@@ -201,4 +201,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
 }  // namespace distributed_matrix
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/common/cuda_hip/distributed/partition_helpers_kernels.cpp b/common/cuda_hip/distributed/partition_helpers_kernels.cpp
index cd1419230d2..e733c9217b1 100644
--- a/common/cuda_hip/distributed/partition_helpers_kernels.cpp
+++ b/common/cuda_hip/distributed/partition_helpers_kernels.cpp
@@ -45,4 +45,4 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
 }  // namespace partition_helpers
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/common/cuda_hip/distributed/partition_kernels.cpp b/common/cuda_hip/distributed/partition_kernels.cpp
index b4e051b97f5..7f623b423fb 100644
--- a/common/cuda_hip/distributed/partition_kernels.cpp
+++ b/common/cuda_hip/distributed/partition_kernels.cpp
@@ -135,4 +135,4 @@ GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE(
 }  // namespace partition
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/common/cuda_hip/distributed/vector_kernels.cpp b/common/cuda_hip/distributed/vector_kernels.cpp
index 91bd838497d..1bacc93489a 100644
--- a/common/cuda_hip/distributed/vector_kernels.cpp
+++ b/common/cuda_hip/distributed/vector_kernels.cpp
@@ -90,4 +90,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
 }  // namespace distributed_vector
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/common/cuda_hip/factorization/cholesky_kernels.cpp b/common/cuda_hip/factorization/cholesky_kernels.cpp
index 6e6be7b81fd..e5f2bf5e5e5 100644
--- a/common/cuda_hip/factorization/cholesky_kernels.cpp
+++ b/common/cuda_hip/factorization/cholesky_kernels.cpp
@@ -435,4 +435,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 }  // namespace cholesky
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/common/cuda_hip/factorization/factorization_kernels.cpp b/common/cuda_hip/factorization/factorization_kernels.cpp
index da2666feb25..3a38175ab70 100644
--- a/common/cuda_hip/factorization/factorization_kernels.cpp
+++ b/common/cuda_hip/factorization/factorization_kernels.cpp
@@ -555,4 +555,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 }  // namespace factorization
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/common/cuda_hip/factorization/lu_kernels.cpp b/common/cuda_hip/factorization/lu_kernels.cpp
index 71d09e93ef7..aa432bf711c 100644
--- a/common/cuda_hip/factorization/lu_kernels.cpp
+++ b/common/cuda_hip/factorization/lu_kernels.cpp
@@ -341,4 +341,4 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
 }  // namespace lu_factorization
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/common/cuda_hip/factorization/par_ic_kernels.cpp b/common/cuda_hip/factorization/par_ic_kernels.cpp
index 7102d782b94..ee8b7c97f64 100644
--- a/common/cuda_hip/factorization/par_ic_kernels.cpp
+++ b/common/cuda_hip/factorization/par_ic_kernels.cpp
@@ -142,4 +142,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 }  // namespace par_ic_factorization
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/common/cuda_hip/factorization/par_ilu_kernels.cpp b/common/cuda_hip/factorization/par_ilu_kernels.cpp
index 447fdb99c2c..8bf71c471a8 100644
--- a/common/cuda_hip/factorization/par_ilu_kernels.cpp
+++ b/common/cuda_hip/factorization/par_ilu_kernels.cpp
@@ -115,4 +115,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 }  // namespace par_ilu_factorization
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/common/cuda_hip/matrix/coo_kernels.cpp b/common/cuda_hip/matrix/coo_kernels.cpp
index 00ab983bc9f..cffe18d981b 100644
--- a/common/cuda_hip/matrix/coo_kernels.cpp
+++ b/common/cuda_hip/matrix/coo_kernels.cpp
@@ -345,4 +345,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 }  // namespace coo
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/common/cuda_hip/matrix/dense_kernels.cpp b/common/cuda_hip/matrix/dense_kernels.cpp
index b44c0396823..d8391ace023 100644
--- a/common/cuda_hip/matrix/dense_kernels.cpp
+++ b/common/cuda_hip/matrix/dense_kernels.cpp
@@ -843,4 +843,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
 }  // namespace dense
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/common/cuda_hip/matrix/diagonal_kernels.cpp b/common/cuda_hip/matrix/diagonal_kernels.cpp
index a824abc6f7c..e12d3ed4f9f 100644
--- a/common/cuda_hip/matrix/diagonal_kernels.cpp
+++ b/common/cuda_hip/matrix/diagonal_kernels.cpp
@@ -88,4 +88,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 }  // namespace diagonal
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/common/cuda_hip/matrix/ell_kernels.cpp b/common/cuda_hip/matrix/ell_kernels.cpp
index 40f174a25c7..bfdd3f21e51 100644
--- a/common/cuda_hip/matrix/ell_kernels.cpp
+++ b/common/cuda_hip/matrix/ell_kernels.cpp
@@ -395,4 +395,4 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
 }  // namespace ell
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/common/cuda_hip/matrix/fbcsr_kernels.template.cpp b/common/cuda_hip/matrix/fbcsr_kernels.template.cpp
index f6276fdd056..960708378e1 100644
--- a/common/cuda_hip/matrix/fbcsr_kernels.template.cpp
+++ b/common/cuda_hip/matrix/fbcsr_kernels.template.cpp
@@ -635,4 +635,4 @@ void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
 }  // namespace fbcsr
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/common/cuda_hip/matrix/sellp_kernels.cpp b/common/cuda_hip/matrix/sellp_kernels.cpp
index 64c672b8d8d..3e8fba395b3 100644
--- a/common/cuda_hip/matrix/sellp_kernels.cpp
+++ b/common/cuda_hip/matrix/sellp_kernels.cpp
@@ -138,4 +138,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 }  // namespace sellp
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/common/cuda_hip/matrix/sparsity_csr_kernels.cpp b/common/cuda_hip/matrix/sparsity_csr_kernels.cpp
index 067b2749097..269708e19ae 100644
--- a/common/cuda_hip/matrix/sparsity_csr_kernels.cpp
+++ b/common/cuda_hip/matrix/sparsity_csr_kernels.cpp
@@ -327,4 +327,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 }  // namespace sparsity_csr
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/common/cuda_hip/multigrid/pgm_kernels.cpp b/common/cuda_hip/multigrid/pgm_kernels.cpp
index a2c5d608a50..d3c44cf540e 100644
--- a/common/cuda_hip/multigrid/pgm_kernels.cpp
+++ b/common/cuda_hip/multigrid/pgm_kernels.cpp
@@ -85,4 +85,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 }  // namespace pgm
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/common/cuda_hip/preconditioner/isai_kernels.cpp b/common/cuda_hip/preconditioner/isai_kernels.cpp
index eda1f9a0661..d6fdd6389fc 100644
--- a/common/cuda_hip/preconditioner/isai_kernels.cpp
+++ b/common/cuda_hip/preconditioner/isai_kernels.cpp
@@ -600,4 +600,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 }  // namespace isai
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/common/cuda_hip/preconditioner/jacobi_kernels.cpp b/common/cuda_hip/preconditioner/jacobi_kernels.cpp
index 3c581546be2..f614070f65e 100644
--- a/common/cuda_hip/preconditioner/jacobi_kernels.cpp
+++ b/common/cuda_hip/preconditioner/jacobi_kernels.cpp
@@ -412,4 +412,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 }  // namespace jacobi
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/common/cuda_hip/reorder/rcm_kernels.cpp b/common/cuda_hip/reorder/rcm_kernels.cpp
index 380ef69fac8..3206fb28c8b 100644
--- a/common/cuda_hip/reorder/rcm_kernels.cpp
+++ b/common/cuda_hip/reorder/rcm_kernels.cpp
@@ -658,4 +658,4 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_RCM_COMPUTE_PERMUTATION_KERNEL);
 }  // namespace rcm
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/common/cuda_hip/solver/cb_gmres_kernels.cpp b/common/cuda_hip/solver/cb_gmres_kernels.cpp
index 59c9812dc65..02d45a8d31e 100644
--- a/common/cuda_hip/solver/cb_gmres_kernels.cpp
+++ b/common/cuda_hip/solver/cb_gmres_kernels.cpp
@@ -1049,4 +1049,4 @@ GKO_INSTANTIATE_FOR_EACH_CB_GMRES_CONST_TYPE(
 }  // namespace cb_gmres
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/common/cuda_hip/solver/idr_kernels.cpp b/common/cuda_hip/solver/idr_kernels.cpp
index 63c5f015f68..a0f605134eb 100644
--- a/common/cuda_hip/solver/idr_kernels.cpp
+++ b/common/cuda_hip/solver/idr_kernels.cpp
@@ -650,4 +650,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL);
 }  // namespace idr
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/common/cuda_hip/solver/multigrid_kernels.cpp b/common/cuda_hip/solver/multigrid_kernels.cpp
index 61b6ee44836..b9e411bd5f8 100644
--- a/common/cuda_hip/solver/multigrid_kernels.cpp
+++ b/common/cuda_hip/solver/multigrid_kernels.cpp
@@ -204,4 +204,4 @@ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
 }  // namespace multigrid
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko

From aa0f23f37f79854a66bab23cae7a95fbd0bc3a95 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Sat, 29 Jun 2024 20:06:24 +0200
Subject: [PATCH 046/448] unify index_map

---
 common/cuda_hip/CMakeLists.txt                |  1 +
 ..._kernels.hpp.inc => index_map_kernels.cpp} | 34 ++++++++++++++++
 cuda/CMakeLists.txt                           |  1 -
 cuda/distributed/index_map_kernels.cu         | 39 -------------------
 hip/CMakeLists.txt                            |  1 -
 hip/distributed/index_map_kernels.hip.cpp     | 39 -------------------
 6 files changed, 35 insertions(+), 80 deletions(-)
 rename common/cuda_hip/distributed/{index_map_kernels.hpp.inc => index_map_kernels.cpp} (92%)
 delete mode 100644 cuda/distributed/index_map_kernels.cu
 delete mode 100644 hip/distributed/index_map_kernels.hip.cpp

diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt
index 0225e3ad872..af6a8c24503 100644
--- a/common/cuda_hip/CMakeLists.txt
+++ b/common/cuda_hip/CMakeLists.txt
@@ -2,6 +2,7 @@ include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
 set(CUDA_HIP_SOURCES
     base/device_matrix_data_kernels.cpp
     components/prefix_sum_kernels.cpp
+    distributed/index_map_kernels.cpp
     distributed/matrix_kernels.cpp
     distributed/partition_helpers_kernels.cpp
     distributed/partition_kernels.cpp
diff --git a/common/cuda_hip/distributed/index_map_kernels.hpp.inc b/common/cuda_hip/distributed/index_map_kernels.cpp
similarity index 92%
rename from common/cuda_hip/distributed/index_map_kernels.hpp.inc
rename to common/cuda_hip/distributed/index_map_kernels.cpp
index 9d312cc43aa..744d0f5581f 100644
--- a/common/cuda_hip/distributed/index_map_kernels.hpp.inc
+++ b/common/cuda_hip/distributed/index_map_kernels.cpp
@@ -2,6 +2,34 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/distributed/index_map_kernels.hpp"
+
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/distance.h>
+#include <thrust/execution_policy.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/transform_reduce.h>
+#include <thrust/unique.h>
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace index_map {
+
+
 /**
  * This struct is necessary, since the `transform_output_iterator` seemingly
  * doesn't support non-copyable tranfsorm function (this excludes lambdas)
@@ -266,3 +294,9 @@ void map_to_local(
 
 GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_INDEX_MAP_MAP_TO_LOCAL);
+
+
+}  // namespace index_map
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index 1552f4f3ee5..a068eb727b7 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -18,7 +18,6 @@ target_sources(ginkgo_cuda
     base/stream.cpp
     base/timer.cpp
     base/version.cpp
-    distributed/index_map_kernels.cu
     factorization/ic_kernels.cu
     factorization/ilu_kernels.cu
     factorization/par_ict_kernels.cu
diff --git a/cuda/distributed/index_map_kernels.cu b/cuda/distributed/index_map_kernels.cu
deleted file mode 100644
index 3c23d098a0e..00000000000
--- a/cuda/distributed/index_map_kernels.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/distributed/index_map_kernels.hpp"
-
-#include <thrust/binary_search.h>
-#include <thrust/copy.h>
-#include <thrust/distance.h>
-#include <thrust/execution_policy.h>
-#include <thrust/for_each.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/iterator/transform_output_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#include <thrust/transform_reduce.h>
-#include <thrust/unique.h>
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-
-#include "common/cuda_hip/base/thrust.hpp"
-#include "common/cuda_hip/components/atomic.hpp"
-#include "common/cuda_hip/components/searching.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-namespace index_map {
-
-
-#include "common/cuda_hip/distributed/index_map_kernels.hpp.inc"
-
-
-}  // namespace index_map
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 71d41ad47df..d83e5e28d21 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -16,7 +16,6 @@ set(GINKGO_HIP_SOURCES
     base/stream.hip.cpp
     base/timer.hip.cpp
     base/version.hip.cpp
-    distributed/index_map_kernels.hip.cpp
     factorization/ic_kernels.hip.cpp
     factorization/ilu_kernels.hip.cpp
     factorization/par_ict_kernels.hip.cpp
diff --git a/hip/distributed/index_map_kernels.hip.cpp b/hip/distributed/index_map_kernels.hip.cpp
deleted file mode 100644
index 67ff2f72857..00000000000
--- a/hip/distributed/index_map_kernels.hip.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/distributed/index_map_kernels.hpp"
-
-#include <thrust/binary_search.h>
-#include <thrust/copy.h>
-#include <thrust/distance.h>
-#include <thrust/execution_policy.h>
-#include <thrust/for_each.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/iterator/transform_output_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#include <thrust/transform_reduce.h>
-#include <thrust/unique.h>
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-
-#include "common/cuda_hip/base/thrust.hpp"
-#include "common/cuda_hip/components/atomic.hpp"
-#include "common/cuda_hip/components/searching.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-namespace index_map {
-
-
-#include "common/cuda_hip/distributed/index_map_kernels.hpp.inc"
-
-
-}  // namespace index_map
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko

From 6e7c0964ffe003da0f72a347f90348cde7af786e Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Sat, 29 Jun 2024 20:12:13 +0200
Subject: [PATCH 047/448] unify stopping criteria

---
 common/cuda_hip/CMakeLists.txt                |   2 +
 .../cuda_hip/stop/criterion_kernels.cpp       |   7 +-
 .../cuda_hip/stop/residual_norm_kernels.cpp   |   8 +-
 cuda/CMakeLists.txt                           |   2 -
 cuda/stop/residual_norm_kernels.cu            | 179 ------------------
 hip/CMakeLists.txt                            |   2 -
 hip/stop/criterion_kernels.hip.cpp            |  58 ------
 7 files changed, 10 insertions(+), 248 deletions(-)
 rename cuda/stop/criterion_kernels.cu => common/cuda_hip/stop/criterion_kernels.cpp (89%)
 rename hip/stop/residual_norm_kernels.hip.cpp => common/cuda_hip/stop/residual_norm_kernels.cpp (96%)
 delete mode 100644 cuda/stop/residual_norm_kernels.cu
 delete mode 100644 hip/stop/criterion_kernels.hip.cpp

diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt
index af6a8c24503..c18755ab164 100644
--- a/common/cuda_hip/CMakeLists.txt
+++ b/common/cuda_hip/CMakeLists.txt
@@ -25,6 +25,8 @@ set(CUDA_HIP_SOURCES
     solver/cb_gmres_kernels.cpp
     solver/idr_kernels.cpp
     solver/multigrid_kernels.cpp
+    stop/criterion_kernels.cpp
+    stop/residual_norm_kernels.cpp
     )
 list(TRANSFORM CUDA_HIP_SOURCES PREPEND ${CMAKE_CURRENT_SOURCE_DIR}/)
 set(GKO_CUDA_HIP_COMMON_SOURCES ${CUDA_HIP_SOURCES} PARENT_SCOPE)
diff --git a/cuda/stop/criterion_kernels.cu b/common/cuda_hip/stop/criterion_kernels.cpp
similarity index 89%
rename from cuda/stop/criterion_kernels.cu
rename to common/cuda_hip/stop/criterion_kernels.cpp
index fa596f0c03f..8e3a69f725e 100644
--- a/cuda/stop/criterion_kernels.cu
+++ b/common/cuda_hip/stop/criterion_kernels.cpp
@@ -9,13 +9,14 @@
 #include <ginkgo/core/stop/stopping_status.hpp>
 
 #include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
 
 
 namespace gko {
 namespace kernels {
-namespace cuda {
+namespace GKO_DEVICE_NAMESPACE {
 /**
  * @brief The Set all statuses namespace.
  * @ref set_status
@@ -38,7 +39,7 @@ __global__ __launch_bounds__(default_block_size) void set_all_statuses(
 }
 
 
-void set_all_statuses(std::shared_ptr<const CudaExecutor> exec,
+void set_all_statuses(std::shared_ptr<const DefaultExecutor> exec,
                       uint8 stoppingId, bool setFinalized,
                       array<stopping_status>* stop_status)
 {
@@ -54,6 +55,6 @@ void set_all_statuses(std::shared_ptr<const CudaExecutor> exec,
 
 
 }  // namespace set_all_statuses
-}  // namespace cuda
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/hip/stop/residual_norm_kernels.hip.cpp b/common/cuda_hip/stop/residual_norm_kernels.cpp
similarity index 96%
rename from hip/stop/residual_norm_kernels.hip.cpp
rename to common/cuda_hip/stop/residual_norm_kernels.cpp
index 0a9af423128..9d6db5211e8 100644
--- a/hip/stop/residual_norm_kernels.hip.cpp
+++ b/common/cuda_hip/stop/residual_norm_kernels.cpp
@@ -17,7 +17,7 @@
 
 namespace gko {
 namespace kernels {
-namespace hip {
+namespace GKO_DEVICE_NAMESPACE {
 /**
  * @brief The Residual norm stopping criterion namespace.
  * @ref resnorm
@@ -61,7 +61,7 @@ __global__ __launch_bounds__(1) void init_kernel(
 
 
 template <typename ValueType>
-void residual_norm(std::shared_ptr<const HipExecutor> exec,
+void residual_norm(std::shared_ptr<const DefaultExecutor> exec,
                    const matrix::Dense<ValueType>* tau,
                    const matrix::Dense<ValueType>* orig_tau,
                    ValueType rel_residual_goal, uint8 stoppingId,
@@ -143,7 +143,7 @@ __global__ __launch_bounds__(1) void init_kernel(
 
 template <typename ValueType>
 void implicit_residual_norm(
-    std::shared_ptr<const HipExecutor> exec,
+    std::shared_ptr<const DefaultExecutor> exec,
     const matrix::Dense<ValueType>* tau,
     const matrix::Dense<remove_complex<ValueType>>* orig_tau,
     remove_complex<ValueType> rel_residual_goal, uint8 stoppingId,
@@ -175,6 +175,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL);
 
 
 }  // namespace implicit_residual_norm
-}  // namespace hip
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index a068eb727b7..b44fe665153 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -41,8 +41,6 @@ target_sources(ginkgo_cuda
     solver/batch_cg_kernels.cu
     solver/lower_trs_kernels.cu
     solver/upper_trs_kernels.cu
-    stop/criterion_kernels.cu
-    stop/residual_norm_kernels.cu
     ${GKO_UNIFIED_COMMON_SOURCES}
     ${GKO_CUDA_HIP_COMMON_SOURCES}
     )
diff --git a/cuda/stop/residual_norm_kernels.cu b/cuda/stop/residual_norm_kernels.cu
deleted file mode 100644
index e52a74cf422..00000000000
--- a/cuda/stop/residual_norm_kernels.cu
+++ /dev/null
@@ -1,179 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/stop/residual_norm_kernels.hpp"
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/stop/residual_norm.hpp>
-
-#include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "core/base/array_access.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Residual norm stopping criterion namespace.
- * @ref resnorm
- * @ingroup resnorm
- */
-namespace residual_norm {
-
-
-constexpr int default_block_size = 512;
-
-
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void residual_norm_kernel(
-    size_type num_cols, ValueType rel_residual_goal,
-    const ValueType* __restrict__ tau, const ValueType* __restrict__ orig_tau,
-    uint8 stoppingId, bool setFinalized,
-    stopping_status* __restrict__ stop_status,
-    bool* __restrict__ device_storage)
-{
-    const auto tidx = thread::get_thread_id_flat();
-    if (tidx < num_cols) {
-        if (tau[tidx] <= rel_residual_goal * orig_tau[tidx]) {
-            stop_status[tidx].converge(stoppingId, setFinalized);
-            device_storage[1] = true;
-        }
-        // because only false is written to all_converged, write conflicts
-        // should not cause any problem
-        else if (!stop_status[tidx].has_stopped()) {
-            device_storage[0] = false;
-        }
-    }
-}
-
-
-__global__ __launch_bounds__(1) void init_kernel(
-    bool* __restrict__ device_storage)
-{
-    device_storage[0] = true;
-    device_storage[1] = false;
-}
-
-
-template <typename ValueType>
-void residual_norm(std::shared_ptr<const CudaExecutor> exec,
-                   const matrix::Dense<ValueType>* tau,
-                   const matrix::Dense<ValueType>* orig_tau,
-                   ValueType rel_residual_goal, uint8 stoppingId,
-                   bool setFinalized, array<stopping_status>* stop_status,
-                   array<bool>* device_storage, bool* all_converged,
-                   bool* one_changed)
-{
-    static_assert(is_complex_s<ValueType>::value == false,
-                  "ValueType must not be complex in this function!");
-    init_kernel<<<1, 1, 0, exec->get_stream()>>>(
-        as_device_type(device_storage->get_data()));
-
-    const auto block_size = default_block_size;
-    const auto grid_size = ceildiv(tau->get_size()[1], block_size);
-
-    if (grid_size > 0) {
-        residual_norm_kernel<<<grid_size, block_size, 0, exec->get_stream()>>>(
-            tau->get_size()[1], as_device_type(rel_residual_goal),
-            as_device_type(tau->get_const_values()),
-            as_device_type(orig_tau->get_const_values()), stoppingId,
-            setFinalized, as_device_type(stop_status->get_data()),
-            as_device_type(device_storage->get_data()));
-    }
-
-    /* Represents all_converged, one_changed */
-    *all_converged = get_element(*device_storage, 0);
-    *one_changed = get_element(*device_storage, 1);
-}
-
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
-    GKO_DECLARE_RESIDUAL_NORM_KERNEL);
-
-
-}  // namespace residual_norm
-
-
-/**
- * @brief The Implicit Residual norm stopping criterion.
- * @ref implicit_resnorm
- * @ingroup resnorm
- */
-namespace implicit_residual_norm {
-
-
-constexpr int default_block_size = 512;
-
-
-template <typename ValueType>
-__global__
-__launch_bounds__(default_block_size) void implicit_residual_norm_kernel(
-    size_type num_cols, remove_complex<ValueType> rel_residual_goal,
-    const ValueType* __restrict__ tau,
-    const remove_complex<ValueType>* __restrict__ orig_tau, uint8 stoppingId,
-    bool setFinalized, stopping_status* __restrict__ stop_status,
-    bool* __restrict__ device_storage)
-{
-    const auto tidx = thread::get_thread_id_flat();
-    if (tidx < num_cols) {
-        if (sqrt(abs(tau[tidx])) <= rel_residual_goal * orig_tau[tidx]) {
-            stop_status[tidx].converge(stoppingId, setFinalized);
-            device_storage[1] = true;
-        }
-        // because only false is written to all_converged, write conflicts
-        // should not cause any problem
-        else if (!stop_status[tidx].has_stopped()) {
-            device_storage[0] = false;
-        }
-    }
-}
-
-
-__global__ __launch_bounds__(1) void init_kernel(
-    bool* __restrict__ device_storage)
-{
-    device_storage[0] = true;
-    device_storage[1] = false;
-}
-
-
-template <typename ValueType>
-void implicit_residual_norm(
-    std::shared_ptr<const CudaExecutor> exec,
-    const matrix::Dense<ValueType>* tau,
-    const matrix::Dense<remove_complex<ValueType>>* orig_tau,
-    remove_complex<ValueType> rel_residual_goal, uint8 stoppingId,
-    bool setFinalized, array<stopping_status>* stop_status,
-    array<bool>* device_storage, bool* all_converged, bool* one_changed)
-{
-    init_kernel<<<1, 1, 0, exec->get_stream()>>>(
-        as_device_type(device_storage->get_data()));
-
-    const auto block_size = default_block_size;
-    const auto grid_size = ceildiv(tau->get_size()[1], block_size);
-
-    if (grid_size > 0) {
-        implicit_residual_norm_kernel<<<grid_size, block_size, 0,
-                                        exec->get_stream()>>>(
-            tau->get_size()[1], as_device_type(rel_residual_goal),
-            as_device_type(tau->get_const_values()),
-            as_device_type(orig_tau->get_const_values()), stoppingId,
-            setFinalized, as_device_type(stop_status->get_data()),
-            as_device_type(device_storage->get_data()));
-    }
-
-    /* Represents all_converged, one_changed */
-    *all_converged = get_element(*device_storage, 0);
-    *one_changed = get_element(*device_storage, 1);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL);
-
-
-}  // namespace implicit_residual_norm
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index d83e5e28d21..abc3d6b5bcf 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -38,8 +38,6 @@ set(GINKGO_HIP_SOURCES
     solver/batch_cg_kernels.hip.cpp
     solver/lower_trs_kernels.hip.cpp
     solver/upper_trs_kernels.hip.cpp
-    stop/criterion_kernels.hip.cpp
-    stop/residual_norm_kernels.hip.cpp
     ${GKO_UNIFIED_COMMON_SOURCES}
     ${GKO_CUDA_HIP_COMMON_SOURCES}
     )
diff --git a/hip/stop/criterion_kernels.hip.cpp b/hip/stop/criterion_kernels.hip.cpp
deleted file mode 100644
index 0b8e300f978..00000000000
--- a/hip/stop/criterion_kernels.hip.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/stop/criterion_kernels.hpp"
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/stop/stopping_status.hpp>
-
-#include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The Set all statuses namespace.
- * @ref set_status
- * @ingroup set_all_statuses
- */
-namespace set_all_statuses {
-
-
-constexpr int default_block_size = 512;
-
-
-__global__ __launch_bounds__(default_block_size) void set_all_statuses(
-    size_type num_elems, uint8 stoppingId, bool setFinalized,
-    stopping_status* stop_status)
-{
-    const auto tidx = thread::get_thread_id_flat();
-    if (tidx < num_elems) {
-        stop_status[tidx].stop(stoppingId, setFinalized);
-    }
-}
-
-
-void set_all_statuses(std::shared_ptr<const HipExecutor> exec, uint8 stoppingId,
-                      bool setFinalized, array<stopping_status>* stop_status)
-{
-    const auto block_size = default_block_size;
-    const auto grid_size = ceildiv(stop_status->get_size(), block_size);
-
-    if (grid_size > 0) {
-        set_all_statuses<<<grid_size, block_size, 0, exec->get_stream()>>>(
-            stop_status->get_size(), stoppingId, setFinalized,
-            as_device_type(stop_status->get_data()));
-    }
-}
-
-
-}  // namespace set_all_statuses
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko

From b618a7e3e87e0e6d98269f9cb6aa7bc68ed407a1 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Mon, 1 Jul 2024 23:44:06 +0200
Subject: [PATCH 048/448] fix include guard naming

---
 common/cuda_hip/base/math.hpp                              | 6 +++---
 common/cuda_hip/components/atomic.hpp                      | 6 +++---
 common/cuda_hip/components/diagonal_block_manipulation.hpp | 6 +++---
 common/cuda_hip/components/intrinsics.hpp                  | 6 +++---
 common/cuda_hip/components/merging.hpp                     | 6 +++---
 common/cuda_hip/components/prefix_sum.hpp                  | 6 +++---
 common/cuda_hip/components/reduction.hpp                   | 6 +++---
 common/cuda_hip/components/searching.hpp                   | 6 +++---
 common/cuda_hip/components/segment_scan.hpp                | 6 +++---
 common/cuda_hip/components/sorting.hpp                     | 6 +++---
 common/cuda_hip/components/syncfree.hpp                    | 6 +++---
 common/cuda_hip/components/thread_ids.hpp                  | 6 +++---
 common/cuda_hip/components/uninitialized_array.hpp         | 6 +++---
 common/cuda_hip/components/warp_blas.hpp                   | 6 +++---
 14 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/common/cuda_hip/base/math.hpp b/common/cuda_hip/base/math.hpp
index ea11c7d73a9..ee8612a691a 100644
--- a/common/cuda_hip/base/math.hpp
+++ b/common/cuda_hip/base/math.hpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#ifndef GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_INC_
-#define GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_INC_
+#ifndef GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_
+#define GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_
 
 
 #include <thrust/complex.h>
@@ -54,4 +54,4 @@ struct truncate_type_impl<thrust::complex<T>> {
 }  // namespace gko
 
 
-#endif  // GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_INC_
+#endif  // GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_
diff --git a/common/cuda_hip/components/atomic.hpp b/common/cuda_hip/components/atomic.hpp
index 3279c9433f1..cb9e5b00e67 100644
--- a/common/cuda_hip/components/atomic.hpp
+++ b/common/cuda_hip/components/atomic.hpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_ATOMIC_HPP_INC_
-#define GKO_COMMON_CUDA_HIP_COMPONENTS_ATOMIC_HPP_INC_
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_ATOMIC_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_ATOMIC_HPP_
 
 
 #include <type_traits>
@@ -250,4 +250,4 @@ __forceinline__ __device__ thrust::complex<double> atomic_add(
 }  // namespace gko
 
 
-#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_ATOMIC_HPP_INC_
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_ATOMIC_HPP_
diff --git a/common/cuda_hip/components/diagonal_block_manipulation.hpp b/common/cuda_hip/components/diagonal_block_manipulation.hpp
index 890d080018e..e00e11f1eea 100644
--- a/common/cuda_hip/components/diagonal_block_manipulation.hpp
+++ b/common/cuda_hip/components/diagonal_block_manipulation.hpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HPP_INC_
-#define GKO_COMMON_CUDA_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HPP_INC_
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HPP_
 
 
 #include <type_traits>
@@ -88,4 +88,4 @@ __device__ __forceinline__ void extract_transposed_diag_blocks(
 }  // namespace gko
 
 
-#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HPP_INC_
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HPP_
diff --git a/common/cuda_hip/components/intrinsics.hpp b/common/cuda_hip/components/intrinsics.hpp
index e8c236e22b1..df3b5ad4c7f 100644
--- a/common/cuda_hip/components/intrinsics.hpp
+++ b/common/cuda_hip/components/intrinsics.hpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_INTRINSICS_HPP_INC_
-#define GKO_COMMON_CUDA_HIP_COMPONENTS_INTRINSICS_HPP_INC_
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_INTRINSICS_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_INTRINSICS_HPP_
 
 
 #include <ginkgo/core/base/types.hpp>
@@ -55,4 +55,4 @@ __forceinline__ __device__ int clz(uint64 mask) { return __clzll(mask); }
 }  // namespace gko
 
 
-#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_INTRINSICS_HPP_INC_
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_INTRINSICS_HPP_
diff --git a/common/cuda_hip/components/merging.hpp b/common/cuda_hip/components/merging.hpp
index 4c1bfa4cd2d..ab070741fbd 100644
--- a/common/cuda_hip/components/merging.hpp
+++ b/common/cuda_hip/components/merging.hpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_MERGING_HPP_INC_
-#define GKO_COMMON_CUDA_HIP_COMPONENTS_MERGING_HPP_INC_
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_MERGING_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_MERGING_HPP_
 
 
 #include "common/cuda_hip/base/math.hpp"
@@ -302,4 +302,4 @@ __forceinline__ __device__ void sequential_match(const ValueType* a,
 }  // namespace gko
 
 
-#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_MERGING_HPP_INC_
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_MERGING_HPP_
diff --git a/common/cuda_hip/components/prefix_sum.hpp b/common/cuda_hip/components/prefix_sum.hpp
index a09eb8f17c5..ceed6b89a93 100644
--- a/common/cuda_hip/components/prefix_sum.hpp
+++ b/common/cuda_hip/components/prefix_sum.hpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_INC_
-#define GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_INC_
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_
 
 
 #include <type_traits>
@@ -182,4 +182,4 @@ __global__ __launch_bounds__(block_size) void finalize_prefix_sum(
 }  // namespace gko
 
 
-#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_INC_
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_
diff --git a/common/cuda_hip/components/reduction.hpp b/common/cuda_hip/components/reduction.hpp
index 582de3de1fb..fd9d34ed73c 100644
--- a/common/cuda_hip/components/reduction.hpp
+++ b/common/cuda_hip/components/reduction.hpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_REDUCTION_HPP_INC_
-#define GKO_COMMON_CUDA_HIP_COMPONENTS_REDUCTION_HPP_INC_
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_REDUCTION_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_REDUCTION_HPP_
 
 
 #include <type_traits>
@@ -296,4 +296,4 @@ __host__ ValueType reduce_add_array(std::shared_ptr<const DefaultExecutor> exec,
 }  // namespace gko
 
 
-#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_REDUCTION_HPP_INC_
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_REDUCTION_HPP_
diff --git a/common/cuda_hip/components/searching.hpp b/common/cuda_hip/components/searching.hpp
index 61efde54197..cb219c58b0b 100644
--- a/common/cuda_hip/components/searching.hpp
+++ b/common/cuda_hip/components/searching.hpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SEARCHING_HPP_INC_
-#define GKO_COMMON_CUDA_HIP_COMPONENTS_SEARCHING_HPP_INC_
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SEARCHING_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_SEARCHING_HPP_
 
 
 #include "common/cuda_hip/base/config.hpp"
@@ -228,4 +228,4 @@ __forceinline__ __device__ IndexType group_ary_search(IndexType offset,
 }  // namespace gko
 
 
-#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_SEARCHING_HPP_INC_
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_SEARCHING_HPP_
diff --git a/common/cuda_hip/components/segment_scan.hpp b/common/cuda_hip/components/segment_scan.hpp
index af3953a4176..0ab34fd093b 100644
--- a/common/cuda_hip/components/segment_scan.hpp
+++ b/common/cuda_hip/components/segment_scan.hpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SEGMENT_SCAN_HPP_INC_
-#define GKO_COMMON_CUDA_HIP_COMPONENTS_SEGMENT_SCAN_HPP_INC_
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SEGMENT_SCAN_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_SEGMENT_SCAN_HPP_
 
 
 #include "common/cuda_hip/components/cooperative_groups.hpp"
@@ -52,4 +52,4 @@ __device__ __forceinline__ bool segment_scan(
 }  // namespace gko
 
 
-#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_SEGMENT_SCAN_HPP_INC_
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_SEGMENT_SCAN_HPP_
diff --git a/common/cuda_hip/components/sorting.hpp b/common/cuda_hip/components/sorting.hpp
index b3ce253b451..7603d41a8ba 100644
--- a/common/cuda_hip/components/sorting.hpp
+++ b/common/cuda_hip/components/sorting.hpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SORTING_HPP_INC_
-#define GKO_COMMON_CUDA_HIP_COMPONENTS_SORTING_HPP_INC_
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SORTING_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_SORTING_HPP_
 
 
 #include "common/cuda_hip/base/config.hpp"
@@ -311,4 +311,4 @@ __forceinline__ __device__ void bitonic_sort(ValueType* local_elements,
 }  // namespace gko
 
 
-#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_SORTING_HPP_INC_
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_SORTING_HPP_
diff --git a/common/cuda_hip/components/syncfree.hpp b/common/cuda_hip/components/syncfree.hpp
index e1693fe4e4d..f2fb82366a2 100644
--- a/common/cuda_hip/components/syncfree.hpp
+++ b/common/cuda_hip/components/syncfree.hpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SYNCFREE_HPP_INC_
-#define GKO_COMMON_CUDA_HIP_COMPONENTS_SYNCFREE_HPP_INC_
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SYNCFREE_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_SYNCFREE_HPP_
 
 
 #include <ginkgo/core/base/array.hpp>
@@ -135,4 +135,4 @@ class syncfree_scheduler {
 }  // namespace gko
 
 
-#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_SYNCFREE_HPP_INC_
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_SYNCFREE_HPP_
diff --git a/common/cuda_hip/components/thread_ids.hpp b/common/cuda_hip/components/thread_ids.hpp
index 7d7c5e2bda3..e73296f92a9 100644
--- a/common/cuda_hip/components/thread_ids.hpp
+++ b/common/cuda_hip/components/thread_ids.hpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_THREAD_IDS_HPP_INC_
-#define GKO_COMMON_CUDA_HIP_COMPONENTS_THREAD_IDS_HPP_INC_
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_THREAD_IDS_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_THREAD_IDS_HPP_
 
 
 #include "common/cuda_hip/base/config.hpp"
@@ -263,4 +263,4 @@ __device__ __forceinline__ IndexType get_subwarp_num_flat()
 }  // namespace gko
 
 
-#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_THREAD_IDS_HPP_INC_
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_THREAD_IDS_HPP_
diff --git a/common/cuda_hip/components/uninitialized_array.hpp b/common/cuda_hip/components/uninitialized_array.hpp
index d4a2b5939af..44fcbfd0d85 100644
--- a/common/cuda_hip/components/uninitialized_array.hpp
+++ b/common/cuda_hip/components/uninitialized_array.hpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_INC_
-#define GKO_COMMON_CUDA_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_INC_
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_
 
 
 #include <ginkgo/core/base/types.hpp>
@@ -82,4 +82,4 @@ class uninitialized_array {
 }  // namespace gko
 
 
-#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_INC_
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_
diff --git a/common/cuda_hip/components/warp_blas.hpp b/common/cuda_hip/components/warp_blas.hpp
index cfa46b8a045..116b963ad11 100644
--- a/common/cuda_hip/components/warp_blas.hpp
+++ b/common/cuda_hip/components/warp_blas.hpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_WARP_BLAS_HPP_INC_
-#define GKO_COMMON_CUDA_HIP_COMPONENTS_WARP_BLAS_HPP_INC_
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_WARP_BLAS_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_WARP_BLAS_HPP_
 
 
 #include <cassert>
@@ -434,4 +434,4 @@ __device__ __forceinline__ remove_complex<ValueType> compute_infinity_norm(
 }  // namespace gko
 
 
-#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_WARP_BLAS_HPP_INC_
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_WARP_BLAS_HPP_

From d16dd18d45d41a43c5b738455b377a1e9a0601ab Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Mon, 1 Jul 2024 23:44:33 +0200
Subject: [PATCH 049/448] fix formatting

Co-authored-by: Yuhsiang M. Tsai <yhmtsai@gmail.com>
---
 common/cuda_hip/base/math.hpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/common/cuda_hip/base/math.hpp b/common/cuda_hip/base/math.hpp
index ee8612a691a..8c655174524 100644
--- a/common/cuda_hip/base/math.hpp
+++ b/common/cuda_hip/base/math.hpp
@@ -49,8 +49,6 @@ struct truncate_type_impl<thrust::complex<T>> {
 
 
 }  // namespace detail
-
-
 }  // namespace gko
 
 

From 6762a902939db8ee853168dfb23e58ee4e70f4a7 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Tue, 2 Jul 2024 15:23:41 +0200
Subject: [PATCH 050/448] add unification script

---
 dev_tools/scripts/unify_cuda_hip.py | 135 ++++++++++++++++++++++++++++
 1 file changed, 135 insertions(+)
 create mode 100644 dev_tools/scripts/unify_cuda_hip.py

diff --git a/dev_tools/scripts/unify_cuda_hip.py b/dev_tools/scripts/unify_cuda_hip.py
new file mode 100644
index 00000000000..e359a69d1ff
--- /dev/null
+++ b/dev_tools/scripts/unify_cuda_hip.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+import sys
+import os
+import difflib
+import subprocess
+
+common_filename = sys.argv[1]
+base_filename = common_filename.replace("common/cuda_hip/", "").replace(".hpp.inc", "")
+cuda_filename = next(
+    f"cuda/{base_filename}{extension}"
+    for extension in [".cu", ".cuh", ".cpp", ".hpp", ".template.cu"]
+    if os.path.exists(f"cuda/{base_filename}{extension}")
+)
+hip_filename = next(
+    f"hip/{base_filename}{extension}"
+    for extension in [".hip.cpp", ".hip.hpp", ".template.hip.cpp"]
+    if os.path.exists(f"hip/{base_filename}{extension}")
+)
+output_filename = f"common/cuda_hip/{base_filename}{'.cpp' if cuda_filename.endswith('.cu') else '.hpp'}"
+
+common_lines = list(open(common_filename))[3:]  # remove license header
+cuda_lines = list(open(cuda_filename))
+hip_lines = list(open(hip_filename))
+
+cuda_file_guard = f"GKO_{cuda_filename.upper().replace('/', '_').replace('.','_')}_"
+hip_file_guard = f"GKO_{hip_filename.upper().replace('/', '_').replace('.','_')}_"
+common_file_guard = f"GKO_{common_filename.upper().replace('/', '_').replace('.','_')}_"
+
+cuda_lines = [
+    line.replace(cuda_file_guard, common_file_guard)
+    .replace("namespace cuda", "namespace GKO_DEVICE_NAMESPACE")
+    .replace("CudaExecutor", "DefaultExecutor")
+    for line in cuda_lines
+]
+hip_lines = [
+    line.replace(hip_file_guard, common_file_guard)
+    .replace("namespace hip", "namespace GKO_DEVICE_NAMESPACE")
+    .replace("HipExecutor", "DefaultExecutor")
+    for line in hip_lines
+]
+
+for i in range(len(cuda_lines)):
+    if cuda_lines[i].startswith('#include "'):
+        cuda_lines[i] = (
+            cuda_lines[i]
+            .replace('#include "cuda/', '#include "common/cuda_hip/')
+            .replace(".cuh", ".hpp")
+            .replace("cublas", "blas")
+            .replace("cusparse", "sparselib")
+            .replace("curand", "randlib")
+        )
+    cuda_lines[i] = (
+        cuda_lines[i]
+        .replace("cuda_range", "device_range")
+        .replace("cuda::", "GKO_DEVICE_NAMESPACE::")
+    )
+for i in range(len(hip_lines)):
+    if hip_lines[i].startswith('#include "'):
+        hip_lines[i] = (
+            hip_lines[i]
+            .replace('#include "hip/', '#include "common/cuda_hip/')
+            .replace(".hip.hpp", ".hpp")
+            .replace("hipblas", "blas")
+            .replace("hipsparse", "sparselib")
+            .replace("hiprand", "randlib")
+        )
+    hip_lines[i] = (
+        hip_lines[i]
+        .replace("hip_range", "device_range")
+        .replace("hip::", "GKO_DEVICE_NAMESPACE::")
+    )
+
+cuda_location = next(
+    i
+    for i, line in enumerate(cuda_lines)
+    if line.startswith(f'#include "{common_filename}"')
+)
+hip_location = next(
+    i
+    for i, line in enumerate(hip_lines)
+    if line.startswith(f'#include "{common_filename}"')
+)
+cuda_replaced = (
+    cuda_lines[:cuda_location] + common_lines + cuda_lines[cuda_location + 1 :]
+)
+hip_replaced = hip_lines[:hip_location] + common_lines + hip_lines[hip_location + 1 :]
+
+cuda_replaced = (
+    subprocess.run(
+        args=[
+            "/home/tribizel/.cache/pre-commit/repoay30okq9/py_env-python3/lib64/python3.9/site-packages/clang_format/data/bin/clang-format",
+            f"-assume-filename={output_filename}",
+            "-",
+        ],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        input=bytes("".join(cuda_replaced), "utf-8"),
+    )
+    .stdout.decode()
+    .splitlines()
+)
+hip_replaced = (
+    subprocess.run(
+        args=[
+            "/home/tribizel/.cache/pre-commit/repoay30okq9/py_env-python3/lib64/python3.9/site-packages/clang_format/data/bin/clang-format",
+            f"-assume-filename={output_filename}",
+            "-",
+        ],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        input=bytes("".join(hip_replaced), "utf-8"),
+    )
+    .stdout.decode()
+    .splitlines()
+)
+
+if cuda_replaced == hip_replaced:
+    with open(output_filename, "w") as file:
+        file.write("\n".join(cuda_replaced))
+    os.remove(common_filename)
+    os.remove(cuda_filename)
+    os.remove(hip_filename)
+    with open("cuda_source_delete.sed", "a") as file:
+        file.write("/" + cuda_filename[5:].replace("/", "\\/") + "/d;")
+    with open("hip_source_delete.sed", "a") as file:
+        file.write("/" + hip_filename[4:].replace("/", "\\/") + "/d;")
+    with open("source_add.cmake", "a") as file:
+        file.write(f"{output_filename}\n")
+    sys.exit(0)
+else:
+    print(common_filename)
+    print(cuda_filename)
+    print(hip_filename)
+    print("\n".join(difflib.unified_diff(cuda_replaced, hip_replaced)))
+    sys.exit(1)

From c8c7051c2e3c224532a7d87d76575340d8ca2bf7 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Thu, 4 Jul 2024 15:51:47 +0200
Subject: [PATCH 051/448] mirror cuda_hip in generated headers

---
 common/cuda_hip/preconditioner/jacobi_kernels.cpp              | 3 ++-
 cuda/CMakeLists.txt                                            | 2 +-
 cuda/preconditioner/batch_jacobi_kernels.cu                    | 3 ++-
 cuda/preconditioner/jacobi_advanced_apply_kernels.cu           | 3 ++-
 .../jacobi_advanced_apply_kernels.instantiate.cu               | 3 ++-
 cuda/preconditioner/jacobi_generate_kernels.cu                 | 3 ++-
 cuda/preconditioner/jacobi_generate_kernels.instantiate.cu     | 3 ++-
 cuda/preconditioner/jacobi_simple_apply_kernels.cu             | 3 ++-
 cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu | 3 ++-
 hip/CMakeLists.txt                                             | 2 +-
 hip/preconditioner/batch_jacobi_kernels.hip.cpp                | 3 ++-
 hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp       | 3 ++-
 .../jacobi_advanced_apply_kernels.instantiate.hip.cpp          | 3 ++-
 hip/preconditioner/jacobi_generate_kernels.hip.cpp             | 3 ++-
 hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp | 3 ++-
 hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp         | 3 ++-
 .../jacobi_simple_apply_kernels.instantiate.hip.cpp            | 3 ++-
 17 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/common/cuda_hip/preconditioner/jacobi_kernels.cpp b/common/cuda_hip/preconditioner/jacobi_kernels.cpp
index f614070f65e..8cf5ad1e9fd 100644
--- a/common/cuda_hip/preconditioner/jacobi_kernels.cpp
+++ b/common/cuda_hip/preconditioner/jacobi_kernels.cpp
@@ -15,7 +15,8 @@
 #include "core/base/extended_float.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "preconditioner/jacobi_common.hpp"
+// generated header
+#include "common/cuda_hip/preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index b44fe665153..30b3f2747e6 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -74,7 +74,7 @@ foreach(GKO_JACOBI_BLOCK_SIZE IN LISTS GKO_CUDA_JACOBI_BLOCK_SIZES)
 endforeach()
 target_sources(ginkgo_cuda PRIVATE ${GKO_CUDA_JACOBI_SOURCES})
 string(REPLACE ";" "," GKO_CUDA_JACOBI_BLOCK_SIZES_CODE "${GKO_CUDA_JACOBI_BLOCK_SIZES}")
-configure_file(preconditioner/jacobi_common.hpp.in preconditioner/jacobi_common.hpp)
+configure_file(preconditioner/jacobi_common.hpp.in common/cuda_hip/preconditioner/jacobi_common.hpp)
 
 if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
     # remove false positive CUDA warnings when calling one<T>() and zero<T>()
diff --git a/cuda/preconditioner/batch_jacobi_kernels.cu b/cuda/preconditioner/batch_jacobi_kernels.cu
index 178b53d04ea..716c158ffff 100644
--- a/cuda/preconditioner/batch_jacobi_kernels.cu
+++ b/cuda/preconditioner/batch_jacobi_kernels.cu
@@ -21,7 +21,8 @@
 #include "cuda/base/types.hpp"
 #include "cuda/components/cooperative_groups.cuh"
 #include "cuda/matrix/batch_struct.hpp"
-#include "preconditioner/jacobi_common.hpp"
+// generated header
+#include "common/cuda_hip/preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernels.cu b/cuda/preconditioner/jacobi_advanced_apply_kernels.cu
index fca6b24ba05..a37296abf40 100644
--- a/cuda/preconditioner/jacobi_advanced_apply_kernels.cu
+++ b/cuda/preconditioner/jacobi_advanced_apply_kernels.cu
@@ -7,7 +7,8 @@
 #include "core/matrix/dense_kernels.hpp"
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "preconditioner/jacobi_common.hpp"
+// generated header
+#include "common/cuda_hip/preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu
index 80c3b5e1e73..fcf238d038f 100644
--- a/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu
+++ b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu
@@ -15,7 +15,8 @@
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "preconditioner/jacobi_common.hpp"
+// generated header
+#include "common/cuda_hip/preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/cuda/preconditioner/jacobi_generate_kernels.cu b/cuda/preconditioner/jacobi_generate_kernels.cu
index e558594f5ce..d51f1947b7a 100644
--- a/cuda/preconditioner/jacobi_generate_kernels.cu
+++ b/cuda/preconditioner/jacobi_generate_kernels.cu
@@ -8,7 +8,8 @@
 #include "core/components/fill_array_kernels.hpp"
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "preconditioner/jacobi_common.hpp"
+// generated header
+#include "common/cuda_hip/preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu
index 0dc21311af9..aa8807728a8 100644
--- a/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu
+++ b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu
@@ -18,7 +18,8 @@
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "preconditioner/jacobi_common.hpp"
+// generated header
+#include "common/cuda_hip/preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/cuda/preconditioner/jacobi_simple_apply_kernels.cu b/cuda/preconditioner/jacobi_simple_apply_kernels.cu
index 0bb09b1064a..62e49d30618 100644
--- a/cuda/preconditioner/jacobi_simple_apply_kernels.cu
+++ b/cuda/preconditioner/jacobi_simple_apply_kernels.cu
@@ -7,7 +7,8 @@
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "preconditioner/jacobi_common.hpp"
+// generated header
+#include "common/cuda_hip/preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu
index 0721c03126b..d51b63487fe 100644
--- a/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu
+++ b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu
@@ -15,7 +15,8 @@
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "preconditioner/jacobi_common.hpp"
+// generated header
+#include "common/cuda_hip/preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index abc3d6b5bcf..23584c2742a 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -84,7 +84,7 @@ foreach(GKO_JACOBI_BLOCK_SIZE IN LISTS GKO_HIP_JACOBI_BLOCK_SIZES)
         ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp)
 endforeach()
 string(REPLACE ";" "," GKO_HIP_JACOBI_BLOCK_SIZES_CODE "${GKO_HIP_JACOBI_BLOCK_SIZES}")
-configure_file(preconditioner/jacobi_common.hip.hpp.in preconditioner/jacobi_common.hpp)
+configure_file(preconditioner/jacobi_common.hip.hpp.in common/cuda_hip/preconditioner/jacobi_common.hpp)
 
 set_source_files_properties(${GINKGO_HIP_SOURCES} PROPERTIES LANGUAGE HIP)
 add_library(ginkgo_hip $<TARGET_OBJECTS:ginkgo_hip_device> ${GINKGO_HIP_SOURCES})
diff --git a/hip/preconditioner/batch_jacobi_kernels.hip.cpp b/hip/preconditioner/batch_jacobi_kernels.hip.cpp
index cfef615dcad..e86bc86390a 100644
--- a/hip/preconditioner/batch_jacobi_kernels.hip.cpp
+++ b/hip/preconditioner/batch_jacobi_kernels.hip.cpp
@@ -23,7 +23,8 @@
 #include "hip/base/types.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/matrix/batch_struct.hip.hpp"
-#include "preconditioner/jacobi_common.hpp"
+// generated header
+#include "common/cuda_hip/preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp
index ce260ec1e16..371a10051fc 100644
--- a/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp
+++ b/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp
@@ -7,7 +7,8 @@
 #include "core/matrix/dense_kernels.hpp"
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "preconditioner/jacobi_common.hpp"
+// generated header
+#include "common/cuda_hip/preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp
index 9cc4978a1f8..42c542c228b 100644
--- a/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp
+++ b/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp
@@ -16,7 +16,8 @@
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "preconditioner/jacobi_common.hpp"
+// generated header
+#include "common/cuda_hip/preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/hip/preconditioner/jacobi_generate_kernels.hip.cpp b/hip/preconditioner/jacobi_generate_kernels.hip.cpp
index 673ca8c373e..d295ebb046e 100644
--- a/hip/preconditioner/jacobi_generate_kernels.hip.cpp
+++ b/hip/preconditioner/jacobi_generate_kernels.hip.cpp
@@ -19,7 +19,8 @@
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "preconditioner/jacobi_common.hpp"
+// generated header
+#include "common/cuda_hip/preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp b/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp
index a6be610a839..698efe6a858 100644
--- a/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp
+++ b/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp
@@ -18,7 +18,8 @@
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "preconditioner/jacobi_common.hpp"
+// generated header
+#include "common/cuda_hip/preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp
index 72f2e4fe556..16ca805a42c 100644
--- a/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp
+++ b/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp
@@ -16,7 +16,8 @@
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "preconditioner/jacobi_common.hpp"
+// generated header
+#include "common/cuda_hip/preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp
index 1ea34bff93f..d666a698b5e 100644
--- a/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp
+++ b/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp
@@ -15,7 +15,8 @@
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "preconditioner/jacobi_common.hpp"
+// generated header
+#include "common/cuda_hip/preconditioner/jacobi_common.hpp"
 
 
 namespace gko {

From 65aa14fb75c72042297f70b85935e7b009d5e6d8 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Thu, 4 Jul 2024 16:07:32 +0200
Subject: [PATCH 052/448] fix HIP warnings

The % in the computation gets misinterpreted by the printf used as a fallback for GKO_ASSERT
---
 hip/solver/batch_bicgstab_kernels.hip.cpp | 7 +++++--
 hip/solver/batch_cg_kernels.hip.cpp       | 7 +++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp
index 92051a81640..95a49953b3e 100644
--- a/hip/solver/batch_bicgstab_kernels.hip.cpp
+++ b/hip/solver/batch_bicgstab_kernels.hip.cpp
@@ -125,8 +125,9 @@ class kernel_caller {
             exec_->get_device_id()));
         const int block_size =
             get_num_threads_per_block<BatchMatrixType>(exec_, mat.num_rows);
+        bool is_block_size_aligned = block_size % config::warp_size == 0;
         GKO_ASSERT(block_size >= 2 * config::warp_size);
-        GKO_ASSERT(block_size % config::warp_size == 0);
+        GKO_ASSERT(is_block_size_aligned);
 
         // Returns amount required in bytes
         const size_t prec_size = PrecType::dynamic_work_size(
@@ -142,7 +143,9 @@ class kernel_caller {
         auto workspace = gko::array<value_type>(
             exec_,
             sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type));
-        GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(value_type) == 0);
+        bool is_stride_aligned =
+            sconf.gmem_stride_bytes % sizeof(value_type) == 0;
+        GKO_ASSERT(is_stride_aligned);
 
         value_type* const workspace_data = workspace.get_data();
 
diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp
index 2df02a6f0a8..6102749b988 100644
--- a/hip/solver/batch_cg_kernels.hip.cpp
+++ b/hip/solver/batch_cg_kernels.hip.cpp
@@ -125,8 +125,9 @@ class kernel_caller {
             exec_->get_device_id()));
         const int block_size =
             get_num_threads_per_block<BatchMatrixType>(exec_, mat.num_rows);
+        bool is_block_size_aligned = block_size % config::warp_size == 0;
         GKO_ASSERT(block_size >= 2 * config::warp_size);
-        GKO_ASSERT(block_size % config::warp_size == 0);
+        GKO_ASSERT(is_block_size_aligned);
 
         // Returns amount required in bytes
         const size_t prec_size = PrecType::dynamic_work_size(
@@ -142,7 +143,9 @@ class kernel_caller {
         auto workspace = gko::array<value_type>(
             exec_,
             sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type));
-        GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(value_type) == 0);
+        bool is_stride_aligned =
+            sconf.gmem_stride_bytes % sizeof(value_type) == 0;
+        GKO_ASSERT(is_stride_aligned);
 
         value_type* const workspace_data = workspace.get_data();
 

From 674c54a3c83dca988974e37c42ac25ee25e7bede Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Thu, 4 Jul 2024 16:10:09 +0200
Subject: [PATCH 053/448] review updates

Co-authored-by: Yu-Hsiang M. Tsai <yhmtsai@gmail.com>
---
 common/cuda_hip/components/atomic.hpp             | 2 --
 common/cuda_hip/components/reduction.hpp          | 4 ++--
 common/cuda_hip/distributed/index_map_kernels.cpp | 2 +-
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/common/cuda_hip/components/atomic.hpp b/common/cuda_hip/components/atomic.hpp
index cb9e5b00e67..2fbb1664165 100644
--- a/common/cuda_hip/components/atomic.hpp
+++ b/common/cuda_hip/components/atomic.hpp
@@ -15,8 +15,6 @@
 namespace gko {
 namespace kernels {
 namespace GKO_DEVICE_NAMESPACE {
-
-
 namespace detail {
 
 
diff --git a/common/cuda_hip/components/reduction.hpp b/common/cuda_hip/components/reduction.hpp
index fd9d34ed73c..1968a6d30b6 100644
--- a/common/cuda_hip/components/reduction.hpp
+++ b/common/cuda_hip/components/reduction.hpp
@@ -260,8 +260,8 @@ __launch_bounds__(default_reduce_block_size) void reduce_add_array_with_initial_
  * @return the reduction result
  */
 template <typename ValueType>
-__host__ ValueType reduce_add_array(std::shared_ptr<const DefaultExecutor> exec,
-                                    size_type size, const ValueType* source)
+ValueType reduce_add_array(std::shared_ptr<const DefaultExecutor> exec,
+                           size_type size, const ValueType* source)
 {
     auto block_results_val = source;
     size_type grid_dim = size;
diff --git a/common/cuda_hip/distributed/index_map_kernels.cpp b/common/cuda_hip/distributed/index_map_kernels.cpp
index 744d0f5581f..e27c5221013 100644
--- a/common/cuda_hip/distributed/index_map_kernels.cpp
+++ b/common/cuda_hip/distributed/index_map_kernels.cpp
@@ -299,4 +299,4 @@ GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE(
 }  // namespace index_map
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko

From 53be0aa665401be322c76f0d6eb645315ce83ba3 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Fri, 5 Jul 2024 09:00:15 +0200
Subject: [PATCH 054/448] Revert "add unification script"

This reverts commit a0f5a82289392e2281b8fe2fc0713cf4043a480c.
---
 dev_tools/scripts/unify_cuda_hip.py | 135 ----------------------------
 1 file changed, 135 deletions(-)
 delete mode 100644 dev_tools/scripts/unify_cuda_hip.py

diff --git a/dev_tools/scripts/unify_cuda_hip.py b/dev_tools/scripts/unify_cuda_hip.py
deleted file mode 100644
index e359a69d1ff..00000000000
--- a/dev_tools/scripts/unify_cuda_hip.py
+++ /dev/null
@@ -1,135 +0,0 @@
-#!/usr/bin/env python3
-import sys
-import os
-import difflib
-import subprocess
-
-common_filename = sys.argv[1]
-base_filename = common_filename.replace("common/cuda_hip/", "").replace(".hpp.inc", "")
-cuda_filename = next(
-    f"cuda/{base_filename}{extension}"
-    for extension in [".cu", ".cuh", ".cpp", ".hpp", ".template.cu"]
-    if os.path.exists(f"cuda/{base_filename}{extension}")
-)
-hip_filename = next(
-    f"hip/{base_filename}{extension}"
-    for extension in [".hip.cpp", ".hip.hpp", ".template.hip.cpp"]
-    if os.path.exists(f"hip/{base_filename}{extension}")
-)
-output_filename = f"common/cuda_hip/{base_filename}{'.cpp' if cuda_filename.endswith('.cu') else '.hpp'}"
-
-common_lines = list(open(common_filename))[3:]  # remove license header
-cuda_lines = list(open(cuda_filename))
-hip_lines = list(open(hip_filename))
-
-cuda_file_guard = f"GKO_{cuda_filename.upper().replace('/', '_').replace('.','_')}_"
-hip_file_guard = f"GKO_{hip_filename.upper().replace('/', '_').replace('.','_')}_"
-common_file_guard = f"GKO_{common_filename.upper().replace('/', '_').replace('.','_')}_"
-
-cuda_lines = [
-    line.replace(cuda_file_guard, common_file_guard)
-    .replace("namespace cuda", "namespace GKO_DEVICE_NAMESPACE")
-    .replace("CudaExecutor", "DefaultExecutor")
-    for line in cuda_lines
-]
-hip_lines = [
-    line.replace(hip_file_guard, common_file_guard)
-    .replace("namespace hip", "namespace GKO_DEVICE_NAMESPACE")
-    .replace("HipExecutor", "DefaultExecutor")
-    for line in hip_lines
-]
-
-for i in range(len(cuda_lines)):
-    if cuda_lines[i].startswith('#include "'):
-        cuda_lines[i] = (
-            cuda_lines[i]
-            .replace('#include "cuda/', '#include "common/cuda_hip/')
-            .replace(".cuh", ".hpp")
-            .replace("cublas", "blas")
-            .replace("cusparse", "sparselib")
-            .replace("curand", "randlib")
-        )
-    cuda_lines[i] = (
-        cuda_lines[i]
-        .replace("cuda_range", "device_range")
-        .replace("cuda::", "GKO_DEVICE_NAMESPACE::")
-    )
-for i in range(len(hip_lines)):
-    if hip_lines[i].startswith('#include "'):
-        hip_lines[i] = (
-            hip_lines[i]
-            .replace('#include "hip/', '#include "common/cuda_hip/')
-            .replace(".hip.hpp", ".hpp")
-            .replace("hipblas", "blas")
-            .replace("hipsparse", "sparselib")
-            .replace("hiprand", "randlib")
-        )
-    hip_lines[i] = (
-        hip_lines[i]
-        .replace("hip_range", "device_range")
-        .replace("hip::", "GKO_DEVICE_NAMESPACE::")
-    )
-
-cuda_location = next(
-    i
-    for i, line in enumerate(cuda_lines)
-    if line.startswith(f'#include "{common_filename}"')
-)
-hip_location = next(
-    i
-    for i, line in enumerate(hip_lines)
-    if line.startswith(f'#include "{common_filename}"')
-)
-cuda_replaced = (
-    cuda_lines[:cuda_location] + common_lines + cuda_lines[cuda_location + 1 :]
-)
-hip_replaced = hip_lines[:hip_location] + common_lines + hip_lines[hip_location + 1 :]
-
-cuda_replaced = (
-    subprocess.run(
-        args=[
-            "/home/tribizel/.cache/pre-commit/repoay30okq9/py_env-python3/lib64/python3.9/site-packages/clang_format/data/bin/clang-format",
-            f"-assume-filename={output_filename}",
-            "-",
-        ],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-        input=bytes("".join(cuda_replaced), "utf-8"),
-    )
-    .stdout.decode()
-    .splitlines()
-)
-hip_replaced = (
-    subprocess.run(
-        args=[
-            "/home/tribizel/.cache/pre-commit/repoay30okq9/py_env-python3/lib64/python3.9/site-packages/clang_format/data/bin/clang-format",
-            f"-assume-filename={output_filename}",
-            "-",
-        ],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-        input=bytes("".join(hip_replaced), "utf-8"),
-    )
-    .stdout.decode()
-    .splitlines()
-)
-
-if cuda_replaced == hip_replaced:
-    with open(output_filename, "w") as file:
-        file.write("\n".join(cuda_replaced))
-    os.remove(common_filename)
-    os.remove(cuda_filename)
-    os.remove(hip_filename)
-    with open("cuda_source_delete.sed", "a") as file:
-        file.write("/" + cuda_filename[5:].replace("/", "\\/") + "/d;")
-    with open("hip_source_delete.sed", "a") as file:
-        file.write("/" + hip_filename[4:].replace("/", "\\/") + "/d;")
-    with open("source_add.cmake", "a") as file:
-        file.write(f"{output_filename}\n")
-    sys.exit(0)
-else:
-    print(common_filename)
-    print(cuda_filename)
-    print(hip_filename)
-    print("\n".join(difflib.unified_diff(cuda_replaced, hip_replaced)))
-    sys.exit(1)

From 9e7a334cce2f6042a06c00f5b5a34bce22d65e26 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Fri, 31 May 2024 15:01:22 +0200
Subject: [PATCH 055/448] add debug logger for solvers

---
 core/CMakeLists.txt                      |   1 +
 core/log/solver_debug.cpp                | 126 +++++++++++++++++++++++
 core/test/log/CMakeLists.txt             |   1 +
 core/test/log/solver_debug.cpp           |  77 ++++++++++++++
 include/ginkgo/core/log/solver_debug.hpp |  80 ++++++++++++++
 include/ginkgo/ginkgo.hpp                |   1 +
 6 files changed, 286 insertions(+)
 create mode 100644 core/log/solver_debug.cpp
 create mode 100644 core/test/log/solver_debug.cpp
 create mode 100644 include/ginkgo/core/log/solver_debug.hpp

diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
index 14ae6ce6592..56d35e8edf0 100644
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -62,6 +62,7 @@ target_sources(${ginkgo_core}
     log/tau.cpp
     log/vtune.cpp
     log/record.cpp
+    log/solver_debug.cpp
     log/stream.cpp
     matrix/batch_csr.cpp
     matrix/batch_dense.cpp
diff --git a/core/log/solver_debug.cpp b/core/log/solver_debug.cpp
new file mode 100644
index 00000000000..be945233a61
--- /dev/null
+++ b/core/log/solver_debug.cpp
@@ -0,0 +1,126 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <ginkgo/core/log/solver_debug.hpp>
+
+
+#include <iomanip>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/name_demangling.hpp>
+#include <ginkgo/core/log/logger.hpp>
+#include <ginkgo/core/solver/solver_base.hpp>
+
+
+namespace gko {
+namespace log {
+
+
+static void print_scalar(const LinOp* value, std::ostream& stream)
+{
+    using conv_to_double = ConvertibleTo<matrix::Dense<double>>;
+    using conv_to_complex = ConvertibleTo<matrix::Dense<std::complex<double>>>;
+    const auto host_exec = value->get_executor()->get_master();
+    if (value->get_size()[0] == 0) {
+        stream << "<empty>";
+    } else if (value->get_size()[0] != 1) {
+        stream << "<matrix>";
+    } else if (dynamic_cast<const conv_to_double*>(value)) {
+        auto host_value = matrix::Dense<double>::create(host_exec);
+        host_value->copy_from(value);
+        stream << host_value->at(0, 0);
+    } else if (dynamic_cast<const conv_to_complex*>(value)) {
+        auto host_value =
+            matrix::Dense<std::complex<double>>::create(host_exec);
+        host_value->copy_from(value);
+        stream << host_value->at(0, 0);
+    } else {
+        stream << "<unknown>";
+    }
+}
+
+
+void SolverDebug::on_linop_apply_started(const LinOp* solver, const LinOp* in,
+                                         const LinOp* out) const
+{
+    using solver_base = solver::detail::SolverBaseLinOp;
+    auto dynamic_type = name_demangling::get_dynamic_type(*solver);
+    auto& stream = *output_;
+    stream << dynamic_type << "::apply(" << in << ',' << out
+           << ") of dimensions " << solver->get_size() << " and "
+           << in->get_size()[1] << " rhs\n";
+    if (const auto base = dynamic_cast<const solver_base*>(solver)) {
+        const auto scalars = base->get_workspace_scalars();
+        const auto names = base->get_workspace_op_names();
+        stream << std::setw(column_width_) << "Iteration";
+        for (auto scalar : scalars) {
+            stream << std::setw(column_width_) << names[scalar];
+        }
+        stream << '\n';
+    } else {
+        stream << "This solver type is not supported by the SolverDebug logger";
+    }
+}
+
+
+void SolverDebug::on_iteration_complete(
+    const LinOp* solver, const LinOp* right_hand_side, const LinOp* solution,
+    const size_type& num_iterations, const LinOp* residual,
+    const LinOp* residual_norm, const LinOp* implicit_sq_residual_norm,
+    const array<stopping_status>* status, bool stopped) const
+{
+    using solver_base = solver::detail::SolverBaseLinOp;
+    auto& stream = *output_;
+    stream << std::setprecision(precision_);
+    if (const auto base = dynamic_cast<const solver_base*>(solver)) {
+        const auto scalars = base->get_workspace_scalars();
+        stream << std::setw(column_width_) << num_iterations;
+        for (auto scalar : scalars) {
+            stream << std::setw(column_width_);
+            print_scalar(base->get_workspace_op(scalar), stream);
+        }
+        stream << '\n';
+    }
+}
+
+
+void SolverDebug::on_iteration_complete(const LinOp* solver,
+                                        const size_type& num_iterations,
+                                        const LinOp* residual,
+                                        const LinOp* solution,
+                                        const LinOp* residual_norm) const
+{
+    on_iteration_complete(solver, nullptr, solution, num_iterations, residual,
+                          residual_norm, nullptr, nullptr, false);
+}
+
+
+void SolverDebug::on_iteration_complete(
+    const LinOp* solver, const size_type& num_iterations, const LinOp* residual,
+    const LinOp* solution, const LinOp* residual_norm,
+    const LinOp* implicit_sq_residual_norm) const
+{
+    on_iteration_complete(solver, nullptr, solution, num_iterations, residual,
+                          residual_norm, implicit_sq_residual_norm, nullptr,
+                          false);
+}
+
+
+SolverDebug::SolverDebug(std::ostream& stream, int precision, int column_width)
+    : output_{&stream}, precision_{precision}, column_width_{column_width}
+{}
+
+
+std::shared_ptr<SolverDebug> SolverDebug::create(std::ostream& output,
+                                                 int precision,
+                                                 int column_width)
+{
+    return std::shared_ptr<SolverDebug>{
+        new SolverDebug{output, precision, column_width}};
+}
+
+
+}  // namespace log
+}  // namespace gko
diff --git a/core/test/log/CMakeLists.txt b/core/test/log/CMakeLists.txt
index 8efd7fafc46..1231b996f5a 100644
--- a/core/test/log/CMakeLists.txt
+++ b/core/test/log/CMakeLists.txt
@@ -6,4 +6,5 @@ endif()
 ginkgo_create_test(performance_hint)
 ginkgo_create_test(profiler_hook)
 ginkgo_create_test(record)
+ginkgo_create_test(solver_debug)
 ginkgo_create_test(stream)
diff --git a/core/test/log/solver_debug.cpp b/core/test/log/solver_debug.cpp
new file mode 100644
index 00000000000..2b0ec771590
--- /dev/null
+++ b/core/test/log/solver_debug.cpp
@@ -0,0 +1,77 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <ginkgo/core/log/solver_debug.hpp>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/solver/cg.hpp>
+#include <ginkgo/core/stop/iteration.hpp>
+
+
+#include "core/test/utils.hpp"
+
+
+template <typename T>
+class SolverDebug : public ::testing::Test {
+public:
+    using Dense = gko::matrix::Dense<T>;
+    using Cg = gko::solver::Cg<T>;
+
+    SolverDebug() : ref{gko::ReferenceExecutor::create()}
+    {
+        mtx = gko::initialize<Dense>({T{1.0}}, ref);
+        in = gko::initialize<Dense>({T{2.0}}, ref);
+        out = mtx->clone();
+        solver =
+            Cg::build()
+                .with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
+                .on(ref)
+                ->generate(mtx);
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<Dense> mtx;
+    std::shared_ptr<Dense> in;
+    std::unique_ptr<Dense> out;
+    std::unique_ptr<Cg> solver;
+};
+
+TYPED_TEST_SUITE(SolverDebug, gko::test::ValueTypes, TypenameNameGenerator);
+
+
+TYPED_TEST(SolverDebug, Works)
+{
+    using T = TypeParam;
+    std::stringstream ref_ss;
+    int default_column_width = 12;
+    auto dynamic_type = gko::name_demangling::get_dynamic_type(*this->solver);
+    ref_ss << dynamic_type << "::apply(" << this->in.get() << ','
+           << this->out.get() << ") of dimensions " << this->solver->get_size()
+           << " and " << this->in->get_size()[1] << " rhs\n";
+    ref_ss << std::setw(default_column_width) << "Iteration"
+           << std::setw(default_column_width) << "alpha"
+           << std::setw(default_column_width) << "beta"
+           << std::setw(default_column_width) << "prev_rho"
+           << std::setw(default_column_width) << "rho" << '\n';
+    ref_ss << std::setw(default_column_width) << 0
+           << std::setw(default_column_width) << T{0.0}
+           << std::setw(default_column_width) << T{0.0}
+           << std::setw(default_column_width) << T{1.0}
+           << std::setw(default_column_width) << T{1.0} << '\n'
+           << std::setw(default_column_width) << 1
+           << std::setw(default_column_width) << T{0.0}
+           << std::setw(default_column_width) << T{1.0}
+           << std::setw(default_column_width) << T{0.0}
+           << std::setw(default_column_width) << T{1.0} << '\n';
+    std::stringstream ss;
+    this->solver->add_logger(gko::log::SolverDebug::create(ss));
+
+    this->solver->apply(this->in, this->out);
+
+    ASSERT_EQ(ss.str(), ref_ss.str());
+}
diff --git a/include/ginkgo/core/log/solver_debug.hpp b/include/ginkgo/core/log/solver_debug.hpp
new file mode 100644
index 00000000000..873a7a247cf
--- /dev/null
+++ b/include/ginkgo/core/log/solver_debug.hpp
@@ -0,0 +1,80 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_PUBLIC_CORE_LOG_SOLVER_DEBUG_HPP_
+#define GKO_PUBLIC_CORE_LOG_SOLVER_DEBUG_HPP_
+
+
+#include <iosfwd>
+
+
+#include <ginkgo/config.hpp>
+#include <ginkgo/core/log/logger.hpp>
+
+
+namespace gko {
+namespace log {
+
+
+/**
+ * This Logger prints the value of all scalar values stored internally by the
+ * solver after each iteration. If the solver is applied to multiple right-hand
+ * sides, only the first right-hand side gets printed.
+ */
+class SolverDebug : public Logger {
+public:
+    /* Internal solver events */
+    void on_linop_apply_started(const LinOp* A, const LinOp* b,
+                                const LinOp* x) const override;
+
+    void on_iteration_complete(
+        const LinOp* solver, const LinOp* right_hand_side,
+        const LinOp* solution, const size_type& num_iterations,
+        const LinOp* residual, const LinOp* residual_norm,
+        const LinOp* implicit_sq_residual_norm,
+        const array<stopping_status>* status, bool stopped) const override;
+
+    GKO_DEPRECATED(
+        "Please use the version with the additional stopping "
+        "information.")
+    void on_iteration_complete(const LinOp* solver,
+                               const size_type& num_iterations,
+                               const LinOp* residual, const LinOp* solution,
+                               const LinOp* residual_norm) const override;
+
+    GKO_DEPRECATED(
+        "Please use the version with the additional stopping "
+        "information.")
+    void on_iteration_complete(
+        const LinOp* solver, const size_type& num_iterations,
+        const LinOp* residual, const LinOp* solution,
+        const LinOp* residual_norm,
+        const LinOp* implicit_sq_residual_norm) const override;
+
+    /**
+     * Creates a logger printing the value for all scalar values in the solver
+     * after each iteration.
+     *
+     * @param output  the stream to write the output to.
+     * @param precision  the number of digits of precision to print
+     * @param column_width  the number of characters an output column is wide
+     */
+    static std::shared_ptr<SolverDebug> create(std::ostream& output,
+                                               int precision = 6,
+                                               int column_width = 12);
+
+private:
+    SolverDebug(std::ostream& output, int precision, int column_width);
+
+    std::ostream* output_;
+    int precision_;
+    int column_width_;
+};
+
+
+}  // namespace log
+}  // namespace gko
+
+
+#endif  // GKO_PUBLIC_CORE_LOG_SOLVER_DEBUG_HPP_
diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp
index 503b0143e09..2e307792c85 100644
--- a/include/ginkgo/ginkgo.hpp
+++ b/include/ginkgo/ginkgo.hpp
@@ -87,6 +87,7 @@
 #include <ginkgo/core/log/performance_hint.hpp>
 #include <ginkgo/core/log/profiler_hook.hpp>
 #include <ginkgo/core/log/record.hpp>
+#include <ginkgo/core/log/solver_debug.hpp>
 #include <ginkgo/core/log/stream.hpp>
 
 #include <ginkgo/core/matrix/batch_csr.hpp>

From fef293b2b80c008c80efebdf218a38db02559c35 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 5 Jun 2024 10:09:14 +0200
Subject: [PATCH 056/448] add missing includes

---
 core/log/solver_debug.cpp                | 1 +
 include/ginkgo/core/log/solver_debug.hpp | 1 +
 2 files changed, 2 insertions(+)

diff --git a/core/log/solver_debug.cpp b/core/log/solver_debug.cpp
index be945233a61..760f182bde1 100644
--- a/core/log/solver_debug.cpp
+++ b/core/log/solver_debug.cpp
@@ -11,6 +11,7 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/name_demangling.hpp>
 #include <ginkgo/core/log/logger.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/solver/solver_base.hpp>
 
 
diff --git a/include/ginkgo/core/log/solver_debug.hpp b/include/ginkgo/core/log/solver_debug.hpp
index 873a7a247cf..9b589f29b88 100644
--- a/include/ginkgo/core/log/solver_debug.hpp
+++ b/include/ginkgo/core/log/solver_debug.hpp
@@ -7,6 +7,7 @@
 
 
 #include <iosfwd>
+#include <memory>
 
 
 #include <ginkgo/config.hpp>

From 2e222abfedf0c88d58aba2f96423b82c188102c0 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Fri, 7 Jun 2024 14:57:29 +0200
Subject: [PATCH 057/448] add file and csv output loggers

---
 core/log/solver_debug.cpp                | 336 ++++++++++++++++++-----
 core/test/log/solver_debug.cpp           | 114 +++++++-
 include/ginkgo/core/log/solver_debug.hpp |  66 ++---
 3 files changed, 401 insertions(+), 115 deletions(-)

diff --git a/core/log/solver_debug.cpp b/core/log/solver_debug.cpp
index 760f182bde1..69b6a3ad692 100644
--- a/core/log/solver_debug.cpp
+++ b/core/log/solver_debug.cpp
@@ -5,121 +5,315 @@
 #include <ginkgo/core/log/solver_debug.hpp>
 
 
+#include <fstream>
 #include <iomanip>
+#include <iostream>
+#include <string>
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/lin_op.hpp>
+#include <ginkgo/core/base/mtx_io.hpp>
 #include <ginkgo/core/base/name_demangling.hpp>
 #include <ginkgo/core/log/logger.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/solver/solver_base.hpp>
 
 
+#include "core/base/dispatch_helper.hpp"
+
+
 namespace gko {
 namespace log {
+namespace {
 
 
-static void print_scalar(const LinOp* value, std::ostream& stream)
+template <typename Functor>
+static bool dispatch_type(const LinOp* value, Functor fn)
 {
+    const auto host_exec = value->get_executor()->get_master();
     using conv_to_double = ConvertibleTo<matrix::Dense<double>>;
     using conv_to_complex = ConvertibleTo<matrix::Dense<std::complex<double>>>;
-    const auto host_exec = value->get_executor()->get_master();
-    if (value->get_size()[0] == 0) {
-        stream << "<empty>";
-    } else if (value->get_size()[0] != 1) {
-        stream << "<matrix>";
-    } else if (dynamic_cast<const conv_to_double*>(value)) {
-        auto host_value = matrix::Dense<double>::create(host_exec);
-        host_value->copy_from(value);
-        stream << host_value->at(0, 0);
+    if (dynamic_cast<const conv_to_double*>(value)) {
+        auto host_vec = matrix::Dense<double>::create(host_exec);
+        host_vec->copy_from(value);
+        fn(host_vec.get());
+        return true;
     } else if (dynamic_cast<const conv_to_complex*>(value)) {
-        auto host_value =
-            matrix::Dense<std::complex<double>>::create(host_exec);
-        host_value->copy_from(value);
-        stream << host_value->at(0, 0);
+        auto host_vec = matrix::Dense<std::complex<double>>::create(host_exec);
+        host_vec->copy_from(value);
+        fn(host_vec.get());
+        return true;
     } else {
-        stream << "<unknown>";
+        return false;
     }
 }
 
 
-void SolverDebug::on_linop_apply_started(const LinOp* solver, const LinOp* in,
-                                         const LinOp* out) const
-{
-    using solver_base = solver::detail::SolverBaseLinOp;
-    auto dynamic_type = name_demangling::get_dynamic_type(*solver);
-    auto& stream = *output_;
-    stream << dynamic_type << "::apply(" << in << ',' << out
-           << ") of dimensions " << solver->get_size() << " and "
-           << in->get_size()[1] << " rhs\n";
-    if (const auto base = dynamic_cast<const solver_base*>(solver)) {
-        const auto scalars = base->get_workspace_scalars();
-        const auto names = base->get_workspace_op_names();
-        stream << std::setw(column_width_) << "Iteration";
-        for (auto scalar : scalars) {
-            stream << std::setw(column_width_) << names[scalar];
-        }
-        stream << '\n';
-    } else {
-        stream << "This solver type is not supported by the SolverDebug logger";
-    }
-}
+class SolverDebugPrint : public SolverDebug {
+    friend class SolverDebug;
 
+public:
+    /* Internal solver events */
+    void on_linop_apply_started(const LinOp* solver, const LinOp* in,
+                                const LinOp* out) const override
+    {
+        printed_header_ = false;
+    }
 
-void SolverDebug::on_iteration_complete(
-    const LinOp* solver, const LinOp* right_hand_side, const LinOp* solution,
-    const size_type& num_iterations, const LinOp* residual,
-    const LinOp* residual_norm, const LinOp* implicit_sq_residual_norm,
-    const array<stopping_status>* status, bool stopped) const
-{
-    using solver_base = solver::detail::SolverBaseLinOp;
-    auto& stream = *output_;
-    stream << std::setprecision(precision_);
-    if (const auto base = dynamic_cast<const solver_base*>(solver)) {
+    void on_iteration_complete(
+        const LinOp* solver, const LinOp* right_hand_side,
+        const LinOp* solution, const size_type& num_iterations,
+        const LinOp* residual, const LinOp* residual_norm,
+        const LinOp* implicit_sq_residual_norm,
+        const array<stopping_status>* status, bool stopped) const override
+    {
+        using solver_base = solver::detail::SolverBaseLinOp;
+        auto dynamic_type = name_demangling::get_dynamic_type(*solver);
+        auto& stream = *output_;
+        auto base = gko::as<solver_base>(solver);
+        if (!printed_header_) {
+            stream << dynamic_type << "::apply(" << right_hand_side << ','
+                   << solution << ") of dimensions " << solver->get_size()
+                   << " and " << right_hand_side->get_size()[1] << " rhs\n";
+            const auto scalars = base->get_workspace_scalars();
+            const auto names = base->get_workspace_op_names();
+            stream << std::setw(column_width_) << "Iteration";
+            for (auto scalar : scalars) {
+                if (separator_) {
+                    stream << separator_;
+                }
+                stream << std::setw(column_width_) << names[scalar];
+            }
+            if (residual_norm) {
+                if (separator_) {
+                    stream << separator_;
+                }
+                stream << std::setw(column_width_) << "residual_norm";
+            }
+            if (implicit_sq_residual_norm) {
+                if (separator_) {
+                    stream << separator_;
+                }
+                stream << std::setw(column_width_)
+                       << "implicit_sq_residual_norm";
+            }
+            stream << '\n';
+            printed_header_ = true;
+        }
+        stream << std::setprecision(precision_);
         const auto scalars = base->get_workspace_scalars();
         stream << std::setw(column_width_) << num_iterations;
         for (auto scalar : scalars) {
-            stream << std::setw(column_width_);
             print_scalar(base->get_workspace_op(scalar), stream);
         }
+        if (residual_norm) {
+            print_scalar(residual_norm, stream);
+        }
+        if (implicit_sq_residual_norm) {
+            print_scalar(implicit_sq_residual_norm, stream);
+        }
         stream << '\n';
     }
-}
 
+    GKO_DEPRECATED(
+        "Please use the version with the additional stopping "
+        "information.")
+    void on_iteration_complete(const LinOp* solver,
+                               const size_type& num_iterations,
+                               const LinOp* residual, const LinOp* solution,
+                               const LinOp* residual_norm) const override
+    {
+        on_iteration_complete(solver, nullptr, solution, num_iterations,
+                              residual, residual_norm, nullptr, nullptr, false);
+    }
 
-void SolverDebug::on_iteration_complete(const LinOp* solver,
-                                        const size_type& num_iterations,
-                                        const LinOp* residual,
-                                        const LinOp* solution,
-                                        const LinOp* residual_norm) const
-{
-    on_iteration_complete(solver, nullptr, solution, num_iterations, residual,
-                          residual_norm, nullptr, nullptr, false);
-}
+    GKO_DEPRECATED(
+        "Please use the version with the additional stopping "
+        "information.")
+    void on_iteration_complete(
+        const LinOp* solver, const size_type& num_iterations,
+        const LinOp* residual, const LinOp* solution,
+        const LinOp* residual_norm,
+        const LinOp* implicit_sq_residual_norm) const override
+    {
+        on_iteration_complete(solver, nullptr, solution, num_iterations,
+                              residual, residual_norm,
+                              implicit_sq_residual_norm, nullptr, false);
+    }
+
+private:
+    void print_scalar(const LinOp* value, std::ostream& stream) const
+    {
+        if (separator_) {
+            stream << separator_;
+        }
+        stream << std::setw(column_width_);
+        if (!value->get_size()) {
+            stream << "<empty>";
+        } else if (value->get_size()[0] != 1) {
+            stream << "<vector>";
+        } else {
+            if (!dispatch_type(
+                    value, [&](auto vector) { stream << vector->at(0, 0); })) {
+                stream << "<unknown>";
+            }
+        }
+    }
 
+    SolverDebugPrint(std::ostream& output, int precision, int column_width,
+                     char separator)
+        : output_{&output},
+          precision_{precision},
+          column_width_{column_width},
+          separator_{separator},
+          printed_header_(false)
+    {}
 
-void SolverDebug::on_iteration_complete(
-    const LinOp* solver, const size_type& num_iterations, const LinOp* residual,
-    const LinOp* solution, const LinOp* residual_norm,
-    const LinOp* implicit_sq_residual_norm) const
+    std::ostream* output_;
+    int precision_;
+    int column_width_;
+    char separator_;
+    mutable bool printed_header_;
+};
+
+
+class SolverDebugStore : public SolverDebug {
+    friend class SolverDebug;
+
+public:
+    /* Internal solver events */
+    void on_linop_apply_started(const LinOp* solver, const LinOp* in,
+                                const LinOp* out) const override
+    {
+        using solver_base = solver::detail::SolverBaseLinOp;
+        auto dynamic_type = name_demangling::get_dynamic_type(*solver);
+        auto base = gko::as<solver_base>(solver);
+        store_vector(base->get_system_matrix().get(), "system_matrix");
+        store_vector(in, "rhs");
+        store_vector(out, "initial_guess");
+    }
+
+    void on_iteration_complete(
+        const LinOp* solver, const LinOp* right_hand_side,
+        const LinOp* solution, const size_type& num_iterations,
+        const LinOp* residual, const LinOp* residual_norm,
+        const LinOp* implicit_sq_residual_norm,
+        const array<stopping_status>* status, bool stopped) const override
+    {
+        using solver_base = solver::detail::SolverBaseLinOp;
+        auto base = gko::as<solver_base>(solver);
+        const auto num_vectors = base->get_num_workspace_ops();
+        const auto names = base->get_workspace_op_names();
+        for (int i = 0; i < num_vectors; i++) {
+            store_vector(base->get_workspace_op(i), num_iterations,
+                         base->get_workspace_op_names()[i]);
+        }
+        store_vector(solution, num_iterations, "solution");
+        store_vector(residual, num_iterations, "residual");
+        store_vector(residual_norm, num_iterations, "residual_norm");
+        store_vector(implicit_sq_residual_norm, num_iterations,
+                     "implicit_sq_residual_norm");
+    }
+
+    GKO_DEPRECATED(
+        "Please use the version with the additional stopping "
+        "information.")
+    void on_iteration_complete(const LinOp* solver,
+                               const size_type& num_iterations,
+                               const LinOp* residual, const LinOp* solution,
+                               const LinOp* residual_norm) const override
+    {
+        on_iteration_complete(solver, nullptr, solution, num_iterations,
+                              residual, residual_norm, nullptr, nullptr, false);
+    }
+
+    GKO_DEPRECATED(
+        "Please use the version with the additional stopping "
+        "information.")
+    void on_iteration_complete(
+        const LinOp* solver, const size_type& num_iterations,
+        const LinOp* residual, const LinOp* solution,
+        const LinOp* residual_norm,
+        const LinOp* implicit_sq_residual_norm) const override
+    {
+        on_iteration_complete(solver, nullptr, solution, num_iterations,
+                              residual, residual_norm,
+                              implicit_sq_residual_norm, nullptr, false);
+    }
+
+private:
+    void store_vector(const LinOp* value, const std::string& name) const
+    {
+        const auto filename =
+            output_file_prefix_ + "_" + name + (binary_ ? ".bin" : ".mtx");
+        if (!value) {
+            return;
+        }
+        // putting Dense first here causes gko::write to use dense output
+        run<gko::matrix::Dense<double>, gko::matrix::Dense<float>,
+            gko::matrix::Dense<std::complex<double>>,
+            gko::matrix::Dense<std::complex<float>>,
+            // fallback for other matrix types
+            gko::WritableToMatrixData<double, int32>,
+            gko::WritableToMatrixData<float, int32>,
+            gko::WritableToMatrixData<std::complex<double>, int32>,
+            gko::WritableToMatrixData<std::complex<float>, int32>,
+            gko::WritableToMatrixData<double, int64>,
+            gko::WritableToMatrixData<float, int64>,
+            gko::WritableToMatrixData<std::complex<double>, int64>,
+            gko::WritableToMatrixData<std::complex<float>, int64>>(
+            value, [&](auto vector) {
+                std::ofstream output{
+                    filename, binary_ ? (std::ios::out | std::ios::binary)
+                                      : std::ios::out};
+                if (binary_) {
+                    gko::write_binary(output, vector);
+                } else {
+                    gko::write(output, vector);
+                }
+            });
+    }
+
+    void store_vector(const LinOp* value, size_type iteration,
+                      const std::string& name) const
+    {
+        store_vector(value, std::to_string(iteration) + "_" + name);
+    }
+
+    SolverDebugStore(std::string output_file_prefix, bool binary)
+        : output_file_prefix_{std::move(output_file_prefix)}, binary_{binary}
+    {}
+
+    std::string output_file_prefix_;
+    bool binary_;
+};
+
+
+}  // namespace
+
+
+std::shared_ptr<SolverDebug> SolverDebug::create_scalar_table(
+    std::ostream& output, int precision, int column_width)
 {
-    on_iteration_complete(solver, nullptr, solution, num_iterations, residual,
-                          residual_norm, implicit_sq_residual_norm, nullptr,
-                          false);
+    return std::shared_ptr<SolverDebug>{
+        new SolverDebugPrint{output, precision, column_width, '\0'}};
 }
 
 
-SolverDebug::SolverDebug(std::ostream& stream, int precision, int column_width)
-    : output_{&stream}, precision_{precision}, column_width_{column_width}
-{}
+std::shared_ptr<SolverDebug> SolverDebug::create_scalar_csv(
+    std::ostream& output, int precision, char separator)
+{
+    return std::shared_ptr<SolverDebug>{
+        new SolverDebugPrint{output, precision, 0, separator}};
+}
 
 
-std::shared_ptr<SolverDebug> SolverDebug::create(std::ostream& output,
-                                                 int precision,
-                                                 int column_width)
+std::shared_ptr<SolverDebug> SolverDebug::create_vector_storage(
+    std::string output_file_prefix, bool binary)
 {
     return std::shared_ptr<SolverDebug>{
-        new SolverDebug{output, precision, column_width}};
+        new SolverDebugStore{output_file_prefix, binary}};
 }
 
 
diff --git a/core/test/log/solver_debug.cpp b/core/test/log/solver_debug.cpp
index 2b0ec771590..ec1f76c6fa7 100644
--- a/core/test/log/solver_debug.cpp
+++ b/core/test/log/solver_debug.cpp
@@ -14,6 +14,7 @@
 
 
 #include "core/test/utils.hpp"
+#include "core/test/utils/assertions.hpp"
 
 
 template <typename T>
@@ -26,7 +27,8 @@ class SolverDebug : public ::testing::Test {
     {
         mtx = gko::initialize<Dense>({T{1.0}}, ref);
         in = gko::initialize<Dense>({T{2.0}}, ref);
-        out = mtx->clone();
+        out = gko::initialize<Dense>({T{4.0}}, ref);
+        zero = gko::initialize<Dense>({T{0.0}}, ref);
         solver =
             Cg::build()
                 .with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
@@ -34,17 +36,39 @@ class SolverDebug : public ::testing::Test {
                 ->generate(mtx);
     }
 
+    template <typename Mtx>
+    void assert_file_equals(const std::string& filename, Mtx* ref_mtx)
+    {
+        auto cleanup = [filename] {
+            std::remove((filename + ".mtx").c_str());
+            std::remove((filename + ".bin").c_str());
+        };
+        if (!ref_mtx) {
+            cleanup();
+            return;
+        }
+        SCOPED_TRACE(filename);
+        std::ifstream stream_mtx{filename + ".mtx"};
+        std::ifstream stream_bin{filename + ".bin", std::ios::binary};
+        auto mtx = gko::read<Dense>(stream_mtx, ref);
+        auto mtx_bin = gko::read_binary<Dense>(stream_bin, ref);
+        cleanup();
+        GKO_ASSERT_MTX_NEAR(mtx, ref_mtx, 0.0);
+        GKO_ASSERT_MTX_NEAR(mtx_bin, ref_mtx, 0.0);
+    }
+
     std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<Dense> mtx;
     std::shared_ptr<Dense> in;
     std::unique_ptr<Dense> out;
+    std::unique_ptr<Dense> zero;
     std::unique_ptr<Cg> solver;
 };
 
 TYPED_TEST_SUITE(SolverDebug, gko::test::ValueTypes, TypenameNameGenerator);
 
 
-TYPED_TEST(SolverDebug, Works)
+TYPED_TEST(SolverDebug, TableWorks)
 {
     using T = TypeParam;
     std::stringstream ref_ss;
@@ -57,21 +81,97 @@ TYPED_TEST(SolverDebug, Works)
            << std::setw(default_column_width) << "alpha"
            << std::setw(default_column_width) << "beta"
            << std::setw(default_column_width) << "prev_rho"
-           << std::setw(default_column_width) << "rho" << '\n';
+           << std::setw(default_column_width) << "rho"
+           << std::setw(default_column_width) << "implicit_sq_residual_norm"
+           << '\n';
     ref_ss << std::setw(default_column_width) << 0
            << std::setw(default_column_width) << T{0.0}
            << std::setw(default_column_width) << T{0.0}
            << std::setw(default_column_width) << T{1.0}
-           << std::setw(default_column_width) << T{1.0} << '\n'
+           << std::setw(default_column_width) << T{4.0}
+           << std::setw(default_column_width) << T{4.0} << '\n'
            << std::setw(default_column_width) << 1
            << std::setw(default_column_width) << T{0.0}
-           << std::setw(default_column_width) << T{1.0}
+           << std::setw(default_column_width) << T{4.0}
            << std::setw(default_column_width) << T{0.0}
-           << std::setw(default_column_width) << T{1.0} << '\n';
+           << std::setw(default_column_width) << T{4.0}
+           << std::setw(default_column_width) << T{0.0} << '\n';
+    std::stringstream ss;
+    this->solver->add_logger(gko::log::SolverDebug::create_scalar_table(ss));
+
+    this->solver->apply(this->in, this->out);
+
+    ASSERT_EQ(ss.str(), ref_ss.str());
+}
+
+
+TYPED_TEST(SolverDebug, CsvWorks)
+{
+    using T = TypeParam;
+    std::stringstream ref_ss;
+    auto dynamic_type = gko::name_demangling::get_dynamic_type(*this->solver);
+    ref_ss << dynamic_type << "::apply(" << this->in.get() << ','
+           << this->out.get() << ") of dimensions " << this->solver->get_size()
+           << " and " << this->in->get_size()[1] << " rhs\n";
+    ref_ss << "Iteration,alpha,beta,prev_rho,rho,implicit_sq_residual_norm"
+           << '\n';
+    ref_ss << 0 << ',' << T{0.0} << ',' << T{0.0} << ',' << T{1.0} << ','
+           << T{4.0} << ',' << T{4.0} << '\n'
+           << 1 << ',' << T{0.0} << ',' << T{4.0} << ',' << T{0.0} << ','
+           << T{4.0} << ',' << T{0.0} << '\n';
     std::stringstream ss;
-    this->solver->add_logger(gko::log::SolverDebug::create(ss));
+    this->solver->add_logger(gko::log::SolverDebug::create_scalar_csv(ss));
 
     this->solver->apply(this->in, this->out);
 
     ASSERT_EQ(ss.str(), ref_ss.str());
 }
+
+
+TYPED_TEST(SolverDebug, StorageWorks)
+{
+    using T = TypeParam;
+    using Dense = typename TestFixture::Dense;
+    auto orig_out = this->out->clone();
+    auto init_residual = gko::initialize<Dense>({T{-2.0}}, this->ref);
+    std::vector<std::pair<std::string, Dense*>> files{
+        {"solver_debug_test_0_alpha", this->zero.get()},
+        {"solver_debug_test_0_beta", nullptr},
+        {"solver_debug_test_0_implicit_sq_residual_norm", orig_out.get()},
+        {"solver_debug_test_0_minus_one", nullptr},
+        {"solver_debug_test_0_one", nullptr},
+        {"solver_debug_test_0_p", nullptr},
+        {"solver_debug_test_0_prev_rho", nullptr},
+        {"solver_debug_test_0_q", nullptr},
+        {"solver_debug_test_0_r", nullptr},
+        {"solver_debug_test_0_residual", init_residual.get()},
+        {"solver_debug_test_0_rho", nullptr},
+        {"solver_debug_test_0_solution", orig_out.get()},
+        {"solver_debug_test_0_z", nullptr},
+        {"solver_debug_test_1_alpha", nullptr},
+        {"solver_debug_test_1_beta", nullptr},
+        {"solver_debug_test_1_implicit_sq_residual_norm", this->zero.get()},
+        {"solver_debug_test_1_minus_one", nullptr},
+        {"solver_debug_test_1_one", nullptr},
+        {"solver_debug_test_1_p", nullptr},
+        {"solver_debug_test_1_prev_rho", nullptr},
+        {"solver_debug_test_1_q", nullptr},
+        {"solver_debug_test_1_r", nullptr},
+        {"solver_debug_test_1_residual", this->zero.get()},
+        {"solver_debug_test_1_rho", nullptr},
+        {"solver_debug_test_1_solution", this->in.get()},
+        {"solver_debug_test_1_z", nullptr},
+        {"solver_debug_test_initial_guess", orig_out.get()},
+        {"solver_debug_test_rhs", this->in.get()},
+        {"solver_debug_test_system_matrix", this->mtx.get()}};
+    this->solver->add_logger(gko::log::SolverDebug::create_vector_storage(
+        "solver_debug_test", false));
+    this->solver->add_logger(gko::log::SolverDebug::create_vector_storage(
+        "solver_debug_test", true));
+
+    this->solver->apply(this->in, this->out);
+
+    for (auto pair : files) {
+        this->assert_file_equals(pair.first, pair.second);
+    }
+}
diff --git a/include/ginkgo/core/log/solver_debug.hpp b/include/ginkgo/core/log/solver_debug.hpp
index 9b589f29b88..85e38338da2 100644
--- a/include/ginkgo/core/log/solver_debug.hpp
+++ b/include/ginkgo/core/log/solver_debug.hpp
@@ -10,7 +10,6 @@
 #include <memory>
 
 
-#include <ginkgo/config.hpp>
 #include <ginkgo/core/log/logger.hpp>
 
 
@@ -25,52 +24,45 @@ namespace log {
  */
 class SolverDebug : public Logger {
 public:
-    /* Internal solver events */
-    void on_linop_apply_started(const LinOp* A, const LinOp* b,
-                                const LinOp* x) const override;
-
-    void on_iteration_complete(
-        const LinOp* solver, const LinOp* right_hand_side,
-        const LinOp* solution, const size_type& num_iterations,
-        const LinOp* residual, const LinOp* residual_norm,
-        const LinOp* implicit_sq_residual_norm,
-        const array<stopping_status>* status, bool stopped) const override;
-
-    GKO_DEPRECATED(
-        "Please use the version with the additional stopping "
-        "information.")
-    void on_iteration_complete(const LinOp* solver,
-                               const size_type& num_iterations,
-                               const LinOp* residual, const LinOp* solution,
-                               const LinOp* residual_norm) const override;
-
-    GKO_DEPRECATED(
-        "Please use the version with the additional stopping "
-        "information.")
-    void on_iteration_complete(
-        const LinOp* solver, const size_type& num_iterations,
-        const LinOp* residual, const LinOp* solution,
-        const LinOp* residual_norm,
-        const LinOp* implicit_sq_residual_norm) const override;
+    /**
+     * Creates a logger printing the value for all scalar values in the solver
+     * after each iteration in an ASCII table.
+     *
+     * @param output  the stream to write the output to.
+     * @param precision  the number of digits of precision to print
+     * @param column_width  the number of characters an output column is wide
+     */
+    static std::shared_ptr<SolverDebug> create_scalar_table(
+        std::ostream& output, int precision = 6, int column_width = 12);
+
 
     /**
      * Creates a logger printing the value for all scalar values in the solver
-     * after each iteration.
+     * after each iteration in a CSV table.
      *
      * @param output  the stream to write the output to.
      * @param precision  the number of digits of precision to print
      * @param column_width  the number of characters an output column is wide
      */
-    static std::shared_ptr<SolverDebug> create(std::ostream& output,
-                                               int precision = 6,
-                                               int column_width = 12);
+    static std::shared_ptr<SolverDebug> create_scalar_csv(std::ostream& output,
+                                                          int precision = 6,
+                                                          char separator = ',');
 
-private:
-    SolverDebug(std::ostream& output, int precision, int column_width);
 
-    std::ostream* output_;
-    int precision_;
-    int column_width_;
+    /**
+     * Creates a logger storing all vectors and scalar values in the solver
+     * after each iteration on disk.
+     *
+     * @param output  the path and file name prefix used to generate the output
+     *                file names.
+     * @param precision  the number of digits of precision to print when
+     *                   outputting matrices in text format
+     * @param binary  if true, write data in Ginkgo's own binary format
+     *                (lossless), if false write data in the MatrixMarket format
+     *                (potentially lossy)
+     */
+    static std::shared_ptr<SolverDebug> create_vector_storage(
+        std::string output_file_prefix = "solver_", bool binary = false);
 };
 
 

From 56d5ef6bb253e201b64baef9ff6daae406d3e7bb Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 3 Jul 2024 19:11:00 +0200
Subject: [PATCH 058/448] improve naming and documentation

---
 core/log/solver_debug.cpp                | 34 +++++++++++++++++-------
 core/test/log/solver_debug.cpp           | 22 ++++++++-------
 include/ginkgo/core/log/solver_debug.hpp | 22 ++++++++-------
 3 files changed, 50 insertions(+), 28 deletions(-)

diff --git a/core/log/solver_debug.cpp b/core/log/solver_debug.cpp
index 69b6a3ad692..e1acb233d03 100644
--- a/core/log/solver_debug.cpp
+++ b/core/log/solver_debug.cpp
@@ -2,24 +2,20 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/log/solver_debug.hpp>
-
-
 #include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <string>
 
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/mtx_io.hpp>
 #include <ginkgo/core/base/name_demangling.hpp>
 #include <ginkgo/core/log/logger.hpp>
+#include <ginkgo/core/log/solver_debug.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/solver/solver_base.hpp>
 
-
 #include "core/base/dispatch_helper.hpp"
 
 
@@ -28,6 +24,15 @@ namespace log {
 namespace {
 
 
+bool is_dense(const LinOp* value)
+{
+    using conv_to_double = ConvertibleTo<matrix::Dense<double>>;
+    using conv_to_complex = ConvertibleTo<matrix::Dense<std::complex<double>>>;
+    return dynamic_cast<const conv_to_double*>(value) ||
+           dynamic_cast<const conv_to_complex*>(value);
+}
+
+
 template <typename Functor>
 static bool dispatch_type(const LinOp* value, Functor fn)
 {
@@ -154,8 +159,19 @@ class SolverDebugPrint : public SolverDebug {
         } else if (value->get_size()[0] != 1) {
             stream << "<vector>";
         } else {
-            if (!dispatch_type(
-                    value, [&](auto vector) { stream << vector->at(0, 0); })) {
+            if (is_dense(value)) {
+                auto host_exec = value->get_executor()->get_master();
+                run<ConvertibleTo<matrix::Dense<double>>,
+                    ConvertibleTo<matrix::Dense<std::complex<double>>>>(
+                    value, [&](auto vector) {
+                        using vector_type = typename detail::pointee<
+                            decltype(vector)>::result_type;
+                        auto host_vec = vector_type::create(host_exec);
+                        host_vec->copy_from(value);
+                        stream << host_vec->at(0, 0);
+                    });
+
+            } else {
                 stream << "<unknown>";
             }
         }
@@ -293,7 +309,7 @@ class SolverDebugStore : public SolverDebug {
 }  // namespace
 
 
-std::shared_ptr<SolverDebug> SolverDebug::create_scalar_table(
+std::shared_ptr<SolverDebug> SolverDebug::create_scalar_table_writer(
     std::ostream& output, int precision, int column_width)
 {
     return std::shared_ptr<SolverDebug>{
@@ -301,7 +317,7 @@ std::shared_ptr<SolverDebug> SolverDebug::create_scalar_table(
 }
 
 
-std::shared_ptr<SolverDebug> SolverDebug::create_scalar_csv(
+std::shared_ptr<SolverDebug> SolverDebug::create_scalar_csv_writer(
     std::ostream& output, int precision, char separator)
 {
     return std::shared_ptr<SolverDebug>{
diff --git a/core/test/log/solver_debug.cpp b/core/test/log/solver_debug.cpp
index ec1f76c6fa7..216e14289da 100644
--- a/core/test/log/solver_debug.cpp
+++ b/core/test/log/solver_debug.cpp
@@ -2,17 +2,13 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <ginkgo/core/log/solver_debug.hpp>
-
-
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/log/solver_debug.hpp>
 #include <ginkgo/core/solver/cg.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 
-
 #include "core/test/utils.hpp"
 #include "core/test/utils/assertions.hpp"
 
@@ -39,17 +35,21 @@ class SolverDebug : public ::testing::Test {
     template <typename Mtx>
     void assert_file_equals(const std::string& filename, Mtx* ref_mtx)
     {
+        SCOPED_TRACE(filename);
         auto cleanup = [filename] {
             std::remove((filename + ".mtx").c_str());
             std::remove((filename + ".bin").c_str());
         };
+        std::ifstream stream_mtx{filename + ".mtx"};
+        std::ifstream stream_bin{filename + ".bin", std::ios::binary};
+        // check that the files exist
+        ASSERT_TRUE(stream_mtx.good());
+        ASSERT_TRUE(stream_bin.good());
         if (!ref_mtx) {
             cleanup();
             return;
         }
-        SCOPED_TRACE(filename);
-        std::ifstream stream_mtx{filename + ".mtx"};
-        std::ifstream stream_bin{filename + ".bin", std::ios::binary};
+        // check that the files have the correct contents
         auto mtx = gko::read<Dense>(stream_mtx, ref);
         auto mtx_bin = gko::read_binary<Dense>(stream_bin, ref);
         cleanup();
@@ -97,7 +97,8 @@ TYPED_TEST(SolverDebug, TableWorks)
            << std::setw(default_column_width) << T{4.0}
            << std::setw(default_column_width) << T{0.0} << '\n';
     std::stringstream ss;
-    this->solver->add_logger(gko::log::SolverDebug::create_scalar_table(ss));
+    this->solver->add_logger(
+        gko::log::SolverDebug::create_scalar_table_writer(ss));
 
     this->solver->apply(this->in, this->out);
 
@@ -120,7 +121,8 @@ TYPED_TEST(SolverDebug, CsvWorks)
            << 1 << ',' << T{0.0} << ',' << T{4.0} << ',' << T{0.0} << ','
            << T{4.0} << ',' << T{0.0} << '\n';
     std::stringstream ss;
-    this->solver->add_logger(gko::log::SolverDebug::create_scalar_csv(ss));
+    this->solver->add_logger(
+        gko::log::SolverDebug::create_scalar_csv_writer(ss));
 
     this->solver->apply(this->in, this->out);
 
diff --git a/include/ginkgo/core/log/solver_debug.hpp b/include/ginkgo/core/log/solver_debug.hpp
index 85e38338da2..98db712cc44 100644
--- a/include/ginkgo/core/log/solver_debug.hpp
+++ b/include/ginkgo/core/log/solver_debug.hpp
@@ -9,7 +9,6 @@
 #include <iosfwd>
 #include <memory>
 
-
 #include <ginkgo/core/log/logger.hpp>
 
 
@@ -18,40 +17,45 @@ namespace log {
 
 
 /**
- * This Logger prints the value of all scalar values stored internally by the
- * solver after each iteration. If the solver is applied to multiple right-hand
- * sides, only the first right-hand side gets printed.
+ * This Logger outputs the value of all scalar values (and potentially vectors)
+ * stored internally by the solver after each iteration. It needs to be attached
+ * to the solver being debugged.
  */
 class SolverDebug : public Logger {
 public:
     /**
      * Creates a logger printing the value for all scalar values in the solver
      * after each iteration in an ASCII table.
+     * If the solver is applied to multiple right-hand sides, only the first
+     * right-hand side gets printed.
      *
      * @param output  the stream to write the output to.
      * @param precision  the number of digits of precision to print
      * @param column_width  the number of characters an output column is wide
      */
-    static std::shared_ptr<SolverDebug> create_scalar_table(
+    static std::shared_ptr<SolverDebug> create_scalar_table_writer(
         std::ostream& output, int precision = 6, int column_width = 12);
 
 
     /**
      * Creates a logger printing the value for all scalar values in the solver
      * after each iteration in a CSV table.
+     * If the solver is applied to multiple right-hand sides, only the first
+     * right-hand side gets printed.
      *
      * @param output  the stream to write the output to.
      * @param precision  the number of digits of precision to print
-     * @param column_width  the number of characters an output column is wide
+     * @param separator  the character separating columns from each other
      */
-    static std::shared_ptr<SolverDebug> create_scalar_csv(std::ostream& output,
-                                                          int precision = 6,
-                                                          char separator = ',');
+    static std::shared_ptr<SolverDebug> create_scalar_csv_writer(
+        std::ostream& output, int precision = 6, char separator = ',');
 
 
     /**
      * Creates a logger storing all vectors and scalar values in the solver
      * after each iteration on disk.
+     * This logger can handle multiple right-hand sides, in contrast to
+     * create_scalar_table_writer or create_scalar_csv_writer.
      *
      * @param output  the path and file name prefix used to generate the output
      *                file names.

From 4a796981a22bebe758abc532b7b8403839591437 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Thu, 4 Jul 2024 13:17:37 +0200
Subject: [PATCH 059/448] clean uninitialized values from comparison

---
 core/test/log/solver_debug.cpp | 35 +++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/core/test/log/solver_debug.cpp b/core/test/log/solver_debug.cpp
index 216e14289da..90108116374 100644
--- a/core/test/log/solver_debug.cpp
+++ b/core/test/log/solver_debug.cpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <regex>
+
 #include <gtest/gtest.h>
 
 #include <ginkgo/core/base/executor.hpp>
@@ -78,20 +80,17 @@ TYPED_TEST(SolverDebug, TableWorks)
            << this->out.get() << ") of dimensions " << this->solver->get_size()
            << " and " << this->in->get_size()[1] << " rhs\n";
     ref_ss << std::setw(default_column_width) << "Iteration"
-           << std::setw(default_column_width) << "alpha"
            << std::setw(default_column_width) << "beta"
            << std::setw(default_column_width) << "prev_rho"
            << std::setw(default_column_width) << "rho"
            << std::setw(default_column_width) << "implicit_sq_residual_norm"
            << '\n';
     ref_ss << std::setw(default_column_width) << 0
-           << std::setw(default_column_width) << T{0.0}
            << std::setw(default_column_width) << T{0.0}
            << std::setw(default_column_width) << T{1.0}
            << std::setw(default_column_width) << T{4.0}
            << std::setw(default_column_width) << T{4.0} << '\n'
            << std::setw(default_column_width) << 1
-           << std::setw(default_column_width) << T{0.0}
            << std::setw(default_column_width) << T{4.0}
            << std::setw(default_column_width) << T{0.0}
            << std::setw(default_column_width) << T{4.0}
@@ -102,7 +101,12 @@ TYPED_TEST(SolverDebug, TableWorks)
 
     this->solver->apply(this->in, this->out);
 
-    ASSERT_EQ(ss.str(), ref_ss.str());
+    // the first value of beta is uninitialized, so we need to remove it
+    std::regex first_beta("\n           0 *[()0-9.e,+-]*");
+    auto clean_str = std::regex_replace(ss.str(), first_beta, "\n           0");
+    auto clean_ref =
+        std::regex_replace(ref_ss.str(), first_beta, "\n           0");
+    ASSERT_EQ(clean_str, clean_ref);
 }
 
 
@@ -114,19 +118,22 @@ TYPED_TEST(SolverDebug, CsvWorks)
     ref_ss << dynamic_type << "::apply(" << this->in.get() << ','
            << this->out.get() << ") of dimensions " << this->solver->get_size()
            << " and " << this->in->get_size()[1] << " rhs\n";
-    ref_ss << "Iteration,alpha,beta,prev_rho,rho,implicit_sq_residual_norm"
-           << '\n';
-    ref_ss << 0 << ',' << T{0.0} << ',' << T{0.0} << ',' << T{1.0} << ','
-           << T{4.0} << ',' << T{4.0} << '\n'
-           << 1 << ',' << T{0.0} << ',' << T{4.0} << ',' << T{0.0} << ','
-           << T{4.0} << ',' << T{0.0} << '\n';
+    ref_ss << "Iteration;beta;prev_rho;rho;implicit_sq_residual_norm" << '\n';
+    ref_ss << 0 << ';' << T{0.0} << ';' << T{1.0} << ';' << T{4.0} << ';'
+           << T{4.0} << '\n'
+           << 1 << ';' << T{4.0} << ';' << T{0.0} << ';' << T{4.0} << ';'
+           << T{0.0} << '\n';
     std::stringstream ss;
     this->solver->add_logger(
-        gko::log::SolverDebug::create_scalar_csv_writer(ss));
+        gko::log::SolverDebug::create_scalar_csv_writer(ss, 6, ';'));
 
     this->solver->apply(this->in, this->out);
 
-    ASSERT_EQ(ss.str(), ref_ss.str());
+    // the first value of beta is uninitialized, so we need to remove it
+    std::regex first_beta("\n0;[^;]*");
+    auto clean_str = std::regex_replace(ss.str(), first_beta, "\n0;");
+    auto clean_ref = std::regex_replace(ref_ss.str(), first_beta, "\n0;");
+    ASSERT_EQ(clean_str, clean_ref);
 }
 
 
@@ -137,7 +144,6 @@ TYPED_TEST(SolverDebug, StorageWorks)
     auto orig_out = this->out->clone();
     auto init_residual = gko::initialize<Dense>({T{-2.0}}, this->ref);
     std::vector<std::pair<std::string, Dense*>> files{
-        {"solver_debug_test_0_alpha", this->zero.get()},
         {"solver_debug_test_0_beta", nullptr},
         {"solver_debug_test_0_implicit_sq_residual_norm", orig_out.get()},
         {"solver_debug_test_0_minus_one", nullptr},
@@ -150,8 +156,7 @@ TYPED_TEST(SolverDebug, StorageWorks)
         {"solver_debug_test_0_rho", nullptr},
         {"solver_debug_test_0_solution", orig_out.get()},
         {"solver_debug_test_0_z", nullptr},
-        {"solver_debug_test_1_alpha", nullptr},
-        {"solver_debug_test_1_beta", nullptr},
+        {"solver_debug_test_1_beta", orig_out.get()},
         {"solver_debug_test_1_implicit_sq_residual_norm", this->zero.get()},
         {"solver_debug_test_1_minus_one", nullptr},
         {"solver_debug_test_1_one", nullptr},

From 369bfe63363240a476347529becfc3f5aa01d9bd Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Thu, 4 Jul 2024 13:19:41 +0200
Subject: [PATCH 060/448] remove unused scalars

---
 core/solver/bicg.cpp                |  9 ++++-----
 core/solver/cg.cpp                  |  8 +++-----
 core/solver/fcg.cpp                 |  9 ++++-----
 include/ginkgo/core/solver/bicg.hpp | 12 +++++-------
 include/ginkgo/core/solver/cg.hpp   | 12 +++++-------
 include/ginkgo/core/solver/fcg.hpp  | 14 ++++++--------
 6 files changed, 27 insertions(+), 37 deletions(-)

diff --git a/core/solver/bicg.cpp b/core/solver/bicg.cpp
index c379cb8df08..0b39b3664cc 100644
--- a/core/solver/bicg.cpp
+++ b/core/solver/bicg.cpp
@@ -124,7 +124,6 @@ void Bicg<ValueType>::apply_dense_impl(const matrix::Dense<ValueType>* dense_b,
     GKO_SOLVER_VECTOR(p2, dense_b);
     GKO_SOLVER_VECTOR(q2, dense_b);
 
-    GKO_SOLVER_SCALAR(alpha, dense_b);
     GKO_SOLVER_SCALAR(beta, dense_b);
     GKO_SOLVER_SCALAR(prev_rho, dense_b);
     GKO_SOLVER_SCALAR(rho, dense_b);
@@ -255,7 +254,7 @@ int workspace_traits<Bicg<ValueType>>::num_arrays(const Solver&)
 template <typename ValueType>
 int workspace_traits<Bicg<ValueType>>::num_vectors(const Solver&)
 {
-    return 14;
+    return 13;
 }
 
 
@@ -264,8 +263,8 @@ std::vector<std::string> workspace_traits<Bicg<ValueType>>::op_names(
     const Solver&)
 {
     return {
-        "r",  "z",     "p",    "q",        "r2",  "z2",  "p2",
-        "q2", "alpha", "beta", "prev_rho", "rho", "one", "minus_one",
+        "r",  "z",    "p",        "q",   "r2",  "z2",        "p2",
+        "q2", "beta", "prev_rho", "rho", "one", "minus_one",
     };
 }
 
@@ -281,7 +280,7 @@ std::vector<std::string> workspace_traits<Bicg<ValueType>>::array_names(
 template <typename ValueType>
 std::vector<int> workspace_traits<Bicg<ValueType>>::scalars(const Solver&)
 {
-    return {alpha, beta, prev_rho, rho};
+    return {beta, prev_rho, rho};
 }
 
 
diff --git a/core/solver/cg.cpp b/core/solver/cg.cpp
index 20487b4cd0d..c512dc4313b 100644
--- a/core/solver/cg.cpp
+++ b/core/solver/cg.cpp
@@ -102,7 +102,6 @@ void Cg<ValueType>::apply_dense_impl(const VectorType* dense_b,
     GKO_SOLVER_VECTOR(p, dense_b);
     GKO_SOLVER_VECTOR(q, dense_b);
 
-    GKO_SOLVER_SCALAR(alpha, dense_b);
     GKO_SOLVER_SCALAR(beta, dense_b);
     GKO_SOLVER_SCALAR(prev_rho, dense_b);
     GKO_SOLVER_SCALAR(rho, dense_b);
@@ -206,7 +205,7 @@ int workspace_traits<Cg<ValueType>>::num_arrays(const Solver&)
 template <typename ValueType>
 int workspace_traits<Cg<ValueType>>::num_vectors(const Solver&)
 {
-    return 10;
+    return 9;
 }
 
 
@@ -215,8 +214,7 @@ std::vector<std::string> workspace_traits<Cg<ValueType>>::op_names(
     const Solver&)
 {
     return {
-        "r",    "z",        "p",   "q",   "alpha",
-        "beta", "prev_rho", "rho", "one", "minus_one",
+        "r", "z", "p", "q", "beta", "prev_rho", "rho", "one", "minus_one",
     };
 }
 
@@ -232,7 +230,7 @@ std::vector<std::string> workspace_traits<Cg<ValueType>>::array_names(
 template <typename ValueType>
 std::vector<int> workspace_traits<Cg<ValueType>>::scalars(const Solver&)
 {
-    return {alpha, beta, prev_rho, rho};
+    return {beta, prev_rho, rho};
 }
 
 
diff --git a/core/solver/fcg.cpp b/core/solver/fcg.cpp
index c4f79854c0a..6c65f63ccae 100644
--- a/core/solver/fcg.cpp
+++ b/core/solver/fcg.cpp
@@ -102,7 +102,6 @@ void Fcg<ValueType>::apply_dense_impl(const VectorType* dense_b,
     GKO_SOLVER_VECTOR(q, dense_b);
     GKO_SOLVER_VECTOR(t, dense_b);
 
-    GKO_SOLVER_SCALAR(alpha, dense_b);
     GKO_SOLVER_SCALAR(beta, dense_b);
     GKO_SOLVER_SCALAR(prev_rho, dense_b);
     GKO_SOLVER_SCALAR(rho, dense_b);
@@ -209,7 +208,7 @@ int workspace_traits<Fcg<ValueType>>::num_arrays(const Solver&)
 template <typename ValueType>
 int workspace_traits<Fcg<ValueType>>::num_vectors(const Solver&)
 {
-    return 12;
+    return 11;
 }
 
 
@@ -218,8 +217,8 @@ std::vector<std::string> workspace_traits<Fcg<ValueType>>::op_names(
     const Solver&)
 {
     return {
-        "r",    "z",        "p",   "q",     "t",   "alpha",
-        "beta", "prev_rho", "rho", "rho_t", "one", "minus_one",
+        "r",        "z",   "p",     "q",   "t",         "beta",
+        "prev_rho", "rho", "rho_t", "one", "minus_one",
     };
 }
 
@@ -235,7 +234,7 @@ std::vector<std::string> workspace_traits<Fcg<ValueType>>::array_names(
 template <typename ValueType>
 std::vector<int> workspace_traits<Fcg<ValueType>>::scalars(const Solver&)
 {
-    return {alpha, beta, prev_rho, rho, rho_t};
+    return {beta, prev_rho, rho, rho_t};
 }
 
 
diff --git a/include/ginkgo/core/solver/bicg.hpp b/include/ginkgo/core/solver/bicg.hpp
index 9f1ef54cc34..2a43c1ca3f8 100644
--- a/include/ginkgo/core/solver/bicg.hpp
+++ b/include/ginkgo/core/solver/bicg.hpp
@@ -155,18 +155,16 @@ struct workspace_traits<Bicg<ValueType>> {
     constexpr static int p2 = 6;
     // "transposed" q vector
     constexpr static int q2 = 7;
-    // alpha scalar
-    constexpr static int alpha = 8;
     // beta scalar
-    constexpr static int beta = 9;
+    constexpr static int beta = 8;
     // previous rho scalar
-    constexpr static int prev_rho = 10;
+    constexpr static int prev_rho = 9;
     // current rho scalar
-    constexpr static int rho = 11;
+    constexpr static int rho = 10;
     // constant 1.0 scalar
-    constexpr static int one = 12;
+    constexpr static int one = 11;
     // constant -1.0 scalar
-    constexpr static int minus_one = 13;
+    constexpr static int minus_one = 12;
 
     // stopping status array
     constexpr static int stop = 0;
diff --git a/include/ginkgo/core/solver/cg.hpp b/include/ginkgo/core/solver/cg.hpp
index 9d850ecbe6d..984d5d1f104 100644
--- a/include/ginkgo/core/solver/cg.hpp
+++ b/include/ginkgo/core/solver/cg.hpp
@@ -141,18 +141,16 @@ struct workspace_traits<Cg<ValueType>> {
     constexpr static int p = 2;
     // q vector
     constexpr static int q = 3;
-    // alpha scalar
-    constexpr static int alpha = 4;
     // beta scalar
-    constexpr static int beta = 5;
+    constexpr static int beta = 4;
     // previous rho scalar
-    constexpr static int prev_rho = 6;
+    constexpr static int prev_rho = 5;
     // current rho scalar
-    constexpr static int rho = 7;
+    constexpr static int rho = 6;
     // constant 1.0 scalar
-    constexpr static int one = 8;
+    constexpr static int one = 7;
     // constant -1.0 scalar
-    constexpr static int minus_one = 9;
+    constexpr static int minus_one = 8;
 
     // stopping status array
     constexpr static int stop = 0;
diff --git a/include/ginkgo/core/solver/fcg.hpp b/include/ginkgo/core/solver/fcg.hpp
index 4577dd1b1d4..dfaf252b557 100644
--- a/include/ginkgo/core/solver/fcg.hpp
+++ b/include/ginkgo/core/solver/fcg.hpp
@@ -148,20 +148,18 @@ struct workspace_traits<Fcg<ValueType>> {
     constexpr static int q = 3;
     // t vector
     constexpr static int t = 4;
-    // alpha scalar
-    constexpr static int alpha = 5;
     // beta scalar
-    constexpr static int beta = 6;
+    constexpr static int beta = 5;
     // previous rho scalar
-    constexpr static int prev_rho = 7;
+    constexpr static int prev_rho = 6;
     // current rho scalar
-    constexpr static int rho = 8;
+    constexpr static int rho = 7;
     // current rho_t scalar
-    constexpr static int rho_t = 9;
+    constexpr static int rho_t = 8;
     // constant 1.0 scalar
-    constexpr static int one = 10;
+    constexpr static int one = 9;
     // constant -1.0 scalar
-    constexpr static int minus_one = 11;
+    constexpr static int minus_one = 10;
 
     // stopping status array
     constexpr static int stop = 0;

From 889756225cd3b0e1a6b22bd432b510747d2ddf91 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Thu, 4 Jul 2024 13:36:54 +0200
Subject: [PATCH 061/448] clean up and simplify code

---
 core/log/solver_debug.cpp | 24 +-----------------------
 1 file changed, 1 insertion(+), 23 deletions(-)

diff --git a/core/log/solver_debug.cpp b/core/log/solver_debug.cpp
index e1acb233d03..66febeb9ea4 100644
--- a/core/log/solver_debug.cpp
+++ b/core/log/solver_debug.cpp
@@ -33,28 +33,6 @@ bool is_dense(const LinOp* value)
 }
 
 
-template <typename Functor>
-static bool dispatch_type(const LinOp* value, Functor fn)
-{
-    const auto host_exec = value->get_executor()->get_master();
-    using conv_to_double = ConvertibleTo<matrix::Dense<double>>;
-    using conv_to_complex = ConvertibleTo<matrix::Dense<std::complex<double>>>;
-    if (dynamic_cast<const conv_to_double*>(value)) {
-        auto host_vec = matrix::Dense<double>::create(host_exec);
-        host_vec->copy_from(value);
-        fn(host_vec.get());
-        return true;
-    } else if (dynamic_cast<const conv_to_complex*>(value)) {
-        auto host_vec = matrix::Dense<std::complex<double>>::create(host_exec);
-        host_vec->copy_from(value);
-        fn(host_vec.get());
-        return true;
-    } else {
-        return false;
-    }
-}
-
-
 class SolverDebugPrint : public SolverDebug {
     friend class SolverDebug;
 
@@ -167,7 +145,7 @@ class SolverDebugPrint : public SolverDebug {
                         using vector_type = typename detail::pointee<
                             decltype(vector)>::result_type;
                         auto host_vec = vector_type::create(host_exec);
-                        host_vec->copy_from(value);
+                        vector->convert_to(host_vec);
                         stream << host_vec->at(0, 0);
                     });
 

From 9826e1b3a69598d76fe57afb78ab3c52550ad72b Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Thu, 4 Jul 2024 14:14:12 +0200
Subject: [PATCH 062/448] simplify condition

Co-authored-by: Marcel Koch <marcel.koch@kit.edu>
---
 core/log/solver_debug.cpp | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/core/log/solver_debug.cpp b/core/log/solver_debug.cpp
index 66febeb9ea4..b93a7987326 100644
--- a/core/log/solver_debug.cpp
+++ b/core/log/solver_debug.cpp
@@ -136,22 +136,20 @@ class SolverDebugPrint : public SolverDebug {
             stream << "<empty>";
         } else if (value->get_size()[0] != 1) {
             stream << "<vector>";
+        } else if (is_dense(value)) {
+            auto host_exec = value->get_executor()->get_master();
+            run<ConvertibleTo<matrix::Dense<double>>,
+                ConvertibleTo<matrix::Dense<std::complex<double>>>>(
+                value, [&](auto vector) {
+                    using vector_type =
+                        typename detail::pointee<decltype(vector)>::result_type;
+                    auto host_vec = vector_type::create(host_exec);
+                    vector->convert_to(host_vec);
+                    stream << host_vec->at(0, 0);
+                });
+
         } else {
-            if (is_dense(value)) {
-                auto host_exec = value->get_executor()->get_master();
-                run<ConvertibleTo<matrix::Dense<double>>,
-                    ConvertibleTo<matrix::Dense<std::complex<double>>>>(
-                    value, [&](auto vector) {
-                        using vector_type = typename detail::pointee<
-                            decltype(vector)>::result_type;
-                        auto host_vec = vector_type::create(host_exec);
-                        vector->convert_to(host_vec);
-                        stream << host_vec->at(0, 0);
-                    });
-
-            } else {
-                stream << "<unknown>";
-            }
+            stream << "<unknown>";
         }
     }
 

From 3669147c7ff194b65e26e64b53e924cef4010e73 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Fri, 5 Jul 2024 13:11:36 +0200
Subject: [PATCH 063/448] Rename solver_debug to solver_progress

---
 core/CMakeLists.txt                           |  2 +-
 .../{solver_debug.cpp => solver_progress.cpp} | 35 ++++----
 core/test/log/CMakeLists.txt                  |  2 +-
 .../{solver_debug.cpp => solver_progress.cpp} | 80 +++++++++----------
 .../{solver_debug.hpp => solver_progress.hpp} | 16 ++--
 include/ginkgo/ginkgo.hpp                     |  2 +-
 6 files changed, 69 insertions(+), 68 deletions(-)
 rename core/log/{solver_debug.cpp => solver_progress.cpp} (91%)
 rename core/test/log/{solver_debug.cpp => solver_progress.cpp} (69%)
 rename include/ginkgo/core/log/{solver_debug.hpp => solver_progress.hpp} (84%)

diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
index 56d35e8edf0..df8f748b4d3 100644
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -62,7 +62,7 @@ target_sources(${ginkgo_core}
     log/tau.cpp
     log/vtune.cpp
     log/record.cpp
-    log/solver_debug.cpp
+    log/solver_progress.cpp
     log/stream.cpp
     matrix/batch_csr.cpp
     matrix/batch_dense.cpp
diff --git a/core/log/solver_debug.cpp b/core/log/solver_progress.cpp
similarity index 91%
rename from core/log/solver_debug.cpp
rename to core/log/solver_progress.cpp
index b93a7987326..effa0279bba 100644
--- a/core/log/solver_debug.cpp
+++ b/core/log/solver_progress.cpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "ginkgo/core/log/solver_progress.hpp"
+
 #include <fstream>
 #include <iomanip>
 #include <iostream>
@@ -12,7 +14,6 @@
 #include <ginkgo/core/base/mtx_io.hpp>
 #include <ginkgo/core/base/name_demangling.hpp>
 #include <ginkgo/core/log/logger.hpp>
-#include <ginkgo/core/log/solver_debug.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/solver/solver_base.hpp>
 
@@ -33,8 +34,8 @@ bool is_dense(const LinOp* value)
 }
 
 
-class SolverDebugPrint : public SolverDebug {
-    friend class SolverDebug;
+class SolverProgressPrint : public SolverProgress {
+    friend class SolverProgress;
 
 public:
     /* Internal solver events */
@@ -153,8 +154,8 @@ class SolverDebugPrint : public SolverDebug {
         }
     }
 
-    SolverDebugPrint(std::ostream& output, int precision, int column_width,
-                     char separator)
+    SolverProgressPrint(std::ostream& output, int precision, int column_width,
+                        char separator)
         : output_{&output},
           precision_{precision},
           column_width_{column_width},
@@ -170,8 +171,8 @@ class SolverDebugPrint : public SolverDebug {
 };
 
 
-class SolverDebugStore : public SolverDebug {
-    friend class SolverDebug;
+class SolverProgressStore : public SolverProgress {
+    friend class SolverProgress;
 
 public:
     /* Internal solver events */
@@ -273,7 +274,7 @@ class SolverDebugStore : public SolverDebug {
         store_vector(value, std::to_string(iteration) + "_" + name);
     }
 
-    SolverDebugStore(std::string output_file_prefix, bool binary)
+    SolverProgressStore(std::string output_file_prefix, bool binary)
         : output_file_prefix_{std::move(output_file_prefix)}, binary_{binary}
     {}
 
@@ -285,27 +286,27 @@ class SolverDebugStore : public SolverDebug {
 }  // namespace
 
 
-std::shared_ptr<SolverDebug> SolverDebug::create_scalar_table_writer(
+std::shared_ptr<SolverProgress> SolverProgress::create_scalar_table_writer(
     std::ostream& output, int precision, int column_width)
 {
-    return std::shared_ptr<SolverDebug>{
-        new SolverDebugPrint{output, precision, column_width, '\0'}};
+    return std::shared_ptr<SolverProgress>{
+        new SolverProgressPrint{output, precision, column_width, '\0'}};
 }
 
 
-std::shared_ptr<SolverDebug> SolverDebug::create_scalar_csv_writer(
+std::shared_ptr<SolverProgress> SolverProgress::create_scalar_csv_writer(
     std::ostream& output, int precision, char separator)
 {
-    return std::shared_ptr<SolverDebug>{
-        new SolverDebugPrint{output, precision, 0, separator}};
+    return std::shared_ptr<SolverProgress>{
+        new SolverProgressPrint{output, precision, 0, separator}};
 }
 
 
-std::shared_ptr<SolverDebug> SolverDebug::create_vector_storage(
+std::shared_ptr<SolverProgress> SolverProgress::create_vector_storage(
     std::string output_file_prefix, bool binary)
 {
-    return std::shared_ptr<SolverDebug>{
-        new SolverDebugStore{output_file_prefix, binary}};
+    return std::shared_ptr<SolverProgress>{
+        new SolverProgressStore{output_file_prefix, binary}};
 }
 
 
diff --git a/core/test/log/CMakeLists.txt b/core/test/log/CMakeLists.txt
index 1231b996f5a..6e8c89ef671 100644
--- a/core/test/log/CMakeLists.txt
+++ b/core/test/log/CMakeLists.txt
@@ -6,5 +6,5 @@ endif()
 ginkgo_create_test(performance_hint)
 ginkgo_create_test(profiler_hook)
 ginkgo_create_test(record)
-ginkgo_create_test(solver_debug)
+ginkgo_create_test(solver_progress)
 ginkgo_create_test(stream)
diff --git a/core/test/log/solver_debug.cpp b/core/test/log/solver_progress.cpp
similarity index 69%
rename from core/test/log/solver_debug.cpp
rename to core/test/log/solver_progress.cpp
index 90108116374..f2433779864 100644
--- a/core/test/log/solver_debug.cpp
+++ b/core/test/log/solver_progress.cpp
@@ -7,7 +7,7 @@
 #include <gtest/gtest.h>
 
 #include <ginkgo/core/base/executor.hpp>
-#include <ginkgo/core/log/solver_debug.hpp>
+#include <ginkgo/core/log/solver_progress.hpp>
 #include <ginkgo/core/solver/cg.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 
@@ -16,12 +16,12 @@
 
 
 template <typename T>
-class SolverDebug : public ::testing::Test {
+class SolverProgress : public ::testing::Test {
 public:
     using Dense = gko::matrix::Dense<T>;
     using Cg = gko::solver::Cg<T>;
 
-    SolverDebug() : ref{gko::ReferenceExecutor::create()}
+    SolverProgress() : ref{gko::ReferenceExecutor::create()}
     {
         mtx = gko::initialize<Dense>({T{1.0}}, ref);
         in = gko::initialize<Dense>({T{2.0}}, ref);
@@ -67,10 +67,10 @@ class SolverDebug : public ::testing::Test {
     std::unique_ptr<Cg> solver;
 };
 
-TYPED_TEST_SUITE(SolverDebug, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(SolverProgress, gko::test::ValueTypes, TypenameNameGenerator);
 
 
-TYPED_TEST(SolverDebug, TableWorks)
+TYPED_TEST(SolverProgress, TableWorks)
 {
     using T = TypeParam;
     std::stringstream ref_ss;
@@ -97,7 +97,7 @@ TYPED_TEST(SolverDebug, TableWorks)
            << std::setw(default_column_width) << T{0.0} << '\n';
     std::stringstream ss;
     this->solver->add_logger(
-        gko::log::SolverDebug::create_scalar_table_writer(ss));
+        gko::log::SolverProgress::create_scalar_table_writer(ss));
 
     this->solver->apply(this->in, this->out);
 
@@ -110,7 +110,7 @@ TYPED_TEST(SolverDebug, TableWorks)
 }
 
 
-TYPED_TEST(SolverDebug, CsvWorks)
+TYPED_TEST(SolverProgress, CsvWorks)
 {
     using T = TypeParam;
     std::stringstream ref_ss;
@@ -125,7 +125,7 @@ TYPED_TEST(SolverDebug, CsvWorks)
            << T{0.0} << '\n';
     std::stringstream ss;
     this->solver->add_logger(
-        gko::log::SolverDebug::create_scalar_csv_writer(ss, 6, ';'));
+        gko::log::SolverProgress::create_scalar_csv_writer(ss, 6, ';'));
 
     this->solver->apply(this->in, this->out);
 
@@ -137,44 +137,44 @@ TYPED_TEST(SolverDebug, CsvWorks)
 }
 
 
-TYPED_TEST(SolverDebug, StorageWorks)
+TYPED_TEST(SolverProgress, StorageWorks)
 {
     using T = TypeParam;
     using Dense = typename TestFixture::Dense;
     auto orig_out = this->out->clone();
     auto init_residual = gko::initialize<Dense>({T{-2.0}}, this->ref);
     std::vector<std::pair<std::string, Dense*>> files{
-        {"solver_debug_test_0_beta", nullptr},
-        {"solver_debug_test_0_implicit_sq_residual_norm", orig_out.get()},
-        {"solver_debug_test_0_minus_one", nullptr},
-        {"solver_debug_test_0_one", nullptr},
-        {"solver_debug_test_0_p", nullptr},
-        {"solver_debug_test_0_prev_rho", nullptr},
-        {"solver_debug_test_0_q", nullptr},
-        {"solver_debug_test_0_r", nullptr},
-        {"solver_debug_test_0_residual", init_residual.get()},
-        {"solver_debug_test_0_rho", nullptr},
-        {"solver_debug_test_0_solution", orig_out.get()},
-        {"solver_debug_test_0_z", nullptr},
-        {"solver_debug_test_1_beta", orig_out.get()},
-        {"solver_debug_test_1_implicit_sq_residual_norm", this->zero.get()},
-        {"solver_debug_test_1_minus_one", nullptr},
-        {"solver_debug_test_1_one", nullptr},
-        {"solver_debug_test_1_p", nullptr},
-        {"solver_debug_test_1_prev_rho", nullptr},
-        {"solver_debug_test_1_q", nullptr},
-        {"solver_debug_test_1_r", nullptr},
-        {"solver_debug_test_1_residual", this->zero.get()},
-        {"solver_debug_test_1_rho", nullptr},
-        {"solver_debug_test_1_solution", this->in.get()},
-        {"solver_debug_test_1_z", nullptr},
-        {"solver_debug_test_initial_guess", orig_out.get()},
-        {"solver_debug_test_rhs", this->in.get()},
-        {"solver_debug_test_system_matrix", this->mtx.get()}};
-    this->solver->add_logger(gko::log::SolverDebug::create_vector_storage(
-        "solver_debug_test", false));
-    this->solver->add_logger(gko::log::SolverDebug::create_vector_storage(
-        "solver_debug_test", true));
+        {"solver_progress_test_0_beta", nullptr},
+        {"solver_progress_test_0_implicit_sq_residual_norm", orig_out.get()},
+        {"solver_progress_test_0_minus_one", nullptr},
+        {"solver_progress_test_0_one", nullptr},
+        {"solver_progress_test_0_p", nullptr},
+        {"solver_progress_test_0_prev_rho", nullptr},
+        {"solver_progress_test_0_q", nullptr},
+        {"solver_progress_test_0_r", nullptr},
+        {"solver_progress_test_0_residual", init_residual.get()},
+        {"solver_progress_test_0_rho", nullptr},
+        {"solver_progress_test_0_solution", orig_out.get()},
+        {"solver_progress_test_0_z", nullptr},
+        {"solver_progress_test_1_beta", orig_out.get()},
+        {"solver_progress_test_1_implicit_sq_residual_norm", this->zero.get()},
+        {"solver_progress_test_1_minus_one", nullptr},
+        {"solver_progress_test_1_one", nullptr},
+        {"solver_progress_test_1_p", nullptr},
+        {"solver_progress_test_1_prev_rho", nullptr},
+        {"solver_progress_test_1_q", nullptr},
+        {"solver_progress_test_1_r", nullptr},
+        {"solver_progress_test_1_residual", this->zero.get()},
+        {"solver_progress_test_1_rho", nullptr},
+        {"solver_progress_test_1_solution", this->in.get()},
+        {"solver_progress_test_1_z", nullptr},
+        {"solver_progress_test_initial_guess", orig_out.get()},
+        {"solver_progress_test_rhs", this->in.get()},
+        {"solver_progress_test_system_matrix", this->mtx.get()}};
+    this->solver->add_logger(gko::log::SolverProgress::create_vector_storage(
+        "solver_progress_test", false));
+    this->solver->add_logger(gko::log::SolverProgress::create_vector_storage(
+        "solver_progress_test", true));
 
     this->solver->apply(this->in, this->out);
 
diff --git a/include/ginkgo/core/log/solver_debug.hpp b/include/ginkgo/core/log/solver_progress.hpp
similarity index 84%
rename from include/ginkgo/core/log/solver_debug.hpp
rename to include/ginkgo/core/log/solver_progress.hpp
index 98db712cc44..71e08fc96c9 100644
--- a/include/ginkgo/core/log/solver_debug.hpp
+++ b/include/ginkgo/core/log/solver_progress.hpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#ifndef GKO_PUBLIC_CORE_LOG_SOLVER_DEBUG_HPP_
-#define GKO_PUBLIC_CORE_LOG_SOLVER_DEBUG_HPP_
+#ifndef GKO_PUBLIC_CORE_LOG_SOLVER_PROGRESS_HPP_
+#define GKO_PUBLIC_CORE_LOG_SOLVER_PROGRESS_HPP_
 
 
 #include <iosfwd>
@@ -19,9 +19,9 @@ namespace log {
 /**
  * This Logger outputs the value of all scalar values (and potentially vectors)
  * stored internally by the solver after each iteration. It needs to be attached
- * to the solver being debugged.
+ * to the solver being inspected.
  */
-class SolverDebug : public Logger {
+class SolverProgress : public Logger {
 public:
     /**
      * Creates a logger printing the value for all scalar values in the solver
@@ -33,7 +33,7 @@ class SolverDebug : public Logger {
      * @param precision  the number of digits of precision to print
      * @param column_width  the number of characters an output column is wide
      */
-    static std::shared_ptr<SolverDebug> create_scalar_table_writer(
+    static std::shared_ptr<SolverProgress> create_scalar_table_writer(
         std::ostream& output, int precision = 6, int column_width = 12);
 
 
@@ -47,7 +47,7 @@ class SolverDebug : public Logger {
      * @param precision  the number of digits of precision to print
      * @param separator  the character separating columns from each other
      */
-    static std::shared_ptr<SolverDebug> create_scalar_csv_writer(
+    static std::shared_ptr<SolverProgress> create_scalar_csv_writer(
         std::ostream& output, int precision = 6, char separator = ',');
 
 
@@ -65,7 +65,7 @@ class SolverDebug : public Logger {
      *                (lossless), if false write data in the MatrixMarket format
      *                (potentially lossy)
      */
-    static std::shared_ptr<SolverDebug> create_vector_storage(
+    static std::shared_ptr<SolverProgress> create_vector_storage(
         std::string output_file_prefix = "solver_", bool binary = false);
 };
 
@@ -74,4 +74,4 @@ class SolverDebug : public Logger {
 }  // namespace gko
 
 
-#endif  // GKO_PUBLIC_CORE_LOG_SOLVER_DEBUG_HPP_
+#endif  // GKO_PUBLIC_CORE_LOG_SOLVER_PROGRESS_HPP_
diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp
index 2e307792c85..0fab93dcefe 100644
--- a/include/ginkgo/ginkgo.hpp
+++ b/include/ginkgo/ginkgo.hpp
@@ -87,7 +87,7 @@
 #include <ginkgo/core/log/performance_hint.hpp>
 #include <ginkgo/core/log/profiler_hook.hpp>
 #include <ginkgo/core/log/record.hpp>
-#include <ginkgo/core/log/solver_debug.hpp>
+#include <ginkgo/core/log/solver_progress.hpp>
 #include <ginkgo/core/log/stream.hpp>
 
 #include <ginkgo/core/matrix/batch_csr.hpp>

From c3fb39810d5cbcf98390429a23f7102879ef63c8 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 4 Jul 2024 11:06:29 +0200
Subject: [PATCH 064/448] add the failing test that the transposed trs uses
 different alg

---
 reference/test/solver/lower_trs_kernels.cpp | 69 ++++++++++++++++++++-
 reference/test/solver/upper_trs_kernels.cpp | 69 ++++++++++++++++++++-
 2 files changed, 134 insertions(+), 4 deletions(-)

diff --git a/reference/test/solver/lower_trs_kernels.cpp b/reference/test/solver/lower_trs_kernels.cpp
index da2e55700f5..3680f19681f 100644
--- a/reference/test/solver/lower_trs_kernels.cpp
+++ b/reference/test/solver/lower_trs_kernels.cpp
@@ -54,6 +54,10 @@ class LowerTrs : public ::testing::Test {
                                     {365.0, 97.0, -654.0, 8.0, 91.0}},
                                    exec)),
           lower_trs_factory(Solver::build().on(exec)),
+          lower_trs_syncfree_factory(
+              Solver::build()
+                  .with_algorithm(gko::solver::trisolve_algorithm::syncfree)
+                  .on(exec)),
           lower_trs_factory_mrhs(Solver::build().with_num_rhs(2u).on(exec)),
           lower_trs_factory_unit(
               Solver::build().with_unit_diagonal(true).on(exec))
@@ -66,6 +70,7 @@ class LowerTrs : public ::testing::Test {
     std::shared_ptr<Mtx> mtx_big_lower;
     std::shared_ptr<Mtx> mtx_big_general;
     std::unique_ptr<typename Solver::Factory> lower_trs_factory;
+    std::unique_ptr<typename Solver::Factory> lower_trs_syncfree_factory;
     std::unique_ptr<typename Solver::Factory> lower_trs_factory_mrhs;
     std::unique_ptr<typename Solver::Factory> lower_trs_factory_unit;
 };
@@ -348,13 +353,21 @@ TYPED_TEST(LowerTrs, SolvesTransposedTriangularSystem)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    using Solver = typename TestFixture::Solver;
     std::shared_ptr<Mtx> b = gko::initialize<Mtx>({1.0, 2.0, 1.0}, this->exec);
     auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
     auto solver = this->lower_trs_factory->generate(this->mtx);
+    auto transposed_solver =
+        gko::as<typename Solver::transposed_type>(solver->transpose());
 
-    solver->transpose()->apply(b, x);
+    transposed_solver->apply(b, x);
 
     GKO_ASSERT_MTX_NEAR(x, l({0.0, 0.0, 1.0}), r<value_type>::value);
+    // Ensure that the other test with syncfree is not the default option
+    ASSERT_EQ(solver->get_parameters().algorithm,
+              gko::solver::trisolve_algorithm::sparselib);
+    ASSERT_EQ(transposed_solver->get_parameters().algorithm,
+              solver->get_parameters().algorithm);
 }
 
 
@@ -362,13 +375,65 @@ TYPED_TEST(LowerTrs, SolvesConjTransposedTriangularSystem)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    using Solver = typename TestFixture::Solver;
     std::shared_ptr<Mtx> b = gko::initialize<Mtx>({1.0, 2.0, 1.0}, this->exec);
     auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
     auto solver = this->lower_trs_factory->generate(this->mtx);
+    auto conj_transposed_solver =
+        gko::as<typename Solver::transposed_type>(solver->conj_transpose());
 
-    solver->conj_transpose()->apply(b, x);
+    conj_transposed_solver->apply(b, x);
 
     GKO_ASSERT_MTX_NEAR(x, l({0.0, 0.0, 1.0}), r<value_type>::value);
+    // Ensure that the other test with syncfree is not the default option
+    ASSERT_EQ(solver->get_parameters().algorithm,
+              gko::solver::trisolve_algorithm::sparselib);
+    ASSERT_EQ(conj_transposed_solver->get_parameters().algorithm,
+              solver->get_parameters().algorithm);
+}
+
+
+TYPED_TEST(LowerTrs, SolvesTransposedTriangularSystemWithSyncFree)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using Solver = typename TestFixture::Solver;
+    std::shared_ptr<Mtx> b = gko::initialize<Mtx>({1.0, 2.0, 1.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
+    auto solver = this->lower_trs_syncfree_factory->generate(this->mtx);
+    auto transposed_solver =
+        gko::as<typename Solver::transposed_type>(solver->transpose());
+
+    transposed_solver->apply(b, x);
+
+    GKO_ASSERT_MTX_NEAR(x, l({0.0, 0.0, 1.0}), r<value_type>::value);
+    // Ensure that this test uses syncfree
+    ASSERT_EQ(solver->get_parameters().algorithm,
+              gko::solver::trisolve_algorithm::syncfree);
+    ASSERT_EQ(transposed_solver->get_parameters().algorithm,
+              solver->get_parameters().algorithm);
+}
+
+
+TYPED_TEST(LowerTrs, SolvesConjTransposedTriangularSystemWithSyncFree)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using Solver = typename TestFixture::Solver;
+    std::shared_ptr<Mtx> b = gko::initialize<Mtx>({1.0, 2.0, 1.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
+    auto solver = this->lower_trs_syncfree_factory->generate(this->mtx);
+    auto conj_transposed_solver =
+        gko::as<typename Solver::transposed_type>(solver->conj_transpose());
+
+    conj_transposed_solver->apply(b, x);
+
+    GKO_ASSERT_MTX_NEAR(x, l({0.0, 0.0, 1.0}), r<value_type>::value);
+    // Ensure that this test uses syncfree
+    ASSERT_EQ(solver->get_parameters().algorithm,
+              gko::solver::trisolve_algorithm::syncfree);
+    ASSERT_EQ(conj_transposed_solver->get_parameters().algorithm,
+              solver->get_parameters().algorithm);
 }
 
 
diff --git a/reference/test/solver/upper_trs_kernels.cpp b/reference/test/solver/upper_trs_kernels.cpp
index dc964e6b83d..a60f3b46079 100644
--- a/reference/test/solver/upper_trs_kernels.cpp
+++ b/reference/test/solver/upper_trs_kernels.cpp
@@ -54,6 +54,10 @@ class UpperTrs : public ::testing::Test {
                                     {0.0, 2.0, 0.0, 4.0, 124.0}},
                                    exec)),
           upper_trs_factory(Solver::build().on(exec)),
+          upper_trs_syncfree_factory(
+              Solver::build()
+                  .with_algorithm(gko::solver::trisolve_algorithm::syncfree)
+                  .on(exec)),
           upper_trs_factory_mrhs(Solver::build().with_num_rhs(2u).on(exec)),
           upper_trs_factory_unit(
               Solver::build().with_unit_diagonal(true).on(exec))
@@ -66,6 +70,7 @@ class UpperTrs : public ::testing::Test {
     std::shared_ptr<Mtx> mtx_big_upper;
     std::shared_ptr<Mtx> mtx_big_general;
     std::unique_ptr<typename Solver::Factory> upper_trs_factory;
+    std::unique_ptr<typename Solver::Factory> upper_trs_syncfree_factory;
     std::unique_ptr<typename Solver::Factory> upper_trs_factory_mrhs;
     std::unique_ptr<typename Solver::Factory> upper_trs_factory_unit;
 };
@@ -349,13 +354,21 @@ TYPED_TEST(UpperTrs, SolvesTransposedTriangularSystem)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    using Solver = typename TestFixture::Solver;
     std::shared_ptr<Mtx> b = gko::initialize<Mtx>({4.0, 2.0, 3.0}, this->exec);
     auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
     auto solver = this->upper_trs_factory->generate(this->mtx);
+    auto transposed_solver =
+        gko::as<typename Solver::transposed_type>(solver->transpose());
 
-    solver->transpose()->apply(b, x);
+    transposed_solver->apply(b, x);
 
     GKO_ASSERT_MTX_NEAR(x, l({4.0, -10.0, 19.0}), r<value_type>::value);
+    // Ensure that the other test with syncfree is not the default option
+    ASSERT_EQ(solver->get_parameters().algorithm,
+              gko::solver::trisolve_algorithm::sparselib);
+    ASSERT_EQ(transposed_solver->get_parameters().algorithm,
+              solver->get_parameters().algorithm);
 }
 
 
@@ -363,13 +376,65 @@ TYPED_TEST(UpperTrs, SolvesConjTransposedTriangularSystem)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    using Solver = typename TestFixture::Solver;
     std::shared_ptr<Mtx> b = gko::initialize<Mtx>({4.0, 2.0, 3.0}, this->exec);
     auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
     auto solver = this->upper_trs_factory->generate(this->mtx);
+    auto conj_transposed_solver =
+        gko::as<typename Solver::transposed_type>(solver->conj_transpose());
 
-    solver->conj_transpose()->apply(b, x);
+    conj_transposed_solver->apply(b, x);
 
     GKO_ASSERT_MTX_NEAR(x, l({4.0, -10.0, 19.0}), r<value_type>::value);
+    // Ensure that the other test with syncfree is not the default option
+    ASSERT_EQ(solver->get_parameters().algorithm,
+              gko::solver::trisolve_algorithm::sparselib);
+    ASSERT_EQ(conj_transposed_solver->get_parameters().algorithm,
+              solver->get_parameters().algorithm);
+}
+
+
+TYPED_TEST(UpperTrs, SolvesTransposedTriangularSystemWithSyncFree)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using Solver = typename TestFixture::Solver;
+    std::shared_ptr<Mtx> b = gko::initialize<Mtx>({4.0, 2.0, 3.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
+    auto solver = this->upper_trs_syncfree_factory->generate(this->mtx);
+    auto transposed_solver =
+        gko::as<typename Solver::transposed_type>(solver->transpose());
+
+    transposed_solver->apply(b, x);
+
+    GKO_ASSERT_MTX_NEAR(x, l({4.0, -10.0, 19.0}), r<value_type>::value);
+    // Ensure that this test uses syncfree
+    ASSERT_EQ(solver->get_parameters().algorithm,
+              gko::solver::trisolve_algorithm::syncfree);
+    ASSERT_EQ(transposed_solver->get_parameters().algorithm,
+              solver->get_parameters().algorithm);
+}
+
+
+TYPED_TEST(UpperTrs, SolvesConjTransposedTriangularSystemWithSyncFree)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using Solver = typename TestFixture::Solver;
+    std::shared_ptr<Mtx> b = gko::initialize<Mtx>({4.0, 2.0, 3.0}, this->exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
+    auto solver = this->upper_trs_syncfree_factory->generate(this->mtx);
+    auto conj_transposed_solver =
+        gko::as<typename Solver::transposed_type>(solver->conj_transpose());
+
+    conj_transposed_solver->apply(b, x);
+
+    GKO_ASSERT_MTX_NEAR(x, l({4.0, -10.0, 19.0}), r<value_type>::value);
+    // Ensure that this test uses syncfree
+    ASSERT_EQ(solver->get_parameters().algorithm,
+              gko::solver::trisolve_algorithm::syncfree);
+    ASSERT_EQ(conj_transposed_solver->get_parameters().algorithm,
+              solver->get_parameters().algorithm);
 }
 
 

From b6a7f61ddcf1bbef98bfb8ddff7039a7ca893ac9 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 4 Jul 2024 11:19:28 +0200
Subject: [PATCH 065/448] fix: transposed trs uses the same alg

---
 core/solver/lower_trs.cpp | 2 ++
 core/solver/upper_trs.cpp | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/core/solver/lower_trs.cpp b/core/solver/lower_trs.cpp
index e8230625ab3..3048c877dbd 100644
--- a/core/solver/lower_trs.cpp
+++ b/core/solver/lower_trs.cpp
@@ -99,6 +99,7 @@ std::unique_ptr<LinOp> LowerTrs<ValueType, IndexType>::transpose() const
 {
     return transposed_type::build()
         .with_num_rhs(this->parameters_.num_rhs)
+        .with_algorithm(this->parameters_.algorithm)
         .on(this->get_executor())
         ->generate(share(this->get_system_matrix()->transpose()));
 }
@@ -109,6 +110,7 @@ std::unique_ptr<LinOp> LowerTrs<ValueType, IndexType>::conj_transpose() const
 {
     return transposed_type::build()
         .with_num_rhs(this->parameters_.num_rhs)
+        .with_algorithm(this->parameters_.algorithm)
         .on(this->get_executor())
         ->generate(share(this->get_system_matrix()->conj_transpose()));
 }
diff --git a/core/solver/upper_trs.cpp b/core/solver/upper_trs.cpp
index be6fcc71275..c759c119647 100644
--- a/core/solver/upper_trs.cpp
+++ b/core/solver/upper_trs.cpp
@@ -99,6 +99,7 @@ std::unique_ptr<LinOp> UpperTrs<ValueType, IndexType>::transpose() const
 {
     return transposed_type::build()
         .with_num_rhs(this->parameters_.num_rhs)
+        .with_algorithm(this->parameters_.algorithm)
         .on(this->get_executor())
         ->generate(share(this->get_system_matrix()->transpose()));
 }
@@ -109,6 +110,7 @@ std::unique_ptr<LinOp> UpperTrs<ValueType, IndexType>::conj_transpose() const
 {
     return transposed_type::build()
         .with_num_rhs(this->parameters_.num_rhs)
+        .with_algorithm(this->parameters_.algorithm)
         .on(this->get_executor())
         ->generate(share(this->get_system_matrix()->conj_transpose()));
 }

From f0a975bba8fe1369e211ddff6fa7d28015b173a6 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 4 Jul 2024 13:18:04 +0200
Subject: [PATCH 066/448] add the test for inconsistent behavior on diag zero

---
 .../test/preconditioner/jacobi_kernels.dp.cpp | 23 +++++++++++++++++++
 reference/test/preconditioner/jacobi.cpp      | 20 ++++++++++++++++
 test/preconditioner/jacobi_kernels.cpp        | 23 +++++++++++++++++++
 3 files changed, 66 insertions(+)

diff --git a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp
index 6dcfe460c71..ebc92fcb4d3 100644
--- a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp
+++ b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp
@@ -905,4 +905,27 @@ TEST_F(
 }
 
 
+TEST_F(Jacobi, ScalarJacobiHandleZero)
+{
+    auto mtx = gko::share(
+        gko::initialize<Vec>({{0, 0, 0}, {0, 2, 0}, {0, 0, 0}}, ref));
+    auto b = gko::initialize<Vec>({1, 2, 3}, ref);
+    auto x = Vec::create(ref, gko::dim<2>(3, 1));
+    auto jacobi = Bj::build().with_max_block_size(1u).on(ref)->generate(mtx);
+    auto d_b = b->clone(dpcpp);
+    auto d_x = x->clone(dpcpp);
+    auto d_mtx = gko::share(mtx->clone(dpcpp));
+    // Must generate from scratch because the clone copies the inverted
+    // information.
+    auto d_jacobi =
+        Bj::build().with_max_block_size(1u).on(dpcpp)->generate(d_mtx);
+
+    // Jacobi uses 1 as the result when diagonal value is zero.
+    jacobi->apply(b, x);
+    d_jacobi->apply(d_b, d_x);
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 0.0);
+}
+
+
 }  // namespace
diff --git a/reference/test/preconditioner/jacobi.cpp b/reference/test/preconditioner/jacobi.cpp
index 801250a9826..4dd9b5bcb31 100644
--- a/reference/test/preconditioner/jacobi.cpp
+++ b/reference/test/preconditioner/jacobi.cpp
@@ -494,4 +494,24 @@ TYPED_TEST(Jacobi, ScalarJacobiGeneratesOnDifferentPrecision)
 }
 
 
+TYPED_TEST(Jacobi, ScalarJacobiHandleZero)
+{
+    using value_type = typename TestFixture::value_type;
+    using Vec = typename TestFixture::Vec;
+    using Bj = typename TestFixture::Bj;
+    auto mtx = gko::share(
+        gko::initialize<Vec>({{0, 0, 0}, {0, 2, 0}, {0, 0, 0}}, this->exec));
+    auto b = gko::initialize<Vec>({1, 2, 3}, this->exec);
+    auto x = Vec::create(this->exec, gko::dim<2>(3, 1));
+    auto jacobi = this->scalar_j_factory->generate(mtx);
+
+    // Jacobi uses 1 as the result when diagonal value is zero.
+    jacobi->apply(b, x);
+
+    ASSERT_EQ(x->at(0, 0), value_type{1.0});
+    ASSERT_EQ(x->at(1, 0), value_type{1.0});
+    ASSERT_EQ(x->at(2, 0), value_type{3.0});
+}
+
+
 }  // namespace
diff --git a/test/preconditioner/jacobi_kernels.cpp b/test/preconditioner/jacobi_kernels.cpp
index 23347d8d896..bc50519f7d7 100644
--- a/test/preconditioner/jacobi_kernels.cpp
+++ b/test/preconditioner/jacobi_kernels.cpp
@@ -887,3 +887,26 @@ TEST_F(
 
     GKO_ASSERT_MTX_NEAR(d_x, x, 1e-6);
 }
+
+
+TEST_F(Jacobi, ScalarJacobiHandleZero)
+{
+    auto mtx = gko::share(
+        gko::initialize<Vec>({{0, 0, 0}, {0, 2, 0}, {0, 0, 0}}, ref));
+    auto b = gko::initialize<Vec>({1, 2, 3}, ref);
+    auto x = Vec::create(ref, gko::dim<2>(3, 1));
+    auto jacobi = Bj::build().with_max_block_size(1u).on(ref)->generate(mtx);
+    auto d_b = b->clone(exec);
+    auto d_x = x->clone(exec);
+    auto d_mtx = gko::share(mtx->clone(exec));
+    // Must generate from scratch because the clone copies the inverted
+    // information.
+    auto d_jacobi =
+        Bj::build().with_max_block_size(1u).on(exec)->generate(d_mtx);
+
+    // Jacobi uses 1 as the result when diagonal value is zero.
+    jacobi->apply(b, x);
+    d_jacobi->apply(d_b, d_x);
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 0.0);
+}

From ae21bd8c2d5c1d73fe5372b09e0d03d3d159cb70 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 4 Jul 2024 16:32:42 +0200
Subject: [PATCH 067/448] fix the backend version on the diag zero

---
 common/unified/preconditioner/jacobi_kernels.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/common/unified/preconditioner/jacobi_kernels.cpp b/common/unified/preconditioner/jacobi_kernels.cpp
index b8c19c24f79..dce00fd1366 100644
--- a/common/unified/preconditioner/jacobi_kernels.cpp
+++ b/common/unified/preconditioner/jacobi_kernels.cpp
@@ -42,7 +42,9 @@ void invert_diagonal(std::shared_ptr<const DefaultExecutor> exec,
     run_kernel(
         exec,
         [] GKO_KERNEL(auto elem, auto diag, auto inv_diag) {
-            inv_diag[elem] = safe_divide(one(diag[elem]), diag[elem]);
+            // if the diagonal is zero, we use 1 for in the inverted result.
+            inv_diag[elem] = is_zero(diag[elem]) ? one(diag[elem])
+                                                 : one(diag[elem]) / diag[elem];
         },
         diag.get_size(), diag, inv_diag);
 }

From 4740e180a0261912a5cc682a4a50d9f591ff577d Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Sat, 6 Jul 2024 09:17:01 +0200
Subject: [PATCH 068/448] AAA rules in test

Co-authored-by: Marcel Koch <marcel.koch@kit.edu>
---
 dpcpp/test/preconditioner/jacobi_kernels.dp.cpp | 4 ++--
 reference/test/preconditioner/jacobi.cpp        | 2 +-
 test/preconditioner/jacobi_kernels.cpp          | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp
index ebc92fcb4d3..b8950ed2d2a 100644
--- a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp
+++ b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp
@@ -911,10 +911,11 @@ TEST_F(Jacobi, ScalarJacobiHandleZero)
         gko::initialize<Vec>({{0, 0, 0}, {0, 2, 0}, {0, 0, 0}}, ref));
     auto b = gko::initialize<Vec>({1, 2, 3}, ref);
     auto x = Vec::create(ref, gko::dim<2>(3, 1));
-    auto jacobi = Bj::build().with_max_block_size(1u).on(ref)->generate(mtx);
     auto d_b = b->clone(dpcpp);
     auto d_x = x->clone(dpcpp);
     auto d_mtx = gko::share(mtx->clone(dpcpp));
+
+    auto jacobi = Bj::build().with_max_block_size(1u).on(ref)->generate(mtx);
     // Must generate from scratch because the clone copies the inverted
     // information.
     auto d_jacobi =
@@ -923,7 +924,6 @@ TEST_F(Jacobi, ScalarJacobiHandleZero)
     // Jacobi uses 1 as the result when diagonal value is zero.
     jacobi->apply(b, x);
     d_jacobi->apply(d_b, d_x);
-
     GKO_ASSERT_MTX_NEAR(d_x, x, 0.0);
 }
 
diff --git a/reference/test/preconditioner/jacobi.cpp b/reference/test/preconditioner/jacobi.cpp
index 4dd9b5bcb31..79c276579ad 100644
--- a/reference/test/preconditioner/jacobi.cpp
+++ b/reference/test/preconditioner/jacobi.cpp
@@ -503,11 +503,11 @@ TYPED_TEST(Jacobi, ScalarJacobiHandleZero)
         gko::initialize<Vec>({{0, 0, 0}, {0, 2, 0}, {0, 0, 0}}, this->exec));
     auto b = gko::initialize<Vec>({1, 2, 3}, this->exec);
     auto x = Vec::create(this->exec, gko::dim<2>(3, 1));
+
     auto jacobi = this->scalar_j_factory->generate(mtx);
 
     // Jacobi uses 1 as the result when diagonal value is zero.
     jacobi->apply(b, x);
-
     ASSERT_EQ(x->at(0, 0), value_type{1.0});
     ASSERT_EQ(x->at(1, 0), value_type{1.0});
     ASSERT_EQ(x->at(2, 0), value_type{3.0});
diff --git a/test/preconditioner/jacobi_kernels.cpp b/test/preconditioner/jacobi_kernels.cpp
index bc50519f7d7..8bfd8ace57d 100644
--- a/test/preconditioner/jacobi_kernels.cpp
+++ b/test/preconditioner/jacobi_kernels.cpp
@@ -895,10 +895,11 @@ TEST_F(Jacobi, ScalarJacobiHandleZero)
         gko::initialize<Vec>({{0, 0, 0}, {0, 2, 0}, {0, 0, 0}}, ref));
     auto b = gko::initialize<Vec>({1, 2, 3}, ref);
     auto x = Vec::create(ref, gko::dim<2>(3, 1));
-    auto jacobi = Bj::build().with_max_block_size(1u).on(ref)->generate(mtx);
     auto d_b = b->clone(exec);
     auto d_x = x->clone(exec);
     auto d_mtx = gko::share(mtx->clone(exec));
+
+    auto jacobi = Bj::build().with_max_block_size(1u).on(ref)->generate(mtx);
     // Must generate from scratch because the clone copies the inverted
     // information.
     auto d_jacobi =
@@ -907,6 +908,5 @@ TEST_F(Jacobi, ScalarJacobiHandleZero)
     // Jacobi uses 1 as the result when diagonal value is zero.
     jacobi->apply(b, x);
     d_jacobi->apply(d_b, d_x);
-
     GKO_ASSERT_MTX_NEAR(d_x, x, 0.0);
 }

From c4e02a16588e90965518a21ff1dfbbc8ce43bf72 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 1 May 2024 15:28:21 +0200
Subject: [PATCH 069/448] require C++17 support

---
 README.md                                     | 10 +++++-----
 accessor/utils.hpp                            | 12 +++++++-----
 cmake/build_helpers.cmake                     |  6 +++---
 cmake/create_test.cmake                       |  4 ++--
 cmake/hip.cmake                               |  2 +-
 common/cuda_hip/components/reduction.hpp      |  2 +-
 common/cuda_hip/reorder/rcm_kernels.cpp       |  1 -
 core/base/extended_float.hpp                  |  1 -
 core/solver/cb_gmres_accessor.hpp             |  1 -
 core/test/accessor/CMakeLists.txt             |  2 +-
 core/test/base/deferred_factory.cpp           |  6 +++---
 cuda/base/pointer_mode_guard.hpp              |  9 ++++++---
 dpcpp/reorder/rcm_kernels.dp.cpp              |  1 -
 examples/custom-matrix-format/CMakeLists.txt  |  2 +-
 hip/base/pointer_mode_guard.hip.hpp           |  9 ++++++---
 include/ginkgo/core/base/abstract_factory.hpp |  3 ++-
 include/ginkgo/core/base/combination.hpp      |  3 ++-
 include/ginkgo/core/base/composition.hpp      |  3 ++-
 include/ginkgo/core/base/math.hpp             | 17 ++++++++---------
 include/ginkgo/core/base/utils_helper.hpp     |  5 ++---
 include/ginkgo/core/distributed/matrix.hpp    |  2 +-
 include/ginkgo/core/log/logger.hpp            |  2 +-
 include/ginkgo/core/preconditioner/ic.hpp     |  1 -
 include/ginkgo/core/preconditioner/ilu.hpp    |  1 -
 include/ginkgo/core/solver/solver_traits.hpp  | 12 +++++++-----
 test/test_install/CMakeLists.txt              |  4 ++--
 26 files changed, 63 insertions(+), 58 deletions(-)

diff --git a/README.md b/README.md
index 0328ac43415..d5e22bd0b35 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@
 
 Ginkgo is a high-performance numerical linear algebra library for many-core systems, with a
 focus on solution of sparse linear systems. It is implemented using modern C++
-(you will need an at least C++14 compliant compiler to build it), with GPU kernels
+(you will need an at least C++17 compliant compiler to build it), with GPU kernels
 implemented for NVIDIA, AMD and Intel GPUs.
 
 ---
@@ -39,7 +39,7 @@ implemented for NVIDIA, AMD and Intel GPUs.
 For Ginkgo core library:
 
 *   _cmake 3.16+_
-*   C++14 compliant compiler, one of:
+*   C++17 compliant compiler, one of:
     *   _gcc 5.5+_
     *   _clang 3.9+_
     *   _Intel compiler 2019+_
@@ -50,7 +50,7 @@ For Ginkgo core library:
 The Ginkgo CUDA module has the following __additional__ requirements:
 
 *   _cmake 3.18+_ (If CUDA was installed through the NVIDIA HPC Toolkit, we require _cmake 3.22+_)
-*   _CUDA 10.1+_ or _NVHPC Package 22.7+_
+*   _CUDA 11.0+_ or _NVHPC Package 22.7+_
 *   Any host compiler restrictions your version of CUDA may impose also apply
     here. For the newest CUDA version, this information can be found in the
     [CUDA installation guide for Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html)
@@ -88,13 +88,13 @@ following:
 ### Windows
 
 *   _cmake 3.16+_
-*   C++14 compliant 64-bit compiler:
+*   C++17 compliant 64-bit compiler:
     *   _MinGW : gcc 5.5+_
     *   _Microsoft Visual Studio : VS 2019+_
 
 The Ginkgo CUDA module has the following __additional__ requirements:
 
-*   _CUDA 10.1+_
+*   _CUDA 11.0+_
 *   _Microsoft Visual Studio_
 *   Any host compiler restrictions your version of CUDA may impose also apply
     here. For the newest CUDA version, this information can be found in the
diff --git a/accessor/utils.hpp b/accessor/utils.hpp
index 5dcd3e89fcd..14d1e26492a 100644
--- a/accessor/utils.hpp
+++ b/accessor/utils.hpp
@@ -5,9 +5,11 @@
 #ifndef GKO_ACCESSOR_UTILS_HPP_
 #define GKO_ACCESSOR_UTILS_HPP_
 
+
 #include <cassert>
 #include <cinttypes>
 #include <complex>
+#include <type_traits>
 
 
 #if defined(__CUDACC__) || defined(__HIPCC__)
@@ -144,7 +146,7 @@ namespace detail {
  * @internal
  * Tests if a member function `Ref::to_arithmetic_type` exists
  */
-template <typename Ref, typename Dummy = xstd::void_t<>>
+template <typename Ref, typename Dummy = std::void_t<>>
 struct has_to_arithmetic_type : std::false_type {
     static_assert(std::is_same<Dummy, void>::value,
                   "Do not modify the Dummy value!");
@@ -153,7 +155,7 @@ struct has_to_arithmetic_type : std::false_type {
 
 template <typename Ref>
 struct has_to_arithmetic_type<
-    Ref, xstd::void_t<decltype(std::declval<Ref>().to_arithmetic_type())>>
+    Ref, std::void_t<decltype(std::declval<Ref>().to_arithmetic_type())>>
     : std::true_type {
     using type = decltype(std::declval<Ref>().to_arithmetic_type());
 };
@@ -163,14 +165,14 @@ struct has_to_arithmetic_type<
  * @internal
  * Tests if the type `Ref::arithmetic_type` exists
  */
-template <typename Ref, typename Dummy = xstd::void_t<>>
+template <typename Ref, typename Dummy = std::void_t<>>
 struct has_arithmetic_type : std::false_type {
     static_assert(std::is_same<Dummy, void>::value,
                   "Do not modify the Dummy value!");
 };
 
 template <typename Ref>
-struct has_arithmetic_type<Ref, xstd::void_t<typename Ref::arithmetic_type>>
+struct has_arithmetic_type<Ref, std::void_t<typename Ref::arithmetic_type>>
     : std::true_type {};
 
 
@@ -236,7 +238,7 @@ struct has_implicit_cast {
 
 template <typename OutType, typename InType>
 struct has_implicit_cast<OutType, InType,
-                         xstd::void_t<decltype(test_for_implicit_cast<OutType>(
+                         std::void_t<decltype(test_for_implicit_cast<OutType>(
                              std::declval<InType>()))>> {
     static constexpr bool value = true;
 };
diff --git a/cmake/build_helpers.cmake b/cmake/build_helpers.cmake
index e8691b77587..f0337839a55 100644
--- a/cmake/build_helpers.cmake
+++ b/cmake/build_helpers.cmake
@@ -18,11 +18,11 @@ function(ginkgo_default_includes name)
 endfunction()
 
 function(ginkgo_compile_features name)
-    target_compile_features("${name}" PUBLIC cxx_std_14)
+    target_compile_features("${name}" PUBLIC cxx_std_17)
     # we set these properties regardless of the enabled backends,
     # because unknown properties are ignored
-    set_target_properties("${name}" PROPERTIES HIP_STANDARD 14)
-    set_target_properties("${name}" PROPERTIES CUDA_STANDARD 14)
+    set_target_properties("${name}" PROPERTIES HIP_STANDARD 17)
+    set_target_properties("${name}" PROPERTIES CUDA_STANDARD 17)
     if(GINKGO_WITH_CLANG_TIDY AND GINKGO_CLANG_TIDY_PATH)
         set_property(TARGET "${name}" PROPERTY CXX_CLANG_TIDY "${GINKGO_CLANG_TIDY_PATH};-checks=*")
     endif()
diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake
index 9f7079f60a3..68f5708e829 100644
--- a/cmake/create_test.cmake
+++ b/cmake/create_test.cmake
@@ -37,8 +37,8 @@ function(ginkgo_set_test_target_properties test_target_name test_library_suffix)
     target_compile_features(${test_target_name} PUBLIC cxx_std_14)
     # we set these properties regardless of the enabled backends,
     # because unknown properties are ignored
-    set_target_properties(${test_target_name} PROPERTIES HIP_STANDARD 14)
-    set_target_properties(${test_target_name} PROPERTIES CUDA_STANDARD 14)
+    set_target_properties(${test_target_name} PROPERTIES HIP_STANDARD 17)
+    set_target_properties(${test_target_name} PROPERTIES CUDA_STANDARD 17)
     target_include_directories(${test_target_name} PRIVATE ${Ginkgo_BINARY_DIR} ${set_properties_ADDITIONAL_INCLUDES})
     target_link_libraries(${test_target_name} PRIVATE ginkgo GTest::GTest ${set_properties_ADDITIONAL_LIBRARIES})
 endfunction()
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
index 327375bfe76..c94117242eb 100644
--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
@@ -128,4 +128,4 @@ find_package(ROCTX)
 if(GINKGO_HIP_AMD_UNSAFE_ATOMIC AND GINKGO_HIP_VERSION VERSION_GREATER_EQUAL 5)
     set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -munsafe-fp-atomics -Wno-unused-command-line-argument")
 endif()
-set(CMAKE_HIP_STANDARD 14)
+set(CMAKE_HIP_STANDARD 17)
diff --git a/common/cuda_hip/components/reduction.hpp b/common/cuda_hip/components/reduction.hpp
index 1968a6d30b6..7c66befa6bd 100644
--- a/common/cuda_hip/components/reduction.hpp
+++ b/common/cuda_hip/components/reduction.hpp
@@ -141,7 +141,7 @@ __device__ void reduce(const Group& __restrict__ group,
  */
 template <
     typename Group, typename ValueType, typename Operator,
-    typename = xstd::enable_if_t<group::is_synchronizable_group<Group>::value>>
+    typename = std::enable_if_t<group::is_synchronizable_group<Group>::value>>
 __device__ void multireduce(const Group& __restrict__ group,
                             ValueType* __restrict__ data, size_type stride,
                             size_type num, Operator reduce_op = Operator{})
diff --git a/common/cuda_hip/reorder/rcm_kernels.cpp b/common/cuda_hip/reorder/rcm_kernels.cpp
index 3206fb28c8b..72729db30f1 100644
--- a/common/cuda_hip/reorder/rcm_kernels.cpp
+++ b/common/cuda_hip/reorder/rcm_kernels.cpp
@@ -16,7 +16,6 @@
 #include <thrust/transform.h>
 
 #include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/std_extensions.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/permutation.hpp>
diff --git a/core/base/extended_float.hpp b/core/base/extended_float.hpp
index 2dc60afd329..c14b5d1bd39 100644
--- a/core/base/extended_float.hpp
+++ b/core/base/extended_float.hpp
@@ -9,7 +9,6 @@
 #include <limits>
 #include <type_traits>
 
-#include <ginkgo/core/base/std_extensions.hpp>
 #include <ginkgo/core/base/types.hpp>
 
 
diff --git a/core/solver/cb_gmres_accessor.hpp b/core/solver/cb_gmres_accessor.hpp
index 64a7c9a46e5..a5d95793d15 100644
--- a/core/solver/cb_gmres_accessor.hpp
+++ b/core/solver/cb_gmres_accessor.hpp
@@ -16,7 +16,6 @@
 #include <ginkgo/core/base/dim.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/std_extensions.hpp>
 #include <ginkgo/core/base/types.hpp>
 
 #include "accessor/range.hpp"
diff --git a/core/test/accessor/CMakeLists.txt b/core/test/accessor/CMakeLists.txt
index 4fd0ff158d0..07da99cc308 100644
--- a/core/test/accessor/CMakeLists.txt
+++ b/core/test/accessor/CMakeLists.txt
@@ -6,7 +6,7 @@ function(create_accessor_test test_name)
         ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR})
     string(REPLACE "/" "_" TEST_TARGET_NAME "${REL_BINARY_DIR}/${test_name}")
     add_executable("${TEST_TARGET_NAME}" "${test_name}.cpp")
-    target_compile_features("${TEST_TARGET_NAME}" PUBLIC cxx_std_14)
+    target_compile_features("${TEST_TARGET_NAME}" PUBLIC cxx_std_17)
     target_include_directories("${TEST_TARGET_NAME}"
         PRIVATE
         "${Ginkgo_SOURCE_DIR}"
diff --git a/core/test/base/deferred_factory.cpp b/core/test/base/deferred_factory.cpp
index a1c02103cf8..4b140bfcbc6 100644
--- a/core/test/base/deferred_factory.cpp
+++ b/core/test/base/deferred_factory.cpp
@@ -80,12 +80,12 @@ struct test_impl : std::false_type {};
 
 // specialization for constructor
 template <typename T, typename... Args>
-struct test_impl<gko::xstd::void_t<decltype(T(std::declval<Args>()...))>, T,
-                 Args...> : std::true_type {};
+struct test_impl<std::void_t<decltype(T(std::declval<Args>()...))>, T, Args...>
+    : std::true_type {};
 
 // specialization for DF2 with_factory_list
 template <typename... Args>
-struct test_impl<gko::xstd::void_t<decltype(
+struct test_impl<std::void_t<decltype(
                      DF2::param{}.with_factory_list(std::declval<Args>()...))>,
                  DummyFlag, Args...> : std::true_type {};
 
diff --git a/cuda/base/pointer_mode_guard.hpp b/cuda/base/pointer_mode_guard.hpp
index 39af6100c46..56f46fedf40 100644
--- a/cuda/base/pointer_mode_guard.hpp
+++ b/cuda/base/pointer_mode_guard.hpp
@@ -13,7 +13,6 @@
 #include <cusparse.h>
 
 #include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/std_extensions.hpp>
 
 
 namespace gko {
@@ -35,6 +34,7 @@ class pointer_mode_guard {
     pointer_mode_guard(cublasHandle_t& handle)
     {
         l_handle = &handle;
+        uncaught_exceptions_ = std::uncaught_exceptions();
         GKO_ASSERT_NO_CUBLAS_ERRORS(
             cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST));
     }
@@ -50,7 +50,7 @@ class pointer_mode_guard {
     ~pointer_mode_guard() noexcept(false)
     {
         /* Ignore the error during stack unwinding for this call */
-        if (xstd::uncaught_exception()) {
+        if (std::uncaught_exception() > uncaught_exceptions_) {
             cublasSetPointerMode(*l_handle, CUBLAS_POINTER_MODE_DEVICE);
         } else {
             GKO_ASSERT_NO_CUBLAS_ERRORS(
@@ -59,6 +59,7 @@ class pointer_mode_guard {
     }
 
 private:
+    int uncaught_exceptions_;
     cublasHandle_t* l_handle;
 };
 
@@ -82,6 +83,7 @@ class pointer_mode_guard {
     pointer_mode_guard(cusparseHandle_t handle)
     {
         l_handle = handle;
+        uncaught_exceptions_ = std::uncaught_exceptions();
         GKO_ASSERT_NO_CUSPARSE_ERRORS(
             cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST));
     }
@@ -97,7 +99,7 @@ class pointer_mode_guard {
     ~pointer_mode_guard() noexcept(false)
     {
         /* Ignore the error during stack unwinding for this call */
-        if (xstd::uncaught_exception()) {
+        if (std::uncaught_exceptions() > uncaught_exceptions_) {
             cusparseSetPointerMode(l_handle, CUSPARSE_POINTER_MODE_DEVICE);
         } else {
             GKO_ASSERT_NO_CUSPARSE_ERRORS(
@@ -106,6 +108,7 @@ class pointer_mode_guard {
     }
 
 private:
+    int uncaught_exceptions_;
     cusparseHandle_t l_handle;
 };
 
diff --git a/dpcpp/reorder/rcm_kernels.dp.cpp b/dpcpp/reorder/rcm_kernels.dp.cpp
index 350b4c90a6d..2985f1a0dc7 100644
--- a/dpcpp/reorder/rcm_kernels.dp.cpp
+++ b/dpcpp/reorder/rcm_kernels.dp.cpp
@@ -7,7 +7,6 @@
 #include <CL/sycl.hpp>
 
 #include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/std_extensions.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/permutation.hpp>
diff --git a/examples/custom-matrix-format/CMakeLists.txt b/examples/custom-matrix-format/CMakeLists.txt
index ad6b9c2950f..a9ad895a996 100644
--- a/examples/custom-matrix-format/CMakeLists.txt
+++ b/examples/custom-matrix-format/CMakeLists.txt
@@ -12,7 +12,7 @@ if(NOT (GINKGO_BUILD_CUDA AND GINKGO_BUILD_OMP))
         "This example needs Ginkgo built with CUDA and OpenMP support")
 endif()
 
-set(CMAKE_CUDA_STANDARD 14)
+set(CMAKE_CUDA_STANDARD 17)
 set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 
 add_executable(custom-matrix-format custom-matrix-format.cpp stencil_kernel.cu)
diff --git a/hip/base/pointer_mode_guard.hip.hpp b/hip/base/pointer_mode_guard.hip.hpp
index d14c8468c0b..ea03758e087 100644
--- a/hip/base/pointer_mode_guard.hip.hpp
+++ b/hip/base/pointer_mode_guard.hip.hpp
@@ -20,7 +20,6 @@
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
-#include <ginkgo/core/base/std_extensions.hpp>
 
 #include "common/cuda_hip/base/runtime.hpp"
 
@@ -44,6 +43,7 @@ class pointer_mode_guard {
     pointer_mode_guard(hipblasContext* handle)
     {
         l_handle = handle;
+        uncaught_exceptions_ = std::uncaught_exceptions();
         GKO_ASSERT_NO_HIPBLAS_ERRORS(
             hipblasSetPointerMode(reinterpret_cast<hipblasHandle_t>(handle),
                                   HIPBLAS_POINTER_MODE_HOST));
@@ -60,7 +60,7 @@ class pointer_mode_guard {
     ~pointer_mode_guard() noexcept(false)
     {
         /* Ignore the error during stack unwinding for this call */
-        if (xstd::uncaught_exception()) {
+        if (std::uncaught_exceptions() > uncaught_exceptions_) {
             hipblasSetPointerMode(reinterpret_cast<hipblasHandle_t>(l_handle),
                                   HIPBLAS_POINTER_MODE_DEVICE);
         } else {
@@ -71,6 +71,7 @@ class pointer_mode_guard {
     }
 
 private:
+    int uncaught_exceptions_;
     hipblasContext* l_handle;
 };
 
@@ -94,6 +95,7 @@ class pointer_mode_guard {
     pointer_mode_guard(hipsparseContext* handle)
     {
         l_handle = handle;
+        uncaught_exceptions_ = std::uncaught_exceptions();
         GKO_ASSERT_NO_HIPSPARSE_ERRORS(
             hipsparseSetPointerMode(reinterpret_cast<hipsparseHandle_t>(handle),
                                     HIPSPARSE_POINTER_MODE_HOST));
@@ -110,7 +112,7 @@ class pointer_mode_guard {
     ~pointer_mode_guard() noexcept(false)
     {
         /* Ignore the error during stack unwinding for this call */
-        if (xstd::uncaught_exception()) {
+        if (std::uncaught_exceptions() > uncaught_exceptions_) {
             hipsparseSetPointerMode(
                 reinterpret_cast<hipsparseHandle_t>(l_handle),
                 HIPSPARSE_POINTER_MODE_DEVICE);
@@ -122,6 +124,7 @@ class pointer_mode_guard {
     }
 
 private:
+    int uncaught_exceptions_;
     hipsparseContext* l_handle;
 };
 
diff --git a/include/ginkgo/core/base/abstract_factory.hpp b/include/ginkgo/core/base/abstract_factory.hpp
index 5c799ab58f1..cbd18cf42d6 100644
--- a/include/ginkgo/core/base/abstract_factory.hpp
+++ b/include/ginkgo/core/base/abstract_factory.hpp
@@ -6,6 +6,7 @@
 #define GKO_PUBLIC_CORE_BASE_ABSTRACT_FACTORY_HPP_
 
 
+#include <type_traits>
 #include <unordered_map>
 
 #include <ginkgo/core/base/polymorphic_object.hpp>
@@ -563,7 +564,7 @@ private:                                                                       \
                                                                                \
 public:                                                                        \
     template <typename... Args,                                                \
-              typename = std::enable_if_t<::gko::xstd::conjunction<            \
+              typename = std::enable_if_t<::std::conjunction<                  \
                   std::is_convertible<Args, ::gko::deferred_factory_parameter< \
                                                 _name##_type>>...>::value>>    \
     auto with_##_name(Args&&... factories)                                     \
diff --git a/include/ginkgo/core/base/combination.hpp b/include/ginkgo/core/base/combination.hpp
index f3cdea82dcb..a9fb4d565ae 100644
--- a/include/ginkgo/core/base/combination.hpp
+++ b/include/ginkgo/core/base/combination.hpp
@@ -6,6 +6,7 @@
 #define GKO_PUBLIC_CORE_BASE_COMBINATION_HPP_
 
 
+#include <type_traits>
 #include <vector>
 
 #include <ginkgo/core/base/lin_op.hpp>
@@ -136,7 +137,7 @@ class Combination : public EnableLinOp<Combination<ValueType>>,
      */
     template <
         typename CoefficientIterator, typename OperatorIterator,
-        typename = xstd::void_t<
+        typename = std::void_t<
             typename std::iterator_traits<
                 CoefficientIterator>::iterator_category,
             typename std::iterator_traits<OperatorIterator>::iterator_category>>
diff --git a/include/ginkgo/core/base/composition.hpp b/include/ginkgo/core/base/composition.hpp
index e151e121b56..9c16f8720aa 100644
--- a/include/ginkgo/core/base/composition.hpp
+++ b/include/ginkgo/core/base/composition.hpp
@@ -6,6 +6,7 @@
 #define GKO_PUBLIC_CORE_BASE_COMPOSITION_HPP_
 
 
+#include <type_traits>
 #include <vector>
 
 #include <ginkgo/core/base/executor.hpp>
@@ -125,7 +126,7 @@ class Composition : public EnableLinOp<Composition<ValueType>>,
      * @param end  iterator pointing behind the last operator
      */
     template <typename Iterator,
-              typename = xstd::void_t<
+              typename = std::void_t<
                   typename std::iterator_traits<Iterator>::iterator_category>>
     explicit Composition(Iterator begin, Iterator end)
         : EnableLinOp<Composition>([&] {
diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp
index 42eff5a5d40..f7b3b35c3f6 100644
--- a/include/ginkgo/core/base/math.hpp
+++ b/include/ginkgo/core/base/math.hpp
@@ -877,7 +877,7 @@ namespace detail {
  * @note
  * This basically mirrors the accessor functionality
  */
-template <typename Ref, typename Dummy = xstd::void_t<>>
+template <typename Ref, typename Dummy = std::void_t<>>
 struct has_to_arithmetic_type : std::false_type {
     static_assert(std::is_same<Dummy, void>::value,
                   "Do not modify the Dummy value!");
@@ -886,7 +886,7 @@ struct has_to_arithmetic_type : std::false_type {
 
 template <typename Ref>
 struct has_to_arithmetic_type<
-    Ref, xstd::void_t<decltype(std::declval<Ref>().to_arithmetic_type())>>
+    Ref, std::void_t<decltype(std::declval<Ref>().to_arithmetic_type())>>
     : std::true_type {
     using type = decltype(std::declval<Ref>().to_arithmetic_type());
 };
@@ -896,14 +896,14 @@ struct has_to_arithmetic_type<
  * @internal
  * Tests if the type `Ref::arithmetic_type` exists
  */
-template <typename Ref, typename Dummy = xstd::void_t<>>
+template <typename Ref, typename Dummy = std::void_t<>>
 struct has_arithmetic_type : std::false_type {
     static_assert(std::is_same<Dummy, void>::value,
                   "Do not modify the Dummy value!");
 };
 
 template <typename Ref>
-struct has_arithmetic_type<Ref, xstd::void_t<typename Ref::arithmetic_type>>
+struct has_arithmetic_type<Ref, std::void_t<typename Ref::arithmetic_type>>
     : std::true_type {};
 
 
@@ -1070,17 +1070,16 @@ GKO_INLINE GKO_ATTRIBUTES constexpr auto squared_norm(const T& x)
  * @return x >= zero<T>() ? x : -x;
  */
 template <typename T>
-GKO_INLINE
-    GKO_ATTRIBUTES constexpr xstd::enable_if_t<!is_complex_s<T>::value, T>
-    abs(const T& x)
+GKO_INLINE GKO_ATTRIBUTES constexpr std::enable_if_t<!is_complex_s<T>::value, T>
+abs(const T& x)
 {
     return x >= zero<T>() ? x : -x;
 }
 
 
 template <typename T>
-GKO_INLINE GKO_ATTRIBUTES constexpr xstd::enable_if_t<is_complex_s<T>::value,
-                                                      remove_complex<T>>
+GKO_INLINE GKO_ATTRIBUTES constexpr std::enable_if_t<is_complex_s<T>::value,
+                                                     remove_complex<T>>
 abs(const T& x)
 {
     return sqrt(squared_norm(x));
diff --git a/include/ginkgo/core/base/utils_helper.hpp b/include/ginkgo/core/base/utils_helper.hpp
index faa74974703..3ea5c9d878d 100644
--- a/include/ginkgo/core/base/utils_helper.hpp
+++ b/include/ginkgo/core/base/utils_helper.hpp
@@ -12,7 +12,6 @@
 
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/name_demangling.hpp>
-#include <ginkgo/core/base/std_extensions.hpp>
 #include <ginkgo/core/base/types.hpp>
 
 
@@ -99,7 +98,7 @@ template <typename T, typename = void>
 struct is_clonable_impl : std::false_type {};
 
 template <typename T>
-struct is_clonable_impl<T, xstd::void_t<decltype(std::declval<T>().clone())>>
+struct is_clonable_impl<T, std::void_t<decltype(std::declval<T>().clone())>>
     : std::true_type {};
 
 template <typename T>
@@ -114,7 +113,7 @@ struct is_clonable_to_impl : std::false_type {};
 
 template <typename T>
 struct is_clonable_to_impl<
-    T, xstd::void_t<decltype(std::declval<T>().clone(
+    T, std::void_t<decltype(std::declval<T>().clone(
            std::declval<std::shared_ptr<const Executor>>()))>>
     : std::true_type {};
 
diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp
index 4689c3d3381..9e3d45443b1 100644
--- a/include/ginkgo/core/distributed/matrix.hpp
+++ b/include/ginkgo/core/distributed/matrix.hpp
@@ -55,7 +55,7 @@ struct is_matrix_type_builder : std::false_type {};
 template <typename Builder, typename ValueType, typename IndexType>
 struct is_matrix_type_builder<
     Builder, ValueType, IndexType,
-    gko::xstd::void_t<
+    std::void_t<
         decltype(std::declval<Builder>().template create<ValueType, IndexType>(
             std::declval<std::shared_ptr<const Executor>>()))>>
     : std::true_type {};
diff --git a/include/ginkgo/core/log/logger.hpp b/include/ginkgo/core/log/logger.hpp
index 907bc418906..dd9d30249e9 100644
--- a/include/ginkgo/core/log/logger.hpp
+++ b/include/ginkgo/core/log/logger.hpp
@@ -796,7 +796,7 @@ class EnableLogging : public PolymorphicBase {
     template <size_type Event, typename ConcreteLoggableT>
     struct propagate_log_helper<
         Event, ConcreteLoggableT,
-        xstd::void_t<
+        std::void_t<
             decltype(std::declval<ConcreteLoggableT>().get_executor())>> {
         template <typename... Args>
         static void propagate_log(const ConcreteLoggableT* loggable,
diff --git a/include/ginkgo/core/preconditioner/ic.hpp b/include/ginkgo/core/preconditioner/ic.hpp
index f78e00eea09..aea43af3cf1 100644
--- a/include/ginkgo/core/preconditioner/ic.hpp
+++ b/include/ginkgo/core/preconditioner/ic.hpp
@@ -15,7 +15,6 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/precision_dispatch.hpp>
-#include <ginkgo/core/base/std_extensions.hpp>
 #include <ginkgo/core/config/config.hpp>
 #include <ginkgo/core/config/registry.hpp>
 #include <ginkgo/core/factorization/par_ic.hpp>
diff --git a/include/ginkgo/core/preconditioner/ilu.hpp b/include/ginkgo/core/preconditioner/ilu.hpp
index 869681fc547..1f4be3e3046 100644
--- a/include/ginkgo/core/preconditioner/ilu.hpp
+++ b/include/ginkgo/core/preconditioner/ilu.hpp
@@ -15,7 +15,6 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/precision_dispatch.hpp>
-#include <ginkgo/core/base/std_extensions.hpp>
 #include <ginkgo/core/config/config.hpp>
 #include <ginkgo/core/config/registry.hpp>
 #include <ginkgo/core/factorization/par_ilu.hpp>
diff --git a/include/ginkgo/core/solver/solver_traits.hpp b/include/ginkgo/core/solver/solver_traits.hpp
index b5c437716b3..d5306f56b08 100644
--- a/include/ginkgo/core/solver/solver_traits.hpp
+++ b/include/ginkgo/core/solver/solver_traits.hpp
@@ -6,7 +6,9 @@
 #define GKO_PUBLIC_CORE_SOLVER_SOLVER_TRAITS_HPP_
 
 
-#include <ginkgo/core/base/std_extensions.hpp>
+#include <type_traits>
+
+
 #include <ginkgo/core/stop/criterion.hpp>
 
 
@@ -33,12 +35,12 @@ struct has_with_criteria : std::false_type {};
  *
  * @internal  The second template parameter (which uses SFINAE) must match
  *            the default value of the general case in order to be accepted
- *            as a specialization, which is why `xstd::void_t` is used.
+ *            as a specialization, which is why `std::void_t` is used.
  */
 template <typename SolverType>
-struct has_with_criteria<
-    SolverType, xstd::void_t<decltype(SolverType::build().with_criteria(
-                    std::shared_ptr<const stop::CriterionFactory>()))>>
+struct has_with_criteria<SolverType,
+                         std::void_t<decltype(SolverType::build().with_criteria(
+                             std::shared_ptr<const stop::CriterionFactory>()))>>
     : std::true_type {};
 
 
diff --git a/test/test_install/CMakeLists.txt b/test/test_install/CMakeLists.txt
index ee19b8d030e..60fad7cf339 100644
--- a/test/test_install/CMakeLists.txt
+++ b/test/test_install/CMakeLists.txt
@@ -38,7 +38,7 @@ if(GINKGO_BUILD_CUDA)
     enable_language(CUDA)
     configure_file(test_install.cpp test_install.cu COPYONLY)
     add_executable(test_install_cuda ${CMAKE_CURRENT_BINARY_DIR}/test_install.cu)
-    set_target_properties(test_install_cuda PROPERTIES CUDA_STANDARD 14)
+    set_target_properties(test_install_cuda PROPERTIES CUDA_STANDARD 17)
     target_compile_definitions(test_install_cuda PRIVATE HAS_CUDA=1)
     target_compile_definitions(test_install_cuda PRIVATE HAS_REFERENCE=${HAS_REFERENCE})
     target_link_libraries(test_install_cuda PRIVATE Ginkgo::ginkgo)
@@ -49,7 +49,7 @@ if(GINKGO_BUILD_HIP)
     configure_file(test_install.cpp test_install.hip.cpp COPYONLY)
     set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/test_install.hip.cpp PROPERTIES LANGUAGE HIP)
     add_executable(test_install_hip ${CMAKE_CURRENT_BINARY_DIR}/test_install.hip.cpp)
-    set_target_properties(test_install_hip PROPERTIES HIP_STANDARD 14)
+    set_target_properties(test_install_hip PROPERTIES HIP_STANDARD 17)
 
     target_link_libraries(test_install_hip PRIVATE Ginkgo::ginkgo)
     target_compile_definitions(test_install_hip PRIVATE HAS_HIP=1)

From e65798d116701a5815851dda20b990e41b74d75c Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 1 May 2024 15:31:40 +0200
Subject: [PATCH 070/448] adress TODOs

---
 core/log/profiler_hook.hpp                | 8 +++-----
 include/ginkgo/core/log/profiler_hook.hpp | 1 -
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/core/log/profiler_hook.hpp b/core/log/profiler_hook.hpp
index 3f4baf80db1..e6104bba932 100644
--- a/core/log/profiler_hook.hpp
+++ b/core/log/profiler_hook.hpp
@@ -135,14 +135,12 @@ class profiling_scope_guard {
     profiling_scope_guard(const char* name)
     {
         auto functions = log::create_vtune_fns();
-        guard_ = std::make_unique<log::profiling_scope_guard>(
-            name, log::profile_event_category::internal,
-            std::move(functions.first), std::move(functions.second));
+        guard_.emplace(name, log::profile_event_category::internal,
+                       std::move(functions.first), std::move(functions.second));
     }
 
 private:
-    // TODO17: use std::optional
-    std::unique_ptr<log::profiling_scope_guard> guard_;
+    std::optional<log::profiling_scope_guard> guard_;
 };
 
 
diff --git a/include/ginkgo/core/log/profiler_hook.hpp b/include/ginkgo/core/log/profiler_hook.hpp
index ce5e8831f1c..5db0e1275f5 100644
--- a/include/ginkgo/core/log/profiler_hook.hpp
+++ b/include/ginkgo/core/log/profiler_hook.hpp
@@ -419,7 +419,6 @@ class profiling_scope_guard {
 
     profiling_scope_guard(const profiling_scope_guard&) = delete;
 
-    // TODO17: unnecessary with guaranteed RVO
     /** Move-constructs from another scope guard, other will be left empty. */
     profiling_scope_guard(profiling_scope_guard&& other);
 

From aa15b066b12a2c99511a1538fcc735db0f788dee Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 1 May 2024 16:14:40 +0200
Subject: [PATCH 071/448] add missing include

---
 core/log/profiler_hook.hpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/core/log/profiler_hook.hpp b/core/log/profiler_hook.hpp
index e6104bba932..31d1d1b5a83 100644
--- a/core/log/profiler_hook.hpp
+++ b/core/log/profiler_hook.hpp
@@ -9,6 +9,9 @@
 #include <ginkgo/core/log/profiler_hook.hpp>
 
 
+#include <optional>
+
+
 namespace gko {
 namespace log {
 

From a5b2a07050cc5305524aa10c001e9fc58a2fe41a Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 1 May 2024 16:27:34 +0200
Subject: [PATCH 072/448] set C++ version where necessary, remove checks where
 unnecessary

ginkgo has the PUBLIC property cxx_std_17,
so we don't need to set it in tests.
pkg-config doesn't propagate C++ standards
---
 benchmark/CMakeLists.txt             | 1 -
 cmake/create_test.cmake              | 3 ---
 dpcpp/CMakeLists.txt                 | 1 -
 test/test_exportbuild/CMakeLists.txt | 1 -
 test/test_install/CMakeLists.txt     | 1 -
 test/test_pkgconfig/CMakeLists.txt   | 2 +-
 test/test_subdir/CMakeLists.txt      | 1 -
 7 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 306655d2315..de6e74d464c 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -37,7 +37,6 @@ function(ginkgo_benchmark_onemkl_linops type def)
     add_library(onemkl_linops_${type} utils/dpcpp_linops.dp.cpp)
     # make the dependency public to catch issues
     target_compile_definitions(onemkl_linops_${type} PUBLIC ${def})
-    target_compile_features(onemkl_linops_${type} PRIVATE cxx_std_17)
     target_link_libraries(onemkl_linops_${type} PRIVATE Ginkgo::ginkgo MKL::MKL_DPCPP)
 endfunction()
 
diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake
index 68f5708e829..9ab0c40de20 100644
--- a/cmake/create_test.cmake
+++ b/cmake/create_test.cmake
@@ -34,7 +34,6 @@ function(ginkgo_set_test_target_properties test_target_name test_library_suffix)
             target_link_libraries(${test_target_name} PRIVATE ginkgo_gtest_main${test_library_suffix})
         endif()
     endif()
-    target_compile_features(${test_target_name} PUBLIC cxx_std_14)
     # we set these properties regardless of the enabled backends,
     # because unknown properties are ignored
     set_target_properties(${test_target_name} PROPERTIES HIP_STANDARD 17)
@@ -139,7 +138,6 @@ endfunction(ginkgo_create_test)
 function(ginkgo_create_dpcpp_test test_name)
     ginkgo_build_test_name(${test_name} test_target_name)
     add_executable(${test_target_name} ${test_name}.dp.cpp)
-    target_compile_features(${test_target_name} PUBLIC cxx_std_17)
     target_compile_options(${test_target_name} PRIVATE ${GINKGO_DPCPP_FLAGS})
     gko_add_sycl_to_target(TARGET ${test_target_name} SOURCES ${test_name}.dp.cpp)
     target_link_options(${test_target_name} PRIVATE -fsycl-device-code-split=per_kernel)
@@ -270,7 +268,6 @@ function(ginkgo_create_common_device_test test_name)
     ginkgo_build_test_name(${test_name} test_target_name)
     if(GINKGO_BUILD_SYCL)
         ginkgo_create_common_test_internal(${test_name} DpcppExecutor dpcpp ${ARGN})
-        target_compile_features(${test_target_name}_dpcpp PRIVATE cxx_std_17)
         target_compile_options(${test_target_name}_dpcpp PRIVATE ${GINKGO_DPCPP_FLAGS})
         # We need to use a new file to avoid sycl setting in other backends because add_sycl_to_target will change the source property.
         configure_file(${test_name}.cpp ${test_name}.dp.cpp COPYONLY)
diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index ee373243842..851ef9a3dc6 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -104,7 +104,6 @@ gko_add_sycl_to_target(TARGET ginkgo_dpcpp)
 # Note: add MKL as PRIVATE not PUBLIC (MKL example shows) to avoid propagating
 # find_package(MKL) everywhere when linking ginkgo (see the MKL example
 # https://software.intel.com/content/www/us/en/develop/documentation/onemkl-windows-developer-guide/top/getting-started/cmake-config-for-onemkl.html)
-target_compile_features(ginkgo_dpcpp PUBLIC cxx_std_17)
 target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-lib=all)
 # When building ginkgo as a static library, we need to use dpcpp and per_kernel
 # link option when the program uses a dpcpp related function.
diff --git a/test/test_exportbuild/CMakeLists.txt b/test/test_exportbuild/CMakeLists.txt
index c65f964d9a0..cb8f0b1225c 100644
--- a/test/test_exportbuild/CMakeLists.txt
+++ b/test/test_exportbuild/CMakeLists.txt
@@ -7,5 +7,4 @@ find_package(Ginkgo REQUIRED
 # Here, we use test install without any data. We instantiate the
 # interface only.
 add_executable(test_exportbuild ../test_install/test_install.cpp)
-target_compile_features(test_exportbuild PUBLIC cxx_std_14)
 target_link_libraries(test_exportbuild PRIVATE Ginkgo::ginkgo)
diff --git a/test/test_install/CMakeLists.txt b/test/test_install/CMakeLists.txt
index 60fad7cf339..285c21f271b 100644
--- a/test/test_install/CMakeLists.txt
+++ b/test/test_install/CMakeLists.txt
@@ -26,7 +26,6 @@ if (GINKGO_BUILD_REFERENCE)
     set(HAS_REFERENCE 1)
 endif()
 add_executable(test_install test_install.cpp)
-target_compile_features(test_install PUBLIC cxx_std_14)
 target_compile_definitions(test_install PRIVATE HAS_REFERENCE=${HAS_REFERENCE})
 target_link_libraries(test_install PRIVATE Ginkgo::ginkgo)
 if(GINKGO_BUILD_MPI)
diff --git a/test/test_pkgconfig/CMakeLists.txt b/test/test_pkgconfig/CMakeLists.txt
index e904f997f26..12b9fc4dc26 100644
--- a/test/test_pkgconfig/CMakeLists.txt
+++ b/test/test_pkgconfig/CMakeLists.txt
@@ -8,7 +8,7 @@ pkg_check_modules(GINKGO REQUIRED IMPORTED_TARGET ginkgo)
 # Here, we use test install without any data. We instantiate the
 # interface only.
 add_executable(test_pkgconfig ../test_install/test_install.cpp)
-target_compile_features(test_pkgconfig PUBLIC cxx_std_14)
+target_compile_features(test_pkgconfig PUBLIC cxx_std_17)
 # CMake PkgConfig only puts the -l, -L, and -framework into link_libraries and others into link_options
 # When linking the target, the linking option will be before the compiled object to lead the linking error
 set_property(TARGET PkgConfig::GINKGO PROPERTY INTERFACE_LINK_LIBRARIES "${GINKGO_LDFLAGS}")
diff --git a/test/test_subdir/CMakeLists.txt b/test/test_subdir/CMakeLists.txt
index dcf846f4adc..00ae3bc07e2 100644
--- a/test/test_subdir/CMakeLists.txt
+++ b/test/test_subdir/CMakeLists.txt
@@ -5,5 +5,4 @@ file(CREATE_LINK "${CMAKE_CURRENT_SOURCE_DIR}/../.." "${CMAKE_CURRENT_BINARY_DIR
 add_subdirectory("${CMAKE_CURRENT_BINARY_DIR}/ginkgo")
 
 add_executable(test_subdir ../test_install/test_install.cpp)
-target_compile_features(test_subdir PUBLIC cxx_std_14)
 target_link_libraries(test_subdir PRIVATE Ginkgo::ginkgo)

From 8932e1287e818a864998800a8f07b0728fd0de42 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 1 May 2024 17:23:14 +0200
Subject: [PATCH 073/448] remove unsupported compilers

---
 .gitlab-ci.yml    | 160 ++++------------------------------------------
 .gitlab/image.yml |  27 --------
 2 files changed, 14 insertions(+), 173 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 1866f16406a..88748c95b79 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -93,138 +93,6 @@ trigger_pipeline:
 
 # Build jobs
 # Job with example runs.
-# cuda 10.1 and friends
-# Build CUDA NVIDIA without omp
-# Make sure that our jobs run when HWLOC is
-# forcibly switched off
-build/cuda101/nompi/clang/cuda_wo_omp/release/shared:
-  extends:
-    - .build_template
-    - .default_variables
-    - .full_test_condition
-    - .use_gko-cuda101-openmpi-gnu8-llvm7-intel2019
-  variables:
-    CXX_COMPILER: "clang++"
-    BUILD_CUDA: "ON"
-    BUILD_HWLOC: "OFF"
-    BUILD_TYPE: "Release"
-    CUDA_ARCH: 35
-
-# Job with example runs.
-# Also explicitly test PAPI SDE
-build/cuda101/openmpi/gcc/all/debug/shared:
-  extends:
-    - .build_template
-    - .default_variables
-    - .quick_test_condition
-    - .use_gko-cuda101-openmpi-gnu8-llvm7-intel2019
-  variables:
-    BUILD_OMP: "ON"
-    BUILD_CUDA: "ON"
-    BUILD_MPI: "ON"
-    MPI_AS_ROOT: "ON"
-    BUILD_TYPE: "Debug"
-    BUILD_PAPI_SDE: "ON"
-    CUDA_ARCH: 35
-
-build/cuda101/nompi/clang/all/release/static:
-  extends:
-    - .build_template
-    - .default_variables
-    - .full_test_condition
-    - .use_gko-cuda101-openmpi-gnu8-llvm7-intel2019
-  variables:
-    CXX_COMPILER: "clang++"
-    BUILD_OMP: "ON"
-    BUILD_CUDA: "ON"
-    BUILD_TYPE: "Release"
-    BUILD_SHARED_LIBS: "OFF"
-    CUDA_ARCH: 35
-
-# clang-cuda with cuda 10.1 and friends
-#build/clang-cuda101/openmpi/clang/cuda/release/shared:
-#  extends:
-#    - .build_and_test_template
-#    - .default_variables
-#    - .quick_test_condition
-#    - .use_gko-cuda101-openmpi-gnu8-llvm13-intel2019
-#  variables:
-#    CXX_COMPILER: "clang++"
-#    CUDA_COMPILER: "clang++"
-#    BUILD_OMP: "ON"
-#    BUILD_CUDA: "ON"
-#    BUILD_MPI: "ON"
-#    MPI_AS_ROOT: "ON"
-#    BUILD_HIP: "OFF"
-#    BUILD_TYPE: "Release"
-
-
-#build/clang-cuda101/nompi/clang/cuda/debug/static:
-#  extends:
-#    - .build_and_test_template
-#    - .default_variables
-#    - .full_test_condition
-#    - .use_gko-cuda101-openmpi-gnu8-llvm13-intel2019
-#  variables:
-#    CXX_COMPILER: "clang++"
-#    CUDA_COMPILER: "clang++"
-#    BUILD_OMP: "ON"
-#    BUILD_CUDA: "ON"
-#    BUILD_TYPE: "Debug"
-#    FAST_TESTS: "ON"
-#    BUILD_SHARED_LIBS: "OFF"
-
-
-# cuda 10.2 and friends
-
-# works when there is no hwloc and tpl hwloc is also switched off.
-build/cuda102/nompi/gcc/all/debug/shared:
-  extends:
-    - .build_template
-    - .default_variables
-    - .full_test_condition
-    - .use_gko-cuda102-nompi-gnu8-llvm8-intel2019
-  variables:
-    BUILD_OMP: "ON"
-    BUILD_CUDA: "ON"
-    BUILD_TYPE: "Debug"
-    FAST_TESTS: "ON"
-    BUILD_HWLOC: "OFF"
-    CUDA_ARCH: 35
-
-# Use TPL hwloc when no system hwloc is available
-build/cuda102/nompi/clang/all/release/static:
-  extends:
-    - .build_template
-    - .default_variables
-    - .full_test_condition
-    - .use_gko-cuda102-nompi-gnu8-llvm8-intel2019
-  variables:
-    CXX_COMPILER: "clang++"
-    BUILD_OMP: "ON"
-    BUILD_CUDA: "ON"
-    BUILD_TYPE: "Release"
-    BUILD_SHARED_LIBS: "OFF"
-    CUDA_ARCH: 35
-    BUILD_HWLOC: "OFF"
-
-build/cuda102/nompi/intel/cuda/debug/static:
-  extends:
-    - .build_template
-    - .default_variables
-    - .full_test_condition
-    - .use_gko-cuda102-nompi-gnu8-llvm8-intel2019
-  variables:
-    CXX_COMPILER: "icpc"
-    CXX_FLAGS: ""
-    BUILD_OMP: "ON"
-    BUILD_CUDA: "ON"
-    BUILD_TYPE: "Debug"
-    FAST_TESTS: "ON"
-    BUILD_SHARED_LIBS: "OFF"
-    CUDA_ARCH: 35
-    BUILD_HWLOC: "OFF"
-
 # cuda 11.0 and friends on HoreKa with tests
 build/cuda110/mvapich2/gcc/cuda/debug/shared:
   extends:
@@ -521,13 +389,13 @@ build/nocuda/openmpi/clang/omp/glibcxx-debug-release/shared:
     # The tests are prohibitively slow in Debug
     BUILD_TYPE: "Release"
 
-# nocuda with the oldest supported compiler
+# nocuda with old compiler
 build/nocuda/nompi/gcc/omp/release/static:
   extends:
     - .build_and_test_template
     - .default_variables
     - .quick_test_condition
-    - .use_gko-nocuda-mvapich2-gnu5-llvm39-intel2019
+    - .use_gko-nocuda-nompi-gnu9-llvm8
   variables:
     BUILD_OMP: "ON"
     BUILD_TYPE: "Release"
@@ -538,7 +406,7 @@ build/nocuda-nomixed/nompi/clang/omp/release/static:
     - .build_and_test_template
     - .default_variables
     - .full_test_condition
-    - .use_gko-nocuda-mvapich2-gnu5-llvm39-intel2019
+    - .use_gko-nocuda-nompi-gnu9-llvm8
   variables:
     CXX_COMPILER: "clang++"
     BUILD_OMP: "ON"
@@ -668,7 +536,7 @@ warnings:
     - .build_template
     - .default_variables
     - .full_test_condition
-    - .use_gko-cuda101-openmpi-gnu8-llvm7-intel2019
+    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
   variables:
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
@@ -682,7 +550,7 @@ no-circular-deps:
     - .build_template
     - .default_variables
     - .quick_test_condition
-    - .use_gko-cuda101-openmpi-gnu8-llvm7-intel2019
+    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
   variables:
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
@@ -709,7 +577,7 @@ clang-tidy:
     - .build_template
     - .default_variables
     - .full_test_condition
-    - .use_gko-cuda101-openmpi-gnu8-llvm7-intel2019
+    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
   variables:
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
@@ -722,7 +590,7 @@ iwyu:
     - .build_template
     - .default_variables
     - .full_test_condition
-    - .use_gko-cuda101-openmpi-gnu8-llvm7-intel2019
+    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
   variables:
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
@@ -737,7 +605,7 @@ sonarqube_cov_:
     - .default_variables
     - .quick_test_short_lived_condition
     - .before_script_template
-    - .use_gko-cuda101-openmpi-gnu8-llvm7-intel2019
+    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
   tags:
     - private_ci
     - controller
@@ -773,7 +641,7 @@ sonarqube_cov:
     - .default_variables
     - .deploy_condition
     - .before_script_template
-    - .use_gko-cuda101-openmpi-gnu8-llvm7-intel2019
+    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
   tags:
     - private_ci
     - controller
@@ -836,7 +704,7 @@ threadsanitizer:
     - .default_variables
     - .deploy_condition
     - .before_script_template
-    - .use_gko-cuda101-openmpi-gnu8-llvm13-intel2019
+    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
   script:
     - LD_PRELOAD=/usr/local/lib/libomp.so
       CC=clang CXX=clang++
@@ -851,7 +719,7 @@ leaksanitizer:
     - .default_variables
     - .deploy_condition
     - .before_script_template
-    - .use_gko-cuda101-openmpi-gnu8-llvm13-intel2019
+    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
   script:
     - ctest -V -S cmake/CTestScript.cmake -DCTEST_BUILD_CONFIGURATION=LSAN
       -DCTEST_MEMORYCHECK_TYPE=LeakSanitizer
@@ -862,7 +730,7 @@ addresssanitizer:
     - .default_variables
     - .deploy_condition
     - .before_script_template
-    - .use_gko-cuda101-openmpi-gnu8-llvm13-intel2019
+    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
   script:
     - ctest -V -S cmake/CTestScript.cmake -DCTEST_BUILD_CONFIGURATION=ASAN
       -DCTEST_MEMORYCHECK_TYPE=AddressSanitizer
@@ -873,7 +741,7 @@ undefinedsanitizer:
     - .default_variables
     - .deploy_condition
     - .before_script_template
-    - .use_gko-cuda101-openmpi-gnu8-llvm13-intel2019
+    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
   script:
     # the Gold linker is required because of a linker flag issues given by UBsan
     # in the Ubuntu setup we are using.
@@ -886,7 +754,7 @@ cudamemcheck:
     - .before_script_template
     - .default_variables
     - .deploy_condition
-  image: ginkgohub/cuda:101-openmpi-gnu8-llvm13-intel2019
+  image: use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
   tags:
     - private_ci
     - nvidia-gpu
diff --git a/.gitlab/image.yml b/.gitlab/image.yml
index da548066a86..60521044d7f 100644
--- a/.gitlab/image.yml
+++ b/.gitlab/image.yml
@@ -17,33 +17,6 @@
     - cpu
     - amdci
 
-.use_gko-nocuda-mvapich2-gnu5-llvm39-intel2019:
-  image: ginkgohub/cpu:mvapich2-gnu5-llvm39-intel2019
-  tags:
-    - private_ci
-    - cpu
-    - controller
-
-.use_gko-cuda101-openmpi-gnu8-llvm7-intel2019:
-  image: ginkgohub/cuda:101-openmpi-gnu8-llvm7-intel2019
-  tags:
-    - private_ci
-    - controller
-    - cpu
-
-.use_gko-cuda101-openmpi-gnu8-llvm13-intel2019:
-  image: ginkgohub/cuda:101-openmpi-gnu8-llvm13-intel2019
-  tags:
-    - private_ci
-    - nvidia-gpu
-
-.use_gko-cuda102-nompi-gnu8-llvm8-intel2019:
-  image: ginkgohub/cuda:102-nompi-gnu8-llvm8-intel2019
-  tags:
-    - private_ci
-    - controller
-    - cpu
-
 .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020:
   image: ginkgohub/cuda:110-mvapich2-gnu9-llvm9-intel2020
   tags:

From 63253cb37b7a6ef5aebe997c76505c1807513cfd Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Thu, 4 Jul 2024 14:09:50 +0200
Subject: [PATCH 074/448] use compile features for specifying HIP/CUDA standard
 version

---
 cmake/build_helpers.cmake                    | 10 ++++++----
 examples/custom-matrix-format/CMakeLists.txt |  3 ---
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/cmake/build_helpers.cmake b/cmake/build_helpers.cmake
index f0337839a55..0985f089382 100644
--- a/cmake/build_helpers.cmake
+++ b/cmake/build_helpers.cmake
@@ -19,10 +19,12 @@ endfunction()
 
 function(ginkgo_compile_features name)
     target_compile_features("${name}" PUBLIC cxx_std_17)
-    # we set these properties regardless of the enabled backends,
-    # because unknown properties are ignored
-    set_target_properties("${name}" PROPERTIES HIP_STANDARD 17)
-    set_target_properties("${name}" PROPERTIES CUDA_STANDARD 17)
+    if (GINKG_BUILD_CUDA)
+        target_compile_features("${name}" PUBLIC cuda_std_17)
+    endif()
+    if (GINKG_BUILD_HIP)
+        target_compile_features("${name}" PUBLIC hip_std_17)
+    endif()
     if(GINKGO_WITH_CLANG_TIDY AND GINKGO_CLANG_TIDY_PATH)
         set_property(TARGET "${name}" PROPERTY CXX_CLANG_TIDY "${GINKGO_CLANG_TIDY_PATH};-checks=*")
     endif()
diff --git a/examples/custom-matrix-format/CMakeLists.txt b/examples/custom-matrix-format/CMakeLists.txt
index a9ad895a996..9a1280ff9f5 100644
--- a/examples/custom-matrix-format/CMakeLists.txt
+++ b/examples/custom-matrix-format/CMakeLists.txt
@@ -12,9 +12,6 @@ if(NOT (GINKGO_BUILD_CUDA AND GINKGO_BUILD_OMP))
         "This example needs Ginkgo built with CUDA and OpenMP support")
 endif()
 
-set(CMAKE_CUDA_STANDARD 17)
-set(CMAKE_CUDA_STANDARD_REQUIRED ON)
-
 add_executable(custom-matrix-format custom-matrix-format.cpp stencil_kernel.cu)
 target_link_libraries(custom-matrix-format Ginkgo::ginkgo OpenMP::OpenMP_CXX)
 

From 99748647acdb1e3ba893bf1e5ba627a736441f90 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Fri, 5 Jul 2024 11:47:01 +0200
Subject: [PATCH 075/448] update version requirements

---
 README.md | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index d5e22bd0b35..74fd6a0f57e 100644
--- a/README.md
+++ b/README.md
@@ -40,10 +40,10 @@ For Ginkgo core library:
 
 *   _cmake 3.16+_
 *   C++17 compliant compiler, one of:
-    *   _gcc 5.5+_
-    *   _clang 3.9+_
+    *   _gcc 7+_
+    *   _clang 5+_
     *   _Intel compiler 2019+_
-    *   _Apple Clang 14.0_ is tested. Earlier versions might also work.
+    *   _Apple Clang 15.0_ is tested. Earlier versions might also work.
     *   _Cray Compiler 14.0.1+_
     *   _NVHPC Compiler 22.7+_
 
@@ -59,9 +59,7 @@ The Ginkgo CUDA module has the following __additional__ requirements:
 The Ginkgo HIP module has the following __additional__ requirements:
 
 * _ROCm 4.5+_
-*    the HIP, hipBLAS, hipSPARSE, hip/rocRAND and rocThrust packages compiled with either:
-    * _AMD_ backend (using the `clang` compiler)
-    * _10.1 <= CUDA < 11_ backend
+* the HIP, hipBLAS, hipSPARSE, hip/rocRAND and rocThrust packages compiled with the ROCm backend
 * if the hipFFT package is available, it is used to implement the FFT LinOps.
 * _cmake 3.21+_
 
@@ -69,7 +67,6 @@ The Ginkgo DPC++(SYCL) module has the following __additional__ requirements:
 
 * _oneAPI 2023.1+_
 * Set `dpcpp` or `icpx` as the `CMAKE_CXX_COMPILER`
-* `c++17` is used to compile Ginkgo
 * The following oneAPI packages should be available:
     * oneMKL
     * oneDPL
@@ -81,7 +78,7 @@ The Ginkgo MPI module has the following __additional__ requirements:
 In addition, if you want to contribute code to Ginkgo, you will also need the
 following:
 
-*   _clang-format 8.0.0+_ (ships as part of _clang_)
+*   _clang-format 14_ (downloaded automatically by `pre-commit`)
 *   _clang-tidy_ (optional, when setting the flag `-DGINKGO_WITH_CLANG_TIDY=ON`)
 *   _iwyu_ (Include What You Use, optional, when setting the flag `-DGINKGO_WITH_IWYU=ON`)
 
@@ -89,7 +86,7 @@ following:
 
 *   _cmake 3.16+_
 *   C++17 compliant 64-bit compiler:
-    *   _MinGW : gcc 5.5+_
+    *   _MinGW : gcc 7+_
     *   _Microsoft Visual Studio : VS 2019+_
 
 The Ginkgo CUDA module has the following __additional__ requirements:

From 5bed30de4666b1414214823e49acdd7a929fc304 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Fri, 5 Jul 2024 11:51:16 +0200
Subject: [PATCH 076/448] remove older std_extensions

---
 cuda/base/pointer_mode_guard.hpp            |  2 +-
 include/ginkgo/core/base/std_extensions.hpp | 39 ++++-----------------
 2 files changed, 8 insertions(+), 33 deletions(-)

diff --git a/cuda/base/pointer_mode_guard.hpp b/cuda/base/pointer_mode_guard.hpp
index 56f46fedf40..6340b98eb6f 100644
--- a/cuda/base/pointer_mode_guard.hpp
+++ b/cuda/base/pointer_mode_guard.hpp
@@ -50,7 +50,7 @@ class pointer_mode_guard {
     ~pointer_mode_guard() noexcept(false)
     {
         /* Ignore the error during stack unwinding for this call */
-        if (std::uncaught_exception() > uncaught_exceptions_) {
+        if (std::uncaught_exceptions() > uncaught_exceptions_) {
             cublasSetPointerMode(*l_handle, CUBLAS_POINTER_MODE_DEVICE);
         } else {
             GKO_ASSERT_NO_CUBLAS_ERRORS(
diff --git a/include/ginkgo/core/base/std_extensions.hpp b/include/ginkgo/core/base/std_extensions.hpp
index 85857873f24..842ad86a23f 100644
--- a/include/ginkgo/core/base/std_extensions.hpp
+++ b/include/ginkgo/core/base/std_extensions.hpp
@@ -11,6 +11,8 @@
 #include <memory>
 #include <type_traits>
 
+#include "ginkgo/core/base/types.hpp"
+
 
 // This header provides implementations of useful utilities introduced into the
 // C++ standard after C++14 (e.g. C++17 and C++20).
@@ -25,33 +27,12 @@ namespace gko {
  * @ingroup xstd
  */
 namespace xstd {
-namespace detail {
-
-
-template <typename... Ts>
-struct make_void {
-    using type = void;
-};
-
-
-}  // namespace detail
-
-
-// Added in C++17
 template <typename... Ts>
-using void_t = typename detail::make_void<Ts...>::type;
+using void_t = std::void_t<Ts...>;
 
 
-// Disable deprecation warnings when using standard > C++14
-inline bool uncaught_exception() noexcept
-{
-// MSVC uses _MSVC_LANG as __cplusplus
-#if (defined(_MSVC_LANG) && _MSVC_LANG > 201402L) || __cplusplus > 201402L
-    return std::uncaught_exceptions() > 0;
-#else
-    return std::uncaught_exception();
-#endif
-}
+GKO_DEPRECATED("use std::uncaught_exceptions")
+inline bool uncaught_exception() noexcept { return std::uncaught_exception(); }
 
 
 // Kept for backward compatibility.
@@ -101,14 +82,8 @@ constexpr bool less_equal(const T&& lhs, const T&& rhs)
 }
 
 
-// available in <type_traits> with C++17
-template <class...>
-struct conjunction : std::true_type {};
-template <class B1>
-struct conjunction<B1> : B1 {};
-template <class B1, class... Bn>
-struct conjunction<B1, Bn...>
-    : std::conditional_t<bool(B1::value), conjunction<Bn...>, B1> {};
+template <class... Ts>
+using conjunction = std::conjunction<Ts...>;
 
 
 }  // namespace xstd

From de37ab9e8cdb0912ec7171ccbff6b458c6576b0d Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Fri, 5 Jul 2024 11:53:13 +0200
Subject: [PATCH 077/448] remove unsupported Intel + CUDA build

---
 .gitlab-ci.yml | 34 ----------------------------------
 1 file changed, 34 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 88748c95b79..687b517bf78 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -159,40 +159,6 @@ test/cuda110/nompi/clang/cuda/release/static:
   needs: [ "build/cuda110/nompi/clang/cuda/release/static" ]
 
 
-build/cuda110/nompi/intel/cuda/debug/static:
-  extends:
-    - .build_template
-    - .default_variables
-    - .full_test_condition
-    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
-  variables:
-    CXX_COMPILER: "icpc"
-    CXX_FLAGS: ""
-    BUILD_OMP: "ON"
-    BUILD_CUDA: "ON"
-    BUILD_TYPE: "Debug"
-    FAST_TESTS: "ON"
-    BUILD_SHARED_LIBS: "OFF"
-    CUDA_ARCH: 80
-    USE_NAME: "cuda110-nompi-intel-${CI_PIPELINE_ID}"
-    KEEP_CONTAINER: "ON"
-    USE_SLURM: 0
-
-test/cuda110/nompi/intel/cuda/debug/static:
-  extends:
-    - .horeka_test_template
-    - .default_variables
-    - .full_test_condition
-    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
-  variables:
-    USE_NAME: "cuda110-nompi-intel-${CI_PIPELINE_ID}"
-    SLURM_PARTITION: "accelerated"
-    SLURM_GRES: "gpu:4"
-    SLURM_TIME: "02:00:00"
-  dependencies: null
-  needs: [ "build/cuda110/nompi/intel/cuda/debug/static" ]
-
-
 # cuda 11.4 and friends
 build/cuda114/nompi/gcc/cuda/debug/shared:
   extends:

From 43f1dcb964d2fd172d75b52904cae6482bf8db72 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Fri, 5 Jul 2024 12:22:59 +0200
Subject: [PATCH 078/448] review updates and formatting

---
 .gitlab-ci.yml                               | 2 +-
 core/log/profiler_hook.cpp                   | 7 -------
 core/log/profiler_hook.hpp                   | 5 ++---
 include/ginkgo/core/base/std_extensions.hpp  | 5 ++++-
 include/ginkgo/core/log/profiler_hook.hpp    | 3 +--
 include/ginkgo/core/solver/solver_traits.hpp | 1 -
 6 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 687b517bf78..8fd46cac12f 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -720,7 +720,7 @@ cudamemcheck:
     - .before_script_template
     - .default_variables
     - .deploy_condition
-  image: use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
+  image: ginkgohub/cuda:110-mvapich2-gnu9-llvm9-intel2020
   tags:
     - private_ci
     - nvidia-gpu
diff --git a/core/log/profiler_hook.cpp b/core/log/profiler_hook.cpp
index 7cb4f807919..e3ed0ad8299 100644
--- a/core/log/profiler_hook.cpp
+++ b/core/log/profiler_hook.cpp
@@ -431,13 +431,6 @@ profiling_scope_guard::~profiling_scope_guard()
     }
 }
 
-profiling_scope_guard::profiling_scope_guard(profiling_scope_guard&& other)
-    : empty_{std::exchange(other.empty_, true)},
-      name_{std::exchange(other.name_, nullptr)},
-      category_{other.category_},
-      end_{std::move(other.end_)}
-{}
-
 
 }  // namespace log
 }  // namespace gko
diff --git a/core/log/profiler_hook.hpp b/core/log/profiler_hook.hpp
index 31d1d1b5a83..c4e31c76ef3 100644
--- a/core/log/profiler_hook.hpp
+++ b/core/log/profiler_hook.hpp
@@ -6,11 +6,10 @@
 #define GKO_CORE_LOG_PROFILER_HOOK_HPP_
 
 
-#include <ginkgo/core/log/profiler_hook.hpp>
-
-
 #include <optional>
 
+#include <ginkgo/core/log/profiler_hook.hpp>
+
 
 namespace gko {
 namespace log {
diff --git a/include/ginkgo/core/base/std_extensions.hpp b/include/ginkgo/core/base/std_extensions.hpp
index 842ad86a23f..893b2b0d865 100644
--- a/include/ginkgo/core/base/std_extensions.hpp
+++ b/include/ginkgo/core/base/std_extensions.hpp
@@ -32,7 +32,10 @@ using void_t = std::void_t<Ts...>;
 
 
 GKO_DEPRECATED("use std::uncaught_exceptions")
-inline bool uncaught_exception() noexcept { return std::uncaught_exception(); }
+inline bool uncaught_exception() noexcept
+{
+    return std::uncaught_exceptions() > 0;
+}
 
 
 // Kept for backward compatibility.
diff --git a/include/ginkgo/core/log/profiler_hook.hpp b/include/ginkgo/core/log/profiler_hook.hpp
index 5db0e1275f5..c5dc9dcbab6 100644
--- a/include/ginkgo/core/log/profiler_hook.hpp
+++ b/include/ginkgo/core/log/profiler_hook.hpp
@@ -419,8 +419,7 @@ class profiling_scope_guard {
 
     profiling_scope_guard(const profiling_scope_guard&) = delete;
 
-    /** Move-constructs from another scope guard, other will be left empty. */
-    profiling_scope_guard(profiling_scope_guard&& other);
+    profiling_scope_guard(profiling_scope_guard&& other) = delete;
 
     profiling_scope_guard& operator=(const profiling_scope_guard&) = delete;
 
diff --git a/include/ginkgo/core/solver/solver_traits.hpp b/include/ginkgo/core/solver/solver_traits.hpp
index d5306f56b08..6209cad3e90 100644
--- a/include/ginkgo/core/solver/solver_traits.hpp
+++ b/include/ginkgo/core/solver/solver_traits.hpp
@@ -8,7 +8,6 @@
 
 #include <type_traits>
 
-
 #include <ginkgo/core/stop/criterion.hpp>
 
 

From d7066224c541356e19c703c9f0e07d361814702c Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Mon, 8 Jul 2024 15:17:07 +0200
Subject: [PATCH 079/448] add build-only job for SM 3.5

---
 .gitlab-ci.yml | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 8fd46cac12f..055a7988a0c 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -175,6 +175,25 @@ build/cuda114/nompi/gcc/cuda/debug/shared:
     CXX_FLAGS: "-Wno-error=maybe-uninitialized"
     # disable spurious unused argument warning
     EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177"
+  
+
+build/cuda114/nompi/clang/cuda/release/shared:
+  extends:
+    - .build_template
+    - .default_variables
+    - .quick_test_condition
+    - .use_gko_cuda114-openmpi-gnu10-llvm12
+  variables:
+    CXX_COMPILER: "clang++"
+    CUDA_ARCH: 35
+    BUILD_OMP: "ON"
+    BUILD_CUDA: "ON"
+    BUILD_TYPE: "Release"
+    FAST_TESTS: "ON"
+    # fix gtest issue https://github.com/google/googletest/issues/3514
+    CXX_FLAGS: "-Wno-error=maybe-uninitialized"
+    # disable spurious unused argument warning
+    EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177"
 
 
 # nvhpc and friends

From f148900eab1c0c48ec20522b3b31a4cd16748f69 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Mon, 8 Jul 2024 16:48:47 +0200
Subject: [PATCH 080/448] remove warning flags

---
 .gitlab-ci.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 055a7988a0c..03e4b5ad4d0 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -190,8 +190,6 @@ build/cuda114/nompi/clang/cuda/release/shared:
     BUILD_CUDA: "ON"
     BUILD_TYPE: "Release"
     FAST_TESTS: "ON"
-    # fix gtest issue https://github.com/google/googletest/issues/3514
-    CXX_FLAGS: "-Wno-error=maybe-uninitialized"
     # disable spurious unused argument warning
     EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177"
 

From b46a653a528965b1064142bb808b3383bde4b66b Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Mon, 8 Jul 2024 21:28:18 +0200
Subject: [PATCH 081/448] move to a compatible nvcc/clang combination

nvcc with clang++ host compiler seems incompatible with libstdc++-10
---
 .gitlab-ci.yml | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 03e4b5ad4d0..2f8e3a892a5 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -157,39 +157,39 @@ test/cuda110/nompi/clang/cuda/release/static:
     SLURM_TIME: "01:30:00"
   dependencies: null
   needs: [ "build/cuda110/nompi/clang/cuda/release/static" ]
+  
 
-
-# cuda 11.4 and friends
-build/cuda114/nompi/gcc/cuda/debug/shared:
+build/cuda110/nompi/clang/cuda/release/shared:
   extends:
-    - .build_and_test_template
+    - .build_template
     - .default_variables
     - .quick_test_condition
-    - .use_gko_cuda114-openmpi-gnu10-llvm12
+    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
   variables:
+    CXX_COMPILER: "clang++"
+    CUDA_ARCH: 52
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
-    BUILD_TYPE: "Debug"
+    BUILD_TYPE: "Release"
     FAST_TESTS: "ON"
-    # fix gtest issue https://github.com/google/googletest/issues/3514
-    CXX_FLAGS: "-Wno-error=maybe-uninitialized"
     # disable spurious unused argument warning
     EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177"
-  
 
-build/cuda114/nompi/clang/cuda/release/shared:
+
+# cuda 11.4 and friends
+build/cuda114/nompi/gcc/cuda/debug/shared:
   extends:
-    - .build_template
+    - .build_and_test_template
     - .default_variables
     - .quick_test_condition
     - .use_gko_cuda114-openmpi-gnu10-llvm12
   variables:
-    CXX_COMPILER: "clang++"
-    CUDA_ARCH: 35
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
-    BUILD_TYPE: "Release"
+    BUILD_TYPE: "Debug"
     FAST_TESTS: "ON"
+    # fix gtest issue https://github.com/google/googletest/issues/3514
+    CXX_FLAGS: "-Wno-error=maybe-uninitialized"
     # disable spurious unused argument warning
     EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177"
 

From 1c039d3694a72db3f549c23e2a5801134bed06fc Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 1 May 2024 15:28:59 +0200
Subject: [PATCH 082/448] add device support to zip_iterator

---
 core/base/iterator_factory.hpp                | 363 +++++++++++++++---
 core/test/base/iterator_factory.cpp           |  19 +-
 omp/distributed/partition_helpers_kernels.cpp |   7 +-
 omp/matrix/csr_kernels.cpp                    |   5 +-
 omp/matrix/fbcsr_kernels.cpp                  |   5 +-
 omp/multigrid/pgm_kernels.cpp                 |   3 +-
 .../distributed/partition_helpers_kernels.cpp |  13 +-
 reference/matrix/csr_kernels.cpp              |   5 +-
 reference/matrix/fbcsr_kernels.cpp            |   5 +-
 reference/multigrid/pgm_kernels.cpp           |   3 +-
 test/base/CMakeLists.txt                      |   1 +
 test/base/iterator_factory.cpp                |  69 ++++
 12 files changed, 410 insertions(+), 88 deletions(-)
 create mode 100644 test/base/iterator_factory.cpp

diff --git a/core/base/iterator_factory.hpp b/core/base/iterator_factory.hpp
index 3d224836b1a..938d705b04d 100644
--- a/core/base/iterator_factory.hpp
+++ b/core/base/iterator_factory.hpp
@@ -24,6 +24,234 @@ template <typename... Iterators>
 class zip_iterator;
 
 
+template <typename... Iterators>
+class zip_iterator_reference;
+
+
+template <typename T, typename... Ts>
+class device_tuple;
+
+
+}  // namespace detail
+}  // namespace gko
+
+
+// structured binding specializations for device_tuple, zip_iterator_reference
+namespace std {
+
+
+template <typename... Ts>
+struct tuple_size<gko::detail::device_tuple<Ts...>>
+    : integral_constant<size_t, sizeof...(Ts)> {};
+
+
+template <std::size_t I, typename... Ts>
+struct tuple_element<I, gko::detail::device_tuple<Ts...>> {
+    using type = typename tuple_element<I, tuple<Ts...>>::type;
+};
+
+
+template <typename... Iterators>
+struct tuple_size<gko::detail::zip_iterator_reference<Iterators...>>
+    : integral_constant<size_t, sizeof...(Iterators)> {};
+
+
+template <std::size_t I, typename... Iterators>
+struct tuple_element<I, gko::detail::zip_iterator_reference<Iterators...>> {
+    using type = typename iterator_traits<
+        typename tuple_element<I, tuple<Iterators...>>::type>::reference;
+};
+
+
+}  // namespace std
+
+
+namespace gko {
+
+
+/** std::get reimplementation for device_tuple. */
+template <std::size_t index, typename... Ts>
+constexpr typename std::tuple_element<index, detail::device_tuple<Ts...>>::type&
+get(detail::device_tuple<Ts...>& tuple);
+
+
+/** std::get reimplementation for const device_tuple. */
+template <std::size_t index, typename... Ts>
+constexpr const typename std::tuple_element<index,
+                                            detail::device_tuple<Ts...>>::type&
+get(const detail::device_tuple<Ts...>& tuple);
+
+
+namespace detail {
+
+
+/** simplified constexpr std::tuple reimplementation for use in device code. */
+template <typename T, typename... Ts>
+class device_tuple {
+public:
+    /** Constructs a device tuple from its elements. */
+    constexpr explicit device_tuple(T value, Ts... others)
+        : value_{value}, other_{others...}
+    {}
+
+    device_tuple() = default;
+
+    /**
+     * Copy-assigns a tuple.
+     * This is necessary to make tuples of references work, which normally cause
+     * the impliciy copy-assignment operator to be deleted.
+     */
+    constexpr device_tuple& operator=(const device_tuple& other)
+    {
+        value_ = other.value_;
+        other_ = other.other_;
+        return *this;
+    }
+
+    /** @return the index-th element in the tuple. */
+    template <std::size_t index>
+    constexpr typename std::tuple_element<index, device_tuple>::type& get()
+    {
+        if constexpr (index == 0) {
+            return value_;
+        } else {
+            return other_.template get<index - 1>();
+        }
+    }
+
+    /** @return the index-th element in the const tuple. */
+    template <std::size_t index>
+    constexpr const typename std::tuple_element<index, device_tuple>::type&
+    get() const
+    {
+        if constexpr (index == 0) {
+            return value_;
+        } else {
+            return other_.template get<index - 1>();
+        }
+    }
+
+    // comparison operators
+    constexpr friend bool operator<(const device_tuple& lhs,
+                                    const device_tuple& rhs)
+    {
+        return lhs.value_ < rhs.value_ ||
+               (lhs.value_ == rhs.value_ && lhs.other_ < rhs.other_);
+    }
+
+    constexpr friend bool operator>(const device_tuple& lhs,
+                                    const device_tuple& rhs)
+    {
+        return rhs < lhs;
+    }
+
+    constexpr friend bool operator>=(const device_tuple& lhs,
+                                     const device_tuple& rhs)
+    {
+        return !(lhs < rhs);
+    }
+
+    constexpr friend bool operator<=(const device_tuple& lhs,
+                                     const device_tuple& rhs)
+    {
+        return !(lhs > rhs);
+    }
+
+    constexpr friend bool operator==(const device_tuple& lhs,
+                                     const device_tuple& rhs)
+    {
+        return lhs.value_ == rhs.value_ && lhs.other_ == rhs.other_;
+    }
+
+    constexpr friend bool operator!=(const device_tuple& lhs,
+                                     const device_tuple& rhs)
+    {
+        return !(lhs == rhs);
+    }
+
+private:
+    T value_;
+    device_tuple<Ts...> other_;
+};
+
+
+template <typename T>
+class device_tuple<T> {
+public:
+    /** Constructs a device tuple from its elements. */
+    constexpr explicit device_tuple(T value) : value_{value} {}
+
+    device_tuple() = default;
+
+    /**
+     * Copy-assigns a tuple.
+     * This is necessary to make tuples of references work, which normally cause
+     * the impliciy copy-assignment operator to be deleted.
+     */
+    constexpr device_tuple& operator=(const device_tuple& other)
+    {
+        value_ = other.value_;
+        return *this;
+    }
+
+    /** @return the index-th element in the tuple. */
+    template <std::size_t index>
+    constexpr T& get()
+    {
+        static_assert(index == 0, "invalid index");
+        return value_;
+    }
+
+    /** @return the index-th element in the const tuple. */
+    template <std::size_t index>
+    constexpr const T& get() const
+    {
+        static_assert(index == 0, "invalid index");
+        return value_;
+    }
+
+    // comparison operators
+    constexpr friend bool operator<(const device_tuple& lhs,
+                                    const device_tuple& rhs)
+    {
+        return lhs.value_ < rhs.value_;
+    }
+
+    constexpr friend bool operator>(const device_tuple& lhs,
+                                    const device_tuple& rhs)
+    {
+        return rhs < lhs;
+    }
+
+    constexpr friend bool operator>=(const device_tuple& lhs,
+                                     const device_tuple& rhs)
+    {
+        return !(lhs < rhs);
+    }
+
+    constexpr friend bool operator<=(const device_tuple& lhs,
+                                     const device_tuple& rhs)
+    {
+        return !(lhs > rhs);
+    }
+
+    constexpr friend bool operator==(const device_tuple& lhs,
+                                     const device_tuple& rhs)
+    {
+        return lhs.value_ == rhs.value_;
+    }
+
+    constexpr friend bool operator!=(const device_tuple& lhs,
+                                     const device_tuple& rhs)
+    {
+        return !(lhs == rhs);
+    }
+
+private:
+    T value_;
+};
+
+
 /**
  * A reference-like type pointing to a tuple of elements originating from a
  * tuple of iterators. A few caveats related to its use:
@@ -45,45 +273,51 @@ class zip_iterator;
  */
 template <typename... Iterators>
 class zip_iterator_reference
-    : public std::tuple<
+    : public device_tuple<
           typename std::iterator_traits<Iterators>::reference...> {
     using ref_tuple_type =
-        std::tuple<typename std::iterator_traits<Iterators>::reference...>;
+        device_tuple<typename std::iterator_traits<Iterators>::reference...>;
     using value_type =
-        std::tuple<typename std::iterator_traits<Iterators>::value_type...>;
+        device_tuple<typename std::iterator_traits<Iterators>::value_type...>;
     using index_sequence = std::index_sequence_for<Iterators...>;
 
     friend class zip_iterator<Iterators...>;
 
     template <std::size_t... idxs>
-    value_type cast_impl(std::index_sequence<idxs...>) const
+    constexpr value_type cast_impl(std::index_sequence<idxs...>) const
     {
         // gcc 5 throws error as using uninitialized array
         // std::tuple<int, char> t = { 1, '2' }; is not allowed.
         // converting to 'std::tuple<...>' from initializer list would use
         // explicit constructor
-        return value_type(std::get<idxs>(*this)...);
+        return value_type(get<idxs>(*this)...);
     }
 
     template <std::size_t... idxs>
-    void assign_impl(std::index_sequence<idxs...>, const value_type& other)
+    constexpr void assign_impl(std::index_sequence<idxs...>,
+                               const value_type& other)
     {
         (void)std::initializer_list<int>{
-            (std::get<idxs>(*this) = std::get<idxs>(other), 0)...};
+            (get<idxs>(*this) = get<idxs>(other), 0)...};
     }
 
-    zip_iterator_reference(Iterators... it) : ref_tuple_type{*it...} {}
+    constexpr explicit zip_iterator_reference(Iterators... it)
+        : ref_tuple_type{*it...}
+    {}
 
 public:
-    operator value_type() const { return cast_impl(index_sequence{}); }
+    constexpr operator value_type() const
+    {
+        return cast_impl(index_sequence{});
+    }
 
-    zip_iterator_reference& operator=(const value_type& other)
+    constexpr zip_iterator_reference& operator=(const value_type& other)
     {
         assign_impl(index_sequence{}, other);
         return *this;
     }
 
-    value_type copy() const { return *this; }
+    constexpr value_type copy() const { return *this; }
 };
 
 
@@ -123,153 +357,156 @@ class zip_iterator {
 public:
     using difference_type = std::ptrdiff_t;
     using value_type =
-        std::tuple<typename std::iterator_traits<Iterators>::value_type...>;
+        device_tuple<typename std::iterator_traits<Iterators>::value_type...>;
     using pointer = value_type*;
     using reference = zip_iterator_reference<Iterators...>;
     using iterator_category = std::random_access_iterator_tag;
     using index_sequence = std::index_sequence_for<Iterators...>;
 
-    explicit zip_iterator() = default;
+    constexpr zip_iterator() = default;
 
-    explicit zip_iterator(Iterators... its) : iterators_{its...} {}
+    constexpr explicit zip_iterator(Iterators... its) : iterators_{its...} {}
 
-    zip_iterator& operator+=(difference_type i)
+    constexpr zip_iterator& operator+=(difference_type i)
     {
         forall([i](auto& it) { it += i; });
         return *this;
     }
 
-    zip_iterator& operator-=(difference_type i)
+    constexpr zip_iterator& operator-=(difference_type i)
     {
         forall([i](auto& it) { it -= i; });
         return *this;
     }
 
-    zip_iterator& operator++()
+    constexpr zip_iterator& operator++()
     {
         forall([](auto& it) { it++; });
         return *this;
     }
 
-    zip_iterator operator++(int)
+    constexpr zip_iterator operator++(int)
     {
         auto tmp = *this;
         ++(*this);
         return tmp;
     }
 
-    zip_iterator& operator--()
+    constexpr zip_iterator& operator--()
     {
         forall([](auto& it) { it--; });
         return *this;
     }
 
-    zip_iterator operator--(int)
+    constexpr zip_iterator operator--(int)
     {
         auto tmp = *this;
         --(*this);
         return tmp;
     }
 
-    zip_iterator operator+(difference_type i) const
+    constexpr zip_iterator operator+(difference_type i) const
     {
         auto tmp = *this;
         tmp += i;
         return tmp;
     }
 
-    friend zip_iterator operator+(difference_type i, const zip_iterator& iter)
+    constexpr friend zip_iterator operator+(difference_type i,
+                                            const zip_iterator& iter)
     {
         return iter + i;
     }
 
-    zip_iterator operator-(difference_type i) const
+    constexpr zip_iterator operator-(difference_type i) const
     {
         auto tmp = *this;
         tmp -= i;
         return tmp;
     }
 
-    difference_type operator-(const zip_iterator& other) const
+    constexpr difference_type operator-(const zip_iterator& other) const
     {
         return forall_check_consistent(
             other, [](const auto& a, const auto& b) { return a - b; });
     }
 
-    reference operator*() const
+    constexpr reference operator*() const
     {
         return deref_impl(std::index_sequence_for<Iterators...>{});
     }
 
-    reference operator[](difference_type i) const { return *(*this + i); }
+    constexpr reference operator[](difference_type i) const
+    {
+        return *(*this + i);
+    }
 
-    bool operator==(const zip_iterator& other) const
+    constexpr bool operator==(const zip_iterator& other) const
     {
         return forall_check_consistent(
             other, [](const auto& a, const auto& b) { return a == b; });
     }
 
-    bool operator!=(const zip_iterator& other) const
+    constexpr bool operator!=(const zip_iterator& other) const
     {
         return !(*this == other);
     }
 
-    bool operator<(const zip_iterator& other) const
+    constexpr bool operator<(const zip_iterator& other) const
     {
         return forall_check_consistent(
             other, [](const auto& a, const auto& b) { return a < b; });
     }
 
-    bool operator<=(const zip_iterator& other) const
+    constexpr bool operator<=(const zip_iterator& other) const
     {
         return forall_check_consistent(
             other, [](const auto& a, const auto& b) { return a <= b; });
     }
 
-    bool operator>(const zip_iterator& other) const
+    constexpr bool operator>(const zip_iterator& other) const
     {
         return !(*this <= other);
     }
 
-    bool operator>=(const zip_iterator& other) const
+    constexpr bool operator>=(const zip_iterator& other) const
     {
         return !(*this < other);
     }
 
 private:
     template <std::size_t... idxs>
-    reference deref_impl(std::index_sequence<idxs...>) const
+    constexpr reference deref_impl(std::index_sequence<idxs...>) const
     {
-        return reference{std::get<idxs>(iterators_)...};
+        return reference{get<idxs>(iterators_)...};
     }
 
     template <typename Functor>
-    void forall(Functor fn)
+    constexpr void forall(Functor fn)
     {
         forall_impl(fn, index_sequence{});
     }
 
     template <typename Functor, std::size_t... idxs>
-    void forall_impl(Functor fn, std::index_sequence<idxs...>)
+    constexpr void forall_impl(Functor fn, std::index_sequence<idxs...>)
     {
-        (void)std::initializer_list<int>{
-            (fn(std::get<idxs>(iterators_)), 0)...};
+        (void)std::initializer_list<int>{(fn(get<idxs>(iterators_)), 0)...};
     }
 
     template <typename Functor, std::size_t... idxs>
-    void forall_impl(const zip_iterator& other, Functor fn,
-                     std::index_sequence<idxs...>) const
+    constexpr void forall_impl(const zip_iterator& other, Functor fn,
+                               std::index_sequence<idxs...>) const
     {
         (void)std::initializer_list<int>{
-            (fn(std::get<idxs>(iterators_), std::get<idxs>(other.iterators_)),
-             0)...};
+            (fn(get<idxs>(iterators_), get<idxs>(other.iterators_)), 0)...};
     }
 
     template <typename Functor>
-    auto forall_check_consistent(const zip_iterator& other, Functor fn) const
+    constexpr auto forall_check_consistent(const zip_iterator& other,
+                                           Functor fn) const
     {
-        auto it = std::get<0>(iterators_);
-        auto other_it = std::get<0>(other.iterators_);
+        auto it = get<0>(iterators_);
+        auto other_it = get<0>(other.iterators_);
         auto result = fn(it, other_it);
         forall_impl(
             other, [&](auto a, auto b) { assert(it - other_it == a - b); },
@@ -277,12 +514,13 @@ class zip_iterator {
         return result;
     }
 
-    std::tuple<Iterators...> iterators_;
+    device_tuple<Iterators...> iterators_;
 };
 
 
 template <typename... Iterators>
-zip_iterator<std::decay_t<Iterators>...> make_zip_iterator(Iterators&&... it)
+constexpr zip_iterator<std::decay_t<Iterators>...> make_zip_iterator(
+    Iterators&&... it)
 {
     return zip_iterator<std::decay_t<Iterators>...>{
         std::forward<Iterators>(it)...};
@@ -305,8 +543,8 @@ zip_iterator<std::decay_t<Iterators>...> make_zip_iterator(Iterators&&... it)
  * @tparam Iterators  the iterator types inside the corresponding zip_iterator
  */
 template <typename... Iterators>
-void swap(zip_iterator_reference<Iterators...> a,
-          zip_iterator_reference<Iterators...> b)
+constexpr void swap(zip_iterator_reference<Iterators...> a,
+                    zip_iterator_reference<Iterators...> b)
 {
     auto tmp = a.copy();
     a = b;
@@ -318,8 +556,8 @@ void swap(zip_iterator_reference<Iterators...> a,
  * @copydoc swap(zip_iterator_reference, zip_iterator_reference)
  */
 template <typename... Iterators>
-void swap(typename zip_iterator<Iterators...>::value_type& a,
-          zip_iterator_reference<Iterators...> b)
+constexpr void swap(typename zip_iterator<Iterators...>::value_type& a,
+                    zip_iterator_reference<Iterators...> b)
 {
     auto tmp = a;
     a = b;
@@ -331,8 +569,8 @@ void swap(typename zip_iterator<Iterators...>::value_type& a,
  * @copydoc swap(zip_iterator_reference, zip_iterator_reference)
  */
 template <typename... Iterators>
-void swap(zip_iterator_reference<Iterators...> a,
-          typename zip_iterator<Iterators...>::value_type& b)
+constexpr void swap(zip_iterator_reference<Iterators...> a,
+                    typename zip_iterator<Iterators...>::value_type& b)
 {
     auto tmp = a.copy();
     a = b;
@@ -468,6 +706,25 @@ permute_iterator<IteratorType, PermutationFn> make_permute_iterator(
 
 
 }  // namespace detail
+
+
+template <std::size_t index, typename... Ts>
+constexpr typename std::tuple_element<index, detail::device_tuple<Ts...>>::type&
+get(detail::device_tuple<Ts...>& tuple)
+{
+    return tuple.template get<index>();
+}
+
+
+template <std::size_t index, typename... Ts>
+constexpr const typename std::tuple_element<index,
+                                            detail::device_tuple<Ts...>>::type&
+get(const detail::device_tuple<Ts...>& tuple)
+{
+    return tuple.template get<index>();
+}
+
+
 }  // namespace gko
 
 
diff --git a/core/test/base/iterator_factory.cpp b/core/test/base/iterator_factory.cpp
index 42ddff343c0..c4dc30bf219 100644
--- a/core/test/base/iterator_factory.cpp
+++ b/core/test/base/iterator_factory.cpp
@@ -156,6 +156,7 @@ TYPED_TEST(ZipIterator, IteratorReferenceOperatorSmaller2)
 
 TYPED_TEST(ZipIterator, IncreasingIterator)
 {
+    using gko::get;
     using index_type = typename TestFixture::index_type;
     using value_type = typename TestFixture::value_type;
     std::vector<index_type> vec1{this->reversed_index};
@@ -182,8 +183,8 @@ TYPED_TEST(ZipIterator, IncreasingIterator)
     ASSERT_TRUE(increment_pre_2 == increment_post_2);
     ASSERT_TRUE(begin == increment_post_test++);
     ASSERT_TRUE(begin + 1 == ++increment_pre_test);
-    ASSERT_TRUE(std::get<0>(*plus_2) == vec1[2]);
-    ASSERT_TRUE(std::get<1>(*plus_2) == vec2[2]);
+    ASSERT_TRUE(get<0>(*plus_2) == vec1[2]);
+    ASSERT_TRUE(get<1>(*plus_2) == vec2[2]);
     // check other comparison operators and difference
     std::vector<gko::detail::zip_iterator<index_type*, value_type*>> its{
         begin,
@@ -257,6 +258,7 @@ TYPED_TEST(ZipIterator, IncompatibleIteratorDeathTest)
 
 TYPED_TEST(ZipIterator, DecreasingIterator)
 {
+    using gko::get;
     using index_type = typename TestFixture::index_type;
     using value_type = typename TestFixture::value_type;
     std::vector<index_type> vec1{this->reversed_index};
@@ -280,13 +282,14 @@ TYPED_TEST(ZipIterator, DecreasingIterator)
     ASSERT_TRUE(decrement_pre_2 == decrement_post_2);
     ASSERT_TRUE(iter == decrement_post_test--);
     ASSERT_TRUE(iter - 1 == --decrement_pre_test);
-    ASSERT_TRUE(std::get<0>(*minus_2) == vec1[3]);
-    ASSERT_TRUE(std::get<1>(*minus_2) == vec2[3]);
+    ASSERT_TRUE(get<0>(*minus_2) == vec1[3]);
+    ASSERT_TRUE(get<1>(*minus_2) == vec2[3]);
 }
 
 
 TYPED_TEST(ZipIterator, CorrectDereferencing)
 {
+    using gko::get;
     using index_type_it = typename TestFixture::index_type;
     using value_type_it = typename TestFixture::value_type;
     std::vector<index_type_it> vec1{this->reversed_index};
@@ -299,10 +302,10 @@ TYPED_TEST(ZipIterator, CorrectDereferencing)
     auto to_test_ref = *(begin + element_to_test);
     value_type to_test_pair = to_test_ref;  // Testing implicit conversion
 
-    ASSERT_TRUE(std::get<0>(to_test_pair) == vec1[element_to_test]);
-    ASSERT_TRUE(std::get<0>(to_test_pair) == std::get<0>(to_test_ref));
-    ASSERT_TRUE(std::get<1>(to_test_pair) == vec2[element_to_test]);
-    ASSERT_TRUE(std::get<1>(to_test_pair) == std::get<1>(to_test_ref));
+    ASSERT_TRUE(get<0>(to_test_pair) == vec1[element_to_test]);
+    ASSERT_TRUE(get<0>(to_test_pair) == get<0>(to_test_ref));
+    ASSERT_TRUE(get<1>(to_test_pair) == vec2[element_to_test]);
+    ASSERT_TRUE(get<1>(to_test_pair) == get<1>(to_test_ref));
 }
 
 
diff --git a/omp/distributed/partition_helpers_kernels.cpp b/omp/distributed/partition_helpers_kernels.cpp
index ceae3e17679..a3dfa8fdef4 100644
--- a/omp/distributed/partition_helpers_kernels.cpp
+++ b/omp/distributed/partition_helpers_kernels.cpp
@@ -27,10 +27,9 @@ void sort_by_range_start(
         range_start_ends.get_data() + 1, [](const auto i) { return 2 * i; });
     auto sort_it = detail::make_zip_iterator(start_it, end_it, part_ids_d);
     // TODO: use TBB or parallel std with c++17
-    std::stable_sort(sort_it, sort_it + num_parts,
-                     [](const auto& a, const auto& b) {
-                         return std::get<0>(a) < std::get<0>(b);
-                     });
+    std::stable_sort(
+        sort_it, sort_it + num_parts,
+        [](const auto& a, const auto& b) { return get<0>(a) < get<0>(b); });
 }
 
 GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp
index 09d1465896b..8e47caef520 100644
--- a/omp/matrix/csr_kernels.cpp
+++ b/omp/matrix/csr_kernels.cpp
@@ -1155,9 +1155,8 @@ void sort_by_column_index(std::shared_ptr<const OmpExecutor> exec,
         auto row_nnz = row_ptrs[i + 1] - start_row_idx;
         auto it = detail::make_zip_iterator(col_idxs + start_row_idx,
                                             values + start_row_idx);
-        std::sort(it, it + row_nnz, [](auto t1, auto t2) {
-            return std::get<0>(t1) < std::get<0>(t2);
-        });
+        std::sort(it, it + row_nnz,
+                  [](auto t1, auto t2) { return get<0>(t1) < get<0>(t2); });
     }
 }
 
diff --git a/omp/matrix/fbcsr_kernels.cpp b/omp/matrix/fbcsr_kernels.cpp
index db60d85db79..a6342034a56 100644
--- a/omp/matrix/fbcsr_kernels.cpp
+++ b/omp/matrix/fbcsr_kernels.cpp
@@ -398,9 +398,8 @@ void sort_by_column_index_impl(
         std::vector<IndexType> col_permute(nbnz_brow);
         std::iota(col_permute.begin(), col_permute.end(), 0);
         auto it = detail::make_zip_iterator(brow_col_idxs, col_permute.data());
-        std::sort(it, it + nbnz_brow, [](auto a, auto b) {
-            return std::get<0>(a) < std::get<0>(b);
-        });
+        std::sort(it, it + nbnz_brow,
+                  [](auto a, auto b) { return get<0>(a) < get<0>(b); });
 
         std::vector<ValueType> oldvalues(nbnz_brow * bs2);
         std::copy(brow_vals, brow_vals + nbnz_brow * bs2, oldvalues.begin());
diff --git a/omp/multigrid/pgm_kernels.cpp b/omp/multigrid/pgm_kernels.cpp
index 9d2aa047cc4..4c824a0140b 100644
--- a/omp/multigrid/pgm_kernels.cpp
+++ b/omp/multigrid/pgm_kernels.cpp
@@ -43,8 +43,7 @@ void sort_row_major(std::shared_ptr<const DefaultExecutor> exec, size_type nnz,
 {
     auto it = detail::make_zip_iterator(row_idxs, col_idxs, vals);
     std::stable_sort(it, it + nnz, [](auto a, auto b) {
-        return std::tie(std::get<0>(a), std::get<1>(a)) <
-               std::tie(std::get<0>(b), std::get<1>(b));
+        return std::tie(get<0>(a), get<1>(a)) < std::tie(get<0>(b), get<1>(b));
     });
 }
 
diff --git a/reference/distributed/partition_helpers_kernels.cpp b/reference/distributed/partition_helpers_kernels.cpp
index b57daab2eaa..0307974f278 100644
--- a/reference/distributed/partition_helpers_kernels.cpp
+++ b/reference/distributed/partition_helpers_kernels.cpp
@@ -26,10 +26,9 @@ void sort_by_range_start(
     auto end_it = detail::make_permute_iterator(
         range_start_ends.get_data() + 1, [](const auto i) { return 2 * i; });
     auto sort_it = detail::make_zip_iterator(start_it, end_it, part_ids_d);
-    std::stable_sort(sort_it, sort_it + num_parts,
-                     [](const auto& a, const auto& b) {
-                         return std::get<0>(a) < std::get<0>(b);
-                     });
+    std::stable_sort(
+        sort_it, sort_it + num_parts,
+        [](const auto& a, const auto& b) { return get<0>(a) < get<0>(b); });
 }
 
 GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
@@ -51,9 +50,9 @@ void check_consecutive_ranges(std::shared_ptr<const DefaultExecutor> exec,
     auto range_it = detail::make_zip_iterator(start_it, end_it);
 
     if (num_parts) {
-        result = std::all_of(
-            range_it, range_it + num_parts - 1,
-            [](const auto& r) { return std::get<0>(r) == std::get<1>(r); });
+        result =
+            std::all_of(range_it, range_it + num_parts - 1,
+                        [](const auto& r) { return get<0>(r) == get<1>(r); });
     } else {
         result = true;
     }
diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp
index f7e2fab4411..be97da442a1 100644
--- a/reference/matrix/csr_kernels.cpp
+++ b/reference/matrix/csr_kernels.cpp
@@ -1128,9 +1128,8 @@ void sort_by_column_index(std::shared_ptr<const ReferenceExecutor> exec,
         auto row_nnz = row_ptrs[i + 1] - start_row_idx;
         auto it = detail::make_zip_iterator(col_idxs + start_row_idx,
                                             values + start_row_idx);
-        std::sort(it, it + row_nnz, [](auto t1, auto t2) {
-            return std::get<0>(t1) < std::get<0>(t2);
-        });
+        std::sort(it, it + row_nnz,
+                  [](auto t1, auto t2) { return get<0>(t1) < get<0>(t2); });
     }
 }
 
diff --git a/reference/matrix/fbcsr_kernels.cpp b/reference/matrix/fbcsr_kernels.cpp
index 9e60e380d9c..cdedc36ddc0 100644
--- a/reference/matrix/fbcsr_kernels.cpp
+++ b/reference/matrix/fbcsr_kernels.cpp
@@ -418,9 +418,8 @@ void sort_by_column_index_impl(
         std::vector<IndexType> col_permute(nbnz_brow);
         std::iota(col_permute.begin(), col_permute.end(), 0);
         auto it = detail::make_zip_iterator(brow_col_idxs, col_permute.data());
-        std::sort(it, it + nbnz_brow, [](auto a, auto b) {
-            return std::get<0>(a) < std::get<0>(b);
-        });
+        std::sort(it, it + nbnz_brow,
+                  [](auto a, auto b) { return get<0>(a) < get<0>(b); });
 
         std::vector<ValueType> oldvalues(nbnz_brow * bs2);
         std::copy(brow_vals, brow_vals + nbnz_brow * bs2, oldvalues.begin());
diff --git a/reference/multigrid/pgm_kernels.cpp b/reference/multigrid/pgm_kernels.cpp
index 2a6e3252a9f..bff2a776c6b 100644
--- a/reference/multigrid/pgm_kernels.cpp
+++ b/reference/multigrid/pgm_kernels.cpp
@@ -270,8 +270,7 @@ void sort_row_major(std::shared_ptr<const DefaultExecutor> exec, size_type nnz,
 {
     auto it = detail::make_zip_iterator(row_idxs, col_idxs, vals);
     std::stable_sort(it, it + nnz, [](auto a, auto b) {
-        return std::tie(std::get<0>(a), std::get<1>(a)) <
-               std::tie(std::get<0>(b), std::get<1>(b));
+        return std::tie(get<0>(a), get<1>(a)) < std::tie(get<0>(b), get<1>(b));
     });
 }
 
diff --git a/test/base/CMakeLists.txt b/test/base/CMakeLists.txt
index d54996f212a..5f31c25db19 100644
--- a/test/base/CMakeLists.txt
+++ b/test/base/CMakeLists.txt
@@ -1,6 +1,7 @@
 ginkgo_create_common_test(batch_multi_vector_kernels)
 ginkgo_create_common_and_reference_test(device_matrix_data_kernels)
 ginkgo_create_common_device_test(index_range)
+ginkgo_create_common_device_test(iterator_factory)
 ginkgo_create_common_device_test(kernel_launch_generic)
 ginkgo_create_common_and_reference_test(executor)
 ginkgo_create_common_and_reference_test(timer)
diff --git a/test/base/iterator_factory.cpp b/test/base/iterator_factory.cpp
new file mode 100644
index 00000000000..5dc97646960
--- /dev/null
+++ b/test/base/iterator_factory.cpp
@@ -0,0 +1,69 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/base/iterator_factory.hpp"
+
+#include <memory>
+
+#include <gtest/gtest.h>
+
+#include <ginkgo/core/base/array.hpp>
+
+#include "common/unified/base/kernel_launch.hpp"
+#include "core/test/utils.hpp"
+#include "test/utils/executor.hpp"
+
+
+class IteratorFactory : public CommonTestFixture {
+public:
+    IteratorFactory()
+        : key_array{exec, {6, 2, 3, 8, 1, 0, 2}},
+          value_array{exec, {9, 5, 7, 2, 4, 7, 2}},
+          expected_key_array{ref, {7, 1, 2, 2, 3, 6, 8}},
+          expected_value_array{ref, {7, 4, 2, 5, 7, 9, 2}}
+    {}
+
+    gko::array<int> key_array;
+    gko::array<int> value_array;
+    gko::array<int> expected_key_array;
+    gko::array<int> expected_value_array;
+};
+
+
+// nvcc doesn't like device lambdas declared in complex classes, move it out
+void run_zip_iterator(std::shared_ptr<gko::EXEC_TYPE> exec,
+                      gko::array<int>& key_array, gko::array<int>& value_array)
+{
+    gko::kernels::EXEC_NAMESPACE::run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto keys, auto values, auto size) {
+            auto begin = gko::detail::make_zip_iterator(keys, values);
+            auto end = begin + size;
+            using std::swap;
+            for (auto it = begin; it != end; ++it) {
+                auto min_it = it;
+                for (auto it2 = it; it2 != end; ++it2) {
+                    if (*it2 < *min_it) {
+                        min_it = it2;
+                    }
+                }
+                swap(*it, *min_it);
+            }
+            // check structured bindings
+            auto [key, value] = *begin;
+            static_assert(std::is_same<typeof(key), int>::value,
+                          "incorrect type");
+            gko::get<0>(*begin) = value;
+        },
+        1, key_array, value_array, static_cast<int>(key_array.get_size()));
+}
+
+
+TEST_F(IteratorFactory, KernelRunsZipIterator)
+{
+    run_zip_iterator(exec, key_array, value_array);
+
+    GKO_ASSERT_ARRAY_EQ(key_array, expected_key_array);
+    GKO_ASSERT_ARRAY_EQ(value_array, expected_value_array);
+}

From b617d9e10d93fcfaf849d9b936ca03e3c11efeb2 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 10 Jul 2024 13:32:44 +0200
Subject: [PATCH 083/448] fix issues after rebase

---
 omp/distributed/index_map_kernels.cpp | 13 ++++++-------
 test/base/iterator_factory.cpp        |  4 ++--
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/omp/distributed/index_map_kernels.cpp b/omp/distributed/index_map_kernels.cpp
index b01dab9cb33..7374f7b978b 100644
--- a/omp/distributed/index_map_kernels.cpp
+++ b/omp/distributed/index_map_kernels.cpp
@@ -58,16 +58,15 @@ void build_mapping(
     auto sort_it = detail::make_zip_iterator(
         full_remote_part_ids.begin(), recv_connections_ptr, range_ids.begin());
     std::sort(sort_it, sort_it + input_size, [](const auto& a, const auto& b) {
-        return std::tie(std::get<0>(a), std::get<1>(a)) <
-               std::tie(std::get<0>(b), std::get<1>(b));
+        return std::tie(get<0>(a), get<1>(a)) < std::tie(get<0>(b), get<1>(b));
     });
 
     // get only unique connections
-    auto unique_end = std::unique(
-        sort_it, sort_it + input_size, [](const auto& a, const auto& b) {
-            return std::tie(std::get<0>(a), std::get<1>(a)) ==
-                   std::tie(std::get<0>(b), std::get<1>(b));
-        });
+    auto unique_end = std::unique(sort_it, sort_it + input_size,
+                                  [](const auto& a, const auto& b) {
+                                      return std::tie(get<0>(a), get<1>(a)) ==
+                                             std::tie(get<0>(b), get<1>(b));
+                                  });
     auto unique_size = std::distance(sort_it, unique_end);
 
     remote_global_idxs.resize_and_reset(unique_size);
diff --git a/test/base/iterator_factory.cpp b/test/base/iterator_factory.cpp
index 5dc97646960..be51a2df32c 100644
--- a/test/base/iterator_factory.cpp
+++ b/test/base/iterator_factory.cpp
@@ -12,7 +12,7 @@
 
 #include "common/unified/base/kernel_launch.hpp"
 #include "core/test/utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class IteratorFactory : public CommonTestFixture {
@@ -35,7 +35,7 @@ class IteratorFactory : public CommonTestFixture {
 void run_zip_iterator(std::shared_ptr<gko::EXEC_TYPE> exec,
                       gko::array<int>& key_array, gko::array<int>& value_array)
 {
-    gko::kernels::EXEC_NAMESPACE::run_kernel(
+    gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto keys, auto values, auto size) {
             auto begin = gko::detail::make_zip_iterator(keys, values);

From ee8bb10aff292b56e9abe1203c50c0e8e96a81a1 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Thu, 11 Jul 2024 13:31:16 +0200
Subject: [PATCH 084/448] fixes for MSVC and nvc++

- MSVC finds the get(...) member function, so we need to call the free function explicitly
- the structured bindings refer to a reference
---
 core/base/iterator_factory.hpp | 8 +++++---
 test/base/iterator_factory.cpp | 3 ++-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/core/base/iterator_factory.hpp b/core/base/iterator_factory.hpp
index 938d705b04d..de5af49e24f 100644
--- a/core/base/iterator_factory.hpp
+++ b/core/base/iterator_factory.hpp
@@ -13,6 +13,8 @@
 #include <tuple>
 #include <utility>
 
+#include <ginkgo/core/base/types.hpp>
+
 #include "core/base/copy_assignable.hpp"
 
 
@@ -290,7 +292,7 @@ class zip_iterator_reference
         // std::tuple<int, char> t = { 1, '2' }; is not allowed.
         // converting to 'std::tuple<...>' from initializer list would use
         // explicit constructor
-        return value_type(get<idxs>(*this)...);
+        return value_type(gko::get<idxs>(*this)...);
     }
 
     template <std::size_t... idxs>
@@ -298,7 +300,7 @@ class zip_iterator_reference
                                const value_type& other)
     {
         (void)std::initializer_list<int>{
-            (get<idxs>(*this) = get<idxs>(other), 0)...};
+            (gko::get<idxs>(*this) = gko::get<idxs>(other), 0)...};
     }
 
     constexpr explicit zip_iterator_reference(Iterators... it)
@@ -509,7 +511,7 @@ class zip_iterator {
         auto other_it = get<0>(other.iterators_);
         auto result = fn(it, other_it);
         forall_impl(
-            other, [&](auto a, auto b) { assert(it - other_it == a - b); },
+            other, [&](auto a, auto b) { GKO_ASSERT(it - other_it == a - b); },
             index_sequence{});
         return result;
     }
diff --git a/test/base/iterator_factory.cpp b/test/base/iterator_factory.cpp
index be51a2df32c..5826118fd81 100644
--- a/test/base/iterator_factory.cpp
+++ b/test/base/iterator_factory.cpp
@@ -52,7 +52,8 @@ void run_zip_iterator(std::shared_ptr<gko::EXEC_TYPE> exec,
             }
             // check structured bindings
             auto [key, value] = *begin;
-            static_assert(std::is_same<typeof(key), int>::value,
+            static_assert(std::is_same<std::remove_reference_t<decltype(key)>,
+                                       int>::value,
                           "incorrect type");
             gko::get<0>(*begin) = value;
         },

From e96bb188ed669b278e5eb9c9f9813114a197db50 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Fri, 12 Jul 2024 13:52:47 +0200
Subject: [PATCH 085/448] unify IC and ILU

---
 common/cuda_hip/CMakeLists.txt                |  2 +
 .../cuda_hip/factorization/ic_kernels.cpp     | 12 ++--
 .../cuda_hip/factorization/ilu_kernels.cpp    | 12 ++--
 cuda/CMakeLists.txt                           |  2 -
 hip/CMakeLists.txt                            |  2 -
 hip/factorization/ic_kernels.hip.cpp          | 63 ------------------
 hip/factorization/ilu_kernels.hip.cpp         | 64 -------------------
 7 files changed, 10 insertions(+), 147 deletions(-)
 rename cuda/factorization/ic_kernels.cu => common/cuda_hip/factorization/ic_kernels.cpp (91%)
 rename cuda/factorization/ilu_kernels.cu => common/cuda_hip/factorization/ilu_kernels.cpp (91%)
 delete mode 100644 hip/factorization/ic_kernels.hip.cpp
 delete mode 100644 hip/factorization/ilu_kernels.hip.cpp

diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt
index c18755ab164..4ae7c462b27 100644
--- a/common/cuda_hip/CMakeLists.txt
+++ b/common/cuda_hip/CMakeLists.txt
@@ -9,6 +9,8 @@ set(CUDA_HIP_SOURCES
     distributed/vector_kernels.cpp
     factorization/cholesky_kernels.cpp
     factorization/factorization_kernels.cpp
+    factorization/ic_kernels.cpp
+    factorization/ilu_kernels.cpp
     factorization/lu_kernels.cpp
     factorization/par_ic_kernels.cpp
     factorization/par_ilu_kernels.cpp
diff --git a/cuda/factorization/ic_kernels.cu b/common/cuda_hip/factorization/ic_kernels.cpp
similarity index 91%
rename from cuda/factorization/ic_kernels.cu
rename to common/cuda_hip/factorization/ic_kernels.cpp
index 3a4b4a55411..62963c479bd 100644
--- a/cuda/factorization/ic_kernels.cu
+++ b/common/cuda_hip/factorization/ic_kernels.cpp
@@ -6,17 +6,13 @@
 
 #include <ginkgo/core/base/array.hpp>
 
+#include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
 
 
 namespace gko {
 namespace kernels {
-namespace cuda {
-/**
- * @brief The ic factorization namespace.
- *
- * @ingroup factor
- */
+namespace GKO_DEVICE_NAMESPACE {
 namespace ic_factorization {
 
 
@@ -50,7 +46,7 @@ void compute(std::shared_ptr<const DefaultExecutor> exec,
                    SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data());
 
     // CUDA 11.4 has a use-after-free bug on Turing
-#if (CUDA_VERSION >= 11040)
+#if defined(GKO_COMPILING_CUDA) && (CUDA_VERSION >= 11040)
     exec->synchronize();
 #endif
 
@@ -62,6 +58,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC_COMPUTE_KERNEL);
 
 
 }  // namespace ic_factorization
-}  // namespace cuda
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/cuda/factorization/ilu_kernels.cu b/common/cuda_hip/factorization/ilu_kernels.cpp
similarity index 91%
rename from cuda/factorization/ilu_kernels.cu
rename to common/cuda_hip/factorization/ilu_kernels.cpp
index 6096e89ef4b..b3f959bba02 100644
--- a/cuda/factorization/ilu_kernels.cu
+++ b/common/cuda_hip/factorization/ilu_kernels.cpp
@@ -6,17 +6,13 @@
 
 #include <ginkgo/core/base/array.hpp>
 
+#include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
 
 
 namespace gko {
 namespace kernels {
-namespace cuda {
-/**
- * @brief The ilu factorization namespace.
- *
- * @ingroup factor
- */
+namespace GKO_DEVICE_NAMESPACE {
 namespace ilu_factorization {
 
 
@@ -50,7 +46,7 @@ void compute_lu(std::shared_ptr<const DefaultExecutor> exec,
                     SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data());
 
     // CUDA 11.4 has a use-after-free bug on Turing
-#if (CUDA_VERSION >= 11040)
+#if defined(GKO_BUILDING_CUDA) && (CUDA_VERSION >= 11040)
     exec->synchronize();
 #endif
 
@@ -63,6 +59,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 }  // namespace ilu_factorization
-}  // namespace cuda
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index 30b3f2747e6..89c711965e1 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -18,8 +18,6 @@ target_sources(ginkgo_cuda
     base/stream.cpp
     base/timer.cpp
     base/version.cpp
-    factorization/ic_kernels.cu
-    factorization/ilu_kernels.cu
     factorization/par_ict_kernels.cu
     factorization/par_ilut_approx_filter_kernels.cu
     factorization/par_ilut_filter_kernels.cu
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 23584c2742a..32e3767f93c 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -16,8 +16,6 @@ set(GINKGO_HIP_SOURCES
     base/stream.hip.cpp
     base/timer.hip.cpp
     base/version.hip.cpp
-    factorization/ic_kernels.hip.cpp
-    factorization/ilu_kernels.hip.cpp
     factorization/par_ict_kernels.hip.cpp
     factorization/par_ilut_approx_filter_kernels.hip.cpp
     factorization/par_ilut_filter_kernels.hip.cpp
diff --git a/hip/factorization/ic_kernels.hip.cpp b/hip/factorization/ic_kernels.hip.cpp
deleted file mode 100644
index cfbb12bd5b3..00000000000
--- a/hip/factorization/ic_kernels.hip.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/ic_kernels.hpp"
-
-#include <ginkgo/core/base/array.hpp>
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/sparselib_bindings.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The ic factorization namespace.
- *
- * @ingroup factor
- */
-namespace ic_factorization {
-
-
-template <typename ValueType, typename IndexType>
-void compute(std::shared_ptr<const DefaultExecutor> exec,
-             matrix::Csr<ValueType, IndexType>* m)
-{
-    const auto id = exec->get_device_id();
-    auto handle = exec->get_sparselib_handle();
-    auto desc = sparselib::create_mat_descr();
-    auto info = sparselib::create_ic0_info();
-
-    // get buffer size for IC
-    IndexType num_rows = m->get_size()[0];
-    IndexType nnz = m->get_num_stored_elements();
-    size_type buffer_size{};
-    sparselib::ic0_buffer_size(handle, num_rows, nnz, desc,
-                               m->get_const_values(), m->get_const_row_ptrs(),
-                               m->get_const_col_idxs(), info, buffer_size);
-
-    array<char> buffer{exec, buffer_size};
-
-    // set up IC(0)
-    sparselib::ic0_analysis(handle, num_rows, nnz, desc, m->get_const_values(),
-                            m->get_const_row_ptrs(), m->get_const_col_idxs(),
-                            info, SPARSELIB_SOLVE_POLICY_USE_LEVEL,
-                            buffer.get_data());
-
-    sparselib::ic0(handle, num_rows, nnz, desc, m->get_values(),
-                   m->get_const_row_ptrs(), m->get_const_col_idxs(), info,
-                   SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data());
-
-    sparselib::destroy_ic0_info(info);
-    sparselib::destroy(desc);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC_COMPUTE_KERNEL);
-
-
-}  // namespace ic_factorization
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/factorization/ilu_kernels.hip.cpp b/hip/factorization/ilu_kernels.hip.cpp
deleted file mode 100644
index 45d468d0500..00000000000
--- a/hip/factorization/ilu_kernels.hip.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/ilu_kernels.hpp"
-
-#include <ginkgo/core/base/array.hpp>
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/sparselib_bindings.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The ilu factorization namespace.
- *
- * @ingroup factor
- */
-namespace ilu_factorization {
-
-
-template <typename ValueType, typename IndexType>
-void compute_lu(std::shared_ptr<const DefaultExecutor> exec,
-                matrix::Csr<ValueType, IndexType>* m)
-{
-    const auto id = exec->get_device_id();
-    auto handle = exec->get_sparselib_handle();
-    auto desc = sparselib::create_mat_descr();
-    auto info = sparselib::create_ilu0_info();
-
-    // get buffer size for ILU
-    IndexType num_rows = m->get_size()[0];
-    IndexType nnz = m->get_num_stored_elements();
-    size_type buffer_size{};
-    sparselib::ilu0_buffer_size(handle, num_rows, nnz, desc,
-                                m->get_const_values(), m->get_const_row_ptrs(),
-                                m->get_const_col_idxs(), info, buffer_size);
-
-    array<char> buffer{exec, buffer_size};
-
-    // set up ILU(0)
-    sparselib::ilu0_analysis(handle, num_rows, nnz, desc, m->get_const_values(),
-                             m->get_const_row_ptrs(), m->get_const_col_idxs(),
-                             info, SPARSELIB_SOLVE_POLICY_USE_LEVEL,
-                             buffer.get_data());
-
-    sparselib::ilu0(handle, num_rows, nnz, desc, m->get_values(),
-                    m->get_const_row_ptrs(), m->get_const_col_idxs(), info,
-                    SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data());
-
-    sparselib::destroy_ilu0_info(info);
-    sparselib::destroy(desc);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_ILU_COMPUTE_LU_KERNEL);
-
-
-}  // namespace ilu_factorization
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko

From e268e29fbcd49062727efbaa79f88aa4f89d0af2 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Fri, 12 Jul 2024 14:16:31 +0200
Subject: [PATCH 086/448] unify most of ParILUT/ICT

---
 common/cuda_hip/CMakeLists.txt                |   3 +
 ...ct_kernels.hpp.inc => par_ict_kernels.cpp} | 182 +++++++++++++++++
 ...ls.hpp.inc => par_ilut_spgeam_kernels.cpp} | 151 ++++++++++++++
 .../factorization/par_ilut_sweep_kernels.cpp  |  97 ++++++++-
 .../par_ilut_sweep_kernels.hpp.inc            |  94 ---------
 cuda/CMakeLists.txt                           |   3 -
 cuda/factorization/par_ict_kernels.cu         | 187 ------------------
 cuda/factorization/par_ilut_spgeam_kernels.cu | 156 ---------------
 hip/CMakeLists.txt                            |   3 -
 hip/factorization/par_ict_kernels.hip.cpp     | 187 ------------------
 .../par_ilut_spgeam_kernels.hip.cpp           | 156 ---------------
 .../par_ilut_sweep_kernels.hip.cpp            | 120 -----------
 12 files changed, 429 insertions(+), 910 deletions(-)
 rename common/cuda_hip/factorization/{par_ict_kernels.hpp.inc => par_ict_kernels.cpp} (62%)
 rename common/cuda_hip/factorization/{par_ilut_spgeam_kernels.hpp.inc => par_ilut_spgeam_kernels.cpp} (63%)
 rename cuda/factorization/par_ilut_sweep_kernels.cu => common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp (54%)
 delete mode 100644 common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc
 delete mode 100644 cuda/factorization/par_ict_kernels.cu
 delete mode 100644 cuda/factorization/par_ilut_spgeam_kernels.cu
 delete mode 100644 hip/factorization/par_ict_kernels.hip.cpp
 delete mode 100644 hip/factorization/par_ilut_spgeam_kernels.hip.cpp
 delete mode 100644 hip/factorization/par_ilut_sweep_kernels.hip.cpp

diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt
index 4ae7c462b27..c4a56482b1d 100644
--- a/common/cuda_hip/CMakeLists.txt
+++ b/common/cuda_hip/CMakeLists.txt
@@ -13,7 +13,10 @@ set(CUDA_HIP_SOURCES
     factorization/ilu_kernels.cpp
     factorization/lu_kernels.cpp
     factorization/par_ic_kernels.cpp
+    factorization/par_ict_kernels.cpp
     factorization/par_ilu_kernels.cpp
+    factorization/par_ilut_spgeam_kernels.cpp
+    factorization/par_ilut_sweep_kernels.cpp
     matrix/coo_kernels.cpp
     matrix/dense_kernels.cpp
     matrix/diagonal_kernels.cpp
diff --git a/common/cuda_hip/factorization/par_ict_kernels.hpp.inc b/common/cuda_hip/factorization/par_ict_kernels.cpp
similarity index 62%
rename from common/cuda_hip/factorization/par_ict_kernels.hpp.inc
rename to common/cuda_hip/factorization/par_ict_kernels.cpp
index 87aa8297345..94aa5e5124e 100644
--- a/common/cuda_hip/factorization/par_ict_kernels.hpp.inc
+++ b/common/cuda_hip/factorization/par_ict_kernels.cpp
@@ -2,6 +2,49 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/factorization/par_ict_kernels.hpp"
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/memory.hpp"
+#include "common/cuda_hip/components/merging.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+#include "core/matrix/coo_builder.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The parallel ICT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ict_factorization {
+
+
+constexpr int default_block_size = 512;
+
+
+// subwarp sizes for all warp-parallel kernels (filter, add_candidates)
+using compiled_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+
+
 namespace kernel {
 
 
@@ -275,3 +318,142 @@ __global__ __launch_bounds__(default_block_size) void ict_sweep(
 
 
 }  // namespace kernel
+
+
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void add_candidates(syn::value_list<int, subwarp_size>,
+                    std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* llh,
+                    const matrix::Csr<ValueType, IndexType>* a,
+                    const matrix::Csr<ValueType, IndexType>* l,
+                    matrix::Csr<ValueType, IndexType>* l_new)
+{
+    auto num_rows = static_cast<IndexType>(llh->get_size()[0]);
+    auto subwarps_per_block = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(num_rows, subwarps_per_block);
+    matrix::CsrBuilder<ValueType, IndexType> l_new_builder(l_new);
+    auto llh_row_ptrs = llh->get_const_row_ptrs();
+    auto llh_col_idxs = llh->get_const_col_idxs();
+    auto llh_vals = llh->get_const_values();
+    auto a_row_ptrs = a->get_const_row_ptrs();
+    auto a_col_idxs = a->get_const_col_idxs();
+    auto a_vals = a->get_const_values();
+    auto l_row_ptrs = l->get_const_row_ptrs();
+    auto l_col_idxs = l->get_const_col_idxs();
+    auto l_vals = l->get_const_values();
+    auto l_new_row_ptrs = l_new->get_row_ptrs();
+    // count non-zeros per row
+    if (num_blocks > 0) {
+        kernel::ict_tri_spgeam_nnz<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                llh_row_ptrs, llh_col_idxs, a_row_ptrs, a_col_idxs,
+                l_new_row_ptrs, num_rows);
+    }
+
+    // build row ptrs
+    components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1);
+
+    // resize output arrays
+    auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows);
+    l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz);
+    l_new_builder.get_value_array().resize_and_reset(l_new_nnz);
+
+    auto l_new_col_idxs = l_new->get_col_idxs();
+    auto l_new_vals = l_new->get_values();
+
+    // fill columns and values
+    if (num_blocks > 0) {
+        kernel::ict_tri_spgeam_init<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                llh_row_ptrs, llh_col_idxs, as_device_type(llh_vals),
+                a_row_ptrs, a_col_idxs, as_device_type(a_vals), l_row_ptrs,
+                l_col_idxs, as_device_type(l_vals), l_new_row_ptrs,
+                l_new_col_idxs, as_device_type(l_new_vals), num_rows);
+    }
+}
+
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates);
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void compute_factor(syn::value_list<int, subwarp_size>,
+                    std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* a,
+                    matrix::Csr<ValueType, IndexType>* l,
+                    const matrix::Coo<ValueType, IndexType>* l_coo)
+{
+    auto total_nnz = static_cast<IndexType>(l->get_num_stored_elements());
+    auto block_size = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(total_nnz, block_size);
+    if (num_blocks > 0) {
+        kernel::ict_sweep<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                a->get_const_row_ptrs(), a->get_const_col_idxs(),
+                as_device_type(a->get_const_values()), l->get_const_row_ptrs(),
+                l_coo->get_const_row_idxs(), l->get_const_col_idxs(),
+                as_device_type(l->get_values()),
+                static_cast<IndexType>(l->get_num_stored_elements()));
+    }
+}
+
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_factor, compute_factor);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* llh,
+                    const matrix::Csr<ValueType, IndexType>* a,
+                    const matrix::Csr<ValueType, IndexType>* l,
+                    matrix::Csr<ValueType, IndexType>* l_new)
+{
+    auto num_rows = a->get_size()[0];
+    auto total_nnz =
+        llh->get_num_stored_elements() + a->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_add_candidates(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, llh, a, l, l_new);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* a,
+                    matrix::Csr<ValueType, IndexType>* l,
+                    const matrix::Coo<ValueType, IndexType>* l_coo)
+{
+    auto num_rows = a->get_size()[0];
+    auto total_nnz = 2 * l->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_compute_factor(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, a, l, l_coo);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL);
+
+
+}  // namespace par_ict_factorization
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc b/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp
similarity index 63%
rename from common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc
rename to common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp
index a97f0f08937..6cc77660394 100644
--- a/common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc
+++ b/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp
@@ -2,6 +2,47 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/merging.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+#include "core/factorization/par_ilut_kernels.hpp"
+#include "core/matrix/coo_builder.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The parallel ILUT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ilut_factorization {
+
+
+constexpr int default_block_size = 512;
+
+
+// subwarp sizes for add_candidates kernels
+using compiled_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+
+
 namespace kernel {
 
 
@@ -246,3 +287,113 @@ __global__ __launch_bounds__(default_block_size) void tri_spgeam_init(
 
 
 }  // namespace kernel
+
+
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void add_candidates(syn::value_list<int, subwarp_size>,
+                    std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* lu,
+                    const matrix::Csr<ValueType, IndexType>* a,
+                    const matrix::Csr<ValueType, IndexType>* l,
+                    const matrix::Csr<ValueType, IndexType>* u,
+                    matrix::Csr<ValueType, IndexType>* l_new,
+                    matrix::Csr<ValueType, IndexType>* u_new)
+{
+    auto num_rows = static_cast<IndexType>(lu->get_size()[0]);
+    auto subwarps_per_block = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(num_rows, subwarps_per_block);
+    matrix::CsrBuilder<ValueType, IndexType> l_new_builder(l_new);
+    matrix::CsrBuilder<ValueType, IndexType> u_new_builder(u_new);
+    auto lu_row_ptrs = lu->get_const_row_ptrs();
+    auto lu_col_idxs = lu->get_const_col_idxs();
+    auto lu_vals = lu->get_const_values();
+    auto a_row_ptrs = a->get_const_row_ptrs();
+    auto a_col_idxs = a->get_const_col_idxs();
+    auto a_vals = a->get_const_values();
+    auto l_row_ptrs = l->get_const_row_ptrs();
+    auto l_col_idxs = l->get_const_col_idxs();
+    auto l_vals = l->get_const_values();
+    auto u_row_ptrs = u->get_const_row_ptrs();
+    auto u_col_idxs = u->get_const_col_idxs();
+    auto u_vals = u->get_const_values();
+    auto l_new_row_ptrs = l_new->get_row_ptrs();
+    auto u_new_row_ptrs = u_new->get_row_ptrs();
+    if (num_blocks > 0) {
+        // count non-zeros per row
+        kernel::tri_spgeam_nnz<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                lu_row_ptrs, lu_col_idxs, a_row_ptrs, a_col_idxs,
+                l_new_row_ptrs, u_new_row_ptrs, num_rows);
+    }
+
+    // build row ptrs
+    components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1);
+    components::prefix_sum_nonnegative(exec, u_new_row_ptrs, num_rows + 1);
+
+    // resize output arrays
+    auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows);
+    auto u_new_nnz = exec->copy_val_to_host(u_new_row_ptrs + num_rows);
+    l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz);
+    l_new_builder.get_value_array().resize_and_reset(l_new_nnz);
+    u_new_builder.get_col_idx_array().resize_and_reset(u_new_nnz);
+    u_new_builder.get_value_array().resize_and_reset(u_new_nnz);
+
+    auto l_new_col_idxs = l_new->get_col_idxs();
+    auto l_new_vals = l_new->get_values();
+    auto u_new_col_idxs = u_new->get_col_idxs();
+    auto u_new_vals = u_new->get_values();
+
+    if (num_blocks > 0) {
+        // fill columns and values
+        kernel::tri_spgeam_init<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                lu_row_ptrs, lu_col_idxs, as_device_type(lu_vals), a_row_ptrs,
+                a_col_idxs, as_device_type(a_vals), l_row_ptrs, l_col_idxs,
+                as_device_type(l_vals), u_row_ptrs, u_col_idxs,
+                as_device_type(u_vals), l_new_row_ptrs, l_new_col_idxs,
+                as_device_type(l_new_vals), u_new_row_ptrs, u_new_col_idxs,
+                as_device_type(u_new_vals), num_rows);
+    }
+}
+
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* lu,
+                    const matrix::Csr<ValueType, IndexType>* a,
+                    const matrix::Csr<ValueType, IndexType>* l,
+                    const matrix::Csr<ValueType, IndexType>* u,
+                    matrix::Csr<ValueType, IndexType>* l_new,
+                    matrix::Csr<ValueType, IndexType>* u_new)
+{
+    auto num_rows = a->get_size()[0];
+    auto total_nnz =
+        lu->get_num_stored_elements() + a->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_add_candidates(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, lu, a, l, u, l_new,
+        u_new);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL);
+
+
+}  // namespace par_ilut_factorization
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/cuda/factorization/par_ilut_sweep_kernels.cu b/common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp
similarity index 54%
rename from cuda/factorization/par_ilut_sweep_kernels.cu
rename to common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp
index 9e277549aa4..52f62b50e6a 100644
--- a/cuda/factorization/par_ilut_sweep_kernels.cu
+++ b/common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp
@@ -27,7 +27,7 @@
 
 namespace gko {
 namespace kernels {
-namespace cuda {
+namespace GKO_DEVICE_NAMESPACE {
 /**
  * @brief The parallel ILUT factorization namespace.
  *
@@ -44,7 +44,96 @@ using compiled_kernels =
     syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
 
 
-#include "common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc"
+namespace kernel {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void sweep(
+    const IndexType* __restrict__ a_row_ptrs,
+    const IndexType* __restrict__ a_col_idxs,
+    const ValueType* __restrict__ a_vals,
+    const IndexType* __restrict__ l_row_ptrs,
+    const IndexType* __restrict__ l_row_idxs,
+    const IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_vals,
+    IndexType l_nnz, const IndexType* __restrict__ u_row_idxs,
+    const IndexType* __restrict__ u_col_idxs, ValueType* __restrict__ u_vals,
+    const IndexType* __restrict__ ut_col_ptrs,
+    const IndexType* __restrict__ ut_row_idxs, ValueType* __restrict__ ut_vals,
+    IndexType u_nnz)
+{
+    auto tidx = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
+    if (tidx >= l_nnz + u_nnz) {
+        return;
+    }
+    // split the subwarps into two halves for lower and upper triangle
+    auto l_nz = tidx;
+    auto u_nz = l_nz - l_nnz;
+    auto lower = u_nz < 0;
+    auto row = lower ? l_row_idxs[l_nz] : u_row_idxs[u_nz];
+    auto col = lower ? l_col_idxs[l_nz] : u_col_idxs[u_nz];
+    if (lower && row == col) {
+        // don't update the diagonal twice
+        return;
+    }
+    auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    // find entry of A at (row, col)
+    auto a_row_begin = a_row_ptrs[row];
+    auto a_row_end = a_row_ptrs[row + 1];
+    auto a_row_size = a_row_end - a_row_begin;
+    auto a_idx =
+        group_wide_search(a_row_begin, a_row_size, subwarp,
+                          [&](IndexType i) { return a_col_idxs[i] >= col; });
+    bool has_a = a_idx < a_row_end && a_col_idxs[a_idx] == col;
+    auto a_val = has_a ? a_vals[a_idx] : zero<ValueType>();
+    auto l_row_begin = l_row_ptrs[row];
+    auto l_row_size = l_row_ptrs[row + 1] - l_row_begin;
+    auto ut_col_begin = ut_col_ptrs[col];
+    auto ut_col_size = ut_col_ptrs[col + 1] - ut_col_begin;
+    ValueType sum{};
+    IndexType ut_nz{};
+    auto last_entry = min(row, col);
+    group_merge<subwarp_size>(
+        l_col_idxs + l_row_begin, l_row_size, ut_row_idxs + ut_col_begin,
+        ut_col_size, subwarp,
+        [&](IndexType l_idx, IndexType l_col, IndexType ut_idx,
+            IndexType ut_row, IndexType, bool) {
+            // we don't need to use the `bool valid` because last_entry is
+            // already a smaller sentinel value than the one used in group_merge
+            if (l_col == ut_row && l_col < last_entry) {
+                sum += load_relaxed(l_vals + (l_idx + l_row_begin)) *
+                       load_relaxed(ut_vals + (ut_idx + ut_col_begin));
+            }
+            // remember the transposed element
+            auto found_transp = subwarp.ballot(ut_row == row);
+            if (found_transp) {
+                ut_nz =
+                    subwarp.shfl(ut_idx + ut_col_begin, ffs(found_transp) - 1);
+            }
+            return true;
+        });
+    // accumulate result from all threads
+    sum = reduce(subwarp, sum, [](ValueType a, ValueType b) { return a + b; });
+
+    if (subwarp.thread_rank() == 0) {
+        if (lower) {
+            auto to_write = (a_val - sum) /
+                            load_relaxed(ut_vals + (ut_col_ptrs[col + 1] - 1));
+            if (is_finite(to_write)) {
+                store_relaxed(l_vals + l_nz, to_write);
+            }
+        } else {
+            auto to_write = a_val - sum;
+            if (is_finite(to_write)) {
+                store_relaxed(u_vals + u_nz, to_write);
+                store_relaxed(ut_vals + ut_nz, to_write);
+            }
+        }
+    }
+}
+
+
+}  // namespace kernel
 
 
 namespace {
@@ -115,6 +204,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 }  // namespace par_ilut_factorization
-}  // namespace cuda
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc b/common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc
deleted file mode 100644
index 9da94a878b3..00000000000
--- a/common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc
+++ /dev/null
@@ -1,94 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-namespace kernel {
-
-
-template <int subwarp_size, typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void sweep(
-    const IndexType* __restrict__ a_row_ptrs,
-    const IndexType* __restrict__ a_col_idxs,
-    const ValueType* __restrict__ a_vals,
-    const IndexType* __restrict__ l_row_ptrs,
-    const IndexType* __restrict__ l_row_idxs,
-    const IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_vals,
-    IndexType l_nnz, const IndexType* __restrict__ u_row_idxs,
-    const IndexType* __restrict__ u_col_idxs, ValueType* __restrict__ u_vals,
-    const IndexType* __restrict__ ut_col_ptrs,
-    const IndexType* __restrict__ ut_row_idxs, ValueType* __restrict__ ut_vals,
-    IndexType u_nnz)
-{
-    auto tidx = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
-    if (tidx >= l_nnz + u_nnz) {
-        return;
-    }
-    // split the subwarps into two halves for lower and upper triangle
-    auto l_nz = tidx;
-    auto u_nz = l_nz - l_nnz;
-    auto lower = u_nz < 0;
-    auto row = lower ? l_row_idxs[l_nz] : u_row_idxs[u_nz];
-    auto col = lower ? l_col_idxs[l_nz] : u_col_idxs[u_nz];
-    if (lower && row == col) {
-        // don't update the diagonal twice
-        return;
-    }
-    auto subwarp =
-        group::tiled_partition<subwarp_size>(group::this_thread_block());
-    // find entry of A at (row, col)
-    auto a_row_begin = a_row_ptrs[row];
-    auto a_row_end = a_row_ptrs[row + 1];
-    auto a_row_size = a_row_end - a_row_begin;
-    auto a_idx =
-        group_wide_search(a_row_begin, a_row_size, subwarp,
-                          [&](IndexType i) { return a_col_idxs[i] >= col; });
-    bool has_a = a_idx < a_row_end && a_col_idxs[a_idx] == col;
-    auto a_val = has_a ? a_vals[a_idx] : zero<ValueType>();
-    auto l_row_begin = l_row_ptrs[row];
-    auto l_row_size = l_row_ptrs[row + 1] - l_row_begin;
-    auto ut_col_begin = ut_col_ptrs[col];
-    auto ut_col_size = ut_col_ptrs[col + 1] - ut_col_begin;
-    ValueType sum{};
-    IndexType ut_nz{};
-    auto last_entry = min(row, col);
-    group_merge<subwarp_size>(
-        l_col_idxs + l_row_begin, l_row_size, ut_row_idxs + ut_col_begin,
-        ut_col_size, subwarp,
-        [&](IndexType l_idx, IndexType l_col, IndexType ut_idx,
-            IndexType ut_row, IndexType, bool) {
-            // we don't need to use the `bool valid` because last_entry is
-            // already a smaller sentinel value than the one used in group_merge
-            if (l_col == ut_row && l_col < last_entry) {
-                sum += load_relaxed(l_vals + (l_idx + l_row_begin)) *
-                       load_relaxed(ut_vals + (ut_idx + ut_col_begin));
-            }
-            // remember the transposed element
-            auto found_transp = subwarp.ballot(ut_row == row);
-            if (found_transp) {
-                ut_nz =
-                    subwarp.shfl(ut_idx + ut_col_begin, ffs(found_transp) - 1);
-            }
-            return true;
-        });
-    // accumulate result from all threads
-    sum = reduce(subwarp, sum, [](ValueType a, ValueType b) { return a + b; });
-
-    if (subwarp.thread_rank() == 0) {
-        if (lower) {
-            auto to_write = (a_val - sum) /
-                            load_relaxed(ut_vals + (ut_col_ptrs[col + 1] - 1));
-            if (is_finite(to_write)) {
-                store_relaxed(l_vals + l_nz, to_write);
-            }
-        } else {
-            auto to_write = a_val - sum;
-            if (is_finite(to_write)) {
-                store_relaxed(u_vals + u_nz, to_write);
-                store_relaxed(ut_vals + ut_nz, to_write);
-            }
-        }
-    }
-}
-
-
-}  // namespace kernel
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index 89c711965e1..ae506faed4b 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -18,13 +18,10 @@ target_sources(ginkgo_cuda
     base/stream.cpp
     base/timer.cpp
     base/version.cpp
-    factorization/par_ict_kernels.cu
     factorization/par_ilut_approx_filter_kernels.cu
     factorization/par_ilut_filter_kernels.cu
     factorization/par_ilut_select_common.cu
     factorization/par_ilut_select_kernels.cu
-    factorization/par_ilut_spgeam_kernels.cu
-    factorization/par_ilut_sweep_kernels.cu
     matrix/batch_csr_kernels.cu
     matrix/batch_dense_kernels.cu
     matrix/batch_ell_kernels.cu
diff --git a/cuda/factorization/par_ict_kernels.cu b/cuda/factorization/par_ict_kernels.cu
deleted file mode 100644
index 62964925aa4..00000000000
--- a/cuda/factorization/par_ict_kernels.cu
+++ /dev/null
@@ -1,187 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/par_ict_kernels.hpp"
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-#include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/components/intrinsics.hpp"
-#include "common/cuda_hip/components/memory.hpp"
-#include "common/cuda_hip/components/merging.hpp"
-#include "common/cuda_hip/components/prefix_sum.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/searching.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/matrix/coo_builder.hpp"
-#include "core/matrix/csr_builder.hpp"
-#include "core/matrix/csr_kernels.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The parallel ICT factorization namespace.
- *
- * @ingroup factor
- */
-namespace par_ict_factorization {
-
-
-constexpr int default_block_size = 512;
-
-
-// subwarp sizes for all warp-parallel kernels (filter, add_candidates)
-using compiled_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
-
-
-#include "common/cuda_hip/factorization/par_ict_kernels.hpp.inc"
-
-
-namespace {
-
-
-template <int subwarp_size, typename ValueType, typename IndexType>
-void add_candidates(syn::value_list<int, subwarp_size>,
-                    std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* llh,
-                    const matrix::Csr<ValueType, IndexType>* a,
-                    const matrix::Csr<ValueType, IndexType>* l,
-                    matrix::Csr<ValueType, IndexType>* l_new)
-{
-    auto num_rows = static_cast<IndexType>(llh->get_size()[0]);
-    auto subwarps_per_block = default_block_size / subwarp_size;
-    auto num_blocks = ceildiv(num_rows, subwarps_per_block);
-    matrix::CsrBuilder<ValueType, IndexType> l_new_builder(l_new);
-    auto llh_row_ptrs = llh->get_const_row_ptrs();
-    auto llh_col_idxs = llh->get_const_col_idxs();
-    auto llh_vals = llh->get_const_values();
-    auto a_row_ptrs = a->get_const_row_ptrs();
-    auto a_col_idxs = a->get_const_col_idxs();
-    auto a_vals = a->get_const_values();
-    auto l_row_ptrs = l->get_const_row_ptrs();
-    auto l_col_idxs = l->get_const_col_idxs();
-    auto l_vals = l->get_const_values();
-    auto l_new_row_ptrs = l_new->get_row_ptrs();
-    // count non-zeros per row
-    if (num_blocks > 0) {
-        kernel::ict_tri_spgeam_nnz<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                llh_row_ptrs, llh_col_idxs, a_row_ptrs, a_col_idxs,
-                l_new_row_ptrs, num_rows);
-    }
-
-    // build row ptrs
-    components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1);
-
-    // resize output arrays
-    auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows);
-    l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz);
-    l_new_builder.get_value_array().resize_and_reset(l_new_nnz);
-
-    auto l_new_col_idxs = l_new->get_col_idxs();
-    auto l_new_vals = l_new->get_values();
-
-    // fill columns and values
-    if (num_blocks > 0) {
-        kernel::ict_tri_spgeam_init<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                llh_row_ptrs, llh_col_idxs, as_device_type(llh_vals),
-                a_row_ptrs, a_col_idxs, as_device_type(a_vals), l_row_ptrs,
-                l_col_idxs, as_device_type(l_vals), l_new_row_ptrs,
-                l_new_col_idxs, as_device_type(l_new_vals), num_rows);
-    }
-}
-
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates);
-
-
-template <int subwarp_size, typename ValueType, typename IndexType>
-void compute_factor(syn::value_list<int, subwarp_size>,
-                    std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* a,
-                    matrix::Csr<ValueType, IndexType>* l,
-                    const matrix::Coo<ValueType, IndexType>* l_coo)
-{
-    auto total_nnz = static_cast<IndexType>(l->get_num_stored_elements());
-    auto block_size = default_block_size / subwarp_size;
-    auto num_blocks = ceildiv(total_nnz, block_size);
-    if (num_blocks > 0) {
-        kernel::ict_sweep<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                a->get_const_row_ptrs(), a->get_const_col_idxs(),
-                as_device_type(a->get_const_values()), l->get_const_row_ptrs(),
-                l_coo->get_const_row_idxs(), l->get_const_col_idxs(),
-                as_device_type(l->get_values()),
-                static_cast<IndexType>(l->get_num_stored_elements()));
-    }
-}
-
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_factor, compute_factor);
-
-
-}  // namespace
-
-
-template <typename ValueType, typename IndexType>
-void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* llh,
-                    const matrix::Csr<ValueType, IndexType>* a,
-                    const matrix::Csr<ValueType, IndexType>* l,
-                    matrix::Csr<ValueType, IndexType>* l_new)
-{
-    auto num_rows = a->get_size()[0];
-    auto total_nnz =
-        llh->get_num_stored_elements() + a->get_num_stored_elements();
-    auto total_nnz_per_row = total_nnz / num_rows;
-    select_add_candidates(
-        compiled_kernels(),
-        [&](int compiled_subwarp_size) {
-            return total_nnz_per_row <= compiled_subwarp_size ||
-                   compiled_subwarp_size == config::warp_size;
-        },
-        syn::value_list<int>(), syn::type_list<>(), exec, llh, a, l, l_new);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* a,
-                    matrix::Csr<ValueType, IndexType>* l,
-                    const matrix::Coo<ValueType, IndexType>* l_coo)
-{
-    auto num_rows = a->get_size()[0];
-    auto total_nnz = 2 * l->get_num_stored_elements();
-    auto total_nnz_per_row = total_nnz / num_rows;
-    select_compute_factor(
-        compiled_kernels(),
-        [&](int compiled_subwarp_size) {
-            return total_nnz_per_row <= compiled_subwarp_size ||
-                   compiled_subwarp_size == config::warp_size;
-        },
-        syn::value_list<int>(), syn::type_list<>(), exec, a, l, l_coo);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL);
-
-
-}  // namespace par_ict_factorization
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/factorization/par_ilut_spgeam_kernels.cu b/cuda/factorization/par_ilut_spgeam_kernels.cu
deleted file mode 100644
index 7277093314a..00000000000
--- a/cuda/factorization/par_ilut_spgeam_kernels.cu
+++ /dev/null
@@ -1,156 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-#include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/intrinsics.hpp"
-#include "common/cuda_hip/components/merging.hpp"
-#include "common/cuda_hip/components/prefix_sum.hpp"
-#include "common/cuda_hip/components/searching.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/factorization/par_ilut_kernels.hpp"
-#include "core/matrix/coo_builder.hpp"
-#include "core/matrix/csr_builder.hpp"
-#include "core/matrix/csr_kernels.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The parallel ILUT factorization namespace.
- *
- * @ingroup factor
- */
-namespace par_ilut_factorization {
-
-
-constexpr int default_block_size = 512;
-
-
-// subwarp sizes for add_candidates kernels
-using compiled_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
-
-
-#include "common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc"
-
-
-namespace {
-
-
-template <int subwarp_size, typename ValueType, typename IndexType>
-void add_candidates(syn::value_list<int, subwarp_size>,
-                    std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* lu,
-                    const matrix::Csr<ValueType, IndexType>* a,
-                    const matrix::Csr<ValueType, IndexType>* l,
-                    const matrix::Csr<ValueType, IndexType>* u,
-                    matrix::Csr<ValueType, IndexType>* l_new,
-                    matrix::Csr<ValueType, IndexType>* u_new)
-{
-    auto num_rows = static_cast<IndexType>(lu->get_size()[0]);
-    auto subwarps_per_block = default_block_size / subwarp_size;
-    auto num_blocks = ceildiv(num_rows, subwarps_per_block);
-    matrix::CsrBuilder<ValueType, IndexType> l_new_builder(l_new);
-    matrix::CsrBuilder<ValueType, IndexType> u_new_builder(u_new);
-    auto lu_row_ptrs = lu->get_const_row_ptrs();
-    auto lu_col_idxs = lu->get_const_col_idxs();
-    auto lu_vals = lu->get_const_values();
-    auto a_row_ptrs = a->get_const_row_ptrs();
-    auto a_col_idxs = a->get_const_col_idxs();
-    auto a_vals = a->get_const_values();
-    auto l_row_ptrs = l->get_const_row_ptrs();
-    auto l_col_idxs = l->get_const_col_idxs();
-    auto l_vals = l->get_const_values();
-    auto u_row_ptrs = u->get_const_row_ptrs();
-    auto u_col_idxs = u->get_const_col_idxs();
-    auto u_vals = u->get_const_values();
-    auto l_new_row_ptrs = l_new->get_row_ptrs();
-    auto u_new_row_ptrs = u_new->get_row_ptrs();
-    if (num_blocks > 0) {
-        // count non-zeros per row
-        kernel::tri_spgeam_nnz<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                lu_row_ptrs, lu_col_idxs, a_row_ptrs, a_col_idxs,
-                l_new_row_ptrs, u_new_row_ptrs, num_rows);
-    }
-
-    // build row ptrs
-    components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1);
-    components::prefix_sum_nonnegative(exec, u_new_row_ptrs, num_rows + 1);
-
-    // resize output arrays
-    auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows);
-    auto u_new_nnz = exec->copy_val_to_host(u_new_row_ptrs + num_rows);
-    l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz);
-    l_new_builder.get_value_array().resize_and_reset(l_new_nnz);
-    u_new_builder.get_col_idx_array().resize_and_reset(u_new_nnz);
-    u_new_builder.get_value_array().resize_and_reset(u_new_nnz);
-
-    auto l_new_col_idxs = l_new->get_col_idxs();
-    auto l_new_vals = l_new->get_values();
-    auto u_new_col_idxs = u_new->get_col_idxs();
-    auto u_new_vals = u_new->get_values();
-
-    if (num_blocks > 0) {
-        // fill columns and values
-        kernel::tri_spgeam_init<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                lu_row_ptrs, lu_col_idxs, as_device_type(lu_vals), a_row_ptrs,
-                a_col_idxs, as_device_type(a_vals), l_row_ptrs, l_col_idxs,
-                as_device_type(l_vals), u_row_ptrs, u_col_idxs,
-                as_device_type(u_vals), l_new_row_ptrs, l_new_col_idxs,
-                as_device_type(l_new_vals), u_new_row_ptrs, u_new_col_idxs,
-                as_device_type(u_new_vals), num_rows);
-    }
-}
-
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates);
-
-
-}  // namespace
-
-
-template <typename ValueType, typename IndexType>
-void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* lu,
-                    const matrix::Csr<ValueType, IndexType>* a,
-                    const matrix::Csr<ValueType, IndexType>* l,
-                    const matrix::Csr<ValueType, IndexType>* u,
-                    matrix::Csr<ValueType, IndexType>* l_new,
-                    matrix::Csr<ValueType, IndexType>* u_new)
-{
-    auto num_rows = a->get_size()[0];
-    auto total_nnz =
-        lu->get_num_stored_elements() + a->get_num_stored_elements();
-    auto total_nnz_per_row = total_nnz / num_rows;
-    select_add_candidates(
-        compiled_kernels(),
-        [&](int compiled_subwarp_size) {
-            return total_nnz_per_row <= compiled_subwarp_size ||
-                   compiled_subwarp_size == config::warp_size;
-        },
-        syn::value_list<int>(), syn::type_list<>(), exec, lu, a, l, u, l_new,
-        u_new);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL);
-
-
-}  // namespace par_ilut_factorization
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 32e3767f93c..0bfe56d7db1 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -16,13 +16,10 @@ set(GINKGO_HIP_SOURCES
     base/stream.hip.cpp
     base/timer.hip.cpp
     base/version.hip.cpp
-    factorization/par_ict_kernels.hip.cpp
     factorization/par_ilut_approx_filter_kernels.hip.cpp
     factorization/par_ilut_filter_kernels.hip.cpp
     factorization/par_ilut_select_common.hip.cpp
     factorization/par_ilut_select_kernels.hip.cpp
-    factorization/par_ilut_spgeam_kernels.hip.cpp
-    factorization/par_ilut_sweep_kernels.hip.cpp
     matrix/batch_csr_kernels.hip.cpp
     matrix/batch_dense_kernels.hip.cpp
     matrix/batch_ell_kernels.hip.cpp
diff --git a/hip/factorization/par_ict_kernels.hip.cpp b/hip/factorization/par_ict_kernels.hip.cpp
deleted file mode 100644
index ed7b104471b..00000000000
--- a/hip/factorization/par_ict_kernels.hip.cpp
+++ /dev/null
@@ -1,187 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/par_ict_kernels.hpp"
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-#include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/components/intrinsics.hpp"
-#include "common/cuda_hip/components/memory.hpp"
-#include "common/cuda_hip/components/merging.hpp"
-#include "common/cuda_hip/components/prefix_sum.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/searching.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/matrix/coo_builder.hpp"
-#include "core/matrix/csr_builder.hpp"
-#include "core/matrix/csr_kernels.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The parallel ICT factorization namespace.
- *
- * @ingroup factor
- */
-namespace par_ict_factorization {
-
-
-constexpr int default_block_size = 512;
-
-
-// subwarp sizes for all warp-parallel kernels (filter, add_candidates)
-using compiled_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
-
-
-#include "common/cuda_hip/factorization/par_ict_kernels.hpp.inc"
-
-
-namespace {
-
-
-template <int subwarp_size, typename ValueType, typename IndexType>
-void add_candidates(syn::value_list<int, subwarp_size>,
-                    std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* llh,
-                    const matrix::Csr<ValueType, IndexType>* a,
-                    const matrix::Csr<ValueType, IndexType>* l,
-                    matrix::Csr<ValueType, IndexType>* l_new)
-{
-    auto num_rows = static_cast<IndexType>(llh->get_size()[0]);
-    auto subwarps_per_block = default_block_size / subwarp_size;
-    auto num_blocks = ceildiv(num_rows, subwarps_per_block);
-    matrix::CsrBuilder<ValueType, IndexType> l_new_builder(l_new);
-    auto llh_row_ptrs = llh->get_const_row_ptrs();
-    auto llh_col_idxs = llh->get_const_col_idxs();
-    auto llh_vals = llh->get_const_values();
-    auto a_row_ptrs = a->get_const_row_ptrs();
-    auto a_col_idxs = a->get_const_col_idxs();
-    auto a_vals = a->get_const_values();
-    auto l_row_ptrs = l->get_const_row_ptrs();
-    auto l_col_idxs = l->get_const_col_idxs();
-    auto l_vals = l->get_const_values();
-    auto l_new_row_ptrs = l_new->get_row_ptrs();
-    // count non-zeros per row
-    if (num_blocks > 0) {
-        kernel::ict_tri_spgeam_nnz<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                llh_row_ptrs, llh_col_idxs, a_row_ptrs, a_col_idxs,
-                l_new_row_ptrs, num_rows);
-    }
-
-    // build row ptrs
-    components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1);
-
-    // resize output arrays
-    auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows);
-    l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz);
-    l_new_builder.get_value_array().resize_and_reset(l_new_nnz);
-
-    auto l_new_col_idxs = l_new->get_col_idxs();
-    auto l_new_vals = l_new->get_values();
-
-    // fill columns and values
-    if (num_blocks > 0) {
-        kernel::ict_tri_spgeam_init<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                llh_row_ptrs, llh_col_idxs, as_device_type(llh_vals),
-                a_row_ptrs, a_col_idxs, as_device_type(a_vals), l_row_ptrs,
-                l_col_idxs, as_device_type(l_vals), l_new_row_ptrs,
-                l_new_col_idxs, as_device_type(l_new_vals), num_rows);
-    }
-}
-
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates);
-
-
-template <int subwarp_size, typename ValueType, typename IndexType>
-void compute_factor(syn::value_list<int, subwarp_size>,
-                    std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* a,
-                    matrix::Csr<ValueType, IndexType>* l,
-                    const matrix::Coo<ValueType, IndexType>* l_coo)
-{
-    auto total_nnz = static_cast<IndexType>(l->get_num_stored_elements());
-    auto block_size = default_block_size / subwarp_size;
-    auto num_blocks = ceildiv(total_nnz, block_size);
-    if (num_blocks > 0) {
-        kernel::ict_sweep<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                a->get_const_row_ptrs(), a->get_const_col_idxs(),
-                as_device_type(a->get_const_values()), l->get_const_row_ptrs(),
-                l_coo->get_const_row_idxs(), l->get_const_col_idxs(),
-                as_device_type(l->get_values()),
-                static_cast<IndexType>(l->get_num_stored_elements()));
-    }
-}
-
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_factor, compute_factor);
-
-
-}  // namespace
-
-
-template <typename ValueType, typename IndexType>
-void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* llh,
-                    const matrix::Csr<ValueType, IndexType>* a,
-                    const matrix::Csr<ValueType, IndexType>* l,
-                    matrix::Csr<ValueType, IndexType>* l_new)
-{
-    auto num_rows = a->get_size()[0];
-    auto total_nnz =
-        llh->get_num_stored_elements() + a->get_num_stored_elements();
-    auto total_nnz_per_row = total_nnz / num_rows;
-    select_add_candidates(
-        compiled_kernels(),
-        [&](int compiled_subwarp_size) {
-            return total_nnz_per_row <= compiled_subwarp_size ||
-                   compiled_subwarp_size == config::warp_size;
-        },
-        syn::value_list<int>(), syn::type_list<>(), exec, llh, a, l, l_new);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* a,
-                    matrix::Csr<ValueType, IndexType>* l,
-                    const matrix::Coo<ValueType, IndexType>* l_coo)
-{
-    auto num_rows = a->get_size()[0];
-    auto total_nnz = 2 * l->get_num_stored_elements();
-    auto total_nnz_per_row = total_nnz / num_rows;
-    select_compute_factor(
-        compiled_kernels(),
-        [&](int compiled_subwarp_size) {
-            return total_nnz_per_row <= compiled_subwarp_size ||
-                   compiled_subwarp_size == config::warp_size;
-        },
-        syn::value_list<int>(), syn::type_list<>(), exec, a, l, l_coo);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL);
-
-
-}  // namespace par_ict_factorization
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/factorization/par_ilut_spgeam_kernels.hip.cpp b/hip/factorization/par_ilut_spgeam_kernels.hip.cpp
deleted file mode 100644
index 5757e00d2a3..00000000000
--- a/hip/factorization/par_ilut_spgeam_kernels.hip.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-#include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/intrinsics.hpp"
-#include "common/cuda_hip/components/merging.hpp"
-#include "common/cuda_hip/components/prefix_sum.hpp"
-#include "common/cuda_hip/components/searching.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/factorization/par_ilut_kernels.hpp"
-#include "core/matrix/coo_builder.hpp"
-#include "core/matrix/csr_builder.hpp"
-#include "core/matrix/csr_kernels.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The parallel ILUT factorization namespace.
- *
- * @ingroup factor
- */
-namespace par_ilut_factorization {
-
-
-constexpr int default_block_size = 512;
-
-
-// subwarp sizes for add_candidates kernels
-using compiled_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
-
-
-#include "common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc"
-
-
-namespace {
-
-
-template <int subwarp_size, typename ValueType, typename IndexType>
-void add_candidates(syn::value_list<int, subwarp_size>,
-                    std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* lu,
-                    const matrix::Csr<ValueType, IndexType>* a,
-                    const matrix::Csr<ValueType, IndexType>* l,
-                    const matrix::Csr<ValueType, IndexType>* u,
-                    matrix::Csr<ValueType, IndexType>* l_new,
-                    matrix::Csr<ValueType, IndexType>* u_new)
-{
-    auto num_rows = static_cast<IndexType>(lu->get_size()[0]);
-    auto subwarps_per_block = default_block_size / subwarp_size;
-    auto num_blocks = ceildiv(num_rows, subwarps_per_block);
-    matrix::CsrBuilder<ValueType, IndexType> l_new_builder(l_new);
-    matrix::CsrBuilder<ValueType, IndexType> u_new_builder(u_new);
-    auto lu_row_ptrs = lu->get_const_row_ptrs();
-    auto lu_col_idxs = lu->get_const_col_idxs();
-    auto lu_vals = lu->get_const_values();
-    auto a_row_ptrs = a->get_const_row_ptrs();
-    auto a_col_idxs = a->get_const_col_idxs();
-    auto a_vals = a->get_const_values();
-    auto l_row_ptrs = l->get_const_row_ptrs();
-    auto l_col_idxs = l->get_const_col_idxs();
-    auto l_vals = l->get_const_values();
-    auto u_row_ptrs = u->get_const_row_ptrs();
-    auto u_col_idxs = u->get_const_col_idxs();
-    auto u_vals = u->get_const_values();
-    auto l_new_row_ptrs = l_new->get_row_ptrs();
-    auto u_new_row_ptrs = u_new->get_row_ptrs();
-    if (num_blocks > 0) {
-        // count non-zeros per row
-        kernel::tri_spgeam_nnz<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                lu_row_ptrs, lu_col_idxs, a_row_ptrs, a_col_idxs,
-                l_new_row_ptrs, u_new_row_ptrs, num_rows);
-    }
-
-    // build row ptrs
-    components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1);
-    components::prefix_sum_nonnegative(exec, u_new_row_ptrs, num_rows + 1);
-
-    // resize output arrays
-    auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows);
-    auto u_new_nnz = exec->copy_val_to_host(u_new_row_ptrs + num_rows);
-    l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz);
-    l_new_builder.get_value_array().resize_and_reset(l_new_nnz);
-    u_new_builder.get_col_idx_array().resize_and_reset(u_new_nnz);
-    u_new_builder.get_value_array().resize_and_reset(u_new_nnz);
-
-    auto l_new_col_idxs = l_new->get_col_idxs();
-    auto l_new_vals = l_new->get_values();
-    auto u_new_col_idxs = u_new->get_col_idxs();
-    auto u_new_vals = u_new->get_values();
-
-    if (num_blocks > 0) {
-        // fill columns and values
-        kernel::tri_spgeam_init<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                lu_row_ptrs, lu_col_idxs, as_device_type(lu_vals), a_row_ptrs,
-                a_col_idxs, as_device_type(a_vals), l_row_ptrs, l_col_idxs,
-                as_device_type(l_vals), u_row_ptrs, u_col_idxs,
-                as_device_type(u_vals), l_new_row_ptrs, l_new_col_idxs,
-                as_device_type(l_new_vals), u_new_row_ptrs, u_new_col_idxs,
-                as_device_type(u_new_vals), num_rows);
-    }
-}
-
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates);
-
-
-}  // namespace
-
-
-template <typename ValueType, typename IndexType>
-void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* lu,
-                    const matrix::Csr<ValueType, IndexType>* a,
-                    const matrix::Csr<ValueType, IndexType>* l,
-                    const matrix::Csr<ValueType, IndexType>* u,
-                    matrix::Csr<ValueType, IndexType>* l_new,
-                    matrix::Csr<ValueType, IndexType>* u_new)
-{
-    auto num_rows = a->get_size()[0];
-    auto total_nnz =
-        lu->get_num_stored_elements() + a->get_num_stored_elements();
-    auto total_nnz_per_row = total_nnz / num_rows;
-    select_add_candidates(
-        compiled_kernels(),
-        [&](int compiled_subwarp_size) {
-            return total_nnz_per_row <= compiled_subwarp_size ||
-                   compiled_subwarp_size == config::warp_size;
-        },
-        syn::value_list<int>(), syn::type_list<>(), exec, lu, a, l, u, l_new,
-        u_new);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL);
-
-
-}  // namespace par_ilut_factorization
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/factorization/par_ilut_sweep_kernels.hip.cpp b/hip/factorization/par_ilut_sweep_kernels.hip.cpp
deleted file mode 100644
index de271d6eebd..00000000000
--- a/hip/factorization/par_ilut_sweep_kernels.hip.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-#include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/components/intrinsics.hpp"
-#include "common/cuda_hip/components/memory.hpp"
-#include "common/cuda_hip/components/merging.hpp"
-#include "common/cuda_hip/components/prefix_sum.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/searching.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/factorization/par_ilut_kernels.hpp"
-#include "core/matrix/coo_builder.hpp"
-#include "core/matrix/csr_builder.hpp"
-#include "core/matrix/csr_kernels.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The parallel ILUT factorization namespace.
- *
- * @ingroup factor
- */
-namespace par_ilut_factorization {
-
-
-constexpr int default_block_size = 512;
-
-
-// subwarp sizes for all warp-parallel kernels (filter, add_candidates)
-using compiled_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
-
-
-#include "common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc"
-
-
-namespace {
-
-
-template <int subwarp_size, typename ValueType, typename IndexType>
-void compute_l_u_factors(syn::value_list<int, subwarp_size>,
-                         std::shared_ptr<const DefaultExecutor> exec,
-                         const matrix::Csr<ValueType, IndexType>* a,
-                         matrix::Csr<ValueType, IndexType>* l,
-                         const matrix::Coo<ValueType, IndexType>* l_coo,
-                         matrix::Csr<ValueType, IndexType>* u,
-                         const matrix::Coo<ValueType, IndexType>* u_coo,
-                         matrix::Csr<ValueType, IndexType>* u_csc)
-{
-    auto total_nnz = static_cast<IndexType>(l->get_num_stored_elements() +
-                                            u->get_num_stored_elements());
-    auto block_size = default_block_size / subwarp_size;
-    auto num_blocks = ceildiv(total_nnz, block_size);
-    if (num_blocks > 0) {
-        kernel::sweep<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                a->get_const_row_ptrs(), a->get_const_col_idxs(),
-                as_device_type(a->get_const_values()), l->get_const_row_ptrs(),
-                l_coo->get_const_row_idxs(), l->get_const_col_idxs(),
-                as_device_type(l->get_values()),
-                static_cast<IndexType>(l->get_num_stored_elements()),
-                u_coo->get_const_row_idxs(), u_coo->get_const_col_idxs(),
-                as_device_type(u->get_values()), u_csc->get_const_row_ptrs(),
-                u_csc->get_const_col_idxs(),
-                as_device_type(u_csc->get_values()),
-                static_cast<IndexType>(u->get_num_stored_elements()));
-    }
-}
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_l_u_factors,
-                                    compute_l_u_factors);
-
-
-}  // namespace
-
-
-template <typename ValueType, typename IndexType>
-void compute_l_u_factors(std::shared_ptr<const DefaultExecutor> exec,
-                         const matrix::Csr<ValueType, IndexType>* a,
-                         matrix::Csr<ValueType, IndexType>* l,
-                         const matrix::Coo<ValueType, IndexType>* l_coo,
-                         matrix::Csr<ValueType, IndexType>* u,
-                         const matrix::Coo<ValueType, IndexType>* u_coo,
-                         matrix::Csr<ValueType, IndexType>* u_csc)
-{
-    auto num_rows = a->get_size()[0];
-    auto total_nnz =
-        l->get_num_stored_elements() + u->get_num_stored_elements();
-    auto total_nnz_per_row = total_nnz / num_rows;
-    select_compute_l_u_factors(
-        compiled_kernels(),
-        [&](int compiled_subwarp_size) {
-            return total_nnz_per_row <= compiled_subwarp_size ||
-                   compiled_subwarp_size == config::warp_size;
-        },
-        syn::value_list<int>(), syn::type_list<>(), exec, a, l, l_coo, u, u_coo,
-        u_csc);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL);
-
-
-}  // namespace par_ilut_factorization
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko

From fd62a02bc753462c662e54a860bc056f5a3900c4 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Fri, 12 Jul 2024 17:30:46 +0200
Subject: [PATCH 087/448] unify ParILUT/ICT

---
 common/cuda_hip/CMakeLists.txt                |   4 +
 .../par_ilut_approx_filter_kernels.cpp        |  12 +-
 .../factorization/par_ilut_config.hpp         |  29 +++
 .../factorization/par_ilut_filter_kernels.cpp |  12 +-
 ...ls.hpp.inc => par_ilut_filter_kernels.hpp} |  26 +++
 .../factorization/par_ilut_select_common.cpp  |  10 +-
 .../factorization/par_ilut_select_common.hpp  |  11 +-
 .../factorization/par_ilut_select_kernels.cpp |  11 +-
 ...ls.hpp.inc => par_ilut_select_kernels.hpp} |  24 +++
 cuda/CMakeLists.txt                           |   4 -
 .../par_ilut_approx_filter_kernels.cu         | 179 ------------------
 cuda/factorization/par_ilut_filter_kernels.cu | 137 --------------
 cuda/factorization/par_ilut_select_common.cu  |  95 ----------
 hip/CMakeLists.txt                            |   4 -
 .../par_ilut_select_common.hip.hpp            |  51 -----
 .../par_ilut_select_kernels.hip.cpp           | 158 ----------------
 16 files changed, 106 insertions(+), 661 deletions(-)
 rename hip/factorization/par_ilut_approx_filter_kernels.hip.cpp => common/cuda_hip/factorization/par_ilut_approx_filter_kernels.cpp (97%)
 create mode 100644 common/cuda_hip/factorization/par_ilut_config.hpp
 rename hip/factorization/par_ilut_filter_kernels.hip.cpp => common/cuda_hip/factorization/par_ilut_filter_kernels.cpp (98%)
 rename common/cuda_hip/factorization/{par_ilut_filter_kernels.hpp.inc => par_ilut_filter_kernels.hpp} (85%)
 rename hip/factorization/par_ilut_select_common.hip.cpp => common/cuda_hip/factorization/par_ilut_select_common.cpp (96%)
 rename cuda/factorization/par_ilut_select_common.cuh => common/cuda_hip/factorization/par_ilut_select_common.hpp (79%)
 rename cuda/factorization/par_ilut_select_kernels.cu => common/cuda_hip/factorization/par_ilut_select_kernels.cpp (97%)
 rename common/cuda_hip/factorization/{par_ilut_select_kernels.hpp.inc => par_ilut_select_kernels.hpp} (91%)
 delete mode 100644 cuda/factorization/par_ilut_approx_filter_kernels.cu
 delete mode 100644 cuda/factorization/par_ilut_filter_kernels.cu
 delete mode 100644 cuda/factorization/par_ilut_select_common.cu
 delete mode 100644 hip/factorization/par_ilut_select_common.hip.hpp
 delete mode 100644 hip/factorization/par_ilut_select_kernels.hip.cpp

diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt
index c4a56482b1d..a333ea9569c 100644
--- a/common/cuda_hip/CMakeLists.txt
+++ b/common/cuda_hip/CMakeLists.txt
@@ -15,6 +15,10 @@ set(CUDA_HIP_SOURCES
     factorization/par_ic_kernels.cpp
     factorization/par_ict_kernels.cpp
     factorization/par_ilu_kernels.cpp
+    factorization/par_ilut_approx_filter_kernels.cpp
+    factorization/par_ilut_filter_kernels.cpp
+    factorization/par_ilut_select_common.cpp
+    factorization/par_ilut_select_kernels.cpp
     factorization/par_ilut_spgeam_kernels.cpp
     factorization/par_ilut_sweep_kernels.cpp
     matrix/coo_kernels.cpp
diff --git a/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp b/common/cuda_hip/factorization/par_ilut_approx_filter_kernels.cpp
similarity index 97%
rename from hip/factorization/par_ilut_approx_filter_kernels.hip.cpp
rename to common/cuda_hip/factorization/par_ilut_approx_filter_kernels.cpp
index 31482cd4034..12d8da9e4f5 100644
--- a/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp
+++ b/common/cuda_hip/factorization/par_ilut_approx_filter_kernels.cpp
@@ -20,18 +20,20 @@
 #include "common/cuda_hip/components/prefix_sum.hpp"
 #include "common/cuda_hip/components/sorting.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp"
+#include "common/cuda_hip/factorization/par_ilut_select_common.hpp"
+#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/par_ilut_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/factorization/par_ilut_select_common.hip.hpp"
 
 
 namespace gko {
 namespace kernels {
-namespace hip {
+namespace GKO_DEVICE_NAMESPACE {
 /**
  * @brief The parallel ILUT factorization namespace.
  *
@@ -45,10 +47,6 @@ using compiled_kernels =
     syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
 
 
-#include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc"
-#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc"
-
-
 template <int subwarp_size, typename ValueType, typename IndexType>
 void threshold_filter_approx(syn::value_list<int, subwarp_size>,
                              std::shared_ptr<const DefaultExecutor> exec,
@@ -175,6 +173,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 }  // namespace par_ilut_factorization
-}  // namespace hip
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/common/cuda_hip/factorization/par_ilut_config.hpp b/common/cuda_hip/factorization/par_ilut_config.hpp
new file mode 100644
index 00000000000..0aaa6211bd6
--- /dev/null
+++ b/common/cuda_hip/factorization/par_ilut_config.hpp
@@ -0,0 +1,29 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_CONFIG_HIP_HPP_
+#define GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_CONFIG_HIP_HPP_
+
+#include "common/cuda_hip/base/config.hpp"
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace par_ilut_factorization {
+
+
+constexpr int default_block_size = 512;
+
+
+// subwarp sizes for add_candidates kernels
+using compiled_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+
+
+}  // namespace par_ilut_factorization
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+#endif  // GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_CONFIG_HIP_HPP_
diff --git a/hip/factorization/par_ilut_filter_kernels.hip.cpp b/common/cuda_hip/factorization/par_ilut_filter_kernels.cpp
similarity index 98%
rename from hip/factorization/par_ilut_filter_kernels.hip.cpp
rename to common/cuda_hip/factorization/par_ilut_filter_kernels.cpp
index bbe0b197d7c..25432fb44d2 100644
--- a/hip/factorization/par_ilut_filter_kernels.hip.cpp
+++ b/common/cuda_hip/factorization/par_ilut_filter_kernels.cpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp"
+
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
@@ -25,7 +27,7 @@
 
 namespace gko {
 namespace kernels {
-namespace hip {
+namespace GKO_DEVICE_NAMESPACE {
 /**
  * @brief The parallel ILUT factorization namespace.
  *
@@ -34,17 +36,11 @@ namespace hip {
 namespace par_ilut_factorization {
 
 
-constexpr int default_block_size = 512;
-
-
 // subwarp sizes for filter kernels
 using compiled_kernels =
     syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
 
 
-#include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc"
-
-
 namespace {
 
 
@@ -132,6 +128,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 }  // namespace par_ilut_factorization
-}  // namespace hip
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc b/common/cuda_hip/factorization/par_ilut_filter_kernels.hpp
similarity index 85%
rename from common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc
rename to common/cuda_hip/factorization/par_ilut_filter_kernels.hpp
index 68794bfc8d1..6312c1af5f5 100644
--- a/common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc
+++ b/common/cuda_hip/factorization/par_ilut_filter_kernels.hpp
@@ -2,6 +2,26 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_FILTER_KERNELS_HIP_HPP_
+#define GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_FILTER_KERNELS_HIP_HPP_
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/sorting.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/factorization/par_ilut_config.hpp"
+#include "core/factorization/par_ilut_kernels.hpp"
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace par_ilut_factorization {
 namespace kernel {
 
 
@@ -162,3 +182,9 @@ __global__ __launch_bounds__(default_block_size) void bucket_filter(
 
 
 }  // namespace kernel
+}  // namespace par_ilut_factorization
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+#endif  // GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_FILTER_KERNELS_HIP_HPP_
diff --git a/hip/factorization/par_ilut_select_common.hip.cpp b/common/cuda_hip/factorization/par_ilut_select_common.cpp
similarity index 96%
rename from hip/factorization/par_ilut_select_common.hip.cpp
rename to common/cuda_hip/factorization/par_ilut_select_common.cpp
index 89ceca0a024..fccb89fcf5a 100644
--- a/hip/factorization/par_ilut_select_common.hip.cpp
+++ b/common/cuda_hip/factorization/par_ilut_select_common.cpp
@@ -8,7 +8,7 @@
 // clang-format on
 
 
-#include "hip/factorization/par_ilut_select_common.hip.hpp"
+#include "common/cuda_hip/factorization/par_ilut_select_common.hpp"
 
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/components/atomic.hpp"
@@ -17,13 +17,14 @@
 #include "common/cuda_hip/components/searching.hpp"
 #include "common/cuda_hip/components/sorting.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/par_ilut_kernels.hpp"
 
 
 namespace gko {
 namespace kernels {
-namespace hip {
+namespace GKO_DEVICE_NAMESPACE {
 /**
  * @brief The parallel ILUT factorization namespace.
  *
@@ -32,9 +33,6 @@ namespace hip {
 namespace par_ilut_factorization {
 
 
-#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc"
-
-
 template <typename ValueType, typename IndexType>
 void sampleselect_count(std::shared_ptr<const DefaultExecutor> exec,
                         const ValueType* values, IndexType size,
@@ -96,6 +94,6 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(DECLARE_SSSS_FIND_BUCKET);
 
 
 }  // namespace par_ilut_factorization
-}  // namespace hip
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/cuda/factorization/par_ilut_select_common.cuh b/common/cuda_hip/factorization/par_ilut_select_common.hpp
similarity index 79%
rename from cuda/factorization/par_ilut_select_common.cuh
rename to common/cuda_hip/factorization/par_ilut_select_common.hpp
index 4cb7dd55258..eca9e5cc4ac 100644
--- a/cuda/factorization/par_ilut_select_common.cuh
+++ b/common/cuda_hip/factorization/par_ilut_select_common.hpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#ifndef GKO_CUDA_FACTORIZATION_PAR_ILUT_SELECT_COMMON_CUH_
-#define GKO_CUDA_FACTORIZATION_PAR_ILUT_SELECT_COMMON_CUH_
+#ifndef GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_SELECT_COMMON_HPP_
+#define GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_SELECT_COMMON_HPP_
 
 
 #include <ginkgo/core/base/executor.hpp>
@@ -13,11 +13,10 @@
 
 namespace gko {
 namespace kernels {
-namespace cuda {
+namespace GKO_DEVICE_NAMESPACE {
 namespace par_ilut_factorization {
 
 
-constexpr int default_block_size = 512;
 constexpr int items_per_thread = 16;
 
 
@@ -43,9 +42,9 @@ sampleselect_bucket<IndexType> sampleselect_find_bucket(
 
 
 }  // namespace par_ilut_factorization
-}  // namespace cuda
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
 
 
-#endif  // GKO_CUDA_FACTORIZATION_PAR_ILUT_SELECT_COMMON_CUH_
+#endif  // GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_SELECT_COMMON_HPP_
diff --git a/cuda/factorization/par_ilut_select_kernels.cu b/common/cuda_hip/factorization/par_ilut_select_kernels.cpp
similarity index 97%
rename from cuda/factorization/par_ilut_select_kernels.cu
rename to common/cuda_hip/factorization/par_ilut_select_kernels.cpp
index a2395a16aea..e03ee379977 100644
--- a/cuda/factorization/par_ilut_select_kernels.cu
+++ b/common/cuda_hip/factorization/par_ilut_select_kernels.cpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp"
+
 #include <algorithm>
 
 #include <ginkgo/core/base/array.hpp>
@@ -16,14 +18,14 @@
 #include "common/cuda_hip/components/searching.hpp"
 #include "common/cuda_hip/components/sorting.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/factorization/par_ilut_select_common.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/par_ilut_kernels.hpp"
-#include "cuda/factorization/par_ilut_select_common.cuh"
 
 
 namespace gko {
 namespace kernels {
-namespace cuda {
+namespace GKO_DEVICE_NAMESPACE {
 /**
  * @brief The parallel ILUT factorization namespace.
  *
@@ -32,9 +34,6 @@ namespace cuda {
 namespace par_ilut_factorization {
 
 
-#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc"
-
-
 template <typename ValueType, typename IndexType>
 void sampleselect_filter(std::shared_ptr<const DefaultExecutor> exec,
                          const ValueType* values, IndexType size,
@@ -153,6 +152,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 }  // namespace par_ilut_factorization
-}  // namespace cuda
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc b/common/cuda_hip/factorization/par_ilut_select_kernels.hpp
similarity index 91%
rename from common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc
rename to common/cuda_hip/factorization/par_ilut_select_kernels.hpp
index 2ee5061d4c5..6f5940c2b14 100644
--- a/common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc
+++ b/common/cuda_hip/factorization/par_ilut_select_kernels.hpp
@@ -2,6 +2,24 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_SELECT_KERNELS_HIP_HPP_
+#define GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_SELECT_KERNELS_HIP_HPP_
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "common/cuda_hip/components/sorting.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/factorization/par_ilut_config.hpp"
+#include "core/factorization/par_ilut_kernels.hpp"
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace par_ilut_factorization {
 namespace kernel {
 
 
@@ -278,3 +296,9 @@ __global__ __launch_bounds__(config::warp_size) void find_bucket(
 
 
 }  // namespace kernel
+}  // namespace par_ilut_factorization
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+#endif  // GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_SELECT_KERNELS_HIP_HPP_
\ No newline at end of file
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index ae506faed4b..ba02918928c 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -18,10 +18,6 @@ target_sources(ginkgo_cuda
     base/stream.cpp
     base/timer.cpp
     base/version.cpp
-    factorization/par_ilut_approx_filter_kernels.cu
-    factorization/par_ilut_filter_kernels.cu
-    factorization/par_ilut_select_common.cu
-    factorization/par_ilut_select_kernels.cu
     matrix/batch_csr_kernels.cu
     matrix/batch_dense_kernels.cu
     matrix/batch_ell_kernels.cu
diff --git a/cuda/factorization/par_ilut_approx_filter_kernels.cu b/cuda/factorization/par_ilut_approx_filter_kernels.cu
deleted file mode 100644
index 93c0ef7fc95..00000000000
--- a/cuda/factorization/par_ilut_approx_filter_kernels.cu
+++ /dev/null
@@ -1,179 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include <algorithm>
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/atomic.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/intrinsics.hpp"
-#include "common/cuda_hip/components/prefix_sum.hpp"
-#include "common/cuda_hip/components/sorting.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/factorization/par_ilut_kernels.hpp"
-#include "core/matrix/coo_builder.hpp"
-#include "core/matrix/csr_builder.hpp"
-#include "core/matrix/csr_kernels.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/factorization/par_ilut_select_common.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The parallel ILUT factorization namespace.
- *
- * @ingroup factor
- */
-namespace par_ilut_factorization {
-
-
-// subwarp sizes for filter kernels
-using compiled_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
-
-
-#include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc"
-#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc"
-
-
-template <int subwarp_size, typename ValueType, typename IndexType>
-void threshold_filter_approx(syn::value_list<int, subwarp_size>,
-                             std::shared_ptr<const DefaultExecutor> exec,
-                             const matrix::Csr<ValueType, IndexType>* m,
-                             IndexType rank, array<ValueType>* tmp,
-                             remove_complex<ValueType>* threshold,
-                             matrix::Csr<ValueType, IndexType>* m_out,
-                             matrix::Coo<ValueType, IndexType>* m_out_coo)
-{
-    auto values = m->get_const_values();
-    IndexType size = m->get_num_stored_elements();
-    using AbsType = remove_complex<ValueType>;
-    constexpr auto bucket_count = kernel::searchtree_width;
-    auto max_num_threads = ceildiv(size, items_per_thread);
-    auto max_num_blocks = ceildiv(max_num_threads, default_block_size);
-
-    size_type tmp_size_totals =
-        ceildiv((bucket_count + 1) * sizeof(IndexType), sizeof(ValueType));
-    size_type tmp_size_partials = ceildiv(
-        bucket_count * max_num_blocks * sizeof(IndexType), sizeof(ValueType));
-    size_type tmp_size_oracles =
-        ceildiv(size * sizeof(unsigned char), sizeof(ValueType));
-    size_type tmp_size_tree =
-        ceildiv(kernel::searchtree_size * sizeof(AbsType), sizeof(ValueType));
-    size_type tmp_size =
-        tmp_size_totals + tmp_size_partials + tmp_size_oracles + tmp_size_tree;
-    tmp->resize_and_reset(tmp_size);
-
-    auto total_counts = reinterpret_cast<IndexType*>(tmp->get_data());
-    auto partial_counts =
-        reinterpret_cast<IndexType*>(tmp->get_data() + tmp_size_totals);
-    auto oracles = reinterpret_cast<unsigned char*>(
-        tmp->get_data() + tmp_size_totals + tmp_size_partials);
-    auto tree =
-        reinterpret_cast<AbsType*>(tmp->get_data() + tmp_size_totals +
-                                   tmp_size_partials + tmp_size_oracles);
-
-    sampleselect_count(exec, values, size, tree, oracles, partial_counts,
-                       total_counts);
-
-    // determine bucket with correct rank
-    auto bucket = static_cast<unsigned char>(
-        sampleselect_find_bucket(exec, total_counts, rank).idx);
-    *threshold =
-        exec->copy_val_to_host(tree + kernel::searchtree_inner_size + bucket);
-    // we implicitly set the first splitter to -inf, but 0 works as well
-    if (bucket == 0) {
-        *threshold = zero<AbsType>();
-    }
-
-    // filter the elements
-    auto old_row_ptrs = m->get_const_row_ptrs();
-    auto old_col_idxs = m->get_const_col_idxs();
-    auto old_vals = m->get_const_values();
-    // compute nnz for each row
-    auto num_rows = static_cast<IndexType>(m->get_size()[0]);
-    auto block_size = default_block_size / subwarp_size;
-    auto num_blocks = ceildiv(num_rows, block_size);
-    auto new_row_ptrs = m_out->get_row_ptrs();
-    if (num_blocks > 0) {
-        kernel::bucket_filter_nnz<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                old_row_ptrs, oracles, num_rows, bucket, new_row_ptrs);
-    }
-
-    // build row pointers
-    components::prefix_sum_nonnegative(exec, new_row_ptrs, num_rows + 1);
-
-    // build matrix
-    auto new_nnz = exec->copy_val_to_host(new_row_ptrs + num_rows);
-    // resize arrays and update aliases
-    matrix::CsrBuilder<ValueType, IndexType> builder{m_out};
-    builder.get_col_idx_array().resize_and_reset(new_nnz);
-    builder.get_value_array().resize_and_reset(new_nnz);
-    auto new_col_idxs = m_out->get_col_idxs();
-    auto new_vals = m_out->get_values();
-    IndexType* new_row_idxs{};
-    if (m_out_coo) {
-        matrix::CooBuilder<ValueType, IndexType> coo_builder{m_out_coo};
-        coo_builder.get_row_idx_array().resize_and_reset(new_nnz);
-        coo_builder.get_col_idx_array() =
-            make_array_view(exec, new_nnz, new_col_idxs);
-        coo_builder.get_value_array() =
-            make_array_view(exec, new_nnz, new_vals);
-        new_row_idxs = m_out_coo->get_row_idxs();
-    }
-    if (num_blocks > 0) {
-        kernel::bucket_filter<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                old_row_ptrs, old_col_idxs, as_device_type(old_vals), oracles,
-                num_rows, bucket, new_row_ptrs, new_row_idxs, new_col_idxs,
-                as_device_type(new_vals));
-    }
-}
-
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_threshold_filter_approx,
-                                    threshold_filter_approx);
-
-
-template <typename ValueType, typename IndexType>
-void threshold_filter_approx(std::shared_ptr<const DefaultExecutor> exec,
-                             const matrix::Csr<ValueType, IndexType>* m,
-                             IndexType rank, array<ValueType>& tmp,
-                             remove_complex<ValueType>& threshold,
-                             matrix::Csr<ValueType, IndexType>* m_out,
-                             matrix::Coo<ValueType, IndexType>* m_out_coo)
-{
-    auto num_rows = m->get_size()[0];
-    auto total_nnz = m->get_num_stored_elements();
-    auto total_nnz_per_row = total_nnz / num_rows;
-    select_threshold_filter_approx(
-        compiled_kernels(),
-        [&](int compiled_subwarp_size) {
-            return total_nnz_per_row <= compiled_subwarp_size ||
-                   compiled_subwarp_size == config::warp_size;
-        },
-        syn::value_list<int>(), syn::type_list<>(), exec, m, rank, &tmp,
-        &threshold, m_out, m_out_coo);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL);
-
-
-}  // namespace par_ilut_factorization
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/factorization/par_ilut_filter_kernels.cu b/cuda/factorization/par_ilut_filter_kernels.cu
deleted file mode 100644
index 3d6b41f07e6..00000000000
--- a/cuda/factorization/par_ilut_filter_kernels.cu
+++ /dev/null
@@ -1,137 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/intrinsics.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/factorization/par_ilut_kernels.hpp"
-#include "core/matrix/coo_builder.hpp"
-#include "core/matrix/csr_builder.hpp"
-#include "core/matrix/csr_kernels.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The parallel ILUT factorization namespace.
- *
- * @ingroup factor
- */
-namespace par_ilut_factorization {
-
-
-constexpr int default_block_size = 512;
-
-
-// subwarp sizes for filter kernels
-using compiled_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
-
-
-#include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc"
-
-
-namespace {
-
-
-template <int subwarp_size, typename ValueType, typename IndexType>
-void threshold_filter(syn::value_list<int, subwarp_size>,
-                      std::shared_ptr<const DefaultExecutor> exec,
-                      const matrix::Csr<ValueType, IndexType>* a,
-                      remove_complex<ValueType> threshold,
-                      matrix::Csr<ValueType, IndexType>* m_out,
-                      matrix::Coo<ValueType, IndexType>* m_out_coo, bool lower)
-{
-    auto old_row_ptrs = a->get_const_row_ptrs();
-    auto old_col_idxs = a->get_const_col_idxs();
-    auto old_vals = a->get_const_values();
-    // compute nnz for each row
-    auto num_rows = static_cast<IndexType>(a->get_size()[0]);
-    auto block_size = default_block_size / subwarp_size;
-    auto num_blocks = ceildiv(num_rows, block_size);
-    auto new_row_ptrs = m_out->get_row_ptrs();
-    if (num_blocks > 0) {
-        kernel::threshold_filter_nnz<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                old_row_ptrs, as_device_type(old_vals), num_rows,
-                as_device_type(threshold), new_row_ptrs, lower);
-    }
-
-    // build row pointers
-    components::prefix_sum_nonnegative(exec, new_row_ptrs, num_rows + 1);
-
-    // build matrix
-    auto new_nnz = exec->copy_val_to_host(new_row_ptrs + num_rows);
-    // resize arrays and update aliases
-    matrix::CsrBuilder<ValueType, IndexType> builder{m_out};
-    builder.get_col_idx_array().resize_and_reset(new_nnz);
-    builder.get_value_array().resize_and_reset(new_nnz);
-    auto new_col_idxs = m_out->get_col_idxs();
-    auto new_vals = m_out->get_values();
-    IndexType* new_row_idxs{};
-    if (m_out_coo) {
-        matrix::CooBuilder<ValueType, IndexType> coo_builder{m_out_coo};
-        coo_builder.get_row_idx_array().resize_and_reset(new_nnz);
-        coo_builder.get_col_idx_array() =
-            make_array_view(exec, new_nnz, new_col_idxs);
-        coo_builder.get_value_array() =
-            make_array_view(exec, new_nnz, new_vals);
-        new_row_idxs = m_out_coo->get_row_idxs();
-    }
-    if (num_blocks > 0) {
-        kernel::threshold_filter<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                old_row_ptrs, old_col_idxs, as_device_type(old_vals), num_rows,
-                as_device_type(threshold), new_row_ptrs, new_row_idxs,
-                new_col_idxs, as_device_type(new_vals), lower);
-    }
-}
-
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_threshold_filter, threshold_filter);
-
-
-}  // namespace
-
-template <typename ValueType, typename IndexType>
-void threshold_filter(std::shared_ptr<const DefaultExecutor> exec,
-                      const matrix::Csr<ValueType, IndexType>* a,
-                      remove_complex<ValueType> threshold,
-                      matrix::Csr<ValueType, IndexType>* m_out,
-                      matrix::Coo<ValueType, IndexType>* m_out_coo, bool lower)
-{
-    auto num_rows = a->get_size()[0];
-    auto total_nnz = a->get_num_stored_elements();
-    auto total_nnz_per_row = total_nnz / num_rows;
-    select_threshold_filter(
-        compiled_kernels(),
-        [&](int compiled_subwarp_size) {
-            return total_nnz_per_row <= compiled_subwarp_size ||
-                   compiled_subwarp_size == config::warp_size;
-        },
-        syn::value_list<int>(), syn::type_list<>(), exec, a, threshold, m_out,
-        m_out_coo, lower);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL);
-
-
-}  // namespace par_ilut_factorization
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/factorization/par_ilut_select_common.cu b/cuda/factorization/par_ilut_select_common.cu
deleted file mode 100644
index e0b81a81a1c..00000000000
--- a/cuda/factorization/par_ilut_select_common.cu
+++ /dev/null
@@ -1,95 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "cuda/factorization/par_ilut_select_common.cuh"
-
-#include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/components/atomic.hpp"
-#include "common/cuda_hip/components/intrinsics.hpp"
-#include "common/cuda_hip/components/prefix_sum.hpp"
-#include "common/cuda_hip/components/searching.hpp"
-#include "common/cuda_hip/components/sorting.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/factorization/par_ilut_kernels.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The parallel ILUT factorization namespace.
- *
- * @ingroup factor
- */
-namespace par_ilut_factorization {
-
-
-#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc"
-
-
-template <typename ValueType, typename IndexType>
-void sampleselect_count(std::shared_ptr<const DefaultExecutor> exec,
-                        const ValueType* values, IndexType size,
-                        remove_complex<ValueType>* tree, unsigned char* oracles,
-                        IndexType* partial_counts, IndexType* total_counts)
-{
-    constexpr auto bucket_count = kernel::searchtree_width;
-    auto num_threads_total = ceildiv(size, items_per_thread);
-    auto num_blocks =
-        static_cast<IndexType>(ceildiv(num_threads_total, default_block_size));
-    // pick sample, build searchtree
-    kernel::build_searchtree<<<1, bucket_count, 0, exec->get_stream()>>>(
-        as_device_type(values), size, as_device_type(tree));
-    // determine bucket sizes
-    if (num_blocks > 0) {
-        kernel::count_buckets<<<num_blocks, default_block_size, 0,
-                                exec->get_stream()>>>(
-            as_device_type(values), size, as_device_type(tree), partial_counts,
-            oracles, items_per_thread);
-    }
-    // compute prefix sum and total sum over block-local values
-    kernel::block_prefix_sum<<<bucket_count, default_block_size, 0,
-                               exec->get_stream()>>>(partial_counts,
-                                                     total_counts, num_blocks);
-    // compute prefix sum over bucket counts
-    components::prefix_sum_nonnegative(exec, total_counts, bucket_count + 1);
-}
-
-
-#define DECLARE_SSSS_COUNT(ValueType, IndexType)                               \
-    void sampleselect_count(std::shared_ptr<const DefaultExecutor> exec,       \
-                            const ValueType* values, IndexType size,           \
-                            remove_complex<ValueType>* tree,                   \
-                            unsigned char* oracles, IndexType* partial_counts, \
-                            IndexType* total_counts)
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(DECLARE_SSSS_COUNT);
-
-
-template <typename IndexType>
-sampleselect_bucket<IndexType> sampleselect_find_bucket(
-    std::shared_ptr<const DefaultExecutor> exec, IndexType* prefix_sum,
-    IndexType rank)
-{
-    kernel::find_bucket<<<1, config::warp_size, 0, exec->get_stream()>>>(
-        prefix_sum, rank);
-    IndexType values[3]{};
-    exec->get_master()->copy_from(exec, 3, prefix_sum, values);
-    return {values[0], values[1], values[2]};
-}
-
-
-#define DECLARE_SSSS_FIND_BUCKET(IndexType)                                 \
-    sampleselect_bucket<IndexType> sampleselect_find_bucket(                \
-        std::shared_ptr<const DefaultExecutor> exec, IndexType* prefix_sum, \
-        IndexType rank)
-
-GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(DECLARE_SSSS_FIND_BUCKET);
-
-
-}  // namespace par_ilut_factorization
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 0bfe56d7db1..4dd54c53782 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -16,10 +16,6 @@ set(GINKGO_HIP_SOURCES
     base/stream.hip.cpp
     base/timer.hip.cpp
     base/version.hip.cpp
-    factorization/par_ilut_approx_filter_kernels.hip.cpp
-    factorization/par_ilut_filter_kernels.hip.cpp
-    factorization/par_ilut_select_common.hip.cpp
-    factorization/par_ilut_select_kernels.hip.cpp
     matrix/batch_csr_kernels.hip.cpp
     matrix/batch_dense_kernels.hip.cpp
     matrix/batch_ell_kernels.hip.cpp
diff --git a/hip/factorization/par_ilut_select_common.hip.hpp b/hip/factorization/par_ilut_select_common.hip.hpp
deleted file mode 100644
index 290de30f5df..00000000000
--- a/hip/factorization/par_ilut_select_common.hip.hpp
+++ /dev/null
@@ -1,51 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_FACTORIZATION_PAR_ILUT_SELECT_COMMON_HIP_HPP_
-#define GKO_HIP_FACTORIZATION_PAR_ILUT_SELECT_COMMON_HIP_HPP_
-
-
-#include <ginkgo/core/base/executor.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/types.hpp>
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-namespace par_ilut_factorization {
-
-
-constexpr int default_block_size = 512;
-constexpr int items_per_thread = 16;
-
-
-template <typename ValueType, typename IndexType>
-void sampleselect_count(std::shared_ptr<const DefaultExecutor> exec,
-                        const ValueType* values, IndexType size,
-                        remove_complex<ValueType>* tree, unsigned char* oracles,
-                        IndexType* partial_counts, IndexType* total_counts);
-
-
-template <typename IndexType>
-struct sampleselect_bucket {
-    IndexType idx;
-    IndexType begin;
-    IndexType size;
-};
-
-
-template <typename IndexType>
-sampleselect_bucket<IndexType> sampleselect_find_bucket(
-    std::shared_ptr<const DefaultExecutor> exec, IndexType* prefix_sum,
-    IndexType rank);
-
-
-}  // namespace par_ilut_factorization
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_FACTORIZATION_PAR_ILUT_SELECT_COMMON_HIP_HPP_
diff --git a/hip/factorization/par_ilut_select_kernels.hip.cpp b/hip/factorization/par_ilut_select_kernels.hip.cpp
deleted file mode 100644
index 2e75f7de81b..00000000000
--- a/hip/factorization/par_ilut_select_kernels.hip.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include <algorithm>
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-
-#include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/components/atomic.hpp"
-#include "common/cuda_hip/components/intrinsics.hpp"
-#include "common/cuda_hip/components/prefix_sum.hpp"
-#include "common/cuda_hip/components/searching.hpp"
-#include "common/cuda_hip/components/sorting.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/factorization/par_ilut_kernels.hpp"
-#include "hip/factorization/par_ilut_select_common.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The parallel ILUT factorization namespace.
- *
- * @ingroup factor
- */
-namespace par_ilut_factorization {
-
-
-#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc"
-
-
-template <typename ValueType, typename IndexType>
-void sampleselect_filter(std::shared_ptr<const DefaultExecutor> exec,
-                         const ValueType* values, IndexType size,
-                         const unsigned char* oracles,
-                         const IndexType* partial_counts, IndexType bucket,
-                         remove_complex<ValueType>* out)
-{
-    auto num_threads_total = ceildiv(size, items_per_thread);
-    auto num_blocks =
-        static_cast<IndexType>(ceildiv(num_threads_total, default_block_size));
-    if (num_blocks > 0) {
-        kernel::filter_bucket<<<num_blocks, default_block_size, 0,
-                                exec->get_stream()>>>(
-            as_device_type(values), size, bucket, oracles, partial_counts,
-            as_device_type(out), items_per_thread);
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
-                      const matrix::Csr<ValueType, IndexType>* m,
-                      IndexType rank, array<ValueType>& tmp1,
-                      array<remove_complex<ValueType>>& tmp2,
-                      remove_complex<ValueType>& threshold)
-{
-    auto values = m->get_const_values();
-    IndexType size = m->get_num_stored_elements();
-    using AbsType = remove_complex<ValueType>;
-    constexpr auto bucket_count = kernel::searchtree_width;
-    auto max_num_threads = ceildiv(size, items_per_thread);
-    auto max_num_blocks = ceildiv(max_num_threads, default_block_size);
-
-    size_type tmp_size_totals =
-        ceildiv((bucket_count + 1) * sizeof(IndexType), sizeof(ValueType));
-    size_type tmp_size_partials = ceildiv(
-        bucket_count * max_num_blocks * sizeof(IndexType), sizeof(ValueType));
-    size_type tmp_size_oracles =
-        ceildiv(size * sizeof(unsigned char), sizeof(ValueType));
-    size_type tmp_size_tree =
-        ceildiv(kernel::searchtree_size * sizeof(AbsType), sizeof(ValueType));
-    size_type tmp_size_vals =
-        size / bucket_count * 4;  // pessimistic estimate for temporary storage
-    size_type tmp_size =
-        tmp_size_totals + tmp_size_partials + tmp_size_oracles + tmp_size_tree;
-    tmp1.resize_and_reset(tmp_size);
-    tmp2.resize_and_reset(tmp_size_vals);
-
-    auto total_counts = reinterpret_cast<IndexType*>(tmp1.get_data());
-    auto partial_counts =
-        reinterpret_cast<IndexType*>(tmp1.get_data() + tmp_size_totals);
-    auto oracles = reinterpret_cast<unsigned char*>(
-        tmp1.get_data() + tmp_size_totals + tmp_size_partials);
-    auto tree =
-        reinterpret_cast<AbsType*>(tmp1.get_data() + tmp_size_totals +
-                                   tmp_size_partials + tmp_size_oracles);
-
-    sampleselect_count(exec, values, size, tree, oracles, partial_counts,
-                       total_counts);
-
-    // determine bucket with correct rank, use bucket-local rank
-    auto bucket = sampleselect_find_bucket(exec, total_counts, rank);
-    rank -= bucket.begin;
-
-    if (bucket.size * 2 > tmp_size_vals) {
-        // we need to reallocate tmp2
-        tmp2.resize_and_reset(bucket.size * 2);
-    }
-    auto tmp21 = tmp2.get_data();
-    auto tmp22 = tmp2.get_data() + bucket.size;
-    // extract target bucket
-    sampleselect_filter(exec, values, size, oracles, partial_counts, bucket.idx,
-                        tmp22);
-
-    // recursively select from smaller buckets
-    int step{};
-    while (bucket.size > kernel::basecase_size) {
-        std::swap(tmp21, tmp22);
-        const auto* tmp_in = tmp21;
-        auto tmp_out = tmp22;
-
-        sampleselect_count(exec, tmp_in, bucket.size, tree, oracles,
-                           partial_counts, total_counts);
-        auto new_bucket = sampleselect_find_bucket(exec, total_counts, rank);
-        sampleselect_filter(exec, tmp_in, bucket.size, oracles, partial_counts,
-                            bucket.idx, tmp_out);
-
-        rank -= new_bucket.begin;
-        bucket.size = new_bucket.size;
-        // we should never need more than 5 recursion steps, this would mean
-        // 256^5 = 2^40. fall back to standard library algorithm in that case.
-        ++step;
-        if (step > 5) {
-            array<AbsType> cpu_out_array{
-                exec->get_master(),
-                make_array_view(exec, bucket.size, tmp_out)};
-            auto begin = cpu_out_array.get_data();
-            auto end = begin + bucket.size;
-            auto middle = begin + rank;
-            std::nth_element(begin, middle, end);
-            threshold = *middle;
-            return;
-        }
-    }
-
-    // base case
-    auto out_ptr = reinterpret_cast<AbsType*>(tmp1.get_data());
-    kernel::basecase_select<<<1, kernel::basecase_block_size, 0,
-                              exec->get_stream()>>>(
-        as_device_type(tmp22), bucket.size, rank, as_device_type(out_ptr));
-    threshold = exec->copy_val_to_host(out_ptr);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL);
-
-
-}  // namespace par_ilut_factorization
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko

From afef0b8bca517ea4e1d0de97110c3403ba3b7ace Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Sat, 13 Jul 2024 00:19:21 +0200
Subject: [PATCH 088/448] unify index_set kernels

---
 common/cuda_hip/CMakeLists.txt                |  1 +
 .../cuda_hip}/base/index_set_kernels.cpp      | 14 +---
 cuda/CMakeLists.txt                           |  1 -
 hip/CMakeLists.txt                            |  1 -
 hip/base/index_set_kernels.hip.cpp            | 83 -------------------
 5 files changed, 3 insertions(+), 97 deletions(-)
 rename {cuda => common/cuda_hip}/base/index_set_kernels.cpp (93%)
 delete mode 100644 hip/base/index_set_kernels.hip.cpp

diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt
index a333ea9569c..88353204488 100644
--- a/common/cuda_hip/CMakeLists.txt
+++ b/common/cuda_hip/CMakeLists.txt
@@ -1,6 +1,7 @@
 include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
 set(CUDA_HIP_SOURCES
     base/device_matrix_data_kernels.cpp
+    base/index_set_kernels.cpp
     components/prefix_sum_kernels.cpp
     distributed/index_map_kernels.cpp
     distributed/matrix_kernels.cpp
diff --git a/cuda/base/index_set_kernels.cpp b/common/cuda_hip/base/index_set_kernels.cpp
similarity index 93%
rename from cuda/base/index_set_kernels.cpp
rename to common/cuda_hip/base/index_set_kernels.cpp
index 2041833e4c2..0a47752d17e 100644
--- a/cuda/base/index_set_kernels.cpp
+++ b/common/cuda_hip/base/index_set_kernels.cpp
@@ -13,17 +13,7 @@
 
 namespace gko {
 namespace kernels {
-/**
- * @brief The Cuda namespace.
- *
- * @ingroup cuda
- */
-namespace cuda {
-/**
- * @brief The index_set namespace.
- *
- * @ingroup index_set
- */
+namespace GKO_DEVICE_NAMESPACE {
 namespace idx_set {
 
 
@@ -78,6 +68,6 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
 
 
 }  // namespace idx_set
-}  // namespace cuda
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index ba02918928c..c9bb448b79b 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -11,7 +11,6 @@ target_sources(ginkgo_cuda
     base/device.cpp
     base/exception.cpp
     base/executor.cpp
-    base/index_set_kernels.cpp
     base/memory.cpp
     base/nvtx.cpp
     base/scoped_device_id.cpp
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 4dd54c53782..e6a337da7b9 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -9,7 +9,6 @@ set(GINKGO_HIP_SOURCES
     base/device.hip.cpp
     base/exception.hip.cpp
     base/executor.hip.cpp
-    base/index_set_kernels.hip.cpp
     base/memory.hip.cpp
     base/roctx.hip.cpp
     base/scoped_device_id.hip.cpp
diff --git a/hip/base/index_set_kernels.hip.cpp b/hip/base/index_set_kernels.hip.cpp
deleted file mode 100644
index 9f9f967fe35..00000000000
--- a/hip/base/index_set_kernels.hip.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/base/index_set_kernels.hpp"
-
-#include <memory>
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/executor.hpp>
-#include <ginkgo/core/base/types.hpp>
-
-
-namespace gko {
-namespace kernels {
-/**
- * @brief The Hip namespace.
- *
- * @ingroup hip
- */
-namespace hip {
-/**
- * @brief The index_set namespace.
- *
- * @ingroup index_set
- */
-namespace idx_set {
-
-
-template <typename IndexType>
-void to_global_indices(std::shared_ptr<const DefaultExecutor> exec,
-                       const IndexType num_subsets,
-                       const IndexType* subset_begin,
-                       const IndexType* subset_end,
-                       const IndexType* superset_indices,
-                       IndexType* decomp_indices) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
-    GKO_DECLARE_INDEX_SET_TO_GLOBAL_INDICES_KERNEL);
-
-
-template <typename IndexType>
-void populate_subsets(std::shared_ptr<const DefaultExecutor> exec,
-                      const IndexType index_space_size,
-                      const array<IndexType>* indices,
-                      array<IndexType>* subset_begin,
-                      array<IndexType>* subset_end,
-                      array<IndexType>* superset_indices,
-                      const bool is_sorted) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_INDEX_SET_POPULATE_KERNEL);
-
-
-template <typename IndexType>
-void global_to_local(std::shared_ptr<const DefaultExecutor> exec,
-                     const IndexType index_space_size,
-                     const IndexType num_subsets, const IndexType* subset_begin,
-                     const IndexType* subset_end,
-                     const IndexType* superset_indices,
-                     const IndexType num_indices,
-                     const IndexType* global_indices, IndexType* local_indices,
-                     const bool is_sorted) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
-    GKO_DECLARE_INDEX_SET_GLOBAL_TO_LOCAL_KERNEL);
-
-
-template <typename IndexType>
-void local_to_global(std::shared_ptr<const DefaultExecutor> exec,
-                     const IndexType num_subsets, const IndexType* subset_begin,
-                     const IndexType* superset_indices,
-                     const IndexType num_indices,
-                     const IndexType* local_indices, IndexType* global_indices,
-                     const bool is_sorted) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
-    GKO_DECLARE_INDEX_SET_LOCAL_TO_GLOBAL_KERNEL);
-
-
-}  // namespace idx_set
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko

From bb999199459d87a6862311e08164a8f0ac50bee6 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Sat, 13 Jul 2024 08:50:42 +0200
Subject: [PATCH 089/448] fix ILU

---
 common/cuda_hip/factorization/ilu_kernels.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/cuda_hip/factorization/ilu_kernels.cpp b/common/cuda_hip/factorization/ilu_kernels.cpp
index b3f959bba02..0469b80fe86 100644
--- a/common/cuda_hip/factorization/ilu_kernels.cpp
+++ b/common/cuda_hip/factorization/ilu_kernels.cpp
@@ -46,7 +46,7 @@ void compute_lu(std::shared_ptr<const DefaultExecutor> exec,
                     SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data());
 
     // CUDA 11.4 has a use-after-free bug on Turing
-#if defined(GKO_BUILDING_CUDA) && (CUDA_VERSION >= 11040)
+#if defined(GKO_COMPILING_CUDA) && (CUDA_VERSION >= 11040)
     exec->synchronize();
 #endif
 

From 43a445ab0d324187280f5c3303ad537d76fa0c10 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Sat, 13 Jul 2024 11:26:22 +0200
Subject: [PATCH 090/448] unify Jacobi

---
 common/cuda_hip/CMakeLists.txt                |  25 ++++
 .../jacobi_advanced_apply_kernels.cpp         |   6 +-
 .../jacobi_advanced_apply_kernels.hpp.inc     |  81 -------------
 ...obi_advanced_apply_kernels.instantiate.cpp |  85 ++++++++++++--
 .../preconditioner/jacobi_common.hpp.in       |  13 +--
 .../jacobi_generate_kernels.cpp               |  14 +--
 ...> jacobi_generate_kernels.instantiate.cpp} |  96 ++++++++++++++++
 .../preconditioner/jacobi_kernels.cpp         |   5 -
 .../jacobi_simple_apply_kernels.cpp           |  11 +-
 .../jacobi_simple_apply_kernels.hpp.inc       |  76 ------------
 ...acobi_simple_apply_kernels.instantiate.cpp |  80 +++++++++++--
 cuda/CMakeLists.txt                           |  32 ++----
 .../jacobi_advanced_apply_kernels.cu          |  75 ------------
 ...cobi_advanced_apply_kernels.instantiate.cu | 100 ----------------
 .../preconditioner/jacobi_generate_kernels.cu |  72 ------------
 .../jacobi_generate_kernels.instantiate.cu    | 108 ------------------
 hip/CMakeLists.txt                            |  39 ++-----
 hip/preconditioner/jacobi_common.hip.hpp.in   |  43 -------
 ...acobi_generate_kernels.instantiate.hip.cpp | 108 ------------------
 .../jacobi_simple_apply_kernels.hip.cpp       |  84 --------------
 ...i_simple_apply_kernels.instantiate.hip.cpp |  97 ----------------
 21 files changed, 301 insertions(+), 949 deletions(-)
 rename hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp => common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.cpp (94%)
 delete mode 100644 common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc
 rename hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp => common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp (52%)
 rename {cuda => common/cuda_hip}/preconditioner/jacobi_common.hpp.in (70%)
 rename hip/preconditioner/jacobi_generate_kernels.hip.cpp => common/cuda_hip/preconditioner/jacobi_generate_kernels.cpp (92%)
 rename common/cuda_hip/preconditioner/{jacobi_generate_kernels.hpp.inc => jacobi_generate_kernels.instantiate.cpp} (66%)
 rename cuda/preconditioner/jacobi_simple_apply_kernels.cu => common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.cpp (92%)
 delete mode 100644 common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc
 rename cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu => common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.cpp (52%)
 delete mode 100644 cuda/preconditioner/jacobi_advanced_apply_kernels.cu
 delete mode 100644 cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu
 delete mode 100644 cuda/preconditioner/jacobi_generate_kernels.cu
 delete mode 100644 cuda/preconditioner/jacobi_generate_kernels.instantiate.cu
 delete mode 100644 hip/preconditioner/jacobi_common.hip.hpp.in
 delete mode 100644 hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp
 delete mode 100644 hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp
 delete mode 100644 hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp

diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt
index 88353204488..463abfd9284 100644
--- a/common/cuda_hip/CMakeLists.txt
+++ b/common/cuda_hip/CMakeLists.txt
@@ -31,6 +31,9 @@ set(CUDA_HIP_SOURCES
     multigrid/pgm_kernels.cpp
     preconditioner/isai_kernels.cpp
     preconditioner/jacobi_kernels.cpp
+    preconditioner/jacobi_advanced_apply_kernels.cpp
+    preconditioner/jacobi_generate_kernels.cpp
+    preconditioner/jacobi_simple_apply_kernels.cpp
     reorder/rcm_kernels.cpp
     solver/cb_gmres_kernels.cpp
     solver/idr_kernels.cpp
@@ -38,5 +41,27 @@ set(CUDA_HIP_SOURCES
     stop/criterion_kernels.cpp
     stop/residual_norm_kernels.cpp
     )
+# create files for all potentially used block sizes
+foreach(GKO_JACOBI_BLOCK_SIZE RANGE 1 64)
+    configure_file(
+        preconditioner/jacobi_generate_kernels.instantiate.cpp
+        preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cpp)
+    configure_file(
+        preconditioner/jacobi_simple_apply_kernels.instantiate.cpp
+        preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cpp)
+    configure_file(
+        preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp
+        preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cpp)
+endforeach()
+function(jacobi_generated_files variable_name block_sizes)
+    set(${variable_name})
+    foreach(block_size IN LISTS block_sizes)
+        list(APPEND variable_name
+            ${Ginkgo_BINARY_DIR}/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.${block_size}.cpp
+            ${Ginkgo_BINARY_DIR}/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.${block_size}.cpp
+            ${Ginkgo_BINARY_DIR}/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.${block_size}.cpp)
+    endforeach()
+    set(${variable_name} ${${variable_name}} PARENT_SCOPE)
+endfunction()
 list(TRANSFORM CUDA_HIP_SOURCES PREPEND ${CMAKE_CURRENT_SOURCE_DIR}/)
 set(GKO_CUDA_HIP_COMMON_SOURCES ${CUDA_HIP_SOURCES} PARENT_SCOPE)
diff --git a/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp b/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.cpp
similarity index 94%
rename from hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp
rename to common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.cpp
index 371a10051fc..27b4f57eb6c 100644
--- a/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp
+++ b/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.cpp
@@ -13,7 +13,7 @@
 
 namespace gko {
 namespace kernels {
-namespace hip {
+namespace GKO_DEVICE_NAMESPACE {
 /**
  * @brief The Jacobi preconditioner namespace.
  * @ref Jacobi
@@ -38,7 +38,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_advanced_apply, advanced_apply);
 
 
 template <typename ValueType, typename IndexType>
-void apply(std::shared_ptr<const HipExecutor> exec, size_type num_blocks,
+void apply(std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
            uint32 max_block_size,
            const preconditioner::block_interleaved_storage_scheme<IndexType>&
                storage_scheme,
@@ -70,6 +70,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_APPLY_KERNEL);
 
 
 }  // namespace jacobi
-}  // namespace hip
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc b/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc
deleted file mode 100644
index 5d7a6966c78..00000000000
--- a/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc
+++ /dev/null
@@ -1,81 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-namespace kernel {
-
-
-template <int max_block_size, int subwarp_size, int warps_per_block,
-          typename ValueType, typename IndexType>
-__global__ void __launch_bounds__(warps_per_block* config::warp_size)
-    advanced_apply(const ValueType* __restrict__ blocks,
-                   preconditioner::block_interleaved_storage_scheme<IndexType>
-                       storage_scheme,
-                   const IndexType* __restrict__ block_ptrs,
-                   size_type num_blocks, const ValueType* __restrict__ alpha,
-                   const ValueType* __restrict__ b, int32 b_stride,
-                   ValueType* __restrict__ x, int32 x_stride)
-{
-    const auto block_id =
-        thread::get_subwarp_id<subwarp_size, warps_per_block>();
-    const auto subwarp =
-        group::tiled_partition<subwarp_size>(group::this_thread_block());
-    if (block_id >= num_blocks) {
-        return;
-    }
-    const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id];
-    ValueType v = zero<ValueType>();
-    if (subwarp.thread_rank() < block_size) {
-        v = alpha[0] *
-            b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride];
-    }
-    multiply_vec<max_block_size>(
-        subwarp, block_size, v,
-        blocks + storage_scheme.get_global_block_offset(block_id) +
-            subwarp.thread_rank(),
-        storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride,
-        x_stride,
-        [](ValueType& result, const ValueType& out) { result += out; });
-}
-
-
-template <int max_block_size, int subwarp_size, int warps_per_block,
-          typename ValueType, typename IndexType>
-__global__ void
-__launch_bounds__(warps_per_block* config::warp_size) advanced_adaptive_apply(
-    const ValueType* __restrict__ blocks,
-    preconditioner::block_interleaved_storage_scheme<IndexType> storage_scheme,
-    const precision_reduction* __restrict__ block_precisions,
-    const IndexType* __restrict__ block_ptrs, size_type num_blocks,
-    const ValueType* __restrict__ alpha, const ValueType* __restrict__ b,
-    int32 b_stride, ValueType* __restrict__ x, int32 x_stride)
-{
-    const auto block_id =
-        thread::get_subwarp_id<subwarp_size, warps_per_block>();
-    const auto subwarp =
-        group::tiled_partition<subwarp_size>(group::this_thread_block());
-    if (block_id >= num_blocks) {
-        return;
-    }
-    const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id];
-    auto alpha_val = alpha == nullptr ? one<ValueType>() : alpha[0];
-    ValueType v = zero<ValueType>();
-    if (subwarp.thread_rank() < block_size) {
-        v = alpha[0] *
-            b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride];
-    }
-    GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION(
-        ValueType, block_precisions[block_id],
-        multiply_vec<max_block_size>(
-            subwarp, block_size, v,
-            reinterpret_cast<const resolved_precision*>(
-                blocks + storage_scheme.get_group_offset(block_id)) +
-                storage_scheme.get_block_offset(block_id) +
-                subwarp.thread_rank(),
-            storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride,
-            x_stride,
-            [](ValueType& result, const ValueType& out) { result += out; }));
-}
-
-
-}  // namespace kernel
diff --git a/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp b/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp
similarity index 52%
rename from hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp
rename to common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp
index 42c542c228b..0ecc3d0d44b 100644
--- a/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp
+++ b/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp
@@ -22,16 +22,85 @@
 
 namespace gko {
 namespace kernels {
-namespace hip {
-/**
- * @brief The Jacobi preconditioner namespace.
- * @ref Jacobi
- * @ingroup jacobi
- */
+namespace GKO_DEVICE_NAMESPACE {
 namespace jacobi {
+namespace kernel {
+
+
+template <int max_block_size, int subwarp_size, int warps_per_block,
+          typename ValueType, typename IndexType>
+__global__ void __launch_bounds__(warps_per_block* config::warp_size)
+    advanced_apply(const ValueType* __restrict__ blocks,
+                   preconditioner::block_interleaved_storage_scheme<IndexType>
+                       storage_scheme,
+                   const IndexType* __restrict__ block_ptrs,
+                   size_type num_blocks, const ValueType* __restrict__ alpha,
+                   const ValueType* __restrict__ b, int32 b_stride,
+                   ValueType* __restrict__ x, int32 x_stride)
+{
+    const auto block_id =
+        thread::get_subwarp_id<subwarp_size, warps_per_block>();
+    const auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    if (block_id >= num_blocks) {
+        return;
+    }
+    const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id];
+    ValueType v = zero<ValueType>();
+    if (subwarp.thread_rank() < block_size) {
+        v = alpha[0] *
+            b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride];
+    }
+    multiply_vec<max_block_size>(
+        subwarp, block_size, v,
+        blocks + storage_scheme.get_global_block_offset(block_id) +
+            subwarp.thread_rank(),
+        storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride,
+        x_stride,
+        [](ValueType& result, const ValueType& out) { result += out; });
+}
+
+
+template <int max_block_size, int subwarp_size, int warps_per_block,
+          typename ValueType, typename IndexType>
+__global__ void
+__launch_bounds__(warps_per_block* config::warp_size) advanced_adaptive_apply(
+    const ValueType* __restrict__ blocks,
+    preconditioner::block_interleaved_storage_scheme<IndexType> storage_scheme,
+    const precision_reduction* __restrict__ block_precisions,
+    const IndexType* __restrict__ block_ptrs, size_type num_blocks,
+    const ValueType* __restrict__ alpha, const ValueType* __restrict__ b,
+    int32 b_stride, ValueType* __restrict__ x, int32 x_stride)
+{
+    const auto block_id =
+        thread::get_subwarp_id<subwarp_size, warps_per_block>();
+    const auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    if (block_id >= num_blocks) {
+        return;
+    }
+    const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id];
+    auto alpha_val = alpha == nullptr ? one<ValueType>() : alpha[0];
+    ValueType v = zero<ValueType>();
+    if (subwarp.thread_rank() < block_size) {
+        v = alpha[0] *
+            b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride];
+    }
+    GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION(
+        ValueType, block_precisions[block_id],
+        multiply_vec<max_block_size>(
+            subwarp, block_size, v,
+            reinterpret_cast<const resolved_precision*>(
+                blocks + storage_scheme.get_group_offset(block_id)) +
+                storage_scheme.get_block_offset(block_id) +
+                subwarp.thread_rank(),
+            storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride,
+            x_stride,
+            [](ValueType& result, const ValueType& out) { result += out; }));
+}
 
 
-#include "common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc"
+}  // namespace kernel
 
 
 // clang-format off
@@ -96,6 +165,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 }  // namespace jacobi
-}  // namespace hip
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/cuda/preconditioner/jacobi_common.hpp.in b/common/cuda_hip/preconditioner/jacobi_common.hpp.in
similarity index 70%
rename from cuda/preconditioner/jacobi_common.hpp.in
rename to common/cuda_hip/preconditioner/jacobi_common.hpp.in
index aeb47fec97e..b243c7de6a5 100644
--- a/cuda/preconditioner/jacobi_common.hpp.in
+++ b/common/cuda_hip/preconditioner/jacobi_common.hpp.in
@@ -5,12 +5,11 @@
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/synthesizer/containers.hpp>
 
-
 #include "common/cuda_hip/base/config.hpp"
 
 namespace gko {
 namespace kernels {
-namespace cuda {
+namespace GKO_DEVICE_NAMESPACE {
 namespace jacobi {
 
 
@@ -19,15 +18,15 @@ namespace jacobi {
  * kernels should be compiled.
  */
 // clang-format off
-#cmakedefine GKO_CUDA_JACOBI_BLOCK_SIZES_CODE @GKO_CUDA_JACOBI_BLOCK_SIZES_CODE@
+#cmakedefine GKO_JACOBI_BLOCK_SIZES_CODE @GKO_JACOBI_BLOCK_SIZES_CODE@
 // clang-format on
 // make things easier for IDEs
-#ifndef GKO_CUDA_JACOBI_BLOCK_SIZES_CODE
-#define GKO_CUDA_JACOBI_BLOCK_SIZES_CODE 1
+#ifndef GKO_JACOBI_BLOCK_SIZES_CODE
+#define GKO_JACOBI_BLOCK_SIZES_CODE 1
 #endif
 
 
-using compiled_kernels = syn::value_list<int, GKO_CUDA_JACOBI_BLOCK_SIZES_CODE>;
+using compiled_kernels = syn::value_list<int, GKO_JACOBI_BLOCK_SIZES_CODE>;
 
 
 constexpr int get_larger_power(int value, int guess = 1)
@@ -37,6 +36,6 @@ constexpr int get_larger_power(int value, int guess = 1)
 
 
 }  // namespace jacobi
-}  // namespace cuda
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/hip/preconditioner/jacobi_generate_kernels.hip.cpp b/common/cuda_hip/preconditioner/jacobi_generate_kernels.cpp
similarity index 92%
rename from hip/preconditioner/jacobi_generate_kernels.hip.cpp
rename to common/cuda_hip/preconditioner/jacobi_generate_kernels.cpp
index d295ebb046e..207550ff6b1 100644
--- a/hip/preconditioner/jacobi_generate_kernels.hip.cpp
+++ b/common/cuda_hip/preconditioner/jacobi_generate_kernels.cpp
@@ -25,18 +25,10 @@
 
 namespace gko {
 namespace kernels {
-namespace hip {
-/**
- * @brief The Jacobi preconditioner namespace.
- * @ref Jacobi
- * @ingroup jacobi
- */
+namespace GKO_DEVICE_NAMESPACE {
 namespace jacobi {
 
 
-#include "common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc"
-
-
 template <int warps_per_block, int max_block_size, typename ValueType,
           typename IndexType>
 void generate(syn::value_list<int, max_block_size>,
@@ -53,7 +45,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generate, generate);
 
 
 template <typename ValueType, typename IndexType>
-void generate(std::shared_ptr<const HipExecutor> exec,
+void generate(std::shared_ptr<const DefaultExecutor> exec,
               const matrix::Csr<ValueType, IndexType>* system_matrix,
               size_type num_blocks, uint32 max_block_size,
               remove_complex<ValueType> accuracy,
@@ -81,6 +73,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 }  // namespace jacobi
-}  // namespace hip
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc b/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp
similarity index 66%
rename from common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc
rename to common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp
index 61a57ca5f81..d004309c622 100644
--- a/common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc
+++ b/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp
@@ -2,6 +2,30 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <ginkgo/config.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/diagonal_block_manipulation.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
+#include "core/base/extended_float.hpp"
+#include "core/components/fill_array_kernels.hpp"
+#include "core/preconditioner/jacobi_kernels.hpp"
+#include "core/preconditioner/jacobi_utils.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+// generated header
+#include "common/cuda_hip/preconditioner/jacobi_common.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace jacobi {
 namespace kernel {
 
 
@@ -180,3 +204,75 @@ __launch_bounds__(warps_per_block* config::warp_size) adaptive_generate(
 
 
 }  // namespace kernel
+
+
+// clang-format off
+#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@
+// clang-format on
+// make things easier for IDEs
+#ifndef GKO_JACOBI_BLOCK_SIZE
+#define GKO_JACOBI_BLOCK_SIZE 1
+#endif
+
+
+template <int warps_per_block, int max_block_size, typename ValueType,
+          typename IndexType>
+void generate(syn::value_list<int, max_block_size>,
+              std::shared_ptr<const DefaultExecutor> exec,
+              const matrix::Csr<ValueType, IndexType>* mtx,
+              remove_complex<ValueType> accuracy, ValueType* block_data,
+              const preconditioner::block_interleaved_storage_scheme<IndexType>&
+                  storage_scheme,
+              remove_complex<ValueType>* conditioning,
+              precision_reduction* block_precisions,
+              const IndexType* block_ptrs, size_type num_blocks)
+{
+    constexpr int subwarp_size = get_larger_power(max_block_size);
+    constexpr int blocks_per_warp = config::warp_size / subwarp_size;
+    const auto grid_size =
+        ceildiv(num_blocks, warps_per_block * blocks_per_warp);
+    const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block);
+
+    if (grid_size > 0) {
+        if (block_precisions) {
+            kernel::adaptive_generate<max_block_size, subwarp_size,
+                                      warps_per_block>
+                <<<grid_size, block_size, 0, exec->get_stream()>>>(
+                    mtx->get_size()[0], mtx->get_const_row_ptrs(),
+                    mtx->get_const_col_idxs(),
+                    as_device_type(mtx->get_const_values()),
+                    as_device_type(accuracy), as_device_type(block_data),
+                    storage_scheme, as_device_type(conditioning),
+                    block_precisions, block_ptrs, num_blocks);
+        } else {
+            kernel::generate<max_block_size, subwarp_size, warps_per_block>
+                <<<grid_size, block_size, 0, exec->get_stream()>>>(
+                    mtx->get_size()[0], mtx->get_const_row_ptrs(),
+                    mtx->get_const_col_idxs(),
+                    as_device_type(mtx->get_const_values()),
+                    as_device_type(block_data), storage_scheme, block_ptrs,
+                    num_blocks);
+        }
+    }
+}
+
+
+#define DECLARE_JACOBI_GENERATE_INSTANTIATION(ValueType, IndexType)          \
+    void generate<config::min_warps_per_block, GKO_JACOBI_BLOCK_SIZE,        \
+                  ValueType, IndexType>(                                     \
+        syn::value_list<int, GKO_JACOBI_BLOCK_SIZE>,                         \
+        std::shared_ptr<const DefaultExecutor> exec,                         \
+        const matrix::Csr<ValueType, IndexType>*, remove_complex<ValueType>, \
+        ValueType*,                                                          \
+        const preconditioner::block_interleaved_storage_scheme<IndexType>&,  \
+        remove_complex<ValueType>*, precision_reduction*, const IndexType*,  \
+        size_type)
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    DECLARE_JACOBI_GENERATE_INSTANTIATION);
+
+
+}  // namespace jacobi
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/preconditioner/jacobi_kernels.cpp b/common/cuda_hip/preconditioner/jacobi_kernels.cpp
index 8cf5ad1e9fd..f3b099e7c18 100644
--- a/common/cuda_hip/preconditioner/jacobi_kernels.cpp
+++ b/common/cuda_hip/preconditioner/jacobi_kernels.cpp
@@ -22,11 +22,6 @@
 namespace gko {
 namespace kernels {
 namespace GKO_DEVICE_NAMESPACE {
-/**
- * @brief The Jacobi preconditioner namespace.
- * @ref Jacobi
- * @ingroup jacobi
- */
 namespace jacobi {
 
 
diff --git a/cuda/preconditioner/jacobi_simple_apply_kernels.cu b/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.cpp
similarity index 92%
rename from cuda/preconditioner/jacobi_simple_apply_kernels.cu
rename to common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.cpp
index 62e49d30618..e9b7b10fd88 100644
--- a/cuda/preconditioner/jacobi_simple_apply_kernels.cu
+++ b/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.cpp
@@ -13,12 +13,7 @@
 
 namespace gko {
 namespace kernels {
-namespace cuda {
-/**
- * @brief The Jacobi preconditioner namespace.
- * @ref Jacobi
- * @ingroup jacobi
- */
+namespace GKO_DEVICE_NAMESPACE {
 namespace jacobi {
 
 
@@ -38,7 +33,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_apply, apply);
 
 template <typename ValueType, typename IndexType>
 void simple_apply(
-    std::shared_ptr<const CudaExecutor> exec, size_type num_blocks,
+    std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
     uint32 max_block_size,
     const preconditioner::block_interleaved_storage_scheme<IndexType>&
         storage_scheme,
@@ -67,6 +62,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 }  // namespace jacobi
-}  // namespace cuda
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc b/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc
deleted file mode 100644
index c39016810fa..00000000000
--- a/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc
+++ /dev/null
@@ -1,76 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-namespace kernel {
-
-
-template <int max_block_size, int subwarp_size, int warps_per_block,
-          typename ValueType, typename IndexType>
-__global__ void __launch_bounds__(warps_per_block* config::warp_size) apply(
-    const ValueType* __restrict__ blocks,
-    preconditioner::block_interleaved_storage_scheme<IndexType> storage_scheme,
-    const IndexType* __restrict__ block_ptrs, size_type num_blocks,
-    const ValueType* __restrict__ b, int32 b_stride, ValueType* __restrict__ x,
-    int32 x_stride)
-{
-    const auto block_id =
-        thread::get_subwarp_id<subwarp_size, warps_per_block>();
-    const auto subwarp =
-        group::tiled_partition<subwarp_size>(group::this_thread_block());
-    if (block_id >= num_blocks) {
-        return;
-    }
-    const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id];
-    ValueType v = zero<ValueType>();
-    if (subwarp.thread_rank() < block_size) {
-        v = b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride];
-    }
-    multiply_vec<max_block_size>(
-        subwarp, block_size, v,
-        blocks + storage_scheme.get_global_block_offset(block_id) +
-            subwarp.thread_rank(),
-        storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride,
-        x_stride,
-        [](ValueType& result, const ValueType& out) { result = out; });
-}
-
-
-template <int max_block_size, int subwarp_size, int warps_per_block,
-          typename ValueType, typename IndexType>
-__global__ void __launch_bounds__(warps_per_block* config::warp_size)
-    adaptive_apply(const ValueType* __restrict__ blocks,
-                   preconditioner::block_interleaved_storage_scheme<IndexType>
-                       storage_scheme,
-                   const precision_reduction* __restrict__ block_precisions,
-                   const IndexType* __restrict__ block_ptrs,
-                   size_type num_blocks, const ValueType* __restrict__ b,
-                   int32 b_stride, ValueType* __restrict__ x, int32 x_stride)
-{
-    const auto block_id =
-        thread::get_subwarp_id<subwarp_size, warps_per_block>();
-    const auto subwarp =
-        group::tiled_partition<subwarp_size>(group::this_thread_block());
-    if (block_id >= num_blocks) {
-        return;
-    }
-    const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id];
-    ValueType v = zero<ValueType>();
-    if (subwarp.thread_rank() < block_size) {
-        v = b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride];
-    }
-    GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION(
-        ValueType, block_precisions[block_id],
-        multiply_vec<max_block_size>(
-            subwarp, block_size, v,
-            reinterpret_cast<const resolved_precision*>(
-                blocks + storage_scheme.get_group_offset(block_id)) +
-                storage_scheme.get_block_offset(block_id) +
-                subwarp.thread_rank(),
-            storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride,
-            x_stride,
-            [](ValueType& result, const ValueType& out) { result = out; }));
-}
-
-
-}  // namespace kernel
diff --git a/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu b/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.cpp
similarity index 52%
rename from cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu
rename to common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.cpp
index d51b63487fe..734385970e3 100644
--- a/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu
+++ b/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.cpp
@@ -21,16 +21,80 @@
 
 namespace gko {
 namespace kernels {
-namespace cuda {
-/**
- * @brief The Jacobi preconditioner namespace.
- * @ref Jacobi
- * @ingroup jacobi
- */
+namespace GKO_DEVICE_NAMESPACE {
 namespace jacobi {
+namespace kernel {
 
 
-#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc"
+template <int max_block_size, int subwarp_size, int warps_per_block,
+          typename ValueType, typename IndexType>
+__global__ void __launch_bounds__(warps_per_block* config::warp_size) apply(
+    const ValueType* __restrict__ blocks,
+    preconditioner::block_interleaved_storage_scheme<IndexType> storage_scheme,
+    const IndexType* __restrict__ block_ptrs, size_type num_blocks,
+    const ValueType* __restrict__ b, int32 b_stride, ValueType* __restrict__ x,
+    int32 x_stride)
+{
+    const auto block_id =
+        thread::get_subwarp_id<subwarp_size, warps_per_block>();
+    const auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    if (block_id >= num_blocks) {
+        return;
+    }
+    const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id];
+    ValueType v = zero<ValueType>();
+    if (subwarp.thread_rank() < block_size) {
+        v = b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride];
+    }
+    multiply_vec<max_block_size>(
+        subwarp, block_size, v,
+        blocks + storage_scheme.get_global_block_offset(block_id) +
+            subwarp.thread_rank(),
+        storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride,
+        x_stride,
+        [](ValueType& result, const ValueType& out) { result = out; });
+}
+
+
+template <int max_block_size, int subwarp_size, int warps_per_block,
+          typename ValueType, typename IndexType>
+__global__ void __launch_bounds__(warps_per_block* config::warp_size)
+    adaptive_apply(const ValueType* __restrict__ blocks,
+                   preconditioner::block_interleaved_storage_scheme<IndexType>
+                       storage_scheme,
+                   const precision_reduction* __restrict__ block_precisions,
+                   const IndexType* __restrict__ block_ptrs,
+                   size_type num_blocks, const ValueType* __restrict__ b,
+                   int32 b_stride, ValueType* __restrict__ x, int32 x_stride)
+{
+    const auto block_id =
+        thread::get_subwarp_id<subwarp_size, warps_per_block>();
+    const auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    if (block_id >= num_blocks) {
+        return;
+    }
+    const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id];
+    ValueType v = zero<ValueType>();
+    if (subwarp.thread_rank() < block_size) {
+        v = b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride];
+    }
+    GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION(
+        ValueType, block_precisions[block_id],
+        multiply_vec<max_block_size>(
+            subwarp, block_size, v,
+            reinterpret_cast<const resolved_precision*>(
+                blocks + storage_scheme.get_group_offset(block_id)) +
+                storage_scheme.get_block_offset(block_id) +
+                subwarp.thread_rank(),
+            storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride,
+            x_stride,
+            [](ValueType& result, const ValueType& out) { result = out; }));
+}
+
+
+}  // namespace kernel
 
 
 // clang-format off
@@ -92,6 +156,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 }  // namespace jacobi
-}  // namespace cuda
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index c9bb448b79b..92b48518e7c 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -24,9 +24,6 @@ target_sources(ginkgo_cuda
     ${FBCSR_INSTANTIATE}
     matrix/fft_kernels.cu
     preconditioner/batch_jacobi_kernels.cu
-    preconditioner/jacobi_advanced_apply_kernels.cu
-    preconditioner/jacobi_generate_kernels.cu
-    preconditioner/jacobi_simple_apply_kernels.cu
     solver/batch_bicgstab_kernels.cu
     solver/batch_cg_kernels.cu
     solver/lower_trs_kernels.cu
@@ -34,10 +31,6 @@ target_sources(ginkgo_cuda
     ${GKO_UNIFIED_COMMON_SOURCES}
     ${GKO_CUDA_HIP_COMMON_SOURCES}
     )
-# override the default language mapping for the common files, set them to CUDA
-foreach(source_file IN LISTS GKO_UNIFIED_COMMON_SOURCES GKO_CUDA_HIP_COMMON_SOURCES FBCSR_INSTANTIATE)
-    set_source_files_properties(${source_file} PROPERTIES LANGUAGE CUDA)
-endforeach(source_file)
 if(GINKGO_JACOBI_FULL_OPTIMIZATIONS)
     set(GKO_CUDA_JACOBI_BLOCK_SIZES)
     foreach(blocksize RANGE 1 32)
@@ -46,25 +39,14 @@ if(GINKGO_JACOBI_FULL_OPTIMIZATIONS)
 else()
     set(GKO_CUDA_JACOBI_BLOCK_SIZES 1 2 4 8 13 16 32)
 endif()
-set(GKO_CUDA_JACOBI_SOURCES)
-foreach(GKO_JACOBI_BLOCK_SIZE IN LISTS GKO_CUDA_JACOBI_BLOCK_SIZES)
-    configure_file(
-        preconditioner/jacobi_generate_kernels.instantiate.cu
-        preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu)
-    configure_file(
-        preconditioner/jacobi_simple_apply_kernels.instantiate.cu
-        preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu)
-    configure_file(
-        preconditioner/jacobi_advanced_apply_kernels.instantiate.cu
-        preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu)
-    list(APPEND GKO_CUDA_JACOBI_SOURCES
-        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu
-        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu
-        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu)
-endforeach()
+jacobi_generated_files(GKO_CUDA_JACOBI_SOURCES "${GKO_CUDA_JACOBI_BLOCK_SIZES}")
+# override the default language mapping for the common files, set them to CUDA
+foreach(source_file IN LISTS GKO_UNIFIED_COMMON_SOURCES GKO_CUDA_HIP_COMMON_SOURCES GKO_CUDA_JACOBI_SOURCES FBCSR_INSTANTIATE)
+    set_source_files_properties(${source_file} PROPERTIES LANGUAGE CUDA)
+endforeach(source_file)
 target_sources(ginkgo_cuda PRIVATE ${GKO_CUDA_JACOBI_SOURCES})
-string(REPLACE ";" "," GKO_CUDA_JACOBI_BLOCK_SIZES_CODE "${GKO_CUDA_JACOBI_BLOCK_SIZES}")
-configure_file(preconditioner/jacobi_common.hpp.in common/cuda_hip/preconditioner/jacobi_common.hpp)
+string(REPLACE ";" "," GKO_JACOBI_BLOCK_SIZES_CODE "${GKO_CUDA_JACOBI_BLOCK_SIZES}")
+configure_file(${Ginkgo_SOURCE_DIR}/common/cuda_hip/preconditioner/jacobi_common.hpp.in common/cuda_hip/preconditioner/jacobi_common.hpp)
 
 if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
     # remove false positive CUDA warnings when calling one<T>() and zero<T>()
diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernels.cu b/cuda/preconditioner/jacobi_advanced_apply_kernels.cu
deleted file mode 100644
index a37296abf40..00000000000
--- a/cuda/preconditioner/jacobi_advanced_apply_kernels.cu
+++ /dev/null
@@ -1,75 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-
-#include "core/matrix/dense_kernels.hpp"
-#include "core/preconditioner/jacobi_kernels.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-// generated header
-#include "common/cuda_hip/preconditioner/jacobi_common.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Jacobi preconditioner namespace.
- * @ref Jacobi
- * @ingroup jacobi
- */
-namespace jacobi {
-
-
-template <int warps_per_block, int max_block_size, typename ValueType,
-          typename IndexType>
-void advanced_apply(
-    syn::value_list<int, max_block_size>,
-    std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
-    const precision_reduction* block_precisions,
-    const IndexType* block_pointers, const ValueType* blocks,
-    const preconditioner::block_interleaved_storage_scheme<IndexType>&
-        storage_scheme,
-    const ValueType* alpha, const ValueType* b, size_type b_stride,
-    ValueType* x, size_type x_stride);
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_advanced_apply, advanced_apply);
-
-
-template <typename ValueType, typename IndexType>
-void apply(std::shared_ptr<const CudaExecutor> exec, size_type num_blocks,
-           uint32 max_block_size,
-           const preconditioner::block_interleaved_storage_scheme<IndexType>&
-               storage_scheme,
-           const array<precision_reduction>& block_precisions,
-           const array<IndexType>& block_pointers,
-           const array<ValueType>& blocks,
-           const matrix::Dense<ValueType>* alpha,
-           const matrix::Dense<ValueType>* b,
-           const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* x)
-{
-    // TODO: write a special kernel for multiple RHS
-    dense::scale(exec, beta, x);
-    for (size_type col = 0; col < b->get_size()[1]; ++col) {
-        select_advanced_apply(
-            compiled_kernels(),
-            [&](int compiled_block_size) {
-                return max_block_size <= compiled_block_size;
-            },
-            syn::value_list<int, config::min_warps_per_block>(),
-            syn::type_list<>(), exec, num_blocks,
-            block_precisions.get_const_data(), block_pointers.get_const_data(),
-            blocks.get_const_data(), storage_scheme, alpha->get_const_values(),
-            b->get_const_values() + col, b->get_stride(), x->get_values() + col,
-            x->get_stride());
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_APPLY_KERNEL);
-
-
-}  // namespace jacobi
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu
deleted file mode 100644
index fcf238d038f..00000000000
--- a/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu
+++ /dev/null
@@ -1,100 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/warp_blas.hpp"
-#include "core/base/extended_float.hpp"
-#include "core/matrix/dense_kernels.hpp"
-#include "core/preconditioner/jacobi_kernels.hpp"
-#include "core/preconditioner/jacobi_utils.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-// generated header
-#include "common/cuda_hip/preconditioner/jacobi_common.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Jacobi preconditioner namespace.
- * @ref Jacobi
- * @ingroup jacobi
- */
-namespace jacobi {
-
-
-#include "common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc"
-
-
-// clang-format off
-#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@
-// clang-format on
-// make things easier for IDEs
-#ifndef GKO_JACOBI_BLOCK_SIZE
-#define GKO_JACOBI_BLOCK_SIZE 1
-#endif
-
-
-template <int warps_per_block, int max_block_size, typename ValueType,
-          typename IndexType>
-void advanced_apply(
-    syn::value_list<int, max_block_size>,
-    std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
-    const precision_reduction* block_precisions,
-    const IndexType* block_pointers, const ValueType* blocks,
-    const preconditioner::block_interleaved_storage_scheme<IndexType>&
-        storage_scheme,
-    const ValueType* alpha, const ValueType* b, size_type b_stride,
-    ValueType* x, size_type x_stride)
-{
-    constexpr int subwarp_size = get_larger_power(max_block_size);
-    constexpr int blocks_per_warp = config::warp_size / subwarp_size;
-    const auto grid_size =
-        ceildiv(num_blocks, warps_per_block * blocks_per_warp);
-    const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block);
-
-    if (grid_size > 0) {
-        if (block_precisions) {
-            kernel::advanced_adaptive_apply<max_block_size, subwarp_size,
-                                            warps_per_block>
-                <<<grid_size, block_size, 0, exec->get_stream()>>>(
-                    as_device_type(blocks), storage_scheme, block_precisions,
-                    block_pointers, num_blocks, as_device_type(alpha),
-                    as_device_type(b), b_stride, as_device_type(x), x_stride);
-        } else {
-            kernel::advanced_apply<max_block_size, subwarp_size,
-                                   warps_per_block>
-                <<<grid_size, block_size, 0, exec->get_stream()>>>(
-                    as_device_type(blocks), storage_scheme, block_pointers,
-                    num_blocks, as_device_type(alpha), as_device_type(b),
-                    b_stride, as_device_type(x), x_stride);
-        }
-    }
-}
-
-
-#define DECLARE_JACOBI_ADVANCED_APPLY_INSTANTIATION(ValueType, IndexType)   \
-    void advanced_apply<config::min_warps_per_block, GKO_JACOBI_BLOCK_SIZE, \
-                        ValueType, IndexType>(                              \
-        syn::value_list<int, GKO_JACOBI_BLOCK_SIZE>,                        \
-        std::shared_ptr<const DefaultExecutor> exec, size_type,             \
-        const precision_reduction*, const IndexType* block_pointers,        \
-        const ValueType*,                                                   \
-        const preconditioner::block_interleaved_storage_scheme<IndexType>&, \
-        const ValueType*, const ValueType*, size_type, ValueType*, size_type)
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    DECLARE_JACOBI_ADVANCED_APPLY_INSTANTIATION);
-
-
-}  // namespace jacobi
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/preconditioner/jacobi_generate_kernels.cu b/cuda/preconditioner/jacobi_generate_kernels.cu
deleted file mode 100644
index d51f1947b7a..00000000000
--- a/cuda/preconditioner/jacobi_generate_kernels.cu
+++ /dev/null
@@ -1,72 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include <ginkgo/config.hpp>
-#include <ginkgo/core/base/exception_helpers.hpp>
-
-#include "core/components/fill_array_kernels.hpp"
-#include "core/preconditioner/jacobi_kernels.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-// generated header
-#include "common/cuda_hip/preconditioner/jacobi_common.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Jacobi preconditioner namespace.
- * @ref Jacobi
- * @ingroup jacobi
- */
-namespace jacobi {
-
-
-template <int warps_per_block, int max_block_size, typename ValueType,
-          typename IndexType>
-void generate(syn::value_list<int, max_block_size>,
-              std::shared_ptr<const DefaultExecutor> exec,
-              const matrix::Csr<ValueType, IndexType>* mtx,
-              remove_complex<ValueType> accuracy, ValueType* block_data,
-              const preconditioner::block_interleaved_storage_scheme<IndexType>&
-                  storage_scheme,
-              remove_complex<ValueType>* conditioning,
-              precision_reduction* block_precisions,
-              const IndexType* block_ptrs, size_type num_blocks);
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generate, generate);
-
-
-template <typename ValueType, typename IndexType>
-void generate(std::shared_ptr<const CudaExecutor> exec,
-              const matrix::Csr<ValueType, IndexType>* system_matrix,
-              size_type num_blocks, uint32 max_block_size,
-              remove_complex<ValueType> accuracy,
-              const preconditioner::block_interleaved_storage_scheme<IndexType>&
-                  storage_scheme,
-              array<remove_complex<ValueType>>& conditioning,
-              array<precision_reduction>& block_precisions,
-              const array<IndexType>& block_pointers, array<ValueType>& blocks)
-{
-    components::fill_array(exec, blocks.get_data(), blocks.get_size(),
-                           zero<ValueType>());
-    select_generate(
-        compiled_kernels(),
-        [&](int compiled_block_size) {
-            return max_block_size <= compiled_block_size;
-        },
-        syn::value_list<int, config::min_warps_per_block>(), syn::type_list<>(),
-        exec, system_matrix, accuracy, blocks.get_data(), storage_scheme,
-        conditioning.get_data(), block_precisions.get_data(),
-        block_pointers.get_const_data(), num_blocks);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_JACOBI_GENERATE_KERNEL);
-
-
-}  // namespace jacobi
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu
deleted file mode 100644
index aa8807728a8..00000000000
--- a/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu
+++ /dev/null
@@ -1,108 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include <ginkgo/config.hpp>
-#include <ginkgo/core/base/exception_helpers.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/diagonal_block_manipulation.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/uninitialized_array.hpp"
-#include "common/cuda_hip/components/warp_blas.hpp"
-#include "core/base/extended_float.hpp"
-#include "core/components/fill_array_kernels.hpp"
-#include "core/preconditioner/jacobi_kernels.hpp"
-#include "core/preconditioner/jacobi_utils.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-// generated header
-#include "common/cuda_hip/preconditioner/jacobi_common.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Jacobi preconditioner namespace.
- * @ref Jacobi
- * @ingroup jacobi
- */
-namespace jacobi {
-
-
-#include "common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc"
-
-
-// clang-format off
-#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@
-// clang-format on
-// make things easier for IDEs
-#ifndef GKO_JACOBI_BLOCK_SIZE
-#define GKO_JACOBI_BLOCK_SIZE 1
-#endif
-
-
-template <int warps_per_block, int max_block_size, typename ValueType,
-          typename IndexType>
-void generate(syn::value_list<int, max_block_size>,
-              std::shared_ptr<const DefaultExecutor> exec,
-              const matrix::Csr<ValueType, IndexType>* mtx,
-              remove_complex<ValueType> accuracy, ValueType* block_data,
-              const preconditioner::block_interleaved_storage_scheme<IndexType>&
-                  storage_scheme,
-              remove_complex<ValueType>* conditioning,
-              precision_reduction* block_precisions,
-              const IndexType* block_ptrs, size_type num_blocks)
-{
-    constexpr int subwarp_size = get_larger_power(max_block_size);
-    constexpr int blocks_per_warp = config::warp_size / subwarp_size;
-    const auto grid_size =
-        ceildiv(num_blocks, warps_per_block * blocks_per_warp);
-    const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block);
-
-    if (grid_size > 0) {
-        if (block_precisions) {
-            kernel::adaptive_generate<max_block_size, subwarp_size,
-                                      warps_per_block>
-                <<<grid_size, block_size, 0, exec->get_stream()>>>(
-                    mtx->get_size()[0], mtx->get_const_row_ptrs(),
-                    mtx->get_const_col_idxs(),
-                    as_device_type(mtx->get_const_values()),
-                    as_device_type(accuracy), as_device_type(block_data),
-                    storage_scheme, as_device_type(conditioning),
-                    block_precisions, block_ptrs, num_blocks);
-        } else {
-            kernel::generate<max_block_size, subwarp_size, warps_per_block>
-                <<<grid_size, block_size, 0, exec->get_stream()>>>(
-                    mtx->get_size()[0], mtx->get_const_row_ptrs(),
-                    mtx->get_const_col_idxs(),
-                    as_device_type(mtx->get_const_values()),
-                    as_device_type(block_data), storage_scheme, block_ptrs,
-                    num_blocks);
-        }
-    }
-}
-
-
-#define DECLARE_JACOBI_GENERATE_INSTANTIATION(ValueType, IndexType)          \
-    void generate<config::min_warps_per_block, GKO_JACOBI_BLOCK_SIZE,        \
-                  ValueType, IndexType>(                                     \
-        syn::value_list<int, GKO_JACOBI_BLOCK_SIZE>,                         \
-        std::shared_ptr<const DefaultExecutor> exec,                         \
-        const matrix::Csr<ValueType, IndexType>*, remove_complex<ValueType>, \
-        ValueType*,                                                          \
-        const preconditioner::block_interleaved_storage_scheme<IndexType>&,  \
-        remove_complex<ValueType>*, precision_reduction*, const IndexType*,  \
-        size_type)
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    DECLARE_JACOBI_GENERATE_INSTANTIATION);
-
-
-}  // namespace jacobi
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index e6a337da7b9..67617169b5a 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -21,9 +21,6 @@ set(GINKGO_HIP_SOURCES
     ${CSR_INSTANTIATE}
     ${FBCSR_INSTANTIATE}
     preconditioner/batch_jacobi_kernels.hip.cpp
-    preconditioner/jacobi_advanced_apply_kernels.hip.cpp
-    preconditioner/jacobi_generate_kernels.hip.cpp
-    preconditioner/jacobi_simple_apply_kernels.hip.cpp
     solver/batch_bicgstab_kernels.hip.cpp
     solver/batch_cg_kernels.hip.cpp
     solver/lower_trs_kernels.hip.cpp
@@ -48,33 +45,15 @@ else()
     set(GKO_HIP_JACOBI_BLOCK_SIZES 1 2 4 8 13 16 32 ${GKO_HIP_JACOBI_MAX_BLOCK_SIZE})
     list(REMOVE_DUPLICATES GKO_HIP_JACOBI_BLOCK_SIZES)
 endif()
-foreach(GKO_JACOBI_BLOCK_SIZE IN LISTS GKO_HIP_JACOBI_BLOCK_SIZES)
-    configure_file(
-        preconditioner/jacobi_generate_kernels.instantiate.hip.cpp
-        preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp)
-    configure_file(
-        preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp
-        preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp)
-    configure_file(
-        preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp
-        preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp)
-    # The 3D indexing used in Jacobi kernel triggers an instruction selection bug in Debug builds
-    # Probably the same as https://github.com/llvm/llvm-project/issues/67574
-    # Fixed in ROCm 6.0 https://github.com/ROCm/llvm-project/commit/cd7f574a1fd1d3f3e8b9c1cae61fa8133a51de5f
-    # and in LLVM trunk https://github.com/llvm/llvm-project/commit/cc3d2533cc2e4ea06981b86ede5087fbf801e789
-    set_source_files_properties(
-        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp
-        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp
-        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp
-        PROPERTIES
-        COMPILE_OPTIONS $<$<CONFIG:Debug>:-O2>)
-    list(APPEND GINKGO_HIP_SOURCES
-        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp
-        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp
-        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp)
-endforeach()
-string(REPLACE ";" "," GKO_HIP_JACOBI_BLOCK_SIZES_CODE "${GKO_HIP_JACOBI_BLOCK_SIZES}")
-configure_file(preconditioner/jacobi_common.hip.hpp.in common/cuda_hip/preconditioner/jacobi_common.hpp)
+jacobi_generated_files(GKO_HIP_JACOBI_SOURCES "${GKO_HIP_JACOBI_BLOCK_SIZES}")
+# The 3D indexing used in Jacobi kernel triggers an instruction selection bug in Debug builds
+# Probably the same as https://github.com/llvm/llvm-project/issues/67574
+# Fixed in ROCm 6.0 https://github.com/ROCm/llvm-project/commit/cd7f574a1fd1d3f3e8b9c1cae61fa8133a51de5f
+# and in LLVM trunk https://github.com/llvm/llvm-project/commit/cc3d2533cc2e4ea06981b86ede5087fbf801e789
+set_source_files_properties(${GKO_HIP_JACOBI_SOURCES} PROPERTIES COMPILE_OPTIONS $<$<CONFIG:Debug>:-O2>)
+list(APPEND GINKGO_HIP_SOURCES ${GKO_HIP_JACOBI_SOURCES})
+string(REPLACE ";" "," GKO_JACOBI_BLOCK_SIZES_CODE "${GKO_HIP_JACOBI_BLOCK_SIZES}")
+configure_file(${Ginkgo_SOURCE_DIR}/common/cuda_hip/preconditioner/jacobi_common.hpp.in common/cuda_hip/preconditioner/jacobi_common.hpp)
 
 set_source_files_properties(${GINKGO_HIP_SOURCES} PROPERTIES LANGUAGE HIP)
 add_library(ginkgo_hip $<TARGET_OBJECTS:ginkgo_hip_device> ${GINKGO_HIP_SOURCES})
diff --git a/hip/preconditioner/jacobi_common.hip.hpp.in b/hip/preconditioner/jacobi_common.hip.hpp.in
deleted file mode 100644
index 2185e124db6..00000000000
--- a/hip/preconditioner/jacobi_common.hip.hpp.in
+++ /dev/null
@@ -1,43 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include <ginkgo/config.hpp>
-#include <ginkgo/core/synthesizer/containers.hpp>
-
-
-#include "common/cuda_hip/base/config.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-namespace jacobi {
-
-
-/**
- * A compile-time list of block sizes for which dedicated generate and apply
- * kernels should be compiled.
- */
-// clang-format off
-#cmakedefine GKO_HIP_JACOBI_BLOCK_SIZES_CODE @GKO_HIP_JACOBI_BLOCK_SIZES_CODE@
-// clang-format on
-// make things easier for IDEs
-#ifndef GKO_HIP_JACOBI_BLOCK_SIZES_CODE
-#define GKO_HIP_JACOBI_BLOCK_SIZES_CODE 1
-#endif
-
-
-using compiled_kernels = syn::value_list<int, GKO_HIP_JACOBI_BLOCK_SIZES_CODE>;
-
-
-constexpr int get_larger_power(int value, int guess = 1)
-{
-    return guess >= value ? guess : get_larger_power(value, guess << 1);
-}
-
-
-}  // namespace jacobi
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp b/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp
deleted file mode 100644
index 698efe6a858..00000000000
--- a/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include <ginkgo/config.hpp>
-#include <ginkgo/core/base/exception_helpers.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/diagonal_block_manipulation.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/uninitialized_array.hpp"
-#include "common/cuda_hip/components/warp_blas.hpp"
-#include "core/base/extended_float.hpp"
-#include "core/components/fill_array_kernels.hpp"
-#include "core/preconditioner/jacobi_kernels.hpp"
-#include "core/preconditioner/jacobi_utils.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-// generated header
-#include "common/cuda_hip/preconditioner/jacobi_common.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The Jacobi preconditioner namespace.
- * @ref Jacobi
- * @ingroup jacobi
- */
-namespace jacobi {
-
-
-#include "common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc"
-
-
-// clang-format off
-#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@
-// clang-format on
-// make things easier for IDEs
-#ifndef GKO_JACOBI_BLOCK_SIZE
-#define GKO_JACOBI_BLOCK_SIZE 1
-#endif
-
-
-template <int warps_per_block, int max_block_size, typename ValueType,
-          typename IndexType>
-void generate(syn::value_list<int, max_block_size>,
-              std::shared_ptr<const DefaultExecutor> exec,
-              const matrix::Csr<ValueType, IndexType>* mtx,
-              remove_complex<ValueType> accuracy, ValueType* block_data,
-              const preconditioner::block_interleaved_storage_scheme<IndexType>&
-                  storage_scheme,
-              remove_complex<ValueType>* conditioning,
-              precision_reduction* block_precisions,
-              const IndexType* block_ptrs, size_type num_blocks)
-{
-    constexpr int subwarp_size = get_larger_power(max_block_size);
-    constexpr int blocks_per_warp = config::warp_size / subwarp_size;
-    const auto grid_size =
-        ceildiv(num_blocks, warps_per_block * blocks_per_warp);
-    const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block);
-
-    if (grid_size > 0) {
-        if (block_precisions) {
-            kernel::adaptive_generate<max_block_size, subwarp_size,
-                                      warps_per_block>
-                <<<grid_size, block_size, 0, exec->get_stream()>>>(
-                    mtx->get_size()[0], mtx->get_const_row_ptrs(),
-                    mtx->get_const_col_idxs(),
-                    as_device_type(mtx->get_const_values()),
-                    as_device_type(accuracy), as_device_type(block_data),
-                    storage_scheme, as_device_type(conditioning),
-                    block_precisions, block_ptrs, num_blocks);
-        } else {
-            kernel::generate<max_block_size, subwarp_size, warps_per_block>
-                <<<grid_size, block_size, 0, exec->get_stream()>>>(
-                    mtx->get_size()[0], mtx->get_const_row_ptrs(),
-                    mtx->get_const_col_idxs(),
-                    as_device_type(mtx->get_const_values()),
-                    as_device_type(block_data), storage_scheme, block_ptrs,
-                    num_blocks);
-        }
-    }
-}
-
-
-#define DECLARE_JACOBI_GENERATE_INSTANTIATION(ValueType, IndexType)          \
-    void generate<config::min_warps_per_block, GKO_JACOBI_BLOCK_SIZE,        \
-                  ValueType, IndexType>(                                     \
-        syn::value_list<int, GKO_JACOBI_BLOCK_SIZE>,                         \
-        std::shared_ptr<const DefaultExecutor> exec,                         \
-        const matrix::Csr<ValueType, IndexType>*, remove_complex<ValueType>, \
-        ValueType*,                                                          \
-        const preconditioner::block_interleaved_storage_scheme<IndexType>&,  \
-        remove_complex<ValueType>*, precision_reduction*, const IndexType*,  \
-        size_type)
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    DECLARE_JACOBI_GENERATE_INSTANTIATION);
-
-
-}  // namespace jacobi
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp
deleted file mode 100644
index 16ca805a42c..00000000000
--- a/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/warp_blas.hpp"
-#include "core/base/extended_float.hpp"
-#include "core/matrix/dense_kernels.hpp"
-#include "core/preconditioner/jacobi_kernels.hpp"
-#include "core/preconditioner/jacobi_utils.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-// generated header
-#include "common/cuda_hip/preconditioner/jacobi_common.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The Jacobi preconditioner namespace.
- * @ref Jacobi
- * @ingroup jacobi
- */
-namespace jacobi {
-
-
-#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc"
-
-
-template <int warps_per_block, int max_block_size, typename ValueType,
-          typename IndexType>
-void apply(syn::value_list<int, max_block_size>,
-           std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
-           const precision_reduction* block_precisions,
-           const IndexType* block_pointers, const ValueType* blocks,
-           const preconditioner::block_interleaved_storage_scheme<IndexType>&
-               storage_scheme,
-           const ValueType* b, size_type b_stride, ValueType* x,
-           size_type x_stride);
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_apply, apply);
-
-
-template <typename ValueType, typename IndexType>
-void simple_apply(
-    std::shared_ptr<const HipExecutor> exec, size_type num_blocks,
-    uint32 max_block_size,
-    const preconditioner::block_interleaved_storage_scheme<IndexType>&
-        storage_scheme,
-    const array<precision_reduction>& block_precisions,
-    const array<IndexType>& block_pointers, const array<ValueType>& blocks,
-    const matrix::Dense<ValueType>* b, matrix::Dense<ValueType>* x)
-{
-    // TODO: write a special kernel for multiple RHS
-    for (size_type col = 0; col < b->get_size()[1]; ++col) {
-        select_apply(
-            compiled_kernels(),
-            [&](int compiled_block_size) {
-                return max_block_size <= compiled_block_size;
-            },
-            syn::value_list<int, config::min_warps_per_block>(),
-            syn::type_list<>(), exec, num_blocks,
-            block_precisions.get_const_data(), block_pointers.get_const_data(),
-            blocks.get_const_data(), storage_scheme,
-            b->get_const_values() + col, b->get_stride(), x->get_values() + col,
-            x->get_stride());
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL);
-
-
-}  // namespace jacobi
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp
deleted file mode 100644
index d666a698b5e..00000000000
--- a/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/warp_blas.hpp"
-#include "core/base/extended_float.hpp"
-#include "core/matrix/dense_kernels.hpp"
-#include "core/preconditioner/jacobi_kernels.hpp"
-#include "core/preconditioner/jacobi_utils.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-// generated header
-#include "common/cuda_hip/preconditioner/jacobi_common.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The Jacobi preconditioner namespace.
- * @ref Jacobi
- * @ingroup jacobi
- */
-namespace jacobi {
-
-
-#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc"
-
-
-// clang-format off
-#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@
-// clang-format on
-// make things easier for IDEs
-#ifndef GKO_JACOBI_BLOCK_SIZE
-#define GKO_JACOBI_BLOCK_SIZE 1
-#endif
-
-
-template <int warps_per_block, int max_block_size, typename ValueType,
-          typename IndexType>
-void apply(syn::value_list<int, max_block_size>,
-           std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
-           const precision_reduction* block_precisions,
-           const IndexType* block_pointers, const ValueType* blocks,
-           const preconditioner::block_interleaved_storage_scheme<IndexType>&
-               storage_scheme,
-           const ValueType* b, size_type b_stride, ValueType* x,
-           size_type x_stride)
-{
-    constexpr int subwarp_size = get_larger_power(max_block_size);
-    constexpr int blocks_per_warp = config::warp_size / subwarp_size;
-    const auto grid_size =
-        ceildiv(num_blocks, warps_per_block * blocks_per_warp);
-    const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block);
-
-    if (grid_size > 0) {
-        if (block_precisions) {
-            kernel::adaptive_apply<max_block_size, subwarp_size,
-                                   warps_per_block>
-                <<<grid_size, block_size, 0, exec->get_stream()>>>(
-                    as_device_type(blocks), storage_scheme, block_precisions,
-                    block_pointers, num_blocks, as_device_type(b), b_stride,
-                    as_device_type(x), x_stride);
-        } else {
-            kernel::apply<max_block_size, subwarp_size, warps_per_block>
-                <<<grid_size, block_size, 0, exec->get_stream()>>>(
-                    as_device_type(blocks), storage_scheme, block_pointers,
-                    num_blocks, as_device_type(b), b_stride, as_device_type(x),
-                    x_stride);
-        }
-    }
-}
-
-
-#define DECLARE_JACOBI_SIMPLE_APPLY_INSTANTIATION(ValueType, IndexType)       \
-    void apply<config::min_warps_per_block, GKO_JACOBI_BLOCK_SIZE, ValueType, \
-               IndexType>(                                                    \
-        syn::value_list<int, GKO_JACOBI_BLOCK_SIZE>,                          \
-        std::shared_ptr<const DefaultExecutor> exec, size_type,               \
-        const precision_reduction*, const IndexType*, const ValueType*,       \
-        const preconditioner::block_interleaved_storage_scheme<IndexType>&,   \
-        const ValueType*, size_type, ValueType*, size_type)
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    DECLARE_JACOBI_SIMPLE_APPLY_INSTANTIATION);
-
-
-}  // namespace jacobi
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko

From 068dc49a7ce35e43ddc0a6b2fc235381a4bbcc88 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Sat, 13 Jul 2024 00:35:04 +0200
Subject: [PATCH 091/448] preparation

---
 cuda/matrix/csr_kernels.template.cu     |  8 +++---
 hip/matrix/csr_kernels.template.hip.cpp | 33 +++++++++++++------------
 2 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/cuda/matrix/csr_kernels.template.cu b/cuda/matrix/csr_kernels.template.cu
index c8d193e09af..89e5de9c303 100644
--- a/cuda/matrix/csr_kernels.template.cu
+++ b/cuda/matrix/csr_kernels.template.cu
@@ -27,6 +27,7 @@
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
 #include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
@@ -54,7 +55,7 @@
 
 namespace gko {
 namespace kernels {
-namespace cuda {
+namespace GKO_DEVICE_NAMESPACE {
 /**
  * @brief The Compressed sparse row matrix format namespace.
  *
@@ -224,6 +225,7 @@ void classical_spmv(syn::value_list<int, subwarp_size>,
 {
     using arithmetic_type =
         highest_precision<InputValueType, OutputValueType, MatrixValueType>;
+
     const auto nwarps = exec->get_num_warps_per_sm() *
                         exec->get_num_multiprocessor() *
                         classical_oversubscription;
@@ -488,7 +490,7 @@ void spmv(std::shared_ptr<const DefaultExecutor> exec,
                            a->get_strategy())) {
                 max_length_per_row = strategy->get_max_length_per_row();
             } else {
-                // as a fall-back: use average row length
+                // as a fall-back: use average row length, at least 1
                 max_length_per_row = a->get_num_stored_elements() /
                                      std::max<size_type>(a->get_size()[0], 1);
             }
@@ -995,6 +997,6 @@ void sort_by_column_index(std::shared_ptr<const DefaultExecutor> exec,
 
 
 }  // namespace csr
-}  // namespace cuda
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/hip/matrix/csr_kernels.template.hip.cpp b/hip/matrix/csr_kernels.template.hip.cpp
index 473361029c8..1fb086c5ea6 100644
--- a/hip/matrix/csr_kernels.template.hip.cpp
+++ b/hip/matrix/csr_kernels.template.hip.cpp
@@ -33,6 +33,7 @@
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/atomic.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
 #include "common/cuda_hip/components/intrinsics.hpp"
 #include "common/cuda_hip/components/merging.hpp"
 #include "common/cuda_hip/components/prefix_sum.hpp"
@@ -54,7 +55,7 @@
 
 namespace gko {
 namespace kernels {
-namespace hip {
+namespace GKO_DEVICE_NAMESPACE {
 /**
  * @brief The Compressed sparse row matrix format namespace.
  *
@@ -93,7 +94,7 @@ namespace {
 template <int items_per_thread, typename MatrixValueType,
           typename InputValueType, typename OutputValueType, typename IndexType>
 void merge_path_spmv(syn::value_list<int, items_per_thread>,
-                     std::shared_ptr<const HipExecutor> exec,
+                     std::shared_ptr<const DefaultExecutor> exec,
                      const matrix::Csr<MatrixValueType, IndexType>* a,
                      const matrix::Dense<InputValueType>* b,
                      matrix::Dense<OutputValueType>* c,
@@ -174,7 +175,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_merge_path_spmv, merge_path_spmv);
 
 
 template <typename ValueType, typename IndexType>
-int compute_items_per_thread(std::shared_ptr<const HipExecutor> exec)
+int compute_items_per_thread(std::shared_ptr<const DefaultExecutor> exec)
 {
 #if GINKGO_HIP_PLATFORM_NVCC
 
@@ -231,7 +232,7 @@ int compute_items_per_thread(std::shared_ptr<const HipExecutor> exec)
 template <int subwarp_size, typename MatrixValueType, typename InputValueType,
           typename OutputValueType, typename IndexType>
 void classical_spmv(syn::value_list<int, subwarp_size>,
-                    std::shared_ptr<const HipExecutor> exec,
+                    std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Csr<MatrixValueType, IndexType>* a,
                     const matrix::Dense<InputValueType>* b,
                     matrix::Dense<OutputValueType>* c,
@@ -285,7 +286,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv);
 
 template <typename MatrixValueType, typename InputValueType,
           typename OutputValueType, typename IndexType>
-void load_balance_spmv(std::shared_ptr<const HipExecutor> exec,
+void load_balance_spmv(std::shared_ptr<const DefaultExecutor> exec,
                        const matrix::Csr<MatrixValueType, IndexType>* a,
                        const matrix::Dense<InputValueType>* b,
                        matrix::Dense<OutputValueType>* c,
@@ -336,7 +337,7 @@ void load_balance_spmv(std::shared_ptr<const HipExecutor> exec,
 
 
 template <typename ValueType, typename IndexType>
-bool try_general_sparselib_spmv(std::shared_ptr<const HipExecutor> exec,
+bool try_general_sparselib_spmv(std::shared_ptr<const DefaultExecutor> exec,
                                 const ValueType* alpha,
                                 const matrix::Csr<ValueType, IndexType>* a,
                                 const matrix::Dense<ValueType>* b,
@@ -371,7 +372,7 @@ template <typename MatrixValueType, typename InputValueType,
           typename = std::enable_if_t<
               !std::is_same<MatrixValueType, InputValueType>::value ||
               !std::is_same<MatrixValueType, OutputValueType>::value>>
-bool try_sparselib_spmv(std::shared_ptr<const HipExecutor> exec,
+bool try_sparselib_spmv(std::shared_ptr<const DefaultExecutor> exec,
                         const matrix::Csr<MatrixValueType, IndexType>* a,
                         const matrix::Dense<InputValueType>* b,
                         matrix::Dense<OutputValueType>* c,
@@ -383,7 +384,7 @@ bool try_sparselib_spmv(std::shared_ptr<const HipExecutor> exec,
 }
 
 template <typename ValueType, typename IndexType>
-bool try_sparselib_spmv(std::shared_ptr<const HipExecutor> exec,
+bool try_sparselib_spmv(std::shared_ptr<const DefaultExecutor> exec,
                         const matrix::Csr<ValueType, IndexType>* a,
                         const matrix::Dense<ValueType>* b,
                         matrix::Dense<ValueType>* c,
@@ -409,7 +410,7 @@ bool try_sparselib_spmv(std::shared_ptr<const HipExecutor> exec,
 
 template <typename MatrixValueType, typename InputValueType,
           typename OutputValueType, typename IndexType>
-void spmv(std::shared_ptr<const HipExecutor> exec,
+void spmv(std::shared_ptr<const DefaultExecutor> exec,
           const matrix::Csr<MatrixValueType, IndexType>* a,
           const matrix::Dense<InputValueType>* b,
           matrix::Dense<OutputValueType>* c)
@@ -466,7 +467,7 @@ void spmv(std::shared_ptr<const HipExecutor> exec,
 
 template <typename MatrixValueType, typename InputValueType,
           typename OutputValueType, typename IndexType>
-void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
+void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec,
                    const matrix::Dense<MatrixValueType>* alpha,
                    const matrix::Csr<MatrixValueType, IndexType>* a,
                    const matrix::Dense<InputValueType>* b,
@@ -527,7 +528,7 @@ void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
 
 
 template <typename ValueType, typename IndexType>
-void spgemm(std::shared_ptr<const HipExecutor> exec,
+void spgemm(std::shared_ptr<const DefaultExecutor> exec,
             const matrix::Csr<ValueType, IndexType>* a,
             const matrix::Csr<ValueType, IndexType>* b,
             matrix::Csr<ValueType, IndexType>* c)
@@ -600,7 +601,7 @@ void spgemm(std::shared_ptr<const HipExecutor> exec,
 
 
 template <typename ValueType, typename IndexType>
-void advanced_spgemm(std::shared_ptr<const HipExecutor> exec,
+void advanced_spgemm(std::shared_ptr<const DefaultExecutor> exec,
                      const matrix::Dense<ValueType>* alpha,
                      const matrix::Csr<ValueType, IndexType>* a,
                      const matrix::Csr<ValueType, IndexType>* b,
@@ -691,7 +692,7 @@ void advanced_spgemm(std::shared_ptr<const HipExecutor> exec,
 
 
 template <typename ValueType, typename IndexType>
-void transpose(std::shared_ptr<const HipExecutor> exec,
+void transpose(std::shared_ptr<const DefaultExecutor> exec,
                const matrix::Csr<ValueType, IndexType>* orig,
                matrix::Csr<ValueType, IndexType>* trans)
 {
@@ -715,7 +716,7 @@ void transpose(std::shared_ptr<const HipExecutor> exec,
 
 
 template <typename ValueType, typename IndexType>
-void conj_transpose(std::shared_ptr<const HipExecutor> exec,
+void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Csr<ValueType, IndexType>* orig,
                     matrix::Csr<ValueType, IndexType>* trans)
 {
@@ -747,7 +748,7 @@ void conj_transpose(std::shared_ptr<const HipExecutor> exec,
 
 
 template <typename ValueType, typename IndexType>
-void sort_by_column_index(std::shared_ptr<const HipExecutor> exec,
+void sort_by_column_index(std::shared_ptr<const DefaultExecutor> exec,
                           matrix::Csr<ValueType, IndexType>* to_sort)
 {
     if (sparselib::is_supported<ValueType, IndexType>::value) {
@@ -792,6 +793,6 @@ void sort_by_column_index(std::shared_ptr<const HipExecutor> exec,
 
 
 }  // namespace csr
-}  // namespace hip
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko

From ed2d73c01a4a9f03995f390aeb01180074c75e94 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Sun, 14 Jul 2024 22:43:15 +0200
Subject: [PATCH 092/448] unify Csr, remove CUDA 10.x ifdefs

---
 .../matrix/csr_kernels.instantiate.cpp        |    6 +-
 ...rnels.hpp.inc => csr_kernels.template.cpp} | 1038 +++++++++++++++++
 cuda/CMakeLists.txt                           |    4 +-
 cuda/matrix/csr_kernels.instantiate.cu        |   81 --
 cuda/matrix/csr_kernels.template.cu           | 1002 ----------------
 hip/CMakeLists.txt                            |    2 +-
 hip/matrix/csr_kernels.template.hip.cpp       |  798 -------------
 7 files changed, 1044 insertions(+), 1887 deletions(-)
 rename hip/matrix/csr_kernels.instantiate.hip.cpp => common/cuda_hip/matrix/csr_kernels.instantiate.cpp (97%)
 rename common/cuda_hip/matrix/{csr_kernels.hpp.inc => csr_kernels.template.cpp} (62%)
 delete mode 100644 cuda/matrix/csr_kernels.instantiate.cu
 delete mode 100644 cuda/matrix/csr_kernels.template.cu
 delete mode 100644 hip/matrix/csr_kernels.template.hip.cpp

diff --git a/hip/matrix/csr_kernels.instantiate.hip.cpp b/common/cuda_hip/matrix/csr_kernels.instantiate.cpp
similarity index 97%
rename from hip/matrix/csr_kernels.instantiate.hip.cpp
rename to common/cuda_hip/matrix/csr_kernels.instantiate.cpp
index 53a5a572aea..f62ca1c1815 100644
--- a/hip/matrix/csr_kernels.instantiate.hip.cpp
+++ b/common/cuda_hip/matrix/csr_kernels.instantiate.cpp
@@ -2,12 +2,12 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "hip/matrix/csr_kernels.template.hip.cpp"
+#include "common/cuda_hip/matrix/csr_kernels.template.cpp"
 
 
 namespace gko {
 namespace kernels {
-namespace hip {
+namespace GKO_DEVICE_NAMESPACE {
 /**
  * @brief The Compressed sparse row matrix format namespace.
  *
@@ -124,6 +124,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 }  // namespace csr
-}  // namespace hip
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.template.cpp
similarity index 62%
rename from common/cuda_hip/matrix/csr_kernels.hpp.inc
rename to common/cuda_hip/matrix/csr_kernels.template.cpp
index 85b98f15825..eda0e856b07 100644
--- a/common/cuda_hip/matrix/csr_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/csr_kernels.template.cpp
@@ -2,6 +2,88 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/matrix/csr_kernels.hpp"
+
+#include <algorithm>
+
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sort.h>
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/matrix/ell.hpp>
+#include <ginkgo/core/matrix/hybrid.hpp>
+#include <ginkgo/core/matrix/sellp.hpp>
+
+#include "accessor/cuda_hip_helper.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/merging.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/segment_scan.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "core/base/array_access.hpp"
+#include "core/base/mixed_precision_types.hpp"
+#include "core/components/fill_array_kernels.hpp"
+#include "core/components/format_conversion_kernels.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+#include "core/matrix/csr_accessor_helper.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_lookup.hpp"
+#include "core/matrix/dense_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The Compressed sparse row matrix format namespace.
+ *
+ * @ingroup csr
+ */
+namespace csr {
+
+
+constexpr int default_block_size = 512;
+constexpr int warps_in_block = 4;
+constexpr int spmv_block_size = warps_in_block * config::warp_size;
+constexpr int classical_oversubscription = 32;
+
+
+/**
+ * A compile-time list of the number items per threads for which spmv kernel
+ * should be compiled.
+ */
+using compiled_kernels = syn::value_list<int, 3, 4, 6, 7, 8, 12, 14>;
+
+using classical_kernels =
+    syn::value_list<int, config::warp_size, 32, 16, 8, 4, 2, 1>;
+
+using spgeam_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+
+
+#include "common/cuda_hip/matrix/csr_common.hpp.inc"
 namespace kernel {
 
 
@@ -1779,3 +1861,959 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
         mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
         as_device_type(mtx->get_values()));
 }
+
+
+namespace host_kernel {
+namespace {
+
+
+template <int items_per_thread, typename MatrixValueType,
+          typename InputValueType, typename OutputValueType, typename IndexType>
+void merge_path_spmv(syn::value_list<int, items_per_thread>,
+                     std::shared_ptr<const DefaultExecutor> exec,
+                     const matrix::Csr<MatrixValueType, IndexType>* a,
+                     const matrix::Dense<InputValueType>* b,
+                     matrix::Dense<OutputValueType>* c,
+                     const matrix::Dense<MatrixValueType>* alpha = nullptr,
+                     const matrix::Dense<OutputValueType>* beta = nullptr)
+{
+    using arithmetic_type =
+        highest_precision<InputValueType, OutputValueType, MatrixValueType>;
+    const IndexType total = a->get_size()[0] + a->get_num_stored_elements();
+    const IndexType grid_num =
+        ceildiv(total, spmv_block_size * items_per_thread);
+    const auto grid = grid_num;
+    const auto block = spmv_block_size;
+    // TODO: workspace?
+    array<IndexType> row_out(exec, grid_num);
+    // TODO: should we store the value in arithmetic_type or output_type?
+    array<arithmetic_type> val_out(exec, grid_num);
+
+    const auto a_vals =
+        acc::helper::build_const_rrm_accessor<arithmetic_type>(a);
+
+    for (IndexType column_id = 0; column_id < b->get_size()[1]; column_id++) {
+        const auto column_span =
+            acc::index_span(static_cast<acc::size_type>(column_id),
+                            static_cast<acc::size_type>(column_id + 1));
+        const auto b_vals =
+            acc::helper::build_const_rrm_accessor<arithmetic_type>(b,
+                                                                   column_span);
+        auto c_vals =
+            acc::helper::build_rrm_accessor<arithmetic_type>(c, column_span);
+        if (alpha == nullptr && beta == nullptr) {
+            if (grid_num > 0) {
+                kernel::abstract_merge_path_spmv<items_per_thread>
+                    <<<grid, block, 0, exec->get_stream()>>>(
+                        static_cast<IndexType>(a->get_size()[0]),
+                        acc::as_device_range(a_vals), a->get_const_col_idxs(),
+                        as_device_type(a->get_const_row_ptrs()),
+                        as_device_type(a->get_const_srow()),
+                        acc::as_device_range(b_vals),
+                        acc::as_device_range(c_vals),
+                        as_device_type(row_out.get_data()),
+                        as_device_type(val_out.get_data()));
+            }
+            kernel::
+                abstract_reduce<<<1, spmv_block_size, 0, exec->get_stream()>>>(
+                    grid_num, as_device_type(val_out.get_data()),
+                    as_device_type(row_out.get_data()),
+                    acc::as_device_range(c_vals));
+
+        } else if (alpha != nullptr && beta != nullptr) {
+            if (grid_num > 0) {
+                kernel::abstract_merge_path_spmv<items_per_thread>
+                    <<<grid, block, 0, exec->get_stream()>>>(
+                        static_cast<IndexType>(a->get_size()[0]),
+                        as_device_type(alpha->get_const_values()),
+                        acc::as_device_range(a_vals), a->get_const_col_idxs(),
+                        as_device_type(a->get_const_row_ptrs()),
+                        as_device_type(a->get_const_srow()),
+                        acc::as_device_range(b_vals),
+                        as_device_type(beta->get_const_values()),
+                        acc::as_device_range(c_vals),
+                        as_device_type(row_out.get_data()),
+                        as_device_type(val_out.get_data()));
+            }
+            kernel::
+                abstract_reduce<<<1, spmv_block_size, 0, exec->get_stream()>>>(
+                    grid_num, as_device_type(val_out.get_data()),
+                    as_device_type(row_out.get_data()),
+                    as_device_type(alpha->get_const_values()),
+                    acc::as_device_range(c_vals));
+        } else {
+            GKO_KERNEL_NOT_FOUND;
+        }
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_merge_path_spmv, merge_path_spmv);
+
+
+template <typename ValueType, typename IndexType>
+int compute_items_per_thread(std::shared_ptr<const DefaultExecutor> exec)
+{
+#if defined(GKO_COMPILING_CUDA) || GINKGO_HIP_PLATFORM_NVCC
+
+
+    const int version =
+        (exec->get_major_version() << 4) + exec->get_minor_version();
+    // The num_item is decided to make the occupancy 100%
+    // TODO: Extend this list when new GPU is released
+    //       Tune this parameter
+    // 128 threads/block the number of items per threads
+    // 3.0 3.5: 6
+    // 3.7: 14
+    // 5.0, 5.3, 6.0, 6.2: 8
+    // 5.2, 6.1, 7.0: 12
+    int num_item = 6;
+    switch (version) {
+    case 0x50:
+    case 0x53:
+    case 0x60:
+    case 0x62:
+        num_item = 8;
+        break;
+    case 0x52:
+    case 0x61:
+    case 0x70:
+        num_item = 12;
+        break;
+    case 0x37:
+        num_item = 14;
+    }
+
+
+#else
+
+
+    // HIP uses the minimal num_item to make the code work correctly.
+    // TODO: this parameter should be tuned.
+    int num_item = 6;
+
+
+#endif  // GINKGO_HIP_PLATFORM_NVCC
+
+
+    // Ensure that the following is satisfied:
+    // sizeof(IndexType) + sizeof(ValueType)
+    // <= items_per_thread * sizeof(IndexType)
+    constexpr int minimal_num =
+        ceildiv(sizeof(IndexType) + sizeof(ValueType), sizeof(IndexType));
+    int items_per_thread = num_item * 4 / sizeof(IndexType);
+    return std::max(minimal_num, items_per_thread);
+}
+
+
+template <int subwarp_size, typename MatrixValueType, typename InputValueType,
+          typename OutputValueType, typename IndexType>
+void classical_spmv(syn::value_list<int, subwarp_size>,
+                    std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<MatrixValueType, IndexType>* a,
+                    const matrix::Dense<InputValueType>* b,
+                    matrix::Dense<OutputValueType>* c,
+                    const matrix::Dense<MatrixValueType>* alpha = nullptr,
+                    const matrix::Dense<OutputValueType>* beta = nullptr)
+{
+    using arithmetic_type =
+        highest_precision<InputValueType, OutputValueType, MatrixValueType>;
+
+    const auto nwarps = exec->get_num_warps_per_sm() *
+                        exec->get_num_multiprocessor() *
+                        classical_oversubscription;
+    const auto gridx =
+        std::min(ceildiv(a->get_size()[0], spmv_block_size / subwarp_size),
+                 int64(nwarps / warps_in_block));
+    const dim3 grid(gridx, b->get_size()[1]);
+    const auto block = spmv_block_size;
+
+    const auto a_vals =
+        acc::helper::build_const_rrm_accessor<arithmetic_type>(a);
+    const auto b_vals =
+        acc::helper::build_const_rrm_accessor<arithmetic_type>(b);
+    auto c_vals = acc::helper::build_rrm_accessor<arithmetic_type>(c);
+    if (alpha == nullptr && beta == nullptr) {
+        if (grid.x > 0 && grid.y > 0) {
+            kernel::abstract_classical_spmv<subwarp_size>
+                <<<grid, block, 0, exec->get_stream()>>>(
+                    a->get_size()[0], acc::as_device_range(a_vals),
+                    a->get_const_col_idxs(),
+                    as_device_type(a->get_const_row_ptrs()),
+                    acc::as_device_range(b_vals), acc::as_device_range(c_vals));
+        }
+    } else if (alpha != nullptr && beta != nullptr) {
+        if (grid.x > 0 && grid.y > 0) {
+            kernel::abstract_classical_spmv<subwarp_size>
+                <<<grid, block, 0, exec->get_stream()>>>(
+                    a->get_size()[0], as_device_type(alpha->get_const_values()),
+                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
+                    as_device_type(a->get_const_row_ptrs()),
+                    acc::as_device_range(b_vals),
+                    as_device_type(beta->get_const_values()),
+                    acc::as_device_range(c_vals));
+        }
+    } else {
+        GKO_KERNEL_NOT_FOUND;
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv);
+
+
+template <typename MatrixValueType, typename InputValueType,
+          typename OutputValueType, typename IndexType>
+void load_balance_spmv(std::shared_ptr<const DefaultExecutor> exec,
+                       const matrix::Csr<MatrixValueType, IndexType>* a,
+                       const matrix::Dense<InputValueType>* b,
+                       matrix::Dense<OutputValueType>* c,
+                       const matrix::Dense<MatrixValueType>* alpha = nullptr,
+                       const matrix::Dense<OutputValueType>* beta = nullptr)
+{
+    using arithmetic_type =
+        highest_precision<InputValueType, OutputValueType, MatrixValueType>;
+
+    if (beta) {
+        dense::scale(exec, beta, c);
+    } else {
+        dense::fill(exec, c, zero<OutputValueType>());
+    }
+    const IndexType nwarps = a->get_num_srow_elements();
+    if (nwarps > 0) {
+        const dim3 csr_block(config::warp_size, warps_in_block, 1);
+        const dim3 csr_grid(ceildiv(nwarps, warps_in_block), b->get_size()[1]);
+        const auto a_vals =
+            acc::helper::build_const_rrm_accessor<arithmetic_type>(a);
+        const auto b_vals =
+            acc::helper::build_const_rrm_accessor<arithmetic_type>(b);
+        auto c_vals = acc::helper::build_rrm_accessor<arithmetic_type>(c);
+        if (alpha) {
+            if (csr_grid.x > 0 && csr_grid.y > 0) {
+                kernel::abstract_spmv<<<csr_grid, csr_block, 0,
+                                        exec->get_stream()>>>(
+                    nwarps, static_cast<IndexType>(a->get_size()[0]),
+                    as_device_type(alpha->get_const_values()),
+                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
+                    as_device_type(a->get_const_row_ptrs()),
+                    as_device_type(a->get_const_srow()),
+                    acc::as_device_range(b_vals), acc::as_device_range(c_vals));
+            }
+        } else {
+            if (csr_grid.x > 0 && csr_grid.y > 0) {
+                kernel::abstract_spmv<<<csr_grid, csr_block, 0,
+                                        exec->get_stream()>>>(
+                    nwarps, static_cast<IndexType>(a->get_size()[0]),
+                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
+                    as_device_type(a->get_const_row_ptrs()),
+                    as_device_type(a->get_const_srow()),
+                    acc::as_device_range(b_vals), acc::as_device_range(c_vals));
+            }
+        }
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+bool try_general_sparselib_spmv(std::shared_ptr<const DefaultExecutor> exec,
+                                const ValueType* alpha,
+                                const matrix::Csr<ValueType, IndexType>* a,
+                                const matrix::Dense<ValueType>* b,
+                                const ValueType* beta,
+                                matrix::Dense<ValueType>* c)
+{
+#ifdef GKO_COMPILING_HIP
+    bool try_sparselib = sparselib::is_supported<ValueType, IndexType>::value;
+    try_sparselib =
+        try_sparselib && b->get_stride() == 1 && c->get_stride() == 1;
+    // rocSPARSE has issues with zero matrices
+    try_sparselib = try_sparselib && a->get_num_stored_elements() > 0;
+    if (try_sparselib) {
+        auto descr = sparselib::create_mat_descr();
+
+        auto row_ptrs = a->get_const_row_ptrs();
+        auto col_idxs = a->get_const_col_idxs();
+
+        sparselib::spmv(exec->get_sparselib_handle(),
+                        SPARSELIB_OPERATION_NON_TRANSPOSE, a->get_size()[0],
+                        a->get_size()[1], a->get_num_stored_elements(), alpha,
+                        descr, a->get_const_values(), row_ptrs, col_idxs,
+                        b->get_const_values(), beta, c->get_values());
+
+        sparselib::destroy(descr);
+    }
+    return try_sparselib;
+#else  // GKO_COMPILING_CUDA
+    auto handle = exec->get_sparselib_handle();
+    // workaround for a division by zero in cuSPARSE 11.?
+    if (a->get_size()[1] == 0) {
+        return false;
+    }
+    cusparseOperation_t trans = SPARSELIB_OPERATION_NON_TRANSPOSE;
+    auto row_ptrs = const_cast<IndexType*>(a->get_const_row_ptrs());
+    auto col_idxs = const_cast<IndexType*>(a->get_const_col_idxs());
+    auto values = const_cast<ValueType*>(a->get_const_values());
+    auto mat = sparselib::create_csr(a->get_size()[0], a->get_size()[1],
+                                     a->get_num_stored_elements(), row_ptrs,
+                                     col_idxs, values);
+    auto b_val = const_cast<ValueType*>(b->get_const_values());
+    auto c_val = c->get_values();
+    if (b->get_stride() == 1 && c->get_stride() == 1) {
+        auto vecb = sparselib::create_dnvec(b->get_size()[0], b_val);
+        auto vecc = sparselib::create_dnvec(c->get_size()[0], c_val);
+#if CUDA_VERSION >= 11021
+        constexpr auto alg = CUSPARSE_SPMV_CSR_ALG1;
+#else
+        constexpr auto alg = CUSPARSE_CSRMV_ALG1;
+#endif
+        size_type buffer_size = 0;
+        sparselib::spmv_buffersize<ValueType>(handle, trans, alpha, mat, vecb,
+                                              beta, vecc, alg, &buffer_size);
+
+        array<char> buffer_array(exec, buffer_size);
+        auto buffer = buffer_array.get_data();
+        sparselib::spmv<ValueType>(handle, trans, alpha, mat, vecb, beta, vecc,
+                                   alg, buffer);
+        sparselib::destroy(vecb);
+        sparselib::destroy(vecc);
+    } else {
+#if CUDA_VERSION >= 11060
+        if (b->get_size()[1] == 1) {
+            // cusparseSpMM seems to take the single strided vector as column
+            // major without considering stride and row major (cuda 11.6)
+            return false;
+        }
+#endif  // CUDA_VERSION >= 11060
+        cusparseSpMMAlg_t alg = CUSPARSE_SPMM_CSR_ALG2;
+        auto vecb =
+            sparselib::create_dnmat(b->get_size(), b->get_stride(), b_val);
+        auto vecc =
+            sparselib::create_dnmat(c->get_size(), c->get_stride(), c_val);
+        size_type buffer_size = 0;
+        sparselib::spmm_buffersize<ValueType>(handle, trans, trans, alpha, mat,
+                                              vecb, beta, vecc, alg,
+                                              &buffer_size);
+
+        array<char> buffer_array(exec, buffer_size);
+        auto buffer = buffer_array.get_data();
+        sparselib::spmm<ValueType>(handle, trans, trans, alpha, mat, vecb, beta,
+                                   vecc, alg, buffer);
+        sparselib::destroy(vecb);
+        sparselib::destroy(vecc);
+    }
+    sparselib::destroy(mat);
+    return true;
+#endif  // GKO_COMPILING_CUDA
+}
+
+
+template <typename MatrixValueType, typename InputValueType,
+          typename OutputValueType, typename IndexType,
+          typename = std::enable_if_t<
+              !std::is_same<MatrixValueType, InputValueType>::value ||
+              !std::is_same<MatrixValueType, OutputValueType>::value>>
+bool try_sparselib_spmv(std::shared_ptr<const DefaultExecutor> exec,
+                        const matrix::Csr<MatrixValueType, IndexType>* a,
+                        const matrix::Dense<InputValueType>* b,
+                        matrix::Dense<OutputValueType>* c,
+                        const matrix::Dense<MatrixValueType>* alpha = nullptr,
+                        const matrix::Dense<OutputValueType>* beta = nullptr)
+{
+    // TODO: support sparselib mixed
+    return false;
+}
+
+template <typename ValueType, typename IndexType>
+bool try_sparselib_spmv(std::shared_ptr<const DefaultExecutor> exec,
+                        const matrix::Csr<ValueType, IndexType>* a,
+                        const matrix::Dense<ValueType>* b,
+                        matrix::Dense<ValueType>* c,
+                        const matrix::Dense<ValueType>* alpha = nullptr,
+                        const matrix::Dense<ValueType>* beta = nullptr)
+{
+    if (alpha) {
+        return try_general_sparselib_spmv(exec, alpha->get_const_values(), a, b,
+                                          beta->get_const_values(), c);
+    } else {
+        auto handle = exec->get_sparselib_handle();
+        sparselib::pointer_mode_guard pm_guard(handle);
+        const auto valpha = one<ValueType>();
+        const auto vbeta = zero<ValueType>();
+        return try_general_sparselib_spmv(exec, &valpha, a, b, &vbeta, c);
+    }
+}
+
+
+}  // anonymous namespace
+}  // namespace host_kernel
+
+
+template <typename MatrixValueType, typename InputValueType,
+          typename OutputValueType, typename IndexType>
+void spmv(std::shared_ptr<const DefaultExecutor> exec,
+          const matrix::Csr<MatrixValueType, IndexType>* a,
+          const matrix::Dense<InputValueType>* b,
+          matrix::Dense<OutputValueType>* c)
+{
+    if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
+        // empty output: nothing to do
+    } else if (a->get_strategy()->get_name() == "load_balance") {
+        host_kernel::load_balance_spmv(exec, a, b, c);
+    } else if (a->get_strategy()->get_name() == "merge_path") {
+        using arithmetic_type =
+            highest_precision<InputValueType, OutputValueType, MatrixValueType>;
+        int items_per_thread =
+            host_kernel::compute_items_per_thread<arithmetic_type, IndexType>(
+                exec);
+        host_kernel::select_merge_path_spmv(
+            compiled_kernels(),
+            [&items_per_thread](int compiled_info) {
+                return items_per_thread == compiled_info;
+            },
+            syn::value_list<int>(), syn::type_list<>(), exec, a, b, c);
+    } else {
+        bool use_classical = true;
+        if (a->get_strategy()->get_name() == "sparselib" ||
+            a->get_strategy()->get_name() == "cusparse") {
+            use_classical = !host_kernel::try_sparselib_spmv(exec, a, b, c);
+        }
+        if (use_classical) {
+            IndexType max_length_per_row = 0;
+            using Tcsr = matrix::Csr<MatrixValueType, IndexType>;
+            if (auto strategy =
+                    std::dynamic_pointer_cast<const typename Tcsr::classical>(
+                        a->get_strategy())) {
+                max_length_per_row = strategy->get_max_length_per_row();
+            } else if (auto strategy = std::dynamic_pointer_cast<
+                           const typename Tcsr::automatical>(
+                           a->get_strategy())) {
+                max_length_per_row = strategy->get_max_length_per_row();
+            } else {
+                // as a fall-back: use average row length, at least 1
+                max_length_per_row = a->get_num_stored_elements() /
+                                     std::max<size_type>(a->get_size()[0], 1);
+            }
+            max_length_per_row = std::max<size_type>(max_length_per_row, 1);
+            host_kernel::select_classical_spmv(
+                classical_kernels(),
+                [&max_length_per_row](int compiled_info) {
+                    return max_length_per_row >= compiled_info;
+                },
+                syn::value_list<int>(), syn::type_list<>(), exec, a, b, c);
+        }
+    }
+}
+
+
+template <typename MatrixValueType, typename InputValueType,
+          typename OutputValueType, typename IndexType>
+void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec,
+                   const matrix::Dense<MatrixValueType>* alpha,
+                   const matrix::Csr<MatrixValueType, IndexType>* a,
+                   const matrix::Dense<InputValueType>* b,
+                   const matrix::Dense<OutputValueType>* beta,
+                   matrix::Dense<OutputValueType>* c)
+{
+    if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
+        // empty output: nothing to do
+    } else if (a->get_strategy()->get_name() == "load_balance") {
+        host_kernel::load_balance_spmv(exec, a, b, c, alpha, beta);
+    } else if (a->get_strategy()->get_name() == "merge_path") {
+        using arithmetic_type =
+            highest_precision<InputValueType, OutputValueType, MatrixValueType>;
+        int items_per_thread =
+            host_kernel::compute_items_per_thread<arithmetic_type, IndexType>(
+                exec);
+        host_kernel::select_merge_path_spmv(
+            compiled_kernels(),
+            [&items_per_thread](int compiled_info) {
+                return items_per_thread == compiled_info;
+            },
+            syn::value_list<int>(), syn::type_list<>(), exec, a, b, c, alpha,
+            beta);
+    } else {
+        bool use_classical = true;
+        if (a->get_strategy()->get_name() == "sparselib" ||
+            a->get_strategy()->get_name() == "cusparse") {
+            use_classical =
+                !host_kernel::try_sparselib_spmv(exec, a, b, c, alpha, beta);
+        }
+        if (use_classical) {
+            IndexType max_length_per_row = 0;
+            using Tcsr = matrix::Csr<MatrixValueType, IndexType>;
+            if (auto strategy =
+                    std::dynamic_pointer_cast<const typename Tcsr::classical>(
+                        a->get_strategy())) {
+                max_length_per_row = strategy->get_max_length_per_row();
+            } else if (auto strategy = std::dynamic_pointer_cast<
+                           const typename Tcsr::automatical>(
+                           a->get_strategy())) {
+                max_length_per_row = strategy->get_max_length_per_row();
+            } else {
+                // as a fall-back: use average row length, at least 1
+                max_length_per_row = a->get_num_stored_elements() /
+                                     std::max<size_type>(a->get_size()[0], 1);
+            }
+            max_length_per_row = std::max<size_type>(max_length_per_row, 1);
+            host_kernel::select_classical_spmv(
+                classical_kernels(),
+                [&max_length_per_row](int compiled_info) {
+                    return max_length_per_row >= compiled_info;
+                },
+                syn::value_list<int>(), syn::type_list<>(), exec, a, b, c,
+                alpha, beta);
+        }
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void spgemm(std::shared_ptr<const DefaultExecutor> exec,
+            const matrix::Csr<ValueType, IndexType>* a,
+            const matrix::Csr<ValueType, IndexType>* b,
+            matrix::Csr<ValueType, IndexType>* c)
+{
+#ifdef GKO_COMPILING_HIP
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_sparselib_handle();
+        sparselib::pointer_mode_guard pm_guard(handle);
+        auto a_descr = sparselib::create_mat_descr();
+        auto b_descr = sparselib::create_mat_descr();
+        auto c_descr = sparselib::create_mat_descr();
+        auto d_descr = sparselib::create_mat_descr();
+        auto info = sparselib::create_spgemm_info();
+
+        auto alpha = one<ValueType>();
+        auto a_nnz = static_cast<IndexType>(a->get_num_stored_elements());
+        auto a_vals = a->get_const_values();
+        auto a_row_ptrs = a->get_const_row_ptrs();
+        auto a_col_idxs = a->get_const_col_idxs();
+        auto b_nnz = static_cast<IndexType>(b->get_num_stored_elements());
+        auto b_vals = b->get_const_values();
+        auto b_row_ptrs = b->get_const_row_ptrs();
+        auto b_col_idxs = b->get_const_col_idxs();
+        auto null_value = static_cast<ValueType*>(nullptr);
+        auto null_index = static_cast<IndexType*>(nullptr);
+        auto zero_nnz = IndexType{};
+        auto m = static_cast<IndexType>(a->get_size()[0]);
+        auto n = static_cast<IndexType>(b->get_size()[1]);
+        auto k = static_cast<IndexType>(a->get_size()[1]);
+        auto c_row_ptrs = c->get_row_ptrs();
+        matrix::CsrBuilder<ValueType, IndexType> c_builder{c};
+        auto& c_col_idxs_array = c_builder.get_col_idx_array();
+        auto& c_vals_array = c_builder.get_value_array();
+
+        // allocate buffer
+        size_type buffer_size{};
+        sparselib::spgemm_buffer_size(
+            handle, m, n, k, &alpha, a_descr, a_nnz, a_row_ptrs, a_col_idxs,
+            b_descr, b_nnz, b_row_ptrs, b_col_idxs, null_value, d_descr,
+            zero_nnz, null_index, null_index, info, buffer_size);
+        array<char> buffer_array(exec, buffer_size);
+        auto buffer = buffer_array.get_data();
+
+        // count nnz
+        IndexType c_nnz{};
+        sparselib::spgemm_nnz(
+            handle, m, n, k, a_descr, a_nnz, a_row_ptrs, a_col_idxs, b_descr,
+            b_nnz, b_row_ptrs, b_col_idxs, d_descr, zero_nnz, null_index,
+            null_index, c_descr, c_row_ptrs, &c_nnz, info, buffer);
+
+        // accumulate non-zeros
+        c_col_idxs_array.resize_and_reset(c_nnz);
+        c_vals_array.resize_and_reset(c_nnz);
+        auto c_col_idxs = c_col_idxs_array.get_data();
+        auto c_vals = c_vals_array.get_data();
+        sparselib::spgemm(handle, m, n, k, &alpha, a_descr, a_nnz, a_vals,
+                          a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals,
+                          b_row_ptrs, b_col_idxs, null_value, d_descr, zero_nnz,
+                          null_value, null_index, null_index, c_descr, c_vals,
+                          c_row_ptrs, c_col_idxs, info, buffer);
+
+        sparselib::destroy_spgemm_info(info);
+        sparselib::destroy(d_descr);
+        sparselib::destroy(c_descr);
+        sparselib::destroy(b_descr);
+        sparselib::destroy(a_descr);
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+#else   // GKO_COMPILING_CUDA
+    auto a_vals = a->get_const_values();
+    auto a_row_ptrs = a->get_const_row_ptrs();
+    auto a_col_idxs = a->get_const_col_idxs();
+    auto b_vals = b->get_const_values();
+    auto b_row_ptrs = b->get_const_row_ptrs();
+    auto b_col_idxs = b->get_const_col_idxs();
+    auto c_row_ptrs = c->get_row_ptrs();
+
+    auto handle = exec->get_sparselib_handle();
+    sparselib::pointer_mode_guard pm_guard(handle);
+
+    auto alpha = one<ValueType>();
+    auto a_nnz = static_cast<IndexType>(a->get_num_stored_elements());
+    auto b_nnz = static_cast<IndexType>(b->get_num_stored_elements());
+    auto null_value = static_cast<ValueType*>(nullptr);
+    auto null_index = static_cast<IndexType*>(nullptr);
+    auto zero_nnz = IndexType{};
+    auto m = IndexType(a->get_size()[0]);
+    auto n = IndexType(b->get_size()[1]);
+    auto k = IndexType(a->get_size()[1]);
+    matrix::CsrBuilder<ValueType, IndexType> c_builder{c};
+    auto& c_col_idxs_array = c_builder.get_col_idx_array();
+    auto& c_vals_array = c_builder.get_value_array();
+
+    const auto beta = zero<ValueType>();
+    auto spgemm_descr = sparselib::create_spgemm_descr();
+    auto a_descr = sparselib::create_csr(
+        m, k, a_nnz, const_cast<IndexType*>(a_row_ptrs),
+        const_cast<IndexType*>(a_col_idxs), const_cast<ValueType*>(a_vals));
+    auto b_descr = sparselib::create_csr(
+        k, n, b_nnz, const_cast<IndexType*>(b_row_ptrs),
+        const_cast<IndexType*>(b_col_idxs), const_cast<ValueType*>(b_vals));
+    auto c_descr = sparselib::create_csr(m, n, zero_nnz, null_index, null_index,
+                                         null_value);
+
+    // estimate work
+    size_type buffer1_size{};
+    sparselib::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, &beta,
+                                      c_descr, spgemm_descr, buffer1_size,
+                                      nullptr);
+    array<char> buffer1{exec, buffer1_size};
+    sparselib::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, &beta,
+                                      c_descr, spgemm_descr, buffer1_size,
+                                      buffer1.get_data());
+
+    // compute spgemm
+    size_type buffer2_size{};
+    sparselib::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, c_descr,
+                              spgemm_descr, buffer1.get_data(), buffer2_size,
+                              nullptr);
+    array<char> buffer2{exec, buffer2_size};
+    sparselib::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, c_descr,
+                              spgemm_descr, buffer1.get_data(), buffer2_size,
+                              buffer2.get_data());
+
+    // copy data to result
+    auto c_nnz = sparselib::sparse_matrix_nnz(c_descr);
+    c_col_idxs_array.resize_and_reset(c_nnz);
+    c_vals_array.resize_and_reset(c_nnz);
+    sparselib::csr_set_pointers(c_descr, c_row_ptrs,
+                                c_col_idxs_array.get_data(),
+                                c_vals_array.get_data());
+
+    sparselib::spgemm_copy(handle, &alpha, a_descr, b_descr, &beta, c_descr,
+                           spgemm_descr);
+
+    sparselib::destroy(c_descr);
+    sparselib::destroy(b_descr);
+    sparselib::destroy(a_descr);
+    sparselib::destroy(spgemm_descr);
+#endif  // GKO_COMPILING_CUDA
+}
+
+
+template <typename ValueType, typename IndexType>
+void advanced_spgemm(std::shared_ptr<const DefaultExecutor> exec,
+                     const matrix::Dense<ValueType>* alpha,
+                     const matrix::Csr<ValueType, IndexType>* a,
+                     const matrix::Csr<ValueType, IndexType>* b,
+                     const matrix::Dense<ValueType>* beta,
+                     const matrix::Csr<ValueType, IndexType>* d,
+                     matrix::Csr<ValueType, IndexType>* c)
+{
+#ifdef GKO_COMPILING_HIP
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_sparselib_handle();
+        sparselib::pointer_mode_guard pm_guard(handle);
+        auto a_descr = sparselib::create_mat_descr();
+        auto b_descr = sparselib::create_mat_descr();
+        auto c_descr = sparselib::create_mat_descr();
+        auto d_descr = sparselib::create_mat_descr();
+        auto info = sparselib::create_spgemm_info();
+
+        auto a_nnz = static_cast<IndexType>(a->get_num_stored_elements());
+        auto a_vals = a->get_const_values();
+        auto a_row_ptrs = a->get_const_row_ptrs();
+        auto a_col_idxs = a->get_const_col_idxs();
+        auto b_nnz = static_cast<IndexType>(b->get_num_stored_elements());
+        auto b_vals = b->get_const_values();
+        auto b_row_ptrs = b->get_const_row_ptrs();
+        auto b_col_idxs = b->get_const_col_idxs();
+        auto d_vals = d->get_const_values();
+        auto d_row_ptrs = d->get_const_row_ptrs();
+        auto d_col_idxs = d->get_const_col_idxs();
+        auto null_value = static_cast<ValueType*>(nullptr);
+        auto null_index = static_cast<IndexType*>(nullptr);
+        auto one_value = one<ValueType>();
+        auto m = static_cast<IndexType>(a->get_size()[0]);
+        auto n = static_cast<IndexType>(b->get_size()[1]);
+        auto k = static_cast<IndexType>(a->get_size()[1]);
+
+        // allocate buffer
+        size_type buffer_size{};
+        sparselib::spgemm_buffer_size(
+            handle, m, n, k, &one_value, a_descr, a_nnz, a_row_ptrs, a_col_idxs,
+            b_descr, b_nnz, b_row_ptrs, b_col_idxs, null_value, d_descr,
+            IndexType{}, null_index, null_index, info, buffer_size);
+        array<char> buffer_array(exec, buffer_size);
+        auto buffer = buffer_array.get_data();
+
+        // count nnz
+        array<IndexType> c_tmp_row_ptrs_array(exec, m + 1);
+        auto c_tmp_row_ptrs = c_tmp_row_ptrs_array.get_data();
+        IndexType c_nnz{};
+        sparselib::spgemm_nnz(
+            handle, m, n, k, a_descr, a_nnz, a_row_ptrs, a_col_idxs, b_descr,
+            b_nnz, b_row_ptrs, b_col_idxs, d_descr, IndexType{}, null_index,
+            null_index, c_descr, c_tmp_row_ptrs, &c_nnz, info, buffer);
+
+        // accumulate non-zeros for A * B
+        array<IndexType> c_tmp_col_idxs_array(exec, c_nnz);
+        array<ValueType> c_tmp_vals_array(exec, c_nnz);
+        auto c_tmp_col_idxs = c_tmp_col_idxs_array.get_data();
+        auto c_tmp_vals = c_tmp_vals_array.get_data();
+        sparselib::spgemm(handle, m, n, k, &one_value, a_descr, a_nnz, a_vals,
+                          a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals,
+                          b_row_ptrs, b_col_idxs, null_value, d_descr,
+                          IndexType{}, null_value, null_index, null_index,
+                          c_descr, c_tmp_vals, c_tmp_row_ptrs, c_tmp_col_idxs,
+                          info, buffer);
+
+        // destroy hipsparse context
+        sparselib::destroy_spgemm_info(info);
+        sparselib::destroy(d_descr);
+        sparselib::destroy(c_descr);
+        sparselib::destroy(b_descr);
+        sparselib::destroy(a_descr);
+
+        auto total_nnz = c_nnz + d->get_num_stored_elements();
+        auto nnz_per_row = total_nnz / m;
+        select_spgeam(
+            spgeam_kernels(),
+            [&](int compiled_subwarp_size) {
+                return compiled_subwarp_size >= nnz_per_row ||
+                       compiled_subwarp_size == config::warp_size;
+            },
+            syn::value_list<int>(), syn::type_list<>(), exec,
+            alpha->get_const_values(), c_tmp_row_ptrs, c_tmp_col_idxs,
+            c_tmp_vals, beta->get_const_values(), d_row_ptrs, d_col_idxs,
+            d_vals, c);
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+#else   // GKO_COMPILING_CUDA
+    auto handle = exec->get_sparselib_handle();
+    sparselib::pointer_mode_guard pm_guard(handle);
+
+    auto valpha = exec->copy_val_to_host(alpha->get_const_values());
+    auto a_nnz = IndexType(a->get_num_stored_elements());
+    auto a_vals = a->get_const_values();
+    auto a_row_ptrs = a->get_const_row_ptrs();
+    auto a_col_idxs = a->get_const_col_idxs();
+    auto b_nnz = IndexType(b->get_num_stored_elements());
+    auto b_vals = b->get_const_values();
+    auto b_row_ptrs = b->get_const_row_ptrs();
+    auto b_col_idxs = b->get_const_col_idxs();
+    auto vbeta = exec->copy_val_to_host(beta->get_const_values());
+    auto d_nnz = IndexType(d->get_num_stored_elements());
+    auto d_vals = d->get_const_values();
+    auto d_row_ptrs = d->get_const_row_ptrs();
+    auto d_col_idxs = d->get_const_col_idxs();
+    auto m = IndexType(a->get_size()[0]);
+    auto n = IndexType(b->get_size()[1]);
+    auto k = IndexType(a->get_size()[1]);
+    auto c_row_ptrs = c->get_row_ptrs();
+
+    auto null_value = static_cast<ValueType*>(nullptr);
+    auto null_index = static_cast<IndexType*>(nullptr);
+    auto one_val = one<ValueType>();
+    auto zero_val = zero<ValueType>();
+    auto zero_nnz = IndexType{};
+    auto spgemm_descr = sparselib::create_spgemm_descr();
+    auto a_descr = sparselib::create_csr(
+        m, k, a_nnz, const_cast<IndexType*>(a_row_ptrs),
+        const_cast<IndexType*>(a_col_idxs), const_cast<ValueType*>(a_vals));
+    auto b_descr = sparselib::create_csr(
+        k, n, b_nnz, const_cast<IndexType*>(b_row_ptrs),
+        const_cast<IndexType*>(b_col_idxs), const_cast<ValueType*>(b_vals));
+    auto c_descr = sparselib::create_csr(m, n, zero_nnz, null_index, null_index,
+                                         null_value);
+
+    // estimate work
+    size_type buffer1_size{};
+    sparselib::spgemm_work_estimation(handle, &one_val, a_descr, b_descr,
+                                      &zero_val, c_descr, spgemm_descr,
+                                      buffer1_size, nullptr);
+    array<char> buffer1{exec, buffer1_size};
+    sparselib::spgemm_work_estimation(handle, &one_val, a_descr, b_descr,
+                                      &zero_val, c_descr, spgemm_descr,
+                                      buffer1_size, buffer1.get_data());
+
+    // compute spgemm
+    size_type buffer2_size{};
+    sparselib::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val,
+                              c_descr, spgemm_descr, buffer1.get_data(),
+                              buffer2_size, nullptr);
+    array<char> buffer2{exec, buffer2_size};
+    sparselib::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val,
+                              c_descr, spgemm_descr, buffer1.get_data(),
+                              buffer2_size, buffer2.get_data());
+
+    // write result to temporary storage
+    auto c_tmp_nnz = sparselib::sparse_matrix_nnz(c_descr);
+    array<IndexType> c_tmp_row_ptrs_array(exec, m + 1);
+    array<IndexType> c_tmp_col_idxs_array(exec, c_tmp_nnz);
+    array<ValueType> c_tmp_vals_array(exec, c_tmp_nnz);
+    sparselib::csr_set_pointers(c_descr, c_tmp_row_ptrs_array.get_data(),
+                                c_tmp_col_idxs_array.get_data(),
+                                c_tmp_vals_array.get_data());
+
+    sparselib::spgemm_copy(handle, &one_val, a_descr, b_descr, &zero_val,
+                           c_descr, spgemm_descr);
+
+    sparselib::destroy(c_descr);
+    sparselib::destroy(b_descr);
+    sparselib::destroy(a_descr);
+    sparselib::destroy(spgemm_descr);
+
+    auto spgeam_total_nnz = c_tmp_nnz + d->get_num_stored_elements();
+    auto nnz_per_row = spgeam_total_nnz / m;
+    select_spgeam(
+        spgeam_kernels(),
+        [&](int compiled_subwarp_size) {
+            return compiled_subwarp_size >= nnz_per_row ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec,
+        alpha->get_const_values(), c_tmp_row_ptrs_array.get_const_data(),
+        c_tmp_col_idxs_array.get_const_data(),
+        c_tmp_vals_array.get_const_data(), beta->get_const_values(), d_row_ptrs,
+        d_col_idxs, d_vals, c);
+#endif  // GKO_COMPILING_CUDA
+}
+
+
+template <typename ValueType, typename IndexType>
+void transpose(std::shared_ptr<const DefaultExecutor> exec,
+               const matrix::Csr<ValueType, IndexType>* orig,
+               matrix::Csr<ValueType, IndexType>* trans)
+{
+    if (orig->get_size()[0] == 0) {
+        return;
+    }
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+#ifdef GKO_COMPILING_HIP
+        hipsparseAction_t copyValues = HIPSPARSE_ACTION_NUMERIC;
+        hipsparseIndexBase_t idxBase = HIPSPARSE_INDEX_BASE_ZERO;
+
+        sparselib::transpose(
+            exec->get_sparselib_handle(), orig->get_size()[0],
+            orig->get_size()[1], orig->get_num_stored_elements(),
+            orig->get_const_values(), orig->get_const_row_ptrs(),
+            orig->get_const_col_idxs(), trans->get_values(),
+            trans->get_row_ptrs(), trans->get_col_idxs(), copyValues, idxBase);
+#else   // GKO_COMPILING_CUDA
+        cudaDataType_t cu_value =
+            gko::kernels::cuda::cuda_data_type<ValueType>();
+        cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
+        cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
+        cusparseCsr2CscAlg_t alg = CUSPARSE_CSR2CSC_ALG1;
+        size_type buffer_size = 0;
+        sparselib::transpose_buffersize(
+            exec->get_sparselib_handle(), orig->get_size()[0],
+            orig->get_size()[1], orig->get_num_stored_elements(),
+            orig->get_const_values(), orig->get_const_row_ptrs(),
+            orig->get_const_col_idxs(), trans->get_values(),
+            trans->get_row_ptrs(), trans->get_col_idxs(), cu_value, copyValues,
+            idxBase, alg, &buffer_size);
+        array<char> buffer_array(exec, buffer_size);
+        auto buffer = buffer_array.get_data();
+        sparselib::transpose(
+            exec->get_sparselib_handle(), orig->get_size()[0],
+            orig->get_size()[1], orig->get_num_stored_elements(),
+            orig->get_const_values(), orig->get_const_row_ptrs(),
+            orig->get_const_col_idxs(), trans->get_values(),
+            trans->get_row_ptrs(), trans->get_col_idxs(), cu_value, copyValues,
+            idxBase, alg, buffer);
+#endif  // GKO_COMPILING_CUDA
+    } else {
+        fallback_transpose(exec, orig, trans);
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* orig,
+                    matrix::Csr<ValueType, IndexType>* trans)
+{
+    if (orig->get_size()[0] == 0) {
+        return;
+    }
+    const auto block_size = default_block_size;
+    const auto grid_size =
+        ceildiv(trans->get_num_stored_elements(), block_size);
+    transpose(exec, orig, trans);
+    if (grid_size > 0 && is_complex<ValueType>()) {
+        kernel::conjugate<<<grid_size, block_size, 0, exec->get_stream()>>>(
+            trans->get_num_stored_elements(),
+            as_device_type(trans->get_values()));
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void sort_by_column_index(std::shared_ptr<const DefaultExecutor> exec,
+                          matrix::Csr<ValueType, IndexType>* to_sort)
+{
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_sparselib_handle();
+        auto descr = sparselib::create_mat_descr();
+        auto m = IndexType(to_sort->get_size()[0]);
+        auto n = IndexType(to_sort->get_size()[1]);
+        auto nnz = IndexType(to_sort->get_num_stored_elements());
+        auto row_ptrs = to_sort->get_const_row_ptrs();
+        auto col_idxs = to_sort->get_col_idxs();
+        auto vals = to_sort->get_values();
+
+        // copy values
+        array<ValueType> tmp_vals_array(exec, nnz);
+        exec->copy(nnz, vals, tmp_vals_array.get_data());
+        auto tmp_vals = tmp_vals_array.get_const_data();
+
+        // init identity permutation
+        array<IndexType> permutation_array(exec, nnz);
+        auto permutation = permutation_array.get_data();
+        components::fill_seq_array(exec, permutation, nnz);
+
+        // allocate buffer
+        size_type buffer_size{};
+        sparselib::csrsort_buffer_size(handle, m, n, nnz, row_ptrs, col_idxs,
+                                       buffer_size);
+        array<char> buffer_array{exec, buffer_size};
+        auto buffer = buffer_array.get_data();
+
+        // sort column indices
+        sparselib::csrsort(handle, m, n, nnz, descr, row_ptrs, col_idxs,
+                           permutation, buffer);
+
+        // sort values
+#ifdef GKO_COMPILING_HIP
+        sparselib::gather(handle, nnz, tmp_vals, vals, permutation);
+#else  // GKO_COMPILING_CUDA
+        auto val_vec = sparselib::create_spvec(nnz, nnz, permutation, vals);
+        auto tmp_vec =
+            sparselib::create_dnvec(nnz, const_cast<ValueType*>(tmp_vals));
+        sparselib::gather(handle, tmp_vec, val_vec);
+#endif
+
+        sparselib::destroy(descr);
+    } else {
+        fallback_sort(exec, to_sort);
+    }
+}
+
+
+}  // namespace csr
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index 92b48518e7c..d4a94eda802 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 add_library(ginkgo_cuda $<TARGET_OBJECTS:ginkgo_cuda_device> "")
 include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
-add_instantiation_files(. matrix/csr_kernels.instantiate.cu CSR_INSTANTIATE)
+add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/csr_kernels.instantiate.cpp CSR_INSTANTIATE)
 add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE)
 # we don't split up the dense kernels into distinct compilations
 list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
@@ -41,7 +41,7 @@ else()
 endif()
 jacobi_generated_files(GKO_CUDA_JACOBI_SOURCES "${GKO_CUDA_JACOBI_BLOCK_SIZES}")
 # override the default language mapping for the common files, set them to CUDA
-foreach(source_file IN LISTS GKO_UNIFIED_COMMON_SOURCES GKO_CUDA_HIP_COMMON_SOURCES GKO_CUDA_JACOBI_SOURCES FBCSR_INSTANTIATE)
+foreach(source_file IN LISTS GKO_UNIFIED_COMMON_SOURCES GKO_CUDA_HIP_COMMON_SOURCES GKO_CUDA_JACOBI_SOURCES CSR_INSTANTIATE FBCSR_INSTANTIATE)
     set_source_files_properties(${source_file} PROPERTIES LANGUAGE CUDA)
 endforeach(source_file)
 target_sources(ginkgo_cuda PRIVATE ${GKO_CUDA_JACOBI_SOURCES})
diff --git a/cuda/matrix/csr_kernels.instantiate.cu b/cuda/matrix/csr_kernels.instantiate.cu
deleted file mode 100644
index a24e66ed89d..00000000000
--- a/cuda/matrix/csr_kernels.instantiate.cu
+++ /dev/null
@@ -1,81 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "cuda/matrix/csr_kernels.template.cu"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Compressed sparse row matrix format namespace.
- *
- * @ingroup csr
- */
-namespace csr {
-
-
-// begin
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL);
-// split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_SPMV_KERNEL);
-// split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL);
-// split
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
-// split
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL);
-// split
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX);
-// split
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_INDEX_SET_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_FROM_INDEX_SET_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL);
-// end
-
-
-}  // namespace csr
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/matrix/csr_kernels.template.cu b/cuda/matrix/csr_kernels.template.cu
deleted file mode 100644
index 89e5de9c303..00000000000
--- a/cuda/matrix/csr_kernels.template.cu
+++ /dev/null
@@ -1,1002 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/csr_kernels.hpp"
-
-#include <algorithm>
-
-#include <thrust/copy.h>
-#include <thrust/count.h>
-#include <thrust/device_ptr.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_output_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/sort.h>
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-#include <ginkgo/core/matrix/ell.hpp>
-#include <ginkgo/core/matrix/hybrid.hpp>
-#include <ginkgo/core/matrix/sellp.hpp>
-
-#include "accessor/cuda_hip_helper.hpp"
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/base/pointer_mode_guard.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/sparselib_bindings.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/atomic.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/format_conversion.hpp"
-#include "common/cuda_hip/components/intrinsics.hpp"
-#include "common/cuda_hip/components/merging.hpp"
-#include "common/cuda_hip/components/prefix_sum.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/segment_scan.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/uninitialized_array.hpp"
-#include "core/base/array_access.hpp"
-#include "core/base/mixed_precision_types.hpp"
-#include "core/components/fill_array_kernels.hpp"
-#include "core/components/format_conversion_kernels.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/matrix/csr_accessor_helper.hpp"
-#include "core/matrix/csr_builder.hpp"
-#include "core/matrix/csr_lookup.hpp"
-#include "core/matrix/dense_kernels.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace GKO_DEVICE_NAMESPACE {
-/**
- * @brief The Compressed sparse row matrix format namespace.
- *
- * @ingroup csr
- */
-namespace csr {
-
-
-constexpr int default_block_size = 512;
-constexpr int warps_in_block = 4;
-constexpr int spmv_block_size = warps_in_block * config::warp_size;
-constexpr int classical_oversubscription = 32;
-
-
-/**
- * A compile-time list of the number items per threads for which spmv kernel
- * should be compiled.
- */
-using compiled_kernels = syn::value_list<int, 3, 4, 6, 7, 8, 12, 14>;
-
-using classical_kernels =
-    syn::value_list<int, config::warp_size, 32, 16, 8, 4, 2, 1>;
-
-using spgeam_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
-
-
-#include "common/cuda_hip/matrix/csr_common.hpp.inc"
-#include "common/cuda_hip/matrix/csr_kernels.hpp.inc"
-
-
-namespace host_kernel {
-namespace {
-
-
-template <int items_per_thread, typename MatrixValueType,
-          typename InputValueType, typename OutputValueType, typename IndexType>
-void merge_path_spmv(syn::value_list<int, items_per_thread>,
-                     std::shared_ptr<const DefaultExecutor> exec,
-                     const matrix::Csr<MatrixValueType, IndexType>* a,
-                     const matrix::Dense<InputValueType>* b,
-                     matrix::Dense<OutputValueType>* c,
-                     const matrix::Dense<MatrixValueType>* alpha = nullptr,
-                     const matrix::Dense<OutputValueType>* beta = nullptr)
-{
-    using arithmetic_type =
-        highest_precision<InputValueType, OutputValueType, MatrixValueType>;
-    const IndexType total = a->get_size()[0] + a->get_num_stored_elements();
-    const IndexType grid_num =
-        ceildiv(total, spmv_block_size * items_per_thread);
-    const auto grid = grid_num;
-    const auto block = spmv_block_size;
-    // TODO: workspace?
-    array<IndexType> row_out(exec, grid_num);
-    // TODO: should we store the value in arithmetic_type or output_type?
-    array<arithmetic_type> val_out(exec, grid_num);
-
-    const auto a_vals =
-        acc::helper::build_const_rrm_accessor<arithmetic_type>(a);
-
-    for (IndexType column_id = 0; column_id < b->get_size()[1]; column_id++) {
-        const auto column_span =
-            acc::index_span(static_cast<acc::size_type>(column_id),
-                            static_cast<acc::size_type>(column_id + 1));
-        const auto b_vals =
-            acc::helper::build_const_rrm_accessor<arithmetic_type>(b,
-                                                                   column_span);
-        auto c_vals =
-            acc::helper::build_rrm_accessor<arithmetic_type>(c, column_span);
-        if (alpha == nullptr && beta == nullptr) {
-            if (grid_num > 0) {
-                kernel::abstract_merge_path_spmv<items_per_thread>
-                    <<<grid, block, 0, exec->get_stream()>>>(
-                        static_cast<IndexType>(a->get_size()[0]),
-                        acc::as_device_range(a_vals), a->get_const_col_idxs(),
-                        as_device_type(a->get_const_row_ptrs()),
-                        as_device_type(a->get_const_srow()),
-                        acc::as_device_range(b_vals),
-                        acc::as_device_range(c_vals),
-                        as_device_type(row_out.get_data()),
-                        as_device_type(val_out.get_data()));
-            }
-            kernel::
-                abstract_reduce<<<1, spmv_block_size, 0, exec->get_stream()>>>(
-                    grid_num, as_device_type(val_out.get_data()),
-                    as_device_type(row_out.get_data()),
-                    acc::as_device_range(c_vals));
-
-        } else if (alpha != nullptr && beta != nullptr) {
-            if (grid_num > 0) {
-                kernel::abstract_merge_path_spmv<items_per_thread>
-                    <<<grid, block, 0, exec->get_stream()>>>(
-                        static_cast<IndexType>(a->get_size()[0]),
-                        as_device_type(alpha->get_const_values()),
-                        acc::as_device_range(a_vals), a->get_const_col_idxs(),
-                        as_device_type(a->get_const_row_ptrs()),
-                        as_device_type(a->get_const_srow()),
-                        acc::as_device_range(b_vals),
-                        as_device_type(beta->get_const_values()),
-                        acc::as_device_range(c_vals),
-                        as_device_type(row_out.get_data()),
-                        as_device_type(val_out.get_data()));
-            }
-            kernel::
-                abstract_reduce<<<1, spmv_block_size, 0, exec->get_stream()>>>(
-                    grid_num, as_device_type(val_out.get_data()),
-                    as_device_type(row_out.get_data()),
-                    as_device_type(alpha->get_const_values()),
-                    acc::as_device_range(c_vals));
-        } else {
-            GKO_KERNEL_NOT_FOUND;
-        }
-    }
-}
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_merge_path_spmv, merge_path_spmv);
-
-
-template <typename ValueType, typename IndexType>
-int compute_items_per_thread(std::shared_ptr<const DefaultExecutor> exec)
-{
-    const int version =
-        (exec->get_major_version() << 4) + exec->get_minor_version();
-    // The num_item is decided to make the occupancy 100%
-    // TODO: Extend this list when new GPU is released
-    //       Tune this parameter
-    // 128 threads/block the number of items per threads
-    // 3.0 3.5: 6
-    // 3.7: 14
-    // 5.0, 5.3, 6.0, 6.2: 8
-    // 5.2, 6.1, 7.0: 12
-    int num_item = 6;
-    switch (version) {
-    case 0x50:
-    case 0x53:
-    case 0x60:
-    case 0x62:
-        num_item = 8;
-        break;
-    case 0x52:
-    case 0x61:
-    case 0x70:
-        num_item = 12;
-        break;
-    case 0x37:
-        num_item = 14;
-    }
-    // Ensure that the following is satisfied:
-    // sizeof(IndexType) + sizeof(ValueType)
-    // <= items_per_thread * sizeof(IndexType)
-    constexpr int minimal_num =
-        ceildiv(sizeof(IndexType) + sizeof(ValueType), sizeof(IndexType));
-    int items_per_thread = num_item * 4 / sizeof(IndexType);
-    return std::max(minimal_num, items_per_thread);
-}
-
-
-template <int subwarp_size, typename MatrixValueType, typename InputValueType,
-          typename OutputValueType, typename IndexType>
-void classical_spmv(syn::value_list<int, subwarp_size>,
-                    std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<MatrixValueType, IndexType>* a,
-                    const matrix::Dense<InputValueType>* b,
-                    matrix::Dense<OutputValueType>* c,
-                    const matrix::Dense<MatrixValueType>* alpha = nullptr,
-                    const matrix::Dense<OutputValueType>* beta = nullptr)
-{
-    using arithmetic_type =
-        highest_precision<InputValueType, OutputValueType, MatrixValueType>;
-
-    const auto nwarps = exec->get_num_warps_per_sm() *
-                        exec->get_num_multiprocessor() *
-                        classical_oversubscription;
-    const auto gridx =
-        std::min(ceildiv(a->get_size()[0], spmv_block_size / subwarp_size),
-                 int64(nwarps / warps_in_block));
-    const dim3 grid(gridx, b->get_size()[1]);
-    const auto block = spmv_block_size;
-
-    const auto a_vals =
-        acc::helper::build_const_rrm_accessor<arithmetic_type>(a);
-    const auto b_vals =
-        acc::helper::build_const_rrm_accessor<arithmetic_type>(b);
-    auto c_vals = acc::helper::build_rrm_accessor<arithmetic_type>(c);
-    if (alpha == nullptr && beta == nullptr) {
-        if (grid.x > 0 && grid.y > 0) {
-            kernel::abstract_classical_spmv<subwarp_size>
-                <<<grid, block, 0, exec->get_stream()>>>(
-                    a->get_size()[0], acc::as_device_range(a_vals),
-                    a->get_const_col_idxs(),
-                    as_device_type(a->get_const_row_ptrs()),
-                    acc::as_device_range(b_vals), acc::as_device_range(c_vals));
-        }
-    } else if (alpha != nullptr && beta != nullptr) {
-        if (grid.x > 0 && grid.y > 0) {
-            kernel::abstract_classical_spmv<subwarp_size>
-                <<<grid, block, 0, exec->get_stream()>>>(
-                    a->get_size()[0], as_device_type(alpha->get_const_values()),
-                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
-                    as_device_type(a->get_const_row_ptrs()),
-                    acc::as_device_range(b_vals),
-                    as_device_type(beta->get_const_values()),
-                    acc::as_device_range(c_vals));
-        }
-    } else {
-        GKO_KERNEL_NOT_FOUND;
-    }
-}
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv);
-
-
-template <typename MatrixValueType, typename InputValueType,
-          typename OutputValueType, typename IndexType>
-void load_balance_spmv(std::shared_ptr<const DefaultExecutor> exec,
-                       const matrix::Csr<MatrixValueType, IndexType>* a,
-                       const matrix::Dense<InputValueType>* b,
-                       matrix::Dense<OutputValueType>* c,
-                       const matrix::Dense<MatrixValueType>* alpha = nullptr,
-                       const matrix::Dense<OutputValueType>* beta = nullptr)
-{
-    using arithmetic_type =
-        highest_precision<InputValueType, OutputValueType, MatrixValueType>;
-
-    if (beta) {
-        dense::scale(exec, beta, c);
-    } else {
-        dense::fill(exec, c, zero<OutputValueType>());
-    }
-    const IndexType nwarps = a->get_num_srow_elements();
-    if (nwarps > 0) {
-        const dim3 csr_block(config::warp_size, warps_in_block, 1);
-        const dim3 csr_grid(ceildiv(nwarps, warps_in_block), b->get_size()[1]);
-        const auto a_vals =
-            acc::helper::build_const_rrm_accessor<arithmetic_type>(a);
-        const auto b_vals =
-            acc::helper::build_const_rrm_accessor<arithmetic_type>(b);
-        auto c_vals = acc::helper::build_rrm_accessor<arithmetic_type>(c);
-        if (alpha) {
-            if (csr_grid.x > 0 && csr_grid.y > 0) {
-                kernel::abstract_spmv<<<csr_grid, csr_block, 0,
-                                        exec->get_stream()>>>(
-                    nwarps, static_cast<IndexType>(a->get_size()[0]),
-                    as_device_type(alpha->get_const_values()),
-                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
-                    as_device_type(a->get_const_row_ptrs()),
-                    as_device_type(a->get_const_srow()),
-                    acc::as_device_range(b_vals), acc::as_device_range(c_vals));
-            }
-        } else {
-            if (csr_grid.x > 0 && csr_grid.y > 0) {
-                kernel::abstract_spmv<<<csr_grid, csr_block, 0,
-                                        exec->get_stream()>>>(
-                    nwarps, static_cast<IndexType>(a->get_size()[0]),
-                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
-                    as_device_type(a->get_const_row_ptrs()),
-                    as_device_type(a->get_const_srow()),
-                    acc::as_device_range(b_vals), acc::as_device_range(c_vals));
-            }
-        }
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-bool try_general_sparselib_spmv(std::shared_ptr<const DefaultExecutor> exec,
-                                const ValueType* alpha,
-                                const matrix::Csr<ValueType, IndexType>* a,
-                                const matrix::Dense<ValueType>* b,
-                                const ValueType* beta,
-                                matrix::Dense<ValueType>* c)
-{
-    auto handle = exec->get_sparselib_handle();
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-    if (!sparselib::is_supported<ValueType, IndexType>::value ||
-        b->get_stride() != 1 || c->get_stride() != 1 || b->get_size()[0] == 0 ||
-        c->get_size()[0] == 0) {
-        return false;
-    }
-
-    auto descr = sparselib::create_mat_descr();
-    auto row_ptrs = a->get_const_row_ptrs();
-    auto col_idxs = a->get_const_col_idxs();
-    sparselib::spmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, a->get_size()[0],
-                    a->get_size()[1], a->get_num_stored_elements(), alpha,
-                    descr, a->get_const_values(), row_ptrs, col_idxs,
-                    b->get_const_values(), beta, c->get_values());
-
-    sparselib::destroy(descr);
-#else  // CUDA_VERSION >= 11000
-    // workaround for a division by zero in cuSPARSE 11.?
-    if (a->get_size()[1] == 0) {
-        return false;
-    }
-    cusparseOperation_t trans = SPARSELIB_OPERATION_NON_TRANSPOSE;
-    auto row_ptrs = const_cast<IndexType*>(a->get_const_row_ptrs());
-    auto col_idxs = const_cast<IndexType*>(a->get_const_col_idxs());
-    auto values = const_cast<ValueType*>(a->get_const_values());
-    auto mat = sparselib::create_csr(a->get_size()[0], a->get_size()[1],
-                                     a->get_num_stored_elements(), row_ptrs,
-                                     col_idxs, values);
-    auto b_val = const_cast<ValueType*>(b->get_const_values());
-    auto c_val = c->get_values();
-    if (b->get_stride() == 1 && c->get_stride() == 1) {
-        auto vecb = sparselib::create_dnvec(b->get_size()[0], b_val);
-        auto vecc = sparselib::create_dnvec(c->get_size()[0], c_val);
-#if CUDA_VERSION >= 11021
-        constexpr auto alg = CUSPARSE_SPMV_CSR_ALG1;
-#else
-        constexpr auto alg = CUSPARSE_CSRMV_ALG1;
-#endif
-        size_type buffer_size = 0;
-        sparselib::spmv_buffersize<ValueType>(handle, trans, alpha, mat, vecb,
-                                              beta, vecc, alg, &buffer_size);
-
-        array<char> buffer_array(exec, buffer_size);
-        auto buffer = buffer_array.get_data();
-        sparselib::spmv<ValueType>(handle, trans, alpha, mat, vecb, beta, vecc,
-                                   alg, buffer);
-        sparselib::destroy(vecb);
-        sparselib::destroy(vecc);
-    } else {
-#if CUDA_VERSION >= 11060
-        if (b->get_size()[1] == 1) {
-            // cusparseSpMM seems to take the single strided vector as column
-            // major without considering stride and row major (cuda 11.6)
-            return false;
-        }
-#endif  // CUDA_VERSION >= 11060
-        cusparseSpMMAlg_t alg = CUSPARSE_SPMM_CSR_ALG2;
-        auto vecb =
-            sparselib::create_dnmat(b->get_size(), b->get_stride(), b_val);
-        auto vecc =
-            sparselib::create_dnmat(c->get_size(), c->get_stride(), c_val);
-        size_type buffer_size = 0;
-        sparselib::spmm_buffersize<ValueType>(handle, trans, trans, alpha, mat,
-                                              vecb, beta, vecc, alg,
-                                              &buffer_size);
-
-        array<char> buffer_array(exec, buffer_size);
-        auto buffer = buffer_array.get_data();
-        sparselib::spmm<ValueType>(handle, trans, trans, alpha, mat, vecb, beta,
-                                   vecc, alg, buffer);
-        sparselib::destroy(vecb);
-        sparselib::destroy(vecc);
-    }
-    sparselib::destroy(mat);
-#endif
-    return true;
-}
-
-
-template <typename MatrixValueType, typename InputValueType,
-          typename OutputValueType, typename IndexType,
-          typename = std::enable_if_t<
-              !std::is_same<MatrixValueType, InputValueType>::value ||
-              !std::is_same<MatrixValueType, OutputValueType>::value>>
-bool try_sparselib_spmv(std::shared_ptr<const DefaultExecutor> exec,
-                        const matrix::Csr<MatrixValueType, IndexType>* a,
-                        const matrix::Dense<InputValueType>* b,
-                        matrix::Dense<OutputValueType>* c,
-                        const matrix::Dense<MatrixValueType>* alpha = nullptr,
-                        const matrix::Dense<OutputValueType>* beta = nullptr)
-{
-    // TODO: support sparselib mixed
-    return false;
-}
-
-template <typename ValueType, typename IndexType>
-bool try_sparselib_spmv(std::shared_ptr<const DefaultExecutor> exec,
-                        const matrix::Csr<ValueType, IndexType>* a,
-                        const matrix::Dense<ValueType>* b,
-                        matrix::Dense<ValueType>* c,
-                        const matrix::Dense<ValueType>* alpha = nullptr,
-                        const matrix::Dense<ValueType>* beta = nullptr)
-{
-    if (alpha) {
-        return try_general_sparselib_spmv(exec, alpha->get_const_values(), a, b,
-                                          beta->get_const_values(), c);
-    } else {
-        auto handle = exec->get_sparselib_handle();
-        sparselib::pointer_mode_guard pm_guard(handle);
-        const auto valpha = one<ValueType>();
-        const auto vbeta = zero<ValueType>();
-        return try_general_sparselib_spmv(exec, &valpha, a, b, &vbeta, c);
-    }
-}
-
-
-}  // anonymous namespace
-}  // namespace host_kernel
-
-
-template <typename MatrixValueType, typename InputValueType,
-          typename OutputValueType, typename IndexType>
-void spmv(std::shared_ptr<const DefaultExecutor> exec,
-          const matrix::Csr<MatrixValueType, IndexType>* a,
-          const matrix::Dense<InputValueType>* b,
-          matrix::Dense<OutputValueType>* c)
-{
-    if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
-        // empty output: nothing to do
-    } else if (a->get_strategy()->get_name() == "load_balance") {
-        host_kernel::load_balance_spmv(exec, a, b, c);
-    } else if (a->get_strategy()->get_name() == "merge_path") {
-        using arithmetic_type =
-            highest_precision<InputValueType, OutputValueType, MatrixValueType>;
-        int items_per_thread =
-            host_kernel::compute_items_per_thread<arithmetic_type, IndexType>(
-                exec);
-        host_kernel::select_merge_path_spmv(
-            compiled_kernels(),
-            [&items_per_thread](int compiled_info) {
-                return items_per_thread == compiled_info;
-            },
-            syn::value_list<int>(), syn::type_list<>(), exec, a, b, c);
-    } else {
-        bool use_classical = true;
-        if (a->get_strategy()->get_name() == "sparselib" ||
-            a->get_strategy()->get_name() == "cusparse") {
-            use_classical = !host_kernel::try_sparselib_spmv(exec, a, b, c);
-        }
-        if (use_classical) {
-            IndexType max_length_per_row = 0;
-            using Tcsr = matrix::Csr<MatrixValueType, IndexType>;
-            if (auto strategy =
-                    std::dynamic_pointer_cast<const typename Tcsr::classical>(
-                        a->get_strategy())) {
-                max_length_per_row = strategy->get_max_length_per_row();
-            } else if (auto strategy = std::dynamic_pointer_cast<
-                           const typename Tcsr::automatical>(
-                           a->get_strategy())) {
-                max_length_per_row = strategy->get_max_length_per_row();
-            } else {
-                // as a fall-back: use average row length, at least 1
-                max_length_per_row = a->get_num_stored_elements() /
-                                     std::max<size_type>(a->get_size()[0], 1);
-            }
-            max_length_per_row = std::max<size_type>(max_length_per_row, 1);
-            host_kernel::select_classical_spmv(
-                classical_kernels(),
-                [&max_length_per_row](int compiled_info) {
-                    return max_length_per_row >= compiled_info;
-                },
-                syn::value_list<int>(), syn::type_list<>(), exec, a, b, c);
-        }
-    }
-}
-
-
-template <typename MatrixValueType, typename InputValueType,
-          typename OutputValueType, typename IndexType>
-void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec,
-                   const matrix::Dense<MatrixValueType>* alpha,
-                   const matrix::Csr<MatrixValueType, IndexType>* a,
-                   const matrix::Dense<InputValueType>* b,
-                   const matrix::Dense<OutputValueType>* beta,
-                   matrix::Dense<OutputValueType>* c)
-{
-    if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
-        // empty output: nothing to do
-    } else if (a->get_strategy()->get_name() == "load_balance") {
-        host_kernel::load_balance_spmv(exec, a, b, c, alpha, beta);
-    } else if (a->get_strategy()->get_name() == "merge_path") {
-        using arithmetic_type =
-            highest_precision<InputValueType, OutputValueType, MatrixValueType>;
-        int items_per_thread =
-            host_kernel::compute_items_per_thread<arithmetic_type, IndexType>(
-                exec);
-        host_kernel::select_merge_path_spmv(
-            compiled_kernels(),
-            [&items_per_thread](int compiled_info) {
-                return items_per_thread == compiled_info;
-            },
-            syn::value_list<int>(), syn::type_list<>(), exec, a, b, c, alpha,
-            beta);
-    } else {
-        bool use_classical = true;
-        if (a->get_strategy()->get_name() == "sparselib" ||
-            a->get_strategy()->get_name() == "cusparse") {
-            use_classical =
-                !host_kernel::try_sparselib_spmv(exec, a, b, c, alpha, beta);
-        }
-        if (use_classical) {
-            IndexType max_length_per_row = 0;
-            using Tcsr = matrix::Csr<MatrixValueType, IndexType>;
-            if (auto strategy =
-                    std::dynamic_pointer_cast<const typename Tcsr::classical>(
-                        a->get_strategy())) {
-                max_length_per_row = strategy->get_max_length_per_row();
-            } else if (auto strategy = std::dynamic_pointer_cast<
-                           const typename Tcsr::automatical>(
-                           a->get_strategy())) {
-                max_length_per_row = strategy->get_max_length_per_row();
-            } else {
-                // as a fall-back: use average row length, at least 1
-                max_length_per_row = a->get_num_stored_elements() /
-                                     std::max<size_type>(a->get_size()[0], 1);
-            }
-            max_length_per_row = std::max<size_type>(max_length_per_row, 1);
-            host_kernel::select_classical_spmv(
-                classical_kernels(),
-                [&max_length_per_row](int compiled_info) {
-                    return max_length_per_row >= compiled_info;
-                },
-                syn::value_list<int>(), syn::type_list<>(), exec, a, b, c,
-                alpha, beta);
-        }
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-void spgemm(std::shared_ptr<const DefaultExecutor> exec,
-            const matrix::Csr<ValueType, IndexType>* a,
-            const matrix::Csr<ValueType, IndexType>* b,
-            matrix::Csr<ValueType, IndexType>* c)
-{
-    auto a_vals = a->get_const_values();
-    auto a_row_ptrs = a->get_const_row_ptrs();
-    auto a_col_idxs = a->get_const_col_idxs();
-    auto b_vals = b->get_const_values();
-    auto b_row_ptrs = b->get_const_row_ptrs();
-    auto b_col_idxs = b->get_const_col_idxs();
-    auto c_row_ptrs = c->get_row_ptrs();
-
-    auto handle = exec->get_sparselib_handle();
-    sparselib::pointer_mode_guard pm_guard(handle);
-
-    auto alpha = one<ValueType>();
-    auto a_nnz = static_cast<IndexType>(a->get_num_stored_elements());
-    auto b_nnz = static_cast<IndexType>(b->get_num_stored_elements());
-    auto null_value = static_cast<ValueType*>(nullptr);
-    auto null_index = static_cast<IndexType*>(nullptr);
-    auto zero_nnz = IndexType{};
-    auto m = IndexType(a->get_size()[0]);
-    auto n = IndexType(b->get_size()[1]);
-    auto k = IndexType(a->get_size()[1]);
-    matrix::CsrBuilder<ValueType, IndexType> c_builder{c};
-    auto& c_col_idxs_array = c_builder.get_col_idx_array();
-    auto& c_vals_array = c_builder.get_value_array();
-
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-    if (!sparselib::is_supported<ValueType, IndexType>::value) {
-        GKO_NOT_IMPLEMENTED;
-    }
-
-    auto a_descr = sparselib::create_mat_descr();
-    auto b_descr = sparselib::create_mat_descr();
-    auto c_descr = sparselib::create_mat_descr();
-    auto d_descr = sparselib::create_mat_descr();
-    auto info = sparselib::create_spgemm_info();
-    // allocate buffer
-    size_type buffer_size{};
-    sparselib::spgemm_buffer_size(
-        handle, m, n, k, &alpha, a_descr, a_nnz, a_row_ptrs, a_col_idxs,
-        b_descr, b_nnz, b_row_ptrs, b_col_idxs, null_value, d_descr, zero_nnz,
-        null_index, null_index, info, buffer_size);
-    array<char> buffer_array(exec, buffer_size);
-    auto buffer = buffer_array.get_data();
-
-    // count nnz
-    IndexType c_nnz{};
-    sparselib::spgemm_nnz(handle, m, n, k, a_descr, a_nnz, a_row_ptrs,
-                          a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs,
-                          d_descr, zero_nnz, null_index, null_index, c_descr,
-                          c_row_ptrs, &c_nnz, info, buffer);
-
-    // accumulate non-zeros
-    c_col_idxs_array.resize_and_reset(c_nnz);
-    c_vals_array.resize_and_reset(c_nnz);
-    auto c_col_idxs = c_col_idxs_array.get_data();
-    auto c_vals = c_vals_array.get_data();
-    sparselib::spgemm(handle, m, n, k, &alpha, a_descr, a_nnz, a_vals,
-                      a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals,
-                      b_row_ptrs, b_col_idxs, null_value, d_descr, zero_nnz,
-                      null_value, null_index, null_index, c_descr, c_vals,
-                      c_row_ptrs, c_col_idxs, info, buffer);
-
-    sparselib::destroy(info);
-    sparselib::destroy(d_descr);
-    sparselib::destroy(c_descr);
-    sparselib::destroy(b_descr);
-    sparselib::destroy(a_descr);
-
-#else   // CUDA_VERSION >= 11000
-    const auto beta = zero<ValueType>();
-    auto spgemm_descr = sparselib::create_spgemm_descr();
-    auto a_descr = sparselib::create_csr(
-        m, k, a_nnz, const_cast<IndexType*>(a_row_ptrs),
-        const_cast<IndexType*>(a_col_idxs), const_cast<ValueType*>(a_vals));
-    auto b_descr = sparselib::create_csr(
-        k, n, b_nnz, const_cast<IndexType*>(b_row_ptrs),
-        const_cast<IndexType*>(b_col_idxs), const_cast<ValueType*>(b_vals));
-    auto c_descr = sparselib::create_csr(m, n, zero_nnz, null_index, null_index,
-                                         null_value);
-
-    // estimate work
-    size_type buffer1_size{};
-    sparselib::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, &beta,
-                                      c_descr, spgemm_descr, buffer1_size,
-                                      nullptr);
-    array<char> buffer1{exec, buffer1_size};
-    sparselib::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, &beta,
-                                      c_descr, spgemm_descr, buffer1_size,
-                                      buffer1.get_data());
-
-    // compute spgemm
-    size_type buffer2_size{};
-    sparselib::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, c_descr,
-                              spgemm_descr, buffer1.get_data(), buffer2_size,
-                              nullptr);
-    array<char> buffer2{exec, buffer2_size};
-    sparselib::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, c_descr,
-                              spgemm_descr, buffer1.get_data(), buffer2_size,
-                              buffer2.get_data());
-
-    // copy data to result
-    auto c_nnz = sparselib::sparse_matrix_nnz(c_descr);
-    c_col_idxs_array.resize_and_reset(c_nnz);
-    c_vals_array.resize_and_reset(c_nnz);
-    sparselib::csr_set_pointers(c_descr, c_row_ptrs,
-                                c_col_idxs_array.get_data(),
-                                c_vals_array.get_data());
-
-    sparselib::spgemm_copy(handle, &alpha, a_descr, b_descr, &beta, c_descr,
-                           spgemm_descr);
-
-    sparselib::destroy(c_descr);
-    sparselib::destroy(b_descr);
-    sparselib::destroy(a_descr);
-    sparselib::destroy(spgemm_descr);
-#endif  // CUDA_VERSION >= 11000
-}
-
-
-template <typename ValueType, typename IndexType>
-void advanced_spgemm(std::shared_ptr<const DefaultExecutor> exec,
-                     const matrix::Dense<ValueType>* alpha,
-                     const matrix::Csr<ValueType, IndexType>* a,
-                     const matrix::Csr<ValueType, IndexType>* b,
-                     const matrix::Dense<ValueType>* beta,
-                     const matrix::Csr<ValueType, IndexType>* d,
-                     matrix::Csr<ValueType, IndexType>* c)
-{
-    auto handle = exec->get_sparselib_handle();
-    sparselib::pointer_mode_guard pm_guard(handle);
-
-    auto valpha = exec->copy_val_to_host(alpha->get_const_values());
-    auto a_nnz = IndexType(a->get_num_stored_elements());
-    auto a_vals = a->get_const_values();
-    auto a_row_ptrs = a->get_const_row_ptrs();
-    auto a_col_idxs = a->get_const_col_idxs();
-    auto b_nnz = IndexType(b->get_num_stored_elements());
-    auto b_vals = b->get_const_values();
-    auto b_row_ptrs = b->get_const_row_ptrs();
-    auto b_col_idxs = b->get_const_col_idxs();
-    auto vbeta = exec->copy_val_to_host(beta->get_const_values());
-    auto d_nnz = IndexType(d->get_num_stored_elements());
-    auto d_vals = d->get_const_values();
-    auto d_row_ptrs = d->get_const_row_ptrs();
-    auto d_col_idxs = d->get_const_col_idxs();
-    auto m = IndexType(a->get_size()[0]);
-    auto n = IndexType(b->get_size()[1]);
-    auto k = IndexType(a->get_size()[1]);
-    auto c_row_ptrs = c->get_row_ptrs();
-
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-    if (!sparselib::is_supported<ValueType, IndexType>::value) {
-        GKO_NOT_IMPLEMENTED;
-    }
-
-    matrix::CsrBuilder<ValueType, IndexType> c_builder{c};
-    auto& c_col_idxs_array = c_builder.get_col_idx_array();
-    auto& c_vals_array = c_builder.get_value_array();
-    auto a_descr = sparselib::create_mat_descr();
-    auto b_descr = sparselib::create_mat_descr();
-    auto c_descr = sparselib::create_mat_descr();
-    auto d_descr = sparselib::create_mat_descr();
-    auto info = sparselib::create_spgemm_info();
-    // allocate buffer
-    size_type buffer_size{};
-    sparselib::spgemm_buffer_size(
-        handle, m, n, k, &valpha, a_descr, a_nnz, a_row_ptrs, a_col_idxs,
-        b_descr, b_nnz, b_row_ptrs, b_col_idxs, &vbeta, d_descr, d_nnz,
-        d_row_ptrs, d_col_idxs, info, buffer_size);
-    array<char> buffer_array(exec, buffer_size);
-    auto buffer = buffer_array.get_data();
-
-    // count nnz
-    IndexType c_nnz{};
-    sparselib::spgemm_nnz(handle, m, n, k, a_descr, a_nnz, a_row_ptrs,
-                          a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs,
-                          d_descr, d_nnz, d_row_ptrs, d_col_idxs, c_descr,
-                          c_row_ptrs, &c_nnz, info, buffer);
-
-    // accumulate non-zeros
-    c_col_idxs_array.resize_and_reset(c_nnz);
-    c_vals_array.resize_and_reset(c_nnz);
-    auto c_col_idxs = c_col_idxs_array.get_data();
-    auto c_vals = c_vals_array.get_data();
-    sparselib::spgemm(handle, m, n, k, &valpha, a_descr, a_nnz, a_vals,
-                      a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals,
-                      b_row_ptrs, b_col_idxs, &vbeta, d_descr, d_nnz, d_vals,
-                      d_row_ptrs, d_col_idxs, c_descr, c_vals, c_row_ptrs,
-                      c_col_idxs, info, buffer);
-
-    sparselib::destroy(info);
-    sparselib::destroy(d_descr);
-    sparselib::destroy(c_descr);
-    sparselib::destroy(b_descr);
-    sparselib::destroy(a_descr);
-#else   // CUDA_VERSION >= 11000
-    auto null_value = static_cast<ValueType*>(nullptr);
-    auto null_index = static_cast<IndexType*>(nullptr);
-    auto one_val = one<ValueType>();
-    auto zero_val = zero<ValueType>();
-    auto zero_nnz = IndexType{};
-    auto spgemm_descr = sparselib::create_spgemm_descr();
-    auto a_descr = sparselib::create_csr(
-        m, k, a_nnz, const_cast<IndexType*>(a_row_ptrs),
-        const_cast<IndexType*>(a_col_idxs), const_cast<ValueType*>(a_vals));
-    auto b_descr = sparselib::create_csr(
-        k, n, b_nnz, const_cast<IndexType*>(b_row_ptrs),
-        const_cast<IndexType*>(b_col_idxs), const_cast<ValueType*>(b_vals));
-    auto c_descr = sparselib::create_csr(m, n, zero_nnz, null_index, null_index,
-                                         null_value);
-
-    // estimate work
-    size_type buffer1_size{};
-    sparselib::spgemm_work_estimation(handle, &one_val, a_descr, b_descr,
-                                      &zero_val, c_descr, spgemm_descr,
-                                      buffer1_size, nullptr);
-    array<char> buffer1{exec, buffer1_size};
-    sparselib::spgemm_work_estimation(handle, &one_val, a_descr, b_descr,
-                                      &zero_val, c_descr, spgemm_descr,
-                                      buffer1_size, buffer1.get_data());
-
-    // compute spgemm
-    size_type buffer2_size{};
-    sparselib::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val,
-                              c_descr, spgemm_descr, buffer1.get_data(),
-                              buffer2_size, nullptr);
-    array<char> buffer2{exec, buffer2_size};
-    sparselib::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val,
-                              c_descr, spgemm_descr, buffer1.get_data(),
-                              buffer2_size, buffer2.get_data());
-
-    // write result to temporary storage
-    auto c_tmp_nnz = sparselib::sparse_matrix_nnz(c_descr);
-    array<IndexType> c_tmp_row_ptrs_array(exec, m + 1);
-    array<IndexType> c_tmp_col_idxs_array(exec, c_tmp_nnz);
-    array<ValueType> c_tmp_vals_array(exec, c_tmp_nnz);
-    sparselib::csr_set_pointers(c_descr, c_tmp_row_ptrs_array.get_data(),
-                                c_tmp_col_idxs_array.get_data(),
-                                c_tmp_vals_array.get_data());
-
-    sparselib::spgemm_copy(handle, &one_val, a_descr, b_descr, &zero_val,
-                           c_descr, spgemm_descr);
-
-    sparselib::destroy(c_descr);
-    sparselib::destroy(b_descr);
-    sparselib::destroy(a_descr);
-    sparselib::destroy(spgemm_descr);
-
-    auto spgeam_total_nnz = c_tmp_nnz + d->get_num_stored_elements();
-    auto nnz_per_row = spgeam_total_nnz / m;
-    select_spgeam(
-        spgeam_kernels(),
-        [&](int compiled_subwarp_size) {
-            return compiled_subwarp_size >= nnz_per_row ||
-                   compiled_subwarp_size == config::warp_size;
-        },
-        syn::value_list<int>(), syn::type_list<>(), exec,
-        alpha->get_const_values(), c_tmp_row_ptrs_array.get_const_data(),
-        c_tmp_col_idxs_array.get_const_data(),
-        c_tmp_vals_array.get_const_data(), beta->get_const_values(), d_row_ptrs,
-        d_col_idxs, d_vals, c);
-#endif  // CUDA_VERSION >= 11000
-}
-
-
-template <typename ValueType, typename IndexType>
-void transpose(std::shared_ptr<const DefaultExecutor> exec,
-               const matrix::Csr<ValueType, IndexType>* orig,
-               matrix::Csr<ValueType, IndexType>* trans)
-{
-    if (orig->get_size()[0] == 0) {
-        return;
-    }
-    if (sparselib::is_supported<ValueType, IndexType>::value) {
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-        cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
-        cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
-
-        sparselib::transpose(
-            exec->get_sparselib_handle(), orig->get_size()[0],
-            orig->get_size()[1], orig->get_num_stored_elements(),
-            orig->get_const_values(), orig->get_const_row_ptrs(),
-            orig->get_const_col_idxs(), trans->get_values(),
-            trans->get_row_ptrs(), trans->get_col_idxs(), copyValues, idxBase);
-#else  // CUDA_VERSION >= 11000
-        cudaDataType_t cu_value =
-            gko::kernels::cuda::cuda_data_type<ValueType>();
-        cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
-        cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
-        cusparseCsr2CscAlg_t alg = CUSPARSE_CSR2CSC_ALG1;
-        size_type buffer_size = 0;
-        sparselib::transpose_buffersize(
-            exec->get_sparselib_handle(), orig->get_size()[0],
-            orig->get_size()[1], orig->get_num_stored_elements(),
-            orig->get_const_values(), orig->get_const_row_ptrs(),
-            orig->get_const_col_idxs(), trans->get_values(),
-            trans->get_row_ptrs(), trans->get_col_idxs(), cu_value, copyValues,
-            idxBase, alg, &buffer_size);
-        array<char> buffer_array(exec, buffer_size);
-        auto buffer = buffer_array.get_data();
-        sparselib::transpose(
-            exec->get_sparselib_handle(), orig->get_size()[0],
-            orig->get_size()[1], orig->get_num_stored_elements(),
-            orig->get_const_values(), orig->get_const_row_ptrs(),
-            orig->get_const_col_idxs(), trans->get_values(),
-            trans->get_row_ptrs(), trans->get_col_idxs(), cu_value, copyValues,
-            idxBase, alg, buffer);
-#endif
-    } else {
-        fallback_transpose(exec, orig, trans);
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* orig,
-                    matrix::Csr<ValueType, IndexType>* trans)
-{
-    if (orig->get_size()[0] == 0) {
-        return;
-    }
-    const auto block_size = default_block_size;
-    const auto grid_size =
-        ceildiv(trans->get_num_stored_elements(), block_size);
-    if (sparselib::is_supported<ValueType, IndexType>::value) {
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-        cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
-        cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
-
-        sparselib::transpose(
-            exec->get_sparselib_handle(), orig->get_size()[0],
-            orig->get_size()[1], orig->get_num_stored_elements(),
-            orig->get_const_values(), orig->get_const_row_ptrs(),
-            orig->get_const_col_idxs(), trans->get_values(),
-            trans->get_row_ptrs(), trans->get_col_idxs(), copyValues, idxBase);
-#else  // CUDA_VERSION >= 11000
-        cudaDataType_t cu_value =
-            gko::kernels::cuda::cuda_data_type<ValueType>();
-        cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
-        cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
-        cusparseCsr2CscAlg_t alg = CUSPARSE_CSR2CSC_ALG1;
-        size_type buffer_size = 0;
-        sparselib::transpose_buffersize(
-            exec->get_sparselib_handle(), orig->get_size()[0],
-            orig->get_size()[1], orig->get_num_stored_elements(),
-            orig->get_const_values(), orig->get_const_row_ptrs(),
-            orig->get_const_col_idxs(), trans->get_values(),
-            trans->get_row_ptrs(), trans->get_col_idxs(), cu_value, copyValues,
-            idxBase, alg, &buffer_size);
-        array<char> buffer_array(exec, buffer_size);
-        auto buffer = buffer_array.get_data();
-        sparselib::transpose(
-            exec->get_sparselib_handle(), orig->get_size()[0],
-            orig->get_size()[1], orig->get_num_stored_elements(),
-            orig->get_const_values(), orig->get_const_row_ptrs(),
-            orig->get_const_col_idxs(), trans->get_values(),
-            trans->get_row_ptrs(), trans->get_col_idxs(), cu_value, copyValues,
-            idxBase, alg, buffer);
-#endif
-    } else {
-        fallback_transpose(exec, orig, trans);
-    }
-    if (grid_size > 0 && is_complex<ValueType>()) {
-        kernel::conjugate<<<grid_size, block_size, 0, exec->get_stream()>>>(
-            trans->get_num_stored_elements(),
-            as_device_type(trans->get_values()));
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-void sort_by_column_index(std::shared_ptr<const DefaultExecutor> exec,
-                          matrix::Csr<ValueType, IndexType>* to_sort)
-{
-    if (sparselib::is_supported<ValueType, IndexType>::value) {
-        auto handle = exec->get_sparselib_handle();
-        auto descr = sparselib::create_mat_descr();
-        auto m = IndexType(to_sort->get_size()[0]);
-        auto n = IndexType(to_sort->get_size()[1]);
-        auto nnz = IndexType(to_sort->get_num_stored_elements());
-        auto row_ptrs = to_sort->get_const_row_ptrs();
-        auto col_idxs = to_sort->get_col_idxs();
-        auto vals = to_sort->get_values();
-
-        // copy values
-        array<ValueType> tmp_vals_array(exec, nnz);
-        exec->copy(nnz, vals, tmp_vals_array.get_data());
-        auto tmp_vals = tmp_vals_array.get_const_data();
-
-        // init identity permutation
-        array<IndexType> permutation_array(exec, nnz);
-        auto permutation = permutation_array.get_data();
-        components::fill_seq_array(exec, permutation, nnz);
-
-        // allocate buffer
-        size_type buffer_size{};
-        sparselib::csrsort_buffer_size(handle, m, n, nnz, row_ptrs, col_idxs,
-                                       buffer_size);
-        array<char> buffer_array{exec, buffer_size};
-        auto buffer = buffer_array.get_data();
-
-        // sort column indices
-        sparselib::csrsort(handle, m, n, nnz, descr, row_ptrs, col_idxs,
-                           permutation, buffer);
-
-        // sort values
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-        sparselib::gather(handle, nnz, tmp_vals, vals, permutation);
-#else  // CUDA_VERSION >= 11000
-        auto val_vec = sparselib::create_spvec(nnz, nnz, permutation, vals);
-        auto tmp_vec =
-            sparselib::create_dnvec(nnz, const_cast<ValueType*>(tmp_vals));
-        sparselib::gather(handle, tmp_vec, val_vec);
-#endif
-
-        sparselib::destroy(descr);
-    } else {
-        fallback_sort(exec, to_sort);
-    }
-}
-
-
-}  // namespace csr
-}  // namespace GKO_DEVICE_NAMESPACE
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 67617169b5a..30e675509d5 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.21)
 include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
-add_instantiation_files(. matrix/csr_kernels.instantiate.hip.cpp CSR_INSTANTIATE)
+add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/csr_kernels.instantiate.cpp CSR_INSTANTIATE)
 add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE)
 # we don't split up the dense kernels into distinct compilations
 list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
diff --git a/hip/matrix/csr_kernels.template.hip.cpp b/hip/matrix/csr_kernels.template.hip.cpp
deleted file mode 100644
index 1fb086c5ea6..00000000000
--- a/hip/matrix/csr_kernels.template.hip.cpp
+++ /dev/null
@@ -1,798 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/csr_kernels.hpp"
-
-#include <algorithm>
-
-#include <thrust/copy.h>
-#include <thrust/count.h>
-#include <thrust/device_ptr.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_output_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/sort.h>
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-#include <ginkgo/core/matrix/ell.hpp>
-#include <ginkgo/core/matrix/hybrid.hpp>
-#include <ginkgo/core/matrix/sellp.hpp>
-
-#include "accessor/cuda_hip_helper.hpp"
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/base/pointer_mode_guard.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/sparselib_bindings.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/atomic.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/format_conversion.hpp"
-#include "common/cuda_hip/components/intrinsics.hpp"
-#include "common/cuda_hip/components/merging.hpp"
-#include "common/cuda_hip/components/prefix_sum.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/segment_scan.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/uninitialized_array.hpp"
-#include "core/base/array_access.hpp"
-#include "core/base/mixed_precision_types.hpp"
-#include "core/components/fill_array_kernels.hpp"
-#include "core/components/format_conversion_kernels.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/matrix/csr_accessor_helper.hpp"
-#include "core/matrix/csr_builder.hpp"
-#include "core/matrix/csr_lookup.hpp"
-#include "core/matrix/dense_kernels.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace GKO_DEVICE_NAMESPACE {
-/**
- * @brief The Compressed sparse row matrix format namespace.
- *
- * @ingroup csr
- */
-namespace csr {
-
-
-constexpr int default_block_size = 512;
-constexpr int warps_in_block = 4;
-constexpr int spmv_block_size = warps_in_block * config::warp_size;
-constexpr int classical_oversubscription = 32;
-
-
-/**
- * A compile-time list of the number items per threads for which spmv kernel
- * should be compiled.
- */
-using compiled_kernels = syn::value_list<int, 3, 4, 6, 7, 8, 12, 14>;
-
-using classical_kernels =
-    syn::value_list<int, config::warp_size, 32, 16, 8, 4, 2, 1>;
-
-using spgeam_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
-
-
-#include "common/cuda_hip/matrix/csr_common.hpp.inc"
-#include "common/cuda_hip/matrix/csr_kernels.hpp.inc"
-
-
-namespace host_kernel {
-namespace {
-
-
-template <int items_per_thread, typename MatrixValueType,
-          typename InputValueType, typename OutputValueType, typename IndexType>
-void merge_path_spmv(syn::value_list<int, items_per_thread>,
-                     std::shared_ptr<const DefaultExecutor> exec,
-                     const matrix::Csr<MatrixValueType, IndexType>* a,
-                     const matrix::Dense<InputValueType>* b,
-                     matrix::Dense<OutputValueType>* c,
-                     const matrix::Dense<MatrixValueType>* alpha = nullptr,
-                     const matrix::Dense<OutputValueType>* beta = nullptr)
-{
-    using arithmetic_type =
-        highest_precision<InputValueType, OutputValueType, MatrixValueType>;
-    const IndexType total = a->get_size()[0] + a->get_num_stored_elements();
-    const IndexType grid_num =
-        ceildiv(total, spmv_block_size * items_per_thread);
-    const auto grid = grid_num;
-    const auto block = spmv_block_size;
-    // TODO: workspace?
-    array<IndexType> row_out(exec, grid_num);
-    // TODO: should we store the value in arithmetic_type or output_type?
-    array<arithmetic_type> val_out(exec, grid_num);
-
-    const auto a_vals =
-        acc::helper::build_const_rrm_accessor<arithmetic_type>(a);
-
-    for (IndexType column_id = 0; column_id < b->get_size()[1]; column_id++) {
-        const auto column_span =
-            acc::index_span(static_cast<acc::size_type>(column_id),
-                            static_cast<acc::size_type>(column_id + 1));
-        const auto b_vals =
-            acc::helper::build_const_rrm_accessor<arithmetic_type>(b,
-                                                                   column_span);
-        auto c_vals =
-            acc::helper::build_rrm_accessor<arithmetic_type>(c, column_span);
-        if (alpha == nullptr && beta == nullptr) {
-            if (grid_num > 0) {
-                kernel::abstract_merge_path_spmv<items_per_thread>
-                    <<<grid, block, 0, exec->get_stream()>>>(
-                        static_cast<IndexType>(a->get_size()[0]),
-                        acc::as_device_range(a_vals), a->get_const_col_idxs(),
-                        as_device_type(a->get_const_row_ptrs()),
-                        as_device_type(a->get_const_srow()),
-                        acc::as_device_range(b_vals),
-                        acc::as_device_range(c_vals),
-                        as_device_type(row_out.get_data()),
-                        as_device_type(val_out.get_data()));
-            }
-            kernel::
-                abstract_reduce<<<1, spmv_block_size, 0, exec->get_stream()>>>(
-                    grid_num, as_device_type(val_out.get_data()),
-                    as_device_type(row_out.get_data()),
-                    acc::as_device_range(c_vals));
-
-        } else if (alpha != nullptr && beta != nullptr) {
-            if (grid_num > 0) {
-                kernel::abstract_merge_path_spmv<items_per_thread>
-                    <<<grid, block, 0, exec->get_stream()>>>(
-                        static_cast<IndexType>(a->get_size()[0]),
-                        as_device_type(alpha->get_const_values()),
-                        acc::as_device_range(a_vals), a->get_const_col_idxs(),
-                        as_device_type(a->get_const_row_ptrs()),
-                        as_device_type(a->get_const_srow()),
-                        acc::as_device_range(b_vals),
-                        as_device_type(beta->get_const_values()),
-                        acc::as_device_range(c_vals),
-                        as_device_type(row_out.get_data()),
-                        as_device_type(val_out.get_data()));
-            }
-            kernel::
-                abstract_reduce<<<1, spmv_block_size, 0, exec->get_stream()>>>(
-                    grid_num, as_device_type(val_out.get_data()),
-                    as_device_type(row_out.get_data()),
-                    as_device_type(alpha->get_const_values()),
-                    acc::as_device_range(c_vals));
-        } else {
-            GKO_KERNEL_NOT_FOUND;
-        }
-    }
-}
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_merge_path_spmv, merge_path_spmv);
-
-
-template <typename ValueType, typename IndexType>
-int compute_items_per_thread(std::shared_ptr<const DefaultExecutor> exec)
-{
-#if GINKGO_HIP_PLATFORM_NVCC
-
-
-    const int version =
-        (exec->get_major_version() << 4) + exec->get_minor_version();
-    // The num_item is decided to make the occupancy 100%
-    // TODO: Extend this list when new GPU is released
-    //       Tune this parameter
-    // 128 threads/block the number of items per threads
-    // 3.0 3.5: 6
-    // 3.7: 14
-    // 5.0, 5.3, 6.0, 6.2: 8
-    // 5.2, 6.1, 7.0: 12
-    int num_item = 6;
-    switch (version) {
-    case 0x50:
-    case 0x53:
-    case 0x60:
-    case 0x62:
-        num_item = 8;
-        break;
-    case 0x52:
-    case 0x61:
-    case 0x70:
-        num_item = 12;
-        break;
-    case 0x37:
-        num_item = 14;
-    }
-
-
-#else
-
-
-    // HIP uses the minimal num_item to make the code work correctly.
-    // TODO: this parameter should be tuned.
-    int num_item = 6;
-
-
-#endif  // GINKGO_HIP_PLATFORM_NVCC
-
-
-    // Ensure that the following is satisfied:
-    // sizeof(IndexType) + sizeof(ValueType)
-    // <= items_per_thread * sizeof(IndexType)
-    constexpr int minimal_num =
-        ceildiv(sizeof(IndexType) + sizeof(ValueType), sizeof(IndexType));
-    int items_per_thread = num_item * 4 / sizeof(IndexType);
-    return std::max(minimal_num, items_per_thread);
-}
-
-
-template <int subwarp_size, typename MatrixValueType, typename InputValueType,
-          typename OutputValueType, typename IndexType>
-void classical_spmv(syn::value_list<int, subwarp_size>,
-                    std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<MatrixValueType, IndexType>* a,
-                    const matrix::Dense<InputValueType>* b,
-                    matrix::Dense<OutputValueType>* c,
-                    const matrix::Dense<MatrixValueType>* alpha = nullptr,
-                    const matrix::Dense<OutputValueType>* beta = nullptr)
-{
-    using arithmetic_type =
-        highest_precision<InputValueType, OutputValueType, MatrixValueType>;
-
-    const auto nwarps = exec->get_num_warps_per_sm() *
-                        exec->get_num_multiprocessor() *
-                        classical_oversubscription;
-    const auto gridx =
-        std::min(ceildiv(a->get_size()[0], spmv_block_size / subwarp_size),
-                 int64(nwarps / warps_in_block));
-    const dim3 grid(gridx, b->get_size()[1]);
-    const auto block = spmv_block_size;
-
-    const auto a_vals =
-        acc::helper::build_const_rrm_accessor<arithmetic_type>(a);
-    const auto b_vals =
-        acc::helper::build_const_rrm_accessor<arithmetic_type>(b);
-    auto c_vals = acc::helper::build_rrm_accessor<arithmetic_type>(c);
-    if (alpha == nullptr && beta == nullptr) {
-        if (grid.x > 0 && grid.y > 0) {
-            kernel::abstract_classical_spmv<subwarp_size>
-                <<<grid, block, 0, exec->get_stream()>>>(
-                    a->get_size()[0], acc::as_device_range(a_vals),
-                    a->get_const_col_idxs(),
-                    as_device_type(a->get_const_row_ptrs()),
-                    acc::as_device_range(b_vals), acc::as_device_range(c_vals));
-        }
-    } else if (alpha != nullptr && beta != nullptr) {
-        if (grid.x > 0 && grid.y > 0) {
-            kernel::abstract_classical_spmv<subwarp_size>
-                <<<grid, block, 0, exec->get_stream()>>>(
-                    a->get_size()[0], as_device_type(alpha->get_const_values()),
-                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
-                    as_device_type(a->get_const_row_ptrs()),
-                    acc::as_device_range(b_vals),
-                    as_device_type(beta->get_const_values()),
-                    acc::as_device_range(c_vals));
-        }
-    } else {
-        GKO_KERNEL_NOT_FOUND;
-    }
-}
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv);
-
-
-template <typename MatrixValueType, typename InputValueType,
-          typename OutputValueType, typename IndexType>
-void load_balance_spmv(std::shared_ptr<const DefaultExecutor> exec,
-                       const matrix::Csr<MatrixValueType, IndexType>* a,
-                       const matrix::Dense<InputValueType>* b,
-                       matrix::Dense<OutputValueType>* c,
-                       const matrix::Dense<MatrixValueType>* alpha = nullptr,
-                       const matrix::Dense<OutputValueType>* beta = nullptr)
-{
-    using arithmetic_type =
-        highest_precision<InputValueType, OutputValueType, MatrixValueType>;
-
-    if (beta) {
-        dense::scale(exec, beta, c);
-    } else {
-        dense::fill(exec, c, zero<OutputValueType>());
-    }
-    const IndexType nwarps = a->get_num_srow_elements();
-    if (nwarps > 0) {
-        const dim3 csr_block(config::warp_size, warps_in_block, 1);
-        const dim3 csr_grid(ceildiv(nwarps, warps_in_block), b->get_size()[1]);
-        const auto a_vals =
-            acc::helper::build_const_rrm_accessor<arithmetic_type>(a);
-        const auto b_vals =
-            acc::helper::build_const_rrm_accessor<arithmetic_type>(b);
-        auto c_vals = acc::helper::build_rrm_accessor<arithmetic_type>(c);
-        if (alpha) {
-            if (csr_grid.x > 0 && csr_grid.y > 0) {
-                kernel::abstract_spmv<<<csr_grid, csr_block, 0,
-                                        exec->get_stream()>>>(
-                    nwarps, static_cast<IndexType>(a->get_size()[0]),
-                    as_device_type(alpha->get_const_values()),
-                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
-                    as_device_type(a->get_const_row_ptrs()),
-                    as_device_type(a->get_const_srow()),
-                    acc::as_device_range(b_vals), acc::as_device_range(c_vals));
-            }
-        } else {
-            if (csr_grid.x > 0 && csr_grid.y > 0) {
-                kernel::abstract_spmv<<<csr_grid, csr_block, 0,
-                                        exec->get_stream()>>>(
-                    nwarps, static_cast<IndexType>(a->get_size()[0]),
-                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
-                    as_device_type(a->get_const_row_ptrs()),
-                    as_device_type(a->get_const_srow()),
-                    acc::as_device_range(b_vals), acc::as_device_range(c_vals));
-            }
-        }
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-bool try_general_sparselib_spmv(std::shared_ptr<const DefaultExecutor> exec,
-                                const ValueType* alpha,
-                                const matrix::Csr<ValueType, IndexType>* a,
-                                const matrix::Dense<ValueType>* b,
-                                const ValueType* beta,
-                                matrix::Dense<ValueType>* c)
-{
-    bool try_sparselib = sparselib::is_supported<ValueType, IndexType>::value;
-    try_sparselib =
-        try_sparselib && b->get_stride() == 1 && c->get_stride() == 1;
-    // rocSPARSE has issues with zero matrices
-    try_sparselib = try_sparselib && a->get_num_stored_elements() > 0;
-    if (try_sparselib) {
-        auto descr = sparselib::create_mat_descr();
-
-        auto row_ptrs = a->get_const_row_ptrs();
-        auto col_idxs = a->get_const_col_idxs();
-
-        sparselib::spmv(exec->get_sparselib_handle(),
-                        SPARSELIB_OPERATION_NON_TRANSPOSE, a->get_size()[0],
-                        a->get_size()[1], a->get_num_stored_elements(), alpha,
-                        descr, a->get_const_values(), row_ptrs, col_idxs,
-                        b->get_const_values(), beta, c->get_values());
-
-        sparselib::destroy(descr);
-    }
-    return try_sparselib;
-}
-
-
-template <typename MatrixValueType, typename InputValueType,
-          typename OutputValueType, typename IndexType,
-          typename = std::enable_if_t<
-              !std::is_same<MatrixValueType, InputValueType>::value ||
-              !std::is_same<MatrixValueType, OutputValueType>::value>>
-bool try_sparselib_spmv(std::shared_ptr<const DefaultExecutor> exec,
-                        const matrix::Csr<MatrixValueType, IndexType>* a,
-                        const matrix::Dense<InputValueType>* b,
-                        matrix::Dense<OutputValueType>* c,
-                        const matrix::Dense<MatrixValueType>* alpha = nullptr,
-                        const matrix::Dense<OutputValueType>* beta = nullptr)
-{
-    // TODO: support sparselib mixed
-    return false;
-}
-
-template <typename ValueType, typename IndexType>
-bool try_sparselib_spmv(std::shared_ptr<const DefaultExecutor> exec,
-                        const matrix::Csr<ValueType, IndexType>* a,
-                        const matrix::Dense<ValueType>* b,
-                        matrix::Dense<ValueType>* c,
-                        const matrix::Dense<ValueType>* alpha = nullptr,
-                        const matrix::Dense<ValueType>* beta = nullptr)
-{
-    if (alpha) {
-        return try_general_sparselib_spmv(exec, alpha->get_const_values(), a, b,
-                                          beta->get_const_values(), c);
-    } else {
-        auto handle = exec->get_sparselib_handle();
-        sparselib::pointer_mode_guard pm_guard(handle);
-        const auto valpha = one<ValueType>();
-        const auto vbeta = zero<ValueType>();
-        return try_general_sparselib_spmv(exec, &valpha, a, b, &vbeta, c);
-    }
-}
-
-
-}  // anonymous namespace
-}  // namespace host_kernel
-
-
-template <typename MatrixValueType, typename InputValueType,
-          typename OutputValueType, typename IndexType>
-void spmv(std::shared_ptr<const DefaultExecutor> exec,
-          const matrix::Csr<MatrixValueType, IndexType>* a,
-          const matrix::Dense<InputValueType>* b,
-          matrix::Dense<OutputValueType>* c)
-{
-    if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
-        // empty output: nothing to do
-    } else if (a->get_strategy()->get_name() == "load_balance") {
-        host_kernel::load_balance_spmv(exec, a, b, c);
-    } else if (a->get_strategy()->get_name() == "merge_path") {
-        using arithmetic_type =
-            highest_precision<InputValueType, OutputValueType, MatrixValueType>;
-        int items_per_thread =
-            host_kernel::compute_items_per_thread<arithmetic_type, IndexType>(
-                exec);
-        host_kernel::select_merge_path_spmv(
-            compiled_kernels(),
-            [&items_per_thread](int compiled_info) {
-                return items_per_thread == compiled_info;
-            },
-            syn::value_list<int>(), syn::type_list<>(), exec, a, b, c);
-    } else {
-        bool use_classical = true;
-        if (a->get_strategy()->get_name() == "sparselib" ||
-            a->get_strategy()->get_name() == "cusparse") {
-            use_classical = !host_kernel::try_sparselib_spmv(exec, a, b, c);
-        }
-        if (use_classical) {
-            IndexType max_length_per_row = 0;
-            using Tcsr = matrix::Csr<MatrixValueType, IndexType>;
-            if (auto strategy =
-                    std::dynamic_pointer_cast<const typename Tcsr::classical>(
-                        a->get_strategy())) {
-                max_length_per_row = strategy->get_max_length_per_row();
-            } else if (auto strategy = std::dynamic_pointer_cast<
-                           const typename Tcsr::automatical>(
-                           a->get_strategy())) {
-                max_length_per_row = strategy->get_max_length_per_row();
-            } else {
-                // as a fall-back: use average row length, at least 1
-                max_length_per_row = a->get_num_stored_elements() /
-                                     std::max<size_type>(a->get_size()[0], 1);
-            }
-            max_length_per_row = std::max<size_type>(max_length_per_row, 1);
-            host_kernel::select_classical_spmv(
-                classical_kernels(),
-                [&max_length_per_row](int compiled_info) {
-                    return max_length_per_row >= compiled_info;
-                },
-                syn::value_list<int>(), syn::type_list<>(), exec, a, b, c);
-        }
-    }
-}
-
-
-template <typename MatrixValueType, typename InputValueType,
-          typename OutputValueType, typename IndexType>
-void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec,
-                   const matrix::Dense<MatrixValueType>* alpha,
-                   const matrix::Csr<MatrixValueType, IndexType>* a,
-                   const matrix::Dense<InputValueType>* b,
-                   const matrix::Dense<OutputValueType>* beta,
-                   matrix::Dense<OutputValueType>* c)
-{
-    if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
-        // empty output: nothing to do
-    } else if (a->get_strategy()->get_name() == "load_balance") {
-        host_kernel::load_balance_spmv(exec, a, b, c, alpha, beta);
-    } else if (a->get_strategy()->get_name() == "merge_path") {
-        using arithmetic_type =
-            highest_precision<InputValueType, OutputValueType, MatrixValueType>;
-        int items_per_thread =
-            host_kernel::compute_items_per_thread<arithmetic_type, IndexType>(
-                exec);
-        host_kernel::select_merge_path_spmv(
-            compiled_kernels(),
-            [&items_per_thread](int compiled_info) {
-                return items_per_thread == compiled_info;
-            },
-            syn::value_list<int>(), syn::type_list<>(), exec, a, b, c, alpha,
-            beta);
-    } else {
-        bool use_classical = true;
-        if (a->get_strategy()->get_name() == "sparselib" ||
-            a->get_strategy()->get_name() == "cusparse") {
-            use_classical =
-                !host_kernel::try_sparselib_spmv(exec, a, b, c, alpha, beta);
-        }
-        if (use_classical) {
-            IndexType max_length_per_row = 0;
-            using Tcsr = matrix::Csr<MatrixValueType, IndexType>;
-            if (auto strategy =
-                    std::dynamic_pointer_cast<const typename Tcsr::classical>(
-                        a->get_strategy())) {
-                max_length_per_row = strategy->get_max_length_per_row();
-            } else if (auto strategy = std::dynamic_pointer_cast<
-                           const typename Tcsr::automatical>(
-                           a->get_strategy())) {
-                max_length_per_row = strategy->get_max_length_per_row();
-            } else {
-                // as a fall-back: use average row length, at least 1
-                max_length_per_row = a->get_num_stored_elements() /
-                                     std::max<size_type>(a->get_size()[0], 1);
-            }
-            max_length_per_row = std::max<size_type>(max_length_per_row, 1);
-            host_kernel::select_classical_spmv(
-                classical_kernels(),
-                [&max_length_per_row](int compiled_info) {
-                    return max_length_per_row >= compiled_info;
-                },
-                syn::value_list<int>(), syn::type_list<>(), exec, a, b, c,
-                alpha, beta);
-        }
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-void spgemm(std::shared_ptr<const DefaultExecutor> exec,
-            const matrix::Csr<ValueType, IndexType>* a,
-            const matrix::Csr<ValueType, IndexType>* b,
-            matrix::Csr<ValueType, IndexType>* c)
-{
-    if (sparselib::is_supported<ValueType, IndexType>::value) {
-        auto handle = exec->get_sparselib_handle();
-        sparselib::pointer_mode_guard pm_guard(handle);
-        auto a_descr = sparselib::create_mat_descr();
-        auto b_descr = sparselib::create_mat_descr();
-        auto c_descr = sparselib::create_mat_descr();
-        auto d_descr = sparselib::create_mat_descr();
-        auto info = sparselib::create_spgemm_info();
-
-        auto alpha = one<ValueType>();
-        auto a_nnz = static_cast<IndexType>(a->get_num_stored_elements());
-        auto a_vals = a->get_const_values();
-        auto a_row_ptrs = a->get_const_row_ptrs();
-        auto a_col_idxs = a->get_const_col_idxs();
-        auto b_nnz = static_cast<IndexType>(b->get_num_stored_elements());
-        auto b_vals = b->get_const_values();
-        auto b_row_ptrs = b->get_const_row_ptrs();
-        auto b_col_idxs = b->get_const_col_idxs();
-        auto null_value = static_cast<ValueType*>(nullptr);
-        auto null_index = static_cast<IndexType*>(nullptr);
-        auto zero_nnz = IndexType{};
-        auto m = static_cast<IndexType>(a->get_size()[0]);
-        auto n = static_cast<IndexType>(b->get_size()[1]);
-        auto k = static_cast<IndexType>(a->get_size()[1]);
-        auto c_row_ptrs = c->get_row_ptrs();
-        matrix::CsrBuilder<ValueType, IndexType> c_builder{c};
-        auto& c_col_idxs_array = c_builder.get_col_idx_array();
-        auto& c_vals_array = c_builder.get_value_array();
-
-        // allocate buffer
-        size_type buffer_size{};
-        sparselib::spgemm_buffer_size(
-            handle, m, n, k, &alpha, a_descr, a_nnz, a_row_ptrs, a_col_idxs,
-            b_descr, b_nnz, b_row_ptrs, b_col_idxs, null_value, d_descr,
-            zero_nnz, null_index, null_index, info, buffer_size);
-        array<char> buffer_array(exec, buffer_size);
-        auto buffer = buffer_array.get_data();
-
-        // count nnz
-        IndexType c_nnz{};
-        sparselib::spgemm_nnz(
-            handle, m, n, k, a_descr, a_nnz, a_row_ptrs, a_col_idxs, b_descr,
-            b_nnz, b_row_ptrs, b_col_idxs, d_descr, zero_nnz, null_index,
-            null_index, c_descr, c_row_ptrs, &c_nnz, info, buffer);
-
-        // accumulate non-zeros
-        c_col_idxs_array.resize_and_reset(c_nnz);
-        c_vals_array.resize_and_reset(c_nnz);
-        auto c_col_idxs = c_col_idxs_array.get_data();
-        auto c_vals = c_vals_array.get_data();
-        sparselib::spgemm(handle, m, n, k, &alpha, a_descr, a_nnz, a_vals,
-                          a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals,
-                          b_row_ptrs, b_col_idxs, null_value, d_descr, zero_nnz,
-                          null_value, null_index, null_index, c_descr, c_vals,
-                          c_row_ptrs, c_col_idxs, info, buffer);
-
-        sparselib::destroy_spgemm_info(info);
-        sparselib::destroy(d_descr);
-        sparselib::destroy(c_descr);
-        sparselib::destroy(b_descr);
-        sparselib::destroy(a_descr);
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-void advanced_spgemm(std::shared_ptr<const DefaultExecutor> exec,
-                     const matrix::Dense<ValueType>* alpha,
-                     const matrix::Csr<ValueType, IndexType>* a,
-                     const matrix::Csr<ValueType, IndexType>* b,
-                     const matrix::Dense<ValueType>* beta,
-                     const matrix::Csr<ValueType, IndexType>* d,
-                     matrix::Csr<ValueType, IndexType>* c)
-{
-    if (sparselib::is_supported<ValueType, IndexType>::value) {
-        auto handle = exec->get_sparselib_handle();
-        sparselib::pointer_mode_guard pm_guard(handle);
-        auto a_descr = sparselib::create_mat_descr();
-        auto b_descr = sparselib::create_mat_descr();
-        auto c_descr = sparselib::create_mat_descr();
-        auto d_descr = sparselib::create_mat_descr();
-        auto info = sparselib::create_spgemm_info();
-
-        auto a_nnz = static_cast<IndexType>(a->get_num_stored_elements());
-        auto a_vals = a->get_const_values();
-        auto a_row_ptrs = a->get_const_row_ptrs();
-        auto a_col_idxs = a->get_const_col_idxs();
-        auto b_nnz = static_cast<IndexType>(b->get_num_stored_elements());
-        auto b_vals = b->get_const_values();
-        auto b_row_ptrs = b->get_const_row_ptrs();
-        auto b_col_idxs = b->get_const_col_idxs();
-        auto d_vals = d->get_const_values();
-        auto d_row_ptrs = d->get_const_row_ptrs();
-        auto d_col_idxs = d->get_const_col_idxs();
-        auto null_value = static_cast<ValueType*>(nullptr);
-        auto null_index = static_cast<IndexType*>(nullptr);
-        auto one_value = one<ValueType>();
-        auto m = static_cast<IndexType>(a->get_size()[0]);
-        auto n = static_cast<IndexType>(b->get_size()[1]);
-        auto k = static_cast<IndexType>(a->get_size()[1]);
-
-        // allocate buffer
-        size_type buffer_size{};
-        sparselib::spgemm_buffer_size(
-            handle, m, n, k, &one_value, a_descr, a_nnz, a_row_ptrs, a_col_idxs,
-            b_descr, b_nnz, b_row_ptrs, b_col_idxs, null_value, d_descr,
-            IndexType{}, null_index, null_index, info, buffer_size);
-        array<char> buffer_array(exec, buffer_size);
-        auto buffer = buffer_array.get_data();
-
-        // count nnz
-        array<IndexType> c_tmp_row_ptrs_array(exec, m + 1);
-        auto c_tmp_row_ptrs = c_tmp_row_ptrs_array.get_data();
-        IndexType c_nnz{};
-        sparselib::spgemm_nnz(
-            handle, m, n, k, a_descr, a_nnz, a_row_ptrs, a_col_idxs, b_descr,
-            b_nnz, b_row_ptrs, b_col_idxs, d_descr, IndexType{}, null_index,
-            null_index, c_descr, c_tmp_row_ptrs, &c_nnz, info, buffer);
-
-        // accumulate non-zeros for A * B
-        array<IndexType> c_tmp_col_idxs_array(exec, c_nnz);
-        array<ValueType> c_tmp_vals_array(exec, c_nnz);
-        auto c_tmp_col_idxs = c_tmp_col_idxs_array.get_data();
-        auto c_tmp_vals = c_tmp_vals_array.get_data();
-        sparselib::spgemm(handle, m, n, k, &one_value, a_descr, a_nnz, a_vals,
-                          a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals,
-                          b_row_ptrs, b_col_idxs, null_value, d_descr,
-                          IndexType{}, null_value, null_index, null_index,
-                          c_descr, c_tmp_vals, c_tmp_row_ptrs, c_tmp_col_idxs,
-                          info, buffer);
-
-        // destroy hipsparse context
-        sparselib::destroy_spgemm_info(info);
-        sparselib::destroy(d_descr);
-        sparselib::destroy(c_descr);
-        sparselib::destroy(b_descr);
-        sparselib::destroy(a_descr);
-
-        auto total_nnz = c_nnz + d->get_num_stored_elements();
-        auto nnz_per_row = total_nnz / m;
-        select_spgeam(
-            spgeam_kernels(),
-            [&](int compiled_subwarp_size) {
-                return compiled_subwarp_size >= nnz_per_row ||
-                       compiled_subwarp_size == config::warp_size;
-            },
-            syn::value_list<int>(), syn::type_list<>(), exec,
-            alpha->get_const_values(), c_tmp_row_ptrs, c_tmp_col_idxs,
-            c_tmp_vals, beta->get_const_values(), d_row_ptrs, d_col_idxs,
-            d_vals, c);
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-void transpose(std::shared_ptr<const DefaultExecutor> exec,
-               const matrix::Csr<ValueType, IndexType>* orig,
-               matrix::Csr<ValueType, IndexType>* trans)
-{
-    if (orig->get_size()[0] == 0) {
-        return;
-    }
-    if (sparselib::is_supported<ValueType, IndexType>::value) {
-        hipsparseAction_t copyValues = HIPSPARSE_ACTION_NUMERIC;
-        hipsparseIndexBase_t idxBase = HIPSPARSE_INDEX_BASE_ZERO;
-
-        sparselib::transpose(
-            exec->get_sparselib_handle(), orig->get_size()[0],
-            orig->get_size()[1], orig->get_num_stored_elements(),
-            orig->get_const_values(), orig->get_const_row_ptrs(),
-            orig->get_const_col_idxs(), trans->get_values(),
-            trans->get_row_ptrs(), trans->get_col_idxs(), copyValues, idxBase);
-    } else {
-        fallback_transpose(exec, orig, trans);
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* orig,
-                    matrix::Csr<ValueType, IndexType>* trans)
-{
-    if (orig->get_size()[0] == 0) {
-        return;
-    }
-    const auto block_size = default_block_size;
-    const auto grid_size =
-        ceildiv(trans->get_num_stored_elements(), block_size);
-    if (sparselib::is_supported<ValueType, IndexType>::value) {
-        hipsparseAction_t copyValues = HIPSPARSE_ACTION_NUMERIC;
-        hipsparseIndexBase_t idxBase = HIPSPARSE_INDEX_BASE_ZERO;
-
-        sparselib::transpose(
-            exec->get_sparselib_handle(), orig->get_size()[0],
-            orig->get_size()[1], orig->get_num_stored_elements(),
-            orig->get_const_values(), orig->get_const_row_ptrs(),
-            orig->get_const_col_idxs(), trans->get_values(),
-            trans->get_row_ptrs(), trans->get_col_idxs(), copyValues, idxBase);
-    } else {
-        fallback_transpose(exec, orig, trans);
-    }
-    if (grid_size > 0 && is_complex<ValueType>()) {
-        kernel::conjugate<<<grid_size, block_size, 0, exec->get_stream()>>>(
-            trans->get_num_stored_elements(),
-            as_device_type(trans->get_values()));
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-void sort_by_column_index(std::shared_ptr<const DefaultExecutor> exec,
-                          matrix::Csr<ValueType, IndexType>* to_sort)
-{
-    if (sparselib::is_supported<ValueType, IndexType>::value) {
-        auto handle = exec->get_sparselib_handle();
-        auto descr = sparselib::create_mat_descr();
-        auto m = IndexType(to_sort->get_size()[0]);
-        auto n = IndexType(to_sort->get_size()[1]);
-        auto nnz = IndexType(to_sort->get_num_stored_elements());
-        auto row_ptrs = to_sort->get_const_row_ptrs();
-        auto col_idxs = to_sort->get_col_idxs();
-        auto vals = to_sort->get_values();
-
-        // copy values
-        array<ValueType> tmp_vals_array(exec, nnz);
-        exec->copy(nnz, vals, tmp_vals_array.get_data());
-        auto tmp_vals = tmp_vals_array.get_const_data();
-
-        // init identity permutation
-        array<IndexType> permutation_array(exec, nnz);
-        auto permutation = permutation_array.get_data();
-        components::fill_seq_array(exec, permutation, nnz);
-
-        // allocate buffer
-        size_type buffer_size{};
-        sparselib::csrsort_buffer_size(handle, m, n, nnz, row_ptrs, col_idxs,
-                                       buffer_size);
-        array<char> buffer_array{exec, buffer_size};
-        auto buffer = buffer_array.get_data();
-
-        // sort column indices
-        sparselib::csrsort(handle, m, n, nnz, descr, row_ptrs, col_idxs,
-                           permutation, buffer);
-
-        // sort values
-        sparselib::gather(handle, nnz, tmp_vals, vals, permutation);
-
-        sparselib::destroy(descr);
-    } else {
-        fallback_sort(exec, to_sort);
-    }
-}
-
-
-}  // namespace csr
-}  // namespace GKO_DEVICE_NAMESPACE
-}  // namespace kernels
-}  // namespace gko

From c063bfa1c4a095508541e775cfa0e3f5e045e96f Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Mon, 15 Jul 2024 22:30:46 +0200
Subject: [PATCH 093/448] remove solver_progress reliance on uninitialized
 values

---
 core/test/log/solver_progress.cpp | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/core/test/log/solver_progress.cpp b/core/test/log/solver_progress.cpp
index f2433779864..fe8a4537f66 100644
--- a/core/test/log/solver_progress.cpp
+++ b/core/test/log/solver_progress.cpp
@@ -86,7 +86,7 @@ TYPED_TEST(SolverProgress, TableWorks)
            << std::setw(default_column_width) << "implicit_sq_residual_norm"
            << '\n';
     ref_ss << std::setw(default_column_width) << 0
-           << std::setw(default_column_width) << T{0.0}
+           << std::setw(default_column_width) << T{4.0}
            << std::setw(default_column_width) << T{1.0}
            << std::setw(default_column_width) << T{4.0}
            << std::setw(default_column_width) << T{4.0} << '\n'
@@ -95,18 +95,16 @@ TYPED_TEST(SolverProgress, TableWorks)
            << std::setw(default_column_width) << T{0.0}
            << std::setw(default_column_width) << T{4.0}
            << std::setw(default_column_width) << T{0.0} << '\n';
+    // run the solve once so the internal vectors are initialized before
+    // attaching the logger
+    this->solver->apply(this->in, this->out->clone());
     std::stringstream ss;
     this->solver->add_logger(
         gko::log::SolverProgress::create_scalar_table_writer(ss));
 
     this->solver->apply(this->in, this->out);
 
-    // the first value of beta is uninitialized, so we need to remove it
-    std::regex first_beta("\n           0 *[()0-9.e,+-]*");
-    auto clean_str = std::regex_replace(ss.str(), first_beta, "\n           0");
-    auto clean_ref =
-        std::regex_replace(ref_ss.str(), first_beta, "\n           0");
-    ASSERT_EQ(clean_str, clean_ref);
+    ASSERT_EQ(ss.str(), ref_ss.str());
 }
 
 
@@ -119,21 +117,20 @@ TYPED_TEST(SolverProgress, CsvWorks)
            << this->out.get() << ") of dimensions " << this->solver->get_size()
            << " and " << this->in->get_size()[1] << " rhs\n";
     ref_ss << "Iteration;beta;prev_rho;rho;implicit_sq_residual_norm" << '\n';
-    ref_ss << 0 << ';' << T{0.0} << ';' << T{1.0} << ';' << T{4.0} << ';'
+    ref_ss << 0 << ';' << T{4.0} << ';' << T{1.0} << ';' << T{4.0} << ';'
            << T{4.0} << '\n'
            << 1 << ';' << T{4.0} << ';' << T{0.0} << ';' << T{4.0} << ';'
            << T{0.0} << '\n';
+    // run the solve once so the internal vectors are initialized before
+    // attaching the logger
+    this->solver->apply(this->in, this->out->clone());
     std::stringstream ss;
     this->solver->add_logger(
         gko::log::SolverProgress::create_scalar_csv_writer(ss, 6, ';'));
 
     this->solver->apply(this->in, this->out);
 
-    // the first value of beta is uninitialized, so we need to remove it
-    std::regex first_beta("\n0;[^;]*");
-    auto clean_str = std::regex_replace(ss.str(), first_beta, "\n0;");
-    auto clean_ref = std::regex_replace(ref_ss.str(), first_beta, "\n0;");
-    ASSERT_EQ(clean_str, clean_ref);
+    ASSERT_EQ(ss.str(), ref_ss.str());
 }
 
 
@@ -171,6 +168,9 @@ TYPED_TEST(SolverProgress, StorageWorks)
         {"solver_progress_test_initial_guess", orig_out.get()},
         {"solver_progress_test_rhs", this->in.get()},
         {"solver_progress_test_system_matrix", this->mtx.get()}};
+    // run the solve once so the internal vectors are initialized before
+    // attaching the logger
+    this->solver->apply(this->in, this->out->clone());
     this->solver->add_logger(gko::log::SolverProgress::create_vector_storage(
         "solver_progress_test", false));
     this->solver->add_logger(gko::log::SolverProgress::create_vector_storage(

From c2511808ac7b8ef4fba149452b939bb6bfa9f114 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Mon, 15 Jul 2024 22:31:35 +0200
Subject: [PATCH 094/448] formatting

---
 common/cuda_hip/matrix/csr_kernels.template.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/common/cuda_hip/matrix/csr_kernels.template.cpp b/common/cuda_hip/matrix/csr_kernels.template.cpp
index eda0e856b07..f17cf1548fe 100644
--- a/common/cuda_hip/matrix/csr_kernels.template.cpp
+++ b/common/cuda_hip/matrix/csr_kernels.template.cpp
@@ -84,6 +84,8 @@ using spgeam_kernels =
 
 
 #include "common/cuda_hip/matrix/csr_common.hpp.inc"
+
+
 namespace kernel {
 
 

From e810e036fd578ef53be3fa4ebe7cb326867aa8ce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Gr=C3=BCtzmacher?= <thomas.gruetzmacher@tum.de>
Date: Wed, 10 Jul 2024 17:52:20 +0200
Subject: [PATCH 095/448] Add additional tests for the communicator group

---
 cuda/test/components/cooperative_groups.cu    | 37 ++++++++++++++++
 .../test/components/cooperative_groups.dp.cpp | 42 +++++++++++++++++++
 .../components/cooperative_groups.hip.cpp     | 37 ++++++++++++++++
 3 files changed, 116 insertions(+)

diff --git a/cuda/test/components/cooperative_groups.cu b/cuda/test/components/cooperative_groups.cu
index df3cef86bb8..077b0121fbd 100644
--- a/cuda/test/components/cooperative_groups.cu
+++ b/cuda/test/components/cooperative_groups.cu
@@ -223,4 +223,41 @@ TEST_F(CooperativeGroups, SubwarpBallot) { test(cg_subwarp_ballot); }
 TEST_F(CooperativeGroups, SubwarpBallot2) { test_subwarp(cg_subwarp_ballot); }
 
 
+__global__ void cg_communicator_categorization(bool*)
+{
+    auto this_block = group::this_thread_block();
+    auto tiled_partition =
+        group::tiled_partition<config::warp_size>(this_block);
+    auto subwarp_partition = group::tiled_partition<subwarp_size>(this_block);
+
+    using not_group = int;
+    using this_block_t = decltype(this_block);
+    using tiled_partition_t = decltype(tiled_partition);
+    using subwarp_partition_t = decltype(subwarp_partition);
+
+    static_assert(!group::is_group<not_group>::value &&
+                      group::is_group<this_block_t>::value &&
+                      group::is_group<tiled_partition_t>::value &&
+                      group::is_group<subwarp_partition_t>::value,
+                  "Group check doesn't work.");
+    static_assert(
+        !group::is_synchronizable_group<not_group>::value &&
+            group::is_synchronizable_group<this_block_t>::value &&
+            group::is_synchronizable_group<tiled_partition_t>::value &&
+            group::is_synchronizable_group<subwarp_partition_t>::value,
+        "Synchronizable group check doesn't work.");
+    static_assert(
+        !group::is_communicator_group<not_group>::value &&
+            !group::is_communicator_group<this_block_t>::value &&
+            group::is_communicator_group<tiled_partition_t>::value &&
+            group::is_communicator_group<subwarp_partition_t>::value,
+        "Communicator group check doesn't work.");
+}
+
+TEST_F(CooperativeGroups, CorrectCategorization)
+{
+    test(cg_communicator_categorization);
+}
+
+
 }  // namespace
diff --git a/dpcpp/test/components/cooperative_groups.dp.cpp b/dpcpp/test/components/cooperative_groups.dp.cpp
index 27e14b62d2d..8667a85713e 100644
--- a/dpcpp/test/components/cooperative_groups.dp.cpp
+++ b/dpcpp/test/components/cooperative_groups.dp.cpp
@@ -198,6 +198,48 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(cg_ballot_call, cg_ballot, default_config_list)
 TEST_P(CooperativeGroups, Ballot) { test_all_subgroup(cg_ballot_call<bool*>); }
 
 
+template <typename cfg>
+void cg_communicator_categorization(bool* s, sycl::nd_item<3> item_ct1)
+{
+    auto this_block = group::this_thread_block(item_ct1);
+    auto tiled_partition =
+        group::tiled_partition<cfg::subgroup_size>(this_block);
+
+    using not_group = int;
+    using this_block_t = decltype(this_block);
+    using tiled_partition_t = decltype(tiled_partition);
+
+    static_assert(!group::is_group<not_group>::value &&
+                      group::is_group<this_block_t>::value &&
+                      group::is_group<tiled_partition_t>::value,
+                  "Group check doesn't work.");
+    static_assert(
+        !group::is_synchronizable_group<not_group>::value &&
+            group::is_synchronizable_group<this_block_t>::value &&
+            group::is_synchronizable_group<tiled_partition_t>::value,
+        "Synchronizable group check doesn't work.");
+    static_assert(
+        !group::is_communicator_group<not_group>::value &&
+            !group::is_communicator_group<this_block_t>::value &&
+            group::is_communicator_group<tiled_partition_t>::value,
+        "Communicator group check doesn't work.");
+    s[this_block.thread_rank()] = true;
+}
+
+GKO_ENABLE_DEFAULT_HOST_CONFIG_TYPE(cg_communicator_categorization,
+                                    cg_communicator_categorization)
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION_TOTYPE(
+    cg_communicator_categorization, cg_communicator_categorization, DCFG_1D)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(cg_communicator_categorization_call,
+                               cg_communicator_categorization,
+                               default_config_list)
+
+TEST_P(CooperativeGroups, CorrectCategorization)
+{
+    test_all_subgroup(cg_communicator_categorization_call<bool*>);
+}
+
+
 INSTANTIATE_TEST_SUITE_P(DifferentSubgroup, CooperativeGroups,
                          testing::Values(4, 8, 16, 32, 64),
                          testing::PrintToStringParamName());
diff --git a/hip/test/components/cooperative_groups.hip.cpp b/hip/test/components/cooperative_groups.hip.cpp
index 06a104a8879..0f71550139c 100644
--- a/hip/test/components/cooperative_groups.hip.cpp
+++ b/hip/test/components/cooperative_groups.hip.cpp
@@ -242,6 +242,43 @@ TEST_F(CooperativeGroups, SubwarpBallot) { test(cg_subwarp_ballot); }
 TEST_F(CooperativeGroups, SubwarpBallot2) { test_subwarp(cg_subwarp_ballot); }
 
 
+__global__ void cg_communicator_categorization(bool*)
+{
+    auto this_block = group::this_thread_block();
+    auto tiled_partition =
+        group::tiled_partition<config::warp_size>(this_block);
+    auto subwarp_partition = group::tiled_partition<subwarp_size>(this_block);
+
+    using not_group = int;
+    using this_block_t = decltype(this_block);
+    using tiled_partition_t = decltype(tiled_partition);
+    using subwarp_partition_t = decltype(subwarp_partition);
+
+    static_assert(!group::is_group<not_group>::value &&
+                      group::is_group<this_block_t>::value &&
+                      group::is_group<tiled_partition_t>::value &&
+                      group::is_group<subwarp_partition_t>::value,
+                  "Group check doesn't work.");
+    static_assert(
+        !group::is_synchronizable_group<not_group>::value &&
+            group::is_synchronizable_group<this_block_t>::value &&
+            group::is_synchronizable_group<tiled_partition_t>::value &&
+            group::is_synchronizable_group<subwarp_partition_t>::value,
+        "Synchronizable group check doesn't work.");
+    static_assert(
+        !group::is_communicator_group<not_group>::value &&
+            !group::is_communicator_group<this_block_t>::value &&
+            group::is_communicator_group<tiled_partition_t>::value &&
+            group::is_communicator_group<subwarp_partition_t>::value,
+        "Communicator group check doesn't work.");
+}
+
+TEST_F(CooperativeGroups, CorrectCategorization)
+{
+    test(cg_communicator_categorization);
+}
+
+
 template <typename ValueType>
 __global__ void cg_shuffle_sum(const int num, ValueType* __restrict__ value)
 {

From 24461b0433de34333237fab5c56b63a79be8e32b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Gr=C3=BCtzmacher?= <thomas.gruetzmacher@tum.de>
Date: Thu, 11 Jul 2024 14:13:54 +0200
Subject: [PATCH 096/448] Fix visibility of HIP specialization

---
 .../test/components/cooperative_groups.dp.cpp | 21 ++++++++++---------
 hip/components/cooperative_groups.hip.hpp     |  6 +++---
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/dpcpp/test/components/cooperative_groups.dp.cpp b/dpcpp/test/components/cooperative_groups.dp.cpp
index 8667a85713e..eadd99a6ac5 100644
--- a/dpcpp/test/components/cooperative_groups.dp.cpp
+++ b/dpcpp/test/components/cooperative_groups.dp.cpp
@@ -213,17 +213,18 @@ void cg_communicator_categorization(bool* s, sycl::nd_item<3> item_ct1)
                       group::is_group<this_block_t>::value &&
                       group::is_group<tiled_partition_t>::value,
                   "Group check doesn't work.");
-    static_assert(
-        !group::is_synchronizable_group<not_group>::value &&
-            group::is_synchronizable_group<this_block_t>::value &&
-            group::is_synchronizable_group<tiled_partition_t>::value,
-        "Synchronizable group check doesn't work.");
-    static_assert(
-        !group::is_communicator_group<not_group>::value &&
-            !group::is_communicator_group<this_block_t>::value &&
-            group::is_communicator_group<tiled_partition_t>::value,
-        "Communicator group check doesn't work.");
+    static_assert(!group::is_synchronizable_group<not_group>::value &&
+                      group::is_synchronizable_group<this_block_t>::value &&
+                      group::is_synchronizable_group<tiled_partition_t>::value,
+                  "Synchronizable group check doesn't work.");
+    static_assert(!group::is_communicator_group<not_group>::value &&
+                      !group::is_communicator_group<this_block_t>::value &&
+                      group::is_communicator_group<tiled_partition_t>::value,
+                  "Communicator group check doesn't work.");
+    // Make it work with the test framework, which performs 3 tests
     s[this_block.thread_rank()] = true;
+    s[this_block.thread_rank() + cfg::subgroup_size] = true;
+    s[this_block.thread_rank() + 2 * cfg::subgroup_size] = true;
 }
 
 GKO_ENABLE_DEFAULT_HOST_CONFIG_TYPE(cg_communicator_categorization,
diff --git a/hip/components/cooperative_groups.hip.hpp b/hip/components/cooperative_groups.hip.hpp
index d3dbc44a5c8..11581db0b0c 100644
--- a/hip/components/cooperative_groups.hip.hpp
+++ b/hip/components/cooperative_groups.hip.hpp
@@ -370,12 +370,12 @@ namespace detail {
 
 
 template <unsigned Size>
-struct is_group_impl<thread_block_tile<Size>> : std::true_type {};
+struct is_group_impl<group::thread_block_tile<Size>> : std::true_type {};
 template <unsigned Size>
-struct is_synchronizable_group_impl<thread_block_tile<Size>> : std::true_type {
+struct is_synchronizable_group_impl<group::thread_block_tile<Size>> : std::true_type {
 };
 template <unsigned Size>
-struct is_communicator_group_impl<thread_block_tile<Size>> : std::true_type {};
+struct is_communicator_group_impl<group::thread_block_tile<Size>> : std::true_type {};
 
 
 }  // namespace detail

From 53a81188071e2280231fa12fdbd4d1c4ea27d8a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Gr=C3=BCtzmacher?= <thomas.gruetzmacher@tum.de>
Date: Wed, 10 Jul 2024 17:54:53 +0200
Subject: [PATCH 097/448] Fix the communicator group categorization

---
 cuda/components/cooperative_groups.cuh     | 2 +-
 dpcpp/components/cooperative_groups.dp.hpp | 2 +-
 hip/components/cooperative_groups.hip.hpp  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cuda/components/cooperative_groups.cuh b/cuda/components/cooperative_groups.cuh
index c4ceca9e409..983ec32f9ac 100644
--- a/cuda/components/cooperative_groups.cuh
+++ b/cuda/components/cooperative_groups.cuh
@@ -113,7 +113,7 @@ struct is_synchronizable_group_impl : std::false_type {};
 
 
 template <typename T>
-struct is_communicator_group_impl : std::true_type {};
+struct is_communicator_group_impl : std::false_type {};
 
 }  // namespace detail
 
diff --git a/dpcpp/components/cooperative_groups.dp.hpp b/dpcpp/components/cooperative_groups.dp.hpp
index c758cf42710..33a107ef3f5 100644
--- a/dpcpp/components/cooperative_groups.dp.hpp
+++ b/dpcpp/components/cooperative_groups.dp.hpp
@@ -101,7 +101,7 @@ struct is_synchronizable_group_impl : std::false_type {};
 
 
 template <typename T>
-struct is_communicator_group_impl : std::true_type {};
+struct is_communicator_group_impl : std::false_type {};
 
 
 }  // namespace detail
diff --git a/hip/components/cooperative_groups.hip.hpp b/hip/components/cooperative_groups.hip.hpp
index 11581db0b0c..2e5d7c0abff 100644
--- a/hip/components/cooperative_groups.hip.hpp
+++ b/hip/components/cooperative_groups.hip.hpp
@@ -101,7 +101,7 @@ struct is_synchronizable_group_impl : std::false_type {};
 
 
 template <typename T>
-struct is_communicator_group_impl : std::true_type {};
+struct is_communicator_group_impl : std::false_type {};
 
 }  // namespace detail
 

From 605889a539ab21b7b2a41ff465f06e63d3da2df9 Mon Sep 17 00:00:00 2001
From: ginkgo-bot <ginkgo.library@gmail.com>
Date: Thu, 11 Jul 2024 12:28:28 +0000
Subject: [PATCH 098/448] Format files
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Thomas Grützmacher <thoasm@users.noreply.github.com>
---
 cuda/test/components/cooperative_groups.cu     | 11 +++++------
 hip/components/cooperative_groups.hip.hpp      |  7 ++++---
 hip/test/components/cooperative_groups.hip.cpp | 11 +++++------
 3 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/cuda/test/components/cooperative_groups.cu b/cuda/test/components/cooperative_groups.cu
index 077b0121fbd..0b384cd704e 100644
--- a/cuda/test/components/cooperative_groups.cu
+++ b/cuda/test/components/cooperative_groups.cu
@@ -246,12 +246,11 @@ __global__ void cg_communicator_categorization(bool*)
             group::is_synchronizable_group<tiled_partition_t>::value &&
             group::is_synchronizable_group<subwarp_partition_t>::value,
         "Synchronizable group check doesn't work.");
-    static_assert(
-        !group::is_communicator_group<not_group>::value &&
-            !group::is_communicator_group<this_block_t>::value &&
-            group::is_communicator_group<tiled_partition_t>::value &&
-            group::is_communicator_group<subwarp_partition_t>::value,
-        "Communicator group check doesn't work.");
+    static_assert(!group::is_communicator_group<not_group>::value &&
+                      !group::is_communicator_group<this_block_t>::value &&
+                      group::is_communicator_group<tiled_partition_t>::value &&
+                      group::is_communicator_group<subwarp_partition_t>::value,
+                  "Communicator group check doesn't work.");
 }
 
 TEST_F(CooperativeGroups, CorrectCategorization)
diff --git a/hip/components/cooperative_groups.hip.hpp b/hip/components/cooperative_groups.hip.hpp
index 2e5d7c0abff..36618bb7f3e 100644
--- a/hip/components/cooperative_groups.hip.hpp
+++ b/hip/components/cooperative_groups.hip.hpp
@@ -372,10 +372,11 @@ namespace detail {
 template <unsigned Size>
 struct is_group_impl<group::thread_block_tile<Size>> : std::true_type {};
 template <unsigned Size>
-struct is_synchronizable_group_impl<group::thread_block_tile<Size>> : std::true_type {
-};
+struct is_synchronizable_group_impl<group::thread_block_tile<Size>>
+    : std::true_type {};
 template <unsigned Size>
-struct is_communicator_group_impl<group::thread_block_tile<Size>> : std::true_type {};
+struct is_communicator_group_impl<group::thread_block_tile<Size>>
+    : std::true_type {};
 
 
 }  // namespace detail
diff --git a/hip/test/components/cooperative_groups.hip.cpp b/hip/test/components/cooperative_groups.hip.cpp
index 0f71550139c..bd8c79b9849 100644
--- a/hip/test/components/cooperative_groups.hip.cpp
+++ b/hip/test/components/cooperative_groups.hip.cpp
@@ -265,12 +265,11 @@ __global__ void cg_communicator_categorization(bool*)
             group::is_synchronizable_group<tiled_partition_t>::value &&
             group::is_synchronizable_group<subwarp_partition_t>::value,
         "Synchronizable group check doesn't work.");
-    static_assert(
-        !group::is_communicator_group<not_group>::value &&
-            !group::is_communicator_group<this_block_t>::value &&
-            group::is_communicator_group<tiled_partition_t>::value &&
-            group::is_communicator_group<subwarp_partition_t>::value,
-        "Communicator group check doesn't work.");
+    static_assert(!group::is_communicator_group<not_group>::value &&
+                      !group::is_communicator_group<this_block_t>::value &&
+                      group::is_communicator_group<tiled_partition_t>::value &&
+                      group::is_communicator_group<subwarp_partition_t>::value,
+                  "Communicator group check doesn't work.");
 }
 
 TEST_F(CooperativeGroups, CorrectCategorization)

From ffbe84f3500cc39409a1b4398f85e3bd2b511866 Mon Sep 17 00:00:00 2001
From: Fritz Goebel <fritz.goebel@kit.edu>
Date: Wed, 3 Jul 2024 11:46:14 +0000
Subject: [PATCH 099/448] Add row and col scaling functions to distributed
 matrix

---
 core/distributed/matrix.cpp                | 50 ++++++++++++
 include/ginkgo/core/distributed/matrix.hpp | 18 +++++
 test/mpi/matrix.cpp                        | 93 ++++++++++++++++++++++
 3 files changed, 161 insertions(+)

diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp
index 8eee020a3e6..0b9c06f761d 100644
--- a/core/distributed/matrix.cpp
+++ b/core/distributed/matrix.cpp
@@ -8,6 +8,7 @@
 #include <ginkgo/core/distributed/vector.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/diagonal.hpp>
 
 #include "core/distributed/matrix_kernels.hpp"
 
@@ -504,6 +505,55 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::apply_impl(
 }
 
 
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void Matrix<ValueType, LocalIndexType, GlobalIndexType>::col_scale(
+    ptr_param<const global_vector_type> scaling_factors)
+{
+    GKO_ASSERT_CONFORMANT(this, scaling_factors.get());
+    auto exec = this->get_executor();
+    auto comm = this->get_communicator();
+    size_type n_local_cols = local_mtx_->get_size()[1];
+    size_type n_non_local_cols = non_local_mtx_->get_size()[1];
+    const auto scale_diag = gko::matrix::Diagonal<ValueType>::create_const(
+        exec, n_local_cols,
+        make_const_array_view(exec, n_local_cols,
+                              scaling_factors->get_const_local_values()));
+
+    auto req = this->communicate(scaling_factors->get_local_vector());
+    scale_diag->rapply(local_mtx_, local_mtx_);
+    req.wait();
+    if (n_non_local_cols > 0) {
+        auto use_host_buffer = mpi::requires_host_buffer(exec, comm);
+        if (use_host_buffer) {
+            recv_buffer_->copy_from(host_recv_buffer_.get());
+        }
+        const auto non_local_scale_diag =
+            gko::matrix::Diagonal<ValueType>::create_const(
+                exec, n_non_local_cols,
+                make_const_array_view(exec, n_non_local_cols,
+                                      recv_buffer_->get_const_values()));
+        non_local_scale_diag->rapply(non_local_mtx_, non_local_mtx_);
+    }
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void Matrix<ValueType, LocalIndexType, GlobalIndexType>::row_scale(
+    ptr_param<const global_vector_type> scaling_factors)
+{
+    GKO_ASSERT_EQUAL_ROWS(this, scaling_factors.get());
+    auto exec = this->get_executor();
+    size_type n_local_rows = local_mtx_->get_size()[0];
+    const auto scale_diag = gko::matrix::Diagonal<ValueType>::create_const(
+        exec, n_local_rows,
+        make_const_array_view(exec, n_local_rows,
+                              scaling_factors->get_const_local_values()));
+
+    scale_diag->apply(local_mtx_, local_mtx_);
+    scale_diag->apply(non_local_mtx_, non_local_mtx_);
+}
+
+
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 Matrix<ValueType, LocalIndexType, GlobalIndexType>::Matrix(const Matrix& other)
     : EnableDistributedLinOp<Matrix<value_type, local_index_type,
diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp
index 9e3d45443b1..1e5e33581a9 100644
--- a/include/ginkgo/core/distributed/matrix.hpp
+++ b/include/ginkgo/core/distributed/matrix.hpp
@@ -575,6 +575,24 @@ class Matrix
         std::vector<comm_index_type> recv_offsets,
         array<local_index_type> recv_gather_idxs);
 
+    /**
+     * Scales the columns of the matrix by the respective entries of the vector.
+     * The vector's row partition has to be the same as the matrix's column
+     * partition. The scaling is done in-place.
+     *
+     * @param scaling_factors  The vector containing the scaling factors.
+     */
+    void col_scale(ptr_param<const global_vector_type> scaling_factors);
+
+    /**
+     * Scales the rows of the matrix by the respective entries of the vector.
+     * The vector and the matrix have to have the same row partition.
+     * The scaling is done in-place.
+     *
+     * @param scaling_factors  The vector containing the scaling factors.
+     */
+    void row_scale(ptr_param<const global_vector_type> scaling_factors);
+
 protected:
     explicit Matrix(std::shared_ptr<const Executor> exec,
                     mpi::communicator comm);
diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp
index cc9ec219a88..7af6f537fb3 100644
--- a/test/mpi/matrix.cpp
+++ b/test/mpi/matrix.cpp
@@ -516,6 +516,99 @@ TYPED_TEST(Matrix, CanAdvancedApplyToMultipleVectorsLarge)
 }
 
 
+TYPED_TEST(Matrix, CanColScale)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::global_index_type;
+    using csr = typename TestFixture::local_matrix_type;
+    using dist_vec_type = typename TestFixture::dist_vec_type;
+    auto vec_md = gko::matrix_data<value_type, index_type>{
+        I<I<value_type>>{{1}, {2}, {3}, {4}, {5}}};
+    I<I<value_type>> res_col_scale_local[] = {
+        {{8, 0}, {0, 0}}, {{0, 10}, {0, 0}}, {{0}}};
+    I<I<value_type>> res_col_scale_non_local[] = {
+        {{2, 0}, {6, 12}}, {{0, 0, 18}, {32, 35, 0}}, {{50, 9}}};
+    auto rank = this->comm.rank();
+    auto col_scaling_factors = dist_vec_type::create(this->exec, this->comm);
+    col_scaling_factors->read_distributed(vec_md, this->col_part);
+
+    this->dist_mat->col_scale(col_scaling_factors);
+
+    GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_local_matrix()),
+                        res_col_scale_local[rank], 0);
+    GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_non_local_matrix()),
+                        res_col_scale_non_local[rank], 0);
+}
+
+
+TYPED_TEST(Matrix, CanRowScale)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::global_index_type;
+    using csr = typename TestFixture::local_matrix_type;
+    using dist_vec_type = typename TestFixture::dist_vec_type;
+    auto vec_md = gko::matrix_data<value_type, index_type>{
+        I<I<value_type>>{{1}, {2}, {3}, {4}, {5}}};
+    I<I<value_type>> res_row_scale_local[] = {
+        {{2, 0}, {0, 0}}, {{0, 15}, {0, 0}}, {{0}}};
+    I<I<value_type>> res_row_scale_non_local[] = {
+        {{1, 0}, {6, 8}}, {{0, 0, 18}, {32, 28, 0}}, {{50, 45}}};
+    auto rank = this->comm.rank();
+    auto row_scaling_factors = dist_vec_type::create(this->exec, this->comm);
+    row_scaling_factors->read_distributed(vec_md, this->row_part);
+
+    this->dist_mat->row_scale(row_scaling_factors);
+
+    GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_local_matrix()),
+                        res_row_scale_local[rank], 0);
+    GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_non_local_matrix()),
+                        res_row_scale_non_local[rank], 0);
+}
+
+
+TYPED_TEST(Matrix, ColScaleThrowsOnWrongDimension)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::global_index_type;
+    using dist_vec_type = typename TestFixture::dist_vec_type;
+    using part_type = typename TestFixture::part_type;
+    auto vec_md = gko::matrix_data<value_type, index_type>{
+        I<I<value_type>>{{1}, {2}, {3}, {4}}};
+    auto rank = this->comm.rank();
+    auto col_part = part_type::build_from_mapping(
+        this->exec,
+        gko::array<gko::experimental::distributed::comm_index_type>(
+            this->exec,
+            I<gko::experimental::distributed::comm_index_type>{1, 2, 0, 0}),
+        3);
+    auto col_scaling_factors = dist_vec_type::create(this->exec, this->comm);
+    col_scaling_factors->read_distributed(vec_md, col_part);
+
+    ASSERT_THROW(this->dist_mat->col_scale(col_scaling_factors),
+                 gko::DimensionMismatch);
+}
+
+
+TYPED_TEST(Matrix, RowScaleThrowsOnWrongDimension)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::global_index_type;
+    using dist_vec_type = typename TestFixture::dist_vec_type;
+    using part_type = typename TestFixture::part_type;
+    auto vec_md = gko::matrix_data<value_type, index_type>{
+        I<I<value_type>>{{1}, {2}, {3}, {4}}};
+    auto rank = this->comm.rank();
+    auto row_part = part_type::build_from_contiguous(
+        this->exec,
+        gko::array<index_type>(this->exec, I<index_type>{0, 2, 3, 4}));
+    auto row_scaling_factors = dist_vec_type::create(this->exec, this->comm);
+    row_scaling_factors->read_distributed(vec_md, row_part);
+
+    ASSERT_THROW(this->dist_mat->row_scale(row_scaling_factors),
+                 gko::DimensionMismatch);
+}
+
+
 TYPED_TEST(Matrix, CanConvertToNextPrecision)
 {
     using T = typename TestFixture::value_type;

From 25fa5be961a471354214355f85a773260511df38 Mon Sep 17 00:00:00 2001
From: Fritz Goebel <fritz.goebel@kit.edu>
Date: Tue, 9 Jul 2024 08:13:47 +0000
Subject: [PATCH 100/448] Enable row / column scaling with strided vectors

---
 core/distributed/matrix.cpp | 31 +++++++++++++---
 test/mpi/matrix.cpp         | 71 +++++++++++++++++++++++++++++++++++++
 2 files changed, 97 insertions(+), 5 deletions(-)

diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp
index 0b9c06f761d..63f359cc40a 100644
--- a/core/distributed/matrix.cpp
+++ b/core/distributed/matrix.cpp
@@ -510,16 +510,27 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::col_scale(
     ptr_param<const global_vector_type> scaling_factors)
 {
     GKO_ASSERT_CONFORMANT(this, scaling_factors.get());
+    GKO_ASSERT_EQ(scaling_factors->get_size()[1], 1);
     auto exec = this->get_executor();
     auto comm = this->get_communicator();
     size_type n_local_cols = local_mtx_->get_size()[1];
     size_type n_non_local_cols = non_local_mtx_->get_size()[1];
+    std::unique_ptr<global_vector_type> scaling_factors_single_stride;
+    auto stride = scaling_factors->get_stride();
+    if (stride != 1) {
+        scaling_factors_single_stride = global_vector_type::create(exec, comm);
+        scaling_factors_single_stride->copy_from(scaling_factors.get());
+    }
+    const auto scale_values =
+        stride == 1 ? scaling_factors->get_const_local_values()
+                    : scaling_factors_single_stride->get_const_local_values();
     const auto scale_diag = gko::matrix::Diagonal<ValueType>::create_const(
         exec, n_local_cols,
-        make_const_array_view(exec, n_local_cols,
-                              scaling_factors->get_const_local_values()));
+        make_const_array_view(exec, n_local_cols, scale_values));
 
-    auto req = this->communicate(scaling_factors->get_local_vector());
+    auto req = this->communicate(
+        stride == 1 ? scaling_factors->get_local_vector()
+                    : scaling_factors_single_stride->get_local_vector());
     scale_diag->rapply(local_mtx_, local_mtx_);
     req.wait();
     if (n_non_local_cols > 0) {
@@ -542,12 +553,22 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::row_scale(
     ptr_param<const global_vector_type> scaling_factors)
 {
     GKO_ASSERT_EQUAL_ROWS(this, scaling_factors.get());
+    GKO_ASSERT_EQ(scaling_factors->get_size()[1], 1);
     auto exec = this->get_executor();
+    auto comm = this->get_communicator();
     size_type n_local_rows = local_mtx_->get_size()[0];
+    std::unique_ptr<global_vector_type> scaling_factors_single_stride;
+    auto stride = scaling_factors->get_stride();
+    if (stride != 1) {
+        scaling_factors_single_stride = global_vector_type::create(exec, comm);
+        scaling_factors_single_stride->copy_from(scaling_factors.get());
+    }
+    const auto scale_values =
+        stride == 1 ? scaling_factors->get_const_local_values()
+                    : scaling_factors_single_stride->get_const_local_values();
     const auto scale_diag = gko::matrix::Diagonal<ValueType>::create_const(
         exec, n_local_rows,
-        make_const_array_view(exec, n_local_rows,
-                              scaling_factors->get_const_local_values()));
+        make_const_array_view(exec, n_local_rows, scale_values));
 
     scale_diag->apply(local_mtx_, local_mtx_);
     scale_diag->apply(non_local_mtx_, non_local_mtx_);
diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp
index 7af6f537fb3..454197ccfd9 100644
--- a/test/mpi/matrix.cpp
+++ b/test/mpi/matrix.cpp
@@ -20,6 +20,7 @@
 #include <ginkgo/core/matrix/csr.hpp>
 
 #include "core/test/utils.hpp"
+#include "ginkgo/core/base/exception.hpp"
 #include "test/utils/mpi/common_fixture.hpp"
 
 
@@ -566,6 +567,62 @@ TYPED_TEST(Matrix, CanRowScale)
 }
 
 
+TYPED_TEST(Matrix, CanColScaleWithStride)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::global_index_type;
+    using csr = typename TestFixture::local_matrix_type;
+    using dist_vec_type = typename TestFixture::dist_vec_type;
+    auto vec_md = gko::matrix_data<value_type, index_type>{
+        I<I<value_type>>{{1}, {2}, {3}, {4}, {5}}};
+    I<I<value_type>> res_col_scale_local[] = {
+        {{8, 0}, {0, 0}}, {{0, 10}, {0, 0}}, {{0}}};
+    I<I<value_type>> res_col_scale_non_local[] = {
+        {{2, 0}, {6, 12}}, {{0, 0, 18}, {32, 35, 0}}, {{50, 9}}};
+    gko::dim<2> local_sizes[] = {{2, 1}, {2, 1}, {1, 1}};
+    auto rank = this->comm.rank();
+    auto col_scaling_factors = dist_vec_type::create(
+        this->exec, this->comm, gko::dim<2>{5, 1}, local_sizes[rank], 2);
+    col_scaling_factors->read_distributed(vec_md, this->col_part);
+
+    this->dist_mat->col_scale(col_scaling_factors);
+
+    GKO_ASSERT_EQ(col_scaling_factors->get_stride(), 2);
+    GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_local_matrix()),
+                        res_col_scale_local[rank], 0);
+    GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_non_local_matrix()),
+                        res_col_scale_non_local[rank], 0);
+}
+
+
+TYPED_TEST(Matrix, CanRowScaleWithStride)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::global_index_type;
+    using csr = typename TestFixture::local_matrix_type;
+    using dist_vec_type = typename TestFixture::dist_vec_type;
+    auto vec_md = gko::matrix_data<value_type, index_type>{
+        I<I<value_type>>{{1}, {2}, {3}, {4}, {5}}};
+    I<I<value_type>> res_row_scale_local[] = {
+        {{2, 0}, {0, 0}}, {{0, 15}, {0, 0}}, {{0}}};
+    I<I<value_type>> res_row_scale_non_local[] = {
+        {{1, 0}, {6, 8}}, {{0, 0, 18}, {32, 28, 0}}, {{50, 45}}};
+    gko::dim<2> local_sizes[] = {{2, 1}, {2, 1}, {1, 1}};
+    auto rank = this->comm.rank();
+    auto row_scaling_factors = dist_vec_type::create(
+        this->exec, this->comm, gko::dim<2>{5, 1}, local_sizes[rank], 2);
+    row_scaling_factors->read_distributed(vec_md, this->row_part);
+
+    this->dist_mat->row_scale(row_scaling_factors);
+
+    GKO_ASSERT_EQ(row_scaling_factors->get_stride(), 2);
+    GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_local_matrix()),
+                        res_row_scale_local[rank], 0);
+    GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_non_local_matrix()),
+                        res_row_scale_non_local[rank], 0);
+}
+
+
 TYPED_TEST(Matrix, ColScaleThrowsOnWrongDimension)
 {
     using value_type = typename TestFixture::value_type;
@@ -574,6 +631,8 @@ TYPED_TEST(Matrix, ColScaleThrowsOnWrongDimension)
     using part_type = typename TestFixture::part_type;
     auto vec_md = gko::matrix_data<value_type, index_type>{
         I<I<value_type>>{{1}, {2}, {3}, {4}}};
+    auto two_vec_md = gko::matrix_data<value_type, index_type>{
+        I<I<value_type>>{{1, 1}, {2, 2}, {3, 3}, {4, 4}, {5, 5}}};
     auto rank = this->comm.rank();
     auto col_part = part_type::build_from_mapping(
         this->exec,
@@ -583,9 +642,14 @@ TYPED_TEST(Matrix, ColScaleThrowsOnWrongDimension)
         3);
     auto col_scaling_factors = dist_vec_type::create(this->exec, this->comm);
     col_scaling_factors->read_distributed(vec_md, col_part);
+    auto two_col_scaling_factors =
+        dist_vec_type::create(this->exec, this->comm);
+    two_col_scaling_factors->read_distributed(two_vec_md, this->col_part);
 
     ASSERT_THROW(this->dist_mat->col_scale(col_scaling_factors),
                  gko::DimensionMismatch);
+    ASSERT_THROW(this->dist_mat->col_scale(two_col_scaling_factors),
+                 gko::ValueMismatch);
 }
 
 
@@ -597,15 +661,22 @@ TYPED_TEST(Matrix, RowScaleThrowsOnWrongDimension)
     using part_type = typename TestFixture::part_type;
     auto vec_md = gko::matrix_data<value_type, index_type>{
         I<I<value_type>>{{1}, {2}, {3}, {4}}};
+    auto two_vec_md = gko::matrix_data<value_type, index_type>{
+        I<I<value_type>>{{1, 1}, {2, 2}, {3, 3}, {4, 4}, {5, 5}}};
     auto rank = this->comm.rank();
     auto row_part = part_type::build_from_contiguous(
         this->exec,
         gko::array<index_type>(this->exec, I<index_type>{0, 2, 3, 4}));
     auto row_scaling_factors = dist_vec_type::create(this->exec, this->comm);
     row_scaling_factors->read_distributed(vec_md, row_part);
+    auto two_row_scaling_factors =
+        dist_vec_type::create(this->exec, this->comm);
+    two_row_scaling_factors->read_distributed(two_vec_md, this->col_part);
 
     ASSERT_THROW(this->dist_mat->row_scale(row_scaling_factors),
                  gko::DimensionMismatch);
+    ASSERT_THROW(this->dist_mat->row_scale(two_row_scaling_factors),
+                 gko::ValueMismatch);
 }
 
 

From f91b4271de9138da0f7fc469eef1a86af5cc35c3 Mon Sep 17 00:00:00 2001
From: fritzgoebel <goebel.fritz@googlemail.com>
Date: Thu, 11 Jul 2024 15:17:34 +0200
Subject: [PATCH 101/448] Apply suggestions from code review

Co-authored-by: Tobias Ribizel <mail@ribizel.de>
---
 test/mpi/matrix.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp
index 454197ccfd9..1c090b6c43f 100644
--- a/test/mpi/matrix.cpp
+++ b/test/mpi/matrix.cpp
@@ -587,7 +587,7 @@ TYPED_TEST(Matrix, CanColScaleWithStride)
 
     this->dist_mat->col_scale(col_scaling_factors);
 
-    GKO_ASSERT_EQ(col_scaling_factors->get_stride(), 2);
+    ASSERT_EQ(col_scaling_factors->get_stride(), 2);
     GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_local_matrix()),
                         res_col_scale_local[rank], 0);
     GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_non_local_matrix()),
@@ -615,7 +615,7 @@ TYPED_TEST(Matrix, CanRowScaleWithStride)
 
     this->dist_mat->row_scale(row_scaling_factors);
 
-    GKO_ASSERT_EQ(row_scaling_factors->get_stride(), 2);
+    ASSERT_EQ(row_scaling_factors->get_stride(), 2);
     GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_local_matrix()),
                         res_row_scale_local[rank], 0);
     GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_non_local_matrix()),

From 7daba3684a5baad3a3f2603487246f8443637d90 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Tue, 30 Jul 2024 09:53:38 +0200
Subject: [PATCH 102/448] fix stride for GCR initialization

---
 common/unified/solver/gcr_kernels.cpp |  5 ++---
 test/solver/gcr_kernels.cpp           | 16 ++++++++++++++++
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/common/unified/solver/gcr_kernels.cpp b/common/unified/solver/gcr_kernels.cpp
index 0c9e825228a..7adef77dfb1 100644
--- a/common/unified/solver/gcr_kernels.cpp
+++ b/common/unified/solver/gcr_kernels.cpp
@@ -27,7 +27,7 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
                 stopping_status* stop_status)
 {
     if (b->get_size()) {
-        run_kernel_solver(
+        run_kernel(
             exec,
             [] GKO_KERNEL(auto row, auto col, auto b, auto residual,
                           auto stop) {
@@ -36,8 +36,7 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
                 }
                 residual(row, col) = b(row, col);
             },
-            b->get_size(), b->get_stride(), default_stride(b),
-            default_stride(residual), stop_status);
+            b->get_size(), b, residual, stop_status);
     } else {
         run_kernel(
             exec, [] GKO_KERNEL(auto col, auto stop) { stop[col].reset(); },
diff --git a/test/solver/gcr_kernels.cpp b/test/solver/gcr_kernels.cpp
index 5a46bbbb940..eb3f5c6df93 100644
--- a/test/solver/gcr_kernels.cpp
+++ b/test/solver/gcr_kernels.cpp
@@ -157,6 +157,22 @@ TEST_F(Gcr, GcrKernelInitializeIsEquivalentToRef)
 }
 
 
+TEST_F(Gcr, GcrKernelInitializeWithStrideIsEquivalentToRef)
+{
+    initialize_data();
+    auto d_b_strided = Mtx::create(exec, b->get_size(), b->get_stride() + 2);
+    d_b_strided->copy_from(d_b);
+
+    gko::kernels::reference::gcr::initialize(ref, b.get(), residual.get(),
+                                             stop_status.get_data());
+    gko::kernels::GKO_DEVICE_NAMESPACE::gcr::initialize(
+        exec, d_b_strided.get(), d_residual.get(), d_stop_status.get_data());
+
+    GKO_ASSERT_MTX_NEAR(d_residual, residual, r<value_type>::value);
+    GKO_ASSERT_ARRAY_EQ(d_stop_status, stop_status);
+}
+
+
 TEST_F(Gcr, GcrKernelRestartIsEquivalentToRef)
 {
     initialize_data();

From c589751872cd906457250fb1d208babf7057d694 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?=
 <1289205+lahwaacz@users.noreply.github.com>
Date: Tue, 6 Aug 2024 18:57:04 +0200
Subject: [PATCH 103/448] Include missing iomanip header in solver_progress.cpp
 test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Without this, compiling with g++ 14.2.1 fails with the following error:

```
/build/ginkgo-hpc-git/src/ginkgo/core/test/log/solver_progress.cpp: In member function ‘void SolverProgress_TableWorks_Test<gtest_TypeParam_>::TestBody()’:
/build/ginkgo-hpc-git/src/ginkgo/core/test/log/solver_progress.cpp:82:20: error: ‘setw’ is not a member of ‘std’
   82 |     ref_ss << std::setw(default_column_width) << "Iteration"
      |                    ^~~~
/build/ginkgo-hpc-git/src/ginkgo/core/test/log/solver_progress.cpp:15:1: note: ‘std::setw’ is defined in header ‘<iomanip>’; this is probably fixable by adding ‘#include <iomanip>’
   14 | #include "core/test/utils.hpp"
  +++ |+#include <iomanip>
   15 | #include "core/test/utils/assertions.hpp"
/build/ginkgo-hpc-git/src/ginkgo/core/test/log/solver_progress.cpp:83:20: error: ‘setw’ is not a member of ‘std’
   83 |            << std::setw(default_column_width) << "beta"
      |                    ^~~~
/build/ginkgo-hpc-git/src/ginkgo/core/test/log/solver_progress.cpp:83:20: note: ‘std::setw’ is defined in header ‘<iomanip>’; this is probably fixable by adding ‘#include <iomanip>’
/build/ginkgo-hpc-git/src/ginkgo/core/test/log/solver_progress.cpp:84:20: error: ‘setw’ is not a member of ‘std’
   84 |            << std::setw(default_column_width) << "prev_rho"
      |                    ^~~~
...
```
---
 core/test/log/solver_progress.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/core/test/log/solver_progress.cpp b/core/test/log/solver_progress.cpp
index fe8a4537f66..e00044a908d 100644
--- a/core/test/log/solver_progress.cpp
+++ b/core/test/log/solver_progress.cpp
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <iomanip>
 #include <regex>
 
 #include <gtest/gtest.h>

From f3e68711e6d8cbd1e9985e79a9978ef0f56cdaea Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Mon, 29 Apr 2024 11:49:17 +0200
Subject: [PATCH 104/448] add simplified segmented range feature

---
 core/base/segmented_range.hpp      | 348 +++++++++++++++++++++++++++++
 core/test/base/CMakeLists.txt      |   1 +
 core/test/base/segmented_range.cpp |  84 +++++++
 3 files changed, 433 insertions(+)
 create mode 100644 core/base/segmented_range.hpp
 create mode 100644 core/test/base/segmented_range.cpp

diff --git a/core/base/segmented_range.hpp b/core/base/segmented_range.hpp
new file mode 100644
index 00000000000..afe04b60b69
--- /dev/null
+++ b/core/base/segmented_range.hpp
@@ -0,0 +1,348 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_CORE_BASE_SEGMENTED_RANGE_HPP_
+#define GKO_CORE_BASE_SEGMENTED_RANGE_HPP_
+
+
+#include <iterator>
+#include <type_traits>
+
+
+#include "core/base/index_range.hpp"
+#include "core/base/iterator_factory.hpp"
+
+
+namespace gko {
+
+
+/**
+ * Represents a range of indices that is segmented into contiguous segments.
+ * Each segment has the shape `[begin, end)`, i.e. it is a half-open interval.
+ *
+ * @tparam IndexType  the type of indices used to represent the segments.
+ */
+template <typename IndexType>
+class segmented_range {
+public:
+    using index_type = IndexType;
+    using index_iterator = index_iterator<index_type>;
+    using segment = irange<index_type>;
+
+    /**
+     * An iterator pointing to (or past) a single segment in the range.
+     */
+    class iterator {
+    public:
+        constexpr explicit iterator(segmented_range range, index_type segment)
+            : range_{range}, segment_{segment}
+        {}
+
+        constexpr segment operator*() const
+        {
+            assert(segment_ >= 0);
+            assert(segment_ < range_.num_segments());
+            return segment{range_.begin_index(segment_),
+                           range_.end_index(segment_)};
+        }
+
+        constexpr iterator& operator++()
+        {
+            ++segment_;
+            return *this;
+        }
+
+        constexpr friend bool operator==(iterator lhs, iterator rhs)
+        {
+            assert(lhs.range_ == rhs.range_);
+            return lhs.segment_ == rhs.segment_;
+        }
+
+        constexpr friend bool operator!=(iterator lhs, iterator rhs)
+        {
+            return !(lhs == rhs);
+        }
+
+    private:
+        segmented_range range_;
+        index_type segment_;
+    };
+
+    /**
+     * Constructs a segmented range from separate begin and end pointers.
+     * The `i`th range is given by `[begins[i], ends[i])`.
+     *
+     * @param begins  a pointer to the array of beginning indices
+     * @param ends  a pointer to the array of end indices
+     * @param num_segments  the number of segments, i.e. the size of the
+     *                      beginning and end index arrays.
+     */
+    constexpr explicit segmented_range(const index_type* begins,
+                                       const index_type* ends,
+                                       index_type num_segments)
+        : begins_{begins}, ends_{ends}, num_segments_{num_segments}
+    {
+        assert(num_segments_ >= 0);
+    }
+
+    /**
+     * Constructs a segmented range from combined begin and end pointers.
+     * The `i`th range is given by `[ptrs[i], ptrs[i + 1])`.
+     *
+     * @param ptrs  a pointer to the array of beginning and end indices
+     * @param num_segments  the number of segments, i.e. the size of the
+     *                      ptrs index arrays.
+     */
+    constexpr explicit segmented_range(const index_type* ptrs,
+                                       index_type num_segments)
+        : segmented_range{ptrs, ptrs + 1, num_segments}
+    {}
+
+    /**
+     * Returns the segment at a given index.
+     *
+     * @param segment  the index to access. It must be in `[0, num_segments())`.
+     * @return  the segment at this index.
+     */
+    constexpr segment operator[](index_type segment) const
+    {
+        assert(segment >= 0);
+        assert(segment < num_segments());
+        return *iterator{*this, segment};
+    }
+
+    /** @return the number of segments in this range. */
+    constexpr index_type num_segments() const { return num_segments_; }
+
+    /** @return iterator pointing to the first segment. */
+    constexpr iterator begin() const { return iterator{*this, 0}; }
+
+    /** @return iterator pointing one past the last segment. */
+    constexpr iterator end() const { return iterator{*this, num_segments()}; }
+
+    /** @return iterator pointing to the first segment. */
+    constexpr const index_type* begin_indices() const { return begins_; }
+
+    /** @return iterator pointing one past the last segment. */
+    constexpr const index_type* end_indices() const { return ends_; }
+
+    /** @return the beginning index of the given segment. */
+    constexpr index_type begin_index(index_type segment) const
+    {
+        assert(segment >= 0);
+        assert(segment < num_segments());
+        return begin_indices()[segment];
+    }
+
+    /** @return the end index of the given segment. */
+    constexpr index_type end_index(index_type segment) const
+    {
+        assert(segment >= 0);
+        assert(segment < num_segments());
+        return end_indices()[segment];
+    }
+
+    /** Compares two ranges for equality. */
+    constexpr friend bool operator==(segmented_range lhs, segmented_range rhs)
+    {
+        return lhs.begin_indices() == rhs.begin_indices() &&
+               lhs.end_indices() == rhs.end_indices() &&
+               lhs.num_segments() == rhs.num_segments();
+    }
+
+    /** Compares two ranges for inequality. */
+    constexpr friend bool operator!=(segmented_range lhs, segmented_range rhs)
+    {
+        return !(lhs == rhs);
+    }
+
+private:
+    const index_type* begins_;
+    const index_type* ends_;
+    index_type num_segments_;
+};
+
+
+/**
+ * Represents a range of indices that is segmented into contiguous segments,
+ * mapped into a value array. Each segment has the shape `[begin, end)`, i.e. it
+ * is a half-open interval and points to corresponding entries of the value
+ * array.
+ *
+ * @tparam IndexType  the type of indices used to represent the segments.
+ * @tparam ValueIterator  the iterator type pointing to the values.
+ */
+template <typename IndexType, typename ValueIterator>
+class segmented_value_range {
+public:
+    using index_type = IndexType;
+    using index_iterator = index_iterator<index_type>;
+    using value_iterator = ValueIterator;
+    using segment = iterator_range<ValueIterator>;
+    using enumerated_range = segmented_value_range<
+        index_type, detail::zip_iterator<index_iterator, value_iterator>>;
+
+    /**
+     * An iterator pointing to (or past) a single segment in the range.
+     */
+    class iterator {
+    public:
+        constexpr explicit iterator(segmented_value_range range,
+                                    index_type segment)
+            : range_{range}, segment_{segment}
+        {}
+
+        constexpr segment operator*() const
+        {
+            assert(segment_ >= 0);
+            assert(segment_ < range_.num_segments());
+            return segment{range_.values() + range_.begin_index(segment_),
+                           range_.values() + range_.end_index(segment_)};
+        }
+
+        constexpr iterator& operator++()
+        {
+            ++segment_;
+            return *this;
+        }
+
+        constexpr friend bool operator==(iterator lhs, iterator rhs)
+        {
+            assert(lhs.range_ == rhs.range_);
+            return lhs.segment_ == rhs.segment_;
+        }
+
+        constexpr friend bool operator!=(iterator lhs, iterator rhs)
+        {
+            return !(lhs == rhs);
+        }
+
+    private:
+        segmented_value_range range_;
+        index_type segment_;
+    };
+
+    /**
+     * Constructs a segmented values range from separate begin and end pointers.
+     * The `i`th range is given by `[begins[i], ends[i])`.
+     *
+     * @param begins  a pointer to the array of beginning indices
+     * @param ends  a pointer to the array of end indices
+     * @param values  an iterator pointing to the values into which the
+     *                beginning/end indices point.
+     * @param num_segments  the number of segments, i.e. the size of the
+     *                      beginning and end index arrays.
+     */
+    constexpr explicit segmented_value_range(const index_type* begins,
+                                             const index_type* ends,
+                                             value_iterator values,
+                                             index_type num_segments)
+        : begins_{begins},
+          ends_{ends},
+          values_{values},
+          num_segments_{num_segments}
+    {
+        assert(num_segments_ >= 0);
+    }
+
+    /**
+     * Constructs a segmented range from combined begin and end pointers.
+     * The `i`th range is given by `[ptrs[i], ptrs[i + 1])`.
+     *
+     * @param ptrs  a pointer to the array of beginning and end indices
+     * @param values  an iterator pointing to the values into which the
+     *                beginning/end indices point.
+     * @param num_segments  the number of segments, i.e. the size of the
+     *                      ptrs index arrays.
+     */
+    constexpr explicit segmented_value_range(const index_type* ptrs,
+                                             value_iterator values,
+                                             index_type num_segments)
+        : segmented_value_range{ptrs, ptrs + 1, values, num_segments}
+    {}
+
+    /**
+     * Returns the segment at a given index.
+     *
+     * @param segment  the index to access. It must be in `[0, num_segments())`.
+     * @return  the segment at this index.
+     */
+    constexpr segment operator[](index_type segment) const
+    {
+        assert(segment >= 0);
+        assert(segment < num_segments());
+        return *iterator{*this, segment};
+    }
+
+    /** @return the number of segments in this range. */
+    constexpr index_type num_segments() const { return num_segments_; }
+
+    constexpr enumerated_range enumerated() const
+    {
+        return enumerated_range{
+            begin_indices(), end_indices(),
+            detail::make_zip_iterator(index_iterator{0}, values()),
+            num_segments()};
+    }
+
+    /** @return iterator pointing to the first segment. */
+    constexpr iterator begin() const { return iterator{*this, 0}; }
+
+    /** @return iterator pointing one past the last segment. */
+    constexpr iterator end() const { return iterator{*this, num_segments()}; }
+
+    /** @return iterator pointing to the first segment. */
+    constexpr const index_type* begin_indices() const { return begins_; }
+
+    /** @return iterator pointing one past the last segment. */
+    constexpr const index_type* end_indices() const { return ends_; }
+
+    /** @return the beginning index of the given segment. */
+    constexpr index_type begin_index(index_type segment) const
+    {
+        assert(segment >= 0);
+        assert(segment < num_segments());
+        return begin_indices()[segment];
+    }
+
+    /** @return the end index of the given segment. */
+    constexpr index_type end_index(index_type segment) const
+    {
+        assert(segment >= 0);
+        assert(segment < num_segments());
+        return end_indices()[segment];
+    }
+
+    /** @return the value iterator. */
+    constexpr value_iterator values() const { return values_; }
+
+    /** Compares two ranges for equality. */
+    constexpr friend bool operator==(segmented_value_range lhs,
+                                     segmented_value_range rhs)
+    {
+        return lhs.begin_indices() == rhs.begin_indices() &&
+               lhs.end_indices() == rhs.end_indices() &&
+               lhs.values() == rhs.values() &&
+               lhs.num_segments() == rhs.num_segments();
+    }
+
+    /** Compares two ranges for inequality. */
+    constexpr friend bool operator!=(segmented_value_range lhs,
+                                     segmented_value_range rhs)
+    {
+        return !(lhs == rhs);
+    }
+
+private:
+    const index_type* begins_;
+    const index_type* ends_;
+    value_iterator values_;
+    index_type num_segments_;
+};
+
+
+}  // namespace gko
+
+
+#endif  // GKO_CORE_BASE_SEGMENTED_RANGE_HPP_
diff --git a/core/test/base/CMakeLists.txt b/core/test/base/CMakeLists.txt
index c608acd9a8a..d7deeec6fb7 100644
--- a/core/test/base/CMakeLists.txt
+++ b/core/test/base/CMakeLists.txt
@@ -27,6 +27,7 @@ ginkgo_create_test(range)
 ginkgo_create_test(range_accessors)
 ginkgo_create_test(sanitizers ADDITIONAL_LIBRARIES Threads::Threads)
 ginkgo_create_test(segmented_array)
+ginkgo_create_test(segmented_range)
 ginkgo_create_test(types)
 ginkgo_create_test(utils)
 ginkgo_create_test(version EXECUTABLE_NAME version_test) # version collides with C++ stdlib header
diff --git a/core/test/base/segmented_range.cpp b/core/test/base/segmented_range.cpp
new file mode 100644
index 00000000000..33c2941d4dd
--- /dev/null
+++ b/core/test/base/segmented_range.cpp
@@ -0,0 +1,84 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <iterator>
+#include <numeric>
+#include <vector>
+
+
+#include <gtest/gtest.h>
+
+
+#include "core/base/segmented_range.hpp"
+
+
+TEST(SegmentedRange, Works)
+{
+    std::vector<int> begins{3, 1, 4, 9};
+    std::vector<int> ends{3, 10, 6, 10};
+    std::vector<std::vector<int>> result_indices(begins.size());
+    gko::segmented_range<int> range{begins.data(), ends.data(),
+                                    static_cast<int>(begins.size())};
+
+    for (auto row : gko::irange<int>(begins.size())) {
+        for (auto nz : range[row]) {
+            result_indices[row].push_back(nz);
+        }
+    }
+
+    ASSERT_EQ(result_indices,
+              std::vector<std::vector<int>>(
+                  {{}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, {4, 5}, {9}}));
+}
+
+
+TEST(SegmentedValueRange, Works)
+{
+    std::vector<int> begins{3, 1, 4, 9};
+    std::vector<int> ends{3, 10, 6, 10};
+    std::vector<int> values(ends.back());
+    std::iota(values.begin(), values.end(), 1);
+    std::vector<std::vector<int>> result_values(begins.size());
+    gko::segmented_value_range<int, std::vector<int>::iterator> range{
+        begins.data(), ends.data(), values.begin(),
+        static_cast<int>(begins.size())};
+
+    for (auto row : gko::irange<int>(begins.size())) {
+        for (auto nz : range[row]) {
+            result_values[row].push_back(nz);
+        }
+    }
+
+    ASSERT_EQ(result_values,
+              std::vector<std::vector<int>>(
+                  {{}, {2, 3, 4, 5, 6, 7, 8, 9, 10}, {5, 6}, {10}}));
+}
+
+
+TEST(SegmentedEnumeratedValueRange, Works)
+{
+    std::vector<int> begins{3, 1, 4, 9};
+    std::vector<int> ends{3, 10, 6, 10};
+    std::vector<int> values(ends.back());
+    std::iota(values.begin(), values.end(), 1);
+    std::vector<std::vector<int>> result_values(begins.size());
+    std::vector<std::vector<int>> result_indices(begins.size());
+    gko::segmented_value_range<int, std::vector<int>::iterator> range{
+        begins.data(), ends.data(), values.begin(),
+        static_cast<int>(begins.size())};
+
+    for (auto row : gko::irange<int>(begins.size())) {
+        for (auto tuple : range.enumerated()[row]) {
+            result_indices[row].push_back(std::get<0>(tuple));
+            result_values[row].push_back(std::get<1>(tuple));
+        }
+    }
+
+    ASSERT_EQ(result_indices,
+              std::vector<std::vector<int>>(
+                  {{}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, {4, 5}, {9}}));
+    ASSERT_EQ(result_values,
+              std::vector<std::vector<int>>(
+                  {{}, {2, 3, 4, 5, 6, 7, 8, 9, 10}, {5, 6}, {10}}));
+}

From 11eded47944d476256157e7b6fb5fdfbb668e6fb Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Mon, 29 Apr 2024 11:49:31 +0200
Subject: [PATCH 105/448] add structured binding support

---
 core/base/segmented_range.hpp      | 29 ++++++++++----
 core/test/base/segmented_range.cpp | 61 ++++++++++++++++++++++++++++--
 2 files changed, 78 insertions(+), 12 deletions(-)

diff --git a/core/base/segmented_range.hpp b/core/base/segmented_range.hpp
index afe04b60b69..344de0aa623 100644
--- a/core/base/segmented_range.hpp
+++ b/core/base/segmented_range.hpp
@@ -39,12 +39,18 @@ class segmented_range {
             : range_{range}, segment_{segment}
         {}
 
-        constexpr segment operator*() const
+        struct enumerated_segment {
+            index_type index;
+            segment segment;
+        };
+
+        constexpr enumerated_segment operator*() const
         {
             assert(segment_ >= 0);
             assert(segment_ < range_.num_segments());
-            return segment{range_.begin_index(segment_),
-                           range_.end_index(segment_)};
+            return enumerated_segment{segment_,
+                                      segment{range_.begin_index(segment_),
+                                              range_.end_index(segment_)}};
         }
 
         constexpr iterator& operator++()
@@ -109,7 +115,7 @@ class segmented_range {
     {
         assert(segment >= 0);
         assert(segment < num_segments());
-        return *iterator{*this, segment};
+        return (*iterator{*this, segment}).segment;
     }
 
     /** @return the number of segments in this range. */
@@ -193,12 +199,19 @@ class segmented_value_range {
             : range_{range}, segment_{segment}
         {}
 
-        constexpr segment operator*() const
+        struct enumerated_segment {
+            index_type index;
+            segment segment;
+        };
+
+        constexpr enumerated_segment operator*() const
         {
             assert(segment_ >= 0);
             assert(segment_ < range_.num_segments());
-            return segment{range_.values() + range_.begin_index(segment_),
-                           range_.values() + range_.end_index(segment_)};
+            return enumerated_segment{
+                segment_,
+                segment{range_.values() + range_.begin_index(segment_),
+                        range_.values() + range_.end_index(segment_)}};
         }
 
         constexpr iterator& operator++()
@@ -272,7 +285,7 @@ class segmented_value_range {
     {
         assert(segment >= 0);
         assert(segment < num_segments());
-        return *iterator{*this, segment};
+        return (*iterator{*this, segment}).segment;
     }
 
     /** @return the number of segments in this range. */
diff --git a/core/test/base/segmented_range.cpp b/core/test/base/segmented_range.cpp
index 33c2941d4dd..5de04c3035a 100644
--- a/core/test/base/segmented_range.cpp
+++ b/core/test/base/segmented_range.cpp
@@ -33,7 +33,7 @@ TEST(SegmentedRange, Works)
 }
 
 
-TEST(SegmentedValueRange, Works)
+TEST(SegmentedValueRange, WorksByIndex)
 {
     std::vector<int> begins{3, 1, 4, 9};
     std::vector<int> ends{3, 10, 6, 10};
@@ -56,8 +56,32 @@ TEST(SegmentedValueRange, Works)
 }
 
 
-TEST(SegmentedEnumeratedValueRange, Works)
+TEST(SegmentedValueRange, WorksByRangeFor)
 {
+    std::vector<int> begins{3, 1, 4, 9};
+    std::vector<int> ends{3, 10, 6, 10};
+    std::vector<int> values(ends.back());
+    std::iota(values.begin(), values.end(), 1);
+    std::vector<std::vector<int>> result_values(begins.size());
+    gko::segmented_value_range<int, std::vector<int>::iterator> range{
+        begins.data(), ends.data(), values.begin(),
+        static_cast<int>(begins.size())};
+
+    for (auto [row, segment] : range) {
+        for (auto nz : segment) {
+            result_values[row].push_back(nz);
+        }
+    }
+
+    ASSERT_EQ(result_values,
+              std::vector<std::vector<int>>(
+                  {{}, {2, 3, 4, 5, 6, 7, 8, 9, 10}, {5, 6}, {10}}));
+}
+
+
+TEST(SegmentedEnumeratedValueRange, WorksByIndex)
+{
+    using gko::get;
     std::vector<int> begins{3, 1, 4, 9};
     std::vector<int> ends{3, 10, 6, 10};
     std::vector<int> values(ends.back());
@@ -70,8 +94,37 @@ TEST(SegmentedEnumeratedValueRange, Works)
 
     for (auto row : gko::irange<int>(begins.size())) {
         for (auto tuple : range.enumerated()[row]) {
-            result_indices[row].push_back(std::get<0>(tuple));
-            result_values[row].push_back(std::get<1>(tuple));
+            result_indices[row].push_back(get<0>(tuple));
+            result_values[row].push_back(get<1>(tuple));
+        }
+    }
+
+    ASSERT_EQ(result_indices,
+              std::vector<std::vector<int>>(
+                  {{}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, {4, 5}, {9}}));
+    ASSERT_EQ(result_values,
+              std::vector<std::vector<int>>(
+                  {{}, {2, 3, 4, 5, 6, 7, 8, 9, 10}, {5, 6}, {10}}));
+}
+
+
+TEST(SegmentedEnumeratedValueRange, WorksByRangeFor)
+{
+    std::vector<int> begins{3, 1, 4, 9};
+    std::vector<int> ends{3, 10, 6, 10};
+    std::vector<int> values(ends.back());
+    std::iota(values.begin(), values.end(), 1);
+    std::vector<std::vector<int>> result_values(begins.size());
+    std::vector<std::vector<int>> result_indices(begins.size());
+    gko::segmented_value_range<int, std::vector<int>::iterator> range{
+        begins.data(), ends.data(), values.begin(),
+        static_cast<int>(begins.size())};
+    auto enumerated_range = range.enumerated();
+
+    for (auto [row, segment] : enumerated_range) {
+        for (auto [index, value] : segment) {
+            result_indices[row].push_back(index);
+            result_values[row].push_back(value);
         }
     }
 

From 4b1be60faccb68c78f2cf226414486a7bdeb2f8a Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 1 May 2024 13:28:36 +0200
Subject: [PATCH 106/448] review updates

- fix name hiding in classes
- add tests for ptrs constructors

Co-authored-by: Marcel Koch <marcel.koch@kit.edu>
---
 core/base/segmented_range.hpp      | 38 ++++++++++++------
 core/test/base/segmented_range.cpp | 64 ++++++++++++++++++++++++++++--
 2 files changed, 86 insertions(+), 16 deletions(-)

diff --git a/core/base/segmented_range.hpp b/core/base/segmented_range.hpp
index 344de0aa623..3820dbfb06f 100644
--- a/core/base/segmented_range.hpp
+++ b/core/base/segmented_range.hpp
@@ -27,8 +27,8 @@ template <typename IndexType>
 class segmented_range {
 public:
     using index_type = IndexType;
-    using index_iterator = index_iterator<index_type>;
-    using segment = irange<index_type>;
+    using index_iterator_type = index_iterator<index_type>;
+    using segment_type = irange<index_type>;
 
     /**
      * An iterator pointing to (or past) a single segment in the range.
@@ -41,7 +41,7 @@ class segmented_range {
 
         struct enumerated_segment {
             index_type index;
-            segment segment;
+            segment_type segment;
         };
 
         constexpr enumerated_segment operator*() const
@@ -49,8 +49,8 @@ class segmented_range {
             assert(segment_ >= 0);
             assert(segment_ < range_.num_segments());
             return enumerated_segment{segment_,
-                                      segment{range_.begin_index(segment_),
-                                              range_.end_index(segment_)}};
+                                      segment_type{range_.begin_index(segment_),
+                                                   range_.end_index(segment_)}};
         }
 
         constexpr iterator& operator++()
@@ -111,7 +111,7 @@ class segmented_range {
      * @param segment  the index to access. It must be in `[0, num_segments())`.
      * @return  the segment at this index.
      */
-    constexpr segment operator[](index_type segment) const
+    constexpr segment_type operator[](index_type segment) const
     {
         assert(segment >= 0);
         assert(segment < num_segments());
@@ -121,6 +121,12 @@ class segmented_range {
     /** @return the number of segments in this range. */
     constexpr index_type num_segments() const { return num_segments_; }
 
+    /** @return an index range representing all segment indices. */
+    constexpr irange<index_type> segment_indices() const
+    {
+        return irange<index_type>{num_segments()};
+    }
+
     /** @return iterator pointing to the first segment. */
     constexpr iterator begin() const { return iterator{*this, 0}; }
 
@@ -183,11 +189,11 @@ template <typename IndexType, typename ValueIterator>
 class segmented_value_range {
 public:
     using index_type = IndexType;
-    using index_iterator = index_iterator<index_type>;
+    using index_iterator_type = index_iterator<index_type>;
     using value_iterator = ValueIterator;
-    using segment = iterator_range<ValueIterator>;
+    using segment_type = iterator_range<ValueIterator>;
     using enumerated_range = segmented_value_range<
-        index_type, detail::zip_iterator<index_iterator, value_iterator>>;
+        index_type, detail::zip_iterator<index_iterator_type, value_iterator>>;
 
     /**
      * An iterator pointing to (or past) a single segment in the range.
@@ -201,7 +207,7 @@ class segmented_value_range {
 
         struct enumerated_segment {
             index_type index;
-            segment segment;
+            segment_type segment;
         };
 
         constexpr enumerated_segment operator*() const
@@ -210,8 +216,8 @@ class segmented_value_range {
             assert(segment_ < range_.num_segments());
             return enumerated_segment{
                 segment_,
-                segment{range_.values() + range_.begin_index(segment_),
-                        range_.values() + range_.end_index(segment_)}};
+                segment_type{range_.values() + range_.begin_index(segment_),
+                             range_.values() + range_.end_index(segment_)}};
         }
 
         constexpr iterator& operator++()
@@ -281,7 +287,7 @@ class segmented_value_range {
      * @param segment  the index to access. It must be in `[0, num_segments())`.
      * @return  the segment at this index.
      */
-    constexpr segment operator[](index_type segment) const
+    constexpr segment_type operator[](index_type segment) const
     {
         assert(segment >= 0);
         assert(segment < num_segments());
@@ -291,6 +297,12 @@ class segmented_value_range {
     /** @return the number of segments in this range. */
     constexpr index_type num_segments() const { return num_segments_; }
 
+    /** @return an index range representing all segment indices. */
+    constexpr irange<index_type> segment_indices() const
+    {
+        return irange<index_type>{num_segments()};
+    }
+
     constexpr enumerated_range enumerated() const
     {
         return enumerated_range{
diff --git a/core/test/base/segmented_range.cpp b/core/test/base/segmented_range.cpp
index 5de04c3035a..76f5ae8ffcc 100644
--- a/core/test/base/segmented_range.cpp
+++ b/core/test/base/segmented_range.cpp
@@ -13,7 +13,7 @@
 #include "core/base/segmented_range.hpp"
 
 
-TEST(SegmentedRange, Works)
+TEST(SegmentedRange, WorksByIndex)
 {
     std::vector<int> begins{3, 1, 4, 9};
     std::vector<int> ends{3, 10, 6, 10};
@@ -21,7 +21,7 @@ TEST(SegmentedRange, Works)
     gko::segmented_range<int> range{begins.data(), ends.data(),
                                     static_cast<int>(begins.size())};
 
-    for (auto row : gko::irange<int>(begins.size())) {
+    for (auto row : range.segment_indices()) {
         for (auto nz : range[row]) {
             result_indices[row].push_back(nz);
         }
@@ -33,6 +33,44 @@ TEST(SegmentedRange, Works)
 }
 
 
+TEST(SegmentedRange, WorksByRangeFor)
+{
+    std::vector<int> begins{3, 1, 4, 9};
+    std::vector<int> ends{3, 10, 6, 10};
+    std::vector<std::vector<int>> result_indices(begins.size());
+    gko::segmented_range<int> range{begins.data(), ends.data(),
+                                    static_cast<int>(begins.size())};
+
+    for (auto [row, segment] : range) {
+        for (auto nz : segment) {
+            result_indices[row].push_back(nz);
+        }
+    }
+
+    ASSERT_EQ(result_indices,
+              std::vector<std::vector<int>>(
+                  {{}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, {4, 5}, {9}}));
+}
+
+
+TEST(SegmentedRange, WorksWithPtrsConstructor)
+{
+    std::vector<int> ptrs{0, 2, 4, 5, 9};
+    std::vector<std::vector<int>> result_indices(ptrs.size() - 1);
+    gko::segmented_range<int> range{ptrs.data(),
+                                    static_cast<int>(ptrs.size() - 1)};
+
+    for (auto row : range.segment_indices()) {
+        for (auto nz : range[row]) {
+            result_indices[row].push_back(nz);
+        }
+    }
+
+    ASSERT_EQ(result_indices, std::vector<std::vector<int>>(
+                                  {{0, 1}, {2, 3}, {4}, {5, 6, 7, 8}}));
+}
+
+
 TEST(SegmentedValueRange, WorksByIndex)
 {
     std::vector<int> begins{3, 1, 4, 9};
@@ -79,6 +117,26 @@ TEST(SegmentedValueRange, WorksByRangeFor)
 }
 
 
+TEST(SegmentedValueRange, WorksWithPtrsConstructor)
+{
+    std::vector<int> ptrs{0, 2, 4, 5, 9};
+    std::vector<int> values(ptrs.back());
+    std::iota(values.begin(), values.end(), 1);
+    std::vector<std::vector<int>> result_values(ptrs.size() - 1);
+    gko::segmented_value_range<int, std::vector<int>::iterator> range{
+        ptrs.data(), values.begin(), static_cast<int>(ptrs.size() - 1)};
+
+    for (auto row : range.segment_indices()) {
+        for (auto nz : range[row]) {
+            result_values[row].push_back(nz);
+        }
+    }
+
+    ASSERT_EQ(result_values, std::vector<std::vector<int>>(
+                                 {{1, 2}, {3, 4}, {5}, {6, 7, 8, 9}}));
+}
+
+
 TEST(SegmentedEnumeratedValueRange, WorksByIndex)
 {
     using gko::get;
@@ -92,7 +150,7 @@ TEST(SegmentedEnumeratedValueRange, WorksByIndex)
         begins.data(), ends.data(), values.begin(),
         static_cast<int>(begins.size())};
 
-    for (auto row : gko::irange<int>(begins.size())) {
+    for (auto row : range.segment_indices()) {
         for (auto tuple : range.enumerated()[row]) {
             result_indices[row].push_back(get<0>(tuple));
             result_values[row].push_back(get<1>(tuple));

From 6b2f01997d3ee7eace6bd149f4e651f37f5a307b Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 1 May 2024 13:33:22 +0200
Subject: [PATCH 107/448] add documentation

---
 core/base/iterator_factory.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/core/base/iterator_factory.hpp b/core/base/iterator_factory.hpp
index de5af49e24f..54e7fecb94e 100644
--- a/core/base/iterator_factory.hpp
+++ b/core/base/iterator_factory.hpp
@@ -710,6 +710,7 @@ permute_iterator<IteratorType, PermutationFn> make_permute_iterator(
 }  // namespace detail
 
 
+/** std::get reimplementation for device_tuple. */
 template <std::size_t index, typename... Ts>
 constexpr typename std::tuple_element<index, detail::device_tuple<Ts...>>::type&
 get(detail::device_tuple<Ts...>& tuple)
@@ -718,6 +719,7 @@ get(detail::device_tuple<Ts...>& tuple)
 }
 
 
+/** std::get reimplementation for const device_tuple. */
 template <std::size_t index, typename... Ts>
 constexpr const typename std::tuple_element<index,
                                             detail::device_tuple<Ts...>>::type&

From e1de865f01c4f7f0194049d8a6c4cdd90c61dcc6 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 1 May 2024 13:51:01 +0200
Subject: [PATCH 108/448] add assertion tests for segmented ranges

---
 core/test/base/segmented_range.cpp | 62 ++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/core/test/base/segmented_range.cpp b/core/test/base/segmented_range.cpp
index 76f5ae8ffcc..63079bc2e3f 100644
--- a/core/test/base/segmented_range.cpp
+++ b/core/test/base/segmented_range.cpp
@@ -193,3 +193,65 @@ TEST(SegmentedEnumeratedValueRange, WorksByRangeFor)
               std::vector<std::vector<int>>(
                   {{}, {2, 3, 4, 5, 6, 7, 8, 9, 10}, {5, 6}, {10}}));
 }
+
+
+#ifndef NDEBUG
+
+
+bool check_assertion_exit_code(int exit_code)
+{
+#ifdef _MSC_VER
+    // MSVC picks up the exit code incorrectly,
+    // so we can only check that it exits
+    return true;
+#else
+    return exit_code != 0;
+#endif
+}
+
+
+TEST(DeathTest, Assertions)
+{
+    using range_t = gko::segmented_range<int>;
+    using vrange_t = gko::segmented_value_range<int, int*>;
+    using range_it_t = range_t::iterator;
+    using vrange_it_t = vrange_t::iterator;
+    std::vector<int> ptrs{0, 1};
+    std::vector<int> values{0, 1};
+    range_t range{ptrs.data(), static_cast<int>(ptrs.size() - 1)};
+    range_t range2{ptrs.data(), 0};
+    vrange_t vrange{ptrs.data(), values.data(),
+                    static_cast<int>(ptrs.size() - 1)};
+    vrange_t vrange2{ptrs.data(), values.data(), 0};
+    // gko::segmented_range::iterator
+    EXPECT_EXIT((void)*(range_it_t{range, -1}), check_assertion_exit_code, "");
+    EXPECT_EXIT((void)*(range_it_t{range, 1}), check_assertion_exit_code, "");
+    EXPECT_EXIT((void)(range_it_t{range, 0} == range_it_t{range2, 0}),
+                check_assertion_exit_code, "");
+    // gko::segmented_range
+    EXPECT_EXIT((void)(range_t{nullptr, -1}), check_assertion_exit_code, "");
+    EXPECT_EXIT((void)range[-1], check_assertion_exit_code, "");
+    EXPECT_EXIT((void)range[1], check_assertion_exit_code, "");
+    EXPECT_EXIT((void)range.begin_index(-1), check_assertion_exit_code, "");
+    EXPECT_EXIT((void)range.begin_index(1), check_assertion_exit_code, "");
+    EXPECT_EXIT((void)range.end_index(-1), check_assertion_exit_code, "");
+    EXPECT_EXIT((void)range.end_index(1), check_assertion_exit_code, "");
+    // gko::segmented_value_range::iterator
+    EXPECT_EXIT((void)*(vrange_it_t{vrange, -1}), check_assertion_exit_code,
+                "");
+    EXPECT_EXIT((void)*(vrange_it_t{vrange, 1}), check_assertion_exit_code, "");
+    EXPECT_EXIT((void)(vrange_it_t{vrange, 0} == vrange_it_t{vrange2, 0}),
+                check_assertion_exit_code, "");
+    // gko::segmented_value_range
+    EXPECT_EXIT((void)(vrange_t{nullptr, nullptr, -1}),
+                check_assertion_exit_code, "");
+    EXPECT_EXIT((void)vrange[-1], check_assertion_exit_code, "");
+    EXPECT_EXIT((void)vrange[1], check_assertion_exit_code, "");
+    EXPECT_EXIT((void)vrange.begin_index(-1), check_assertion_exit_code, "");
+    EXPECT_EXIT((void)vrange.begin_index(1), check_assertion_exit_code, "");
+    EXPECT_EXIT((void)vrange.end_index(-1), check_assertion_exit_code, "");
+    EXPECT_EXIT((void)vrange.end_index(1), check_assertion_exit_code, "");
+}
+
+
+#endif

From bc09d636a67e6634b220ccd74cfc3f86ec16a1dc Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 1 May 2024 14:17:58 +0200
Subject: [PATCH 109/448] test segmented ranges on the device

---
 test/base/CMakeLists.txt      |  1 +
 test/base/segmented_range.cpp | 69 +++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+)
 create mode 100644 test/base/segmented_range.cpp

diff --git a/test/base/CMakeLists.txt b/test/base/CMakeLists.txt
index 5f31c25db19..bc2ea73620f 100644
--- a/test/base/CMakeLists.txt
+++ b/test/base/CMakeLists.txt
@@ -4,4 +4,5 @@ ginkgo_create_common_device_test(index_range)
 ginkgo_create_common_device_test(iterator_factory)
 ginkgo_create_common_device_test(kernel_launch_generic)
 ginkgo_create_common_and_reference_test(executor)
+ginkgo_create_common_device_test(segmented_range)
 ginkgo_create_common_and_reference_test(timer)
diff --git a/test/base/segmented_range.cpp b/test/base/segmented_range.cpp
new file mode 100644
index 00000000000..54c491a8493
--- /dev/null
+++ b/test/base/segmented_range.cpp
@@ -0,0 +1,69 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <memory>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+
+
+#include "common/unified/base/kernel_launch.hpp"
+#include "core/base/segmented_range.hpp"
+#include "core/test/utils.hpp"
+#include "test/utils/executor.hpp"
+
+
+class SegmentedRange : public CommonTestFixture {
+public:
+    SegmentedRange()
+        : ptrs{exec, {0, 0, 1, 3, 4, 9}},
+          values{exec, {1, 2, 3, 4, 5, 6, 7, 8, 9}},
+          output{exec, 2 * values.get_size()}
+    {}
+
+    gko::array<int> ptrs;
+    gko::array<int> values;
+    gko::array<int> output;
+};
+
+
+// nvcc doesn't like device lambdas declared in complex classes, move it out
+void run_segmented_range(std::shared_ptr<gko::EXEC_TYPE> exec,
+                         const gko::array<int>& ptrs,
+                         const gko::array<int>& values, gko::array<int>& output)
+{
+    gko::kernels::EXEC_NAMESPACE::run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto ptrs, auto values, auto output, auto size) {
+            gko::segmented_range<int> range{ptrs, size};
+            for (auto [row, segment] : range) {
+                for (auto nz : segment) {
+                    output[nz] = row;
+                }
+            }
+            auto num_values = ptrs[size];
+            gko::segmented_value_range<int, const int*> vrange{ptrs, values,
+                                                               size};
+            for (auto [row, segment] : vrange.enumerated()) {
+                for (auto [nz, value] : segment) {
+                    output[nz + num_values] = row * 10 + value;
+                }
+            }
+        },
+        1, ptrs, values, output, static_cast<int>(ptrs.get_size() - 1));
+}
+
+
+TEST_F(SegmentedRange, KernelRunsSegmentedRange)
+{
+    gko::array<int> expected{
+        ref, {1, 2, 2, 3, 4, 4, 4, 4, 4, 11, 22, 23, 34, 45, 46, 47, 48, 49}};
+
+    run_segmented_range(exec, ptrs, values, output);
+
+    GKO_ASSERT_ARRAY_EQ(output, expected);
+}

From cf29d4bec39e2f0c4046357a7c88d879c1ea4ff6 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 1 May 2024 18:29:45 +0200
Subject: [PATCH 110/448] work around nvcc issue

---
 core/base/iterator_factory.hpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/core/base/iterator_factory.hpp b/core/base/iterator_factory.hpp
index 54e7fecb94e..dbd921d0762 100644
--- a/core/base/iterator_factory.hpp
+++ b/core/base/iterator_factory.hpp
@@ -288,10 +288,7 @@ class zip_iterator_reference
     template <std::size_t... idxs>
     constexpr value_type cast_impl(std::index_sequence<idxs...>) const
     {
-        // gcc 5 throws error as using uninitialized array
-        // std::tuple<int, char> t = { 1, '2' }; is not allowed.
-        // converting to 'std::tuple<...>' from initializer list would use
-        // explicit constructor
+        // need to use fully qualified name for nvcc 11.x to not call this->get
         return value_type(gko::get<idxs>(*this)...);
     }
 
@@ -299,6 +296,7 @@ class zip_iterator_reference
     constexpr void assign_impl(std::index_sequence<idxs...>,
                                const value_type& other)
     {
+        // need to use fully qualified name for nvcc 11.x to not call this->get
         (void)std::initializer_list<int>{
             (gko::get<idxs>(*this) = gko::get<idxs>(other), 0)...};
     }

From c641cd2cc6908980316bb423ae34e06cbdc2dbab Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Tue, 9 Jul 2024 00:13:49 +0200
Subject: [PATCH 111/448] formatting

---
 core/base/segmented_range.hpp      | 1 -
 core/test/base/segmented_range.cpp | 6 ++----
 test/base/segmented_range.cpp      | 6 ++----
 3 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/core/base/segmented_range.hpp b/core/base/segmented_range.hpp
index 3820dbfb06f..9bab7e457d6 100644
--- a/core/base/segmented_range.hpp
+++ b/core/base/segmented_range.hpp
@@ -9,7 +9,6 @@
 #include <iterator>
 #include <type_traits>
 
-
 #include "core/base/index_range.hpp"
 #include "core/base/iterator_factory.hpp"
 
diff --git a/core/test/base/segmented_range.cpp b/core/test/base/segmented_range.cpp
index 63079bc2e3f..6067ab13ca2 100644
--- a/core/test/base/segmented_range.cpp
+++ b/core/test/base/segmented_range.cpp
@@ -2,17 +2,15 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/base/segmented_range.hpp"
+
 #include <iterator>
 #include <numeric>
 #include <vector>
 
-
 #include <gtest/gtest.h>
 
 
-#include "core/base/segmented_range.hpp"
-
-
 TEST(SegmentedRange, WorksByIndex)
 {
     std::vector<int> begins{3, 1, 4, 9};
diff --git a/test/base/segmented_range.cpp b/test/base/segmented_range.cpp
index 54c491a8493..436a6fb8a55 100644
--- a/test/base/segmented_range.cpp
+++ b/test/base/segmented_range.cpp
@@ -2,17 +2,15 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <memory>
+#include "core/base/segmented_range.hpp"
 
+#include <memory>
 
 #include <gtest/gtest.h>
 
-
 #include <ginkgo/core/base/array.hpp>
 
-
 #include "common/unified/base/kernel_launch.hpp"
-#include "core/base/segmented_range.hpp"
 #include "core/test/utils.hpp"
 #include "test/utils/executor.hpp"
 

From bca42c2a8011becc2a3f14e692eccfc8cb0ddc91 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Thu, 1 Aug 2024 10:43:05 +0200
Subject: [PATCH 112/448] rename segmented_range to segmented_index_range

---
 core/base/segmented_range.hpp      | 25 ++++++++++++++-----------
 core/test/base/segmented_range.cpp | 18 +++++++++---------
 test/base/segmented_range.cpp      | 15 ++++++++-------
 3 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/core/base/segmented_range.hpp b/core/base/segmented_range.hpp
index 9bab7e457d6..d3ec8e1da73 100644
--- a/core/base/segmented_range.hpp
+++ b/core/base/segmented_range.hpp
@@ -23,7 +23,7 @@ namespace gko {
  * @tparam IndexType  the type of indices used to represent the segments.
  */
 template <typename IndexType>
-class segmented_range {
+class segmented_index_range {
 public:
     using index_type = IndexType;
     using index_iterator_type = index_iterator<index_type>;
@@ -34,7 +34,8 @@ class segmented_range {
      */
     class iterator {
     public:
-        constexpr explicit iterator(segmented_range range, index_type segment)
+        constexpr explicit iterator(segmented_index_range range,
+                                    index_type segment)
             : range_{range}, segment_{segment}
         {}
 
@@ -70,7 +71,7 @@ class segmented_range {
         }
 
     private:
-        segmented_range range_;
+        segmented_index_range range_;
         index_type segment_;
     };
 
@@ -83,9 +84,9 @@ class segmented_range {
      * @param num_segments  the number of segments, i.e. the size of the
      *                      beginning and end index arrays.
      */
-    constexpr explicit segmented_range(const index_type* begins,
-                                       const index_type* ends,
-                                       index_type num_segments)
+    constexpr explicit segmented_index_range(const index_type* begins,
+                                             const index_type* ends,
+                                             index_type num_segments)
         : begins_{begins}, ends_{ends}, num_segments_{num_segments}
     {
         assert(num_segments_ >= 0);
@@ -99,9 +100,9 @@ class segmented_range {
      * @param num_segments  the number of segments, i.e. the size of the
      *                      ptrs index arrays.
      */
-    constexpr explicit segmented_range(const index_type* ptrs,
-                                       index_type num_segments)
-        : segmented_range{ptrs, ptrs + 1, num_segments}
+    constexpr explicit segmented_index_range(const index_type* ptrs,
+                                             index_type num_segments)
+        : segmented_index_range{ptrs, ptrs + 1, num_segments}
     {}
 
     /**
@@ -155,7 +156,8 @@ class segmented_range {
     }
 
     /** Compares two ranges for equality. */
-    constexpr friend bool operator==(segmented_range lhs, segmented_range rhs)
+    constexpr friend bool operator==(segmented_index_range lhs,
+                                     segmented_index_range rhs)
     {
         return lhs.begin_indices() == rhs.begin_indices() &&
                lhs.end_indices() == rhs.end_indices() &&
@@ -163,7 +165,8 @@ class segmented_range {
     }
 
     /** Compares two ranges for inequality. */
-    constexpr friend bool operator!=(segmented_range lhs, segmented_range rhs)
+    constexpr friend bool operator!=(segmented_index_range lhs,
+                                     segmented_index_range rhs)
     {
         return !(lhs == rhs);
     }
diff --git a/core/test/base/segmented_range.cpp b/core/test/base/segmented_range.cpp
index 6067ab13ca2..b10b17f7e1b 100644
--- a/core/test/base/segmented_range.cpp
+++ b/core/test/base/segmented_range.cpp
@@ -16,8 +16,8 @@ TEST(SegmentedRange, WorksByIndex)
     std::vector<int> begins{3, 1, 4, 9};
     std::vector<int> ends{3, 10, 6, 10};
     std::vector<std::vector<int>> result_indices(begins.size());
-    gko::segmented_range<int> range{begins.data(), ends.data(),
-                                    static_cast<int>(begins.size())};
+    gko::segmented_index_range<int> range{begins.data(), ends.data(),
+                                          static_cast<int>(begins.size())};
 
     for (auto row : range.segment_indices()) {
         for (auto nz : range[row]) {
@@ -36,8 +36,8 @@ TEST(SegmentedRange, WorksByRangeFor)
     std::vector<int> begins{3, 1, 4, 9};
     std::vector<int> ends{3, 10, 6, 10};
     std::vector<std::vector<int>> result_indices(begins.size());
-    gko::segmented_range<int> range{begins.data(), ends.data(),
-                                    static_cast<int>(begins.size())};
+    gko::segmented_index_range<int> range{begins.data(), ends.data(),
+                                          static_cast<int>(begins.size())};
 
     for (auto [row, segment] : range) {
         for (auto nz : segment) {
@@ -55,8 +55,8 @@ TEST(SegmentedRange, WorksWithPtrsConstructor)
 {
     std::vector<int> ptrs{0, 2, 4, 5, 9};
     std::vector<std::vector<int>> result_indices(ptrs.size() - 1);
-    gko::segmented_range<int> range{ptrs.data(),
-                                    static_cast<int>(ptrs.size() - 1)};
+    gko::segmented_index_range<int> range{ptrs.data(),
+                                          static_cast<int>(ptrs.size() - 1)};
 
     for (auto row : range.segment_indices()) {
         for (auto nz : range[row]) {
@@ -210,7 +210,7 @@ bool check_assertion_exit_code(int exit_code)
 
 TEST(DeathTest, Assertions)
 {
-    using range_t = gko::segmented_range<int>;
+    using range_t = gko::segmented_index_range<int>;
     using vrange_t = gko::segmented_value_range<int, int*>;
     using range_it_t = range_t::iterator;
     using vrange_it_t = vrange_t::iterator;
@@ -221,12 +221,12 @@ TEST(DeathTest, Assertions)
     vrange_t vrange{ptrs.data(), values.data(),
                     static_cast<int>(ptrs.size() - 1)};
     vrange_t vrange2{ptrs.data(), values.data(), 0};
-    // gko::segmented_range::iterator
+    // gko::segmented_index_range::iterator
     EXPECT_EXIT((void)*(range_it_t{range, -1}), check_assertion_exit_code, "");
     EXPECT_EXIT((void)*(range_it_t{range, 1}), check_assertion_exit_code, "");
     EXPECT_EXIT((void)(range_it_t{range, 0} == range_it_t{range2, 0}),
                 check_assertion_exit_code, "");
-    // gko::segmented_range
+    // gko::segmented_index_range
     EXPECT_EXIT((void)(range_t{nullptr, -1}), check_assertion_exit_code, "");
     EXPECT_EXIT((void)range[-1], check_assertion_exit_code, "");
     EXPECT_EXIT((void)range[1], check_assertion_exit_code, "");
diff --git a/test/base/segmented_range.cpp b/test/base/segmented_range.cpp
index 436a6fb8a55..86dfc21eaa6 100644
--- a/test/base/segmented_range.cpp
+++ b/test/base/segmented_range.cpp
@@ -12,7 +12,7 @@
 
 #include "common/unified/base/kernel_launch.hpp"
 #include "core/test/utils.hpp"
-#include "test/utils/executor.hpp"
+#include "test/utils/common_fixture.hpp"
 
 
 class SegmentedRange : public CommonTestFixture {
@@ -30,14 +30,15 @@ class SegmentedRange : public CommonTestFixture {
 
 
 // nvcc doesn't like device lambdas declared in complex classes, move it out
-void run_segmented_range(std::shared_ptr<gko::EXEC_TYPE> exec,
-                         const gko::array<int>& ptrs,
-                         const gko::array<int>& values, gko::array<int>& output)
+void run_segmented_index_range(std::shared_ptr<gko::EXEC_TYPE> exec,
+                               const gko::array<int>& ptrs,
+                               const gko::array<int>& values,
+                               gko::array<int>& output)
 {
-    gko::kernels::EXEC_NAMESPACE::run_kernel(
+    gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto ptrs, auto values, auto output, auto size) {
-            gko::segmented_range<int> range{ptrs, size};
+            gko::segmented_index_range<int> range{ptrs, size};
             for (auto [row, segment] : range) {
                 for (auto nz : segment) {
                     output[nz] = row;
@@ -61,7 +62,7 @@ TEST_F(SegmentedRange, KernelRunsSegmentedRange)
     gko::array<int> expected{
         ref, {1, 2, 2, 3, 4, 4, 4, 4, 4, 11, 22, 23, 34, 45, 46, 47, 48, 49}};
 
-    run_segmented_range(exec, ptrs, values, output);
+    run_segmented_index_range(exec, ptrs, values, output);
 
     GKO_ASSERT_ARRAY_EQ(output, expected);
 }

From 9cca4f7931db8e05ff8109edda687266f008a529 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Thu, 1 Aug 2024 14:30:42 +0200
Subject: [PATCH 113/448] use GKO_ASSERT in device code

---
 core/base/index_range.hpp     |  4 +++-
 core/base/segmented_array.hpp |  2 +-
 core/base/segmented_range.hpp | 40 +++++++++++++++++------------------
 3 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/core/base/index_range.hpp b/core/base/index_range.hpp
index ca972363b4a..0a9d1e109c6 100644
--- a/core/base/index_range.hpp
+++ b/core/base/index_range.hpp
@@ -10,6 +10,8 @@
 #include <iterator>
 #include <type_traits>
 
+#include <ginkgo/core/base/types.hpp>
+
 #include "core/base/iterator_range.hpp"
 
 
@@ -188,7 +190,7 @@ class irange : public iterator_range<index_iterator<IndexType>> {
     constexpr explicit irange(index_type begin, index_type end)
         : iterator_range<iterator>{iterator{begin}, iterator{end}}
     {
-        assert(begin <= end);
+        GKO_ASSERT(begin <= end);
     }
 
     /**
diff --git a/core/base/segmented_array.hpp b/core/base/segmented_array.hpp
index 8999feddd01..ffa4d62e74a 100644
--- a/core/base/segmented_array.hpp
+++ b/core/base/segmented_array.hpp
@@ -31,7 +31,7 @@ struct device_segmented_array {
 
     constexpr segment get_segment(size_type segment_id)
     {
-        assert(segment_id < (offsets_end - offsets_begin));
+        GKO_ASSERT(segment_id < (offsets_end - offsets_begin));
         return {flat_begin + offsets_begin[segment_id],
                 flat_begin + offsets_begin[segment_id + 1]};
     }
diff --git a/core/base/segmented_range.hpp b/core/base/segmented_range.hpp
index d3ec8e1da73..546f7d62e18 100644
--- a/core/base/segmented_range.hpp
+++ b/core/base/segmented_range.hpp
@@ -46,8 +46,8 @@ class segmented_index_range {
 
         constexpr enumerated_segment operator*() const
         {
-            assert(segment_ >= 0);
-            assert(segment_ < range_.num_segments());
+            GKO_ASSERT(segment_ >= 0);
+            GKO_ASSERT(segment_ < range_.num_segments());
             return enumerated_segment{segment_,
                                       segment_type{range_.begin_index(segment_),
                                                    range_.end_index(segment_)}};
@@ -61,7 +61,7 @@ class segmented_index_range {
 
         constexpr friend bool operator==(iterator lhs, iterator rhs)
         {
-            assert(lhs.range_ == rhs.range_);
+            GKO_ASSERT(lhs.range_ == rhs.range_);
             return lhs.segment_ == rhs.segment_;
         }
 
@@ -89,7 +89,7 @@ class segmented_index_range {
                                              index_type num_segments)
         : begins_{begins}, ends_{ends}, num_segments_{num_segments}
     {
-        assert(num_segments_ >= 0);
+        GKO_ASSERT(num_segments_ >= 0);
     }
 
     /**
@@ -113,8 +113,8 @@ class segmented_index_range {
      */
     constexpr segment_type operator[](index_type segment) const
     {
-        assert(segment >= 0);
-        assert(segment < num_segments());
+        GKO_ASSERT(segment >= 0);
+        GKO_ASSERT(segment < num_segments());
         return (*iterator{*this, segment}).segment;
     }
 
@@ -142,16 +142,16 @@ class segmented_index_range {
     /** @return the beginning index of the given segment. */
     constexpr index_type begin_index(index_type segment) const
     {
-        assert(segment >= 0);
-        assert(segment < num_segments());
+        GKO_ASSERT(segment >= 0);
+        GKO_ASSERT(segment < num_segments());
         return begin_indices()[segment];
     }
 
     /** @return the end index of the given segment. */
     constexpr index_type end_index(index_type segment) const
     {
-        assert(segment >= 0);
-        assert(segment < num_segments());
+        GKO_ASSERT(segment >= 0);
+        GKO_ASSERT(segment < num_segments());
         return end_indices()[segment];
     }
 
@@ -214,8 +214,8 @@ class segmented_value_range {
 
         constexpr enumerated_segment operator*() const
         {
-            assert(segment_ >= 0);
-            assert(segment_ < range_.num_segments());
+            GKO_ASSERT(segment_ >= 0);
+            GKO_ASSERT(segment_ < range_.num_segments());
             return enumerated_segment{
                 segment_,
                 segment_type{range_.values() + range_.begin_index(segment_),
@@ -230,7 +230,7 @@ class segmented_value_range {
 
         constexpr friend bool operator==(iterator lhs, iterator rhs)
         {
-            assert(lhs.range_ == rhs.range_);
+            GKO_ASSERT(lhs.range_ == rhs.range_);
             return lhs.segment_ == rhs.segment_;
         }
 
@@ -264,7 +264,7 @@ class segmented_value_range {
           values_{values},
           num_segments_{num_segments}
     {
-        assert(num_segments_ >= 0);
+        GKO_ASSERT(num_segments_ >= 0);
     }
 
     /**
@@ -291,8 +291,8 @@ class segmented_value_range {
      */
     constexpr segment_type operator[](index_type segment) const
     {
-        assert(segment >= 0);
-        assert(segment < num_segments());
+        GKO_ASSERT(segment >= 0);
+        GKO_ASSERT(segment < num_segments());
         return (*iterator{*this, segment}).segment;
     }
 
@@ -328,16 +328,16 @@ class segmented_value_range {
     /** @return the beginning index of the given segment. */
     constexpr index_type begin_index(index_type segment) const
     {
-        assert(segment >= 0);
-        assert(segment < num_segments());
+        GKO_ASSERT(segment >= 0);
+        GKO_ASSERT(segment < num_segments());
         return begin_indices()[segment];
     }
 
     /** @return the end index of the given segment. */
     constexpr index_type end_index(index_type segment) const
     {
-        assert(segment >= 0);
-        assert(segment < num_segments());
+        GKO_ASSERT(segment >= 0);
+        GKO_ASSERT(segment < num_segments());
         return end_indices()[segment];
     }
 

From ad108c0d8e7b5627fa3c259dbb549ac2ce784fd2 Mon Sep 17 00:00:00 2001
From: nbeams <246972+nbeams@users.noreply.github.com>
Date: Thu, 11 Jul 2024 02:48:55 +0000
Subject: [PATCH 114/448] Add CGS and CGS2 orthogonalization options to GMRES.
 Change Hessenberg data layout to facilitate CGS communication in distributed
 solver.

---
 .../unified/solver/common_gmres_kernels.cpp   |  32 ++-
 common/unified/solver/gmres_kernels.cpp       |  25 ++
 core/device_hooks/common_kernels.inc.cpp      |   1 +
 core/solver/gmres.cpp                         | 253 ++++++++++++++++--
 core/solver/gmres_kernels.hpp                 |  18 +-
 core/test/config/solver.cpp                   |   3 +
 include/ginkgo/core/solver/gmres.hpp          |  51 +++-
 reference/solver/common_gmres_kernels.cpp     |  46 ++--
 reference/solver/gmres_kernels.cpp            |  21 ++
 reference/test/solver/gmres_kernels.cpp       | 100 +++++--
 test/mpi/solver/solver.cpp                    |   8 +-
 test/solver/gmres_kernels.cpp                 |  86 ++++--
 12 files changed, 520 insertions(+), 124 deletions(-)

diff --git a/common/unified/solver/common_gmres_kernels.cpp b/common/unified/solver/common_gmres_kernels.cpp
index 0e6ba18bb64..15637fe701e 100644
--- a/common/unified/solver/common_gmres_kernels.cpp
+++ b/common/unified/solver/common_gmres_kernels.cpp
@@ -69,28 +69,30 @@ void hessenberg_qr(std::shared_ptr<const DefaultExecutor> exec,
         exec,
         [] GKO_KERNEL(auto rhs, auto givens_sin, auto givens_cos,
                       auto residual_norm, auto residual_norm_collection,
-                      auto hessenberg_iter, auto iter, auto final_iter_nums,
-                      auto stop_status) {
+                      auto hessenberg_iter, auto iter, auto num_rhs,
+                      auto final_iter_nums, auto stop_status) {
             using value_type = std::decay_t<decltype(givens_sin(0, 0))>;
             if (stop_status[rhs].has_stopped()) {
                 return;
             }
             // increment iteration count
             final_iter_nums[rhs]++;
-            auto hess_this = hessenberg_iter(0, rhs);
-            auto hess_next = hessenberg_iter(1, rhs);
+            auto hess_this =
+                hessenberg_iter(0, rhs);  // hessenberg_iter(0, rhs);
+            auto hess_next =
+                hessenberg_iter(0, num_rhs + rhs);  // hessenberg_iter(1, rhs);
             // apply previous Givens rotations to column
             for (decltype(iter) j = 0; j < iter; ++j) {
                 // in here: hess_this = hessenberg_iter(j, rhs);
                 //          hess_next = hessenberg_iter(j+1, rhs);
-                hess_next = hessenberg_iter(j + 1, rhs);
+                hess_next = hessenberg_iter(0, (j + 1) * num_rhs + rhs);
                 const auto gc = givens_cos(j, rhs);
                 const auto gs = givens_sin(j, rhs);
                 const auto out1 = gc * hess_this + gs * hess_next;
                 const auto out2 = -conj(gs) * hess_this + conj(gc) * hess_next;
-                hessenberg_iter(j, rhs) = out1;
-                hessenberg_iter(j + 1, rhs) = hess_this = out2;
-                hess_next = hessenberg_iter(j + 2, rhs);
+                hessenberg_iter(0, j * num_rhs + rhs) = out1;
+                hessenberg_iter(0, (j + 1) * num_rhs + rhs) = hess_this = out2;
+                hess_next = hessenberg_iter(0, (j + 2) * num_rhs + rhs);
             }
             // hess_this is hessenberg_iter(iter, rhs) and
             // hess_next is hessenberg_iter(iter + 1, rhs)
@@ -110,8 +112,9 @@ void hessenberg_qr(std::shared_ptr<const DefaultExecutor> exec,
                 givens_sin(iter, rhs) = gs = conj(hess_next) / hypotenuse;
             }
             // apply new Givens rotation to column
-            hessenberg_iter(iter, rhs) = gc * hess_this + gs * hess_next;
-            hessenberg_iter(iter + 1, rhs) = zero<value_type>();
+            hessenberg_iter(0, iter * num_rhs + rhs) =
+                gc * hess_this + gs * hess_next;
+            hessenberg_iter(0, (iter + 1) * num_rhs + rhs) = zero<value_type>();
             // apply new Givens rotation to RHS of least-squares problem
             const auto rnc_new =
                 -conj(gs) * residual_norm_collection(iter, rhs);
@@ -120,9 +123,9 @@ void hessenberg_qr(std::shared_ptr<const DefaultExecutor> exec,
                 gc * residual_norm_collection(iter, rhs);
             residual_norm(0, rhs) = abs(rnc_new);
         },
-        hessenberg_iter->get_size()[1], givens_sin, givens_cos, residual_norm,
-        residual_norm_collection, hessenberg_iter, iter, final_iter_nums,
-        stop_status);
+        residual_norm->get_size()[1], givens_sin, givens_cos, residual_norm,
+        residual_norm_collection, hessenberg_iter, iter,
+        residual_norm->get_size()[1], final_iter_nums, stop_status);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -146,7 +149,8 @@ void solve_krylov(std::shared_ptr<const DefaultExecutor> exec,
             for (int64 i = sizes[col] - 1; i >= 0; i--) {
                 auto value = rhs(i, col);
                 for (int64 j = i + 1; j < sizes[col]; j++) {
-                    value -= mtx(i, j * num_cols + col) * y(j, col);
+                    // i is the Krylov vector, j is Arnoldi iter
+                    value -= mtx(j, i * num_cols + col) * y(j, col);
                 }
                 // y(i) = (rhs(i) - U(i,i+1:) * y(i+1:)) / U(i, i)
                 y(i, col) = value / mtx(i, i * num_cols + col);
diff --git a/common/unified/solver/gmres_kernels.cpp b/common/unified/solver/gmres_kernels.cpp
index 3997963f8d7..c10dc2562e5 100644
--- a/common/unified/solver/gmres_kernels.cpp
+++ b/common/unified/solver/gmres_kernels.cpp
@@ -8,6 +8,7 @@
 #include <ginkgo/core/stop/stopping_status.hpp>
 
 #include "common/unified/base/kernel_launch.hpp"
+#include "common/unified/base/kernel_launch_reduction.hpp"
 
 
 namespace gko {
@@ -94,6 +95,30 @@ void multi_axpy(std::shared_ptr<const DefaultExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL);
 
 
+template <typename ValueType>
+void multi_dot(std::shared_ptr<const DefaultExecutor> exec,
+               const matrix::Dense<ValueType>* krylov_bases,
+               const matrix::Dense<ValueType>* next_krylov,
+               matrix::Dense<ValueType>* hessenberg_col)
+{
+    run_kernel_col_reduction(
+        exec,
+        [] GKO_KERNEL(auto row, auto col, auto bases, auto next_krylov,
+                      auto num_rhs, auto num_rows) {
+            auto irhs = col % num_rhs;  // which rhs
+            auto ivec = col / num_rhs;  // which Krylov vector
+            return conj(bases(ivec * num_rows + row, irhs)) *
+                   next_krylov(row, irhs);
+        },
+        GKO_KERNEL_REDUCE_SUM(ValueType), hessenberg_col->get_values(),
+        gko::dim<2>{next_krylov->get_size()[0],
+                    hessenberg_col->get_size()[1] - next_krylov->get_size()[1]},
+        krylov_bases, next_krylov, next_krylov->get_size()[1],
+        next_krylov->get_size()[0]);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_DOT_KERNEL);
+
 }  // namespace gmres
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index f5dc92ce16e..1ba925e94e3 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -553,6 +553,7 @@ namespace gmres {
 
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_GMRES_RESTART_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_DOT_KERNEL);
 
 
 }  // namespace gmres
diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp
index cd3d88a5c02..f6fb254cf94 100644
--- a/core/solver/gmres.cpp
+++ b/core/solver/gmres.cpp
@@ -33,9 +33,25 @@ GKO_REGISTER_OPERATION(restart, gmres::restart);
 GKO_REGISTER_OPERATION(hessenberg_qr, common_gmres::hessenberg_qr);
 GKO_REGISTER_OPERATION(solve_krylov, common_gmres::solve_krylov);
 GKO_REGISTER_OPERATION(multi_axpy, gmres::multi_axpy);
+GKO_REGISTER_OPERATION(multi_dot, gmres::multi_dot);
 
 
 }  // anonymous namespace
+
+
+std::ostream& operator<<(std::ostream& stream, orthog_method orthog)
+{
+    switch (orthog) {
+    case orthog_method::mgs:
+        return stream << "mgs";
+    case orthog_method::cgs:
+        return stream << "cgs";
+    case orthog_method::cgs2:
+        return stream << "cgs2";
+    }
+    return stream;
+}
+
 }  // namespace gmres
 
 
@@ -52,6 +68,20 @@ typename Gmres<ValueType>::parameters_type Gmres<ValueType>::parse(
     if (auto& obj = config.get("flexible")) {
         params.with_flexible(gko::config::get_value<bool>(obj));
     }
+    if (auto& obj = config.get("orthog_method")) {
+        auto str = obj.get_string();
+        gmres::orthog_method orthog;
+        if (str == "mgs") {
+            orthog = gmres::orthog_method::mgs;
+        } else if (str == "cgs") {
+            orthog = gmres::orthog_method::cgs;
+        } else if (str == "cgs2") {
+            orthog = gmres::orthog_method::cgs2;
+        } else {
+            GKO_INVALID_CONFIG_VALUE("orthog_method", str);
+        }
+        params.with_orthog_method(orthog);
+    }
     return params;
 }
 
@@ -112,6 +142,155 @@ struct help_compute_norm {
     }
 };
 
+namespace {
+// Orthogonalization helper functions
+template <typename ValueType, typename VectorType>
+void orthogonalize_mgs(matrix::Dense<ValueType>* hessenberg_iter,
+                       VectorType* krylov_bases, VectorType* next_krylov,
+                       array<char>& reduction_tmp, size_type restart_iter,
+                       size_type num_rows, size_type num_rhs,
+                       size_type local_num_rows)
+{
+    for (size_type i = 0; i <= restart_iter; i++) {
+        // orthogonalize against krylov_bases(:, i):
+        // hessenberg(i, restart_iter) = next_krylov' * krylov_bases(:,
+        // i) next_krylov -= hessenberg(i, restart_iter) *
+        // krylov_bases(:, i)
+        auto hessenberg_entry = hessenberg_iter->create_submatrix(
+            span{0, 1}, span{i * num_rhs, (i + 1) * num_rhs});
+        auto krylov_basis = ::gko::detail::create_submatrix_helper(
+            krylov_bases, dim<2>{num_rows, num_rhs},
+            span{local_num_rows * i, local_num_rows * (i + 1)},
+            span{0, num_rhs});
+        next_krylov->compute_conj_dot(krylov_basis, hessenberg_entry,
+                                      reduction_tmp);
+        next_krylov->sub_scaled(hessenberg_entry, krylov_basis);
+    }
+}
+
+template <typename ValueType>
+void finish_reduce(matrix::Dense<ValueType>* hessenberg_iter,
+                   matrix::Dense<ValueType>* next_krylov,
+                   const size_type num_rhs, const size_type restart_iter)
+{
+    return;
+}
+
+#if GINKGO_BUILD_MPI
+template <typename ValueType>
+void finish_reduce(matrix::Dense<ValueType>* hessenberg_iter,
+                   experimental::distributed::Vector<ValueType>* next_krylov,
+                   const size_type num_rhs, const size_type restart_iter)
+{
+    auto exec = hessenberg_iter->get_executor();
+    const auto comm = next_krylov->get_communicator();
+    exec->synchronize();
+    // hessenberg_iter is the size of all non-zeros for this iteration -- but we
+    // are not setting the last values for each rhs (values that would be below
+    // the diagonal in the "full" matrix.
+    auto hessenberg_reduce = hessenberg_iter->create_submatrix(
+        span{0, 1}, span{0, num_rhs * (restart_iter + 1)});
+    if (experimental::mpi::requires_host_buffer(exec, comm)) {
+        ::gko::detail::DenseCache<ValueType> host_reduction_buffer;
+        host_reduction_buffer.init(exec->get_master(),
+                                   hessenberg_reduce->get_size());
+        host_reduction_buffer->copy_from(hessenberg_reduce);
+        comm.all_reduce(exec->get_master(), host_reduction_buffer->get_values(),
+                        static_cast<int>(hessenberg_reduce->get_size()[1]),
+                        MPI_SUM);
+        hessenberg_reduce->copy_from(host_reduction_buffer.get());
+    } else {
+        comm.all_reduce(exec, hessenberg_reduce->get_values(),
+                        static_cast<int>(hessenberg_reduce->get_size()[1]),
+                        MPI_SUM);
+    }
+}
+#endif
+
+template <typename ValueType, typename VectorType>
+void orthogonalize_cgs(matrix::Dense<ValueType>* hessenberg_iter,
+                       VectorType* krylov_bases, VectorType* next_krylov,
+                       size_type restart_iter, size_type num_rows,
+                       size_type num_rhs, size_type local_num_rows)
+{
+    auto exec = hessenberg_iter->get_executor();
+    // hessenberg(0:restart_iter, restart_iter) = krylov_basis' *
+    // next_krylov
+    auto krylov_basis_small = ::gko::detail::create_submatrix_helper(
+        krylov_bases, dim<2>{num_rows, num_rhs},
+        span{0, local_num_rows * (restart_iter + 1)}, span{0, num_rhs});
+    exec->run(gmres::make_multi_dot(
+        gko::detail::get_local(krylov_basis_small.get()),
+        gko::detail::get_local(next_krylov), hessenberg_iter));
+    finish_reduce(hessenberg_iter, next_krylov, num_rhs, restart_iter);
+    for (size_type i = 0; i <= restart_iter; i++) {
+        // next_krylov -= hessenberg(i, restart_iter) * krylov_bases(:,
+        // i)
+        auto hessenberg_entry = hessenberg_iter->create_submatrix(
+            span{0, 1}, span{i * num_rhs, (i + 1) * num_rhs});
+        auto krylov_col = ::gko::detail::create_submatrix_helper(
+            krylov_bases, dim<2>{num_rows, num_rhs},
+            span{local_num_rows * i, local_num_rows * (i + 1)},
+            span{0, num_rhs});
+        next_krylov->sub_scaled(hessenberg_entry, krylov_col);
+    }
+}
+
+
+template <typename ValueType, typename VectorType>
+void orthogonalize_cgs2(matrix::Dense<ValueType>* hessenberg_iter,
+                        VectorType* krylov_bases, VectorType* next_krylov,
+                        matrix::Dense<ValueType>* hessenberg_aux,
+                        matrix::Dense<ValueType>* one_op,
+                        size_type restart_iter, size_type num_rows,
+                        size_type num_rhs, size_type local_num_rows)
+{
+    auto exec = hessenberg_iter->get_executor();
+    // hessenberg(0:restart_iter, restart_iter) = krylov_bases' *
+    // next_krylov
+    auto krylov_basis_small = ::gko::detail::create_submatrix_helper(
+        krylov_bases, dim<2>{num_rows, num_rhs},
+        span{0, local_num_rows * (restart_iter + 1)}, span{0, num_rhs});
+    exec->run(gmres::make_multi_dot(
+        gko::detail::get_local(krylov_basis_small.get()),
+        gko::detail::get_local(next_krylov), hessenberg_iter));
+    finish_reduce(hessenberg_iter, next_krylov, num_rhs, restart_iter);
+    for (size_type i = 0; i <= restart_iter; i++) {
+        // next_krylov -= hessenberg(i, restart_iter) * krylov_bases(:,
+        // i)
+        auto hessenberg_entry = hessenberg_iter->create_submatrix(
+            span{0, 1}, span{i * num_rhs, (i + 1) * num_rhs});
+        auto krylov_col = ::gko::detail::create_submatrix_helper(
+            krylov_bases, dim<2>{num_rows, num_rhs},
+            span{local_num_rows * i, local_num_rows * (i + 1)},
+            span{0, num_rhs});
+        next_krylov->sub_scaled(hessenberg_entry, krylov_col);
+    }
+    // Re-orthogonalize
+    auto hessenberg_aux_iter = hessenberg_aux->create_submatrix(
+        span{0, 1}, span{0, (restart_iter + 2) * num_rhs});
+    exec->run(gmres::make_multi_dot(
+        gko::detail::get_local(krylov_basis_small.get()),
+        gko::detail::get_local(next_krylov), hessenberg_aux_iter.get()));
+    finish_reduce(hessenberg_aux_iter.get(), next_krylov, num_rhs,
+                  restart_iter);
+
+    for (size_type i = 0; i <= restart_iter; i++) {
+        // next_krylov -= hessenberg(i, restart_iter) * krylov_bases(:,
+        // i)
+        auto hessenberg_entry = hessenberg_aux->create_submatrix(
+            span{0, 1}, span{i * num_rhs, (i + 1) * num_rhs});
+        auto krylov_col = ::gko::detail::create_submatrix_helper(
+            krylov_bases, dim<2>{num_rows, num_rhs},
+            span{local_num_rows * i, local_num_rows * (i + 1)},
+            span{0, num_rhs});
+        next_krylov->sub_scaled(hessenberg_entry, krylov_col);
+    }
+    // Add both Hessenberg columns
+    hessenberg_iter->add_scaled(one_op, hessenberg_aux_iter);
+}
+}  // anonymous namespace
+
 template <typename ValueType>
 struct help_compute_norm<ValueType,
                          std::enable_if_t<is_complex_s<ValueType>::value>> {
@@ -127,7 +306,6 @@ struct help_compute_norm<ValueType,
     }
 };
 
-
 template <typename ValueType>
 template <typename VectorType>
 void Gmres<ValueType>::apply_dense_impl(const VectorType* dense_b,
@@ -161,9 +339,23 @@ void Gmres<ValueType>::apply_dense_impl(const VectorType* dense_b,
             dim<2>{num_rows * (krylov_dim + 1), num_rhs},
             dim<2>{local_num_rows * (krylov_dim + 1), num_rhs});
     }
-    // rows: rows of Hessenberg matrix, columns: block for each entry
+    // The Hessenberg matrix formed by the Arnoldi process is of shape
+    // (krylov_dim + 1) x (krylov_dim) for a single RHS. The (i,j)th
+    // entry is associated with the ith Krylov basis vector and the jth
+    // iteration of Arnoldi.
+    // For ease of using the reduction kernels locally and for having
+    // contiguous memory for communicating in the distributed case, we
+    // will store the Hessenberg matrix in the shape
+    // (krylov_dim) x ((krylov_dim + 1) * num_rhs), where the (i,j)th
+    // entry is associated with the ith iteration and the (j/num_rhs)th
+    // Krylov basis vector, for the (j % num_rhs)th RHS vector.
     auto hessenberg = this->template create_workspace_op<LocalVector>(
-        ws::hessenberg, dim<2>{krylov_dim + 1, krylov_dim * num_rhs});
+        ws::hessenberg, dim<2>{krylov_dim, (krylov_dim + 1) * num_rhs});
+    LocalVector* hessenberg_aux = nullptr;
+    if (this->parameters_.orthog_method == gmres::orthog_method::cgs2) {
+        hessenberg_aux = this->template create_workspace_op<LocalVector>(
+            ws::hessenberg_aux, dim<2>{1, (krylov_dim + 1) * num_rhs});
+    }
     auto givens_sin = this->template create_workspace_op<LocalVector>(
         ws::givens_sin, dim<2>{krylov_dim, num_rhs});
     auto givens_cos = this->template create_workspace_op<LocalVector>(
@@ -312,36 +504,39 @@ void Gmres<ValueType>::apply_dense_impl(const VectorType* dense_b,
         this->get_preconditioner()->apply(this_krylov,
                                           preconditioned_krylov_vector);
 
-        // Create view of current column in the hessenberg matrix:
-        // hessenberg_iter = hessenberg(:, restart_iter);
-        auto hessenberg_iter = hessenberg->create_submatrix(
-            span{0, restart_iter + 2},
-            span{num_rhs * restart_iter, num_rhs * (restart_iter + 1)});
+        // Create view of current "column" in the hessenberg matrix:
+        // hessenberg_iter = hessenberg(:, restart_iter), which
+        // is actually stored as a row, hessenberg(restart_iter, :)
+        auto hessenberg_iter =
+            hessenberg->create_submatrix(span{restart_iter, restart_iter + 1},
+                                         span{0, num_rhs * (restart_iter + 2)});
 
         // Start of Arnoldi
         // next_krylov = A * preconditioned_krylov_vector
         this->get_system_matrix()->apply(preconditioned_krylov_vector,
                                          next_krylov);
-
-        for (size_type i = 0; i <= restart_iter; i++) {
-            // orthogonalize against krylov_bases(:, i):
-            // hessenberg(i, restart_iter) = next_krylov' * krylov_bases(:, i)
-            // next_krylov -= hessenberg(i, restart_iter) * krylov_bases(:, i)
-            auto hessenberg_entry = hessenberg_iter->create_submatrix(
-                span{i, i + 1}, span{0, num_rhs});
-            auto krylov_basis = ::gko::detail::create_submatrix_helper(
-                krylov_bases, dim<2>{num_rows, num_rhs},
-                span{local_num_rows * i, local_num_rows * (i + 1)},
-                span{0, num_rhs});
-            next_krylov->compute_conj_dot(krylov_basis, hessenberg_entry,
-                                          reduction_tmp);
-            next_krylov->sub_scaled(hessenberg_entry, krylov_basis);
+        if (this->parameters_.orthog_method == gmres::orthog_method::mgs) {
+            orthogonalize_mgs(hessenberg_iter.get(), krylov_bases,
+                              next_krylov.get(), reduction_tmp, restart_iter,
+                              num_rows, num_rhs, local_num_rows);
+        } else if (this->parameters_.orthog_method ==
+                   gmres::orthog_method::cgs) {
+            orthogonalize_cgs(hessenberg_iter.get(), krylov_bases,
+                              next_krylov.get(), restart_iter, num_rows,
+                              num_rhs, local_num_rows);
+        } else if (this->parameters_.orthog_method ==
+                   gmres::orthog_method::cgs2) {
+            orthogonalize_cgs2(hessenberg_iter.get(), krylov_bases,
+                               next_krylov.get(), hessenberg_aux, one_op,
+                               restart_iter, num_rows, num_rhs, local_num_rows);
         }
         // normalize next_krylov:
         // hessenberg(restart_iter+1, restart_iter) = norm(next_krylov)
+        // (stored in hessenberg(restart_iter, (restart_iter + 1) * num_rhs))
         // next_krylov /= hessenberg(restart_iter+1, restart_iter)
         auto hessenberg_norm_entry = hessenberg_iter->create_submatrix(
-            span{restart_iter + 1, restart_iter + 2}, span{0, num_rhs});
+            span{0, 1},
+            span{(restart_iter + 1) * num_rhs, (restart_iter + 2) * num_rhs});
         help_compute_norm<ValueType>::compute_next_krylov_norm_into_hessenberg(
             next_krylov.get(), hessenberg_norm_entry.get(),
             next_krylov_norm_tmp, reduction_tmp);
@@ -379,7 +574,7 @@ void Gmres<ValueType>::apply_dense_impl(const VectorType* dense_b,
     }
 
     auto hessenberg_small = hessenberg->create_submatrix(
-        span{0, restart_iter}, span{0, num_rhs * (restart_iter)});
+        span{0, restart_iter}, span{0, num_rhs * restart_iter});
 
     // Solve upper triangular.
     // y = hessenberg \ residual_norm_collection
@@ -443,7 +638,7 @@ int workspace_traits<Gmres<ValueType>>::num_arrays(const Solver&)
 template <typename ValueType>
 int workspace_traits<Gmres<ValueType>>::num_vectors(const Solver&)
 {
-    return 15;
+    return 16;
 }
 
 
@@ -455,6 +650,7 @@ std::vector<std::string> workspace_traits<Gmres<ValueType>>::op_names(
             "preconditioned_vector",
             "krylov_bases",
             "hessenberg",
+            "hessenberg_aux",
             "givens_sin",
             "givens_cos",
             "residual_norm_collection",
@@ -480,10 +676,9 @@ std::vector<std::string> workspace_traits<Gmres<ValueType>>::array_names(
 template <typename ValueType>
 std::vector<int> workspace_traits<Gmres<ValueType>>::scalars(const Solver&)
 {
-    return {hessenberg,          givens_sin,
-            givens_cos,          residual_norm_collection,
-            residual_norm,       y,
-            next_krylov_norm_tmp};
+    return {hessenberg, hessenberg_aux,           givens_sin,
+            givens_cos, residual_norm_collection, residual_norm,
+            y,          next_krylov_norm_tmp};
 }
 
 
diff --git a/core/solver/gmres_kernels.hpp b/core/solver/gmres_kernels.hpp
index 196b0de3ab0..f9fbe76279b 100644
--- a/core/solver/gmres_kernels.hpp
+++ b/core/solver/gmres_kernels.hpp
@@ -38,11 +38,19 @@ namespace gmres {
                     stopping_status* stop_status)
 
 
-#define GKO_DECLARE_ALL_AS_TEMPLATES             \
-    template <typename ValueType>                \
-    GKO_DECLARE_GMRES_RESTART_KERNEL(ValueType); \
-    template <typename ValueType>                \
-    GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL(ValueType)
+#define GKO_DECLARE_GMRES_MULTI_DOT_KERNEL(_type)               \
+    void multi_dot(std::shared_ptr<const DefaultExecutor> exec, \
+                   const matrix::Dense<_type>* krylov_bases,    \
+                   const matrix::Dense<_type>* next_krylov,     \
+                   matrix::Dense<_type>* hessenberg_col)
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES                \
+    template <typename ValueType>                   \
+    GKO_DECLARE_GMRES_RESTART_KERNEL(ValueType);    \
+    template <typename ValueType>                   \
+    GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL(ValueType); \
+    template <typename ValueType>                   \
+    GKO_DECLARE_GMRES_MULTI_DOT_KERNEL(ValueType)
 
 
 }  // namespace gmres
diff --git a/core/test/config/solver.cpp b/core/test/config/solver.cpp
index 8a2f025d00a..78f1f7351f8 100644
--- a/core/test/config/solver.cpp
+++ b/core/test/config/solver.cpp
@@ -289,6 +289,8 @@ struct Gmres
         param.with_krylov_dim(3u);
         config_map["flexible"] = pnode{true};
         param.with_flexible(true);
+        config_map["orthog_method"] = pnode{"cgs"};
+        param.with_orthog_method(gko::solver::gmres::orthog_method::cgs);
     }
 
     template <bool from_reg, typename AnswerType>
@@ -300,6 +302,7 @@ struct Gmres
         solver_config_test::template validate<from_reg>(result, answer);
         ASSERT_EQ(res_param.krylov_dim, ans_param.krylov_dim);
         ASSERT_EQ(res_param.flexible, ans_param.flexible);
+        ASSERT_EQ(res_param.orthog_method, ans_param.orthog_method);
     }
 };
 
diff --git a/include/ginkgo/core/solver/gmres.hpp b/include/ginkgo/core/solver/gmres.hpp
index 57bbca0b529..308dadf5218 100644
--- a/include/ginkgo/core/solver/gmres.hpp
+++ b/include/ginkgo/core/solver/gmres.hpp
@@ -31,6 +31,29 @@ namespace solver {
 
 constexpr size_type gmres_default_krylov_dim = 100u;
 
+namespace gmres {
+/**
+ * Set the orthogonalization method for the Krylov subspace.
+ */
+enum class orthog_method {
+    /**
+     * Modified Gram-Schmidt (default)
+     */
+    mgs,
+    /**
+     * Classical Gram-Schmidt
+     */
+    cgs,
+    /**
+     * Classical Gram-Schmidt with re-orthogonalization
+     */
+    cgs2
+};
+
+/** Prints an orthogonalization method. */
+std::ostream& operator<<(std::ostream& stream, orthog_method orthog);
+
+}  // namespace gmres
 
 /**
  * GMRES or the generalized minimal residual method is an iterative type Krylov
@@ -93,6 +116,10 @@ class Gmres
 
         /** Flexible GMRES */
         bool GKO_FACTORY_PARAMETER_SCALAR(flexible, false);
+
+        /** Orthogonalization method */
+        gmres::orthog_method GKO_FACTORY_PARAMETER_SCALAR(
+            orthog_method, gmres::orthog_method::mgs);
     };
     GKO_ENABLE_LIN_OP_FACTORY(Gmres, parameters, Factory);
     GKO_ENABLE_BUILD_METHOD(Factory);
@@ -167,28 +194,30 @@ struct workspace_traits<Gmres<ValueType>> {
     constexpr static int krylov_bases = 2;
     // hessenberg matrix
     constexpr static int hessenberg = 3;
+    // auxiliary space for CGS2
+    constexpr static int hessenberg_aux = 4;
     // givens sin parameters
-    constexpr static int givens_sin = 4;
+    constexpr static int givens_sin = 5;
     // givens cos parameters
-    constexpr static int givens_cos = 5;
+    constexpr static int givens_cos = 6;
     // coefficients of the residual in Krylov space
-    constexpr static int residual_norm_collection = 6;
+    constexpr static int residual_norm_collection = 7;
     // residual norm scalar
-    constexpr static int residual_norm = 7;
+    constexpr static int residual_norm = 8;
     // solution of the least-squares problem in Krylov space
-    constexpr static int y = 8;
+    constexpr static int y = 9;
     // solution of the least-squares problem mapped to the full space
-    constexpr static int before_preconditioner = 9;
+    constexpr static int before_preconditioner = 10;
     // preconditioned solution of the least-squares problem
-    constexpr static int after_preconditioner = 10;
+    constexpr static int after_preconditioner = 11;
     // constant 1.0 scalar
-    constexpr static int one = 11;
+    constexpr static int one = 12;
     // constant -1.0 scalar
-    constexpr static int minus_one = 12;
+    constexpr static int minus_one = 13;
     // temporary norm vector of next_krylov to copy into hessenberg matrix
-    constexpr static int next_krylov_norm_tmp = 13;
+    constexpr static int next_krylov_norm_tmp = 14;
     // preconditioned krylov basis multivector
-    constexpr static int preconditioned_krylov_bases = 14;
+    constexpr static int preconditioned_krylov_bases = 15;
 
     // stopping status array
     constexpr static int stop = 0;
diff --git a/reference/solver/common_gmres_kernels.cpp b/reference/solver/common_gmres_kernels.cpp
index 643c164b828..122c224d5c1 100644
--- a/reference/solver/common_gmres_kernels.cpp
+++ b/reference/solver/common_gmres_kernels.cpp
@@ -30,14 +30,15 @@ template <typename ValueType>
 void calculate_sin_and_cos(matrix::Dense<ValueType>* givens_sin,
                            matrix::Dense<ValueType>* givens_cos,
                            matrix::Dense<ValueType>* hessenberg_iter,
-                           size_type iter, const size_type rhs)
+                           size_type iter, const size_type num_rhs,
+                           const size_type rhs)
 {
-    if (is_zero(hessenberg_iter->at(iter, rhs))) {
+    if (is_zero(hessenberg_iter->at(0, iter * num_rhs + rhs))) {
         givens_cos->at(iter, rhs) = zero<ValueType>();
         givens_sin->at(iter, rhs) = one<ValueType>();
     } else {
-        auto this_hess = hessenberg_iter->at(iter, rhs);
-        auto next_hess = hessenberg_iter->at(iter + 1, rhs);
+        auto this_hess = hessenberg_iter->at(0, iter * num_rhs + rhs);
+        auto next_hess = hessenberg_iter->at(0, (iter + 1) * num_rhs + rhs);
         const auto scale = abs(this_hess) + abs(next_hess);
         const auto hypotenuse =
             scale * sqrt(abs(this_hess / scale) * abs(this_hess / scale) +
@@ -52,19 +53,24 @@ template <typename ValueType>
 void givens_rotation(matrix::Dense<ValueType>* givens_sin,
                      matrix::Dense<ValueType>* givens_cos,
                      matrix::Dense<ValueType>* hessenberg_iter, size_type iter,
+                     const size_type num_rhs,
                      const stopping_status* stop_status)
 {
-    for (size_type i = 0; i < hessenberg_iter->get_size()[1]; ++i) {
+    for (size_type i = 0; i < num_rhs; ++i) {
         if (stop_status[i].has_stopped()) {
             continue;
         }
         for (size_type j = 0; j < iter; ++j) {
-            auto temp = givens_cos->at(j, i) * hessenberg_iter->at(j, i) +
-                        givens_sin->at(j, i) * hessenberg_iter->at(j + 1, i);
-            hessenberg_iter->at(j + 1, i) =
-                -conj(givens_sin->at(j, i)) * hessenberg_iter->at(j, i) +
-                conj(givens_cos->at(j, i)) * hessenberg_iter->at(j + 1, i);
-            hessenberg_iter->at(j, i) = temp;
+            auto temp =
+                givens_cos->at(j, i) * hessenberg_iter->at(0, j * num_rhs + i) +
+                givens_sin->at(j, i) *
+                    hessenberg_iter->at(0, (j + 1) * num_rhs + i);
+            hessenberg_iter->at(0, (j + 1) * num_rhs + i) =
+                -conj(givens_sin->at(j, i)) *
+                    hessenberg_iter->at(0, j * num_rhs + i) +
+                conj(givens_cos->at(j, i)) *
+                    hessenberg_iter->at(0, (j + 1) * num_rhs + i);
+            hessenberg_iter->at(0, j * num_rhs + i) = temp;
             // temp             =  cos(j)*hessenberg(j) +
             //                     sin(j)*hessenberg(j+1)
             // hessenberg(j+1)  = -conj(sin(j))*hessenberg(j) +
@@ -72,12 +78,15 @@ void givens_rotation(matrix::Dense<ValueType>* givens_sin,
             // hessenberg(j)    =  temp;
         }
 
-        calculate_sin_and_cos(givens_sin, givens_cos, hessenberg_iter, iter, i);
+        calculate_sin_and_cos(givens_sin, givens_cos, hessenberg_iter, iter,
+                              num_rhs, i);
 
-        hessenberg_iter->at(iter, i) =
-            givens_cos->at(iter, i) * hessenberg_iter->at(iter, i) +
-            givens_sin->at(iter, i) * hessenberg_iter->at(iter + 1, i);
-        hessenberg_iter->at(iter + 1, i) = zero<ValueType>();
+        hessenberg_iter->at(0, iter * num_rhs + i) =
+            givens_cos->at(iter, i) *
+                hessenberg_iter->at(0, iter * num_rhs + i) +
+            givens_sin->at(iter, i) *
+                hessenberg_iter->at(0, (iter + 1) * num_rhs + i);
+        hessenberg_iter->at(0, (iter + 1) * num_rhs + i) = zero<ValueType>();
         // hessenberg(iter)   = cos(iter)*hessenberg(iter) +
         //                      sin(iter)*hessenberg(iter + 1)
         // hessenberg(iter+1) = 0
@@ -151,7 +160,8 @@ void hessenberg_qr(std::shared_ptr<const ReferenceExecutor> exec,
         }
     }
 
-    givens_rotation(givens_sin, givens_cos, hessenberg_iter, iter, stop_status);
+    givens_rotation(givens_sin, givens_cos, hessenberg_iter, iter,
+                    residual_norm->get_size()[1], stop_status);
     calculate_next_residual_norm(givens_sin, givens_cos, residual_norm,
                                  residual_norm_collection, iter, stop_status);
 }
@@ -176,7 +186,7 @@ void solve_krylov(std::shared_ptr<const ReferenceExecutor> exec,
             for (size_type j = i + 1; j < final_iter_nums[k]; ++j) {
                 temp -=
                     hessenberg->at(
-                        i, j * residual_norm_collection->get_size()[1] + k) *
+                        j, i * residual_norm_collection->get_size()[1] + k) *
                     y->at(j, k);
             }
             y->at(i, k) =
diff --git a/reference/solver/gmres_kernels.cpp b/reference/solver/gmres_kernels.cpp
index a0b22862998..4c482632353 100644
--- a/reference/solver/gmres_kernels.cpp
+++ b/reference/solver/gmres_kernels.cpp
@@ -71,6 +71,27 @@ void multi_axpy(std::shared_ptr<const ReferenceExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL);
 
+template <typename ValueType>
+void multi_dot(std::shared_ptr<const ReferenceExecutor> exec,
+               const matrix::Dense<ValueType>* krylov_bases,
+               const matrix::Dense<ValueType>* next_krylov,
+               matrix::Dense<ValueType>* hessenberg_col)
+{
+    auto num_rhs = next_krylov->get_size()[1];
+    auto krylov_bases_rowoffset = next_krylov->get_size()[0];
+    for (size_type i = 0; i < hessenberg_col->get_size()[1]; ++i) {
+        auto ivec = i / num_rhs;
+        auto irhs = i % num_rhs;
+        hessenberg_col->at(0, i) = zero<ValueType>();
+        for (size_type j = 0; j < krylov_bases_rowoffset; ++j) {
+            hessenberg_col->at(0, i) +=
+                krylov_bases->at(ivec * krylov_bases_rowoffset + j, irhs) *
+                next_krylov->at(j, irhs);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_DOT_KERNEL);
 
 }  // namespace gmres
 }  // namespace reference
diff --git a/reference/test/solver/gmres_kernels.cpp b/reference/test/solver/gmres_kernels.cpp
index 00f7766179f..bc877e0ed76 100644
--- a/reference/test/solver/gmres_kernels.cpp
+++ b/reference/test/solver/gmres_kernels.cpp
@@ -102,7 +102,7 @@ class Gmres : public ::testing::Test {
         small_y = Mtx::create(exec, gko::dim<2>{small_restart, small_size[1]});
         small_hessenberg = Mtx::create(
             exec,
-            gko::dim<2>{small_restart + 1, small_restart * small_size[1]});
+            gko::dim<2>{small_restart, (small_restart + 1) * small_size[1]});
         small_hessenberg->fill(gko::zero<value_type>());
 
         stopped.converge(1, true);
@@ -222,8 +222,8 @@ TYPED_TEST(Gmres, KernelHessenbergQrIter0)
     this->small_residual_norm->fill(nan);
     this->small_residual_norm_collection = gko::initialize<Mtx>(
         {I<T>{1.25, 1.5}, I<T>{nan, nan}, I<T>{95., 94.}}, this->exec);
-    this->small_hessenberg = gko::initialize<Mtx>(
-        {I<T>{0.5, -0.75}, I<T>{-0.5, 1}, I<T>{97., 96.}}, this->exec);
+    this->small_hessenberg =
+        gko::initialize<Mtx>({I<T>{0.5, -0.75, -0.5, 1, 97., 96.}}, this->exec);
     this->small_final_iter_nums.get_data()[0] = 0;
     this->small_final_iter_nums.get_data()[1] = 0;
 
@@ -242,7 +242,7 @@ TYPED_TEST(Gmres, KernelHessenbergQrIter0)
     GKO_EXPECT_MTX_NEAR(this->small_givens_sin,
                         l({{-0.5 * sqrt(2.), 0.8}, {-72., 73.}}), r<T>::value);
     GKO_EXPECT_MTX_NEAR(this->small_hessenberg,
-                        l({{0.5 * sqrt(2.), 1.25}, {0., 0.}, {97., 96.}}),
+                        l({{0.5 * sqrt(2.), 1.25, 0., 0., 97., 96.}}),
                         r<T>::value);
     GKO_EXPECT_MTX_NEAR(
         this->small_residual_norm_collection,
@@ -267,8 +267,8 @@ TYPED_TEST(Gmres, KernelHessenbergQrIter1)
     this->small_residual_norm->fill(nan);
     this->small_residual_norm_collection = gko::initialize<Mtx>(
         {I<T>{95., 94.}, I<T>{1.25, 1.5}, I<T>{nan, nan}}, this->exec);
-    this->small_hessenberg = gko::initialize<Mtx>(
-        {I<T>{-0.5, 4}, I<T>{0.25, 0.5}, I<T>{-0.5, 1}}, this->exec);
+    this->small_hessenberg =
+        gko::initialize<Mtx>({I<T>{-0.5, 4, 0.25, 0.5, -0.5, 1}}, this->exec);
     this->small_final_iter_nums.get_data()[0] = 1;
     this->small_final_iter_nums.get_data()[1] = 1;
 
@@ -287,7 +287,7 @@ TYPED_TEST(Gmres, KernelHessenbergQrIter1)
     GKO_EXPECT_MTX_NEAR(this->small_givens_sin,
                         l({{0.5, 0.25}, {-0.5 * sqrt(2.), 0.8}}), r<T>::value);
     GKO_EXPECT_MTX_NEAR(this->small_hessenberg,
-                        l({{-0.375, 2.125}, {0.5 * sqrt(2.), 1.25}, {0., 0.}}),
+                        l({{-0.375, 2.125, 0.5 * sqrt(2.), 1.25, 0., 0.}}),
                         r<T>::value);
     GKO_EXPECT_MTX_NEAR(
         this->small_residual_norm_collection,
@@ -309,9 +309,8 @@ TYPED_TEST(Gmres, KernelSolveKrylov)
     this->small_final_iter_nums.get_data()[1] = restart;
     this->small_hessenberg = gko::initialize<Mtx>(
         // clang-format off
-        {{-1, 3, 2, -4},
-         {0, 0, 1, 5},
-         {nan, nan, nan, nan}},
+        {{-1, 3, 0, 0, nan, nan},
+         {2, -4, 1, 5, nan, nan}},
         // clang-format on
         this->exec);
     this->small_residual_norm_collection =
@@ -366,6 +365,40 @@ TYPED_TEST(Gmres, KernelMultiAxpy)
                         r<T>::value);
 }
 
+TYPED_TEST(Gmres, KernelMultiDot)
+{
+    using T = typename TestFixture::value_type;
+    using Mtx = typename TestFixture::Mtx;
+    const T nan = std::numeric_limits<gko::remove_complex<T>>::quiet_NaN();
+    const auto restart = this->small_givens_sin->get_size()[0];
+    this->small_hessenberg->fill(gko::zero<T>());
+    auto hessenberg_iter = this->small_hessenberg->create_submatrix(
+        gko::span{0, 1},
+        gko::span{0, (restart + 1) * this->small_x->get_size()[1]});
+    this->small_x = gko::initialize<Mtx>(  // next_krylov
+        {I<T>{-1.0, 2.3}, I<T>{-14.0, -22.0}, I<T>{8.4, 14.2}}, this->exec);
+
+    this->small_krylov_bases = gko::initialize<Mtx>(  // restart+1 x rows x #rhs
+        {
+            I<T>{1, 10},  // 0, 0, x
+            I<T>{2, 11},  // 0, 1, x
+            I<T>{3, 12},  // 0, 2, x
+            I<T>{4, 13},  // 1, 0, x
+            I<T>{5, 14},  // 1, 1, x
+            I<T>{6, 15},  // 1, 2, x
+            I<T>{7, 16},  // 2, 0, x
+            I<T>{8, 17},  // 2, 1, x
+            I<T>{9, 18},  // 2, 2, x
+        },
+        this->exec);
+    gko::kernels::reference::gmres::multi_dot(
+        this->exec, this->small_krylov_bases.get(), this->small_x.get(),
+        hessenberg_iter.get());
+
+    GKO_ASSERT_MTX_NEAR(hessenberg_iter,
+                        l({{-3.8, -48.6, -23.6, -65.1, -43.4, -81.6}}),
+                        r<T>::value);
+}
 
 TYPED_TEST(Gmres, SolvesStencilSystem)
 {
@@ -703,28 +736,37 @@ TYPED_TEST(Gmres, SolvesBigDenseSystem1WithRestart)
 
 TYPED_TEST(Gmres, SolvesWithPreconditioner)
 {
+    using gko::solver::gmres::orthog_method;
+
     using Mtx = typename TestFixture::Mtx;
     using Solver = typename TestFixture::Solver;
     using value_type = typename TestFixture::value_type;
-    auto gmres_factory_preconditioner =
-        Solver::build()
-            .with_criteria(gko::stop::Iteration::build().with_max_iters(100u),
-                           gko::stop::ResidualNorm<value_type>::build()
-                               .with_reduction_factor(r<value_type>::value))
-            .with_preconditioner(
-                gko::preconditioner::Jacobi<value_type>::build()
-                    .with_max_block_size(3u))
-            .on(this->exec);
-    auto solver = gmres_factory_preconditioner->generate(this->mtx_big);
-    auto b = gko::initialize<Mtx>(
-        {175352.10, 313410.50, 131114.10, -134116.30, 179529.30, -43564.90},
-        this->exec);
-    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
-
-    solver->apply(b, x);
-
-    GKO_ASSERT_MTX_NEAR(x, l({33.0, -56.0, 81.0, -30.0, 21.0, 40.0}),
-                        r<value_type>::value * 1e3);
+    for (auto orthog :
+         {orthog_method::mgs, orthog_method::cgs, orthog_method::cgs2}) {
+        SCOPED_TRACE(orthog);
+        auto gmres_factory_preconditioner =
+            Solver::build()
+                .with_orthog_method(orthog)
+                .with_criteria(
+                    gko::stop::Iteration::build().with_max_iters(100u),
+                    gko::stop::ResidualNorm<value_type>::build()
+                        .with_reduction_factor(r<value_type>::value))
+                .with_preconditioner(
+                    gko::preconditioner::Jacobi<value_type>::build()
+                        .with_max_block_size(3u))
+                .on(this->exec);
+        auto solver = gmres_factory_preconditioner->generate(this->mtx_big);
+        auto b = gko::initialize<Mtx>(
+            {175352.10, 313410.50, 131114.10, -134116.30, 179529.30, -43564.90},
+            this->exec);
+        auto x =
+            gko::initialize<Mtx>({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec);
+
+        solver->apply(b, x);
+
+        GKO_ASSERT_MTX_NEAR(x, l({33.0, -56.0, 81.0, -30.0, 21.0, 40.0}),
+                            r<value_type>::value * 1e3);
+    }
 }
 
 
diff --git a/test/mpi/solver/solver.cpp b/test/mpi/solver/solver.cpp
index 589be91bcba..aaf61cb47ea 100644
--- a/test/mpi/solver/solver.cpp
+++ b/test/mpi/solver/solver.cpp
@@ -195,13 +195,14 @@ struct Ir : SimpleSolverTest<gko::solver::Ir<solver_value_type>> {
 };
 
 
-template <unsigned dimension>
+template <unsigned dimension, gko::solver::gmres::orthog_method orthog>
 struct Gmres : SimpleSolverTest<gko::solver::Gmres<solver_value_type>> {
     static typename solver_type::parameters_type build(
         std::shared_ptr<const gko::Executor> exec)
     {
         return SimpleSolverTest<gko::solver::Gmres<solver_value_type>>::build(
                    std::move(exec))
+            .with_orthog_method(orthog)
             .with_krylov_dim(dimension);
     }
 };
@@ -531,7 +532,10 @@ class Solver : public CommonMpiTestFixture {
 
 using SolverTypes =
     ::testing::Types<Cg, CgWithMg, Cgs, Fcg, Bicgstab, Ir, Gcr<10u>, Gcr<100u>,
-                     Gmres<10u>, Gmres<100u>>;
+                     Gmres<10u, gko::solver::gmres::orthog_method::mgs>,
+                     Gmres<10u, gko::solver::gmres::orthog_method::cgs>,
+                     Gmres<10u, gko::solver::gmres::orthog_method::cgs2>,
+                     Gmres<100u, gko::solver::gmres::orthog_method::mgs>>;
 
 TYPED_TEST_SUITE(Solver, SolverTypes, TypenameNameGenerator);
 
diff --git a/test/solver/gmres_kernels.cpp b/test/solver/gmres_kernels.cpp
index a6c74bd45c0..fb2eab5c040 100644
--- a/test/solver/gmres_kernels.cpp
+++ b/test/solver/gmres_kernels.cpp
@@ -74,10 +74,11 @@ class Gmres : public CommonTestFixture {
         b = gen_mtx(m, nrhs);
         krylov_bases =
             gen_mtx(m * (gko::solver::gmres_default_krylov_dim + 1), nrhs);
-        hessenberg = gen_mtx(gko::solver::gmres_default_krylov_dim + 1,
-                             gko::solver::gmres_default_krylov_dim * nrhs);
+        hessenberg =
+            gen_mtx(gko::solver::gmres_default_krylov_dim,
+                    (gko::solver::gmres_default_krylov_dim + 1) * nrhs);
         hessenberg_iter =
-            gen_mtx(gko::solver::gmres_default_krylov_dim + 1, nrhs);
+            gen_mtx(1, (gko::solver::gmres_default_krylov_dim + 1) * nrhs);
         residual = gen_mtx(m, nrhs);
         residual_norm = gen_mtx<norm_type>(1, nrhs);
         residual_norm_collection =
@@ -272,6 +273,50 @@ TEST_F(Gmres, GmresKernelMultiAxpyIsEquivalentToRef)
     GKO_ASSERT_ARRAY_EQ(stop_status, d_stop_status);
 }
 
+TEST_F(Gmres, GmresKernelMultiDotIsEquivalentToRef)
+{
+    initialize_data();
+
+    auto krylov_basis = krylov_bases->create_submatrix(
+        gko::span{
+            0, x->get_size()[0] * (gko::solver::gmres_default_krylov_dim - 1)},
+        gko::span{0, x->get_size()[1]});
+    auto d_krylov_basis = d_krylov_bases->create_submatrix(
+        gko::span{0, d_x->get_size()[0] *
+                         (gko::solver::gmres_default_krylov_dim - 1)},
+        gko::span{0, d_x->get_size()[1]});
+    auto next_krylov = krylov_bases->create_submatrix(
+        gko::span{
+            x->get_size()[0] * (gko::solver::gmres_default_krylov_dim - 1),
+            x->get_size()[0] * gko::solver::gmres_default_krylov_dim},
+        gko::span{0, x->get_size()[1]});
+    auto d_next_krylov = d_krylov_bases->create_submatrix(
+        gko::span{
+            d_x->get_size()[0] * (gko::solver::gmres_default_krylov_dim - 1),
+            d_x->get_size()[0] * gko::solver::gmres_default_krylov_dim},
+        gko::span{0, d_x->get_size()[1]});
+    gko::kernels::reference::gmres::multi_dot(
+        ref, krylov_basis.get(), next_krylov.get(), hessenberg_iter.get());
+    gko::kernels::GKO_DEVICE_NAMESPACE::gmres::multi_dot(
+        exec, d_krylov_basis.get(), d_next_krylov.get(),
+        d_hessenberg_iter.get());
+
+    // The multidot computation does not set the value below the diagonal
+    // in the Hessenberg matrix column(s), as that is done after the
+    // orthogonalization of the next basis vector. In this test, we
+    // are checking the column(s) created on the last iteration before the
+    // solver's restart would be triggered, so it is only the final row of
+    // the Hessenberg column(s) that we ignore.
+    auto hessenberg_iter_small = hessenberg_iter->create_submatrix(
+        gko::span{0, 1},
+        gko::span{0, gko::solver::gmres_default_krylov_dim * x->get_size()[1]});
+    auto d_hessenberg_iter_small = d_hessenberg_iter->create_submatrix(
+        gko::span{0, 1},
+        gko::span{0, gko::solver::gmres_default_krylov_dim * x->get_size()[1]});
+    GKO_ASSERT_MTX_NEAR(d_hessenberg_iter_small, hessenberg_iter_small,
+                        r<value_type>::value);
+}
+
 
 TEST_F(Gmres, GmresApplyOneRHSIsEquivalentToRef)
 {
@@ -294,18 +339,27 @@ TEST_F(Gmres, GmresApplyOneRHSIsEquivalentToRef)
 
 TEST_F(Gmres, GmresApplyMultipleRHSIsEquivalentToRef)
 {
-    int m = 123;
-    int n = 5;
-    auto ref_solver = ref_gmres_factory->generate(mtx);
-    auto exec_solver = exec_gmres_factory->generate(d_mtx);
-    auto b = gen_mtx(m, n);
-    auto x = gen_mtx(m, n);
-    auto d_b = gko::clone(exec, b);
-    auto d_x = gko::clone(exec, x);
-
-    ref_solver->apply(b, x);
-    exec_solver->apply(d_b, d_x);
+    using gko::solver::gmres::orthog_method;
+    auto base_params = gko::clone(ref, ref_gmres_factory)->get_parameters();
 
-    GKO_ASSERT_MTX_NEAR(d_b, b, 0);
-    GKO_ASSERT_MTX_NEAR(d_x, x, r<value_type>::value * 1e3);
+    for (auto orthog :
+         {orthog_method::mgs, orthog_method::cgs, orthog_method::cgs2}) {
+        SCOPED_TRACE(orthog);
+        int m = 123;
+        int n = 5;
+        auto ref_solver =
+            base_params.with_orthog_method(orthog).on(ref)->generate(mtx);
+        auto exec_solver =
+            base_params.with_orthog_method(orthog).on(exec)->generate(d_mtx);
+        auto b = gen_mtx(m, n);
+        auto x = gen_mtx(m, n);
+        auto d_b = gko::clone(exec, b);
+        auto d_x = gko::clone(exec, x);
+
+        ref_solver->apply(b, x);
+        exec_solver->apply(d_b, d_x);
+
+        GKO_ASSERT_MTX_NEAR(d_b, b, 0);
+        GKO_ASSERT_MTX_NEAR(d_x, x, r<value_type>::value * 1e3);
+    }
 }

From c8ffffc1cf62b35528a244a6218966eb5b7f83b8 Mon Sep 17 00:00:00 2001
From: nbeams <246972+nbeams@users.noreply.github.com>
Date: Thu, 18 Jul 2024 19:09:30 +0000
Subject: [PATCH 115/448] Minor: formatting and comment clarity

---
 core/solver/gmres.cpp         | 12 +++++++-----
 core/solver/gmres_kernels.hpp |  1 +
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp
index f6fb254cf94..7a77988be98 100644
--- a/core/solver/gmres.cpp
+++ b/core/solver/gmres.cpp
@@ -154,8 +154,9 @@ void orthogonalize_mgs(matrix::Dense<ValueType>* hessenberg_iter,
     for (size_type i = 0; i <= restart_iter; i++) {
         // orthogonalize against krylov_bases(:, i):
         // hessenberg(i, restart_iter) = next_krylov' * krylov_bases(:,
-        // i) next_krylov -= hessenberg(i, restart_iter) *
-        // krylov_bases(:, i)
+        // i)
+        // next_krylov -= hessenberg(i, restart_iter) * krylov_bases(:,
+        // i)
         auto hessenberg_entry = hessenberg_iter->create_submatrix(
             span{0, 1}, span{i * num_rhs, (i + 1) * num_rhs});
         auto krylov_basis = ::gko::detail::create_submatrix_helper(
@@ -185,9 +186,10 @@ void finish_reduce(matrix::Dense<ValueType>* hessenberg_iter,
     auto exec = hessenberg_iter->get_executor();
     const auto comm = next_krylov->get_communicator();
     exec->synchronize();
-    // hessenberg_iter is the size of all non-zeros for this iteration -- but we
-    // are not setting the last values for each rhs (values that would be below
-    // the diagonal in the "full" matrix.
+    // hessenberg_iter is the size of all non-zeros for this iteration, but we
+    // are not setting the last values for each rhs here. Values that would be
+    // below the diagonal in the "full" matrix are skipped, because they will
+    // be used to hold the norm of next_krylov for each rhs.
     auto hessenberg_reduce = hessenberg_iter->create_submatrix(
         span{0, 1}, span{0, num_rhs * (restart_iter + 1)});
     if (experimental::mpi::requires_host_buffer(exec, comm)) {
diff --git a/core/solver/gmres_kernels.hpp b/core/solver/gmres_kernels.hpp
index f9fbe76279b..21bb5854816 100644
--- a/core/solver/gmres_kernels.hpp
+++ b/core/solver/gmres_kernels.hpp
@@ -44,6 +44,7 @@ namespace gmres {
                    const matrix::Dense<_type>* next_krylov,     \
                    matrix::Dense<_type>* hessenberg_col)
 
+
 #define GKO_DECLARE_ALL_AS_TEMPLATES                \
     template <typename ValueType>                   \
     GKO_DECLARE_GMRES_RESTART_KERNEL(ValueType);    \

From 4c50f84e2bff4347ef6fb4b86dd85d34208e3d90 Mon Sep 17 00:00:00 2001
From: nbeams <246972+nbeams@users.noreply.github.com>
Date: Fri, 19 Jul 2024 20:47:28 +0000
Subject: [PATCH 116/448] Reshape hessenberg_iter view to 'logical layout' (one
 column per rhs) for kernels that do not use the full Hessenberg matrix

---
 .../unified/solver/common_gmres_kernels.cpp   | 29 +++++------
 common/unified/solver/gmres_kernels.cpp       |  6 ++-
 core/solver/gmres.cpp                         | 49 ++++++++++---------
 reference/solver/common_gmres_kernels.cpp     | 44 +++++++----------
 reference/solver/gmres_kernels.cpp            | 16 +++---
 reference/test/solver/gmres_kernels.cpp       | 38 ++++++++++----
 test/solver/gmres_kernels.cpp                 | 10 ++--
 7 files changed, 102 insertions(+), 90 deletions(-)

diff --git a/common/unified/solver/common_gmres_kernels.cpp b/common/unified/solver/common_gmres_kernels.cpp
index 15637fe701e..679aebcfaa2 100644
--- a/common/unified/solver/common_gmres_kernels.cpp
+++ b/common/unified/solver/common_gmres_kernels.cpp
@@ -69,30 +69,28 @@ void hessenberg_qr(std::shared_ptr<const DefaultExecutor> exec,
         exec,
         [] GKO_KERNEL(auto rhs, auto givens_sin, auto givens_cos,
                       auto residual_norm, auto residual_norm_collection,
-                      auto hessenberg_iter, auto iter, auto num_rhs,
-                      auto final_iter_nums, auto stop_status) {
+                      auto hessenberg_iter, auto iter, auto final_iter_nums,
+                      auto stop_status) {
             using value_type = std::decay_t<decltype(givens_sin(0, 0))>;
             if (stop_status[rhs].has_stopped()) {
                 return;
             }
             // increment iteration count
             final_iter_nums[rhs]++;
-            auto hess_this =
-                hessenberg_iter(0, rhs);  // hessenberg_iter(0, rhs);
-            auto hess_next =
-                hessenberg_iter(0, num_rhs + rhs);  // hessenberg_iter(1, rhs);
+            auto hess_this = hessenberg_iter(0, rhs);
+            auto hess_next = hessenberg_iter(1, rhs);
             // apply previous Givens rotations to column
             for (decltype(iter) j = 0; j < iter; ++j) {
                 // in here: hess_this = hessenberg_iter(j, rhs);
                 //          hess_next = hessenberg_iter(j+1, rhs);
-                hess_next = hessenberg_iter(0, (j + 1) * num_rhs + rhs);
+                hess_next = hessenberg_iter(j + 1, rhs);
                 const auto gc = givens_cos(j, rhs);
                 const auto gs = givens_sin(j, rhs);
                 const auto out1 = gc * hess_this + gs * hess_next;
                 const auto out2 = -conj(gs) * hess_this + conj(gc) * hess_next;
-                hessenberg_iter(0, j * num_rhs + rhs) = out1;
-                hessenberg_iter(0, (j + 1) * num_rhs + rhs) = hess_this = out2;
-                hess_next = hessenberg_iter(0, (j + 2) * num_rhs + rhs);
+                hessenberg_iter(j, rhs) = out1;
+                hessenberg_iter(j + 1, rhs) = hess_this = out2;
+                hess_next = hessenberg_iter(j + 2, rhs);
             }
             // hess_this is hessenberg_iter(iter, rhs) and
             // hess_next is hessenberg_iter(iter + 1, rhs)
@@ -112,9 +110,8 @@ void hessenberg_qr(std::shared_ptr<const DefaultExecutor> exec,
                 givens_sin(iter, rhs) = gs = conj(hess_next) / hypotenuse;
             }
             // apply new Givens rotation to column
-            hessenberg_iter(0, iter * num_rhs + rhs) =
-                gc * hess_this + gs * hess_next;
-            hessenberg_iter(0, (iter + 1) * num_rhs + rhs) = zero<value_type>();
+            hessenberg_iter(iter, rhs) = gc * hess_this + gs * hess_next;
+            hessenberg_iter(iter + 1, rhs) = zero<value_type>();
             // apply new Givens rotation to RHS of least-squares problem
             const auto rnc_new =
                 -conj(gs) * residual_norm_collection(iter, rhs);
@@ -123,9 +120,9 @@ void hessenberg_qr(std::shared_ptr<const DefaultExecutor> exec,
                 gc * residual_norm_collection(iter, rhs);
             residual_norm(0, rhs) = abs(rnc_new);
         },
-        residual_norm->get_size()[1], givens_sin, givens_cos, residual_norm,
-        residual_norm_collection, hessenberg_iter, iter,
-        residual_norm->get_size()[1], final_iter_nums, stop_status);
+        hessenberg_iter->get_size()[1], givens_sin, givens_cos, residual_norm,
+        residual_norm_collection, hessenberg_iter, iter, final_iter_nums,
+        stop_status);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
diff --git a/common/unified/solver/gmres_kernels.cpp b/common/unified/solver/gmres_kernels.cpp
index c10dc2562e5..f24ae445edb 100644
--- a/common/unified/solver/gmres_kernels.cpp
+++ b/common/unified/solver/gmres_kernels.cpp
@@ -111,8 +111,10 @@ void multi_dot(std::shared_ptr<const DefaultExecutor> exec,
                    next_krylov(row, irhs);
         },
         GKO_KERNEL_REDUCE_SUM(ValueType), hessenberg_col->get_values(),
-        gko::dim<2>{next_krylov->get_size()[0],
-                    hessenberg_col->get_size()[1] - next_krylov->get_size()[1]},
+        gko::dim<2>{
+            next_krylov->get_size()[0],
+            hessenberg_col->get_size()[0] * hessenberg_col->get_size()[1] -
+                next_krylov->get_size()[1]},
         krylov_bases, next_krylov, next_krylov->get_size()[1],
         next_krylov->get_size()[0]);
 }
diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp
index 7a77988be98..d47eb4428ea 100644
--- a/core/solver/gmres.cpp
+++ b/core/solver/gmres.cpp
@@ -157,8 +157,8 @@ void orthogonalize_mgs(matrix::Dense<ValueType>* hessenberg_iter,
         // i)
         // next_krylov -= hessenberg(i, restart_iter) * krylov_bases(:,
         // i)
-        auto hessenberg_entry = hessenberg_iter->create_submatrix(
-            span{0, 1}, span{i * num_rhs, (i + 1) * num_rhs});
+        auto hessenberg_entry =
+            hessenberg_iter->create_submatrix(span{i, i + 1}, span{0, num_rhs});
         auto krylov_basis = ::gko::detail::create_submatrix_helper(
             krylov_bases, dim<2>{num_rows, num_rhs},
             span{local_num_rows * i, local_num_rows * (i + 1)},
@@ -191,19 +191,18 @@ void finish_reduce(matrix::Dense<ValueType>* hessenberg_iter,
     // below the diagonal in the "full" matrix are skipped, because they will
     // be used to hold the norm of next_krylov for each rhs.
     auto hessenberg_reduce = hessenberg_iter->create_submatrix(
-        span{0, 1}, span{0, num_rhs * (restart_iter + 1)});
+        span{0, restart_iter + 1}, span{0, num_rhs});
+    int message_size = static_cast<int>((restart_iter + 1) * num_rhs);
     if (experimental::mpi::requires_host_buffer(exec, comm)) {
         ::gko::detail::DenseCache<ValueType> host_reduction_buffer;
         host_reduction_buffer.init(exec->get_master(),
                                    hessenberg_reduce->get_size());
         host_reduction_buffer->copy_from(hessenberg_reduce);
         comm.all_reduce(exec->get_master(), host_reduction_buffer->get_values(),
-                        static_cast<int>(hessenberg_reduce->get_size()[1]),
-                        MPI_SUM);
+                        message_size, MPI_SUM);
         hessenberg_reduce->copy_from(host_reduction_buffer.get());
     } else {
-        comm.all_reduce(exec, hessenberg_reduce->get_values(),
-                        static_cast<int>(hessenberg_reduce->get_size()[1]),
+        comm.all_reduce(exec, hessenberg_reduce->get_values(), message_size,
                         MPI_SUM);
     }
 }
@@ -228,8 +227,8 @@ void orthogonalize_cgs(matrix::Dense<ValueType>* hessenberg_iter,
     for (size_type i = 0; i <= restart_iter; i++) {
         // next_krylov -= hessenberg(i, restart_iter) * krylov_bases(:,
         // i)
-        auto hessenberg_entry = hessenberg_iter->create_submatrix(
-            span{0, 1}, span{i * num_rhs, (i + 1) * num_rhs});
+        auto hessenberg_entry =
+            hessenberg_iter->create_submatrix(span{i, i + 1}, span{0, num_rhs});
         auto krylov_col = ::gko::detail::create_submatrix_helper(
             krylov_bases, dim<2>{num_rows, num_rhs},
             span{local_num_rows * i, local_num_rows * (i + 1)},
@@ -260,8 +259,8 @@ void orthogonalize_cgs2(matrix::Dense<ValueType>* hessenberg_iter,
     for (size_type i = 0; i <= restart_iter; i++) {
         // next_krylov -= hessenberg(i, restart_iter) * krylov_bases(:,
         // i)
-        auto hessenberg_entry = hessenberg_iter->create_submatrix(
-            span{0, 1}, span{i * num_rhs, (i + 1) * num_rhs});
+        auto hessenberg_entry =
+            hessenberg_iter->create_submatrix(span{i, i + 1}, span{0, num_rhs});
         auto krylov_col = ::gko::detail::create_submatrix_helper(
             krylov_bases, dim<2>{num_rows, num_rhs},
             span{local_num_rows * i, local_num_rows * (i + 1)},
@@ -270,7 +269,7 @@ void orthogonalize_cgs2(matrix::Dense<ValueType>* hessenberg_iter,
     }
     // Re-orthogonalize
     auto hessenberg_aux_iter = hessenberg_aux->create_submatrix(
-        span{0, 1}, span{0, (restart_iter + 2) * num_rhs});
+        span{0, restart_iter + 2}, span{0, num_rhs});
     exec->run(gmres::make_multi_dot(
         gko::detail::get_local(krylov_basis_small.get()),
         gko::detail::get_local(next_krylov), hessenberg_aux_iter.get()));
@@ -280,8 +279,8 @@ void orthogonalize_cgs2(matrix::Dense<ValueType>* hessenberg_iter,
     for (size_type i = 0; i <= restart_iter; i++) {
         // next_krylov -= hessenberg(i, restart_iter) * krylov_bases(:,
         // i)
-        auto hessenberg_entry = hessenberg_aux->create_submatrix(
-            span{0, 1}, span{i * num_rhs, (i + 1) * num_rhs});
+        auto hessenberg_entry =
+            hessenberg_aux->create_submatrix(span{i, i + 1}, span{0, num_rhs});
         auto krylov_col = ::gko::detail::create_submatrix_helper(
             krylov_bases, dim<2>{num_rows, num_rhs},
             span{local_num_rows * i, local_num_rows * (i + 1)},
@@ -353,10 +352,13 @@ void Gmres<ValueType>::apply_dense_impl(const VectorType* dense_b,
     // Krylov basis vector, for the (j % num_rhs)th RHS vector.
     auto hessenberg = this->template create_workspace_op<LocalVector>(
         ws::hessenberg, dim<2>{krylov_dim, (krylov_dim + 1) * num_rhs});
+    // Because the auxiliary Hessenberg workspace only ever stores one
+    // iteration of data at a time, we store it in the "logical" layout
+    // from the start.
     LocalVector* hessenberg_aux = nullptr;
     if (this->parameters_.orthog_method == gmres::orthog_method::cgs2) {
         hessenberg_aux = this->template create_workspace_op<LocalVector>(
-            ws::hessenberg_aux, dim<2>{1, (krylov_dim + 1) * num_rhs});
+            ws::hessenberg_aux, dim<2>{(krylov_dim + 1), num_rhs});
     }
     auto givens_sin = this->template create_workspace_op<LocalVector>(
         ws::givens_sin, dim<2>{krylov_dim, num_rhs});
@@ -506,12 +508,16 @@ void Gmres<ValueType>::apply_dense_impl(const VectorType* dense_b,
         this->get_preconditioner()->apply(this_krylov,
                                           preconditioned_krylov_vector);
 
-        // Create view of current "column" in the hessenberg matrix:
+        // Create view of current column in the hessenberg matrix:
         // hessenberg_iter = hessenberg(:, restart_iter), which
-        // is actually stored as a row, hessenberg(restart_iter, :)
-        auto hessenberg_iter =
-            hessenberg->create_submatrix(span{restart_iter, restart_iter + 1},
-                                         span{0, num_rhs * (restart_iter + 2)});
+        // is actually stored as a row, hessenberg(restart_iter, :),
+        // but we will reshape it for viewing in hessenberg_iter.
+        auto hessenberg_iter = LocalVector::create(
+            exec, dim<2>{restart_iter + 2, num_rhs},
+            make_array_view(exec, (restart_iter + 2) * num_rhs,
+                            hessenberg->get_values() +
+                                restart_iter * hessenberg->get_size()[1]),
+            num_rhs);
 
         // Start of Arnoldi
         // next_krylov = A * preconditioned_krylov_vector
@@ -537,8 +543,7 @@ void Gmres<ValueType>::apply_dense_impl(const VectorType* dense_b,
         // (stored in hessenberg(restart_iter, (restart_iter + 1) * num_rhs))
         // next_krylov /= hessenberg(restart_iter+1, restart_iter)
         auto hessenberg_norm_entry = hessenberg_iter->create_submatrix(
-            span{0, 1},
-            span{(restart_iter + 1) * num_rhs, (restart_iter + 2) * num_rhs});
+            span{restart_iter + 1, restart_iter + 2}, span{0, num_rhs});
         help_compute_norm<ValueType>::compute_next_krylov_norm_into_hessenberg(
             next_krylov.get(), hessenberg_norm_entry.get(),
             next_krylov_norm_tmp, reduction_tmp);
diff --git a/reference/solver/common_gmres_kernels.cpp b/reference/solver/common_gmres_kernels.cpp
index 122c224d5c1..4ba091e03ae 100644
--- a/reference/solver/common_gmres_kernels.cpp
+++ b/reference/solver/common_gmres_kernels.cpp
@@ -30,15 +30,14 @@ template <typename ValueType>
 void calculate_sin_and_cos(matrix::Dense<ValueType>* givens_sin,
                            matrix::Dense<ValueType>* givens_cos,
                            matrix::Dense<ValueType>* hessenberg_iter,
-                           size_type iter, const size_type num_rhs,
-                           const size_type rhs)
+                           size_type iter, const size_type rhs)
 {
-    if (is_zero(hessenberg_iter->at(0, iter * num_rhs + rhs))) {
+    if (is_zero(hessenberg_iter->at(iter, rhs))) {
         givens_cos->at(iter, rhs) = zero<ValueType>();
         givens_sin->at(iter, rhs) = one<ValueType>();
     } else {
-        auto this_hess = hessenberg_iter->at(0, iter * num_rhs + rhs);
-        auto next_hess = hessenberg_iter->at(0, (iter + 1) * num_rhs + rhs);
+        auto this_hess = hessenberg_iter->at(iter, rhs);
+        auto next_hess = hessenberg_iter->at(iter + 1, rhs);
         const auto scale = abs(this_hess) + abs(next_hess);
         const auto hypotenuse =
             scale * sqrt(abs(this_hess / scale) * abs(this_hess / scale) +
@@ -53,24 +52,19 @@ template <typename ValueType>
 void givens_rotation(matrix::Dense<ValueType>* givens_sin,
                      matrix::Dense<ValueType>* givens_cos,
                      matrix::Dense<ValueType>* hessenberg_iter, size_type iter,
-                     const size_type num_rhs,
                      const stopping_status* stop_status)
 {
-    for (size_type i = 0; i < num_rhs; ++i) {
+    for (size_type i = 0; i < hessenberg_iter->get_size()[1]; ++i) {
         if (stop_status[i].has_stopped()) {
             continue;
         }
         for (size_type j = 0; j < iter; ++j) {
-            auto temp =
-                givens_cos->at(j, i) * hessenberg_iter->at(0, j * num_rhs + i) +
-                givens_sin->at(j, i) *
-                    hessenberg_iter->at(0, (j + 1) * num_rhs + i);
-            hessenberg_iter->at(0, (j + 1) * num_rhs + i) =
-                -conj(givens_sin->at(j, i)) *
-                    hessenberg_iter->at(0, j * num_rhs + i) +
-                conj(givens_cos->at(j, i)) *
-                    hessenberg_iter->at(0, (j + 1) * num_rhs + i);
-            hessenberg_iter->at(0, j * num_rhs + i) = temp;
+            auto temp = givens_cos->at(j, i) * hessenberg_iter->at(j, i) +
+                        givens_sin->at(j, i) * hessenberg_iter->at(j + 1, i);
+            hessenberg_iter->at(j + 1, i) =
+                -conj(givens_sin->at(j, i)) * hessenberg_iter->at(j, i) +
+                conj(givens_cos->at(j, i)) * hessenberg_iter->at(j + 1, i);
+            hessenberg_iter->at(j, i) = temp;
             // temp             =  cos(j)*hessenberg(j) +
             //                     sin(j)*hessenberg(j+1)
             // hessenberg(j+1)  = -conj(sin(j))*hessenberg(j) +
@@ -78,15 +72,12 @@ void givens_rotation(matrix::Dense<ValueType>* givens_sin,
             // hessenberg(j)    =  temp;
         }
 
-        calculate_sin_and_cos(givens_sin, givens_cos, hessenberg_iter, iter,
-                              num_rhs, i);
+        calculate_sin_and_cos(givens_sin, givens_cos, hessenberg_iter, iter, i);
 
-        hessenberg_iter->at(0, iter * num_rhs + i) =
-            givens_cos->at(iter, i) *
-                hessenberg_iter->at(0, iter * num_rhs + i) +
-            givens_sin->at(iter, i) *
-                hessenberg_iter->at(0, (iter + 1) * num_rhs + i);
-        hessenberg_iter->at(0, (iter + 1) * num_rhs + i) = zero<ValueType>();
+        hessenberg_iter->at(iter, i) =
+            givens_cos->at(iter, i) * hessenberg_iter->at(iter, i) +
+            givens_sin->at(iter, i) * hessenberg_iter->at(iter + 1, i);
+        hessenberg_iter->at(iter + 1, i) = zero<ValueType>();
         // hessenberg(iter)   = cos(iter)*hessenberg(iter) +
         //                      sin(iter)*hessenberg(iter + 1)
         // hessenberg(iter+1) = 0
@@ -160,8 +151,7 @@ void hessenberg_qr(std::shared_ptr<const ReferenceExecutor> exec,
         }
     }
 
-    givens_rotation(givens_sin, givens_cos, hessenberg_iter, iter,
-                    residual_norm->get_size()[1], stop_status);
+    givens_rotation(givens_sin, givens_cos, hessenberg_iter, iter, stop_status);
     calculate_next_residual_norm(givens_sin, givens_cos, residual_norm,
                                  residual_norm_collection, iter, stop_status);
 }
diff --git a/reference/solver/gmres_kernels.cpp b/reference/solver/gmres_kernels.cpp
index 4c482632353..a7f5a751a3b 100644
--- a/reference/solver/gmres_kernels.cpp
+++ b/reference/solver/gmres_kernels.cpp
@@ -79,14 +79,14 @@ void multi_dot(std::shared_ptr<const ReferenceExecutor> exec,
 {
     auto num_rhs = next_krylov->get_size()[1];
     auto krylov_bases_rowoffset = next_krylov->get_size()[0];
-    for (size_type i = 0; i < hessenberg_col->get_size()[1]; ++i) {
-        auto ivec = i / num_rhs;
-        auto irhs = i % num_rhs;
-        hessenberg_col->at(0, i) = zero<ValueType>();
-        for (size_type j = 0; j < krylov_bases_rowoffset; ++j) {
-            hessenberg_col->at(0, i) +=
-                krylov_bases->at(ivec * krylov_bases_rowoffset + j, irhs) *
-                next_krylov->at(j, irhs);
+    for (size_type i = 0; i < hessenberg_col->get_size()[0] - 1; ++i) {
+        for (size_type k = 0; k < num_rhs; ++k) {
+            hessenberg_col->at(i, k) = zero<ValueType>();
+            for (size_type j = 0; j < krylov_bases_rowoffset; ++j) {
+                hessenberg_col->at(i, k) +=
+                    conj(krylov_bases->at(i * krylov_bases_rowoffset + j, k)) *
+                    next_krylov->at(j, k);
+            }
         }
     }
 }
diff --git a/reference/test/solver/gmres_kernels.cpp b/reference/test/solver/gmres_kernels.cpp
index bc877e0ed76..7bbb30fff11 100644
--- a/reference/test/solver/gmres_kernels.cpp
+++ b/reference/test/solver/gmres_kernels.cpp
@@ -227,12 +227,19 @@ TYPED_TEST(Gmres, KernelHessenbergQrIter0)
     this->small_final_iter_nums.get_data()[0] = 0;
     this->small_final_iter_nums.get_data()[1] = 0;
 
+    // Reshape into "hessenberg_iter" columns as done in Gmres
+    auto hessenberg_iter_rows = this->small_givens_sin->get_size()[0] + 1;
+    auto hessenberg_iter_cols = this->small_givens_sin->get_size()[1];
+    auto hessenberg_reshape = Mtx::create(
+        this->exec, gko::dim<2>{hessenberg_iter_rows, hessenberg_iter_cols},
+        make_array_view(this->exec, hessenberg_iter_rows * hessenberg_iter_cols,
+                        this->small_hessenberg->get_values()),
+        hessenberg_iter_cols);
     gko::kernels::reference::common_gmres::hessenberg_qr(
         this->exec, this->small_givens_sin.get(), this->small_givens_cos.get(),
         this->small_residual_norm.get(),
-        this->small_residual_norm_collection.get(),
-        this->small_hessenberg.get(), iteration,
-        this->small_final_iter_nums.get_data(),
+        this->small_residual_norm_collection.get(), hessenberg_reshape.get(),
+        iteration, this->small_final_iter_nums.get_data(),
         this->small_stop.get_const_data());
 
     ASSERT_EQ(this->small_final_iter_nums.get_data()[0], 1);
@@ -272,12 +279,19 @@ TYPED_TEST(Gmres, KernelHessenbergQrIter1)
     this->small_final_iter_nums.get_data()[0] = 1;
     this->small_final_iter_nums.get_data()[1] = 1;
 
+    // Reshape into "hessenberg_iter" columns as done in Gmres
+    auto hessenberg_iter_rows = this->small_givens_sin->get_size()[0] + 1;
+    auto hessenberg_iter_cols = this->small_givens_sin->get_size()[1];
+    auto hessenberg_reshape = Mtx::create(
+        this->exec, gko::dim<2>{hessenberg_iter_rows, hessenberg_iter_cols},
+        make_array_view(this->exec, hessenberg_iter_rows * hessenberg_iter_cols,
+                        this->small_hessenberg->get_values()),
+        hessenberg_iter_cols);
     gko::kernels::reference::common_gmres::hessenberg_qr(
         this->exec, this->small_givens_sin.get(), this->small_givens_cos.get(),
         this->small_residual_norm.get(),
-        this->small_residual_norm_collection.get(),
-        this->small_hessenberg.get(), iteration,
-        this->small_final_iter_nums.get_data(),
+        this->small_residual_norm_collection.get(), hessenberg_reshape.get(),
+        iteration, this->small_final_iter_nums.get_data(),
         this->small_stop.get_const_data());
 
     ASSERT_EQ(this->small_final_iter_nums.get_data()[0], 2);
@@ -372,9 +386,13 @@ TYPED_TEST(Gmres, KernelMultiDot)
     const T nan = std::numeric_limits<gko::remove_complex<T>>::quiet_NaN();
     const auto restart = this->small_givens_sin->get_size()[0];
     this->small_hessenberg->fill(gko::zero<T>());
-    auto hessenberg_iter = this->small_hessenberg->create_submatrix(
-        gko::span{0, 1},
-        gko::span{0, (restart + 1) * this->small_x->get_size()[1]});
+    // Reshape into "hessenberg_iter" columns as done in Gmres
+    auto hessenberg_iter = Mtx::create(
+        this->exec, gko::dim<2>{restart + 1, this->small_x->get_size()[1]},
+        make_array_view(this->exec,
+                        (restart + 1) * this->small_x->get_size()[1],
+                        this->small_hessenberg->get_values()),
+        this->small_x->get_size()[1]);
     this->small_x = gko::initialize<Mtx>(  // next_krylov
         {I<T>{-1.0, 2.3}, I<T>{-14.0, -22.0}, I<T>{8.4, 14.2}}, this->exec);
 
@@ -396,7 +414,7 @@ TYPED_TEST(Gmres, KernelMultiDot)
         hessenberg_iter.get());
 
     GKO_ASSERT_MTX_NEAR(hessenberg_iter,
-                        l({{-3.8, -48.6, -23.6, -65.1, -43.4, -81.6}}),
+                        l({{-3.8, -48.6}, {-23.6, -65.1}, {0.0, 0.0}}),
                         r<T>::value);
 }
 
diff --git a/test/solver/gmres_kernels.cpp b/test/solver/gmres_kernels.cpp
index fb2eab5c040..d4dcbf19318 100644
--- a/test/solver/gmres_kernels.cpp
+++ b/test/solver/gmres_kernels.cpp
@@ -78,7 +78,7 @@ class Gmres : public CommonTestFixture {
             gen_mtx(gko::solver::gmres_default_krylov_dim,
                     (gko::solver::gmres_default_krylov_dim + 1) * nrhs);
         hessenberg_iter =
-            gen_mtx(1, (gko::solver::gmres_default_krylov_dim + 1) * nrhs);
+            gen_mtx(gko::solver::gmres_default_krylov_dim + 1, nrhs);
         residual = gen_mtx(m, nrhs);
         residual_norm = gen_mtx<norm_type>(1, nrhs);
         residual_norm_collection =
@@ -308,11 +308,11 @@ TEST_F(Gmres, GmresKernelMultiDotIsEquivalentToRef)
     // solver's restart would be triggered, so it is only the final row of
     // the Hessenberg column(s) that we ignore.
     auto hessenberg_iter_small = hessenberg_iter->create_submatrix(
-        gko::span{0, 1},
-        gko::span{0, gko::solver::gmres_default_krylov_dim * x->get_size()[1]});
+        gko::span{0, gko::solver::gmres_default_krylov_dim + 1},
+        gko::span{0, x->get_size()[1]});
     auto d_hessenberg_iter_small = d_hessenberg_iter->create_submatrix(
-        gko::span{0, 1},
-        gko::span{0, gko::solver::gmres_default_krylov_dim * x->get_size()[1]});
+        gko::span{0, gko::solver::gmres_default_krylov_dim + 1},
+        gko::span{0, x->get_size()[1]});
     GKO_ASSERT_MTX_NEAR(d_hessenberg_iter_small, hessenberg_iter_small,
                         r<value_type>::value);
 }

From 1344522314e8823f83c8995da7a6f53b18d9ffd1 Mon Sep 17 00:00:00 2001
From: nbeams <246972+nbeams@users.noreply.github.com>
Date: Mon, 22 Jul 2024 23:24:49 +0000
Subject: [PATCH 117/448] gmres: minor spacing and namespace fixes

---
 core/solver/gmres.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp
index d47eb4428ea..dbad7d8d1d8 100644
--- a/core/solver/gmres.cpp
+++ b/core/solver/gmres.cpp
@@ -52,6 +52,7 @@ std::ostream& operator<<(std::ostream& stream, orthog_method orthog)
     return stream;
 }
 
+
 }  // namespace gmres
 
 
@@ -142,7 +143,7 @@ struct help_compute_norm {
     }
 };
 
-namespace {
+
 // Orthogonalization helper functions
 template <typename ValueType, typename VectorType>
 void orthogonalize_mgs(matrix::Dense<ValueType>* hessenberg_iter,
@@ -169,6 +170,7 @@ void orthogonalize_mgs(matrix::Dense<ValueType>* hessenberg_iter,
     }
 }
 
+
 template <typename ValueType>
 void finish_reduce(matrix::Dense<ValueType>* hessenberg_iter,
                    matrix::Dense<ValueType>* next_krylov,
@@ -177,6 +179,7 @@ void finish_reduce(matrix::Dense<ValueType>* hessenberg_iter,
     return;
 }
 
+
 #if GINKGO_BUILD_MPI
 template <typename ValueType>
 void finish_reduce(matrix::Dense<ValueType>* hessenberg_iter,
@@ -208,6 +211,7 @@ void finish_reduce(matrix::Dense<ValueType>* hessenberg_iter,
 }
 #endif
 
+
 template <typename ValueType, typename VectorType>
 void orthogonalize_cgs(matrix::Dense<ValueType>* hessenberg_iter,
                        VectorType* krylov_bases, VectorType* next_krylov,
@@ -290,7 +294,7 @@ void orthogonalize_cgs2(matrix::Dense<ValueType>* hessenberg_iter,
     // Add both Hessenberg columns
     hessenberg_iter->add_scaled(one_op, hessenberg_aux_iter);
 }
-}  // anonymous namespace
+
 
 template <typename ValueType>
 struct help_compute_norm<ValueType,
@@ -307,6 +311,7 @@ struct help_compute_norm<ValueType,
     }
 };
 
+
 template <typename ValueType>
 template <typename VectorType>
 void Gmres<ValueType>::apply_dense_impl(const VectorType* dense_b,

From 15995da4ccd746938fd6b889f027ce2cf0aed91e Mon Sep 17 00:00:00 2001
From: nbeams <246972+nbeams@users.noreply.github.com>
Date: Mon, 22 Jul 2024 23:27:00 +0000
Subject: [PATCH 118/448] Simplify multi_dot kernel test

---
 test/solver/gmres_kernels.cpp | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/test/solver/gmres_kernels.cpp b/test/solver/gmres_kernels.cpp
index d4dcbf19318..a084e17fbdc 100644
--- a/test/solver/gmres_kernels.cpp
+++ b/test/solver/gmres_kernels.cpp
@@ -301,19 +301,7 @@ TEST_F(Gmres, GmresKernelMultiDotIsEquivalentToRef)
         exec, d_krylov_basis.get(), d_next_krylov.get(),
         d_hessenberg_iter.get());
 
-    // The multidot computation does not set the value below the diagonal
-    // in the Hessenberg matrix column(s), as that is done after the
-    // orthogonalization of the next basis vector. In this test, we
-    // are checking the column(s) created on the last iteration before the
-    // solver's restart would be triggered, so it is only the final row of
-    // the Hessenberg column(s) that we ignore.
-    auto hessenberg_iter_small = hessenberg_iter->create_submatrix(
-        gko::span{0, gko::solver::gmres_default_krylov_dim + 1},
-        gko::span{0, x->get_size()[1]});
-    auto d_hessenberg_iter_small = d_hessenberg_iter->create_submatrix(
-        gko::span{0, gko::solver::gmres_default_krylov_dim + 1},
-        gko::span{0, x->get_size()[1]});
-    GKO_ASSERT_MTX_NEAR(d_hessenberg_iter_small, hessenberg_iter_small,
+    GKO_ASSERT_MTX_NEAR(d_hessenberg_iter, hessenberg_iter,
                         r<value_type>::value);
 }
 

From ec25ea3765c52f030413c1e00b287fe0151f5424 Mon Sep 17 00:00:00 2001
From: nbeams <246972+nbeams@users.noreply.github.com>
Date: Mon, 5 Aug 2024 19:23:26 +0000
Subject: [PATCH 119/448] Rename orthog_method to ortho_method

---
 core/solver/gmres.cpp                   | 35 ++++++++++++-------------
 core/test/config/solver.cpp             |  6 ++---
 include/ginkgo/core/solver/gmres.hpp    |  8 +++---
 reference/test/solver/gmres_kernels.cpp | 10 +++----
 test/mpi/solver/solver.cpp              | 12 ++++-----
 test/solver/gmres_kernels.cpp           | 12 ++++-----
 6 files changed, 41 insertions(+), 42 deletions(-)

diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp
index dbad7d8d1d8..e47714b2186 100644
--- a/core/solver/gmres.cpp
+++ b/core/solver/gmres.cpp
@@ -39,14 +39,14 @@ GKO_REGISTER_OPERATION(multi_dot, gmres::multi_dot);
 }  // anonymous namespace
 
 
-std::ostream& operator<<(std::ostream& stream, orthog_method orthog)
+std::ostream& operator<<(std::ostream& stream, ortho_method ortho)
 {
-    switch (orthog) {
-    case orthog_method::mgs:
+    switch (ortho) {
+    case ortho_method::mgs:
         return stream << "mgs";
-    case orthog_method::cgs:
+    case ortho_method::cgs:
         return stream << "cgs";
-    case orthog_method::cgs2:
+    case ortho_method::cgs2:
         return stream << "cgs2";
     }
     return stream;
@@ -69,19 +69,19 @@ typename Gmres<ValueType>::parameters_type Gmres<ValueType>::parse(
     if (auto& obj = config.get("flexible")) {
         params.with_flexible(gko::config::get_value<bool>(obj));
     }
-    if (auto& obj = config.get("orthog_method")) {
+    if (auto& obj = config.get("ortho_method")) {
         auto str = obj.get_string();
-        gmres::orthog_method orthog;
+        gmres::ortho_method ortho;
         if (str == "mgs") {
-            orthog = gmres::orthog_method::mgs;
+            ortho = gmres::ortho_method::mgs;
         } else if (str == "cgs") {
-            orthog = gmres::orthog_method::cgs;
+            ortho = gmres::ortho_method::cgs;
         } else if (str == "cgs2") {
-            orthog = gmres::orthog_method::cgs2;
+            ortho = gmres::ortho_method::cgs2;
         } else {
-            GKO_INVALID_CONFIG_VALUE("orthog_method", str);
+            GKO_INVALID_CONFIG_VALUE("ortho_method", str);
         }
-        params.with_orthog_method(orthog);
+        params.with_ortho_method(ortho);
     }
     return params;
 }
@@ -361,7 +361,7 @@ void Gmres<ValueType>::apply_dense_impl(const VectorType* dense_b,
     // iteration of data at a time, we store it in the "logical" layout
     // from the start.
     LocalVector* hessenberg_aux = nullptr;
-    if (this->parameters_.orthog_method == gmres::orthog_method::cgs2) {
+    if (this->parameters_.ortho_method == gmres::ortho_method::cgs2) {
         hessenberg_aux = this->template create_workspace_op<LocalVector>(
             ws::hessenberg_aux, dim<2>{(krylov_dim + 1), num_rhs});
     }
@@ -528,17 +528,16 @@ void Gmres<ValueType>::apply_dense_impl(const VectorType* dense_b,
         // next_krylov = A * preconditioned_krylov_vector
         this->get_system_matrix()->apply(preconditioned_krylov_vector,
                                          next_krylov);
-        if (this->parameters_.orthog_method == gmres::orthog_method::mgs) {
+        if (this->parameters_.ortho_method == gmres::ortho_method::mgs) {
             orthogonalize_mgs(hessenberg_iter.get(), krylov_bases,
                               next_krylov.get(), reduction_tmp, restart_iter,
                               num_rows, num_rhs, local_num_rows);
-        } else if (this->parameters_.orthog_method ==
-                   gmres::orthog_method::cgs) {
+        } else if (this->parameters_.ortho_method == gmres::ortho_method::cgs) {
             orthogonalize_cgs(hessenberg_iter.get(), krylov_bases,
                               next_krylov.get(), restart_iter, num_rows,
                               num_rhs, local_num_rows);
-        } else if (this->parameters_.orthog_method ==
-                   gmres::orthog_method::cgs2) {
+        } else if (this->parameters_.ortho_method ==
+                   gmres::ortho_method::cgs2) {
             orthogonalize_cgs2(hessenberg_iter.get(), krylov_bases,
                                next_krylov.get(), hessenberg_aux, one_op,
                                restart_iter, num_rows, num_rhs, local_num_rows);
diff --git a/core/test/config/solver.cpp b/core/test/config/solver.cpp
index 78f1f7351f8..a170ebb1e04 100644
--- a/core/test/config/solver.cpp
+++ b/core/test/config/solver.cpp
@@ -289,8 +289,8 @@ struct Gmres
         param.with_krylov_dim(3u);
         config_map["flexible"] = pnode{true};
         param.with_flexible(true);
-        config_map["orthog_method"] = pnode{"cgs"};
-        param.with_orthog_method(gko::solver::gmres::orthog_method::cgs);
+        config_map["ortho_method"] = pnode{"cgs"};
+        param.with_ortho_method(gko::solver::gmres::ortho_method::cgs);
     }
 
     template <bool from_reg, typename AnswerType>
@@ -302,7 +302,7 @@ struct Gmres
         solver_config_test::template validate<from_reg>(result, answer);
         ASSERT_EQ(res_param.krylov_dim, ans_param.krylov_dim);
         ASSERT_EQ(res_param.flexible, ans_param.flexible);
-        ASSERT_EQ(res_param.orthog_method, ans_param.orthog_method);
+        ASSERT_EQ(res_param.ortho_method, ans_param.ortho_method);
     }
 };
 
diff --git a/include/ginkgo/core/solver/gmres.hpp b/include/ginkgo/core/solver/gmres.hpp
index 308dadf5218..3ba3acf94bb 100644
--- a/include/ginkgo/core/solver/gmres.hpp
+++ b/include/ginkgo/core/solver/gmres.hpp
@@ -35,7 +35,7 @@ namespace gmres {
 /**
  * Set the orthogonalization method for the Krylov subspace.
  */
-enum class orthog_method {
+enum class ortho_method {
     /**
      * Modified Gram-Schmidt (default)
      */
@@ -51,7 +51,7 @@ enum class orthog_method {
 };
 
 /** Prints an orthogonalization method. */
-std::ostream& operator<<(std::ostream& stream, orthog_method orthog);
+std::ostream& operator<<(std::ostream& stream, ortho_method ortho);
 
 }  // namespace gmres
 
@@ -118,8 +118,8 @@ class Gmres
         bool GKO_FACTORY_PARAMETER_SCALAR(flexible, false);
 
         /** Orthogonalization method */
-        gmres::orthog_method GKO_FACTORY_PARAMETER_SCALAR(
-            orthog_method, gmres::orthog_method::mgs);
+        gmres::ortho_method GKO_FACTORY_PARAMETER_SCALAR(
+            ortho_method, gmres::ortho_method::mgs);
     };
     GKO_ENABLE_LIN_OP_FACTORY(Gmres, parameters, Factory);
     GKO_ENABLE_BUILD_METHOD(Factory);
diff --git a/reference/test/solver/gmres_kernels.cpp b/reference/test/solver/gmres_kernels.cpp
index 7bbb30fff11..3f11b087bb7 100644
--- a/reference/test/solver/gmres_kernels.cpp
+++ b/reference/test/solver/gmres_kernels.cpp
@@ -754,17 +754,17 @@ TYPED_TEST(Gmres, SolvesBigDenseSystem1WithRestart)
 
 TYPED_TEST(Gmres, SolvesWithPreconditioner)
 {
-    using gko::solver::gmres::orthog_method;
+    using gko::solver::gmres::ortho_method;
 
     using Mtx = typename TestFixture::Mtx;
     using Solver = typename TestFixture::Solver;
     using value_type = typename TestFixture::value_type;
-    for (auto orthog :
-         {orthog_method::mgs, orthog_method::cgs, orthog_method::cgs2}) {
-        SCOPED_TRACE(orthog);
+    for (auto ortho :
+         {ortho_method::mgs, ortho_method::cgs, ortho_method::cgs2}) {
+        SCOPED_TRACE(ortho);
         auto gmres_factory_preconditioner =
             Solver::build()
-                .with_orthog_method(orthog)
+                .with_ortho_method(ortho)
                 .with_criteria(
                     gko::stop::Iteration::build().with_max_iters(100u),
                     gko::stop::ResidualNorm<value_type>::build()
diff --git a/test/mpi/solver/solver.cpp b/test/mpi/solver/solver.cpp
index aaf61cb47ea..be9f6865c86 100644
--- a/test/mpi/solver/solver.cpp
+++ b/test/mpi/solver/solver.cpp
@@ -195,14 +195,14 @@ struct Ir : SimpleSolverTest<gko::solver::Ir<solver_value_type>> {
 };
 
 
-template <unsigned dimension, gko::solver::gmres::orthog_method orthog>
+template <unsigned dimension, gko::solver::gmres::ortho_method ortho>
 struct Gmres : SimpleSolverTest<gko::solver::Gmres<solver_value_type>> {
     static typename solver_type::parameters_type build(
         std::shared_ptr<const gko::Executor> exec)
     {
         return SimpleSolverTest<gko::solver::Gmres<solver_value_type>>::build(
                    std::move(exec))
-            .with_orthog_method(orthog)
+            .with_ortho_method(ortho)
             .with_krylov_dim(dimension);
     }
 };
@@ -532,10 +532,10 @@ class Solver : public CommonMpiTestFixture {
 
 using SolverTypes =
     ::testing::Types<Cg, CgWithMg, Cgs, Fcg, Bicgstab, Ir, Gcr<10u>, Gcr<100u>,
-                     Gmres<10u, gko::solver::gmres::orthog_method::mgs>,
-                     Gmres<10u, gko::solver::gmres::orthog_method::cgs>,
-                     Gmres<10u, gko::solver::gmres::orthog_method::cgs2>,
-                     Gmres<100u, gko::solver::gmres::orthog_method::mgs>>;
+                     Gmres<10u, gko::solver::gmres::ortho_method::mgs>,
+                     Gmres<10u, gko::solver::gmres::ortho_method::cgs>,
+                     Gmres<10u, gko::solver::gmres::ortho_method::cgs2>,
+                     Gmres<100u, gko::solver::gmres::ortho_method::mgs>>;
 
 TYPED_TEST_SUITE(Solver, SolverTypes, TypenameNameGenerator);
 
diff --git a/test/solver/gmres_kernels.cpp b/test/solver/gmres_kernels.cpp
index a084e17fbdc..72cbc83b002 100644
--- a/test/solver/gmres_kernels.cpp
+++ b/test/solver/gmres_kernels.cpp
@@ -327,18 +327,18 @@ TEST_F(Gmres, GmresApplyOneRHSIsEquivalentToRef)
 
 TEST_F(Gmres, GmresApplyMultipleRHSIsEquivalentToRef)
 {
-    using gko::solver::gmres::orthog_method;
+    using gko::solver::gmres::ortho_method;
     auto base_params = gko::clone(ref, ref_gmres_factory)->get_parameters();
 
-    for (auto orthog :
-         {orthog_method::mgs, orthog_method::cgs, orthog_method::cgs2}) {
-        SCOPED_TRACE(orthog);
+    for (auto ortho :
+         {ortho_method::mgs, ortho_method::cgs, ortho_method::cgs2}) {
+        SCOPED_TRACE(ortho);
         int m = 123;
         int n = 5;
         auto ref_solver =
-            base_params.with_orthog_method(orthog).on(ref)->generate(mtx);
+            base_params.with_ortho_method(ortho).on(ref)->generate(mtx);
         auto exec_solver =
-            base_params.with_orthog_method(orthog).on(exec)->generate(d_mtx);
+            base_params.with_ortho_method(ortho).on(exec)->generate(d_mtx);
         auto b = gen_mtx(m, n);
         auto x = gen_mtx(m, n);
         auto d_b = gko::clone(exec, b);

From f47f94ee3ab9d80a79e63f085fb422b68c4d1c6e Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Wed, 19 Jun 2024 09:33:05 +0200
Subject: [PATCH 120/448] [core] reading mm-files discards extra characters in
 row

---
 core/base/mtx_io.cpp      |  7 ++++++
 core/test/base/mtx_io.cpp | 50 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/core/base/mtx_io.cpp b/core/base/mtx_io.cpp
index c264a073f31..5851135607e 100644
--- a/core/base/mtx_io.cpp
+++ b/core/base/mtx_io.cpp
@@ -35,6 +35,9 @@ namespace {
     }
 
 
+constexpr auto max_streamsize = std::numeric_limits<std::streamsize>::max();
+
+
 /**
  * The mtx_io class provides the functionality of reading and writing matrix
  * market format files.
@@ -514,6 +517,8 @@ class mtx_io {
                 GKO_CHECK_STREAM(content, "error when reading matrix entry " +
                                               std::to_string(i));
                 modifier->insert_entry(row - 1, col - 1, entry, data);
+                content.ignore(max_streamsize,
+                               '\n');  // discards rest of the line
             }
             return data;
         }
@@ -582,6 +587,8 @@ class mtx_io {
                                          std::to_string(row) + " ," +
                                          std::to_string(col));
                     modifier->insert_entry(row, col, entry, data);
+                    content.ignore(max_streamsize,
+                                   '\n');  // discards rest of the line
                 }
             }
             return data;
diff --git a/core/test/base/mtx_io.cpp b/core/test/base/mtx_io.cpp
index 66b6766b2d3..8ac1ced0e50 100644
--- a/core/test/base/mtx_io.cpp
+++ b/core/test/base/mtx_io.cpp
@@ -231,6 +231,32 @@ TEST(MtxReader, ReadsDenseComplexFloatMtxWith64Index)
 }
 
 
+TEST(MtxReader, ReadsDenseIgnoresExtraCharactersInRow)
+{
+    using tpl = gko::matrix_data<double, gko::int32>::nonzero_type;
+    std::istringstream iss(
+        "%%MatrixMarket matrix array real general\n"
+        "2 3 -77\n"
+        "1.0\n"
+        "0.0 58\n"
+        "3.0\n"
+        "5.0\n"
+        "2.0\n"
+        "0.0\n");
+
+    auto data = gko::read_raw<double, gko::int32>(iss);
+
+    ASSERT_EQ(data.size, gko::dim<2>(2, 3));
+    auto& v = data.nonzeros;
+    ASSERT_EQ(v[0], tpl(0, 0, 1.0));
+    ASSERT_EQ(v[1], tpl(0, 1, 3.0));
+    ASSERT_EQ(v[2], tpl(0, 2, 2.0));
+    ASSERT_EQ(v[3], tpl(1, 0, 0.0));
+    ASSERT_EQ(v[4], tpl(1, 1, 5.0));
+    ASSERT_EQ(v[5], tpl(1, 2, 0.0));
+}
+
+
 TEST(MtxReader, ReadsSparseRealMtx)
 {
     using tpl = gko::matrix_data<double, gko::int32>::nonzero_type;
@@ -385,7 +411,29 @@ TEST(MtxReader, ReadsSparseComplexHermitianMtx)
 }
 
 
-TEST(MtxReader, ReadIgnoresExtraCharacters)
+TEST(MtxReader, ReadsSparseIgnoresExtraCharactersInRow)
+{
+    using tpl = gko::matrix_data<double, gko::int32>::nonzero_type;
+    std::istringstream iss(
+        "%%MatrixMarket matrix coordinate real general\n"
+        "2 3 4 abc\n"
+        "1 1 1.0 some value\n"
+        "2 2 5.0 who knows?\n"
+        "1 2 3.0\n"
+        "1 3 2.0\n");
+
+    auto data = gko::read_raw<double, gko::int32>(iss);
+
+    ASSERT_EQ(data.size, gko::dim<2>(2, 3));
+    auto& v = data.nonzeros;
+    ASSERT_EQ(v[0], tpl(0, 0, 1.0));
+    ASSERT_EQ(v[1], tpl(0, 1, 3.0));
+    ASSERT_EQ(v[2], tpl(0, 2, 2.0));
+    ASSERT_EQ(v[3], tpl(1, 1, 5.0));
+}
+
+
+TEST(MtxReader, ReadHeaderIgnoresExtraCharacters)
 {
     using tpl = gko::matrix_data<double, gko::int32>::nonzero_type;
     std::istringstream iss(

From 8d84ccfbeee0aae15c252a5956171e57ce411ad2 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Tue, 13 Aug 2024 11:57:49 +0200
Subject: [PATCH 121/448] review updates:
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- formatting
- fix error message

Co-authored-by: Thomas Grützmacher <thomas.gruetzmacher@tum.de>
Co-authored-by: Yu-Hsiang M. Tsai <yhmtsai@gmail.com>
---
 core/base/mtx_io.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/core/base/mtx_io.cpp b/core/base/mtx_io.cpp
index 5851135607e..33c3b07d487 100644
--- a/core/base/mtx_io.cpp
+++ b/core/base/mtx_io.cpp
@@ -517,8 +517,8 @@ class mtx_io {
                 GKO_CHECK_STREAM(content, "error when reading matrix entry " +
                                               std::to_string(i));
                 modifier->insert_entry(row - 1, col - 1, entry, data);
-                content.ignore(max_streamsize,
-                               '\n');  // discards rest of the line
+                // discards rest of the line
+                content.ignore(max_streamsize, '\n');
             }
             return data;
         }
@@ -574,7 +574,7 @@ class mtx_io {
             size_type num_cols{};
             GKO_CHECK_STREAM(
                 header >> num_rows >> num_cols,
-                "error when determining matrix size, expected: rows cols nnz");
+                "error when determining matrix size, expected: rows cols");
             matrix_data<ValueType, IndexType> data(dim<2>{num_rows, num_cols});
             data.nonzeros.reserve(modifier->get_reservation_size(
                 num_rows, num_cols, num_rows * num_cols));
@@ -587,8 +587,8 @@ class mtx_io {
                                          std::to_string(row) + " ," +
                                          std::to_string(col));
                     modifier->insert_entry(row, col, entry, data);
-                    content.ignore(max_streamsize,
-                                   '\n');  // discards rest of the line
+                    // discards rest of the line
+                    content.ignore(max_streamsize, '\n');
                 }
             }
             return data;

From a328701aef8f5344f5c8efaf1caacda5418f721c Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Sun, 11 Aug 2024 14:50:39 +0200
Subject: [PATCH 122/448] fix uninitialized array alignment

---
 .../components/uninitialized_array.hpp        | 43 ++++++++++++++++---
 .../cuda_hip/matrix/csr_kernels.template.cpp  |  2 +-
 2 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/common/cuda_hip/components/uninitialized_array.hpp b/common/cuda_hip/components/uninitialized_array.hpp
index 44fcbfd0d85..8929476fbd6 100644
--- a/common/cuda_hip/components/uninitialized_array.hpp
+++ b/common/cuda_hip/components/uninitialized_array.hpp
@@ -8,6 +8,8 @@
 
 #include <ginkgo/core/base/types.hpp>
 
+#include "common/cuda_hip/base/thrust.hpp"
+
 
 namespace gko {
 namespace kernels {
@@ -34,7 +36,7 @@ class uninitialized_array {
      */
     constexpr GKO_ATTRIBUTES operator const ValueType*() const noexcept
     {
-        return &(*this)[0];
+        return data_;
     }
 
     /**
@@ -43,7 +45,7 @@ class uninitialized_array {
      *
      * @return the non-const pointer to the first entry of the array.
      */
-    GKO_ATTRIBUTES operator ValueType*() noexcept { return &(*this)[0]; }
+    GKO_ATTRIBUTES operator ValueType*() noexcept { return data_; }
 
     /**
      * constexpr array access operator.
@@ -56,7 +58,7 @@ class uninitialized_array {
     constexpr GKO_ATTRIBUTES const ValueType& operator[](
         size_type pos) const noexcept
     {
-        return reinterpret_cast<const ValueType*>(data_)[pos];
+        return data_[pos];
     }
 
     /**
@@ -69,11 +71,42 @@ class uninitialized_array {
      */
     GKO_ATTRIBUTES ValueType& operator[](size_type pos) noexcept
     {
-        return reinterpret_cast<ValueType*>(data_)[pos];
+        return data_[pos];
+    }
+
+private:
+    ValueType data_[size];
+};
+
+
+template <typename ValueType, size_type size>
+class uninitialized_array<thrust::complex<ValueType>, size> {
+public:
+    constexpr GKO_ATTRIBUTES operator const thrust::complex<ValueType>*()
+        const noexcept
+    {
+        return &(*this)[0];
+    }
+
+    GKO_ATTRIBUTES operator thrust::complex<ValueType>*() noexcept
+    {
+        return &(*this)[0];
+    }
+
+    constexpr GKO_ATTRIBUTES const thrust::complex<ValueType>& operator[](
+        size_type pos) const noexcept
+    {
+        return reinterpret_cast<const thrust::complex<ValueType>*>(data_)[pos];
+    }
+
+    GKO_ATTRIBUTES thrust::complex<ValueType>& operator[](
+        size_type pos) noexcept
+    {
+        return reinterpret_cast<thrust::complex<ValueType>*>(data_)[pos];
     }
 
 private:
-    unsigned char data_[sizeof(ValueType) / sizeof(unsigned char) * size];
+    ValueType data_[2 * size];
 };
 
 
diff --git a/common/cuda_hip/matrix/csr_kernels.template.cpp b/common/cuda_hip/matrix/csr_kernels.template.cpp
index f17cf1548fe..757e689a777 100644
--- a/common/cuda_hip/matrix/csr_kernels.template.cpp
+++ b/common/cuda_hip/matrix/csr_kernels.template.cpp
@@ -335,7 +335,7 @@ __device__ void merge_path_reduce(const IndexType nwarps,
             }
         }
     }
-    __shared__ uninitialized_array<IndexType, spmv_block_size> tmp_ind;
+    __shared__ IndexType tmp_ind[spmv_block_size];
     __shared__ uninitialized_array<arithmetic_type, spmv_block_size> tmp_val;
     tmp_val[threadIdx.x] = value;
     tmp_ind[threadIdx.x] = row;

From dc49b315d7451f65208104b9b37b4433a5066bca Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Mon, 12 Aug 2024 16:38:13 +0200
Subject: [PATCH 123/448] fix NaN handling on Windows

MSVC doesn't treat the is_nan properly, so we do a byte comparison instead
---
 cuda/solver/common_trs_kernels.cuh | 13 +++---
 include/ginkgo/core/base/math.hpp  | 63 +++++++++++++++++++++++++++++-
 2 files changed, 68 insertions(+), 8 deletions(-)

diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh
index 7cedf2fbd2e..31ba6f0c19f 100644
--- a/cuda/solver/common_trs_kernels.cuh
+++ b/cuda/solver/common_trs_kernels.cuh
@@ -399,11 +399,12 @@ __global__ void sptrsv_naive_caching_kernel(
         ValueType val{};
         if (shmem_possible) {
             const auto dependency_shid = dependency_gid % default_block_size;
-            while (is_nan(val = load_relaxed_shared(x_s + dependency_shid))) {
+            while (is_nan_exact(
+                val = load_relaxed_shared(x_s + dependency_shid))) {
             }
         } else {
-            while (
-                is_nan(val = load_relaxed(x + dependency * x_stride + rhs))) {
+            while (is_nan_exact(
+                val = load_relaxed(x + dependency * x_stride + rhs))) {
             }
         }
 
@@ -418,7 +419,7 @@ __global__ void sptrsv_naive_caching_kernel(
     store_relaxed(x + row * x_stride + rhs, r);
 
     // This check to ensure no infinite loops happen.
-    if (is_nan(r)) {
+    if (is_nan_exact(r)) {
         store_relaxed_shared(x_s + self_shid, zero<ValueType>());
         store_relaxed(x + row * x_stride + rhs, zero<ValueType>());
         *nan_produced = true;
@@ -460,7 +461,7 @@ __global__ void sptrsv_naive_legacy_kernel(
     auto col = colidxs[j];
     while (j != row_end) {
         auto x_val = load_relaxed(x + col * x_stride + rhs);
-        while (!is_nan(x_val)) {
+        while (!is_nan_exact(x_val)) {
             sum += vals[j] * x_val;
             j += row_step;
             col = colidxs[j];
@@ -478,7 +479,7 @@ __global__ void sptrsv_naive_legacy_kernel(
             // after we encountered the diagonal, we are done
             // this also skips entries outside the triangle
             j = row_end;
-            if (is_nan(r)) {
+            if (is_nan_exact(r)) {
                 store_relaxed(x + row * x_stride + rhs, zero<ValueType>());
                 *nan_produced = true;
             }
diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp
index f7b3b35c3f6..128712f0974 100644
--- a/include/ginkgo/core/base/math.hpp
+++ b/include/ginkgo/core/base/math.hpp
@@ -9,6 +9,7 @@
 #include <cmath>
 #include <complex>
 #include <cstdlib>
+#include <cstring>
 #include <limits>
 #include <type_traits>
 #include <utility>
@@ -102,6 +103,21 @@ using std::sqrt;
 namespace detail {
 
 
+/** Returns an unsigned type matching the size of the given float type. */
+template <typename T>
+struct float_to_bytes_impl {};
+
+template <>
+struct float_to_bytes_impl<double> {
+    using type = uint64;
+};
+
+template <>
+struct float_to_bytes_impl<float> {
+    using type = uint32;
+};
+
+
 /**
  * Keep the same data type if it is not complex.
  */
@@ -1223,7 +1239,8 @@ template <typename T>
 GKO_INLINE GKO_ATTRIBUTES std::enable_if_t<!is_complex_s<T>::value, bool>
 is_nan(const T& value)
 {
-    return std::isnan(value);
+    using std::isnan;
+    return isnan(value);
 }
 
 
@@ -1240,7 +1257,7 @@ template <typename T>
 GKO_INLINE GKO_ATTRIBUTES std::enable_if_t<is_complex_s<T>::value, bool> is_nan(
     const T& value)
 {
-    return std::isnan(value.real()) || std::isnan(value.imag());
+    return is_nan(value.real()) || is_nan(value.imag());
 }
 
 
@@ -1274,6 +1291,48 @@ nan()
 }
 
 
+/**
+ * Checks if a floating point number is a quiet NaN (gko::nan()).
+ *
+ * @tparam T  type of the value to check
+ *
+ * @param value  value to check
+ *
+ * @return `true` if the value is bitwise equal to gko::nan<T>().
+ */
+template <typename T>
+GKO_INLINE GKO_ATTRIBUTES std::enable_if_t<!is_complex_s<T>::value, bool>
+is_nan_exact(const T& value)
+{
+    using type = typename detail::float_to_bytes_impl<T>::type;
+    type value_bytes{};
+    type nan_bytes{};
+    auto nan_value = nan<T>();
+    using std::memcpy;
+    memcpy(&value_bytes, &value, sizeof(value));
+    memcpy(&nan_bytes, &nan_value, sizeof(value));
+    return value_bytes == nan_bytes;
+}
+
+
+/**
+ * Checks if any component of a complex value is a quiet NaN (gko::nan).
+ *
+ * @tparam T  complex type of the value to check
+ *
+ * @param value  complex value to check
+ *
+ * @return `true` if any component of the complex number fulfills
+ * is_nan_exact(component).
+ */
+template <typename T>
+GKO_INLINE GKO_ATTRIBUTES std::enable_if_t<is_complex_s<T>::value, bool>
+is_nan_exact(const T& value)
+{
+    return is_nan_exact(value.real()) || is_nan_exact(value.imag());
+}
+
+
 }  // namespace gko
 
 

From ec41ac3c984dd4e41584872c76e5f52ae06f555f Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Mon, 12 Aug 2024 16:53:40 +0200
Subject: [PATCH 124/448] deprecate is_nan

---
 include/ginkgo/core/base/math.hpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp
index 128712f0974..034ca6cbd52 100644
--- a/include/ginkgo/core/base/math.hpp
+++ b/include/ginkgo/core/base/math.hpp
@@ -1236,8 +1236,9 @@ GKO_INLINE GKO_ATTRIBUTES T safe_divide(T a, T b)
  * @return `true` if the value is NaN.
  */
 template <typename T>
-GKO_INLINE GKO_ATTRIBUTES std::enable_if_t<!is_complex_s<T>::value, bool>
-is_nan(const T& value)
+GKO_DEPRECATED("is_nan can't be used safely on the device (MSVC+CUDA)")
+GKO_INLINE GKO_ATTRIBUTES
+    std::enable_if_t<!is_complex_s<T>::value, bool> is_nan(const T& value)
 {
     using std::isnan;
     return isnan(value);
@@ -1254,6 +1255,7 @@ is_nan(const T& value)
  * @return `true` if any component of the given value is NaN.
  */
 template <typename T>
+GKO_DEPRECATED("is_nan can't be used safely on the device (MSVC+CUDA)")
 GKO_INLINE GKO_ATTRIBUTES std::enable_if_t<is_complex_s<T>::value, bool> is_nan(
     const T& value)
 {

From a2c0cef2fda4c3a2b0b833fcc85e7c91509be401 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Mon, 12 Aug 2024 16:53:52 +0200
Subject: [PATCH 125/448] run tests for Windows CUDA

---
 .gitlab-ci.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 2f8e3a892a5..226a10f4cea 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -486,8 +486,7 @@ build/windows-cuda/release/shared:
     - mkdir install
     - cmake -B build -DBUILD_SHARED_LIBS=ON -DGINKGO_BUILD_CUDA=ON "-DCMAKE_INSTALL_PREFIX=$pwd\install" .
     - cmake --build build --config Release -j16
-# we disable these tests until the triangular solver issues are resolved
-#   - ctest --test-dir build -C Release --no-tests=error --output-on-failure
+    - ctest --test-dir build -C Release --no-tests=error --output-on-failure
     - $env:PATH+=";$pwd/install/bin"
     - cmake --install build --config Release
     - cmake --build build --target test_install --config Release

From f2f8617f66d3e3363564a3172c137fcaef15cac8 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Mon, 12 Aug 2024 18:05:38 +0200
Subject: [PATCH 126/448] review updates and fixes

---
 include/ginkgo/core/base/math.hpp | 8 ++++++--
 test/solver/gcr_kernels.cpp       | 2 +-
 test/solver/lower_trs_kernels.cpp | 2 +-
 test/solver/upper_trs_kernels.cpp | 2 +-
 4 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp
index 034ca6cbd52..f32f47eda2f 100644
--- a/include/ginkgo/core/base/math.hpp
+++ b/include/ginkgo/core/base/math.hpp
@@ -1236,7 +1236,9 @@ GKO_INLINE GKO_ATTRIBUTES T safe_divide(T a, T b)
  * @return `true` if the value is NaN.
  */
 template <typename T>
-GKO_DEPRECATED("is_nan can't be used safely on the device (MSVC+CUDA)")
+GKO_DEPRECATED(
+    "is_nan can't be used safely on the device (MSVC+CUDA), and will thus be "
+    "removed in a future release, without replacement")
 GKO_INLINE GKO_ATTRIBUTES
     std::enable_if_t<!is_complex_s<T>::value, bool> is_nan(const T& value)
 {
@@ -1255,7 +1257,9 @@ GKO_INLINE GKO_ATTRIBUTES
  * @return `true` if any component of the given value is NaN.
  */
 template <typename T>
-GKO_DEPRECATED("is_nan can't be used safely on the device (MSVC+CUDA)")
+GKO_DEPRECATED(
+    "is_nan can't be used safely on the device (MSVC+CUDA), and will thus be "
+    "removed in a future release, without replacement")
 GKO_INLINE GKO_ATTRIBUTES std::enable_if_t<is_complex_s<T>::value, bool> is_nan(
     const T& value)
 {
diff --git a/test/solver/gcr_kernels.cpp b/test/solver/gcr_kernels.cpp
index eb3f5c6df93..3095475538d 100644
--- a/test/solver/gcr_kernels.cpp
+++ b/test/solver/gcr_kernels.cpp
@@ -222,7 +222,7 @@ TEST_F(Gcr, GcrApplyOneRHSIsEquivalentToRef)
     exec_solver->apply(d_b.get(), d_x.get());
 
     GKO_ASSERT_MTX_NEAR(d_b, b, 0);
-    GKO_ASSERT_MTX_NEAR(d_x, x, r<value_type>::value * 1e2);
+    GKO_ASSERT_MTX_NEAR(d_x, x, r<value_type>::value * 1e3);
 }
 
 
diff --git a/test/solver/lower_trs_kernels.cpp b/test/solver/lower_trs_kernels.cpp
index b838c1df14b..9bfea0a22d0 100644
--- a/test/solver/lower_trs_kernels.cpp
+++ b/test/solver/lower_trs_kernels.cpp
@@ -152,7 +152,7 @@ TEST_F(LowerTrs, ApplyTriangularDenseMtxIsEquivalentToRef)
     solver->apply(b, x);
     d_solver->apply(db, dx);
 
-    GKO_ASSERT_MTX_NEAR(dx, x, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dx, x, 1e-13);
 }
 
 
diff --git a/test/solver/upper_trs_kernels.cpp b/test/solver/upper_trs_kernels.cpp
index 6825d9f6c3b..c62dfa7c5de 100644
--- a/test/solver/upper_trs_kernels.cpp
+++ b/test/solver/upper_trs_kernels.cpp
@@ -152,7 +152,7 @@ TEST_F(UpperTrs, ApplyTriangularDenseMtxIsEquivalentToRef)
     solver->apply(b, x);
     d_solver->apply(db, dx);
 
-    GKO_ASSERT_MTX_NEAR(dx, x, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dx, x, 1e-13);
 }
 
 

From 31a8e692e22c0deccfcba6cf615336bd4a242690 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Mon, 12 Aug 2024 22:17:37 +0200
Subject: [PATCH 127/448] review updates

- move is_nan_exact to internal code
- rename to float_to_uint_impl
- increase tolerance for additional test
---
 cuda/solver/common_trs_kernels.cuh | 41 +++++++++++++++++++++
 include/ginkgo/core/base/math.hpp  | 58 ------------------------------
 test/solver/lower_trs_kernels.cpp  |  2 +-
 3 files changed, 42 insertions(+), 59 deletions(-)

diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh
index 31ba6f0c19f..7a3712c0390 100644
--- a/cuda/solver/common_trs_kernels.cuh
+++ b/cuda/solver/common_trs_kernels.cuh
@@ -6,6 +6,7 @@
 #define GKO_CUDA_SOLVER_COMMON_TRS_KERNELS_CUH_
 
 
+#include <cstring>
 #include <functional>
 #include <iostream>
 #include <memory>
@@ -342,6 +343,46 @@ constexpr int default_block_size = 512;
 constexpr int fallback_block_size = 32;
 
 
+/** Returns an unsigned type matching the size of the given float type. */
+template <typename T>
+struct float_to_unsigned_impl {};
+
+template <>
+struct float_to_unsigned_impl<double> {
+    using type = uint64;
+};
+
+template <>
+struct float_to_unsigned_impl<float> {
+    using type = uint32;
+};
+
+
+/** Checks if a floating point number is a quiet NaN (gko::nan()). */
+template <typename T>
+GKO_INLINE GKO_ATTRIBUTES std::enable_if_t<!is_complex_s<T>::value, bool>
+is_nan_exact(const T& value)
+{
+    using type = typename float_to_unsigned_impl<T>::type;
+    type value_bytes{};
+    type nan_bytes{};
+    auto nan_value = nan<T>();
+    using std::memcpy;
+    memcpy(&value_bytes, &value, sizeof(value));
+    memcpy(&nan_bytes, &nan_value, sizeof(value));
+    return value_bytes == nan_bytes;
+}
+
+
+/** Checks if any component of a complex value is a quiet NaN (gko::nan). */
+template <typename T>
+GKO_INLINE GKO_ATTRIBUTES std::enable_if_t<is_complex_s<T>::value, bool>
+is_nan_exact(const T& value)
+{
+    return is_nan_exact(value.real()) || is_nan_exact(value.imag());
+}
+
+
 template <bool is_upper, typename ValueType, typename IndexType>
 __global__ void sptrsv_naive_caching_kernel(
     const IndexType* const rowptrs, const IndexType* const colidxs,
diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp
index f32f47eda2f..f6847743717 100644
--- a/include/ginkgo/core/base/math.hpp
+++ b/include/ginkgo/core/base/math.hpp
@@ -9,7 +9,6 @@
 #include <cmath>
 #include <complex>
 #include <cstdlib>
-#include <cstring>
 #include <limits>
 #include <type_traits>
 #include <utility>
@@ -103,21 +102,6 @@ using std::sqrt;
 namespace detail {
 
 
-/** Returns an unsigned type matching the size of the given float type. */
-template <typename T>
-struct float_to_bytes_impl {};
-
-template <>
-struct float_to_bytes_impl<double> {
-    using type = uint64;
-};
-
-template <>
-struct float_to_bytes_impl<float> {
-    using type = uint32;
-};
-
-
 /**
  * Keep the same data type if it is not complex.
  */
@@ -1297,48 +1281,6 @@ nan()
 }
 
 
-/**
- * Checks if a floating point number is a quiet NaN (gko::nan()).
- *
- * @tparam T  type of the value to check
- *
- * @param value  value to check
- *
- * @return `true` if the value is bitwise equal to gko::nan<T>().
- */
-template <typename T>
-GKO_INLINE GKO_ATTRIBUTES std::enable_if_t<!is_complex_s<T>::value, bool>
-is_nan_exact(const T& value)
-{
-    using type = typename detail::float_to_bytes_impl<T>::type;
-    type value_bytes{};
-    type nan_bytes{};
-    auto nan_value = nan<T>();
-    using std::memcpy;
-    memcpy(&value_bytes, &value, sizeof(value));
-    memcpy(&nan_bytes, &nan_value, sizeof(value));
-    return value_bytes == nan_bytes;
-}
-
-
-/**
- * Checks if any component of a complex value is a quiet NaN (gko::nan).
- *
- * @tparam T  complex type of the value to check
- *
- * @param value  complex value to check
- *
- * @return `true` if any component of the complex number fulfills
- * is_nan_exact(component).
- */
-template <typename T>
-GKO_INLINE GKO_ATTRIBUTES std::enable_if_t<is_complex_s<T>::value, bool>
-is_nan_exact(const T& value)
-{
-    return is_nan_exact(value.real()) || is_nan_exact(value.imag());
-}
-
-
 }  // namespace gko
 
 
diff --git a/test/solver/lower_trs_kernels.cpp b/test/solver/lower_trs_kernels.cpp
index 9bfea0a22d0..da55f6153cc 100644
--- a/test/solver/lower_trs_kernels.cpp
+++ b/test/solver/lower_trs_kernels.cpp
@@ -417,7 +417,7 @@ TEST_F(LowerTrs, ClassicalApplyTriangularDenseMtxIsEquivalentToRef)
     solver->apply(b, x);
     d_solver->apply(db, dx);
 
-    GKO_ASSERT_MTX_NEAR(dx, x, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dx, x, 1e-13);
 }
 
 

From a569123968f5bcf08e5de72fae5a2285eedb6106 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Tue, 13 Aug 2024 12:54:46 +0200
Subject: [PATCH 128/448] fix library location for sparselib benchmark linops

For things to work in Windows, the shared libraries need to be in the working directory or PATH
---
 benchmark/CMakeLists.txt | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index de6e74d464c..e2479e02344 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -22,6 +22,7 @@ function(ginkgo_benchmark_cusparse_linops type def)
     target_compile_definitions(cusparse_linops_${type} PUBLIC ${def})
     target_compile_definitions(cusparse_linops_${type} PRIVATE GKO_COMPILING_CUDA)
     target_link_libraries(cusparse_linops_${type} Ginkgo::ginkgo CUDA::cudart CUDA::cublas CUDA::cusparse)
+    ginkgo_compile_features(cusparse_linops_${type})
 endfunction()
 
 function(ginkgo_benchmark_hipsparse_linops type def)
@@ -31,6 +32,7 @@ function(ginkgo_benchmark_hipsparse_linops type def)
     target_compile_definitions(hipsparse_linops_${type} PRIVATE GKO_COMPILING_HIP)
     target_include_directories(hipsparse_linops_${type} SYSTEM PRIVATE ${HIPBLAS_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS})
     target_link_libraries(hipsparse_linops_${type} Ginkgo::ginkgo ${HIPSPARSE_LIBRARIES})
+    ginkgo_compile_features(hipsparse_linops_${type})
 endfunction()
 
 function(ginkgo_benchmark_onemkl_linops type def)
@@ -38,6 +40,7 @@ function(ginkgo_benchmark_onemkl_linops type def)
     # make the dependency public to catch issues
     target_compile_definitions(onemkl_linops_${type} PUBLIC ${def})
     target_link_libraries(onemkl_linops_${type} PRIVATE Ginkgo::ginkgo MKL::MKL_DPCPP)
+    ginkgo_compile_features(onemkl_linops_${type})
 endfunction()
 
 
@@ -116,6 +119,7 @@ if (GINKGO_BUILD_CUDA)
     ginkgo_benchmark_cusparse_linops(c GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION)
     add_library(cuda_timer utils/cuda_timer.cpp)
     target_link_libraries(cuda_timer ginkgo CUDA::cudart)
+    ginkgo_compile_features(cuda_timer)
 endif()
 if (GINKGO_BUILD_HIP)
     ginkgo_benchmark_hipsparse_linops(d GKO_BENCHMARK_USE_DOUBLE_PRECISION)
@@ -125,6 +129,7 @@ if (GINKGO_BUILD_HIP)
     set_source_files_properties(utils/hip_timer.hip.cpp PROPERTIES LANGUAGE HIP)
     add_library(hip_timer utils/hip_timer.hip.cpp)
     target_link_libraries(hip_timer ginkgo)
+    ginkgo_compile_features(hip_timer)
 endif()
 
 if (GINKGO_BUILD_SYCL)
@@ -136,11 +141,13 @@ if (GINKGO_BUILD_SYCL)
     target_compile_options(dpcpp_timer PRIVATE ${GINKGO_DPCPP_FLAGS})
     gko_add_sycl_to_target(TARGET dpcpp_timer SOURCES utils/dpcpp_timer.dp.cpp)
     target_link_libraries(dpcpp_timer ginkgo)
+    ginkgo_compile_features(dpcpp_timer)
 endif()
 
 if (GINKGO_BUILD_MPI)
     add_library(mpi_timer ${Ginkgo_SOURCE_DIR}/benchmark/utils/mpi_timer.cpp)
     target_link_libraries(mpi_timer ginkgo)
+    ginkgo_compile_features(mpi_timer)
 endif()
 
 add_subdirectory(blas)

From b9d5224bbdbb54486748dbd3e9dd7ae3289a0601 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Tue, 13 Aug 2024 12:55:00 +0200
Subject: [PATCH 129/448] catch invalid JSON in test framework

---
 benchmark/test/test_framework.py.in | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in
index 48f3ca608b2..9294b2f02ec 100644
--- a/benchmark/test/test_framework.py.in
+++ b/benchmark/test/test_framework.py.in
@@ -90,9 +90,12 @@ def sanitize_json_text(input: str) -> List[str]:
     and pretty-printed to replace the original JSON input.
     """
 
-    result = json.dumps(sanitize_json(json.loads(input)), indent=4)
-    # json.dumps doesn't add a trailing newline
-    return result.splitlines() + [""]
+    try:
+        result = json.dumps(sanitize_json(json.loads(input)), indent=4)
+        # json.dumps doesn't add a trailing newline
+        return result.splitlines() + [""]
+    except Exception as e:
+        return f"Error: {str(e)}"
 
 
 def sanitize_text(

From 4e92e81b795bf5a106021ddff87634e9a30e207d Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Tue, 13 Aug 2024 14:41:24 +0200
Subject: [PATCH 130/448] review updates
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Thomas Grützmacher <thomas.gruetzmacher@tum.de>
---
 cuda/solver/common_trs_kernels.cuh | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh
index 7a3712c0390..291c842325f 100644
--- a/cuda/solver/common_trs_kernels.cuh
+++ b/cuda/solver/common_trs_kernels.cuh
@@ -358,7 +358,10 @@ struct float_to_unsigned_impl<float> {
 };
 
 
-/** Checks if a floating point number is a quiet NaN (gko::nan()). */
+/**
+ * Checks if a floating point number representation matches the representation
+ * of the quiet NaN with value gko::nan() exactly.
+ */
 template <typename T>
 GKO_INLINE GKO_ATTRIBUTES std::enable_if_t<!is_complex_s<T>::value, bool>
 is_nan_exact(const T& value)
@@ -374,7 +377,10 @@ is_nan_exact(const T& value)
 }
 
 
-/** Checks if any component of a complex value is a quiet NaN (gko::nan). */
+/**
+ * Checks if any component of the complex value matches the quiet NaN with
+ * value gko::nan() exactly.
+ */
 template <typename T>
 GKO_INLINE GKO_ATTRIBUTES std::enable_if_t<is_complex_s<T>::value, bool>
 is_nan_exact(const T& value)

From 495a1ebba0a01057b366da29d99fa4b26097a0f5 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Thu, 11 Jul 2024 11:20:57 +0200
Subject: [PATCH 131/448] Add cmake flag and instantiate only one by default

---
 CMakeLists.txt                             |  1 +
 core/solver/batch_bicgstab_kernels.hpp     | 10 +++++++++-
 core/solver/batch_cg_kernels.hpp           | 10 +++++++++-
 cuda/solver/batch_bicgstab_kernels.cu      |  8 ++++++++
 cuda/solver/batch_cg_kernels.cu            |  8 ++++++++
 dpcpp/solver/batch_bicgstab_kernels.dp.cpp |  8 ++++++++
 dpcpp/solver/batch_cg_kernels.dp.cpp       |  8 ++++++++
 hip/solver/batch_bicgstab_kernels.hip.cpp  |  8 ++++++++
 hip/solver/batch_cg_kernels.hip.cpp        |  8 ++++++++
 include/ginkgo/config.hpp.in               |  4 ++++
 10 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 21832c98592..f60500b4cc9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,6 +55,7 @@ option(GINKGO_HIP_AMD_UNSAFE_ATOMIC "Compiler uses unsafe floating point atomic
 option(GINKGO_SPLIT_TEMPLATE_INSTANTIATIONS "Split template instantiations for slow-to-compile files. This improves parallel build performance" ON)
 mark_as_advanced(GINKGO_SPLIT_TEMPLATE_INSTANTIATIONS)
 option(GINKGO_JACOBI_FULL_OPTIMIZATIONS "Use all the optimizations for the CUDA Jacobi algorithm" OFF)
+option(GINKGO_BATCHED_FULL_OPTIMIZATIONS "Use all the optimizations for the CUDA/HIP batched solver algorithms" OFF)
 option(BUILD_SHARED_LIBS "Build shared (.so, .dylib, .dll) libraries" ON)
 option(GINKGO_BUILD_HWLOC "Build Ginkgo with HWLOC. Default is OFF." OFF)
 option(GINKGO_BUILD_PAPI_SDE "Build Ginkgo with PAPI SDE. Enabled if a system installation is found." ${PAPI_SDE_FOUND})
diff --git a/core/solver/batch_bicgstab_kernels.hpp b/core/solver/batch_bicgstab_kernels.hpp
index 1eed30aba5a..07ecb1bd834 100644
--- a/core/solver/batch_bicgstab_kernels.hpp
+++ b/core/solver/batch_bicgstab_kernels.hpp
@@ -6,6 +6,7 @@
 #define GKO_CORE_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_
 
 
+#include <ginkgo/config.hpp>
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/log/batch_logger.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
@@ -15,6 +16,13 @@
 #include "core/base/kernel_declaration.hpp"
 
 
+#ifdef GINKGO_BACTCHED_FULL_OPTIMIZATIONS
+constexpr bool bicgstab_no_shared_vecs = false;
+#else
+constexpr bool bicgstab_no_shared_vecs = true;
+#endif
+
+
 namespace gko {
 namespace kernels {
 namespace batch_bicgstab {
@@ -138,7 +146,7 @@ storage_config compute_shared_storage(const int available_shared_mem,
     // {prec_shared, n_shared, n_global, gmem_stride_bytes, padded_vec_len}
     storage_config sconf{false, 0, num_main_vecs, 0, num_rows};
     // If available shared mem is zero, set all vecs to global.
-    if (rem_shared <= 0) {
+    if (rem_shared <= 0 || bicgstab_no_shared_vecs) {
         set_gmem_stride_bytes<align_bytes>(sconf, vec_size, prec_storage);
         return sconf;
     }
diff --git a/core/solver/batch_cg_kernels.hpp b/core/solver/batch_cg_kernels.hpp
index 6fdb595862e..028223886fe 100644
--- a/core/solver/batch_cg_kernels.hpp
+++ b/core/solver/batch_cg_kernels.hpp
@@ -6,6 +6,7 @@
 #define GKO_CORE_SOLVER_BATCH_CG_KERNELS_HPP_
 
 
+#include <ginkgo/config.hpp>
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/log/batch_logger.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
@@ -15,6 +16,13 @@
 #include "core/base/kernel_declaration.hpp"
 
 
+#ifdef GINKGO_BACTCHED_FULL_OPTIMIZATIONS
+constexpr bool cg_no_shared_vecs = false;
+#else
+constexpr bool cg_no_shared_vecs = true;
+#endif
+
+
 namespace gko {
 namespace kernels {
 namespace batch_cg {
@@ -126,7 +134,7 @@ storage_config compute_shared_storage(const int available_shared_mem,
     // {prec_shared, n_shared, n_global, gmem_stride_bytes, padded_vec_len}
     storage_config sconf{false, 0, num_main_vecs, 0, num_rows};
     // If available shared mem is zero, set all vecs to global.
-    if (rem_shared <= 0) {
+    if (rem_shared <= 0 || cg_no_shared_vecs) {
         set_gmem_stride_bytes<align_bytes>(sconf, vec_bytes, prec_storage);
         return sconf;
     }
diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu
index 6b3dca28607..bc12fc7efde 100644
--- a/cuda/solver/batch_bicgstab_kernels.cu
+++ b/cuda/solver/batch_bicgstab_kernels.cu
@@ -167,6 +167,9 @@ public:
 
         value_type* const workspace_data = workspace.get_data();
 
+        // Only instantiate when full optimizations has been enabled. Otherwise,
+        // just use the default one with no shared memory.
+#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS
         // Template parameters launch_apply_kernel<StopType, n_shared,
         // prec_shared>
         if (sconf.prec_shared) {
@@ -229,6 +232,11 @@ public:
                 GKO_NOT_IMPLEMENTED;
             }
         }
+#else
+        launch_apply_kernel<StopType, 0, false>(
+            sconf, logger, prec, mat, b.values, x.values, workspace_data,
+            block_size, shared_size);
+#endif
     }
 
 private:
diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu
index 746be0365e7..f09b6c70487 100644
--- a/cuda/solver/batch_cg_kernels.cu
+++ b/cuda/solver/batch_cg_kernels.cu
@@ -165,6 +165,9 @@ public:
 
         value_type* const workspace_data = workspace.get_data();
 
+        // Only instantiate when full optimizations has been enabled. Otherwise,
+        // just use the default one with no shared memory.
+#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS
         // Template parameters launch_apply_kernel<StopType, n_shared,
         // prec_shared>
         if (sconf.prec_shared) {
@@ -207,6 +210,11 @@ public:
                 GKO_NOT_IMPLEMENTED;
             }
         }
+#else
+        launch_apply_kernel<StopType, 0, false>(
+            sconf, logger, prec, mat, b.values, x.values, workspace_data,
+            block_size, shared_size);
+#endif
     }
 
 private:
diff --git a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
index 344e4af56b9..3b6d5d1c5df 100644
--- a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
+++ b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
@@ -159,6 +159,9 @@ class kernel_caller {
         ValueType* const workspace_data = workspace.get_data();
         int n_shared_total = sconf.n_shared + int(sconf.prec_shared);
 
+        // Only instantiate when full optimizations has been enabled. Otherwise,
+        // just use the default one with no shared memory.
+#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS
         // template
         // launch_apply_kernel<StopType, subgroup_size, n_shared_total>
         if (num_rows <= 32 && n_shared_total == 10) {
@@ -230,6 +233,11 @@ class kernel_caller {
                 GKO_NOT_IMPLEMENTED;
             }
         }
+#else
+        launch_apply_kernel<StopType, 32, 0>(sconf, logger, prec, mat, b.values,
+                                             x.values, workspace_data,
+                                             group_size, shared_size);
+#endif
     }
 
 private:
diff --git a/dpcpp/solver/batch_cg_kernels.dp.cpp b/dpcpp/solver/batch_cg_kernels.dp.cpp
index 0787afa6fd3..36fbe0dc269 100644
--- a/dpcpp/solver/batch_cg_kernels.dp.cpp
+++ b/dpcpp/solver/batch_cg_kernels.dp.cpp
@@ -158,6 +158,9 @@ class kernel_caller {
         ValueType* const workspace_data = workspace.get_data();
         int n_shared_total = sconf.n_shared + int(sconf.prec_shared);
 
+        // Only instantiate when full optimizations has been enabled. Otherwise,
+        // just use the default one with no shared memory.
+#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS
         // template
         // launch_apply_kernel<StopType, subgroup_size, n_shared_total>
         if (num_rows <= 32 && n_shared_total == 6) {
@@ -205,6 +208,11 @@ class kernel_caller {
                 GKO_NOT_IMPLEMENTED;
             }
         }
+#else
+        launch_apply_kernel<StopType, 32, 0>(sconf, logger, prec, mat, b.values,
+                                             x.values, workspace_data,
+                                             group_size, shared_size);
+#endif
     }
 
 private:
diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp
index 95a49953b3e..54b63983388 100644
--- a/hip/solver/batch_bicgstab_kernels.hip.cpp
+++ b/hip/solver/batch_bicgstab_kernels.hip.cpp
@@ -149,6 +149,9 @@ class kernel_caller {
 
         value_type* const workspace_data = workspace.get_data();
 
+        // Only instantiate when full optimizations has been enabled. Otherwise,
+        // just use the default one with no shared memory.
+#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS
         // Template parameters launch_apply_kernel<StopType, n_shared,
         // prec_shared)
         if (sconf.prec_shared) {
@@ -211,6 +214,11 @@ class kernel_caller {
                 GKO_NOT_IMPLEMENTED;
             }
         }
+#else
+        launch_apply_kernel<StopType, 0, false>(
+            sconf, logger, prec, mat, b.values, x.values, workspace_data,
+            block_size, shared_size);
+#endif
     }
 
 private:
diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp
index 6102749b988..290fd72b9f7 100644
--- a/hip/solver/batch_cg_kernels.hip.cpp
+++ b/hip/solver/batch_cg_kernels.hip.cpp
@@ -149,6 +149,9 @@ class kernel_caller {
 
         value_type* const workspace_data = workspace.get_data();
 
+        // Only instantiate when full optimizations has been enabled. Otherwise,
+        // just use the default one with no shared memory.
+#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS
         // Template parameters launch_apply_kernel<StopType, n_shared,
         // prec_shared)
         if (sconf.prec_shared) {
@@ -191,6 +194,11 @@ class kernel_caller {
                 GKO_NOT_IMPLEMENTED;
             }
         }
+#else
+        launch_apply_kernel<StopType, 0, false>(
+            sconf, logger, prec, mat, b.values, x.values, workspace_data,
+            block_size, shared_size);
+#endif
     }
 
 private:
diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in
index 329918399d6..4eb3106633f 100644
--- a/include/ginkgo/config.hpp.in
+++ b/include/ginkgo/config.hpp.in
@@ -31,6 +31,10 @@
 #cmakedefine GINKGO_JACOBI_FULL_OPTIMIZATIONS
 
 
+/* Should we use all optimizations for batched solvers? */
+#cmakedefine GINKGO_BATCHED_FULL_OPTIMIZATIONS
+
+
 /* Should we compile Ginkgo specifically to tune values? */
 #cmakedefine GINKGO_BENCHMARK_ENABLE_TUNING
 

From 1c6bc7be91e0cf3065ff43a1262afa6c2d202e0b Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Tue, 23 Jul 2024 14:26:32 +0200
Subject: [PATCH 132/448] [cuda,hip,dpcpp] disable optimized kernels

---
 CMakeLists.txt                             |   1 -
 core/solver/batch_bicgstab_kernels.hpp     |   5 +-
 core/solver/batch_cg_kernels.hpp           |   5 +-
 cuda/solver/batch_bicgstab_kernels.cu      | 123 +++++++++---------
 cuda/solver/batch_cg_kernels.cu            |  81 ++++++------
 dpcpp/solver/batch_bicgstab_kernels.dp.cpp | 139 ++++++++++-----------
 dpcpp/solver/batch_cg_kernels.dp.cpp       |  90 +++++++------
 hip/solver/batch_bicgstab_kernels.hip.cpp  | 120 +++++++++---------
 hip/solver/batch_cg_kernels.hip.cpp        |  80 ++++++------
 include/ginkgo/config.hpp.in               |   4 -
 10 files changed, 301 insertions(+), 347 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f60500b4cc9..21832c98592 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,7 +55,6 @@ option(GINKGO_HIP_AMD_UNSAFE_ATOMIC "Compiler uses unsafe floating point atomic
 option(GINKGO_SPLIT_TEMPLATE_INSTANTIATIONS "Split template instantiations for slow-to-compile files. This improves parallel build performance" ON)
 mark_as_advanced(GINKGO_SPLIT_TEMPLATE_INSTANTIATIONS)
 option(GINKGO_JACOBI_FULL_OPTIMIZATIONS "Use all the optimizations for the CUDA Jacobi algorithm" OFF)
-option(GINKGO_BATCHED_FULL_OPTIMIZATIONS "Use all the optimizations for the CUDA/HIP batched solver algorithms" OFF)
 option(BUILD_SHARED_LIBS "Build shared (.so, .dylib, .dll) libraries" ON)
 option(GINKGO_BUILD_HWLOC "Build Ginkgo with HWLOC. Default is OFF." OFF)
 option(GINKGO_BUILD_PAPI_SDE "Build Ginkgo with PAPI SDE. Enabled if a system installation is found." ${PAPI_SDE_FOUND})
diff --git a/core/solver/batch_bicgstab_kernels.hpp b/core/solver/batch_bicgstab_kernels.hpp
index 07ecb1bd834..5bab0e43b26 100644
--- a/core/solver/batch_bicgstab_kernels.hpp
+++ b/core/solver/batch_bicgstab_kernels.hpp
@@ -16,11 +16,8 @@
 #include "core/base/kernel_declaration.hpp"
 
 
-#ifdef GINKGO_BACTCHED_FULL_OPTIMIZATIONS
-constexpr bool bicgstab_no_shared_vecs = false;
-#else
+// TODO: update when splitting kernels
 constexpr bool bicgstab_no_shared_vecs = true;
-#endif
 
 
 namespace gko {
diff --git a/core/solver/batch_cg_kernels.hpp b/core/solver/batch_cg_kernels.hpp
index 028223886fe..031b20b2a61 100644
--- a/core/solver/batch_cg_kernels.hpp
+++ b/core/solver/batch_cg_kernels.hpp
@@ -16,11 +16,8 @@
 #include "core/base/kernel_declaration.hpp"
 
 
-#ifdef GINKGO_BACTCHED_FULL_OPTIMIZATIONS
-constexpr bool cg_no_shared_vecs = false;
-#else
+// TODO: update when splitting compilation
 constexpr bool cg_no_shared_vecs = true;
-#endif
 
 
 namespace gko {
diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu
index bc12fc7efde..54f489304a7 100644
--- a/cuda/solver/batch_bicgstab_kernels.cu
+++ b/cuda/solver/batch_bicgstab_kernels.cu
@@ -167,76 +167,69 @@ public:
 
         value_type* const workspace_data = workspace.get_data();
 
-        // Only instantiate when full optimizations has been enabled. Otherwise,
-        // just use the default one with no shared memory.
-#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS
+        // TODO: split compilation
         // Template parameters launch_apply_kernel<StopType, n_shared,
         // prec_shared>
-        if (sconf.prec_shared) {
-            launch_apply_kernel<StopType, 9, true>(
-                sconf, logger, prec, mat, b.values, x.values, workspace_data,
-                block_size, shared_size);
-        } else {
-            switch (sconf.n_shared) {
-            case 0:
-                launch_apply_kernel<StopType, 0, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            case 1:
-                launch_apply_kernel<StopType, 1, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            case 2:
-                launch_apply_kernel<StopType, 2, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            case 3:
-                launch_apply_kernel<StopType, 3, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            case 4:
-                launch_apply_kernel<StopType, 4, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            case 5:
-                launch_apply_kernel<StopType, 5, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            case 6:
-                launch_apply_kernel<StopType, 6, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            case 7:
-                launch_apply_kernel<StopType, 7, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            case 8:
-                launch_apply_kernel<StopType, 8, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            case 9:
-                launch_apply_kernel<StopType, 9, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            default:
-                GKO_NOT_IMPLEMENTED;
-            }
-        }
-#else
+        // if (sconf.prec_shared) {
+        //     launch_apply_kernel<StopType, 9, true>(
+        //         sconf, logger, prec, mat, b.values, x.values, workspace_data,
+        //         block_size, shared_size);
+        // } else {
+        //     switch (sconf.n_shared) {
+        // case 0:
         launch_apply_kernel<StopType, 0, false>(
             sconf, logger, prec, mat, b.values, x.values, workspace_data,
             block_size, shared_size);
-#endif
+        //         break;
+        //     case 1:
+        //         launch_apply_kernel<StopType, 1, false>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, block_size, shared_size);
+        //         break;
+        //     case 2:
+        //         launch_apply_kernel<StopType, 2, false>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, block_size, shared_size);
+        //         break;
+        //     case 3:
+        //         launch_apply_kernel<StopType, 3, false>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, block_size, shared_size);
+        //         break;
+        //     case 4:
+        //         launch_apply_kernel<StopType, 4, false>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, block_size, shared_size);
+        //         break;
+        //     case 5:
+        //         launch_apply_kernel<StopType, 5, false>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, block_size, shared_size);
+        //         break;
+        //     case 6:
+        //         launch_apply_kernel<StopType, 6, false>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, block_size, shared_size);
+        //         break;
+        //     case 7:
+        //         launch_apply_kernel<StopType, 7, false>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, block_size, shared_size);
+        //         break;
+        //     case 8:
+        //         launch_apply_kernel<StopType, 8, false>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, block_size, shared_size);
+        //         break;
+        //     case 9:
+        //         launch_apply_kernel<StopType, 9, false>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, block_size, shared_size);
+        //         break;
+        //     default:
+        //         GKO_NOT_IMPLEMENTED;
+        //     }
+        // }
     }
 
 private:
diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu
index f09b6c70487..b681bd13ce3 100644
--- a/cuda/solver/batch_cg_kernels.cu
+++ b/cuda/solver/batch_cg_kernels.cu
@@ -165,56 +165,51 @@ public:
 
         value_type* const workspace_data = workspace.get_data();
 
+        // TODO: split compilation
         // Only instantiate when full optimizations has been enabled. Otherwise,
         // just use the default one with no shared memory.
-#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS
         // Template parameters launch_apply_kernel<StopType, n_shared,
         // prec_shared>
-        if (sconf.prec_shared) {
-            launch_apply_kernel<StopType, 5, true>(
-                sconf, logger, prec, mat, b.values, x.values, workspace_data,
-                block_size, shared_size);
-        } else {
-            switch (sconf.n_shared) {
-            case 0:
-                launch_apply_kernel<StopType, 0, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            case 1:
-                launch_apply_kernel<StopType, 1, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            case 2:
-                launch_apply_kernel<StopType, 2, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            case 3:
-                launch_apply_kernel<StopType, 3, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            case 4:
-                launch_apply_kernel<StopType, 4, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            case 5:
-                launch_apply_kernel<StopType, 5, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            default:
-                GKO_NOT_IMPLEMENTED;
-            }
-        }
-#else
+        // if (sconf.prec_shared) {
+        //     launch_apply_kernel<StopType, 5, true>(
+        //         sconf, logger, prec, mat, b.values, x.values, workspace_data,
+        //         block_size, shared_size);
+        // } else {
+        //     switch (sconf.n_shared) {
+        //     case 0:
         launch_apply_kernel<StopType, 0, false>(
             sconf, logger, prec, mat, b.values, x.values, workspace_data,
             block_size, shared_size);
-#endif
+        //         break;
+        //     case 1:
+        //         launch_apply_kernel<StopType, 1, false>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, block_size, shared_size);
+        //         break;
+        //     case 2:
+        //         launch_apply_kernel<StopType, 2, false>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, block_size, shared_size);
+        //         break;
+        //     case 3:
+        //         launch_apply_kernel<StopType, 3, false>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, block_size, shared_size);
+        //         break;
+        //     case 4:
+        //         launch_apply_kernel<StopType, 4, false>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, block_size, shared_size);
+        //         break;
+        //     case 5:
+        //         launch_apply_kernel<StopType, 5, false>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, block_size, shared_size);
+        //         break;
+        //     default:
+        //         GKO_NOT_IMPLEMENTED;
+        //     }
+        // }
     }
 
 private:
diff --git a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
index 3b6d5d1c5df..bb84283b49f 100644
--- a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
+++ b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
@@ -159,85 +159,80 @@ class kernel_caller {
         ValueType* const workspace_data = workspace.get_data();
         int n_shared_total = sconf.n_shared + int(sconf.prec_shared);
 
+        // TODO: split compilation
         // Only instantiate when full optimizations has been enabled. Otherwise,
         // just use the default one with no shared memory.
-#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS
         // template
         // launch_apply_kernel<StopType, subgroup_size, n_shared_total>
-        if (num_rows <= 32 && n_shared_total == 10) {
-            launch_apply_kernel<StopType, 32, 10>(
-                sconf, logger, prec, mat, b.values, x.values, workspace_data,
-                group_size, shared_size);
-        } else if (num_rows <= 256 && n_shared_total == 10) {
-            launch_apply_kernel<StopType, 32, 10>(
-                sconf, logger, prec, mat, b.values, x.values, workspace_data,
-                group_size, shared_size);
-        } else {
-            switch (n_shared_total) {
-            case 0:
-                launch_apply_kernel<StopType, 32, 0>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, group_size, shared_size);
-                break;
-            case 1:
-                launch_apply_kernel<StopType, 32, 1>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, group_size, shared_size);
-                break;
-            case 2:
-                launch_apply_kernel<StopType, 32, 2>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, group_size, shared_size);
-                break;
-            case 3:
-                launch_apply_kernel<StopType, 32, 3>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, group_size, shared_size);
-                break;
-            case 4:
-                launch_apply_kernel<StopType, 32, 4>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, group_size, shared_size);
-                break;
-            case 5:
-                launch_apply_kernel<StopType, 32, 5>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, group_size, shared_size);
-                break;
-            case 6:
-                launch_apply_kernel<StopType, 32, 6>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, group_size, shared_size);
-                break;
-            case 7:
-                launch_apply_kernel<StopType, 32, 7>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, group_size, shared_size);
-                break;
-            case 8:
-                launch_apply_kernel<StopType, 32, 8>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, group_size, shared_size);
-                break;
-            case 9:
-                launch_apply_kernel<StopType, 32, 9>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, group_size, shared_size);
-                break;
-            case 10:
-                launch_apply_kernel<StopType, 32, 10>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, group_size, shared_size);
-                break;
-            default:
-                GKO_NOT_IMPLEMENTED;
-            }
-        }
-#else
+        // if (num_rows <= 32 && n_shared_total == 10) {
+        //     launch_apply_kernel<StopType, 32, 10>(
+        //         sconf, logger, prec, mat, b.values, x.values, workspace_data,
+        //         group_size, shared_size);
+        // } else if (num_rows <= 256 && n_shared_total == 10) {
+        //     launch_apply_kernel<StopType, 32, 10>(
+        //         sconf, logger, prec, mat, b.values, x.values, workspace_data,
+        //         group_size, shared_size);
+        // } else {
+        //     switch (n_shared_total) {
+        //     case 0:
         launch_apply_kernel<StopType, 32, 0>(sconf, logger, prec, mat, b.values,
                                              x.values, workspace_data,
                                              group_size, shared_size);
-#endif
+        //         break;
+        //     case 1:
+        //         launch_apply_kernel<StopType, 32, 1>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, group_size, shared_size);
+        //         break;
+        //     case 2:
+        //         launch_apply_kernel<StopType, 32, 2>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, group_size, shared_size);
+        //         break;
+        //     case 3:
+        //         launch_apply_kernel<StopType, 32, 3>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, group_size, shared_size);
+        //         break;
+        //     case 4:
+        //         launch_apply_kernel<StopType, 32, 4>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, group_size, shared_size);
+        //         break;
+        //     case 5:
+        //         launch_apply_kernel<StopType, 32, 5>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, group_size, shared_size);
+        //         break;
+        //     case 6:
+        //         launch_apply_kernel<StopType, 32, 6>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, group_size, shared_size);
+        //         break;
+        //     case 7:
+        //         launch_apply_kernel<StopType, 32, 7>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, group_size, shared_size);
+        //         break;
+        //     case 8:
+        //         launch_apply_kernel<StopType, 32, 8>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, group_size, shared_size);
+        //         break;
+        //     case 9:
+        //         launch_apply_kernel<StopType, 32, 9>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, group_size, shared_size);
+        //         break;
+        //     case 10:
+        //         launch_apply_kernel<StopType, 32, 10>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, group_size, shared_size);
+        //         break;
+        //     default:
+        //         GKO_NOT_IMPLEMENTED;
+        //     }
+        // }
     }
 
 private:
diff --git a/dpcpp/solver/batch_cg_kernels.dp.cpp b/dpcpp/solver/batch_cg_kernels.dp.cpp
index 36fbe0dc269..61591f9efb6 100644
--- a/dpcpp/solver/batch_cg_kernels.dp.cpp
+++ b/dpcpp/solver/batch_cg_kernels.dp.cpp
@@ -160,59 +160,53 @@ class kernel_caller {
 
         // Only instantiate when full optimizations has been enabled. Otherwise,
         // just use the default one with no shared memory.
-#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS
         // template
         // launch_apply_kernel<StopType, subgroup_size, n_shared_total>
-        if (num_rows <= 32 && n_shared_total == 6) {
-            launch_apply_kernel<StopType, 16, 6>(
-                sconf, logger, prec, mat, b.values, x.values, workspace_data,
-                group_size, shared_size);
-        } else {
-            switch (n_shared_total) {
-            case 0:
-                launch_apply_kernel<StopType, 32, 0>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, group_size, shared_size);
-                break;
-            case 1:
-                launch_apply_kernel<StopType, 32, 1>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, group_size, shared_size);
-                break;
-            case 2:
-                launch_apply_kernel<StopType, 32, 2>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, group_size, shared_size);
-                break;
-            case 3:
-                launch_apply_kernel<StopType, 32, 3>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, group_size, shared_size);
-                break;
-            case 4:
-                launch_apply_kernel<StopType, 32, 4>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, group_size, shared_size);
-                break;
-            case 5:
-                launch_apply_kernel<StopType, 32, 5>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, group_size, shared_size);
-                break;
-            case 6:
-                launch_apply_kernel<StopType, 32, 6>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, group_size, shared_size);
-                break;
-            default:
-                GKO_NOT_IMPLEMENTED;
-            }
-        }
-#else
+        // if (num_rows <= 32 && n_shared_total == 6) {
+        //     launch_apply_kernel<StopType, 16, 6>(
+        //         sconf, logger, prec, mat, b.values, x.values, workspace_data,
+        //         group_size, shared_size);
+        // } else {
+        //     switch (n_shared_total) {
+        //     case 0:
         launch_apply_kernel<StopType, 32, 0>(sconf, logger, prec, mat, b.values,
                                              x.values, workspace_data,
                                              group_size, shared_size);
-#endif
+        //         break;
+        //     case 1:
+        //         launch_apply_kernel<StopType, 32, 1>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, group_size, shared_size);
+        //         break;
+        //     case 2:
+        //         launch_apply_kernel<StopType, 32, 2>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, group_size, shared_size);
+        //         break;
+        //     case 3:
+        //         launch_apply_kernel<StopType, 32, 3>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, group_size, shared_size);
+        //         break;
+        //     case 4:
+        //         launch_apply_kernel<StopType, 32, 4>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, group_size, shared_size);
+        //         break;
+        //     case 5:
+        //         launch_apply_kernel<StopType, 32, 5>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, group_size, shared_size);
+        //         break;
+        //     case 6:
+        //         launch_apply_kernel<StopType, 32, 6>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, group_size, shared_size);
+        //         break;
+        //     default:
+        //         GKO_NOT_IMPLEMENTED;
+        //     }
+        // }
     }
 
 private:
diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp
index 54b63983388..ca49fa5eb9c 100644
--- a/hip/solver/batch_bicgstab_kernels.hip.cpp
+++ b/hip/solver/batch_bicgstab_kernels.hip.cpp
@@ -151,74 +151,68 @@ class kernel_caller {
 
         // Only instantiate when full optimizations has been enabled. Otherwise,
         // just use the default one with no shared memory.
-#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS
         // Template parameters launch_apply_kernel<StopType, n_shared,
         // prec_shared)
-        if (sconf.prec_shared) {
-            launch_apply_kernel<StopType, 9, true>(
-                sconf, logger, prec, mat, b.values, x.values, workspace_data,
-                block_size, shared_size);
-        } else {
-            switch (sconf.n_shared) {
-            case 0:
-                launch_apply_kernel<StopType, 0, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            case 1:
-                launch_apply_kernel<StopType, 1, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            case 2:
-                launch_apply_kernel<StopType, 2, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            case 3:
-                launch_apply_kernel<StopType, 3, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            case 4:
-                launch_apply_kernel<StopType, 4, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            case 5:
-                launch_apply_kernel<StopType, 5, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            case 6:
-                launch_apply_kernel<StopType, 6, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            case 7:
-                launch_apply_kernel<StopType, 7, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            case 8:
-                launch_apply_kernel<StopType, 8, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            case 9:
-                launch_apply_kernel<StopType, 9, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            default:
-                GKO_NOT_IMPLEMENTED;
-            }
-        }
-#else
+        // if (sconf.prec_shared) {
+        //     launch_apply_kernel<StopType, 9, true>(
+        //         sconf, logger, prec, mat, b.values, x.values, workspace_data,
+        //         block_size, shared_size);
+        // } else {
+        //     switch (sconf.n_shared) {
+        //     case 0:
         launch_apply_kernel<StopType, 0, false>(
             sconf, logger, prec, mat, b.values, x.values, workspace_data,
             block_size, shared_size);
-#endif
+        //         break;
+        //     case 1:
+        //         launch_apply_kernel<StopType, 1, false>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, block_size, shared_size);
+        //         break;
+        //     case 2:
+        //         launch_apply_kernel<StopType, 2, false>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, block_size, shared_size);
+        //         break;
+        //     case 3:
+        //         launch_apply_kernel<StopType, 3, false>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, block_size, shared_size);
+        //         break;
+        //     case 4:
+        //         launch_apply_kernel<StopType, 4, false>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, block_size, shared_size);
+        //         break;
+        //     case 5:
+        //         launch_apply_kernel<StopType, 5, false>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, block_size, shared_size);
+        //         break;
+        //     case 6:
+        //         launch_apply_kernel<StopType, 6, false>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, block_size, shared_size);
+        //         break;
+        //     case 7:
+        //         launch_apply_kernel<StopType, 7, false>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, block_size, shared_size);
+        //         break;
+        //     case 8:
+        //         launch_apply_kernel<StopType, 8, false>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, block_size, shared_size);
+        //         break;
+        //     case 9:
+        //         launch_apply_kernel<StopType, 9, false>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, block_size, shared_size);
+        //         break;
+        //     default:
+        //         GKO_NOT_IMPLEMENTED;
+        //     }
+        // }
     }
 
 private:
diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp
index 290fd72b9f7..3a1642edfea 100644
--- a/hip/solver/batch_cg_kernels.hip.cpp
+++ b/hip/solver/batch_cg_kernels.hip.cpp
@@ -151,54 +151,48 @@ class kernel_caller {
 
         // Only instantiate when full optimizations has been enabled. Otherwise,
         // just use the default one with no shared memory.
-#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS
         // Template parameters launch_apply_kernel<StopType, n_shared,
         // prec_shared)
-        if (sconf.prec_shared) {
-            launch_apply_kernel<StopType, 5, true>(
-                sconf, logger, prec, mat, b.values, x.values, workspace_data,
-                block_size, shared_size);
-        } else {
-            switch (sconf.n_shared) {
-            case 0:
-                launch_apply_kernel<StopType, 0, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            case 1:
-                launch_apply_kernel<StopType, 1, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            case 2:
-                launch_apply_kernel<StopType, 2, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            case 3:
-                launch_apply_kernel<StopType, 3, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            case 4:
-                launch_apply_kernel<StopType, 4, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            case 5:
-                launch_apply_kernel<StopType, 5, false>(
-                    sconf, logger, prec, mat, b.values, x.values,
-                    workspace_data, block_size, shared_size);
-                break;
-            default:
-                GKO_NOT_IMPLEMENTED;
-            }
-        }
-#else
+        // if (sconf.prec_shared) {
+        //     launch_apply_kernel<StopType, 5, true>(
+        //         sconf, logger, prec, mat, b.values, x.values, workspace_data,
+        //         block_size, shared_size);
+        // } else {
+        //     switch (sconf.n_shared) {
+        //     case 0:
         launch_apply_kernel<StopType, 0, false>(
             sconf, logger, prec, mat, b.values, x.values, workspace_data,
             block_size, shared_size);
-#endif
+        //         break;
+        //     case 1:
+        //         launch_apply_kernel<StopType, 1, false>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, block_size, shared_size);
+        //         break;
+        //     case 2:
+        //         launch_apply_kernel<StopType, 2, false>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, block_size, shared_size);
+        //         break;
+        //     case 3:
+        //         launch_apply_kernel<StopType, 3, false>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, block_size, shared_size);
+        //         break;
+        //     case 4:
+        //         launch_apply_kernel<StopType, 4, false>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, block_size, shared_size);
+        //         break;
+        //     case 5:
+        //         launch_apply_kernel<StopType, 5, false>(
+        //             sconf, logger, prec, mat, b.values, x.values,
+        //             workspace_data, block_size, shared_size);
+        //         break;
+        //     default:
+        //         GKO_NOT_IMPLEMENTED;
+        //     }
+        // }
     }
 
 private:
diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in
index 4eb3106633f..329918399d6 100644
--- a/include/ginkgo/config.hpp.in
+++ b/include/ginkgo/config.hpp.in
@@ -31,10 +31,6 @@
 #cmakedefine GINKGO_JACOBI_FULL_OPTIMIZATIONS
 
 
-/* Should we use all optimizations for batched solvers? */
-#cmakedefine GINKGO_BATCHED_FULL_OPTIMIZATIONS
-
-
 /* Should we compile Ginkgo specifically to tune values? */
 #cmakedefine GINKGO_BENCHMARK_ENABLE_TUNING
 

From 26b362b39f2901a4951b5a42467640052cbc304d Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Mon, 5 Aug 2024 13:02:24 +0200
Subject: [PATCH 133/448] [review] review updates

---
 core/solver/batch_bicgstab_kernels.hpp | 1 -
 core/solver/batch_cg_kernels.hpp       | 1 -
 2 files changed, 2 deletions(-)

diff --git a/core/solver/batch_bicgstab_kernels.hpp b/core/solver/batch_bicgstab_kernels.hpp
index 5bab0e43b26..615ed472597 100644
--- a/core/solver/batch_bicgstab_kernels.hpp
+++ b/core/solver/batch_bicgstab_kernels.hpp
@@ -6,7 +6,6 @@
 #define GKO_CORE_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_
 
 
-#include <ginkgo/config.hpp>
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/log/batch_logger.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
diff --git a/core/solver/batch_cg_kernels.hpp b/core/solver/batch_cg_kernels.hpp
index 031b20b2a61..b21a2c07d3e 100644
--- a/core/solver/batch_cg_kernels.hpp
+++ b/core/solver/batch_cg_kernels.hpp
@@ -6,7 +6,6 @@
 #define GKO_CORE_SOLVER_BATCH_CG_KERNELS_HPP_
 
 
-#include <ginkgo/config.hpp>
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/log/batch_logger.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>

From e1eedfef4b5b7d8b43cfe9363ee10478080fa11f Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Tue, 13 Aug 2024 13:08:19 +0200
Subject: [PATCH 134/448] use smaller block size on cuda

---
 cuda/solver/batch_bicgstab_kernels.cu | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu
index 54f489304a7..3c7fe50709c 100644
--- a/cuda/solver/batch_bicgstab_kernels.cu
+++ b/cuda/solver/batch_bicgstab_kernels.cu
@@ -144,10 +144,11 @@ public:
         const int shmem_per_blk =
             get_max_dynamic_shared_memory<StopType, PrecType, LogType,
                                           BatchMatrixType, value_type>(exec_);
-        const int block_size =
-            get_num_threads_per_block<StopType, PrecType, LogType,
-                                      BatchMatrixType, value_type>(
-                exec_, mat.num_rows);
+        // TODO
+        const int block_size = 256;
+        // get_num_threads_per_block<StopType, PrecType, LogType,
+        //                           BatchMatrixType, value_type>(
+        //     exec_, mat.num_rows);
         GKO_ASSERT(block_size >= 2 * config::warp_size);
 
         const size_t prec_size = PrecType::dynamic_work_size(

From c38735c51d7d70ab0228aa13f588eb1a7048230d Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Fri, 16 Aug 2024 12:43:44 +0200
Subject: [PATCH 135/448] [cmake] add rocthrust through cmake

---
 cmake/GinkgoConfig.cmake.in | 1 +
 cmake/hip.cmake             | 1 +
 hip/CMakeLists.txt          | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/cmake/GinkgoConfig.cmake.in b/cmake/GinkgoConfig.cmake.in
index 23b1d25adc1..1f12251f93d 100644
--- a/cmake/GinkgoConfig.cmake.in
+++ b/cmake/GinkgoConfig.cmake.in
@@ -175,6 +175,7 @@ if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_HIP)
     find_dependency(hiprand)
     find_dependency(hipsparse)
     find_dependency(rocrand)
+    find_dependency(rocthrust)
     set_and_check(ROCTRACER_PATH "@ROCTRACER_PATH@")
     find_dependency(ROCTX)
 endif()
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
index c94117242eb..bd834c3ebde 100644
--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
@@ -123,6 +123,7 @@ find_package(hiprand REQUIRED)
 find_package(hipsparse REQUIRED)
 # At the moment, for hiprand to work also rocrand is required.
 find_package(rocrand REQUIRED)
+find_package(rocthrust REQUIRED)
 find_package(ROCTX)
 
 if(GINKGO_HIP_AMD_UNSAFE_ATOMIC AND GINKGO_HIP_VERSION VERSION_GREATER_EQUAL 5)
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 30e675509d5..46b2d7bd19b 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -65,7 +65,7 @@ target_include_directories(ginkgo_hip
 target_compile_definitions(ginkgo_hip PRIVATE GKO_COMPILING_HIP GKO_DEVICE_NAMESPACE=hip)
 
 target_link_libraries(ginkgo_hip PUBLIC ginkgo_device)
-target_link_libraries(ginkgo_hip PRIVATE hip::host roc::hipblas roc::hipsparse hip::hiprand roc::rocrand)
+target_link_libraries(ginkgo_hip PRIVATE hip::host roc::hipblas roc::hipsparse hip::hiprand roc::rocrand roc::rocthrust)
 if (hipfft_FOUND)
     target_link_libraries(ginkgo_hip PRIVATE hip::hipfft)
 endif()

From 65980d34c4df73e3c166a1596fae12a92a3690a1 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 16 Aug 2024 00:31:07 +0200
Subject: [PATCH 136/448] use tbb from onemkl, and add the path after
 installing

---
 .github/workflows/intel.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/intel.yml b/.github/workflows/intel.yml
index 7a1e97a80e8..0aa435dfee3 100644
--- a/.github/workflows/intel.yml
+++ b/.github/workflows/intel.yml
@@ -33,20 +33,20 @@ jobs:
     - name: configure
       run: |
         source /etc/profile
-        module load intel-oneapi-compilers intel-oneapi-dpl intel-oneapi-mkl intel-oneapi-tbb cmake
+        module load intel-oneapi-compilers intel-oneapi-dpl intel-oneapi-mkl cmake
         mkdir build
         cd build
         cmake .. -DCMAKE_INSTALL_PREFIX=install_ginkgo -DCMAKE_CXX_FLAGS="-Wpedantic -ffp-model=precise" -DCMAKE_CXX_COMPILER=${{ matrix.config.compiler }} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_HIP=OFF -DGINKGO_BUILD_MPI=OFF -DGINKGO_DPCPP_SINGLE_MODE=ON
         make -j8
         ONEAPI_DEVICE_SELECTOR=level_zero:gpu ctest -j10 --output-on-failure
-
+ 
     - name: install
       run: |
         source /etc/profile
-        module load intel-oneapi-compilers intel-oneapi-dpl intel-oneapi-mkl intel-oneapi-tbb cmake
+        module load intel-oneapi-compilers intel-oneapi-dpl intel-oneapi-mkl cmake
         cd build
         SYCL_DEVICE_FILTER=level_zero:gpu make install
         export GINKGO_PATH="$(pwd)/install_ginkgo/lib"
-        export LIBRARY_PATH=${GINKGO_PATH}:$LIBRARY_PATH
-        export LD_LIBRARY_PATH=${GINKGO_PATH}:$LD_LIBRARY_PATH
+        export LIBRARY_PATH=${ICL_INTEL_TBB_ROOT}/lib64:${GINKGO_PATH}:$LIBRARY_PATH
+        export LD_LIBRARY_PATH=${ICL_INTEL_TBB_ROOT}/lib64:${GINKGO_PATH}:$LD_LIBRARY_PATH
         SYCL_DEVICE_FILTER=level_zero:gpu make test_install

From 06af951d63d6ae66fb7c801241d8a83f52be5dea Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 6 Aug 2024 11:24:38 +0200
Subject: [PATCH 137/448] e uses next level precision, but the coarest solver
 uses the last level precision. Thus, we can not cast e to current level
 precision unless it is the last level

---
 core/solver/multigrid.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/solver/multigrid.cpp b/core/solver/multigrid.cpp
index 6a8b5ee151b..35ad7b5d1fe 100644
--- a/core/solver/multigrid.cpp
+++ b/core/solver/multigrid.cpp
@@ -486,7 +486,7 @@ void MultigridState::run_cycle(multigrid::cycle cycle, size_type level,
 
     auto r = r_list.at(level);
     auto g = g_list.at(level);
-    auto e = as<VectorType>(e_list.at(level));
+    auto e = e_list.at(level);
     // get mg_level
     auto mg_level = multigrid->get_mg_level_list().at(level);
     // get the pre_smoother
@@ -537,7 +537,7 @@ void MultigridState::run_cycle(multigrid::cycle cycle, size_type level,
     // next level
     if (level + 1 == total_level) {
         // the coarsest solver use the last level valuetype
-        e->fill(zero<value_type>());
+        as<VectorType>(e)->fill(zero<value_type>());
     }
     auto next_level_matrix =
         (level + 1 < total_level)

From 2d3d622f83128303e33fd6675ebb3e46e6a082c7 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Tue, 23 Jul 2024 13:34:59 +0200
Subject: [PATCH 138/448] unify cuda/hip batch_mvec

---
 common/cuda_hip/CMakeLists.txt                |   1 +
 ...hpp.inc => batch_multi_vector_kernels.cpp} |  67 +++-
 .../base/batch_multi_vector_kernels.hpp       | 326 ++++++++++++++++++
 .../base/batch_multi_vector_kernels.hpp.inc   |  43 +--
 cuda/CMakeLists.txt                           |   1 -
 cuda/base/batch_multi_vector_kernels.cu       |  56 ---
 hip/CMakeLists.txt                            |   1 -
 hip/base/batch_multi_vector_kernels.hip.cpp   |  56 ---
 8 files changed, 390 insertions(+), 161 deletions(-)
 rename common/cuda_hip/base/{batch_multi_vector_kernel_launcher.hpp.inc => batch_multi_vector_kernels.cpp} (67%)
 create mode 100644 common/cuda_hip/base/batch_multi_vector_kernels.hpp
 delete mode 100644 cuda/base/batch_multi_vector_kernels.cu
 delete mode 100644 hip/base/batch_multi_vector_kernels.hip.cpp

diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt
index 463abfd9284..15d3a82419e 100644
--- a/common/cuda_hip/CMakeLists.txt
+++ b/common/cuda_hip/CMakeLists.txt
@@ -1,5 +1,6 @@
 include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
 set(CUDA_HIP_SOURCES
+    base/batch_multi_vector_kernels.cpp
     base/device_matrix_data_kernels.cpp
     base/index_set_kernels.cpp
     components/prefix_sum_kernels.cpp
diff --git a/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.cpp
similarity index 67%
rename from common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc
rename to common/cuda_hip/base/batch_multi_vector_kernels.cpp
index 19b5b74a547..17f65487464 100644
--- a/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.cpp
@@ -2,6 +2,32 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "core/base/batch_multi_vector_kernels.hpp"
+#include "core/base/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_multi_vector {
+
+
+constexpr auto default_block_size = 256;
+
+
 template <typename ValueType>
 void scale(std::shared_ptr<const DefaultExecutor> exec,
            const batch::MultiVector<ValueType>* const alpha,
@@ -11,16 +37,19 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     const auto alpha_ub = get_batch_struct(alpha);
     const auto x_ub = get_batch_struct(x);
     if (alpha->get_common_size()[1] == 1) {
-        scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        batch_single_kernels::scale_kernel<<<num_blocks, default_block_size, 0,
+                                             exec->get_stream()>>>(
             alpha_ub, x_ub,
             [] __device__(int row, int col, int stride) { return 0; });
     } else if (alpha->get_common_size() == x->get_common_size()) {
-        scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        batch_single_kernels::scale_kernel<<<num_blocks, default_block_size, 0,
+                                             exec->get_stream()>>>(
             alpha_ub, x_ub, [] __device__(int row, int col, int stride) {
                 return row * stride + col;
             });
     } else {
-        scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        batch_single_kernels::scale_kernel<<<num_blocks, default_block_size, 0,
+                                             exec->get_stream()>>>(
             alpha_ub, x_ub,
             [] __device__(int row, int col, int stride) { return col; });
     }
@@ -42,12 +71,12 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
     const auto x_ub = get_batch_struct(x);
     const auto y_ub = get_batch_struct(y);
     if (alpha->get_common_size()[1] == 1) {
-        add_scaled_kernel<<<num_blocks, default_block_size, 0,
-                            exec->get_stream()>>>(
+        batch_single_kernels::add_scaled_kernel<<<
+            num_blocks, default_block_size, 0, exec->get_stream()>>>(
             alpha_ub, x_ub, y_ub, [] __device__(int col) { return 0; });
     } else {
-        add_scaled_kernel<<<num_blocks, default_block_size, 0,
-                            exec->get_stream()>>>(
+        batch_single_kernels::add_scaled_kernel<<<
+            num_blocks, default_block_size, 0, exec->get_stream()>>>(
             alpha_ub, x_ub, y_ub, [] __device__(int col) { return col; });
     }
 }
@@ -67,8 +96,8 @@ void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
     const auto x_ub = get_batch_struct(x);
     const auto y_ub = get_batch_struct(y);
     const auto res_ub = get_batch_struct(result);
-    compute_gen_dot_product_kernel<<<num_blocks, default_block_size, 0,
-                                     exec->get_stream()>>>(
+    batch_single_kernels::compute_gen_dot_product_kernel<<<
+        num_blocks, default_block_size, 0, exec->get_stream()>>>(
         x_ub, y_ub, res_ub, [] __device__(auto val) { return val; });
 }
 
@@ -87,8 +116,8 @@ void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
     const auto x_ub = get_batch_struct(x);
     const auto y_ub = get_batch_struct(y);
     const auto res_ub = get_batch_struct(result);
-    compute_gen_dot_product_kernel<<<num_blocks, default_block_size, 0,
-                                     exec->get_stream()>>>(
+    batch_single_kernels::compute_gen_dot_product_kernel<<<
+        num_blocks, default_block_size, 0, exec->get_stream()>>>(
         x_ub, y_ub, res_ub, [] __device__(auto val) { return conj(val); });
 }
 
@@ -105,8 +134,9 @@ void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
     const auto num_rhs = x->get_common_size()[1];
     const auto x_ub = get_batch_struct(x);
     const auto res_ub = get_batch_struct(result);
-    compute_norm2_kernel<<<num_blocks, default_block_size, 0,
-                           exec->get_stream()>>>(x_ub, res_ub);
+    batch_single_kernels::compute_norm2_kernel<<<num_blocks, default_block_size,
+                                                 0, exec->get_stream()>>>(
+        x_ub, res_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -121,8 +151,15 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
     const auto num_blocks = x->get_num_batch_items();
     const auto result_ub = get_batch_struct(result);
     const auto x_ub = get_batch_struct(x);
-    copy_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-        x_ub, result_ub);
+    batch_single_kernels::
+        copy_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+            x_ub, result_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
+
+
+}  // namespace batch_multi_vector
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp b/common/cuda_hip/base/batch_multi_vector_kernels.hpp
new file mode 100644
index 00000000000..36aa69d7d99
--- /dev/null
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp
@@ -0,0 +1,326 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/segment_scan.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
+
+#if defined(GKO_COMPILING_CUDA)
+#include "cuda/base/batch_struct.hpp"
+#elif defined(GKO_COMPILING_HIP)
+#include "hip/base/batch_struct.hip.hpp"
+#else
+#error "batch struct def missing"
+#endif
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_multi_vector {
+namespace batch_single_kernels {
+
+
+constexpr auto default_block_size = 256;
+
+
+template <typename ValueType, typename Mapping>
+__device__ __forceinline__ void scale(
+    const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
+    const gko::batch::multi_vector::batch_item<ValueType>& x, Mapping map)
+{
+    const int max_li = x.num_rows * x.num_rhs;
+    for (int li = threadIdx.x; li < max_li; li += blockDim.x) {
+        const int row = li / x.num_rhs;
+        const int col = li % x.num_rhs;
+
+        x.values[row * x.stride + col] =
+            alpha.values[map(row, col, alpha.stride)] *
+            x.values[row * x.stride + col];
+    }
+}
+
+
+template <typename ValueType, typename Mapping>
+__global__ __launch_bounds__(default_block_size) void scale_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> alpha,
+    const gko::batch::multi_vector::uniform_batch<ValueType> x, Mapping map)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id);
+        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
+        scale(alpha_b, x_b, map);
+    }
+}
+
+
+template <typename ValueType, typename Mapping>
+__device__ __forceinline__ void add_scaled(
+    const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const gko::batch::multi_vector::batch_item<ValueType>& y, Mapping map)
+{
+    const int max_li = x.num_rows * x.num_rhs;
+    for (int li = threadIdx.x; li < max_li; li += blockDim.x) {
+        const int row = li / x.num_rhs;
+        const int col = li % x.num_rhs;
+
+        y.values[row * y.stride + col] +=
+            alpha.values[map(col)] * x.values[row * x.stride + col];
+    }
+}
+
+
+template <typename ValueType, typename Mapping>
+__global__ __launch_bounds__(default_block_size) void add_scaled_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> alpha,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> x,
+    const gko::batch::multi_vector::uniform_batch<ValueType> y, Mapping map)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id);
+        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
+        const auto y_b = gko::batch::extract_batch_item(y, batch_id);
+        add_scaled(alpha_b, x_b, y_b, map);
+    }
+}
+
+
+template <typename Group, typename ValueType>
+__device__ __forceinline__ void single_rhs_compute_conj_dot(Group subgroup,
+                                                            const int num_rows,
+                                                            const ValueType* x,
+                                                            const ValueType* y,
+                                                            ValueType& result)
+
+{
+    ValueType val = zero<ValueType>();
+    for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) {
+        val += conj(x[r]) * y[r];
+    }
+
+    // subgroup level reduction
+    val = reduce(subgroup, val, thrust::plus<ValueType>{});
+
+    if (subgroup.thread_rank() == 0) {
+        result = val;
+    }
+}
+
+
+template <typename Group, typename ValueType, typename Mapping>
+__device__ __forceinline__ void gen_one_dot(
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const gko::batch::multi_vector::batch_item<const ValueType>& y,
+    const int rhs_index,
+    const gko::batch::multi_vector::batch_item<ValueType>& result,
+    Group subgroup, Mapping conj_map)
+{
+    ValueType val = zero<ValueType>();
+
+    for (int r = subgroup.thread_rank(); r < x.num_rows; r += subgroup.size()) {
+        val += conj_map(x.values[r * x.stride + rhs_index]) *
+               y.values[r * y.stride + rhs_index];
+    }
+
+    // subgroup level reduction
+    val = reduce(subgroup, val, thrust::plus<ValueType>{});
+
+    if (subgroup.thread_rank() == 0) {
+        result.values[rhs_index] = val;
+    }
+}
+
+
+template <typename ValueType, typename Mapping>
+__device__ __forceinline__ void compute_gen_dot_product(
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const gko::batch::multi_vector::batch_item<const ValueType>& y,
+    const gko::batch::multi_vector::batch_item<ValueType>& result,
+    Mapping conj_map)
+{
+    constexpr auto tile_size = config::warp_size;
+    auto thread_block = group::this_thread_block();
+    auto subgroup = group::tiled_partition<tile_size>(thread_block);
+    const auto subgroup_id = static_cast<int>(threadIdx.x / tile_size);
+    const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size);
+
+    for (int rhs_index = subgroup_id; rhs_index < x.num_rhs;
+         rhs_index += num_subgroups_per_block) {
+        gen_one_dot(x, y, rhs_index, result, subgroup, conj_map);
+    }
+}
+
+
+template <typename ValueType, typename Mapping>
+__global__
+__launch_bounds__(default_block_size) void compute_gen_dot_product_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> x,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> y,
+    const gko::batch::multi_vector::uniform_batch<ValueType> result,
+    Mapping map)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
+        const auto y_b = gko::batch::extract_batch_item(y, batch_id);
+        const auto r_b = gko::batch::extract_batch_item(result, batch_id);
+        compute_gen_dot_product(x_b, y_b, r_b, map);
+    }
+}
+
+
+template <typename Group, typename ValueType>
+__device__ __forceinline__ void single_rhs_compute_norm2(
+    Group subgroup, const int num_rows, const ValueType* x,
+    remove_complex<ValueType>& result)
+{
+    using real_type = typename gko::remove_complex<ValueType>;
+    real_type val = zero<real_type>();
+
+    for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) {
+        val += squared_norm(x[r]);
+    }
+
+    // subgroup level reduction
+    val = reduce(subgroup, val, thrust::plus<remove_complex<ValueType>>{});
+
+    if (subgroup.thread_rank() == 0) {
+        result = sqrt(val);
+    }
+}
+
+
+template <typename Group, typename ValueType>
+__device__ __forceinline__ void one_norm2(
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const int rhs_index,
+    const gko::batch::multi_vector::batch_item<remove_complex<ValueType>>&
+        result,
+    Group subgroup)
+{
+    using real_type = typename gko::remove_complex<ValueType>;
+    real_type val = zero<real_type>();
+
+    for (int r = subgroup.thread_rank(); r < x.num_rows; r += subgroup.size()) {
+        val += squared_norm(x.values[r * x.stride + rhs_index]);
+    }
+
+    // subgroup level reduction
+    val = reduce(subgroup, val, thrust::plus<remove_complex<ValueType>>{});
+
+    if (subgroup.thread_rank() == 0) {
+        result.values[rhs_index] = sqrt(val);
+    }
+}
+
+
+/**
+ * Computes the 2-norms of some column vectors in global or shared memory.
+ *
+ * @param x  A row-major multivector with nrhs columns.
+ * @param result  Holds norm value for each vector in x.
+ */
+template <typename ValueType>
+__device__ __forceinline__ void compute_norm2(
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const gko::batch::multi_vector::batch_item<remove_complex<ValueType>>&
+        result)
+{
+    constexpr auto tile_size = config::warp_size;
+    auto thread_block = group::this_thread_block();
+    auto subgroup = group::tiled_partition<tile_size>(thread_block);
+    const auto subgroup_id = static_cast<int>(threadIdx.x / tile_size);
+    const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size);
+
+    for (int rhs_index = subgroup_id; rhs_index < x.num_rhs;
+         rhs_index += num_subgroups_per_block) {
+        one_norm2(x, rhs_index, result, subgroup);
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void compute_norm2_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> x,
+    const gko::batch::multi_vector::uniform_batch<remove_complex<ValueType>>
+        result)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
+        const auto r_b = gko::batch::extract_batch_item(result, batch_id);
+        compute_norm2(x_b, r_b);
+    }
+}
+
+
+template <typename ValueType>
+__device__ __forceinline__ void single_rhs_copy(const int num_rows,
+                                                const ValueType* in,
+                                                ValueType* out)
+{
+    for (int iz = threadIdx.x; iz < num_rows; iz += blockDim.x) {
+        out[iz] = in[iz];
+    }
+}
+
+
+/**
+ * Copies the values of one multi-vector into another.
+ *
+ * Note that the output multi-vector should already have memory allocated
+ * and stride set.
+ */
+template <typename ValueType>
+__device__ __forceinline__ void copy(
+    const gko::batch::multi_vector::batch_item<const ValueType>& in,
+    const gko::batch::multi_vector::batch_item<ValueType>& out)
+{
+    for (int iz = threadIdx.x; iz < in.num_rows * in.num_rhs;
+         iz += blockDim.x) {
+        const int i = iz / in.num_rhs;
+        const int j = iz % in.num_rhs;
+        out.values[i * out.stride + j] = in.values[i * in.stride + j];
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void copy_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> src,
+    const gko::batch::multi_vector::uniform_batch<ValueType> dst)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < src.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto dst_b = gko::batch::extract_batch_item(dst, batch_id);
+        const auto src_b = gko::batch::extract_batch_item(src, batch_id);
+        copy(src_b, dst_b);
+    }
+}
+
+
+}  // namespace batch_single_kernels
+}  // namespace batch_multi_vector
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
index 9b6301674be..7af3c84303f 100644
--- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
@@ -20,8 +20,7 @@ __device__ __forceinline__ void scale(
 
 
 template <typename ValueType, typename Mapping>
-__global__
-__launch_bounds__(default_block_size, sm_oversubscription) void scale_kernel(
+__global__ __launch_bounds__(default_block_size) void scale_kernel(
     const gko::batch::multi_vector::uniform_batch<const ValueType> alpha,
     const gko::batch::multi_vector::uniform_batch<ValueType> x, Mapping map)
 {
@@ -52,20 +51,10 @@ __device__ __forceinline__ void add_scaled(
 
 
 template <typename ValueType, typename Mapping>
-__global__ __launch_bounds__(
-    default_block_size,
-    sm_oversubscription) void add_scaled_kernel(const gko::batch::multi_vector::
-                                                    uniform_batch<
-                                                        const ValueType>
-                                                        alpha,
-                                                const gko::batch::multi_vector::
-                                                    uniform_batch<
-                                                        const ValueType>
-                                                        x,
-                                                const gko::batch::multi_vector::
-                                                    uniform_batch<ValueType>
-                                                        y,
-                                                Mapping map)
+__global__ __launch_bounds__(default_block_size) void add_scaled_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> alpha,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> x,
+    const gko::batch::multi_vector::uniform_batch<ValueType> y, Mapping map)
 {
     for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
          batch_id += gridDim.x) {
@@ -145,7 +134,7 @@ __device__ __forceinline__ void compute_gen_dot_product(
 
 template <typename ValueType, typename Mapping>
 __global__
-__launch_bounds__(default_block_size, sm_oversubscription) void compute_gen_dot_product_kernel(
+__launch_bounds__(default_block_size) void compute_gen_dot_product_kernel(
     const gko::batch::multi_vector::uniform_batch<const ValueType> x,
     const gko::batch::multi_vector::uniform_batch<const ValueType> y,
     const gko::batch::multi_vector::uniform_batch<ValueType> result,
@@ -232,19 +221,10 @@ __device__ __forceinline__ void compute_norm2(
 
 
 template <typename ValueType>
-__global__ __launch_bounds__(
-    default_block_size,
-    sm_oversubscription) void compute_norm2_kernel(const gko::batch::
-                                                       multi_vector::
-                                                           uniform_batch<
-                                                               const ValueType>
-                                                               x,
-                                                   const gko::batch::
-                                                       multi_vector::
-                                                           uniform_batch<
-                                                               remove_complex<
-                                                                   ValueType>>
-                                                               result)
+__global__ __launch_bounds__(default_block_size) void compute_norm2_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> x,
+    const gko::batch::multi_vector::uniform_batch<remove_complex<ValueType>>
+        result)
 {
     for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
          batch_id += gridDim.x) {
@@ -287,8 +267,7 @@ __device__ __forceinline__ void copy(
 
 
 template <typename ValueType>
-__global__
-__launch_bounds__(default_block_size, sm_oversubscription) void copy_kernel(
+__global__ __launch_bounds__(default_block_size) void copy_kernel(
     const gko::batch::multi_vector::uniform_batch<const ValueType> src,
     const gko::batch::multi_vector::uniform_batch<ValueType> dst)
 {
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index d4a94eda802..3631a65f48d 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -7,7 +7,6 @@ add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kerne
 list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
 target_sources(ginkgo_cuda
     PRIVATE
-    base/batch_multi_vector_kernels.cu
     base/device.cpp
     base/exception.cpp
     base/executor.cpp
diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu
deleted file mode 100644
index 3dad5ba94f1..00000000000
--- a/cuda/base/batch_multi_vector_kernels.cu
+++ /dev/null
@@ -1,56 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/base/batch_multi_vector_kernels.hpp"
-
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/range_accessors.hpp>
-
-#include "common/cuda_hip/base/blas_bindings.hpp"
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/pointer_mode_guard.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/warp_blas.hpp"
-#include "core/base/batch_struct.hpp"
-#include "cuda/base/batch_struct.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The MultiVector matrix format namespace.
- *
- * @ingroup batch_multi_vector
- */
-namespace batch_multi_vector {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc"
-
-// clang-format on
-
-
-}  // namespace batch_multi_vector
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 46b2d7bd19b..84bba295120 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -5,7 +5,6 @@ add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kerne
 # we don't split up the dense kernels into distinct compilations
 list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
 set(GINKGO_HIP_SOURCES
-    base/batch_multi_vector_kernels.hip.cpp
     base/device.hip.cpp
     base/exception.hip.cpp
     base/executor.hip.cpp
diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp
deleted file mode 100644
index 701f4655a9a..00000000000
--- a/hip/base/batch_multi_vector_kernels.hip.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/base/batch_multi_vector_kernels.hpp"
-
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/range_accessors.hpp>
-
-#include "common/cuda_hip/base/blas_bindings.hpp"
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/pointer_mode_guard.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/uninitialized_array.hpp"
-#include "core/base/batch_struct.hpp"
-#include "hip/base/batch_struct.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The MultiVector matrix format namespace.
- *
- * @ingroup batch_multi_vector
- */
-namespace batch_multi_vector {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc"
-
-// clang-format on
-
-
-}  // namespace batch_multi_vector
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko

From dd66c6702fb6e8010042f9795c14cf43fe3f5244 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Mon, 19 Aug 2024 11:03:12 +0200
Subject: [PATCH 139/448] [cuda,hip] update namespaces and includes

---
 .../base/batch_multi_vector_kernels.cpp       |  37 +--
 .../base/batch_multi_vector_kernels.hpp       |   2 -
 .../base/batch_multi_vector_kernels.hpp.inc   | 280 ------------------
 .../solver/batch_bicgstab_kernels.hpp.inc     |  41 ++-
 .../cuda_hip/solver/batch_cg_kernels.hpp.inc  |  23 +-
 cuda/solver/batch_bicgstab_kernels.cu         |   2 +-
 cuda/solver/batch_cg_kernels.cu               |   2 +-
 hip/solver/batch_bicgstab_kernels.hip.cpp     |   2 +-
 hip/solver/batch_cg_kernels.hip.cpp           |   2 +-
 9 files changed, 63 insertions(+), 328 deletions(-)
 delete mode 100644 common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc

diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.cpp b/common/cuda_hip/base/batch_multi_vector_kernels.cpp
index 17f65487464..76565a83f80 100644
--- a/common/cuda_hip/base/batch_multi_vector_kernels.cpp
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.cpp
@@ -37,19 +37,19 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     const auto alpha_ub = get_batch_struct(alpha);
     const auto x_ub = get_batch_struct(x);
     if (alpha->get_common_size()[1] == 1) {
-        batch_single_kernels::scale_kernel<<<num_blocks, default_block_size, 0,
-                                             exec->get_stream()>>>(
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel<<<
+            num_blocks, default_block_size, 0, exec->get_stream()>>>(
             alpha_ub, x_ub,
             [] __device__(int row, int col, int stride) { return 0; });
     } else if (alpha->get_common_size() == x->get_common_size()) {
-        batch_single_kernels::scale_kernel<<<num_blocks, default_block_size, 0,
-                                             exec->get_stream()>>>(
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel<<<
+            num_blocks, default_block_size, 0, exec->get_stream()>>>(
             alpha_ub, x_ub, [] __device__(int row, int col, int stride) {
                 return row * stride + col;
             });
     } else {
-        batch_single_kernels::scale_kernel<<<num_blocks, default_block_size, 0,
-                                             exec->get_stream()>>>(
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel<<<
+            num_blocks, default_block_size, 0, exec->get_stream()>>>(
             alpha_ub, x_ub,
             [] __device__(int row, int col, int stride) { return col; });
     }
@@ -71,11 +71,11 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
     const auto x_ub = get_batch_struct(x);
     const auto y_ub = get_batch_struct(y);
     if (alpha->get_common_size()[1] == 1) {
-        batch_single_kernels::add_scaled_kernel<<<
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::add_scaled_kernel<<<
             num_blocks, default_block_size, 0, exec->get_stream()>>>(
             alpha_ub, x_ub, y_ub, [] __device__(int col) { return 0; });
     } else {
-        batch_single_kernels::add_scaled_kernel<<<
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::add_scaled_kernel<<<
             num_blocks, default_block_size, 0, exec->get_stream()>>>(
             alpha_ub, x_ub, y_ub, [] __device__(int col) { return col; });
     }
@@ -96,9 +96,10 @@ void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
     const auto x_ub = get_batch_struct(x);
     const auto y_ub = get_batch_struct(y);
     const auto res_ub = get_batch_struct(result);
-    batch_single_kernels::compute_gen_dot_product_kernel<<<
-        num_blocks, default_block_size, 0, exec->get_stream()>>>(
-        x_ub, y_ub, res_ub, [] __device__(auto val) { return val; });
+    GKO_DEVICE_NAMESPACE::batch_single_kernels::
+        compute_gen_dot_product_kernel<<<num_blocks, default_block_size, 0,
+                                         exec->get_stream()>>>(
+            x_ub, y_ub, res_ub, [] __device__(auto val) { return val; });
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -116,9 +117,10 @@ void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
     const auto x_ub = get_batch_struct(x);
     const auto y_ub = get_batch_struct(y);
     const auto res_ub = get_batch_struct(result);
-    batch_single_kernels::compute_gen_dot_product_kernel<<<
-        num_blocks, default_block_size, 0, exec->get_stream()>>>(
-        x_ub, y_ub, res_ub, [] __device__(auto val) { return conj(val); });
+    GKO_DEVICE_NAMESPACE::batch_single_kernels::
+        compute_gen_dot_product_kernel<<<num_blocks, default_block_size, 0,
+                                         exec->get_stream()>>>(
+            x_ub, y_ub, res_ub, [] __device__(auto val) { return conj(val); });
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -134,9 +136,8 @@ void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
     const auto num_rhs = x->get_common_size()[1];
     const auto x_ub = get_batch_struct(x);
     const auto res_ub = get_batch_struct(result);
-    batch_single_kernels::compute_norm2_kernel<<<num_blocks, default_block_size,
-                                                 0, exec->get_stream()>>>(
-        x_ub, res_ub);
+    GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_norm2_kernel<<<
+        num_blocks, default_block_size, 0, exec->get_stream()>>>(x_ub, res_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -151,7 +152,7 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
     const auto num_blocks = x->get_num_batch_items();
     const auto result_ub = get_batch_struct(result);
     const auto x_ub = get_batch_struct(x);
-    batch_single_kernels::
+    GKO_DEVICE_NAMESPACE::batch_single_kernels::
         copy_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
             x_ub, result_ub);
 }
diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp b/common/cuda_hip/base/batch_multi_vector_kernels.hpp
index 36aa69d7d99..bb3aac67b55 100644
--- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp
@@ -34,7 +34,6 @@
 namespace gko {
 namespace kernels {
 namespace GKO_DEVICE_NAMESPACE {
-namespace batch_multi_vector {
 namespace batch_single_kernels {
 
 
@@ -320,7 +319,6 @@ __global__ __launch_bounds__(default_block_size) void copy_kernel(
 
 
 }  // namespace batch_single_kernels
-}  // namespace batch_multi_vector
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
deleted file mode 100644
index 7af3c84303f..00000000000
--- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
+++ /dev/null
@@ -1,280 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-template <typename ValueType, typename Mapping>
-__device__ __forceinline__ void scale(
-    const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
-    const gko::batch::multi_vector::batch_item<ValueType>& x, Mapping map)
-{
-    const int max_li = x.num_rows * x.num_rhs;
-    for (int li = threadIdx.x; li < max_li; li += blockDim.x) {
-        const int row = li / x.num_rhs;
-        const int col = li % x.num_rhs;
-
-        x.values[row * x.stride + col] =
-            alpha.values[map(row, col, alpha.stride)] *
-            x.values[row * x.stride + col];
-    }
-}
-
-
-template <typename ValueType, typename Mapping>
-__global__ __launch_bounds__(default_block_size) void scale_kernel(
-    const gko::batch::multi_vector::uniform_batch<const ValueType> alpha,
-    const gko::batch::multi_vector::uniform_batch<ValueType> x, Mapping map)
-{
-    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
-         batch_id += gridDim.x) {
-        const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id);
-        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
-        scale(alpha_b, x_b, map);
-    }
-}
-
-
-template <typename ValueType, typename Mapping>
-__device__ __forceinline__ void add_scaled(
-    const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
-    const gko::batch::multi_vector::batch_item<const ValueType>& x,
-    const gko::batch::multi_vector::batch_item<ValueType>& y, Mapping map)
-{
-    const int max_li = x.num_rows * x.num_rhs;
-    for (int li = threadIdx.x; li < max_li; li += blockDim.x) {
-        const int row = li / x.num_rhs;
-        const int col = li % x.num_rhs;
-
-        y.values[row * y.stride + col] +=
-            alpha.values[map(col)] * x.values[row * x.stride + col];
-    }
-}
-
-
-template <typename ValueType, typename Mapping>
-__global__ __launch_bounds__(default_block_size) void add_scaled_kernel(
-    const gko::batch::multi_vector::uniform_batch<const ValueType> alpha,
-    const gko::batch::multi_vector::uniform_batch<const ValueType> x,
-    const gko::batch::multi_vector::uniform_batch<ValueType> y, Mapping map)
-{
-    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
-         batch_id += gridDim.x) {
-        const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id);
-        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
-        const auto y_b = gko::batch::extract_batch_item(y, batch_id);
-        add_scaled(alpha_b, x_b, y_b, map);
-    }
-}
-
-
-template <typename Group, typename ValueType>
-__device__ __forceinline__ void single_rhs_compute_conj_dot(Group subgroup,
-                                                            const int num_rows,
-                                                            const ValueType* x,
-                                                            const ValueType* y,
-                                                            ValueType& result)
-
-{
-    ValueType val = zero<ValueType>();
-    for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) {
-        val += conj(x[r]) * y[r];
-    }
-
-    // subgroup level reduction
-    val = reduce(subgroup, val, thrust::plus<ValueType>{});
-
-    if (subgroup.thread_rank() == 0) {
-        result = val;
-    }
-}
-
-
-template <typename Group, typename ValueType, typename Mapping>
-__device__ __forceinline__ void gen_one_dot(
-    const gko::batch::multi_vector::batch_item<const ValueType>& x,
-    const gko::batch::multi_vector::batch_item<const ValueType>& y,
-    const int rhs_index,
-    const gko::batch::multi_vector::batch_item<ValueType>& result,
-    Group subgroup, Mapping conj_map)
-{
-    ValueType val = zero<ValueType>();
-
-    for (int r = subgroup.thread_rank(); r < x.num_rows; r += subgroup.size()) {
-        val += conj_map(x.values[r * x.stride + rhs_index]) *
-               y.values[r * y.stride + rhs_index];
-    }
-
-    // subgroup level reduction
-    val = reduce(subgroup, val, thrust::plus<ValueType>{});
-
-    if (subgroup.thread_rank() == 0) {
-        result.values[rhs_index] = val;
-    }
-}
-
-
-template <typename ValueType, typename Mapping>
-__device__ __forceinline__ void compute_gen_dot_product(
-    const gko::batch::multi_vector::batch_item<const ValueType>& x,
-    const gko::batch::multi_vector::batch_item<const ValueType>& y,
-    const gko::batch::multi_vector::batch_item<ValueType>& result,
-    Mapping conj_map)
-{
-    constexpr auto tile_size = config::warp_size;
-    auto thread_block = group::this_thread_block();
-    auto subgroup = group::tiled_partition<tile_size>(thread_block);
-    const auto subgroup_id = static_cast<int>(threadIdx.x / tile_size);
-    const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size);
-
-    for (int rhs_index = subgroup_id; rhs_index < x.num_rhs;
-         rhs_index += num_subgroups_per_block) {
-        gen_one_dot(x, y, rhs_index, result, subgroup, conj_map);
-    }
-}
-
-
-template <typename ValueType, typename Mapping>
-__global__
-__launch_bounds__(default_block_size) void compute_gen_dot_product_kernel(
-    const gko::batch::multi_vector::uniform_batch<const ValueType> x,
-    const gko::batch::multi_vector::uniform_batch<const ValueType> y,
-    const gko::batch::multi_vector::uniform_batch<ValueType> result,
-    Mapping map)
-{
-    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
-         batch_id += gridDim.x) {
-        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
-        const auto y_b = gko::batch::extract_batch_item(y, batch_id);
-        const auto r_b = gko::batch::extract_batch_item(result, batch_id);
-        compute_gen_dot_product(x_b, y_b, r_b, map);
-    }
-}
-
-
-template <typename Group, typename ValueType>
-__device__ __forceinline__ void single_rhs_compute_norm2(
-    Group subgroup, const int num_rows, const ValueType* x,
-    remove_complex<ValueType>& result)
-{
-    using real_type = typename gko::remove_complex<ValueType>;
-    real_type val = zero<real_type>();
-
-    for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) {
-        val += squared_norm(x[r]);
-    }
-
-    // subgroup level reduction
-    val = reduce(subgroup, val, thrust::plus<remove_complex<ValueType>>{});
-
-    if (subgroup.thread_rank() == 0) {
-        result = sqrt(val);
-    }
-}
-
-
-template <typename Group, typename ValueType>
-__device__ __forceinline__ void one_norm2(
-    const gko::batch::multi_vector::batch_item<const ValueType>& x,
-    const int rhs_index,
-    const gko::batch::multi_vector::batch_item<remove_complex<ValueType>>&
-        result,
-    Group subgroup)
-{
-    using real_type = typename gko::remove_complex<ValueType>;
-    real_type val = zero<real_type>();
-
-    for (int r = subgroup.thread_rank(); r < x.num_rows; r += subgroup.size()) {
-        val += squared_norm(x.values[r * x.stride + rhs_index]);
-    }
-
-    // subgroup level reduction
-    val = reduce(subgroup, val, thrust::plus<remove_complex<ValueType>>{});
-
-    if (subgroup.thread_rank() == 0) {
-        result.values[rhs_index] = sqrt(val);
-    }
-}
-
-
-/**
- * Computes the 2-norms of some column vectors in global or shared memory.
- *
- * @param x  A row-major multivector with nrhs columns.
- * @param result  Holds norm value for each vector in x.
- */
-template <typename ValueType>
-__device__ __forceinline__ void compute_norm2(
-    const gko::batch::multi_vector::batch_item<const ValueType>& x,
-    const gko::batch::multi_vector::batch_item<remove_complex<ValueType>>&
-        result)
-{
-    constexpr auto tile_size = config::warp_size;
-    auto thread_block = group::this_thread_block();
-    auto subgroup = group::tiled_partition<tile_size>(thread_block);
-    const auto subgroup_id = static_cast<int>(threadIdx.x / tile_size);
-    const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size);
-
-    for (int rhs_index = subgroup_id; rhs_index < x.num_rhs;
-         rhs_index += num_subgroups_per_block) {
-        one_norm2(x, rhs_index, result, subgroup);
-    }
-}
-
-
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void compute_norm2_kernel(
-    const gko::batch::multi_vector::uniform_batch<const ValueType> x,
-    const gko::batch::multi_vector::uniform_batch<remove_complex<ValueType>>
-        result)
-{
-    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
-         batch_id += gridDim.x) {
-        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
-        const auto r_b = gko::batch::extract_batch_item(result, batch_id);
-        compute_norm2(x_b, r_b);
-    }
-}
-
-
-template <typename ValueType>
-__device__ __forceinline__ void single_rhs_copy(const int num_rows,
-                                                const ValueType* in,
-                                                ValueType* out)
-{
-    for (int iz = threadIdx.x; iz < num_rows; iz += blockDim.x) {
-        out[iz] = in[iz];
-    }
-}
-
-
-/**
- * Copies the values of one multi-vector into another.
- *
- * Note that the output multi-vector should already have memory allocated
- * and stride set.
- */
-template <typename ValueType>
-__device__ __forceinline__ void copy(
-    const gko::batch::multi_vector::batch_item<const ValueType>& in,
-    const gko::batch::multi_vector::batch_item<ValueType>& out)
-{
-    for (int iz = threadIdx.x; iz < in.num_rows * in.num_rhs;
-         iz += blockDim.x) {
-        const int i = iz / in.num_rhs;
-        const int j = iz % in.num_rhs;
-        out.values[i * out.stride + j] = in.values[i * in.stride + j];
-    }
-}
-
-
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void copy_kernel(
-    const gko::batch::multi_vector::uniform_batch<const ValueType> src,
-    const gko::batch::multi_vector::uniform_batch<ValueType> dst)
-{
-    for (size_type batch_id = blockIdx.x; batch_id < src.num_batch_items;
-         batch_id += gridDim.x) {
-        const auto dst_b = gko::batch::extract_batch_item(dst, batch_id);
-        const auto src_b = gko::batch::extract_batch_item(src, batch_id);
-        copy(src_b, dst_b);
-    }
-}
diff --git a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc
index f71c8c40c3e..c2a53b2e518 100644
--- a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc
+++ b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc
@@ -32,10 +32,14 @@ __device__ __forceinline__ void initialize(
     __syncthreads();
 
     if (threadIdx.x / config::warp_size == 0) {
-        single_rhs_compute_norm2(subgroup, num_rows, r_shared_entry, res_norm);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_norm2(subgroup, num_rows, r_shared_entry,
+                                     res_norm);
     } else if (threadIdx.x / config::warp_size == 1) {
         // Compute norms of rhs
-        single_rhs_compute_norm2(subgroup, num_rows, b_global_entry, rhs_norm);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_norm2(subgroup, num_rows, b_global_entry,
+                                     rhs_norm);
     }
     __syncthreads();
 
@@ -70,8 +74,9 @@ __device__ __forceinline__ void compute_alpha(
     const ValueType* const v_shared_entry, ValueType& alpha)
 {
     if (threadIdx.x / config::warp_size == 0) {
-        single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_shared_entry,
-                                    v_shared_entry, alpha);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_shared_entry,
+                                        v_shared_entry, alpha);
     }
     __syncthreads();
     if (threadIdx.x == 0) {
@@ -99,11 +104,13 @@ __device__ __forceinline__ void compute_omega(
     const ValueType* const s_shared_entry, ValueType& temp, ValueType& omega)
 {
     if (threadIdx.x / config::warp_size == 0) {
-        single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry,
-                                    s_shared_entry, omega);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry,
+                                        s_shared_entry, omega);
     } else if (threadIdx.x / config::warp_size == 1) {
-        single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry,
-                                    t_shared_entry, temp);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry,
+                                        t_shared_entry, temp);
     }
 
     __syncthreads();
@@ -271,8 +278,9 @@ __global__ void apply_kernel(
 
             // rho_new =  < r_hat , r > = (r_hat)' * (r)
             if (threadIdx.x / config::warp_size == 0) {
-                single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_sh, r_sh,
-                                            rho_new_sh[0]);
+                gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+                    single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_sh,
+                                                r_sh, rho_new_sh[0]);
             }
             __syncthreads();
 
@@ -301,8 +309,9 @@ __global__ void apply_kernel(
 
             // an estimate of residual norms
             if (threadIdx.x / config::warp_size == 0) {
-                single_rhs_compute_norm2(subgroup, num_rows, s_sh,
-                                         norms_res_sh[0]);
+                gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+                    single_rhs_compute_norm2(subgroup, num_rows, s_sh,
+                                             norms_res_sh[0]);
             }
             __syncthreads();
 
@@ -333,8 +342,9 @@ __global__ void apply_kernel(
             __syncthreads();
 
             if (threadIdx.x / config::warp_size == 0) {
-                single_rhs_compute_norm2(subgroup, num_rows, r_sh,
-                                         norms_res_sh[0]);
+                gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+                    single_rhs_compute_norm2(subgroup, num_rows, r_sh,
+                                             norms_res_sh[0]);
             }
             //__syncthreads();
 
@@ -347,7 +357,8 @@ __global__ void apply_kernel(
         logger.log_iteration(batch_id, iter, norms_res_sh[0]);
 
         // copy x back to global memory
-        single_rhs_copy(num_rows, x_sh, x_gl_entry_ptr);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_copy(num_rows, x_sh, x_gl_entry_ptr);
         __syncthreads();
     }
 }
diff --git a/common/cuda_hip/solver/batch_cg_kernels.hpp.inc b/common/cuda_hip/solver/batch_cg_kernels.hpp.inc
index ffee501b58c..c95a6b1cf05 100644
--- a/common/cuda_hip/solver/batch_cg_kernels.hpp.inc
+++ b/common/cuda_hip/solver/batch_cg_kernels.hpp.inc
@@ -32,12 +32,14 @@ __device__ __forceinline__ void initialize(
 
     if (threadIdx.x / config::warp_size == 0) {
         // Compute norms of rhs
-        single_rhs_compute_norm2(subgroup, num_rows, b_global_entry,
-                                 rhs_norms_sh);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_norm2(subgroup, num_rows, b_global_entry,
+                                     rhs_norms_sh);
     } else if (threadIdx.x / config::warp_size == 1) {
         // rho_old = r' * z
-        single_rhs_compute_conj_dot(subgroup, num_rows, r_shared_entry,
-                                    z_shared_entry, rho_old_shared_entry);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_conj_dot(subgroup, num_rows, r_shared_entry,
+                                        z_shared_entry, rho_old_shared_entry);
     }
 
     // p = z
@@ -69,8 +71,9 @@ __device__ __forceinline__ void update_x_and_r(
     ValueType* const x_shared_entry, ValueType* const r_shared_entry)
 {
     if (threadIdx.x / config::warp_size == 0) {
-        single_rhs_compute_conj_dot(subgroup, num_rows, p_shared_entry,
-                                    Ap_shared_entry, alpha_shared_entry);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_conj_dot(subgroup, num_rows, p_shared_entry,
+                                        Ap_shared_entry, alpha_shared_entry);
     }
     __syncthreads();
 
@@ -202,8 +205,9 @@ __global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf,
 
             if (threadIdx.x / config::warp_size == 0) {
                 // rho_new =  (r)' * (z)
-                single_rhs_compute_conj_dot(subgroup, num_rows, r_sh, z_sh,
-                                            rho_new_sh[0]);
+                gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+                    single_rhs_compute_conj_dot(subgroup, num_rows, r_sh, z_sh,
+                                                rho_new_sh[0]);
             }
             __syncthreads();
 
@@ -222,7 +226,8 @@ __global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf,
         logger.log_iteration(batch_id, iter, norms_res_sh[0]);
 
         // copy x back to global memory
-        single_rhs_copy(num_rows, x_sh, x_global_entry);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_copy(num_rows, x_sh, x_global_entry);
         __syncthreads();
     }
 }
diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu
index 3c7fe50709c..4d3deb742fe 100644
--- a/cuda/solver/batch_bicgstab_kernels.cu
+++ b/cuda/solver/batch_bicgstab_kernels.cu
@@ -10,6 +10,7 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/thrust.hpp"
@@ -43,7 +44,6 @@ constexpr int sm_oversubscription = 4;
 namespace batch_bicgstab {
 
 
-#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu
index b681bd13ce3..21c3e3d43c4 100644
--- a/cuda/solver/batch_cg_kernels.cu
+++ b/cuda/solver/batch_cg_kernels.cu
@@ -10,6 +10,7 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
@@ -42,7 +43,6 @@ constexpr int sm_oversubscription = 4;
 namespace batch_cg {
 
 
-#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp
index ca49fa5eb9c..1c1be8b21f7 100644
--- a/hip/solver/batch_bicgstab_kernels.hip.cpp
+++ b/hip/solver/batch_bicgstab_kernels.hip.cpp
@@ -10,6 +10,7 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
@@ -42,7 +43,6 @@ constexpr int sm_oversubscription = 4;
 namespace batch_bicgstab {
 
 
-#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp
index 3a1642edfea..c860286c17c 100644
--- a/hip/solver/batch_cg_kernels.hip.cpp
+++ b/hip/solver/batch_cg_kernels.hip.cpp
@@ -10,6 +10,7 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
@@ -42,7 +43,6 @@ constexpr int sm_oversubscription = 4;
 namespace batch_cg {
 
 
-#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"

From ae1b24b8617b08fc8ab0a847f93cdd1b1e95a981 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Mon, 19 Aug 2024 11:32:50 +0200
Subject: [PATCH 140/448] [ref, omp] move kernels to headers

---
 omp/base/batch_multi_vector_kernels.cpp       | 30 +++++++-------
 omp/solver/batch_bicgstab_kernels.cpp         |  2 +-
 omp/solver/batch_cg_kernels.cpp               |  2 +-
 reference/base/batch_multi_vector_kernels.cpp | 33 +++++++--------
 ...hpp.inc => batch_multi_vector_kernels.hpp} | 20 +++++++++
 reference/solver/batch_bicgstab_kernels.cpp   |  2 +-
 .../solver/batch_bicgstab_kernels.hpp.inc     | 41 ++++++++++++-------
 reference/solver/batch_cg_kernels.cpp         |  2 +-
 reference/solver/batch_cg_kernels.hpp.inc     | 23 +++++++----
 9 files changed, 96 insertions(+), 59 deletions(-)
 rename reference/base/{batch_multi_vector_kernels.hpp.inc => batch_multi_vector_kernels.hpp} (90%)

diff --git a/omp/base/batch_multi_vector_kernels.cpp b/omp/base/batch_multi_vector_kernels.cpp
index 395bf96cc7a..8a947107479 100644
--- a/omp/base/batch_multi_vector_kernels.cpp
+++ b/omp/base/batch_multi_vector_kernels.cpp
@@ -10,24 +10,18 @@
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/range_accessors.hpp>
 
+#include "common/unified/base/kernel_launch.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
+#include "reference/base/batch_multi_vector_kernels.hpp"
 #include "reference/base/batch_struct.hpp"
 
 
 namespace gko {
 namespace kernels {
-namespace omp {
-/**
- * @brief The batch::MultiVector matrix format namespace.
- * @ref batch::MultiVector
- * @ingroup batch_multi_vector
- */
+namespace GKO_DEVICE_NAMESPACE {
 namespace batch_multi_vector {
 
 
-#include "reference/base/batch_multi_vector_kernels.hpp.inc"
-
-
 template <typename ValueType>
 void scale(std::shared_ptr<const DefaultExecutor> exec,
            const batch::MultiVector<ValueType>* const alpha,
@@ -39,7 +33,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
         const auto alpha_b = gko::batch::extract_batch_item(alpha_ub, batch);
         const auto x_b = gko::batch::extract_batch_item(x_ub, batch);
-        scale_kernel(alpha_b, x_b);
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel(alpha_b, x_b);
     }
 }
 
@@ -61,7 +55,8 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
         const auto alpha_b = gko::batch::extract_batch_item(alpha_ub, batch);
         const auto x_b = gko::batch::extract_batch_item(x_ub, batch);
         const auto y_b = gko::batch::extract_batch_item(y_ub, batch);
-        add_scaled_kernel(alpha_b, x_b, y_b);
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::add_scaled_kernel(alpha_b,
+                                                                      x_b, y_b);
     }
 }
 
@@ -83,7 +78,8 @@ void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
         const auto res_b = gko::batch::extract_batch_item(res_ub, batch);
         const auto x_b = gko::batch::extract_batch_item(x_ub, batch);
         const auto y_b = gko::batch::extract_batch_item(y_ub, batch);
-        compute_dot_product_kernel(x_b, y_b, res_b);
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_dot_product_kernel(
+            x_b, y_b, res_b);
     }
 }
 
@@ -105,7 +101,8 @@ void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
         const auto res_b = gko::batch::extract_batch_item(res_ub, batch);
         const auto x_b = gko::batch::extract_batch_item(x_ub, batch);
         const auto y_b = gko::batch::extract_batch_item(y_ub, batch);
-        compute_conj_dot_product_kernel(x_b, y_b, res_b);
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            compute_conj_dot_product_kernel(x_b, y_b, res_b);
     }
 }
 
@@ -124,7 +121,8 @@ void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
     for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) {
         const auto res_b = gko::batch::extract_batch_item(res_ub, batch);
         const auto x_b = gko::batch::extract_batch_item(x_ub, batch);
-        compute_norm2_kernel(x_b, res_b);
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_norm2_kernel(x_b,
+                                                                         res_b);
     }
 }
 
@@ -143,7 +141,7 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
     for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
         const auto result_b = gko::batch::extract_batch_item(result_ub, batch);
         const auto x_b = gko::batch::extract_batch_item(x_ub, batch);
-        copy_kernel(x_b, result_b);
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(x_b, result_b);
     }
 }
 
@@ -151,6 +149,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
 
 
 }  // namespace batch_multi_vector
-}  // namespace omp
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/omp/solver/batch_bicgstab_kernels.cpp b/omp/solver/batch_bicgstab_kernels.cpp
index 81df9c45e51..c245f284106 100644
--- a/omp/solver/batch_bicgstab_kernels.cpp
+++ b/omp/solver/batch_bicgstab_kernels.cpp
@@ -9,6 +9,7 @@
 #include <ginkgo/core/base/array.hpp>
 
 #include "core/solver/batch_dispatch.hpp"
+#include "reference/base/batch_multi_vector_kernels.hpp"
 
 
 namespace gko {
@@ -28,7 +29,6 @@ namespace {
 constexpr int max_num_rhs = 1;
 
 
-#include "reference/base/batch_multi_vector_kernels.hpp.inc"
 #include "reference/matrix/batch_csr_kernels.hpp.inc"
 #include "reference/matrix/batch_dense_kernels.hpp.inc"
 #include "reference/matrix/batch_ell_kernels.hpp.inc"
diff --git a/omp/solver/batch_cg_kernels.cpp b/omp/solver/batch_cg_kernels.cpp
index 51c794ab597..55d6ee29321 100644
--- a/omp/solver/batch_cg_kernels.cpp
+++ b/omp/solver/batch_cg_kernels.cpp
@@ -9,6 +9,7 @@
 #include <ginkgo/core/base/array.hpp>
 
 #include "core/solver/batch_dispatch.hpp"
+#include "reference/base/batch_multi_vector_kernels.hpp"
 
 
 namespace gko {
@@ -28,7 +29,6 @@ namespace {
 constexpr int max_num_rhs = 1;
 
 
-#include "reference/base/batch_multi_vector_kernels.hpp.inc"
 #include "reference/matrix/batch_csr_kernels.hpp.inc"
 #include "reference/matrix/batch_dense_kernels.hpp.inc"
 #include "reference/matrix/batch_ell_kernels.hpp.inc"
diff --git a/reference/base/batch_multi_vector_kernels.cpp b/reference/base/batch_multi_vector_kernels.cpp
index b0d20a6b826..c05398226f0 100644
--- a/reference/base/batch_multi_vector_kernels.cpp
+++ b/reference/base/batch_multi_vector_kernels.cpp
@@ -10,24 +10,21 @@
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/range_accessors.hpp>
 
+
+#define GKO_DEVICE_NAMESPACE reference
+
+
 #include "core/base/batch_struct.hpp"
+#include "reference/base/batch_multi_vector_kernels.hpp"
 #include "reference/base/batch_struct.hpp"
 
 
 namespace gko {
 namespace kernels {
-namespace reference {
-/**
- * @brief The batch::MultiVector matrix format namespace.
- * @ref batch::MultiVector
- * @ingroup batch_multi_vector
- */
+namespace GKO_DEVICE_NAMESPACE {
 namespace batch_multi_vector {
 
 
-#include "reference/base/batch_multi_vector_kernels.hpp.inc"
-
-
 template <typename ValueType>
 void scale(std::shared_ptr<const DefaultExecutor> exec,
            const batch::MultiVector<ValueType>* alpha,
@@ -38,7 +35,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
         const auto alpha_b = batch::extract_batch_item(alpha_ub, batch);
         const auto x_b = batch::extract_batch_item(x_ub, batch);
-        scale_kernel(alpha_b, x_b);
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel(alpha_b, x_b);
     }
 }
 
@@ -59,7 +56,8 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
         const auto alpha_b = batch::extract_batch_item(alpha_ub, batch);
         const auto x_b = batch::extract_batch_item(x_ub, batch);
         const auto y_b = batch::extract_batch_item(y_ub, batch);
-        add_scaled_kernel(alpha_b, x_b, y_b);
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::add_scaled_kernel(alpha_b,
+                                                                      x_b, y_b);
     }
 }
 
@@ -80,7 +78,8 @@ void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
         const auto res_b = batch::extract_batch_item(res_ub, batch);
         const auto x_b = batch::extract_batch_item(x_ub, batch);
         const auto y_b = batch::extract_batch_item(y_ub, batch);
-        compute_dot_product_kernel(x_b, y_b, res_b);
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_dot_product_kernel(
+            x_b, y_b, res_b);
     }
 }
 
@@ -101,7 +100,8 @@ void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
         const auto res_b = batch::extract_batch_item(res_ub, batch);
         const auto x_b = batch::extract_batch_item(x_ub, batch);
         const auto y_b = batch::extract_batch_item(y_ub, batch);
-        compute_conj_dot_product_kernel(x_b, y_b, res_b);
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            compute_conj_dot_product_kernel(x_b, y_b, res_b);
     }
 }
 
@@ -119,7 +119,8 @@ void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
     for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) {
         const auto res_b = batch::extract_batch_item(res_ub, batch);
         const auto x_b = batch::extract_batch_item(x_ub, batch);
-        compute_norm2_kernel(x_b, res_b);
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_norm2_kernel(x_b,
+                                                                         res_b);
     }
 }
 
@@ -137,7 +138,7 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
     for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
         const auto result_b = batch::extract_batch_item(result_ub, batch);
         const auto x_b = batch::extract_batch_item(x_ub, batch);
-        copy_kernel(x_b, result_b);
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(x_b, result_b);
     }
 }
 
@@ -145,6 +146,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
 
 
 }  // namespace batch_multi_vector
-}  // namespace reference
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/reference/base/batch_multi_vector_kernels.hpp.inc b/reference/base/batch_multi_vector_kernels.hpp
similarity index 90%
rename from reference/base/batch_multi_vector_kernels.hpp.inc
rename to reference/base/batch_multi_vector_kernels.hpp
index 24e59664b74..88f531f29cc 100644
--- a/reference/base/batch_multi_vector_kernels.hpp.inc
+++ b/reference/base/batch_multi_vector_kernels.hpp
@@ -2,6 +2,20 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+#include "reference/base/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
 template <typename ValueType>
 inline void scale_kernel(
     const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
@@ -129,3 +143,9 @@ inline void copy_kernel(
         out.values[i * out.stride + j] = in.values[i * in.stride + j];
     }
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/solver/batch_bicgstab_kernels.cpp b/reference/solver/batch_bicgstab_kernels.cpp
index 97de157fb90..e68caffa936 100644
--- a/reference/solver/batch_bicgstab_kernels.cpp
+++ b/reference/solver/batch_bicgstab_kernels.cpp
@@ -5,6 +5,7 @@
 #include "core/solver/batch_bicgstab_kernels.hpp"
 
 #include "core/solver/batch_dispatch.hpp"
+#include "reference/base/batch_multi_vector_kernels.hpp"
 
 
 namespace gko {
@@ -26,7 +27,6 @@ namespace {
 constexpr int max_num_rhs = 1;
 
 
-#include "reference/base/batch_multi_vector_kernels.hpp.inc"
 #include "reference/matrix/batch_csr_kernels.hpp.inc"
 #include "reference/matrix/batch_dense_kernels.hpp.inc"
 #include "reference/matrix/batch_ell_kernels.hpp.inc"
diff --git a/reference/solver/batch_bicgstab_kernels.hpp.inc b/reference/solver/batch_bicgstab_kernels.hpp.inc
index b61db3669ef..1f8537ab66d 100644
--- a/reference/solver/batch_bicgstab_kernels.hpp.inc
+++ b/reference/solver/batch_bicgstab_kernels.hpp.inc
@@ -25,17 +25,20 @@ inline void initialize(
     alpha_entry.values[0] = one<ValueType>();
 
     // Compute norms of rhs
-    compute_norm2_kernel<ValueType>(b_entry, rhs_norms_entry);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+        compute_norm2_kernel<ValueType>(b_entry, rhs_norms_entry);
 
     // r = b
-    copy_kernel(b_entry, r_entry);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(
+        b_entry, r_entry);
 
     // r = b - A*x
     advanced_apply_kernel(static_cast<ValueType>(-1.0), A_entry,
                           gko::batch::to_const(x_entry),
                           static_cast<ValueType>(1.0), r_entry);
-    compute_norm2_kernel<ValueType>(gko::batch::to_const(r_entry),
-                                    res_norms_entry);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+        compute_norm2_kernel<ValueType>(gko::batch::to_const(r_entry),
+                                        res_norms_entry);
 
     for (int r = 0; r < p_entry.num_rows; r++) {
         r_hat_entry.values[r * r_hat_entry.stride] =
@@ -75,7 +78,9 @@ inline void compute_alpha(
     const gko::batch::multi_vector::batch_item<const ValueType>& v_entry,
     const gko::batch::multi_vector::batch_item<ValueType>& alpha_entry)
 {
-    compute_dot_product_kernel<ValueType>(r_hat_entry, v_entry, alpha_entry);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+        compute_dot_product_kernel<ValueType>(r_hat_entry, v_entry,
+                                              alpha_entry);
     alpha_entry.values[0] = rho_new_entry.values[0] / alpha_entry.values[0];
 }
 
@@ -102,8 +107,10 @@ inline void compute_omega(
     const gko::batch::multi_vector::batch_item<ValueType>& temp_entry,
     const gko::batch::multi_vector::batch_item<ValueType>& omega_entry)
 {
-    compute_dot_product_kernel<ValueType>(t_entry, s_entry, omega_entry);
-    compute_dot_product_kernel<ValueType>(t_entry, t_entry, temp_entry);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+        compute_dot_product_kernel<ValueType>(t_entry, s_entry, omega_entry);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+        compute_dot_product_kernel<ValueType>(t_entry, t_entry, temp_entry);
     omega_entry.values[0] /= temp_entry.values[0];
 }
 
@@ -246,9 +253,10 @@ inline void batch_entry_bicgstab_impl(
         }
 
         // rho_new =  < r_hat , r > = (r_hat)' * (r)
-        compute_dot_product_kernel<ValueType>(gko::batch::to_const(r_hat_entry),
-                                              gko::batch::to_const(r_entry),
-                                              rho_new_entry);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            compute_dot_product_kernel<ValueType>(
+                gko::batch::to_const(r_hat_entry),
+                gko::batch::to_const(r_entry), rho_new_entry);
 
         // beta = (rho_new / rho_old)*(alpha / omega)
         // p = r + beta*(p - omega * v)
@@ -277,8 +285,9 @@ inline void batch_entry_bicgstab_impl(
                  gko::batch::to_const(v_entry), s_entry);
 
         // an estimate of residual norms
-        compute_norm2_kernel<ValueType>(gko::batch::to_const(s_entry),
-                                        res_norms_entry);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            compute_norm2_kernel<ValueType>(gko::batch::to_const(s_entry),
+                                            res_norms_entry);
 
         if (stop.check_converged(res_norms_entry.values)) {
             // update x for the systems
@@ -310,11 +319,13 @@ inline void batch_entry_bicgstab_impl(
                        gko::batch::to_const(s_entry),
                        gko::batch::to_const(t_entry), x_entry, r_entry);
 
-        compute_norm2_kernel<ValueType>(gko::batch::to_const(r_entry),
-                                        res_norms_entry);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            compute_norm2_kernel<ValueType>(gko::batch::to_const(r_entry),
+                                            res_norms_entry);
 
         // rho_old = rho_new
-        copy_kernel(gko::batch::to_const(rho_new_entry), rho_old_entry);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(
+            gko::batch::to_const(rho_new_entry), rho_old_entry);
     }
 
     logger.log_iteration(batch_item_id, iter, res_norms_entry.values[0]);
diff --git a/reference/solver/batch_cg_kernels.cpp b/reference/solver/batch_cg_kernels.cpp
index 290fbc3718b..785a7a868a2 100644
--- a/reference/solver/batch_cg_kernels.cpp
+++ b/reference/solver/batch_cg_kernels.cpp
@@ -5,6 +5,7 @@
 #include "core/solver/batch_cg_kernels.hpp"
 
 #include "core/solver/batch_dispatch.hpp"
+#include "reference/base/batch_multi_vector_kernels.hpp"
 
 
 namespace gko {
@@ -26,7 +27,6 @@ namespace {
 constexpr int max_num_rhs = 1;
 
 
-#include "reference/base/batch_multi_vector_kernels.hpp.inc"
 #include "reference/matrix/batch_csr_kernels.hpp.inc"
 #include "reference/matrix/batch_dense_kernels.hpp.inc"
 #include "reference/matrix/batch_ell_kernels.hpp.inc"
diff --git a/reference/solver/batch_cg_kernels.hpp.inc b/reference/solver/batch_cg_kernels.hpp.inc
index b3df5ba97fd..ca88940cd69 100644
--- a/reference/solver/batch_cg_kernels.hpp.inc
+++ b/reference/solver/batch_cg_kernels.hpp.inc
@@ -26,10 +26,12 @@ inline void initialize(
     }
 
     // Compute norms of rhs
-    compute_norm2_kernel<ValueType>(b_entry, rhs_norms_entry);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+        compute_norm2_kernel<ValueType>(b_entry, rhs_norms_entry);
 
     // r = b
-    copy_kernel(b_entry, r_entry);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(
+        b_entry, r_entry);
 
     // r = b - A*x
     advanced_apply_kernel(static_cast<ValueType>(-1.0), A_entry,
@@ -46,7 +48,8 @@ inline void update_p(
     const gko::batch::multi_vector::batch_item<ValueType>& p_entry)
 {
     if (rho_old_entry.values[0] == zero<ValueType>()) {
-        copy_kernel(z_entry, p_entry);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(
+            z_entry, p_entry);
         return;
     }
     const ValueType beta = rho_new_entry.values[0] / rho_old_entry.values[0];
@@ -67,7 +70,9 @@ inline void update_x_and_r(
     const gko::batch::multi_vector::batch_item<ValueType>& x_entry,
     const gko::batch::multi_vector::batch_item<ValueType>& r_entry)
 {
-    compute_conj_dot_product_kernel<ValueType>(p_entry, Ap_entry, alpha_entry);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+        compute_conj_dot_product_kernel<ValueType>(p_entry, Ap_entry,
+                                                   alpha_entry);
 
     const ValueType temp = rho_old_entry.values[0] / alpha_entry.values[0];
     for (int row = 0; row < r_entry.num_rows; row++) {
@@ -154,9 +159,10 @@ inline void batch_entry_cg_impl(
         prec.apply(gko::batch::to_const(r_entry), z_entry);
 
         // rho_new =  < r , z > = (r)' * (z)
-        compute_conj_dot_product_kernel<ValueType>(
-            gko::batch::to_const(r_entry), gko::batch::to_const(z_entry),
-            rho_new_entry);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            compute_conj_dot_product_kernel<ValueType>(
+                gko::batch::to_const(r_entry), gko::batch::to_const(z_entry),
+                rho_new_entry);
         ++iter;
         // use implicit residual norms
         res_norms_entry.values[0] = sqrt(abs(rho_new_entry.values[0]));
@@ -185,7 +191,8 @@ inline void batch_entry_cg_impl(
             gko::batch::to_const(Ap_entry), alpha_entry, x_entry, r_entry);
 
         // rho_old = rho_new
-        copy_kernel(gko::batch::to_const(rho_new_entry), rho_old_entry);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(
+            gko::batch::to_const(rho_new_entry), rho_old_entry);
     }
 
     logger.log_iteration(batch_item_id, iter, res_norms_entry.values[0]);

From a04305292a9cec7c81bf3598dce253f731b3c4c6 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Mon, 19 Aug 2024 15:14:26 +0200
Subject: [PATCH 141/448] [kernels] remove GKO_DEVICE_NAMESPACE

---
 .../base/batch_multi_vector_kernels.cpp       | 37 +++++++++----------
 omp/base/batch_multi_vector_kernels.cpp       | 16 +++-----
 reference/base/batch_multi_vector_kernels.cpp | 16 +++-----
 3 files changed, 30 insertions(+), 39 deletions(-)

diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.cpp b/common/cuda_hip/base/batch_multi_vector_kernels.cpp
index 76565a83f80..17f65487464 100644
--- a/common/cuda_hip/base/batch_multi_vector_kernels.cpp
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.cpp
@@ -37,19 +37,19 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     const auto alpha_ub = get_batch_struct(alpha);
     const auto x_ub = get_batch_struct(x);
     if (alpha->get_common_size()[1] == 1) {
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel<<<
-            num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        batch_single_kernels::scale_kernel<<<num_blocks, default_block_size, 0,
+                                             exec->get_stream()>>>(
             alpha_ub, x_ub,
             [] __device__(int row, int col, int stride) { return 0; });
     } else if (alpha->get_common_size() == x->get_common_size()) {
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel<<<
-            num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        batch_single_kernels::scale_kernel<<<num_blocks, default_block_size, 0,
+                                             exec->get_stream()>>>(
             alpha_ub, x_ub, [] __device__(int row, int col, int stride) {
                 return row * stride + col;
             });
     } else {
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel<<<
-            num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        batch_single_kernels::scale_kernel<<<num_blocks, default_block_size, 0,
+                                             exec->get_stream()>>>(
             alpha_ub, x_ub,
             [] __device__(int row, int col, int stride) { return col; });
     }
@@ -71,11 +71,11 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
     const auto x_ub = get_batch_struct(x);
     const auto y_ub = get_batch_struct(y);
     if (alpha->get_common_size()[1] == 1) {
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::add_scaled_kernel<<<
+        batch_single_kernels::add_scaled_kernel<<<
             num_blocks, default_block_size, 0, exec->get_stream()>>>(
             alpha_ub, x_ub, y_ub, [] __device__(int col) { return 0; });
     } else {
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::add_scaled_kernel<<<
+        batch_single_kernels::add_scaled_kernel<<<
             num_blocks, default_block_size, 0, exec->get_stream()>>>(
             alpha_ub, x_ub, y_ub, [] __device__(int col) { return col; });
     }
@@ -96,10 +96,9 @@ void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
     const auto x_ub = get_batch_struct(x);
     const auto y_ub = get_batch_struct(y);
     const auto res_ub = get_batch_struct(result);
-    GKO_DEVICE_NAMESPACE::batch_single_kernels::
-        compute_gen_dot_product_kernel<<<num_blocks, default_block_size, 0,
-                                         exec->get_stream()>>>(
-            x_ub, y_ub, res_ub, [] __device__(auto val) { return val; });
+    batch_single_kernels::compute_gen_dot_product_kernel<<<
+        num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        x_ub, y_ub, res_ub, [] __device__(auto val) { return val; });
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -117,10 +116,9 @@ void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
     const auto x_ub = get_batch_struct(x);
     const auto y_ub = get_batch_struct(y);
     const auto res_ub = get_batch_struct(result);
-    GKO_DEVICE_NAMESPACE::batch_single_kernels::
-        compute_gen_dot_product_kernel<<<num_blocks, default_block_size, 0,
-                                         exec->get_stream()>>>(
-            x_ub, y_ub, res_ub, [] __device__(auto val) { return conj(val); });
+    batch_single_kernels::compute_gen_dot_product_kernel<<<
+        num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        x_ub, y_ub, res_ub, [] __device__(auto val) { return conj(val); });
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -136,8 +134,9 @@ void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
     const auto num_rhs = x->get_common_size()[1];
     const auto x_ub = get_batch_struct(x);
     const auto res_ub = get_batch_struct(result);
-    GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_norm2_kernel<<<
-        num_blocks, default_block_size, 0, exec->get_stream()>>>(x_ub, res_ub);
+    batch_single_kernels::compute_norm2_kernel<<<num_blocks, default_block_size,
+                                                 0, exec->get_stream()>>>(
+        x_ub, res_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -152,7 +151,7 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
     const auto num_blocks = x->get_num_batch_items();
     const auto result_ub = get_batch_struct(result);
     const auto x_ub = get_batch_struct(x);
-    GKO_DEVICE_NAMESPACE::batch_single_kernels::
+    batch_single_kernels::
         copy_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
             x_ub, result_ub);
 }
diff --git a/omp/base/batch_multi_vector_kernels.cpp b/omp/base/batch_multi_vector_kernels.cpp
index 8a947107479..f740e3c32f0 100644
--- a/omp/base/batch_multi_vector_kernels.cpp
+++ b/omp/base/batch_multi_vector_kernels.cpp
@@ -33,7 +33,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
         const auto alpha_b = gko::batch::extract_batch_item(alpha_ub, batch);
         const auto x_b = gko::batch::extract_batch_item(x_ub, batch);
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel(alpha_b, x_b);
+        batch_single_kernels::scale_kernel(alpha_b, x_b);
     }
 }
 
@@ -55,8 +55,7 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
         const auto alpha_b = gko::batch::extract_batch_item(alpha_ub, batch);
         const auto x_b = gko::batch::extract_batch_item(x_ub, batch);
         const auto y_b = gko::batch::extract_batch_item(y_ub, batch);
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::add_scaled_kernel(alpha_b,
-                                                                      x_b, y_b);
+        batch_single_kernels::add_scaled_kernel(alpha_b, x_b, y_b);
     }
 }
 
@@ -78,8 +77,7 @@ void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
         const auto res_b = gko::batch::extract_batch_item(res_ub, batch);
         const auto x_b = gko::batch::extract_batch_item(x_ub, batch);
         const auto y_b = gko::batch::extract_batch_item(y_ub, batch);
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_dot_product_kernel(
-            x_b, y_b, res_b);
+        batch_single_kernels::compute_dot_product_kernel(x_b, y_b, res_b);
     }
 }
 
@@ -101,8 +99,7 @@ void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
         const auto res_b = gko::batch::extract_batch_item(res_ub, batch);
         const auto x_b = gko::batch::extract_batch_item(x_ub, batch);
         const auto y_b = gko::batch::extract_batch_item(y_ub, batch);
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::
-            compute_conj_dot_product_kernel(x_b, y_b, res_b);
+        batch_single_kernels::compute_conj_dot_product_kernel(x_b, y_b, res_b);
     }
 }
 
@@ -121,8 +118,7 @@ void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
     for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) {
         const auto res_b = gko::batch::extract_batch_item(res_ub, batch);
         const auto x_b = gko::batch::extract_batch_item(x_ub, batch);
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_norm2_kernel(x_b,
-                                                                         res_b);
+        batch_single_kernels::compute_norm2_kernel(x_b, res_b);
     }
 }
 
@@ -141,7 +137,7 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
     for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
         const auto result_b = gko::batch::extract_batch_item(result_ub, batch);
         const auto x_b = gko::batch::extract_batch_item(x_ub, batch);
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(x_b, result_b);
+        batch_single_kernels::copy_kernel(x_b, result_b);
     }
 }
 
diff --git a/reference/base/batch_multi_vector_kernels.cpp b/reference/base/batch_multi_vector_kernels.cpp
index c05398226f0..f5e1c653054 100644
--- a/reference/base/batch_multi_vector_kernels.cpp
+++ b/reference/base/batch_multi_vector_kernels.cpp
@@ -35,7 +35,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
         const auto alpha_b = batch::extract_batch_item(alpha_ub, batch);
         const auto x_b = batch::extract_batch_item(x_ub, batch);
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel(alpha_b, x_b);
+        batch_single_kernels::scale_kernel(alpha_b, x_b);
     }
 }
 
@@ -56,8 +56,7 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
         const auto alpha_b = batch::extract_batch_item(alpha_ub, batch);
         const auto x_b = batch::extract_batch_item(x_ub, batch);
         const auto y_b = batch::extract_batch_item(y_ub, batch);
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::add_scaled_kernel(alpha_b,
-                                                                      x_b, y_b);
+        batch_single_kernels::add_scaled_kernel(alpha_b, x_b, y_b);
     }
 }
 
@@ -78,8 +77,7 @@ void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
         const auto res_b = batch::extract_batch_item(res_ub, batch);
         const auto x_b = batch::extract_batch_item(x_ub, batch);
         const auto y_b = batch::extract_batch_item(y_ub, batch);
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_dot_product_kernel(
-            x_b, y_b, res_b);
+        batch_single_kernels::compute_dot_product_kernel(x_b, y_b, res_b);
     }
 }
 
@@ -100,8 +98,7 @@ void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
         const auto res_b = batch::extract_batch_item(res_ub, batch);
         const auto x_b = batch::extract_batch_item(x_ub, batch);
         const auto y_b = batch::extract_batch_item(y_ub, batch);
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::
-            compute_conj_dot_product_kernel(x_b, y_b, res_b);
+        batch_single_kernels::compute_conj_dot_product_kernel(x_b, y_b, res_b);
     }
 }
 
@@ -119,8 +116,7 @@ void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
     for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) {
         const auto res_b = batch::extract_batch_item(res_ub, batch);
         const auto x_b = batch::extract_batch_item(x_ub, batch);
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_norm2_kernel(x_b,
-                                                                         res_b);
+        batch_single_kernels::compute_norm2_kernel(x_b, res_b);
     }
 }
 
@@ -138,7 +134,7 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
     for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
         const auto result_b = batch::extract_batch_item(result_ub, batch);
         const auto x_b = batch::extract_batch_item(x_ub, batch);
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(x_b, result_b);
+        batch_single_kernels::copy_kernel(x_b, result_b);
     }
 }
 

From 006943b79b655bc962d9ade087cf0c172271a400 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Mon, 19 Aug 2024 15:31:01 +0200
Subject: [PATCH 142/448] [dpcpp] move to proper headers

---
 dpcpp/base/batch_multi_vector_kernels.dp.cpp  | 64 +++++++++----------
 ...hpp.inc => batch_multi_vector_kernels.hpp} | 29 +++++++++
 dpcpp/solver/batch_bicgstab_kernels.dp.cpp    |  2 +-
 dpcpp/solver/batch_bicgstab_kernels.hpp.inc   | 43 ++++++++-----
 dpcpp/solver/batch_cg_kernels.dp.cpp          |  2 +-
 dpcpp/solver/batch_cg_kernels.hpp.inc         | 25 +++++---
 6 files changed, 102 insertions(+), 63 deletions(-)
 rename dpcpp/base/{batch_multi_vector_kernels.hpp.inc => batch_multi_vector_kernels.hpp} (92%)

diff --git a/dpcpp/base/batch_multi_vector_kernels.dp.cpp b/dpcpp/base/batch_multi_vector_kernels.dp.cpp
index 8f607725bc8..0d2662bdccd 100644
--- a/dpcpp/base/batch_multi_vector_kernels.dp.cpp
+++ b/dpcpp/base/batch_multi_vector_kernels.dp.cpp
@@ -15,6 +15,7 @@
 
 #include "core/base/batch_struct.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
+#include "dpcpp/base/batch_multi_vector_kernels.hpp"
 #include "dpcpp/base/batch_struct.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
@@ -29,17 +30,9 @@
 namespace gko {
 namespace kernels {
 namespace dpcpp {
-/**
- * @brief The MultiVector matrix format namespace.
- * @ref MultiVector
- * @ingroup batch_multi_vector
- */
 namespace batch_multi_vector {
 
 
-#include "dpcpp/base/batch_multi_vector_kernels.hpp.inc"
-
-
 template <typename ValueType>
 void scale(std::shared_ptr<const DefaultExecutor> exec,
            const batch::MultiVector<ValueType>* const alpha,
@@ -71,7 +64,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
                     const auto alpha_b =
                         batch::extract_batch_item(alpha_ub, group_id);
                     const auto x_b = batch::extract_batch_item(x_ub, group_id);
-                    scale_kernel(
+                    batch_single_kernels::scale_kernel(
                         alpha_b, x_b, item_ct1,
                         [](int row, int col, int stride) { return 0; });
                 });
@@ -85,10 +78,11 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
                     const auto alpha_b =
                         batch::extract_batch_item(alpha_ub, group_id);
                     const auto x_b = batch::extract_batch_item(x_ub, group_id);
-                    scale_kernel(alpha_b, x_b, item_ct1,
-                                 [](int row, int col, int stride) {
-                                     return row * stride + col;
-                                 });
+                    batch_single_kernels::scale_kernel(
+                        alpha_b, x_b, item_ct1,
+                        [](int row, int col, int stride) {
+                            return row * stride + col;
+                        });
                 });
         });
     } else {
@@ -100,7 +94,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
                     const auto alpha_b =
                         batch::extract_batch_item(alpha_ub, group_id);
                     const auto x_b = batch::extract_batch_item(x_ub, group_id);
-                    scale_kernel(
+                    batch_single_kernels::scale_kernel(
                         alpha_b, x_b, item_ct1,
                         [](int row, int col, int stride) { return col; });
                 });
@@ -144,8 +138,9 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
                         batch::extract_batch_item(alpha_ub, group_id);
                     const auto x_b = batch::extract_batch_item(x_ub, group_id);
                     const auto y_b = batch::extract_batch_item(y_ub, group_id);
-                    add_scaled_kernel(alpha_b, x_b, y_b, item_ct1,
-                                      [](auto col) { return 0; });
+                    batch_single_kernels::add_scaled_kernel(
+                        alpha_b, x_b, y_b, item_ct1,
+                        [](auto col) { return 0; });
                 });
         });
     } else {
@@ -158,8 +153,9 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
                         batch::extract_batch_item(alpha_ub, group_id);
                     const auto x_b = batch::extract_batch_item(x_ub, group_id);
                     const auto y_b = batch::extract_batch_item(y_ub, group_id);
-                    add_scaled_kernel(alpha_b, x_b, y_b, item_ct1,
-                                      [](auto col) { return col; });
+                    batch_single_kernels::add_scaled_kernel(
+                        alpha_b, x_b, y_b, item_ct1,
+                        [](auto col) { return col; });
                 });
         });
     }
@@ -206,7 +202,7 @@ void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
                             batch::extract_batch_item(y_ub, group_id);
                         const auto res_b =
                             batch::extract_batch_item(res_ub, group_id);
-                        single_rhs_compute_conj_dot_sg(
+                        batch_single_kernels::single_rhs_compute_conj_dot_sg(
                             x_b.num_rows, x_b.values, y_b.values,
                             res_b.values[0], item_ct1);
                     });
@@ -226,7 +222,7 @@ void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
                             batch::extract_batch_item(y_ub, group_id);
                         const auto res_b =
                             batch::extract_batch_item(res_ub, group_id);
-                        compute_gen_dot_product_kernel(
+                        batch_single_kernels::compute_gen_dot_product_kernel(
                             x_b, y_b, res_b, item_ct1,
                             [](auto val) { return val; });
                     });
@@ -272,7 +268,7 @@ void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
                     const auto y_b = batch::extract_batch_item(y_ub, group_id);
                     const auto res_b =
                         batch::extract_batch_item(res_ub, group_id);
-                    compute_gen_dot_product_kernel(
+                    batch_single_kernels::compute_gen_dot_product_kernel(
                         x_b, y_b, res_b, item_ct1,
                         [](auto val) { return conj(val); });
                 });
@@ -308,17 +304,16 @@ void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
         exec->get_queue()->submit([&](sycl::handler& cgh) {
             cgh.parallel_for(
                 sycl_nd_range(grid, block),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[sycl::reqd_sub_group_size(max_subgroup_size)]] {
-                        auto group = item_ct1.get_group();
-                        auto group_id = group.get_group_linear_id();
-                        const auto x_b =
-                            batch::extract_batch_item(x_ub, group_id);
-                        const auto res_b =
-                            batch::extract_batch_item(res_ub, group_id);
-                        single_rhs_compute_norm2_sg(x_b.num_rows, x_b.values,
-                                                    res_b.values[0], item_ct1);
-                    });
+                [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
+                    max_subgroup_size)]] {
+                    auto group = item_ct1.get_group();
+                    auto group_id = group.get_group_linear_id();
+                    const auto x_b = batch::extract_batch_item(x_ub, group_id);
+                    const auto res_b =
+                        batch::extract_batch_item(res_ub, group_id);
+                    batch_single_kernels::single_rhs_compute_norm2_sg(
+                        x_b.num_rows, x_b.values, res_b.values[0], item_ct1);
+                });
         });
     } else {
         exec->get_queue()->submit([&](sycl::handler& cgh) {
@@ -332,7 +327,8 @@ void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
                             batch::extract_batch_item(x_ub, group_id);
                         const auto res_b =
                             batch::extract_batch_item(res_ub, group_id);
-                        compute_norm2_kernel(x_b, res_b, item_ct1);
+                        batch_single_kernels::compute_norm2_kernel(x_b, res_b,
+                                                                   item_ct1);
                     });
         });
     }
@@ -371,7 +367,7 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
                 const auto x_b = batch::extract_batch_item(x_ub, group_id);
                 const auto result_b =
                     batch::extract_batch_item(result_ub, group_id);
-                copy_kernel(x_b, result_b, item_ct1);
+                batch_single_kernels::copy_kernel(x_b, result_b, item_ct1);
             });
     });
 }
diff --git a/dpcpp/base/batch_multi_vector_kernels.hpp.inc b/dpcpp/base/batch_multi_vector_kernels.hpp
similarity index 92%
rename from dpcpp/base/batch_multi_vector_kernels.hpp.inc
rename to dpcpp/base/batch_multi_vector_kernels.hpp
index c41eafd7efd..a16df237e34 100644
--- a/dpcpp/base/batch_multi_vector_kernels.hpp.inc
+++ b/dpcpp/base/batch_multi_vector_kernels.hpp
@@ -2,6 +2,29 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+
+#include <memory>
+
+#include <CL/sycl.hpp>
+
+#include "core/base/batch_struct.hpp"
+#include "dpcpp/base/batch_struct.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/base/helper.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/intrinsics.dp.hpp"
+#include "dpcpp/components/reduction.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
 template <typename ValueType, typename Mapping>
 __dpct_inline__ void scale_kernel(
     const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
@@ -229,3 +252,9 @@ __dpct_inline__ void copy_kernel(
         out.values[i * out.stride + j] = in.values[i * in.stride + j];
     }
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
index bb84283b49f..7dc8f3ec23b 100644
--- a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
+++ b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
@@ -13,6 +13,7 @@
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
+#include "dpcpp/base/batch_multi_vector_kernels.hpp"
 #include "dpcpp/base/batch_struct.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
@@ -36,7 +37,6 @@ namespace dpcpp {
 namespace batch_bicgstab {
 
 
-#include "dpcpp/base/batch_multi_vector_kernels.hpp.inc"
 #include "dpcpp/matrix/batch_csr_kernels.hpp.inc"
 #include "dpcpp/matrix/batch_dense_kernels.hpp.inc"
 #include "dpcpp/matrix/batch_ell_kernels.hpp.inc"
diff --git a/dpcpp/solver/batch_bicgstab_kernels.hpp.inc b/dpcpp/solver/batch_bicgstab_kernels.hpp.inc
index ad7eaeff556..f5a88e9d59d 100644
--- a/dpcpp/solver/batch_bicgstab_kernels.hpp.inc
+++ b/dpcpp/solver/batch_bicgstab_kernels.hpp.inc
@@ -39,11 +39,13 @@ __dpct_inline__ void initialize(
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
     if (sg_id == 0) {
-        single_rhs_compute_norm2_sg(num_rows, r_shared_entry, res_norm,
-                                    item_ct1);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_norm2_sg(num_rows, r_shared_entry, res_norm,
+                                        item_ct1);
     } else if (sg_id == 1) {
-        single_rhs_compute_norm2_sg(num_rows, b_global_entry, rhs_norm,
-                                    item_ct1);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_norm2_sg(num_rows, b_global_entry, rhs_norm,
+                                        item_ct1);
     }
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
@@ -86,8 +88,9 @@ __dpct_inline__ void compute_alpha(const int num_rows, const ValueType& rho_new,
     const auto sg_id = sg.get_group_id();
     const auto tid = item_ct1.get_local_linear_id();
     if (sg_id == 0) {
-        single_rhs_compute_conj_dot_sg(num_rows, r_hat_shared_entry,
-                                       v_shared_entry, alpha, item_ct1);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_conj_dot_sg(num_rows, r_hat_shared_entry,
+                                           v_shared_entry, alpha, item_ct1);
     }
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
     if (tid == 0) {
@@ -123,11 +126,13 @@ __dpct_inline__ void compute_omega(const int num_rows,
     const auto sg_id = sg.get_group_id();
     const auto tid = item_ct1.get_local_linear_id();
     if (sg_id == 0) {
-        single_rhs_compute_conj_dot_sg(num_rows, t_shared_entry, s_shared_entry,
-                                       omega, item_ct1);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_conj_dot_sg(num_rows, t_shared_entry,
+                                           s_shared_entry, omega, item_ct1);
     } else if (sg_id == 1) {
-        single_rhs_compute_conj_dot_sg(num_rows, t_shared_entry, t_shared_entry,
-                                       temp, item_ct1);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_conj_dot_sg(num_rows, t_shared_entry,
+                                           t_shared_entry, temp, item_ct1);
     }
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
     if (tid == 0) {
@@ -308,8 +313,9 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf,
 
         // rho_new =  < r_hat , r > = (r_hat)' * (r)
         if (sg_id == 0) {
-            single_rhs_compute_conj_dot_sg(num_rows, r_hat_sh, r_sh,
-                                           rho_new_sh[0], item_ct1);
+            gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+                single_rhs_compute_conj_dot_sg(num_rows, r_hat_sh, r_sh,
+                                               rho_new_sh[0], item_ct1);
         }
         item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
@@ -338,8 +344,9 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf,
 
         // an estimate of residual norms
         if (sg_id == 0) {
-            single_rhs_compute_norm2_sg(num_rows, s_sh, norms_res_sh[0],
-                                        item_ct1);
+            gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+                single_rhs_compute_norm2_sg(num_rows, s_sh, norms_res_sh[0],
+                                            item_ct1);
         }
         item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
@@ -368,8 +375,9 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf,
         item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
         if (sg_id == 0)
-            single_rhs_compute_norm2_sg(num_rows, r_sh, norms_res_sh[0],
-                                        item_ct1);
+            gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+                single_rhs_compute_norm2_sg(num_rows, r_sh, norms_res_sh[0],
+                                            item_ct1);
         if (tid == group_size - 1) {
             rho_old_sh[0] = rho_new_sh[0];
         }
@@ -379,6 +387,7 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf,
     logger.log_iteration(batch_id, iter, norms_res_sh[0]);
 
     // copy x back to global memory
-    copy_kernel(num_rows, x_sh, x_global_entry, item_ct1);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(
+        num_rows, x_sh, x_global_entry, item_ct1);
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
 }
diff --git a/dpcpp/solver/batch_cg_kernels.dp.cpp b/dpcpp/solver/batch_cg_kernels.dp.cpp
index 61591f9efb6..f25d8266803 100644
--- a/dpcpp/solver/batch_cg_kernels.dp.cpp
+++ b/dpcpp/solver/batch_cg_kernels.dp.cpp
@@ -13,6 +13,7 @@
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
+#include "dpcpp/base/batch_multi_vector_kernels.hpp"
 #include "dpcpp/base/batch_struct.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
@@ -36,7 +37,6 @@ namespace dpcpp {
 namespace batch_cg {
 
 
-#include "dpcpp/base/batch_multi_vector_kernels.hpp.inc"
 #include "dpcpp/matrix/batch_csr_kernels.hpp.inc"
 #include "dpcpp/matrix/batch_dense_kernels.hpp.inc"
 #include "dpcpp/matrix/batch_ell_kernels.hpp.inc"
diff --git a/dpcpp/solver/batch_cg_kernels.hpp.inc b/dpcpp/solver/batch_cg_kernels.hpp.inc
index cef6e620b64..7a91bcb2bbf 100644
--- a/dpcpp/solver/batch_cg_kernels.hpp.inc
+++ b/dpcpp/solver/batch_cg_kernels.hpp.inc
@@ -40,11 +40,13 @@ __dpct_inline__ void initialize(
     // Compute norms of rhs
     // and rho_old = r' * z
     if (sg_id == 0) {
-        single_rhs_compute_norm2_sg(num_rows, b_global_entry, rhs_norms,
-                                    item_ct1);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_norm2_sg(num_rows, b_global_entry, rhs_norms,
+                                        item_ct1);
     } else if (sg_id == 1) {
-        single_rhs_compute_conj_dot_sg(num_rows, r_shared_entry, z_shared_entry,
-                                       rho_old, item_ct1);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_conj_dot_sg(num_rows, r_shared_entry,
+                                           z_shared_entry, rho_old, item_ct1);
     }
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
@@ -80,9 +82,10 @@ __dpct_inline__ void update_x_and_r(
     auto sg = item_ct1.get_sub_group();
     const auto tid = item_ct1.get_local_linear_id();
     if (sg.get_group_id() == 0) {
-        single_rhs_compute_conj_dot_sg(num_rows, p_shared_entry,
-                                       Ap_shared_entry, alpha_shared_entry,
-                                       item_ct1);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_conj_dot_sg(num_rows, p_shared_entry,
+                                           Ap_shared_entry, alpha_shared_entry,
+                                           item_ct1);
     }
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
     if (tid == 0) {
@@ -221,8 +224,9 @@ __dpct_inline__ void apply_kernel(
 
         //  rho_new =  (r)' * (z)
         if (sg_id == 0) {
-            single_rhs_compute_conj_dot_sg(num_rows, r_sh, z_sh, rho_new_sh[0],
-                                           item_ct1);
+            gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+                single_rhs_compute_conj_dot_sg(num_rows, r_sh, z_sh,
+                                               rho_new_sh[0], item_ct1);
         }
         item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
@@ -239,6 +243,7 @@ __dpct_inline__ void apply_kernel(
     logger.log_iteration(batch_id, iter, norms_res_sh[0]);
 
     // copy x back to global memory
-    copy_kernel(num_rows, x_sh, x_global_entry, item_ct1);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(
+        num_rows, x_sh, x_global_entry, item_ct1);
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
 }

From b1a300062876a609f6a6cf242e8ea68a32de6f38 Mon Sep 17 00:00:00 2001
From: ginkgo-bot <ginkgo.library@gmail.com>
Date: Tue, 20 Aug 2024 12:04:44 +0000
Subject: [PATCH 143/448] [format] Format files

Co-authored-by: Pratik Nayak <pratikvn@pm.me>
---
 dpcpp/base/batch_multi_vector_kernels.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dpcpp/base/batch_multi_vector_kernels.hpp b/dpcpp/base/batch_multi_vector_kernels.hpp
index a16df237e34..bbcc540ae60 100644
--- a/dpcpp/base/batch_multi_vector_kernels.hpp
+++ b/dpcpp/base/batch_multi_vector_kernels.hpp
@@ -2,7 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-
 #include <memory>
 
 #include <CL/sycl.hpp>

From 5598265d1d575c7dc515c328108042865c83aaae Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Wed, 21 Aug 2024 15:42:29 +0200
Subject: [PATCH 144/448] [cuda, hip] unify csr, dense and ell kernels

---
 common/cuda_hip/CMakeLists.txt                |   3 +
 .../base/batch_multi_vector_kernels.hpp       |   4 +
 ...launcher.hpp.inc => batch_csr_kernels.cpp} |  57 ++++++++--
 ..._kernels.hpp.inc => batch_csr_kernels.hpp} | 100 +++++++++--------
 ...uncher.hpp.inc => batch_dense_kernels.cpp} |  60 +++++++++--
 ...ernels.hpp.inc => batch_dense_kernels.hpp} |  96 +++++++++--------
 ...launcher.hpp.inc => batch_ell_kernels.cpp} |  57 ++++++++--
 ..._kernels.hpp.inc => batch_ell_kernels.hpp} | 101 ++++++++++--------
 .../solver/batch_bicgstab_kernels.hpp.inc     |  11 +-
 .../cuda_hip/solver/batch_cg_kernels.hpp.inc  |   8 +-
 cuda/CMakeLists.txt                           |   3 -
 cuda/matrix/batch_csr_kernels.cu              |  55 ----------
 cuda/matrix/batch_struct.hpp                  |   8 ++
 cuda/solver/batch_bicgstab_kernels.cu         |  11 +-
 cuda/solver/batch_cg_kernels.cu               |  11 +-
 hip/CMakeLists.txt                            |   3 -
 hip/matrix/batch_struct.hip.hpp               |   7 ++
 hip/solver/batch_bicgstab_kernels.hip.cpp     |   9 +-
 hip/solver/batch_cg_kernels.hip.cpp           |   9 +-
 19 files changed, 356 insertions(+), 257 deletions(-)
 rename common/cuda_hip/matrix/{batch_csr_kernel_launcher.hpp.inc => batch_csr_kernels.cpp} (64%)
 rename common/cuda_hip/matrix/{batch_csr_kernels.hpp.inc => batch_csr_kernels.hpp} (66%)
 rename common/cuda_hip/matrix/{batch_dense_kernel_launcher.hpp.inc => batch_dense_kernels.cpp} (66%)
 rename common/cuda_hip/matrix/{batch_dense_kernels.hpp.inc => batch_dense_kernels.hpp} (72%)
 rename common/cuda_hip/matrix/{batch_ell_kernel_launcher.hpp.inc => batch_ell_kernels.cpp} (64%)
 rename common/cuda_hip/matrix/{batch_ell_kernels.hpp.inc => batch_ell_kernels.hpp} (67%)
 delete mode 100644 cuda/matrix/batch_csr_kernels.cu

diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt
index 15d3a82419e..f5a28596d16 100644
--- a/common/cuda_hip/CMakeLists.txt
+++ b/common/cuda_hip/CMakeLists.txt
@@ -23,6 +23,9 @@ set(CUDA_HIP_SOURCES
     factorization/par_ilut_select_kernels.cpp
     factorization/par_ilut_spgeam_kernels.cpp
     factorization/par_ilut_sweep_kernels.cpp
+    matrix/batch_csr_kernels.cpp
+    matrix/batch_dense_kernels.cpp
+    matrix/batch_ell_kernels.cpp
     matrix/coo_kernels.cpp
     matrix/dense_kernels.cpp
     matrix/diagonal_kernels.cpp
diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp b/common/cuda_hip/base/batch_multi_vector_kernels.hpp
index bb3aac67b55..0cbbdf9f5ee 100644
--- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp
@@ -35,11 +35,15 @@ namespace gko {
 namespace kernels {
 namespace GKO_DEVICE_NAMESPACE {
 namespace batch_single_kernels {
+namespace {
 
 
 constexpr auto default_block_size = 256;
 
 
+}
+
+
 template <typename ValueType, typename Mapping>
 __device__ __forceinline__ void scale(
     const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
diff --git a/common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc b/common/cuda_hip/matrix/batch_csr_kernels.cpp
similarity index 64%
rename from common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc
rename to common/cuda_hip/matrix/batch_csr_kernels.cpp
index 18c9dbcb29a..35dc2c17e03 100644
--- a/common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc
+++ b/common/cuda_hip/matrix/batch_csr_kernels.cpp
@@ -2,6 +2,34 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_csr.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_csr_kernels.hpp"
+#include "core/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_csr {
+
+
+constexpr auto default_block_size = 256;
+
+
 template <typename ValueType, typename IndexType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const batch::matrix::Csr<ValueType, IndexType>* mat,
@@ -15,8 +43,9 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     if (b->get_common_size()[1] > 1) {
         GKO_NOT_IMPLEMENTED;
     }
-    simple_apply_kernel<<<num_blocks, default_block_size, 0,
-                          exec->get_stream()>>>(mat_ub, b_ub, x_ub);
+    batch_single_kernels::simple_apply_kernel<<<num_blocks, default_block_size,
+                                                0, exec->get_stream()>>>(
+        mat_ub, b_ub, x_ub);
 }
 
 
@@ -41,9 +70,9 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
     if (b->get_common_size()[1] > 1) {
         GKO_NOT_IMPLEMENTED;
     }
-    advanced_apply_kernel<<<num_blocks, default_block_size, 0,
-                            exec->get_stream()>>>(alpha_ub, mat_ub, b_ub,
-                                                  beta_ub, x_ub);
+    batch_single_kernels::advanced_apply_kernel<<<
+        num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        alpha_ub, mat_ub, b_ub, beta_ub, x_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
@@ -59,8 +88,10 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     const auto col_scale_vals = col_scale->get_const_data();
     const auto row_scale_vals = row_scale->get_const_data();
     const auto mat_ub = get_batch_struct(input);
-    scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-        as_device_type(col_scale_vals), as_device_type(row_scale_vals), mat_ub);
+    batch_single_kernels::
+        scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+            as_device_type(col_scale_vals), as_device_type(row_scale_vals),
+            mat_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
@@ -77,10 +108,16 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
     const auto alpha_ub = get_batch_struct(alpha);
     const auto beta_ub = get_batch_struct(beta);
     const auto mat_ub = get_batch_struct(mat);
-    add_scaled_identity_kernel<<<num_blocks, default_block_size, 0,
-                                 exec->get_stream()>>>(alpha_ub, beta_ub,
-                                                       mat_ub);
+    batch_single_kernels::add_scaled_identity_kernel<<<
+        num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        alpha_ub, beta_ub, mat_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_CSR_ADD_SCALED_IDENTITY_KERNEL);
+
+
+}  // namespace batch_csr
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/matrix/batch_csr_kernels.hpp.inc b/common/cuda_hip/matrix/batch_csr_kernels.hpp
similarity index 66%
rename from common/cuda_hip/matrix/batch_csr_kernels.hpp.inc
rename to common/cuda_hip/matrix/batch_csr_kernels.hpp
index e041dadaa3e..32d22e435eb 100644
--- a/common/cuda_hip/matrix/batch_csr_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/batch_csr_kernels.hpp
@@ -2,6 +2,44 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_csr.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/segment_scan.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
+
+#if defined(GKO_COMPILING_CUDA)
+#include "cuda/base/batch_struct.hpp"
+#include "cuda/matrix/batch_struct.hpp"
+#elif defined(GKO_COMPILING_HIP)
+#include "hip/base/batch_struct.hip.hpp"
+#include "hip/matrix/batch_struct.hip.hpp"
+#else
+#error "batch struct def missing"
+#endif
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
 template <typename ValueType, typename IndexType>
 __device__ __forceinline__ void simple_apply(
     const gko::batch::matrix::csr::batch_item<const ValueType, IndexType>& mat,
@@ -21,23 +59,11 @@ __device__ __forceinline__ void simple_apply(
 }
 
 template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(
-    default_block_size,
-    sm_oversubscription) void simple_apply_kernel(const gko::batch::matrix::
-                                                      csr::uniform_batch<
-                                                          const ValueType,
-                                                          IndexType>
-                                                          mat,
-                                                  const gko::batch::
-                                                      multi_vector::
-                                                          uniform_batch<
-                                                              const ValueType>
-                                                              b,
-                                                  const gko::batch::
-                                                      multi_vector::
-                                                          uniform_batch<
-                                                              ValueType>
-                                                              x)
+__global__ __launch_bounds__(default_block_size) void simple_apply_kernel(
+    const gko::batch::matrix::csr::uniform_batch<const ValueType, IndexType>
+        mat,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> b,
+    const gko::batch::multi_vector::uniform_batch<ValueType> x)
 {
     for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
          batch_id += gridDim.x) {
@@ -71,33 +97,13 @@ __device__ __forceinline__ void advanced_apply(
 }
 
 template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(
-    default_block_size,
-    sm_oversubscription) void advanced_apply_kernel(const gko::batch::
-                                                        multi_vector::
-                                                            uniform_batch<
-                                                                const ValueType>
-                                                                alpha,
-                                                    const gko::batch::matrix::
-                                                        csr::uniform_batch<
-                                                            const ValueType,
-                                                            IndexType>
-                                                            mat,
-                                                    const gko::batch::
-                                                        multi_vector::
-                                                            uniform_batch<
-                                                                const ValueType>
-                                                                b,
-                                                    const gko::batch::
-                                                        multi_vector::
-                                                            uniform_batch<
-                                                                const ValueType>
-                                                                beta,
-                                                    const gko::batch::
-                                                        multi_vector::
-                                                            uniform_batch<
-                                                                ValueType>
-                                                                x)
+__global__ __launch_bounds__(default_block_size) void advanced_apply_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> alpha,
+    const gko::batch::matrix::csr::uniform_batch<const ValueType, IndexType>
+        mat,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> b,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> beta,
+    const gko::batch::multi_vector::uniform_batch<ValueType> x)
 {
     for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
          batch_id += gridDim.x) {
@@ -196,3 +202,9 @@ __global__ void add_scaled_identity_kernel(
         add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b);
     }
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc b/common/cuda_hip/matrix/batch_dense_kernels.cpp
similarity index 66%
rename from common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc
rename to common/cuda_hip/matrix/batch_dense_kernels.cpp
index 8fdb001fd1f..44dad55aa70 100644
--- a/common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc
+++ b/common/cuda_hip/matrix/batch_dense_kernels.cpp
@@ -2,6 +2,34 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_dense_kernels.hpp"
+#include "core/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_dense {
+
+
+constexpr auto default_block_size = 256;
+
+
 template <typename ValueType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const batch::matrix::Dense<ValueType>* mat,
@@ -15,8 +43,9 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     if (b->get_common_size()[1] > 1) {
         GKO_NOT_IMPLEMENTED;
     }
-    simple_apply_kernel<<<num_blocks, default_block_size, 0,
-                          exec->get_stream()>>>(mat_ub, b_ub, x_ub);
+    batch_single_kernels::simple_apply_kernel<<<num_blocks, default_block_size,
+                                                0, exec->get_stream()>>>(
+        mat_ub, b_ub, x_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -40,9 +69,9 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
     if (b->get_common_size()[1] > 1) {
         GKO_NOT_IMPLEMENTED;
     }
-    advanced_apply_kernel<<<num_blocks, default_block_size, 0,
-                            exec->get_stream()>>>(alpha_ub, mat_ub, b_ub,
-                                                  beta_ub, x_ub);
+    batch_single_kernels::advanced_apply_kernel<<<
+        num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        alpha_ub, mat_ub, b_ub, beta_ub, x_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -58,8 +87,10 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     const auto col_scale_vals = col_scale->get_const_data();
     const auto row_scale_vals = row_scale->get_const_data();
     const auto mat_ub = get_batch_struct(input);
-    scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-        as_device_type(col_scale_vals), as_device_type(row_scale_vals), mat_ub);
+    batch_single_kernels::
+        scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+            as_device_type(col_scale_vals), as_device_type(row_scale_vals),
+            mat_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL);
@@ -75,7 +106,8 @@ void scale_add(std::shared_ptr<const DefaultExecutor> exec,
     const auto alpha_ub = get_batch_struct(alpha);
     const auto mat_ub = get_batch_struct(mat);
     const auto in_out_ub = get_batch_struct(in_out);
-    scale_add_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+    batch_single_kernels::scale_add_kernel<<<num_blocks, default_block_size, 0,
+                                             exec->get_stream()>>>(
         alpha_ub, mat_ub, in_out_ub);
 }
 
@@ -92,10 +124,16 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
     const auto alpha_ub = get_batch_struct(alpha);
     const auto beta_ub = get_batch_struct(beta);
     const auto mat_ub = get_batch_struct(mat);
-    add_scaled_identity_kernel<<<num_blocks, default_block_size, 0,
-                                 exec->get_stream()>>>(alpha_ub, beta_ub,
-                                                       mat_ub);
+    batch_single_kernels::add_scaled_identity_kernel<<<
+        num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        alpha_ub, beta_ub, mat_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_DENSE_ADD_SCALED_IDENTITY_KERNEL);
+
+
+}  // namespace batch_dense
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc b/common/cuda_hip/matrix/batch_dense_kernels.hpp
similarity index 72%
rename from common/cuda_hip/matrix/batch_dense_kernels.hpp.inc
rename to common/cuda_hip/matrix/batch_dense_kernels.hpp
index f8abf9131a1..74b81008b38 100644
--- a/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/batch_dense_kernels.hpp
@@ -2,6 +2,44 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/segment_scan.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
+
+#if defined(GKO_COMPILING_CUDA)
+#include "cuda/base/batch_struct.hpp"
+#include "cuda/matrix/batch_struct.hpp"
+#elif defined(GKO_COMPILING_HIP)
+#include "hip/base/batch_struct.hip.hpp"
+#include "hip/matrix/batch_struct.hip.hpp"
+#else
+#error "batch struct def missing"
+#endif
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
 template <typename ValueType>
 __device__ __forceinline__ void simple_apply(
     const gko::batch::matrix::dense::batch_item<const ValueType>& mat,
@@ -33,22 +71,10 @@ __device__ __forceinline__ void simple_apply(
 }
 
 template <typename ValueType>
-__global__ __launch_bounds__(
-    default_block_size,
-    sm_oversubscription) void simple_apply_kernel(const gko::batch::matrix::
-                                                      dense::uniform_batch<
-                                                          const ValueType>
-                                                          mat,
-                                                  const gko::batch::
-                                                      multi_vector::
-                                                          uniform_batch<
-                                                              const ValueType>
-                                                              b,
-                                                  const gko::batch::
-                                                      multi_vector::
-                                                          uniform_batch<
-                                                              ValueType>
-                                                              x)
+__global__ __launch_bounds__(default_block_size) void simple_apply_kernel(
+    const gko::batch::matrix::dense::uniform_batch<const ValueType> mat,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> b,
+    const gko::batch::multi_vector::uniform_batch<ValueType> x)
 {
     for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
          batch_id += gridDim.x) {
@@ -94,32 +120,12 @@ __device__ __forceinline__ void advanced_apply(
 }
 
 template <typename ValueType>
-__global__ __launch_bounds__(
-    default_block_size,
-    sm_oversubscription) void advanced_apply_kernel(const gko::batch::
-                                                        multi_vector::
-                                                            uniform_batch<
-                                                                const ValueType>
-                                                                alpha,
-                                                    const gko::batch::matrix::
-                                                        dense::uniform_batch<
-                                                            const ValueType>
-                                                            mat,
-                                                    const gko::batch::
-                                                        multi_vector::
-                                                            uniform_batch<
-                                                                const ValueType>
-                                                                b,
-                                                    const gko::batch::
-                                                        multi_vector::
-                                                            uniform_batch<
-                                                                const ValueType>
-                                                                beta,
-                                                    const gko::batch::
-                                                        multi_vector::
-                                                            uniform_batch<
-                                                                ValueType>
-                                                                x)
+__global__ __launch_bounds__(default_block_size) void advanced_apply_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> alpha,
+    const gko::batch::matrix::dense::uniform_batch<const ValueType> mat,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> b,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> beta,
+    const gko::batch::multi_vector::uniform_batch<ValueType> x)
 {
     for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
          batch_id += gridDim.x) {
@@ -243,3 +249,9 @@ __global__ void add_scaled_identity_kernel(
         add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b);
     }
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc b/common/cuda_hip/matrix/batch_ell_kernels.cpp
similarity index 64%
rename from common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc
rename to common/cuda_hip/matrix/batch_ell_kernels.cpp
index 7e69b119c85..c56325ab824 100644
--- a/common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc
+++ b/common/cuda_hip/matrix/batch_ell_kernels.cpp
@@ -2,6 +2,34 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_ell_kernels.hpp"
+#include "core/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_ell {
+
+
+constexpr auto default_block_size = 256;
+
+
 template <typename ValueType, typename IndexType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const batch::matrix::Ell<ValueType, IndexType>* mat,
@@ -15,8 +43,9 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     if (b->get_common_size()[1] > 1) {
         GKO_NOT_IMPLEMENTED;
     }
-    simple_apply_kernel<<<num_blocks, default_block_size, 0,
-                          exec->get_stream()>>>(mat_ub, b_ub, x_ub);
+    batch_single_kernels::simple_apply_kernel<<<num_blocks, default_block_size,
+                                                0, exec->get_stream()>>>(
+        mat_ub, b_ub, x_ub);
 }
 
 
@@ -41,9 +70,9 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
     if (b->get_common_size()[1] > 1) {
         GKO_NOT_IMPLEMENTED;
     }
-    advanced_apply_kernel<<<num_blocks, default_block_size, 0,
-                            exec->get_stream()>>>(alpha_ub, mat_ub, b_ub,
-                                                  beta_ub, x_ub);
+    batch_single_kernels::advanced_apply_kernel<<<
+        num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        alpha_ub, mat_ub, b_ub, beta_ub, x_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
@@ -59,8 +88,10 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     const auto col_scale_vals = col_scale->get_const_data();
     const auto row_scale_vals = row_scale->get_const_data();
     const auto mat_ub = get_batch_struct(input);
-    scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-        as_device_type(col_scale_vals), as_device_type(row_scale_vals), mat_ub);
+    batch_single_kernels::
+        scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+            as_device_type(col_scale_vals), as_device_type(row_scale_vals),
+            mat_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
@@ -77,10 +108,16 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
     const auto alpha_ub = get_batch_struct(alpha);
     const auto beta_ub = get_batch_struct(beta);
     const auto mat_ub = get_batch_struct(mat);
-    add_scaled_identity_kernel<<<num_blocks, default_block_size, 0,
-                                 exec->get_stream()>>>(alpha_ub, beta_ub,
-                                                       mat_ub);
+    batch_single_kernels::add_scaled_identity_kernel<<<
+        num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        alpha_ub, beta_ub, mat_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_ELL_ADD_SCALED_IDENTITY_KERNEL);
+
+
+}  // namespace batch_ell
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc b/common/cuda_hip/matrix/batch_ell_kernels.hpp
similarity index 67%
rename from common/cuda_hip/matrix/batch_ell_kernels.hpp.inc
rename to common/cuda_hip/matrix/batch_ell_kernels.hpp
index 0a6d1927c96..e8cadc29cd3 100644
--- a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/batch_ell_kernels.hpp
@@ -2,6 +2,44 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/segment_scan.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
+
+#if defined(GKO_COMPILING_CUDA)
+#include "cuda/base/batch_struct.hpp"
+#include "cuda/matrix/batch_struct.hpp"
+#elif defined(GKO_COMPILING_HIP)
+#include "hip/base/batch_struct.hip.hpp"
+#include "hip/matrix/batch_struct.hip.hpp"
+#else
+#error "batch struct def missing"
+#endif
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
 template <typename ValueType, typename IndexType>
 __device__ __forceinline__ void simple_apply(
     const gko::batch::matrix::ell::batch_item<const ValueType, IndexType>& mat,
@@ -28,23 +66,11 @@ __device__ __forceinline__ void simple_apply(
 }
 
 template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(
-    default_block_size,
-    sm_oversubscription) void simple_apply_kernel(const gko::batch::matrix::
-                                                      ell::uniform_batch<
-                                                          const ValueType,
-                                                          IndexType>
-                                                          mat,
-                                                  const gko::batch::
-                                                      multi_vector::
-                                                          uniform_batch<
-                                                              const ValueType>
-                                                              b,
-                                                  const gko::batch::
-                                                      multi_vector::
-                                                          uniform_batch<
-                                                              ValueType>
-                                                              x)
+__global__ __launch_bounds__(default_block_size) void simple_apply_kernel(
+    const gko::batch::matrix::ell::uniform_batch<const ValueType, IndexType>
+        mat,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> b,
+    const gko::batch::multi_vector::uniform_batch<ValueType> x)
 {
     for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
          batch_id += gridDim.x) {
@@ -84,34 +110,15 @@ __device__ __forceinline__ void advanced_apply(
     }
 }
 
+
 template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(
-    default_block_size,
-    sm_oversubscription) void advanced_apply_kernel(const gko::batch::
-                                                        multi_vector::
-                                                            uniform_batch<
-                                                                const ValueType>
-                                                                alpha,
-                                                    const gko::batch::matrix::
-                                                        ell::uniform_batch<
-                                                            const ValueType,
-                                                            IndexType>
-                                                            mat,
-                                                    const gko::batch::
-                                                        multi_vector::
-                                                            uniform_batch<
-                                                                const ValueType>
-                                                                b,
-                                                    const gko::batch::
-                                                        multi_vector::
-                                                            uniform_batch<
-                                                                const ValueType>
-                                                                beta,
-                                                    const gko::batch::
-                                                        multi_vector::
-                                                            uniform_batch<
-                                                                ValueType>
-                                                                x)
+__global__ __launch_bounds__(default_block_size) void advanced_apply_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> alpha,
+    const gko::batch::matrix::ell::uniform_batch<const ValueType, IndexType>
+        mat,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> b,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> beta,
+    const gko::batch::multi_vector::uniform_batch<ValueType> x)
 {
     for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
          batch_id += gridDim.x) {
@@ -205,3 +212,9 @@ __global__ void add_scaled_identity_kernel(
         add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b);
     }
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc
index c2a53b2e518..d4ce149d394 100644
--- a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc
+++ b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc
@@ -27,8 +27,9 @@ __device__ __forceinline__ void initialize(
     __syncthreads();
 
     // r = b - A*x
-    advanced_apply(static_cast<ValueType>(-1.0), mat_entry, x_shared_entry,
-                   static_cast<ValueType>(1.0), r_shared_entry);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply(
+        static_cast<ValueType>(-1.0), mat_entry, x_shared_entry,
+        static_cast<ValueType>(1.0), r_shared_entry);
     __syncthreads();
 
     if (threadIdx.x / config::warp_size == 0) {
@@ -295,7 +296,8 @@ __global__ void apply_kernel(
             __syncthreads();
 
             // v = A * p_hat
-            simple_apply(mat_entry, p_hat_sh, v_sh);
+            gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+                simple_apply(mat_entry, p_hat_sh, v_sh);
             __syncthreads();
 
             // alpha = rho_new / < r_hat , v>
@@ -327,7 +329,8 @@ __global__ void apply_kernel(
             __syncthreads();
 
             // t = A * s_hat
-            simple_apply(mat_entry, s_hat_sh, t_sh);
+            gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+                simple_apply(mat_entry, s_hat_sh, t_sh);
             __syncthreads();
 
             // omega = <t,s> / <t,t>
diff --git a/common/cuda_hip/solver/batch_cg_kernels.hpp.inc b/common/cuda_hip/solver/batch_cg_kernels.hpp.inc
index c95a6b1cf05..4f4b382f552 100644
--- a/common/cuda_hip/solver/batch_cg_kernels.hpp.inc
+++ b/common/cuda_hip/solver/batch_cg_kernels.hpp.inc
@@ -22,8 +22,9 @@ __device__ __forceinline__ void initialize(
     __syncthreads();
 
     // r = b - A*x
-    advanced_apply(static_cast<ValueType>(-1.0), mat_entry, x_shared_entry,
-                   static_cast<ValueType>(1.0), r_shared_entry);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply(
+        static_cast<ValueType>(-1.0), mat_entry, x_shared_entry,
+        static_cast<ValueType>(1.0), r_shared_entry);
     __syncthreads();
 
     // z = precond * r
@@ -189,7 +190,8 @@ __global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf,
             }
 
             // Ap = A * p
-            simple_apply(mat_entry, p_sh, Ap_sh);
+            gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+                simple_apply(mat_entry, p_sh, Ap_sh);
             __syncthreads();
 
             // alpha = rho_old / (p' * Ap)
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index 3631a65f48d..000cb7b215f 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -16,9 +16,6 @@ target_sources(ginkgo_cuda
     base/stream.cpp
     base/timer.cpp
     base/version.cpp
-    matrix/batch_csr_kernels.cu
-    matrix/batch_dense_kernels.cu
-    matrix/batch_ell_kernels.cu
     ${CSR_INSTANTIATE}
     ${FBCSR_INSTANTIATE}
     matrix/fft_kernels.cu
diff --git a/cuda/matrix/batch_csr_kernels.cu b/cuda/matrix/batch_csr_kernels.cu
deleted file mode 100644
index 95b4f85cdfc..00000000000
--- a/cuda/matrix/batch_csr_kernels.cu
+++ /dev/null
@@ -1,55 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/batch_csr_kernels.hpp"
-
-#include <thrust/functional.h>
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/batch_csr.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/warp_blas.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
-#include "cuda/base/batch_struct.hpp"
-#include "cuda/matrix/batch_struct.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Csr matrix format namespace.
- * @ref Csr
- * @ingroup batch_csr
- */
-namespace batch_csr {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc"
-
-// clang-format on
-
-
-}  // namespace batch_csr
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/matrix/batch_struct.hpp b/cuda/matrix/batch_struct.hpp
index 5845fb2235e..8a1b8fee00a 100644
--- a/cuda/matrix/batch_struct.hpp
+++ b/cuda/matrix/batch_struct.hpp
@@ -9,6 +9,7 @@
 #include <ginkgo/core/matrix/batch_dense.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
+#include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
@@ -17,6 +18,13 @@
 namespace gko {
 namespace kernels {
 namespace cuda {
+namespace {
+
+
+constexpr auto default_block_size = 256;
+
+
+}
 
 
 /** @file batch_struct.hpp
diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu
index 4d3deb742fe..09e737c8793 100644
--- a/cuda/solver/batch_bicgstab_kernels.cu
+++ b/cuda/solver/batch_bicgstab_kernels.cu
@@ -19,6 +19,9 @@
 #include "common/cuda_hip/components/reduction.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
 #include "common/cuda_hip/components/warp_blas.hpp"
+#include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
@@ -31,11 +34,6 @@ namespace kernels {
 namespace cuda {
 
 
-// NOTE: this default block size is not used for the main solver kernel.
-constexpr int default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-
 /**
  * @brief The batch Bicgstab solver namespace.
  *
@@ -44,9 +42,6 @@ constexpr int sm_oversubscription = 4;
 namespace batch_bicgstab {
 
 
-#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
-#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
-#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
 #include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc"
 
 
diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu
index 21c3e3d43c4..7ac876de3a2 100644
--- a/cuda/solver/batch_cg_kernels.cu
+++ b/cuda/solver/batch_cg_kernels.cu
@@ -18,6 +18,9 @@
 #include "common/cuda_hip/components/reduction.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
 #include "common/cuda_hip/components/warp_blas.hpp"
+#include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
@@ -30,11 +33,6 @@ namespace kernels {
 namespace cuda {
 
 
-// NOTE: this default block size is not used for the main solver kernel.
-constexpr int default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-
 /**
  * @brief The batch Cg solver namespace.
  *
@@ -43,9 +41,6 @@ constexpr int sm_oversubscription = 4;
 namespace batch_cg {
 
 
-#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
-#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
-#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
 #include "common/cuda_hip/solver/batch_cg_kernels.hpp.inc"
 
 
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 84bba295120..7d914d57a81 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -14,9 +14,6 @@ set(GINKGO_HIP_SOURCES
     base/stream.hip.cpp
     base/timer.hip.cpp
     base/version.hip.cpp
-    matrix/batch_csr_kernels.hip.cpp
-    matrix/batch_dense_kernels.hip.cpp
-    matrix/batch_ell_kernels.hip.cpp
     ${CSR_INSTANTIATE}
     ${FBCSR_INSTANTIATE}
     preconditioner/batch_jacobi_kernels.hip.cpp
diff --git a/hip/matrix/batch_struct.hip.hpp b/hip/matrix/batch_struct.hip.hpp
index bb9f7912cd6..a8d14b84bb7 100644
--- a/hip/matrix/batch_struct.hip.hpp
+++ b/hip/matrix/batch_struct.hip.hpp
@@ -17,6 +17,13 @@
 namespace gko {
 namespace kernels {
 namespace hip {
+namespace {
+
+
+constexpr auto default_block_size = 256;
+
+
+}
 
 
 /** @file batch_struct.hpp
diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp
index 1c1be8b21f7..f0f1a715a86 100644
--- a/hip/solver/batch_bicgstab_kernels.hip.cpp
+++ b/hip/solver/batch_bicgstab_kernels.hip.cpp
@@ -20,6 +20,9 @@
 #include "common/cuda_hip/components/reduction.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
 #include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
@@ -32,9 +35,6 @@ namespace kernels {
 namespace hip {
 
 
-constexpr int default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
 /**
  * @brief The batch Bicgstab solver namespace.
  *
@@ -43,9 +43,6 @@ constexpr int sm_oversubscription = 4;
 namespace batch_bicgstab {
 
 
-#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
-#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
-#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
 #include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc"
 
 
diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp
index c860286c17c..b40732535f4 100644
--- a/hip/solver/batch_cg_kernels.hip.cpp
+++ b/hip/solver/batch_cg_kernels.hip.cpp
@@ -20,6 +20,9 @@
 #include "common/cuda_hip/components/reduction.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
 #include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
@@ -32,9 +35,6 @@ namespace kernels {
 namespace hip {
 
 
-constexpr int default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
 /**
  * @brief The batch Cg solver namespace.
  *
@@ -43,9 +43,6 @@ constexpr int sm_oversubscription = 4;
 namespace batch_cg {
 
 
-#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
-#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
-#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
 #include "common/cuda_hip/solver/batch_cg_kernels.hpp.inc"
 
 

From 68a53e27187f2f5fda51d712f1f9425b02c730f5 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Thu, 22 Aug 2024 17:41:40 +0200
Subject: [PATCH 145/448] [ref, omp] unify csr, dense, ell kernels

+ also fix kernel names: remove _kernel suffix
---
 omp/matrix/batch_csr_kernels.cpp              | 22 ++++++-------
 omp/matrix/batch_dense_kernels.cpp            | 25 +++++++--------
 omp/matrix/batch_ell_kernels.cpp              | 22 ++++++-------
 omp/solver/batch_bicgstab_kernels.cpp         | 13 ++------
 omp/solver/batch_cg_kernels.cpp               | 13 ++------
 reference/matrix/batch_csr_kernels.cpp        | 25 +++++++--------
 ..._kernels.hpp.inc => batch_csr_kernels.hpp} | 29 +++++++++++++++--
 reference/matrix/batch_dense_kernels.cpp      | 28 ++++++++---------
 ...ernels.hpp.inc => batch_dense_kernels.hpp} | 31 ++++++++++++++++---
 reference/matrix/batch_ell_kernels.cpp        | 25 +++++++--------
 ..._kernels.hpp.inc => batch_ell_kernels.hpp} | 29 +++++++++++++++--
 reference/solver/batch_bicgstab_kernels.cpp   | 15 ++-------
 .../solver/batch_bicgstab_kernels.hpp.inc     | 14 ++++-----
 reference/solver/batch_cg_kernels.cpp         | 15 ++-------
 reference/solver/batch_cg_kernels.hpp.inc     |  9 +++---
 15 files changed, 170 insertions(+), 145 deletions(-)
 rename reference/matrix/{batch_csr_kernels.hpp.inc => batch_csr_kernels.hpp} (81%)
 rename reference/matrix/{batch_dense_kernels.hpp.inc => batch_dense_kernels.hpp} (84%)
 rename reference/matrix/{batch_ell_kernels.hpp.inc => batch_ell_kernels.hpp} (84%)

diff --git a/omp/matrix/batch_csr_kernels.cpp b/omp/matrix/batch_csr_kernels.cpp
index eacb26c12cb..d4ea6cbd642 100644
--- a/omp/matrix/batch_csr_kernels.cpp
+++ b/omp/matrix/batch_csr_kernels.cpp
@@ -9,26 +9,20 @@
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/matrix/batch_csr.hpp>
 
+#include "common/unified/base/kernel_launch.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "reference/base/batch_struct.hpp"
+#include "reference/matrix/batch_csr_kernels.hpp"
 #include "reference/matrix/batch_struct.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace omp {
-/**
- * @brief The Csr matrix format namespace.
- * @ref Csr
- * @ingroup batch_csr
- */
 namespace batch_csr {
 
 
-#include "reference/matrix/batch_csr_kernels.hpp.inc"
-
-
 template <typename ValueType, typename IndexType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const batch::matrix::Csr<ValueType, IndexType>* mat,
@@ -43,7 +37,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
         const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch);
         const auto b_item = batch::extract_batch_item(b_ub, batch);
         const auto x_item = batch::extract_batch_item(x_ub, batch);
-        simple_apply_kernel(mat_item, b_item, x_item);
+        batch_single_kernels::simple_apply(mat_item, b_item, x_item);
     }
 }
 
@@ -71,8 +65,9 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
         const auto x_item = batch::extract_batch_item(x_ub, batch);
         const auto alpha_item = batch::extract_batch_item(alpha_ub, batch);
         const auto beta_item = batch::extract_batch_item(beta_ub, batch);
-        advanced_apply_kernel(alpha_item.values[0], mat_item, b_item,
-                              beta_item.values[0], x_item);
+        batch_single_kernels::advanced_apply(alpha_item.values[0], mat_item,
+                                             b_item, beta_item.values[0],
+                                             x_item);
     }
 }
 
@@ -99,7 +94,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
         const auto row_scale_b = row_scale_vals + num_rows * batch_id;
         const auto mat_item =
             batch::matrix::extract_batch_item(mat_ub, batch_id);
-        scale(col_scale_b, row_scale_b, mat_item);
+        batch_single_kernels::scale(col_scale_b, row_scale_b, mat_item);
     }
 }
 
@@ -122,7 +117,8 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
         const auto alpha_b = batch::extract_batch_item(alpha_ub, batch_id);
         const auto beta_b = batch::extract_batch_item(beta_ub, batch_id);
         const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id);
-        add_scaled_identity_kernel(alpha_b.values[0], beta_b.values[0], mat_b);
+        batch_single_kernels::add_scaled_identity(alpha_b.values[0],
+                                                  beta_b.values[0], mat_b);
     }
 }
 
diff --git a/omp/matrix/batch_dense_kernels.cpp b/omp/matrix/batch_dense_kernels.cpp
index 836908260a7..cd4a7f05b4a 100644
--- a/omp/matrix/batch_dense_kernels.cpp
+++ b/omp/matrix/batch_dense_kernels.cpp
@@ -9,26 +9,20 @@
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
 
+#include "common/unified/base/kernel_launch.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "reference/base/batch_struct.hpp"
+#include "reference/matrix/batch_dense_kernels.hpp"
 #include "reference/matrix/batch_struct.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace omp {
-/**
- * @brief The Dense matrix format namespace.
- * @ref Dense
- * @ingroup batch_dense
- */
 namespace batch_dense {
 
 
-#include "reference/matrix/batch_dense_kernels.hpp.inc"
-
-
 template <typename ValueType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const batch::matrix::Dense<ValueType>* mat,
@@ -43,7 +37,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
         const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch);
         const auto b_item = batch::extract_batch_item(b_ub, batch);
         const auto x_item = batch::extract_batch_item(x_ub, batch);
-        simple_apply_kernel(mat_item, b_item, x_item);
+        batch_single_kernels::simple_apply(mat_item, b_item, x_item);
     }
 }
 
@@ -71,8 +65,9 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
         const auto x_item = batch::extract_batch_item(x_ub, batch);
         const auto alpha_item = batch::extract_batch_item(alpha_ub, batch);
         const auto beta_item = batch::extract_batch_item(beta_ub, batch);
-        advanced_apply_kernel(alpha_item.values[0], mat_item, b_item,
-                              beta_item.values[0], x_item);
+        batch_single_kernels::advanced_apply(alpha_item.values[0], mat_item,
+                                             b_item, beta_item.values[0],
+                                             x_item);
     }
 }
 
@@ -98,7 +93,8 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
         const auto row_scale_b = row_scale_vals + num_rows * batch_id;
         const auto input_mat =
             input_vals + input->get_num_elements_per_item() * batch_id;
-        scale(num_rows, num_cols, stride, col_scale_b, row_scale_b, input_mat);
+        batch_single_kernels::scale(num_rows, num_cols, stride, col_scale_b,
+                                    row_scale_b, input_mat);
     }
 }
 
@@ -121,7 +117,7 @@ void scale_add(std::shared_ptr<const DefaultExecutor> exec,
         const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id);
         const auto input_mat_b =
             batch::matrix::extract_batch_item(in_mat_ub, batch_id);
-        scale_add_kernel(alpha_b.values[0], mat_b, input_mat_b);
+        batch_single_kernels::scale_add(alpha_b.values[0], mat_b, input_mat_b);
     }
 }
 
@@ -143,7 +139,8 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
         const auto alpha_b = batch::extract_batch_item(alpha_ub, batch_id);
         const auto beta_b = batch::extract_batch_item(beta_ub, batch_id);
         const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id);
-        add_scaled_identity_kernel(alpha_b.values[0], beta_b.values[0], mat_b);
+        batch_single_kernels::add_scaled_identity(alpha_b.values[0],
+                                                  beta_b.values[0], mat_b);
     }
 }
 
diff --git a/omp/matrix/batch_ell_kernels.cpp b/omp/matrix/batch_ell_kernels.cpp
index 4fb5aeea6fa..8b1239565a1 100644
--- a/omp/matrix/batch_ell_kernels.cpp
+++ b/omp/matrix/batch_ell_kernels.cpp
@@ -9,26 +9,20 @@
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
+#include "common/unified/base/kernel_launch.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "reference/base/batch_struct.hpp"
+#include "reference/matrix/batch_ell_kernels.hpp"
 #include "reference/matrix/batch_struct.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace omp {
-/**
- * @brief The Ell matrix format namespace.
- * @ref Ell
- * @ingroup batch_ell
- */
 namespace batch_ell {
 
 
-#include "reference/matrix/batch_ell_kernels.hpp.inc"
-
-
 template <typename ValueType, typename IndexType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const batch::matrix::Ell<ValueType, IndexType>* mat,
@@ -43,7 +37,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
         const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch);
         const auto b_item = batch::extract_batch_item(b_ub, batch);
         const auto x_item = batch::extract_batch_item(x_ub, batch);
-        simple_apply_kernel(mat_item, b_item, x_item);
+        batch_single_kernels::simple_apply(mat_item, b_item, x_item);
     }
 }
 
@@ -71,8 +65,9 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
         const auto x_item = batch::extract_batch_item(x_ub, batch);
         const auto alpha_item = batch::extract_batch_item(alpha_ub, batch);
         const auto beta_item = batch::extract_batch_item(beta_ub, batch);
-        advanced_apply_kernel(alpha_item.values[0], mat_item, b_item,
-                              beta_item.values[0], x_item);
+        batch_single_kernels::advanced_apply(alpha_item.values[0], mat_item,
+                                             b_item, beta_item.values[0],
+                                             x_item);
     }
 }
 
@@ -99,7 +94,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
         const auto row_scale_b = row_scale_vals + num_rows * batch_id;
         const auto mat_item =
             batch::matrix::extract_batch_item(mat_ub, batch_id);
-        scale(col_scale_b, row_scale_b, mat_item);
+        batch_single_kernels::scale(col_scale_b, row_scale_b, mat_item);
     }
 }
 
@@ -122,7 +117,8 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
         const auto alpha_b = batch::extract_batch_item(alpha_ub, batch_id);
         const auto beta_b = batch::extract_batch_item(beta_ub, batch_id);
         const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id);
-        add_scaled_identity_kernel(alpha_b.values[0], beta_b.values[0], mat_b);
+        batch_single_kernels::add_scaled_identity(alpha_b.values[0],
+                                                  beta_b.values[0], mat_b);
     }
 }
 
diff --git a/omp/solver/batch_bicgstab_kernels.cpp b/omp/solver/batch_bicgstab_kernels.cpp
index c245f284106..661cdbcd2ec 100644
--- a/omp/solver/batch_bicgstab_kernels.cpp
+++ b/omp/solver/batch_bicgstab_kernels.cpp
@@ -10,28 +10,21 @@
 
 #include "core/solver/batch_dispatch.hpp"
 #include "reference/base/batch_multi_vector_kernels.hpp"
+#include "reference/matrix/batch_csr_kernels.hpp"
+#include "reference/matrix/batch_dense_kernels.hpp"
+#include "reference/matrix/batch_ell_kernels.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace omp {
-/**
- * @brief The batch Bicgstab solver namespace.
- *
- * @ingroup batch_bicgstab
- */
 namespace batch_bicgstab {
-
-
 namespace {
 
 
 constexpr int max_num_rhs = 1;
 
 
-#include "reference/matrix/batch_csr_kernels.hpp.inc"
-#include "reference/matrix/batch_dense_kernels.hpp.inc"
-#include "reference/matrix/batch_ell_kernels.hpp.inc"
 #include "reference/solver/batch_bicgstab_kernels.hpp.inc"
 
 
diff --git a/omp/solver/batch_cg_kernels.cpp b/omp/solver/batch_cg_kernels.cpp
index 55d6ee29321..3a6e31256c2 100644
--- a/omp/solver/batch_cg_kernels.cpp
+++ b/omp/solver/batch_cg_kernels.cpp
@@ -10,28 +10,21 @@
 
 #include "core/solver/batch_dispatch.hpp"
 #include "reference/base/batch_multi_vector_kernels.hpp"
+#include "reference/matrix/batch_csr_kernels.hpp"
+#include "reference/matrix/batch_dense_kernels.hpp"
+#include "reference/matrix/batch_ell_kernels.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace omp {
-/**
- * @brief The batch Cg solver namespace.
- *
- * @ingroup batch_cg
- */
 namespace batch_cg {
-
-
 namespace {
 
 
 constexpr int max_num_rhs = 1;
 
 
-#include "reference/matrix/batch_csr_kernels.hpp.inc"
-#include "reference/matrix/batch_dense_kernels.hpp.inc"
-#include "reference/matrix/batch_ell_kernels.hpp.inc"
 #include "reference/solver/batch_cg_kernels.hpp.inc"
 
 
diff --git a/reference/matrix/batch_csr_kernels.cpp b/reference/matrix/batch_csr_kernels.cpp
index 7c6d9a6c000..9fbb2e35804 100644
--- a/reference/matrix/batch_csr_kernels.cpp
+++ b/reference/matrix/batch_csr_kernels.cpp
@@ -9,26 +9,23 @@
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/matrix/batch_csr.hpp>
 
+
+#define GKO_DEVICE_NAMESPACE reference
+
+
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "reference/base/batch_struct.hpp"
+#include "reference/matrix/batch_csr_kernels.hpp"
 #include "reference/matrix/batch_struct.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace reference {
-/**
- * @brief The Csr matrix format namespace.
- * @ref Csr
- * @ingroup batch_csr
- */
 namespace batch_csr {
 
 
-#include "reference/matrix/batch_csr_kernels.hpp.inc"
-
-
 template <typename ValueType, typename IndexType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const batch::matrix::Csr<ValueType, IndexType>* mat,
@@ -42,7 +39,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
         const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch);
         const auto b_item = batch::extract_batch_item(b_ub, batch);
         const auto x_item = batch::extract_batch_item(x_ub, batch);
-        simple_apply_kernel(mat_item, b_item, x_item);
+        batch_single_kernels::simple_apply(mat_item, b_item, x_item);
     }
 }
 
@@ -69,8 +66,9 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
         const auto x_item = batch::extract_batch_item(x_ub, batch);
         const auto alpha_item = batch::extract_batch_item(alpha_ub, batch);
         const auto beta_item = batch::extract_batch_item(beta_ub, batch);
-        advanced_apply_kernel(alpha_item.values[0], mat_item, b_item,
-                              beta_item.values[0], x_item);
+        batch_single_kernels::advanced_apply(alpha_item.values[0], mat_item,
+                                             b_item, beta_item.values[0],
+                                             x_item);
     }
 }
 
@@ -96,7 +94,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
         const auto row_scale_b = row_scale_vals + num_rows * batch_id;
         const auto mat_item =
             batch::matrix::extract_batch_item(mat_ub, batch_id);
-        scale(col_scale_b, row_scale_b, mat_item);
+        batch_single_kernels::scale(col_scale_b, row_scale_b, mat_item);
     }
 }
 
@@ -118,7 +116,8 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
         const auto alpha_b = batch::extract_batch_item(alpha_ub, batch_id);
         const auto beta_b = batch::extract_batch_item(beta_ub, batch_id);
         const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id);
-        add_scaled_identity_kernel(alpha_b.values[0], beta_b.values[0], mat_b);
+        batch_single_kernels::add_scaled_identity(alpha_b.values[0],
+                                                  beta_b.values[0], mat_b);
     }
 }
 
diff --git a/reference/matrix/batch_csr_kernels.hpp.inc b/reference/matrix/batch_csr_kernels.hpp
similarity index 81%
rename from reference/matrix/batch_csr_kernels.hpp.inc
rename to reference/matrix/batch_csr_kernels.hpp
index 52e511785a0..e04b2bdf345 100644
--- a/reference/matrix/batch_csr_kernels.hpp.inc
+++ b/reference/matrix/batch_csr_kernels.hpp
@@ -2,8 +2,25 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <algorithm>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/matrix/batch_csr.hpp>
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "reference/base/batch_struct.hpp"
+#include "reference/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
 template <typename ValueType, typename IndexType>
-inline void simple_apply_kernel(
+inline void simple_apply(
     const gko::batch::matrix::csr::batch_item<const ValueType, IndexType>& a,
     const gko::batch::multi_vector::batch_item<const ValueType>& b,
     const gko::batch::multi_vector::batch_item<ValueType>& c)
@@ -25,7 +42,7 @@ inline void simple_apply_kernel(
 
 
 template <typename ValueType, typename IndexType>
-inline void advanced_apply_kernel(
+inline void advanced_apply(
     const ValueType alpha,
     const gko::batch::matrix::csr::batch_item<const ValueType, IndexType>& a,
     const gko::batch::multi_vector::batch_item<const ValueType>& b,
@@ -63,7 +80,7 @@ inline void scale(
 
 
 template <typename ValueType, typename IndexType>
-inline void add_scaled_identity_kernel(
+inline void add_scaled_identity(
     const ValueType alpha, const ValueType beta,
     const gko::batch::matrix::csr::batch_item<ValueType, IndexType>& mat)
 {
@@ -76,3 +93,9 @@ inline void add_scaled_identity_kernel(
         }
     }
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/matrix/batch_dense_kernels.cpp b/reference/matrix/batch_dense_kernels.cpp
index 2116a691fb9..99a7d4e8d7b 100644
--- a/reference/matrix/batch_dense_kernels.cpp
+++ b/reference/matrix/batch_dense_kernels.cpp
@@ -9,26 +9,23 @@
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
 
+
+#define GKO_DEVICE_NAMESPACE reference
+
+
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "reference/base/batch_struct.hpp"
+#include "reference/matrix/batch_dense_kernels.hpp"
 #include "reference/matrix/batch_struct.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace reference {
-/**
- * @brief The Dense matrix format namespace.
- * @ref Dense
- * @ingroup batch_dense
- */
 namespace batch_dense {
 
 
-#include "reference/matrix/batch_dense_kernels.hpp.inc"
-
-
 template <typename ValueType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const batch::matrix::Dense<ValueType>* mat,
@@ -42,7 +39,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
         const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch);
         const auto b_item = batch::extract_batch_item(b_ub, batch);
         const auto x_item = batch::extract_batch_item(x_ub, batch);
-        simple_apply_kernel(mat_item, b_item, x_item);
+        batch_single_kernels::simple_apply(mat_item, b_item, x_item);
     }
 }
 
@@ -69,8 +66,9 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
         const auto x_item = batch::extract_batch_item(x_ub, batch);
         const auto alpha_item = batch::extract_batch_item(alpha_ub, batch);
         const auto beta_item = batch::extract_batch_item(beta_ub, batch);
-        advanced_apply_kernel(alpha_item.values[0], mat_item, b_item,
-                              beta_item.values[0], x_item);
+        batch_single_kernels::advanced_apply(alpha_item.values[0], mat_item,
+                                             b_item, beta_item.values[0],
+                                             x_item);
     }
 }
 
@@ -95,7 +93,8 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
         const auto row_scale_b = row_scale_vals + num_rows * batch_id;
         const auto input_mat =
             input_vals + input->get_num_elements_per_item() * batch_id;
-        scale(num_rows, num_cols, stride, col_scale_b, row_scale_b, input_mat);
+        batch_single_kernels::scale(num_rows, num_cols, stride, col_scale_b,
+                                    row_scale_b, input_mat);
     }
 }
 
@@ -117,7 +116,7 @@ void scale_add(std::shared_ptr<const DefaultExecutor> exec,
         const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id);
         const auto input_mat_b =
             batch::matrix::extract_batch_item(in_mat_ub, batch_id);
-        scale_add_kernel(alpha_b.values[0], mat_b, input_mat_b);
+        batch_single_kernels::scale_add(alpha_b.values[0], mat_b, input_mat_b);
     }
 }
 
@@ -138,7 +137,8 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
         const auto alpha_b = batch::extract_batch_item(alpha_ub, batch_id);
         const auto beta_b = batch::extract_batch_item(beta_ub, batch_id);
         const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id);
-        add_scaled_identity_kernel(alpha_b.values[0], beta_b.values[0], mat_b);
+        batch_single_kernels::add_scaled_identity(alpha_b.values[0],
+                                                  beta_b.values[0], mat_b);
     }
 }
 
diff --git a/reference/matrix/batch_dense_kernels.hpp.inc b/reference/matrix/batch_dense_kernels.hpp
similarity index 84%
rename from reference/matrix/batch_dense_kernels.hpp.inc
rename to reference/matrix/batch_dense_kernels.hpp
index a017010a644..e12827c77de 100644
--- a/reference/matrix/batch_dense_kernels.hpp.inc
+++ b/reference/matrix/batch_dense_kernels.hpp
@@ -2,8 +2,25 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <algorithm>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "reference/base/batch_struct.hpp"
+#include "reference/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
 template <typename ValueType>
-inline void simple_apply_kernel(
+inline void simple_apply(
     const gko::batch::matrix::dense::batch_item<const ValueType>& a,
     const gko::batch::multi_vector::batch_item<const ValueType>& b,
     const gko::batch::multi_vector::batch_item<ValueType>& c)
@@ -27,7 +44,7 @@ inline void simple_apply_kernel(
 
 
 template <typename ValueType>
-inline void advanced_apply_kernel(
+inline void advanced_apply(
     const ValueType alpha,
     const gko::batch::matrix::dense::batch_item<const ValueType>& a,
     const gko::batch::multi_vector::batch_item<const ValueType>& b,
@@ -75,7 +92,7 @@ inline void scale(const int num_rows, const int num_cols,
 
 
 template <typename ValueType>
-inline void scale_add_kernel(
+inline void scale_add(
     const ValueType alpha,
     const gko::batch::matrix::dense::batch_item<const ValueType>& b,
     const gko::batch::matrix::dense::batch_item<ValueType>& in_out)
@@ -91,7 +108,7 @@ inline void scale_add_kernel(
 
 
 template <typename ValueType>
-inline void add_scaled_identity_kernel(
+inline void add_scaled_identity(
     const ValueType alpha, const ValueType beta,
     const gko::batch::matrix::dense::batch_item<ValueType>& mat)
 {
@@ -105,3 +122,9 @@ inline void add_scaled_identity_kernel(
         }
     }
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/matrix/batch_ell_kernels.cpp b/reference/matrix/batch_ell_kernels.cpp
index 0d47f9ea601..7772662b216 100644
--- a/reference/matrix/batch_ell_kernels.cpp
+++ b/reference/matrix/batch_ell_kernels.cpp
@@ -9,26 +9,23 @@
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
+
+#define GKO_DEVICE_NAMESPACE reference
+
+
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "reference/base/batch_struct.hpp"
+#include "reference/matrix/batch_ell_kernels.hpp"
 #include "reference/matrix/batch_struct.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace reference {
-/**
- * @brief The Ell matrix format namespace.
- * @ref Ell
- * @ingroup batch_ell
- */
 namespace batch_ell {
 
 
-#include "reference/matrix/batch_ell_kernels.hpp.inc"
-
-
 template <typename ValueType, typename IndexType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const batch::matrix::Ell<ValueType, IndexType>* mat,
@@ -42,7 +39,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
         const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch);
         const auto b_item = batch::extract_batch_item(b_ub, batch);
         const auto x_item = batch::extract_batch_item(x_ub, batch);
-        simple_apply_kernel(mat_item, b_item, x_item);
+        batch_single_kernels::simple_apply(mat_item, b_item, x_item);
     }
 }
 
@@ -69,8 +66,9 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
         const auto x_item = batch::extract_batch_item(x_ub, batch);
         const auto alpha_item = batch::extract_batch_item(alpha_ub, batch);
         const auto beta_item = batch::extract_batch_item(beta_ub, batch);
-        advanced_apply_kernel(alpha_item.values[0], mat_item, b_item,
-                              beta_item.values[0], x_item);
+        batch_single_kernels::advanced_apply(alpha_item.values[0], mat_item,
+                                             b_item, beta_item.values[0],
+                                             x_item);
     }
 }
 
@@ -96,7 +94,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
         const auto row_scale_b = row_scale_vals + num_rows * batch_id;
         const auto mat_item =
             batch::matrix::extract_batch_item(mat_ub, batch_id);
-        scale(col_scale_b, row_scale_b, mat_item);
+        batch_single_kernels::scale(col_scale_b, row_scale_b, mat_item);
     }
 }
 
@@ -118,7 +116,8 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
         const auto alpha_b = batch::extract_batch_item(alpha_ub, batch_id);
         const auto beta_b = batch::extract_batch_item(beta_ub, batch_id);
         const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id);
-        add_scaled_identity_kernel(alpha_b.values[0], beta_b.values[0], mat_b);
+        batch_single_kernels::add_scaled_identity(alpha_b.values[0],
+                                                  beta_b.values[0], mat_b);
     }
 }
 
diff --git a/reference/matrix/batch_ell_kernels.hpp.inc b/reference/matrix/batch_ell_kernels.hpp
similarity index 84%
rename from reference/matrix/batch_ell_kernels.hpp.inc
rename to reference/matrix/batch_ell_kernels.hpp
index 7aea0946573..71bd1ce851a 100644
--- a/reference/matrix/batch_ell_kernels.hpp.inc
+++ b/reference/matrix/batch_ell_kernels.hpp
@@ -2,8 +2,25 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <algorithm>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "reference/base/batch_struct.hpp"
+#include "reference/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
 template <typename ValueType, typename IndexType>
-inline void simple_apply_kernel(
+inline void simple_apply(
     const gko::batch::matrix::ell::batch_item<const ValueType, IndexType>& a,
     const gko::batch::multi_vector::batch_item<const ValueType>& b,
     const gko::batch::multi_vector::batch_item<ValueType>& c)
@@ -27,7 +44,7 @@ inline void simple_apply_kernel(
 
 
 template <typename ValueType, typename IndexType>
-inline void advanced_apply_kernel(
+inline void advanced_apply(
     const ValueType alpha,
     const gko::batch::matrix::ell::batch_item<const ValueType, IndexType>& a,
     const gko::batch::multi_vector::batch_item<const ValueType>& b,
@@ -73,7 +90,7 @@ inline void scale(
 
 
 template <typename ValueType, typename IndexType>
-inline void add_scaled_identity_kernel(
+inline void add_scaled_identity(
     const ValueType alpha, const ValueType beta,
     const gko::batch::matrix::ell::batch_item<ValueType, IndexType>& mat)
 {
@@ -91,3 +108,9 @@ inline void add_scaled_identity_kernel(
         }
     }
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/solver/batch_bicgstab_kernels.cpp b/reference/solver/batch_bicgstab_kernels.cpp
index e68caffa936..33e1e9392d9 100644
--- a/reference/solver/batch_bicgstab_kernels.cpp
+++ b/reference/solver/batch_bicgstab_kernels.cpp
@@ -6,30 +6,21 @@
 
 #include "core/solver/batch_dispatch.hpp"
 #include "reference/base/batch_multi_vector_kernels.hpp"
+#include "reference/matrix/batch_csr_kernels.hpp"
+#include "reference/matrix/batch_dense_kernels.hpp"
+#include "reference/matrix/batch_ell_kernels.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace reference {
-
-
-/**
- * @brief The batch Bicgstab solver namespace.
- *
- * @ingroup batch_bicgstab
- */
 namespace batch_bicgstab {
-
-
 namespace {
 
 
 constexpr int max_num_rhs = 1;
 
 
-#include "reference/matrix/batch_csr_kernels.hpp.inc"
-#include "reference/matrix/batch_dense_kernels.hpp.inc"
-#include "reference/matrix/batch_ell_kernels.hpp.inc"
 #include "reference/solver/batch_bicgstab_kernels.hpp.inc"
 
 
diff --git a/reference/solver/batch_bicgstab_kernels.hpp.inc b/reference/solver/batch_bicgstab_kernels.hpp.inc
index 1f8537ab66d..786e98eb5d1 100644
--- a/reference/solver/batch_bicgstab_kernels.hpp.inc
+++ b/reference/solver/batch_bicgstab_kernels.hpp.inc
@@ -33,9 +33,9 @@ inline void initialize(
         b_entry, r_entry);
 
     // r = b - A*x
-    advanced_apply_kernel(static_cast<ValueType>(-1.0), A_entry,
-                          gko::batch::to_const(x_entry),
-                          static_cast<ValueType>(1.0), r_entry);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply(
+        static_cast<ValueType>(-1.0), A_entry, gko::batch::to_const(x_entry),
+        static_cast<ValueType>(1.0), r_entry);
     gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
         compute_norm2_kernel<ValueType>(gko::batch::to_const(r_entry),
                                         res_norms_entry);
@@ -271,8 +271,8 @@ inline void batch_entry_bicgstab_impl(
         prec.apply(gko::batch::to_const(p_entry), p_hat_entry);
 
         // v = A * p_hat
-        simple_apply_kernel(A_entry, gko::batch::to_const(p_hat_entry),
-                            v_entry);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply(
+            A_entry, gko::batch::to_const(p_hat_entry), v_entry);
 
         // alpha = rho_new / < r_hat , v>
         compute_alpha(gko::batch::to_const(rho_new_entry),
@@ -303,8 +303,8 @@ inline void batch_entry_bicgstab_impl(
         prec.apply(gko::batch::to_const(s_entry), s_hat_entry);
 
         // t = A * s_hat
-        simple_apply_kernel(A_entry, gko::batch::to_const(s_hat_entry),
-                            t_entry);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply(
+            A_entry, gko::batch::to_const(s_hat_entry), t_entry);
         // omega = <t,s> / <t,t>
         compute_omega(gko::batch::to_const(t_entry),
                       gko::batch::to_const(s_entry), temp_entry, omega_entry);
diff --git a/reference/solver/batch_cg_kernels.cpp b/reference/solver/batch_cg_kernels.cpp
index 785a7a868a2..7c69157d4a7 100644
--- a/reference/solver/batch_cg_kernels.cpp
+++ b/reference/solver/batch_cg_kernels.cpp
@@ -6,30 +6,21 @@
 
 #include "core/solver/batch_dispatch.hpp"
 #include "reference/base/batch_multi_vector_kernels.hpp"
+#include "reference/matrix/batch_csr_kernels.hpp"
+#include "reference/matrix/batch_dense_kernels.hpp"
+#include "reference/matrix/batch_ell_kernels.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace reference {
-
-
-/**
- * @brief The batch Cg solver namespace.
- *
- * @ingroup batch_cg
- */
 namespace batch_cg {
-
-
 namespace {
 
 
 constexpr int max_num_rhs = 1;
 
 
-#include "reference/matrix/batch_csr_kernels.hpp.inc"
-#include "reference/matrix/batch_dense_kernels.hpp.inc"
-#include "reference/matrix/batch_ell_kernels.hpp.inc"
 #include "reference/solver/batch_cg_kernels.hpp.inc"
 
 
diff --git a/reference/solver/batch_cg_kernels.hpp.inc b/reference/solver/batch_cg_kernels.hpp.inc
index ca88940cd69..991db5c061c 100644
--- a/reference/solver/batch_cg_kernels.hpp.inc
+++ b/reference/solver/batch_cg_kernels.hpp.inc
@@ -34,9 +34,9 @@ inline void initialize(
         b_entry, r_entry);
 
     // r = b - A*x
-    advanced_apply_kernel(static_cast<ValueType>(-1.0), A_entry,
-                          gko::batch::to_const(x_entry),
-                          static_cast<ValueType>(1.0), r_entry);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply(
+        static_cast<ValueType>(-1.0), A_entry, gko::batch::to_const(x_entry),
+        static_cast<ValueType>(1.0), r_entry);
 }
 
 
@@ -181,7 +181,8 @@ inline void batch_entry_cg_impl(
                  gko::batch::to_const(z_entry), p_entry);
 
         // Ap = A * p
-        simple_apply_kernel(A_entry, gko::batch::to_const(p_entry), Ap_entry);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply(
+            A_entry, gko::batch::to_const(p_entry), Ap_entry);
 
         // temp= rho_old / (p' * Ap)
         // x = x + temp * p

From 4d6756fd7690ec508c3ff6d693af4ee5f377ad13 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Thu, 22 Aug 2024 17:57:03 +0200
Subject: [PATCH 146/448] [dpcpp] unify dpcpp kernels

---
 dpcpp/matrix/batch_csr_kernels.dp.cpp         | 28 ++++-------
 ..._kernels.hpp.inc => batch_csr_kernels.hpp} | 48 ++++++++++++++----
 dpcpp/matrix/batch_dense_kernels.dp.cpp       | 50 +++++++++----------
 ...ernels.hpp.inc => batch_dense_kernels.hpp} | 40 +++++++++++++--
 dpcpp/matrix/batch_ell_kernels.dp.cpp         | 24 ++++-----
 ..._kernels.hpp.inc => batch_ell_kernels.hpp} | 38 ++++++++++++--
 dpcpp/solver/batch_bicgstab_kernels.dp.cpp    | 11 ++--
 dpcpp/solver/batch_bicgstab_kernels.hpp.inc   | 12 +++--
 dpcpp/solver/batch_cg_kernels.dp.cpp          | 11 ++--
 dpcpp/solver/batch_cg_kernels.hpp.inc         |  9 ++--
 10 files changed, 168 insertions(+), 103 deletions(-)
 rename dpcpp/matrix/{batch_csr_kernels.hpp.inc => batch_csr_kernels.hpp} (67%)
 rename dpcpp/matrix/{batch_dense_kernels.hpp.inc => batch_dense_kernels.hpp} (84%)
 rename dpcpp/matrix/{batch_ell_kernels.hpp.inc => batch_ell_kernels.hpp} (78%)

diff --git a/dpcpp/matrix/batch_csr_kernels.dp.cpp b/dpcpp/matrix/batch_csr_kernels.dp.cpp
index 9feb824a3aa..1759a959299 100644
--- a/dpcpp/matrix/batch_csr_kernels.dp.cpp
+++ b/dpcpp/matrix/batch_csr_kernels.dp.cpp
@@ -21,23 +21,16 @@
 #include "dpcpp/components/intrinsics.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/matrix/batch_csr_kernels.hpp"
 #include "dpcpp/matrix/batch_struct.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace dpcpp {
-/**
- * @brief The Csr matrix format namespace.
- * @ref Csr
- * @ingroup batch_csr
- */
 namespace batch_csr {
 
 
-#include "dpcpp/matrix/batch_csr_kernels.hpp.inc"
-
-
 template <typename ValueType, typename IndexType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const batch::matrix::Csr<ValueType, IndexType>* mat,
@@ -74,8 +67,8 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                         batch::matrix::extract_batch_item(mat_ub, group_id);
                     const auto b_b = batch::extract_batch_item(b_ub, group_id);
                     const auto x_b = batch::extract_batch_item(x_ub, group_id);
-                    simple_apply_kernel(mat_b, b_b.values, x_b.values,
-                                        item_ct1);
+                    batch_single_kernels::simple_apply(mat_b, b_b.values,
+                                                       x_b.values, item_ct1);
                 });
     });
 }
@@ -127,9 +120,9 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
                         batch::extract_batch_item(alpha_ub, group_id);
                     const auto beta_b =
                         batch::extract_batch_item(beta_ub, group_id);
-                    advanced_apply_kernel(alpha_b.values[0], mat_b, b_b.values,
-                                          beta_b.values[0], x_b.values,
-                                          item_ct1);
+                    batch_single_kernels::advanced_apply(
+                        alpha_b.values[0], mat_b, b_b.values, beta_b.values[0],
+                        x_b.values, item_ct1);
                 });
     });
 }
@@ -172,9 +165,10 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
                         row_scale_vals + num_rows * group_id;
                     const auto mat_item =
                         batch::matrix::extract_batch_item(mat_ub, group_id);
-                    scale_kernel(mat_item.num_rows, col_scale_b, row_scale_b,
-                                 mat_item.col_idxs, mat_item.row_ptrs,
-                                 mat_item.values, item_ct1);
+                    batch_single_kernels::scale(mat_item.num_rows, col_scale_b,
+                                                row_scale_b, mat_item.col_idxs,
+                                                mat_item.row_ptrs,
+                                                mat_item.values, item_ct1);
                 });
     });
 }
@@ -215,7 +209,7 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
                         gko::batch::extract_batch_item(beta_ub, group_id);
                     const auto mat_b = gko::batch::matrix::extract_batch_item(
                         mat_ub, group_id);
-                    add_scaled_identity_kernel(
+                    batch_single_kernels::add_scaled_identity(
                         alpha_b.values[0], beta_b.values[0], mat_b, item_ct1);
                 });
     });
diff --git a/dpcpp/matrix/batch_csr_kernels.hpp.inc b/dpcpp/matrix/batch_csr_kernels.hpp
similarity index 67%
rename from dpcpp/matrix/batch_csr_kernels.hpp.inc
rename to dpcpp/matrix/batch_csr_kernels.hpp
index 4379e02d0b7..f51124f81a4 100644
--- a/dpcpp/matrix/batch_csr_kernels.hpp.inc
+++ b/dpcpp/matrix/batch_csr_kernels.hpp
@@ -2,8 +2,32 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <memory>
+
+#include <CL/sycl.hpp>
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "dpcpp/base/batch_struct.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/base/helper.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/intrinsics.dp.hpp"
+#include "dpcpp/components/reduction.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
 template <typename ValueType, typename IndexType>
-__dpct_inline__ void simple_apply_kernel(
+__dpct_inline__ void simple_apply(
     const gko::batch::matrix::csr::batch_item<const ValueType, IndexType>& mat,
     const ValueType* b, ValueType* x, sycl::nd_item<3>& item_ct1)
 {
@@ -23,7 +47,7 @@ __dpct_inline__ void simple_apply_kernel(
 
 
 template <typename ValueType, typename IndexType>
-__dpct_inline__ void advanced_apply_kernel(
+__dpct_inline__ void advanced_apply(
     const ValueType alpha,
     const gko::batch::matrix::csr::batch_item<const ValueType, IndexType>& mat,
     const ValueType* b, const ValueType beta, ValueType* x,
@@ -45,13 +69,11 @@ __dpct_inline__ void advanced_apply_kernel(
 
 
 template <typename ValueType, typename IndexType>
-__dpct_inline__ void scale_kernel(const int num_rows,
-                                  const ValueType* const col_scale,
-                                  const ValueType* const row_scale,
-                                  const IndexType* const col_idxs,
-                                  const IndexType* const row_ptrs,
-                                  ValueType* const values,
-                                  sycl::nd_item<3>& item_ct1)
+__dpct_inline__ void scale(const int num_rows, const ValueType* const col_scale,
+                           const ValueType* const row_scale,
+                           const IndexType* const col_idxs,
+                           const IndexType* const row_ptrs,
+                           ValueType* const values, sycl::nd_item<3>& item_ct1)
 {
     for (int row = item_ct1.get_local_linear_id(); row < num_rows;
          row += item_ct1.get_local_range().size()) {
@@ -64,7 +86,7 @@ __dpct_inline__ void scale_kernel(const int num_rows,
 
 
 template <typename ValueType, typename IndexType>
-__dpct_inline__ void add_scaled_identity_kernel(
+__dpct_inline__ void add_scaled_identity(
     const ValueType alpha, const ValueType beta,
     const gko::batch::matrix::csr::batch_item<ValueType, IndexType>& mat,
     sycl::nd_item<3>& item_ct1)
@@ -80,3 +102,9 @@ __dpct_inline__ void add_scaled_identity_kernel(
         }
     }
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/matrix/batch_dense_kernels.dp.cpp b/dpcpp/matrix/batch_dense_kernels.dp.cpp
index a9f6afce0f5..2cebbe326e8 100644
--- a/dpcpp/matrix/batch_dense_kernels.dp.cpp
+++ b/dpcpp/matrix/batch_dense_kernels.dp.cpp
@@ -25,23 +25,16 @@
 #include "dpcpp/components/intrinsics.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/matrix/batch_dense_kernels.hpp"
 #include "dpcpp/matrix/batch_struct.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace dpcpp {
-/**
- * @brief The Dense matrix format namespace.
- *
- * @ingroup batch_dense
- */
 namespace batch_dense {
 
 
-#include "dpcpp/matrix/batch_dense_kernels.hpp.inc"
-
-
 template <typename ValueType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const batch::matrix::Dense<ValueType>* mat,
@@ -77,8 +70,8 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                         batch::matrix::extract_batch_item(mat_ub, group_id);
                     const auto b_b = batch::extract_batch_item(b_ub, group_id);
                     const auto x_b = batch::extract_batch_item(x_ub, group_id);
-                    simple_apply_kernel(mat_b, b_b.values, x_b.values,
-                                        item_ct1);
+                    batch_single_kernels::simple_apply_kernel(
+                        mat_b, b_b.values, x_b.values, item_ct1);
                 });
     });
 }
@@ -129,9 +122,9 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
                         batch::extract_batch_item(alpha_ub, group_id);
                     const auto beta_b =
                         batch::extract_batch_item(beta_ub, group_id);
-                    advanced_apply_kernel(alpha_b.values[0], mat_b, b_b.values,
-                                          beta_b.values[0], x_b.values,
-                                          item_ct1);
+                    batch_single_kernels::advanced_apply(
+                        alpha_b.values[0], mat_b, b_b.values, beta_b.values[0],
+                        x_b.values, item_ct1);
                 });
     });
 }
@@ -174,7 +167,8 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
                         row_scale_vals + num_rows * group_id;
                     auto input_mat =
                         batch::matrix::extract_batch_item(mat_ub, group_id);
-                    scale_kernel(col_scale_b, row_scale_b, input_mat, item_ct1);
+                    batch_single_kernels::scale(col_scale_b, row_scale_b,
+                                                input_mat, item_ct1);
                 });
     });
 }
@@ -204,18 +198,20 @@ void scale_add(std::shared_ptr<const DefaultExecutor> exec,
     exec->get_queue()->submit([&](sycl::handler& cgh) {
         cgh.parallel_for(
             sycl_nd_range(grid, block),
-            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
-                config::warp_size)]] {
-                auto group = item_ct1.get_group();
-                auto group_id = group.get_group_linear_id();
-                const auto alpha_b =
-                    gko::batch::extract_batch_item(alpha_ub, group_id);
-                const auto mat_b =
-                    gko::batch::matrix::extract_batch_item(mat_ub, group_id);
-                const auto in_out_b =
-                    gko::batch::matrix::extract_batch_item(in_out_ub, group_id);
-                scale_add_kernel(alpha_b.values[0], mat_b, in_out_b, item_ct1);
-            });
+            [=](sycl::nd_item<3> item_ct1)
+                [[sycl::reqd_sub_group_size(config::warp_size)]] {
+                    auto group = item_ct1.get_group();
+                    auto group_id = group.get_group_linear_id();
+                    const auto alpha_b =
+                        gko::batch::extract_batch_item(alpha_ub, group_id);
+                    const auto mat_b = gko::batch::matrix::extract_batch_item(
+                        mat_ub, group_id);
+                    const auto in_out_b =
+                        gko::batch::matrix::extract_batch_item(in_out_ub,
+                                                               group_id);
+                    batch_single_kernels::scale_add(alpha_b.values[0], mat_b,
+                                                    in_out_b, item_ct1);
+                });
     });
 }
 
@@ -254,7 +250,7 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
                         gko::batch::extract_batch_item(beta_ub, group_id);
                     const auto mat_b = gko::batch::matrix::extract_batch_item(
                         mat_ub, group_id);
-                    add_scaled_identity_kernel(
+                    batch_single_kernels::add_scaled_identity(
                         alpha_b.values[0], beta_b.values[0], mat_b, item_ct1);
                 });
     });
diff --git a/dpcpp/matrix/batch_dense_kernels.hpp.inc b/dpcpp/matrix/batch_dense_kernels.hpp
similarity index 84%
rename from dpcpp/matrix/batch_dense_kernels.hpp.inc
rename to dpcpp/matrix/batch_dense_kernels.hpp
index 98282fe253d..acf1e65939d 100644
--- a/dpcpp/matrix/batch_dense_kernels.hpp.inc
+++ b/dpcpp/matrix/batch_dense_kernels.hpp
@@ -2,8 +2,32 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <memory>
+
+#include <CL/sycl.hpp>
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "dpcpp/base/batch_struct.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/base/helper.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/intrinsics.dp.hpp"
+#include "dpcpp/components/reduction.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
 template <typename ValueType>
-__dpct_inline__ void simple_apply_kernel(
+__dpct_inline__ void simple_apply(
     const gko::batch::matrix::dense::batch_item<const ValueType>& mat,
     const ValueType* b, ValueType* x, sycl::nd_item<3>& item_ct1)
 {
@@ -34,7 +58,7 @@ __dpct_inline__ void simple_apply_kernel(
 
 
 template <typename ValueType>
-__dpct_inline__ void advanced_apply_kernel(
+__dpct_inline__ void advanced_apply(
     const ValueType alpha,
     const gko::batch::matrix::dense::batch_item<const ValueType>& mat,
     const ValueType* b, const ValueType beta, ValueType* x,
@@ -67,7 +91,7 @@ __dpct_inline__ void advanced_apply_kernel(
 
 
 template <typename ValueType>
-__dpct_inline__ void scale_kernel(
+__dpct_inline__ void scale(
     const ValueType* const col_scale, const ValueType* const row_scale,
     gko::batch::matrix::dense::batch_item<ValueType>& mat,
     sycl::nd_item<3>& item_ct1)
@@ -91,7 +115,7 @@ __dpct_inline__ void scale_kernel(
 
 
 template <typename ValueType>
-__dpct_inline__ void scale_add_kernel(
+__dpct_inline__ void scale_add(
     const ValueType alpha,
     const gko::batch::matrix::dense::batch_item<const ValueType>& mat,
     const gko::batch::matrix::dense::batch_item<ValueType>& in_out,
@@ -117,7 +141,7 @@ __dpct_inline__ void scale_add_kernel(
 
 
 template <typename ValueType>
-__dpct_inline__ void add_scaled_identity_kernel(
+__dpct_inline__ void add_scaled_identity(
     const ValueType alpha, const ValueType beta,
     const gko::batch::matrix::dense::batch_item<ValueType>& mat,
     sycl::nd_item<3>& item_ct1)
@@ -140,3 +164,9 @@ __dpct_inline__ void add_scaled_identity_kernel(
         }
     }
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/matrix/batch_ell_kernels.dp.cpp b/dpcpp/matrix/batch_ell_kernels.dp.cpp
index 2cb40dc35eb..d9b819b101e 100644
--- a/dpcpp/matrix/batch_ell_kernels.dp.cpp
+++ b/dpcpp/matrix/batch_ell_kernels.dp.cpp
@@ -21,23 +21,16 @@
 #include "dpcpp/components/intrinsics.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/matrix/batch_ell_kernels.hpp"
 #include "dpcpp/matrix/batch_struct.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace dpcpp {
-/**
- * @brief The Ell matrix format namespace.
- * @ref Ell
- * @ingroup batch_ell
- */
 namespace batch_ell {
 
 
-#include "dpcpp/matrix/batch_ell_kernels.hpp.inc"
-
-
 template <typename ValueType, typename IndexType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const batch::matrix::Ell<ValueType, IndexType>* mat,
@@ -74,8 +67,8 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                         batch::matrix::extract_batch_item(mat_ub, group_id);
                     const auto b_b = batch::extract_batch_item(b_ub, group_id);
                     const auto x_b = batch::extract_batch_item(x_ub, group_id);
-                    simple_apply_kernel(mat_b, b_b.values, x_b.values,
-                                        item_ct1);
+                    batch_single_kernels::simple_apply(mat_b, b_b.values,
+                                                       x_b.values, item_ct1);
                 });
     });
 }
@@ -127,9 +120,9 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
                         batch::extract_batch_item(alpha_ub, group_id);
                     const auto beta_b =
                         batch::extract_batch_item(beta_ub, group_id);
-                    advanced_apply_kernel(alpha_b.values[0], mat_b, b_b.values,
-                                          beta_b.values[0], x_b.values,
-                                          item_ct1);
+                    batch_single_kernels::advanced_apply(
+                        alpha_b.values[0], mat_b, b_b.values, beta_b.values[0],
+                        x_b.values, item_ct1);
                 });
     });
 }
@@ -171,7 +164,8 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
                         row_scale_vals + num_rows * group_id;
                     auto mat_item =
                         batch::matrix::extract_batch_item(mat_ub, group_id);
-                    scale_kernel(col_scale_b, row_scale_b, mat_item, item_ct1);
+                    batch_single_kernels::scale(col_scale_b, row_scale_b,
+                                                mat_item, item_ct1);
                 });
     });
 }
@@ -212,7 +206,7 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
                         gko::batch::extract_batch_item(beta_ub, group_id);
                     const auto mat_b = gko::batch::matrix::extract_batch_item(
                         mat_ub, group_id);
-                    add_scaled_identity_kernel(
+                    batch_single_kernels::add_scaled_identity(
                         alpha_b.values[0], beta_b.values[0], mat_b, item_ct1);
                 });
     });
diff --git a/dpcpp/matrix/batch_ell_kernels.hpp.inc b/dpcpp/matrix/batch_ell_kernels.hpp
similarity index 78%
rename from dpcpp/matrix/batch_ell_kernels.hpp.inc
rename to dpcpp/matrix/batch_ell_kernels.hpp
index 1a809664dca..48ab9318bdf 100644
--- a/dpcpp/matrix/batch_ell_kernels.hpp.inc
+++ b/dpcpp/matrix/batch_ell_kernels.hpp
@@ -2,8 +2,32 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <memory>
+
+#include <CL/sycl.hpp>
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "dpcpp/base/batch_struct.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/base/helper.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/intrinsics.dp.hpp"
+#include "dpcpp/components/reduction.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
 template <typename ValueType, typename IndexType>
-__dpct_inline__ void simple_apply_kernel(
+__dpct_inline__ void simple_apply(
     const gko::batch::matrix::ell::batch_item<const ValueType, IndexType>& mat,
     const ValueType* b, ValueType* x, sycl::nd_item<3>& item_ct1)
 {
@@ -24,7 +48,7 @@ __dpct_inline__ void simple_apply_kernel(
 
 
 template <typename ValueType, typename IndexType>
-__dpct_inline__ void advanced_apply_kernel(
+__dpct_inline__ void advanced_apply(
     const ValueType alpha,
     const gko::batch::matrix::ell::batch_item<const ValueType, IndexType>& mat,
     const ValueType* b, const ValueType beta, ValueType* x,
@@ -47,7 +71,7 @@ __dpct_inline__ void advanced_apply_kernel(
 
 
 template <typename ValueType, typename IndexType>
-__dpct_inline__ void scale_kernel(
+__dpct_inline__ void scale(
     const ValueType* const col_scale, const ValueType* const row_scale,
     gko::batch::matrix::ell::batch_item<ValueType, IndexType>& mat,
     sycl::nd_item<3>& item_ct1)
@@ -69,7 +93,7 @@ __dpct_inline__ void scale_kernel(
 
 
 template <typename ValueType, typename IndexType>
-__dpct_inline__ void add_scaled_identity_kernel(
+__dpct_inline__ void add_scaled_identity(
     const ValueType alpha, const ValueType beta,
     const gko::batch::matrix::ell::batch_item<ValueType, IndexType>& mat,
     sycl::nd_item<3>& item_ct1)
@@ -89,3 +113,9 @@ __dpct_inline__ void add_scaled_identity_kernel(
         }
     }
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
index 7dc8f3ec23b..291ee1d8a8b 100644
--- a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
+++ b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
@@ -23,23 +23,18 @@
 #include "dpcpp/components/intrinsics.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/matrix/batch_csr_kernels.hpp"
+#include "dpcpp/matrix/batch_dense_kernels.hpp"
+#include "dpcpp/matrix/batch_ell_kernels.hpp"
 #include "dpcpp/matrix/batch_struct.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace dpcpp {
-/**
- * @brief The batch Bicgstab solver namespace.
- *
- * @ingroup batch_bicgstab
- */
 namespace batch_bicgstab {
 
 
-#include "dpcpp/matrix/batch_csr_kernels.hpp.inc"
-#include "dpcpp/matrix/batch_dense_kernels.hpp.inc"
-#include "dpcpp/matrix/batch_ell_kernels.hpp.inc"
 #include "dpcpp/solver/batch_bicgstab_kernels.hpp.inc"
 
 
diff --git a/dpcpp/solver/batch_bicgstab_kernels.hpp.inc b/dpcpp/solver/batch_bicgstab_kernels.hpp.inc
index f5a88e9d59d..de1956c8c6c 100644
--- a/dpcpp/solver/batch_bicgstab_kernels.hpp.inc
+++ b/dpcpp/solver/batch_bicgstab_kernels.hpp.inc
@@ -33,9 +33,9 @@ __dpct_inline__ void initialize(
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
     // r = b - A*x
-    advanced_apply_kernel(static_cast<ValueType>(-1.0), mat_global_entry,
-                          x_shared_entry, static_cast<ValueType>(1.0),
-                          r_shared_entry, item_ct1);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply(
+        static_cast<ValueType>(-1.0), mat_global_entry, x_shared_entry,
+        static_cast<ValueType>(1.0), r_shared_entry, item_ct1);
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
     if (sg_id == 0) {
@@ -330,7 +330,8 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf,
         item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
         // v = A * p_hat
-        simple_apply_kernel(mat_global_entry, p_hat_sh, v_sh, item_ct1);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply(
+            mat_global_entry, p_hat_sh, v_sh, item_ct1);
         item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
         // alpha = rho_new / < r_hat , v>
@@ -361,7 +362,8 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf,
         item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
         // t = A * s_hat
-        simple_apply_kernel(mat_global_entry, s_hat_sh, t_sh, item_ct1);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply(
+            mat_global_entry, s_hat_sh, t_sh, item_ct1);
         item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
         // omega = <t,s> / <t,t>
diff --git a/dpcpp/solver/batch_cg_kernels.dp.cpp b/dpcpp/solver/batch_cg_kernels.dp.cpp
index f25d8266803..05b3f7b803c 100644
--- a/dpcpp/solver/batch_cg_kernels.dp.cpp
+++ b/dpcpp/solver/batch_cg_kernels.dp.cpp
@@ -23,23 +23,18 @@
 #include "dpcpp/components/intrinsics.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/matrix/batch_csr_kernels.hpp"
+#include "dpcpp/matrix/batch_dense_kernels.hpp"
+#include "dpcpp/matrix/batch_ell_kernels.hpp"
 #include "dpcpp/matrix/batch_struct.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace dpcpp {
-/**
- * @brief The batch Cg solver namespace.
- *
- * @ingroup batch_cg
- */
 namespace batch_cg {
 
 
-#include "dpcpp/matrix/batch_csr_kernels.hpp.inc"
-#include "dpcpp/matrix/batch_dense_kernels.hpp.inc"
-#include "dpcpp/matrix/batch_ell_kernels.hpp.inc"
 #include "dpcpp/solver/batch_cg_kernels.hpp.inc"
 
 
diff --git a/dpcpp/solver/batch_cg_kernels.hpp.inc b/dpcpp/solver/batch_cg_kernels.hpp.inc
index 7a91bcb2bbf..b233b7df680 100644
--- a/dpcpp/solver/batch_cg_kernels.hpp.inc
+++ b/dpcpp/solver/batch_cg_kernels.hpp.inc
@@ -27,9 +27,9 @@ __dpct_inline__ void initialize(
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
     // r = b - A*x
-    advanced_apply_kernel(static_cast<ValueType>(-1.0), mat_global_entry,
-                          x_shared_entry, static_cast<ValueType>(1.0),
-                          r_shared_entry, item_ct1);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply(
+        static_cast<ValueType>(-1.0), mat_global_entry, x_shared_entry,
+        static_cast<ValueType>(1.0), r_shared_entry, item_ct1);
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
 
@@ -207,7 +207,8 @@ __dpct_inline__ void apply_kernel(
             break;
         }
         // Ap = A * p
-        simple_apply_kernel(mat_global_entry, p_sh, Ap_sh, item_ct1);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply(
+            mat_global_entry, p_sh, Ap_sh, item_ct1);
         item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
         // alpha = rho_old / (p' * Ap)

From 927a35f1be3c93aca03f526ef7b60d3939d74999 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Thu, 22 Aug 2024 18:03:28 +0200
Subject: [PATCH 147/448] [hip, cuda] remove unnecessary .hip.cpp/.cu files

---
 cuda/matrix/batch_dense_kernels.cu     | 56 --------------------------
 cuda/matrix/batch_ell_kernels.cu       | 55 -------------------------
 hip/matrix/batch_csr_kernels.hip.cpp   | 55 -------------------------
 hip/matrix/batch_dense_kernels.hip.cpp | 56 --------------------------
 hip/matrix/batch_ell_kernels.hip.cpp   | 55 -------------------------
 5 files changed, 277 deletions(-)
 delete mode 100644 cuda/matrix/batch_dense_kernels.cu
 delete mode 100644 cuda/matrix/batch_ell_kernels.cu
 delete mode 100644 hip/matrix/batch_csr_kernels.hip.cpp
 delete mode 100644 hip/matrix/batch_dense_kernels.hip.cpp
 delete mode 100644 hip/matrix/batch_ell_kernels.hip.cpp

diff --git a/cuda/matrix/batch_dense_kernels.cu b/cuda/matrix/batch_dense_kernels.cu
deleted file mode 100644
index 10148ee242b..00000000000
--- a/cuda/matrix/batch_dense_kernels.cu
+++ /dev/null
@@ -1,56 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/batch_dense_kernels.hpp"
-
-#include <thrust/functional.h>
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/batch_dense.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/warp_blas.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
-#include "cuda/base/batch_struct.hpp"
-#include "cuda/matrix/batch_struct.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Dense matrix format namespace.
- *
- * @ingroup batch_dense
- */
-namespace batch_dense {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc"
-
-
-// clang-format on
-
-
-}  // namespace batch_dense
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/matrix/batch_ell_kernels.cu b/cuda/matrix/batch_ell_kernels.cu
deleted file mode 100644
index 25281cf6f81..00000000000
--- a/cuda/matrix/batch_ell_kernels.cu
+++ /dev/null
@@ -1,55 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/batch_ell_kernels.hpp"
-
-#include <thrust/functional.h>
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/batch_ell.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/warp_blas.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
-#include "cuda/base/batch_struct.hpp"
-#include "cuda/matrix/batch_struct.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Ell matrix format namespace.
- * @ref Ell
- * @ingroup batch_ell
- */
-namespace batch_ell {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc"
-
-// clang-format on
-
-
-}  // namespace batch_ell
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/matrix/batch_csr_kernels.hip.cpp b/hip/matrix/batch_csr_kernels.hip.cpp
deleted file mode 100644
index b77b9416505..00000000000
--- a/hip/matrix/batch_csr_kernels.hip.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/batch_csr_kernels.hpp"
-
-#include <thrust/functional.h>
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/batch_csr.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/uninitialized_array.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
-#include "hip/base/batch_struct.hip.hpp"
-#include "hip/matrix/batch_struct.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The Csr matrix format namespace.
- * @ref Csr
- * @ingroup batch_csr
- */
-namespace batch_csr {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc"
-
-// clang-format on
-
-
-}  // namespace batch_csr
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/matrix/batch_dense_kernels.hip.cpp b/hip/matrix/batch_dense_kernels.hip.cpp
deleted file mode 100644
index 67dfd78e264..00000000000
--- a/hip/matrix/batch_dense_kernels.hip.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/batch_dense_kernels.hpp"
-
-#include <thrust/functional.h>
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/batch_dense.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/uninitialized_array.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
-#include "hip/base/batch_struct.hip.hpp"
-#include "hip/matrix/batch_struct.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The Dense matrix format namespace.
- *
- * @ingroup batch_dense
- */
-namespace batch_dense {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc"
-
-
-// clang-format on
-
-
-}  // namespace batch_dense
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/matrix/batch_ell_kernels.hip.cpp b/hip/matrix/batch_ell_kernels.hip.cpp
deleted file mode 100644
index 68b59c042f1..00000000000
--- a/hip/matrix/batch_ell_kernels.hip.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/batch_ell_kernels.hpp"
-
-#include <thrust/functional.h>
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/batch_ell.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/uninitialized_array.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
-#include "hip/base/batch_struct.hip.hpp"
-#include "hip/matrix/batch_struct.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The Ell matrix format namespace.
- * @ref Ell
- * @ingroup batch_ell
- */
-namespace batch_ell {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc"
-
-// clang-format on
-
-
-}  // namespace batch_ell
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko

From 2283e78adef133cf7230c9fee3e18b246c0d8929 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Thu, 22 Aug 2024 18:20:36 +0200
Subject: [PATCH 148/448] fixup! [dpcpp] unify dpcpp kernels

---
 dpcpp/matrix/batch_dense_kernels.dp.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dpcpp/matrix/batch_dense_kernels.dp.cpp b/dpcpp/matrix/batch_dense_kernels.dp.cpp
index 2cebbe326e8..43974589abb 100644
--- a/dpcpp/matrix/batch_dense_kernels.dp.cpp
+++ b/dpcpp/matrix/batch_dense_kernels.dp.cpp
@@ -70,8 +70,8 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                         batch::matrix::extract_batch_item(mat_ub, group_id);
                     const auto b_b = batch::extract_batch_item(b_ub, group_id);
                     const auto x_b = batch::extract_batch_item(x_ub, group_id);
-                    batch_single_kernels::simple_apply_kernel(
-                        mat_b, b_b.values, x_b.values, item_ct1);
+                    batch_single_kernels::simple_apply(mat_b, b_b.values,
+                                                       x_b.values, item_ct1);
                 });
     });
 }

From bd73597b5af439ed4d2544b4b9a9c14fa4787081 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Fri, 23 Aug 2024 13:14:36 +0200
Subject: [PATCH 149/448] [cuda, hip] unify batch_struct headers

---
 .../base/batch_multi_vector_kernels.hpp       |   9 +-
 .../cuda_hip}/base/batch_struct.hpp           |  19 +--
 common/cuda_hip/base/types.hpp                |   4 +
 common/cuda_hip/matrix/batch_csr_kernels.hpp  |  12 +-
 .../cuda_hip/matrix/batch_dense_kernels.hpp   |  12 +-
 common/cuda_hip/matrix/batch_ell_kernels.hpp  |  12 +-
 .../cuda_hip}/matrix/batch_struct.hpp         |  46 +++---
 core/solver/batch_dispatch.hpp                |   8 +-
 cuda/preconditioner/batch_jacobi_kernels.cu   |   4 +-
 cuda/solver/batch_bicgstab_kernels.cu         |   4 +-
 cuda/solver/batch_cg_kernels.cu               |   4 +-
 hip/base/batch_struct.hip.hpp                 |  64 --------
 hip/matrix/batch_struct.hip.hpp               | 142 ------------------
 .../batch_jacobi_kernels.hip.cpp              |   4 +-
 hip/solver/batch_bicgstab_kernels.hip.cpp     |   4 +-
 hip/solver/batch_cg_kernels.hip.cpp           |   4 +-
 16 files changed, 58 insertions(+), 294 deletions(-)
 rename {cuda => common/cuda_hip}/base/batch_struct.hpp (71%)
 rename {cuda => common/cuda_hip}/matrix/batch_struct.hpp (75%)
 delete mode 100644 hip/base/batch_struct.hip.hpp
 delete mode 100644 hip/matrix/batch_struct.hip.hpp

diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp b/common/cuda_hip/base/batch_multi_vector_kernels.hpp
index 0cbbdf9f5ee..1cd9d6c752b 100644
--- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp
@@ -10,6 +10,7 @@
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 
+#include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
@@ -22,14 +23,6 @@
 #include "common/cuda_hip/components/thread_ids.hpp"
 #include "common/cuda_hip/components/warp_blas.hpp"
 
-#if defined(GKO_COMPILING_CUDA)
-#include "cuda/base/batch_struct.hpp"
-#elif defined(GKO_COMPILING_HIP)
-#include "hip/base/batch_struct.hip.hpp"
-#else
-#error "batch struct def missing"
-#endif
-
 
 namespace gko {
 namespace kernels {
diff --git a/cuda/base/batch_struct.hpp b/common/cuda_hip/base/batch_struct.hpp
similarity index 71%
rename from cuda/base/batch_struct.hpp
rename to common/cuda_hip/base/batch_struct.hpp
index 9f07b6b4532..bc10752975f 100644
--- a/cuda/base/batch_struct.hpp
+++ b/common/cuda_hip/base/batch_struct.hpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#ifndef GKO_CUDA_BASE_BATCH_STRUCT_HPP_
-#define GKO_CUDA_BASE_BATCH_STRUCT_HPP_
+#ifndef GKO_COMMON_CUDA_HIP_BASE_BATCH_STRUCT_HPP_
+#define GKO_COMMON_CUDA_HIP_BASE_BATCH_STRUCT_HPP_
 
 
 #include <ginkgo/core/base/batch_multi_vector.hpp>
@@ -11,12 +11,13 @@
 
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/types.hpp"
+#include "common/unified/base/kernel_launch.hpp"
 #include "core/base/batch_struct.hpp"
 
 
 namespace gko {
 namespace kernels {
-namespace cuda {
+namespace GKO_DEVICE_NAMESPACE {
 
 
 /** @file batch_struct.hpp
@@ -33,10 +34,10 @@ namespace cuda {
  * Generates an immutable uniform batch struct from a batch of multi-vectors.
  */
 template <typename ValueType>
-inline batch::multi_vector::uniform_batch<const cuda_type<ValueType>>
+inline batch::multi_vector::uniform_batch<const device_type<ValueType>>
 get_batch_struct(const batch::MultiVector<ValueType>* const op)
 {
-    return {as_cuda_type(op->get_const_values()), op->get_num_batch_items(),
+    return {as_device_type(op->get_const_values()), op->get_num_batch_items(),
             static_cast<int32>(op->get_common_size()[1]),
             static_cast<int32>(op->get_common_size()[0]),
             static_cast<int32>(op->get_common_size()[1])};
@@ -46,19 +47,19 @@ get_batch_struct(const batch::MultiVector<ValueType>* const op)
  * Generates a uniform batch struct from a batch of multi-vectors.
  */
 template <typename ValueType>
-inline batch::multi_vector::uniform_batch<cuda_type<ValueType>>
+inline batch::multi_vector::uniform_batch<device_type<ValueType>>
 get_batch_struct(batch::MultiVector<ValueType>* const op)
 {
-    return {as_cuda_type(op->get_values()), op->get_num_batch_items(),
+    return {as_device_type(op->get_values()), op->get_num_batch_items(),
             static_cast<int32>(op->get_common_size()[1]),
             static_cast<int32>(op->get_common_size()[0]),
             static_cast<int32>(op->get_common_size()[1])};
 }
 
 
-}  // namespace cuda
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
 
 
-#endif  // GKO_CUDA_BASE_BATCH_STRUCT_HPP_
+#endif  // GKO_COMMON_CUDA_HIP_BASE_BATCH_STRUCT_HPP_
diff --git a/common/cuda_hip/base/types.hpp b/common/cuda_hip/base/types.hpp
index 08f0516d691..ee1c76a0585 100644
--- a/common/cuda_hip/base/types.hpp
+++ b/common/cuda_hip/base/types.hpp
@@ -8,8 +8,12 @@
 
 #if defined(GKO_COMPILING_CUDA)
 #include "cuda/base/types.hpp"
+#define device_type cuda_type
+#define as_device_type as_cuda_type
 #elif defined(GKO_COMPILING_HIP)
 #include "hip/base/types.hip.hpp"
+#define device_type hip_type
+#define as_device_type as_hip_type
 #else
 #error "Executor definition missing"
 #endif
diff --git a/common/cuda_hip/matrix/batch_csr_kernels.hpp b/common/cuda_hip/matrix/batch_csr_kernels.hpp
index 32d22e435eb..64611559715 100644
--- a/common/cuda_hip/matrix/batch_csr_kernels.hpp
+++ b/common/cuda_hip/matrix/batch_csr_kernels.hpp
@@ -11,6 +11,7 @@
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/batch_csr.hpp>
 
+#include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
@@ -22,16 +23,7 @@
 #include "common/cuda_hip/components/segment_scan.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
 #include "common/cuda_hip/components/warp_blas.hpp"
-
-#if defined(GKO_COMPILING_CUDA)
-#include "cuda/base/batch_struct.hpp"
-#include "cuda/matrix/batch_struct.hpp"
-#elif defined(GKO_COMPILING_HIP)
-#include "hip/base/batch_struct.hip.hpp"
-#include "hip/matrix/batch_struct.hip.hpp"
-#else
-#error "batch struct def missing"
-#endif
+#include "common/cuda_hip/matrix/batch_struct.hpp"
 
 
 namespace gko {
diff --git a/common/cuda_hip/matrix/batch_dense_kernels.hpp b/common/cuda_hip/matrix/batch_dense_kernels.hpp
index 74b81008b38..e4cd24bbd78 100644
--- a/common/cuda_hip/matrix/batch_dense_kernels.hpp
+++ b/common/cuda_hip/matrix/batch_dense_kernels.hpp
@@ -11,6 +11,7 @@
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
 
+#include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
@@ -22,16 +23,7 @@
 #include "common/cuda_hip/components/segment_scan.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
 #include "common/cuda_hip/components/warp_blas.hpp"
-
-#if defined(GKO_COMPILING_CUDA)
-#include "cuda/base/batch_struct.hpp"
-#include "cuda/matrix/batch_struct.hpp"
-#elif defined(GKO_COMPILING_HIP)
-#include "hip/base/batch_struct.hip.hpp"
-#include "hip/matrix/batch_struct.hip.hpp"
-#else
-#error "batch struct def missing"
-#endif
+#include "common/cuda_hip/matrix/batch_struct.hpp"
 
 
 namespace gko {
diff --git a/common/cuda_hip/matrix/batch_ell_kernels.hpp b/common/cuda_hip/matrix/batch_ell_kernels.hpp
index e8cadc29cd3..52826957ddb 100644
--- a/common/cuda_hip/matrix/batch_ell_kernels.hpp
+++ b/common/cuda_hip/matrix/batch_ell_kernels.hpp
@@ -11,6 +11,7 @@
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
+#include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
@@ -22,16 +23,7 @@
 #include "common/cuda_hip/components/segment_scan.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
 #include "common/cuda_hip/components/warp_blas.hpp"
-
-#if defined(GKO_COMPILING_CUDA)
-#include "cuda/base/batch_struct.hpp"
-#include "cuda/matrix/batch_struct.hpp"
-#elif defined(GKO_COMPILING_HIP)
-#include "hip/base/batch_struct.hip.hpp"
-#include "hip/matrix/batch_struct.hip.hpp"
-#else
-#error "batch struct def missing"
-#endif
+#include "common/cuda_hip/matrix/batch_struct.hpp"
 
 
 namespace gko {
diff --git a/cuda/matrix/batch_struct.hpp b/common/cuda_hip/matrix/batch_struct.hpp
similarity index 75%
rename from cuda/matrix/batch_struct.hpp
rename to common/cuda_hip/matrix/batch_struct.hpp
index 8a1b8fee00a..e88eca245bb 100644
--- a/cuda/matrix/batch_struct.hpp
+++ b/common/cuda_hip/matrix/batch_struct.hpp
@@ -2,35 +2,31 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#ifndef GKO_CUDA_MATRIX_BATCH_STRUCT_HPP_
-#define GKO_CUDA_MATRIX_BATCH_STRUCT_HPP_
+#ifndef GKO_COMMON_CUDA_HIP_MATRIX_BATCH_STRUCT_HPP_
+#define GKO_COMMON_CUDA_HIP_MATRIX_BATCH_STRUCT_HPP_
 
 
+#include <ginkgo/core/matrix/batch_csr.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/types.hpp"
+#include "common/unified/base/kernel_launch.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 
 
 namespace gko {
 namespace kernels {
-namespace cuda {
-namespace {
-
-
-constexpr auto default_block_size = 256;
-
-
-}
+namespace GKO_DEVICE_NAMESPACE {
 
 
 /** @file batch_struct.hpp
  *
  * Helper functions to generate a batch struct from a batch LinOp,
- * while also shallow-casting to the required CUDA scalar type.
+ * while also shallow-casting to the required GKO_DEVICE_NAMESPACE scalar
+ * type.
  *
  * A specialization is needed for every format of every kind of linear algebra
  * object. These are intended to be called on the host.
@@ -41,11 +37,11 @@ constexpr auto default_block_size = 256;
  * Generates an immutable uniform batch struct from a batch of csr matrices.
  */
 template <typename ValueType, typename IndexType>
-inline batch::matrix::csr::uniform_batch<const cuda_type<ValueType>,
+inline batch::matrix::csr::uniform_batch<const device_type<ValueType>,
                                          const IndexType>
 get_batch_struct(const batch::matrix::Csr<ValueType, IndexType>* const op)
 {
-    return {as_cuda_type(op->get_const_values()),
+    return {as_device_type(op->get_const_values()),
             op->get_const_col_idxs(),
             op->get_const_row_ptrs(),
             op->get_num_batch_items(),
@@ -59,10 +55,10 @@ get_batch_struct(const batch::matrix::Csr<ValueType, IndexType>* const op)
  * Generates a uniform batch struct from a batch of csr matrices.
  */
 template <typename ValueType, typename IndexType>
-inline batch::matrix::csr::uniform_batch<cuda_type<ValueType>, IndexType>
+inline batch::matrix::csr::uniform_batch<device_type<ValueType>, IndexType>
 get_batch_struct(batch::matrix::Csr<ValueType, IndexType>* const op)
 {
-    return {as_cuda_type(op->get_values()),
+    return {as_device_type(op->get_values()),
             op->get_col_idxs(),
             op->get_row_ptrs(),
             op->get_num_batch_items(),
@@ -76,10 +72,10 @@ get_batch_struct(batch::matrix::Csr<ValueType, IndexType>* const op)
  * Generates an immutable uniform batch struct from a batch of dense matrices.
  */
 template <typename ValueType>
-inline batch::matrix::dense::uniform_batch<const cuda_type<ValueType>>
+inline batch::matrix::dense::uniform_batch<const device_type<ValueType>>
 get_batch_struct(const batch::matrix::Dense<ValueType>* const op)
 {
-    return {as_cuda_type(op->get_const_values()), op->get_num_batch_items(),
+    return {as_device_type(op->get_const_values()), op->get_num_batch_items(),
             static_cast<int32>(op->get_common_size()[1]),
             static_cast<int32>(op->get_common_size()[0]),
             static_cast<int32>(op->get_common_size()[1])};
@@ -90,10 +86,10 @@ get_batch_struct(const batch::matrix::Dense<ValueType>* const op)
  * Generates a uniform batch struct from a batch of dense matrices.
  */
 template <typename ValueType>
-inline batch::matrix::dense::uniform_batch<cuda_type<ValueType>>
+inline batch::matrix::dense::uniform_batch<device_type<ValueType>>
 get_batch_struct(batch::matrix::Dense<ValueType>* const op)
 {
-    return {as_cuda_type(op->get_values()), op->get_num_batch_items(),
+    return {as_device_type(op->get_values()), op->get_num_batch_items(),
             static_cast<int32>(op->get_common_size()[1]),
             static_cast<int32>(op->get_common_size()[0]),
             static_cast<int32>(op->get_common_size()[1])};
@@ -104,11 +100,11 @@ get_batch_struct(batch::matrix::Dense<ValueType>* const op)
  * Generates an immutable uniform batch struct from a batch of ell matrices.
  */
 template <typename ValueType, typename IndexType>
-inline batch::matrix::ell::uniform_batch<const cuda_type<ValueType>,
+inline batch::matrix::ell::uniform_batch<const device_type<ValueType>,
                                          const IndexType>
 get_batch_struct(const batch::matrix::Ell<ValueType, IndexType>* const op)
 {
-    return {as_cuda_type(op->get_const_values()),
+    return {as_device_type(op->get_const_values()),
             op->get_const_col_idxs(),
             op->get_num_batch_items(),
             static_cast<IndexType>(op->get_common_size()[0]),
@@ -122,10 +118,10 @@ get_batch_struct(const batch::matrix::Ell<ValueType, IndexType>* const op)
  * Generates a uniform batch struct from a batch of ell matrices.
  */
 template <typename ValueType, typename IndexType>
-inline batch::matrix::ell::uniform_batch<cuda_type<ValueType>, IndexType>
+inline batch::matrix::ell::uniform_batch<device_type<ValueType>, IndexType>
 get_batch_struct(batch::matrix::Ell<ValueType, IndexType>* const op)
 {
-    return {as_cuda_type(op->get_values()),
+    return {as_device_type(op->get_values()),
             op->get_col_idxs(),
             op->get_num_batch_items(),
             static_cast<IndexType>(op->get_common_size()[0]),
@@ -135,9 +131,9 @@ get_batch_struct(batch::matrix::Ell<ValueType, IndexType>* const op)
 }
 
 
-}  // namespace cuda
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
 
 
-#endif  // GKO_CUDA_MATRIX_BATCH_STRUCT_HPP_
+#endif  // GKO_COMMON_CUDA_HIP_MATRIX_BATCH_STRUCT_HPP_
diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp
index 8a142a5224a..599c708b334 100644
--- a/core/solver/batch_dispatch.hpp
+++ b/core/solver/batch_dispatch.hpp
@@ -24,10 +24,10 @@
 #if defined GKO_COMPILING_CUDA
 
 
-#include "cuda/base/batch_struct.hpp"
+#include "common/cuda_hip/base/batch_struct.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "cuda/components/cooperative_groups.cuh"
 #include "cuda/log/batch_logger.cuh"
-#include "cuda/matrix/batch_struct.hpp"
 #include "cuda/preconditioner/batch_preconditioners.cuh"
 #include "cuda/stop/batch_criteria.cuh"
 
@@ -52,10 +52,10 @@ using DeviceValueType = typename gko::kernels::cuda::cuda_type<ValueType>;
 #elif defined GKO_COMPILING_HIP
 
 
-#include "hip/base/batch_struct.hip.hpp"
+#include "common/cuda_hip/base/batch_struct.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/log/batch_logger.hip.hpp"
-#include "hip/matrix/batch_struct.hip.hpp"
 #include "hip/preconditioner/batch_preconditioners.hip.hpp"
 #include "hip/stop/batch_criteria.hip.hpp"
 
diff --git a/cuda/preconditioner/batch_jacobi_kernels.cu b/cuda/preconditioner/batch_jacobi_kernels.cu
index 716c158ffff..edf052cb649 100644
--- a/cuda/preconditioner/batch_jacobi_kernels.cu
+++ b/cuda/preconditioner/batch_jacobi_kernels.cu
@@ -8,19 +8,19 @@
 #include <ginkgo/core/matrix/batch_csr.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
+#include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/components/intrinsics.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/base/utils.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/preconditioner/batch_jacobi_helpers.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/batch_struct.hpp"
 #include "cuda/base/config.hpp"
 #include "cuda/base/types.hpp"
 #include "cuda/components/cooperative_groups.cuh"
-#include "cuda/matrix/batch_struct.hpp"
 // generated header
 #include "common/cuda_hip/preconditioner/jacobi_common.hpp"
 
diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu
index 09e737c8793..35d567fd911 100644
--- a/cuda/solver/batch_bicgstab_kernels.cu
+++ b/cuda/solver/batch_bicgstab_kernels.cu
@@ -11,6 +11,7 @@
 #include <ginkgo/core/base/math.hpp>
 
 #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
+#include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/thrust.hpp"
@@ -22,11 +23,10 @@
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
-#include "cuda/base/batch_struct.hpp"
-#include "cuda/matrix/batch_struct.hpp"
 
 
 namespace gko {
diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu
index 7ac876de3a2..f26f2d37313 100644
--- a/cuda/solver/batch_cg_kernels.cu
+++ b/cuda/solver/batch_cg_kernels.cu
@@ -11,6 +11,7 @@
 #include <ginkgo/core/base/math.hpp>
 
 #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
+#include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
@@ -21,11 +22,10 @@
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
-#include "cuda/base/batch_struct.hpp"
-#include "cuda/matrix/batch_struct.hpp"
 
 
 namespace gko {
diff --git a/hip/base/batch_struct.hip.hpp b/hip/base/batch_struct.hip.hpp
deleted file mode 100644
index 3e4cba6a747..00000000000
--- a/hip/base/batch_struct.hip.hpp
+++ /dev/null
@@ -1,64 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_BASE_BATCH_STRUCT_HIP_HPP_
-#define GKO_HIP_BASE_BATCH_STRUCT_HIP_HPP_
-
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/math.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "core/base/batch_struct.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-/** @file batch_struct.hpp
- *
- * Helper functions to generate a batch struct from a batch LinOp,
- * while also shallow-casting to the required Hip scalar type.
- *
- * A specialization is needed for every format of every kind of linear algebra
- * object. These are intended to be called on the host.
- */
-
-
-/**
- * Generates an immutable uniform batch struct from a batch of multi-vectors.
- */
-template <typename ValueType>
-inline batch::multi_vector::uniform_batch<const hip_type<ValueType>>
-get_batch_struct(const batch::MultiVector<ValueType>* const op)
-{
-    return {as_hip_type(op->get_const_values()), op->get_num_batch_items(),
-            static_cast<int32>(op->get_common_size()[1]),
-            static_cast<int32>(op->get_common_size()[0]),
-            static_cast<int32>(op->get_common_size()[1])};
-}
-
-/**
- * Generates a uniform batch struct from a batch of multi-vectors.
- */
-template <typename ValueType>
-inline batch::multi_vector::uniform_batch<hip_type<ValueType>> get_batch_struct(
-    batch::MultiVector<ValueType>* const op)
-{
-    return {as_hip_type(op->get_values()), op->get_num_batch_items(),
-            static_cast<int32>(op->get_common_size()[1]),
-            static_cast<int32>(op->get_common_size()[0]),
-            static_cast<int32>(op->get_common_size()[1])};
-}
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_BASE_BATCH_STRUCT_HIP_HPP_
diff --git a/hip/matrix/batch_struct.hip.hpp b/hip/matrix/batch_struct.hip.hpp
deleted file mode 100644
index a8d14b84bb7..00000000000
--- a/hip/matrix/batch_struct.hip.hpp
+++ /dev/null
@@ -1,142 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_MATRIX_BATCH_STRUCT_HIP_HPP_
-#define GKO_HIP_MATRIX_BATCH_STRUCT_HIP_HPP_
-
-
-#include <ginkgo/core/matrix/batch_dense.hpp>
-#include <ginkgo/core/matrix/batch_ell.hpp>
-
-#include "common/cuda_hip/base/types.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-namespace {
-
-
-constexpr auto default_block_size = 256;
-
-
-}
-
-
-/** @file batch_struct.hpp
- *
- * Helper functions to generate a batch struct from a batch LinOp,
- * while also shallow-casting to the required HIP scalar type.
- *
- * A specialization is needed for every format of every kind of linear algebra
- * object. These are intended to be called on the host.
- */
-
-
-/**
- * Generates an immutable uniform batch struct from a batch of csr matrices.
- */
-template <typename ValueType, typename IndexType>
-inline batch::matrix::csr::uniform_batch<const hip_type<ValueType>,
-                                         const IndexType>
-get_batch_struct(const batch::matrix::Csr<ValueType, IndexType>* const op)
-{
-    return {as_hip_type(op->get_const_values()),
-            op->get_const_col_idxs(),
-            op->get_const_row_ptrs(),
-            op->get_num_batch_items(),
-            static_cast<IndexType>(op->get_common_size()[0]),
-            static_cast<IndexType>(op->get_common_size()[1]),
-            static_cast<IndexType>(op->get_num_elements_per_item())};
-}
-
-
-/**
- * Generates a uniform batch struct from a batch of csr matrices.
- */
-template <typename ValueType, typename IndexType>
-inline batch::matrix::csr::uniform_batch<hip_type<ValueType>, IndexType>
-get_batch_struct(batch::matrix::Csr<ValueType, IndexType>* const op)
-{
-    return {as_hip_type(op->get_values()),
-            op->get_col_idxs(),
-            op->get_row_ptrs(),
-            op->get_num_batch_items(),
-            static_cast<IndexType>(op->get_common_size()[0]),
-            static_cast<IndexType>(op->get_common_size()[1]),
-            static_cast<IndexType>(op->get_num_elements_per_item())};
-}
-
-
-/**
- * Generates an immutable uniform batch struct from a batch of dense matrices.
- */
-template <typename ValueType>
-inline batch::matrix::dense::uniform_batch<const hip_type<ValueType>>
-get_batch_struct(const batch::matrix::Dense<ValueType>* const op)
-{
-    return {as_hip_type(op->get_const_values()), op->get_num_batch_items(),
-            static_cast<int32>(op->get_common_size()[1]),
-            static_cast<int32>(op->get_common_size()[0]),
-            static_cast<int32>(op->get_common_size()[1])};
-}
-
-
-/**
- * Generates a uniform batch struct from a batch of dense matrices.
- */
-template <typename ValueType>
-inline batch::matrix::dense::uniform_batch<hip_type<ValueType>>
-get_batch_struct(batch::matrix::Dense<ValueType>* const op)
-{
-    return {as_hip_type(op->get_values()), op->get_num_batch_items(),
-            static_cast<int32>(op->get_common_size()[1]),
-            static_cast<int32>(op->get_common_size()[0]),
-            static_cast<int32>(op->get_common_size()[1])};
-}
-
-
-/**
- * Generates an immutable uniform batch struct from a batch of ell matrices.
- */
-template <typename ValueType, typename IndexType>
-inline batch::matrix::ell::uniform_batch<const hip_type<ValueType>,
-                                         const IndexType>
-get_batch_struct(const batch::matrix::Ell<ValueType, IndexType>* const op)
-{
-    return {as_hip_type(op->get_const_values()),
-            op->get_const_col_idxs(),
-            op->get_num_batch_items(),
-            static_cast<IndexType>(op->get_common_size()[0]),
-            static_cast<IndexType>(op->get_common_size()[0]),
-            static_cast<IndexType>(op->get_common_size()[1]),
-            static_cast<IndexType>(op->get_num_stored_elements_per_row())};
-}
-
-
-/**
- * Generates a uniform batch struct from a batch of ell matrices.
- */
-template <typename ValueType, typename IndexType>
-inline batch::matrix::ell::uniform_batch<hip_type<ValueType>, IndexType>
-get_batch_struct(batch::matrix::Ell<ValueType, IndexType>* const op)
-{
-    return {as_hip_type(op->get_values()),
-            op->get_col_idxs(),
-            op->get_num_batch_items(),
-            static_cast<IndexType>(op->get_common_size()[0]),
-            static_cast<IndexType>(op->get_common_size()[0]),
-            static_cast<IndexType>(op->get_common_size()[1]),
-            static_cast<IndexType>(op->get_num_stored_elements_per_row())};
-}
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_MATRIX_BATCH_STRUCT_HIP_HPP_
diff --git a/hip/preconditioner/batch_jacobi_kernels.hip.cpp b/hip/preconditioner/batch_jacobi_kernels.hip.cpp
index e86bc86390a..38a81972e66 100644
--- a/hip/preconditioner/batch_jacobi_kernels.hip.cpp
+++ b/hip/preconditioner/batch_jacobi_kernels.hip.cpp
@@ -8,21 +8,21 @@
 #include <ginkgo/core/matrix/batch_csr.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
+#include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/components/diagonal_block_manipulation.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
 #include "common/cuda_hip/components/uninitialized_array.hpp"
 #include "common/cuda_hip/components/warp_blas.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/base/utils.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/batch_struct.hip.hpp"
 #include "hip/base/config.hip.hpp"
 #include "hip/base/types.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/matrix/batch_struct.hip.hpp"
 // generated header
 #include "common/cuda_hip/preconditioner/jacobi_common.hpp"
 
diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp
index f0f1a715a86..a5de10953bc 100644
--- a/hip/solver/batch_bicgstab_kernels.hip.cpp
+++ b/hip/solver/batch_bicgstab_kernels.hip.cpp
@@ -11,6 +11,7 @@
 #include <ginkgo/core/base/math.hpp>
 
 #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
+#include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
@@ -23,11 +24,10 @@
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
-#include "hip/base/batch_struct.hip.hpp"
-#include "hip/matrix/batch_struct.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp
index b40732535f4..23bb939ead8 100644
--- a/hip/solver/batch_cg_kernels.hip.cpp
+++ b/hip/solver/batch_cg_kernels.hip.cpp
@@ -11,6 +11,7 @@
 #include <ginkgo/core/base/math.hpp>
 
 #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
+#include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
@@ -23,11 +24,10 @@
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
-#include "hip/base/batch_struct.hip.hpp"
-#include "hip/matrix/batch_struct.hip.hpp"
 
 
 namespace gko {

From 3ffba6a713eefcc61068bb642ab75345e31d7a4b Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Fri, 23 Aug 2024 14:47:10 +0200
Subject: [PATCH 150/448] [cuda, hip] rem anon namespace, type defs

---
 common/cuda_hip/base/batch_multi_vector_kernels.hpp | 4 ----
 common/cuda_hip/base/types.hpp                      | 4 ----
 2 files changed, 8 deletions(-)

diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp b/common/cuda_hip/base/batch_multi_vector_kernels.hpp
index 1cd9d6c752b..7583cc72292 100644
--- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp
@@ -28,15 +28,11 @@ namespace gko {
 namespace kernels {
 namespace GKO_DEVICE_NAMESPACE {
 namespace batch_single_kernels {
-namespace {
 
 
 constexpr auto default_block_size = 256;
 
 
-}
-
-
 template <typename ValueType, typename Mapping>
 __device__ __forceinline__ void scale(
     const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
diff --git a/common/cuda_hip/base/types.hpp b/common/cuda_hip/base/types.hpp
index ee1c76a0585..08f0516d691 100644
--- a/common/cuda_hip/base/types.hpp
+++ b/common/cuda_hip/base/types.hpp
@@ -8,12 +8,8 @@
 
 #if defined(GKO_COMPILING_CUDA)
 #include "cuda/base/types.hpp"
-#define device_type cuda_type
-#define as_device_type as_cuda_type
 #elif defined(GKO_COMPILING_HIP)
 #include "hip/base/types.hip.hpp"
-#define device_type hip_type
-#define as_device_type as_hip_type
 #else
 #error "Executor definition missing"
 #endif

From fdab7d4380eab7c69433bc2de53fef5aa7789bc5 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Fri, 23 Aug 2024 16:00:43 +0200
Subject: [PATCH 151/448] [ref] set device namespace with CMake

---
 reference/CMakeLists.txt                      | 1 +
 reference/base/batch_multi_vector_kernels.cpp | 4 ----
 reference/matrix/batch_csr_kernels.cpp        | 4 ----
 reference/matrix/batch_dense_kernels.cpp      | 4 ----
 reference/matrix/batch_ell_kernels.cpp        | 4 ----
 5 files changed, 1 insertion(+), 16 deletions(-)

diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt
index 0c226830637..85b8f33e38b 100644
--- a/reference/CMakeLists.txt
+++ b/reference/CMakeLists.txt
@@ -66,6 +66,7 @@ target_sources(ginkgo_reference
     stop/residual_norm_kernels.cpp)
 
 target_link_libraries(ginkgo_reference PUBLIC ginkgo_device)
+target_compile_definitions(ginkgo_reference PRIVATE GKO_COMPILING_REFERENCE GKO_DEVICE_NAMESPACE=reference)
 ginkgo_compile_features(ginkgo_reference)
 ginkgo_default_includes(ginkgo_reference)
 ginkgo_install_library(ginkgo_reference)
diff --git a/reference/base/batch_multi_vector_kernels.cpp b/reference/base/batch_multi_vector_kernels.cpp
index f5e1c653054..d7fbf3ce214 100644
--- a/reference/base/batch_multi_vector_kernels.cpp
+++ b/reference/base/batch_multi_vector_kernels.cpp
@@ -10,10 +10,6 @@
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/range_accessors.hpp>
 
-
-#define GKO_DEVICE_NAMESPACE reference
-
-
 #include "core/base/batch_struct.hpp"
 #include "reference/base/batch_multi_vector_kernels.hpp"
 #include "reference/base/batch_struct.hpp"
diff --git a/reference/matrix/batch_csr_kernels.cpp b/reference/matrix/batch_csr_kernels.cpp
index 9fbb2e35804..d3304ab9795 100644
--- a/reference/matrix/batch_csr_kernels.cpp
+++ b/reference/matrix/batch_csr_kernels.cpp
@@ -9,10 +9,6 @@
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/matrix/batch_csr.hpp>
 
-
-#define GKO_DEVICE_NAMESPACE reference
-
-
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "reference/base/batch_struct.hpp"
diff --git a/reference/matrix/batch_dense_kernels.cpp b/reference/matrix/batch_dense_kernels.cpp
index 99a7d4e8d7b..599af30ecfb 100644
--- a/reference/matrix/batch_dense_kernels.cpp
+++ b/reference/matrix/batch_dense_kernels.cpp
@@ -9,10 +9,6 @@
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
 
-
-#define GKO_DEVICE_NAMESPACE reference
-
-
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "reference/base/batch_struct.hpp"
diff --git a/reference/matrix/batch_ell_kernels.cpp b/reference/matrix/batch_ell_kernels.cpp
index 7772662b216..1a4855f389f 100644
--- a/reference/matrix/batch_ell_kernels.cpp
+++ b/reference/matrix/batch_ell_kernels.cpp
@@ -9,10 +9,6 @@
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
-
-#define GKO_DEVICE_NAMESPACE reference
-
-
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "reference/base/batch_struct.hpp"

From d0a7f4a8f99a7e18eea49ae2f3051fdc39ecb297 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Fri, 23 Aug 2024 16:31:57 +0200
Subject: [PATCH 152/448] [unified] rem device_namespace defines in source

---
 common/unified/base/kernel_launch.hpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/common/unified/base/kernel_launch.hpp b/common/unified/base/kernel_launch.hpp
index 73d37eb2ac2..455d3d67a6d 100644
--- a/common/unified/base/kernel_launch.hpp
+++ b/common/unified/base/kernel_launch.hpp
@@ -16,7 +16,6 @@
 
 #if defined(GKO_COMPILING_CUDA)
 
-#define GKO_DEVICE_NAMESPACE cuda
 #define GKO_KERNEL __device__
 #include "common/cuda_hip/base/types.hpp"
 
@@ -43,7 +42,6 @@ GKO_INLINE GKO_ATTRIBUTES constexpr unpack_member_type<T> unpack_member(T value)
 
 #elif defined(GKO_COMPILING_HIP)
 
-#define GKO_DEVICE_NAMESPACE hip
 #define GKO_KERNEL __device__
 #include "common/cuda_hip/base/types.hpp"
 
@@ -70,7 +68,6 @@ GKO_INLINE GKO_ATTRIBUTES constexpr unpack_member_type<T> unpack_member(T value)
 
 #elif defined(GKO_COMPILING_DPCPP)
 
-#define GKO_DEVICE_NAMESPACE dpcpp
 #define GKO_KERNEL
 
 
@@ -105,7 +102,6 @@ GKO_INLINE GKO_ATTRIBUTES constexpr unpack_member_type<T> unpack_member(T value)
 
 #elif defined(GKO_COMPILING_OMP)
 
-#define GKO_DEVICE_NAMESPACE omp
 #define GKO_KERNEL
 
 

From b2069d75db5c1d8e9ccff6128304f8ac20b37108 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Sat, 15 Jun 2024 22:12:41 +0200
Subject: [PATCH 153/448] add schwarz config whose global index from file

---
 core/CMakeLists.txt                           |  3 ++
 core/config/config_helper.hpp                 |  3 +-
 core/config/registry.cpp                      | 13 ++++-
 core/config/schwarz_config.cpp                | 54 +++++++++++++++++++
 core/distributed/preconditioner/schwarz.cpp   | 26 +++++++++
 .../distributed/preconditioner/schwarz.hpp    | 27 +++++++++-
 6 files changed, 121 insertions(+), 5 deletions(-)
 create mode 100644 core/config/schwarz_config.cpp

diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
index df8f748b4d3..8c802b2eca5 100644
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -7,6 +7,9 @@ set(config_source
     config/registry.cpp
     config/solver_config.cpp
 )
+if(GINKGO_BUILD_MPI)
+    list(APPEND config_source config/schwarz_config.cpp)
+endif()
 # MSVC: To solve LNK1189, we separate the library as a workaround
 # To make ginkgo still be the major library, we make the original to ginkgo_core in MSVC/shared
 # TODO: should think another way to solve it like dllexport or def file
diff --git a/core/config/config_helper.hpp b/core/config/config_helper.hpp
index f84e6799bf7..555bb75c2a8 100644
--- a/core/config/config_helper.hpp
+++ b/core/config/config_helper.hpp
@@ -65,7 +65,8 @@ enum class LinOpFactoryType : int {
     Isai,
     Jacobi,
     Multigrid,
-    Pgm
+    Pgm,
+    Schwarz
 };
 
 
diff --git a/core/config/registry.cpp b/core/config/registry.cpp
index 1718de5fed2..188c34b35dd 100644
--- a/core/config/registry.cpp
+++ b/core/config/registry.cpp
@@ -4,6 +4,7 @@
 
 #include "ginkgo/core/config/registry.hpp"
 
+#include <ginkgo/config.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/config/config.hpp>
 
@@ -16,7 +17,9 @@ namespace config {
 
 configuration_map generate_config_map()
 {
-    return {{"solver::Cg", parse<LinOpFactoryType::Cg>},
+    return
+    {
+        {"solver::Cg", parse<LinOpFactoryType::Cg>},
             {"solver::Bicg", parse<LinOpFactoryType::Bicg>},
             {"solver::Bicgstab", parse<LinOpFactoryType::Bicgstab>},
             {"solver::Fcg", parse<LinOpFactoryType::Fcg>},
@@ -42,7 +45,13 @@ configuration_map generate_config_map()
             {"preconditioner::Isai", parse<LinOpFactoryType::Isai>},
             {"preconditioner::Jacobi", parse<LinOpFactoryType::Jacobi>},
             {"solver::Multigrid", parse<LinOpFactoryType::Multigrid>},
-            {"multigrid::Pgm", parse<LinOpFactoryType::Pgm>}};
+            {"multigrid::Pgm", parse<LinOpFactoryType::Pgm>},
+#if GINKGO_BUILD_MPI
+        {
+            "preconditioner::Schwarz", parse<LinOpFactoryType::Schwarz>
+        }
+#endif
+    };
 }
 
 
diff --git a/core/config/schwarz_config.cpp b/core/config/schwarz_config.cpp
new file mode 100644
index 00000000000..dea907dae08
--- /dev/null
+++ b/core/config/schwarz_config.cpp
@@ -0,0 +1,54 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <ginkgo/core/config/config.hpp>
+#include <ginkgo/core/config/registry.hpp>
+#include <ginkgo/core/config/type_descriptor.hpp>
+#include <ginkgo/core/distributed/preconditioner/schwarz.hpp>
+
+#include "core/config/config_helper.hpp"
+#include "core/config/dispatch.hpp"
+#include "core/config/type_descriptor_helper.hpp"
+
+
+namespace gko {
+namespace config {
+
+
+template <>
+deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Schwarz>(
+    const pnode& config, const registry& context, const type_descriptor& td)
+{
+    auto updated = update_type(config, td);
+    auto global_index_str = updated.get_index_typestr();
+    if (auto& obj = config.get("global_index_type")) {
+        global_index_str = obj.get_string();
+    }
+    // We can not directly dispatch the global index type without consider local
+    // index type, which leadw the invalid index type <int64, int32> in
+    // compile time.
+    if (updated.get_index_typestr() == type_string<int32>::str()) {
+        return dispatch<
+            gko::LinOpFactory,
+            gko::experimental::distributed::preconditioner::Schwarz>(
+            config, context, updated,
+            make_type_selector(updated.get_value_typestr(), value_type_list()),
+            make_type_selector(updated.get_index_typestr(),
+                               syn::type_list<int32>()),
+            make_type_selector(global_index_str, index_type_list()));
+    } else {
+        return dispatch<
+            gko::LinOpFactory,
+            gko::experimental::distributed::preconditioner::Schwarz>(
+            config, context, updated,
+            make_type_selector(updated.get_value_typestr(), value_type_list()),
+            make_type_selector(updated.get_index_typestr(),
+                               syn::type_list<int64>()),
+            make_type_selector(global_index_str, syn::type_list<int64>()));
+    }
+}
+
+
+}  // namespace config
+}  // namespace gko
diff --git a/core/distributed/preconditioner/schwarz.cpp b/core/distributed/preconditioner/schwarz.cpp
index 7235038847d..d5466cd003a 100644
--- a/core/distributed/preconditioner/schwarz.cpp
+++ b/core/distributed/preconditioner/schwarz.cpp
@@ -12,11 +12,15 @@
 #include <ginkgo/core/base/precision_dispatch.hpp>
 #include <ginkgo/core/base/temporary_conversion.hpp>
 #include <ginkgo/core/base/utils.hpp>
+#include <ginkgo/core/config/config.hpp>
+#include <ginkgo/core/config/registry.hpp>
 #include <ginkgo/core/distributed/matrix.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
 #include "core/base/utils.hpp"
+#include "core/config/config_helper.hpp"
+#include "core/config/dispatch.hpp"
 #include "core/distributed/helpers.hpp"
 
 
@@ -26,6 +30,28 @@ namespace distributed {
 namespace preconditioner {
 
 
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+typename Schwarz<ValueType, LocalIndexType, GlobalIndexType>::parameters_type
+Schwarz<ValueType, LocalIndexType, GlobalIndexType>::parse(
+    const config::pnode& config, const config::registry& context,
+    const config::type_descriptor& td_for_child)
+{
+    auto params = Schwarz<ValueType, LocalIndexType, GlobalIndexType>::build();
+
+    if (auto& obj = config.get("generated_local_solver")) {
+        params.with_generated_local_solver(
+            gko::config::get_stored_obj<const LinOp>(obj, context));
+    }
+    if (auto& obj = config.get("local_solver")) {
+        params.with_local_solver(
+            gko::config::parse_or_get_factory<const LinOpFactory>(
+                obj, context, td_for_child));
+    }
+
+    return params;
+}
+
+
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void Schwarz<ValueType, LocalIndexType, GlobalIndexType>::apply_impl(
     const LinOp* b, LinOp* x) const
diff --git a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp
index badd5ba7dd3..a8eca306964 100644
--- a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp
+++ b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp
@@ -14,6 +14,9 @@
 
 #include <ginkgo/core/base/abstract_factory.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
+#include <ginkgo/core/config/config.hpp>
+#include <ginkgo/core/config/registry.hpp>
+#include <ginkgo/core/config/type_descriptor.hpp>
 #include <ginkgo/core/distributed/matrix.hpp>
 #include <ginkgo/core/distributed/vector.hpp>
 
@@ -39,8 +42,9 @@ namespace preconditioner {
  *
  * @note Currently overlap and coarse grid correction are not supported (TODO).
  *
- * @tparam ValueType  precision of matrix elements
- * @tparam IndexType  integral type of the preconditioner
+ * @tparam ValueType  precision of matrix element
+ * @tparam LocalIndexType  local integer type of the matrix
+ * @tparam GlobalIndexType  global integer type of the matrix
  *
  * @ingroup schwarz
  * @ingroup precond
@@ -78,6 +82,25 @@ class Schwarz
     GKO_ENABLE_LIN_OP_FACTORY(Schwarz, parameters, Factory);
     GKO_ENABLE_BUILD_METHOD(Factory);
 
+    /**
+     * Create the parameters from the property_tree.
+     * Because this is directly tied to the specific type, the value/index type
+     * settings within config are ignored and type_descriptor is only used
+     * for children objects.
+     *
+     * @param config  the property tree for setting
+     * @param context  the registry
+     * @param td_for_child  the type descriptor for children objects. The
+     *                      default uses the value/local index type of this
+     * class.
+     *
+     * @return parameters
+     */
+    static parameters_type parse(
+        const config::pnode& config, const config::registry& context,
+        const config::type_descriptor& td_for_child =
+            config::make_type_descriptor<ValueType, LocalIndexType>());
+
 protected:
     /**
      * Creates an empty Schwarz preconditioner.

From e1a3341f19349410b035163caa7a633a45347c34 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 31 Jul 2024 00:21:27 +0200
Subject: [PATCH 154/448] only set the global index via type descriptor

---
 core/config/schwarz_config.cpp                |  12 +--
 core/config/type_descriptor.cpp               | 101 ++++++++++++++----
 core/test/config/type_descriptor.cpp          |  17 ++-
 .../ginkgo/core/config/type_descriptor.hpp    |  20 +++-
 .../distributed/preconditioner/schwarz.hpp    |   7 +-
 5 files changed, 121 insertions(+), 36 deletions(-)

diff --git a/core/config/schwarz_config.cpp b/core/config/schwarz_config.cpp
index dea907dae08..9543b833041 100644
--- a/core/config/schwarz_config.cpp
+++ b/core/config/schwarz_config.cpp
@@ -21,12 +21,8 @@ deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Schwarz>(
     const pnode& config, const registry& context, const type_descriptor& td)
 {
     auto updated = update_type(config, td);
-    auto global_index_str = updated.get_index_typestr();
-    if (auto& obj = config.get("global_index_type")) {
-        global_index_str = obj.get_string();
-    }
     // We can not directly dispatch the global index type without consider local
-    // index type, which leadw the invalid index type <int64, int32> in
+    // index type, which leads the invalid index type <int64, int32> in
     // compile time.
     if (updated.get_index_typestr() == type_string<int32>::str()) {
         return dispatch<
@@ -36,7 +32,8 @@ deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Schwarz>(
             make_type_selector(updated.get_value_typestr(), value_type_list()),
             make_type_selector(updated.get_index_typestr(),
                                syn::type_list<int32>()),
-            make_type_selector(global_index_str, index_type_list()));
+            make_type_selector(updated.get_global_index_typestr(),
+                               index_type_list()));
     } else {
         return dispatch<
             gko::LinOpFactory,
@@ -45,7 +42,8 @@ deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Schwarz>(
             make_type_selector(updated.get_value_typestr(), value_type_list()),
             make_type_selector(updated.get_index_typestr(),
                                syn::type_list<int64>()),
-            make_type_selector(global_index_str, syn::type_list<int64>()));
+            make_type_selector(updated.get_global_index_typestr(),
+                               syn::type_list<int64>()));
     }
 }
 
diff --git a/core/config/type_descriptor.cpp b/core/config/type_descriptor.cpp
index 93ec1d3f929..017cc98ca06 100644
--- a/core/config/type_descriptor.cpp
+++ b/core/config/type_descriptor.cpp
@@ -17,6 +17,7 @@ type_descriptor update_type(const pnode& config, const type_descriptor& td)
 {
     auto value_typestr = td.get_value_typestr();
     auto index_typestr = td.get_index_typestr();
+    auto global_index_typestr = td.get_global_index_typestr();
 
     if (auto& obj = config.get("value_type")) {
         value_typestr = obj.get_string();
@@ -26,37 +27,93 @@ type_descriptor update_type(const pnode& config, const type_descriptor& td)
             "Setting index_type in the config is not allowed. Please set the "
             "proper index_type through type_descriptor of parse");
     }
-    return type_descriptor{value_typestr, index_typestr};
+    if (auto& obj = config.get("global_index_type")) {
+        GKO_INVALID_STATE(
+            "Setting global_index_type in the config is not allowed. Please "
+            "set the proper global_index_type through type_descriptor of "
+            "parse");
+    }
+    return type_descriptor{value_typestr, index_typestr, global_index_typestr};
 }
 
 
-template <typename ValueType, typename IndexType>
+template <typename ValueType, typename IndexType, typename GlobalIndexType>
 type_descriptor make_type_descriptor()
 {
     return type_descriptor{type_string<ValueType>::str(),
-                           type_string<IndexType>::str()};
+                           type_string<IndexType>::str(),
+                           type_string<GlobalIndexType>::str()};
 }
 
-template type_descriptor make_type_descriptor<void, void>();
-template type_descriptor make_type_descriptor<float, void>();
-template type_descriptor make_type_descriptor<double, void>();
-template type_descriptor make_type_descriptor<std::complex<float>, void>();
-template type_descriptor make_type_descriptor<std::complex<double>, void>();
-template type_descriptor make_type_descriptor<void, int32>();
-template type_descriptor make_type_descriptor<float, int32>();
-template type_descriptor make_type_descriptor<double, int32>();
-template type_descriptor make_type_descriptor<std::complex<float>, int32>();
-template type_descriptor make_type_descriptor<std::complex<double>, int32>();
-template type_descriptor make_type_descriptor<void, int64>();
-template type_descriptor make_type_descriptor<float, int64>();
-template type_descriptor make_type_descriptor<double, int64>();
-template type_descriptor make_type_descriptor<std::complex<float>, int64>();
-template type_descriptor make_type_descriptor<std::complex<double>, int64>();
+// global_index: void
+template type_descriptor make_type_descriptor<void, void, void>();
+template type_descriptor make_type_descriptor<float, void, void>();
+template type_descriptor make_type_descriptor<double, void, void>();
+template type_descriptor
+make_type_descriptor<std::complex<float>, void, void>();
+template type_descriptor
+make_type_descriptor<std::complex<double>, void, void>();
+template type_descriptor make_type_descriptor<void, int32, void>();
+template type_descriptor make_type_descriptor<float, int32, void>();
+template type_descriptor make_type_descriptor<double, int32, void>();
+template type_descriptor
+make_type_descriptor<std::complex<float>, int32, void>();
+template type_descriptor
+make_type_descriptor<std::complex<double>, int32, void>();
+template type_descriptor make_type_descriptor<void, int64, void>();
+template type_descriptor make_type_descriptor<float, int64, void>();
+template type_descriptor make_type_descriptor<double, int64, void>();
+template type_descriptor
+make_type_descriptor<std::complex<float>, int64, void>();
+template type_descriptor
+make_type_descriptor<std::complex<double>, int64, void>();
+
+// global_index int32
+template type_descriptor make_type_descriptor<void, void, int32>();
+template type_descriptor make_type_descriptor<float, void, int32>();
+template type_descriptor make_type_descriptor<double, void, int32>();
+template type_descriptor
+make_type_descriptor<std::complex<float>, void, int32>();
+template type_descriptor
+make_type_descriptor<std::complex<double>, void, int32>();
+template type_descriptor make_type_descriptor<void, int32, int32>();
+template type_descriptor make_type_descriptor<float, int32, int32>();
+template type_descriptor make_type_descriptor<double, int32, int32>();
+template type_descriptor
+make_type_descriptor<std::complex<float>, int32, int32>();
+template type_descriptor
+make_type_descriptor<std::complex<double>, int32, int32>();
+
+// global_index_type int64
+template type_descriptor make_type_descriptor<void, void, int64>();
+template type_descriptor make_type_descriptor<float, void, int64>();
+template type_descriptor make_type_descriptor<double, void, int64>();
+template type_descriptor
+make_type_descriptor<std::complex<float>, void, int64>();
+template type_descriptor
+make_type_descriptor<std::complex<double>, void, int64>();
+template type_descriptor make_type_descriptor<void, int32, int64>();
+template type_descriptor make_type_descriptor<float, int32, int64>();
+template type_descriptor make_type_descriptor<double, int32, int64>();
+template type_descriptor
+make_type_descriptor<std::complex<float>, int32, int64>();
+template type_descriptor
+make_type_descriptor<std::complex<double>, int32, int64>();
+template type_descriptor make_type_descriptor<void, int64, int64>();
+template type_descriptor make_type_descriptor<float, int64, int64>();
+template type_descriptor make_type_descriptor<double, int64, int64>();
+template type_descriptor
+make_type_descriptor<std::complex<float>, int64, int64>();
+template type_descriptor
+make_type_descriptor<std::complex<double>, int64, int64>();
 
 
 type_descriptor::type_descriptor(std::string value_typestr,
-                                 std::string index_typestr)
-    : value_typestr_(value_typestr), index_typestr_(index_typestr)
+                                 std::string index_typestr,
+                                 std::string global_index_typestr)
+    : value_typestr_(value_typestr),
+      index_typestr_(index_typestr),
+      global_index_typestr_(global_index_typestr)
 {}
 
 const std::string& type_descriptor::get_value_typestr() const
@@ -69,6 +126,10 @@ const std::string& type_descriptor::get_index_typestr() const
     return index_typestr_;
 }
 
+const std::string& type_descriptor::get_global_index_typestr() const
+{
+    return global_index_typestr_;
+}
 
 }  // namespace config
 }  // namespace gko
diff --git a/core/test/config/type_descriptor.cpp b/core/test/config/type_descriptor.cpp
index ff519e88101..86505df51d1 100644
--- a/core/test/config/type_descriptor.cpp
+++ b/core/test/config/type_descriptor.cpp
@@ -21,6 +21,15 @@ TEST(TypeDescriptor, TemplateCreate)
 
         ASSERT_EQ(td.get_value_typestr(), "float64");
         ASSERT_EQ(td.get_index_typestr(), "int32");
+        ASSERT_EQ(td.get_global_index_typestr(), "int64");
+    }
+    {
+        SCOPED_TRACE("specify global indextype");
+        auto td = make_type_descriptor<float, int, int>();
+
+        ASSERT_EQ(td.get_value_typestr(), "float32");
+        ASSERT_EQ(td.get_index_typestr(), "int32");
+        ASSERT_EQ(td.get_global_index_typestr(), "int");
     }
     {
         SCOPED_TRACE("specify valuetype");
@@ -28,20 +37,24 @@ TEST(TypeDescriptor, TemplateCreate)
 
         ASSERT_EQ(td.get_value_typestr(), "float32");
         ASSERT_EQ(td.get_index_typestr(), "int32");
+        ASSERT_EQ(td.get_global_index_typestr(), "int64");
     }
     {
         SCOPED_TRACE("specify all template");
-        auto td = make_type_descriptor<std::complex<float>, gko::int64>();
+        auto td =
+            make_type_descriptor<std::complex<float>, gko::int64, gko::int64>();
 
         ASSERT_EQ(td.get_value_typestr(), "complex<float32>");
         ASSERT_EQ(td.get_index_typestr(), "int64");
+        ASSERT_EQ(td.get_global_index_typestr(), "int64");
     }
     {
         SCOPED_TRACE("specify void");
-        auto td = make_type_descriptor<void, void>();
+        auto td = make_type_descriptor<void, void, void>();
 
         ASSERT_EQ(td.get_value_typestr(), "void");
         ASSERT_EQ(td.get_index_typestr(), "void");
+        ASSERT_EQ(td.get_global_index_typestr(), "void");
     }
 }
 
diff --git a/include/ginkgo/core/config/type_descriptor.hpp b/include/ginkgo/core/config/type_descriptor.hpp
index 48475f7f469..aa75b4591fa 100644
--- a/include/ginkgo/core/config/type_descriptor.hpp
+++ b/include/ginkgo/core/config/type_descriptor.hpp
@@ -8,6 +8,8 @@
 
 #include <string>
 
+#include <ginkgo/core/base/types.hpp>
+
 namespace gko {
 namespace config {
 
@@ -27,10 +29,9 @@ namespace config {
  * value `void` can be used to specify that no default type is provided. In this
  * case, the configuration has to provide the necessary template types.
  *
- * If the configuration specifies one of the fields (or both):
+ * If the configuration specifies one field (only allow value_type now):
  * ```
  * value_type: "some_value_type"
- * index_type: "some_index_type"
  * ```
  * these types will take precedence over the type_descriptor.
  */
@@ -42,12 +43,15 @@ class type_descriptor final {
      *
      * @param value_typestr  the value type string. "void" means no default.
      * @param index_typestr  the index type string. "void" means no default.
+     * @param global_index_typestr  the global index type string. "void" means
+     * no default.
      *
      * @note there is no way to call the constructor with explicit template, so
      * we create another free function to handle it.
      */
     explicit type_descriptor(std::string value_typestr = "float64",
-                             std::string index_typestr = "int32");
+                             std::string index_typestr = "int32",
+                             std::string global_index_typestr = "int64");
 
     /**
      * Get the value type string.
@@ -59,9 +63,15 @@ class type_descriptor final {
      */
     const std::string& get_index_typestr() const;
 
+    /**
+     * Get the global index type string
+     */
+    const std::string& get_global_index_typestr() const;
+
 private:
     std::string value_typestr_;
     std::string index_typestr_;
+    std::string global_index_typestr_;
 };
 
 
@@ -71,8 +81,10 @@ class type_descriptor final {
  *
  * @tparam ValueType  the value type in descriptor
  * @tparam IndexType  the index type in descriptor
+ * @tparam GlobalIndexType  the global index type in descriptor
  */
-template <typename ValueType = double, typename IndexType = int>
+template <typename ValueType = double, typename IndexType = int,
+          typename GlobalIndexType = int64>
 type_descriptor make_type_descriptor();
 
 
diff --git a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp
index a8eca306964..adc67dfbd36 100644
--- a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp
+++ b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp
@@ -91,15 +91,16 @@ class Schwarz
      * @param config  the property tree for setting
      * @param context  the registry
      * @param td_for_child  the type descriptor for children objects. The
-     *                      default uses the value/local index type of this
-     * class.
+     *                      default uses the value/local/global index type of
+     *                      this class.
      *
      * @return parameters
      */
     static parameters_type parse(
         const config::pnode& config, const config::registry& context,
         const config::type_descriptor& td_for_child =
-            config::make_type_descriptor<ValueType, LocalIndexType>());
+            config::make_type_descriptor<ValueType, LocalIndexType,
+                                         GlobalIndexType>());
 
 protected:
     /**

From 36adda67e407b590f02e31bd3aa2460941a9ba92 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 1 Aug 2024 16:12:50 +0200
Subject: [PATCH 155/448] add schwarz config test

---
 core/test/config/CMakeLists.txt      |  1 +
 core/test/config/preconditioner.cpp  | 74 +++++++++++++++++++++++++++-
 core/test/config/type_descriptor.cpp |  2 +-
 3 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/core/test/config/CMakeLists.txt b/core/test/config/CMakeLists.txt
index c3c400cc8b4..4460e2ed916 100644
--- a/core/test/config/CMakeLists.txt
+++ b/core/test/config/CMakeLists.txt
@@ -5,3 +5,4 @@ ginkgo_create_test(preconditioner)
 ginkgo_create_test(property_tree)
 ginkgo_create_test(registry)
 ginkgo_create_test(solver)
+ginkgo_create_test(type_descriptor)
diff --git a/core/test/config/preconditioner.cpp b/core/test/config/preconditioner.cpp
index b11ea3b6705..9e81e690967 100644
--- a/core/test/config/preconditioner.cpp
+++ b/core/test/config/preconditioner.cpp
@@ -6,8 +6,11 @@
 
 #include <gtest/gtest.h>
 
+#include <ginkgo/config.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/config/config.hpp>
+#include <ginkgo/core/distributed/preconditioner/schwarz.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/preconditioner/ic.hpp>
 #include <ginkgo/core/preconditioner/ilu.hpp>
 #include <ginkgo/core/preconditioner/isai.hpp>
@@ -297,6 +300,68 @@ struct Jacobi
 };
 
 
+#if GINKGO_BUILD_MPI
+
+
+struct Schwarz
+    : PreconditionerConfigTest<
+          ::gko::experimental::distributed::preconditioner::Schwarz<float, int,
+                                                                    gko::int64>,
+          ::gko::experimental::distributed::preconditioner::Schwarz<
+              double, int, gko::int64>> {
+    static pnode::map_type setup_base()
+    {
+        return {{"type", pnode{"preconditioner::Schwarz"}}};
+    }
+
+    static void change_template(pnode::map_type& config_map)
+    {
+        config_map["value_type"] = pnode{"float32"};
+    }
+
+    template <bool from_reg, typename ParamType>
+    static void set(pnode::map_type& config_map, ParamType& param, registry reg,
+                    std::shared_ptr<const gko::Executor> exec)
+    {
+        if (from_reg) {
+            config_map["local_solver"] = pnode{"solver"};
+            param.with_local_solver(
+                detail::registry_accessor::get_data<gko::LinOpFactory>(
+                    reg, "solver"));
+        } else {
+            config_map["local_solver"] =
+                pnode{{{"type", pnode{"solver::Ir"}},
+                       {"value_type", pnode{"float32"}}}};
+            param.with_local_solver(DummyIr::build().on(exec));
+        }
+        config_map["generated_local_solver"] = pnode{"linop"};
+        param.with_generated_local_solver(
+            detail::registry_accessor::get_data<gko::LinOp>(reg, "linop"));
+    }
+
+    template <bool from_reg, typename AnswerType>
+    static void validate(gko::LinOpFactory* result, AnswerType* answer)
+    {
+        auto res_param = gko::as<AnswerType>(result)->get_parameters();
+        auto ans_param = answer->get_parameters();
+
+        if (from_reg) {
+            ASSERT_EQ(res_param.local_solver, ans_param.local_solver);
+        } else {
+            ASSERT_NE(
+                std::dynamic_pointer_cast<const typename DummyIr::Factory>(
+                    res_param.local_solver),
+                nullptr);
+        }
+        ASSERT_EQ(res_param.generated_local_solver,
+                  ans_param.generated_local_solver);
+    }
+};
+
+
+#endif  // GINKGO_BUILD_MPI
+
+
 template <typename T>
 class Preconditioner : public ::testing::Test {
 protected:
@@ -309,12 +374,14 @@ class Preconditioner : public ::testing::Test {
           l_solver(DummyIr::build().on(exec)),
           u_solver(DummyIr::build().on(exec)),
           factorization(DummyIr::build().on(exec)),
+          linop(gko::matrix::Dense<>::create(exec)),
           reg()
     {
         reg.emplace("solver", solver_factory);
         reg.emplace("l_solver", l_solver);
         reg.emplace("u_solver", u_solver);
         reg.emplace("factorization", factorization);
+        reg.emplace("linop", linop);
     }
 
     std::shared_ptr<const gko::Executor> exec;
@@ -323,11 +390,16 @@ class Preconditioner : public ::testing::Test {
     std::shared_ptr<typename DummyIr::Factory> l_solver;
     std::shared_ptr<typename DummyIr::Factory> u_solver;
     std::shared_ptr<typename DummyIr::Factory> factorization;
+    std::shared_ptr<gko::LinOp> linop;
     registry reg;
 };
 
 
-using PreconditionerTypes = ::testing::Types<::Ic, ::Ilu, ::Isai, ::Jacobi>;
+using PreconditionerTypes = ::testing::Types<
+#if GINKGO_BUILD_MPI
+    ::Schwarz,
+#endif  // GINKGO_BUILD_MPI
+    ::Ic, ::Ilu, ::Isai, ::Jacobi>;
 
 
 TYPED_TEST_SUITE(Preconditioner, PreconditionerTypes, TypenameNameGenerator);
diff --git a/core/test/config/type_descriptor.cpp b/core/test/config/type_descriptor.cpp
index 86505df51d1..f044d60716f 100644
--- a/core/test/config/type_descriptor.cpp
+++ b/core/test/config/type_descriptor.cpp
@@ -29,7 +29,7 @@ TEST(TypeDescriptor, TemplateCreate)
 
         ASSERT_EQ(td.get_value_typestr(), "float32");
         ASSERT_EQ(td.get_index_typestr(), "int32");
-        ASSERT_EQ(td.get_global_index_typestr(), "int");
+        ASSERT_EQ(td.get_global_index_typestr(), "int32");
     }
     {
         SCOPED_TRACE("specify valuetype");

From 4761e53c7e9bad9803fa24a158e11901c06833ff Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 23 Aug 2024 18:05:21 +0200
Subject: [PATCH 156/448] update documentation and use macro

Co-authored-by: Marcel Koch <marcel.koch@kit.edu>
Co-authored-by: Pratik Nayak <pratikvn@protonmail.com>
---
 core/config/type_descriptor.cpp               | 79 +++++--------------
 core/test/config/type_descriptor.cpp          | 40 ++++++++--
 .../ginkgo/core/config/type_descriptor.hpp    | 22 ++++--
 3 files changed, 67 insertions(+), 74 deletions(-)

diff --git a/core/config/type_descriptor.cpp b/core/config/type_descriptor.cpp
index 017cc98ca06..fe11b785d6f 100644
--- a/core/config/type_descriptor.cpp
+++ b/core/config/type_descriptor.cpp
@@ -5,6 +5,7 @@
 #include "ginkgo/core/config/type_descriptor.hpp"
 
 #include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/types.hpp>
 
 #include "core/config/type_descriptor_helper.hpp"
 
@@ -45,67 +46,18 @@ type_descriptor make_type_descriptor()
                            type_string<GlobalIndexType>::str()};
 }
 
-// global_index: void
-template type_descriptor make_type_descriptor<void, void, void>();
-template type_descriptor make_type_descriptor<float, void, void>();
-template type_descriptor make_type_descriptor<double, void, void>();
-template type_descriptor
-make_type_descriptor<std::complex<float>, void, void>();
-template type_descriptor
-make_type_descriptor<std::complex<double>, void, void>();
-template type_descriptor make_type_descriptor<void, int32, void>();
-template type_descriptor make_type_descriptor<float, int32, void>();
-template type_descriptor make_type_descriptor<double, int32, void>();
-template type_descriptor
-make_type_descriptor<std::complex<float>, int32, void>();
-template type_descriptor
-make_type_descriptor<std::complex<double>, int32, void>();
-template type_descriptor make_type_descriptor<void, int64, void>();
-template type_descriptor make_type_descriptor<float, int64, void>();
-template type_descriptor make_type_descriptor<double, int64, void>();
-template type_descriptor
-make_type_descriptor<std::complex<float>, int64, void>();
-template type_descriptor
-make_type_descriptor<std::complex<double>, int64, void>();
-
-// global_index int32
-template type_descriptor make_type_descriptor<void, void, int32>();
-template type_descriptor make_type_descriptor<float, void, int32>();
-template type_descriptor make_type_descriptor<double, void, int32>();
-template type_descriptor
-make_type_descriptor<std::complex<float>, void, int32>();
-template type_descriptor
-make_type_descriptor<std::complex<double>, void, int32>();
-template type_descriptor make_type_descriptor<void, int32, int32>();
-template type_descriptor make_type_descriptor<float, int32, int32>();
-template type_descriptor make_type_descriptor<double, int32, int32>();
-template type_descriptor
-make_type_descriptor<std::complex<float>, int32, int32>();
-template type_descriptor
-make_type_descriptor<std::complex<double>, int32, int32>();
-
-// global_index_type int64
-template type_descriptor make_type_descriptor<void, void, int64>();
-template type_descriptor make_type_descriptor<float, void, int64>();
-template type_descriptor make_type_descriptor<double, void, int64>();
-template type_descriptor
-make_type_descriptor<std::complex<float>, void, int64>();
-template type_descriptor
-make_type_descriptor<std::complex<double>, void, int64>();
-template type_descriptor make_type_descriptor<void, int32, int64>();
-template type_descriptor make_type_descriptor<float, int32, int64>();
-template type_descriptor make_type_descriptor<double, int32, int64>();
-template type_descriptor
-make_type_descriptor<std::complex<float>, int32, int64>();
-template type_descriptor
-make_type_descriptor<std::complex<double>, int32, int64>();
-template type_descriptor make_type_descriptor<void, int64, int64>();
-template type_descriptor make_type_descriptor<float, int64, int64>();
-template type_descriptor make_type_descriptor<double, int64, int64>();
-template type_descriptor
-make_type_descriptor<std::complex<float>, int64, int64>();
-template type_descriptor
-make_type_descriptor<std::complex<double>, int64, int64>();
+#define GKO_DECLARE_MAKE_TYPE_DESCRIPTOR(ValueType, LocalIndexType, \
+                                         GlobalIndexType)           \
+    type_descriptor                                                 \
+    make_type_descriptor<ValueType, LocalIndexType, GlobalIndexType>()
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_MAKE_TYPE_DESCRIPTOR);
+
+#define GKO_DECLARE_MAKE_VOID_TYPE_DESCRIPTOR(LocalIndexType, GlobalIndexType) \
+    type_descriptor                                                            \
+    make_type_descriptor<void, LocalIndexType, GlobalIndexType>()
+GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_MAKE_VOID_TYPE_DESCRIPTOR);
 
 
 type_descriptor::type_descriptor(std::string value_typestr,
@@ -126,6 +78,11 @@ const std::string& type_descriptor::get_index_typestr() const
     return index_typestr_;
 }
 
+const std::string& type_descriptor::get_local_index_typestr() const
+{
+    return this->get_index_typestr();
+}
+
 const std::string& type_descriptor::get_global_index_typestr() const
 {
     return global_index_typestr_;
diff --git a/core/test/config/type_descriptor.cpp b/core/test/config/type_descriptor.cpp
index f044d60716f..e8a7327a6a2 100644
--- a/core/test/config/type_descriptor.cpp
+++ b/core/test/config/type_descriptor.cpp
@@ -21,6 +21,7 @@ TEST(TypeDescriptor, TemplateCreate)
 
         ASSERT_EQ(td.get_value_typestr(), "float64");
         ASSERT_EQ(td.get_index_typestr(), "int32");
+        ASSERT_EQ(td.get_local_index_typestr(), td.get_index_typestr());
         ASSERT_EQ(td.get_global_index_typestr(), "int64");
     }
     {
@@ -29,6 +30,7 @@ TEST(TypeDescriptor, TemplateCreate)
 
         ASSERT_EQ(td.get_value_typestr(), "float32");
         ASSERT_EQ(td.get_index_typestr(), "int32");
+        ASSERT_EQ(td.get_local_index_typestr(), td.get_index_typestr());
         ASSERT_EQ(td.get_global_index_typestr(), "int32");
     }
     {
@@ -37,24 +39,37 @@ TEST(TypeDescriptor, TemplateCreate)
 
         ASSERT_EQ(td.get_value_typestr(), "float32");
         ASSERT_EQ(td.get_index_typestr(), "int32");
+        ASSERT_EQ(td.get_local_index_typestr(), td.get_index_typestr());
         ASSERT_EQ(td.get_global_index_typestr(), "int64");
     }
     {
-        SCOPED_TRACE("specify all template");
+        SCOPED_TRACE("specify local index template");
         auto td =
             make_type_descriptor<std::complex<float>, gko::int64, gko::int64>();
 
         ASSERT_EQ(td.get_value_typestr(), "complex<float32>");
         ASSERT_EQ(td.get_index_typestr(), "int64");
+        ASSERT_EQ(td.get_local_index_typestr(), td.get_index_typestr());
         ASSERT_EQ(td.get_global_index_typestr(), "int64");
     }
+    {
+        SCOPED_TRACE("specify global index template");
+        auto td =
+            make_type_descriptor<std::complex<float>, gko::int32, gko::int32>();
+
+        ASSERT_EQ(td.get_value_typestr(), "complex<float32>");
+        ASSERT_EQ(td.get_index_typestr(), "int32");
+        ASSERT_EQ(td.get_local_index_typestr(), td.get_index_typestr());
+        ASSERT_EQ(td.get_global_index_typestr(), "int32");
+    }
     {
         SCOPED_TRACE("specify void");
-        auto td = make_type_descriptor<void, void, void>();
+        auto td = make_type_descriptor<void>();
 
         ASSERT_EQ(td.get_value_typestr(), "void");
-        ASSERT_EQ(td.get_index_typestr(), "void");
-        ASSERT_EQ(td.get_global_index_typestr(), "void");
+        ASSERT_EQ(td.get_index_typestr(), "int32");
+        ASSERT_EQ(td.get_local_index_typestr(), td.get_index_typestr());
+        ASSERT_EQ(td.get_global_index_typestr(), "int64");
     }
 }
 
@@ -67,6 +82,8 @@ TEST(TypeDescriptor, Constructor)
 
         ASSERT_EQ(td.get_value_typestr(), "float64");
         ASSERT_EQ(td.get_index_typestr(), "int32");
+        ASSERT_EQ(td.get_local_index_typestr(), td.get_index_typestr());
+        ASSERT_EQ(td.get_global_index_typestr(), "int64");
     }
     {
         SCOPED_TRACE("specify valuetype");
@@ -74,12 +91,25 @@ TEST(TypeDescriptor, Constructor)
 
         ASSERT_EQ(td.get_value_typestr(), "float32");
         ASSERT_EQ(td.get_index_typestr(), "int32");
+        ASSERT_EQ(td.get_local_index_typestr(), td.get_index_typestr());
+        ASSERT_EQ(td.get_global_index_typestr(), "int64");
     }
     {
-        SCOPED_TRACE("specify all parameters");
+        SCOPED_TRACE("specify local index parameters");
         type_descriptor td("void", "int64");
 
         ASSERT_EQ(td.get_value_typestr(), "void");
         ASSERT_EQ(td.get_index_typestr(), "int64");
+        ASSERT_EQ(td.get_local_index_typestr(), td.get_index_typestr());
+        ASSERT_EQ(td.get_global_index_typestr(), "int64");
+    }
+    {
+        SCOPED_TRACE("specify global index parameters");
+        type_descriptor td("void", "int32", "int32");
+
+        ASSERT_EQ(td.get_value_typestr(), "void");
+        ASSERT_EQ(td.get_index_typestr(), "int32");
+        ASSERT_EQ(td.get_local_index_typestr(), td.get_index_typestr());
+        ASSERT_EQ(td.get_global_index_typestr(), "int32");
     }
 }
diff --git a/include/ginkgo/core/config/type_descriptor.hpp b/include/ginkgo/core/config/type_descriptor.hpp
index aa75b4591fa..5c719340436 100644
--- a/include/ginkgo/core/config/type_descriptor.hpp
+++ b/include/ginkgo/core/config/type_descriptor.hpp
@@ -25,15 +25,16 @@ namespace config {
  * auto cg = parse(config, context, type_descriptor("float64", "int32"));
  * ```
  * will have the value type `float64` and the index type `int32`. Any Ginkgo
- * object that does not require one of these types will just ignore it. The
- * value `void` can be used to specify that no default type is provided. In this
- * case, the configuration has to provide the necessary template types.
+ * object that does not require one of these types will just ignore it. In
+ * value_type, one additional value `void` can be used to specify that no
+ * default type is provided. In this case, the configuration has to provide the
+ * necessary template types.
  *
  * If the configuration specifies one field (only allow value_type now):
  * ```
  * value_type: "some_value_type"
  * ```
- * these types will take precedence over the type_descriptor.
+ * this type will take precedence over the type_descriptor.
  */
 class type_descriptor final {
 public:
@@ -42,9 +43,8 @@ class type_descriptor final {
      * `make_type_descriptor` to create the object by template.
      *
      * @param value_typestr  the value type string. "void" means no default.
-     * @param index_typestr  the index type string. "void" means no default.
-     * @param global_index_typestr  the global index type string. "void" means
-     * no default.
+     * @param index_typestr  the (local) index type string.
+     * @param global_index_typestr  the global index type string.
      *
      * @note there is no way to call the constructor with explicit template, so
      * we create another free function to handle it.
@@ -63,6 +63,12 @@ class type_descriptor final {
      */
     const std::string& get_index_typestr() const;
 
+    /**
+     * Get the local index type string, which gives the same result as
+     * get_index_typestr()
+     */
+    const std::string& get_local_index_typestr() const;
+
     /**
      * Get the global index type string
      */
@@ -83,7 +89,7 @@ class type_descriptor final {
  * @tparam IndexType  the index type in descriptor
  * @tparam GlobalIndexType  the global index type in descriptor
  */
-template <typename ValueType = double, typename IndexType = int,
+template <typename ValueType = double, typename IndexType = int32,
           typename GlobalIndexType = int64>
 type_descriptor make_type_descriptor();
 

From aab84e9fc2e329ab87a3bb3464e1b14d093d9b26 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 21 Aug 2024 15:38:17 +0200
Subject: [PATCH 157/448] remove assertion workaround

This causes some kernels on ROCm debug builds to fail
---
 include/ginkgo/core/base/types.hpp | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp
index 8e2096c09e2..e375da15f9c 100644
--- a/include/ginkgo/core/base/types.hpp
+++ b/include/ginkgo/core/base/types.hpp
@@ -51,30 +51,8 @@
 #endif
 
 
-#if (defined(__CUDA_ARCH__) && defined(__APPLE__)) || \
-    defined(__HIP_DEVICE_COMPILE__)
-
-#ifdef NDEBUG
-#define GKO_ASSERT(condition) ((void)0)
-#else  // NDEBUG
-// Poor man's assertions on GPUs for MACs. They won't terminate the program
-// but will at least print something on the screen
-#define GKO_ASSERT(condition)                                               \
-    ((condition)                                                            \
-         ? ((void)0)                                                        \
-         : ((void)printf("%s: %d: %s: Assertion `" #condition "' failed\n", \
-                         __FILE__, __LINE__, __func__)))
-#endif  // NDEBUG
-
-#else  // (defined(__CUDA_ARCH__) && defined(__APPLE__)) ||
-       // defined(__HIP_DEVICE_COMPILE__)
-
-// Handle assertions normally on other systems
 #define GKO_ASSERT(condition) assert(condition)
 
-#endif  // (defined(__CUDA_ARCH__) && defined(__APPLE__)) ||
-        // defined(__HIP_DEVICE_COMPILE__)
-
 
 // Handle deprecated notices correctly on different systems
 // clang-format off

From 18eb9d7bf3aa750db981fa203b31753f2503bb4d Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Sat, 24 Aug 2024 10:47:42 +0200
Subject: [PATCH 158/448] fix ROCm 6.x segfaults on MI50

There is some weird interaction between inlining of shfl_xor and the
(otherwise unused) members of thread_block_tile.
The easiest way of working around it is to inline them explicitly as
__shfl_xor(_sync).
---
 common/cuda_hip/components/sorting.hpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/common/cuda_hip/components/sorting.hpp b/common/cuda_hip/components/sorting.hpp
index 7603d41a8ba..c15c4a70c64 100644
--- a/common/cuda_hip/components/sorting.hpp
+++ b/common/cuda_hip/components/sorting.hpp
@@ -113,11 +113,15 @@ struct bitonic_warp {
 
     __forceinline__ __device__ static void merge(ValueType* els, bool reverse)
     {
-        auto tile =
-            group::tiled_partition<num_threads>(group::this_thread_block());
         auto new_reverse = reverse != upper_half();
         for (int i = 0; i < num_local; ++i) {
-            auto other = tile.shfl_xor(els[i], num_threads / 2);
+            // workaround for ROCm 6.x segfaults on gfx906
+#ifdef GKO_COMPILING_CUDA
+            auto other = __shfl_xor_sync(config::full_lane_mask, els[i],
+                                         num_threads / 2);
+#else
+            auto other = __shfl_xor(els[i], num_threads / 2);
+#endif
             bitonic_cas(els[i], other, new_reverse);
         }
         half::merge(els, reverse);

From 71cd5eec814fe3c061bdb9cb3c3172e5c014a0c9 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Sun, 25 Aug 2024 16:56:20 +0200
Subject: [PATCH 159/448] more precise shuffle bounds

---
 common/cuda_hip/components/sorting.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/cuda_hip/components/sorting.hpp b/common/cuda_hip/components/sorting.hpp
index c15c4a70c64..76694541c2d 100644
--- a/common/cuda_hip/components/sorting.hpp
+++ b/common/cuda_hip/components/sorting.hpp
@@ -118,9 +118,9 @@ struct bitonic_warp {
             // workaround for ROCm 6.x segfaults on gfx906
 #ifdef GKO_COMPILING_CUDA
             auto other = __shfl_xor_sync(config::full_lane_mask, els[i],
-                                         num_threads / 2);
+                                         num_threads / 2, num_threads);
 #else
-            auto other = __shfl_xor(els[i], num_threads / 2);
+            auto other = __shfl_xor(els[i], num_threads / 2, num_threads);
 #endif
             bitonic_cas(els[i], other, new_reverse);
         }

From 4a56f71db7e9e9bd339301074efbe3b1eedc1a95 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Fri, 23 Aug 2024 23:59:30 +0200
Subject: [PATCH 160/448] [kernels] fix odr violations

---
 common/cuda_hip/base/batch_multi_vector_kernels.hpp | 7 +++++++
 common/cuda_hip/matrix/batch_csr_kernels.hpp        | 7 +++++++
 common/cuda_hip/matrix/batch_dense_kernels.hpp      | 7 +++++++
 common/cuda_hip/matrix/batch_ell_kernels.hpp        | 7 +++++++
 dpcpp/base/batch_multi_vector_kernels.hpp           | 7 +++++++
 dpcpp/matrix/batch_csr_kernels.hpp                  | 7 +++++++
 dpcpp/matrix/batch_dense_kernels.hpp                | 7 +++++++
 dpcpp/matrix/batch_ell_kernels.hpp                  | 7 +++++++
 reference/base/batch_multi_vector_kernels.hpp       | 7 +++++++
 reference/matrix/batch_csr_kernels.hpp              | 7 +++++++
 reference/matrix/batch_dense_kernels.hpp            | 7 +++++++
 reference/matrix/batch_ell_kernels.hpp              | 7 +++++++
 12 files changed, 84 insertions(+)

diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp b/common/cuda_hip/base/batch_multi_vector_kernels.hpp
index 7583cc72292..3f5763474c2 100644
--- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp
@@ -2,6 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_BASE_BATCH_MULTI_VECTOR_KERNELS_HPP_
+#define GKO_COMMON_CUDA_HIP_BASE_BATCH_MULTI_VECTOR_KERNELS_HPP_
+
+
 #include <thrust/functional.h>
 #include <thrust/transform.h>
 
@@ -315,3 +319,6 @@ __global__ __launch_bounds__(default_block_size) void copy_kernel(
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
+
+
+#endif
diff --git a/common/cuda_hip/matrix/batch_csr_kernels.hpp b/common/cuda_hip/matrix/batch_csr_kernels.hpp
index 64611559715..5ed66c59d14 100644
--- a/common/cuda_hip/matrix/batch_csr_kernels.hpp
+++ b/common/cuda_hip/matrix/batch_csr_kernels.hpp
@@ -2,6 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_MATRIX_BATCH_CSR_KERNELS_HPP_
+#define GKO_COMMON_CUDA_HIP_MATRIX_BATCH_CSR_KERNELS_HPP_
+
+
 #include <thrust/functional.h>
 #include <thrust/transform.h>
 
@@ -200,3 +204,6 @@ __global__ void add_scaled_identity_kernel(
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
+
+
+#endif
diff --git a/common/cuda_hip/matrix/batch_dense_kernels.hpp b/common/cuda_hip/matrix/batch_dense_kernels.hpp
index e4cd24bbd78..7902d6010fa 100644
--- a/common/cuda_hip/matrix/batch_dense_kernels.hpp
+++ b/common/cuda_hip/matrix/batch_dense_kernels.hpp
@@ -2,6 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_MATRIX_BATCH_DENSE_KERNELS_HPP_
+#define GKO_COMMON_CUDA_HIP_MATRIX_BATCH_DENSE_KERNELS_HPP_
+
+
 #include <thrust/functional.h>
 #include <thrust/transform.h>
 
@@ -247,3 +251,6 @@ __global__ void add_scaled_identity_kernel(
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
+
+
+#endif
diff --git a/common/cuda_hip/matrix/batch_ell_kernels.hpp b/common/cuda_hip/matrix/batch_ell_kernels.hpp
index 52826957ddb..f32144dc172 100644
--- a/common/cuda_hip/matrix/batch_ell_kernels.hpp
+++ b/common/cuda_hip/matrix/batch_ell_kernels.hpp
@@ -2,6 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_MATRIX_BATCH_ELL_KERNELS_HPP_
+#define GKO_COMMON_CUDA_HIP_MATRIX_BATCH_ELL_KERNELS_HPP_
+
+
 #include <thrust/functional.h>
 #include <thrust/transform.h>
 
@@ -210,3 +214,6 @@ __global__ void add_scaled_identity_kernel(
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
+
+
+#endif
diff --git a/dpcpp/base/batch_multi_vector_kernels.hpp b/dpcpp/base/batch_multi_vector_kernels.hpp
index bbcc540ae60..142eba259de 100644
--- a/dpcpp/base/batch_multi_vector_kernels.hpp
+++ b/dpcpp/base/batch_multi_vector_kernels.hpp
@@ -2,6 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_DPCPP_BASE_BATCH_MULTI_VECTOR_KERNELS_HPP_
+#define GKO_DPCPP_BASE_BATCH_MULTI_VECTOR_KERNELS_HPP_
+
+
 #include <memory>
 
 #include <CL/sycl.hpp>
@@ -257,3 +261,6 @@ __dpct_inline__ void copy_kernel(
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
+
+
+#endif
diff --git a/dpcpp/matrix/batch_csr_kernels.hpp b/dpcpp/matrix/batch_csr_kernels.hpp
index f51124f81a4..2b195de308b 100644
--- a/dpcpp/matrix/batch_csr_kernels.hpp
+++ b/dpcpp/matrix/batch_csr_kernels.hpp
@@ -2,6 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_DPCPP_MATRIX_BATCH_CSR_KERNELS_HPP_
+#define GKO_DPCPP_MATRIX_BATCH_CSR_KERNELS_HPP_
+
+
 #include <memory>
 
 #include <CL/sycl.hpp>
@@ -108,3 +112,6 @@ __dpct_inline__ void add_scaled_identity(
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
+
+
+#endif
diff --git a/dpcpp/matrix/batch_dense_kernels.hpp b/dpcpp/matrix/batch_dense_kernels.hpp
index acf1e65939d..59aee9a7208 100644
--- a/dpcpp/matrix/batch_dense_kernels.hpp
+++ b/dpcpp/matrix/batch_dense_kernels.hpp
@@ -2,6 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_DPCPP_MATRIX_BATCH_DENSE_KERNELS_HPP_
+#define GKO_DPCPP_MATRIX_BATCH_DENSE_KERNELS_HPP_
+
+
 #include <memory>
 
 #include <CL/sycl.hpp>
@@ -170,3 +174,6 @@ __dpct_inline__ void add_scaled_identity(
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
+
+
+#endif
diff --git a/dpcpp/matrix/batch_ell_kernels.hpp b/dpcpp/matrix/batch_ell_kernels.hpp
index 48ab9318bdf..5a1ba163216 100644
--- a/dpcpp/matrix/batch_ell_kernels.hpp
+++ b/dpcpp/matrix/batch_ell_kernels.hpp
@@ -2,6 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_DPCPP_MATRIX_BATCH_ELL_KERNELS_HPP_
+#define GKO_DPCPP_MATRIX_BATCH_ELL_KERNELS_HPP_
+
+
 #include <memory>
 
 #include <CL/sycl.hpp>
@@ -119,3 +123,6 @@ __dpct_inline__ void add_scaled_identity(
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
+
+
+#endif
diff --git a/reference/base/batch_multi_vector_kernels.hpp b/reference/base/batch_multi_vector_kernels.hpp
index 88f531f29cc..140072fd301 100644
--- a/reference/base/batch_multi_vector_kernels.hpp
+++ b/reference/base/batch_multi_vector_kernels.hpp
@@ -2,6 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_REFERENCE_BASE_BATCH_MULTI_VECTOR_KERNELS_HPP_
+#define GKO_REFERENCE_BASE_BATCH_MULTI_VECTOR_KERNELS_HPP_
+
+
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
@@ -149,3 +153,6 @@ inline void copy_kernel(
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
+
+
+#endif
diff --git a/reference/matrix/batch_csr_kernels.hpp b/reference/matrix/batch_csr_kernels.hpp
index e04b2bdf345..8f1bfe400e3 100644
--- a/reference/matrix/batch_csr_kernels.hpp
+++ b/reference/matrix/batch_csr_kernels.hpp
@@ -2,6 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_REFERENCE_MATRIX_BATCH_CSR_KERNELS_HPP_
+#define GKO_REFERENCE_MATRIX_BATCH_CSR_KERNELS_HPP_
+
+
 #include <algorithm>
 
 #include <ginkgo/core/base/batch_multi_vector.hpp>
@@ -99,3 +103,6 @@ inline void add_scaled_identity(
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
+
+
+#endif
diff --git a/reference/matrix/batch_dense_kernels.hpp b/reference/matrix/batch_dense_kernels.hpp
index e12827c77de..7fd6a8cdbb5 100644
--- a/reference/matrix/batch_dense_kernels.hpp
+++ b/reference/matrix/batch_dense_kernels.hpp
@@ -2,6 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_REFERENCE_MATRIX_BATCH_DENSE_KERNELS_HPP_
+#define GKO_REFERENCE_MATRIX_BATCH_DENSE_KERNELS_HPP_
+
+
 #include <algorithm>
 
 #include <ginkgo/core/base/batch_multi_vector.hpp>
@@ -128,3 +132,6 @@ inline void add_scaled_identity(
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
+
+
+#endif
diff --git a/reference/matrix/batch_ell_kernels.hpp b/reference/matrix/batch_ell_kernels.hpp
index 71bd1ce851a..cfdc2040d8f 100644
--- a/reference/matrix/batch_ell_kernels.hpp
+++ b/reference/matrix/batch_ell_kernels.hpp
@@ -2,6 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_REFERENCE_MATRIX_BATCH_ELL_KERNELS_HPP_
+#define GKO_REFERENCE_MATRIX_BATCH_ELL_KERNELS_HPP_
+
+
 #include <algorithm>
 
 #include <ginkgo/core/base/batch_multi_vector.hpp>
@@ -114,3 +118,6 @@ inline void add_scaled_identity(
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
+
+
+#endif

From 0e6106b7f93e9bdcf0d6850bd167c13855de3828 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Sat, 24 Aug 2024 00:01:04 +0200
Subject: [PATCH 161/448] [cuda, hip] move inc to full headers

---
 ...els.hpp.inc => batch_bicgstab_kernels.hpp} | 96 +++++++++++++------
 ...g_kernels.hpp.inc => batch_cg_kernels.hpp} | 75 +++++++++++----
 cuda/solver/batch_bicgstab_kernels.cu         | 31 +++---
 cuda/solver/batch_cg_kernels.cu               | 31 +++---
 hip/solver/batch_bicgstab_kernels.hip.cpp     | 13 +--
 hip/solver/batch_cg_kernels.hip.cpp           | 13 +--
 6 files changed, 151 insertions(+), 108 deletions(-)
 rename common/cuda_hip/solver/{batch_bicgstab_kernels.hpp.inc => batch_bicgstab_kernels.hpp} (81%)
 rename common/cuda_hip/solver/{batch_cg_kernels.hpp.inc => batch_cg_kernels.hpp} (78%)

diff --git a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp
similarity index 81%
rename from common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc
rename to common/cuda_hip/solver/batch_bicgstab_kernels.hpp
index d4ce149d394..cbab8ed6961 100644
--- a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc
+++ b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp
@@ -2,6 +2,42 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_
+#define GKO_COMMON_CUDA_HIP_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_
+
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
+#include "common/cuda_hip/base/batch_struct.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/segment_scan.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
+#include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
 template <typename Group, typename BatchMatrixType_entry, typename ValueType>
 __device__ __forceinline__ void initialize(
     Group subgroup, const int num_rows, const BatchMatrixType_entry& mat_entry,
@@ -27,20 +63,18 @@ __device__ __forceinline__ void initialize(
     __syncthreads();
 
     // r = b - A*x
-    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply(
+    batch_single_kernels::advanced_apply(
         static_cast<ValueType>(-1.0), mat_entry, x_shared_entry,
         static_cast<ValueType>(1.0), r_shared_entry);
     __syncthreads();
 
     if (threadIdx.x / config::warp_size == 0) {
-        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-            single_rhs_compute_norm2(subgroup, num_rows, r_shared_entry,
-                                     res_norm);
+        batch_single_kernels::single_rhs_compute_norm2(
+            subgroup, num_rows, r_shared_entry, res_norm);
     } else if (threadIdx.x / config::warp_size == 1) {
         // Compute norms of rhs
-        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-            single_rhs_compute_norm2(subgroup, num_rows, b_global_entry,
-                                     rhs_norm);
+        batch_single_kernels::single_rhs_compute_norm2(
+            subgroup, num_rows, b_global_entry, rhs_norm);
     }
     __syncthreads();
 
@@ -75,9 +109,8 @@ __device__ __forceinline__ void compute_alpha(
     const ValueType* const v_shared_entry, ValueType& alpha)
 {
     if (threadIdx.x / config::warp_size == 0) {
-        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-            single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_shared_entry,
-                                        v_shared_entry, alpha);
+        batch_single_kernels::single_rhs_compute_conj_dot(
+            subgroup, num_rows, r_hat_shared_entry, v_shared_entry, alpha);
     }
     __syncthreads();
     if (threadIdx.x == 0) {
@@ -105,13 +138,11 @@ __device__ __forceinline__ void compute_omega(
     const ValueType* const s_shared_entry, ValueType& temp, ValueType& omega)
 {
     if (threadIdx.x / config::warp_size == 0) {
-        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-            single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry,
-                                        s_shared_entry, omega);
+        batch_single_kernels::single_rhs_compute_conj_dot(
+            subgroup, num_rows, t_shared_entry, s_shared_entry, omega);
     } else if (threadIdx.x / config::warp_size == 1) {
-        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-            single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry,
-                                        t_shared_entry, temp);
+        batch_single_kernels::single_rhs_compute_conj_dot(
+            subgroup, num_rows, t_shared_entry, t_shared_entry, temp);
     }
 
     __syncthreads();
@@ -279,9 +310,8 @@ __global__ void apply_kernel(
 
             // rho_new =  < r_hat , r > = (r_hat)' * (r)
             if (threadIdx.x / config::warp_size == 0) {
-                gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-                    single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_sh,
-                                                r_sh, rho_new_sh[0]);
+                batch_single_kernels::single_rhs_compute_conj_dot(
+                    subgroup, num_rows, r_hat_sh, r_sh, rho_new_sh[0]);
             }
             __syncthreads();
 
@@ -296,8 +326,7 @@ __global__ void apply_kernel(
             __syncthreads();
 
             // v = A * p_hat
-            gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-                simple_apply(mat_entry, p_hat_sh, v_sh);
+            batch_single_kernels::simple_apply(mat_entry, p_hat_sh, v_sh);
             __syncthreads();
 
             // alpha = rho_new / < r_hat , v>
@@ -311,9 +340,8 @@ __global__ void apply_kernel(
 
             // an estimate of residual norms
             if (threadIdx.x / config::warp_size == 0) {
-                gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-                    single_rhs_compute_norm2(subgroup, num_rows, s_sh,
-                                             norms_res_sh[0]);
+                batch_single_kernels::single_rhs_compute_norm2(
+                    subgroup, num_rows, s_sh, norms_res_sh[0]);
             }
             __syncthreads();
 
@@ -329,8 +357,7 @@ __global__ void apply_kernel(
             __syncthreads();
 
             // t = A * s_hat
-            gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-                simple_apply(mat_entry, s_hat_sh, t_sh);
+            batch_single_kernels::simple_apply(mat_entry, s_hat_sh, t_sh);
             __syncthreads();
 
             // omega = <t,s> / <t,t>
@@ -345,9 +372,8 @@ __global__ void apply_kernel(
             __syncthreads();
 
             if (threadIdx.x / config::warp_size == 0) {
-                gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-                    single_rhs_compute_norm2(subgroup, num_rows, r_sh,
-                                             norms_res_sh[0]);
+                batch_single_kernels::single_rhs_compute_norm2(
+                    subgroup, num_rows, r_sh, norms_res_sh[0]);
             }
             //__syncthreads();
 
@@ -360,8 +386,16 @@ __global__ void apply_kernel(
         logger.log_iteration(batch_id, iter, norms_res_sh[0]);
 
         // copy x back to global memory
-        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-            single_rhs_copy(num_rows, x_sh, x_gl_entry_ptr);
+        batch_single_kernels::single_rhs_copy(num_rows, x_sh, x_gl_entry_ptr);
         __syncthreads();
     }
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif
diff --git a/common/cuda_hip/solver/batch_cg_kernels.hpp.inc b/common/cuda_hip/solver/batch_cg_kernels.hpp
similarity index 78%
rename from common/cuda_hip/solver/batch_cg_kernels.hpp.inc
rename to common/cuda_hip/solver/batch_cg_kernels.hpp
index 4f4b382f552..e7ec0505844 100644
--- a/common/cuda_hip/solver/batch_cg_kernels.hpp.inc
+++ b/common/cuda_hip/solver/batch_cg_kernels.hpp
@@ -2,6 +2,42 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_SOLVER_BATCH_CG_KERNELS_HPP_
+#define GKO_COMMON_CUDA_HIP_SOLVER_BATCH_CG_KERNELS_HPP_
+
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
+#include "common/cuda_hip/base/batch_struct.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/segment_scan.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
+#include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
 template <typename Group, typename BatchMatrixType_entry, typename PrecType,
           typename ValueType>
 __device__ __forceinline__ void initialize(
@@ -22,7 +58,7 @@ __device__ __forceinline__ void initialize(
     __syncthreads();
 
     // r = b - A*x
-    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply(
+    batch_single_kernels::advanced_apply(
         static_cast<ValueType>(-1.0), mat_entry, x_shared_entry,
         static_cast<ValueType>(1.0), r_shared_entry);
     __syncthreads();
@@ -33,14 +69,13 @@ __device__ __forceinline__ void initialize(
 
     if (threadIdx.x / config::warp_size == 0) {
         // Compute norms of rhs
-        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-            single_rhs_compute_norm2(subgroup, num_rows, b_global_entry,
-                                     rhs_norms_sh);
+        batch_single_kernels::single_rhs_compute_norm2(
+            subgroup, num_rows, b_global_entry, rhs_norms_sh);
     } else if (threadIdx.x / config::warp_size == 1) {
         // rho_old = r' * z
-        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-            single_rhs_compute_conj_dot(subgroup, num_rows, r_shared_entry,
-                                        z_shared_entry, rho_old_shared_entry);
+        batch_single_kernels::single_rhs_compute_conj_dot(
+            subgroup, num_rows, r_shared_entry, z_shared_entry,
+            rho_old_shared_entry);
     }
 
     // p = z
@@ -72,9 +107,9 @@ __device__ __forceinline__ void update_x_and_r(
     ValueType* const x_shared_entry, ValueType* const r_shared_entry)
 {
     if (threadIdx.x / config::warp_size == 0) {
-        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-            single_rhs_compute_conj_dot(subgroup, num_rows, p_shared_entry,
-                                        Ap_shared_entry, alpha_shared_entry);
+        batch_single_kernels::single_rhs_compute_conj_dot(
+            subgroup, num_rows, p_shared_entry, Ap_shared_entry,
+            alpha_shared_entry);
     }
     __syncthreads();
 
@@ -190,8 +225,7 @@ __global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf,
             }
 
             // Ap = A * p
-            gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-                simple_apply(mat_entry, p_sh, Ap_sh);
+            batch_single_kernels::simple_apply(mat_entry, p_sh, Ap_sh);
             __syncthreads();
 
             // alpha = rho_old / (p' * Ap)
@@ -207,9 +241,8 @@ __global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf,
 
             if (threadIdx.x / config::warp_size == 0) {
                 // rho_new =  (r)' * (z)
-                gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-                    single_rhs_compute_conj_dot(subgroup, num_rows, r_sh, z_sh,
-                                                rho_new_sh[0]);
+                batch_single_kernels::single_rhs_compute_conj_dot(
+                    subgroup, num_rows, r_sh, z_sh, rho_new_sh[0]);
             }
             __syncthreads();
 
@@ -228,8 +261,16 @@ __global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf,
         logger.log_iteration(batch_id, iter, norms_res_sh[0]);
 
         // copy x back to global memory
-        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-            single_rhs_copy(num_rows, x_sh, x_global_entry);
+        batch_single_kernels::single_rhs_copy(num_rows, x_sh, x_global_entry);
         __syncthreads();
     }
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif
diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu
index 35d567fd911..d3dc8712201 100644
--- a/cuda/solver/batch_bicgstab_kernels.cu
+++ b/cuda/solver/batch_bicgstab_kernels.cu
@@ -24,6 +24,7 @@
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
+#include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
@@ -32,19 +33,9 @@
 namespace gko {
 namespace kernels {
 namespace cuda {
-
-
-/**
- * @brief The batch Bicgstab solver namespace.
- *
- * @ingroup batch_bicgstab
- */
 namespace batch_bicgstab {
 
 
-#include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc"
-
-
 template <typename StopType, typename PrecType, typename LogType,
           typename BatchMatrixType, typename ValueType>
 int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
@@ -56,9 +47,10 @@ int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
     const int device_max_threads =
         ((std::max(num_rows, min_block_size)) / warp_sz) * warp_sz;
     cudaFuncAttributes funcattr;
-    cudaFuncGetAttributes(&funcattr,
-                          apply_kernel<StopType, 9, true, PrecType, LogType,
-                                       BatchMatrixType, ValueType>);
+    cudaFuncGetAttributes(
+        &funcattr,
+        batch_single_kernels::apply_kernel<StopType, 9, true, PrecType, LogType,
+                                           BatchMatrixType, ValueType>);
     const int num_regs_used = funcattr.numRegs;
     int max_regs_blk = 0;
     cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock,
@@ -80,13 +72,14 @@ int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec)
                            cudaDevAttrMaxSharedMemoryPerMultiprocessor,
                            exec->get_device_id());
     GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute(
-        apply_kernel<StopType, 9, true, PrecType, LogType, BatchMatrixType,
-                     ValueType>,
+        batch_single_kernels::apply_kernel<StopType, 9, true, PrecType, LogType,
+                                           BatchMatrixType, ValueType>,
         cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/));
     cudaFuncAttributes funcattr;
-    cudaFuncGetAttributes(&funcattr,
-                          apply_kernel<StopType, 9, true, PrecType, LogType,
-                                       BatchMatrixType, ValueType>);
+    cudaFuncGetAttributes(
+        &funcattr,
+        batch_single_kernels::apply_kernel<StopType, 9, true, PrecType, LogType,
+                                           BatchMatrixType, ValueType>);
     return funcattr.maxDynamicSharedSizeBytes;
 }
 
@@ -116,7 +109,7 @@ public:
         value_type* const __restrict__ workspace_data, const int& block_size,
         const size_t& shared_size) const
     {
-        apply_kernel<StopType, n_shared, prec_shared_bool>
+        batch_single_kernels::apply_kernel<StopType, n_shared, prec_shared_bool>
             <<<mat.num_batch_items, block_size, shared_size,
                exec_->get_stream()>>>(sconf, settings_.max_iterations,
                                       settings_.residual_tol, logger, prec, mat,
diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu
index f26f2d37313..b8ead675a3c 100644
--- a/cuda/solver/batch_cg_kernels.cu
+++ b/cuda/solver/batch_cg_kernels.cu
@@ -23,6 +23,7 @@
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
+#include "common/cuda_hip/solver/batch_cg_kernels.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
@@ -31,19 +32,9 @@
 namespace gko {
 namespace kernels {
 namespace cuda {
-
-
-/**
- * @brief The batch Cg solver namespace.
- *
- * @ingroup batch_cg
- */
 namespace batch_cg {
 
 
-#include "common/cuda_hip/solver/batch_cg_kernels.hpp.inc"
-
-
 template <typename StopType, typename PrecType, typename LogType,
           typename BatchMatrixType, typename ValueType>
 int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
@@ -55,9 +46,10 @@ int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
     const int device_max_threads =
         (std::max(num_rows, min_block_size) / warp_sz) * warp_sz;
     cudaFuncAttributes funcattr;
-    cudaFuncGetAttributes(&funcattr,
-                          apply_kernel<StopType, 5, true, PrecType, LogType,
-                                       BatchMatrixType, ValueType>);
+    cudaFuncGetAttributes(
+        &funcattr,
+        batch_single_kernels::apply_kernel<StopType, 5, true, PrecType, LogType,
+                                           BatchMatrixType, ValueType>);
     const int num_regs_used = funcattr.numRegs;
     int max_regs_blk = 0;
     cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock,
@@ -79,13 +71,14 @@ int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec)
                            cudaDevAttrMaxSharedMemoryPerMultiprocessor,
                            exec->get_device_id());
     GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute(
-        apply_kernel<StopType, 5, true, PrecType, LogType, BatchMatrixType,
-                     ValueType>,
+        batch_single_kernels::apply_kernel<StopType, 5, true, PrecType, LogType,
+                                           BatchMatrixType, ValueType>,
         cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/));
     cudaFuncAttributes funcattr;
-    cudaFuncGetAttributes(&funcattr,
-                          apply_kernel<StopType, 5, true, PrecType, LogType,
-                                       BatchMatrixType, ValueType>);
+    cudaFuncGetAttributes(
+        &funcattr,
+        batch_single_kernels::apply_kernel<StopType, 5, true, PrecType, LogType,
+                                           BatchMatrixType, ValueType>);
     return funcattr.maxDynamicSharedSizeBytes;
 }
 
@@ -115,7 +108,7 @@ public:
         value_type* const __restrict__ workspace_data, const int& block_size,
         const size_t& shared_size) const
     {
-        apply_kernel<StopType, n_shared, prec_shared_bool>
+        batch_single_kernels::apply_kernel<StopType, n_shared, prec_shared_bool>
             <<<mat.num_batch_items, block_size, shared_size,
                exec_->get_stream()>>>(sconf, settings_.max_iterations,
                                       settings_.residual_tol, logger, prec, mat,
diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp
index a5de10953bc..d44bc4a0eb6 100644
--- a/hip/solver/batch_bicgstab_kernels.hip.cpp
+++ b/hip/solver/batch_bicgstab_kernels.hip.cpp
@@ -25,6 +25,7 @@
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
+#include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
@@ -33,19 +34,9 @@
 namespace gko {
 namespace kernels {
 namespace hip {
-
-
-/**
- * @brief The batch Bicgstab solver namespace.
- *
- * @ingroup batch_bicgstab
- */
 namespace batch_bicgstab {
 
 
-#include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc"
-
-
 template <typename BatchMatrixType>
 int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
                               const int num_rows)
@@ -96,7 +87,7 @@ class kernel_caller {
         value_type* const __restrict__ workspace_data, const int& block_size,
         const size_t& shared_size) const
     {
-        apply_kernel<StopType, n_shared, prec_shared_bool>
+        batch_single_kernels::apply_kernel<StopType, n_shared, prec_shared_bool>
             <<<mat.num_batch_items, block_size, shared_size,
                exec_->get_stream()>>>(sconf, settings_.max_iterations,
                                       settings_.residual_tol, logger, prec, mat,
diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp
index 23bb939ead8..c9a1e81be81 100644
--- a/hip/solver/batch_cg_kernels.hip.cpp
+++ b/hip/solver/batch_cg_kernels.hip.cpp
@@ -25,6 +25,7 @@
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
+#include "common/cuda_hip/solver/batch_cg_kernels.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
@@ -33,19 +34,9 @@
 namespace gko {
 namespace kernels {
 namespace hip {
-
-
-/**
- * @brief The batch Cg solver namespace.
- *
- * @ingroup batch_cg
- */
 namespace batch_cg {
 
 
-#include "common/cuda_hip/solver/batch_cg_kernels.hpp.inc"
-
-
 template <typename BatchMatrixType>
 int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
                               const int num_rows)
@@ -96,7 +87,7 @@ class kernel_caller {
         value_type* const __restrict__ workspace_data, const int& block_size,
         const size_t& shared_size) const
     {
-        apply_kernel<StopType, n_shared, prec_shared_bool>
+        batch_single_kernels::apply_kernel<StopType, n_shared, prec_shared_bool>
             <<<mat.num_batch_items, block_size, shared_size,
                exec_->get_stream()>>>(sconf, settings_.max_iterations,
                                       settings_.residual_tol, logger, prec, mat,

From 9dafdbfee3fa992fe303461df175746078606299 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Sat, 24 Aug 2024 00:27:51 +0200
Subject: [PATCH 162/448] [ref, omp] move kernels to headers

---
 omp/solver/batch_bicgstab_kernels.cpp         | 17 +---
 omp/solver/batch_cg_kernels.cpp               | 14 ++-
 reference/matrix/batch_dense_kernels.hpp      |  2 -
 reference/matrix/batch_ell_kernels.hpp        |  2 +-
 reference/solver/batch_bicgstab_kernels.cpp   |  8 +-
 ...els.hpp.inc => batch_bicgstab_kernels.hpp} | 88 ++++++++++++-------
 reference/solver/batch_cg_kernels.cpp         | 11 ++-
 ...g_kernels.hpp.inc => batch_cg_kernels.hpp} | 66 ++++++++++----
 8 files changed, 123 insertions(+), 85 deletions(-)
 rename reference/solver/{batch_bicgstab_kernels.hpp.inc => batch_bicgstab_kernels.hpp} (84%)
 rename reference/solver/{batch_cg_kernels.hpp.inc => batch_cg_kernels.hpp} (80%)

diff --git a/omp/solver/batch_bicgstab_kernels.cpp b/omp/solver/batch_bicgstab_kernels.cpp
index 661cdbcd2ec..ed880507116 100644
--- a/omp/solver/batch_bicgstab_kernels.cpp
+++ b/omp/solver/batch_bicgstab_kernels.cpp
@@ -13,22 +13,13 @@
 #include "reference/matrix/batch_csr_kernels.hpp"
 #include "reference/matrix/batch_dense_kernels.hpp"
 #include "reference/matrix/batch_ell_kernels.hpp"
+#include "reference/solver/batch_bicgstab_kernels.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace omp {
 namespace batch_bicgstab {
-namespace {
-
-
-constexpr int max_num_rhs = 1;
-
-
-#include "reference/solver/batch_bicgstab_kernels.hpp.inc"
-
-
-}  // unnamed namespace
 
 
 template <typename T>
@@ -54,7 +45,7 @@ class kernel_caller {
         const size_type num_batch_items = mat.num_batch_items;
         const auto num_rows = mat.num_rows;
         const auto num_rhs = b.num_rhs;
-        if (num_rhs > max_num_rhs) {
+        if (num_rhs > 1) {
             GKO_NOT_IMPLEMENTED;
         }
 
@@ -73,8 +64,8 @@ class kernel_caller {
                 exec_, local_size_bytes,
                 local_space.get_data() +
                     omp_get_thread_num() * local_size_bytes);
-            batch_entry_bicgstab_impl<StopType, PrecondType, LogType,
-                                      BatchMatrixType, ValueType>(
+            batch_single_kernels::batch_entry_bicgstab_impl<
+                StopType, PrecondType, LogType, BatchMatrixType, ValueType>(
                 settings_, logger, precond, mat, b, x, batch_id,
                 thread_local_space.get_data());
         }
diff --git a/omp/solver/batch_cg_kernels.cpp b/omp/solver/batch_cg_kernels.cpp
index 3a6e31256c2..89d4441db64 100644
--- a/omp/solver/batch_cg_kernels.cpp
+++ b/omp/solver/batch_cg_kernels.cpp
@@ -13,6 +13,7 @@
 #include "reference/matrix/batch_csr_kernels.hpp"
 #include "reference/matrix/batch_dense_kernels.hpp"
 #include "reference/matrix/batch_ell_kernels.hpp"
+#include "reference/solver/batch_cg_kernels.hpp"
 
 
 namespace gko {
@@ -25,9 +26,6 @@ namespace {
 constexpr int max_num_rhs = 1;
 
 
-#include "reference/solver/batch_cg_kernels.hpp.inc"
-
-
 }  // unnamed namespace
 
 
@@ -54,7 +52,7 @@ class kernel_caller {
         const size_type num_batch_items = mat.num_batch_items;
         const auto num_rows = mat.num_rows;
         const auto num_rhs = b.num_rhs;
-        if (num_rhs > max_num_rhs) {
+        if (num_rhs > 1) {
             GKO_NOT_IMPLEMENTED;
         }
 
@@ -72,10 +70,10 @@ class kernel_caller {
                 exec_, local_size_bytes,
                 local_space.get_data() +
                     omp_get_thread_num() * local_size_bytes);
-            batch_entry_cg_impl<StopType, PrecondType, LogType, BatchMatrixType,
-                                ValueType>(settings_, logger, precond, mat, b,
-                                           x, batch_id,
-                                           thread_local_space.get_data());
+            batch_single_kernels::batch_entry_cg_impl<
+                StopType, PrecondType, LogType, BatchMatrixType, ValueType>(
+                settings_, logger, precond, mat, b, x, batch_id,
+                thread_local_space.get_data());
         }
     }
 
diff --git a/reference/matrix/batch_dense_kernels.hpp b/reference/matrix/batch_dense_kernels.hpp
index 7fd6a8cdbb5..bc4e7c497cd 100644
--- a/reference/matrix/batch_dense_kernels.hpp
+++ b/reference/matrix/batch_dense_kernels.hpp
@@ -6,8 +6,6 @@
 #define GKO_REFERENCE_MATRIX_BATCH_DENSE_KERNELS_HPP_
 
 
-#include <algorithm>
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
 
diff --git a/reference/matrix/batch_ell_kernels.hpp b/reference/matrix/batch_ell_kernels.hpp
index cfdc2040d8f..d6892c67f32 100644
--- a/reference/matrix/batch_ell_kernels.hpp
+++ b/reference/matrix/batch_ell_kernels.hpp
@@ -9,7 +9,7 @@
 #include <algorithm>
 
 #include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/matrix/batch_dense.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
 
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
diff --git a/reference/solver/batch_bicgstab_kernels.cpp b/reference/solver/batch_bicgstab_kernels.cpp
index 33e1e9392d9..20883e24434 100644
--- a/reference/solver/batch_bicgstab_kernels.cpp
+++ b/reference/solver/batch_bicgstab_kernels.cpp
@@ -9,6 +9,7 @@
 #include "reference/matrix/batch_csr_kernels.hpp"
 #include "reference/matrix/batch_dense_kernels.hpp"
 #include "reference/matrix/batch_ell_kernels.hpp"
+#include "reference/solver/batch_bicgstab_kernels.hpp"
 
 
 namespace gko {
@@ -21,9 +22,6 @@ namespace {
 constexpr int max_num_rhs = 1;
 
 
-#include "reference/solver/batch_bicgstab_kernels.hpp.inc"
-
-
 }  // unnamed namespace
 
 
@@ -62,8 +60,8 @@ class kernel_caller {
         array<unsigned char> local_space(exec_, local_size_bytes);
 
         for (size_type batch_id = 0; batch_id < num_batch_items; batch_id++) {
-            batch_entry_bicgstab_impl<StopType, PrecType, LogType,
-                                      BatchMatrixType, ValueType>(
+            batch_single_kernels::batch_entry_bicgstab_impl<
+                StopType, PrecType, LogType, BatchMatrixType, ValueType>(
                 settings_, logger, prec, mat, b, x, batch_id,
                 local_space.get_data());
         }
diff --git a/reference/solver/batch_bicgstab_kernels.hpp.inc b/reference/solver/batch_bicgstab_kernels.hpp
similarity index 84%
rename from reference/solver/batch_bicgstab_kernels.hpp.inc
rename to reference/solver/batch_bicgstab_kernels.hpp
index 786e98eb5d1..f91e06d2e44 100644
--- a/reference/solver/batch_bicgstab_kernels.hpp.inc
+++ b/reference/solver/batch_bicgstab_kernels.hpp
@@ -2,6 +2,29 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_REFERENCE_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_
+#define GKO_REFERENCE_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_
+
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "reference/base/batch_multi_vector_kernels.hpp"
+#include "reference/base/batch_struct.hpp"
+#include "reference/matrix/batch_csr_kernels.hpp"
+#include "reference/matrix/batch_dense_kernels.hpp"
+#include "reference/matrix/batch_ell_kernels.hpp"
+#include "reference/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
+constexpr int max_num_rhs = 1;
+
+
 template <typename BatchMatrixType_entry, typename ValueType>
 inline void initialize(
     const BatchMatrixType_entry& A_entry,
@@ -25,20 +48,18 @@ inline void initialize(
     alpha_entry.values[0] = one<ValueType>();
 
     // Compute norms of rhs
-    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-        compute_norm2_kernel<ValueType>(b_entry, rhs_norms_entry);
+    batch_single_kernels::compute_norm2_kernel<ValueType>(b_entry,
+                                                          rhs_norms_entry);
 
     // r = b
-    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(
-        b_entry, r_entry);
+    batch_single_kernels::copy_kernel(b_entry, r_entry);
 
     // r = b - A*x
-    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply(
-        static_cast<ValueType>(-1.0), A_entry, gko::batch::to_const(x_entry),
-        static_cast<ValueType>(1.0), r_entry);
-    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-        compute_norm2_kernel<ValueType>(gko::batch::to_const(r_entry),
-                                        res_norms_entry);
+    batch_single_kernels::advanced_apply(static_cast<ValueType>(-1.0), A_entry,
+                                         gko::batch::to_const(x_entry),
+                                         static_cast<ValueType>(1.0), r_entry);
+    batch_single_kernels::compute_norm2_kernel<ValueType>(
+        gko::batch::to_const(r_entry), res_norms_entry);
 
     for (int r = 0; r < p_entry.num_rows; r++) {
         r_hat_entry.values[r * r_hat_entry.stride] =
@@ -78,9 +99,8 @@ inline void compute_alpha(
     const gko::batch::multi_vector::batch_item<const ValueType>& v_entry,
     const gko::batch::multi_vector::batch_item<ValueType>& alpha_entry)
 {
-    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-        compute_dot_product_kernel<ValueType>(r_hat_entry, v_entry,
-                                              alpha_entry);
+    batch_single_kernels::compute_dot_product_kernel<ValueType>(
+        r_hat_entry, v_entry, alpha_entry);
     alpha_entry.values[0] = rho_new_entry.values[0] / alpha_entry.values[0];
 }
 
@@ -107,10 +127,10 @@ inline void compute_omega(
     const gko::batch::multi_vector::batch_item<ValueType>& temp_entry,
     const gko::batch::multi_vector::batch_item<ValueType>& omega_entry)
 {
-    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-        compute_dot_product_kernel<ValueType>(t_entry, s_entry, omega_entry);
-    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-        compute_dot_product_kernel<ValueType>(t_entry, t_entry, temp_entry);
+    batch_single_kernels::compute_dot_product_kernel<ValueType>(
+        t_entry, s_entry, omega_entry);
+    batch_single_kernels::compute_dot_product_kernel<ValueType>(
+        t_entry, t_entry, temp_entry);
     omega_entry.values[0] /= temp_entry.values[0];
 }
 
@@ -253,10 +273,9 @@ inline void batch_entry_bicgstab_impl(
         }
 
         // rho_new =  < r_hat , r > = (r_hat)' * (r)
-        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-            compute_dot_product_kernel<ValueType>(
-                gko::batch::to_const(r_hat_entry),
-                gko::batch::to_const(r_entry), rho_new_entry);
+        batch_single_kernels::compute_dot_product_kernel<ValueType>(
+            gko::batch::to_const(r_hat_entry), gko::batch::to_const(r_entry),
+            rho_new_entry);
 
         // beta = (rho_new / rho_old)*(alpha / omega)
         // p = r + beta*(p - omega * v)
@@ -271,7 +290,7 @@ inline void batch_entry_bicgstab_impl(
         prec.apply(gko::batch::to_const(p_entry), p_hat_entry);
 
         // v = A * p_hat
-        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply(
+        batch_single_kernels::simple_apply(
             A_entry, gko::batch::to_const(p_hat_entry), v_entry);
 
         // alpha = rho_new / < r_hat , v>
@@ -285,9 +304,8 @@ inline void batch_entry_bicgstab_impl(
                  gko::batch::to_const(v_entry), s_entry);
 
         // an estimate of residual norms
-        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-            compute_norm2_kernel<ValueType>(gko::batch::to_const(s_entry),
-                                            res_norms_entry);
+        batch_single_kernels::compute_norm2_kernel<ValueType>(
+            gko::batch::to_const(s_entry), res_norms_entry);
 
         if (stop.check_converged(res_norms_entry.values)) {
             // update x for the systems
@@ -303,7 +321,7 @@ inline void batch_entry_bicgstab_impl(
         prec.apply(gko::batch::to_const(s_entry), s_hat_entry);
 
         // t = A * s_hat
-        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply(
+        batch_single_kernels::simple_apply(
             A_entry, gko::batch::to_const(s_hat_entry), t_entry);
         // omega = <t,s> / <t,t>
         compute_omega(gko::batch::to_const(t_entry),
@@ -319,14 +337,22 @@ inline void batch_entry_bicgstab_impl(
                        gko::batch::to_const(s_entry),
                        gko::batch::to_const(t_entry), x_entry, r_entry);
 
-        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-            compute_norm2_kernel<ValueType>(gko::batch::to_const(r_entry),
-                                            res_norms_entry);
+        batch_single_kernels::compute_norm2_kernel<ValueType>(
+            gko::batch::to_const(r_entry), res_norms_entry);
 
         // rho_old = rho_new
-        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(
-            gko::batch::to_const(rho_new_entry), rho_old_entry);
+        batch_single_kernels::copy_kernel(gko::batch::to_const(rho_new_entry),
+                                          rho_old_entry);
     }
 
     logger.log_iteration(batch_item_id, iter, res_norms_entry.values[0]);
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif
diff --git a/reference/solver/batch_cg_kernels.cpp b/reference/solver/batch_cg_kernels.cpp
index 7c69157d4a7..f2155f98719 100644
--- a/reference/solver/batch_cg_kernels.cpp
+++ b/reference/solver/batch_cg_kernels.cpp
@@ -9,6 +9,7 @@
 #include "reference/matrix/batch_csr_kernels.hpp"
 #include "reference/matrix/batch_dense_kernels.hpp"
 #include "reference/matrix/batch_ell_kernels.hpp"
+#include "reference/solver/batch_cg_kernels.hpp"
 
 
 namespace gko {
@@ -21,9 +22,6 @@ namespace {
 constexpr int max_num_rhs = 1;
 
 
-#include "reference/solver/batch_cg_kernels.hpp.inc"
-
-
 }  // unnamed namespace
 
 
@@ -62,9 +60,10 @@ class kernel_caller {
         array<unsigned char> local_space(exec_, local_size_bytes);
 
         for (size_type batch_id = 0; batch_id < num_batch_items; batch_id++) {
-            batch_entry_cg_impl<StopType, PrecType, LogType, BatchMatrixType,
-                                ValueType>(settings_, logger, prec, mat, b, x,
-                                           batch_id, local_space.get_data());
+            batch_single_kernels::batch_entry_cg_impl<
+                StopType, PrecType, LogType, BatchMatrixType, ValueType>(
+                settings_, logger, prec, mat, b, x, batch_id,
+                local_space.get_data());
         }
     }
 
diff --git a/reference/solver/batch_cg_kernels.hpp.inc b/reference/solver/batch_cg_kernels.hpp
similarity index 80%
rename from reference/solver/batch_cg_kernels.hpp.inc
rename to reference/solver/batch_cg_kernels.hpp
index 991db5c061c..d4a35e3d01a 100644
--- a/reference/solver/batch_cg_kernels.hpp.inc
+++ b/reference/solver/batch_cg_kernels.hpp
@@ -2,6 +2,29 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_REFERENCE_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_
+#define GKO_REFERENCE_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_
+
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "reference/base/batch_multi_vector_kernels.hpp"
+#include "reference/base/batch_struct.hpp"
+#include "reference/matrix/batch_csr_kernels.hpp"
+#include "reference/matrix/batch_dense_kernels.hpp"
+#include "reference/matrix/batch_ell_kernels.hpp"
+#include "reference/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
+constexpr int max_num_rhs = 1;
+
+
 template <typename BatchMatrixType_entry, typename ValueType>
 inline void initialize(
     const BatchMatrixType_entry& A_entry,
@@ -26,17 +49,16 @@ inline void initialize(
     }
 
     // Compute norms of rhs
-    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-        compute_norm2_kernel<ValueType>(b_entry, rhs_norms_entry);
+    batch_single_kernels::compute_norm2_kernel<ValueType>(b_entry,
+                                                          rhs_norms_entry);
 
     // r = b
-    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(
-        b_entry, r_entry);
+    batch_single_kernels::copy_kernel(b_entry, r_entry);
 
     // r = b - A*x
-    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply(
-        static_cast<ValueType>(-1.0), A_entry, gko::batch::to_const(x_entry),
-        static_cast<ValueType>(1.0), r_entry);
+    batch_single_kernels::advanced_apply(static_cast<ValueType>(-1.0), A_entry,
+                                         gko::batch::to_const(x_entry),
+                                         static_cast<ValueType>(1.0), r_entry);
 }
 
 
@@ -48,8 +70,7 @@ inline void update_p(
     const gko::batch::multi_vector::batch_item<ValueType>& p_entry)
 {
     if (rho_old_entry.values[0] == zero<ValueType>()) {
-        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(
-            z_entry, p_entry);
+        batch_single_kernels::copy_kernel(z_entry, p_entry);
         return;
     }
     const ValueType beta = rho_new_entry.values[0] / rho_old_entry.values[0];
@@ -70,9 +91,8 @@ inline void update_x_and_r(
     const gko::batch::multi_vector::batch_item<ValueType>& x_entry,
     const gko::batch::multi_vector::batch_item<ValueType>& r_entry)
 {
-    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-        compute_conj_dot_product_kernel<ValueType>(p_entry, Ap_entry,
-                                                   alpha_entry);
+    batch_single_kernels::compute_conj_dot_product_kernel<ValueType>(
+        p_entry, Ap_entry, alpha_entry);
 
     const ValueType temp = rho_old_entry.values[0] / alpha_entry.values[0];
     for (int row = 0; row < r_entry.num_rows; row++) {
@@ -159,10 +179,9 @@ inline void batch_entry_cg_impl(
         prec.apply(gko::batch::to_const(r_entry), z_entry);
 
         // rho_new =  < r , z > = (r)' * (z)
-        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-            compute_conj_dot_product_kernel<ValueType>(
-                gko::batch::to_const(r_entry), gko::batch::to_const(z_entry),
-                rho_new_entry);
+        batch_single_kernels::compute_conj_dot_product_kernel<ValueType>(
+            gko::batch::to_const(r_entry), gko::batch::to_const(z_entry),
+            rho_new_entry);
         ++iter;
         // use implicit residual norms
         res_norms_entry.values[0] = sqrt(abs(rho_new_entry.values[0]));
@@ -181,7 +200,7 @@ inline void batch_entry_cg_impl(
                  gko::batch::to_const(z_entry), p_entry);
 
         // Ap = A * p
-        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply(
+        batch_single_kernels::simple_apply(
             A_entry, gko::batch::to_const(p_entry), Ap_entry);
 
         // temp= rho_old / (p' * Ap)
@@ -192,9 +211,18 @@ inline void batch_entry_cg_impl(
             gko::batch::to_const(Ap_entry), alpha_entry, x_entry, r_entry);
 
         // rho_old = rho_new
-        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(
-            gko::batch::to_const(rho_new_entry), rho_old_entry);
+        batch_single_kernels::copy_kernel(gko::batch::to_const(rho_new_entry),
+                                          rho_old_entry);
     }
 
     logger.log_iteration(batch_item_id, iter, res_norms_entry.values[0]);
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif

From fce448ed87a8fb87068265244de2a94f1d2443fc Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Sat, 24 Aug 2024 00:36:46 +0200
Subject: [PATCH 163/448] [dpcpp] move to headers

---
 dpcpp/solver/batch_bicgstab_kernels.dp.cpp    |  7 +-
 ...els.hpp.inc => batch_bicgstab_kernels.hpp} | 41 +++++++++++
 dpcpp/solver/batch_cg_kernels.dp.cpp          |  7 +-
 ...g_kernels.hpp.inc => batch_cg_kernels.hpp} | 72 ++++++++++++++-----
 4 files changed, 101 insertions(+), 26 deletions(-)
 rename dpcpp/solver/{batch_bicgstab_kernels.hpp.inc => batch_bicgstab_kernels.hpp} (93%)
 rename dpcpp/solver/{batch_cg_kernels.hpp.inc => batch_cg_kernels.hpp} (82%)

diff --git a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
index 291ee1d8a8b..7036b770f1b 100644
--- a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
+++ b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
@@ -27,6 +27,7 @@
 #include "dpcpp/matrix/batch_dense_kernels.hpp"
 #include "dpcpp/matrix/batch_ell_kernels.hpp"
 #include "dpcpp/matrix/batch_struct.hpp"
+#include "dpcpp/solver/batch_bicgstab_kernels.hpp"
 
 
 namespace gko {
@@ -35,9 +36,6 @@ namespace dpcpp {
 namespace batch_bicgstab {
 
 
-#include "dpcpp/solver/batch_bicgstab_kernels.hpp.inc"
-
-
 template <typename T>
 using settings = gko::kernels::batch_bicgstab::settings<T>;
 
@@ -95,7 +93,8 @@ class kernel_caller {
                     ValueType* const x_global_entry =
                         gko::batch::multi_vector::batch_item_ptr(
                             x_values, 1, num_rows, batch_id);
-                    apply_kernel<StopType, n_shared_total>(
+                    batch_single_kernels::apply_kernel<StopType,
+                                                       n_shared_total>(
                         sconf, max_iters, res_tol, logger, prec,
                         mat_global_entry, b_global_entry, x_global_entry,
                         num_rows, mat.get_single_item_num_nnz(),
diff --git a/dpcpp/solver/batch_bicgstab_kernels.hpp.inc b/dpcpp/solver/batch_bicgstab_kernels.hpp
similarity index 93%
rename from dpcpp/solver/batch_bicgstab_kernels.hpp.inc
rename to dpcpp/solver/batch_bicgstab_kernels.hpp
index de1956c8c6c..a6db9e7470a 100644
--- a/dpcpp/solver/batch_bicgstab_kernels.hpp.inc
+++ b/dpcpp/solver/batch_bicgstab_kernels.hpp
@@ -2,6 +2,38 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_DPCPP_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_
+#define GKO_DPCPP_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_
+
+
+#include <memory>
+
+#include <CL/sycl.hpp>
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "dpcpp/base/batch_multi_vector_kernels.hpp"
+#include "dpcpp/base/batch_struct.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/base/helper.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/intrinsics.dp.hpp"
+#include "dpcpp/components/reduction.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/matrix/batch_csr_kernels.hpp"
+#include "dpcpp/matrix/batch_dense_kernels.hpp"
+#include "dpcpp/matrix/batch_ell_kernels.hpp"
+#include "dpcpp/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
 template <typename BatchMatrixType_entry, typename ValueType>
 __dpct_inline__ void initialize(
     const int num_rows, const BatchMatrixType_entry& mat_global_entry,
@@ -393,3 +425,12 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf,
         num_rows, x_sh, x_global_entry, item_ct1);
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif
diff --git a/dpcpp/solver/batch_cg_kernels.dp.cpp b/dpcpp/solver/batch_cg_kernels.dp.cpp
index 05b3f7b803c..9d3aa14ab2c 100644
--- a/dpcpp/solver/batch_cg_kernels.dp.cpp
+++ b/dpcpp/solver/batch_cg_kernels.dp.cpp
@@ -27,6 +27,7 @@
 #include "dpcpp/matrix/batch_dense_kernels.hpp"
 #include "dpcpp/matrix/batch_ell_kernels.hpp"
 #include "dpcpp/matrix/batch_struct.hpp"
+#include "dpcpp/solver/batch_cg_kernels.hpp"
 
 
 namespace gko {
@@ -35,9 +36,6 @@ namespace dpcpp {
 namespace batch_cg {
 
 
-#include "dpcpp/solver/batch_cg_kernels.hpp.inc"
-
-
 template <typename T>
 using settings = gko::kernels::batch_cg::settings<T>;
 
@@ -95,7 +93,8 @@ class kernel_caller {
                     ValueType* const x_global_entry =
                         gko::batch::multi_vector::batch_item_ptr(
                             x_values, 1, num_rows, batch_id);
-                    apply_kernel<StopType, n_shared_total>(
+                    batch_single_kernels::apply_kernel<StopType,
+                                                       n_shared_total>(
                         sconf, max_iters, res_tol, logger, prec,
                         mat_global_entry, b_global_entry, x_global_entry,
                         num_rows, mat.get_single_item_num_nnz(),
diff --git a/dpcpp/solver/batch_cg_kernels.hpp.inc b/dpcpp/solver/batch_cg_kernels.hpp
similarity index 82%
rename from dpcpp/solver/batch_cg_kernels.hpp.inc
rename to dpcpp/solver/batch_cg_kernels.hpp
index b233b7df680..67df0a17236 100644
--- a/dpcpp/solver/batch_cg_kernels.hpp.inc
+++ b/dpcpp/solver/batch_cg_kernels.hpp
@@ -2,6 +2,38 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_DPCPP_SOLVER_BATCH_CG_KERNELS_HPP_
+#define GKO_DPCPP_SOLVER_BATCH_CG_KERNELS_HPP_
+
+
+#include <memory>
+
+#include <CL/sycl.hpp>
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "dpcpp/base/batch_multi_vector_kernels.hpp"
+#include "dpcpp/base/batch_struct.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/base/helper.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/intrinsics.dp.hpp"
+#include "dpcpp/components/reduction.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/matrix/batch_csr_kernels.hpp"
+#include "dpcpp/matrix/batch_dense_kernels.hpp"
+#include "dpcpp/matrix/batch_ell_kernels.hpp"
+#include "dpcpp/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
 template <typename PrecType, typename ValueType, typename BatchMatrixType>
 __dpct_inline__ void initialize(
     const int num_rows, const BatchMatrixType& mat_global_entry,
@@ -27,7 +59,7 @@ __dpct_inline__ void initialize(
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
     // r = b - A*x
-    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply(
+    batch_single_kernels::advanced_apply(
         static_cast<ValueType>(-1.0), mat_global_entry, x_shared_entry,
         static_cast<ValueType>(1.0), r_shared_entry, item_ct1);
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
@@ -40,13 +72,11 @@ __dpct_inline__ void initialize(
     // Compute norms of rhs
     // and rho_old = r' * z
     if (sg_id == 0) {
-        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-            single_rhs_compute_norm2_sg(num_rows, b_global_entry, rhs_norms,
-                                        item_ct1);
+        batch_single_kernels::single_rhs_compute_norm2_sg(
+            num_rows, b_global_entry, rhs_norms, item_ct1);
     } else if (sg_id == 1) {
-        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-            single_rhs_compute_conj_dot_sg(num_rows, r_shared_entry,
-                                           z_shared_entry, rho_old, item_ct1);
+        batch_single_kernels::single_rhs_compute_conj_dot_sg(
+            num_rows, r_shared_entry, z_shared_entry, rho_old, item_ct1);
     }
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
@@ -82,10 +112,9 @@ __dpct_inline__ void update_x_and_r(
     auto sg = item_ct1.get_sub_group();
     const auto tid = item_ct1.get_local_linear_id();
     if (sg.get_group_id() == 0) {
-        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-            single_rhs_compute_conj_dot_sg(num_rows, p_shared_entry,
-                                           Ap_shared_entry, alpha_shared_entry,
-                                           item_ct1);
+        batch_single_kernels::single_rhs_compute_conj_dot_sg(
+            num_rows, p_shared_entry, Ap_shared_entry, alpha_shared_entry,
+            item_ct1);
     }
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
     if (tid == 0) {
@@ -207,8 +236,8 @@ __dpct_inline__ void apply_kernel(
             break;
         }
         // Ap = A * p
-        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply(
-            mat_global_entry, p_sh, Ap_sh, item_ct1);
+        batch_single_kernels::simple_apply(mat_global_entry, p_sh, Ap_sh,
+                                           item_ct1);
         item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
         // alpha = rho_old / (p' * Ap)
@@ -225,9 +254,8 @@ __dpct_inline__ void apply_kernel(
 
         //  rho_new =  (r)' * (z)
         if (sg_id == 0) {
-            gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-                single_rhs_compute_conj_dot_sg(num_rows, r_sh, z_sh,
-                                               rho_new_sh[0], item_ct1);
+            batch_single_kernels::single_rhs_compute_conj_dot_sg(
+                num_rows, r_sh, z_sh, rho_new_sh[0], item_ct1);
         }
         item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
@@ -244,7 +272,15 @@ __dpct_inline__ void apply_kernel(
     logger.log_iteration(batch_id, iter, norms_res_sh[0]);
 
     // copy x back to global memory
-    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(
-        num_rows, x_sh, x_global_entry, item_ct1);
+    batch_single_kernels::copy_kernel(num_rows, x_sh, x_global_entry, item_ct1);
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif

From d018241d4ec868da24c8d4e66baa5f040b2055a1 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Sat, 24 Aug 2024 12:44:59 +0200
Subject: [PATCH 164/448] [cuda, hip] reorg batch preconds

---
 ..._jacobi.hpp.inc => batch_block_jacobi.hpp} | 47 +++++++++++
 .../preconditioner/batch_identity.hpp         | 78 +++++++++++++++++++
 .../preconditioner/batch_identity.hpp.inc     | 33 --------
 ...rnels.hpp.inc => batch_jacobi_kernels.hpp} | 45 +++++++++++
 .../preconditioner/batch_preconditioners.hpp  | 17 ++++
 ...jacobi.hpp.inc => batch_scalar_jacobi.hpp} | 46 +++++++++++
 core/solver/batch_dispatch.hpp                |  6 +-
 cuda/preconditioner/batch_jacobi_kernels.cu   | 20 ++---
 cuda/preconditioner/batch_preconditioners.cuh | 32 --------
 .../batch_jacobi_kernels.hip.cpp              | 18 +++--
 .../batch_preconditioners.hip.hpp             | 32 --------
 11 files changed, 255 insertions(+), 119 deletions(-)
 rename common/cuda_hip/preconditioner/{batch_block_jacobi.hpp.inc => batch_block_jacobi.hpp} (81%)
 create mode 100644 common/cuda_hip/preconditioner/batch_identity.hpp
 delete mode 100644 common/cuda_hip/preconditioner/batch_identity.hpp.inc
 rename common/cuda_hip/preconditioner/{batch_jacobi_kernels.hpp.inc => batch_jacobi_kernels.hpp} (87%)
 create mode 100644 common/cuda_hip/preconditioner/batch_preconditioners.hpp
 rename common/cuda_hip/preconditioner/{batch_scalar_jacobi.hpp.inc => batch_scalar_jacobi.hpp} (77%)
 delete mode 100644 cuda/preconditioner/batch_preconditioners.cuh
 delete mode 100644 hip/preconditioner/batch_preconditioners.hip.hpp

diff --git a/common/cuda_hip/preconditioner/batch_block_jacobi.hpp.inc b/common/cuda_hip/preconditioner/batch_block_jacobi.hpp
similarity index 81%
rename from common/cuda_hip/preconditioner/batch_block_jacobi.hpp.inc
rename to common/cuda_hip/preconditioner/batch_block_jacobi.hpp
index 124f1ee93a1..5aff975e960 100644
--- a/common/cuda_hip/preconditioner/batch_block_jacobi.hpp.inc
+++ b/common/cuda_hip/preconditioner/batch_block_jacobi.hpp
@@ -2,6 +2,44 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_PRECONDITIONER_BATCH_BLOCK_JACOBI_HPP_
+#define GKO_COMMON_CUDA_HIP_PRECONDITIONER_BATCH_BLOCK_JACOBI_HPP_
+
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/preconditioner/batch_jacobi.hpp>
+
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
+#include "common/cuda_hip/base/batch_struct.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/segment_scan.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
+#include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
+#include "core/preconditioner/batch_jacobi_helpers.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_preconditioner {
+
+
 /**
  * BlockJacobi preconditioner for batch solvers.
  */
@@ -173,3 +211,12 @@ class BlockJacobi final {
     const int* __restrict__ const block_ptrs_arr_;
     const int* __restrict__ const row_block_map_;
 };
+
+
+}  // namespace batch_preconditioner
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif
diff --git a/common/cuda_hip/preconditioner/batch_identity.hpp b/common/cuda_hip/preconditioner/batch_identity.hpp
new file mode 100644
index 00000000000..634d3212f36
--- /dev/null
+++ b/common/cuda_hip/preconditioner/batch_identity.hpp
@@ -0,0 +1,78 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_PRECONDITIONER_BATCH_IDENTITY_HPP_
+#define GKO_COMMON_CUDA_HIP_PRECONDITIONER_BATCH_IDENTITY_HPP_
+
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
+#include "common/cuda_hip/base/batch_struct.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/segment_scan.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
+#include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_preconditioner {
+
+
+/**
+ * @see reference/preconditioner/batch_identity.hpp
+ */
+template <typename ValueType>
+class Identity final {
+public:
+    using value_type = ValueType;
+
+    static constexpr int work_size = 0;
+
+    __host__ __device__ static constexpr int dynamic_work_size(int, int)
+    {
+        return 0;
+    }
+
+    template <typename batch_item_type>
+    __device__ __forceinline__ void generate(size_type, const batch_item_type&,
+                                             ValueType*)
+    {}
+
+    __device__ __forceinline__ void apply(const int num_rows,
+                                          const ValueType* const r,
+                                          ValueType* const z) const
+    {
+        for (int li = threadIdx.x; li < num_rows; li += blockDim.x) {
+            z[li] = r[li];
+        }
+    }
+};
+
+
+}  // namespace batch_preconditioner
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif
diff --git a/common/cuda_hip/preconditioner/batch_identity.hpp.inc b/common/cuda_hip/preconditioner/batch_identity.hpp.inc
deleted file mode 100644
index b85a8b1d7da..00000000000
--- a/common/cuda_hip/preconditioner/batch_identity.hpp.inc
+++ /dev/null
@@ -1,33 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-/**
- * @see reference/preconditioner/batch_identity.hpp
- */
-template <typename ValueType>
-class Identity final {
-public:
-    using value_type = ValueType;
-
-    static constexpr int work_size = 0;
-
-    __host__ __device__ static constexpr int dynamic_work_size(int, int)
-    {
-        return 0;
-    }
-
-    template <typename batch_item_type>
-    __device__ __forceinline__ void generate(size_type, const batch_item_type&,
-                                             ValueType*)
-    {}
-
-    __device__ __forceinline__ void apply(const int num_rows,
-                                          const ValueType* const r,
-                                          ValueType* const z) const
-    {
-        for (int li = threadIdx.x; li < num_rows; li += blockDim.x) {
-            z[li] = r[li];
-        }
-    }
-};
diff --git a/common/cuda_hip/preconditioner/batch_jacobi_kernels.hpp.inc b/common/cuda_hip/preconditioner/batch_jacobi_kernels.hpp
similarity index 87%
rename from common/cuda_hip/preconditioner/batch_jacobi_kernels.hpp.inc
rename to common/cuda_hip/preconditioner/batch_jacobi_kernels.hpp
index a26a2077c2d..ac9143fefb9 100644
--- a/common/cuda_hip/preconditioner/batch_jacobi_kernels.hpp.inc
+++ b/common/cuda_hip/preconditioner/batch_jacobi_kernels.hpp
@@ -2,6 +2,42 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_PRECONDITIONER_BATCH_JACOBI_KERNELS_HPP_
+#define GKO_COMMON_CUDA_HIP_PRECONDITIONER_BATCH_JACOBI_KERNELS_HPP_
+
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
+#include "common/cuda_hip/base/batch_struct.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/segment_scan.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
+#include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
 __global__ void compute_block_storage_kernel(
     const gko::size_type num_blocks,
     const int* const __restrict__ block_pointers,
@@ -243,3 +279,12 @@ __launch_bounds__(default_block_size) void compute_block_jacobi_kernel(
         }
     }
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif
diff --git a/common/cuda_hip/preconditioner/batch_preconditioners.hpp b/common/cuda_hip/preconditioner/batch_preconditioners.hpp
new file mode 100644
index 00000000000..fc1d3fd2c9e
--- /dev/null
+++ b/common/cuda_hip/preconditioner/batch_preconditioners.hpp
@@ -0,0 +1,17 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_PRECONDITIONER_BATCH_PRECONDITIONERS_HPP_
+#define GKO_COMMON_CUDA_HIP_PRECONDITIONER_BATCH_PRECONDITIONERS_HPP_
+
+
+#include "common/cuda_hip/preconditioner/batch_block_jacobi.hpp"
+#include "common/cuda_hip/preconditioner/batch_identity.hpp"
+#include "common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "core/preconditioner/batch_jacobi_helpers.hpp"
+
+
+#endif  // GKO_COMMON_CUDA_HIP_PRECONDITIONER_BATCH_PRECONDITIONERS_HPP_
diff --git a/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp.inc b/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp
similarity index 77%
rename from common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp.inc
rename to common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp
index 751c2696e15..695d31235a8 100644
--- a/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp.inc
+++ b/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp
@@ -2,6 +2,43 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_PRECONDITIONER_BATCH_SCALAR_JACOBI_HPP_
+#define GKO_COMMON_CUDA_HIP_PRECONDITIONER_BATCH_SCALAR_JACOBI_HPP_
+
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/preconditioner/batch_jacobi.hpp>
+
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
+#include "common/cuda_hip/base/batch_struct.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/segment_scan.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
+#include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_preconditioner {
+
+
 /**
  * (Scalar) Jacobi preconditioner for batch solvers.
  */
@@ -132,3 +169,12 @@ class ScalarJacobi final {
 private:
     value_type* __restrict__ work_;
 };
+
+
+}  // namespace batch_preconditioner
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif
diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp
index 599c708b334..178f6b1beae 100644
--- a/core/solver/batch_dispatch.hpp
+++ b/core/solver/batch_dispatch.hpp
@@ -26,9 +26,8 @@
 
 #include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
-#include "cuda/components/cooperative_groups.cuh"
+#include "common/cuda_hip/preconditioner/batch_preconditioners.hpp"
 #include "cuda/log/batch_logger.cuh"
-#include "cuda/preconditioner/batch_preconditioners.cuh"
 #include "cuda/stop/batch_criteria.cuh"
 
 
@@ -54,9 +53,8 @@ using DeviceValueType = typename gko::kernels::cuda::cuda_type<ValueType>;
 
 #include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
+#include "common/cuda_hip/preconditioner/batch_preconditioners.hpp"
 #include "hip/log/batch_logger.hip.hpp"
-#include "hip/preconditioner/batch_preconditioners.hip.hpp"
 #include "hip/stop/batch_criteria.hip.hpp"
 
 
diff --git a/cuda/preconditioner/batch_jacobi_kernels.cu b/cuda/preconditioner/batch_jacobi_kernels.cu
index edf052cb649..8768937dc6d 100644
--- a/cuda/preconditioner/batch_jacobi_kernels.cu
+++ b/cuda/preconditioner/batch_jacobi_kernels.cu
@@ -12,6 +12,7 @@
 #include "common/cuda_hip/components/intrinsics.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
+#include "common/cuda_hip/preconditioner/batch_jacobi_kernels.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/base/utils.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
@@ -29,8 +30,6 @@ namespace gko {
 namespace kernels {
 namespace cuda {
 namespace batch_jacobi {
-
-
 namespace {
 
 
@@ -39,8 +38,6 @@ constexpr int default_block_size = 128;
 using batch_jacobi_cuda_compiled_max_block_sizes =
     gko::kernels::cuda::jacobi::compiled_kernels;
 
-#include "common/cuda_hip/preconditioner/batch_jacobi_kernels.hpp.inc"
-
 
 }  // namespace
 
@@ -54,8 +51,9 @@ void compute_cumulative_block_storage(
     dim3 block(default_block_size);
     dim3 grid(ceildiv(num_blocks, default_block_size));
 
-    compute_block_storage_kernel<<<grid, block, 0, exec->get_stream()>>>(
-        num_blocks, block_pointers, blocks_cumulative_offsets);
+    batch_single_kernels::
+        compute_block_storage_kernel<<<grid, block, 0, exec->get_stream()>>>(
+            num_blocks, block_pointers, blocks_cumulative_offsets);
 
     components::prefix_sum_nonnegative(exec, blocks_cumulative_offsets,
                                        num_blocks + 1);
@@ -73,8 +71,9 @@ void find_row_block_map(std::shared_ptr<const DefaultExecutor> exec,
 {
     dim3 block(default_block_size);
     dim3 grid(ceildiv(num_blocks, default_block_size));
-    find_row_block_map_kernel<<<grid, block, 0, exec->get_stream()>>>(
-        num_blocks, block_pointers, map_block_to_row);
+    batch_single_kernels::
+        find_row_block_map_kernel<<<grid, block, 0, exec->get_stream()>>>(
+            num_blocks, block_pointers, map_block_to_row);
 }
 
 GKO_INSTANTIATE_FOR_INT32_TYPE(
@@ -93,7 +92,8 @@ void extract_common_blocks_pattern(
     dim3 block(default_block_size);
     dim3 grid(ceildiv(nrows * config::warp_size, default_block_size));
 
-    extract_common_block_pattern_kernel<<<grid, block, 0, exec->get_stream()>>>(
+    batch_single_kernels::extract_common_block_pattern_kernel<<<
+        grid, block, 0, exec->get_stream()>>>(
         static_cast<int>(nrows), first_sys_csr->get_const_row_ptrs(),
         first_sys_csr->get_const_col_idxs(), num_blocks,
         cumulative_block_storage, block_pointers, map_block_to_row,
@@ -125,7 +125,7 @@ void compute_block_jacobi_helper(
     dim3 block(default_block_size);
     dim3 grid(ceildiv(num_blocks * nbatch * subwarp_size, default_block_size));
 
-    compute_block_jacobi_kernel<subwarp_size>
+    batch_single_kernels::compute_block_jacobi_kernel<subwarp_size>
         <<<grid, block, 0, exec->get_stream()>>>(
             nbatch, static_cast<int>(nnz),
             as_cuda_type(sys_csr->get_const_values()), num_blocks,
diff --git a/cuda/preconditioner/batch_preconditioners.cuh b/cuda/preconditioner/batch_preconditioners.cuh
deleted file mode 100644
index 01001c036b2..00000000000
--- a/cuda/preconditioner/batch_preconditioners.cuh
+++ /dev/null
@@ -1,32 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_PRECONDITIONER_BATCH_PRECONDITIONERS_CUH_
-#define GKO_CUDA_PRECONDITIONER_BATCH_PRECONDITIONERS_CUH_
-
-
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "core/matrix/batch_struct.hpp"
-#include "core/preconditioner/batch_jacobi_helpers.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-namespace batch_preconditioner {
-
-
-#include "common/cuda_hip/preconditioner/batch_block_jacobi.hpp.inc"
-#include "common/cuda_hip/preconditioner/batch_identity.hpp.inc"
-#include "common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp.inc"
-
-
-}  // namespace batch_preconditioner
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_PRECONDITIONER_BATCH_PRECONDITIONERS_CUH_
diff --git a/hip/preconditioner/batch_jacobi_kernels.hip.cpp b/hip/preconditioner/batch_jacobi_kernels.hip.cpp
index 38a81972e66..2380bc6a0bd 100644
--- a/hip/preconditioner/batch_jacobi_kernels.hip.cpp
+++ b/hip/preconditioner/batch_jacobi_kernels.hip.cpp
@@ -15,6 +15,7 @@
 #include "common/cuda_hip/components/uninitialized_array.hpp"
 #include "common/cuda_hip/components/warp_blas.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
+#include "common/cuda_hip/preconditioner/batch_jacobi_kernels.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/base/utils.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
@@ -40,8 +41,6 @@ constexpr int default_block_size = 128;
 using batch_jacobi_hip_compiled_max_block_sizes =
     gko::kernels::hip::jacobi::compiled_kernels;
 
-#include "common/cuda_hip/preconditioner/batch_jacobi_kernels.hpp.inc"
-
 }  // namespace
 
 
@@ -54,8 +53,9 @@ void compute_cumulative_block_storage(
     dim3 block(default_block_size);
     dim3 grid(ceildiv(num_blocks, default_block_size));
 
-    compute_block_storage_kernel<<<grid, block, 0, exec->get_stream()>>>(
-        num_blocks, block_pointers, blocks_cumulative_offsets);
+    batch_single_kernels::
+        compute_block_storage_kernel<<<grid, block, 0, exec->get_stream()>>>(
+            num_blocks, block_pointers, blocks_cumulative_offsets);
 
     components::prefix_sum_nonnegative(exec, blocks_cumulative_offsets,
                                        num_blocks + 1);
@@ -73,8 +73,9 @@ void find_row_block_map(std::shared_ptr<const DefaultExecutor> exec,
 {
     dim3 block(default_block_size);
     dim3 grid(ceildiv(num_blocks, default_block_size));
-    find_row_block_map_kernel<<<grid, block, 0, exec->get_stream()>>>(
-        num_blocks, block_pointers, map_block_to_row);
+    batch_single_kernels::
+        find_row_block_map_kernel<<<grid, block, 0, exec->get_stream()>>>(
+            num_blocks, block_pointers, map_block_to_row);
 }
 
 GKO_INSTANTIATE_FOR_INT32_TYPE(
@@ -93,7 +94,8 @@ void extract_common_blocks_pattern(
     dim3 block(default_block_size);
     dim3 grid(ceildiv(nrows * config::warp_size, default_block_size));
 
-    extract_common_block_pattern_kernel<<<grid, block, 0, exec->get_stream()>>>(
+    batch_single_kernels::extract_common_block_pattern_kernel<<<
+        grid, block, 0, exec->get_stream()>>>(
         static_cast<int>(nrows), first_sys_csr->get_const_row_ptrs(),
         first_sys_csr->get_const_col_idxs(), num_blocks,
         cumulative_block_storage, block_pointers, map_block_to_row,
@@ -126,7 +128,7 @@ void compute_block_jacobi_helper(
     dim3 block(default_block_size);
     dim3 grid(ceildiv(num_blocks * nbatch * subwarp_size, default_block_size));
 
-    compute_block_jacobi_kernel<subwarp_size>
+    batch_single_kernels::compute_block_jacobi_kernel<subwarp_size>
         <<<grid, block, 0, exec->get_stream()>>>(
             nbatch, static_cast<int>(nnz),
             as_hip_type(sys_csr->get_const_values()), num_blocks,
diff --git a/hip/preconditioner/batch_preconditioners.hip.hpp b/hip/preconditioner/batch_preconditioners.hip.hpp
deleted file mode 100644
index f62000ff46f..00000000000
--- a/hip/preconditioner/batch_preconditioners.hip.hpp
+++ /dev/null
@@ -1,32 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_PRECONDITIONER_BATCH_PRECONDITIONERS_HIP_HPP_
-#define GKO_HIP_PRECONDITIONER_BATCH_PRECONDITIONERS_HIP_HPP_
-
-
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "core/matrix/batch_struct.hpp"
-#include "core/preconditioner/batch_jacobi_helpers.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-namespace batch_preconditioner {
-
-
-#include "common/cuda_hip/preconditioner/batch_block_jacobi.hpp.inc"
-#include "common/cuda_hip/preconditioner/batch_identity.hpp.inc"
-#include "common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp.inc"
-
-
-}  // namespace batch_preconditioner
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_PRECONDITIONER_BATCH_PRECONDITIONERS_HIP_HPP_

From f6cc8b1876f4ac83740abd94b24bae83c3c68997 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Sat, 24 Aug 2024 12:45:20 +0200
Subject: [PATCH 165/448] [ref, omp] reorg batch preconds

---
 omp/preconditioner/batch_jacobi_kernels.cpp   | 22 +++++---------
 .../preconditioner/batch_jacobi_kernels.cpp   | 22 +++++---------
 ...rnels.hpp.inc => batch_jacobi_kernels.hpp} | 30 +++++++++++++++++++
 3 files changed, 44 insertions(+), 30 deletions(-)
 rename reference/preconditioner/{batch_jacobi_kernels.hpp.inc => batch_jacobi_kernels.hpp} (88%)

diff --git a/omp/preconditioner/batch_jacobi_kernels.cpp b/omp/preconditioner/batch_jacobi_kernels.cpp
index 9dfe06be32b..90c8f0c1865 100644
--- a/omp/preconditioner/batch_jacobi_kernels.cpp
+++ b/omp/preconditioner/batch_jacobi_kernels.cpp
@@ -10,6 +10,7 @@
 #include "reference/base/batch_struct.hpp"
 #include "reference/matrix/batch_struct.hpp"
 #include "reference/preconditioner/batch_block_jacobi.hpp"
+#include "reference/preconditioner/batch_jacobi_kernels.hpp"
 #include "reference/preconditioner/batch_scalar_jacobi.hpp"
 
 
@@ -19,16 +20,6 @@ namespace omp {
 namespace batch_jacobi {
 
 
-namespace {
-
-
-// Note: Do not change the ordering
-#include "reference/preconditioner/batch_jacobi_kernels.hpp.inc"
-
-
-}  // unnamed namespace
-
-
 template <typename IndexType>
 void compute_cumulative_block_storage(
     std::shared_ptr<const DefaultExecutor> exec, const size_type num_blocks,
@@ -78,8 +69,9 @@ void extract_common_blocks_pattern(
 {
 #pragma omp parallel for
     for (size_type k = 0; k < num_blocks; k++) {
-        extract_block_pattern_impl(k, first_sys_csr, cumulative_block_storage,
-                                   block_pointers, blocks_pattern);
+        batch_single_kernels::extract_block_pattern_impl(
+            k, first_sys_csr, cumulative_block_storage, block_pointers,
+            blocks_pattern);
     }
 }
 
@@ -105,9 +97,9 @@ void compute_block_jacobi(
 
         const auto A_entry =
             gko::batch::matrix::extract_batch_item(A_batch, batch_idx);
-        compute_block_jacobi_impl(batch_idx, block_idx, A_entry, num_blocks,
-                                  cumulative_block_storage, block_pointers,
-                                  blocks_pattern, blocks);
+        batch_single_kernels::compute_block_jacobi_impl(
+            batch_idx, block_idx, A_entry, num_blocks, cumulative_block_storage,
+            block_pointers, blocks_pattern, blocks);
     }
 }
 
diff --git a/reference/preconditioner/batch_jacobi_kernels.cpp b/reference/preconditioner/batch_jacobi_kernels.cpp
index 3c03a21fae7..a012e019b41 100644
--- a/reference/preconditioner/batch_jacobi_kernels.cpp
+++ b/reference/preconditioner/batch_jacobi_kernels.cpp
@@ -10,6 +10,7 @@
 #include "reference/base/batch_struct.hpp"
 #include "reference/matrix/batch_struct.hpp"
 #include "reference/preconditioner/batch_block_jacobi.hpp"
+#include "reference/preconditioner/batch_jacobi_kernels.hpp"
 #include "reference/preconditioner/batch_scalar_jacobi.hpp"
 
 
@@ -19,16 +20,6 @@ namespace reference {
 namespace batch_jacobi {
 
 
-namespace {
-
-
-// Note: Do not change the ordering
-#include "reference/preconditioner/batch_jacobi_kernels.hpp.inc"
-
-
-}  // unnamed namespace
-
-
 template <typename IndexType>
 void compute_cumulative_block_storage(
     std::shared_ptr<const DefaultExecutor> exec, const size_type num_blocks,
@@ -74,8 +65,9 @@ void extract_common_blocks_pattern(
     IndexType* const blocks_pattern)
 {
     for (size_type k = 0; k < num_blocks; k++) {
-        extract_block_pattern_impl(k, first_sys_csr, cumulative_block_storage,
-                                   block_pointers, blocks_pattern);
+        batch_single_kernels::extract_block_pattern_impl(
+            k, first_sys_csr, cumulative_block_storage, block_pointers,
+            blocks_pattern);
     }
 }
 
@@ -98,9 +90,9 @@ void compute_block_jacobi(
         for (size_type k = 0; k < num_blocks; k++) {
             const auto A_entry =
                 gko::batch::matrix::extract_batch_item(A_batch, batch_idx);
-            compute_block_jacobi_impl(batch_idx, k, A_entry, num_blocks,
-                                      cumulative_block_storage, block_pointers,
-                                      blocks_pattern, blocks);
+            batch_single_kernels::compute_block_jacobi_impl(
+                batch_idx, k, A_entry, num_blocks, cumulative_block_storage,
+                block_pointers, blocks_pattern, blocks);
         }
     }
 }
diff --git a/reference/preconditioner/batch_jacobi_kernels.hpp.inc b/reference/preconditioner/batch_jacobi_kernels.hpp
similarity index 88%
rename from reference/preconditioner/batch_jacobi_kernels.hpp.inc
rename to reference/preconditioner/batch_jacobi_kernels.hpp
index 0f04841bc7b..ee44f21eb97 100644
--- a/reference/preconditioner/batch_jacobi_kernels.hpp.inc
+++ b/reference/preconditioner/batch_jacobi_kernels.hpp
@@ -2,6 +2,27 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_REFERENCE_PRECONDITIONER_BATCH_JACOBI_KERNELS_HPP_
+#define GKO_REFERENCE_PRECONDITIONER_BATCH_JACOBI_KERNELS_HPP_
+
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "core/preconditioner/batch_jacobi_helpers.hpp"
+#include "reference/base/batch_multi_vector_kernels.hpp"
+#include "reference/base/batch_struct.hpp"
+#include "reference/matrix/batch_csr_kernels.hpp"
+#include "reference/matrix/batch_dense_kernels.hpp"
+#include "reference/matrix/batch_ell_kernels.hpp"
+#include "reference/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
 template <typename ValueType>
 inline void extract_block_pattern_impl(
     const size_type k, const matrix::Csr<ValueType, int>* const first_sys_csr,
@@ -164,3 +185,12 @@ inline void compute_block_jacobi_impl(
         }
     }
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif

From a0b83052fef218a465d6b1dfd7f64ee7b171b0b4 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Sat, 24 Aug 2024 12:57:14 +0200
Subject: [PATCH 166/448] [dpcpp] reorg batch preconds

---
 ..._jacobi.hpp.inc => batch_block_jacobi.hpp} | 40 +++++++++++
 dpcpp/preconditioner/batch_identity.hpp       | 72 +++++++++++++++++++
 dpcpp/preconditioner/batch_identity.hpp.inc   | 31 --------
 .../batch_jacobi_kernels.dp.cpp               | 41 +++++------
 ...rnels.hpp.inc => batch_jacobi_kernels.hpp} | 42 +++++++++++
 .../preconditioner/batch_preconditioners.hpp  | 20 +-----
 ...jacobi.hpp.inc => batch_scalar_jacobi.hpp} | 42 +++++++++++
 7 files changed, 218 insertions(+), 70 deletions(-)
 rename dpcpp/preconditioner/{batch_block_jacobi.hpp.inc => batch_block_jacobi.hpp} (81%)
 create mode 100644 dpcpp/preconditioner/batch_identity.hpp
 delete mode 100644 dpcpp/preconditioner/batch_identity.hpp.inc
 rename dpcpp/preconditioner/{batch_jacobi_kernels.hpp.inc => batch_jacobi_kernels.hpp} (87%)
 rename dpcpp/preconditioner/{batch_scalar_jacobi.hpp.inc => batch_scalar_jacobi.hpp} (82%)

diff --git a/dpcpp/preconditioner/batch_block_jacobi.hpp.inc b/dpcpp/preconditioner/batch_block_jacobi.hpp
similarity index 81%
rename from dpcpp/preconditioner/batch_block_jacobi.hpp.inc
rename to dpcpp/preconditioner/batch_block_jacobi.hpp
index 442914b3933..b01de33c299 100644
--- a/dpcpp/preconditioner/batch_block_jacobi.hpp.inc
+++ b/dpcpp/preconditioner/batch_block_jacobi.hpp
@@ -2,6 +2,39 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_DPCPP_PRECONDITIONER_BATCH_BLOCK_JACOBI_HPP_
+#define GKO_DPCPP_PRECONDITIONER_BATCH_BLOCK_JACOBI_HPP_
+
+
+#include <memory>
+
+#include <CL/sycl.hpp>
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "core/preconditioner/batch_jacobi_helpers.hpp"
+#include "dpcpp/base/batch_multi_vector_kernels.hpp"
+#include "dpcpp/base/batch_struct.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/base/helper.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/intrinsics.dp.hpp"
+#include "dpcpp/components/reduction.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/matrix/batch_csr_kernels.hpp"
+#include "dpcpp/matrix/batch_dense_kernels.hpp"
+#include "dpcpp/matrix/batch_ell_kernels.hpp"
+#include "dpcpp/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_preconditioner {
+
+
 /**
  * BlockJacobi preconditioner for batch solvers.
  */
@@ -129,3 +162,10 @@ class BlockJacobi final {
     const int* __restrict__ const block_ptrs_arr_;
     const int* __restrict__ const row_block_map_;
 };
+
+}  // namespace batch_preconditioner
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+#endif
diff --git a/dpcpp/preconditioner/batch_identity.hpp b/dpcpp/preconditioner/batch_identity.hpp
new file mode 100644
index 00000000000..0696d028059
--- /dev/null
+++ b/dpcpp/preconditioner/batch_identity.hpp
@@ -0,0 +1,72 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_DPCPP_PRECONDITIONER_BATCH_IDENTITY_HPP_
+#define GKO_DPCPP_PRECONDITIONER_BATCH_IDENTITY_HPP_
+
+
+#include <memory>
+
+#include <CL/sycl.hpp>
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "dpcpp/base/batch_multi_vector_kernels.hpp"
+#include "dpcpp/base/batch_struct.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/base/helper.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/intrinsics.dp.hpp"
+#include "dpcpp/components/reduction.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/matrix/batch_csr_kernels.hpp"
+#include "dpcpp/matrix/batch_dense_kernels.hpp"
+#include "dpcpp/matrix/batch_ell_kernels.hpp"
+#include "dpcpp/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_preconditioner {
+
+
+/**
+ * @see reference/preconditioner/batch_identity.hpp
+ */
+template <typename ValueType>
+class Identity final {
+public:
+    using value_type = ValueType;
+
+    static constexpr int work_size = 0;
+
+    static int dynamic_work_size(int, int) { return 0; }
+
+    template <typename batch_item_type>
+    void generate(size_type, const batch_item_type&, ValueType*,
+                  sycl::nd_item<3> item_ct1)
+    {}
+
+    __dpct_inline__ void apply(const int num_rows, const ValueType* const r,
+                               ValueType* const z,
+                               sycl::nd_item<3> item_ct1) const
+    {
+        for (int li = item_ct1.get_local_linear_id(); li < num_rows;
+             li += item_ct1.get_local_range().size()) {
+            z[li] = r[li];
+        }
+    }
+};
+
+
+}  // namespace batch_preconditioner
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif
diff --git a/dpcpp/preconditioner/batch_identity.hpp.inc b/dpcpp/preconditioner/batch_identity.hpp.inc
deleted file mode 100644
index 4b5314363da..00000000000
--- a/dpcpp/preconditioner/batch_identity.hpp.inc
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-/**
- * @see reference/preconditioner/batch_identity.hpp
- */
-template <typename ValueType>
-class Identity final {
-public:
-    using value_type = ValueType;
-
-    static constexpr int work_size = 0;
-
-    static int dynamic_work_size(int, int) { return 0; }
-
-    template <typename batch_item_type>
-    void generate(size_type, const batch_item_type&, ValueType*,
-                  sycl::nd_item<3> item_ct1)
-    {}
-
-    __dpct_inline__ void apply(const int num_rows, const ValueType* const r,
-                               ValueType* const z,
-                               sycl::nd_item<3> item_ct1) const
-    {
-        for (int li = item_ct1.get_local_linear_id(); li < num_rows;
-             li += item_ct1.get_local_range().size()) {
-            z[li] = r[li];
-        }
-    }
-};
diff --git a/dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp b/dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp
index e66e7141a47..d85f93e74f2 100644
--- a/dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp
+++ b/dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp
@@ -16,6 +16,7 @@
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/matrix/batch_struct.hpp"
+#include "dpcpp/preconditioner/batch_jacobi_kernels.hpp"
 #include "dpcpp/preconditioner/jacobi_common.hpp"
 
 
@@ -23,16 +24,12 @@ namespace gko {
 namespace kernels {
 namespace dpcpp {
 namespace batch_jacobi {
-
-
 namespace {
 
 
 using batch_jacobi_dpcpp_compiled_max_block_sizes =
     gko::kernels::dpcpp::jacobi::compiled_kernels;
 
-#include "dpcpp/preconditioner/batch_jacobi_kernels.hpp.inc"
-
 
 }  // namespace
 
@@ -96,15 +93,15 @@ void extract_common_blocks_pattern(
     const auto col_idxs = first_sys_csr->get_const_col_idxs();
 
     exec->get_queue()->submit([&](sycl::handler& cgh) {
-        cgh.parallel_for(sycl_nd_range(grid, block),
-                         [=](sycl::nd_item<3> item_ct1)
-                             [[intel::reqd_sub_group_size(subgroup_size)]] {
-                                 extract_common_block_pattern_kernel(
-                                     static_cast<int>(nrows), row_ptrs,
-                                     col_idxs, num_blocks,
-                                     cumulative_block_storage, block_pointers,
-                                     map_block_row, blocks_pattern, item_ct1);
-                             });
+        cgh.parallel_for(
+            sycl_nd_range(grid, block),
+            [=](sycl::nd_item<3> item_ct1)
+                [[intel::reqd_sub_group_size(subgroup_size)]] {
+                    batch_single_kernels::extract_common_block_pattern_kernel(
+                        static_cast<int>(nrows), row_ptrs, col_idxs, num_blocks,
+                        cumulative_block_storage, block_pointers, map_block_row,
+                        blocks_pattern, item_ct1);
+                });
     });
 }
 
@@ -142,15 +139,15 @@ void compute_block_jacobi_helper(
     dim3 grid(ceildiv(num_blocks * nbatch * subgroup_size, group_size));
 
     exec->get_queue()->submit([&](sycl::handler& cgh) {
-        cgh.parallel_for(sycl_nd_range(grid, block),
-                         [=](sycl::nd_item<3> item_ct1)
-                             [[intel::reqd_sub_group_size(subgroup_size)]] {
-                                 compute_block_jacobi_kernel(
-                                     nbatch, static_cast<int>(nnz),
-                                     sys_csr_values, num_blocks,
-                                     cumulative_block_storage, block_pointers,
-                                     blocks_pattern, blocks, item_ct1);
-                             });
+        cgh.parallel_for(
+            sycl_nd_range(grid, block),
+            [=](sycl::nd_item<3> item_ct1)
+                [[intel::reqd_sub_group_size(subgroup_size)]] {
+                    batch_single_kernels::compute_block_jacobi_kernel(
+                        nbatch, static_cast<int>(nnz), sys_csr_values,
+                        num_blocks, cumulative_block_storage, block_pointers,
+                        blocks_pattern, blocks, item_ct1);
+                });
     });
 }
 
diff --git a/dpcpp/preconditioner/batch_jacobi_kernels.hpp.inc b/dpcpp/preconditioner/batch_jacobi_kernels.hpp
similarity index 87%
rename from dpcpp/preconditioner/batch_jacobi_kernels.hpp.inc
rename to dpcpp/preconditioner/batch_jacobi_kernels.hpp
index 930850aaf1a..b8c75c9efa0 100644
--- a/dpcpp/preconditioner/batch_jacobi_kernels.hpp.inc
+++ b/dpcpp/preconditioner/batch_jacobi_kernels.hpp
@@ -2,6 +2,39 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_DPCPP_PRECONDITIONER_BATCH_JACOBI_KERNELS_HPP_
+#define GKO_DPCPP_PRECONDITIONER_BATCH_JACOBI_KERNELS_HPP_
+
+
+#include <memory>
+
+#include <CL/sycl.hpp>
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "core/preconditioner/batch_jacobi_helpers.hpp"
+#include "dpcpp/base/batch_multi_vector_kernels.hpp"
+#include "dpcpp/base/batch_struct.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/base/helper.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/intrinsics.dp.hpp"
+#include "dpcpp/components/reduction.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/matrix/batch_csr_kernels.hpp"
+#include "dpcpp/matrix/batch_dense_kernels.hpp"
+#include "dpcpp/matrix/batch_ell_kernels.hpp"
+#include "dpcpp/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
 __dpct_inline__ void extract_common_block_pattern_kernel(
     const int nrows, const int* const __restrict__ sys_row_ptrs,
     const int* const __restrict__ sys_col_idxs, const size_type num_blocks,
@@ -203,3 +236,12 @@ __dpct_inline__ void compute_block_jacobi_kernel(
         }
     }
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif
diff --git a/dpcpp/preconditioner/batch_preconditioners.hpp b/dpcpp/preconditioner/batch_preconditioners.hpp
index 607cd7fa7bf..208e35b21b3 100644
--- a/dpcpp/preconditioner/batch_preconditioners.hpp
+++ b/dpcpp/preconditioner/batch_preconditioners.hpp
@@ -8,23 +8,9 @@
 
 #include "core/matrix/batch_struct.hpp"
 #include "core/preconditioner/batch_jacobi_helpers.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace dpcpp {
-namespace batch_preconditioner {
-
-
-#include "dpcpp/preconditioner/batch_block_jacobi.hpp.inc"
-#include "dpcpp/preconditioner/batch_identity.hpp.inc"
-#include "dpcpp/preconditioner/batch_scalar_jacobi.hpp.inc"
-
-
-}  // namespace batch_preconditioner
-}  // namespace dpcpp
-}  // namespace kernels
-}  // namespace gko
+#include "dpcpp/preconditioner/batch_block_jacobi.hpp"
+#include "dpcpp/preconditioner/batch_identity.hpp"
+#include "dpcpp/preconditioner/batch_scalar_jacobi.hpp"
 
 
 #endif  // GKO_DPCPP_PRECONDITIONER_BATCH_PRECONDITIONERS_HPP_
diff --git a/dpcpp/preconditioner/batch_scalar_jacobi.hpp.inc b/dpcpp/preconditioner/batch_scalar_jacobi.hpp
similarity index 82%
rename from dpcpp/preconditioner/batch_scalar_jacobi.hpp.inc
rename to dpcpp/preconditioner/batch_scalar_jacobi.hpp
index 3bb652a5032..c8963c7b592 100644
--- a/dpcpp/preconditioner/batch_scalar_jacobi.hpp.inc
+++ b/dpcpp/preconditioner/batch_scalar_jacobi.hpp
@@ -2,6 +2,39 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_DPCPP_PRECONDITIONER_BATCH_SCALAR_JACOBI_HPP_
+#define GKO_DPCPP_PRECONDITIONER_BATCH_SCALAR_JACOBI_HPP_
+
+
+#include <memory>
+
+#include <CL/sycl.hpp>
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "core/preconditioner/batch_jacobi_helpers.hpp"
+#include "dpcpp/base/batch_multi_vector_kernels.hpp"
+#include "dpcpp/base/batch_struct.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/base/helper.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/intrinsics.dp.hpp"
+#include "dpcpp/components/reduction.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/matrix/batch_csr_kernels.hpp"
+#include "dpcpp/matrix/batch_dense_kernels.hpp"
+#include "dpcpp/matrix/batch_ell_kernels.hpp"
+#include "dpcpp/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_preconditioner {
+
+
 /**
  * (Scalar) Jacobi preconditioner for batch solvers.
  */
@@ -134,3 +167,12 @@ class ScalarJacobi final {
 private:
     value_type* __restrict__ work_;
 };
+
+
+}  // namespace batch_preconditioner
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif

From bf6349965078d52da867371d29c8487c578ba344 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Sat, 24 Aug 2024 14:47:24 +0200
Subject: [PATCH 167/448] [cuda, hip] reorg batch_criteria

---
 ...ch_criteria.hpp.inc => batch_criteria.hpp} | 21 +++++++++++++++
 cuda/stop/batch_criteria.cuh                  | 26 -------------------
 hip/stop/batch_criteria.hip.hpp               | 26 -------------------
 3 files changed, 21 insertions(+), 52 deletions(-)
 rename common/cuda_hip/stop/{batch_criteria.hpp.inc => batch_criteria.hpp} (77%)
 delete mode 100644 cuda/stop/batch_criteria.cuh
 delete mode 100644 hip/stop/batch_criteria.hip.hpp

diff --git a/common/cuda_hip/stop/batch_criteria.hpp.inc b/common/cuda_hip/stop/batch_criteria.hpp
similarity index 77%
rename from common/cuda_hip/stop/batch_criteria.hpp.inc
rename to common/cuda_hip/stop/batch_criteria.hpp
index 38072467765..a7ae2005cc0 100644
--- a/common/cuda_hip/stop/batch_criteria.hpp.inc
+++ b/common/cuda_hip/stop/batch_criteria.hpp
@@ -2,6 +2,19 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_STOP_BATCH_CRITERIA_HPP_
+#define GKO_COMMON_CUDA_HIP_STOP_BATCH_CRITERIA_HPP_
+
+
+#include <ginkgo/core/base/math.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_stop {
+
+
 /**
  * @see reference/stop/batch_criteria.hpp
  */
@@ -49,3 +62,11 @@ class SimpleAbsResidual {
 private:
     const real_type abs_tol_;
 };
+
+
+}  // namespace batch_stop
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+#endif
diff --git a/cuda/stop/batch_criteria.cuh b/cuda/stop/batch_criteria.cuh
deleted file mode 100644
index f4f434dda11..00000000000
--- a/cuda/stop/batch_criteria.cuh
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_STOP_BATCH_CRITERIA_CUH_
-#define GKO_CUDA_STOP_BATCH_CRITERIA_CUH_
-
-
-#include <ginkgo/core/base/math.hpp>
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-namespace batch_stop {
-
-
-#include "common/cuda_hip/stop/batch_criteria.hpp.inc"
-
-
-}  // namespace batch_stop
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-#endif  // GKO_CUDA_STOP_BATCH_CRITERIA_CUH_
diff --git a/hip/stop/batch_criteria.hip.hpp b/hip/stop/batch_criteria.hip.hpp
deleted file mode 100644
index 1f721e36aaf..00000000000
--- a/hip/stop/batch_criteria.hip.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_
-#define GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_
-
-
-#include <ginkgo/core/base/math.hpp>
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-namespace batch_stop {
-
-
-#include "common/cuda_hip/stop/batch_criteria.hpp.inc"
-
-
-}  // namespace batch_stop
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-#endif  // GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_

From 1f46611bc33d6805cbd920ae4a12b2cefb78ad53 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Sat, 24 Aug 2024 14:47:51 +0200
Subject: [PATCH 168/448] [cuda, hip] reorg batch_logger

---
 ...{batch_logger.hpp.inc => batch_logger.hpp} | 21 +++++++++++++++
 core/solver/batch_dispatch.hpp                |  8 +++---
 cuda/log/batch_logger.cuh                     | 27 -------------------
 hip/log/batch_logger.hip.hpp                  | 26 ------------------
 4 files changed, 25 insertions(+), 57 deletions(-)
 rename common/cuda_hip/log/{batch_logger.hpp.inc => batch_logger.hpp} (71%)
 delete mode 100644 cuda/log/batch_logger.cuh
 delete mode 100644 hip/log/batch_logger.hip.hpp

diff --git a/common/cuda_hip/log/batch_logger.hpp.inc b/common/cuda_hip/log/batch_logger.hpp
similarity index 71%
rename from common/cuda_hip/log/batch_logger.hpp.inc
rename to common/cuda_hip/log/batch_logger.hpp
index 04b614b50f9..5e897b3c67d 100644
--- a/common/cuda_hip/log/batch_logger.hpp.inc
+++ b/common/cuda_hip/log/batch_logger.hpp
@@ -2,6 +2,18 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_LOG_BATCH_LOGGER_HPP_
+#define GKO_COMMON_CUDA_HIP_LOG_BATCH_LOGGER_HPP_
+
+
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_log {
+
 /**
  * @see reference/log/batch_logger.hpp
  */
@@ -28,3 +40,12 @@ class SimpleFinalLogger final {
     real_type* const final_residuals_;
     idx_type* const final_iters_;
 };
+
+
+}  // namespace batch_log
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif
diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp
index 178f6b1beae..018a6674df5 100644
--- a/core/solver/batch_dispatch.hpp
+++ b/core/solver/batch_dispatch.hpp
@@ -25,10 +25,10 @@
 
 
 #include "common/cuda_hip/base/batch_struct.hpp"
+#include "common/cuda_hip/log/batch_logger.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "common/cuda_hip/preconditioner/batch_preconditioners.hpp"
-#include "cuda/log/batch_logger.cuh"
-#include "cuda/stop/batch_criteria.cuh"
+#include "common/cuda_hip/stop/batch_criteria.hpp"
 
 
 namespace gko {
@@ -52,10 +52,10 @@ using DeviceValueType = typename gko::kernels::cuda::cuda_type<ValueType>;
 
 
 #include "common/cuda_hip/base/batch_struct.hpp"
+#include "common/cuda_hip/log/batch_logger.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "common/cuda_hip/preconditioner/batch_preconditioners.hpp"
-#include "hip/log/batch_logger.hip.hpp"
-#include "hip/stop/batch_criteria.hip.hpp"
+#include "common/cuda_hip/stop/batch_criteria.hpp"
 
 
 namespace gko {
diff --git a/cuda/log/batch_logger.cuh b/cuda/log/batch_logger.cuh
deleted file mode 100644
index 3e53d6ef0a6..00000000000
--- a/cuda/log/batch_logger.cuh
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_LOG_BATCH_LOGGER_CUH_
-#define GKO_CUDA_LOG_BATCH_LOGGER_CUH_
-
-
-#include <ginkgo/core/base/types.hpp>
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-namespace batch_log {
-
-
-#include "common/cuda_hip/log/batch_logger.hpp.inc"
-
-
-}  // namespace batch_log
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_LOG_BATCH_LOGGER_CUH_
diff --git a/hip/log/batch_logger.hip.hpp b/hip/log/batch_logger.hip.hpp
deleted file mode 100644
index a2540f2bd9d..00000000000
--- a/hip/log/batch_logger.hip.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_
-#define GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_
-
-
-#include <ginkgo/core/base/types.hpp>
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-namespace batch_log {
-
-#include "common/cuda_hip/log/batch_logger.hpp.inc"
-
-
-}  // namespace batch_log
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_

From 3a397d16f0c4facc3c028f7e5612d5f55c5e5cb5 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Mon, 26 Aug 2024 10:51:39 +0200
Subject: [PATCH 169/448] [cuda, hip] remove unnecessary headers

---
 .../base/batch_multi_vector_kernels.hpp       |  2 -
 common/cuda_hip/matrix/batch_csr_kernels.cpp  |  3 --
 common/cuda_hip/matrix/batch_csr_kernels.hpp  |  7 ---
 .../cuda_hip/matrix/batch_dense_kernels.cpp   |  3 --
 .../cuda_hip/matrix/batch_dense_kernels.hpp   |  7 ---
 common/cuda_hip/matrix/batch_ell_kernels.cpp  |  3 --
 common/cuda_hip/matrix/batch_ell_kernels.hpp  |  8 ---
 .../preconditioner/batch_block_jacobi.hpp     |  8 ---
 .../preconditioner/batch_identity.hpp         | 13 -----
 .../preconditioner/batch_jacobi_kernels.hpp   |  8 ---
 .../preconditioner/batch_scalar_jacobi.hpp    |  8 ---
 .../solver/batch_bicgstab_kernels.hpp         | 50 +++++++------------
 common/cuda_hip/solver/batch_cg_kernels.hpp   | 35 +++++--------
 cuda/solver/batch_cg_kernels.cu               |  8 ---
 14 files changed, 31 insertions(+), 132 deletions(-)

diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp b/common/cuda_hip/base/batch_multi_vector_kernels.hpp
index 3f5763474c2..5c6210eeaed 100644
--- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp
@@ -21,9 +21,7 @@
 #include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/format_conversion.hpp"
 #include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/segment_scan.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
 #include "common/cuda_hip/components/warp_blas.hpp"
 
diff --git a/common/cuda_hip/matrix/batch_csr_kernels.cpp b/common/cuda_hip/matrix/batch_csr_kernels.cpp
index 35dc2c17e03..d48cdbaf32a 100644
--- a/common/cuda_hip/matrix/batch_csr_kernels.cpp
+++ b/common/cuda_hip/matrix/batch_csr_kernels.cpp
@@ -4,9 +4,6 @@
 
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
 
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
diff --git a/common/cuda_hip/matrix/batch_csr_kernels.hpp b/common/cuda_hip/matrix/batch_csr_kernels.hpp
index 5ed66c59d14..b1520f2d808 100644
--- a/common/cuda_hip/matrix/batch_csr_kernels.hpp
+++ b/common/cuda_hip/matrix/batch_csr_kernels.hpp
@@ -6,9 +6,6 @@
 #define GKO_COMMON_CUDA_HIP_MATRIX_BATCH_CSR_KERNELS_HPP_
 
 
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
@@ -22,11 +19,7 @@
 #include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/format_conversion.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/segment_scan.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/warp_blas.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
 
 
diff --git a/common/cuda_hip/matrix/batch_dense_kernels.cpp b/common/cuda_hip/matrix/batch_dense_kernels.cpp
index 44dad55aa70..ee4d87abaa3 100644
--- a/common/cuda_hip/matrix/batch_dense_kernels.cpp
+++ b/common/cuda_hip/matrix/batch_dense_kernels.cpp
@@ -4,9 +4,6 @@
 
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
 
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
diff --git a/common/cuda_hip/matrix/batch_dense_kernels.hpp b/common/cuda_hip/matrix/batch_dense_kernels.hpp
index 7902d6010fa..c9089bd9a80 100644
--- a/common/cuda_hip/matrix/batch_dense_kernels.hpp
+++ b/common/cuda_hip/matrix/batch_dense_kernels.hpp
@@ -6,9 +6,6 @@
 #define GKO_COMMON_CUDA_HIP_MATRIX_BATCH_DENSE_KERNELS_HPP_
 
 
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
@@ -19,12 +16,8 @@
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/format_conversion.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/segment_scan.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
 #include "common/cuda_hip/components/warp_blas.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
diff --git a/common/cuda_hip/matrix/batch_ell_kernels.cpp b/common/cuda_hip/matrix/batch_ell_kernels.cpp
index c56325ab824..38d34707d45 100644
--- a/common/cuda_hip/matrix/batch_ell_kernels.cpp
+++ b/common/cuda_hip/matrix/batch_ell_kernels.cpp
@@ -4,9 +4,6 @@
 
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
 
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
diff --git a/common/cuda_hip/matrix/batch_ell_kernels.hpp b/common/cuda_hip/matrix/batch_ell_kernels.hpp
index f32144dc172..a9037f5144a 100644
--- a/common/cuda_hip/matrix/batch_ell_kernels.hpp
+++ b/common/cuda_hip/matrix/batch_ell_kernels.hpp
@@ -6,9 +6,6 @@
 #define GKO_COMMON_CUDA_HIP_MATRIX_BATCH_ELL_KERNELS_HPP_
 
 
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
@@ -19,14 +16,9 @@
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/format_conversion.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/segment_scan.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/warp_blas.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
 
 
diff --git a/common/cuda_hip/preconditioner/batch_block_jacobi.hpp b/common/cuda_hip/preconditioner/batch_block_jacobi.hpp
index 5aff975e960..c01bafa875a 100644
--- a/common/cuda_hip/preconditioner/batch_block_jacobi.hpp
+++ b/common/cuda_hip/preconditioner/batch_block_jacobi.hpp
@@ -6,9 +6,6 @@
 #define GKO_COMMON_CUDA_HIP_PRECONDITIONER_BATCH_BLOCK_JACOBI_HPP_
 
 
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
@@ -19,14 +16,9 @@
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/format_conversion.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/segment_scan.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/warp_blas.hpp"
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
diff --git a/common/cuda_hip/preconditioner/batch_identity.hpp b/common/cuda_hip/preconditioner/batch_identity.hpp
index 634d3212f36..3d57bcae406 100644
--- a/common/cuda_hip/preconditioner/batch_identity.hpp
+++ b/common/cuda_hip/preconditioner/batch_identity.hpp
@@ -6,9 +6,6 @@
 #define GKO_COMMON_CUDA_HIP_PRECONDITIONER_BATCH_IDENTITY_HPP_
 
 
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
@@ -16,16 +13,6 @@
 #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
 #include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/format_conversion.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/segment_scan.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/warp_blas.hpp"
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
diff --git a/common/cuda_hip/preconditioner/batch_jacobi_kernels.hpp b/common/cuda_hip/preconditioner/batch_jacobi_kernels.hpp
index ac9143fefb9..9a1ea7458c8 100644
--- a/common/cuda_hip/preconditioner/batch_jacobi_kernels.hpp
+++ b/common/cuda_hip/preconditioner/batch_jacobi_kernels.hpp
@@ -6,9 +6,6 @@
 #define GKO_COMMON_CUDA_HIP_PRECONDITIONER_BATCH_JACOBI_KERNELS_HPP_
 
 
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
@@ -18,14 +15,9 @@
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/format_conversion.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/segment_scan.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/warp_blas.hpp"
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
diff --git a/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp b/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp
index 695d31235a8..42a4f3f6aa6 100644
--- a/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp
+++ b/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp
@@ -6,9 +6,6 @@
 #define GKO_COMMON_CUDA_HIP_PRECONDITIONER_BATCH_SCALAR_JACOBI_HPP_
 
 
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
@@ -19,14 +16,9 @@
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/format_conversion.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/segment_scan.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/warp_blas.hpp"
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
diff --git a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp
index cbab8ed6961..10d235358bc 100644
--- a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp
+++ b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp
@@ -5,10 +5,6 @@
 #ifndef GKO_COMMON_CUDA_HIP_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_
 #define GKO_COMMON_CUDA_HIP_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_
 
-
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
@@ -18,14 +14,9 @@
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/format_conversion.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/segment_scan.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/warp_blas.hpp"
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
@@ -63,18 +54,15 @@ __device__ __forceinline__ void initialize(
     __syncthreads();
 
     // r = b - A*x
-    batch_single_kernels::advanced_apply(
-        static_cast<ValueType>(-1.0), mat_entry, x_shared_entry,
-        static_cast<ValueType>(1.0), r_shared_entry);
+    advanced_apply(static_cast<ValueType>(-1.0), mat_entry, x_shared_entry,
+                   static_cast<ValueType>(1.0), r_shared_entry);
     __syncthreads();
 
     if (threadIdx.x / config::warp_size == 0) {
-        batch_single_kernels::single_rhs_compute_norm2(
-            subgroup, num_rows, r_shared_entry, res_norm);
+        single_rhs_compute_norm2(subgroup, num_rows, r_shared_entry, res_norm);
     } else if (threadIdx.x / config::warp_size == 1) {
         // Compute norms of rhs
-        batch_single_kernels::single_rhs_compute_norm2(
-            subgroup, num_rows, b_global_entry, rhs_norm);
+        single_rhs_compute_norm2(subgroup, num_rows, b_global_entry, rhs_norm);
     }
     __syncthreads();
 
@@ -109,8 +97,8 @@ __device__ __forceinline__ void compute_alpha(
     const ValueType* const v_shared_entry, ValueType& alpha)
 {
     if (threadIdx.x / config::warp_size == 0) {
-        batch_single_kernels::single_rhs_compute_conj_dot(
-            subgroup, num_rows, r_hat_shared_entry, v_shared_entry, alpha);
+        single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_shared_entry,
+                                    v_shared_entry, alpha);
     }
     __syncthreads();
     if (threadIdx.x == 0) {
@@ -138,11 +126,11 @@ __device__ __forceinline__ void compute_omega(
     const ValueType* const s_shared_entry, ValueType& temp, ValueType& omega)
 {
     if (threadIdx.x / config::warp_size == 0) {
-        batch_single_kernels::single_rhs_compute_conj_dot(
-            subgroup, num_rows, t_shared_entry, s_shared_entry, omega);
+        single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry,
+                                    s_shared_entry, omega);
     } else if (threadIdx.x / config::warp_size == 1) {
-        batch_single_kernels::single_rhs_compute_conj_dot(
-            subgroup, num_rows, t_shared_entry, t_shared_entry, temp);
+        single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry,
+                                    t_shared_entry, temp);
     }
 
     __syncthreads();
@@ -310,8 +298,8 @@ __global__ void apply_kernel(
 
             // rho_new =  < r_hat , r > = (r_hat)' * (r)
             if (threadIdx.x / config::warp_size == 0) {
-                batch_single_kernels::single_rhs_compute_conj_dot(
-                    subgroup, num_rows, r_hat_sh, r_sh, rho_new_sh[0]);
+                single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_sh, r_sh,
+                                            rho_new_sh[0]);
             }
             __syncthreads();
 
@@ -326,7 +314,7 @@ __global__ void apply_kernel(
             __syncthreads();
 
             // v = A * p_hat
-            batch_single_kernels::simple_apply(mat_entry, p_hat_sh, v_sh);
+            simple_apply(mat_entry, p_hat_sh, v_sh);
             __syncthreads();
 
             // alpha = rho_new / < r_hat , v>
@@ -340,8 +328,8 @@ __global__ void apply_kernel(
 
             // an estimate of residual norms
             if (threadIdx.x / config::warp_size == 0) {
-                batch_single_kernels::single_rhs_compute_norm2(
-                    subgroup, num_rows, s_sh, norms_res_sh[0]);
+                single_rhs_compute_norm2(subgroup, num_rows, s_sh,
+                                         norms_res_sh[0]);
             }
             __syncthreads();
 
@@ -357,7 +345,7 @@ __global__ void apply_kernel(
             __syncthreads();
 
             // t = A * s_hat
-            batch_single_kernels::simple_apply(mat_entry, s_hat_sh, t_sh);
+            simple_apply(mat_entry, s_hat_sh, t_sh);
             __syncthreads();
 
             // omega = <t,s> / <t,t>
@@ -372,8 +360,8 @@ __global__ void apply_kernel(
             __syncthreads();
 
             if (threadIdx.x / config::warp_size == 0) {
-                batch_single_kernels::single_rhs_compute_norm2(
-                    subgroup, num_rows, r_sh, norms_res_sh[0]);
+                single_rhs_compute_norm2(subgroup, num_rows, r_sh,
+                                         norms_res_sh[0]);
             }
             //__syncthreads();
 
@@ -386,7 +374,7 @@ __global__ void apply_kernel(
         logger.log_iteration(batch_id, iter, norms_res_sh[0]);
 
         // copy x back to global memory
-        batch_single_kernels::single_rhs_copy(num_rows, x_sh, x_gl_entry_ptr);
+        single_rhs_copy(num_rows, x_sh, x_gl_entry_ptr);
         __syncthreads();
     }
 }
diff --git a/common/cuda_hip/solver/batch_cg_kernels.hpp b/common/cuda_hip/solver/batch_cg_kernels.hpp
index e7ec0505844..7ccdc5f9926 100644
--- a/common/cuda_hip/solver/batch_cg_kernels.hpp
+++ b/common/cuda_hip/solver/batch_cg_kernels.hpp
@@ -6,9 +6,6 @@
 #define GKO_COMMON_CUDA_HIP_SOLVER_BATCH_CG_KERNELS_HPP_
 
 
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
@@ -18,14 +15,9 @@
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/format_conversion.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/segment_scan.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/warp_blas.hpp"
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
@@ -58,9 +50,8 @@ __device__ __forceinline__ void initialize(
     __syncthreads();
 
     // r = b - A*x
-    batch_single_kernels::advanced_apply(
-        static_cast<ValueType>(-1.0), mat_entry, x_shared_entry,
-        static_cast<ValueType>(1.0), r_shared_entry);
+    advanced_apply(static_cast<ValueType>(-1.0), mat_entry, x_shared_entry,
+                   static_cast<ValueType>(1.0), r_shared_entry);
     __syncthreads();
 
     // z = precond * r
@@ -69,13 +60,12 @@ __device__ __forceinline__ void initialize(
 
     if (threadIdx.x / config::warp_size == 0) {
         // Compute norms of rhs
-        batch_single_kernels::single_rhs_compute_norm2(
-            subgroup, num_rows, b_global_entry, rhs_norms_sh);
+        single_rhs_compute_norm2(subgroup, num_rows, b_global_entry,
+                                 rhs_norms_sh);
     } else if (threadIdx.x / config::warp_size == 1) {
         // rho_old = r' * z
-        batch_single_kernels::single_rhs_compute_conj_dot(
-            subgroup, num_rows, r_shared_entry, z_shared_entry,
-            rho_old_shared_entry);
+        single_rhs_compute_conj_dot(subgroup, num_rows, r_shared_entry,
+                                    z_shared_entry, rho_old_shared_entry);
     }
 
     // p = z
@@ -107,9 +97,8 @@ __device__ __forceinline__ void update_x_and_r(
     ValueType* const x_shared_entry, ValueType* const r_shared_entry)
 {
     if (threadIdx.x / config::warp_size == 0) {
-        batch_single_kernels::single_rhs_compute_conj_dot(
-            subgroup, num_rows, p_shared_entry, Ap_shared_entry,
-            alpha_shared_entry);
+        single_rhs_compute_conj_dot(subgroup, num_rows, p_shared_entry,
+                                    Ap_shared_entry, alpha_shared_entry);
     }
     __syncthreads();
 
@@ -225,7 +214,7 @@ __global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf,
             }
 
             // Ap = A * p
-            batch_single_kernels::simple_apply(mat_entry, p_sh, Ap_sh);
+            simple_apply(mat_entry, p_sh, Ap_sh);
             __syncthreads();
 
             // alpha = rho_old / (p' * Ap)
@@ -241,8 +230,8 @@ __global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf,
 
             if (threadIdx.x / config::warp_size == 0) {
                 // rho_new =  (r)' * (z)
-                batch_single_kernels::single_rhs_compute_conj_dot(
-                    subgroup, num_rows, r_sh, z_sh, rho_new_sh[0]);
+                single_rhs_compute_conj_dot(subgroup, num_rows, r_sh, z_sh,
+                                            rho_new_sh[0]);
             }
             __syncthreads();
 
@@ -261,7 +250,7 @@ __global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf,
         logger.log_iteration(batch_id, iter, norms_res_sh[0]);
 
         // copy x back to global memory
-        batch_single_kernels::single_rhs_copy(num_rows, x_sh, x_global_entry);
+        single_rhs_copy(num_rows, x_sh, x_global_entry);
         __syncthreads();
     }
 }
diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu
index b8ead675a3c..3f7dac1d08a 100644
--- a/cuda/solver/batch_cg_kernels.cu
+++ b/cuda/solver/batch_cg_kernels.cu
@@ -4,21 +4,13 @@
 
 #include "core/solver/batch_cg_kernels.hpp"
 
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
 #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
 #include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/warp_blas.hpp"
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp"

From dcb72c3c54f766e7bc06a8d01b9e5e0ca81709a5 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Mon, 26 Aug 2024 11:27:13 +0200
Subject: [PATCH 170/448] [dpcpp] rem headers and namespaces

---
 .../base/batch_multi_vector_kernels.hpp       |  4 --
 dpcpp/base/batch_multi_vector_kernels.hpp     |  1 -
 dpcpp/matrix/batch_csr_kernels.hpp            |  2 -
 dpcpp/matrix/batch_dense_kernels.hpp          |  2 -
 dpcpp/matrix/batch_ell_kernels.hpp            |  2 -
 dpcpp/preconditioner/batch_block_jacobi.hpp   |  6 --
 dpcpp/preconditioner/batch_identity.hpp       |  6 --
 dpcpp/preconditioner/batch_jacobi_kernels.hpp |  1 -
 dpcpp/preconditioner/batch_scalar_jacobi.hpp  |  6 --
 dpcpp/solver/batch_bicgstab_kernels.hpp       | 56 ++++++++-----------
 dpcpp/solver/batch_cg_kernels.hpp             | 30 +++++-----
 11 files changed, 36 insertions(+), 80 deletions(-)

diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp b/common/cuda_hip/base/batch_multi_vector_kernels.hpp
index 5c6210eeaed..63836280544 100644
--- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp
@@ -6,9 +6,6 @@
 #define GKO_COMMON_CUDA_HIP_BASE_BATCH_MULTI_VECTOR_KERNELS_HPP_
 
 
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
@@ -18,7 +15,6 @@
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "common/cuda_hip/components/reduction.hpp"
diff --git a/dpcpp/base/batch_multi_vector_kernels.hpp b/dpcpp/base/batch_multi_vector_kernels.hpp
index 142eba259de..74abaeda86f 100644
--- a/dpcpp/base/batch_multi_vector_kernels.hpp
+++ b/dpcpp/base/batch_multi_vector_kernels.hpp
@@ -17,7 +17,6 @@
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/base/helper.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
-#include "dpcpp/components/intrinsics.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
 
diff --git a/dpcpp/matrix/batch_csr_kernels.hpp b/dpcpp/matrix/batch_csr_kernels.hpp
index 2b195de308b..37dc5a2c52c 100644
--- a/dpcpp/matrix/batch_csr_kernels.hpp
+++ b/dpcpp/matrix/batch_csr_kernels.hpp
@@ -18,8 +18,6 @@
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/base/helper.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
-#include "dpcpp/components/intrinsics.dp.hpp"
-#include "dpcpp/components/reduction.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
 #include "dpcpp/matrix/batch_struct.hpp"
 
diff --git a/dpcpp/matrix/batch_dense_kernels.hpp b/dpcpp/matrix/batch_dense_kernels.hpp
index 59aee9a7208..a8f741bc3d0 100644
--- a/dpcpp/matrix/batch_dense_kernels.hpp
+++ b/dpcpp/matrix/batch_dense_kernels.hpp
@@ -18,8 +18,6 @@
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/base/helper.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
-#include "dpcpp/components/intrinsics.dp.hpp"
-#include "dpcpp/components/reduction.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
 #include "dpcpp/matrix/batch_struct.hpp"
 
diff --git a/dpcpp/matrix/batch_ell_kernels.hpp b/dpcpp/matrix/batch_ell_kernels.hpp
index 5a1ba163216..fb6bd3d8121 100644
--- a/dpcpp/matrix/batch_ell_kernels.hpp
+++ b/dpcpp/matrix/batch_ell_kernels.hpp
@@ -18,8 +18,6 @@
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/base/helper.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
-#include "dpcpp/components/intrinsics.dp.hpp"
-#include "dpcpp/components/reduction.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
 #include "dpcpp/matrix/batch_struct.hpp"
 
diff --git a/dpcpp/preconditioner/batch_block_jacobi.hpp b/dpcpp/preconditioner/batch_block_jacobi.hpp
index b01de33c299..a7431f919a5 100644
--- a/dpcpp/preconditioner/batch_block_jacobi.hpp
+++ b/dpcpp/preconditioner/batch_block_jacobi.hpp
@@ -13,19 +13,13 @@
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/preconditioner/batch_jacobi_helpers.hpp"
-#include "dpcpp/base/batch_multi_vector_kernels.hpp"
 #include "dpcpp/base/batch_struct.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/base/helper.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
-#include "dpcpp/components/intrinsics.dp.hpp"
-#include "dpcpp/components/reduction.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
-#include "dpcpp/matrix/batch_csr_kernels.hpp"
-#include "dpcpp/matrix/batch_dense_kernels.hpp"
-#include "dpcpp/matrix/batch_ell_kernels.hpp"
 #include "dpcpp/matrix/batch_struct.hpp"
 
 
diff --git a/dpcpp/preconditioner/batch_identity.hpp b/dpcpp/preconditioner/batch_identity.hpp
index 0696d028059..5d6a1cfcb65 100644
--- a/dpcpp/preconditioner/batch_identity.hpp
+++ b/dpcpp/preconditioner/batch_identity.hpp
@@ -12,19 +12,13 @@
 
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
-#include "dpcpp/base/batch_multi_vector_kernels.hpp"
 #include "dpcpp/base/batch_struct.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/base/helper.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
-#include "dpcpp/components/intrinsics.dp.hpp"
-#include "dpcpp/components/reduction.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
-#include "dpcpp/matrix/batch_csr_kernels.hpp"
-#include "dpcpp/matrix/batch_dense_kernels.hpp"
-#include "dpcpp/matrix/batch_ell_kernels.hpp"
 #include "dpcpp/matrix/batch_struct.hpp"
 
 
diff --git a/dpcpp/preconditioner/batch_jacobi_kernels.hpp b/dpcpp/preconditioner/batch_jacobi_kernels.hpp
index b8c75c9efa0..769ebc47a57 100644
--- a/dpcpp/preconditioner/batch_jacobi_kernels.hpp
+++ b/dpcpp/preconditioner/batch_jacobi_kernels.hpp
@@ -20,7 +20,6 @@
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/base/helper.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
-#include "dpcpp/components/intrinsics.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
 #include "dpcpp/matrix/batch_csr_kernels.hpp"
diff --git a/dpcpp/preconditioner/batch_scalar_jacobi.hpp b/dpcpp/preconditioner/batch_scalar_jacobi.hpp
index c8963c7b592..e48188c32c2 100644
--- a/dpcpp/preconditioner/batch_scalar_jacobi.hpp
+++ b/dpcpp/preconditioner/batch_scalar_jacobi.hpp
@@ -13,19 +13,13 @@
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/preconditioner/batch_jacobi_helpers.hpp"
-#include "dpcpp/base/batch_multi_vector_kernels.hpp"
 #include "dpcpp/base/batch_struct.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/base/helper.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
-#include "dpcpp/components/intrinsics.dp.hpp"
-#include "dpcpp/components/reduction.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
-#include "dpcpp/matrix/batch_csr_kernels.hpp"
-#include "dpcpp/matrix/batch_dense_kernels.hpp"
-#include "dpcpp/matrix/batch_ell_kernels.hpp"
 #include "dpcpp/matrix/batch_struct.hpp"
 
 
diff --git a/dpcpp/solver/batch_bicgstab_kernels.hpp b/dpcpp/solver/batch_bicgstab_kernels.hpp
index a6db9e7470a..c670725503e 100644
--- a/dpcpp/solver/batch_bicgstab_kernels.hpp
+++ b/dpcpp/solver/batch_bicgstab_kernels.hpp
@@ -19,7 +19,6 @@
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/base/helper.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
-#include "dpcpp/components/intrinsics.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
 #include "dpcpp/matrix/batch_csr_kernels.hpp"
@@ -65,19 +64,17 @@ __dpct_inline__ void initialize(
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
     // r = b - A*x
-    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply(
-        static_cast<ValueType>(-1.0), mat_global_entry, x_shared_entry,
-        static_cast<ValueType>(1.0), r_shared_entry, item_ct1);
+    advanced_apply(static_cast<ValueType>(-1.0), mat_global_entry,
+                   x_shared_entry, static_cast<ValueType>(1.0), r_shared_entry,
+                   item_ct1);
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
     if (sg_id == 0) {
-        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-            single_rhs_compute_norm2_sg(num_rows, r_shared_entry, res_norm,
-                                        item_ct1);
+        single_rhs_compute_norm2_sg(num_rows, r_shared_entry, res_norm,
+                                    item_ct1);
     } else if (sg_id == 1) {
-        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-            single_rhs_compute_norm2_sg(num_rows, b_global_entry, rhs_norm,
-                                        item_ct1);
+        single_rhs_compute_norm2_sg(num_rows, b_global_entry, rhs_norm,
+                                    item_ct1);
     }
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
@@ -120,9 +117,8 @@ __dpct_inline__ void compute_alpha(const int num_rows, const ValueType& rho_new,
     const auto sg_id = sg.get_group_id();
     const auto tid = item_ct1.get_local_linear_id();
     if (sg_id == 0) {
-        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-            single_rhs_compute_conj_dot_sg(num_rows, r_hat_shared_entry,
-                                           v_shared_entry, alpha, item_ct1);
+        single_rhs_compute_conj_dot_sg(num_rows, r_hat_shared_entry,
+                                       v_shared_entry, alpha, item_ct1);
     }
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
     if (tid == 0) {
@@ -158,13 +154,11 @@ __dpct_inline__ void compute_omega(const int num_rows,
     const auto sg_id = sg.get_group_id();
     const auto tid = item_ct1.get_local_linear_id();
     if (sg_id == 0) {
-        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-            single_rhs_compute_conj_dot_sg(num_rows, t_shared_entry,
-                                           s_shared_entry, omega, item_ct1);
+        single_rhs_compute_conj_dot_sg(num_rows, t_shared_entry, s_shared_entry,
+                                       omega, item_ct1);
     } else if (sg_id == 1) {
-        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-            single_rhs_compute_conj_dot_sg(num_rows, t_shared_entry,
-                                           t_shared_entry, temp, item_ct1);
+        single_rhs_compute_conj_dot_sg(num_rows, t_shared_entry, t_shared_entry,
+                                       temp, item_ct1);
     }
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
     if (tid == 0) {
@@ -345,9 +339,8 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf,
 
         // rho_new =  < r_hat , r > = (r_hat)' * (r)
         if (sg_id == 0) {
-            gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-                single_rhs_compute_conj_dot_sg(num_rows, r_hat_sh, r_sh,
-                                               rho_new_sh[0], item_ct1);
+            single_rhs_compute_conj_dot_sg(num_rows, r_hat_sh, r_sh,
+                                           rho_new_sh[0], item_ct1);
         }
         item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
@@ -362,8 +355,7 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf,
         item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
         // v = A * p_hat
-        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply(
-            mat_global_entry, p_hat_sh, v_sh, item_ct1);
+        simple_apply(mat_global_entry, p_hat_sh, v_sh, item_ct1);
         item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
         // alpha = rho_new / < r_hat , v>
@@ -377,9 +369,8 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf,
 
         // an estimate of residual norms
         if (sg_id == 0) {
-            gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-                single_rhs_compute_norm2_sg(num_rows, s_sh, norms_res_sh[0],
-                                            item_ct1);
+            single_rhs_compute_norm2_sg(num_rows, s_sh, norms_res_sh[0],
+                                        item_ct1);
         }
         item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
@@ -394,8 +385,7 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf,
         item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
         // t = A * s_hat
-        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply(
-            mat_global_entry, s_hat_sh, t_sh, item_ct1);
+        simple_apply(mat_global_entry, s_hat_sh, t_sh, item_ct1);
         item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
         // omega = <t,s> / <t,t>
@@ -409,9 +399,8 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf,
         item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
         if (sg_id == 0)
-            gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
-                single_rhs_compute_norm2_sg(num_rows, r_sh, norms_res_sh[0],
-                                            item_ct1);
+            single_rhs_compute_norm2_sg(num_rows, r_sh, norms_res_sh[0],
+                                        item_ct1);
         if (tid == group_size - 1) {
             rho_old_sh[0] = rho_new_sh[0];
         }
@@ -421,8 +410,7 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf,
     logger.log_iteration(batch_id, iter, norms_res_sh[0]);
 
     // copy x back to global memory
-    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(
-        num_rows, x_sh, x_global_entry, item_ct1);
+    copy_kernel(num_rows, x_sh, x_global_entry, item_ct1);
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
 }
 
diff --git a/dpcpp/solver/batch_cg_kernels.hpp b/dpcpp/solver/batch_cg_kernels.hpp
index 67df0a17236..1619e64aa2f 100644
--- a/dpcpp/solver/batch_cg_kernels.hpp
+++ b/dpcpp/solver/batch_cg_kernels.hpp
@@ -19,7 +19,6 @@
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/base/helper.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
-#include "dpcpp/components/intrinsics.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
 #include "dpcpp/matrix/batch_csr_kernels.hpp"
@@ -59,9 +58,9 @@ __dpct_inline__ void initialize(
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
     // r = b - A*x
-    batch_single_kernels::advanced_apply(
-        static_cast<ValueType>(-1.0), mat_global_entry, x_shared_entry,
-        static_cast<ValueType>(1.0), r_shared_entry, item_ct1);
+    advanced_apply(static_cast<ValueType>(-1.0), mat_global_entry,
+                   x_shared_entry, static_cast<ValueType>(1.0), r_shared_entry,
+                   item_ct1);
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
 
@@ -72,11 +71,11 @@ __dpct_inline__ void initialize(
     // Compute norms of rhs
     // and rho_old = r' * z
     if (sg_id == 0) {
-        batch_single_kernels::single_rhs_compute_norm2_sg(
-            num_rows, b_global_entry, rhs_norms, item_ct1);
+        single_rhs_compute_norm2_sg(num_rows, b_global_entry, rhs_norms,
+                                    item_ct1);
     } else if (sg_id == 1) {
-        batch_single_kernels::single_rhs_compute_conj_dot_sg(
-            num_rows, r_shared_entry, z_shared_entry, rho_old, item_ct1);
+        single_rhs_compute_conj_dot_sg(num_rows, r_shared_entry, z_shared_entry,
+                                       rho_old, item_ct1);
     }
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
@@ -112,9 +111,9 @@ __dpct_inline__ void update_x_and_r(
     auto sg = item_ct1.get_sub_group();
     const auto tid = item_ct1.get_local_linear_id();
     if (sg.get_group_id() == 0) {
-        batch_single_kernels::single_rhs_compute_conj_dot_sg(
-            num_rows, p_shared_entry, Ap_shared_entry, alpha_shared_entry,
-            item_ct1);
+        single_rhs_compute_conj_dot_sg(num_rows, p_shared_entry,
+                                       Ap_shared_entry, alpha_shared_entry,
+                                       item_ct1);
     }
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
     if (tid == 0) {
@@ -236,8 +235,7 @@ __dpct_inline__ void apply_kernel(
             break;
         }
         // Ap = A * p
-        batch_single_kernels::simple_apply(mat_global_entry, p_sh, Ap_sh,
-                                           item_ct1);
+        simple_apply(mat_global_entry, p_sh, Ap_sh, item_ct1);
         item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
         // alpha = rho_old / (p' * Ap)
@@ -254,8 +252,8 @@ __dpct_inline__ void apply_kernel(
 
         //  rho_new =  (r)' * (z)
         if (sg_id == 0) {
-            batch_single_kernels::single_rhs_compute_conj_dot_sg(
-                num_rows, r_sh, z_sh, rho_new_sh[0], item_ct1);
+            single_rhs_compute_conj_dot_sg(num_rows, r_sh, z_sh, rho_new_sh[0],
+                                           item_ct1);
         }
         item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
@@ -272,7 +270,7 @@ __dpct_inline__ void apply_kernel(
     logger.log_iteration(batch_id, iter, norms_res_sh[0]);
 
     // copy x back to global memory
-    batch_single_kernels::copy_kernel(num_rows, x_sh, x_global_entry, item_ct1);
+    copy_kernel(num_rows, x_sh, x_global_entry, item_ct1);
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
 }
 

From 212d2c4bb0116cb8091f6cbb529ff1eefbbd71d5 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Mon, 26 Aug 2024 12:59:02 +0200
Subject: [PATCH 171/448] fixup! [cuda, hip] remove unnecessary headers

---
 .../cuda_hip/preconditioner/batch_block_jacobi.hpp  |  6 ------
 common/cuda_hip/preconditioner/batch_identity.hpp   |  4 ----
 .../cuda_hip/preconditioner/batch_scalar_jacobi.hpp |  6 ------
 common/cuda_hip/solver/batch_bicgstab_kernels.hpp   |  1 -
 cuda/solver/batch_bicgstab_kernels.cu               | 13 -------------
 cuda/solver/batch_cg_kernels.cu                     |  4 ----
 hip/solver/batch_bicgstab_kernels.hip.cpp           | 11 -----------
 hip/solver/batch_cg_kernels.hip.cpp                 | 12 ------------
 8 files changed, 57 deletions(-)

diff --git a/common/cuda_hip/preconditioner/batch_block_jacobi.hpp b/common/cuda_hip/preconditioner/batch_block_jacobi.hpp
index c01bafa875a..604989dfa6d 100644
--- a/common/cuda_hip/preconditioner/batch_block_jacobi.hpp
+++ b/common/cuda_hip/preconditioner/batch_block_jacobi.hpp
@@ -11,17 +11,11 @@
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/preconditioner/batch_jacobi.hpp>
 
-#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
 #include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
-#include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
-#include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "core/preconditioner/batch_jacobi_helpers.hpp"
 
diff --git a/common/cuda_hip/preconditioner/batch_identity.hpp b/common/cuda_hip/preconditioner/batch_identity.hpp
index 3d57bcae406..3fa6693c7ef 100644
--- a/common/cuda_hip/preconditioner/batch_identity.hpp
+++ b/common/cuda_hip/preconditioner/batch_identity.hpp
@@ -10,12 +10,8 @@
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 
-#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
 #include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
-#include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
-#include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
 
 
diff --git a/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp b/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp
index 42a4f3f6aa6..5cd8c28a1d0 100644
--- a/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp
+++ b/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp
@@ -11,17 +11,11 @@
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/preconditioner/batch_jacobi.hpp>
 
-#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
 #include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
-#include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
-#include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
 
 
diff --git a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp
index 10d235358bc..8ea31358ed5 100644
--- a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp
+++ b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp
@@ -13,7 +13,6 @@
 #include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu
index d3dc8712201..8a5eee6b196 100644
--- a/cuda/solver/batch_bicgstab_kernels.cu
+++ b/cuda/solver/batch_bicgstab_kernels.cu
@@ -4,25 +4,12 @@
 
 #include "core/solver/batch_bicgstab_kernels.hpp"
 
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
-#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
 #include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/warp_blas.hpp"
-#include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
-#include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
-#include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp"
 #include "core/base/batch_struct.hpp"
diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu
index 3f7dac1d08a..32e66d7ee54 100644
--- a/cuda/solver/batch_cg_kernels.cu
+++ b/cuda/solver/batch_cg_kernels.cu
@@ -7,13 +7,9 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
-#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
 #include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
-#include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
-#include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "common/cuda_hip/solver/batch_cg_kernels.hpp"
 #include "core/base/batch_struct.hpp"
diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp
index d44bc4a0eb6..17199d2cd19 100644
--- a/hip/solver/batch_bicgstab_kernels.hip.cpp
+++ b/hip/solver/batch_bicgstab_kernels.hip.cpp
@@ -4,9 +4,6 @@
 
 #include "core/solver/batch_bicgstab_kernels.hpp"
 
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
@@ -15,15 +12,7 @@
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/uninitialized_array.hpp"
-#include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
-#include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
-#include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp"
 #include "core/base/batch_struct.hpp"
diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp
index c9a1e81be81..6d5e3bff3b3 100644
--- a/hip/solver/batch_cg_kernels.hip.cpp
+++ b/hip/solver/batch_cg_kernels.hip.cpp
@@ -4,26 +4,14 @@
 
 #include "core/solver/batch_cg_kernels.hpp"
 
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
-#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
 #include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/uninitialized_array.hpp"
-#include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
-#include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
-#include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "common/cuda_hip/solver/batch_cg_kernels.hpp"
 #include "core/base/batch_struct.hpp"

From eae2cab8686f298e4c1e343aca502103c828fa2c Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Wed, 31 Jul 2024 10:10:00 +0200
Subject: [PATCH 172/448] [core] allow naming lambda operations

---
 include/ginkgo/core/base/executor.hpp | 29 +++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index 95373b3e847..8afac213303 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -659,6 +659,17 @@ class Executor : public log::EnableLogging<Executor> {
         this->run(op);
     }
 
+    template <typename ClosureOmp, typename ClosureCuda, typename ClosureHip,
+              typename ClosureDpcpp>
+    void run(std::string name, const ClosureOmp& op_omp,
+             const ClosureCuda& op_cuda, const ClosureHip& op_hip,
+             const ClosureDpcpp& op_dpcpp) const
+    {
+        LambdaOperation<ClosureOmp, ClosureCuda, ClosureHip, ClosureDpcpp> op(
+            std::move(name), op_omp, op_cuda, op_hip, op_dpcpp);
+        this->run(op);
+    }
+
     /**
      * Allocates memory in this Executor.
      *
@@ -1109,6 +1120,16 @@ class Executor : public log::EnableLogging<Executor> {
               typename ClosureDpcpp>
     class LambdaOperation : public Operation {
     public:
+        LambdaOperation(std::string name, const ClosureOmp& op_omp,
+                        const ClosureCuda& op_cuda, const ClosureHip& op_hip,
+                        const ClosureDpcpp& op_dpcpp)
+            : name_(std::move(name)),
+              op_omp_(op_omp),
+              op_cuda_(op_cuda),
+              op_hip_(op_hip),
+              op_dpcpp_(op_dpcpp)
+        {}
+
         /**
          * Creates an LambdaOperation object from four functors.
          *
@@ -1121,10 +1142,7 @@ class Executor : public log::EnableLogging<Executor> {
          */
         LambdaOperation(const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
                         const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp)
-            : op_omp_(op_omp),
-              op_cuda_(op_cuda),
-              op_hip_(op_hip),
-              op_dpcpp_(op_dpcpp)
+            : LambdaOperation("unnamed", op_omp, op_cuda, op_hip, op_dpcpp)
         {}
 
         void run(std::shared_ptr<const OmpExecutor>) const override
@@ -1152,7 +1170,10 @@ class Executor : public log::EnableLogging<Executor> {
             op_dpcpp_();
         }
 
+        const char* get_name() const noexcept override { return name_.c_str(); }
+
     private:
+        std::string name_;
         ClosureOmp op_omp_;
         ClosureCuda op_cuda_;
         ClosureHip op_hip_;

From b9ee8ae4a03b894b3d7503b4d6ec916af02ef109 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Thu, 15 Aug 2024 11:42:34 +0200
Subject: [PATCH 173/448] [core] make run(lambda operation) available on all
 executors

---
 include/ginkgo/core/base/executor.hpp | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index 8afac213303..0e338f42044 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -1251,8 +1251,6 @@ class ExecutorBase : public Executor {
     friend class ReferenceExecutor;
 
 public:
-    using Executor::run;
-
     void run(const Operation& op) const override
     {
         this->template log<log::Logger::operation_launched>(this, &op);
@@ -1362,6 +1360,8 @@ class OmpExecutor : public detail::ExecutorBase<OmpExecutor>,
     friend class detail::ExecutorBase<OmpExecutor>;
 
 public:
+    using Executor::run;
+
     /**
      * Creates a new OmpExecutor.
      */
@@ -1439,6 +1439,8 @@ using DefaultExecutor = OmpExecutor;
  */
 class ReferenceExecutor : public OmpExecutor {
 public:
+    using Executor::run;
+
     static std::shared_ptr<ReferenceExecutor> create(
         std::shared_ptr<CpuAllocatorBase> alloc =
             std::make_shared<CpuAllocator>())
@@ -1513,6 +1515,8 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
     friend class detail::ExecutorBase<CudaExecutor>;
 
 public:
+    using Executor::run;
+
     /**
      * Creates a new CudaExecutor.
      *
@@ -1748,6 +1752,8 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
     friend class detail::ExecutorBase<HipExecutor>;
 
 public:
+    using Executor::run;
+
     /**
      * Creates a new HipExecutor.
      *
@@ -1963,6 +1969,8 @@ class DpcppExecutor : public detail::ExecutorBase<DpcppExecutor>,
     friend class detail::ExecutorBase<DpcppExecutor>;
 
 public:
+    using Executor::run;
+
     /**
      * Creates a new DpcppExecutor.
      *

From 2efc482956a4e86fd7aba1cfa08591cd347867fd Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Thu, 15 Aug 2024 11:51:08 +0200
Subject: [PATCH 174/448] [core] add tests for lambda op name

---
 core/test/base/executor.cpp | 38 +++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp
index 64a11929983..20f795b2ded 100644
--- a/core/test/base/executor.cpp
+++ b/core/test/base/executor.cpp
@@ -521,4 +521,42 @@ TEST_F(ExecutorLogging, LogsOperation)
 }
 
 
+struct NameLogger : public gko::log::Logger {
+protected:
+    void on_operation_launched(const gko::Executor* exec,
+                               const gko::Operation* op) const override
+    {
+        op_name = op->get_name();
+    }
+
+public:
+    mutable std::string op_name;
+};
+
+
+TEST(LambdaOperation, CanSetName)
+{
+    auto name_logger = std::make_shared<NameLogger>();
+    auto exec = gko::ReferenceExecutor::create();
+    exec->add_logger(name_logger);
+
+    exec->run(
+        "name", [] {}, [] {}, [] {}, [] {});
+
+    ASSERT_EQ("name", name_logger->op_name);
+}
+
+
+TEST(LambdaOperation, HasDefaultName)
+{
+    auto name_logger = std::make_shared<NameLogger>();
+    auto exec = gko::ReferenceExecutor::create();
+    exec->add_logger(name_logger);
+
+    exec->run([] {}, [] {}, [] {}, [] {});
+
+    ASSERT_EQ("unname", name_logger->op_name);
+}
+
+
 }  // namespace

From 900653d8bf8ff3e47de57db39ef1fcc48c03ac8b Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Mon, 19 Aug 2024 11:38:19 +0200
Subject: [PATCH 175/448] [core] review updates:

- test only for existence of default name
- add closure for reference op
- deprecate lambda run without name

Co-authored-by: Pratik Nayak <pratik.nayak@kit.edu>
Co-authored-by: Tobias Ribizel <mail@ribizel.de>
---
 core/test/base/executor.cpp           | 10 ++++-
 include/ginkgo/core/base/executor.hpp | 55 +++++++++++++++++++--------
 test/base/executor.cpp                | 22 +++++++++++
 3 files changed, 69 insertions(+), 18 deletions(-)

diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp
index 20f795b2ded..ae037e075df 100644
--- a/core/test/base/executor.cpp
+++ b/core/test/base/executor.cpp
@@ -541,12 +541,15 @@ TEST(LambdaOperation, CanSetName)
     exec->add_logger(name_logger);
 
     exec->run(
-        "name", [] {}, [] {}, [] {}, [] {});
+        "name", [] {}, [] {}, [] {}, [] {}, [] {});
 
     ASSERT_EQ("name", name_logger->op_name);
 }
 
 
+GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS
+
+
 TEST(LambdaOperation, HasDefaultName)
 {
     auto name_logger = std::make_shared<NameLogger>();
@@ -555,8 +558,11 @@ TEST(LambdaOperation, HasDefaultName)
 
     exec->run([] {}, [] {}, [] {}, [] {});
 
-    ASSERT_EQ("unname", name_logger->op_name);
+    ASSERT_NE(nullptr, name_logger->op_name.c_str());
 }
 
 
+GKO_END_DISABLE_DEPRECATION_WARNINGS
+
+
 }  // namespace
diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index 0e338f42044..963e30bfddd 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -651,22 +651,42 @@ class Executor : public log::EnableLogging<Executor> {
      */
     template <typename ClosureOmp, typename ClosureCuda, typename ClosureHip,
               typename ClosureDpcpp>
+    GKO_DEPRECATED(
+        "Please use the overload with std::string as first parameter.")
     void run(const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
              const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp) const
     {
-        LambdaOperation<ClosureOmp, ClosureCuda, ClosureHip, ClosureDpcpp> op(
-            op_omp, op_cuda, op_hip, op_dpcpp);
+        LambdaOperation<ClosureOmp, ClosureOmp, ClosureCuda, ClosureHip,
+                        ClosureDpcpp>
+            op(op_omp, op_cuda, op_hip, op_dpcpp);
         this->run(op);
     }
 
-    template <typename ClosureOmp, typename ClosureCuda, typename ClosureHip,
-              typename ClosureDpcpp>
-    void run(std::string name, const ClosureOmp& op_omp,
-             const ClosureCuda& op_cuda, const ClosureHip& op_hip,
-             const ClosureDpcpp& op_dpcpp) const
+    /**
+     * Runs one of the passed in functors, depending on the Executor type.
+     *
+     * @tparam ClosureReference  type of op_ref
+     * @tparam ClosureOmp  type of op_omp
+     * @tparam ClosureCuda  type of op_cuda
+     * @tparam ClosureHip  type of op_hip
+     * @tparam ClosureDpcpp  type of op_dpcpp
+     *
+     *  @param name  the name of the operation
+     * @param op_ref  functor to run in case of a ReferenceExecutor
+     * @param op_omp  functor to run in case of a OmpExecutor
+     * @param op_cuda  functor to run in case of a CudaExecutor
+     * @param op_hip  functor to run in case of a HipExecutor
+     * @param op_dpcpp  functor to run in case of a DpcppExecutor
+     */
+    template <typename ClosureReference, typename ClosureOmp,
+              typename ClosureCuda, typename ClosureHip, typename ClosureDpcpp>
+    void run(std::string name, const ClosureReference& op_ref,
+             const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
+             const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp) const
     {
-        LambdaOperation<ClosureOmp, ClosureCuda, ClosureHip, ClosureDpcpp> op(
-            std::move(name), op_omp, op_cuda, op_hip, op_dpcpp);
+        LambdaOperation<ClosureReference, ClosureOmp, ClosureCuda, ClosureHip,
+                        ClosureDpcpp>
+            op(std::move(name), op_ref, op_omp, op_cuda, op_hip, op_dpcpp);
         this->run(op);
     }
 
@@ -1116,14 +1136,15 @@ class Executor : public log::EnableLogging<Executor> {
      * @tparam ClosureHip  the type of the third functor
      * @tparam ClosureDpcpp  the type of the fourth functor
      */
-    template <typename ClosureOmp, typename ClosureCuda, typename ClosureHip,
-              typename ClosureDpcpp>
+    template <typename ClosureReference, typename ClosureOmp,
+              typename ClosureCuda, typename ClosureHip, typename ClosureDpcpp>
     class LambdaOperation : public Operation {
     public:
-        LambdaOperation(std::string name, const ClosureOmp& op_omp,
-                        const ClosureCuda& op_cuda, const ClosureHip& op_hip,
-                        const ClosureDpcpp& op_dpcpp)
+        LambdaOperation(std::string name, const ClosureReference& op_ref,
+                        const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
+                        const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp)
             : name_(std::move(name)),
+              op_ref_(op_ref),
               op_omp_(op_omp),
               op_cuda_(op_cuda),
               op_hip_(op_hip),
@@ -1142,7 +1163,8 @@ class Executor : public log::EnableLogging<Executor> {
          */
         LambdaOperation(const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
                         const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp)
-            : LambdaOperation("unnamed", op_omp, op_cuda, op_hip, op_dpcpp)
+            : LambdaOperation("unnamed", op_omp, op_omp, op_cuda, op_hip,
+                              op_dpcpp)
         {}
 
         void run(std::shared_ptr<const OmpExecutor>) const override
@@ -1152,7 +1174,7 @@ class Executor : public log::EnableLogging<Executor> {
 
         void run(std::shared_ptr<const ReferenceExecutor>) const override
         {
-            op_omp_();
+            op_ref_();
         }
 
         void run(std::shared_ptr<const CudaExecutor>) const override
@@ -1174,6 +1196,7 @@ class Executor : public log::EnableLogging<Executor> {
 
     private:
         std::string name_;
+        ClosureReference op_ref_;
         ClosureOmp op_omp_;
         ClosureCuda op_cuda_;
         ClosureHip op_hip_;
diff --git a/test/base/executor.cpp b/test/base/executor.cpp
index 8a344eb224d..7fcab4e0784 100644
--- a/test/base/executor.cpp
+++ b/test/base/executor.cpp
@@ -90,9 +90,28 @@ TEST_F(Executor, RunsCorrectHostOperation)
 }
 
 
+TEST_F(Executor, RunsCorrectLambdaOperationWithReferenceExecutor)
+{
+    int value = 0;
+    auto ref_lambda = [&value]() { value = reference::value; };
+    auto omp_lambda = [&value]() { value = omp::value; };
+    auto cuda_lambda = [&value]() { value = cuda::value; };
+    auto hip_lambda = [&value]() { value = hip::value; };
+    auto dpcpp_lambda = [&value]() { value = dpcpp::value; };
+
+    exec->run("test", ref_lambda, omp_lambda, cuda_lambda, hip_lambda,
+              dpcpp_lambda);
+
+    ASSERT_EQ(GKO_DEVICE_NAMESPACE::value, value);
+}
+
+
 #ifndef GKO_COMPILING_REFERENCE
 
 
+GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS
+
+
 TEST_F(Executor, RunsCorrectLambdaOperation)
 {
     int value = 0;
@@ -107,4 +126,7 @@ TEST_F(Executor, RunsCorrectLambdaOperation)
 }
 
 
+GKO_END_DISABLE_DEPRECATION_WARNINGS
+
+
 #endif  // GKO_COMPILING_REFERENCE

From 09d6704ba62b40ec8dbfa8d882eafd6c48b8443a Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 9 Sep 2024 16:42:57 +0200
Subject: [PATCH 176/448] add the test to detect the problem

---
 test/solver/bicgstab_kernels.cpp | 42 ++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/test/solver/bicgstab_kernels.cpp b/test/solver/bicgstab_kernels.cpp
index a90451a3f3a..9716acd86cb 100644
--- a/test/solver/bicgstab_kernels.cpp
+++ b/test/solver/bicgstab_kernels.cpp
@@ -245,6 +245,48 @@ TEST_F(Bicgstab, BicgstabStep3IsEquivalentToRef)
 }
 
 
+TEST_F(Bicgstab, BicgstabFinalizeIsEquivalentToRefWithoutRaceCondition)
+{
+    /**
+     * This test is designed to detect the following problem. Originally, we
+     * assigned threads per value to update the value and the stop status if the
+     * stop status is stopped but not finished yet. However, it leads to race
+     * conditions. If all threads see stop status before the update, all values
+     * will be correctly updated. It is also possible that some threads already
+     * finalize the stop status, but the rest see the stop status as finalized
+     * such that they will not update the value. We make this test case large to
+     * trigger this race condition more easily. However, it is not guaranteed to
+     * fail with the old version because of race conditions.
+     */
+    int m = 1e6;
+    int n = 2;
+    x = gen_mtx(m, n, n);
+    y = gen_mtx(m, n, n);
+    alpha = gen_mtx(1, n, n);
+    d_x = x->clone(exec);
+    d_y = y->clone(exec);
+    d_alpha = alpha->clone(exec);
+    stop_status = std::make_unique<gko::array<gko::stopping_status>>(ref, n);
+    for (size_t i = 0; i < n; ++i) {
+        stop_status->get_data()[i].reset();
+    }
+    // check correct handling for stopped columns
+    stop_status->get_data()[1].stop(1);
+    // finalize only update the stopped one but not finished yet
+    stop_status->get_data()[0].stop(1, false);
+    d_stop_status =
+        std::make_unique<gko::array<gko::stopping_status>>(exec, *stop_status);
+
+    gko::kernels::reference::bicgstab::finalize(ref, x.get(), y.get(),
+                                                alpha.get(), stop_status.get());
+    gko::kernels::GKO_DEVICE_NAMESPACE::bicgstab::finalize(
+        exec, d_x.get(), d_y.get(), d_alpha.get(), d_stop_status.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, ::r<value_type>::value);
+    GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status);
+}
+
+
 TEST_F(Bicgstab, BicgstabApplyOneRHSIsEquivalentToRef)
 {
     int m = 123;

From 2c439eecd08d75e27aa5364566e39eb55f03de73 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 5 Sep 2024 18:35:12 +0200
Subject: [PATCH 177/448] fix the race condition

---
 common/unified/solver/bicgstab_kernels.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/common/unified/solver/bicgstab_kernels.cpp b/common/unified/solver/bicgstab_kernels.cpp
index b696815f0d4..c403da3bf96 100644
--- a/common/unified/solver/bicgstab_kernels.cpp
+++ b/common/unified/solver/bicgstab_kernels.cpp
@@ -174,11 +174,18 @@ void finalize(std::shared_ptr<const DefaultExecutor> exec,
                       auto stop) {
             if (stop[col].has_stopped() && !stop[col].is_finalized()) {
                 x(row, col) += alpha[col] * y(row, col);
-                stop[col].finalize();
             }
         },
         x->get_size(), y->get_stride(), x, default_stride(y), row_vector(alpha),
         *stop_status);
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto col, auto stop) {
+            if (stop[col].has_stopped() && !stop[col].is_finalized()) {
+                stop[col].finalize();
+            }
+        },
+        x->get_size()[1], *stop_status);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_FINALIZE_KERNEL);

From 76ce05d6515e545a73993292e12c53795cac97c8 Mon Sep 17 00:00:00 2001
From: nbeams <246972+nbeams@users.noreply.github.com>
Date: Wed, 11 Sep 2024 15:22:49 +0000
Subject: [PATCH 178/448] GMRES: fix conj use in MGS dot product

---
 core/solver/gmres.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp
index e47714b2186..e066fc696a1 100644
--- a/core/solver/gmres.cpp
+++ b/core/solver/gmres.cpp
@@ -164,8 +164,8 @@ void orthogonalize_mgs(matrix::Dense<ValueType>* hessenberg_iter,
             krylov_bases, dim<2>{num_rows, num_rhs},
             span{local_num_rows * i, local_num_rows * (i + 1)},
             span{0, num_rhs});
-        next_krylov->compute_conj_dot(krylov_basis, hessenberg_entry,
-                                      reduction_tmp);
+        krylov_basis->compute_conj_dot(next_krylov, hessenberg_entry,
+                                       reduction_tmp);
         next_krylov->sub_scaled(hessenberg_entry, krylov_basis);
     }
 }

From 3e7fc2b60deedbbc5e1c16ae0aded1590f2c0edb Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Wed, 2 Oct 2024 15:47:38 +0200
Subject: [PATCH 179/448] [misc] fix typo

---
 include/ginkgo/core/base/utils_helper.hpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/ginkgo/core/base/utils_helper.hpp b/include/ginkgo/core/base/utils_helper.hpp
index 3ea5c9d878d..1cd36cdadcb 100644
--- a/include/ginkgo/core/base/utils_helper.hpp
+++ b/include/ginkgo/core/base/utils_helper.hpp
@@ -95,32 +95,32 @@ using pointee =
 
 
 template <typename T, typename = void>
-struct is_clonable_impl : std::false_type {};
+struct is_cloneable_impl : std::false_type {};
 
 template <typename T>
-struct is_clonable_impl<T, std::void_t<decltype(std::declval<T>().clone())>>
+struct is_cloneable_impl<T, std::void_t<decltype(std::declval<T>().clone())>>
     : std::true_type {};
 
 template <typename T>
-constexpr bool is_clonable()
+constexpr bool is_cloneable()
 {
-    return is_clonable_impl<std::decay_t<T>>::value;
+    return is_cloneable_impl<std::decay_t<T>>::value;
 }
 
 
 template <typename T, typename = void>
-struct is_clonable_to_impl : std::false_type {};
+struct is_cloneable_to_impl : std::false_type {};
 
 template <typename T>
-struct is_clonable_to_impl<
+struct is_cloneable_to_impl<
     T, std::void_t<decltype(std::declval<T>().clone(
            std::declval<std::shared_ptr<const Executor>>()))>>
     : std::true_type {};
 
 template <typename T>
-constexpr bool is_clonable_to()
+constexpr bool is_cloneable_to()
 {
-    return is_clonable_to_impl<std::decay_t<T>>::value;
+    return is_cloneable_to_impl<std::decay_t<T>>::value;
 }
 
 
@@ -172,7 +172,7 @@ using shared_type = std::shared_ptr<pointee<Pointer>>;
 template <typename Pointer>
 inline detail::cloned_type<Pointer> clone(const Pointer& p)
 {
-    static_assert(detail::is_clonable<detail::pointee<Pointer>>(),
+    static_assert(detail::is_cloneable<detail::pointee<Pointer>>(),
                   "Object is not clonable");
     return detail::cloned_type<Pointer>(
         static_cast<typename std::remove_cv<detail::pointee<Pointer>>::type*>(
@@ -199,7 +199,7 @@ template <typename Pointer>
 inline detail::cloned_type<Pointer> clone(std::shared_ptr<const Executor> exec,
                                           const Pointer& p)
 {
-    static_assert(detail::is_clonable_to<detail::pointee<Pointer>>(),
+    static_assert(detail::is_cloneable_to<detail::pointee<Pointer>>(),
                   "Object is not clonable");
     return detail::cloned_type<Pointer>(
         static_cast<typename std::remove_cv<detail::pointee<Pointer>>::type*>(

From a22ccbea83915096ca7759d461f6369430c6630b Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Wed, 2 Oct 2024 15:57:03 +0200
Subject: [PATCH 180/448] [misc] fix typo

---
 core/test/base/utils.cpp                  | 32 +++++++++++------------
 include/ginkgo/core/base/utils_helper.hpp |  4 +--
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/core/test/base/utils.cpp b/core/test/base/utils.cpp
index 1ad4705b824..5c6947b7cc6 100644
--- a/core/test/base/utils.cpp
+++ b/core/test/base/utils.cpp
@@ -83,19 +83,19 @@ TEST(PointerParam, WorksForUniquePointers)
 }
 
 
-struct ClonableDerived : Base {
-    ClonableDerived(std::shared_ptr<const gko::Executor> exec = nullptr)
+struct CloneableDerived : Base {
+    CloneableDerived(std::shared_ptr<const gko::Executor> exec = nullptr)
         : executor(exec)
     {}
 
     std::unique_ptr<Base> clone()
     {
-        return std::unique_ptr<Base>(new ClonableDerived());
+        return std::unique_ptr<Base>(new CloneableDerived());
     }
 
     std::unique_ptr<Base> clone(std::shared_ptr<const gko::Executor> exec)
     {
-        return std::unique_ptr<Base>(new ClonableDerived{exec});
+        return std::unique_ptr<Base>(new CloneableDerived{exec});
     }
 
     std::shared_ptr<const gko::Executor> executor;
@@ -104,36 +104,36 @@ struct ClonableDerived : Base {
 
 TEST(Clone, ClonesUniquePointer)
 {
-    std::unique_ptr<ClonableDerived> p(new ClonableDerived());
+    std::unique_ptr<CloneableDerived> p(new CloneableDerived());
 
     auto clone = gko::clone(p);
 
     ::testing::StaticAssertTypeEq<decltype(clone),
-                                  std::unique_ptr<ClonableDerived>>();
+                                  std::unique_ptr<CloneableDerived>>();
     ASSERT_NE(p.get(), clone.get());
 }
 
 
 TEST(Clone, ClonesSharedPointer)
 {
-    std::shared_ptr<ClonableDerived> p(new ClonableDerived());
+    std::shared_ptr<CloneableDerived> p(new CloneableDerived());
 
     auto clone = gko::clone(p);
 
     ::testing::StaticAssertTypeEq<decltype(clone),
-                                  std::unique_ptr<ClonableDerived>>();
+                                  std::unique_ptr<CloneableDerived>>();
     ASSERT_NE(p.get(), clone.get());
 }
 
 
 TEST(Clone, ClonesPlainPointer)
 {
-    std::unique_ptr<ClonableDerived> p(new ClonableDerived());
+    std::unique_ptr<CloneableDerived> p(new CloneableDerived());
 
     auto clone = gko::clone(p.get());
 
     ::testing::StaticAssertTypeEq<decltype(clone),
-                                  std::unique_ptr<ClonableDerived>>();
+                                  std::unique_ptr<CloneableDerived>>();
     ASSERT_NE(p.get(), clone.get());
 }
 
@@ -141,12 +141,12 @@ TEST(Clone, ClonesPlainPointer)
 TEST(CloneTo, ClonesUniquePointer)
 {
     auto exec = gko::ReferenceExecutor::create();
-    std::unique_ptr<ClonableDerived> p(new ClonableDerived());
+    std::unique_ptr<CloneableDerived> p(new CloneableDerived());
 
     auto clone = gko::clone(exec, p);
 
     ::testing::StaticAssertTypeEq<decltype(clone),
-                                  std::unique_ptr<ClonableDerived>>();
+                                  std::unique_ptr<CloneableDerived>>();
     ASSERT_NE(p.get(), clone.get());
     ASSERT_EQ(clone->executor, exec);
 }
@@ -155,12 +155,12 @@ TEST(CloneTo, ClonesUniquePointer)
 TEST(CloneTo, ClonesSharedPointer)
 {
     auto exec = gko::ReferenceExecutor::create();
-    std::shared_ptr<ClonableDerived> p(new ClonableDerived());
+    std::shared_ptr<CloneableDerived> p(new CloneableDerived());
 
     auto clone = gko::clone(exec, p);
 
     ::testing::StaticAssertTypeEq<decltype(clone),
-                                  std::unique_ptr<ClonableDerived>>();
+                                  std::unique_ptr<CloneableDerived>>();
     ASSERT_NE(p.get(), clone.get());
     ASSERT_EQ(clone->executor, exec);
 }
@@ -169,12 +169,12 @@ TEST(CloneTo, ClonesSharedPointer)
 TEST(CloneTo, ClonesPlainPointer)
 {
     auto exec = gko::ReferenceExecutor::create();
-    std::unique_ptr<ClonableDerived> p(new ClonableDerived());
+    std::unique_ptr<CloneableDerived> p(new CloneableDerived());
 
     auto clone = gko::clone(exec, p.get());
 
     ::testing::StaticAssertTypeEq<decltype(clone),
-                                  std::unique_ptr<ClonableDerived>>();
+                                  std::unique_ptr<CloneableDerived>>();
     ASSERT_NE(p.get(), clone.get());
     ASSERT_EQ(clone->executor, exec);
 }
diff --git a/include/ginkgo/core/base/utils_helper.hpp b/include/ginkgo/core/base/utils_helper.hpp
index 1cd36cdadcb..951ea4bbf5d 100644
--- a/include/ginkgo/core/base/utils_helper.hpp
+++ b/include/ginkgo/core/base/utils_helper.hpp
@@ -173,7 +173,7 @@ template <typename Pointer>
 inline detail::cloned_type<Pointer> clone(const Pointer& p)
 {
     static_assert(detail::is_cloneable<detail::pointee<Pointer>>(),
-                  "Object is not clonable");
+                  "Object is not cloneable");
     return detail::cloned_type<Pointer>(
         static_cast<typename std::remove_cv<detail::pointee<Pointer>>::type*>(
             p->clone().release()));
@@ -200,7 +200,7 @@ inline detail::cloned_type<Pointer> clone(std::shared_ptr<const Executor> exec,
                                           const Pointer& p)
 {
     static_assert(detail::is_cloneable_to<detail::pointee<Pointer>>(),
-                  "Object is not clonable");
+                  "Object is not cloneable");
     return detail::cloned_type<Pointer>(
         static_cast<typename std::remove_cv<detail::pointee<Pointer>>::type*>(
             p->clone(std::move(exec)).release()));

From fd24e5d0d25a90f8d4eed21ce67d7ec6dcf62ff1 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 8 Oct 2024 15:40:34 +0200
Subject: [PATCH 181/448] add workspace for reduction usage

---
 core/stop/residual_norm.cpp                | 18 ++++++++++++------
 include/ginkgo/core/stop/residual_norm.hpp |  2 ++
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/core/stop/residual_norm.cpp b/core/stop/residual_norm.cpp
index adf7da3e2e6..4e73cc8d56a 100644
--- a/core/stop/residual_norm.cpp
+++ b/core/stop/residual_norm.cpp
@@ -98,7 +98,8 @@ ResidualNormBase<ValueType>::ResidualNormBase(
       system_matrix_{args.system_matrix},
       b_{args.b},
       one_{gko::initialize<Vector>({1}, exec)},
-      neg_one_{gko::initialize<Vector>({-1}, exec)}
+      neg_one_{gko::initialize<Vector>({-1}, exec)},
+      reduction_tmp_{exec}
 {
     switch (baseline_) {
     case mode::initial_resnorm: {
@@ -113,7 +114,8 @@ ResidualNormBase<ValueType>::ResidualNormBase(
                 args.system_matrix->apply(neg_one_, args.x, one_, b_clone);
                 norm_dispatch<ValueType>(
                     [&](auto dense_r) {
-                        dense_r->compute_norm2(this->starting_tau_);
+                        dense_r->compute_norm2(this->starting_tau_,
+                                               reduction_tmp_);
                     },
                     b_clone.get());
             }
@@ -122,7 +124,7 @@ ResidualNormBase<ValueType>::ResidualNormBase(
                 exec, dim<2>{1, args.initial_residual->get_size()[1]});
             norm_dispatch<ValueType>(
                 [&](auto dense_r) {
-                    dense_r->compute_norm2(this->starting_tau_);
+                    dense_r->compute_norm2(this->starting_tau_, reduction_tmp_);
                 },
                 args.initial_residual);
         }
@@ -135,7 +137,9 @@ ResidualNormBase<ValueType>::ResidualNormBase(
         this->starting_tau_ =
             NormVector::create(exec, dim<2>{1, args.b->get_size()[1]});
         norm_dispatch<ValueType>(
-            [&](auto dense_r) { dense_r->compute_norm2(this->starting_tau_); },
+            [&](auto dense_r) {
+                dense_r->compute_norm2(this->starting_tau_, reduction_tmp_);
+            },
             args.b.get());
         break;
     }
@@ -169,7 +173,9 @@ bool ResidualNormBase<ValueType>::check_impl(
         return false;
     } else if (updater.residual_ != nullptr) {
         norm_dispatch<ValueType>(
-            [&](auto dense_r) { dense_r->compute_norm2(u_dense_tau_); },
+            [&](auto dense_r) {
+                dense_r->compute_norm2(u_dense_tau_, reduction_tmp_);
+            },
             updater.residual_);
         dense_tau = u_dense_tau_.get();
     } else if (updater.solution_ != nullptr && system_matrix_ != nullptr &&
@@ -179,7 +185,7 @@ bool ResidualNormBase<ValueType>::check_impl(
             [&](auto dense_b, auto dense_x) {
                 auto dense_r = dense_b->clone();
                 system_matrix_->apply(neg_one_, dense_x, one_, dense_r);
-                dense_r->compute_norm2(u_dense_tau_);
+                dense_r->compute_norm2(u_dense_tau_, reduction_tmp_);
             },
             b_.get(), updater.solution_);
         dense_tau = u_dense_tau_.get();
diff --git a/include/ginkgo/core/stop/residual_norm.hpp b/include/ginkgo/core/stop/residual_norm.hpp
index 6ee3c843e6a..7ee020207d4 100644
--- a/include/ginkgo/core/stop/residual_norm.hpp
+++ b/include/ginkgo/core/stop/residual_norm.hpp
@@ -82,6 +82,8 @@ class ResidualNormBase
     /* one/neg_one for residual computation */
     std::shared_ptr<const Vector> one_{};
     std::shared_ptr<const Vector> neg_one_{};
+    // workspace for reduction
+    mutable gko::array<char> reduction_tmp_;
 };
 
 

From 244b5a956a74e7ffc9fccc90ca68be7de1fa0765 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Wed, 2 Oct 2024 15:31:04 +0200
Subject: [PATCH 182/448] [ci] disable horeka CI jobs

---
 .gitlab-ci.yml    | 91 +++--------------------------------------------
 .gitlab/image.yml |  6 ----
 .gitlab/rules.yml |  5 +++
 3 files changed, 9 insertions(+), 93 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 226a10f4cea..d6ba260f75d 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -91,91 +91,6 @@ trigger_pipeline:
       fi
 
 
-# Build jobs
-# Job with example runs.
-# cuda 11.0 and friends on HoreKa with tests
-build/cuda110/mvapich2/gcc/cuda/debug/shared:
-  extends:
-    - .build_template
-    - .default_variables
-    - .quick_test_condition
-    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
-  variables:
-    BUILD_OMP: "ON"
-    BUILD_CUDA: "ON"
-    BUILD_MPI: "ON"
-    BUILD_TYPE: "Debug"
-    FAST_TESTS: "ON"
-    NONDEFAULT_STREAM: "ON"
-    CUDA_ARCH: 80
-    USE_NAME: "cuda110-mvapich2-gcc-${CI_PIPELINE_ID}"
-    KEEP_CONTAINER: "ON"
-    USE_SLURM: 0
-
-test/cuda110/mvapich2/gcc/cuda/debug/shared:
-  extends:
-    - .horeka_test_template
-    - .default_variables
-    - .quick_test_condition
-    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
-  variables:
-    USE_NAME: "cuda110-mvapich2-gcc-${CI_PIPELINE_ID}"
-    SLURM_PARTITION: "accelerated"
-    SLURM_GRES: "gpu:4"
-    SLURM_TIME: "02:00:00"
-  dependencies: null
-  needs: [ "build/cuda110/mvapich2/gcc/cuda/debug/shared" ]
-
-
-build/cuda110/nompi/clang/cuda/release/static:
-  extends:
-    - .build_template
-    - .default_variables
-    - .full_test_condition
-    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
-  variables:
-    CXX_COMPILER: "clang++"
-    BUILD_OMP: "ON"
-    BUILD_CUDA: "ON"
-    BUILD_TYPE: "Release"
-    BUILD_SHARED_LIBS: "OFF"
-    CUDA_ARCH: 80
-    USE_NAME: "cuda110-nompi-clang-${CI_PIPELINE_ID}"
-    KEEP_CONTAINER: "ON"
-    USE_SLURM: 0
-
-test/cuda110/nompi/clang/cuda/release/static:
-  extends:
-    - .horeka_test_template
-    - .default_variables
-    - .full_test_condition
-    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
-  variables:
-    USE_NAME: "cuda110-nompi-clang-${CI_PIPELINE_ID}"
-    SLURM_PARTITION: "accelerated"
-    SLURM_GRES: "gpu:4"
-    SLURM_TIME: "01:30:00"
-  dependencies: null
-  needs: [ "build/cuda110/nompi/clang/cuda/release/static" ]
-  
-
-build/cuda110/nompi/clang/cuda/release/shared:
-  extends:
-    - .build_template
-    - .default_variables
-    - .quick_test_condition
-    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
-  variables:
-    CXX_COMPILER: "clang++"
-    CUDA_ARCH: 52
-    BUILD_OMP: "ON"
-    BUILD_CUDA: "ON"
-    BUILD_TYPE: "Release"
-    FAST_TESTS: "ON"
-    # disable spurious unused argument warning
-    EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177"
-
-
 # cuda 11.4 and friends
 build/cuda114/nompi/gcc/cuda/debug/shared:
   extends:
@@ -764,8 +679,9 @@ benchmark-cuda-spmv-build:
   extends:
     - .build_template
     - .default_variables
-    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
+    - .use_gko_cuda114-openmpi-gnu10-llvm12
     - .benchmark-spmv-cuda-rules
+    - .disable_job_condition
   stage: benchmark-build
   variables:
     BUILD_OMP: "ON"
@@ -785,8 +701,9 @@ benchmark-cuda-spmv:
   extends:
     - .benchmark_template
     - .default_variables
-    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
+    - .use_gko_cuda114-openmpi-gnu10-llvm12
     - .benchmark-spmv-cuda-rules
+    - .disable_job_condition
   stage: benchmark-cuda
   variables:
     BENCHMARK_REPO: git@github.com:ginkgo-project/ginkgo-data.git
diff --git a/.gitlab/image.yml b/.gitlab/image.yml
index 60521044d7f..2295f6312ae 100644
--- a/.gitlab/image.yml
+++ b/.gitlab/image.yml
@@ -17,12 +17,6 @@
     - cpu
     - amdci
 
-.use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020:
-  image: ginkgohub/cuda:110-mvapich2-gnu9-llvm9-intel2020
-  tags:
-    - private_ci
-    - horeka
-
 .use_gko_cuda114-openmpi-gnu10-llvm12:
   image: ginkgohub/cuda:114-openmpi-gnu10-llvm12
   tags:
diff --git a/.gitlab/rules.yml b/.gitlab/rules.yml
index 0280017c08b..4afc04799bb 100644
--- a/.gitlab/rules.yml
+++ b/.gitlab/rules.yml
@@ -59,3 +59,8 @@
         # - common/unified/matrix/* # for now no SpMV there?
       when: manual
       allow_failure: true
+
+
+.disable_job_condition:
+  rules:
+    - when: never

From 6b96a374328dbc763568210020be1852ce575961 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Wed, 2 Oct 2024 15:37:37 +0200
Subject: [PATCH 183/448] [ci] delete github windows jobs

---
 .github/workflows/windows-mingw.yml     | 65 -------------------------
 .github/workflows/windows-msvc-cuda.yml | 62 -----------------------
 .github/workflows/windows-msvc-ref.yml  | 62 -----------------------
 3 files changed, 189 deletions(-)
 delete mode 100644 .github/workflows/windows-mingw.yml
 delete mode 100644 .github/workflows/windows-msvc-cuda.yml
 delete mode 100644 .github/workflows/windows-msvc-ref.yml

diff --git a/.github/workflows/windows-mingw.yml b/.github/workflows/windows-mingw.yml
deleted file mode 100644
index 1c859661562..00000000000
--- a/.github/workflows/windows-mingw.yml
+++ /dev/null
@@ -1,65 +0,0 @@
-name: Windows-MinGW
-
-on:
-  push:
-    branches:
-      - 'master'
-      - 'develop'
-      - 'release/**'
-    tags:
-      - '**'
-  pull_request:
-    types: [opened,synchronize]
-  workflow_dispatch:
-    inputs:
-      debug_enabled:
-        description: 'Run the build with tmate debugging enabled by `debug_enabled` keyword (https://github.com/marketplace/actions/debugging-with-tmate)'
-        required: false
-        default: false
-
-concurrency:
-  group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  windows_mingw:
-    if: ${{ false }}
-    strategy:
-      fail-fast: false
-      matrix:
-        config:
-        - {shared: "OFF", build_type: "Release", name: "omp/release/static", cflags: ""}
-    name: mingw/${{ matrix.config.name }}
-    runs-on: [windows-latest]
-    steps:
-    - name: Checkout the latest code (shallow clone)
-      uses: actions/checkout@v4
-
-    - name: Debug over SSH (tmate)
-      uses: mxschmitt/action-tmate@v3.5
-      if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }}
-      with:
-        limit-access-to-actor: true
-
-    - name: configure
-    # Use cmd to remove the path easily
-      run: |
-        bcdedit /set IncreaseUserVa 3072
-        editbin /LARGEADDRESSAWARE "C:\Program Files\Git\mingw64\bin\cc1plus.exe"
-        set PATH=C:\Program Files\Git\mingw64\bin;%PATH%
-        set PATH=C:\ProgramData\chocolatey\lib\mingw\tools\install\mingw64\bin;%PATH%
-        mkdir build
-        cd build
-        cmake -G "MinGW Makefiles" -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DCMAKE_CXX_FLAGS=${{ matrix.config.cflags }} ..
-        cmake --build . -j4
-      shell: cmd
-
-    - name: install
-      run: |
-        set PATH=C:\Program Files\Git\mingw64\bin;%PATH%
-        set PATH=C:\ProgramData\chocolatey\lib\mingw\tools\install\mingw64\bin;%PATH%
-        set PATH=C:\Program Files (x86)\Ginkgo\bin;%PATH%
-        cd build
-        cmake --install .
-        cmake --build . --target test_install
-      shell: cmd
diff --git a/.github/workflows/windows-msvc-cuda.yml b/.github/workflows/windows-msvc-cuda.yml
deleted file mode 100644
index efa637b2bf9..00000000000
--- a/.github/workflows/windows-msvc-cuda.yml
+++ /dev/null
@@ -1,62 +0,0 @@
-name: Windows-MSVC-CUDA (compile-only)
-
-on:
-  push:
-    branches:
-      - 'master'
-      - 'develop'
-      - 'release/**'
-    tags:
-      - '**'
-  pull_request:
-    types: [opened,synchronize]
-  workflow_dispatch:
-    inputs:
-      debug_enabled:
-        description: 'Run the build with tmate debugging enabled by `debug_enabled` keyword (https://github.com/marketplace/actions/debugging-with-tmate)'
-        required: false
-        default: false
-
-concurrency:
-  group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  windows_cuda:
-    if: ${{ false }}
-    strategy:
-      fail-fast: false
-      matrix:
-        config:
-        - {version: "latest", name: "cuda-latest/release/shared", "mixed": "ON"}
-    name: msvc/${{ matrix.config.name }} (only compile)
-    runs-on: [windows-2019]
-
-    steps:
-    - name: Checkout the latest code (shallow clone)
-      uses: actions/checkout@v4
-    - name: setup (versioned)
-      if: matrix.config.version != 'latest'
-      run: |
-        choco install cuda --version=${{ matrix.config.version }} -y
-
-    - name: setup (latest)
-      if: matrix.config.version == 'latest'
-      run: |
-        choco install cuda -y
-
-    - name: Debug over SSH (tmate)
-      uses: mxschmitt/action-tmate@v3.5
-      if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }}
-      with:
-        limit-access-to-actor: true
-
-    - name: configure
-      run: |
-        $env:ChocolateyInstall = Convert-Path "$((Get-Command choco).Path)\..\.."
-        Import-Module "$env:ChocolateyInstall\helpers\chocolateyProfile.psm1"
-        refreshenv
-        mkdir build
-        cd build
-        cmake -DGINKGO_BUILD_CUDA=ON -DGINKGO_BUILD_OMP=OFF -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} -DGINKGO_CUDA_ARCHITECTURES=60 ..
-        cmake --build . -j4 --config Release
diff --git a/.github/workflows/windows-msvc-ref.yml b/.github/workflows/windows-msvc-ref.yml
deleted file mode 100644
index 60a811bb99b..00000000000
--- a/.github/workflows/windows-msvc-ref.yml
+++ /dev/null
@@ -1,62 +0,0 @@
-name: Windows-MSVC-Reference
-
-on:
-  push:
-    branches:
-      - 'master'
-      - 'develop'
-      - 'release/**'
-    tags:
-      - '**'
-  pull_request:
-    types: [opened,synchronize]
-  workflow_dispatch:
-    inputs:
-      debug_enabled:
-        description: 'Run the build with tmate debugging enabled by `debug_enabled` keyword (https://github.com/marketplace/actions/debugging-with-tmate)'
-        required: false
-        default: false
-
-concurrency:
-  group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  windows_ref:
-    if: ${{ false }}
-    strategy:
-      fail-fast: false
-      matrix:
-        config:
-        # Debug shared exceeds symbol limit
-        # - {shared: "ON", build_type: "Debug", name: "reference/debug/shared"}
-        - {shared: "OFF", build_type: "Release", name: "reference/release/static"}
-        - {shared: "ON", build_type: "Release", name: "reference/release/shared"}
-        # Debug static needs too much storage
-        # - {shared: "OFF", build_type: "Debug", name: "reference/debug/static"}
-    name: msvc/${{ matrix.config.name }}
-    runs-on: [windows-latest]
-    steps:
-    - name: Checkout the latest code (shallow clone)
-      uses: actions/checkout@v4
-
-    - name: Debug over SSH (tmate)
-      uses: mxschmitt/action-tmate@v3.5
-      if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }}
-      with:
-        limit-access-to-actor: true
-
-    - name: configure
-      run: |
-        mkdir build
-        cd build
-        cmake  -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_CXX_FLAGS_DEBUG='/MDd /Zi /Ob1 /O1 /Od /RTC1' -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_OMP=OFF ..
-        cmake --build . -j4 --config ${{ matrix.config.build_type }}
-        ctest . -C ${{ matrix.config.build_type }} --output-on-failure
-
-    - name: install
-      run: |
-        $env:PATH="$env:PATH;C:\Program Files (x86)\Ginkgo\bin"
-        cd build
-        cmake --install . --config ${{ matrix.config.build_type }}
-        cmake --build . --target test_install --config ${{ matrix.config.build_type }}

From 1deb9a29e17e45671c47b756f06beb0bbffc505d Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Mon, 7 Oct 2024 11:37:54 +0000
Subject: [PATCH 184/448] [ci] use nla-gpu for QoS jobs

---
 .gitlab-ci.yml | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index d6ba260f75d..ef10b92e20d 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -433,10 +433,9 @@ warnings:
     - .build_template
     - .default_variables
     - .full_test_condition
-    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
+    - .use_gko-rocm514-nompi-gnu11-llvm11
   variables:
     BUILD_OMP: "ON"
-    BUILD_CUDA: "ON"
     CXX_FLAGS: "-Werror=pedantic -pedantic-errors"
   allow_failure: yes
 
@@ -447,10 +446,9 @@ no-circular-deps:
     - .build_template
     - .default_variables
     - .quick_test_condition
-    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
+    - .use_gko-rocm514-nompi-gnu11-llvm11
   variables:
     BUILD_OMP: "ON"
-    BUILD_CUDA: "ON"
     EXTRA_CMAKE_FLAGS: '-DGINKGO_CHECK_CIRCULAR_DEPS=on'
   allow_failure: no
 
@@ -474,10 +472,9 @@ clang-tidy:
     - .build_template
     - .default_variables
     - .full_test_condition
-    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
+    - .use_gko-rocm514-nompi-gnu11-llvm11
   variables:
     BUILD_OMP: "ON"
-    BUILD_CUDA: "ON"
     EXTRA_CMAKE_FLAGS: '-DGINKGO_WITH_CLANG_TIDY=ON'
   allow_failure: yes
 
@@ -487,10 +484,9 @@ iwyu:
     - .build_template
     - .default_variables
     - .full_test_condition
-    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
+    - .use_gko-rocm514-nompi-gnu11-llvm11
   variables:
     BUILD_OMP: "ON"
-    BUILD_CUDA: "ON"
     EXTRA_CMAKE_FLAGS: '-DGINKGO_WITH_IWYU=ON'
   allow_failure: yes
 
@@ -502,7 +498,7 @@ sonarqube_cov_:
     - .default_variables
     - .quick_test_short_lived_condition
     - .before_script_template
-    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
+    - .use_gko_cuda114-openmpi-gnu10-llvm12
   tags:
     - private_ci
     - controller
@@ -538,7 +534,7 @@ sonarqube_cov:
     - .default_variables
     - .deploy_condition
     - .before_script_template
-    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
+    - .use_gko_cuda114-openmpi-gnu10-llvm12
   tags:
     - private_ci
     - controller
@@ -601,7 +597,7 @@ threadsanitizer:
     - .default_variables
     - .deploy_condition
     - .before_script_template
-    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
+    - .use_gko-rocm514-nompi-gnu11-llvm11
   script:
     - LD_PRELOAD=/usr/local/lib/libomp.so
       CC=clang CXX=clang++
@@ -616,7 +612,7 @@ leaksanitizer:
     - .default_variables
     - .deploy_condition
     - .before_script_template
-    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
+    - .use_gko-rocm514-nompi-gnu11-llvm11
   script:
     - ctest -V -S cmake/CTestScript.cmake -DCTEST_BUILD_CONFIGURATION=LSAN
       -DCTEST_MEMORYCHECK_TYPE=LeakSanitizer
@@ -627,7 +623,7 @@ addresssanitizer:
     - .default_variables
     - .deploy_condition
     - .before_script_template
-    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
+    - .use_gko-rocm514-nompi-gnu11-llvm11
   script:
     - ctest -V -S cmake/CTestScript.cmake -DCTEST_BUILD_CONFIGURATION=ASAN
       -DCTEST_MEMORYCHECK_TYPE=AddressSanitizer
@@ -638,7 +634,7 @@ undefinedsanitizer:
     - .default_variables
     - .deploy_condition
     - .before_script_template
-    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
+    - .use_gko-rocm514-nompi-gnu11-llvm11
   script:
     # the Gold linker is required because of a linker flag issues given by UBsan
     # in the Ubuntu setup we are using.

From 41b535a2c8cdea24ed2336db17a2f540bdf67a87 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Wed, 9 Oct 2024 07:33:40 +0000
Subject: [PATCH 185/448] [ci] fix circular-deps issues

---
 benchmark/CMakeLists.txt                    | 12 ++++++------
 reference/solver/batch_bicgstab_kernels.hpp |  2 ++
 reference/solver/batch_cg_kernels.hpp       |  2 ++
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index e2479e02344..55ed76d1613 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -21,7 +21,7 @@ function(ginkgo_benchmark_cusparse_linops type def)
     # make the dependency public to catch issues
     target_compile_definitions(cusparse_linops_${type} PUBLIC ${def})
     target_compile_definitions(cusparse_linops_${type} PRIVATE GKO_COMPILING_CUDA)
-    target_link_libraries(cusparse_linops_${type} Ginkgo::ginkgo CUDA::cudart CUDA::cublas CUDA::cusparse)
+    target_link_libraries(cusparse_linops_${type} PRIVATE Ginkgo::ginkgo CUDA::cudart CUDA::cublas CUDA::cusparse)
     ginkgo_compile_features(cusparse_linops_${type})
 endfunction()
 
@@ -31,7 +31,7 @@ function(ginkgo_benchmark_hipsparse_linops type def)
     target_compile_definitions(hipsparse_linops_${type} PUBLIC ${def})
     target_compile_definitions(hipsparse_linops_${type} PRIVATE GKO_COMPILING_HIP)
     target_include_directories(hipsparse_linops_${type} SYSTEM PRIVATE ${HIPBLAS_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS})
-    target_link_libraries(hipsparse_linops_${type} Ginkgo::ginkgo ${HIPSPARSE_LIBRARIES})
+    target_link_libraries(hipsparse_linops_${type} PRIVATE Ginkgo::ginkgo ${HIPSPARSE_LIBRARIES})
     ginkgo_compile_features(hipsparse_linops_${type})
 endfunction()
 
@@ -118,7 +118,7 @@ if (GINKGO_BUILD_CUDA)
     ginkgo_benchmark_cusparse_linops(z GKO_BENCHMARK_USE_DOUBLE_COMPLEX_PRECISION)
     ginkgo_benchmark_cusparse_linops(c GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION)
     add_library(cuda_timer utils/cuda_timer.cpp)
-    target_link_libraries(cuda_timer ginkgo CUDA::cudart)
+    target_link_libraries(cuda_timer PRIVATE ginkgo CUDA::cudart)
     ginkgo_compile_features(cuda_timer)
 endif()
 if (GINKGO_BUILD_HIP)
@@ -128,7 +128,7 @@ if (GINKGO_BUILD_HIP)
     ginkgo_benchmark_hipsparse_linops(c GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION)
     set_source_files_properties(utils/hip_timer.hip.cpp PROPERTIES LANGUAGE HIP)
     add_library(hip_timer utils/hip_timer.hip.cpp)
-    target_link_libraries(hip_timer ginkgo)
+    target_link_libraries(hip_timer PRIVATE ginkgo)
     ginkgo_compile_features(hip_timer)
 endif()
 
@@ -140,13 +140,13 @@ if (GINKGO_BUILD_SYCL)
     add_library(dpcpp_timer utils/dpcpp_timer.dp.cpp)
     target_compile_options(dpcpp_timer PRIVATE ${GINKGO_DPCPP_FLAGS})
     gko_add_sycl_to_target(TARGET dpcpp_timer SOURCES utils/dpcpp_timer.dp.cpp)
-    target_link_libraries(dpcpp_timer ginkgo)
+    target_link_libraries(dpcpp_timer PRIVATE ginkgo)
     ginkgo_compile_features(dpcpp_timer)
 endif()
 
 if (GINKGO_BUILD_MPI)
     add_library(mpi_timer ${Ginkgo_SOURCE_DIR}/benchmark/utils/mpi_timer.cpp)
-    target_link_libraries(mpi_timer ginkgo)
+    target_link_libraries(mpi_timer PRIVATE ginkgo)
     ginkgo_compile_features(mpi_timer)
 endif()
 
diff --git a/reference/solver/batch_bicgstab_kernels.hpp b/reference/solver/batch_bicgstab_kernels.hpp
index f91e06d2e44..85b1bed5ccd 100644
--- a/reference/solver/batch_bicgstab_kernels.hpp
+++ b/reference/solver/batch_bicgstab_kernels.hpp
@@ -6,6 +6,8 @@
 #define GKO_REFERENCE_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_
 
 
+#include "core/solver/batch_bicgstab_kernels.hpp"
+
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "reference/base/batch_multi_vector_kernels.hpp"
diff --git a/reference/solver/batch_cg_kernels.hpp b/reference/solver/batch_cg_kernels.hpp
index d4a35e3d01a..2f8e5990931 100644
--- a/reference/solver/batch_cg_kernels.hpp
+++ b/reference/solver/batch_cg_kernels.hpp
@@ -6,6 +6,8 @@
 #define GKO_REFERENCE_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_
 
 
+#include "core/solver/batch_cg_kernels.hpp"
+
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "reference/base/batch_multi_vector_kernels.hpp"

From 9d0f7cf397e566e9f43d86d267cfc19949854edd Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Fri, 11 Oct 2024 10:00:02 +0200
Subject: [PATCH 186/448] [doc] remove windows ci badge

AFAIK there is no way to get a gitlab badge for a specific job, so there is no replacement.
---
 README.md | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 74fd6a0f57e..598b17e5b5b 100644
--- a/README.md
+++ b/README.md
@@ -6,12 +6,8 @@
 |:-:|:-:|:-:|:-:|
 
 
-[![Build status](https://gitlab.com/ginkgo-project/ginkgo-public-ci/badges/master/pipeline.svg)](https://gitlab.com/ginkgo-project/ginkgo-public-ci/-/pipelines?page=1&scope=branches&ref=master)|[![OSX-build](https://github.com/ginkgo-project/ginkgo/actions/workflows/osx.yml/badge.svg)](https://github.com/ginkgo-project/ginkgo/actions/workflows/osx.yml)|[![Windows-build](https://github.com/ginkgo-project/ginkgo/actions/workflows/windows-msvc-ref.yml/badge.svg)](https://github.com/ginkgo-project/ginkgo/actions/workflows/windows-msvc-ref.yml)
-|:-:|:-:|:-:|
-
-
-[![codecov](https://codecov.io/gh/ginkgo-project/ginkgo/branch/master/graph/badge.svg)](https://codecov.io/gh/ginkgo-project/ginkgo)|[![Maintainability Rating](https://sonarcloud.io/api/project_badges/measure?project=ginkgo-project_ginkgo&metric=sqale_rating)](https://sonarcloud.io/dashboard?id=ginkgo-project_ginkgo)|[![Reliability Rating](https://sonarcloud.io/api/project_badges/measure?project=ginkgo-project_ginkgo&metric=reliability_rating)](https://sonarcloud.io/dashboard?id=ginkgo-project_ginkgo)|[![CDash dashboard](https://img.shields.io/badge/CDash-Access-blue.svg)](https://my.cdash.org/index.php?project=Ginkgo+Project)
-|:-:|:-:|:-:|:-:|
+[![Build status](https://gitlab.com/ginkgo-project/ginkgo-public-ci/badges/master/pipeline.svg)](https://gitlab.com/ginkgo-project/ginkgo-public-ci/-/pipelines?page=1&scope=branches&ref=master)|[![OSX-build](https://github.com/ginkgo-project/ginkgo/actions/workflows/osx.yml/badge.svg)](https://github.com/ginkgo-project/ginkgo/actions/workflows/osx.yml)|[![codecov](https://codecov.io/gh/ginkgo-project/ginkgo/branch/master/graph/badge.svg)](https://codecov.io/gh/ginkgo-project/ginkgo)|[![Maintainability Rating](https://sonarcloud.io/api/project_badges/measure?project=ginkgo-project_ginkgo&metric=sqale_rating)](https://sonarcloud.io/dashboard?id=ginkgo-project_ginkgo)|[![Reliability Rating](https://sonarcloud.io/api/project_badges/measure?project=ginkgo-project_ginkgo&metric=reliability_rating)](https://sonarcloud.io/dashboard?id=ginkgo-project_ginkgo)|[![CDash dashboard](https://img.shields.io/badge/CDash-Access-blue.svg)](https://my.cdash.org/index.php?project=Ginkgo+Project)
+|:-:|:-:|:-:|:-:|:-:|:-:|
 
 </div>
 

From 9566b2232cdf11acdbaafeb3c9dd1da0ff870929 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Fri, 11 Oct 2024 11:04:43 +0200
Subject: [PATCH 187/448] [ci] fix cuda memcheck

---
 cmake/CTestScript.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/CTestScript.cmake b/cmake/CTestScript.cmake
index 81ff86625d1..c24e4e4529a 100644
--- a/cmake/CTestScript.cmake
+++ b/cmake/CTestScript.cmake
@@ -153,11 +153,11 @@ ctest_submit(PARTS Start)
 
 if (CTEST_MEMORYCHECK_TYPE STREQUAL "CudaMemcheck")
     # generate line number information for CUDA
-    set(GINKGO_CONFIGURE_OPTIONS "-DGINKGO_DEVEL_TOOLS=OFF;-DGINKGO_BUILD_REFERENCE=ON;-DGINKGO_BUILD_OMP=OFF;-DGINKGO_BUILD_CUDA=ON;-DGINKGO_BUILD_HIP=ON;-DCMAKE_BUILD_TYPE=${CTEST_BUILD_CONFIGURATION};-DCMAKE_CUDA_FLAGS=-lineinfo")
+    set(GINKGO_CONFIGURE_OPTIONS "-DGINKGO_DEVEL_TOOLS=OFF;-DGINKGO_BUILD_REFERENCE=ON;-DGINKGO_BUILD_OMP=OFF;-DGINKGO_BUILD_CUDA=ON;-DGINKGO_BUILD_HIP=OFF;-DGINKGO_BUILD_SYCL=OFF;-DCMAKE_BUILD_TYPE=${CTEST_BUILD_CONFIGURATION};-DCMAKE_CUDA_FLAGS=-lineinfo")
 elseif((NOT CTEST_MEMORYCHECK_TYPE STREQUAL "NONE" AND NOT CTEST_MEMORYCHECK_TYPE STREQUAL "Valgrind") OR CTEST_BUILD_CONFIGURATION STREQUAL "COVERAGE")
-    set(GINKGO_CONFIGURE_OPTIONS "-DGINKGO_DEVEL_TOOLS=OFF;-DGINKGO_BUILD_REFERENCE=ON;-DGINKGO_BUILD_OMP=ON;-DGINKGO_BUILD_CUDA=OFF;-DGINKGO_BUILD_HIP=OFF;-DCMAKE_BUILD_TYPE=${CTEST_BUILD_CONFIGURATION}")
+    set(GINKGO_CONFIGURE_OPTIONS "-DGINKGO_DEVEL_TOOLS=OFF;-DGINKGO_BUILD_REFERENCE=ON;-DGINKGO_BUILD_OMP=ON;-DGINKGO_BUILD_CUDA=OFF;-DGINKGO_BUILD_HIP=OFF;-DGINKGO_BUILD_SYCL=OFF;-DCMAKE_BUILD_TYPE=${CTEST_BUILD_CONFIGURATION}")
 else()
-    set(GINKGO_CONFIGURE_OPTIONS "-DGINKGO_DEVEL_TOOLS=OFF;-DGINKGO_BUILD_REFERENCE=ON;-DGINKGO_BUILD_OMP=ON;-DGINKGO_BUILD_CUDA=ON;-DGINKGO_BUILD_HIP=ON;-DCMAKE_BUILD_TYPE=${CTEST_BUILD_CONFIGURATION}")
+    set(GINKGO_CONFIGURE_OPTIONS "-DGINKGO_DEVEL_TOOLS=OFF;-DGINKGO_BUILD_REFERENCE=ON;-DGINKGO_BUILD_OMP=ON;-DGINKGO_BUILD_CUDA=ON;-DGINKGO_BUILD_HIP=ON;-DGINKGO_BUILD_SYCL=OFF;-DCMAKE_BUILD_TYPE=${CTEST_BUILD_CONFIGURATION}")
 endif()
 
 # UBSAN needs gold linker

From b2e39cccc17c26a0c7ba885bbb25ecd6d1e7031a Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Mon, 14 Oct 2024 15:07:29 +0200
Subject: [PATCH 188/448] [ci] move the majority of jobs to the full pipeline

---
 .gitlab-ci.yml    | 28 ++++++++++++++--------------
 .gitlab/rules.yml | 12 +++++++-----
 2 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index ef10b92e20d..18771d9bc2d 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -92,7 +92,7 @@ trigger_pipeline:
 
 
 # cuda 11.4 and friends
-build/cuda114/nompi/gcc/cuda/debug/shared:
+build/cuda114/nompi/gcc/cuda/release/shared:
   extends:
     - .build_and_test_template
     - .default_variables
@@ -101,7 +101,7 @@ build/cuda114/nompi/gcc/cuda/debug/shared:
   variables:
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
-    BUILD_TYPE: "Debug"
+    BUILD_TYPE: "Release"
     FAST_TESTS: "ON"
     # fix gtest issue https://github.com/google/googletest/issues/3514
     CXX_FLAGS: "-Wno-error=maybe-uninitialized"
@@ -114,7 +114,7 @@ build/nvhpc233/cuda120/nompi/nvcpp/release/static:
   extends:
     - .build_and_test_template
     - .default_variables
-    - .quick_test_condition
+    - .full_test_condition
     - .use_gko_nvhpc233-cuda120-openmpi-gnu12-llvm16
   variables:
     CXX_COMPILER: "nvc++"
@@ -133,7 +133,7 @@ build/nvhpc227/cuda117/nompi/nvcpp/debug/shared:
   extends:
     - .build_and_test_template
     - .default_variables
-    - .quick_test_condition
+    - .full_test_condition
     - .use_gko_nvhpc227-cuda117-openmpi-gnu11-llvm14
   variables:
     CXX_COMPILER: "nvc++"
@@ -178,7 +178,7 @@ build/amd/nompi/clang/rocm45/debug/shared:
   extends:
     - .build_and_test_template
     - .default_variables
-    - .quick_test_condition
+    - .full_test_condition
     - .use_gko-rocm45-nompi-gnu8-llvm8
   variables:
     CXX_COMPILER: "clang++"
@@ -203,7 +203,7 @@ build/amd/nompi/clang/rocm514/release/shared:
   extends:
     - .build_and_test_template
     - .default_variables
-    - .quick_test_condition
+    - .full_test_condition
     - .use_gko-rocm514-nompi-gnu11-llvm11
   variables:
     CXX_COMPILER: "clang++"
@@ -229,7 +229,7 @@ build/nocuda/nompi/gcc/core/debug/static:
   extends:
     - .build_and_test_template
     - .default_variables
-    - .quick_test_condition
+    - .full_test_condition
     - .use_gko-nocuda-nompi-gnu9-llvm8
   variables:
     BUILD_TYPE: "Debug"
@@ -241,7 +241,7 @@ build/nocuda/nompi/clang/core/release/shared:
   extends:
     - .build_and_test_template
     - .default_variables
-    - .quick_test_condition
+    - .full_test_condition
     - .use_gko-nocuda-nompi-gnu9-llvm8
   variables:
     CXX_COMPILER: "clang++"
@@ -276,7 +276,7 @@ build/nocuda/openmpi/clang/omp/glibcxx-debug-release/shared:
   extends:
     - .build_and_test_template
     - .default_variables
-    - .quick_test_condition
+    - .full_test_condition
     - .use_gko-nocuda-nompi-gnu9-llvm8
   variables:
     CXX_COMPILER: "clang++"
@@ -292,7 +292,7 @@ build/nocuda/nompi/gcc/omp/release/static:
   extends:
     - .build_and_test_template
     - .default_variables
-    - .quick_test_condition
+    - .full_test_condition
     - .use_gko-nocuda-nompi-gnu9-llvm8
   variables:
     BUILD_OMP: "ON"
@@ -316,7 +316,7 @@ build/nocuda-nomixed/openmpi/gcc/omp/release/shared:
   extends:
     - .build_and_test_template
     - .default_variables
-    - .quick_test_condition
+    - .full_test_condition
     - .use_gko-nocuda-nompi-gnu9-llvm8
   variables:
     BUILD_MPI: "ON"
@@ -410,7 +410,7 @@ build/windows-cuda/release/shared:
 
 build/windows/release/shared:
   extends:
-    - .quick_test_condition
+    - .full_test_condition
   stage: build
   script:
     - if (Test-Path build) { rm -r -fo build }
@@ -445,7 +445,7 @@ no-circular-deps:
   extends:
     - .build_template
     - .default_variables
-    - .quick_test_condition
+    - .full_test_condition
     - .use_gko-rocm514-nompi-gnu11-llvm11
   variables:
     BUILD_OMP: "ON"
@@ -496,7 +496,7 @@ sonarqube_cov_:
   stage: code_quality
   extends:
     - .default_variables
-    - .quick_test_short_lived_condition
+    - .full_test_short_lived_condition
     - .before_script_template
     - .use_gko_cuda114-openmpi-gnu10-llvm12
   tags:
diff --git a/.gitlab/rules.yml b/.gitlab/rules.yml
index 4afc04799bb..e60aaf7a66c 100644
--- a/.gitlab/rules.yml
+++ b/.gitlab/rules.yml
@@ -30,18 +30,20 @@
   dependencies: []
 
 
-.quick_test_condition:
+.full_test_short_lived_condition:
   rules:
-    - if: $RUN_CI_TAG && $STATUS_CONTEXT == null
+    - if: $CI_COMMIT_BRANCH == "develop" || $CI_COMMIT_BRANCH == "master" || $CI_COMMIT_TAG
+      when: never
+    - if: $RUN_CI_TAG && $STATUS_CONTEXT == "full"
   dependencies: []
 
-.quick_test_short_lived_condition:
+
+.quick_test_condition:
   rules:
-    - if: $CI_COMMIT_BRANCH == "develop" || $CI_COMMIT_BRANCH == "master" || $CI_COMMIT_TAG
-      when: never
     - if: $RUN_CI_TAG && $STATUS_CONTEXT == null
   dependencies: []
 
+
 .deploy_condition:
   rules:
     - if: $RUN_CI_TAG && ($CI_COMMIT_BRANCH == "master" || $CI_COMMIT_BRANCH == "develop" || $CI_COMMIT_TAG) && $CI_PIPELINE_SOURCE != "schedule"

From 700737511ebdf2540fe22aebfe357a37de93281c Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Thu, 29 Aug 2024 09:51:02 +0200
Subject: [PATCH 189/448] [cmake] remove maipulation of HIP|ROCM_PATH

---
 CMakeLists.txt       |  1 -
 cmake/hip.cmake      | 94 +++++---------------------------------------
 cmake/hip_path.cmake | 13 ------
 3 files changed, 9 insertions(+), 99 deletions(-)
 delete mode 100644 cmake/hip_path.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 21832c98592..6d0804b4eed 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,7 +7,6 @@ set(PROJECT_VERSION_TAG ${Ginkgo_VERSION_TAG})
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 
 # Determine which modules can be compiled
-include(cmake/hip_path.cmake)
 include(cmake/autodetect_executors.cmake)
 
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/Modules/")
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
index bd834c3ebde..52f377ad6ca 100644
--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
@@ -1,5 +1,10 @@
 cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
 enable_language(HIP)
+
+# We keep using NVCC/HCC for consistency with previous releases even if AMD
+# updated everything to use NVIDIA/AMD in ROCM 4.1
+set(GINKGO_HIP_PLATFORM_NVCC 0)
+set(GINKGO_HIP_PLATFORM_HCC 0)
 if(CMAKE_HIP_COMPILER_ID STREQUAL "NVIDIA")
     set(GINKGO_HIP_PLATFORM "nvidia")
     set(GINKGO_HIP_PLATFORM_NVIDIA ON)
@@ -12,73 +17,6 @@ else()
     set(GINKGO_HIP_PLATFORM_HCC 1)
 endif()
 
-
-if(NOT DEFINED ROCM_PATH)
-    if(DEFINED ENV{ROCM_PATH})
-        set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCM has been installed")
-    elseif(DEFINED ENV{HIP_PATH})
-        set(ROCM_PATH "$ENV{HIP_PATH}/.." CACHE PATH "Path to which ROCM has been installed")
-    else()
-        set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCM has been installed")
-    endif()
-endif()
-
-if(NOT DEFINED HIPBLAS_PATH)
-    if(DEFINED ENV{HIPBLAS_PATH})
-        set(HIPBLAS_PATH $ENV{HIPBLAS_PATH} CACHE PATH "Path to which HIPBLAS has been installed")
-    else()
-        set(HIPBLAS_PATH "${ROCM_PATH}/hipblas" CACHE PATH "Path to which HIPBLAS has been installed")
-    endif()
-endif()
-
-if(NOT DEFINED HIPFFT_PATH)
-    if(DEFINED ENV{HIPFFT_PATH})
-        set(HIPFFT_PATH $ENV{HIPFFT_PATH} CACHE PATH "Path to which HIPFFT has been installed")
-    else()
-        set(HIPFFT_PATH "${ROCM_PATH}/hipfft" CACHE PATH "Path to which HIPFFT has been installed")
-    endif()
-endif()
-
-if(NOT DEFINED HIPRAND_PATH)
-    if(DEFINED ENV{HIPRAND_PATH})
-        set(HIPRAND_PATH $ENV{HIPRAND_PATH} CACHE PATH "Path to which HIPRAND has been installed")
-    else()
-        set(HIPRAND_PATH "${ROCM_PATH}/hiprand" CACHE PATH "Path to which HIPRAND has been installed")
-    endif()
-endif()
-
-if(NOT DEFINED ROCRAND_PATH)
-    if(DEFINED ENV{ROCRAND_PATH})
-        set(ROCRAND_PATH $ENV{ROCRAND_PATH} CACHE PATH "Path to which ROCRAND has been installed")
-    else()
-        set(ROCRAND_PATH "${ROCM_PATH}/rocrand" CACHE PATH "Path to which ROCRAND has been installed")
-    endif()
-endif()
-
-if(NOT DEFINED HIPSPARSE_PATH)
-    if(DEFINED ENV{HIPSPARSE_PATH})
-        set(HIPSPARSE_PATH $ENV{HIPSPARSE_PATH} CACHE PATH "Path to which HIPSPARSE has been installed")
-    else()
-        set(HIPSPARSE_PATH "${ROCM_PATH}/hipsparse" CACHE PATH "Path to which HIPSPARSE has been installed")
-    endif()
-endif()
-
-if(NOT DEFINED HIP_CLANG_PATH)
-    if(NOT DEFINED ENV{HIP_CLANG_PATH})
-        set(HIP_CLANG_PATH "${ROCM_PATH}/llvm/bin" CACHE PATH "Path to which HIP compatible clang binaries have been installed")
-    else()
-        set(HIP_CLANG_PATH $ENV{HIP_CLANG_PATH} CACHE PATH "Path to which HIP compatible clang binaries have been installed")
-    endif()
-endif()
-
-if(NOT DEFINED ROCTRACER_PATH)
-    if(DEFINED ENV{ROCTRACER_PATH})
-        set(ROCTRACER_PATH $ENV{ROCTRACER_PATH} CACHE PATH "Path to which ROCTRACER has been installed")
-    else()
-        set(ROCTRACER_PATH "${ROCM_PATH}/roctracer" CACHE PATH "Path to which ROCTRACER has been installed")
-    endif()
-endif()
-
 find_program(
     HIP_HIPCONFIG_EXECUTABLE
     NAMES hipconfig
@@ -97,24 +35,10 @@ if(NOT HIP_HIPCONFIG_EXECUTABLE)
 endif()
 
 execute_process(
-            COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --version
-            OUTPUT_VARIABLE GINKGO_HIP_VERSION
-            OUTPUT_STRIP_TRAILING_WHITESPACE
-            ERROR_STRIP_TRAILING_WHITESPACE
-            )
-
-## Setup all CMAKE variables to find HIP and its dependencies
-set(GINKGO_HIP_MODULE_PATH "${HIP_PATH}/cmake")
-list(APPEND CMAKE_MODULE_PATH "${GINKGO_HIP_MODULE_PATH}")
-if (GINKGO_HIP_PLATFORM_AND)
-    list(APPEND CMAKE_PREFIX_PATH "${HIP_PATH}/lib/cmake")
-endif()
-list(APPEND CMAKE_PREFIX_PATH
-    "${HIPBLAS_PATH}/lib/cmake"
-    "${HIPFFT_PATH}/lib/cmake"
-    "${HIPRAND_PATH}/lib/cmake"
-    "${HIPSPARSE_PATH}/lib/cmake"
-    "${ROCRAND_PATH}/lib/cmake"
+        COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --version
+        OUTPUT_VARIABLE GINKGO_HIP_VERSION
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        ERROR_STRIP_TRAILING_WHITESPACE
 )
 
 find_package(hipblas REQUIRED)
diff --git a/cmake/hip_path.cmake b/cmake/hip_path.cmake
deleted file mode 100644
index a9f418cb3bd..00000000000
--- a/cmake/hip_path.cmake
+++ /dev/null
@@ -1,13 +0,0 @@
-if(NOT DEFINED HIP_PATH)
-    if(NOT DEFINED ENV{HIP_PATH})
-        set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
-        set(ENV{HIP_PATH} ${HIP_PATH})
-    else()
-        set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
-    endif()
-endif()
-
-# We keep using NVCC/HCC for consistency with previous releases even if AMD
-# updated everything to use NVIDIA/AMD in ROCM 4.1
-set(GINKGO_HIP_PLATFORM_NVCC 0)
-set(GINKGO_HIP_PLATFORM_HCC 0)

From a59e298ac4ca86514a0c35a5f5344b0e1f285e64 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Thu, 29 Aug 2024 09:52:13 +0200
Subject: [PATCH 190/448] [cmake] warn on faulty rocm CMake setup

---
 cmake/autodetect_executors.cmake |  3 +++
 cmake/hip.cmake                  | 30 ++++++----------------
 cmake/hip_helpers.cmake          | 43 ++++++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+), 23 deletions(-)
 create mode 100644 cmake/hip_helpers.cmake

diff --git a/cmake/autodetect_executors.cmake b/cmake/autodetect_executors.cmake
index d3ad2e3a6a1..656e5096fc1 100644
--- a/cmake/autodetect_executors.cmake
+++ b/cmake/autodetect_executors.cmake
@@ -35,6 +35,9 @@ if (NOT DEFINED GINKGO_BUILD_HIP)
     if(CMAKE_HIP_COMPILER)
         message(STATUS "Enabling HIP executor")
         set(GINKGO_HAS_HIP ON)
+    else ()
+        include(cmake/hip_helpers.cmake)
+        ginkgo_check_hip_detection_issue()
     endif()
 endif()
 
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
index 52f377ad6ca..6a05933377f 100644
--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
@@ -1,4 +1,10 @@
 cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
+
+include(cmake/hip_helpers.cmake)
+include(CheckLanguage)
+check_language(HIP)
+ginkgo_check_hip_detection_issue()
+
 enable_language(HIP)
 
 # We keep using NVCC/HCC for consistency with previous releases even if AMD
@@ -17,29 +23,7 @@ else()
     set(GINKGO_HIP_PLATFORM_HCC 1)
 endif()
 
-find_program(
-    HIP_HIPCONFIG_EXECUTABLE
-    NAMES hipconfig
-    PATHS
-    "${HIP_ROOT_DIR}"
-    ENV ROCM_PATH
-    ENV HIP_PATH
-    /opt/rocm
-    /opt/rocm/hip
-    PATH_SUFFIXES bin
-    NO_DEFAULT_PATH
-)
-if(NOT HIP_HIPCONFIG_EXECUTABLE)
-    # Now search in default paths
-    find_program(HIP_HIPCONFIG_EXECUTABLE hipconfig)
-endif()
-
-execute_process(
-        COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --version
-        OUTPUT_VARIABLE GINKGO_HIP_VERSION
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-        ERROR_STRIP_TRAILING_WHITESPACE
-)
+ginkgo_find_hip_version()
 
 find_package(hipblas REQUIRED)
 find_package(hipfft) # optional dependency
diff --git a/cmake/hip_helpers.cmake b/cmake/hip_helpers.cmake
new file mode 100644
index 00000000000..cf9062bde41
--- /dev/null
+++ b/cmake/hip_helpers.cmake
@@ -0,0 +1,43 @@
+function(ginkgo_find_hip_version)
+    find_program(
+            HIP_HIPCONFIG_EXECUTABLE
+            NAMES hipconfig
+            PATHS
+            "${HIP_ROOT_DIR}"
+            ENV ROCM_PATH
+            ENV HIP_PATH
+            /opt/rocm
+            /opt/rocm/hip
+            PATH_SUFFIXES bin
+            NO_DEFAULT_PATH
+    )
+    if(NOT HIP_HIPCONFIG_EXECUTABLE)
+        # Now search in default paths
+        find_program(HIP_HIPCONFIG_EXECUTABLE hipconfig)
+    endif()
+
+    execute_process(
+            COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --version
+            OUTPUT_VARIABLE GINKGO_HIP_VERSION
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+            ERROR_STRIP_TRAILING_WHITESPACE
+    )
+    set(GINKGO_HIP_VERSION ${GINKGO_HIP_VERSION} PARENT_SCOPE)
+endfunction()
+
+# This function checks if ROCm might not be detected correctly.
+# ROCm < 5.7 has a faulty CMake setup that requires setting
+# CMAKE_PREFIX_PATH=$ROCM_PATH/lib/cmake, otherwise HIP will not be detected.
+function(ginkgo_check_hip_detection_issue)
+    if(NOT CMAKE_HIP_COMPILER)
+        ginkgo_find_hip_version()
+        if (GINKGO_HIP_VERSION AND GINKGO_HIP_VERSION VERSION_LESS 5.7)
+            message(WARNING
+                    "Could not find a HIP compiler, but HIP version ${GINKGO_HIP_VERSION} was detected through "
+                    "hipconfig. Try setting the environment variable CMAKE_PREFIX_PATH=$ROCM_PATH/lib/cmake, or "
+                    "update to ROCm >= 5.7."
+            )
+        endif ()
+    endif ()
+endfunction()
+

From a80b551585ff1883922fb9d9a25ba9db9e1a3fea Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Thu, 29 Aug 2024 10:33:45 +0200
Subject: [PATCH 191/448] [cmake] use cmakedefine01 for hip platform

---
 include/ginkgo/config.hpp.in | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in
index 329918399d6..1dfa6bc61bc 100644
--- a/include/ginkgo/config.hpp.in
+++ b/include/ginkgo/config.hpp.in
@@ -41,10 +41,10 @@
 
 /* What is HIP compiled for, hcc or nvcc? */
 // clang-format off
-#define GINKGO_HIP_PLATFORM_HCC @GINKGO_HIP_PLATFORM_HCC@
+#cmakedefine01 GINKGO_HIP_PLATFORM_HCC
 
 
-#define GINKGO_HIP_PLATFORM_NVCC @GINKGO_HIP_PLATFORM_NVCC@
+#cmakedefine01 GINKGO_HIP_PLATFORM_NVCC
 // clang-format on
 
 

From ad717eae6a9eee975d8011fdd5405e0f17bf112f Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Tue, 15 Oct 2024 06:34:51 +0000
Subject: [PATCH 192/448] fixup! [cmake] remove maipulation of HIP|ROCM_PATH

---
 cmake/GinkgoConfig.cmake.in | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cmake/GinkgoConfig.cmake.in b/cmake/GinkgoConfig.cmake.in
index 1f12251f93d..a5ead102c23 100644
--- a/cmake/GinkgoConfig.cmake.in
+++ b/cmake/GinkgoConfig.cmake.in
@@ -135,7 +135,7 @@ set(GINKGO_HAVE_VTUNE "@GINKGO_HAVE_VTUNE@")
 set(GINKGO_HAVE_METIS "@GINKGO_HAVE_METIS@")
 set_and_check(VTune_PATH "@VTune_PATH@")
 
-# ensure Threads settings 
+# ensure Threads settings
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 
 # NOTE: we do not export benchmarks, examples, tests or devel tools
@@ -176,7 +176,6 @@ if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_HIP)
     find_dependency(hipsparse)
     find_dependency(rocrand)
     find_dependency(rocthrust)
-    set_and_check(ROCTRACER_PATH "@ROCTRACER_PATH@")
     find_dependency(ROCTX)
 endif()
 

From 1bc2ec10460163a0720a30dc3421a3c78a68ae99 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Tue, 15 Oct 2024 07:32:31 +0000
Subject: [PATCH 193/448] [cmake] use ROCM_PATH for finding roctx

---
 cmake/Modules/FindROCTX.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/Modules/FindROCTX.cmake b/cmake/Modules/FindROCTX.cmake
index e5647080ca3..1bcb344c6d9 100644
--- a/cmake/Modules/FindROCTX.cmake
+++ b/cmake/Modules/FindROCTX.cmake
@@ -26,11 +26,11 @@
 # ``ROCTX_FOUND``
 #   If false, do not try to use the ROCTX library.
 
-find_path(ROCTX_INCLUDE_DIR NAMES roctx.h HINTS ${ROCTRACER_PATH}/include)
+find_path(ROCTX_INCLUDE_DIR NAMES roctx.h HINTS ${ROCTRACER_PATH}/include ${ROCM_PATH}/include/roctracer)
 mark_as_advanced(ROCTX_INCLUDE_DIR)
 
 if(NOT ROCTX_LIBRARY)
-    find_library(ROCTX_LIBRARY NAMES roctx64 HINTS ${ROCTRACER_PATH}/lib)
+    find_library(ROCTX_LIBRARY NAMES roctx64 HINTS ${ROCTRACER_PATH}/lib ${ROCM_PATH}/lib)
 endif()
 
 include(FindPackageHandleStandardArgs)

From 4f79e370d6fff56b6c4c68b18fff446e2ed0a2e1 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Tue, 15 Oct 2024 07:59:48 +0000
Subject: [PATCH 194/448] [dist] use xstd::void_t again in
 is_matrix_type_builder

This works around a (likely) GCC 7.5 bug when using std::void_t.
---
 include/ginkgo/core/base/std_extensions.hpp | 18 +++++++++++++++++-
 include/ginkgo/core/distributed/matrix.hpp  |  3 ++-
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/include/ginkgo/core/base/std_extensions.hpp b/include/ginkgo/core/base/std_extensions.hpp
index 893b2b0d865..a950fcc2003 100644
--- a/include/ginkgo/core/base/std_extensions.hpp
+++ b/include/ginkgo/core/base/std_extensions.hpp
@@ -27,8 +27,24 @@ namespace gko {
  * @ingroup xstd
  */
 namespace xstd {
+namespace detail {
+
+
+template <typename... Ts>
+struct make_void {
+    using type = void;
+};
+
+
+}  // namespace detail
+
+
+/**
+ * Use the custom implementation, since the std::void_t used in
+ * is_matrix_type_builder seems to trigger a compiler bug in GCC 7.5.
+ */
 template <typename... Ts>
-using void_t = std::void_t<Ts...>;
+using void_t = typename detail::make_void<Ts...>::type;
 
 
 GKO_DEPRECATED("use std::uncaught_exceptions")
diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp
index 1e5e33581a9..de719bb9315 100644
--- a/include/ginkgo/core/distributed/matrix.hpp
+++ b/include/ginkgo/core/distributed/matrix.hpp
@@ -14,6 +14,7 @@
 
 #include <ginkgo/core/base/dense_cache.hpp>
 #include <ginkgo/core/base/mpi.hpp>
+#include <ginkgo/core/base/std_extensions.hpp>
 #include <ginkgo/core/distributed/base.hpp>
 #include <ginkgo/core/distributed/index_map.hpp>
 #include <ginkgo/core/distributed/lin_op.hpp>
@@ -55,7 +56,7 @@ struct is_matrix_type_builder : std::false_type {};
 template <typename Builder, typename ValueType, typename IndexType>
 struct is_matrix_type_builder<
     Builder, ValueType, IndexType,
-    std::void_t<
+    xstd::void_t<
         decltype(std::declval<Builder>().template create<ValueType, IndexType>(
             std::declval<std::shared_ptr<const Executor>>()))>>
     : std::true_type {};

From 14c5610983c81251aa8ccdd248bb5976deb9b7a5 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Wed, 16 Oct 2024 09:55:30 +0000
Subject: [PATCH 195/448] [test] fix compiler error with nvhpc

---
 test/mpi/matrix.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp
index 1c090b6c43f..f4b8af2fb19 100644
--- a/test/mpi/matrix.cpp
+++ b/test/mpi/matrix.cpp
@@ -207,6 +207,8 @@ TYPED_TEST(MatrixCreation, BuildFromExistingData)
     using Partition = typename TestFixture::Partition;
     using local_index_type = typename TestFixture::local_index_type;
     using matrix_data = gko::matrix_data<value_type, local_index_type>;
+    using input_triple =
+        gko::detail::input_triple<value_type, local_index_type>;
     using dist_mtx_type = typename TestFixture::dist_mtx_type;
     using dist_vec_type = gko::experimental::distributed::Vector<value_type>;
     using comm_index_type = gko::experimental::distributed::comm_index_type;
@@ -214,18 +216,16 @@ TYPED_TEST(MatrixCreation, BuildFromExistingData)
     I<I<value_type>> res_local[] = {{{2, 0}, {0, 0}}, {{0, 5}, {0, 0}}, {{0}}};
     std::array<gko::dim<2>, 3> size_local{{{2, 2}, {2, 2}, {1, 1}}};
     std::array<matrix_data, 3> dist_input_local{
-        {{size_local[0], {{0, 0, 2}}},
-         {size_local[1], {{0, 1, 5}}},
-         {size_local[2],
-          std::initializer_list<
-              gko::detail::input_triple<value_type, local_index_type>>{}}}};
+        {{size_local[0], I<input_triple>{{0, 0, 2}}},
+         {size_local[1], I<input_triple>{{0, 1, 5}}},
+         {size_local[2]}}};
     I<I<value_type>> res_non_local[] = {
         {{1, 0}, {3, 4}}, {{0, 0, 6}, {8, 7, 0}}, {{10, 9}}};
     std::array<gko::dim<2>, 3> size_non_local{{{2, 2}, {2, 3}, {1, 2}}};
     std::array<matrix_data, 3> dist_input_non_local{
-        {{size_non_local[0], {{0, 0, 1}, {1, 0, 3}, {1, 1, 4}}},
-         {size_non_local[1], {{0, 2, 6}, {1, 0, 8}, {1, 1, 7}}},
-         {size_non_local[2], {{0, 0, 10}, {0, 1, 9}}}}};
+        {{size_non_local[0], I<input_triple>{{0, 0, 1}, {1, 0, 3}, {1, 1, 4}}},
+         {size_non_local[1], I<input_triple>{{0, 2, 6}, {1, 0, 8}, {1, 1, 7}}},
+         {size_non_local[2], I<input_triple>{{0, 0, 10}, {0, 1, 9}}}}};
     std::array<std::vector<comm_index_type>, 3> recv_sizes{
         {{0, 1, 1}, {2, 0, 1}, {1, 1, 0}}};
     std::array<std::vector<comm_index_type>, 3> recv_offsets{

From a25c9d64b706bd34356427b208d201e3a864f567 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Tue, 15 Oct 2024 07:38:32 +0000
Subject: [PATCH 196/448] [core] use only `constexpr` for math functions

---
 include/ginkgo/core/base/math.hpp | 172 ++++--------------------------
 1 file changed, 22 insertions(+), 150 deletions(-)

diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp
index f6847743717..33b3a566b37 100644
--- a/include/ginkgo/core/base/math.hpp
+++ b/include/ginkgo/core/base/math.hpp
@@ -283,7 +283,7 @@ using is_complex_s = detail::is_complex_impl<T>;
  * @return `true` if T is a complex type, `false` otherwise
  */
 template <typename T>
-GKO_INLINE GKO_ATTRIBUTES constexpr bool is_complex()
+GKO_INLINE constexpr bool is_complex()
 {
     return detail::is_complex_impl<T>::value;
 }
@@ -307,7 +307,7 @@ using is_complex_or_scalar_s = detail::is_complex_or_scalar_impl<T>;
  * @return `true` if T is a complex/scalar type, `false` otherwise
  */
 template <typename T>
-GKO_INLINE GKO_ATTRIBUTES constexpr bool is_complex_or_scalar()
+GKO_INLINE constexpr bool is_complex_or_scalar()
 {
     return detail::is_complex_or_scalar_impl<T>::value;
 }
@@ -511,7 +511,7 @@ using highest_precision =
  * @return the rounded down value
  */
 template <typename T>
-GKO_INLINE GKO_ATTRIBUTES constexpr reduce_precision<T> round_down(T val)
+GKO_INLINE constexpr reduce_precision<T> round_down(T val)
 {
     return static_cast<reduce_precision<T>>(val);
 }
@@ -527,7 +527,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr reduce_precision<T> round_down(T val)
  * @return the rounded up value
  */
 template <typename T>
-GKO_INLINE GKO_ATTRIBUTES constexpr increase_precision<T> round_up(T val)
+GKO_INLINE constexpr increase_precision<T> round_up(T val)
 {
     return static_cast<increase_precision<T>>(val);
 }
@@ -609,141 +609,19 @@ struct default_converter {
  *
  * @return returns the ceiled quotient.
  */
-GKO_INLINE GKO_ATTRIBUTES constexpr int64 ceildiv(int64 num, int64 den)
+GKO_INLINE constexpr int64 ceildiv(int64 num, int64 den)
 {
     return (num + den - 1) / den;
 }
 
 
-#if defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC
-
-
-/**
- * Returns the additive identity for T.
- *
- * @return additive identity for T
- */
-template <typename T>
-GKO_INLINE __host__ constexpr T zero()
-{
-    return T{};
-}
-
-
-/**
- * Returns the additive identity for T.
- *
- * @return additive identity for T
- *
- * @note This version takes an unused reference argument to avoid
- *       complicated calls like `zero<decltype(x)>()`. Instead, it allows
- *       `zero(x)`.
- */
-template <typename T>
-GKO_INLINE __host__ constexpr T zero(const T&)
-{
-    return zero<T>();
-}
-
-
-/**
- * Returns the multiplicative identity for T.
- *
- * @return the multiplicative identity for T
- */
-template <typename T>
-GKO_INLINE __host__ constexpr T one()
-{
-    return T(1);
-}
-
-
-/**
- * Returns the multiplicative identity for T.
- *
- * @return the multiplicative identity for T
- *
- * @note This version takes an unused reference argument to avoid
- *       complicated calls like `one<decltype(x)>()`. Instead, it allows
- *       `one(x)`.
- */
-template <typename T>
-GKO_INLINE __host__ constexpr T one(const T&)
-{
-    return one<T>();
-}
-
-
-/**
- * Returns the additive identity for T.
- *
- * @return additive identity for T
- */
-template <typename T>
-GKO_INLINE __device__ constexpr std::enable_if_t<
-    !std::is_same<T, std::complex<remove_complex<T>>>::value, T>
-zero()
-{
-    return T{};
-}
-
-
-/**
- * Returns the additive identity for T.
- *
- * @return additive identity for T
- *
- * @note This version takes an unused reference argument to avoid
- *       complicated calls like `zero<decltype(x)>()`. Instead, it allows
- *       `zero(x)`.
- */
-template <typename T>
-GKO_INLINE __device__ constexpr T zero(const T&)
-{
-    return zero<T>();
-}
-
-
-/**
- * Returns the multiplicative identity for T.
- *
- * @return the multiplicative identity for T
- */
-template <typename T>
-GKO_INLINE __device__ constexpr std::enable_if_t<
-    !std::is_same<T, std::complex<remove_complex<T>>>::value, T>
-one()
-{
-    return T(1);
-}
-
-
-/**
- * Returns the multiplicative identity for T.
- *
- * @return the multiplicative identity for T
- *
- * @note This version takes an unused reference argument to avoid
- *       complicated calls like `one<decltype(x)>()`. Instead, it allows
- *       `one(x)`.
- */
-template <typename T>
-GKO_INLINE __device__ constexpr T one(const T&)
-{
-    return one<T>();
-}
-
-
-#else
-
-
 /**
  * Returns the additive identity for T.
  *
  * @return additive identity for T
  */
 template <typename T>
-GKO_INLINE GKO_ATTRIBUTES constexpr T zero()
+GKO_INLINE constexpr T zero()
 {
     return T{};
 }
@@ -759,7 +637,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr T zero()
  *       `zero(x)`.
  */
 template <typename T>
-GKO_INLINE GKO_ATTRIBUTES constexpr T zero(const T&)
+GKO_INLINE constexpr T zero(const T&)
 {
     return zero<T>();
 }
@@ -771,7 +649,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr T zero(const T&)
  * @return the multiplicative identity for T
  */
 template <typename T>
-GKO_INLINE GKO_ATTRIBUTES constexpr T one()
+GKO_INLINE constexpr T one()
 {
     return T(1);
 }
@@ -787,15 +665,12 @@ GKO_INLINE GKO_ATTRIBUTES constexpr T one()
  *       `one(x)`.
  */
 template <typename T>
-GKO_INLINE GKO_ATTRIBUTES constexpr T one(const T&)
+GKO_INLINE constexpr T one(const T&)
 {
     return one<T>();
 }
 
 
-#endif  // defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC
-
-
 #undef GKO_BIND_ZERO_ONE
 
 
@@ -808,7 +683,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr T one(const T&)
  * @return true iff the given value is zero, i.e. `value == zero<T>()`
  */
 template <typename T>
-GKO_INLINE GKO_ATTRIBUTES constexpr bool is_zero(T value)
+GKO_INLINE constexpr bool is_zero(T value)
 {
     return value == zero<T>();
 }
@@ -823,7 +698,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr bool is_zero(T value)
  * @return true iff the given value is not zero, i.e. `value != zero<T>()`
  */
 template <typename T>
-GKO_INLINE GKO_ATTRIBUTES constexpr bool is_nonzero(T value)
+GKO_INLINE constexpr bool is_nonzero(T value)
 {
     return value != zero<T>();
 }
@@ -841,7 +716,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr bool is_nonzero(T value)
  *
  */
 template <typename T>
-GKO_INLINE GKO_ATTRIBUTES constexpr T max(const T& x, const T& y)
+GKO_INLINE constexpr T max(const T& x, const T& y)
 {
     return x >= y ? x : y;
 }
@@ -859,7 +734,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr T max(const T& x, const T& y)
  *
  */
 template <typename T>
-GKO_INLINE GKO_ATTRIBUTES constexpr T min(const T& x, const T& y)
+GKO_INLINE constexpr T min(const T& x, const T& y)
 {
     return x <= y ? x : y;
 }
@@ -1053,7 +928,7 @@ GKO_ATTRIBUTES GKO_INLINE constexpr auto conj(const T& x)
  * @return  The squared norm of the object.
  */
 template <typename T>
-GKO_INLINE GKO_ATTRIBUTES constexpr auto squared_norm(const T& x)
+GKO_INLINE constexpr auto squared_norm(const T& x)
     -> decltype(real(conj(x) * x))
 {
     return real(conj(x) * x);
@@ -1070,16 +945,15 @@ GKO_INLINE GKO_ATTRIBUTES constexpr auto squared_norm(const T& x)
  * @return x >= zero<T>() ? x : -x;
  */
 template <typename T>
-GKO_INLINE GKO_ATTRIBUTES constexpr std::enable_if_t<!is_complex_s<T>::value, T>
-abs(const T& x)
+GKO_INLINE constexpr std::enable_if_t<!is_complex_s<T>::value, T> abs(
+    const T& x)
 {
     return x >= zero<T>() ? x : -x;
 }
 
 
 template <typename T>
-GKO_INLINE GKO_ATTRIBUTES constexpr std::enable_if_t<is_complex_s<T>::value,
-                                                     remove_complex<T>>
+GKO_INLINE constexpr std::enable_if_t<is_complex_s<T>::value, remove_complex<T>>
 abs(const T& x)
 {
     return sqrt(squared_norm(x));
@@ -1092,7 +966,7 @@ abs(const T& x)
  * @tparam T  the value type to return
  */
 template <typename T>
-GKO_INLINE GKO_ATTRIBUTES constexpr T pi()
+GKO_INLINE constexpr T pi()
 {
     return static_cast<T>(3.1415926535897932384626433);
 }
@@ -1107,8 +981,8 @@ GKO_INLINE GKO_ATTRIBUTES constexpr T pi()
  * @tparam T  the corresponding real value type.
  */
 template <typename T>
-GKO_INLINE GKO_ATTRIBUTES constexpr std::complex<remove_complex<T>> unit_root(
-    int64 n, int64 k = 1)
+GKO_INLINE constexpr std::complex<remove_complex<T>> unit_root(int64 n,
+                                                               int64 k = 1)
 {
     return std::polar(one<remove_complex<T>>(),
                       remove_complex<T>{2} * pi<remove_complex<T>>() * k / n);
@@ -1259,8 +1133,7 @@ GKO_INLINE GKO_ATTRIBUTES std::enable_if_t<is_complex_s<T>::value, bool> is_nan(
  * @return NaN.
  */
 template <typename T>
-GKO_INLINE GKO_ATTRIBUTES constexpr std::enable_if_t<!is_complex_s<T>::value, T>
-nan()
+GKO_INLINE constexpr std::enable_if_t<!is_complex_s<T>::value, T> nan()
 {
     return std::numeric_limits<T>::quiet_NaN();
 }
@@ -1274,8 +1147,7 @@ nan()
  * @return complex{NaN, NaN}.
  */
 template <typename T>
-GKO_INLINE GKO_ATTRIBUTES constexpr std::enable_if_t<is_complex_s<T>::value, T>
-nan()
+GKO_INLINE constexpr std::enable_if_t<is_complex_s<T>::value, T> nan()
 {
     return T{nan<remove_complex<T>>(), nan<remove_complex<T>>()};
 }

From 8381a24ea5e614f7585063206b0e2850fed0e17f Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Tue, 15 Oct 2024 07:38:51 +0000
Subject: [PATCH 197/448] [core] remove unused #undef

---
 include/ginkgo/core/base/math.hpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp
index 33b3a566b37..cd5e489b95d 100644
--- a/include/ginkgo/core/base/math.hpp
+++ b/include/ginkgo/core/base/math.hpp
@@ -671,9 +671,6 @@ GKO_INLINE constexpr T one(const T&)
 }
 
 
-#undef GKO_BIND_ZERO_ONE
-
-
 /**
  * Returns true if and only if the given value is zero.
  *

From 4b48ecb18e7f8bbecf69f63c769031d624007d5a Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Mon, 21 Oct 2024 14:00:23 +0200
Subject: [PATCH 198/448] temporarily disable oneAPI CI jobs

The runners are down for a short while
---
 .gitlab-ci.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 18771d9bc2d..cc67883c4b3 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -344,6 +344,7 @@ build/icpx20231/igpu/release/shared:
     - .build_and_test_template
     - .default_variables
     - .quick_test_condition
+    - .disable_job_condition
     - .use_gko-oneapi20231-igpu
   variables:
     CXX_COMPILER: "icpx"
@@ -377,6 +378,7 @@ build/icpx/igpu/release/static:
     - .build_and_test_template
     - .default_variables
     - .full_test_condition
+    - .disable_job_condition
     - .use_gko-oneapi-igpu
   variables:
     CXX_COMPILER: "dpcpp"

From 068c9ebac84c25f7fd9b0ae3f790aa083bcb5e6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Gr=C3=BCtzmacher?= <thomas.gruetzmacher@kit.edu>
Date: Mon, 8 Apr 2024 18:34:10 +0200
Subject: [PATCH 199/448] Simplify the testing types

Add template type functions to combine and merge type lists.
As a result, the GINKGO_DPCPP_SINGLE_MODE only needs to be present once.

Additionally, change the typen name ValueAndIndexType to ComplexAndPODTypes
(because gko::size_type is not an IndexType).
---
 core/test/base/array.cpp                      |   2 +-
 core/test/base/iterator_factory.cpp           |   2 +-
 core/test/utils.hpp                           | 262 ++++++++++--------
 core/test/utils/CMakeLists.txt                |   1 +
 core/test/utils/utils_test.cpp                | 236 ++++++++++++++++
 cuda/test/base/array.cpp                      |   2 +-
 reference/test/base/array.cpp                 |   2 +-
 .../test/components/fill_array_kernels.cpp    |   2 +-
 .../test/components/reduce_array_kernels.cpp  |   2 +-
 test/components/fill_array_kernels.cpp        |   2 +-
 test/components/reduce_array_kernels.cpp      |   2 +-
 11 files changed, 387 insertions(+), 128 deletions(-)
 create mode 100644 core/test/utils/utils_test.cpp

diff --git a/core/test/base/array.cpp b/core/test/base/array.cpp
index 71816f690ce..f7e03855d06 100644
--- a/core/test/base/array.cpp
+++ b/core/test/base/array.cpp
@@ -40,7 +40,7 @@ class Array : public ::testing::Test {
     gko::array<T> x;
 };
 
-TYPED_TEST_SUITE(Array, gko::test::ValueAndIndexTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Array, gko::test::ComplexAndPODTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Array, CanBeCreatedWithoutAnExecutor)
diff --git a/core/test/base/iterator_factory.cpp b/core/test/base/iterator_factory.cpp
index c4dc30bf219..bbc3bbfd04f 100644
--- a/core/test/base/iterator_factory.cpp
+++ b/core/test/base/iterator_factory.cpp
@@ -366,7 +366,7 @@ class PermuteIterator : public ::testing::Test {
     using value_type = ValueType;
 };
 
-TYPED_TEST_SUITE(PermuteIterator, gko::test::ValueAndIndexTypes,
+TYPED_TEST_SUITE(PermuteIterator, gko::test::ComplexAndPODTypes,
                  TypenameNameGenerator);
 
 
diff --git a/core/test/utils.hpp b/core/test/utils.hpp
index 43ded30cde5..d711e6310e3 100644
--- a/core/test/utils.hpp
+++ b/core/test/utils.hpp
@@ -29,154 +29,176 @@
 
 namespace gko {
 namespace test {
+namespace detail {
+
+
+template <typename LeftList, typename RightList, typename... Result>
+struct cartesian_type_product {};
+
+template <template <typename...> class OuterWrapper, typename FirstLeft,
+          typename... RemainingLeftArgs, typename... RightArgs,
+          typename... Result>
+struct cartesian_type_product<OuterWrapper<FirstLeft, RemainingLeftArgs...>,
+                              OuterWrapper<RightArgs...>, Result...>
+    : cartesian_type_product<OuterWrapper<RemainingLeftArgs...>,
+                             OuterWrapper<RightArgs...>, Result...,
+                             std::tuple<FirstLeft, RightArgs>...> {};
+
+template <template <typename...> class OuterWrapper, typename... RightArgs,
+          typename... Result>
+struct cartesian_type_product<OuterWrapper<>, OuterWrapper<RightArgs...>,
+                              Result...> {
+    using type = OuterWrapper<Result...>;
+};
 
+template <typename ExistingCombinationList, typename NewElementList,
+          typename... Result>
+struct add_to_cartesian_type_product {};
+
+template <template <typename...> class OuterWrapper,
+          typename... CurrentCombinationArgs,
+          typename... RemainingOldCombinations, typename... NewElementArgs,
+          typename... Result>
+struct add_to_cartesian_type_product<
+    OuterWrapper<std::tuple<CurrentCombinationArgs...>,
+                 RemainingOldCombinations...>,
+    OuterWrapper<NewElementArgs...>, Result...>
+    : add_to_cartesian_type_product<
+          OuterWrapper<RemainingOldCombinations...>,
+          OuterWrapper<NewElementArgs...>, Result...,
+          std::tuple<CurrentCombinationArgs..., NewElementArgs>...> {};
+
+template <template <typename...> class OuterWrapper, typename... NewElementArgs,
+          typename... Result>
+struct add_to_cartesian_type_product<
+    OuterWrapper<>, OuterWrapper<NewElementArgs...>, Result...> {
+    using type = OuterWrapper<Result...>;
+};
 
-using ValueTypes =
-#if GINKGO_DPCPP_SINGLE_MODE
-    ::testing::Types<float, std::complex<float>>;
-#else
-    ::testing::Types<float, double, std::complex<float>, std::complex<double>>;
-#endif
+template <typename NewElementList, typename ExistingCombinationList,
+          typename... Result>
+struct add_to_cartesian_type_product_left {};
+
+template <template <typename...> class OuterWrapper, typename... NewElementArgs,
+          typename... CurrentCombinationArgs,
+          typename... RemainingOldCombinations, typename... Result>
+struct add_to_cartesian_type_product_left<
+    OuterWrapper<NewElementArgs...>,
+    OuterWrapper<std::tuple<CurrentCombinationArgs...>,
+                 RemainingOldCombinations...>,
+    Result...>
+    : add_to_cartesian_type_product_left<
+          OuterWrapper<NewElementArgs...>,
+          OuterWrapper<RemainingOldCombinations...>, Result...,
+          std::tuple<NewElementArgs, CurrentCombinationArgs...>...> {};
+
+template <template <typename...> class OuterWrapper, typename... NewElementArgs,
+          typename... Result>
+struct add_to_cartesian_type_product_left<OuterWrapper<NewElementArgs...>,
+                                          OuterWrapper<>, Result...> {
+    using type = OuterWrapper<Result...>;
+};
 
-using ComplexValueTypes =
-#if GINKGO_DPCPP_SINGLE_MODE
-    ::testing::Types<std::complex<float>>;
-#else
-    ::testing::Types<std::complex<float>, std::complex<double>>;
-#endif
+template <typename FirstList, typename SecondList>
+struct merge_type_list {};
 
-using RealValueTypes =
-#if GINKGO_DPCPP_SINGLE_MODE
-    ::testing::Types<float>;
-#else
-    ::testing::Types<float, double>;
-#endif
+template <template <typename...> class OuterWrapper, typename... Args1,
+          typename... Args2>
+struct merge_type_list<OuterWrapper<Args1...>, OuterWrapper<Args2...>> {
+    using type = OuterWrapper<Args1..., Args2...>;
+};
 
 
-using IndexTypes = ::testing::Types<gko::int32, gko::int64>;
+template <template <typename...> class NewOuterWrapper,
+          typename OldOuterWrapper>
+struct change_outer_wrapper {};
 
+template <template <typename...> class NewOuterWrapper,
+          template <typename...> class OldOuterWrapper, typename... Args>
+struct change_outer_wrapper<NewOuterWrapper, OldOuterWrapper<Args...>> {
+    using type = NewOuterWrapper<Args...>;
+};
 
-using LocalGlobalIndexTypes =
-    ::testing::Types<std::tuple<int32, int32>, std::tuple<int32, int64>,
-                     std::tuple<int64, int64>>;
 
+template <template <typename...> class NewInnerWrapper, typename ListType>
+struct add_internal_wrapper {};
 
-using PODTypes =
-#if GINKGO_DPCPP_SINGLE_MODE
-    ::testing::Types<float, gko::int32, gko::int64>;
-#else
-    ::testing::Types<float, double, gko::int32, gko::int64>;
-#endif
+template <template <typename...> class NewInnerWrapper,
+          template <typename...> class OuterWrapper, typename... Args>
+struct add_internal_wrapper<NewInnerWrapper, OuterWrapper<Args...>> {
+    using type = OuterWrapper<NewInnerWrapper<Args>...>;
+};
 
 
-using ValueAndIndexTypes =
-#if GINKGO_DPCPP_SINGLE_MODE
-    ::testing::Types<float, std::complex<float>, gko::int32, gko::int64,
-                     gko::size_type>;
-#else
-    ::testing::Types<float, double, std::complex<float>, std::complex<double>,
-                     gko::int32, gko::int64, gko::size_type>;
-#endif
+}  // namespace detail
 
 
-using RealValueAndIndexTypes =
-#if GINKGO_DPCPP_SINGLE_MODE
-    ::testing::Types<float, gko::int32, gko::int64, gko::size_type>;
-#else
-    ::testing::Types<float, double, gko::int32, gko::int64, gko::size_type>;
-#endif
+template <typename LeftList, typename RightList>
+using cartesian_type_product_t =
+    typename detail::cartesian_type_product<LeftList, RightList>::type;
 
+template <typename ExistingCombinationList, typename NewElementList>
+using add_to_cartesian_type_product_t =
+    typename detail::add_to_cartesian_type_product<ExistingCombinationList,
+                                                   NewElementList>::type;
 
-using ValueIndexTypes =
-#if GINKGO_DPCPP_SINGLE_MODE
-    ::testing::Types<std::tuple<float, gko::int32>,
-                     std::tuple<std::complex<float>, gko::int32>,
-                     std::tuple<float, gko::int64>,
-                     std::tuple<std::complex<float>, gko::int64>>;
-#else
-    ::testing::Types<
-        std::tuple<float, gko::int32>, std::tuple<double, gko::int32>,
-        std::tuple<std::complex<float>, gko::int32>,
-        std::tuple<std::complex<double>, gko::int32>,
-        std::tuple<float, gko::int64>, std::tuple<double, gko::int64>,
-        std::tuple<std::complex<float>, gko::int64>,
-        std::tuple<std::complex<double>, gko::int64>>;
-#endif
+template <typename NewElementList, typename ExistingCombinationList>
+using add_to_cartesian_type_product_left_t =
+    typename detail::add_to_cartesian_type_product_left<
+        NewElementList, ExistingCombinationList>::type;
 
+template <typename FirstList, typename SecondList>
+using merge_type_list_t =
+    typename detail::merge_type_list<FirstList, SecondList>::type;
 
-using RealValueIndexTypes =
-#if GINKGO_DPCPP_SINGLE_MODE
-    ::testing::Types<std::tuple<float, gko::int32>,
-                     std::tuple<float, gko::int64>>;
-#else
-    ::testing::Types<
-        std::tuple<float, gko::int32>, std::tuple<double, gko::int32>,
-        std::tuple<float, gko::int64>, std::tuple<double, gko::int64>>;
-#endif
+template <template <typename...> class NewInnerWrapper, typename ListType>
+using add_internal_wrapper_t =
+    typename detail::add_internal_wrapper<NewInnerWrapper, ListType>::type;
 
+template <template <typename...> class NewOuterWrapper, typename ListType>
+using change_outer_wrapper_t =
+    typename detail::change_outer_wrapper<NewOuterWrapper, ListType>::type;
 
-using ComplexValueIndexTypes =
+
+using RealValueTypes =
 #if GINKGO_DPCPP_SINGLE_MODE
-    ::testing::Types<std::tuple<std::complex<float>, gko::int32>,
-                     std::tuple<std::complex<float>, gko::int64>>;
+    ::testing::Types<float>;
 #else
-    ::testing::Types<std::tuple<std::complex<float>, gko::int32>,
-                     std::tuple<std::complex<double>, gko::int32>,
-                     std::tuple<std::complex<float>, gko::int64>,
-                     std::tuple<std::complex<double>, gko::int64>>;
+    ::testing::Types<float, double>;
 #endif
 
+using ComplexValueTypes = add_internal_wrapper_t<std::complex, RealValueTypes>;
 
-using TwoValueIndexType =
-#if GINKGO_DPCPP_SINGLE_MODE
-    ::testing::Types<
-        std::tuple<float, float, gko::int32>,
-        std::tuple<std::complex<float>, std::complex<float>, gko::int32>,
-        std::tuple<float, float, gko::int64>,
-        std::tuple<std::complex<float>, std::complex<float>, gko::int64>>;
-#else
-    ::testing::Types<
-        std::tuple<float, float, gko::int32>,
-        std::tuple<float, double, gko::int32>,
-        std::tuple<double, double, gko::int32>,
-        std::tuple<double, float, gko::int32>,
-        std::tuple<std::complex<float>, std::complex<float>, gko::int32>,
-        std::tuple<std::complex<float>, std::complex<double>, gko::int32>,
-        std::tuple<std::complex<double>, std::complex<double>, gko::int32>,
-        std::tuple<std::complex<double>, std::complex<float>, gko::int32>,
-        std::tuple<float, float, gko::int64>,
-        std::tuple<float, double, gko::int64>,
-        std::tuple<double, double, gko::int64>,
-        std::tuple<double, float, gko::int64>,
-        std::tuple<std::complex<float>, std::complex<float>, gko::int64>,
-        std::tuple<std::complex<float>, std::complex<double>, gko::int64>,
-        std::tuple<std::complex<double>, std::complex<double>, gko::int64>,
-        std::tuple<std::complex<double>, std::complex<float>, gko::int64>>;
-#endif
+using ValueTypes = merge_type_list_t<RealValueTypes, ComplexValueTypes>;
 
+using IndexTypes = ::testing::Types<int32, int64>;
+
+using LocalGlobalIndexTypes =
+    ::testing::Types<std::tuple<int32, int32>, std::tuple<int32, int64>,
+                     std::tuple<int64, int64>>;
+
+using PODTypes =
+    merge_type_list_t<merge_type_list_t<RealValueTypes, IndexTypes>,
+                      ::testing::Types<size_type>>;
+
+using ComplexAndPODTypes = merge_type_list_t<ComplexValueTypes, PODTypes>;
+
+using ValueIndexTypes = cartesian_type_product_t<ValueTypes, IndexTypes>;
+
+using RealValueIndexTypes =
+    cartesian_type_product_t<RealValueTypes, IndexTypes>;
+
+using ComplexValueIndexTypes =
+    cartesian_type_product_t<ComplexValueTypes, IndexTypes>;
+
+using TwoValueIndexType = add_to_cartesian_type_product_t<
+    merge_type_list_t<
+        cartesian_type_product_t<RealValueTypes, RealValueTypes>,
+        cartesian_type_product_t<ComplexValueTypes, ComplexValueTypes>>,
+    IndexTypes>;
 
 using ValueLocalGlobalIndexTypes =
-#if GINKGO_DPCPP_SINGLE_MODE
-    ::testing::Types<std::tuple<float, gko::int32, int32>,
-                     std::tuple<float, gko::int32, int64>,
-                     std::tuple<float, gko::int64, int64>,
-                     std::tuple<std::complex<float>, gko::int32, int32>,
-                     std::tuple<std::complex<float>, gko::int32, int64>,
-                     std::tuple<std::complex<float>, gko::int64, int64>>;
-#else
-    ::testing::Types<std::tuple<float, gko::int32, int32>,
-                     std::tuple<float, gko::int32, int64>,
-                     std::tuple<float, gko::int64, int64>,
-                     std::tuple<double, gko::int32, int32>,
-                     std::tuple<double, gko::int32, int64>,
-                     std::tuple<double, gko::int64, int64>,
-                     std::tuple<std::complex<float>, gko::int32, int32>,
-                     std::tuple<std::complex<float>, gko::int32, int64>,
-                     std::tuple<std::complex<float>, gko::int64, int64>,
-                     std::tuple<std::complex<double>, gko::int32, int32>,
-                     std::tuple<std::complex<double>, gko::int32, int64>,
-                     std::tuple<std::complex<double>, gko::int64, int64>>;
-#endif
+    add_to_cartesian_type_product_left_t<ValueTypes, LocalGlobalIndexTypes>;
 
 
 template <typename Precision, typename OutputType>
diff --git a/core/test/utils/CMakeLists.txt b/core/test/utils/CMakeLists.txt
index 88c61456280..af6de8ad1a3 100644
--- a/core/test/utils/CMakeLists.txt
+++ b/core/test/utils/CMakeLists.txt
@@ -1,3 +1,4 @@
+ginkgo_create_test(utils_test)
 ginkgo_create_test(array_generator_test)
 ginkgo_create_test(assertions_test)
 ginkgo_create_test(matrix_generator_test)
diff --git a/core/test/utils/utils_test.cpp b/core/test/utils/utils_test.cpp
new file mode 100644
index 00000000000..9184e56b4e2
--- /dev/null
+++ b/core/test/utils/utils_test.cpp
@@ -0,0 +1,236 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <complex>
+#include <tuple>
+#include <type_traits>
+
+
+#include <gtest/gtest.h>
+
+
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+using i_type = std::integral_constant<int, 42>;
+using t_type = std::tuple<int>;
+
+using testing_types1 = testing::Types<double>;
+using testing_types2 = testing::Types<t_type, int>;
+using testing_types3 = testing::Types<i_type, short, float>;
+
+using tuple_types1 = std::tuple<double>;
+using tuple_types2 = std::tuple<t_type, int>;
+using tuple_types3 = std::tuple<i_type, short, float>;
+
+template <typename... Args>
+struct type_list {};
+
+
+TEST(TypeListHelper, ChangeOuterWrapper1)
+{
+    testing::StaticAssertTypeEq<
+        gko::test::change_outer_wrapper_t<std::tuple, testing_types1>,
+        tuple_types1>();
+    testing::StaticAssertTypeEq<
+        gko::test::change_outer_wrapper_t<std::tuple, testing_types2>,
+        tuple_types2>();
+    testing::StaticAssertTypeEq<
+        gko::test::change_outer_wrapper_t<std::tuple, testing_types3>,
+        tuple_types3>();
+    testing::StaticAssertTypeEq<
+        gko::test::change_outer_wrapper_t<testing::Types, tuple_types1>,
+        testing_types1>();
+    testing::StaticAssertTypeEq<
+        gko::test::change_outer_wrapper_t<testing::Types, tuple_types2>,
+        testing_types2>();
+    testing::StaticAssertTypeEq<
+        gko::test::change_outer_wrapper_t<testing::Types, tuple_types3>,
+        testing_types3>();
+}
+
+
+TEST(TypeListHelper, ChangeOuterWrapper2)
+{
+    using alternative_list1 = type_list<i_type, t_type, double>;
+    using expected_ow2 = testing::Types<i_type, t_type, double>;
+
+    testing::StaticAssertTypeEq<
+        gko::test::change_outer_wrapper_t<testing::Types, alternative_list1>,
+        expected_ow2>();
+}
+
+
+TEST(TypeListHelper, AddInternalWrapperTuple)
+{
+    using expected_iw1 = testing::Types<std::tuple<i_type>, std::tuple<short>,
+                                        std::tuple<float>>;
+    testing::StaticAssertTypeEq<
+        gko::test::add_internal_wrapper_t<std::tuple, testing_types3>,
+        expected_iw1>();
+    testing::StaticAssertTypeEq<
+        gko::test::add_internal_wrapper_t<std::tuple, tuple_types3>,
+        gko::test::change_outer_wrapper_t<std::tuple, expected_iw1>>();
+}
+
+
+TEST(TypeListHelper, AddInternalWrapperComplex)
+{
+    using expected_iw2 = testing::Types<std::complex<double>>;
+
+    testing::StaticAssertTypeEq<
+        gko::test::add_internal_wrapper_t<std::complex, testing_types1>,
+        expected_iw2>();
+    testing::StaticAssertTypeEq<
+        gko::test::add_internal_wrapper_t<std::complex, tuple_types1>,
+        gko::test::change_outer_wrapper_t<std::tuple, expected_iw2>>();
+}
+
+
+TEST(TypeListHelper, MergeTypeListLarge)
+{
+    using expected_m1 = testing::Types<i_type, short, float, t_type, int>;
+
+    testing::StaticAssertTypeEq<
+        gko::test::merge_type_list_t<testing_types3, testing_types2>,
+        expected_m1>();
+    testing::StaticAssertTypeEq<
+        gko::test::merge_type_list_t<tuple_types3, tuple_types2>,
+        gko::test::change_outer_wrapper_t<std::tuple, expected_m1>>();
+}
+
+
+TEST(TypeListHelper, MergeTypeListEmpty)
+{
+    using expected_m2 = testing::Types<double>;
+
+    testing::StaticAssertTypeEq<
+        gko::test::merge_type_list_t<testing_types1, testing::Types<>>,
+        expected_m2>();
+    testing::StaticAssertTypeEq<
+        gko::test::merge_type_list_t<tuple_types1, std::tuple<>>,
+        gko::test::change_outer_wrapper_t<std::tuple, expected_m2>>();
+    testing::StaticAssertTypeEq<
+        gko::test::merge_type_list_t<testing::Types<>, testing_types1>,
+        expected_m2>();
+    testing::StaticAssertTypeEq<
+        gko::test::merge_type_list_t<std::tuple<>, tuple_types1>,
+        gko::test::change_outer_wrapper_t<std::tuple, expected_m2>>();
+}
+
+
+TEST(TypeListHelper, CartesianTypeProductLarge)
+{
+    using expected_c1 =
+        testing::Types<std::tuple<t_type, i_type>, std::tuple<t_type, short>,
+                       std::tuple<t_type, float>, std::tuple<int, i_type>,
+                       std::tuple<int, short>, std::tuple<int, float>>;
+
+    testing::StaticAssertTypeEq<
+        gko::test::cartesian_type_product_t<testing_types2, testing_types3>,
+        expected_c1>();
+    testing::StaticAssertTypeEq<
+        gko::test::cartesian_type_product_t<tuple_types2, tuple_types3>,
+        gko::test::change_outer_wrapper_t<std::tuple, expected_c1>>();
+}
+
+
+TEST(TypeListHelper, CartesianTypeProductSmall)
+{
+    using expected_c2 =
+        testing::Types<std::tuple<double, t_type>, std::tuple<double, int>>;
+
+    testing::StaticAssertTypeEq<
+        gko::test::cartesian_type_product_t<testing_types1, testing_types2>,
+        expected_c2>();
+    testing::StaticAssertTypeEq<
+        gko::test::cartesian_type_product_t<tuple_types1, tuple_types2>,
+        gko::test::change_outer_wrapper_t<std::tuple, expected_c2>>();
+}
+
+
+TEST(TypeListHelper, AddToCartesianTypeProductLarge)
+{
+    using list1 =
+        testing::Types<std::tuple<double, int>, std::tuple<double, short>,
+                       std::tuple<float, int>, std::tuple<float, short>>;
+    using list2 = testing::Types<long, char>;
+    using tlist1 = gko::test::change_outer_wrapper_t<std::tuple, list1>;
+    using tlist2 = std::tuple<long, char>;
+    using expected_a1 = testing::Types<
+        std::tuple<double, int, long>, std::tuple<double, int, char>,
+        std::tuple<double, short, long>, std::tuple<double, short, char>,
+        std::tuple<float, int, long>, std::tuple<float, int, char>,
+        std::tuple<float, short, long>, std::tuple<float, short, char>>;
+
+    testing::StaticAssertTypeEq<
+        gko::test::add_to_cartesian_type_product_t<list1, list2>,
+        expected_a1>();
+    testing::StaticAssertTypeEq<
+        gko::test::add_to_cartesian_type_product_t<tlist1, tlist2>,
+        gko::test::change_outer_wrapper_t<std::tuple, expected_a1>>();
+}
+
+
+TEST(TypeListHelper, AddToCartesianTypeProductSmall)
+{
+    using list3 = testing::Types<std::tuple<long>>;
+    using list4 = testing::Types<double>;
+    using tlist3 = std::tuple<std::tuple<long>>;
+    using tlist4 = std::tuple<double>;
+    using expected_a2 = testing::Types<std::tuple<long, double>>;
+
+    testing::StaticAssertTypeEq<
+        gko::test::add_to_cartesian_type_product_t<list3, list4>,
+        expected_a2>();
+    testing::StaticAssertTypeEq<
+        gko::test::add_to_cartesian_type_product_t<tlist3, tlist4>,
+        gko::test::change_outer_wrapper_t<std::tuple, expected_a2>>();
+}
+
+
+TEST(TypeListHelper, AddToCartesianTypeProductLeftLarge)
+{
+    using list1 = testing::Types<long, char>;
+    using list2 =
+        testing::Types<std::tuple<double, int>, std::tuple<double, short>,
+                       std::tuple<float, int>, std::tuple<float, short>>;
+    using tlist1 = std::tuple<long, char>;
+    using tlist2 = gko::test::change_outer_wrapper_t<std::tuple, list2>;
+    using expected_a1 = testing::Types<
+        std::tuple<long, double, int>, std::tuple<char, double, int>,
+        std::tuple<long, double, short>, std::tuple<char, double, short>,
+        std::tuple<long, float, int>, std::tuple<char, float, int>,
+        std::tuple<long, float, short>, std::tuple<char, float, short>>;
+
+    testing::StaticAssertTypeEq<
+        gko::test::add_to_cartesian_type_product_left_t<list1, list2>,
+        expected_a1>();
+    testing::StaticAssertTypeEq<
+        gko::test::add_to_cartesian_type_product_left_t<tlist1, tlist2>,
+        gko::test::change_outer_wrapper_t<std::tuple, expected_a1>>();
+}
+
+
+TEST(TypeListHelper, AddToCartesianTypeProductLeftSmall)
+{
+    using list3 = testing::Types<double>;
+    using list4 = testing::Types<std::tuple<long>>;
+    using tlist3 = std::tuple<double>;
+    using tlist4 = std::tuple<std::tuple<long>>;
+    using expected_a2 = testing::Types<std::tuple<double, long>>;
+
+    testing::StaticAssertTypeEq<
+        gko::test::add_to_cartesian_type_product_left_t<list3, list4>,
+        expected_a2>();
+    testing::StaticAssertTypeEq<
+        gko::test::add_to_cartesian_type_product_left_t<tlist3, tlist4>,
+        gko::test::change_outer_wrapper_t<std::tuple, expected_a2>>();
+}
+
+
+}  // namespace
diff --git a/cuda/test/base/array.cpp b/cuda/test/base/array.cpp
index edb6b71676a..db7d4c54536 100644
--- a/cuda/test/base/array.cpp
+++ b/cuda/test/base/array.cpp
@@ -32,7 +32,7 @@ class Array : public CudaTestFixture {
     gko::array<T> x;
 };
 
-TYPED_TEST_SUITE(Array, gko::test::ValueAndIndexTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Array, gko::test::ComplexAndPODTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Array, CanCreateTemporaryCloneOnDifferentExecutor)
diff --git a/reference/test/base/array.cpp b/reference/test/base/array.cpp
index be0396383e1..666ab13063c 100644
--- a/reference/test/base/array.cpp
+++ b/reference/test/base/array.cpp
@@ -28,7 +28,7 @@ class Array : public ::testing::Test {
     gko::array<T> x;
 };
 
-TYPED_TEST_SUITE(Array, gko::test::ValueAndIndexTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Array, gko::test::ComplexAndPODTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Array, CanBeFilledWithValue)
diff --git a/reference/test/components/fill_array_kernels.cpp b/reference/test/components/fill_array_kernels.cpp
index d087c833c96..3c7520c6847 100644
--- a/reference/test/components/fill_array_kernels.cpp
+++ b/reference/test/components/fill_array_kernels.cpp
@@ -40,7 +40,7 @@ class FillArray : public ::testing::Test {
     gko::array<value_type> seqs;
 };
 
-TYPED_TEST_SUITE(FillArray, gko::test::ValueAndIndexTypes,
+TYPED_TEST_SUITE(FillArray, gko::test::ComplexAndPODTypes,
                  TypenameNameGenerator);
 
 
diff --git a/reference/test/components/reduce_array_kernels.cpp b/reference/test/components/reduce_array_kernels.cpp
index b88ec181261..8286817c853 100644
--- a/reference/test/components/reduce_array_kernels.cpp
+++ b/reference/test/components/reduce_array_kernels.cpp
@@ -31,7 +31,7 @@ class ReduceArray : public ::testing::Test {
     gko::array<value_type> vals;
 };
 
-TYPED_TEST_SUITE(ReduceArray, gko::test::ValueAndIndexTypes,
+TYPED_TEST_SUITE(ReduceArray, gko::test::ComplexAndPODTypes,
                  TypenameNameGenerator);
 
 
diff --git a/test/components/fill_array_kernels.cpp b/test/components/fill_array_kernels.cpp
index 4756180f896..3d494b3f5f0 100644
--- a/test/components/fill_array_kernels.cpp
+++ b/test/components/fill_array_kernels.cpp
@@ -36,7 +36,7 @@ class FillArray : public CommonTestFixture {
     gko::array<value_type> seqs;
 };
 
-TYPED_TEST_SUITE(FillArray, gko::test::ValueAndIndexTypes,
+TYPED_TEST_SUITE(FillArray, gko::test::ComplexAndPODTypes,
                  TypenameNameGenerator);
 
 
diff --git a/test/components/reduce_array_kernels.cpp b/test/components/reduce_array_kernels.cpp
index 182928412f2..b7407801a32 100644
--- a/test/components/reduce_array_kernels.cpp
+++ b/test/components/reduce_array_kernels.cpp
@@ -38,7 +38,7 @@ class ReduceArray : public CommonTestFixture {
     gko::array<value_type> dvals;
 };
 
-TYPED_TEST_SUITE(ReduceArray, gko::test::ValueAndIndexTypes,
+TYPED_TEST_SUITE(ReduceArray, gko::test::ComplexAndPODTypes,
                  TypenameNameGenerator);
 
 

From 007f39420892a2c37385d21eefc2bd6dca8a89bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Gr=C3=BCtzmacher?= <thomas.gruetzmacher@kit.edu>
Date: Tue, 9 Apr 2024 15:07:37 +0200
Subject: [PATCH 200/448] Use new type aliases in relevant tests

Additionally, introduce `IntegerTypes`, which is IndexTypes combined
with gko::size_type (used for the prefix sum tests).
---
 core/test/solver/cb_gmres.cpp                    | 16 +++++-----------
 core/test/utils.hpp                              |  6 +++---
 reference/test/components/prefix_sum_kernels.cpp |  5 +----
 reference/test/solver/cb_gmres_kernels.cpp       | 16 +++++-----------
 test/components/prefix_sum_kernels.cpp           |  5 +----
 test/factorization/cholesky_kernels.cpp          | 10 ++++------
 test/factorization/lu_kernels.cpp                | 10 ++++------
 test/solver/direct.cpp                           | 10 ++++------
 8 files changed, 27 insertions(+), 51 deletions(-)

diff --git a/core/test/solver/cb_gmres.cpp b/core/test/solver/cb_gmres.cpp
index 21600ed2b70..e014e5f8acc 100644
--- a/core/test/solver/cb_gmres.cpp
+++ b/core/test/solver/cb_gmres.cpp
@@ -85,17 +85,11 @@ using st_i = st_helper_type<st_enum::integer>;
 using st_ir1 = st_helper_type<st_enum::ireduce1>;
 using st_ir2 = st_helper_type<st_enum::ireduce2>;
 
-using TestTypes =
-    ::testing::Types<std::tuple<double, st_keep>, std::tuple<double, st_r1>,
-                     std::tuple<double, st_r2>, std::tuple<double, st_i>,
-                     std::tuple<double, st_ir1>, std::tuple<double, st_ir2>,
-                     std::tuple<float, st_keep>, std::tuple<float, st_r1>,
-                     std::tuple<float, st_r2>, std::tuple<float, st_i>,
-                     std::tuple<float, st_ir1>, std::tuple<float, st_ir2>,
-                     std::tuple<std::complex<double>, st_keep>,
-                     std::tuple<std::complex<double>, st_r1>,
-                     std::tuple<std::complex<double>, st_r2>,
-                     std::tuple<std::complex<float>, st_keep>>;
+using TestTypes = gko::test::merge_type_list_t<
+    gko::test::cartesian_type_product_t<
+        gko::test::ValueTypes, ::testing::Types<st_keep, st_r1, st_r2>>,
+    gko::test::cartesian_type_product_t<
+        gko::test::RealValueTypes, ::testing::Types<st_i, st_ir1, st_ir2>>>;
 
 TYPED_TEST_SUITE(CbGmres, TestTypes, PairTypenameNameGenerator);
 
diff --git a/core/test/utils.hpp b/core/test/utils.hpp
index d711e6310e3..58081f3fa16 100644
--- a/core/test/utils.hpp
+++ b/core/test/utils.hpp
@@ -173,13 +173,13 @@ using ValueTypes = merge_type_list_t<RealValueTypes, ComplexValueTypes>;
 
 using IndexTypes = ::testing::Types<int32, int64>;
 
+using IntegerTypes = merge_type_list_t<IndexTypes, ::testing::Types<size_type>>;
+
 using LocalGlobalIndexTypes =
     ::testing::Types<std::tuple<int32, int32>, std::tuple<int32, int64>,
                      std::tuple<int64, int64>>;
 
-using PODTypes =
-    merge_type_list_t<merge_type_list_t<RealValueTypes, IndexTypes>,
-                      ::testing::Types<size_type>>;
+using PODTypes = merge_type_list_t<RealValueTypes, IntegerTypes>;
 
 using ComplexAndPODTypes = merge_type_list_t<ComplexValueTypes, PODTypes>;
 
diff --git a/reference/test/components/prefix_sum_kernels.cpp b/reference/test/components/prefix_sum_kernels.cpp
index 00265442cce..8d0993cf547 100644
--- a/reference/test/components/prefix_sum_kernels.cpp
+++ b/reference/test/components/prefix_sum_kernels.cpp
@@ -35,10 +35,7 @@ class PrefixSum : public ::testing::Test {
     std::vector<index_type> expected;
 };
 
-using PrefixSumIndexTypes =
-    ::testing::Types<gko::int32, gko::int64, gko::size_type>;
-
-TYPED_TEST_SUITE(PrefixSum, PrefixSumIndexTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(PrefixSum, gko::test::IntegerTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(PrefixSum, Works)
diff --git a/reference/test/solver/cb_gmres_kernels.cpp b/reference/test/solver/cb_gmres_kernels.cpp
index eeeca82494c..a027c02705b 100644
--- a/reference/test/solver/cb_gmres_kernels.cpp
+++ b/reference/test/solver/cb_gmres_kernels.cpp
@@ -134,17 +134,11 @@ using st_i = st_helper_type<st_enum::integer>;
 using st_ir1 = st_helper_type<st_enum::ireduce1>;
 using st_ir2 = st_helper_type<st_enum::ireduce2>;
 
-using TestTypes =
-    ::testing::Types<std::tuple<double, st_keep>, std::tuple<double, st_r1>,
-                     std::tuple<double, st_r2>, std::tuple<double, st_i>,
-                     std::tuple<double, st_ir1>, std::tuple<double, st_ir2>,
-                     std::tuple<float, st_keep>, std::tuple<float, st_r1>,
-                     std::tuple<float, st_r2>, std::tuple<float, st_i>,
-                     std::tuple<float, st_ir1>, std::tuple<float, st_ir2>,
-                     std::tuple<std::complex<double>, st_keep>,
-                     std::tuple<std::complex<double>, st_r1>,
-                     std::tuple<std::complex<double>, st_r2>,
-                     std::tuple<std::complex<float>, st_keep>>;
+using TestTypes = gko::test::merge_type_list_t<
+    gko::test::cartesian_type_product_t<
+        gko::test::ValueTypes, ::testing::Types<st_keep, st_r1, st_r2>>,
+    gko::test::cartesian_type_product_t<
+        gko::test::RealValueTypes, ::testing::Types<st_i, st_ir1, st_ir2>>>;
 
 TYPED_TEST_SUITE(CbGmres, TestTypes, PairTypenameNameGenerator);
 
diff --git a/test/components/prefix_sum_kernels.cpp b/test/components/prefix_sum_kernels.cpp
index 4a1c950855a..deb8a8aa280 100644
--- a/test/components/prefix_sum_kernels.cpp
+++ b/test/components/prefix_sum_kernels.cpp
@@ -39,10 +39,7 @@ class PrefixSum : public CommonTestFixture {
     gko::array<index_type> dvals;
 };
 
-using PrefixSumIndexTypes =
-    ::testing::Types<gko::int32, gko::int64, gko::size_type>;
-
-TYPED_TEST_SUITE(PrefixSum, PrefixSumIndexTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(PrefixSum, gko::test::IntegerTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(PrefixSum, EqualsReference)
diff --git a/test/factorization/cholesky_kernels.cpp b/test/factorization/cholesky_kernels.cpp
index 94d31fe33db..aff2abed6c5 100644
--- a/test/factorization/cholesky_kernels.cpp
+++ b/test/factorization/cholesky_kernels.cpp
@@ -115,14 +115,12 @@ using Types = gko::test::ValueIndexTypes;
 #elif defined(GKO_COMPILING_CUDA)
 // CUDA doesn't support long indices for sorting, and the triangular solvers
 // seem broken
-using Types = ::testing::Types<std::tuple<float, gko::int32>,
-                               std::tuple<double, gko::int32>,
-                               std::tuple<std::complex<float>, gko::int32>,
-                               std::tuple<std::complex<double>, gko::int32>>;
+using Types = gko::test::cartesian_type_product_t<gko::test::ValueTypes,
+                                                  ::testing::Types<gko::int32>>;
 #else
 // HIP only supports real types and int32
-using Types = ::testing::Types<std::tuple<float, gko::int32>,
-                               std::tuple<double, gko::int32>>;
+using Types = gko::test::cartesian_type_product_t<gko::test::RealValueTypes,
+                                                  ::testing::Types<gko::int32>>;
 #endif
 
 TYPED_TEST_SUITE(CholeskySymbolic, Types, PairTypenameNameGenerator);
diff --git a/test/factorization/lu_kernels.cpp b/test/factorization/lu_kernels.cpp
index 830ba6ddd5f..d38b6346cd8 100644
--- a/test/factorization/lu_kernels.cpp
+++ b/test/factorization/lu_kernels.cpp
@@ -131,14 +131,12 @@ using Types = gko::test::ValueIndexTypes;
 #elif defined(GKO_COMPILING_CUDA)
 // CUDA don't support long indices for sorting, and the triangular solvers
 // seem broken
-using Types = ::testing::Types<std::tuple<float, gko::int32>,
-                               std::tuple<double, gko::int32>,
-                               std::tuple<std::complex<float>, gko::int32>,
-                               std::tuple<std::complex<double>, gko::int32>>;
+using Types = gko::test::cartesian_type_product_t<gko::test::ValueTypes,
+                                                  ::testing::Types<gko::int32>>;
 #else
 // HIP only supports real types and int32
-using Types = ::testing::Types<std::tuple<float, gko::int32>,
-                               std::tuple<double, gko::int32>>;
+using Types = gko::test::cartesian_type_product_t<gko::test::RealValueTypes,
+                                                  ::testing::Types<gko::int32>>;
 #endif
 
 TYPED_TEST_SUITE(Lu, Types, PairTypenameNameGenerator);
diff --git a/test/solver/direct.cpp b/test/solver/direct.cpp
index a58d3d46f3f..da77682bcdd 100644
--- a/test/solver/direct.cpp
+++ b/test/solver/direct.cpp
@@ -106,14 +106,12 @@ using Types = gko::test::ValueIndexTypes;
 #elif defined(GKO_COMPILING_CUDA)
 // CUDA don't support long indices for sorting, and the triangular solvers
 // seem broken
-using Types = ::testing::Types<std::tuple<float, gko::int32>,
-                               std::tuple<double, gko::int32>,
-                               std::tuple<std::complex<float>, gko::int32>,
-                               std::tuple<std::complex<double>, gko::int32>>;
+using Types = gko::test::cartesian_type_product_t<gko::test::ValueTypes,
+                                                  ::testing::Types<gko::int32>>;
 #else
 // HIP only supports real types and int32
-using Types = ::testing::Types<std::tuple<float, gko::int32>,
-                               std::tuple<double, gko::int32>>;
+using Types = gko::test::cartesian_type_product_t<gko::test::RealValueTypes,
+                                                  ::testing::Types<gko::int32>>;
 #endif
 
 TYPED_TEST_SUITE(Direct, Types, PairTypenameNameGenerator);

From 381a3690129758c7afad4bd0fd601a226fd07aa6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Gr=C3=BCtzmacher?= <thomas.gruetzmacher@kit.edu>
Date: Thu, 11 Apr 2024 19:01:49 +0200
Subject: [PATCH 201/448] Add documentation for the new type alias

---
 core/test/utils.hpp | 220 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 216 insertions(+), 4 deletions(-)

diff --git a/core/test/utils.hpp b/core/test/utils.hpp
index 58081f3fa16..c01f1df23c1 100644
--- a/core/test/utils.hpp
+++ b/core/test/utils.hpp
@@ -32,6 +32,40 @@ namespace test {
 namespace detail {
 
 
+/**
+ * This structure creates a cartesian product of the types in the left list with
+ * the types of the right list and stores the combination in a std::tuple. The
+ * resulting type is a list (it will be the same type wrapper as the left and
+ * right list) of `std::tuple`.
+ * The wrapper / list type needs to be a structure that can take an arbitrary
+ * amount of types. An example for this wrapper is std::tuple, but a simple
+ * `template<typename... Args> struct wrapper {};` also works.
+ * Both the left and right list need to use the same wrapper, otherwise, the
+ * type specialization fails.
+ *
+ * This structure uses partial specialization to:
+ * - remove the OuterWrapper of both the left and right list;
+ * - extracts a single element of the left list and combines it with all
+ *   elements of the right list;
+ * - after no elements remain in the left list, put all generated combinations
+ *   together and wrap them in an OuterWrapper again
+ *
+ * This structure uses inheritance to store the combinations it creates in the
+ * parameter pack Result, which will be wrapped in the original OuterWrapper
+ * after all parameters from the LeftList have been processed (in the last
+ * specialization).
+ *
+ * Example:
+ * ```
+ * // Here, we use std::tuple as the outer type wrapper.
+ * using left_list = std::tuple<a1, a2, a3>;
+ * using right_list = std::tuple<b1, b2>;
+ * using result = typename cartesian_type_product<left_list, right_list>::type;
+ * // result = std::tuple<std::tuple<a1, b1>, std::tuple<a1, b2>,
+ * //                     std::tuple<a2, b1>, std::tuple<a2, b2>,
+ * //                     std::tuple<a3, b1>, std::tuple<a3, b2>>;
+ * ```
+ */
 template <typename LeftList, typename RightList, typename... Result>
 struct cartesian_type_product {};
 
@@ -51,6 +85,28 @@ struct cartesian_type_product<OuterWrapper<>, OuterWrapper<RightArgs...>,
     using type = OuterWrapper<Result...>;
 };
 
+
+/**
+ * This structure expects the left list to have all elements of the type
+ * std::tuple and the right list of elements you want to add to those tuples. It
+ * creates a new list where it adds all combinations of the std::tuple with the
+ * new element list as a new member of the std::tuple to the right side.
+ *
+ * It can be used to create a cartesian product with more than two lists by
+ * using the cartesian_type_product initially for the left argument, followed by
+ * this structure for each additional list.
+ * Example:
+ * ```
+ * template<typename... Args>
+ * using t = std::tuple<Args>;  // use this alias to increase readability
+ * using left_combinations = t<t<a1, b1>, t<a1, b2>>;
+ * using right_new = t<n1, n2>;
+ * using new_list =
+ *     typename add_to_cartesian_type_product<left_combinations,
+ *                                            right_new>::type;
+ * // new_list = t<t<a1, b1, n1>, t<a1, b1, n2>, t<a1, b2, n1>, t<a1, b2, n2>>;
+ * ```
+ */
 template <typename ExistingCombinationList, typename NewElementList,
           typename... Result>
 struct add_to_cartesian_type_product {};
@@ -75,6 +131,22 @@ struct add_to_cartesian_type_product<
     using type = OuterWrapper<Result...>;
 };
 
+
+/**
+ * Does the same as add_to_cartesian_type_product with the only difference that
+ * the new element will be added to the left side. The template parameter order
+ * is also flipped, so the new list is now on the left side.
+ * Example:
+ * ```
+ * template<typename... Args> using t = std::tuple<Args>;
+ * using right_combinations = t<t<a1, b1>, t<a1, b2>>;
+ * using left_new = t<n1, n2>;
+ * using new_list =
+ *     typename add_to_cartesian_type_product_left<left_new,
+ *                                                 right_combinations>::type;
+ * // new_list = t<t<n1, a1, b1>, t<n2, a1, b1>, t<n1, a1, b2>, t<n2, a1, b2>>;
+ * ```
+ */
 template <typename NewElementList, typename ExistingCombinationList,
           typename... Result>
 struct add_to_cartesian_type_product_left {};
@@ -99,6 +171,14 @@ struct add_to_cartesian_type_product_left<OuterWrapper<NewElementArgs...>,
     using type = OuterWrapper<Result...>;
 };
 
+
+/**
+ * Merges two lists into a single list.
+ * The left and right list need to use the same type wrapper, which will also be
+ * the resulting wrapper containing elements of both lists. The order of the
+ * left and right list are preserved. The resulting list will have all elements
+ * of the left list, followed by all elements of the right list.
+ */
 template <typename FirstList, typename SecondList>
 struct merge_type_list {};
 
@@ -109,6 +189,17 @@ struct merge_type_list<OuterWrapper<Args1...>, OuterWrapper<Args2...>> {
 };
 
 
+/**
+ * This structure can change the outer type wrapper to the new, given one.
+ * Example:
+ * ```
+ * template <typename... Args>
+ * struct type_wrapper {};
+ * using old_list = std::tuple<int, double, short>;
+ * using new_list = typename change_outer_wrapper<type_wrapper, old_list>::type;
+ * // new_list = type_wrapper<int, double, short>;
+ * ```
+ */
 template <template <typename...> class NewOuterWrapper,
           typename OldOuterWrapper>
 struct change_outer_wrapper {};
@@ -120,6 +211,15 @@ struct change_outer_wrapper<NewOuterWrapper, OldOuterWrapper<Args...>> {
 };
 
 
+/**
+ * Creates a type list (the outer wrapper stays the same) where each original
+ * type is wrapped into the given NewInnerWrapper. Example:
+ * ```
+ * using new_type =
+ *     typename add_internal_wrapper<std::complex,
+ *                                   std::tuple<float, double>>::type;
+ * // new_type = std::tuple<std::complex<float>, std::complex<double>>;
+ */
 template <template <typename...> class NewInnerWrapper, typename ListType>
 struct add_internal_wrapper {};
 
@@ -133,32 +233,144 @@ struct add_internal_wrapper<NewInnerWrapper, OuterWrapper<Args...>> {
 }  // namespace detail
 
 
+/**
+ * This type alias creates a cartesian product of the types in the left list
+ * with the types of the right list and stores the combination in a std::tuple.
+ * The resulting type is a list (it will be the same type wrapper as the left
+ * and right list) of `std::tuple`.
+ * Example:
+ * ```
+ * // Here, we use std::tuple as the outer type wrapper.
+ * using left_list = std::tuple<a1, a2, a3>;
+ * using right_list = std::tuple<b1, b2>;
+ * using result = cartesian_type_product_t<left_list, right_list>;
+ * // result = std::tuple<std::tuple<a1, b1>, std::tuple<a1, b2>,
+ * //                     std::tuple<a2, b1>, std::tuple<a2, b2>,
+ * //                     std::tuple<a3, b1>, std::tuple<a3, b2>>;
+ * ```
+ *
+ * @tparam LeftList  A wrapper type (like std::tuple) containing the list of
+ *                   types that you want to create the cartesian product with.
+ *                   The paremeters of this list will be the left type in the
+ *                   resulting `std::tuple`
+ * @tparam RightList  Similar to the LeftList. Must use the same outer wrapper
+ *                    as the LeftList.
+ */
 template <typename LeftList, typename RightList>
 using cartesian_type_product_t =
     typename detail::cartesian_type_product<LeftList, RightList>::type;
 
+/**
+ * This type alias is intended to be used with cartesian_type_product_t in order
+ * to create a more than two dimensional cartesian product by adding one element
+ * to the result per call.
+ * This structure expects the left list to have all elements of the type
+ * std::tuple (as it is returned from cartesian_type_product_t) and the right
+ * list of elements you want to add to those tuples.
+ * creates a new list where it adds all combinations of the std::tuple with the
+ * new element list as a new member of the std::tuple to the right side.
+ * Example:
+ * ```
+ * template<typename... Args>
+ * using t = std::tuple<Args>;  // use this alias to increase readability
+ * using left_combinations = t<t<a1, b1>, t<a1, b2>>;
+ * using right_new = t<n1, n2>;
+ * using new_list =
+ *     add_to_cartesian_type_product_t<left_combinations, right_new>;
+ * // new_list = t<t<a1, b1, n1>, t<a1, b1, n2>, t<a1, b2, n1>, t<a1, b2, n2>>;
+ * ```
+ *
+ * @tparam ExistingCombinationList  An outer type wrapper containing different
+ *                                  std::tuples that you want to add elements to
+ * @tparam NewElementList  The list of new elements (using the same outer
+ *                         wrapper as ExistingCombinationList) you want to
+ *                         create all possible combinations with. These elements
+ *                         will be added to the right of each std::tuple
+ */
 template <typename ExistingCombinationList, typename NewElementList>
 using add_to_cartesian_type_product_t =
     typename detail::add_to_cartesian_type_product<ExistingCombinationList,
                                                    NewElementList>::type;
 
+/**
+ * This type alias is very similar to add_to_cartesian_type_product_t. It only
+ * differs in where the new element is added to the `std::tuple`, which is to
+ * the left here, and the order of the parameter.
+ * Example:
+ * ```
+ * template<typename... Args> using t = std::tuple<Args>;
+ * using right_combinations = t<t<a1, b1>, t<a1, b2>>;
+ * using left_new = t<n1, n2>;
+ * using new_list =
+ *     add_to_cartesian_type_product_left_t<left_new, right_combinations>;
+ * // new_list = t<t<n1, a1, b1>, t<n2, a1, b1>, t<n1, a1, b2>, t<n2, a1, b2>>;
+ * ```
+ *
+ * @tparam NewElementList  The list of new elements (using the same outer
+ *                         wrapper as ExistingCombinationList) you want to
+ *                         create all possible combinations with. These elements
+ *                         will be added to the left of each std::tuple
+ * @tparam ExistingCombinationList  An outer type wrapper containing different
+ *                                  std::tuples that you want to add elements to
+ */
 template <typename NewElementList, typename ExistingCombinationList>
 using add_to_cartesian_type_product_left_t =
     typename detail::add_to_cartesian_type_product_left<
         NewElementList, ExistingCombinationList>::type;
 
+/**
+ * Merges two lists into a single list.
+ * The left and right list need to use the same type wrapper, which will also be
+ * the resulting wrapper containing elements of both lists. The order of the
+ * left and right list are preserved. The resulting list will have all elements
+ * of the left list, followed by all elements of the right list.
+ *
+ * @tparam FirstList  The first list of types
+ * @tparam SecondList  The second list of types. The type wrapper needs to be
+ *                     the same as for FirstList.
+ */
 template <typename FirstList, typename SecondList>
 using merge_type_list_t =
     typename detail::merge_type_list<FirstList, SecondList>::type;
 
-template <template <typename...> class NewInnerWrapper, typename ListType>
-using add_internal_wrapper_t =
-    typename detail::add_internal_wrapper<NewInnerWrapper, ListType>::type;
-
+/**
+ * This type alias can change the outer type wrapper to the new, given one.
+ * Example:
+ * ```
+ * template <typename... Args>
+ * struct type_wrapper {};
+ * using old_list = std::tuple<int, double, short>;
+ * using new_list = change_outer_wrapper_t<type_wrapper, old_list>;
+ * // new_list = type_wrapper<int, double, short>;
+ * ```
+ *
+ * @tparam NewOuterWrapper  the new wrapper you want to use as the new outer
+ *                          wrapper
+ * @tparam ListType  The list of types where you want to replace the outer
+ *                   wrapper.
+ */
 template <template <typename...> class NewOuterWrapper, typename ListType>
 using change_outer_wrapper_t =
     typename detail::change_outer_wrapper<NewOuterWrapper, ListType>::type;
 
+/**
+ * Creates a type list (the outer wrapper stays the same) where each original
+ * type is wrapped into the given NewInnerWrapper.
+ * Example:
+ * ```
+ * using new_type =
+ *     add_internal_wrapper<std::complex, std::tuple<float, double>>;
+ * // new_type = std::tuple<std::complex<float>, std::complex<double>>;
+ * ```
+ *
+ * @tparam NewInnerWrapper  the new wrapper you want to use to wrap each type
+ *                          in the list
+ * @tparam ListType  The list of types where you want to add a wrapper to each
+ */
+template <template <typename...> class NewInnerWrapper, typename ListType>
+using add_internal_wrapper_t =
+    typename detail::add_internal_wrapper<NewInnerWrapper, ListType>::type;
+
 
 using RealValueTypes =
 #if GINKGO_DPCPP_SINGLE_MODE

From 89d2d004937d2015a3523674da4d55a1af6fda4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Gr=C3=BCtzmacher?= <thomas.gruetzmacher@kit.edu>
Date: Fri, 19 Apr 2024 16:25:38 +0200
Subject: [PATCH 202/448] Update documentation for type list helper

Additionally, add tests for empty inputs.
---
 core/test/utils.hpp            |  95 ++++++--------------------
 core/test/utils/utils_test.cpp | 120 ++++++++++++++++++++++++++++-----
 2 files changed, 123 insertions(+), 92 deletions(-)

diff --git a/core/test/utils.hpp b/core/test/utils.hpp
index c01f1df23c1..6b9b4a8537c 100644
--- a/core/test/utils.hpp
+++ b/core/test/utils.hpp
@@ -33,10 +33,8 @@ namespace detail {
 
 
 /**
- * This structure creates a cartesian product of the types in the left list with
- * the types of the right list and stores the combination in a std::tuple. The
- * resulting type is a list (it will be the same type wrapper as the left and
- * right list) of `std::tuple`.
+ * @see cartesian_type_product_t for details.
+ *
  * The wrapper / list type needs to be a structure that can take an arbitrary
  * amount of types. An example for this wrapper is std::tuple, but a simple
  * `template<typename... Args> struct wrapper {};` also works.
@@ -54,17 +52,6 @@ namespace detail {
  * parameter pack Result, which will be wrapped in the original OuterWrapper
  * after all parameters from the LeftList have been processed (in the last
  * specialization).
- *
- * Example:
- * ```
- * // Here, we use std::tuple as the outer type wrapper.
- * using left_list = std::tuple<a1, a2, a3>;
- * using right_list = std::tuple<b1, b2>;
- * using result = typename cartesian_type_product<left_list, right_list>::type;
- * // result = std::tuple<std::tuple<a1, b1>, std::tuple<a1, b2>,
- * //                     std::tuple<a2, b1>, std::tuple<a2, b2>,
- * //                     std::tuple<a3, b1>, std::tuple<a3, b2>>;
- * ```
  */
 template <typename LeftList, typename RightList, typename... Result>
 struct cartesian_type_product {};
@@ -87,25 +74,11 @@ struct cartesian_type_product<OuterWrapper<>, OuterWrapper<RightArgs...>,
 
 
 /**
- * This structure expects the left list to have all elements of the type
- * std::tuple and the right list of elements you want to add to those tuples. It
- * creates a new list where it adds all combinations of the std::tuple with the
- * new element list as a new member of the std::tuple to the right side.
+ * @see add_to_cartesian_type_product_t for details.
  *
- * It can be used to create a cartesian product with more than two lists by
- * using the cartesian_type_product initially for the left argument, followed by
- * this structure for each additional list.
- * Example:
- * ```
- * template<typename... Args>
- * using t = std::tuple<Args>;  // use this alias to increase readability
- * using left_combinations = t<t<a1, b1>, t<a1, b2>>;
- * using right_new = t<n1, n2>;
- * using new_list =
- *     typename add_to_cartesian_type_product<left_combinations,
- *                                            right_new>::type;
- * // new_list = t<t<a1, b1, n1>, t<a1, b1, n2>, t<a1, b2, n1>, t<a1, b2, n2>>;
- * ```
+ * Uses a similar technique to cartesian_type_product.
+ * It also uses the parameter pack Result to store the interim results, which
+ * will be put in the OuterWrapper after all inputs have been processed.
  */
 template <typename ExistingCombinationList, typename NewElementList,
           typename... Result>
@@ -133,19 +106,7 @@ struct add_to_cartesian_type_product<
 
 
 /**
- * Does the same as add_to_cartesian_type_product with the only difference that
- * the new element will be added to the left side. The template parameter order
- * is also flipped, so the new list is now on the left side.
- * Example:
- * ```
- * template<typename... Args> using t = std::tuple<Args>;
- * using right_combinations = t<t<a1, b1>, t<a1, b2>>;
- * using left_new = t<n1, n2>;
- * using new_list =
- *     typename add_to_cartesian_type_product_left<left_new,
- *                                                 right_combinations>::type;
- * // new_list = t<t<n1, a1, b1>, t<n2, a1, b1>, t<n1, a1, b2>, t<n2, a1, b2>>;
- * ```
+ * @see add_to_cartesian_type_product_left_t for details.
  */
 template <typename NewElementList, typename ExistingCombinationList,
           typename... Result>
@@ -173,11 +134,7 @@ struct add_to_cartesian_type_product_left<OuterWrapper<NewElementArgs...>,
 
 
 /**
- * Merges two lists into a single list.
- * The left and right list need to use the same type wrapper, which will also be
- * the resulting wrapper containing elements of both lists. The order of the
- * left and right list are preserved. The resulting list will have all elements
- * of the left list, followed by all elements of the right list.
+ * @see merge_type_lists_t for details.
  */
 template <typename FirstList, typename SecondList>
 struct merge_type_list {};
@@ -190,15 +147,7 @@ struct merge_type_list<OuterWrapper<Args1...>, OuterWrapper<Args2...>> {
 
 
 /**
- * This structure can change the outer type wrapper to the new, given one.
- * Example:
- * ```
- * template <typename... Args>
- * struct type_wrapper {};
- * using old_list = std::tuple<int, double, short>;
- * using new_list = typename change_outer_wrapper<type_wrapper, old_list>::type;
- * // new_list = type_wrapper<int, double, short>;
- * ```
+ * @see change_outer_wrapper_t for details.
  */
 template <template <typename...> class NewOuterWrapper,
           typename OldOuterWrapper>
@@ -212,20 +161,14 @@ struct change_outer_wrapper<NewOuterWrapper, OldOuterWrapper<Args...>> {
 
 
 /**
- * Creates a type list (the outer wrapper stays the same) where each original
- * type is wrapped into the given NewInnerWrapper. Example:
- * ```
- * using new_type =
- *     typename add_internal_wrapper<std::complex,
- *                                   std::tuple<float, double>>::type;
- * // new_type = std::tuple<std::complex<float>, std::complex<double>>;
+ * @see add_inner_wrapper_t for details.
  */
 template <template <typename...> class NewInnerWrapper, typename ListType>
-struct add_internal_wrapper {};
+struct add_inner_wrapper {};
 
 template <template <typename...> class NewInnerWrapper,
           template <typename...> class OuterWrapper, typename... Args>
-struct add_internal_wrapper<NewInnerWrapper, OuterWrapper<Args...>> {
+struct add_inner_wrapper<NewInnerWrapper, OuterWrapper<Args...>> {
     using type = OuterWrapper<NewInnerWrapper<Args>...>;
 };
 
@@ -251,7 +194,7 @@ struct add_internal_wrapper<NewInnerWrapper, OuterWrapper<Args...>> {
  *
  * @tparam LeftList  A wrapper type (like std::tuple) containing the list of
  *                   types that you want to create the cartesian product with.
- *                   The paremeters of this list will be the left type in the
+ *                   The parameters of this list will be the left type in the
  *                   resulting `std::tuple`
  * @tparam RightList  Similar to the LeftList. Must use the same outer wrapper
  *                    as the LeftList.
@@ -267,8 +210,8 @@ using cartesian_type_product_t =
  * This structure expects the left list to have all elements of the type
  * std::tuple (as it is returned from cartesian_type_product_t) and the right
  * list of elements you want to add to those tuples.
- * creates a new list where it adds all combinations of the std::tuple with the
- * new element list as a new member of the std::tuple to the right side.
+ * It creates a new list where it adds all combinations of the std::tuple with
+ * the new element list as a new member of the std::tuple to the right side.
  * Example:
  * ```
  * template<typename... Args>
@@ -359,7 +302,7 @@ using change_outer_wrapper_t =
  * Example:
  * ```
  * using new_type =
- *     add_internal_wrapper<std::complex, std::tuple<float, double>>;
+ *     add_inner_wrapper<std::complex, std::tuple<float, double>>;
  * // new_type = std::tuple<std::complex<float>, std::complex<double>>;
  * ```
  *
@@ -368,8 +311,8 @@ using change_outer_wrapper_t =
  * @tparam ListType  The list of types where you want to add a wrapper to each
  */
 template <template <typename...> class NewInnerWrapper, typename ListType>
-using add_internal_wrapper_t =
-    typename detail::add_internal_wrapper<NewInnerWrapper, ListType>::type;
+using add_inner_wrapper_t =
+    typename detail::add_inner_wrapper<NewInnerWrapper, ListType>::type;
 
 
 using RealValueTypes =
@@ -379,7 +322,7 @@ using RealValueTypes =
     ::testing::Types<float, double>;
 #endif
 
-using ComplexValueTypes = add_internal_wrapper_t<std::complex, RealValueTypes>;
+using ComplexValueTypes = add_inner_wrapper_t<std::complex, RealValueTypes>;
 
 using ValueTypes = merge_type_list_t<RealValueTypes, ComplexValueTypes>;
 
diff --git a/core/test/utils/utils_test.cpp b/core/test/utils/utils_test.cpp
index 9184e56b4e2..b7e00642c5c 100644
--- a/core/test/utils/utils_test.cpp
+++ b/core/test/utils/utils_test.cpp
@@ -22,16 +22,18 @@ using t_type = std::tuple<int>;
 using testing_types1 = testing::Types<double>;
 using testing_types2 = testing::Types<t_type, int>;
 using testing_types3 = testing::Types<i_type, short, float>;
+using testing_empty = testing::Types<>;
 
 using tuple_types1 = std::tuple<double>;
 using tuple_types2 = std::tuple<t_type, int>;
 using tuple_types3 = std::tuple<i_type, short, float>;
+using tuple_empty = std::tuple<>;
 
 template <typename... Args>
 struct type_list {};
 
 
-TEST(TypeListHelper, ChangeOuterWrapper1)
+TEST(TypeListHelper, ChangeOuterWrapperPredefined)
 {
     testing::StaticAssertTypeEq<
         gko::test::change_outer_wrapper_t<std::tuple, testing_types1>,
@@ -54,14 +56,28 @@ TEST(TypeListHelper, ChangeOuterWrapper1)
 }
 
 
-TEST(TypeListHelper, ChangeOuterWrapper2)
+TEST(TypeListHelper, ChangeOuterWrapperCustomType)
 {
-    using alternative_list1 = type_list<i_type, t_type, double>;
-    using expected_ow2 = testing::Types<i_type, t_type, double>;
+    using type_list1 = type_list<i_type, t_type, double>;
+    using testing_list1 = testing::Types<i_type, t_type, double>;
 
     testing::StaticAssertTypeEq<
-        gko::test::change_outer_wrapper_t<testing::Types, alternative_list1>,
-        expected_ow2>();
+        gko::test::change_outer_wrapper_t<testing::Types, type_list1>,
+        testing_list1>();
+    testing::StaticAssertTypeEq<
+        gko::test::change_outer_wrapper_t<type_list, testing_list1>,
+        type_list1>();
+}
+
+
+TEST(TypeListHelper, ChangeOuterWrapperEmpty)
+{
+    testing::StaticAssertTypeEq<
+        gko::test::change_outer_wrapper_t<testing::Types, tuple_empty>,
+        testing_empty>();
+    testing::StaticAssertTypeEq<
+        gko::test::change_outer_wrapper_t<std::tuple, testing_empty>,
+        tuple_empty>();
 }
 
 
@@ -69,11 +85,12 @@ TEST(TypeListHelper, AddInternalWrapperTuple)
 {
     using expected_iw1 = testing::Types<std::tuple<i_type>, std::tuple<short>,
                                         std::tuple<float>>;
+
     testing::StaticAssertTypeEq<
-        gko::test::add_internal_wrapper_t<std::tuple, testing_types3>,
+        gko::test::add_inner_wrapper_t<std::tuple, testing_types3>,
         expected_iw1>();
     testing::StaticAssertTypeEq<
-        gko::test::add_internal_wrapper_t<std::tuple, tuple_types3>,
+        gko::test::add_inner_wrapper_t<std::tuple, tuple_types3>,
         gko::test::change_outer_wrapper_t<std::tuple, expected_iw1>>();
 }
 
@@ -83,14 +100,24 @@ TEST(TypeListHelper, AddInternalWrapperComplex)
     using expected_iw2 = testing::Types<std::complex<double>>;
 
     testing::StaticAssertTypeEq<
-        gko::test::add_internal_wrapper_t<std::complex, testing_types1>,
+        gko::test::add_inner_wrapper_t<std::complex, testing_types1>,
         expected_iw2>();
     testing::StaticAssertTypeEq<
-        gko::test::add_internal_wrapper_t<std::complex, tuple_types1>,
+        gko::test::add_inner_wrapper_t<std::complex, tuple_types1>,
         gko::test::change_outer_wrapper_t<std::tuple, expected_iw2>>();
 }
 
 
+TEST(TypeListHelper, AddInternalWrapperEmpty)
+{
+    testing::StaticAssertTypeEq<
+        gko::test::add_inner_wrapper_t<std::tuple, testing_empty>,
+        testing_empty>();
+    testing::StaticAssertTypeEq<
+        gko::test::add_inner_wrapper_t<std::tuple, tuple_empty>, tuple_empty>();
+}
+
+
 TEST(TypeListHelper, MergeTypeListLarge)
 {
     using expected_m1 = testing::Types<i_type, short, float, t_type, int>;
@@ -106,20 +133,18 @@ TEST(TypeListHelper, MergeTypeListLarge)
 
 TEST(TypeListHelper, MergeTypeListEmpty)
 {
-    using expected_m2 = testing::Types<double>;
-
     testing::StaticAssertTypeEq<
         gko::test::merge_type_list_t<testing_types1, testing::Types<>>,
-        expected_m2>();
+        testing_types1>();
     testing::StaticAssertTypeEq<
         gko::test::merge_type_list_t<tuple_types1, std::tuple<>>,
-        gko::test::change_outer_wrapper_t<std::tuple, expected_m2>>();
+        tuple_types1>();
     testing::StaticAssertTypeEq<
         gko::test::merge_type_list_t<testing::Types<>, testing_types1>,
-        expected_m2>();
+        testing_types1>();
     testing::StaticAssertTypeEq<
         gko::test::merge_type_list_t<std::tuple<>, tuple_types1>,
-        gko::test::change_outer_wrapper_t<std::tuple, expected_m2>>();
+        tuple_types1>();
 }
 
 
@@ -153,6 +178,29 @@ TEST(TypeListHelper, CartesianTypeProductSmall)
 }
 
 
+TEST(TypeListHelper, CartesianTypeProductEmpty)
+{
+    testing::StaticAssertTypeEq<
+        gko::test::cartesian_type_product_t<testing_empty, testing_types2>,
+        testing_empty>();
+    testing::StaticAssertTypeEq<
+        gko::test::cartesian_type_product_t<testing_types1, testing_empty>,
+        testing_empty>();
+    testing::StaticAssertTypeEq<
+        gko::test::cartesian_type_product_t<testing_empty, testing_empty>,
+        testing_empty>();
+    testing::StaticAssertTypeEq<
+        gko::test::cartesian_type_product_t<tuple_empty, tuple_types2>,
+        tuple_empty>();
+    testing::StaticAssertTypeEq<
+        gko::test::cartesian_type_product_t<tuple_types1, tuple_empty>,
+        tuple_empty>();
+    testing::StaticAssertTypeEq<
+        gko::test::cartesian_type_product_t<tuple_empty, tuple_empty>,
+        tuple_empty>();
+}
+
+
 TEST(TypeListHelper, AddToCartesianTypeProductLarge)
 {
     using list1 =
@@ -193,6 +241,26 @@ TEST(TypeListHelper, AddToCartesianTypeProductSmall)
 }
 
 
+TEST(TypeListHelper, AddToCartesianTypeProductEmpty)
+{
+    using list3 = testing::Types<std::tuple<long>>;
+    using tlist3 = std::tuple<std::tuple<long>>;
+
+    testing::StaticAssertTypeEq<
+        gko::test::add_to_cartesian_type_product_t<list3, testing_empty>,
+        testing_empty>();
+    testing::StaticAssertTypeEq<gko::test::add_to_cartesian_type_product_t<
+                                    testing_empty, testing_types1>,
+                                testing_empty>();
+    testing::StaticAssertTypeEq<
+        gko::test::add_to_cartesian_type_product_t<tlist3, tuple_empty>,
+        tuple_empty>();
+    testing::StaticAssertTypeEq<
+        gko::test::add_to_cartesian_type_product_t<tuple_empty, tuple_types1>,
+        tuple_empty>();
+}
+
+
 TEST(TypeListHelper, AddToCartesianTypeProductLeftLarge)
 {
     using list1 = testing::Types<long, char>;
@@ -233,4 +301,24 @@ TEST(TypeListHelper, AddToCartesianTypeProductLeftSmall)
 }
 
 
+TEST(TypeListHelper, AddToCartesianTypeProductLeftEmpty)
+{
+    using list3 = testing::Types<std::tuple<long>>;
+    using tlist3 = std::tuple<std::tuple<long>>;
+
+    testing::StaticAssertTypeEq<gko::test::add_to_cartesian_type_product_left_t<
+                                    testing_types1, testing_empty>,
+                                testing_empty>();
+    testing::StaticAssertTypeEq<
+        gko::test::add_to_cartesian_type_product_left_t<testing_empty, list3>,
+        testing_empty>();
+    testing::StaticAssertTypeEq<gko::test::add_to_cartesian_type_product_left_t<
+                                    tuple_types1, tuple_empty>,
+                                tuple_empty>();
+    testing::StaticAssertTypeEq<
+        gko::test::add_to_cartesian_type_product_left_t<tuple_empty, tlist3>,
+        tuple_empty>();
+}
+
+
 }  // namespace

From e0b4be48fe3a99017f884c256208ab5df318b688 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Gr=C3=BCtzmacher?= <thomas.gruetzmacher@tum.de>
Date: Thu, 11 Jul 2024 18:47:51 +0200
Subject: [PATCH 203/448] Remove unsigned type from MPI bindings tests

---
 core/test/mpi/base/bindings.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/core/test/mpi/base/bindings.cpp b/core/test/mpi/base/bindings.cpp
index d3ecf359908..ddcbb1777df 100644
--- a/core/test/mpi/base/bindings.cpp
+++ b/core/test/mpi/base/bindings.cpp
@@ -24,7 +24,10 @@ class MpiBindings : public ::testing::Test {
     std::shared_ptr<gko::Executor> ref;
 };
 
-TYPED_TEST_SUITE(MpiBindings, gko::test::PODTypes, TypenameNameGenerator);
+using TestTypes = gko::test::merge_type_list_t<gko::test::RealValueTypes,
+                                               gko::test::IndexTypes>;
+
+TYPED_TEST_SUITE(MpiBindings, TestTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(MpiBindings, CanSetADefaultwindow)

From 6dbac08a0925d067b8363610dcd639c8875d28e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Gr=C3=BCtzmacher?= <thomas.gruetzmacher@tum.de>
Date: Fri, 18 Oct 2024 12:00:22 +0200
Subject: [PATCH 204/448] Properly format files

---
 core/test/utils/utils_test.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/core/test/utils/utils_test.cpp b/core/test/utils/utils_test.cpp
index b7e00642c5c..810a10ee0c5 100644
--- a/core/test/utils/utils_test.cpp
+++ b/core/test/utils/utils_test.cpp
@@ -2,17 +2,15 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/test/utils.hpp"
+
 #include <complex>
 #include <tuple>
 #include <type_traits>
 
-
 #include <gtest/gtest.h>
 
 
-#include "core/test/utils.hpp"
-
-
 namespace {
 
 

From c05984ba5f77a673954d4dbb525905351e2d0351 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Gr=C3=BCtzmacher?= <thomas.gruetzmacher@tum.de>
Date: Fri, 18 Oct 2024 18:06:25 +0200
Subject: [PATCH 205/448] Review updates

Co-authored-by: Yu-Hsiang M. Tsai <yhmtsai@gmail.com>
---
 core/test/utils.hpp            | 5 +++++
 core/test/utils/utils_test.cpp | 6 ------
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/core/test/utils.hpp b/core/test/utils.hpp
index 6b9b4a8537c..cacc7191bbf 100644
--- a/core/test/utils.hpp
+++ b/core/test/utils.hpp
@@ -203,6 +203,7 @@ template <typename LeftList, typename RightList>
 using cartesian_type_product_t =
     typename detail::cartesian_type_product<LeftList, RightList>::type;
 
+
 /**
  * This type alias is intended to be used with cartesian_type_product_t in order
  * to create a more than two dimensional cartesian product by adding one element
@@ -235,6 +236,7 @@ using add_to_cartesian_type_product_t =
     typename detail::add_to_cartesian_type_product<ExistingCombinationList,
                                                    NewElementList>::type;
 
+
 /**
  * This type alias is very similar to add_to_cartesian_type_product_t. It only
  * differs in where the new element is added to the `std::tuple`, which is to
@@ -261,6 +263,7 @@ using add_to_cartesian_type_product_left_t =
     typename detail::add_to_cartesian_type_product_left<
         NewElementList, ExistingCombinationList>::type;
 
+
 /**
  * Merges two lists into a single list.
  * The left and right list need to use the same type wrapper, which will also be
@@ -276,6 +279,7 @@ template <typename FirstList, typename SecondList>
 using merge_type_list_t =
     typename detail::merge_type_list<FirstList, SecondList>::type;
 
+
 /**
  * This type alias can change the outer type wrapper to the new, given one.
  * Example:
@@ -296,6 +300,7 @@ template <template <typename...> class NewOuterWrapper, typename ListType>
 using change_outer_wrapper_t =
     typename detail::change_outer_wrapper<NewOuterWrapper, ListType>::type;
 
+
 /**
  * Creates a type list (the outer wrapper stays the same) where each original
  * type is wrapped into the given NewInnerWrapper.
diff --git a/core/test/utils/utils_test.cpp b/core/test/utils/utils_test.cpp
index 810a10ee0c5..f84bc16d549 100644
--- a/core/test/utils/utils_test.cpp
+++ b/core/test/utils/utils_test.cpp
@@ -11,9 +11,6 @@
 #include <gtest/gtest.h>
 
 
-namespace {
-
-
 using i_type = std::integral_constant<int, 42>;
 using t_type = std::tuple<int>;
 
@@ -317,6 +314,3 @@ TEST(TypeListHelper, AddToCartesianTypeProductLeftEmpty)
         gko::test::add_to_cartesian_type_product_left_t<tuple_empty, tlist3>,
         tuple_empty>();
 }
-
-
-}  // namespace

From 004eda038fb3fb7d623f626ad11bc3e34beb3d78 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 21 Oct 2024 11:08:44 +0200
Subject: [PATCH 206/448] squeeze memory dispatch

Co-authored-by: Marcel Koch <marcel.koch@kit.edu>
---
 .../cuda_hip/components/memory.nvidia.hpp.inc | 766 ++++++++----------
 dev_tools/scripts/generate_cuda_memory_ptx.py |  54 +-
 2 files changed, 380 insertions(+), 440 deletions(-)

diff --git a/common/cuda_hip/components/memory.nvidia.hpp.inc b/common/cuda_hip/components/memory.nvidia.hpp.inc
index 49c9ae7601c..a695904e82a 100644
--- a/common/cuda_hip/components/memory.nvidia.hpp.inc
+++ b/common/cuda_hip/components/memory.nvidia.hpp.inc
@@ -68,17 +68,15 @@ __device__ __forceinline__ void membar_acq_rel_local()
 __device__ __forceinline__ int32 load_relaxed_shared(const int32* ptr)
 {
     int32 result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.shared.s32 %0, [%1];"
-                 : "=r"(result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int32*>(ptr)))
-                 : "memory");
+        "ld.volatile.shared.s32 %0, [%1];"
 #else
-    asm volatile("ld.relaxed.cta.shared.s32 %0, [%1];"
-                 : "=r"(result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int32*>(ptr)))
-                 : "memory");
+        "ld.relaxed.cta.shared.s32 %0, [%1];"
 #endif
+        : "=r"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int32*>(ptr)))
+        : "memory");
 
     return result;
 }
@@ -86,34 +84,30 @@ __device__ __forceinline__ int32 load_relaxed_shared(const int32* ptr)
 
 __device__ __forceinline__ void store_relaxed_shared(int32* ptr, int32 result)
 {
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.shared.s32 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "r"(result)
-                 : "memory");
+        "st.volatile.shared.s32 [%0], %1;"
 #else
-    asm volatile("st.relaxed.cta.shared.s32 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "r"(result)
-                 : "memory");
+        "st.relaxed.cta.shared.s32 [%0], %1;"
 #endif
+        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "r"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ int64 load_relaxed_shared(const int64* ptr)
 {
     int64 result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.shared.s64 %0, [%1];"
-                 : "=l"(result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int64*>(ptr)))
-                 : "memory");
+        "ld.volatile.shared.s64 %0, [%1];"
 #else
-    asm volatile("ld.relaxed.cta.shared.s64 %0, [%1];"
-                 : "=l"(result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int64*>(ptr)))
-                 : "memory");
+        "ld.relaxed.cta.shared.s64 %0, [%1];"
 #endif
+        : "=l"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int64*>(ptr)))
+        : "memory");
 
     return result;
 }
@@ -121,34 +115,30 @@ __device__ __forceinline__ int64 load_relaxed_shared(const int64* ptr)
 
 __device__ __forceinline__ void store_relaxed_shared(int64* ptr, int64 result)
 {
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.shared.s64 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "l"(result)
-                 : "memory");
+        "st.volatile.shared.s64 [%0], %1;"
 #else
-    asm volatile("st.relaxed.cta.shared.s64 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "l"(result)
-                 : "memory");
+        "st.relaxed.cta.shared.s64 [%0], %1;"
 #endif
+        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "l"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ float load_relaxed_shared(const float* ptr)
 {
     float result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.shared.f32 %0, [%1];"
-                 : "=f"(result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<float*>(ptr)))
-                 : "memory");
+        "ld.volatile.shared.f32 %0, [%1];"
 #else
-    asm volatile("ld.relaxed.cta.shared.f32 %0, [%1];"
-                 : "=f"(result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<float*>(ptr)))
-                 : "memory");
+        "ld.relaxed.cta.shared.f32 %0, [%1];"
 #endif
+        : "=f"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<float*>(ptr)))
+        : "memory");
 
     return result;
 }
@@ -156,36 +146,30 @@ __device__ __forceinline__ float load_relaxed_shared(const float* ptr)
 
 __device__ __forceinline__ void store_relaxed_shared(float* ptr, float result)
 {
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.shared.f32 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "f"(result)
-                 : "memory");
+        "st.volatile.shared.f32 [%0], %1;"
 #else
-    asm volatile("st.relaxed.cta.shared.f32 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "f"(result)
-                 : "memory");
+        "st.relaxed.cta.shared.f32 [%0], %1;"
 #endif
+        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "f"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ double load_relaxed_shared(const double* ptr)
 {
     double result;
-#if __CUDA_ARCH__ < 700
     asm volatile(
+#if __CUDA_ARCH__ < 700
         "ld.volatile.shared.f64 %0, [%1];"
-        : "=d"(result)
-        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<double*>(ptr)))
-        : "memory");
 #else
-    asm volatile(
         "ld.relaxed.cta.shared.f64 %0, [%1];"
+#endif
         : "=d"(result)
         : "r"(convert_generic_ptr_to_smem_ptr(const_cast<double*>(ptr)))
         : "memory");
-#endif
 
     return result;
 }
@@ -193,34 +177,30 @@ __device__ __forceinline__ double load_relaxed_shared(const double* ptr)
 
 __device__ __forceinline__ void store_relaxed_shared(double* ptr, double result)
 {
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.shared.f64 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "d"(result)
-                 : "memory");
+        "st.volatile.shared.f64 [%0], %1;"
 #else
-    asm volatile("st.relaxed.cta.shared.f64 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "d"(result)
-                 : "memory");
+        "st.relaxed.cta.shared.f64 [%0], %1;"
 #endif
+        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "d"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ int32 load_acquire_shared(const int32* ptr)
 {
     int32 result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.shared.s32 %0, [%1];"
-                 : "=r"(result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int32*>(ptr)))
-                 : "memory");
+        "ld.volatile.shared.s32 %0, [%1];"
 #else
-    asm volatile("ld.acquire.cta.shared.s32 %0, [%1];"
-                 : "=r"(result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int32*>(ptr)))
-                 : "memory");
+        "ld.acquire.cta.shared.s32 %0, [%1];"
 #endif
+        : "=r"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int32*>(ptr)))
+        : "memory");
     membar_acq_rel_shared();
     return result;
 }
@@ -229,34 +209,30 @@ __device__ __forceinline__ int32 load_acquire_shared(const int32* ptr)
 __device__ __forceinline__ void store_release_shared(int32* ptr, int32 result)
 {
     membar_acq_rel_shared();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.shared.s32 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "r"(result)
-                 : "memory");
+        "st.volatile.shared.s32 [%0], %1;"
 #else
-    asm volatile("st.release.cta.shared.s32 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "r"(result)
-                 : "memory");
+        "st.release.cta.shared.s32 [%0], %1;"
 #endif
+        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "r"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ int64 load_acquire_shared(const int64* ptr)
 {
     int64 result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.shared.s64 %0, [%1];"
-                 : "=l"(result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int64*>(ptr)))
-                 : "memory");
+        "ld.volatile.shared.s64 %0, [%1];"
 #else
-    asm volatile("ld.acquire.cta.shared.s64 %0, [%1];"
-                 : "=l"(result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int64*>(ptr)))
-                 : "memory");
+        "ld.acquire.cta.shared.s64 %0, [%1];"
 #endif
+        : "=l"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int64*>(ptr)))
+        : "memory");
     membar_acq_rel_shared();
     return result;
 }
@@ -265,34 +241,30 @@ __device__ __forceinline__ int64 load_acquire_shared(const int64* ptr)
 __device__ __forceinline__ void store_release_shared(int64* ptr, int64 result)
 {
     membar_acq_rel_shared();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.shared.s64 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "l"(result)
-                 : "memory");
+        "st.volatile.shared.s64 [%0], %1;"
 #else
-    asm volatile("st.release.cta.shared.s64 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "l"(result)
-                 : "memory");
+        "st.release.cta.shared.s64 [%0], %1;"
 #endif
+        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "l"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ float load_acquire_shared(const float* ptr)
 {
     float result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.shared.f32 %0, [%1];"
-                 : "=f"(result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<float*>(ptr)))
-                 : "memory");
+        "ld.volatile.shared.f32 %0, [%1];"
 #else
-    asm volatile("ld.acquire.cta.shared.f32 %0, [%1];"
-                 : "=f"(result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<float*>(ptr)))
-                 : "memory");
+        "ld.acquire.cta.shared.f32 %0, [%1];"
 #endif
+        : "=f"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<float*>(ptr)))
+        : "memory");
     membar_acq_rel_shared();
     return result;
 }
@@ -301,36 +273,30 @@ __device__ __forceinline__ float load_acquire_shared(const float* ptr)
 __device__ __forceinline__ void store_release_shared(float* ptr, float result)
 {
     membar_acq_rel_shared();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.shared.f32 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "f"(result)
-                 : "memory");
+        "st.volatile.shared.f32 [%0], %1;"
 #else
-    asm volatile("st.release.cta.shared.f32 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "f"(result)
-                 : "memory");
+        "st.release.cta.shared.f32 [%0], %1;"
 #endif
+        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "f"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ double load_acquire_shared(const double* ptr)
 {
     double result;
-#if __CUDA_ARCH__ < 700
     asm volatile(
+#if __CUDA_ARCH__ < 700
         "ld.volatile.shared.f64 %0, [%1];"
-        : "=d"(result)
-        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<double*>(ptr)))
-        : "memory");
 #else
-    asm volatile(
         "ld.acquire.cta.shared.f64 %0, [%1];"
+#endif
         : "=d"(result)
         : "r"(convert_generic_ptr_to_smem_ptr(const_cast<double*>(ptr)))
         : "memory");
-#endif
     membar_acq_rel_shared();
     return result;
 }
@@ -339,34 +305,30 @@ __device__ __forceinline__ double load_acquire_shared(const double* ptr)
 __device__ __forceinline__ void store_release_shared(double* ptr, double result)
 {
     membar_acq_rel_shared();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.shared.f64 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "d"(result)
-                 : "memory");
+        "st.volatile.shared.f64 [%0], %1;"
 #else
-    asm volatile("st.release.cta.shared.f64 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "d"(result)
-                 : "memory");
+        "st.release.cta.shared.f64 [%0], %1;"
 #endif
+        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "d"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ int32 load_relaxed_local(const int32* ptr)
 {
     int32 result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.s32 %0, [%1];"
-                 : "=r"(result)
-                 : "l"(const_cast<int32*>(ptr))
-                 : "memory");
+        "ld.volatile.s32 %0, [%1];"
 #else
-    asm volatile("ld.relaxed.cta.s32 %0, [%1];"
-                 : "=r"(result)
-                 : "l"(const_cast<int32*>(ptr))
-                 : "memory");
+        "ld.relaxed.cta.s32 %0, [%1];"
 #endif
+        : "=r"(result)
+        : "l"(const_cast<int32*>(ptr))
+        : "memory");
 
     return result;
 }
@@ -374,30 +336,30 @@ __device__ __forceinline__ int32 load_relaxed_local(const int32* ptr)
 
 __device__ __forceinline__ void store_relaxed_local(int32* ptr, int32 result)
 {
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.s32 [%0], %1;" ::"l"(ptr), "r"(result)
-                 : "memory");
+        "st.volatile.s32 [%0], %1;"
 #else
-    asm volatile("st.relaxed.cta.s32 [%0], %1;" ::"l"(ptr), "r"(result)
-                 : "memory");
+        "st.relaxed.cta.s32 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "r"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ int64 load_relaxed_local(const int64* ptr)
 {
     int64 result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.s64 %0, [%1];"
-                 : "=l"(result)
-                 : "l"(const_cast<int64*>(ptr))
-                 : "memory");
+        "ld.volatile.s64 %0, [%1];"
 #else
-    asm volatile("ld.relaxed.cta.s64 %0, [%1];"
-                 : "=l"(result)
-                 : "l"(const_cast<int64*>(ptr))
-                 : "memory");
+        "ld.relaxed.cta.s64 %0, [%1];"
 #endif
+        : "=l"(result)
+        : "l"(const_cast<int64*>(ptr))
+        : "memory");
 
     return result;
 }
@@ -405,30 +367,30 @@ __device__ __forceinline__ int64 load_relaxed_local(const int64* ptr)
 
 __device__ __forceinline__ void store_relaxed_local(int64* ptr, int64 result)
 {
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.s64 [%0], %1;" ::"l"(ptr), "l"(result)
-                 : "memory");
+        "st.volatile.s64 [%0], %1;"
 #else
-    asm volatile("st.relaxed.cta.s64 [%0], %1;" ::"l"(ptr), "l"(result)
-                 : "memory");
+        "st.relaxed.cta.s64 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "l"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ float load_relaxed_local(const float* ptr)
 {
     float result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.f32 %0, [%1];"
-                 : "=f"(result)
-                 : "l"(const_cast<float*>(ptr))
-                 : "memory");
+        "ld.volatile.f32 %0, [%1];"
 #else
-    asm volatile("ld.relaxed.cta.f32 %0, [%1];"
-                 : "=f"(result)
-                 : "l"(const_cast<float*>(ptr))
-                 : "memory");
+        "ld.relaxed.cta.f32 %0, [%1];"
 #endif
+        : "=f"(result)
+        : "l"(const_cast<float*>(ptr))
+        : "memory");
 
     return result;
 }
@@ -436,30 +398,30 @@ __device__ __forceinline__ float load_relaxed_local(const float* ptr)
 
 __device__ __forceinline__ void store_relaxed_local(float* ptr, float result)
 {
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result)
-                 : "memory");
+        "st.volatile.f32 [%0], %1;"
 #else
-    asm volatile("st.relaxed.cta.f32 [%0], %1;" ::"l"(ptr), "f"(result)
-                 : "memory");
+        "st.relaxed.cta.f32 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "f"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ double load_relaxed_local(const double* ptr)
 {
     double result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.f64 %0, [%1];"
-                 : "=d"(result)
-                 : "l"(const_cast<double*>(ptr))
-                 : "memory");
+        "ld.volatile.f64 %0, [%1];"
 #else
-    asm volatile("ld.relaxed.cta.f64 %0, [%1];"
-                 : "=d"(result)
-                 : "l"(const_cast<double*>(ptr))
-                 : "memory");
+        "ld.relaxed.cta.f64 %0, [%1];"
 #endif
+        : "=d"(result)
+        : "l"(const_cast<double*>(ptr))
+        : "memory");
 
     return result;
 }
@@ -467,30 +429,30 @@ __device__ __forceinline__ double load_relaxed_local(const double* ptr)
 
 __device__ __forceinline__ void store_relaxed_local(double* ptr, double result)
 {
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result)
-                 : "memory");
+        "st.volatile.f64 [%0], %1;"
 #else
-    asm volatile("st.relaxed.cta.f64 [%0], %1;" ::"l"(ptr), "d"(result)
-                 : "memory");
+        "st.relaxed.cta.f64 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "d"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ int32 load_acquire_local(const int32* ptr)
 {
     int32 result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.s32 %0, [%1];"
-                 : "=r"(result)
-                 : "l"(const_cast<int32*>(ptr))
-                 : "memory");
+        "ld.volatile.s32 %0, [%1];"
 #else
-    asm volatile("ld.acquire.cta.s32 %0, [%1];"
-                 : "=r"(result)
-                 : "l"(const_cast<int32*>(ptr))
-                 : "memory");
+        "ld.acquire.cta.s32 %0, [%1];"
 #endif
+        : "=r"(result)
+        : "l"(const_cast<int32*>(ptr))
+        : "memory");
     membar_acq_rel_local();
     return result;
 }
@@ -499,30 +461,30 @@ __device__ __forceinline__ int32 load_acquire_local(const int32* ptr)
 __device__ __forceinline__ void store_release_local(int32* ptr, int32 result)
 {
     membar_acq_rel_local();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.s32 [%0], %1;" ::"l"(ptr), "r"(result)
-                 : "memory");
+        "st.volatile.s32 [%0], %1;"
 #else
-    asm volatile("st.release.cta.s32 [%0], %1;" ::"l"(ptr), "r"(result)
-                 : "memory");
+        "st.release.cta.s32 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "r"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ int64 load_acquire_local(const int64* ptr)
 {
     int64 result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.s64 %0, [%1];"
-                 : "=l"(result)
-                 : "l"(const_cast<int64*>(ptr))
-                 : "memory");
+        "ld.volatile.s64 %0, [%1];"
 #else
-    asm volatile("ld.acquire.cta.s64 %0, [%1];"
-                 : "=l"(result)
-                 : "l"(const_cast<int64*>(ptr))
-                 : "memory");
+        "ld.acquire.cta.s64 %0, [%1];"
 #endif
+        : "=l"(result)
+        : "l"(const_cast<int64*>(ptr))
+        : "memory");
     membar_acq_rel_local();
     return result;
 }
@@ -531,30 +493,30 @@ __device__ __forceinline__ int64 load_acquire_local(const int64* ptr)
 __device__ __forceinline__ void store_release_local(int64* ptr, int64 result)
 {
     membar_acq_rel_local();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.s64 [%0], %1;" ::"l"(ptr), "l"(result)
-                 : "memory");
+        "st.volatile.s64 [%0], %1;"
 #else
-    asm volatile("st.release.cta.s64 [%0], %1;" ::"l"(ptr), "l"(result)
-                 : "memory");
+        "st.release.cta.s64 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "l"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ float load_acquire_local(const float* ptr)
 {
     float result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.f32 %0, [%1];"
-                 : "=f"(result)
-                 : "l"(const_cast<float*>(ptr))
-                 : "memory");
+        "ld.volatile.f32 %0, [%1];"
 #else
-    asm volatile("ld.acquire.cta.f32 %0, [%1];"
-                 : "=f"(result)
-                 : "l"(const_cast<float*>(ptr))
-                 : "memory");
+        "ld.acquire.cta.f32 %0, [%1];"
 #endif
+        : "=f"(result)
+        : "l"(const_cast<float*>(ptr))
+        : "memory");
     membar_acq_rel_local();
     return result;
 }
@@ -563,30 +525,30 @@ __device__ __forceinline__ float load_acquire_local(const float* ptr)
 __device__ __forceinline__ void store_release_local(float* ptr, float result)
 {
     membar_acq_rel_local();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result)
-                 : "memory");
+        "st.volatile.f32 [%0], %1;"
 #else
-    asm volatile("st.release.cta.f32 [%0], %1;" ::"l"(ptr), "f"(result)
-                 : "memory");
+        "st.release.cta.f32 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "f"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ double load_acquire_local(const double* ptr)
 {
     double result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.f64 %0, [%1];"
-                 : "=d"(result)
-                 : "l"(const_cast<double*>(ptr))
-                 : "memory");
+        "ld.volatile.f64 %0, [%1];"
 #else
-    asm volatile("ld.acquire.cta.f64 %0, [%1];"
-                 : "=d"(result)
-                 : "l"(const_cast<double*>(ptr))
-                 : "memory");
+        "ld.acquire.cta.f64 %0, [%1];"
 #endif
+        : "=d"(result)
+        : "l"(const_cast<double*>(ptr))
+        : "memory");
     membar_acq_rel_local();
     return result;
 }
@@ -595,30 +557,30 @@ __device__ __forceinline__ double load_acquire_local(const double* ptr)
 __device__ __forceinline__ void store_release_local(double* ptr, double result)
 {
     membar_acq_rel_local();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result)
-                 : "memory");
+        "st.volatile.f64 [%0], %1;"
 #else
-    asm volatile("st.release.cta.f64 [%0], %1;" ::"l"(ptr), "d"(result)
-                 : "memory");
+        "st.release.cta.f64 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "d"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ int32 load_relaxed(const int32* ptr)
 {
     int32 result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.s32 %0, [%1];"
-                 : "=r"(result)
-                 : "l"(const_cast<int32*>(ptr))
-                 : "memory");
+        "ld.volatile.s32 %0, [%1];"
 #else
-    asm volatile("ld.relaxed.gpu.s32 %0, [%1];"
-                 : "=r"(result)
-                 : "l"(const_cast<int32*>(ptr))
-                 : "memory");
+        "ld.relaxed.gpu.s32 %0, [%1];"
 #endif
+        : "=r"(result)
+        : "l"(const_cast<int32*>(ptr))
+        : "memory");
 
     return result;
 }
@@ -626,30 +588,30 @@ __device__ __forceinline__ int32 load_relaxed(const int32* ptr)
 
 __device__ __forceinline__ void store_relaxed(int32* ptr, int32 result)
 {
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.s32 [%0], %1;" ::"l"(ptr), "r"(result)
-                 : "memory");
+        "st.volatile.s32 [%0], %1;"
 #else
-    asm volatile("st.relaxed.gpu.s32 [%0], %1;" ::"l"(ptr), "r"(result)
-                 : "memory");
+        "st.relaxed.gpu.s32 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "r"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ int64 load_relaxed(const int64* ptr)
 {
     int64 result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.s64 %0, [%1];"
-                 : "=l"(result)
-                 : "l"(const_cast<int64*>(ptr))
-                 : "memory");
+        "ld.volatile.s64 %0, [%1];"
 #else
-    asm volatile("ld.relaxed.gpu.s64 %0, [%1];"
-                 : "=l"(result)
-                 : "l"(const_cast<int64*>(ptr))
-                 : "memory");
+        "ld.relaxed.gpu.s64 %0, [%1];"
 #endif
+        : "=l"(result)
+        : "l"(const_cast<int64*>(ptr))
+        : "memory");
 
     return result;
 }
@@ -657,30 +619,30 @@ __device__ __forceinline__ int64 load_relaxed(const int64* ptr)
 
 __device__ __forceinline__ void store_relaxed(int64* ptr, int64 result)
 {
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.s64 [%0], %1;" ::"l"(ptr), "l"(result)
-                 : "memory");
+        "st.volatile.s64 [%0], %1;"
 #else
-    asm volatile("st.relaxed.gpu.s64 [%0], %1;" ::"l"(ptr), "l"(result)
-                 : "memory");
+        "st.relaxed.gpu.s64 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "l"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ float load_relaxed(const float* ptr)
 {
     float result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.f32 %0, [%1];"
-                 : "=f"(result)
-                 : "l"(const_cast<float*>(ptr))
-                 : "memory");
+        "ld.volatile.f32 %0, [%1];"
 #else
-    asm volatile("ld.relaxed.gpu.f32 %0, [%1];"
-                 : "=f"(result)
-                 : "l"(const_cast<float*>(ptr))
-                 : "memory");
+        "ld.relaxed.gpu.f32 %0, [%1];"
 #endif
+        : "=f"(result)
+        : "l"(const_cast<float*>(ptr))
+        : "memory");
 
     return result;
 }
@@ -688,30 +650,30 @@ __device__ __forceinline__ float load_relaxed(const float* ptr)
 
 __device__ __forceinline__ void store_relaxed(float* ptr, float result)
 {
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result)
-                 : "memory");
+        "st.volatile.f32 [%0], %1;"
 #else
-    asm volatile("st.relaxed.gpu.f32 [%0], %1;" ::"l"(ptr), "f"(result)
-                 : "memory");
+        "st.relaxed.gpu.f32 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "f"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ double load_relaxed(const double* ptr)
 {
     double result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.f64 %0, [%1];"
-                 : "=d"(result)
-                 : "l"(const_cast<double*>(ptr))
-                 : "memory");
+        "ld.volatile.f64 %0, [%1];"
 #else
-    asm volatile("ld.relaxed.gpu.f64 %0, [%1];"
-                 : "=d"(result)
-                 : "l"(const_cast<double*>(ptr))
-                 : "memory");
+        "ld.relaxed.gpu.f64 %0, [%1];"
 #endif
+        : "=d"(result)
+        : "l"(const_cast<double*>(ptr))
+        : "memory");
 
     return result;
 }
@@ -719,30 +681,30 @@ __device__ __forceinline__ double load_relaxed(const double* ptr)
 
 __device__ __forceinline__ void store_relaxed(double* ptr, double result)
 {
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result)
-                 : "memory");
+        "st.volatile.f64 [%0], %1;"
 #else
-    asm volatile("st.relaxed.gpu.f64 [%0], %1;" ::"l"(ptr), "d"(result)
-                 : "memory");
+        "st.relaxed.gpu.f64 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "d"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ int32 load_acquire(const int32* ptr)
 {
     int32 result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.s32 %0, [%1];"
-                 : "=r"(result)
-                 : "l"(const_cast<int32*>(ptr))
-                 : "memory");
+        "ld.volatile.s32 %0, [%1];"
 #else
-    asm volatile("ld.acquire.gpu.s32 %0, [%1];"
-                 : "=r"(result)
-                 : "l"(const_cast<int32*>(ptr))
-                 : "memory");
+        "ld.acquire.gpu.s32 %0, [%1];"
 #endif
+        : "=r"(result)
+        : "l"(const_cast<int32*>(ptr))
+        : "memory");
     membar_acq_rel();
     return result;
 }
@@ -751,30 +713,30 @@ __device__ __forceinline__ int32 load_acquire(const int32* ptr)
 __device__ __forceinline__ void store_release(int32* ptr, int32 result)
 {
     membar_acq_rel();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.s32 [%0], %1;" ::"l"(ptr), "r"(result)
-                 : "memory");
+        "st.volatile.s32 [%0], %1;"
 #else
-    asm volatile("st.release.gpu.s32 [%0], %1;" ::"l"(ptr), "r"(result)
-                 : "memory");
+        "st.release.gpu.s32 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "r"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ int64 load_acquire(const int64* ptr)
 {
     int64 result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.s64 %0, [%1];"
-                 : "=l"(result)
-                 : "l"(const_cast<int64*>(ptr))
-                 : "memory");
+        "ld.volatile.s64 %0, [%1];"
 #else
-    asm volatile("ld.acquire.gpu.s64 %0, [%1];"
-                 : "=l"(result)
-                 : "l"(const_cast<int64*>(ptr))
-                 : "memory");
+        "ld.acquire.gpu.s64 %0, [%1];"
 #endif
+        : "=l"(result)
+        : "l"(const_cast<int64*>(ptr))
+        : "memory");
     membar_acq_rel();
     return result;
 }
@@ -783,30 +745,30 @@ __device__ __forceinline__ int64 load_acquire(const int64* ptr)
 __device__ __forceinline__ void store_release(int64* ptr, int64 result)
 {
     membar_acq_rel();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.s64 [%0], %1;" ::"l"(ptr), "l"(result)
-                 : "memory");
+        "st.volatile.s64 [%0], %1;"
 #else
-    asm volatile("st.release.gpu.s64 [%0], %1;" ::"l"(ptr), "l"(result)
-                 : "memory");
+        "st.release.gpu.s64 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "l"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ float load_acquire(const float* ptr)
 {
     float result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.f32 %0, [%1];"
-                 : "=f"(result)
-                 : "l"(const_cast<float*>(ptr))
-                 : "memory");
+        "ld.volatile.f32 %0, [%1];"
 #else
-    asm volatile("ld.acquire.gpu.f32 %0, [%1];"
-                 : "=f"(result)
-                 : "l"(const_cast<float*>(ptr))
-                 : "memory");
+        "ld.acquire.gpu.f32 %0, [%1];"
 #endif
+        : "=f"(result)
+        : "l"(const_cast<float*>(ptr))
+        : "memory");
     membar_acq_rel();
     return result;
 }
@@ -815,30 +777,30 @@ __device__ __forceinline__ float load_acquire(const float* ptr)
 __device__ __forceinline__ void store_release(float* ptr, float result)
 {
     membar_acq_rel();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result)
-                 : "memory");
+        "st.volatile.f32 [%0], %1;"
 #else
-    asm volatile("st.release.gpu.f32 [%0], %1;" ::"l"(ptr), "f"(result)
-                 : "memory");
+        "st.release.gpu.f32 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "f"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ double load_acquire(const double* ptr)
 {
     double result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.f64 %0, [%1];"
-                 : "=d"(result)
-                 : "l"(const_cast<double*>(ptr))
-                 : "memory");
+        "ld.volatile.f64 %0, [%1];"
 #else
-    asm volatile("ld.acquire.gpu.f64 %0, [%1];"
-                 : "=d"(result)
-                 : "l"(const_cast<double*>(ptr))
-                 : "memory");
+        "ld.acquire.gpu.f64 %0, [%1];"
 #endif
+        : "=d"(result)
+        : "l"(const_cast<double*>(ptr))
+        : "memory");
     membar_acq_rel();
     return result;
 }
@@ -847,13 +809,15 @@ __device__ __forceinline__ double load_acquire(const double* ptr)
 __device__ __forceinline__ void store_release(double* ptr, double result)
 {
     membar_acq_rel();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result)
-                 : "memory");
+        "st.volatile.f64 [%0], %1;"
 #else
-    asm volatile("st.release.gpu.f64 [%0], %1;" ::"l"(ptr), "d"(result)
-                 : "memory");
+        "st.release.gpu.f64 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "d"(result)
+        : "memory");
 }
 
 
@@ -862,19 +826,16 @@ __device__ __forceinline__ thrust::complex<float> load_relaxed_shared(
 {
     float real_result;
     float imag_result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.shared.v2.f32 {%0, %1}, [%2];"
-                 : "=f"(real_result), "=f"(imag_result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(
-                     const_cast<thrust::complex<float>*>(ptr)))
-                 : "memory");
-#else
-    asm volatile("ld.relaxed.cta.shared.v2.f32 {%0, %1}, [%2];"
-                 : "=f"(real_result), "=f"(imag_result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(
-                     const_cast<thrust::complex<float>*>(ptr)))
-                 : "memory");
+        "ld.volatile.shared.v2.f32 {%0, %1}, [%2];"
+#else
+        "ld.relaxed.cta.shared.v2.f32 {%0, %1}, [%2];"
 #endif
+        : "=f"(real_result), "=f"(imag_result)
+        : "r"(convert_generic_ptr_to_smem_ptr(
+            const_cast<thrust::complex<float>*>(ptr)))
+        : "memory");
     return thrust::complex<float>{real_result, imag_result};
 }
 
@@ -884,17 +845,15 @@ __device__ __forceinline__ void store_relaxed_shared(
 {
     auto real_result = result.real();
     auto imag_result = result.imag();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.shared.v2.f32 [%0], {%1, %2};" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "f"(real_result), "f"(imag_result)
-                 : "memory");
+        "st.volatile.shared.v2.f32 [%0], {%1, %2};"
 #else
-    asm volatile("st.relaxed.cta.shared.v2.f32 [%0], {%1, %2};" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "f"(real_result), "f"(imag_result)
-                 : "memory");
+        "st.relaxed.cta.shared.v2.f32 [%0], {%1, %2};"
 #endif
+        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "f"(real_result), "f"(imag_result)
+        : "memory");
 }
 
 
@@ -903,19 +862,16 @@ __device__ __forceinline__ thrust::complex<double> load_relaxed_shared(
 {
     double real_result;
     double imag_result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.shared.v2.f64 {%0, %1}, [%2];"
-                 : "=d"(real_result), "=d"(imag_result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(
-                     const_cast<thrust::complex<double>*>(ptr)))
-                 : "memory");
-#else
-    asm volatile("ld.relaxed.cta.shared.v2.f64 {%0, %1}, [%2];"
-                 : "=d"(real_result), "=d"(imag_result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(
-                     const_cast<thrust::complex<double>*>(ptr)))
-                 : "memory");
+        "ld.volatile.shared.v2.f64 {%0, %1}, [%2];"
+#else
+        "ld.relaxed.cta.shared.v2.f64 {%0, %1}, [%2];"
 #endif
+        : "=d"(real_result), "=d"(imag_result)
+        : "r"(convert_generic_ptr_to_smem_ptr(
+            const_cast<thrust::complex<double>*>(ptr)))
+        : "memory");
     return thrust::complex<double>{real_result, imag_result};
 }
 
@@ -925,17 +881,15 @@ __device__ __forceinline__ void store_relaxed_shared(
 {
     auto real_result = result.real();
     auto imag_result = result.imag();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.shared.v2.f64 [%0], {%1, %2};" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "d"(real_result), "d"(imag_result)
-                 : "memory");
+        "st.volatile.shared.v2.f64 [%0], {%1, %2};"
 #else
-    asm volatile("st.relaxed.cta.shared.v2.f64 [%0], {%1, %2};" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "d"(real_result), "d"(imag_result)
-                 : "memory");
+        "st.relaxed.cta.shared.v2.f64 [%0], {%1, %2};"
 #endif
+        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "d"(real_result), "d"(imag_result)
+        : "memory");
 }
 
 
@@ -944,17 +898,15 @@ __device__ __forceinline__ thrust::complex<float> load_relaxed_local(
 {
     float real_result;
     float imag_result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.v2.f32 {%0, %1}, [%2];"
-                 : "=f"(real_result), "=f"(imag_result)
-                 : "l"(const_cast<thrust::complex<float>*>(ptr))
-                 : "memory");
+        "ld.volatile.v2.f32 {%0, %1}, [%2];"
 #else
-    asm volatile("ld.relaxed.cta.v2.f32 {%0, %1}, [%2];"
-                 : "=f"(real_result), "=f"(imag_result)
-                 : "l"(const_cast<thrust::complex<float>*>(ptr))
-                 : "memory");
+        "ld.relaxed.cta.v2.f32 {%0, %1}, [%2];"
 #endif
+        : "=f"(real_result), "=f"(imag_result)
+        : "l"(const_cast<thrust::complex<float>*>(ptr))
+        : "memory");
     return thrust::complex<float>{real_result, imag_result};
 }
 
@@ -964,15 +916,15 @@ __device__ __forceinline__ void store_relaxed_local(
 {
     auto real_result = result.real();
     auto imag_result = result.imag();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.v2.f32 [%0], {%1, %2};" ::"l"(ptr),
-                 "f"(real_result), "f"(imag_result)
-                 : "memory");
+        "st.volatile.v2.f32 [%0], {%1, %2};"
 #else
-    asm volatile("st.relaxed.cta.v2.f32 [%0], {%1, %2};" ::"l"(ptr),
-                 "f"(real_result), "f"(imag_result)
-                 : "memory");
+        "st.relaxed.cta.v2.f32 [%0], {%1, %2};"
 #endif
+        ::"l"(ptr),
+        "f"(real_result), "f"(imag_result)
+        : "memory");
 }
 
 
@@ -981,17 +933,15 @@ __device__ __forceinline__ thrust::complex<double> load_relaxed_local(
 {
     double real_result;
     double imag_result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.v2.f64 {%0, %1}, [%2];"
-                 : "=d"(real_result), "=d"(imag_result)
-                 : "l"(const_cast<thrust::complex<double>*>(ptr))
-                 : "memory");
+        "ld.volatile.v2.f64 {%0, %1}, [%2];"
 #else
-    asm volatile("ld.relaxed.cta.v2.f64 {%0, %1}, [%2];"
-                 : "=d"(real_result), "=d"(imag_result)
-                 : "l"(const_cast<thrust::complex<double>*>(ptr))
-                 : "memory");
+        "ld.relaxed.cta.v2.f64 {%0, %1}, [%2];"
 #endif
+        : "=d"(real_result), "=d"(imag_result)
+        : "l"(const_cast<thrust::complex<double>*>(ptr))
+        : "memory");
     return thrust::complex<double>{real_result, imag_result};
 }
 
@@ -1001,15 +951,15 @@ __device__ __forceinline__ void store_relaxed_local(
 {
     auto real_result = result.real();
     auto imag_result = result.imag();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.v2.f64 [%0], {%1, %2};" ::"l"(ptr),
-                 "d"(real_result), "d"(imag_result)
-                 : "memory");
+        "st.volatile.v2.f64 [%0], {%1, %2};"
 #else
-    asm volatile("st.relaxed.cta.v2.f64 [%0], {%1, %2};" ::"l"(ptr),
-                 "d"(real_result), "d"(imag_result)
-                 : "memory");
+        "st.relaxed.cta.v2.f64 [%0], {%1, %2};"
 #endif
+        ::"l"(ptr),
+        "d"(real_result), "d"(imag_result)
+        : "memory");
 }
 
 
@@ -1018,17 +968,15 @@ __device__ __forceinline__ thrust::complex<float> load_relaxed(
 {
     float real_result;
     float imag_result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.v2.f32 {%0, %1}, [%2];"
-                 : "=f"(real_result), "=f"(imag_result)
-                 : "l"(const_cast<thrust::complex<float>*>(ptr))
-                 : "memory");
+        "ld.volatile.v2.f32 {%0, %1}, [%2];"
 #else
-    asm volatile("ld.relaxed.gpu.v2.f32 {%0, %1}, [%2];"
-                 : "=f"(real_result), "=f"(imag_result)
-                 : "l"(const_cast<thrust::complex<float>*>(ptr))
-                 : "memory");
+        "ld.relaxed.gpu.v2.f32 {%0, %1}, [%2];"
 #endif
+        : "=f"(real_result), "=f"(imag_result)
+        : "l"(const_cast<thrust::complex<float>*>(ptr))
+        : "memory");
     return thrust::complex<float>{real_result, imag_result};
 }
 
@@ -1038,15 +986,15 @@ __device__ __forceinline__ void store_relaxed(thrust::complex<float>* ptr,
 {
     auto real_result = result.real();
     auto imag_result = result.imag();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.v2.f32 [%0], {%1, %2};" ::"l"(ptr),
-                 "f"(real_result), "f"(imag_result)
-                 : "memory");
+        "st.volatile.v2.f32 [%0], {%1, %2};"
 #else
-    asm volatile("st.relaxed.gpu.v2.f32 [%0], {%1, %2};" ::"l"(ptr),
-                 "f"(real_result), "f"(imag_result)
-                 : "memory");
+        "st.relaxed.gpu.v2.f32 [%0], {%1, %2};"
 #endif
+        ::"l"(ptr),
+        "f"(real_result), "f"(imag_result)
+        : "memory");
 }
 
 
@@ -1055,17 +1003,15 @@ __device__ __forceinline__ thrust::complex<double> load_relaxed(
 {
     double real_result;
     double imag_result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.v2.f64 {%0, %1}, [%2];"
-                 : "=d"(real_result), "=d"(imag_result)
-                 : "l"(const_cast<thrust::complex<double>*>(ptr))
-                 : "memory");
+        "ld.volatile.v2.f64 {%0, %1}, [%2];"
 #else
-    asm volatile("ld.relaxed.gpu.v2.f64 {%0, %1}, [%2];"
-                 : "=d"(real_result), "=d"(imag_result)
-                 : "l"(const_cast<thrust::complex<double>*>(ptr))
-                 : "memory");
+        "ld.relaxed.gpu.v2.f64 {%0, %1}, [%2];"
 #endif
+        : "=d"(real_result), "=d"(imag_result)
+        : "l"(const_cast<thrust::complex<double>*>(ptr))
+        : "memory");
     return thrust::complex<double>{real_result, imag_result};
 }
 
@@ -1075,13 +1021,13 @@ __device__ __forceinline__ void store_relaxed(thrust::complex<double>* ptr,
 {
     auto real_result = result.real();
     auto imag_result = result.imag();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.v2.f64 [%0], {%1, %2};" ::"l"(ptr),
-                 "d"(real_result), "d"(imag_result)
-                 : "memory");
+        "st.volatile.v2.f64 [%0], {%1, %2};"
 #else
-    asm volatile("st.relaxed.gpu.v2.f64 [%0], {%1, %2};" ::"l"(ptr),
-                 "d"(real_result), "d"(imag_result)
-                 : "memory");
+        "st.relaxed.gpu.v2.f64 [%0], {%1, %2};"
 #endif
+        ::"l"(ptr),
+        "d"(real_result), "d"(imag_result)
+        : "memory");
 }
diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py
index 9dec14d2394..49f99d4d96f 100755
--- a/dev_tools/scripts/generate_cuda_memory_ptx.py
+++ b/dev_tools/scripts/generate_cuda_memory_ptx.py
@@ -67,7 +67,7 @@ class type_desc:
 // for reasoning behind this implementation
 #if (!defined(__clang__) && __CUDACC_VER_MAJOR__ >= 11)
     return static_cast<uint32>(__cvta_generic_to_shared(ptr));
-#elif (!defined(__clang__) && CUDACC_VER_MAJOR__ == 10 && \
+#elif (!defined(__clang__) && CUDACC_VER_MAJOR__ == 10 && \\
        __CUDACC_VER_MINOR__ >= 2)
     return __nvvm_get_smem_pointer(ptr);
 #else
@@ -123,17 +123,15 @@ class type_desc:
 __device__ __forceinline__ {t.name} load{o.fn_load_suffix}{s.fn_suffix}(const {t.name}* ptr)
 {{
     {t.name} result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile{s.ptx_space_suffix}{t.ptx_type_suffix} %0, [%1];"
-                 : "={t.val_constraint}"(result)
-                 : "{s.ptr_constraint}"({const_ptr_expr})
-                 : "memory");
+        "ld.volatile{s.ptx_space_suffix}{t.ptx_type_suffix} %0, [%1];"
 #else
-    asm volatile("ld{o.ptx_load_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_type_suffix} %0, [%1];"
-                 : "={t.val_constraint}"(result)
-                 : "{s.ptr_constraint}"({const_ptr_expr})
-                 : "memory");
+        "ld{o.ptx_load_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_type_suffix} %0, [%1];"
 #endif
+        : "={t.val_constraint}"(result)
+        : "{s.ptr_constraint}"({const_ptr_expr})
+        : "memory");
     {membar_expression}
     return result;
 }}
@@ -142,15 +140,14 @@ class type_desc:
 __device__ __forceinline__ void store{o.fn_store_suffix}{s.fn_suffix}({t.name}* ptr, {t.name} result)
 {{
     {membar_expression}
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile{s.ptx_space_suffix}{t.ptx_type_suffix} [%0], %1;"
-                 :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(result)
-                 : "memory");
+        "st.volatile{s.ptx_space_suffix}{t.ptx_type_suffix} [%0], %1;"
 #else
-    asm volatile("st{o.ptx_store_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_type_suffix} [%0], %1;"
-                 :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(result)
-                 : "memory");
+        "st{o.ptx_store_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_type_suffix} [%0], %1;"
 #endif
+        :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(result)
+        : "memory");
 }}
 """)
 
@@ -167,17 +164,15 @@ class type_desc:
 {{
     {t.name} real_result;
     {t.name} imag_result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile{s.ptx_space_suffix}.v2{t.ptx_type_suffix} {{%0, %1}}, [%2];"
-                 : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result)
-                 : "{s.ptr_constraint}"({const_ptr_expr})
-                 : "memory");
+        "ld.volatile{s.ptx_space_suffix}.v2{t.ptx_type_suffix} {{%0, %1}}, [%2];"
 #else
-    asm volatile("ld.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_type_suffix} {{%0, %1}}, [%2];"
-                 : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result)
-                 : "{s.ptr_constraint}"({const_ptr_expr})
-                 : "memory");
-#endif
+        "ld.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_type_suffix} {{%0, %1}}, [%2];"
+#endif                 
+        : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result)
+        : "{s.ptr_constraint}"({const_ptr_expr})
+        : "memory");
     return thrust::complex<{t.name}>{{real_result, imag_result}};
 }}
 
@@ -186,14 +181,13 @@ class type_desc:
 {{
     auto real_result = result.real();
     auto imag_result = result.imag();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile{s.ptx_space_suffix}.v2{t.ptx_type_suffix} [%0], {{%1, %2}};"
-                 :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result)
-                 : "memory");
+        "st.volatile{s.ptx_space_suffix}.v2{t.ptx_type_suffix} [%0], {{%1, %2}};"
 #else
-    asm volatile("st.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_type_suffix} [%0], {{%1, %2}};"
-                 :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result)
-                 : "memory");
+        "st.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_type_suffix} [%0], {{%1, %2}};"
 #endif
+        :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result)
+        : "memory");
 }}
 """)

From e955c6204fcc46101ba4305cac6c5ba7d0b6a4b2 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 18 Oct 2024 10:33:39 +0200
Subject: [PATCH 207/448] clear unused version switch macro and change the
 badge

---
 dpcpp/base/executor.dp.cpp                 | 12 ------------
 dpcpp/components/cooperative_groups.dp.hpp |  6 ------
 include/ginkgo/core/base/fwd_decls.hpp     | 15 ---------------
 3 files changed, 33 deletions(-)

diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp
index 8a7460f6bcd..863b8aec211 100644
--- a/dpcpp/base/executor.dp.cpp
+++ b/dpcpp/base/executor.dp.cpp
@@ -254,15 +254,8 @@ void DpcppExecutor::set_device_property(dpcpp_queue_property property)
     }
     this->get_exec_info().max_workgroup_size = static_cast<int>(
         device.get_info<sycl::info::device::max_work_group_size>());
-// They change the max_work_item_size with template parameter Dimension after
-// major version 6 and adding the default = 3 is not in the same release.
-#if GINKGO_DPCPP_MAJOR_VERSION >= 6
     auto max_workitem_sizes =
         device.get_info<sycl::info::device::max_work_item_sizes<3>>();
-#else
-    auto max_workitem_sizes =
-        device.get_info<sycl::info::device::max_work_item_sizes>();
-#endif
     // Get the max dimension of a sycl::id object
     auto max_work_item_dimensions =
         device.get_info<sycl::info::device::max_work_item_dimensions>();
@@ -273,13 +266,8 @@ void DpcppExecutor::set_device_property(dpcpp_queue_property property)
 
     // Get the hardware threads per eu
     if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
-#if GINKGO_DPCPP_MAJOR_VERSION >= 6
         this->get_exec_info().num_pu_per_cu = device.get_info<
             sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();
-#else
-        this->get_exec_info().num_pu_per_cu = device.get_info<
-            sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
-#endif
     } else {
         // To make the usage still valid.
         // TODO: check the value for other vendor gpu or cpu.
diff --git a/dpcpp/components/cooperative_groups.dp.hpp b/dpcpp/components/cooperative_groups.dp.hpp
index 33a107ef3f5..034bf4baf28 100644
--- a/dpcpp/components/cooperative_groups.dp.hpp
+++ b/dpcpp/components/cooperative_groups.dp.hpp
@@ -451,9 +451,6 @@ __dpct_inline__ grid_group this_grid(sycl::nd_item<3>& group)
 
 
 // Enable group can directly use group function
-#if GINKGO_DPCPP_MAJOR_VERSION < 6
-inline namespace cl {
-#endif
 namespace sycl {
 namespace detail {
 
@@ -480,9 +477,6 @@ struct group_scope<
 }  // namespace spirv
 }  // namespace detail
 }  // namespace sycl
-#if GINKGO_DPCPP_MAJOR_VERSION < 6
-}  // namespace cl
-#endif
 
 
 #endif  // GKO_DPCPP_COMPONENTS_COOPERATIVE_GROUPS_DP_HPP_
diff --git a/include/ginkgo/core/base/fwd_decls.hpp b/include/ginkgo/core/base/fwd_decls.hpp
index f7e446d7bf2..84e579058c4 100644
--- a/include/ginkgo/core/base/fwd_decls.hpp
+++ b/include/ginkgo/core/base/fwd_decls.hpp
@@ -32,9 +32,6 @@ struct ihipEvent_t;
 #endif
 
 
-// after intel/llvm September'22 release, which uses major version 6, they
-// introduce another inline namespace _V1.
-#if GINKGO_DPCPP_MAJOR_VERSION >= 6
 namespace sycl {
 inline namespace _V1 {
 
@@ -45,18 +42,6 @@ class event;
 
 }  // namespace _V1
 }  // namespace sycl
-#else  // GINKGO_DPCPP_MAJOR_VERSION < 6
-inline namespace cl {
-namespace sycl {
-
-
-class queue;
-class event;
-
-
-}  // namespace sycl
-}  // namespace cl
-#endif
 
 
 #endif  // GKO_PUBLIC_CORE_BASE_FWD_DECLS_HPP_

From 1249e17c49d7b54349f997449766bdb07669bffd Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 6 Aug 2024 11:30:48 +0200
Subject: [PATCH 208/448] add file config example with the configs

---
 examples/CMakeLists.txt                       |   1 +
 examples/file-config-solver/CMakeLists.txt    |  21 +++
 examples/file-config-solver/build.sh          |  16 ++
 .../config/blockjacobi-cg.json                |  17 ++
 examples/file-config-solver/config/cg.json    |  13 ++
 examples/file-config-solver/config/ir.json    |  22 +++
 .../config/mixed-pgm-multigrid-cg.json        |  84 ++++++++++
 .../config/parilu-gmres.json                  |  21 +++
 .../config/pgm-multigrid-cg.json              |  30 ++++
 examples/file-config-solver/data/A.mtx        | 114 +++++++++++++
 examples/file-config-solver/doc/builds-on     |   1 +
 examples/file-config-solver/doc/intro.dox     |   5 +
 examples/file-config-solver/doc/kind          |   1 +
 examples/file-config-solver/doc/results.dox   |  35 ++++
 examples/file-config-solver/doc/short-intro   |   1 +
 examples/file-config-solver/doc/tooltip       |   1 +
 .../file-config-solver/file-config-solver.cpp | 150 ++++++++++++++++++
 17 files changed, 533 insertions(+)
 create mode 100644 examples/file-config-solver/CMakeLists.txt
 create mode 100755 examples/file-config-solver/build.sh
 create mode 100644 examples/file-config-solver/config/blockjacobi-cg.json
 create mode 100644 examples/file-config-solver/config/cg.json
 create mode 100644 examples/file-config-solver/config/ir.json
 create mode 100644 examples/file-config-solver/config/mixed-pgm-multigrid-cg.json
 create mode 100644 examples/file-config-solver/config/parilu-gmres.json
 create mode 100644 examples/file-config-solver/config/pgm-multigrid-cg.json
 create mode 100644 examples/file-config-solver/data/A.mtx
 create mode 100644 examples/file-config-solver/doc/builds-on
 create mode 100644 examples/file-config-solver/doc/intro.dox
 create mode 100644 examples/file-config-solver/doc/kind
 create mode 100644 examples/file-config-solver/doc/results.dox
 create mode 100644 examples/file-config-solver/doc/short-intro
 create mode 100644 examples/file-config-solver/doc/tooltip
 create mode 100644 examples/file-config-solver/file-config-solver.cpp

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 653d52a1e88..229f8763fcc 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -20,6 +20,7 @@ set(EXAMPLES_EXEC_LIST
 set(EXAMPLES_LIST
     ${EXAMPLES_EXEC_LIST}
     custom-stopping-criterion
+    file-config-solver
     ginkgo-overhead
     minimal-cuda-solver
     mixed-spmv
diff --git a/examples/file-config-solver/CMakeLists.txt b/examples/file-config-solver/CMakeLists.txt
new file mode 100644
index 00000000000..743519f6ce1
--- /dev/null
+++ b/examples/file-config-solver/CMakeLists.txt
@@ -0,0 +1,21 @@
+cmake_minimum_required(VERSION 3.16)
+project(file-config-solver)
+
+# We only need to find Ginkgo/nlohmann_json if we build this example stand-alone
+if (NOT GINKGO_BUILD_EXAMPLES)
+    find_package(Ginkgo 1.9.0 REQUIRED)
+    find_package(nlohmann_json 3.9.1 REQUIRED)
+endif()
+
+add_executable(file-config-solver file-config-solver.cpp)
+target_link_libraries(file-config-solver Ginkgo::ginkgo nlohmann_json::nlohmann_json)
+
+# Copy the data files to the execution directory
+configure_file(data/A.mtx data/A.mtx COPYONLY)
+# Copy the config files to the execution directory
+configure_file(config/cg.json config/cg.json COPYONLY)
+configure_file(config/blockjacobi-cg.json config/blockjacobi-cg.json COPYONLY)
+configure_file(config/ir.json config/ir.json COPYONLY)
+configure_file(config/mixed-pgm-multigrid-cg.json config/mixed-pgm-multigrid-cg.json COPYONLY)
+configure_file(config/parilu-gmres.json config/parilu-gmres.json COPYONLY)
+configure_file(config/pgm-multigrid-cg.json config/pgm-multigrid-cg.json COPYONLY)
diff --git a/examples/file-config-solver/build.sh b/examples/file-config-solver/build.sh
new file mode 100755
index 00000000000..c3143d5634f
--- /dev/null
+++ b/examples/file-config-solver/build.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# set up script
+if [ $# -ne 1 ]; then
+    echo -e "Usage: $0 GINKGO_BUILD_DIRECTORY"
+    exit 1
+fi
+BUILD_DIR=$1
+THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
+
+source ${THIS_DIR}/../build-setup.sh
+
+# build
+${CXX} -std=c++14 -o ${THIS_DIR}/file-config-solver ${THIS_DIR}/file-config-solver.cpp \
+       -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
+       -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/file-config-solver/config/blockjacobi-cg.json b/examples/file-config-solver/config/blockjacobi-cg.json
new file mode 100644
index 00000000000..d6c434d430c
--- /dev/null
+++ b/examples/file-config-solver/config/blockjacobi-cg.json
@@ -0,0 +1,17 @@
+{
+    "type": "solver::Cg",
+    "preconditioner": {
+        "type": "preconditioner::Jacobi",
+        "max_block_size": 8
+    },
+    "criteria": [
+        {
+            "type": "Iteration",
+            "max_iters": 20
+        },
+        {
+            "type": "ResidualNorm",
+            "reduction_factor": 1e-7
+        }
+    ]
+}
diff --git a/examples/file-config-solver/config/cg.json b/examples/file-config-solver/config/cg.json
new file mode 100644
index 00000000000..eaf06470827
--- /dev/null
+++ b/examples/file-config-solver/config/cg.json
@@ -0,0 +1,13 @@
+{
+    "type": "solver::Cg",
+    "criteria": [
+        {
+            "type": "Iteration",
+            "max_iters": 20
+        },
+        {
+            "type": "ResidualNorm",
+            "reduction_factor": 1e-7
+        }
+    ]
+}
diff --git a/examples/file-config-solver/config/ir.json b/examples/file-config-solver/config/ir.json
new file mode 100644
index 00000000000..d85fe878c80
--- /dev/null
+++ b/examples/file-config-solver/config/ir.json
@@ -0,0 +1,22 @@
+{
+    "type": "solver::Ir",
+    "solver": {
+        "type": "solver::Cg",
+        "criteria": [
+            {
+                "type": "ResidualNorm",
+                "reduction_factor": 1e-2
+            }
+        ]
+    },
+    "criteria": [
+        {
+            "type": "Iteration",
+            "max_iters": 10000
+        },
+        {
+            "type": "ResidualNorm",
+            "reduction_factor": 1e-12
+        }
+    ]
+}
diff --git a/examples/file-config-solver/config/mixed-pgm-multigrid-cg.json b/examples/file-config-solver/config/mixed-pgm-multigrid-cg.json
new file mode 100644
index 00000000000..6ff281731b5
--- /dev/null
+++ b/examples/file-config-solver/config/mixed-pgm-multigrid-cg.json
@@ -0,0 +1,84 @@
+{
+    "type": "solver::Cg",
+    "preconditioner": {
+        "type": "solver::Multigrid",
+        "max_levels": 10,
+        "min_coarse_rows": 2,
+        "pre_smoother": [
+            {
+                "type": "solver::Ir",
+                "relaxation_factor": 0.9,
+                "solver": {
+                    "type": "preconditioner::Jacobi",
+                    "max_block_size": 1
+                },
+                "criteria": [
+                    {
+                        "type": "Iteration",
+                        "max_iters": 1
+                    }
+                ]
+            },
+            {
+                "type": "solver::Ir",
+                "value_type": "float32",
+                "relaxation_factor": 0.9,
+                "solver": {
+                    "type": "preconditioner::Jacobi",
+                    "max_block_size": 1
+                },
+                "criteria": [
+                    {
+                        "type": "Iteration",
+                        "max_iters": 1
+                    }
+                ]
+            }
+        ],
+        "post_uses_pre": true,
+        "mg_level": [
+            {
+                "type": "multigrid::Pgm",
+                "deterministic": true
+            },
+            {
+                "type": "multigrid::Pgm",
+                "value_type": "float32",
+                "deterministic": true
+            }
+        ],
+        "coarsest_solver": {
+            "type": "solver::Ir",
+            "value_type": "float32",
+            "relaxation_factor": 0.9,
+            "solver": {
+                    "type": "preconditioner::Jacobi",
+                    "max_block_size": 1
+                },
+            "criteria": [
+                {
+                    "type": "Iteration",
+                    "max_iters": 4
+                }
+            ]
+        },
+        "default_initial_guess": "zero",
+        "criteria": [
+            {
+                "type": "Iteration",
+                "max_iters": 1
+            }
+        ]
+    },
+    "criteria": [
+        {
+            "type": "Iteration",
+            "max_iters": 100
+        },
+        {
+            "type": "ResidualNorm",
+            "reduction_factor": 1e-8,
+            "baseline": "absolute"
+        }
+    ]
+}
diff --git a/examples/file-config-solver/config/parilu-gmres.json b/examples/file-config-solver/config/parilu-gmres.json
new file mode 100644
index 00000000000..57ebb8fea52
--- /dev/null
+++ b/examples/file-config-solver/config/parilu-gmres.json
@@ -0,0 +1,21 @@
+{
+    "type": "solver::Gmres",
+    "preconditioner": {
+        "type": "preconditioner::Ilu",
+        "l_solver_type": "solver::LowerTrs",
+        "reverse_apply": false,
+        "factorization": {
+            "type": "factorization::ParIlu"
+        }
+    },
+    "criteria": [
+        {
+            "type": "Iteration",
+            "max_iters": 1000
+        },
+        {
+            "type": "ResidualNorm",
+            "reduction_factor": 1e-7
+        }
+    ]
+}
diff --git a/examples/file-config-solver/config/pgm-multigrid-cg.json b/examples/file-config-solver/config/pgm-multigrid-cg.json
new file mode 100644
index 00000000000..8b48aaec03f
--- /dev/null
+++ b/examples/file-config-solver/config/pgm-multigrid-cg.json
@@ -0,0 +1,30 @@
+{
+    "type": "solver::Cg",
+    "preconditioner": {
+        "type": "solver::Multigrid",
+        "min_coarse_rows": 2,
+        "mg_level": [
+            {
+                "type": "multigrid::Pgm",
+                "deterministic": true
+            }
+        ],
+        "criteria": [
+            {
+                "type": "Iteration",
+                "max_iters": 1
+            }
+        ]
+    },
+    "criteria": [
+        {
+            "type": "Iteration",
+            "max_iters": 100
+        },
+        {
+            "type": "ResidualNorm",
+            "reduction_factor": 1e-8,
+            "baseline": "absolute"
+        }
+    ]
+}
diff --git a/examples/file-config-solver/data/A.mtx b/examples/file-config-solver/data/A.mtx
new file mode 100644
index 00000000000..c67437da567
--- /dev/null
+++ b/examples/file-config-solver/data/A.mtx
@@ -0,0 +1,114 @@
+%%MatrixMarket matrix coordinate integer symmetric
+%-------------------------------------------------------------------------------
+% UF Sparse Matrix Collection, Tim Davis
+% http://www.cise.ufl.edu/research/sparse/matrices/JGD_Trefethen/Trefethen_20b
+% name: JGD_Trefethen/Trefethen_20b
+% [Diagonal matrices with primes, Nick Trefethen, Oxford Univ.]
+% id: 2203
+% date: 2008
+% author: N. Trefethen
+% ed: J.-G. Dumas
+% fields: name title A id date author ed kind notes
+% kind: combinatorial problem
+%-------------------------------------------------------------------------------
+% notes:
+% Diagonal matrices with primes, Nick Trefethen, Oxford Univ.          
+% From Jean-Guillaume Dumas' Sparse Integer Matrix Collection,         
+% http://ljk.imag.fr/membres/Jean-Guillaume.Dumas/simc.html            
+%                                                                      
+% Problem 7 of the Hundred-dollar, Hundred-digit Challenge Problems,   
+% SIAM News, vol 35, no. 1.                                            
+%                                                                      
+% 7. Let A be the 20,000 x 20,000 matrix whose entries are zero        
+% everywhere except for the primes 2, 3, 5, 7, . . . , 224737 along the
+% main diagonal and the number 1 in all the positions A(i,j) with      
+% |i-j| = 1,2,4,8, . . . ,16384.  What is the (1,1) entry of inv(A)?   
+%                                                                      
+% http://www.siam.org/news/news.php?id=388                             
+%                                                                      
+% Filename in JGD collection: Trefethen/trefethen_20__19_minor.sms     
+%-------------------------------------------------------------------------------
+19 19 83
+1 1 3
+2 1 1
+3 1 1
+5 1 1
+9 1 1
+17 1 1
+2 2 5
+3 2 1
+4 2 1
+6 2 1
+10 2 1
+18 2 1
+3 3 7
+4 3 1
+5 3 1
+7 3 1
+11 3 1
+19 3 1
+4 4 11
+5 4 1
+6 4 1
+8 4 1
+12 4 1
+5 5 13
+6 5 1
+7 5 1
+9 5 1
+13 5 1
+6 6 17
+7 6 1
+8 6 1
+10 6 1
+14 6 1
+7 7 19
+8 7 1
+9 7 1
+11 7 1
+15 7 1
+8 8 23
+9 8 1
+10 8 1
+12 8 1
+16 8 1
+9 9 29
+10 9 1
+11 9 1
+13 9 1
+17 9 1
+10 10 31
+11 10 1
+12 10 1
+14 10 1
+18 10 1
+11 11 37
+12 11 1
+13 11 1
+15 11 1
+19 11 1
+12 12 41
+13 12 1
+14 12 1
+16 12 1
+13 13 43
+14 13 1
+15 13 1
+17 13 1
+14 14 47
+15 14 1
+16 14 1
+18 14 1
+15 15 53
+16 15 1
+17 15 1
+19 15 1
+16 16 59
+17 16 1
+18 16 1
+17 17 61
+18 17 1
+19 17 1
+18 18 67
+19 18 1
+19 19 71
diff --git a/examples/file-config-solver/doc/builds-on b/examples/file-config-solver/doc/builds-on
new file mode 100644
index 00000000000..369aa997770
--- /dev/null
+++ b/examples/file-config-solver/doc/builds-on
@@ -0,0 +1 @@
+simple-solver
diff --git a/examples/file-config-solver/doc/intro.dox b/examples/file-config-solver/doc/intro.dox
new file mode 100644
index 00000000000..44b93ec2ce6
--- /dev/null
+++ b/examples/file-config-solver/doc/intro.dox
@@ -0,0 +1,5 @@
+<a name="File Config Solver"></a>
+<h1>This example shows how to use file to configure solver.</h1>
+
+<h3> In this example, we first read in a matrix from a file. We read the file to configure the solver. The example features the generating time and runtime of the solver.</h3>
+
diff --git a/examples/file-config-solver/doc/kind b/examples/file-config-solver/doc/kind
new file mode 100644
index 00000000000..c1d9154931a
--- /dev/null
+++ b/examples/file-config-solver/doc/kind
@@ -0,0 +1 @@
+techniques
diff --git a/examples/file-config-solver/doc/results.dox b/examples/file-config-solver/doc/results.dox
new file mode 100644
index 00000000000..6698dd59660
--- /dev/null
+++ b/examples/file-config-solver/doc/results.dox
@@ -0,0 +1,35 @@
+<h1>Results</h1>
+This is the expected output:
+
+@code{.cpp}
+
+Config file: config/cg.json
+{
+    "type": "solver::Cg",
+    "criteria": [
+        {
+            "type": "Iteration",
+            "max_iters": 20
+        },
+        {
+            "type": "ResidualNorm",
+            "reduction_factor": 1e-7
+        }
+    ]
+}
+Initial residual norm sqrt(r^T r):
+%%MatrixMarket matrix array real general
+1 1
+25.9808
+Final residual norm sqrt(r^T r):
+%%MatrixMarket matrix array real general
+1 1
+58.79
+Solver iteration count:     20
+Solver generation time [ms]: 0.065244
+Solver execution time [ms]: 0.793764
+Solver execution time per iteration[ms]: 0.0396882
+
+@endcode
+
+<h3> Comments about programming and debugging </h3>
diff --git a/examples/file-config-solver/doc/short-intro b/examples/file-config-solver/doc/short-intro
new file mode 100644
index 00000000000..3abf58fe547
--- /dev/null
+++ b/examples/file-config-solver/doc/short-intro
@@ -0,0 +1 @@
+The file config solver example.
diff --git a/examples/file-config-solver/doc/tooltip b/examples/file-config-solver/doc/tooltip
new file mode 100644
index 00000000000..32b332803aa
--- /dev/null
+++ b/examples/file-config-solver/doc/tooltip
@@ -0,0 +1 @@
+Use a solver from config file in Ginkgo to solve a linear system.
diff --git a/examples/file-config-solver/file-config-solver.cpp b/examples/file-config-solver/file-config-solver.cpp
new file mode 100644
index 00000000000..9e76d606c59
--- /dev/null
+++ b/examples/file-config-solver/file-config-solver.cpp
@@ -0,0 +1,150 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <string>
+
+#include <ginkgo/ginkgo.hpp>
+
+// the header in extensions is not shipped with ginkgo.hpp
+#include <ginkgo/extensions/config/json_config.hpp>
+
+
+int main(int argc, char* argv[])
+{
+    // Some shortcuts
+    using ValueType = double;
+    using IndexType = int;
+    using vec = gko::matrix::Dense<ValueType>;
+    using mtx = gko::matrix::Csr<ValueType, IndexType>;
+
+    // Print version information
+    std::cout << gko::version_info::get() << std::endl;
+    // Print usage
+    std::cout << argv[0] << " executor configfile" << std::endl;
+
+    const auto executor_string = argc >= 2 ? argv[1] : "reference";
+    const auto configfile = argc >= 3 ? argv[2] : "config/cg.json";
+    // Figure out where to run the code
+    std::map<std::string, std::function<std::shared_ptr<gko::Executor>()>>
+        exec_map{
+            {"omp", [] { return gko::OmpExecutor::create(); }},
+            {"cuda",
+             [] {
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
+             }},
+            {"hip",
+             [] {
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
+             }},
+            {"dpcpp",
+             [] {
+                 return gko::DpcppExecutor::create(
+                     0, gko::ReferenceExecutor::create());
+             }},
+            {"reference", [] { return gko::ReferenceExecutor::create(); }}};
+
+    // executor where Ginkgo will perform the computation
+    const auto exec = exec_map.at(executor_string)();  // throws if not valid
+
+    // Read data
+    auto A = share(gko::read<mtx>(std::ifstream("data/A.mtx"), exec));
+    // Create RHS as 1 and initial guess as 0
+    gko::size_type size = A->get_size()[0];
+    auto host_x = vec::create(exec->get_master(), gko::dim<2>(size, 1));
+    auto host_b = vec::create(exec->get_master(), gko::dim<2>(size, 1));
+    for (auto i = 0; i < size; i++) {
+        host_x->at(i, 0) = 0.;
+        host_b->at(i, 0) = 1.;
+    }
+    auto x = vec::create(exec);
+    auto b = vec::create(exec);
+    x->copy_from(host_x);
+    b->copy_from(host_b);
+
+    // Calculate initial residual by overwriting b
+    auto one = gko::initialize<vec>({1.0}, exec);
+    auto neg_one = gko::initialize<vec>({-1.0}, exec);
+    auto initres = gko::initialize<vec>({0.0}, exec);
+    A->apply(one, x, neg_one, b);
+    b->compute_norm2(initres);
+
+    // Copy b again
+    b->copy_from(host_b);
+
+    // Read the json file into ginkgo structure
+    // cg.json: simple-solver
+    // blockjacobi-cg.json: preconditioned-solver
+    // ir.json: iterative-refinement
+    // parilu.json: ilu-preconditioned-solver (by using factoization parameter
+    //              directly)
+    // pgm-multigrid-cg.json: multigrid-preconditioned-solver (set
+    //                        min_coarse_rows additionally due to this small
+    //                        example matrix)
+    // mixed-pgm-multigrid-cg.json: mixed-multigrid-preconditioned-solver
+    //                              (assuming there are always more than one
+    //                              level)
+    auto config = gko::ext::config::parse_json_file(configfile);
+    // Create the registry, which allows passing the existing data into config
+    // This example does not show the usage
+    auto reg = gko::config::registry();
+    // Create the default type descriptor, which gives the default common type
+    // (value/index) for solver generation. If the solver does not specify value
+    // type, the solver will use these types.
+    auto td = gko::config::make_type_descriptor<ValueType, IndexType>();
+    // generate the linopfactory on the given executors
+    auto solver_gen = gko::config::parse(config, reg, td).on(exec);
+
+    // Create solver
+    std::chrono::nanoseconds gen_time(0);
+    auto gen_tic = std::chrono::steady_clock::now();
+    auto solver = solver_gen->generate(A);
+    exec->synchronize();
+    auto gen_toc = std::chrono::steady_clock::now();
+    gen_time +=
+        std::chrono::duration_cast<std::chrono::nanoseconds>(gen_toc - gen_tic);
+
+    // Add logger
+    std::shared_ptr<const gko::log::Convergence<ValueType>> logger =
+        gko::log::Convergence<ValueType>::create();
+    solver->add_logger(logger);
+
+    // Solve system
+    exec->synchronize();
+    std::chrono::nanoseconds time(0);
+    auto tic = std::chrono::steady_clock::now();
+    solver->apply(b, x);
+    exec->synchronize();
+    auto toc = std::chrono::steady_clock::now();
+    time += std::chrono::duration_cast<std::chrono::nanoseconds>(toc - tic);
+
+    // Print out the solver config
+    std::cout << "Config file: " << configfile << std::endl;
+    std::ifstream f(configfile);
+    std::cout << f.rdbuf() << std::endl;
+
+    // Calculate residual
+    auto res = gko::as<vec>(logger->get_residual_norm());
+
+    std::cout << "Initial residual norm sqrt(r^T r): \n";
+    write(std::cout, initres);
+    std::cout << "Final residual norm sqrt(r^T r): \n";
+    write(std::cout, res);
+
+    // Print solver statistics
+    std::cout << "Solver iteration count:     " << logger->get_num_iterations()
+              << std::endl;
+    std::cout << "Solver generation time [ms]: "
+              << static_cast<double>(gen_time.count()) / 1000000.0 << std::endl;
+    std::cout << "Solver execution time [ms]: "
+              << static_cast<double>(time.count()) / 1000000.0 << std::endl;
+    std::cout << "Solver execution time per iteration[ms]: "
+              << static_cast<double>(time.count()) / 1000000.0 /
+                     logger->get_num_iterations()
+              << std::endl;
+}

From 1e878e262e3a76aed21e2610f77b20a4e5e1683e Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 9 Sep 2024 20:47:57 +0200
Subject: [PATCH 209/448] update build.sh to c++17 in example

Co-authored-by: Marcel Koch <marcel.koch@kit.edu>
---
 examples/adaptiveprecision-blockjacobi/build.sh              | 2 +-
 examples/cb-gmres/build.sh                                   | 2 +-
 examples/custom-logger/build.sh                              | 2 +-
 examples/custom-matrix-format/build.sh                       | 2 +-
 examples/custom-stopping-criterion/build.sh                  | 2 +-
 examples/file-config-solver/build.sh                         | 2 +-
 examples/ginkgo-overhead/build.sh                            | 2 +-
 examples/ginkgo-ranges/build.sh                              | 2 +-
 examples/heat-equation/build.sh                              | 2 +-
 examples/ilu-preconditioned-solver/build.sh                  | 2 +-
 examples/inverse-iteration/build.sh                          | 2 +-
 examples/ir-ilu-preconditioned-solver/build.sh               | 2 +-
 examples/iterative-refinement/build.sh                       | 2 +-
 examples/minimal-cuda-solver/build.sh                        | 2 +-
 examples/mixed-multigrid-preconditioned-solver/build.sh      | 2 +-
 examples/mixed-multigrid-solver/build.sh                     | 2 +-
 examples/mixed-precision-ir/build.sh                         | 2 +-
 examples/mixed-spmv/build.sh                                 | 2 +-
 examples/multigrid-preconditioned-solver-customized/build.sh | 2 +-
 examples/multigrid-preconditioned-solver/build.sh            | 2 +-
 examples/nine-pt-stencil-solver/build.sh                     | 2 +-
 examples/papi-logging/build.sh                               | 2 +-
 examples/par-ilu-convergence/build.sh                        | 2 +-
 examples/performance-debugging/build.sh                      | 2 +-
 examples/poisson-solver/build.sh                             | 2 +-
 examples/preconditioned-solver/build.sh                      | 2 +-
 examples/preconditioner-export/build.sh                      | 2 +-
 examples/reordered-preconditioned-solver/build.sh            | 2 +-
 examples/schroedinger-splitting/build.sh                     | 2 +-
 examples/simple-solver-logging/build.sh                      | 2 +-
 examples/simple-solver/build.sh                              | 2 +-
 examples/three-pt-stencil-solver/build.sh                    | 2 +-
 32 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/examples/adaptiveprecision-blockjacobi/build.sh b/examples/adaptiveprecision-blockjacobi/build.sh
index 4561f46c145..2513838d31a 100755
--- a/examples/adaptiveprecision-blockjacobi/build.sh
+++ b/examples/adaptiveprecision-blockjacobi/build.sh
@@ -11,7 +11,7 @@ THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 source ${THIS_DIR}/../build-setup.sh
 
 # build
-${CXX} -std=c++14 -o ${THIS_DIR}/adaptiveprecision-blockjacobi \
+${CXX} -std=c++17 -o ${THIS_DIR}/adaptiveprecision-blockjacobi \
     ${THIS_DIR}/adaptiveprecision-blockjacobi.cpp \
     -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
     -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/cb-gmres/build.sh b/examples/cb-gmres/build.sh
index da7e54829b6..413473cf6be 100755
--- a/examples/cb-gmres/build.sh
+++ b/examples/cb-gmres/build.sh
@@ -14,6 +14,6 @@ mkdir -p data
 cp ${THIS_DIR}/../../matrices/test/ani1.mtx data/A.mtx
 
 # build
-${CXX} -std=c++14 -o ${THIS_DIR}/cb-gmres ${THIS_DIR}/cb-gmres.cpp \
+${CXX} -std=c++17 -o ${THIS_DIR}/cb-gmres ${THIS_DIR}/cb-gmres.cpp \
        -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
        -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/custom-logger/build.sh b/examples/custom-logger/build.sh
index 8b2f684c5c4..fe76bacc22c 100755
--- a/examples/custom-logger/build.sh
+++ b/examples/custom-logger/build.sh
@@ -11,6 +11,6 @@ THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 source ${THIS_DIR}/../build-setup.sh
 
 # build
-${CXX} -std=c++14 -o ${THIS_DIR}/custom-logger ${THIS_DIR}/custom-logger.cpp \
+${CXX} -std=c++17 -o ${THIS_DIR}/custom-logger ${THIS_DIR}/custom-logger.cpp \
        -I${THIS_DIR}/../../include -I${BUILD_DIR}/include -L${THIS_DIR} \
        ${LINK_FLAGS}
diff --git a/examples/custom-matrix-format/build.sh b/examples/custom-matrix-format/build.sh
index 10c2d188666..063b390cd95 100755
--- a/examples/custom-matrix-format/build.sh
+++ b/examples/custom-matrix-format/build.sh
@@ -16,7 +16,7 @@ CXX="nvcc"
 LINK_FLAGS="${LINK_FLAGS/-Wl,-rpath,/-Xlinker -rpath -Xlinker }"
 
 # build
-${CXX} -std=c++14 -o ${THIS_DIR}/custom-matrix-format \
+${CXX} -std=c++17 -o ${THIS_DIR}/custom-matrix-format \
        ${THIS_DIR}/custom-matrix-format.cpp ${THIS_DIR}/stencil_kernel.cu \
        -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
        -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/custom-stopping-criterion/build.sh b/examples/custom-stopping-criterion/build.sh
index 8fd3a1d226c..8bd05f9fd02 100755
--- a/examples/custom-stopping-criterion/build.sh
+++ b/examples/custom-stopping-criterion/build.sh
@@ -13,7 +13,7 @@ source ${THIS_DIR}/../build-setup.sh
 LINK_FLAGS="${LINK_FLAGS} -lpthread"
 
 # build
-${CXX} -std=c++14 -o ${THIS_DIR}/custom-stopping-criterion \
+${CXX} -std=c++17 -o ${THIS_DIR}/custom-stopping-criterion \
        ${THIS_DIR}/custom-stopping-criterion.cpp \
        -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
        -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/file-config-solver/build.sh b/examples/file-config-solver/build.sh
index c3143d5634f..f1852573723 100755
--- a/examples/file-config-solver/build.sh
+++ b/examples/file-config-solver/build.sh
@@ -11,6 +11,6 @@ THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 source ${THIS_DIR}/../build-setup.sh
 
 # build
-${CXX} -std=c++14 -o ${THIS_DIR}/file-config-solver ${THIS_DIR}/file-config-solver.cpp \
+${CXX} -std=c++17 -o ${THIS_DIR}/file-config-solver ${THIS_DIR}/file-config-solver.cpp \
        -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
        -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/ginkgo-overhead/build.sh b/examples/ginkgo-overhead/build.sh
index 644b0e7f71d..2356d2669fa 100755
--- a/examples/ginkgo-overhead/build.sh
+++ b/examples/ginkgo-overhead/build.sh
@@ -11,7 +11,7 @@ THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 source ${THIS_DIR}/../build-setup.sh
 
 # build
-${CXX} -std=c++14 -O3 -o ${THIS_DIR}/ginkgo-overhead \
+${CXX} -std=c++17 -O3 -o ${THIS_DIR}/ginkgo-overhead \
        ${THIS_DIR}/ginkgo-overhead.cpp \
        -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
        -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/ginkgo-ranges/build.sh b/examples/ginkgo-ranges/build.sh
index def15b577f0..10d90375d09 100755
--- a/examples/ginkgo-ranges/build.sh
+++ b/examples/ginkgo-ranges/build.sh
@@ -11,6 +11,6 @@ THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 source ${THIS_DIR}/../build-setup.sh
 
 # build
-${CXX} -std=c++14 -o ${THIS_DIR}/ginkgo-ranges ${THIS_DIR}/ginkgo-ranges.cpp \
+${CXX} -std=c++17 -o ${THIS_DIR}/ginkgo-ranges ${THIS_DIR}/ginkgo-ranges.cpp \
        -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
        -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/heat-equation/build.sh b/examples/heat-equation/build.sh
index 495d054d477..63453ec5b9f 100755
--- a/examples/heat-equation/build.sh
+++ b/examples/heat-equation/build.sh
@@ -11,7 +11,7 @@ THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 source ${THIS_DIR}/../build-setup.sh
 
 # build
-${CXX} -std=c++14 -o ${THIS_DIR}/heat-equation \
+${CXX} -std=c++17 -o ${THIS_DIR}/heat-equation \
        ${THIS_DIR}/heat-equation.cpp \
        -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
        `pkg-config --cflags opencv4` \
diff --git a/examples/ilu-preconditioned-solver/build.sh b/examples/ilu-preconditioned-solver/build.sh
index dd4b024a233..c1498913e0d 100755
--- a/examples/ilu-preconditioned-solver/build.sh
+++ b/examples/ilu-preconditioned-solver/build.sh
@@ -11,7 +11,7 @@ THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 source ${THIS_DIR}/../build-setup.sh
 
 # build
-${CXX} -std=c++14 -o ${THIS_DIR}/ilu-preconditioned-solver \
+${CXX} -std=c++17 -o ${THIS_DIR}/ilu-preconditioned-solver \
     ${THIS_DIR}/ilu-preconditioned-solver.cpp \
     -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
     -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/inverse-iteration/build.sh b/examples/inverse-iteration/build.sh
index 221e5dfcdc7..3eee28595f5 100755
--- a/examples/inverse-iteration/build.sh
+++ b/examples/inverse-iteration/build.sh
@@ -11,7 +11,7 @@ THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 source ${THIS_DIR}/../build-setup.sh
 
 # build
-${CXX} -std=c++14 -o \
+${CXX} -std=c++17 -o \
     ${THIS_DIR}/inverse-iteration ${THIS_DIR}/inverse-iteration.cpp \
     -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
     -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/ir-ilu-preconditioned-solver/build.sh b/examples/ir-ilu-preconditioned-solver/build.sh
index 1b19e4d92f7..7e2a652c36c 100755
--- a/examples/ir-ilu-preconditioned-solver/build.sh
+++ b/examples/ir-ilu-preconditioned-solver/build.sh
@@ -11,7 +11,7 @@ THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 source ${THIS_DIR}/../build-setup.sh
 
 # build
-${CXX} -std=c++14 -o ${THIS_DIR}/ir-ilu-preconditioned-solver \
+${CXX} -std=c++17 -o ${THIS_DIR}/ir-ilu-preconditioned-solver \
     ${THIS_DIR}/ir-ilu-preconditioned-solver.cpp \
     -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
     -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/iterative-refinement/build.sh b/examples/iterative-refinement/build.sh
index 1e3db6e5924..f38796c587b 100755
--- a/examples/iterative-refinement/build.sh
+++ b/examples/iterative-refinement/build.sh
@@ -11,7 +11,7 @@ THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 source ${THIS_DIR}/../build-setup.sh
 
 # build
-${CXX} -std=c++14 -o ${THIS_DIR}/iterative-refinement \
+${CXX} -std=c++17 -o ${THIS_DIR}/iterative-refinement \
     ${THIS_DIR}/iterative-refinement.cpp \
     -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
     -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/minimal-cuda-solver/build.sh b/examples/minimal-cuda-solver/build.sh
index 69a8a51df51..5244496c3bf 100755
--- a/examples/minimal-cuda-solver/build.sh
+++ b/examples/minimal-cuda-solver/build.sh
@@ -11,7 +11,7 @@ THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 source ${THIS_DIR}/../build-setup.sh
 
 # build
-${CXX} -std=c++14 -o ${THIS_DIR}/minimal-cuda-solver \
+${CXX} -std=c++17 -o ${THIS_DIR}/minimal-cuda-solver \
     ${THIS_DIR}/minimal-cuda-solver.cpp \
     -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
     -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/mixed-multigrid-preconditioned-solver/build.sh b/examples/mixed-multigrid-preconditioned-solver/build.sh
index 86fb4ca3107..3f9b0bf71b4 100755
--- a/examples/mixed-multigrid-preconditioned-solver/build.sh
+++ b/examples/mixed-multigrid-preconditioned-solver/build.sh
@@ -11,6 +11,6 @@ THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 source ${THIS_DIR}/../build-setup.sh
 
 # build
-${CXX} -std=c++14 -o ${THIS_DIR}/mixed-multigrid-preconditioned-solver ${THIS_DIR}/mixed-multigrid-preconditioned-solver.cpp \
+${CXX} -std=c++17 -o ${THIS_DIR}/mixed-multigrid-preconditioned-solver ${THIS_DIR}/mixed-multigrid-preconditioned-solver.cpp \
        -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
        -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/mixed-multigrid-solver/build.sh b/examples/mixed-multigrid-solver/build.sh
index eecb3c16806..29d30491236 100755
--- a/examples/mixed-multigrid-solver/build.sh
+++ b/examples/mixed-multigrid-solver/build.sh
@@ -11,6 +11,6 @@ THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 source ${THIS_DIR}/../build-setup.sh
 
 # build
-${CXX} -std=c++14 -o ${THIS_DIR}/mixed-multigrid-solver ${THIS_DIR}/mixed-multigrid-solver.cpp \
+${CXX} -std=c++17 -o ${THIS_DIR}/mixed-multigrid-solver ${THIS_DIR}/mixed-multigrid-solver.cpp \
        -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
        -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/mixed-precision-ir/build.sh b/examples/mixed-precision-ir/build.sh
index cf18895de52..8fb0aaa8cc1 100755
--- a/examples/mixed-precision-ir/build.sh
+++ b/examples/mixed-precision-ir/build.sh
@@ -11,7 +11,7 @@ THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 source ${THIS_DIR}/../build-setup.sh
 
 # build
-${CXX} -std=c++14 -o ${THIS_DIR}/mixed-precision-ir \
+${CXX} -std=c++17 -o ${THIS_DIR}/mixed-precision-ir \
     ${THIS_DIR}/mixed-precision-ir.cpp \
     -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
     -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/mixed-spmv/build.sh b/examples/mixed-spmv/build.sh
index 3137d5656c1..1caa4e692f0 100755
--- a/examples/mixed-spmv/build.sh
+++ b/examples/mixed-spmv/build.sh
@@ -11,6 +11,6 @@ THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 source ${THIS_DIR}/../build-setup.sh
 
 # build
-${CXX} -std=c++14 -o ${THIS_DIR}/mixed-spmv ${THIS_DIR}/mixed-spmv.cpp \
+${CXX} -std=c++17 -o ${THIS_DIR}/mixed-spmv ${THIS_DIR}/mixed-spmv.cpp \
        -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
        -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/multigrid-preconditioned-solver-customized/build.sh b/examples/multigrid-preconditioned-solver-customized/build.sh
index 16a8115c430..b790fa5cd11 100755
--- a/examples/multigrid-preconditioned-solver-customized/build.sh
+++ b/examples/multigrid-preconditioned-solver-customized/build.sh
@@ -11,6 +11,6 @@ THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 source ${THIS_DIR}/../build-setup.sh
 
 # build
-${CXX} -std=c++14 -o ${THIS_DIR}/multigrid-preconditioned-solver-customized ${THIS_DIR}/multigrid-preconditioned-solver-customized.cpp \
+${CXX} -std=c++17 -o ${THIS_DIR}/multigrid-preconditioned-solver-customized ${THIS_DIR}/multigrid-preconditioned-solver-customized.cpp \
        -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
        -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/multigrid-preconditioned-solver/build.sh b/examples/multigrid-preconditioned-solver/build.sh
index 01ffd99d725..d86a5adb5fb 100755
--- a/examples/multigrid-preconditioned-solver/build.sh
+++ b/examples/multigrid-preconditioned-solver/build.sh
@@ -11,6 +11,6 @@ THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 source ${THIS_DIR}/../build-setup.sh
 
 # build
-${CXX} -std=c++14 -o ${THIS_DIR}/multigrid-preconditioned-solver ${THIS_DIR}/multigrid-preconditioned-solver.cpp \
+${CXX} -std=c++17 -o ${THIS_DIR}/multigrid-preconditioned-solver ${THIS_DIR}/multigrid-preconditioned-solver.cpp \
        -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
        -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/nine-pt-stencil-solver/build.sh b/examples/nine-pt-stencil-solver/build.sh
index 5fa790ac5f7..c4887047b4f 100755
--- a/examples/nine-pt-stencil-solver/build.sh
+++ b/examples/nine-pt-stencil-solver/build.sh
@@ -11,7 +11,7 @@ THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 source ${THIS_DIR}/../build-setup.sh
 
 # build
-${CXX} -std=c++14 -o ${THIS_DIR}/nine-pt-stencil-solver \
+${CXX} -std=c++17 -o ${THIS_DIR}/nine-pt-stencil-solver \
        ${THIS_DIR}/nine-pt-stencil-solver.cpp \
        -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
        -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/papi-logging/build.sh b/examples/papi-logging/build.sh
index 25355fb7c00..37a693ba9bc 100755
--- a/examples/papi-logging/build.sh
+++ b/examples/papi-logging/build.sh
@@ -11,6 +11,6 @@ THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 source ${THIS_DIR}/../build-setup.sh
 
 # build
-${CXX} -std=c++14 -o ${THIS_DIR}/papi-logging ${THIS_DIR}/papi-logging.cpp \
+${CXX} -std=c++17 -o ${THIS_DIR}/papi-logging ${THIS_DIR}/papi-logging.cpp \
        -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
        -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/par-ilu-convergence/build.sh b/examples/par-ilu-convergence/build.sh
index 6af53f00387..5ad355f97e2 100755
--- a/examples/par-ilu-convergence/build.sh
+++ b/examples/par-ilu-convergence/build.sh
@@ -11,7 +11,7 @@ THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 source ${THIS_DIR}/../build-setup.sh
 
 # build
-${CXX} -std=c++14 -o ${THIS_DIR}/par-ilu-convergence \
+${CXX} -std=c++17 -o ${THIS_DIR}/par-ilu-convergence \
     ${THIS_DIR}/par-ilu-convergence.cpp \
     -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
     -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/performance-debugging/build.sh b/examples/performance-debugging/build.sh
index d79373ccef5..32c297aaf7d 100755
--- a/examples/performance-debugging/build.sh
+++ b/examples/performance-debugging/build.sh
@@ -11,7 +11,7 @@ THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 source ${THIS_DIR}/../build-setup.sh
 
 # build
-${CXX} -std=c++14 -o ${THIS_DIR}/performance-debugging \
+${CXX} -std=c++17 -o ${THIS_DIR}/performance-debugging \
        ${THIS_DIR}/performance-debugging.cpp \
        -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
        -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/poisson-solver/build.sh b/examples/poisson-solver/build.sh
index ee8a8fc9f8a..a1a67ba8687 100755
--- a/examples/poisson-solver/build.sh
+++ b/examples/poisson-solver/build.sh
@@ -11,7 +11,7 @@ THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 source ${THIS_DIR}/../build-setup.sh
 
 # build
-${CXX} -std=c++14 -o ${THIS_DIR}/poisson-solver \
+${CXX} -std=c++17 -o ${THIS_DIR}/poisson-solver \
     ${THIS_DIR}/poisson-solver.cpp \
     -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
     -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/preconditioned-solver/build.sh b/examples/preconditioned-solver/build.sh
index 205e648e054..0c45ff53194 100755
--- a/examples/preconditioned-solver/build.sh
+++ b/examples/preconditioned-solver/build.sh
@@ -11,7 +11,7 @@ THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 source ${THIS_DIR}/../build-setup.sh
 
 # build
-${CXX} -std=c++14 -o ${THIS_DIR}/preconditioned-solver \
+${CXX} -std=c++17 -o ${THIS_DIR}/preconditioned-solver \
     ${THIS_DIR}/preconditioned-solver.cpp \
     -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
     -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/preconditioner-export/build.sh b/examples/preconditioner-export/build.sh
index 642e4f94636..4479f7ae86b 100755
--- a/examples/preconditioner-export/build.sh
+++ b/examples/preconditioner-export/build.sh
@@ -11,7 +11,7 @@ THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 source ${THIS_DIR}/../build-setup.sh
 
 # build
-${CXX} -std=c++14 -o ${THIS_DIR}/preconditioner-export \
+${CXX} -std=c++17 -o ${THIS_DIR}/preconditioner-export \
     ${THIS_DIR}/preconditioner-export.cpp \
     -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
     -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/reordered-preconditioned-solver/build.sh b/examples/reordered-preconditioned-solver/build.sh
index cd35af13dc9..39bf8d56a47 100755
--- a/examples/reordered-preconditioned-solver/build.sh
+++ b/examples/reordered-preconditioned-solver/build.sh
@@ -11,7 +11,7 @@ THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 source ${THIS_DIR}/../build-setup.sh
 
 # build
-${CXX} -std=c++14 -o ${THIS_DIR}/preconditioned-solver \
+${CXX} -std=c++17 -o ${THIS_DIR}/preconditioned-solver \
     ${THIS_DIR}/reordered-preconditioned-solver.cpp \
     -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
     -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/schroedinger-splitting/build.sh b/examples/schroedinger-splitting/build.sh
index 95fa17efd77..ee318c33541 100755
--- a/examples/schroedinger-splitting/build.sh
+++ b/examples/schroedinger-splitting/build.sh
@@ -11,7 +11,7 @@ THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 source ${THIS_DIR}/../build-setup.sh
 
 # build
-${CXX} -std=c++14 -o ${THIS_DIR}/schroedinger-splitting \
+${CXX} -std=c++17 -o ${THIS_DIR}/schroedinger-splitting \
        ${THIS_DIR}/schroedinger-splitting.cpp \
        -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
        `pkg-config --cflags opencv4` \
diff --git a/examples/simple-solver-logging/build.sh b/examples/simple-solver-logging/build.sh
index 5a9b3edd96d..8cf47d37e14 100755
--- a/examples/simple-solver-logging/build.sh
+++ b/examples/simple-solver-logging/build.sh
@@ -11,7 +11,7 @@ THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 source ${THIS_DIR}/../build-setup.sh
 
 # build
-${CXX} -std=c++14 -o ${THIS_DIR}/simple-solver-logging \
+${CXX} -std=c++17 -o ${THIS_DIR}/simple-solver-logging \
        ${THIS_DIR}/simple-solver-logging.cpp \
        -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
        -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/simple-solver/build.sh b/examples/simple-solver/build.sh
index 4430ad58765..03f2a999bf1 100755
--- a/examples/simple-solver/build.sh
+++ b/examples/simple-solver/build.sh
@@ -11,6 +11,6 @@ THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 source ${THIS_DIR}/../build-setup.sh
 
 # build
-${CXX} -std=c++14 -o ${THIS_DIR}/simple-solver ${THIS_DIR}/simple-solver.cpp \
+${CXX} -std=c++17 -o ${THIS_DIR}/simple-solver ${THIS_DIR}/simple-solver.cpp \
        -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
        -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/three-pt-stencil-solver/build.sh b/examples/three-pt-stencil-solver/build.sh
index efe1675ecae..85be91a9276 100755
--- a/examples/three-pt-stencil-solver/build.sh
+++ b/examples/three-pt-stencil-solver/build.sh
@@ -11,7 +11,7 @@ THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 source ${THIS_DIR}/../build-setup.sh
 
 # build
-${CXX} -std=c++14 -o ${THIS_DIR}/three-pt-stencil-solver \
+${CXX} -std=c++17 -o ${THIS_DIR}/three-pt-stencil-solver \
        ${THIS_DIR}/three-pt-stencil-solver.cpp \
        -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
        -L${THIS_DIR} ${LINK_FLAGS}

From ee76a5538fc65a3028a31fafbe6ac51b771467ef Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 9 Sep 2024 22:25:08 +0200
Subject: [PATCH 210/448] add test and update documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Marcel Koch <marcel.koch@kit.edu>
Co-authored-by: Thomas Grützmacher <thomas.gruetzmacher@kit.edu>
---
 examples/CMakeLists.txt                        | 14 ++++++++++++++
 examples/file-config-solver/CMakeLists.txt     | 10 ++++------
 examples/file-config-solver/doc/tooltip        |  2 +-
 .../file-config-solver/file-config-solver.cpp  | 18 +++++++++---------
 4 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 229f8763fcc..5908caff3eb 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -103,4 +103,18 @@ if(GINKGO_BUILD_TESTS)
                      "${CMAKE_CURRENT_SOURCE_DIR}/${example}")
         endforeach()
     endforeach()
+
+    file(GLOB config_list RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" file-config-solver/config/*.json)
+    foreach(config IN LISTS config_list)
+        get_filename_component(config_name "${config}" NAME_WE)
+        foreach(executor IN LISTS executors)
+            add_test(NAME example_file-config-solver_${config_name}_${executor} 
+                     COMMAND 
+                     "$<TARGET_FILE:file-config-solver>"
+                     "${executor}" "config/${config_name}.json"
+                     WORKING_DIRECTORY
+                     "${CMAKE_CURRENT_BINARY_DIR}/file-config-solver")
+        endforeach()    
+    endforeach()
+        
 endif()
diff --git a/examples/file-config-solver/CMakeLists.txt b/examples/file-config-solver/CMakeLists.txt
index 743519f6ce1..c2cc3d7ef29 100644
--- a/examples/file-config-solver/CMakeLists.txt
+++ b/examples/file-config-solver/CMakeLists.txt
@@ -13,9 +13,7 @@ target_link_libraries(file-config-solver Ginkgo::ginkgo nlohmann_json::nlohmann_
 # Copy the data files to the execution directory
 configure_file(data/A.mtx data/A.mtx COPYONLY)
 # Copy the config files to the execution directory
-configure_file(config/cg.json config/cg.json COPYONLY)
-configure_file(config/blockjacobi-cg.json config/blockjacobi-cg.json COPYONLY)
-configure_file(config/ir.json config/ir.json COPYONLY)
-configure_file(config/mixed-pgm-multigrid-cg.json config/mixed-pgm-multigrid-cg.json COPYONLY)
-configure_file(config/parilu-gmres.json config/parilu-gmres.json COPYONLY)
-configure_file(config/pgm-multigrid-cg.json config/pgm-multigrid-cg.json COPYONLY)
+file(GLOB config_list RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" config/*.json)
+foreach(config IN LISTS config_list)
+    configure_file("${config}" "${config}" COPYONLY)
+endforeach()
\ No newline at end of file
diff --git a/examples/file-config-solver/doc/tooltip b/examples/file-config-solver/doc/tooltip
index 32b332803aa..be7b9637329 100644
--- a/examples/file-config-solver/doc/tooltip
+++ b/examples/file-config-solver/doc/tooltip
@@ -1 +1 @@
-Use a solver from config file in Ginkgo to solve a linear system.
+Use a solver from a config file in Ginkgo to solve a linear system.
diff --git a/examples/file-config-solver/file-config-solver.cpp b/examples/file-config-solver/file-config-solver.cpp
index 9e76d606c59..39c09d6f212 100644
--- a/examples/file-config-solver/file-config-solver.cpp
+++ b/examples/file-config-solver/file-config-solver.cpp
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <chrono>
 #include <fstream>
 #include <iomanip>
 #include <iostream>
@@ -91,7 +92,7 @@ int main(int argc, char* argv[])
     //                              level)
     auto config = gko::ext::config::parse_json_file(configfile);
     // Create the registry, which allows passing the existing data into config
-    // This example does not show the usage
+    // This example does not use existing data.
     auto reg = gko::config::registry();
     // Create the default type descriptor, which gives the default common type
     // (value/index) for solver generation. If the solver does not specify value
@@ -101,12 +102,11 @@ int main(int argc, char* argv[])
     auto solver_gen = gko::config::parse(config, reg, td).on(exec);
 
     // Create solver
-    std::chrono::nanoseconds gen_time(0);
-    auto gen_tic = std::chrono::steady_clock::now();
+    const auto gen_tic = std::chrono::steady_clock::now();
     auto solver = solver_gen->generate(A);
     exec->synchronize();
-    auto gen_toc = std::chrono::steady_clock::now();
-    gen_time +=
+    const auto gen_toc = std::chrono::steady_clock::now();
+    const auto gen_time =
         std::chrono::duration_cast<std::chrono::nanoseconds>(gen_toc - gen_tic);
 
     // Add logger
@@ -116,12 +116,12 @@ int main(int argc, char* argv[])
 
     // Solve system
     exec->synchronize();
-    std::chrono::nanoseconds time(0);
-    auto tic = std::chrono::steady_clock::now();
+    const auto tic = std::chrono::steady_clock::now();
     solver->apply(b, x);
     exec->synchronize();
-    auto toc = std::chrono::steady_clock::now();
-    time += std::chrono::duration_cast<std::chrono::nanoseconds>(toc - tic);
+    const auto toc = std::chrono::steady_clock::now();
+    const auto time =
+        std::chrono::duration_cast<std::chrono::nanoseconds>(toc - tic);
 
     // Print out the solver config
     std::cout << "Config file: " << configfile << std::endl;

From 9be6dc1042cf68777b94774acdea3958ca74abe1 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 10 Sep 2024 15:16:23 +0200
Subject: [PATCH 211/448] add missing example link

Co-authored-by: Marcel Koch <marcel.koch@kit.edu>
---
 doc/examples/examples.hpp.in | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/doc/examples/examples.hpp.in b/doc/examples/examples.hpp.in
index 16bcc3a8823..45cbd34f348 100644
--- a/doc/examples/examples.hpp.in
+++ b/doc/examples/examples.hpp.in
@@ -382,7 +382,14 @@
  *         @ref mixed_multigrid_solver
  *     </td>
  *   </tr>
-
+ * 
+ *   <tr valign="top">
+ *     <td> Configure a solver from a config file
+ *     </td>
+ *     <td>@ref file_config_solver
+ *     </td>
+ *   </tr>
+ * 
  *   <tr valign="top">
  *     <td> Distributed
  *     </td>

From 48552826c336c3b9083f81cefb045bc353e38845 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 18 Oct 2024 13:00:14 +0200
Subject: [PATCH 212/448] use milliseconds directly and update documentation

Co-authored-by: Gregor Olenik <gregor.olenik@web.de>
---
 examples/file-config-solver/CMakeLists.txt        |  2 +-
 examples/file-config-solver/doc/intro.dox         |  2 +-
 .../file-config-solver/file-config-solver.cpp     | 15 ++++++++-------
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/examples/file-config-solver/CMakeLists.txt b/examples/file-config-solver/CMakeLists.txt
index c2cc3d7ef29..a8f4665d6a8 100644
--- a/examples/file-config-solver/CMakeLists.txt
+++ b/examples/file-config-solver/CMakeLists.txt
@@ -16,4 +16,4 @@ configure_file(data/A.mtx data/A.mtx COPYONLY)
 file(GLOB config_list RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" config/*.json)
 foreach(config IN LISTS config_list)
     configure_file("${config}" "${config}" COPYONLY)
-endforeach()
\ No newline at end of file
+endforeach()
diff --git a/examples/file-config-solver/doc/intro.dox b/examples/file-config-solver/doc/intro.dox
index 44b93ec2ce6..de2232e4a8f 100644
--- a/examples/file-config-solver/doc/intro.dox
+++ b/examples/file-config-solver/doc/intro.dox
@@ -1,5 +1,5 @@
 <a name="File Config Solver"></a>
 <h1>This example shows how to use file to configure solver.</h1>
 
-<h3> In this example, we first read in a matrix from a file. We read the file to configure the solver. The example features the generating time and runtime of the solver.</h3>
+<h3>In this example, we first read in a matrix from a file. Then we read the config file to configure the solver. The example application reports the generation and run time of the solver.</h3>
 
diff --git a/examples/file-config-solver/file-config-solver.cpp b/examples/file-config-solver/file-config-solver.cpp
index 39c09d6f212..e19dd2873c2 100644
--- a/examples/file-config-solver/file-config-solver.cpp
+++ b/examples/file-config-solver/file-config-solver.cpp
@@ -78,7 +78,8 @@ int main(int argc, char* argv[])
     // Copy b again
     b->copy_from(host_b);
 
-    // Read the json file into ginkgo structure
+    // Read the json config file to configure the ginkgo solver. The following
+    // files, which are mapped to corresponding examples, are available
     // cg.json: simple-solver
     // blockjacobi-cg.json: preconditioned-solver
     // ir.json: iterative-refinement
@@ -106,8 +107,8 @@ int main(int argc, char* argv[])
     auto solver = solver_gen->generate(A);
     exec->synchronize();
     const auto gen_toc = std::chrono::steady_clock::now();
-    const auto gen_time =
-        std::chrono::duration_cast<std::chrono::nanoseconds>(gen_toc - gen_tic);
+    const auto gen_time = std::chrono::duration_cast<std::chrono::milliseconds>(
+        gen_toc - gen_tic);
 
     // Add logger
     std::shared_ptr<const gko::log::Convergence<ValueType>> logger =
@@ -121,7 +122,7 @@ int main(int argc, char* argv[])
     exec->synchronize();
     const auto toc = std::chrono::steady_clock::now();
     const auto time =
-        std::chrono::duration_cast<std::chrono::nanoseconds>(toc - tic);
+        std::chrono::duration_cast<std::chrono::milliseconds>(toc - tic);
 
     // Print out the solver config
     std::cout << "Config file: " << configfile << std::endl;
@@ -140,11 +141,11 @@ int main(int argc, char* argv[])
     std::cout << "Solver iteration count:     " << logger->get_num_iterations()
               << std::endl;
     std::cout << "Solver generation time [ms]: "
-              << static_cast<double>(gen_time.count()) / 1000000.0 << std::endl;
+              << static_cast<double>(gen_time.count()) << std::endl;
     std::cout << "Solver execution time [ms]: "
-              << static_cast<double>(time.count()) / 1000000.0 << std::endl;
+              << static_cast<double>(time.count()) << std::endl;
     std::cout << "Solver execution time per iteration[ms]: "
-              << static_cast<double>(time.count()) / 1000000.0 /
+              << static_cast<double>(time.count()) /
                      logger->get_num_iterations()
               << std::endl;
 }

From db942145a4c9434c9ee257c3385d98022f75a4a6 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 22 Oct 2024 23:02:41 +0200
Subject: [PATCH 213/448] fix windows execution test by using std::filesystem

---
 examples/CMakeLists.txt                            | 5 +++--
 examples/file-config-solver/file-config-solver.cpp | 6 ++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 5908caff3eb..762a3e33208 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -111,9 +111,10 @@ if(GINKGO_BUILD_TESTS)
             add_test(NAME example_file-config-solver_${config_name}_${executor} 
                      COMMAND 
                      "$<TARGET_FILE:file-config-solver>"
-                     "${executor}" "config/${config_name}.json"
+                     "${executor}" "${CMAKE_CURRENT_SOURCE_DIR}/file-config-solver/config/${config_name}.json"
+                     "${CMAKE_CURRENT_SOURCE_DIR}/file-config-solver/data/A.mtx"
                      WORKING_DIRECTORY
-                     "${CMAKE_CURRENT_BINARY_DIR}/file-config-solver")
+                     "$<TARGET_FILE_DIR:ginkgo>")
         endforeach()    
     endforeach()
         
diff --git a/examples/file-config-solver/file-config-solver.cpp b/examples/file-config-solver/file-config-solver.cpp
index e19dd2873c2..9f5a2c1f3e1 100644
--- a/examples/file-config-solver/file-config-solver.cpp
+++ b/examples/file-config-solver/file-config-solver.cpp
@@ -26,10 +26,12 @@ int main(int argc, char* argv[])
     // Print version information
     std::cout << gko::version_info::get() << std::endl;
     // Print usage
-    std::cout << argv[0] << " executor configfile" << std::endl;
+    std::cout << argv[0] << " executor configfile matrix" << std::endl;
 
     const auto executor_string = argc >= 2 ? argv[1] : "reference";
     const auto configfile = argc >= 3 ? argv[2] : "config/cg.json";
+    const std::string matrix_path = argc >= 4 ? argv[3] : "data/A.mtx";
+
     // Figure out where to run the code
     std::map<std::string, std::function<std::shared_ptr<gko::Executor>()>>
         exec_map{
@@ -54,7 +56,7 @@ int main(int argc, char* argv[])
     const auto exec = exec_map.at(executor_string)();  // throws if not valid
 
     // Read data
-    auto A = share(gko::read<mtx>(std::ifstream("data/A.mtx"), exec));
+    auto A = share(gko::read<mtx>(std::ifstream(matrix_path), exec));
     // Create RHS as 1 and initial guess as 0
     gko::size_type size = A->get_size()[0];
     auto host_x = vec::create(exec->get_master(), gko::dim<2>(size, 1));

From 1e57673afb3bcedc5e16451e3539821d2c73adc3 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Tue, 17 Sep 2024 13:28:08 +0200
Subject: [PATCH 214/448] [core] allow filling the device_matrix_data

The main use case is in combination with `sum_duplicates` and `remove_zeros` to simplify the assembly setup.
---
 core/base/device_matrix_data.cpp              | 18 +++++++-
 .../ginkgo/core/base/device_matrix_data.hpp   | 17 ++++++-
 test/base/device_matrix_data_kernels.cpp      | 46 +++++++++++++++++++
 3 files changed, 78 insertions(+), 3 deletions(-)

diff --git a/core/base/device_matrix_data.cpp b/core/base/device_matrix_data.cpp
index a2e5d6e7044..11d2536268f 100644
--- a/core/base/device_matrix_data.cpp
+++ b/core/base/device_matrix_data.cpp
@@ -29,12 +29,17 @@ GKO_REGISTER_OPERATION(sort_row_major, components::sort_row_major);
 
 template <typename ValueType, typename IndexType>
 device_matrix_data<ValueType, IndexType>::device_matrix_data(
-    std::shared_ptr<const Executor> exec, dim<2> size, size_type num_entries)
+    std::shared_ptr<const Executor> exec, dim<2> size, size_type num_entries,
+    fill_mode fm)
     : size_{size},
       row_idxs_{exec, num_entries},
       col_idxs_{exec, num_entries},
       values_{exec, num_entries}
-{}
+{
+    if (fm == fill_mode::zero) {
+        fill_zero();
+    }
+}
 
 
 template <typename ValueType, typename IndexType>
@@ -93,6 +98,15 @@ device_matrix_data<ValueType, IndexType>::create_from_host(
 }
 
 
+template <typename ValueType, typename IndexType>
+void device_matrix_data<ValueType, IndexType>::fill_zero()
+{
+    row_idxs_.fill(0);
+    col_idxs_.fill(0);
+    values_.fill(ValueType{0});
+}
+
+
 template <typename ValueType, typename IndexType>
 void device_matrix_data<ValueType, IndexType>::sort_row_major()
 {
diff --git a/include/ginkgo/core/base/device_matrix_data.hpp b/include/ginkgo/core/base/device_matrix_data.hpp
index 35e3f300954..dfdd08b261c 100644
--- a/include/ginkgo/core/base/device_matrix_data.hpp
+++ b/include/ginkgo/core/base/device_matrix_data.hpp
@@ -16,6 +16,14 @@
 
 namespace gko {
 
+/**
+ * Enum that describes how allocated data is filled
+ */
+enum class fill_mode {
+    uninitialized,  //!< no fill operation is done
+    zero            //!< fill with zeros
+};
+
 
 /**
  * This type is a device-side equivalent to matrix_data.
@@ -48,9 +56,11 @@ class device_matrix_data {
      * @param exec  the executor to be used to store the matrix entries
      * @param size  the matrix dimensions
      * @param num_entries  the number of entries to be stored
+     * @param fm  describes how the data is filled
      */
     explicit device_matrix_data(std::shared_ptr<const Executor> exec,
-                                dim<2> size = {}, size_type num_entries = 0);
+                                dim<2> size = {}, size_type num_entries = 0,
+                                fill_mode fm = fill_mode::uninitialized);
 
     /**
      * Initializes a device_matrix_data object by copying an existing object on
@@ -114,6 +124,11 @@ class device_matrix_data {
     static device_matrix_data create_from_host(
         std::shared_ptr<const Executor> exec, const host_type& data);
 
+    /**
+     * Fills the matrix entries with zeros
+     */
+    void fill_zero();
+
     /**
      * Sorts the matrix entries in row-major order
      * This means that they will be sorted by row index first, and then by
diff --git a/test/base/device_matrix_data_kernels.cpp b/test/base/device_matrix_data_kernels.cpp
index ffadbcfb245..039cc9eac20 100644
--- a/test/base/device_matrix_data_kernels.cpp
+++ b/test/base/device_matrix_data_kernels.cpp
@@ -116,6 +116,30 @@ TYPED_TEST(DeviceMatrixData, ConstructsCorrectly)
 }
 
 
+TYPED_TEST(DeviceMatrixData, ConstructsWithZerosCorrectly)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+
+    gko::device_matrix_data<value_type, index_type> local_data{
+        this->exec, gko::dim<2>{4, 3}, 10, gko::fill_mode::zero};
+
+    ASSERT_EQ((gko::dim<2>{4, 3}), local_data.get_size());
+    ASSERT_EQ(this->exec, local_data.get_executor());
+    ASSERT_EQ(local_data.get_num_stored_elements(), 10);
+    auto arrays = local_data.empty_out();
+    auto expected_row_idxs = gko::array<index_type>(this->exec, 10);
+    auto expected_col_idxs = gko::array<index_type>(this->exec, 10);
+    auto expected_values = gko::array<value_type>(this->exec, 10);
+    expected_row_idxs.fill(0);
+    expected_col_idxs.fill(0);
+    expected_values.fill(0.0);
+    GKO_ASSERT_ARRAY_EQ(arrays.row_idxs, expected_row_idxs);
+    GKO_ASSERT_ARRAY_EQ(arrays.col_idxs, expected_col_idxs);
+    GKO_ASSERT_ARRAY_EQ(arrays.values, expected_values);
+}
+
+
 TYPED_TEST(DeviceMatrixData, CopyConstructsOnOtherExecutorCorrectly)
 {
     using value_type = typename TestFixture::value_type;
@@ -241,6 +265,28 @@ TYPED_TEST(DeviceMatrixData, CopiesToHost)
 }
 
 
+TYPED_TEST(DeviceMatrixData, CanFillEntriesWithZeros)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using device_matrix_data = gko::device_matrix_data<value_type, index_type>;
+    auto device_data = device_matrix_data{this->exec, gko::dim<2>{4, 3}, 10};
+
+    device_data.fill_zero();
+
+    auto arrays = device_data.empty_out();
+    auto expected_row_idxs = gko::array<index_type>(this->exec, 10);
+    auto expected_col_idxs = gko::array<index_type>(this->exec, 10);
+    auto expected_values = gko::array<value_type>(this->exec, 10);
+    expected_row_idxs.fill(0);
+    expected_col_idxs.fill(0);
+    expected_values.fill(0.0);
+    GKO_ASSERT_ARRAY_EQ(arrays.row_idxs, expected_row_idxs);
+    GKO_ASSERT_ARRAY_EQ(arrays.col_idxs, expected_col_idxs);
+    GKO_ASSERT_ARRAY_EQ(arrays.values, expected_values);
+}
+
+
 TYPED_TEST(DeviceMatrixData, SortsRowMajor)
 {
     using value_type = typename TestFixture::value_type;

From bb4f7329f06276b82ea0485ad982b2e77bd5fe98 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Tue, 8 Oct 2024 08:21:37 +0000
Subject: [PATCH 215/448] [core] remove zero initialization

---
 core/base/device_matrix_data.cpp              |  9 ++-----
 .../ginkgo/core/base/device_matrix_data.hpp   | 12 +---------
 test/base/device_matrix_data_kernels.cpp      | 24 -------------------
 3 files changed, 3 insertions(+), 42 deletions(-)

diff --git a/core/base/device_matrix_data.cpp b/core/base/device_matrix_data.cpp
index 11d2536268f..adbd5af8e60 100644
--- a/core/base/device_matrix_data.cpp
+++ b/core/base/device_matrix_data.cpp
@@ -29,17 +29,12 @@ GKO_REGISTER_OPERATION(sort_row_major, components::sort_row_major);
 
 template <typename ValueType, typename IndexType>
 device_matrix_data<ValueType, IndexType>::device_matrix_data(
-    std::shared_ptr<const Executor> exec, dim<2> size, size_type num_entries,
-    fill_mode fm)
+    std::shared_ptr<const Executor> exec, dim<2> size, size_type num_entries)
     : size_{size},
       row_idxs_{exec, num_entries},
       col_idxs_{exec, num_entries},
       values_{exec, num_entries}
-{
-    if (fm == fill_mode::zero) {
-        fill_zero();
-    }
-}
+{}
 
 
 template <typename ValueType, typename IndexType>
diff --git a/include/ginkgo/core/base/device_matrix_data.hpp b/include/ginkgo/core/base/device_matrix_data.hpp
index dfdd08b261c..16a68517b2a 100644
--- a/include/ginkgo/core/base/device_matrix_data.hpp
+++ b/include/ginkgo/core/base/device_matrix_data.hpp
@@ -16,14 +16,6 @@
 
 namespace gko {
 
-/**
- * Enum that describes how allocated data is filled
- */
-enum class fill_mode {
-    uninitialized,  //!< no fill operation is done
-    zero            //!< fill with zeros
-};
-
 
 /**
  * This type is a device-side equivalent to matrix_data.
@@ -56,11 +48,9 @@ class device_matrix_data {
      * @param exec  the executor to be used to store the matrix entries
      * @param size  the matrix dimensions
      * @param num_entries  the number of entries to be stored
-     * @param fm  describes how the data is filled
      */
     explicit device_matrix_data(std::shared_ptr<const Executor> exec,
-                                dim<2> size = {}, size_type num_entries = 0,
-                                fill_mode fm = fill_mode::uninitialized);
+                                dim<2> size = {}, size_type num_entries = 0);
 
     /**
      * Initializes a device_matrix_data object by copying an existing object on
diff --git a/test/base/device_matrix_data_kernels.cpp b/test/base/device_matrix_data_kernels.cpp
index 039cc9eac20..6ddc926b76c 100644
--- a/test/base/device_matrix_data_kernels.cpp
+++ b/test/base/device_matrix_data_kernels.cpp
@@ -116,30 +116,6 @@ TYPED_TEST(DeviceMatrixData, ConstructsCorrectly)
 }
 
 
-TYPED_TEST(DeviceMatrixData, ConstructsWithZerosCorrectly)
-{
-    using value_type = typename TestFixture::value_type;
-    using index_type = typename TestFixture::index_type;
-
-    gko::device_matrix_data<value_type, index_type> local_data{
-        this->exec, gko::dim<2>{4, 3}, 10, gko::fill_mode::zero};
-
-    ASSERT_EQ((gko::dim<2>{4, 3}), local_data.get_size());
-    ASSERT_EQ(this->exec, local_data.get_executor());
-    ASSERT_EQ(local_data.get_num_stored_elements(), 10);
-    auto arrays = local_data.empty_out();
-    auto expected_row_idxs = gko::array<index_type>(this->exec, 10);
-    auto expected_col_idxs = gko::array<index_type>(this->exec, 10);
-    auto expected_values = gko::array<value_type>(this->exec, 10);
-    expected_row_idxs.fill(0);
-    expected_col_idxs.fill(0);
-    expected_values.fill(0.0);
-    GKO_ASSERT_ARRAY_EQ(arrays.row_idxs, expected_row_idxs);
-    GKO_ASSERT_ARRAY_EQ(arrays.col_idxs, expected_col_idxs);
-    GKO_ASSERT_ARRAY_EQ(arrays.values, expected_values);
-}
-
-
 TYPED_TEST(DeviceMatrixData, CopyConstructsOnOtherExecutorCorrectly)
 {
     using value_type = typename TestFixture::value_type;

From db1a49240f5537834ebae7fb5881f89c48a01908 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Thu, 17 Oct 2024 10:29:17 +0200
Subject: [PATCH 216/448] [batch] fix parameter ordering/naming

---
 reference/solver/batch_cg_kernels.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/reference/solver/batch_cg_kernels.hpp b/reference/solver/batch_cg_kernels.hpp
index 2f8e5990931..6208a048972 100644
--- a/reference/solver/batch_cg_kernels.hpp
+++ b/reference/solver/batch_cg_kernels.hpp
@@ -32,8 +32,8 @@ inline void initialize(
     const BatchMatrixType_entry& A_entry,
     const gko::batch::multi_vector::batch_item<const ValueType>& b_entry,
     const gko::batch::multi_vector::batch_item<const ValueType>& x_entry,
-    const gko::batch::multi_vector::batch_item<ValueType>& rho_new_entry,
     const gko::batch::multi_vector::batch_item<ValueType>& rho_old_entry,
+    const gko::batch::multi_vector::batch_item<ValueType>& rho_new_entry,
     const gko::batch::multi_vector::batch_item<ValueType>& r_entry,
     const gko::batch::multi_vector::batch_item<ValueType>& p_entry,
     const gko::batch::multi_vector::batch_item<ValueType>& z_entry,
@@ -86,7 +86,7 @@ inline void update_p(
 
 template <typename ValueType>
 inline void update_x_and_r(
-    const gko::batch::multi_vector::batch_item<const ValueType>& rho_old_entry,
+    const gko::batch::multi_vector::batch_item<const ValueType>& rho_new_entry,
     const gko::batch::multi_vector::batch_item<const ValueType>& p_entry,
     const gko::batch::multi_vector::batch_item<const ValueType>& Ap_entry,
     const gko::batch::multi_vector::batch_item<ValueType>& alpha_entry,
@@ -96,7 +96,7 @@ inline void update_x_and_r(
     batch_single_kernels::compute_conj_dot_product_kernel<ValueType>(
         p_entry, Ap_entry, alpha_entry);
 
-    const ValueType temp = rho_old_entry.values[0] / alpha_entry.values[0];
+    const ValueType temp = rho_new_entry.values[0] / alpha_entry.values[0];
     for (int row = 0; row < r_entry.num_rows; row++) {
         x_entry.values[row * x_entry.stride] +=
             temp * p_entry.values[row * p_entry.stride];

From e4b80aecb8b3b921200b07d3201f6b8ac5811e17 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Tue, 25 Jun 2024 13:46:55 +0200
Subject: [PATCH 217/448] [fact] extract l/u initialization

---
 core/factorization/factorization_helpers.hpp  |  56 +++++++++
 .../factorization/factorization_helpers.hpp   | 112 ++++++++++++++++++
 .../factorization/factorization_kernels.cpp   |  97 +++------------
 3 files changed, 187 insertions(+), 78 deletions(-)
 create mode 100644 core/factorization/factorization_helpers.hpp
 create mode 100644 reference/factorization/factorization_helpers.hpp

diff --git a/core/factorization/factorization_helpers.hpp b/core/factorization/factorization_helpers.hpp
new file mode 100644
index 00000000000..16ead3a198d
--- /dev/null
+++ b/core/factorization/factorization_helpers.hpp
@@ -0,0 +1,56 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GINKGO_CORE_FACTORIZATION_FACTORIZATION_HELPERS_HPP
+#define GINKGO_CORE_FACTORIZATION_FACTORIZATION_HELPERS_HPP
+
+
+#include <utility>
+
+
+namespace gko {
+namespace factorization {
+
+
+struct identity {
+    template <typename T>
+    constexpr T operator()(T value)
+    {
+        return value;
+    }
+};
+
+
+template <typename DiagClosure, typename OffDiagClosure>
+class triangular_mtx_closure {
+public:
+    constexpr triangular_mtx_closure(DiagClosure diag_closure,
+                                     OffDiagClosure off_diag_closure)
+        : diag_closure_(std::move(diag_closure)),
+          off_diag_closure_(std::move(off_diag_closure))
+    {}
+
+    template <typename T>
+    constexpr T map_diag(T value)
+    {
+        return diag_closure_(value);
+    }
+
+    template <typename T>
+    constexpr T map_off_diag(T value)
+    {
+        return off_diag_closure_(value);
+    }
+
+private:
+    DiagClosure diag_closure_;
+    OffDiagClosure off_diag_closure_;
+};
+
+
+}  // namespace factorization
+}  // namespace gko
+
+
+#endif  // GINKGO_CORE_FACTORIZATION_FACTORIZATION_HELPERS_HPP
diff --git a/reference/factorization/factorization_helpers.hpp b/reference/factorization/factorization_helpers.hpp
new file mode 100644
index 00000000000..9b5afb518af
--- /dev/null
+++ b/reference/factorization/factorization_helpers.hpp
@@ -0,0 +1,112 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <ginkgo/core/matrix/csr.hpp>
+
+#include "core/factorization/factorization_helpers.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace reference {
+namespace factorization {
+namespace helpers {
+
+
+using namespace ::gko::factorization;
+
+
+template <typename ValueType, typename IndexType, typename LClosure,
+          typename UClosure>
+void initialize_l_u(const matrix::Csr<ValueType, IndexType>* system_matrix,
+                    matrix::Csr<ValueType, IndexType>* csr_l,
+                    matrix::Csr<ValueType, IndexType>* csr_u,
+                    LClosure&& l_closure, UClosure&& u_closure)
+{
+    const auto row_ptrs = system_matrix->get_const_row_ptrs();
+    const auto col_idxs = system_matrix->get_const_col_idxs();
+    const auto vals = system_matrix->get_const_values();
+
+    const auto row_ptrs_l = csr_l->get_const_row_ptrs();
+    auto col_idxs_l = csr_l->get_col_idxs();
+    auto vals_l = csr_l->get_values();
+
+    const auto row_ptrs_u = csr_u->get_const_row_ptrs();
+    auto col_idxs_u = csr_u->get_col_idxs();
+    auto vals_u = csr_u->get_values();
+
+    for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) {
+        size_type current_index_l = row_ptrs_l[row];
+        size_type current_index_u =
+            row_ptrs_u[row] + 1;  // we treat the diagonal separately
+        // if there is no diagonal value, set it to 1 by default
+        auto diag_val = one<ValueType>();
+        for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) {
+            const auto col = col_idxs[el];
+            const auto val = vals[el];
+            if (col < row) {
+                col_idxs_l[current_index_l] = col;
+                vals_l[current_index_l] = l_closure.map_off_diag(val);
+                ++current_index_l;
+            } else if (col == row) {
+                // save diagonal value
+                diag_val = val;
+            } else {  // col > row
+                col_idxs_u[current_index_u] = col;
+                vals_u[current_index_u] = u_closure.map_off_diag(val);
+                ++current_index_u;
+            }
+        }
+        // store diagonal values separately
+        auto l_diag_idx = row_ptrs_l[row + 1] - 1;
+        auto u_diag_idx = row_ptrs_u[row];
+        col_idxs_l[l_diag_idx] = row;
+        col_idxs_u[u_diag_idx] = row;
+        vals_l[l_diag_idx] = l_closure.map_diag(diag_val);
+        vals_u[u_diag_idx] = u_closure.map_diag(diag_val);
+    }
+}
+
+
+template <typename ValueType, typename IndexType, typename Closure>
+void initialize_l(const matrix::Csr<ValueType, IndexType>* system_matrix,
+                  matrix::Csr<ValueType, IndexType>* csr_l, Closure&& closure)
+{
+    const auto row_ptrs = system_matrix->get_const_row_ptrs();
+    const auto col_idxs = system_matrix->get_const_col_idxs();
+    const auto vals = system_matrix->get_const_values();
+
+    const auto row_ptrs_l = csr_l->get_const_row_ptrs();
+    auto col_idxs_l = csr_l->get_col_idxs();
+    auto vals_l = csr_l->get_values();
+
+    for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) {
+        size_type current_index_l = row_ptrs_l[row];
+        // if there is no diagonal value, set it to 1 by default
+        auto diag_val = one<ValueType>();
+        for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) {
+            const auto col = col_idxs[el];
+            const auto val = vals[el];
+            if (col < row) {
+                col_idxs_l[current_index_l] = col;
+                vals_l[current_index_l] = closure.map_off_diag(val);
+                ++current_index_l;
+            } else if (col == row) {
+                // save diagonal value
+                diag_val = val;
+            }
+        }
+        // store diagonal values separately
+        auto l_diag_idx = row_ptrs_l[row + 1] - 1;
+        col_idxs_l[l_diag_idx] = row;
+        vals_l[l_diag_idx] = closure.map_diag(diag_val);
+    }
+}
+
+
+}  // namespace helpers
+}  // namespace factorization
+}  // namespace reference
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/factorization/factorization_kernels.cpp b/reference/factorization/factorization_kernels.cpp
index 085e2f62ecc..99b522ffba9 100644
--- a/reference/factorization/factorization_kernels.cpp
+++ b/reference/factorization/factorization_kernels.cpp
@@ -12,6 +12,7 @@
 
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
+#include "reference/factorization/factorization_helpers.hpp"
 
 
 namespace gko {
@@ -168,48 +169,12 @@ void initialize_l_u(std::shared_ptr<const ReferenceExecutor> exec,
                     matrix::Csr<ValueType, IndexType>* csr_l,
                     matrix::Csr<ValueType, IndexType>* csr_u)
 {
-    const auto row_ptrs = system_matrix->get_const_row_ptrs();
-    const auto col_idxs = system_matrix->get_const_col_idxs();
-    const auto vals = system_matrix->get_const_values();
-
-    const auto row_ptrs_l = csr_l->get_const_row_ptrs();
-    auto col_idxs_l = csr_l->get_col_idxs();
-    auto vals_l = csr_l->get_values();
-
-    const auto row_ptrs_u = csr_u->get_const_row_ptrs();
-    auto col_idxs_u = csr_u->get_col_idxs();
-    auto vals_u = csr_u->get_values();
-
-    for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) {
-        size_type current_index_l = row_ptrs_l[row];
-        size_type current_index_u =
-            row_ptrs_u[row] + 1;  // we treat the diagonal separately
-        // if there is no diagonal value, set it to 1 by default
-        auto diag_val = one<ValueType>();
-        for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) {
-            const auto col = col_idxs[el];
-            const auto val = vals[el];
-            if (col < row) {
-                col_idxs_l[current_index_l] = col;
-                vals_l[current_index_l] = val;
-                ++current_index_l;
-            } else if (col == row) {
-                // save diagonal value
-                diag_val = val;
-            } else {  // col > row
-                col_idxs_u[current_index_u] = col;
-                vals_u[current_index_u] = val;
-                ++current_index_u;
-            }
-        }
-        // store diagonal values separately
-        auto l_diag_idx = row_ptrs_l[row + 1] - 1;
-        auto u_diag_idx = row_ptrs_u[row];
-        col_idxs_l[l_diag_idx] = row;
-        col_idxs_u[u_diag_idx] = row;
-        vals_l[l_diag_idx] = one<ValueType>();
-        vals_u[u_diag_idx] = diag_val;
-    }
+    helpers::initialize_l_u(
+        system_matrix, csr_l, csr_u,
+        helpers::triangular_mtx_closure([](auto) { return one<ValueType>(); },
+                                        helpers::identity{}),
+        helpers::triangular_mtx_closure(helpers::identity{},
+                                        helpers::identity{}));
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -248,42 +213,18 @@ void initialize_l(std::shared_ptr<const ReferenceExecutor> exec,
                   const matrix::Csr<ValueType, IndexType>* system_matrix,
                   matrix::Csr<ValueType, IndexType>* csr_l, bool diag_sqrt)
 {
-    const auto row_ptrs = system_matrix->get_const_row_ptrs();
-    const auto col_idxs = system_matrix->get_const_col_idxs();
-    const auto vals = system_matrix->get_const_values();
-
-    const auto row_ptrs_l = csr_l->get_const_row_ptrs();
-    auto col_idxs_l = csr_l->get_col_idxs();
-    auto vals_l = csr_l->get_values();
-
-    for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) {
-        size_type current_index_l = row_ptrs_l[row];
-        // if there is no diagonal value, set it to 1 by default
-        auto diag_val = one<ValueType>();
-        for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) {
-            const auto col = col_idxs[el];
-            const auto val = vals[el];
-            if (col < row) {
-                col_idxs_l[current_index_l] = col;
-                vals_l[current_index_l] = val;
-                ++current_index_l;
-            } else if (col == row) {
-                // save diagonal value
-                diag_val = val;
-            }
-        }
-        // store diagonal values separately
-        auto l_diag_idx = row_ptrs_l[row + 1] - 1;
-        col_idxs_l[l_diag_idx] = row;
-        // compute square root with sentinel
-        if (diag_sqrt) {
-            diag_val = sqrt(diag_val);
-            if (!is_finite(diag_val)) {
-                diag_val = one<ValueType>();
-            }
-        }
-        vals_l[l_diag_idx] = diag_val;
-    }
+    helpers::initialize_l(system_matrix, csr_l,
+                          helpers::triangular_mtx_closure(
+                              [diag_sqrt](auto val) {
+                                  if (diag_sqrt) {
+                                      val = sqrt(val);
+                                      if (!is_finite(val)) {
+                                          val = one<ValueType>();
+                                      }
+                                  }
+                                  return val;
+                              },
+                              helpers::identity{}));
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(

From 601263df943fc686e8556f7225ade2ebbd91e966 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Tue, 25 Jun 2024 16:07:46 +0200
Subject: [PATCH 218/448] [prec] implement SOR preconditioner reference kernels

only reference kernels are available
---
 common/unified/CMakeLists.txt                 |   1 +
 common/unified/preconditioner/sor_kernels.cpp |  44 ++++++
 core/CMakeLists.txt                           |   1 +
 core/device_hooks/common_kernels.inc.cpp      |  11 ++
 core/preconditioner/sor.cpp                   | 133 ++++++++++++++++++
 core/preconditioner/sor_kernels.hpp           |  50 +++++++
 include/ginkgo/core/preconditioner/sor.hpp    | 125 ++++++++++++++++
 include/ginkgo/ginkgo.hpp                     |   1 +
 reference/CMakeLists.txt                      |   1 +
 reference/preconditioner/sor_kernels.cpp      |  70 +++++++++
 10 files changed, 437 insertions(+)
 create mode 100644 common/unified/preconditioner/sor_kernels.cpp
 create mode 100644 core/preconditioner/sor.cpp
 create mode 100644 core/preconditioner/sor_kernels.hpp
 create mode 100644 include/ginkgo/core/preconditioner/sor.hpp
 create mode 100644 reference/preconditioner/sor_kernels.cpp

diff --git a/common/unified/CMakeLists.txt b/common/unified/CMakeLists.txt
index 00bc21df0c6..132e04c5d9a 100644
--- a/common/unified/CMakeLists.txt
+++ b/common/unified/CMakeLists.txt
@@ -19,6 +19,7 @@ set(UNIFIED_SOURCES
     matrix/diagonal_kernels.cpp
     multigrid/pgm_kernels.cpp
     preconditioner/jacobi_kernels.cpp
+    preconditioner/sor_kernels.cpp
     solver/bicg_kernels.cpp
     solver/bicgstab_kernels.cpp
     solver/cg_kernels.cpp
diff --git a/common/unified/preconditioner/sor_kernels.cpp b/common/unified/preconditioner/sor_kernels.cpp
new file mode 100644
index 00000000000..8932c1df562
--- /dev/null
+++ b/common/unified/preconditioner/sor_kernels.cpp
@@ -0,0 +1,44 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/preconditioner/sor_kernels.hpp"
+
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+#include "common/unified/base/kernel_launch.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace sor {
+
+
+template <typename ValueType, typename IndexType>
+void initialize_weighted_l(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* system_matrix,
+    remove_complex<ValueType> weight,
+    matrix::Csr<ValueType, IndexType>* l_mtx) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L);
+
+
+template <typename ValueType, typename IndexType>
+void initialize_weighted_l_u(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* system_matrix,
+    remove_complex<ValueType> weight, matrix::Csr<ValueType, IndexType>* l_mtx,
+    matrix::Csr<ValueType, IndexType>* u_mtx) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U);
+
+
+}  // namespace sor
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
index 8c802b2eca5..ef07359e8b4 100644
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -88,6 +88,7 @@ target_sources(${ginkgo_core}
     multigrid/pgm.cpp
     multigrid/fixed_coarsening.cpp
     preconditioner/batch_jacobi.cpp
+        preconditioner/sor.cpp
     preconditioner/ic.cpp
     preconditioner/ilu.cpp
     preconditioner/isai.cpp
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 1ba925e94e3..26de8531741 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -48,6 +48,7 @@
 #include "core/preconditioner/batch_jacobi_kernels.hpp"
 #include "core/preconditioner/isai_kernels.hpp"
 #include "core/preconditioner/jacobi_kernels.hpp"
+#include "core/preconditioner/sor_kernels.hpp"
 #include "core/reorder/rcm_kernels.hpp"
 #include "core/solver/batch_bicgstab_kernels.hpp"
 #include "core/solver/batch_cg_kernels.hpp"
@@ -819,6 +820,16 @@ GKO_STUB(GKO_DECLARE_JACOBI_INITIALIZE_PRECISIONS_KERNEL);
 }  // namespace jacobi
 
 
+namespace sor {
+
+
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U);
+
+
+}  // namespace sor
+
+
 namespace isai {
 
 
diff --git a/core/preconditioner/sor.cpp b/core/preconditioner/sor.cpp
new file mode 100644
index 00000000000..30a2539a0cc
--- /dev/null
+++ b/core/preconditioner/sor.cpp
@@ -0,0 +1,133 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/precision_dispatch.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/diagonal.hpp>
+#include <ginkgo/core/preconditioner/sor.hpp>
+#include <ginkgo/core/solver/triangular.hpp>
+
+#include "core/base/array_access.hpp"
+#include "core/base/utils.hpp"
+#include "core/factorization/factorization_kernels.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/preconditioner/sor_kernels.hpp"
+
+namespace gko {
+namespace preconditioner {
+namespace {
+
+
+GKO_REGISTER_OPERATION(initialize_row_ptrs_l,
+                       factorization::initialize_row_ptrs_l);
+GKO_REGISTER_OPERATION(initialize_row_ptrs_l_u,
+                       factorization::initialize_row_ptrs_l_u);
+GKO_REGISTER_OPERATION(initialize_weighted_l, sor::initialize_weighted_l);
+GKO_REGISTER_OPERATION(initialize_weighted_l_u, sor::initialize_weighted_l_u);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<typename Sor<ValueType, IndexType>::composition_type>
+Sor<ValueType, IndexType>::generate(
+    std::shared_ptr<const LinOp> system_matrix) const
+{
+    auto product =
+        std::unique_ptr<composition_type>(static_cast<composition_type*>(
+            this->LinOpFactory::generate(std::move(system_matrix)).release()));
+    return product;
+}
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<LinOp> Sor<ValueType, IndexType>::generate_impl(
+    std::shared_ptr<const LinOp> system_matrix) const
+{
+    using Csr = matrix::Csr<ValueType, IndexType>;
+    using LTrs = solver::LowerTrs<value_type, index_type>;
+    using UTrs = solver::UpperTrs<value_type, index_type>;
+
+    GKO_ASSERT_IS_SQUARE_MATRIX(system_matrix);
+
+    auto exec = this->get_executor();
+    auto size = system_matrix->get_size();
+
+    auto csr_matrix = convert_to_with_sorting<Csr>(exec, system_matrix,
+                                                   parameters_.skip_sorting);
+
+    auto l_trs_factory =
+        parameters_.l_solver ? parameters_.l_solver : LTrs::build().on(exec);
+    auto u_trs_factory =
+        parameters_.u_solver ? parameters_.u_solver : UTrs::build().on(exec);
+
+    if (parameters_.symmetric) {
+        array<index_type> l_row_ptrs{exec, size[0] + 1};
+        array<index_type> u_row_ptrs{exec, size[0] + 1};
+        exec->run(make_initialize_row_ptrs_l_u(
+            csr_matrix.get(), l_row_ptrs.get_data(), u_row_ptrs.get_data()));
+        const auto l_nnz =
+            static_cast<size_type>(get_element(l_row_ptrs, size[0]));
+        const auto u_nnz =
+            static_cast<size_type>(get_element(u_row_ptrs, size[0]));
+
+        // create matrices
+        auto l_mtx =
+            Csr::create(exec, size, array<value_type>{exec, l_nnz},
+                        array<index_type>{exec, l_nnz}, std::move(l_row_ptrs));
+        auto u_mtx =
+            Csr::create(exec, size, array<value_type>{exec, u_nnz},
+                        array<index_type>{exec, u_nnz}, std::move(u_row_ptrs));
+
+        // fill l_mtx with 1/w (D + wL)
+        // fill u_mtx with 1/(1-w) (D + wU)
+        exec->run(make_initialize_weighted_l_u(csr_matrix.get(),
+                                               parameters_.relaxation_factor,
+                                               l_mtx.get(), u_mtx.get()));
+
+        // scale u_mtx with D^-1
+        auto diag = csr_matrix->extract_diagonal();
+        diag->inverse_apply(u_mtx, u_mtx);
+
+        // invert the triangular matrices with triangular solvers
+        auto l_trs = l_trs_factory->generate(std::move(l_mtx));
+        auto u_trs = u_trs_factory->generate(std::move(u_mtx));
+
+        // return (1/(w * (1 - w)) (D + wL) D^-1 (D + wU))^-1
+        // because of the inversion, the factor order is switched
+        return composition_type::create(std::move(u_trs), std::move(l_trs));
+    } else {
+        array<index_type> l_row_ptrs{exec, size[0] + 1};
+        exec->run(make_initialize_row_ptrs_l(csr_matrix.get(),
+                                             l_row_ptrs.get_data()));
+        const auto l_nnz =
+            static_cast<size_type>(get_element(l_row_ptrs, size[0]));
+
+        // create matrices
+        auto l_mtx =
+            Csr::create(exec, size, array<value_type>{exec, l_nnz},
+                        array<index_type>{exec, l_nnz}, std::move(l_row_ptrs));
+
+        // fill l_mtx with 1/w * (D + wL)
+        exec->run(make_initialize_weighted_l(
+            csr_matrix.get(), parameters_.relaxation_factor, l_mtx.get()));
+
+        // invert the triangular matrices with triangular solvers
+        auto l_trs = l_trs_factory->generate(std::move(l_mtx));
+
+        return composition_type::create(std::move(l_trs));
+    }
+}
+
+
+#define GKO_DECLARE_SOR(ValueType, IndexType) class Sor<ValueType, IndexType>
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SOR);
+
+
+}  // namespace preconditioner
+}  // namespace gko
diff --git a/core/preconditioner/sor_kernels.hpp b/core/preconditioner/sor_kernels.hpp
new file mode 100644
index 00000000000..fbca3de612c
--- /dev/null
+++ b/core/preconditioner/sor_kernels.hpp
@@ -0,0 +1,50 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_CORE_PRECONDITIONER_SOR_KERNELS_HPP_
+#define GKO_CORE_PRECONDITIONER_SOR_KERNELS_HPP_
+
+
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/preconditioner/sor.hpp>
+
+#include "core/base/kernel_declaration.hpp"
+
+
+namespace gko {
+namespace kernels {
+
+
+#define GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L(_vtype, _itype) \
+    void initialize_weighted_l(                               \
+        std::shared_ptr<const DefaultExecutor> exec,          \
+        const matrix::Csr<_vtype, _itype>* system_matrix,     \
+        remove_complex<_vtype> weight, matrix::Csr<_vtype, _itype>* l_factor)
+
+
+#define GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U(_vtype, _itype)               \
+    void initialize_weighted_l_u(                                             \
+        std::shared_ptr<const DefaultExecutor> exec,                          \
+        const matrix::Csr<_vtype, _itype>* system_matrix,                     \
+        remove_complex<_vtype> weight, matrix::Csr<_vtype, _itype>* l_factor, \
+        matrix::Csr<_vtype, _itype>* u_factor)
+
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES                             \
+    template <typename ValueType, typename IndexType>            \
+    GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L(ValueType, IndexType); \
+    template <typename ValueType, typename IndexType>            \
+    GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U(ValueType, IndexType)
+
+
+GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(sor, GKO_DECLARE_ALL_AS_TEMPLATES);
+
+
+#undef GKO_DECLARE_ALL_AS_TEMPLATES
+
+
+}  // namespace kernels
+}  // namespace gko
+
+#endif  // GKO_CORE_PRECONDITIONER_SOR_KERNELS_HPP_
diff --git a/include/ginkgo/core/preconditioner/sor.hpp b/include/ginkgo/core/preconditioner/sor.hpp
new file mode 100644
index 00000000000..941f012039d
--- /dev/null
+++ b/include/ginkgo/core/preconditioner/sor.hpp
@@ -0,0 +1,125 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_PUBLIC_CORE_PRECONDITIONER_SOR_HPP_
+#define GKO_PUBLIC_CORE_PRECONDITIONER_SOR_HPP_
+
+
+#include <vector>
+
+#include <ginkgo/core/base/abstract_factory.hpp>
+#include <ginkgo/core/base/composition.hpp>
+#include <ginkgo/core/base/lin_op.hpp>
+#include <ginkgo/core/base/polymorphic_object.hpp>
+
+
+namespace gko {
+namespace preconditioner {
+
+
+/**
+ * This class generates the (S)SOR preconditioner.
+ *
+ * The SOR preconditioner starts from a splitting of the the matrix $A$ into
+ * $A = D + L + U$, where $L$ contains all entries below the diagonal, and $U$
+ * contains all entries above the diagonal. The application of the
+ * preconditioner is then defined as solving $M x = y$ with
+ * $$
+ * M = \frac{1}{\omega} (D + \omega L), \quad 0 < \omega < 2.
+ * $$
+ * $\omega$ is known as the relaxation factor.
+ * The preconditioner can be made symmetric, leading to the SSOR preconitioner.
+ * Here, $M$ is defined as
+ * $$
+ * M = \frac{1}{\omega (2 - \omega)} (D + \omega L) D^{-1} (D + \omega U) ,
+ * \quad 0 < \omega < 2.
+ * $$
+ *
+ * This class is a factory, which will only generate the preconditioner. The
+ * resulting LinOp will represent the application of $M^{-1}$.
+ *
+ * @tparam ValueType  The value type of the internally used CSR matrix
+ * @tparam IndexType  The index type of the internally used CSR matrix
+ */
+template <typename ValueType = default_precision, typename IndexType = int32>
+class Sor
+    : public EnablePolymorphicObject<Sor<ValueType, IndexType>, LinOpFactory>,
+      public EnablePolymorphicAssignment<Sor<ValueType, IndexType>> {
+    friend class EnablePolymorphicObject<Sor, LinOpFactory>;
+
+public:
+    struct parameters_type;
+    friend class enable_parameters_type<parameters_type, Sor>;
+
+    using value_type = ValueType;
+    using index_type = IndexType;
+    using composition_type = Composition<ValueType>;
+
+    struct parameters_type
+        : public enable_parameters_type<parameters_type, Sor> {
+        // skip sorting of input matrix
+        bool GKO_FACTORY_PARAMETER_SCALAR(skip_sorting, false);
+
+        // determines if SOR or SSOR should be used
+        bool GKO_FACTORY_PARAMETER_SCALAR(symmetric, false);
+
+        // has to be between 0.0 and 2.0
+        remove_complex<value_type> GKO_FACTORY_PARAMETER_SCALAR(
+            relaxation_factor, remove_complex<value_type>(1.2));
+
+        // factory for the lower triangular factor solver
+        std::shared_ptr<const LinOpFactory> GKO_DEFERRED_FACTORY_PARAMETER(
+            l_solver);
+
+        // factory for the upper triangular factor solver, unused if symmetric
+        // is false
+        std::shared_ptr<const LinOpFactory> GKO_DEFERRED_FACTORY_PARAMETER(
+            u_solver);
+    };
+
+    /**
+     * Returns the parameters used to construct the factory.
+     *
+     * @return the parameters used to construct the factory.
+     */
+    const parameters_type& get_parameters() { return parameters_; }
+
+    /**
+     * @copydoc get_parameters
+     */
+    const parameters_type& get_parameters() const { return parameters_; }
+
+    /**
+     * @copydoc LinOpFactory::generate
+     * @note This function overrides the default LinOpFactory::generate to
+     *       return a Factorization instead of a generic LinOp, which would need
+     *       to be cast to Factorization again to access its factors.
+     *       It is only necessary because smart pointers aren't covariant.
+     */
+    std::unique_ptr<composition_type> generate(
+        std::shared_ptr<const LinOp> system_matrix) const;
+
+    /** Creates a new parameter_type to set up the factory. */
+    static parameters_type build() { return {}; }
+
+protected:
+    explicit Sor(std::shared_ptr<const Executor> exec,
+                 const parameters_type& params = {})
+        : EnablePolymorphicObject<Sor, LinOpFactory>(exec), parameters_(params)
+    {
+        GKO_ASSERT(parameters_.relaxation_factor > 0.0 &&
+                   parameters_.relaxation_factor < 2.0);
+    }
+
+    std::unique_ptr<LinOp> generate_impl(
+        std::shared_ptr<const LinOp> system_matrix) const override;
+
+private:
+    parameters_type parameters_;
+};
+}  // namespace preconditioner
+}  // namespace gko
+
+
+#endif  // GKO_PUBLIC_CORE_PRECONDITIONER_SOR_HPP_
diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp
index 0fab93dcefe..c44cdee2485 100644
--- a/include/ginkgo/ginkgo.hpp
+++ b/include/ginkgo/ginkgo.hpp
@@ -118,6 +118,7 @@
 #include <ginkgo/core/preconditioner/ilu.hpp>
 #include <ginkgo/core/preconditioner/isai.hpp>
 #include <ginkgo/core/preconditioner/jacobi.hpp>
+#include <ginkgo/core/preconditioner/sor.hpp>
 #include <ginkgo/core/preconditioner/utils.hpp>
 
 #include <ginkgo/core/reorder/amd.hpp>
diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt
index 85b8f33e38b..ab6c210518b 100644
--- a/reference/CMakeLists.txt
+++ b/reference/CMakeLists.txt
@@ -43,6 +43,7 @@ target_sources(ginkgo_reference
     matrix/sparsity_csr_kernels.cpp
     multigrid/pgm_kernels.cpp
     preconditioner/batch_jacobi_kernels.cpp
+        preconditioner/sor_kernels.cpp
     preconditioner/isai_kernels.cpp
     preconditioner/jacobi_kernels.cpp
     reorder/rcm_kernels.cpp
diff --git a/reference/preconditioner/sor_kernels.cpp b/reference/preconditioner/sor_kernels.cpp
new file mode 100644
index 00000000000..88ac422dd02
--- /dev/null
+++ b/reference/preconditioner/sor_kernels.cpp
@@ -0,0 +1,70 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/preconditioner/sor_kernels.hpp"
+
+#include <memory>
+
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+#include "reference/factorization/factorization_helpers.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace reference {
+namespace sor {
+
+
+template <typename ValueType, typename IndexType>
+void initialize_weighted_l(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* system_matrix,
+    remove_complex<ValueType> weight, matrix::Csr<ValueType, IndexType>* l_mtx)
+{
+    auto inv_weight = one(weight) / weight;
+    factorization::helpers::initialize_l(
+        system_matrix, l_mtx,
+        factorization::helpers::triangular_mtx_closure(
+            [inv_weight](auto val) { return val * inv_weight; },
+            [](auto val) { return val; }));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L);
+
+
+template <typename ValueType, typename IndexType>
+void initialize_weighted_l_u(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* system_matrix,
+    remove_complex<ValueType> weight, matrix::Csr<ValueType, IndexType>* l_mtx,
+    matrix::Csr<ValueType, IndexType>* u_mtx)
+{
+    auto inv_weight = one(weight) / weight;
+    auto inv_two_minus_weight =
+        one(weight) / (static_cast<remove_complex<ValueType>>(2.0) - weight);
+    factorization::helpers::initialize_l_u(
+        system_matrix, l_mtx, u_mtx,
+        factorization::helpers::triangular_mtx_closure(
+            [inv_weight](auto val) { return val * inv_weight; },
+            factorization::helpers::identity{}),
+        factorization::helpers::triangular_mtx_closure(
+            [inv_two_minus_weight](auto val) {
+                return val * inv_two_minus_weight;
+            },
+            [weight, inv_two_minus_weight](auto val) {
+                return val * weight * inv_two_minus_weight;
+            }));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U);
+
+
+}  // namespace sor
+}  // namespace reference
+}  // namespace kernels
+}  // namespace gko

From 22dcf40e0dcbea9d446caa68fd632dd8633ce9fb Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Tue, 25 Jun 2024 17:46:06 +0200
Subject: [PATCH 219/448] [prec] add SOR core + ref tests

---
 core/test/preconditioner/CMakeLists.txt       |   1 +
 core/test/preconditioner/sor.cpp              |  58 +++++
 reference/test/preconditioner/CMakeLists.txt  |   1 +
 reference/test/preconditioner/sor_kernels.cpp | 204 ++++++++++++++++++
 4 files changed, 264 insertions(+)
 create mode 100644 core/test/preconditioner/sor.cpp
 create mode 100644 reference/test/preconditioner/sor_kernels.cpp

diff --git a/core/test/preconditioner/CMakeLists.txt b/core/test/preconditioner/CMakeLists.txt
index 41db021e030..87996a79e32 100644
--- a/core/test/preconditioner/CMakeLists.txt
+++ b/core/test/preconditioner/CMakeLists.txt
@@ -3,3 +3,4 @@ ginkgo_create_test(ic)
 ginkgo_create_test(ilu)
 ginkgo_create_test(isai)
 ginkgo_create_test(jacobi)
+ginkgo_create_test(sor)
diff --git a/core/test/preconditioner/sor.cpp b/core/test/preconditioner/sor.cpp
new file mode 100644
index 00000000000..21c6f1a03e5
--- /dev/null
+++ b/core/test/preconditioner/sor.cpp
@@ -0,0 +1,58 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <memory>
+
+#include <gtest/gtest.h>
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/preconditioner/isai.hpp>
+#include <ginkgo/core/preconditioner/sor.hpp>
+
+#include "core/test/utils.hpp"
+
+
+class SorFactory : public ::testing::Test {
+public:
+    using sor_type = gko::preconditioner::Sor<double, int>;
+    using l_isai_type = gko::preconditioner::LowerIsai<double, int>;
+    using u_isai_type = gko::preconditioner::UpperIsai<double, int>;
+
+    std::shared_ptr<gko::ReferenceExecutor> exec =
+        gko::ReferenceExecutor::create();
+};
+
+
+TEST_F(SorFactory, CanDefaultBuild)
+{
+    auto factory = sor_type::build().on(exec);
+
+    auto params = factory->get_parameters();
+    ASSERT_EQ(params.skip_sorting, false);
+    ASSERT_EQ(params.relaxation_factor, 1.2);
+    ASSERT_EQ(params.symmetric, false);
+    ASSERT_EQ(params.l_solver, nullptr);
+    ASSERT_EQ(params.u_solver, nullptr);
+}
+
+
+TEST_F(SorFactory, CanBuildWithParameters)
+{
+    auto factory = sor_type::build()
+                       .with_skip_sorting(true)
+                       .with_relaxation_factor(0.5)
+                       .with_symmetric(true)
+                       .with_l_solver(l_isai_type::build())
+                       .with_u_solver(u_isai_type::build())
+                       .on(exec);
+
+    auto params = factory->get_parameters();
+    ASSERT_EQ(params.skip_sorting, true);
+    ASSERT_EQ(params.relaxation_factor, 0.5);
+    ASSERT_EQ(params.symmetric, true);
+    ASSERT_NE(params.l_solver, nullptr);
+    GKO_ASSERT_DYNAMIC_TYPE(params.l_solver, l_isai_type::Factory);
+    ASSERT_NE(params.u_solver, nullptr);
+    GKO_ASSERT_DYNAMIC_TYPE(params.u_solver, u_isai_type::Factory);
+}
diff --git a/reference/test/preconditioner/CMakeLists.txt b/reference/test/preconditioner/CMakeLists.txt
index c8b945e2913..09c88608d65 100644
--- a/reference/test/preconditioner/CMakeLists.txt
+++ b/reference/test/preconditioner/CMakeLists.txt
@@ -4,3 +4,4 @@ ginkgo_create_test(ic)
 ginkgo_create_test(isai_kernels)
 ginkgo_create_test(jacobi)
 ginkgo_create_test(jacobi_kernels)
+ginkgo_create_test(sor_kernels)
diff --git a/reference/test/preconditioner/sor_kernels.cpp b/reference/test/preconditioner/sor_kernels.cpp
new file mode 100644
index 00000000000..f2bf5f186f9
--- /dev/null
+++ b/reference/test/preconditioner/sor_kernels.cpp
@@ -0,0 +1,204 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/preconditioner/sor_kernels.hpp"
+
+#include <memory>
+
+#include <gtest/gtest.h>
+
+#include <ginkgo/core/base/composition.hpp>
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/preconditioner/sor.hpp>
+#include <ginkgo/core/solver/triangular.hpp>
+
+#include "core/test/utils.hpp"
+
+
+template <typename ValueIndexType>
+class Sor : public ::testing::Test {
+public:
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using csr_type = gko::matrix::Csr<value_type, index_type>;
+    using sor_type = gko::preconditioner::Sor<value_type, index_type>;
+
+    std::shared_ptr<gko::ReferenceExecutor> exec =
+        gko::ReferenceExecutor::create();
+    gko::remove_complex<value_type> diag_value =
+        static_cast<gko::remove_complex<value_type>>(1.5);
+    std::shared_ptr<csr_type> mtx =
+        gko::initialize<csr_type>({{diag_value, 2, 0, 3, 4},
+                                   {-2, diag_value, 5, 0, 0},
+                                   {0, -5, diag_value, 0, 6},
+                                   {-3, 0, 0, diag_value, 7},
+                                   {-4, 0, -6, -7, diag_value}},
+                                  exec);
+    std::shared_ptr<csr_type> expected_l =
+        gko::initialize<csr_type>({{diag_value, 0, 0, 0, 0},
+                                   {-2, diag_value, 0, 0, 0},
+                                   {0, -5, diag_value, 0, 0},
+                                   {-3, 0, 0, diag_value, 0},
+                                   {-4, 0, -6, -7, diag_value}},
+                                  exec);
+    std::shared_ptr<csr_type> expected_u =
+        gko::initialize<csr_type>({{diag_value, 2, 0, 3, 4},
+                                   {0, diag_value, 5, 0, 0},
+                                   {0, 0, diag_value, 0, 6},
+                                   {0, 0, 0, diag_value, 7},
+                                   {0, 0, 0, 0, diag_value}},
+                                  exec);
+};
+
+TYPED_TEST_SUITE(Sor, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+
+
+TYPED_TEST(Sor, CanInitializeLFactor)
+{
+    using value_type = typename TestFixture::value_type;
+    auto result = gko::clone(this->expected_l);
+    result->scale(
+        gko::initialize<gko::matrix::Dense<value_type>>({0.0}, this->exec));
+
+    gko::kernels::reference::sor::initialize_weighted_l(
+        this->exec, this->mtx.get(), 1.0, result.get());
+
+    GKO_ASSERT_MTX_NEAR(result, this->expected_l, 0.0);
+}
+
+
+TYPED_TEST(Sor, CanInitializeLFactorWithWeight)
+{
+    using value_type = typename TestFixture::value_type;
+    using csr_type = typename TestFixture::csr_type;
+    auto result = gko::clone(this->expected_l);
+    result->scale(
+        gko::initialize<gko::matrix::Dense<value_type>>({0.0}, this->exec));
+    std::shared_ptr<csr_type> expected_l =
+        gko::initialize<csr_type>({{1, 0, 0, 0, 0},
+                                   {-2, 1, 0, 0, 0},
+                                   {0, -5, 1, 0, 0},
+                                   {-3, 0, 0, 1, 0},
+                                   {-4, 0, -6, -7, 1}},
+                                  this->exec);
+
+    gko::kernels::reference::sor::initialize_weighted_l(
+        this->exec, this->mtx.get(), this->diag_value, result.get());
+
+    GKO_ASSERT_MTX_NEAR(result, expected_l, r<value_type>::value);
+}
+
+
+TYPED_TEST(Sor, CanInitializeLAndUFactor)
+{
+    using value_type = typename TestFixture::value_type;
+    auto result_l = gko::clone(this->expected_l);
+    auto result_u = gko::clone(this->expected_u);
+    result_l->scale(
+        gko::initialize<gko::matrix::Dense<value_type>>({0.0}, this->exec));
+    result_u->scale(
+        gko::initialize<gko::matrix::Dense<value_type>>({0.0}, this->exec));
+
+    gko::kernels::reference::sor::initialize_weighted_l_u(
+        this->exec, this->mtx.get(), 1.0, result_l.get(), result_u.get());
+
+    GKO_ASSERT_MTX_NEAR(result_l, this->expected_l, 0.0);
+    GKO_ASSERT_MTX_NEAR(result_u, this->expected_u, 0.0);
+}
+
+
+TYPED_TEST(Sor, CanInitializeLAndUFactorWithWeight)
+{
+    using value_type = typename TestFixture::value_type;
+    using csr_type = typename TestFixture::csr_type;
+    auto result_l = gko::clone(this->expected_l);
+    auto result_u = gko::clone(this->expected_u);
+    result_l->scale(
+        gko::initialize<gko::matrix::Dense<value_type>>({0.0}, this->exec));
+    result_u->scale(
+        gko::initialize<gko::matrix::Dense<value_type>>({0.0}, this->exec));
+    auto diag_weight = static_cast<gko::remove_complex<value_type>>(
+        1.0 / (2 - this->diag_value));
+    auto off_diag_weight = this->diag_value * diag_weight;
+    std::shared_ptr<csr_type> expected_l =
+        gko::initialize<csr_type>({{1, 0, 0, 0, 0},
+                                   {-2, 1, 0, 0, 0},
+                                   {0, -5, 1, 0, 0},
+                                   {-3, 0, 0, 1, 0},
+                                   {-4, 0, -6, -7, 1}},
+                                  this->exec);
+    std::shared_ptr<csr_type> expected_u = gko::initialize<csr_type>(
+        {{this->diag_value * diag_weight, 2 * off_diag_weight, 0,
+          3 * off_diag_weight, 4 * off_diag_weight},
+         {0, this->diag_value * diag_weight, 5 * off_diag_weight, 0, 0},
+         {0, 0, this->diag_value * diag_weight, 0, 6 * off_diag_weight},
+         {0, 0, 0, this->diag_value * diag_weight, 7 * off_diag_weight},
+         {0, 0, 0, 0, this->diag_value * diag_weight}},
+        this->exec);
+
+    gko::kernels::reference::sor::initialize_weighted_l_u(
+        this->exec, this->mtx.get(), this->diag_value, result_l.get(),
+        result_u.get());
+
+    GKO_ASSERT_MTX_NEAR(result_l, expected_l, r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(result_u, expected_u, r<value_type>::value);
+}
+
+
+TYPED_TEST(Sor, CanGenerateNonSymmetric)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using sor_type = typename TestFixture::sor_type;
+    using composition_type = typename sor_type::composition_type;
+    using trs_type = gko::solver::LowerTrs<value_type, index_type>;
+
+    auto sor_pre = sor_type::build()
+                       .with_relaxation_factor(1.0f)
+                       .on(this->exec)
+                       ->generate(this->mtx);
+
+    testing::StaticAssertTypeEq<decltype(sor_pre),
+                                std::unique_ptr<composition_type>>();
+    const auto& ops = sor_pre->get_operators();
+    ASSERT_EQ(ops.size(), 1);
+    GKO_ASSERT_DYNAMIC_TYPE(ops[0], trs_type);
+    auto result_l = gko::as<trs_type>(ops[0])->get_system_matrix();
+    GKO_ASSERT_MTX_NEAR(result_l, this->expected_l, 0.0);
+}
+
+
+TYPED_TEST(Sor, CanGenerateSymmetric)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using sor_type = typename TestFixture::sor_type;
+    using composition_type = typename sor_type::composition_type;
+    using l_trs_type = gko::solver::LowerTrs<value_type, index_type>;
+    using u_trs_type = gko::solver::UpperTrs<value_type, index_type>;
+
+    auto sor_pre = sor_type::build()
+                       .with_symmetric(true)
+                       .with_relaxation_factor(1.0f)
+                       .on(this->exec)
+                       ->generate(this->mtx);
+
+    testing::StaticAssertTypeEq<decltype(sor_pre),
+                                std::unique_ptr<composition_type>>();
+    const auto& ops = sor_pre->get_operators();
+    ASSERT_EQ(ops.size(), 2);
+    GKO_ASSERT_DYNAMIC_TYPE(ops[0], u_trs_type);
+    GKO_ASSERT_DYNAMIC_TYPE(ops[1], l_trs_type);
+    auto result_u = gko::as<u_trs_type>(ops[0])->get_system_matrix();
+    auto result_l = gko::as<l_trs_type>(ops[1])->get_system_matrix();
+    GKO_ASSERT_MTX_NEAR(result_l, this->expected_l, 0.0);
+    auto expected_u = gko::clone(this->expected_u);
+    expected_u->inv_scale(gko::initialize<gko::matrix::Dense<value_type>>(
+        {this->diag_value}, this->exec));
+    GKO_ASSERT_MTX_NEAR(result_u, expected_u, r<value_type>::value);
+}

From e3848bbba3268a643f983234bf1d7b6f4010b012 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Fri, 28 Jun 2024 16:51:24 +0200
Subject: [PATCH 220/448] [prec] add sor parsing

---
 core/config/config_helper.hpp              |  1 +
 core/config/preconditioner_config.cpp      |  2 +
 core/config/registry.cpp                   |  1 +
 core/preconditioner/sor.cpp                | 34 +++++++++++++
 core/test/config/preconditioner.cpp        | 55 +++++++++++++++++++++-
 include/ginkgo/core/preconditioner/sor.hpp |  6 +++
 6 files changed, 97 insertions(+), 2 deletions(-)

diff --git a/core/config/config_helper.hpp b/core/config/config_helper.hpp
index 555bb75c2a8..7ddfe35b99a 100644
--- a/core/config/config_helper.hpp
+++ b/core/config/config_helper.hpp
@@ -64,6 +64,7 @@ enum class LinOpFactoryType : int {
     Ilu,
     Isai,
     Jacobi,
+    Sor,
     Multigrid,
     Pgm,
     Schwarz
diff --git a/core/config/preconditioner_config.cpp b/core/config/preconditioner_config.cpp
index cba54cb3356..baf780360e4 100644
--- a/core/config/preconditioner_config.cpp
+++ b/core/config/preconditioner_config.cpp
@@ -9,6 +9,7 @@
 #include <ginkgo/core/preconditioner/ilu.hpp>
 #include <ginkgo/core/preconditioner/isai.hpp>
 #include <ginkgo/core/preconditioner/jacobi.hpp>
+#include <ginkgo/core/preconditioner/sor.hpp>
 #include <ginkgo/core/solver/gmres.hpp>
 #include <ginkgo/core/solver/ir.hpp>
 #include <ginkgo/core/solver/triangular.hpp>
@@ -294,6 +295,7 @@ deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Isai>(
 
 
 GKO_PARSE_VALUE_AND_INDEX_TYPE(Jacobi, gko::preconditioner::Jacobi);
+GKO_PARSE_VALUE_AND_INDEX_TYPE(Sor, gko::preconditioner::Sor);
 
 
 }  // namespace config
diff --git a/core/config/registry.cpp b/core/config/registry.cpp
index 188c34b35dd..afba48297ba 100644
--- a/core/config/registry.cpp
+++ b/core/config/registry.cpp
@@ -44,6 +44,7 @@ configuration_map generate_config_map()
             {"preconditioner::Ilu", parse<LinOpFactoryType::Ilu>},
             {"preconditioner::Isai", parse<LinOpFactoryType::Isai>},
             {"preconditioner::Jacobi", parse<LinOpFactoryType::Jacobi>},
+            {"preconditioner::Sor", parse<LinOpFactoryType::Sor>},
             {"solver::Multigrid", parse<LinOpFactoryType::Multigrid>},
             {"multigrid::Pgm", parse<LinOpFactoryType::Pgm>},
 #if GINKGO_BUILD_MPI
diff --git a/core/preconditioner/sor.cpp b/core/preconditioner/sor.cpp
index 30a2539a0cc..0ff534a268c 100644
--- a/core/preconditioner/sor.cpp
+++ b/core/preconditioner/sor.cpp
@@ -12,6 +12,7 @@
 
 #include "core/base/array_access.hpp"
 #include "core/base/utils.hpp"
+#include "core/config/config_helper.hpp"
 #include "core/factorization/factorization_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/preconditioner/sor_kernels.hpp"
@@ -32,6 +33,39 @@ GKO_REGISTER_OPERATION(initialize_weighted_l_u, sor::initialize_weighted_l_u);
 }  // namespace
 
 
+template <typename ValueType, typename IndexType>
+typename Sor<ValueType, IndexType>::parameters_type
+Sor<ValueType, IndexType>::parse(const config::pnode& config,
+                                 const config::registry& context,
+                                 const config::type_descriptor& td_for_child)
+{
+    auto params = Sor::build();
+
+    if (auto& obj = config.get("skip_sorting")) {
+        params.with_skip_sorting(config::get_value<bool>(obj));
+    }
+    if (auto& obj = config.get("symmetric")) {
+        params.with_symmetric(config::get_value<bool>(obj));
+    }
+    if (auto& obj = config.get("relaxation_factor")) {
+        params.with_relaxation_factor(
+            config::get_value<remove_complex<ValueType>>(obj));
+    }
+    if (auto& obj = config.get("l_solver")) {
+        params.with_l_solver(
+            gko::config::parse_or_get_factory<const LinOpFactory>(
+                obj, context, td_for_child));
+    }
+    if (auto& obj = config.get("u_solver")) {
+        params.with_u_solver(
+            gko::config::parse_or_get_factory<const LinOpFactory>(
+                obj, context, td_for_child));
+    }
+
+    return params;
+}
+
+
 template <typename ValueType, typename IndexType>
 std::unique_ptr<typename Sor<ValueType, IndexType>::composition_type>
 Sor<ValueType, IndexType>::generate(
diff --git a/core/test/config/preconditioner.cpp b/core/test/config/preconditioner.cpp
index 9e81e690967..410e8d74297 100644
--- a/core/test/config/preconditioner.cpp
+++ b/core/test/config/preconditioner.cpp
@@ -15,6 +15,7 @@
 #include <ginkgo/core/preconditioner/ilu.hpp>
 #include <ginkgo/core/preconditioner/isai.hpp>
 #include <ginkgo/core/preconditioner/jacobi.hpp>
+#include <ginkgo/core/preconditioner/sor.hpp>
 #include <ginkgo/core/solver/gmres.hpp>
 #include <ginkgo/core/solver/ir.hpp>
 #include <ginkgo/core/solver/triangular.hpp>
@@ -300,6 +301,55 @@ struct Jacobi
 };
 
 
+struct Sor
+    : PreconditionerConfigTest<::gko::preconditioner::Sor<float, gko::int32>,
+                               ::gko::preconditioner::Sor<double, gko::int32>> {
+    using Ir = gko::solver::Ir<float>;
+
+    static pnode::map_type setup_base()
+    {
+        return {{"type", pnode{"preconditioner::Sor"}}};
+    }
+
+    static void change_template(pnode::map_type& config_map)
+    {
+        config_map["value_type"] = pnode{"float32"};
+    }
+
+    template <bool from_reg, typename ParamType>
+    static void set(pnode::map_type& config_map, ParamType& param, registry reg,
+                    std::shared_ptr<const gko::Executor> exec)
+    {
+        config_map["skip_sorting"] = pnode{true};
+        param.with_skip_sorting(true);
+        config_map["symmetric"] = pnode{true};
+        param.with_symmetric(true);
+        config_map["relaxation_factor"] = pnode{0.8};
+        // float can be cast to double without issues
+        param.with_relaxation_factor(0.8f);
+        config_map["l_solver"] = pnode{
+            {{"type", pnode{"solver::Ir"}}, {"value_type", pnode{"float32"}}}};
+        param.with_l_solver(DummyIr::build());
+        config_map["u_solver"] = pnode{
+            {{"type", pnode{"solver::Ir"}}, {"value_type", pnode{"float32"}}}};
+        param.with_u_solver(DummyIr::build());
+    }
+
+    template <bool from_reg, typename AnswerType>
+    static void validate(gko::LinOpFactory* result, AnswerType* answer)
+    {
+        auto res_param = gko::as<AnswerType>(result)->get_parameters();
+        auto ans_param = answer->get_parameters();
+
+        ASSERT_EQ(res_param.skip_sorting, ans_param.skip_sorting);
+        ASSERT_EQ(res_param.symmetric, ans_param.symmetric);
+        ASSERT_EQ(res_param.relaxation_factor, ans_param.relaxation_factor);
+        ASSERT_EQ(typeid(res_param.l_solver), typeid(ans_param.l_solver));
+        ASSERT_EQ(typeid(res_param.u_solver), typeid(ans_param.u_solver));
+    }
+};
+
+
 #if GINKGO_BUILD_MPI
 
 
@@ -395,11 +445,12 @@ class Preconditioner : public ::testing::Test {
 };
 
 
-using PreconditionerTypes = ::testing::Types<
+using PreconditionerTypes =
+    ::testing::Types<
 #if GINKGO_BUILD_MPI
     ::Schwarz,
 #endif  // GINKGO_BUILD_MPI
-    ::Ic, ::Ilu, ::Isai, ::Jacobi>;
+    ::Ic, ::Ilu, ::Isai, ::Jacobi, ::Sor>;
 
 
 TYPED_TEST_SUITE(Preconditioner, PreconditionerTypes, TypenameNameGenerator);
diff --git a/include/ginkgo/core/preconditioner/sor.hpp b/include/ginkgo/core/preconditioner/sor.hpp
index 941f012039d..276d718dacb 100644
--- a/include/ginkgo/core/preconditioner/sor.hpp
+++ b/include/ginkgo/core/preconditioner/sor.hpp
@@ -12,6 +12,7 @@
 #include <ginkgo/core/base/composition.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/polymorphic_object.hpp>
+#include <ginkgo/core/config/config.hpp>
 
 
 namespace gko {
@@ -103,6 +104,11 @@ class Sor
     /** Creates a new parameter_type to set up the factory. */
     static parameters_type build() { return {}; }
 
+    static parameters_type parse(
+        const config::pnode& config, const config::registry& context,
+        const config::type_descriptor& td_for_child =
+            config::make_type_descriptor<ValueType, IndexType>());
+
 protected:
     explicit Sor(std::shared_ptr<const Executor> exec,
                  const parameters_type& params = {})

From fbdf417fffd181b13eae1725373e5cf7c559de53 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Fri, 28 Jun 2024 14:14:33 +0200
Subject: [PATCH 221/448] =?UTF-8?q?[prec]=20implement=20Gau=C3=9F-Seidel?=
 =?UTF-8?q?=20preconditioner?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 core/CMakeLists.txt                           |   3 +-
 core/preconditioner/gauss_seidel.cpp          |  47 +++++++
 core/test/preconditioner/CMakeLists.txt       |   1 +
 core/test/preconditioner/gauss_seidel.cpp     |  55 ++++++++
 .../core/preconditioner/gauss_seidel.hpp      | 103 +++++++++++++++
 include/ginkgo/ginkgo.hpp                     |   1 +
 reference/test/preconditioner/CMakeLists.txt  |   1 +
 .../test/preconditioner/gauss_seidel.cpp      | 120 ++++++++++++++++++
 8 files changed, 330 insertions(+), 1 deletion(-)
 create mode 100644 core/preconditioner/gauss_seidel.cpp
 create mode 100644 core/test/preconditioner/gauss_seidel.cpp
 create mode 100644 include/ginkgo/core/preconditioner/gauss_seidel.hpp
 create mode 100644 reference/test/preconditioner/gauss_seidel.cpp

diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
index ef07359e8b4..6e0960459e0 100644
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -88,7 +88,8 @@ target_sources(${ginkgo_core}
     multigrid/pgm.cpp
     multigrid/fixed_coarsening.cpp
     preconditioner/batch_jacobi.cpp
-        preconditioner/sor.cpp
+    preconditioner/gauss_seidel.cpp
+    preconditioner/sor.cpp
     preconditioner/ic.cpp
     preconditioner/ilu.cpp
     preconditioner/isai.cpp
diff --git a/core/preconditioner/gauss_seidel.cpp b/core/preconditioner/gauss_seidel.cpp
new file mode 100644
index 00000000000..20aef490f05
--- /dev/null
+++ b/core/preconditioner/gauss_seidel.cpp
@@ -0,0 +1,47 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <ginkgo/core/preconditioner/gauss_seidel.hpp>
+#include <ginkgo/core/preconditioner/sor.hpp>
+
+
+namespace gko {
+namespace preconditioner {
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<typename GaussSeidel<ValueType, IndexType>::composition_type>
+GaussSeidel<ValueType, IndexType>::generate(
+    std::shared_ptr<const LinOp> system_matrix) const
+{
+    auto product =
+        std::unique_ptr<composition_type>(static_cast<composition_type*>(
+            this->LinOpFactory::generate(std::move(system_matrix)).release()));
+    return product;
+}
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<LinOp> GaussSeidel<ValueType, IndexType>::generate_impl(
+    std::shared_ptr<const LinOp> system_matrix) const
+{
+    return Sor<ValueType, IndexType>::build()
+        .with_skip_sorting(parameters_.skip_sorting)
+        .with_symmetric(parameters_.symmetric)
+        .with_relaxation_factor(static_cast<remove_complex<ValueType>>(1.0))
+        .with_l_solver(parameters_.l_solver)
+        .with_u_solver(parameters_.u_solver)
+        .on(this->get_executor())
+        ->generate(std::move(system_matrix));
+}
+
+
+#define GKO_DECLARE_GAUSS_SEIDEL(ValueType, IndexType) \
+    class GaussSeidel<ValueType, IndexType>
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_GAUSS_SEIDEL);
+
+
+}  // namespace preconditioner
+}  // namespace gko
diff --git a/core/test/preconditioner/CMakeLists.txt b/core/test/preconditioner/CMakeLists.txt
index 87996a79e32..bb13a60dbe7 100644
--- a/core/test/preconditioner/CMakeLists.txt
+++ b/core/test/preconditioner/CMakeLists.txt
@@ -1,4 +1,5 @@
 ginkgo_create_test(batch_jacobi)
+ginkgo_create_test(gauss_seidel)
 ginkgo_create_test(ic)
 ginkgo_create_test(ilu)
 ginkgo_create_test(isai)
diff --git a/core/test/preconditioner/gauss_seidel.cpp b/core/test/preconditioner/gauss_seidel.cpp
new file mode 100644
index 00000000000..9a2f965db79
--- /dev/null
+++ b/core/test/preconditioner/gauss_seidel.cpp
@@ -0,0 +1,55 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <memory>
+
+#include <gtest/gtest.h>
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/preconditioner/gauss_seidel.hpp>
+#include <ginkgo/core/preconditioner/isai.hpp>
+
+#include "core/test/utils.hpp"
+
+
+class GaussSeidelFactory : public ::testing::Test {
+public:
+    using GaussSeidel_type = gko::preconditioner::GaussSeidel<double, int>;
+    using l_isai_type = gko::preconditioner::LowerIsai<double, int>;
+    using u_isai_type = gko::preconditioner::UpperIsai<double, int>;
+
+    std::shared_ptr<gko::ReferenceExecutor> exec =
+        gko::ReferenceExecutor::create();
+};
+
+
+TEST_F(GaussSeidelFactory, CanDefaultBuild)
+{
+    auto factory = GaussSeidel_type::build().on(exec);
+
+    auto params = factory->get_parameters();
+    ASSERT_EQ(params.skip_sorting, false);
+    ASSERT_EQ(params.symmetric, false);
+    ASSERT_EQ(params.l_solver, nullptr);
+    ASSERT_EQ(params.u_solver, nullptr);
+}
+
+
+TEST_F(GaussSeidelFactory, CanBuildWithParameters)
+{
+    auto factory = GaussSeidel_type::build()
+                       .with_skip_sorting(true)
+                       .with_symmetric(true)
+                       .with_l_solver(l_isai_type::build())
+                       .with_u_solver(u_isai_type::build())
+                       .on(exec);
+
+    auto params = factory->get_parameters();
+    ASSERT_EQ(params.skip_sorting, true);
+    ASSERT_EQ(params.symmetric, true);
+    ASSERT_NE(params.l_solver, nullptr);
+    GKO_ASSERT_DYNAMIC_TYPE(params.l_solver, l_isai_type::Factory);
+    ASSERT_NE(params.u_solver, nullptr);
+    GKO_ASSERT_DYNAMIC_TYPE(params.u_solver, u_isai_type::Factory);
+}
diff --git a/include/ginkgo/core/preconditioner/gauss_seidel.hpp b/include/ginkgo/core/preconditioner/gauss_seidel.hpp
new file mode 100644
index 00000000000..1e482a93819
--- /dev/null
+++ b/include/ginkgo/core/preconditioner/gauss_seidel.hpp
@@ -0,0 +1,103 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_PUBLIC_CORE_PRECONDITIONER_GAUSS_SEIDEL_HPP_
+#define GKO_PUBLIC_CORE_PRECONDITIONER_GAUSS_SEIDEL_HPP_
+
+
+#include <vector>
+
+#include <ginkgo/core/base/abstract_factory.hpp>
+#include <ginkgo/core/base/composition.hpp>
+#include <ginkgo/core/base/lin_op.hpp>
+#include <ginkgo/core/base/polymorphic_object.hpp>
+
+
+namespace gko {
+namespace preconditioner {
+
+
+/**
+ * This class generates the Gauss-Seidel preconditioner.
+ *
+ * This is the special case of $\omega = 1$ of the (S)SOR preconditioner.
+ *
+ * @see Sor
+ */
+template <typename ValueType = default_precision, typename IndexType = int32>
+class GaussSeidel
+    : public EnablePolymorphicObject<GaussSeidel<ValueType, IndexType>,
+                                     LinOpFactory>,
+      public EnablePolymorphicAssignment<GaussSeidel<ValueType, IndexType>> {
+    friend class EnablePolymorphicObject<GaussSeidel, LinOpFactory>;
+
+public:
+    struct parameters_type;
+    friend class enable_parameters_type<parameters_type, GaussSeidel>;
+
+    using value_type = ValueType;
+    using index_type = IndexType;
+    using composition_type = Composition<ValueType>;
+
+    struct parameters_type
+        : public enable_parameters_type<parameters_type, GaussSeidel> {
+        // skip sorting of input matrix
+        bool GKO_FACTORY_PARAMETER_SCALAR(skip_sorting, false);
+
+        // determines if Gauss-Seidel or symmetric Gauss-Seidel should be used
+        bool GKO_FACTORY_PARAMETER_SCALAR(symmetric, false);
+
+        // factory for the lower triangular factor solver
+        std::shared_ptr<const LinOpFactory> GKO_DEFERRED_FACTORY_PARAMETER(
+            l_solver);
+
+        // factory for the upper triangular factor solver, unused if symmetric
+        // is false
+        std::shared_ptr<const LinOpFactory> GKO_DEFERRED_FACTORY_PARAMETER(
+            u_solver);
+    };
+
+    /**
+     * Returns the parameters used to construct the factory.
+     *
+     * @return the parameters used to construct the factory.
+     */
+    const parameters_type& get_parameters() { return parameters_; }
+
+    /**
+     * @copydoc get_parameters
+     */
+    const parameters_type& get_parameters() const { return parameters_; }
+
+    /**
+     * @copydoc LinOpFactory::generate
+     * @note This function overrides the default LinOpFactory::generate to
+     *       return a Factorization instead of a generic LinOp, which would need
+     *       to be cast to Factorization again to access its factors.
+     *       It is only necessary because smart pointers aren't covariant.
+     */
+    std::unique_ptr<composition_type> generate(
+        std::shared_ptr<const LinOp> system_matrix) const;
+
+    /** Creates a new parameter_type to set up the factory. */
+    static parameters_type build() { return {}; }
+
+protected:
+    explicit GaussSeidel(std::shared_ptr<const Executor> exec,
+                         const parameters_type& params = {})
+        : EnablePolymorphicObject<GaussSeidel, LinOpFactory>(exec),
+          parameters_(params)
+    {}
+
+    std::unique_ptr<LinOp> generate_impl(
+        std::shared_ptr<const LinOp> system_matrix) const override;
+
+private:
+    parameters_type parameters_;
+};
+}  // namespace preconditioner
+}  // namespace gko
+
+
+#endif  // GKO_PUBLIC_CORE_PRECONDITIONER_GAUSS_SEIDEL_HPP_
diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp
index c44cdee2485..61e5b719508 100644
--- a/include/ginkgo/ginkgo.hpp
+++ b/include/ginkgo/ginkgo.hpp
@@ -114,6 +114,7 @@
 #include <ginkgo/core/multigrid/pgm.hpp>
 
 #include <ginkgo/core/preconditioner/batch_jacobi.hpp>
+#include <ginkgo/core/preconditioner/gauss_seidel.hpp>
 #include <ginkgo/core/preconditioner/ic.hpp>
 #include <ginkgo/core/preconditioner/ilu.hpp>
 #include <ginkgo/core/preconditioner/isai.hpp>
diff --git a/reference/test/preconditioner/CMakeLists.txt b/reference/test/preconditioner/CMakeLists.txt
index 09c88608d65..f558aa87495 100644
--- a/reference/test/preconditioner/CMakeLists.txt
+++ b/reference/test/preconditioner/CMakeLists.txt
@@ -1,4 +1,5 @@
 ginkgo_create_test(batch_jacobi_kernels)
+ginkgo_create_test(gauss_seidel)
 ginkgo_create_test(ilu)
 ginkgo_create_test(ic)
 ginkgo_create_test(isai_kernels)
diff --git a/reference/test/preconditioner/gauss_seidel.cpp b/reference/test/preconditioner/gauss_seidel.cpp
new file mode 100644
index 00000000000..2b67b665d77
--- /dev/null
+++ b/reference/test/preconditioner/gauss_seidel.cpp
@@ -0,0 +1,120 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <memory>
+
+#include <gtest/gtest.h>
+
+#include <ginkgo/core/base/composition.hpp>
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/preconditioner/gauss_seidel.hpp>
+#include <ginkgo/core/preconditioner/sor.hpp>
+#include <ginkgo/core/solver/triangular.hpp>
+
+#include "core/test/utils.hpp"
+#include "core/utils/matrix_utils.hpp"
+
+
+template <typename ValueIndexType>
+class GaussSeidel : public ::testing::Test {
+public:
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using csr_type = gko::matrix::Csr<value_type, index_type>;
+    using gs_type = gko::preconditioner::GaussSeidel<value_type, index_type>;
+    using sor_type = gko::preconditioner::Sor<value_type, index_type>;
+    using ltrs_type = gko::solver::LowerTrs<value_type, index_type>;
+    using utrs_type = gko::solver::UpperTrs<value_type, index_type>;
+
+    GaussSeidel()
+    {
+        auto data =
+            gko::test::generate_random_matrix_data<value_type, index_type>(
+                10, 10, std::uniform_int_distribution<>(2, 6),
+                std::uniform_real_distribution<>(1, 2), engine);
+        gko::utils::make_symmetric(data);
+        gko::utils::make_unit_diagonal(data);
+        mtx->read(data);
+    }
+
+    std::default_random_engine engine;
+    std::shared_ptr<gko::ReferenceExecutor> exec =
+        gko::ReferenceExecutor::create();
+    std::shared_ptr<csr_type> mtx = csr_type::create(exec);
+};
+
+TYPED_TEST_SUITE(GaussSeidel, gko::test::ValueIndexTypes,
+                 PairTypenameNameGenerator);
+
+
+TYPED_TEST(GaussSeidel, GenerateSameAsSor)
+{
+    using real_type = gko::remove_complex<typename TestFixture::value_type>;
+    using gs_type = typename TestFixture::gs_type;
+    using sor_type = typename TestFixture::sor_type;
+    using composition_type = typename sor_type::composition_type;
+    using csr_type = typename TestFixture::csr_type;
+    using ltrs_type = typename TestFixture::ltrs_type;
+
+    auto gs = gs_type ::build().on(this->exec)->generate(this->mtx);
+    auto sor = sor_type::build()
+                   .with_relaxation_factor(real_type{1.0})
+                   .on(this->exec)
+                   ->generate(this->mtx);
+
+    auto gs_comp = dynamic_cast<composition_type*>(gs.get());
+    auto sor_comp = dynamic_cast<composition_type*>(sor.get());
+    ASSERT_TRUE(gs_comp);
+    ASSERT_TRUE(sor_comp);
+    ASSERT_EQ(gs_comp->get_operators().size(),
+              sor_comp->get_operators().size());
+    GKO_ASSERT_MTX_NEAR(
+        dynamic_cast<const ltrs_type*>(gs_comp->get_operators()[0].get())
+            ->get_system_matrix(),
+        dynamic_cast<const ltrs_type*>(sor_comp->get_operators()[0].get())
+            ->get_system_matrix(),
+        0.0);
+}
+
+TYPED_TEST(GaussSeidel, GenerateSymmetricSameAsSor)
+{
+    using real_type = gko::remove_complex<typename TestFixture::value_type>;
+    using gs_type = typename TestFixture::gs_type;
+    using sor_type = typename TestFixture::sor_type;
+    using composition_type = typename sor_type::composition_type;
+    using ltrs_type = typename TestFixture::ltrs_type;
+    using utrs_type = typename TestFixture::utrs_type;
+
+    auto gs = gs_type ::build()
+                  .with_symmetric(true)
+                  .on(this->exec)
+                  ->generate(this->mtx);
+    auto sor = sor_type::build()
+                   .with_symmetric(true)
+                   .with_relaxation_factor(real_type{1.0})
+                   .on(this->exec)
+                   ->generate(this->mtx);
+
+    auto gs_comp = dynamic_cast<composition_type*>(gs.get());
+    auto sor_comp = dynamic_cast<composition_type*>(sor.get());
+    ASSERT_TRUE(gs_comp);
+    ASSERT_TRUE(sor_comp);
+    ASSERT_EQ(gs_comp->get_operators().size(),
+              sor_comp->get_operators().size());
+    GKO_ASSERT_MTX_NEAR(
+        dynamic_cast<const utrs_type*>(gs_comp->get_operators()[0].get())
+            ->get_system_matrix(),
+        dynamic_cast<const utrs_type*>(sor_comp->get_operators()[0].get())
+            ->get_system_matrix(),
+        0.0);
+    GKO_ASSERT_MTX_NEAR(
+        dynamic_cast<const ltrs_type*>(gs_comp->get_operators()[1].get())
+            ->get_system_matrix(),
+        dynamic_cast<const ltrs_type*>(sor_comp->get_operators()[1].get())
+            ->get_system_matrix(),
+        0.0);
+}

From 741ca81c4e211516c21f18fbcffb732407ecc786 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Tue, 29 Oct 2024 17:51:34 +0000
Subject: [PATCH 222/448] [prec] add gauss seidel parsing

---
 core/config/config_helper.hpp                 |  1 +
 core/config/preconditioner_config.cpp         |  2 +
 core/config/registry.cpp                      |  2 +
 core/preconditioner/gauss_seidel.cpp          | 31 +++++++++++
 core/test/config/preconditioner.cpp           | 52 +++++++++++++++++--
 .../core/preconditioner/gauss_seidel.hpp      |  8 +++
 6 files changed, 93 insertions(+), 3 deletions(-)

diff --git a/core/config/config_helper.hpp b/core/config/config_helper.hpp
index 7ddfe35b99a..483366765aa 100644
--- a/core/config/config_helper.hpp
+++ b/core/config/config_helper.hpp
@@ -60,6 +60,7 @@ enum class LinOpFactoryType : int {
     ParIct,
     ParIlu,
     ParIlut,
+    GaussSeidel,
     Ic,
     Ilu,
     Isai,
diff --git a/core/config/preconditioner_config.cpp b/core/config/preconditioner_config.cpp
index baf780360e4..68cbf8595ba 100644
--- a/core/config/preconditioner_config.cpp
+++ b/core/config/preconditioner_config.cpp
@@ -5,6 +5,7 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/config/config.hpp>
 #include <ginkgo/core/config/registry.hpp>
+#include <ginkgo/core/preconditioner/gauss_seidel.hpp>
 #include <ginkgo/core/preconditioner/ic.hpp>
 #include <ginkgo/core/preconditioner/ilu.hpp>
 #include <ginkgo/core/preconditioner/isai.hpp>
@@ -294,6 +295,7 @@ deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Isai>(
 }
 
 
+GKO_PARSE_VALUE_AND_INDEX_TYPE(GaussSeidel, gko::preconditioner::GaussSeidel);
 GKO_PARSE_VALUE_AND_INDEX_TYPE(Jacobi, gko::preconditioner::Jacobi);
 GKO_PARSE_VALUE_AND_INDEX_TYPE(Sor, gko::preconditioner::Sor);
 
diff --git a/core/config/registry.cpp b/core/config/registry.cpp
index afba48297ba..19da3ed2559 100644
--- a/core/config/registry.cpp
+++ b/core/config/registry.cpp
@@ -40,6 +40,8 @@ configuration_map generate_config_map()
             {"factorization::ParIct", parse<LinOpFactoryType::ParIct>},
             {"factorization::ParIlu", parse<LinOpFactoryType::ParIlu>},
             {"factorization::ParIlut", parse<LinOpFactoryType::ParIlut>},
+            {"preconditioner::GaussSeidel",
+             parse<LinOpFactoryType::GaussSeidel>},
             {"preconditioner::Ic", parse<LinOpFactoryType::Ic>},
             {"preconditioner::Ilu", parse<LinOpFactoryType::Ilu>},
             {"preconditioner::Isai", parse<LinOpFactoryType::Isai>},
diff --git a/core/preconditioner/gauss_seidel.cpp b/core/preconditioner/gauss_seidel.cpp
index 20aef490f05..aec7a4ff827 100644
--- a/core/preconditioner/gauss_seidel.cpp
+++ b/core/preconditioner/gauss_seidel.cpp
@@ -5,11 +5,42 @@
 #include <ginkgo/core/preconditioner/gauss_seidel.hpp>
 #include <ginkgo/core/preconditioner/sor.hpp>
 
+#include "core/config/config_helper.hpp"
+
 
 namespace gko {
 namespace preconditioner {
 
 
+template <typename ValueType, typename IndexType>
+typename GaussSeidel<ValueType, IndexType>::parameters_type
+GaussSeidel<ValueType, IndexType>::parse(
+    const config::pnode& config, const config::registry& context,
+    const config::type_descriptor& td_for_child)
+{
+    auto params = GaussSeidel::build();
+
+    if (auto& obj = config.get("skip_sorting")) {
+        params.with_skip_sorting(config::get_value<bool>(obj));
+    }
+    if (auto& obj = config.get("symmetric")) {
+        params.with_symmetric(config::get_value<bool>(obj));
+    }
+    if (auto& obj = config.get("l_solver")) {
+        params.with_l_solver(
+            gko::config::parse_or_get_factory<const LinOpFactory>(
+                obj, context, td_for_child));
+    }
+    if (auto& obj = config.get("u_solver")) {
+        params.with_u_solver(
+            gko::config::parse_or_get_factory<const LinOpFactory>(
+                obj, context, td_for_child));
+    }
+
+    return params;
+}
+
+
 template <typename ValueType, typename IndexType>
 std::unique_ptr<typename GaussSeidel<ValueType, IndexType>::composition_type>
 GaussSeidel<ValueType, IndexType>::generate(
diff --git a/core/test/config/preconditioner.cpp b/core/test/config/preconditioner.cpp
index 410e8d74297..c3941f504e2 100644
--- a/core/test/config/preconditioner.cpp
+++ b/core/test/config/preconditioner.cpp
@@ -11,6 +11,7 @@
 #include <ginkgo/core/config/config.hpp>
 #include <ginkgo/core/distributed/preconditioner/schwarz.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/preconditioner/gauss_seidel.hpp>
 #include <ginkgo/core/preconditioner/ic.hpp>
 #include <ginkgo/core/preconditioner/ilu.hpp>
 #include <ginkgo/core/preconditioner/isai.hpp>
@@ -350,6 +351,52 @@ struct Sor
 };
 
 
+struct GaussSeidel
+    : PreconditionerConfigTest<
+          ::gko::preconditioner::GaussSeidel<float, gko::int32>,
+          ::gko::preconditioner::GaussSeidel<double, gko::int32>> {
+    using Ir = gko::solver::Ir<float>;
+
+    static pnode::map_type setup_base()
+    {
+        return {{"type", pnode{"preconditioner::GaussSeidel"}}};
+    }
+
+    static void change_template(pnode::map_type& config_map)
+    {
+        config_map["value_type"] = pnode{"float32"};
+    }
+
+    template <bool from_reg, typename ParamType>
+    static void set(pnode::map_type& config_map, ParamType& param, registry reg,
+                    std::shared_ptr<const gko::Executor> exec)
+    {
+        config_map["skip_sorting"] = pnode{true};
+        param.with_skip_sorting(true);
+        config_map["symmetric"] = pnode{true};
+        param.with_symmetric(true);
+        config_map["l_solver"] = pnode{
+            {{"type", pnode{"solver::Ir"}}, {"value_type", pnode{"float32"}}}};
+        param.with_l_solver(DummyIr::build());
+        config_map["u_solver"] = pnode{
+            {{"type", pnode{"solver::Ir"}}, {"value_type", pnode{"float32"}}}};
+        param.with_u_solver(DummyIr::build());
+    }
+
+    template <bool from_reg, typename AnswerType>
+    static void validate(gko::LinOpFactory* result, AnswerType* answer)
+    {
+        auto res_param = gko::as<AnswerType>(result)->get_parameters();
+        auto ans_param = answer->get_parameters();
+
+        ASSERT_EQ(res_param.skip_sorting, ans_param.skip_sorting);
+        ASSERT_EQ(res_param.symmetric, ans_param.symmetric);
+        ASSERT_EQ(typeid(res_param.l_solver), typeid(ans_param.l_solver));
+        ASSERT_EQ(typeid(res_param.u_solver), typeid(ans_param.u_solver));
+    }
+};
+
+
 #if GINKGO_BUILD_MPI
 
 
@@ -445,12 +492,11 @@ class Preconditioner : public ::testing::Test {
 };
 
 
-using PreconditionerTypes =
-    ::testing::Types<
+using PreconditionerTypes = ::testing::Types<
 #if GINKGO_BUILD_MPI
     ::Schwarz,
 #endif  // GINKGO_BUILD_MPI
-    ::Ic, ::Ilu, ::Isai, ::Jacobi, ::Sor>;
+    ::GaussSeidel, ::Ic, ::Ilu, ::Isai, ::Jacobi, ::Sor>;
 
 
 TYPED_TEST_SUITE(Preconditioner, PreconditionerTypes, TypenameNameGenerator);
diff --git a/include/ginkgo/core/preconditioner/gauss_seidel.hpp b/include/ginkgo/core/preconditioner/gauss_seidel.hpp
index 1e482a93819..7003dd54740 100644
--- a/include/ginkgo/core/preconditioner/gauss_seidel.hpp
+++ b/include/ginkgo/core/preconditioner/gauss_seidel.hpp
@@ -12,6 +12,7 @@
 #include <ginkgo/core/base/composition.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/polymorphic_object.hpp>
+#include <ginkgo/core/config/config.hpp>
 
 
 namespace gko {
@@ -83,6 +84,11 @@ class GaussSeidel
     /** Creates a new parameter_type to set up the factory. */
     static parameters_type build() { return {}; }
 
+    static parameters_type parse(
+        const config::pnode& config, const config::registry& context,
+        const config::type_descriptor& td_for_child =
+            config::make_type_descriptor<ValueType, IndexType>());
+
 protected:
     explicit GaussSeidel(std::shared_ptr<const Executor> exec,
                          const parameters_type& params = {})
@@ -96,6 +102,8 @@ class GaussSeidel
 private:
     parameters_type parameters_;
 };
+
+
 }  // namespace preconditioner
 }  // namespace gko
 

From f8ffb2cc43b05fe7f3aa94c215235692730bf741 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Thu, 11 Jul 2024 07:24:09 +0000
Subject: [PATCH 223/448] [sor] review updates:

- documentation
- tests
- don't build upper solver if not symmetric

Co-authored-by: Yu-Hsiang M. Tsai <yhmtsai@gmail.com>
---
 core/preconditioner/sor.cpp                   |  5 +--
 core/test/config/preconditioner.cpp           |  8 ++---
 .../core/preconditioner/gauss_seidel.hpp      |  7 ++--
 include/ginkgo/core/preconditioner/sor.hpp    |  6 ++--
 reference/CMakeLists.txt                      |  2 +-
 reference/test/preconditioner/sor_kernels.cpp | 32 +++++++++----------
 6 files changed, 32 insertions(+), 28 deletions(-)

diff --git a/core/preconditioner/sor.cpp b/core/preconditioner/sor.cpp
index 0ff534a268c..c9905c5f73c 100644
--- a/core/preconditioner/sor.cpp
+++ b/core/preconditioner/sor.cpp
@@ -96,10 +96,11 @@ std::unique_ptr<LinOp> Sor<ValueType, IndexType>::generate_impl(
 
     auto l_trs_factory =
         parameters_.l_solver ? parameters_.l_solver : LTrs::build().on(exec);
-    auto u_trs_factory =
-        parameters_.u_solver ? parameters_.u_solver : UTrs::build().on(exec);
 
     if (parameters_.symmetric) {
+        auto u_trs_factory = parameters_.u_solver ? parameters_.u_solver
+                                                  : UTrs::build().on(exec);
+
         array<index_type> l_row_ptrs{exec, size[0] + 1};
         array<index_type> u_row_ptrs{exec, size[0] + 1};
         exec->run(make_initialize_row_ptrs_l_u(
diff --git a/core/test/config/preconditioner.cpp b/core/test/config/preconditioner.cpp
index c3941f504e2..c603aaea750 100644
--- a/core/test/config/preconditioner.cpp
+++ b/core/test/config/preconditioner.cpp
@@ -330,10 +330,10 @@ struct Sor
         param.with_relaxation_factor(0.8f);
         config_map["l_solver"] = pnode{
             {{"type", pnode{"solver::Ir"}}, {"value_type", pnode{"float32"}}}};
-        param.with_l_solver(DummyIr::build());
+        param.with_l_solver(Ir::build());
         config_map["u_solver"] = pnode{
             {{"type", pnode{"solver::Ir"}}, {"value_type", pnode{"float32"}}}};
-        param.with_u_solver(DummyIr::build());
+        param.with_u_solver(Ir::build());
     }
 
     template <bool from_reg, typename AnswerType>
@@ -377,10 +377,10 @@ struct GaussSeidel
         param.with_symmetric(true);
         config_map["l_solver"] = pnode{
             {{"type", pnode{"solver::Ir"}}, {"value_type", pnode{"float32"}}}};
-        param.with_l_solver(DummyIr::build());
+        param.with_l_solver(Ir::build());
         config_map["u_solver"] = pnode{
             {{"type", pnode{"solver::Ir"}}, {"value_type", pnode{"float32"}}}};
-        param.with_u_solver(DummyIr::build());
+        param.with_u_solver(Ir::build());
     }
 
     template <bool from_reg, typename AnswerType>
diff --git a/include/ginkgo/core/preconditioner/gauss_seidel.hpp b/include/ginkgo/core/preconditioner/gauss_seidel.hpp
index 7003dd54740..75668e652a7 100644
--- a/include/ginkgo/core/preconditioner/gauss_seidel.hpp
+++ b/include/ginkgo/core/preconditioner/gauss_seidel.hpp
@@ -22,7 +22,8 @@ namespace preconditioner {
 /**
  * This class generates the Gauss-Seidel preconditioner.
  *
- * This is the special case of $\omega = 1$ of the (S)SOR preconditioner.
+ * This is the special case of the relaxation factor $\omega = 1$ of the (S)SOR
+ * preconditioner.
  *
  * @see Sor
  */
@@ -49,12 +50,12 @@ class GaussSeidel
         // determines if Gauss-Seidel or symmetric Gauss-Seidel should be used
         bool GKO_FACTORY_PARAMETER_SCALAR(symmetric, false);
 
-        // factory for the lower triangular factor solver
+        // factory for the lower triangular factor solver, defaults to LowerTrs
         std::shared_ptr<const LinOpFactory> GKO_DEFERRED_FACTORY_PARAMETER(
             l_solver);
 
         // factory for the upper triangular factor solver, unused if symmetric
-        // is false
+        // is false, defaults to UpperTrs
         std::shared_ptr<const LinOpFactory> GKO_DEFERRED_FACTORY_PARAMETER(
             u_solver);
     };
diff --git a/include/ginkgo/core/preconditioner/sor.hpp b/include/ginkgo/core/preconditioner/sor.hpp
index 276d718dacb..531dded79f2 100644
--- a/include/ginkgo/core/preconditioner/sor.hpp
+++ b/include/ginkgo/core/preconditioner/sor.hpp
@@ -36,6 +36,8 @@ namespace preconditioner {
  * M = \frac{1}{\omega (2 - \omega)} (D + \omega L) D^{-1} (D + \omega U) ,
  * \quad 0 < \omega < 2.
  * $$
+ * A detailed description can be found in Iterative Methods for Sparse Linear
+ * Systems (Y. Saad) ch. 4.1.
  *
  * This class is a factory, which will only generate the preconditioner. The
  * resulting LinOp will represent the application of $M^{-1}$.
@@ -69,12 +71,12 @@ class Sor
         remove_complex<value_type> GKO_FACTORY_PARAMETER_SCALAR(
             relaxation_factor, remove_complex<value_type>(1.2));
 
-        // factory for the lower triangular factor solver
+        // factory for the lower triangular factor solver, defaults to LowerTrs
         std::shared_ptr<const LinOpFactory> GKO_DEFERRED_FACTORY_PARAMETER(
             l_solver);
 
         // factory for the upper triangular factor solver, unused if symmetric
-        // is false
+        // is false, defaults to UpperTrs
         std::shared_ptr<const LinOpFactory> GKO_DEFERRED_FACTORY_PARAMETER(
             u_solver);
     };
diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt
index ab6c210518b..e2f27dab57e 100644
--- a/reference/CMakeLists.txt
+++ b/reference/CMakeLists.txt
@@ -43,7 +43,7 @@ target_sources(ginkgo_reference
     matrix/sparsity_csr_kernels.cpp
     multigrid/pgm_kernels.cpp
     preconditioner/batch_jacobi_kernels.cpp
-        preconditioner/sor_kernels.cpp
+    preconditioner/sor_kernels.cpp
     preconditioner/isai_kernels.cpp
     preconditioner/jacobi_kernels.cpp
     reorder/rcm_kernels.cpp
diff --git a/reference/test/preconditioner/sor_kernels.cpp b/reference/test/preconditioner/sor_kernels.cpp
index f2bf5f186f9..18c055aa6d9 100644
--- a/reference/test/preconditioner/sor_kernels.cpp
+++ b/reference/test/preconditioner/sor_kernels.cpp
@@ -80,15 +80,15 @@ TYPED_TEST(Sor, CanInitializeLFactorWithWeight)
     result->scale(
         gko::initialize<gko::matrix::Dense<value_type>>({0.0}, this->exec));
     std::shared_ptr<csr_type> expected_l =
-        gko::initialize<csr_type>({{1, 0, 0, 0, 0},
-                                   {-2, 1, 0, 0, 0},
-                                   {0, -5, 1, 0, 0},
-                                   {-3, 0, 0, 1, 0},
-                                   {-4, 0, -6, -7, 1}},
+        gko::initialize<csr_type>({{2 * this->diag_value, 0, 0, 0, 0},
+                                   {-2, 2 * this->diag_value, 0, 0, 0},
+                                   {0, -5, 2 * this->diag_value, 0, 0},
+                                   {-3, 0, 0, 2 * this->diag_value, 0},
+                                   {-4, 0, -6, -7, 2 * this->diag_value}},
                                   this->exec);
 
     gko::kernels::reference::sor::initialize_weighted_l(
-        this->exec, this->mtx.get(), this->diag_value, result.get());
+        this->exec, this->mtx.get(), 0.5f, result.get());
 
     GKO_ASSERT_MTX_NEAR(result, expected_l, r<value_type>::value);
 }
@@ -122,15 +122,16 @@ TYPED_TEST(Sor, CanInitializeLAndUFactorWithWeight)
         gko::initialize<gko::matrix::Dense<value_type>>({0.0}, this->exec));
     result_u->scale(
         gko::initialize<gko::matrix::Dense<value_type>>({0.0}, this->exec));
-    auto diag_weight = static_cast<gko::remove_complex<value_type>>(
-        1.0 / (2 - this->diag_value));
-    auto off_diag_weight = this->diag_value * diag_weight;
+    auto factor = static_cast<gko::remove_complex<value_type>>(0.5);
+    auto diag_weight =
+        static_cast<gko::remove_complex<value_type>>(1.0 / (2 - factor));
+    auto off_diag_weight = factor * diag_weight;
     std::shared_ptr<csr_type> expected_l =
-        gko::initialize<csr_type>({{1, 0, 0, 0, 0},
-                                   {-2, 1, 0, 0, 0},
-                                   {0, -5, 1, 0, 0},
-                                   {-3, 0, 0, 1, 0},
-                                   {-4, 0, -6, -7, 1}},
+        gko::initialize<csr_type>({{2 * this->diag_value, 0, 0, 0, 0},
+                                   {-2, 2 * this->diag_value, 0, 0, 0},
+                                   {0, -5, 2 * this->diag_value, 0, 0},
+                                   {-3, 0, 0, 2 * this->diag_value, 0},
+                                   {-4, 0, -6, -7, 2 * this->diag_value}},
                                   this->exec);
     std::shared_ptr<csr_type> expected_u = gko::initialize<csr_type>(
         {{this->diag_value * diag_weight, 2 * off_diag_weight, 0,
@@ -142,8 +143,7 @@ TYPED_TEST(Sor, CanInitializeLAndUFactorWithWeight)
         this->exec);
 
     gko::kernels::reference::sor::initialize_weighted_l_u(
-        this->exec, this->mtx.get(), this->diag_value, result_l.get(),
-        result_u.get());
+        this->exec, this->mtx.get(), factor, result_l.get(), result_u.get());
 
     GKO_ASSERT_MTX_NEAR(result_l, expected_l, r<value_type>::value);
     GKO_ASSERT_MTX_NEAR(result_u, expected_u, r<value_type>::value);

From c8a7dde7155c9c354455cdb78501362c27445a32 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Tue, 9 Jul 2024 10:35:01 +0200
Subject: [PATCH 224/448] [prec] split up common impl of sor

---
 common/cuda_hip/CMakeLists.txt                |  1 +
 .../preconditioner/sor_kernels.cpp            |  3 --
 common/unified/CMakeLists.txt                 |  1 -
 dpcpp/CMakeLists.txt                          |  1 +
 dpcpp/preconditioner/sor_kernels.dp.cpp       | 42 +++++++++++++++++++
 omp/CMakeLists.txt                            |  1 +
 omp/preconditioner/sor_kernels.cpp            | 41 ++++++++++++++++++
 7 files changed, 86 insertions(+), 4 deletions(-)
 rename common/{unified => cuda_hip}/preconditioner/sor_kernels.cpp (96%)
 create mode 100644 dpcpp/preconditioner/sor_kernels.dp.cpp
 create mode 100644 omp/preconditioner/sor_kernels.cpp

diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt
index f5a28596d16..267444d2144 100644
--- a/common/cuda_hip/CMakeLists.txt
+++ b/common/cuda_hip/CMakeLists.txt
@@ -38,6 +38,7 @@ set(CUDA_HIP_SOURCES
     preconditioner/jacobi_advanced_apply_kernels.cpp
     preconditioner/jacobi_generate_kernels.cpp
     preconditioner/jacobi_simple_apply_kernels.cpp
+    preconditioner/sor_kernels.cpp
     reorder/rcm_kernels.cpp
     solver/cb_gmres_kernels.cpp
     solver/idr_kernels.cpp
diff --git a/common/unified/preconditioner/sor_kernels.cpp b/common/cuda_hip/preconditioner/sor_kernels.cpp
similarity index 96%
rename from common/unified/preconditioner/sor_kernels.cpp
rename to common/cuda_hip/preconditioner/sor_kernels.cpp
index 8932c1df562..dcf6f68c5c7 100644
--- a/common/unified/preconditioner/sor_kernels.cpp
+++ b/common/cuda_hip/preconditioner/sor_kernels.cpp
@@ -7,9 +7,6 @@
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
-#include "common/unified/base/kernel_launch.hpp"
-
-
 namespace gko {
 namespace kernels {
 namespace GKO_DEVICE_NAMESPACE {
diff --git a/common/unified/CMakeLists.txt b/common/unified/CMakeLists.txt
index 132e04c5d9a..00bc21df0c6 100644
--- a/common/unified/CMakeLists.txt
+++ b/common/unified/CMakeLists.txt
@@ -19,7 +19,6 @@ set(UNIFIED_SOURCES
     matrix/diagonal_kernels.cpp
     multigrid/pgm_kernels.cpp
     preconditioner/jacobi_kernels.cpp
-    preconditioner/sor_kernels.cpp
     solver/bicg_kernels.cpp
     solver/bicgstab_kernels.cpp
     solver/cg_kernels.cpp
diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index 851ef9a3dc6..bf65888a6ab 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -56,6 +56,7 @@ target_sources(ginkgo_dpcpp
     preconditioner/jacobi_generate_kernel.dp.cpp
     preconditioner/jacobi_kernels.dp.cpp
     preconditioner/jacobi_simple_apply_kernel.dp.cpp
+    preconditioner/sor_kernels.dp.cpp
     reorder/rcm_kernels.dp.cpp
     solver/batch_bicgstab_kernels.dp.cpp
     solver/batch_cg_kernels.dp.cpp
diff --git a/dpcpp/preconditioner/sor_kernels.dp.cpp b/dpcpp/preconditioner/sor_kernels.dp.cpp
new file mode 100644
index 00000000000..ab91a67b999
--- /dev/null
+++ b/dpcpp/preconditioner/sor_kernels.dp.cpp
@@ -0,0 +1,42 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/preconditioner/sor_kernels.hpp"
+
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+namespace sor {
+
+
+template <typename ValueType, typename IndexType>
+void initialize_weighted_l(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* system_matrix,
+    remove_complex<ValueType> weight,
+    matrix::Csr<ValueType, IndexType>* l_mtx) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L);
+
+
+template <typename ValueType, typename IndexType>
+void initialize_weighted_l_u(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* system_matrix,
+    remove_complex<ValueType> weight, matrix::Csr<ValueType, IndexType>* l_mtx,
+    matrix::Csr<ValueType, IndexType>* u_mtx) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U);
+
+
+}  // namespace sor
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt
index 41bec80673f..fef0702048f 100644
--- a/omp/CMakeLists.txt
+++ b/omp/CMakeLists.txt
@@ -40,6 +40,7 @@ target_sources(ginkgo_omp
     preconditioner/batch_jacobi_kernels.cpp
     preconditioner/isai_kernels.cpp
     preconditioner/jacobi_kernels.cpp
+    preconditioner/sor_kernels.cpp
     reorder/rcm_kernels.cpp
     solver/batch_bicgstab_kernels.cpp
     solver/batch_cg_kernels.cpp
diff --git a/omp/preconditioner/sor_kernels.cpp b/omp/preconditioner/sor_kernels.cpp
new file mode 100644
index 00000000000..275ba2117f9
--- /dev/null
+++ b/omp/preconditioner/sor_kernels.cpp
@@ -0,0 +1,41 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/preconditioner/sor_kernels.hpp"
+
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+namespace gko {
+namespace kernels {
+namespace omp {
+namespace sor {
+
+
+template <typename ValueType, typename IndexType>
+void initialize_weighted_l(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* system_matrix,
+    remove_complex<ValueType> weight,
+    matrix::Csr<ValueType, IndexType>* l_mtx) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L);
+
+
+template <typename ValueType, typename IndexType>
+void initialize_weighted_l_u(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* system_matrix,
+    remove_complex<ValueType> weight, matrix::Csr<ValueType, IndexType>* l_mtx,
+    matrix::Csr<ValueType, IndexType>* u_mtx) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U);
+
+
+}  // namespace sor
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko

From 198d2941042a03d1e366d22250dcd6e93b5329a7 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Thu, 27 Jun 2024 09:49:37 +0200
Subject: [PATCH 225/448] [prec] implement omp sor kernels

---
 omp/factorization/factorization_helpers.hpp | 114 ++++++++++++++++++++
 omp/factorization/factorization_kernels.cpp |  99 ++++-------------
 omp/preconditioner/sor_kernels.cpp          |  32 +++++-
 3 files changed, 162 insertions(+), 83 deletions(-)
 create mode 100644 omp/factorization/factorization_helpers.hpp

diff --git a/omp/factorization/factorization_helpers.hpp b/omp/factorization/factorization_helpers.hpp
new file mode 100644
index 00000000000..f1eed7d4d37
--- /dev/null
+++ b/omp/factorization/factorization_helpers.hpp
@@ -0,0 +1,114 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <ginkgo/core/matrix/csr.hpp>
+
+#include "core/factorization/factorization_helpers.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace omp {
+namespace factorization {
+namespace helpers {
+
+
+using namespace ::gko::factorization;
+
+
+template <typename ValueType, typename IndexType, typename LClosure,
+          typename UClosure>
+void initialize_l_u(const matrix::Csr<ValueType, IndexType>* system_matrix,
+                    matrix::Csr<ValueType, IndexType>* csr_l,
+                    matrix::Csr<ValueType, IndexType>* csr_u,
+                    LClosure&& l_closure, UClosure&& u_closure)
+{
+    const auto row_ptrs = system_matrix->get_const_row_ptrs();
+    const auto col_idxs = system_matrix->get_const_col_idxs();
+    const auto vals = system_matrix->get_const_values();
+
+    const auto row_ptrs_l = csr_l->get_const_row_ptrs();
+    auto col_idxs_l = csr_l->get_col_idxs();
+    auto vals_l = csr_l->get_values();
+
+    const auto row_ptrs_u = csr_u->get_const_row_ptrs();
+    auto col_idxs_u = csr_u->get_col_idxs();
+    auto vals_u = csr_u->get_values();
+
+#pragma omp parallel for
+    for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) {
+        size_type current_index_l = row_ptrs_l[row];
+        size_type current_index_u =
+            row_ptrs_u[row] + 1;  // we treat the diagonal separately
+        // if there is no diagonal value, set it to 1 by default
+        auto diag_val = one<ValueType>();
+        for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) {
+            const auto col = col_idxs[el];
+            const auto val = vals[el];
+            if (col < row) {
+                col_idxs_l[current_index_l] = col;
+                vals_l[current_index_l] = l_closure.map_off_diag(val);
+                ++current_index_l;
+            } else if (col == row) {
+                // save value for later
+                diag_val = val;
+            } else {  // col > row
+                col_idxs_u[current_index_u] = col;
+                vals_u[current_index_u] = u_closure.map_off_diag(val);
+                ++current_index_u;
+            }
+        }
+        // store diagonal entries
+        size_type l_diag_idx = row_ptrs_l[row + 1] - 1;
+        size_type u_diag_idx = row_ptrs_u[row];
+        col_idxs_l[l_diag_idx] = row;
+        col_idxs_u[u_diag_idx] = row;
+        vals_l[l_diag_idx] = l_closure.map_diag(diag_val);
+        vals_u[u_diag_idx] = u_closure.map_diag(diag_val);
+    }
+}
+
+
+template <typename ValueType, typename IndexType, typename Closure>
+void initialize_l(const matrix::Csr<ValueType, IndexType>* system_matrix,
+                  matrix::Csr<ValueType, IndexType>* csr_l, Closure&& closure)
+{
+    const auto row_ptrs = system_matrix->get_const_row_ptrs();
+    const auto col_idxs = system_matrix->get_const_col_idxs();
+    const auto vals = system_matrix->get_const_values();
+
+    const auto row_ptrs_l = csr_l->get_const_row_ptrs();
+    auto col_idxs_l = csr_l->get_col_idxs();
+    auto vals_l = csr_l->get_values();
+
+#pragma omp parallel for
+    for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) {
+        size_type current_index_l = row_ptrs_l[row];
+        // if there is no diagonal value, set it to 1 by default
+        auto diag_val = one<ValueType>();
+        for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) {
+            const auto col = col_idxs[el];
+            const auto val = vals[el];
+            if (col < row) {
+                col_idxs_l[current_index_l] = col;
+                vals_l[current_index_l] = val;
+                ++current_index_l;
+            } else if (col == row) {
+                // save value for later
+                diag_val = val;
+            }
+        }
+        // store diagonal entries
+        size_type l_diag_idx = row_ptrs_l[row + 1] - 1;
+        col_idxs_l[l_diag_idx] = row;
+        vals_l[l_diag_idx] = closure.map_diag(diag_val);
+    }
+}
+
+
+}  // namespace helpers
+}  // namespace factorization
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
diff --git a/omp/factorization/factorization_kernels.cpp b/omp/factorization/factorization_kernels.cpp
index f4b41cbdac5..e7b66f6f887 100644
--- a/omp/factorization/factorization_kernels.cpp
+++ b/omp/factorization/factorization_kernels.cpp
@@ -12,6 +12,7 @@
 
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
+#include "omp/factorization/factorization_helpers.hpp"
 
 
 namespace gko {
@@ -224,49 +225,12 @@ void initialize_l_u(std::shared_ptr<const OmpExecutor> exec,
                     matrix::Csr<ValueType, IndexType>* csr_l,
                     matrix::Csr<ValueType, IndexType>* csr_u)
 {
-    const auto row_ptrs = system_matrix->get_const_row_ptrs();
-    const auto col_idxs = system_matrix->get_const_col_idxs();
-    const auto vals = system_matrix->get_const_values();
-
-    const auto row_ptrs_l = csr_l->get_const_row_ptrs();
-    auto col_idxs_l = csr_l->get_col_idxs();
-    auto vals_l = csr_l->get_values();
-
-    const auto row_ptrs_u = csr_u->get_const_row_ptrs();
-    auto col_idxs_u = csr_u->get_col_idxs();
-    auto vals_u = csr_u->get_values();
-
-#pragma omp parallel for
-    for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) {
-        size_type current_index_l = row_ptrs_l[row];
-        size_type current_index_u =
-            row_ptrs_u[row] + 1;  // we treat the diagonal separately
-        // if there is no diagonal value, set it to 1 by default
-        auto diag_val = one<ValueType>();
-        for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) {
-            const auto col = col_idxs[el];
-            const auto val = vals[el];
-            if (col < row) {
-                col_idxs_l[current_index_l] = col;
-                vals_l[current_index_l] = val;
-                ++current_index_l;
-            } else if (col == row) {
-                // save value for later
-                diag_val = val;
-            } else {  // col > row
-                col_idxs_u[current_index_u] = col;
-                vals_u[current_index_u] = val;
-                ++current_index_u;
-            }
-        }
-        // store diagonal entries
-        size_type l_diag_idx = row_ptrs_l[row + 1] - 1;
-        size_type u_diag_idx = row_ptrs_u[row];
-        col_idxs_l[l_diag_idx] = row;
-        col_idxs_u[u_diag_idx] = row;
-        vals_l[l_diag_idx] = one<ValueType>();
-        vals_u[u_diag_idx] = diag_val;
-    }
+    helpers::initialize_l_u(
+        system_matrix, csr_l, csr_u,
+        helpers::triangular_mtx_closure([](auto) { return one<ValueType>(); },
+                                        helpers::identity{}),
+        helpers::triangular_mtx_closure(helpers::identity{},
+                                        helpers::identity{}));
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -309,43 +273,18 @@ void initialize_l(std::shared_ptr<const OmpExecutor> exec,
                   const matrix::Csr<ValueType, IndexType>* system_matrix,
                   matrix::Csr<ValueType, IndexType>* csr_l, bool diag_sqrt)
 {
-    const auto row_ptrs = system_matrix->get_const_row_ptrs();
-    const auto col_idxs = system_matrix->get_const_col_idxs();
-    const auto vals = system_matrix->get_const_values();
-
-    const auto row_ptrs_l = csr_l->get_const_row_ptrs();
-    auto col_idxs_l = csr_l->get_col_idxs();
-    auto vals_l = csr_l->get_values();
-
-#pragma omp parallel for
-    for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) {
-        size_type current_index_l = row_ptrs_l[row];
-        // if there is no diagonal value, set it to 1 by default
-        auto diag_val = one<ValueType>();
-        for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) {
-            const auto col = col_idxs[el];
-            const auto val = vals[el];
-            if (col < row) {
-                col_idxs_l[current_index_l] = col;
-                vals_l[current_index_l] = val;
-                ++current_index_l;
-            } else if (col == row) {
-                // save value for later
-                diag_val = val;
-            }
-        }
-        // store diagonal entries
-        size_type l_diag_idx = row_ptrs_l[row + 1] - 1;
-        col_idxs_l[l_diag_idx] = row;
-        // compute square root with sentinel
-        if (diag_sqrt) {
-            diag_val = sqrt(diag_val);
-            if (!is_finite(diag_val)) {
-                diag_val = one<ValueType>();
-            }
-        }
-        vals_l[l_diag_idx] = diag_val;
-    }
+    helpers::initialize_l(system_matrix, csr_l,
+                          helpers::triangular_mtx_closure(
+                              [diag_sqrt](auto val) {
+                                  if (diag_sqrt) {
+                                      val = sqrt(val);
+                                      if (!is_finite(val)) {
+                                          val = one<ValueType>();
+                                      }
+                                  }
+                                  return val;
+                              },
+                              helpers::identity{}));
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/omp/preconditioner/sor_kernels.cpp b/omp/preconditioner/sor_kernels.cpp
index 275ba2117f9..509946ac15a 100644
--- a/omp/preconditioner/sor_kernels.cpp
+++ b/omp/preconditioner/sor_kernels.cpp
@@ -7,6 +7,8 @@
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
+#include "omp/factorization/factorization_helpers.hpp"
+
 namespace gko {
 namespace kernels {
 namespace omp {
@@ -17,8 +19,15 @@ template <typename ValueType, typename IndexType>
 void initialize_weighted_l(
     std::shared_ptr<const DefaultExecutor> exec,
     const matrix::Csr<ValueType, IndexType>* system_matrix,
-    remove_complex<ValueType> weight,
-    matrix::Csr<ValueType, IndexType>* l_mtx) GKO_NOT_IMPLEMENTED;
+    remove_complex<ValueType> weight, matrix::Csr<ValueType, IndexType>* l_mtx)
+{
+    auto inv_weight = one(weight) / weight;
+    factorization::helpers::initialize_l(
+        system_matrix, l_mtx,
+        factorization::helpers::triangular_mtx_closure(
+            [inv_weight](auto val) { return val * inv_weight; },
+            [](auto val) { return val; }));
+};
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L);
@@ -29,7 +38,24 @@ void initialize_weighted_l_u(
     std::shared_ptr<const DefaultExecutor> exec,
     const matrix::Csr<ValueType, IndexType>* system_matrix,
     remove_complex<ValueType> weight, matrix::Csr<ValueType, IndexType>* l_mtx,
-    matrix::Csr<ValueType, IndexType>* u_mtx) GKO_NOT_IMPLEMENTED;
+    matrix::Csr<ValueType, IndexType>* u_mtx)
+{
+    auto inv_weight = one(weight) / weight;
+    auto inv_two_minus_weight =
+        one(weight) / (static_cast<remove_complex<ValueType>>(2.0) - weight);
+    factorization::helpers::initialize_l_u(
+        system_matrix, l_mtx, u_mtx,
+        factorization::helpers::triangular_mtx_closure(
+            [inv_weight](auto val) { return val * inv_weight; },
+            [](auto val) { return val; }),
+        factorization::helpers::triangular_mtx_closure(
+            [inv_two_minus_weight](auto val) {
+                return val * inv_two_minus_weight;
+            },
+            [weight, inv_two_minus_weight](auto val) {
+                return val * weight * inv_two_minus_weight;
+            }));
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U);

From 86a430fad147146684d7bd3ad53b6235db8a04f6 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Thu, 27 Jun 2024 14:10:04 +0200
Subject: [PATCH 226/448] [prec] implement sycl sor kernels

---
 .../factorization_helpers.dp.hpp              | 110 +++++++++++++++++
 .../factorization_kernels.dp.cpp              | 114 ++++--------------
 dpcpp/preconditioner/sor_kernels.dp.cpp       |  64 +++++++++-
 3 files changed, 194 insertions(+), 94 deletions(-)
 create mode 100644 dpcpp/factorization/factorization_helpers.dp.hpp

diff --git a/dpcpp/factorization/factorization_helpers.dp.hpp b/dpcpp/factorization/factorization_helpers.dp.hpp
new file mode 100644
index 00000000000..9779e134e77
--- /dev/null
+++ b/dpcpp/factorization/factorization_helpers.dp.hpp
@@ -0,0 +1,110 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <CL/sycl.hpp>
+
+#include "core/factorization/factorization_helpers.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+namespace factorization {
+namespace helpers {
+
+using namespace ::gko::factorization;
+
+
+template <typename ValueType, typename IndexType, typename LClosure,
+          typename UClosure>
+void initialize_l_u(size_type num_rows, const IndexType* __restrict__ row_ptrs,
+                    const IndexType* __restrict__ col_idxs,
+                    const ValueType* __restrict__ values,
+                    const IndexType* __restrict__ l_row_ptrs,
+                    IndexType* __restrict__ l_col_idxs,
+                    ValueType* __restrict__ l_values,
+                    const IndexType* __restrict__ u_row_ptrs,
+                    IndexType* __restrict__ u_col_idxs,
+                    ValueType* __restrict__ u_values, LClosure l_closure,
+                    UClosure u_closure, sycl::nd_item<3> item_ct1)
+{
+    const auto row = thread::get_thread_id_flat<IndexType>(item_ct1);
+    if (row < num_rows) {
+        auto l_idx = l_row_ptrs[row];
+        auto u_idx = u_row_ptrs[row] + 1;  // we treat the diagonal separately
+        // default diagonal to one
+        auto diag_val = one<ValueType>();
+        for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
+            const auto col = col_idxs[i];
+            const auto val = values[i];
+            // save diagonal entry for later
+            if (col == row) {
+                diag_val = val;
+            }
+            if (col < row) {
+                l_col_idxs[l_idx] = col;
+                l_values[l_idx] = l_closure.map_off_diag(val);
+                ++l_idx;
+            }
+            if (row < col) {
+                u_col_idxs[u_idx] = col;
+                u_values[u_idx] = u_closure.map_off_diag(val);
+                ++u_idx;
+            }
+        }
+        // store diagonal entries
+        auto l_diag_idx = l_row_ptrs[row + 1] - 1;
+        auto u_diag_idx = u_row_ptrs[row];
+        l_col_idxs[l_diag_idx] = row;
+        u_col_idxs[u_diag_idx] = row;
+        l_values[l_diag_idx] = l_closure.map_diag(diag_val);
+        u_values[u_diag_idx] = u_closure.map_diag(diag_val);
+    }
+}
+
+
+template <typename ValueType, typename IndexType, typename LClosure>
+void initialize_l(size_type num_rows, const IndexType* __restrict__ row_ptrs,
+                  const IndexType* __restrict__ col_idxs,
+                  const ValueType* __restrict__ values,
+                  const IndexType* __restrict__ l_row_ptrs,
+                  IndexType* __restrict__ l_col_idxs,
+                  ValueType* __restrict__ l_values, LClosure l_closure,
+                  sycl::nd_item<3> item_ct1)
+{
+    const auto row = thread::get_thread_id_flat<IndexType>(item_ct1);
+    if (row < num_rows) {
+        auto l_idx = l_row_ptrs[row];
+        // if there was no diagonal entry, default to one
+        auto diag_val = one<ValueType>();
+        for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
+            const auto col = col_idxs[i];
+            const auto val = values[i];
+            // save diagonal entry for later
+            if (col == row) {
+                diag_val = val;
+            }
+            if (col < row) {
+                l_col_idxs[l_idx] = col;
+                l_values[l_idx] = l_closure.map_off_diag(val);
+                ++l_idx;
+            }
+        }
+        // store diagonal entries
+        auto l_diag_idx = l_row_ptrs[row + 1] - 1;
+        l_col_idxs[l_diag_idx] = row;
+        l_values[l_diag_idx] = l_closure.map_diag(diag_val);
+    }
+}
+
+
+}  // namespace helpers
+}  // namespace factorization
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/factorization/factorization_kernels.dp.cpp b/dpcpp/factorization/factorization_kernels.dp.cpp
index 1d9912b4f12..885fe481609 100644
--- a/dpcpp/factorization/factorization_kernels.dp.cpp
+++ b/dpcpp/factorization/factorization_kernels.dp.cpp
@@ -18,6 +18,7 @@
 #include "dpcpp/components/intrinsics.dp.hpp"
 #include "dpcpp/components/searching.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/factorization/factorization_helpers.dp.hpp"
 
 
 namespace gko {
@@ -320,51 +321,6 @@ void count_nnz_per_l_u_row(dim3 grid, dim3 block,
 }
 
 
-template <typename ValueType, typename IndexType>
-void initialize_l_u(size_type num_rows, const IndexType* __restrict__ row_ptrs,
-                    const IndexType* __restrict__ col_idxs,
-                    const ValueType* __restrict__ values,
-                    const IndexType* __restrict__ l_row_ptrs,
-                    IndexType* __restrict__ l_col_idxs,
-                    ValueType* __restrict__ l_values,
-                    const IndexType* __restrict__ u_row_ptrs,
-                    IndexType* __restrict__ u_col_idxs,
-                    ValueType* __restrict__ u_values, sycl::nd_item<3> item_ct1)
-{
-    const auto row = thread::get_thread_id_flat<IndexType>(item_ct1);
-    if (row < num_rows) {
-        auto l_idx = l_row_ptrs[row];
-        auto u_idx = u_row_ptrs[row] + 1;  // we treat the diagonal separately
-        // default diagonal to one
-        auto diag_val = one<ValueType>();
-        for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
-            const auto col = col_idxs[i];
-            const auto val = values[i];
-            // save diagonal entry for later
-            if (col == row) {
-                diag_val = val;
-            }
-            if (col < row) {
-                l_col_idxs[l_idx] = col;
-                l_values[l_idx] = val;
-                ++l_idx;
-            }
-            if (row < col) {
-                u_col_idxs[u_idx] = col;
-                u_values[u_idx] = val;
-                ++u_idx;
-            }
-        }
-        // store diagonal entries
-        auto l_diag_idx = l_row_ptrs[row + 1] - 1;
-        auto u_diag_idx = u_row_ptrs[row];
-        l_col_idxs[l_diag_idx] = row;
-        u_col_idxs[u_diag_idx] = row;
-        l_values[l_diag_idx] = one<ValueType>();
-        u_values[u_diag_idx] = diag_val;
-    }
-}
-
 template <typename ValueType, typename IndexType>
 void initialize_l_u(dim3 grid, dim3 block, size_type dynamic_shared_memory,
                     sycl::queue* queue, size_type num_rows,
@@ -376,9 +332,14 @@ void initialize_l_u(dim3 grid, dim3 block, size_type dynamic_shared_memory,
 {
     queue->parallel_for(
         sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-            initialize_l_u(num_rows, row_ptrs, col_idxs, values, l_row_ptrs,
-                           l_col_idxs, l_values, u_row_ptrs, u_col_idxs,
-                           u_values, item_ct1);
+            helpers::initialize_l_u(
+                num_rows, row_ptrs, col_idxs, values, l_row_ptrs, l_col_idxs,
+                l_values, u_row_ptrs, u_col_idxs, u_values,
+                helpers::triangular_mtx_closure(
+                    [](auto) { return one<ValueType>(); }, helpers::identity{}),
+                helpers::triangular_mtx_closure(helpers::identity{},
+                                                helpers::identity{}),
+                item_ct1);
         });
 }
 
@@ -418,47 +379,6 @@ void count_nnz_per_l_row(dim3 grid, dim3 block, size_type dynamic_shared_memory,
 }
 
 
-template <typename ValueType, typename IndexType>
-void initialize_l(size_type num_rows, const IndexType* __restrict__ row_ptrs,
-                  const IndexType* __restrict__ col_idxs,
-                  const ValueType* __restrict__ values,
-                  const IndexType* __restrict__ l_row_ptrs,
-                  IndexType* __restrict__ l_col_idxs,
-                  ValueType* __restrict__ l_values, bool use_sqrt,
-                  sycl::nd_item<3> item_ct1)
-{
-    const auto row = thread::get_thread_id_flat<IndexType>(item_ct1);
-    if (row < num_rows) {
-        auto l_idx = l_row_ptrs[row];
-        // if there was no diagonal entry, default to one
-        auto diag_val = one<ValueType>();
-        for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
-            const auto col = col_idxs[i];
-            const auto val = values[i];
-            // save diagonal entry for later
-            if (col == row) {
-                diag_val = val;
-            }
-            if (col < row) {
-                l_col_idxs[l_idx] = col;
-                l_values[l_idx] = val;
-                ++l_idx;
-            }
-        }
-        // store diagonal entries
-        auto l_diag_idx = l_row_ptrs[row + 1] - 1;
-        l_col_idxs[l_diag_idx] = row;
-        // compute square root with sentinel
-        if (use_sqrt) {
-            diag_val = std::sqrt(diag_val);
-            if (!is_finite(diag_val)) {
-                diag_val = one<ValueType>();
-            }
-        }
-        l_values[l_diag_idx] = diag_val;
-    }
-}
-
 template <typename ValueType, typename IndexType>
 void initialize_l(dim3 grid, dim3 block, size_type dynamic_shared_memory,
                   sycl::queue* queue, size_type num_rows,
@@ -468,8 +388,20 @@ void initialize_l(dim3 grid, dim3 block, size_type dynamic_shared_memory,
 {
     queue->parallel_for(
         sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-            initialize_l(num_rows, row_ptrs, col_idxs, values, l_row_ptrs,
-                         l_col_idxs, l_values, use_sqrt, item_ct1);
+            helpers::initialize_l(num_rows, row_ptrs, col_idxs, values,
+                                  l_row_ptrs, l_col_idxs, l_values,
+                                  helpers::triangular_mtx_closure(
+                                      [use_sqrt](auto val) {
+                                          if (use_sqrt) {
+                                              val = sqrt(val);
+                                              if (!is_finite(val)) {
+                                                  val = one<ValueType>();
+                                              }
+                                          }
+                                          return val;
+                                      },
+                                      helpers::identity{}),
+                                  item_ct1);
         });
 }
 
diff --git a/dpcpp/preconditioner/sor_kernels.dp.cpp b/dpcpp/preconditioner/sor_kernels.dp.cpp
index ab91a67b999..fe796586591 100644
--- a/dpcpp/preconditioner/sor_kernels.dp.cpp
+++ b/dpcpp/preconditioner/sor_kernels.dp.cpp
@@ -7,6 +7,7 @@
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
+#include "dpcpp/factorization/factorization_helpers.dp.hpp"
 
 namespace gko {
 namespace kernels {
@@ -14,12 +15,36 @@ namespace dpcpp {
 namespace sor {
 
 
+constexpr int default_block_size{256};
+
+
 template <typename ValueType, typename IndexType>
 void initialize_weighted_l(
     std::shared_ptr<const DefaultExecutor> exec,
     const matrix::Csr<ValueType, IndexType>* system_matrix,
-    remove_complex<ValueType> weight,
-    matrix::Csr<ValueType, IndexType>* l_mtx) GKO_NOT_IMPLEMENTED;
+    remove_complex<ValueType> weight, matrix::Csr<ValueType, IndexType>* l_mtx)
+{
+    const size_type num_rows{system_matrix->get_size()[0]};
+    const dim3 block_size{default_block_size, 1, 1};
+    const dim3 grid_dim{static_cast<uint32>(ceildiv(
+                            num_rows, static_cast<size_type>(block_size.x))),
+                        1, 1};
+
+    auto inv_weight = one(weight) / weight;
+
+    exec->get_queue()->parallel_for(
+        sycl_nd_range(grid_dim, block_size), [=](sycl::nd_item<3> item_ct1) {
+            factorization::helpers::initialize_l(
+                num_rows, system_matrix->get_const_row_ptrs(),
+                system_matrix->get_const_col_idxs(),
+                system_matrix->get_const_values(), l_mtx->get_const_row_ptrs(),
+                l_mtx->get_col_idxs(), l_mtx->get_values(),
+                factorization::helpers::triangular_mtx_closure(
+                    [inv_weight](auto val) { return val * inv_weight; },
+                    factorization::helpers::identity{}),
+                item_ct1);
+        });
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L);
@@ -30,7 +55,40 @@ void initialize_weighted_l_u(
     std::shared_ptr<const DefaultExecutor> exec,
     const matrix::Csr<ValueType, IndexType>* system_matrix,
     remove_complex<ValueType> weight, matrix::Csr<ValueType, IndexType>* l_mtx,
-    matrix::Csr<ValueType, IndexType>* u_mtx) GKO_NOT_IMPLEMENTED;
+    matrix::Csr<ValueType, IndexType>* u_mtx)
+{
+    const size_type num_rows{system_matrix->get_size()[0]};
+    const dim3 block_size{default_block_size, 1, 1};
+    const dim3 grid_dim{static_cast<uint32>(ceildiv(
+                            num_rows, static_cast<size_type>(block_size.x))),
+                        1, 1};
+
+    auto inv_weight = one(weight) / weight;
+    auto inv_two_minus_weight =
+        one(weight) / (static_cast<remove_complex<ValueType>>(2.0) - weight);
+
+    exec->get_queue()->parallel_for(
+        sycl_nd_range(grid_dim, block_size), [=](sycl::nd_item<3> item_ct1) {
+            factorization::helpers::initialize_l_u(
+                num_rows, system_matrix->get_const_row_ptrs(),
+                system_matrix->get_const_col_idxs(),
+                system_matrix->get_const_values(), l_mtx->get_const_row_ptrs(),
+                l_mtx->get_col_idxs(), l_mtx->get_values(),
+                u_mtx->get_const_row_ptrs(), u_mtx->get_col_idxs(),
+                u_mtx->get_values(),
+                factorization::helpers::triangular_mtx_closure(
+                    [inv_weight](auto val) { return val * inv_weight; },
+                    factorization::helpers::identity{}),
+                factorization::helpers::triangular_mtx_closure(
+                    [inv_two_minus_weight](auto val) {
+                        return val * inv_two_minus_weight;
+                    },
+                    [weight, inv_two_minus_weight](auto val) {
+                        return val * weight * inv_two_minus_weight;
+                    }),
+                item_ct1);
+        });
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U);

From fc0b9e76afb5a45f009f00aaef3dbac691b7c3a1 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Tue, 9 Jul 2024 12:54:09 +0200
Subject: [PATCH 227/448] [prec] implement cuda/hip sor kernels

---
 .../factorization/factorization_helpers.hpp   | 112 +++++++++++++++
 .../factorization/factorization_kernels.cpp   | 128 ++++--------------
 .../cuda_hip/preconditioner/sor_kernels.cpp   |  64 ++++++++-
 3 files changed, 203 insertions(+), 101 deletions(-)
 create mode 100644 common/cuda_hip/factorization/factorization_helpers.hpp

diff --git a/common/cuda_hip/factorization/factorization_helpers.hpp b/common/cuda_hip/factorization/factorization_helpers.hpp
new file mode 100644
index 00000000000..87248740867
--- /dev/null
+++ b/common/cuda_hip/factorization/factorization_helpers.hpp
@@ -0,0 +1,112 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/factorization/factorization_helpers.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace factorization {
+namespace helpers {
+
+
+using namespace ::gko::factorization;
+
+
+constexpr int default_block_size{512};
+
+
+template <typename ValueType, typename IndexType, typename LClosure,
+          typename UClosure>
+__global__ __launch_bounds__(default_block_size) void initialize_l_u(
+    size_type num_rows, const IndexType* __restrict__ row_ptrs,
+    const IndexType* __restrict__ col_idxs,
+    const ValueType* __restrict__ values,
+    const IndexType* __restrict__ l_row_ptrs,
+    IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_values,
+    const IndexType* __restrict__ u_row_ptrs,
+    IndexType* __restrict__ u_col_idxs, ValueType* __restrict__ u_values,
+    LClosure l_closure, UClosure u_closure)
+{
+    const auto row = thread::get_thread_id_flat<IndexType>();
+    if (row < num_rows) {
+        auto l_idx = l_row_ptrs[row];
+        auto u_idx = u_row_ptrs[row] + 1;  // we treat the diagonal separately
+        // default diagonal to one
+        auto diag_val = one<ValueType>();
+        for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
+            const auto col = col_idxs[i];
+            const auto val = values[i];
+            // save diagonal entry for later
+            if (col == row) {
+                diag_val = val;
+            }
+            if (col < row) {
+                l_col_idxs[l_idx] = col;
+                l_values[l_idx] = l_closure.map_off_diag(val);
+                ++l_idx;
+            }
+            if (row < col) {
+                u_col_idxs[u_idx] = col;
+                u_values[u_idx] = u_closure.map_off_diag(val);
+                ++u_idx;
+            }
+        }
+        // store diagonal entries
+        auto l_diag_idx = l_row_ptrs[row + 1] - 1;
+        auto u_diag_idx = u_row_ptrs[row];
+        l_col_idxs[l_diag_idx] = row;
+        u_col_idxs[u_diag_idx] = row;
+        l_values[l_diag_idx] = l_closure.map_diag(diag_val);
+        u_values[u_diag_idx] = u_closure.map_diag(diag_val);
+    }
+}
+
+
+template <typename ValueType, typename IndexType, typename LClosure>
+__global__ __launch_bounds__(default_block_size) void initialize_l(
+    size_type num_rows, const IndexType* __restrict__ row_ptrs,
+    const IndexType* __restrict__ col_idxs,
+    const ValueType* __restrict__ values,
+    const IndexType* __restrict__ l_row_ptrs,
+    IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_values,
+    LClosure l_closure)
+{
+    const auto row = thread::get_thread_id_flat<IndexType>();
+    if (row < num_rows) {
+        auto l_idx = l_row_ptrs[row];
+        // if there was no diagonal entry, default to one
+        auto diag_val = one<ValueType>();
+        for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
+            const auto col = col_idxs[i];
+            const auto val = values[i];
+            // save diagonal entry for later
+            if (col == row) {
+                diag_val = val;
+            }
+            if (col < row) {
+                l_col_idxs[l_idx] = col;
+                l_values[l_idx] = l_closure.map_off_diag(val);
+                ++l_idx;
+            }
+        }
+        // store diagonal entries
+        auto l_diag_idx = l_row_ptrs[row + 1] - 1;
+        l_col_idxs[l_diag_idx] = row;
+        l_values[l_diag_idx] = l_closure.map_diag(diag_val);
+    }
+}
+
+
+}  // namespace helpers
+}  // namespace factorization
+}  // namespace GKO_DEVICE_NAMESPACE
+
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/factorization/factorization_kernels.cpp b/common/cuda_hip/factorization/factorization_kernels.cpp
index 3a38175ab70..e790cf19540 100644
--- a/common/cuda_hip/factorization/factorization_kernels.cpp
+++ b/common/cuda_hip/factorization/factorization_kernels.cpp
@@ -13,6 +13,7 @@
 #include "common/cuda_hip/components/intrinsics.hpp"
 #include "common/cuda_hip/components/searching.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/factorization/factorization_helpers.hpp"
 #include "core/base/array_access.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
@@ -255,51 +256,6 @@ __global__ __launch_bounds__(default_block_size) void count_nnz_per_l_u_row(
 }
 
 
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void initialize_l_u(
-    size_type num_rows, const IndexType* __restrict__ row_ptrs,
-    const IndexType* __restrict__ col_idxs,
-    const ValueType* __restrict__ values,
-    const IndexType* __restrict__ l_row_ptrs,
-    IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_values,
-    const IndexType* __restrict__ u_row_ptrs,
-    IndexType* __restrict__ u_col_idxs, ValueType* __restrict__ u_values)
-{
-    const auto row = thread::get_thread_id_flat<IndexType>();
-    if (row < num_rows) {
-        auto l_idx = l_row_ptrs[row];
-        auto u_idx = u_row_ptrs[row] + 1;  // we treat the diagonal separately
-        // default diagonal to one
-        auto diag_val = one<ValueType>();
-        for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
-            const auto col = col_idxs[i];
-            const auto val = values[i];
-            // save diagonal entry for later
-            if (col == row) {
-                diag_val = val;
-            }
-            if (col < row) {
-                l_col_idxs[l_idx] = col;
-                l_values[l_idx] = val;
-                ++l_idx;
-            }
-            if (row < col) {
-                u_col_idxs[u_idx] = col;
-                u_values[u_idx] = val;
-                ++u_idx;
-            }
-        }
-        // store diagonal entries
-        auto l_diag_idx = l_row_ptrs[row + 1] - 1;
-        auto u_diag_idx = u_row_ptrs[row];
-        l_col_idxs[l_diag_idx] = row;
-        u_col_idxs[u_diag_idx] = row;
-        l_values[l_diag_idx] = one<ValueType>();
-        u_values[u_diag_idx] = diag_val;
-    }
-}
-
-
 template <typename ValueType, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void count_nnz_per_l_row(
     size_type num_rows, const IndexType* __restrict__ row_ptrs,
@@ -320,48 +276,6 @@ __global__ __launch_bounds__(default_block_size) void count_nnz_per_l_row(
 }
 
 
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void initialize_l(
-    size_type num_rows, const IndexType* __restrict__ row_ptrs,
-    const IndexType* __restrict__ col_idxs,
-    const ValueType* __restrict__ values,
-    const IndexType* __restrict__ l_row_ptrs,
-    IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_values,
-    bool use_sqrt)
-{
-    const auto row = thread::get_thread_id_flat<IndexType>();
-    if (row < num_rows) {
-        auto l_idx = l_row_ptrs[row];
-        // if there was no diagonal entry, default to one
-        auto diag_val = one<ValueType>();
-        for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
-            const auto col = col_idxs[i];
-            const auto val = values[i];
-            // save diagonal entry for later
-            if (col == row) {
-                diag_val = val;
-            }
-            if (col < row) {
-                l_col_idxs[l_idx] = col;
-                l_values[l_idx] = val;
-                ++l_idx;
-            }
-        }
-        // store diagonal entries
-        auto l_diag_idx = l_row_ptrs[row + 1] - 1;
-        l_col_idxs[l_diag_idx] = row;
-        // compute square root with sentinel
-        if (use_sqrt) {
-            diag_val = sqrt(diag_val);
-            if (!is_finite(diag_val)) {
-                diag_val = one<ValueType>();
-            }
-        }
-        l_values[l_diag_idx] = diag_val;
-    }
-}
-
-
 }  // namespace kernel
 
 
@@ -481,18 +395,25 @@ void initialize_l_u(std::shared_ptr<const DefaultExecutor> exec,
                     matrix::Csr<ValueType, IndexType>* csr_u)
 {
     const size_type num_rows{system_matrix->get_size()[0]};
-    const auto block_size = default_block_size;
+    const auto block_size = helpers::default_block_size;
     const auto grid_dim = static_cast<uint32>(
         ceildiv(num_rows, static_cast<size_type>(block_size)));
 
     if (grid_dim > 0) {
-        kernel::initialize_l_u<<<grid_dim, block_size, 0, exec->get_stream()>>>(
-            num_rows, system_matrix->get_const_row_ptrs(),
-            system_matrix->get_const_col_idxs(),
-            as_device_type(system_matrix->get_const_values()),
-            csr_l->get_const_row_ptrs(), csr_l->get_col_idxs(),
-            as_device_type(csr_l->get_values()), csr_u->get_const_row_ptrs(),
-            csr_u->get_col_idxs(), as_device_type(csr_u->get_values()));
+        helpers::
+            initialize_l_u<<<grid_dim, block_size, 0, exec->get_stream()>>>(
+                num_rows, system_matrix->get_const_row_ptrs(),
+                system_matrix->get_const_col_idxs(),
+                as_device_type(system_matrix->get_const_values()),
+                csr_l->get_const_row_ptrs(), csr_l->get_col_idxs(),
+                as_device_type(csr_l->get_values()),
+                csr_u->get_const_row_ptrs(), csr_u->get_col_idxs(),
+                as_device_type(csr_u->get_values()),
+                helpers::triangular_mtx_closure(
+                    [] __device__(auto val) { return one(val); },
+                    helpers::identity{}),
+                helpers::triangular_mtx_closure(helpers::identity{},
+                                                helpers::identity{}));
     }
 }
 
@@ -534,17 +455,28 @@ void initialize_l(std::shared_ptr<const DefaultExecutor> exec,
                   matrix::Csr<ValueType, IndexType>* csr_l, bool diag_sqrt)
 {
     const size_type num_rows{system_matrix->get_size()[0]};
-    const auto block_size = default_block_size;
+    const auto block_size = helpers::default_block_size;
     const auto grid_dim = static_cast<uint32>(
         ceildiv(num_rows, static_cast<size_type>(block_size)));
 
     if (grid_dim > 0) {
-        kernel::initialize_l<<<grid_dim, block_size, 0, exec->get_stream()>>>(
+        helpers::initialize_l<<<grid_dim, block_size, 0, exec->get_stream()>>>(
             num_rows, system_matrix->get_const_row_ptrs(),
             system_matrix->get_const_col_idxs(),
             as_device_type(system_matrix->get_const_values()),
             csr_l->get_const_row_ptrs(), csr_l->get_col_idxs(),
-            as_device_type(csr_l->get_values()), diag_sqrt);
+            as_device_type(csr_l->get_values()),
+            helpers::triangular_mtx_closure(
+                [diag_sqrt] __device__(auto val) {
+                    if (diag_sqrt) {
+                        val = sqrt(val);
+                        if (!is_finite(val)) {
+                            val = one(val);
+                        }
+                    }
+                    return val;
+                },
+                helpers::identity{}));
     }
 }
 
diff --git a/common/cuda_hip/preconditioner/sor_kernels.cpp b/common/cuda_hip/preconditioner/sor_kernels.cpp
index dcf6f68c5c7..a415953915f 100644
--- a/common/cuda_hip/preconditioner/sor_kernels.cpp
+++ b/common/cuda_hip/preconditioner/sor_kernels.cpp
@@ -7,6 +7,8 @@
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
+#include "common/cuda_hip/factorization/factorization_helpers.hpp"
+
 namespace gko {
 namespace kernels {
 namespace GKO_DEVICE_NAMESPACE {
@@ -17,8 +19,30 @@ template <typename ValueType, typename IndexType>
 void initialize_weighted_l(
     std::shared_ptr<const DefaultExecutor> exec,
     const matrix::Csr<ValueType, IndexType>* system_matrix,
-    remove_complex<ValueType> weight,
-    matrix::Csr<ValueType, IndexType>* l_mtx) GKO_NOT_IMPLEMENTED;
+    remove_complex<ValueType> weight, matrix::Csr<ValueType, IndexType>* l_mtx)
+{
+    const size_type num_rows{system_matrix->get_size()[0]};
+    const auto block_size = factorization::helpers::default_block_size;
+    const auto grid_dim = static_cast<uint32>(
+        ceildiv(num_rows, static_cast<size_type>(block_size)));
+
+    auto inv_weight = one(weight) / weight;
+
+    if (grid_dim > 0) {
+        factorization::helpers::
+            initialize_l<<<grid_dim, block_size, 0, exec->get_stream()>>>(
+                num_rows, system_matrix->get_const_row_ptrs(),
+                system_matrix->get_const_col_idxs(),
+                as_device_type(system_matrix->get_const_values()),
+                l_mtx->get_const_row_ptrs(), l_mtx->get_col_idxs(),
+                as_device_type(l_mtx->get_values()),
+                factorization::helpers::triangular_mtx_closure(
+                    [inv_weight] __device__(auto val) {
+                        return val * inv_weight;
+                    },
+                    factorization::helpers::identity{}));
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L);
@@ -29,7 +53,41 @@ void initialize_weighted_l_u(
     std::shared_ptr<const DefaultExecutor> exec,
     const matrix::Csr<ValueType, IndexType>* system_matrix,
     remove_complex<ValueType> weight, matrix::Csr<ValueType, IndexType>* l_mtx,
-    matrix::Csr<ValueType, IndexType>* u_mtx) GKO_NOT_IMPLEMENTED;
+    matrix::Csr<ValueType, IndexType>* u_mtx)
+{
+    const size_type num_rows{system_matrix->get_size()[0]};
+    const auto block_size = factorization::helpers::default_block_size;
+    const auto grid_dim = static_cast<uint32>(
+        ceildiv(num_rows, static_cast<size_type>(block_size)));
+
+    auto inv_weight = one(weight) / weight;
+    auto inv_two_minus_weight =
+        one(weight) / (static_cast<remove_complex<ValueType>>(2.0) - weight);
+
+    if (grid_dim > 0) {
+        factorization::helpers::
+            initialize_l_u<<<grid_dim, block_size, 0, exec->get_stream()>>>(
+                num_rows, system_matrix->get_const_row_ptrs(),
+                system_matrix->get_const_col_idxs(),
+                as_device_type(system_matrix->get_const_values()),
+                l_mtx->get_const_row_ptrs(), l_mtx->get_col_idxs(),
+                as_device_type(l_mtx->get_values()),
+                u_mtx->get_const_row_ptrs(), u_mtx->get_col_idxs(),
+                as_device_type(u_mtx->get_values()),
+                factorization::helpers::triangular_mtx_closure(
+                    [inv_weight] __device__(auto val) {
+                        return val * inv_weight;
+                    },
+                    factorization::helpers::identity{}),
+                factorization::helpers::triangular_mtx_closure(
+                    [inv_two_minus_weight] __device__(auto val) {
+                        return val * inv_two_minus_weight;
+                    },
+                    [weight, inv_two_minus_weight] __device__(auto val) {
+                        return val * weight * inv_two_minus_weight;
+                    }));
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U);

From 8d72712e37095ecd615dbee4554fb2e137d77c55 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Thu, 27 Jun 2024 14:05:31 +0200
Subject: [PATCH 228/448] [prec] add sor device test

---
 test/preconditioner/CMakeLists.txt  |  1 +
 test/preconditioner/sor_kernels.cpp | 89 +++++++++++++++++++++++++++++
 2 files changed, 90 insertions(+)
 create mode 100644 test/preconditioner/sor_kernels.cpp

diff --git a/test/preconditioner/CMakeLists.txt b/test/preconditioner/CMakeLists.txt
index b41897efaac..46696e29549 100644
--- a/test/preconditioner/CMakeLists.txt
+++ b/test/preconditioner/CMakeLists.txt
@@ -1,3 +1,4 @@
 ginkgo_create_common_test(batch_jacobi_kernels DISABLE_EXECUTORS dpcpp)
 ginkgo_create_common_test(jacobi_kernels DISABLE_EXECUTORS dpcpp)
 ginkgo_create_common_test(isai_kernels)
+ginkgo_create_common_test(sor_kernels)
diff --git a/test/preconditioner/sor_kernels.cpp b/test/preconditioner/sor_kernels.cpp
new file mode 100644
index 00000000000..cd12514bb28
--- /dev/null
+++ b/test/preconditioner/sor_kernels.cpp
@@ -0,0 +1,89 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/preconditioner/sor_kernels.hpp"
+
+#include <random>
+
+#include <gtest/gtest.h>
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+#include "core/test/utils.hpp"
+#include "core/utils/matrix_utils.hpp"
+#include "test/utils/common_fixture.hpp"
+#include "test/utils/executor.hpp"
+
+
+class Sor : public CommonTestFixture {
+protected:
+    using Csr = gko::matrix::Csr<value_type, index_type>;
+    using Dense = gko::matrix::Dense<value_type>;
+
+    Sor()
+    {
+        gko::size_type n = 133;
+        index_type row_limit = 15;
+        auto nz_dist = std::uniform_int_distribution<index_type>(1, row_limit);
+        auto val_dist = std::uniform_real_distribution<value_type>(-1., 1.);
+        auto md =
+            gko::test::generate_random_matrix_data<value_type, index_type>(
+                n, n, nz_dist, val_dist, rand_engine);
+        auto md_l = md;
+        auto md_u = md;
+        // make_upper/lower_triangular also removes the diagonal, so it is
+        // added back with make_unit_diagonal
+        gko::utils::make_lower_triangular(md_l);
+        gko::utils::make_unit_diagonal(md_l);
+        gko::utils::make_upper_triangular(md_u);
+        gko::utils::make_unit_diagonal(md_u);
+
+        mtx->read(md);
+        d_mtx->read(md);
+
+        result_l->read(md_l);
+        result_l->scale(gko::initialize<Dense>({0.0}, ref));
+        d_result_l = gko::clone(exec, result_l);
+
+        result_u->read(md_u);
+        result_u->scale(gko::initialize<Dense>({0.0}, ref));
+        d_result_u = gko::clone(exec, result_u);
+    }
+
+    std::default_random_engine rand_engine{42};
+
+    std::unique_ptr<Csr> mtx = Csr::create(ref);
+    std::unique_ptr<Csr> d_mtx = Csr::create(exec);
+
+    std::unique_ptr<Csr> result_l = Csr::create(ref);
+    std::unique_ptr<Csr> d_result_l = Csr::create(exec);
+    std::unique_ptr<Csr> result_u = Csr::create(ref);
+    std::unique_ptr<Csr> d_result_u = Csr::create(exec);
+};
+
+
+TEST_F(Sor, InitializeWeightedLFactorIsSameAsReference)
+{
+    gko::kernels::reference::sor::initialize_weighted_l(ref, mtx.get(), 1.24,
+                                                        result_l.get());
+    gko::kernels::GKO_DEVICE_NAMESPACE::sor::initialize_weighted_l(
+        exec, d_mtx.get(), 1.24, d_result_l.get());
+
+    GKO_ASSERT_MTX_NEAR(result_l, d_result_l, r<value_type>::value);
+}
+
+
+TEST_F(Sor, InitializeWeightedLAndUFactorIsSameAsReference)
+{
+    gko::kernels::reference::sor::initialize_weighted_l_u(
+        ref, mtx.get(), 1.24, result_l.get(), result_u.get());
+    gko::kernels::GKO_DEVICE_NAMESPACE::sor::initialize_weighted_l_u(
+        exec, d_mtx.get(), 1.24, d_result_l.get(), d_result_u.get());
+
+    GKO_ASSERT_MTX_NEAR(result_l, d_result_l, r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(result_u, d_result_u, r<value_type>::value);
+}

From 1e121fc93f1eddc270832b84cd120747dfded14d Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Thu, 27 Jun 2024 14:06:47 +0200
Subject: [PATCH 229/448] [prec] add sor to benchmarks

---
 benchmark/utils/preconditioners.hpp | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/benchmark/utils/preconditioners.hpp b/benchmark/utils/preconditioners.hpp
index 63fd22708e6..ea8594e7446 100644
--- a/benchmark/utils/preconditioners.hpp
+++ b/benchmark/utils/preconditioners.hpp
@@ -22,7 +22,7 @@ DEFINE_string(preconditioners, "none",
               "A comma-separated list of preconditioners to use. "
               "Supported values are: none, jacobi, paric, parict, parilu, "
               "parilut, ic, ilu, paric-isai, parict-isai, parilu-isai, "
-              "parilut-isai, ic-isai, ilu-isai, overhead");
+              "parilut-isai, ic-isai, ilu-isai, sor, overhead");
 
 DEFINE_uint32(parilu_iterations, 5,
               "The number of iterations for ParIC(T)/ParILU(T)");
@@ -49,6 +49,12 @@ DEFINE_double(jacobi_accuracy, 1e-1,
 DEFINE_uint32(jacobi_max_block_size, 32,
               "Maximal block size of the block-Jacobi preconditioner");
 
+DEFINE_double(sor_relaxation_factor, 1.0,
+              "The relaxation factor for the SOR preconditioner");
+
+DEFINE_bool(sor_symmetric, false,
+            "Apply the SOR preconditioner symmetrically, i.e. use SSOR");
+
 
 // parses the Jacobi storage optimization command line argument
 gko::precision_reduction parse_storage_optimization(const std::string& flag)
@@ -292,6 +298,15 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOpFactory>(
                  .with_sparsity_power(FLAGS_isai_power)
                  .on(exec);
          }},
+        {"sor",
+         [](std::shared_ptr<const gko::Executor> exec) {
+             return gko::preconditioner::Sor<etype, itype>::build()
+                 .with_relaxation_factor(
+                     static_cast<gko::remove_complex<etype>>(
+                         FLAGS_sor_relaxation_factor))
+                 .with_symmetric(FLAGS_sor_symmetric)
+                 .on(exec);
+         }},
         {"overhead", [](std::shared_ptr<const gko::Executor> exec) {
              return gko::Overhead<etype>::build()
                  .with_criteria(gko::stop::ResidualNorm<etype>::build()

From 092d481b7698e754d0811a198dfcf2295d7d846e Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Wed, 30 Oct 2024 13:29:23 +0100
Subject: [PATCH 230/448] [test] add missing factorization tests

---
 test/factorization/CMakeLists.txt            |  1 +
 test/factorization/factorization_kernels.cpp | 78 ++++++++++++++++++++
 2 files changed, 79 insertions(+)
 create mode 100644 test/factorization/factorization_kernels.cpp

diff --git a/test/factorization/CMakeLists.txt b/test/factorization/CMakeLists.txt
index e768a48ef05..8b5aa51287b 100644
--- a/test/factorization/CMakeLists.txt
+++ b/test/factorization/CMakeLists.txt
@@ -1,4 +1,5 @@
 ginkgo_create_common_test(cholesky_kernels DISABLE_EXECUTORS dpcpp)
+ginkgo_create_common_test(factorization_kernels)
 ginkgo_create_common_test(lu_kernels DISABLE_EXECUTORS dpcpp)
 ginkgo_create_common_test(ic_kernels DISABLE_EXECUTORS dpcpp omp)
 ginkgo_create_common_test(ilu_kernels DISABLE_EXECUTORS dpcpp omp)
diff --git a/test/factorization/factorization_kernels.cpp b/test/factorization/factorization_kernels.cpp
new file mode 100644
index 00000000000..7887d83e0f7
--- /dev/null
+++ b/test/factorization/factorization_kernels.cpp
@@ -0,0 +1,78 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/factorization/factorization_kernels.hpp"
+
+#include <fstream>
+#include <memory>
+#include <random>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include <core/utils/matrix_utils.hpp>
+
+#include <ginkgo/core/base/executor.hpp>
+
+#include "core/test/utils.hpp"
+#include "core/test/utils/unsort_matrix.hpp"
+#include "test/utils/common_fixture.hpp"
+
+
+class Factorization : public CommonTestFixture {
+protected:
+    using Csr = gko::matrix::Csr<value_type, index_type>;
+
+    Factorization()
+    {
+        mtx = gko::test::generate_random_matrix<Csr>(
+            52, 52, std::uniform_int_distribution<>(4, 40),
+            std::uniform_real_distribution<>(1, 2), rand_engine, ref);
+        gko::utils::ensure_all_diagonal_entries(mtx.get());
+        dmtx = gko::clone(exec, mtx);
+    }
+
+    std::default_random_engine rand_engine{6794};
+    std::shared_ptr<Csr> mtx;
+    std::shared_ptr<Csr> dmtx;
+};
+
+
+TEST_F(Factorization, InitializeRowPtrsLSameAsRef)
+{
+    gko::array<index_type> l_ptrs{ref, mtx->get_size()[0] + 1};
+    gko::array<index_type> dl_ptrs{exec, mtx->get_size()[0] + 1};
+
+    gko::kernels::reference::factorization::initialize_row_ptrs_l(
+        ref, mtx.get(), l_ptrs.get_data());
+    gko::kernels::GKO_DEVICE_NAMESPACE::factorization::initialize_row_ptrs_l(
+        exec, dmtx.get(), dl_ptrs.get_data());
+
+    GKO_ASSERT_ARRAY_EQ(l_ptrs, dl_ptrs);
+}
+
+
+TEST_F(Factorization, InitializeLWithoutSqrtSameAsRef)
+{
+    gko::array<index_type> l_ptrs{ref, mtx->get_size()[0] + 1};
+    gko::kernels::reference::factorization::initialize_row_ptrs_l(
+        ref, mtx.get(), l_ptrs.get_data());
+    auto nnz =
+        static_cast<gko::size_type>(l_ptrs.get_data()[mtx->get_size()[0]]);
+    auto l_mtx =
+        Csr::create(ref, mtx->get_size(), gko::array<value_type>(ref, nnz),
+                    gko::array<index_type>(ref, nnz), l_ptrs);
+    auto dl_mtx = gko::clone(exec, l_mtx);
+
+    for (auto diag_sqrt : {false, true}) {
+        SCOPED_TRACE("diag_sqrt: " + std::to_string(diag_sqrt));
+
+        gko::kernels::reference::factorization::initialize_l(
+            ref, mtx.get(), l_mtx.get(), diag_sqrt);
+        gko::kernels::GKO_DEVICE_NAMESPACE::factorization::initialize_l(
+            exec, dmtx.get(), dl_mtx.get(), diag_sqrt);
+
+        GKO_ASSERT_MTX_NEAR(l_mtx, dl_mtx, 0.0);
+    }
+}

From 25073488259440079f6072605a24061990d9bd1f Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Mon, 28 Oct 2024 10:43:12 +0100
Subject: [PATCH 231/448] [cuda] remove pre cuda 11 code

Co-authored-by: Tobias Ribizel <mail@ribizel.de>
---
 benchmark/utils/cuda_linops.cpp               | 370 --------------
 common/cuda_hip/components/atomic.hpp         |  31 +-
 cuda/base/cusparse_bindings.hpp               | 467 +-----------------
 cuda/base/types.hpp                           |  18 -
 cuda/components/cooperative_groups.cuh        | 126 -----
 .../identify_stream_usage.cpp                 |   5 -
 6 files changed, 14 insertions(+), 1003 deletions(-)

diff --git a/benchmark/utils/cuda_linops.cpp b/benchmark/utils/cuda_linops.cpp
index 4683d6086e1..961b055135b 100644
--- a/benchmark/utils/cuda_linops.cpp
+++ b/benchmark/utils/cuda_linops.cpp
@@ -87,237 +87,6 @@ class CusparseBase : public gko::LinOp {
 };
 
 
-#if CUDA_VERSION < 11000
-
-
-template <typename ValueType = gko::default_precision,
-          typename IndexType = gko::int32>
-class CusparseCsrmp
-    : public gko::EnableLinOp<CusparseCsrmp<ValueType, IndexType>,
-                              CusparseBase>,
-      public gko::ReadableFromMatrixData<ValueType, IndexType>,
-      public gko::EnableCreateMethod<CusparseCsrmp<ValueType, IndexType>> {
-    friend class gko::EnableCreateMethod<CusparseCsrmp>;
-    friend class gko::EnablePolymorphicObject<CusparseCsrmp, CusparseBase>;
-
-public:
-    using csr = gko::matrix::Csr<ValueType, IndexType>;
-    using mat_data = gko::matrix_data<ValueType, IndexType>;
-    using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;
-
-    void read(const device_mat_data& data) override
-    {
-        this->read(data.copy_to_host());
-    }
-
-    void read(device_mat_data&& data) override
-    {
-        this->read(data.copy_to_host());
-    }
-
-    void read(const mat_data& data) override
-    {
-        csr_->read(data);
-        this->set_size(csr_->get_size());
-    }
-
-    gko::size_type get_num_stored_elements() const noexcept
-    {
-        return csr_->get_num_stored_elements();
-    }
-
-protected:
-    void apply_impl(const gko::LinOp* b, gko::LinOp* x) const override
-    {
-        auto dense_b = gko::as<gko::matrix::Dense<ValueType>>(b);
-        auto dense_x = gko::as<gko::matrix::Dense<ValueType>>(x);
-        auto db = dense_b->get_const_values();
-        auto dx = dense_x->get_values();
-
-        auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
-        gko::kernels::cuda::cusparse::spmv_mp(
-            this->get_gpu_exec()->get_sparselib_handle(), trans_,
-            this->get_size()[0], this->get_size()[1],
-            csr_->get_num_stored_elements(), &scalars.get_const_data()[0],
-            this->get_descr(), csr_->get_const_values(),
-            csr_->get_const_row_ptrs(), csr_->get_const_col_idxs(), db,
-            &scalars.get_const_data()[1], dx);
-    }
-
-    void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
-                    const gko::LinOp* beta,
-                    gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;
-
-    CusparseCsrmp(std::shared_ptr<const gko::Executor> exec,
-                  const gko::dim<2>& size = gko::dim<2>{})
-        : gko::EnableLinOp<CusparseCsrmp, CusparseBase>(exec, size),
-          csr_(std::move(
-              csr::create(exec, std::make_shared<typename csr::classical>()))),
-          trans_(SPARSELIB_OPERATION_NON_TRANSPOSE)
-    {}
-
-private:
-    // Contains {alpha, beta}
-    gko::array<ValueType> scalars{
-        this->get_executor(), {gko::one<ValueType>(), gko::zero<ValueType>()}};
-    std::shared_ptr<csr> csr_;
-    cusparseOperation_t trans_;
-};
-
-
-template <typename ValueType = gko::default_precision,
-          typename IndexType = gko::int32>
-class CusparseCsr
-    : public gko::EnableLinOp<CusparseCsr<ValueType, IndexType>, CusparseBase>,
-      public gko::EnableCreateMethod<CusparseCsr<ValueType, IndexType>>,
-      public gko::ReadableFromMatrixData<ValueType, IndexType> {
-    friend class gko::EnableCreateMethod<CusparseCsr>;
-    friend class gko::EnablePolymorphicObject<CusparseCsr, CusparseBase>;
-
-public:
-    using csr = gko::matrix::Csr<ValueType, IndexType>;
-    using mat_data = gko::matrix_data<ValueType, IndexType>;
-    using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;
-
-    void read(const device_mat_data& data) override
-    {
-        this->read(data.copy_to_host());
-    }
-
-    void read(device_mat_data&& data) override
-    {
-        this->read(data.copy_to_host());
-    }
-
-    void read(const mat_data& data) override
-    {
-        csr_->read(data);
-        this->set_size(csr_->get_size());
-    }
-
-    gko::size_type get_num_stored_elements() const noexcept
-    {
-        return csr_->get_num_stored_elements();
-    }
-
-protected:
-    void apply_impl(const gko::LinOp* b, gko::LinOp* x) const override
-    {
-        auto dense_b = gko::as<gko::matrix::Dense<ValueType>>(b);
-        auto dense_x = gko::as<gko::matrix::Dense<ValueType>>(x);
-        auto db = dense_b->get_const_values();
-        auto dx = dense_x->get_values();
-
-        auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
-        gko::kernels::cuda::cusparse::spmv(
-            this->get_gpu_exec()->get_sparselib_handle(), trans_,
-            this->get_size()[0], this->get_size()[1],
-            csr_->get_num_stored_elements(), &scalars.get_const_data()[0],
-            this->get_descr(), csr_->get_const_values(),
-            csr_->get_const_row_ptrs(), csr_->get_const_col_idxs(), db,
-            &scalars.get_const_data()[1], dx);
-    }
-
-    void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
-                    const gko::LinOp* beta,
-                    gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;
-
-    CusparseCsr(std::shared_ptr<const gko::Executor> exec,
-                const gko::dim<2>& size = gko::dim<2>{})
-        : gko::EnableLinOp<CusparseCsr, CusparseBase>(exec, size),
-          csr_(std::move(
-              csr::create(exec, std::make_shared<typename csr::classical>()))),
-          trans_(SPARSELIB_OPERATION_NON_TRANSPOSE)
-    {}
-
-private:
-    // Contains {alpha, beta}
-    gko::array<ValueType> scalars{
-        this->get_executor(), {gko::one<ValueType>(), gko::zero<ValueType>()}};
-    std::shared_ptr<csr> csr_;
-    cusparseOperation_t trans_;
-};
-
-
-template <typename ValueType = gko::default_precision,
-          typename IndexType = gko::int32>
-class CusparseCsrmm
-    : public gko::EnableLinOp<CusparseCsrmm<ValueType, IndexType>,
-                              CusparseBase>,
-      public gko::EnableCreateMethod<CusparseCsrmm<ValueType, IndexType>>,
-      public gko::ReadableFromMatrixData<ValueType, IndexType> {
-    friend class gko::EnableCreateMethod<CusparseCsrmm>;
-    friend class gko::EnablePolymorphicObject<CusparseCsrmm, CusparseBase>;
-
-public:
-    using csr = gko::matrix::Csr<ValueType, IndexType>;
-    using mat_data = gko::matrix_data<ValueType, IndexType>;
-    using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;
-
-    void read(const device_mat_data& data) override
-    {
-        this->read(data.copy_to_host());
-    }
-
-    void read(device_mat_data&& data) override
-    {
-        this->read(data.copy_to_host());
-    }
-
-    void read(const mat_data& data) override
-    {
-        csr_->read(data);
-        this->set_size(csr_->get_size());
-    }
-
-    gko::size_type get_num_stored_elements() const noexcept
-    {
-        return csr_->get_num_stored_elements();
-    }
-
-protected:
-    void apply_impl(const gko::LinOp* b, gko::LinOp* x) const override
-    {
-        auto dense_b = gko::as<gko::matrix::Dense<ValueType>>(b);
-        auto dense_x = gko::as<gko::matrix::Dense<ValueType>>(x);
-        auto db = dense_b->get_const_values();
-        auto dx = dense_x->get_values();
-
-        auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
-        gko::kernels::cuda::cusparse::spmm(
-            this->get_gpu_exec()->get_sparselib_handle(), trans_,
-            this->get_size()[0], dense_b->get_size()[1], this->get_size()[1],
-            csr_->get_num_stored_elements(), &scalars.get_const_data()[0],
-            this->get_descr(), csr_->get_const_values(),
-            csr_->get_const_row_ptrs(), csr_->get_const_col_idxs(), db,
-            dense_b->get_size()[0], &scalars.get_const_data()[1], dx,
-            dense_x->get_size()[0]);
-    }
-
-    void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
-                    const gko::LinOp* beta,
-                    gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;
-
-    CusparseCsrmm(std::shared_ptr<const gko::Executor> exec,
-                  const gko::dim<2>& size = gko::dim<2>{})
-        : gko::EnableLinOp<CusparseCsrmm, CusparseBase>(exec, size),
-          csr_(std::move(
-              csr::create(exec, std::make_shared<typename csr::classical>()))),
-          trans_(SPARSELIB_OPERATION_NON_TRANSPOSE)
-    {}
-
-private:
-    // Contains {alpha, beta}
-    gko::array<ValueType> scalars{
-        this->get_executor(), {gko::one<ValueType>(), gko::zero<ValueType>()}};
-    std::shared_ptr<csr> csr_;
-    cusparseOperation_t trans_;
-};
-
-
-#endif  // CUDA_VERSION < 11000
-
-
 #if CUDA_VERSION < 11021
 
 
@@ -421,112 +190,6 @@ class CusparseCsrEx
 #endif  // CUDA_VERSION < 11021
 
 
-#if CUDA_VERSION < 11000
-
-
-template <typename ValueType = gko::default_precision,
-          typename IndexType = gko::int32,
-          cusparseHybPartition_t Partition = CUSPARSE_HYB_PARTITION_AUTO,
-          int Threshold = 0>
-class CusparseHybrid
-    : public gko::EnableLinOp<
-          CusparseHybrid<ValueType, IndexType, Partition, Threshold>,
-          CusparseBase>,
-      public gko::EnableCreateMethod<
-          CusparseHybrid<ValueType, IndexType, Partition, Threshold>>,
-      public gko::ReadableFromMatrixData<ValueType, IndexType> {
-    friend class gko::EnableCreateMethod<CusparseHybrid>;
-    friend class gko::EnablePolymorphicObject<CusparseHybrid, CusparseBase>;
-
-public:
-    using csr = gko::matrix::Csr<ValueType, IndexType>;
-    using mat_data = gko::matrix_data<ValueType, IndexType>;
-    using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;
-
-    void read(const device_mat_data& data) override
-    {
-        this->read(data.copy_to_host());
-    }
-
-    void read(device_mat_data&& data) override
-    {
-        this->read(data.copy_to_host());
-    }
-
-    void read(const mat_data& data) override
-    {
-        auto t_csr = csr::create(this->get_executor(),
-                                 std::make_shared<typename csr::classical>());
-        t_csr->read(data);
-        this->set_size(t_csr->get_size());
-
-        auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
-        gko::kernels::cuda::cusparse::csr2hyb(
-            this->get_gpu_exec()->get_sparselib_handle(), this->get_size()[0],
-            this->get_size()[1], this->get_descr(), t_csr->get_const_values(),
-            t_csr->get_const_row_ptrs(), t_csr->get_const_col_idxs(), hyb_,
-            Threshold, Partition);
-    }
-
-    ~CusparseHybrid() override
-    {
-        try {
-            auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
-            GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyHybMat(hyb_));
-        } catch (const std::exception& e) {
-            std::cerr << "Error when unallocating CusparseHybrid hyb_ matrix: "
-                      << e.what() << std::endl;
-        }
-    }
-
-    CusparseHybrid(const CusparseHybrid& other) = delete;
-
-    CusparseHybrid& operator=(const CusparseHybrid& other) = default;
-
-protected:
-    void apply_impl(const gko::LinOp* b, gko::LinOp* x) const override
-    {
-        auto dense_b = gko::as<gko::matrix::Dense<ValueType>>(b);
-        auto dense_x = gko::as<gko::matrix::Dense<ValueType>>(x);
-        auto db = dense_b->get_const_values();
-        auto dx = dense_x->get_values();
-
-        auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
-        gko::kernels::cuda::cusparse::spmv(
-            this->get_gpu_exec()->get_sparselib_handle(), trans_,
-            &scalars.get_const_data()[0], this->get_descr(), hyb_, db,
-            &scalars.get_const_data()[1], dx);
-    }
-
-    void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
-                    const gko::LinOp* beta,
-                    gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;
-
-    CusparseHybrid(std::shared_ptr<const gko::Executor> exec,
-                   const gko::dim<2>& size = gko::dim<2>{})
-        : gko::EnableLinOp<CusparseHybrid, CusparseBase>(exec, size),
-          trans_(SPARSELIB_OPERATION_NON_TRANSPOSE)
-    {
-        auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
-        GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateHybMat(&hyb_));
-    }
-
-private:
-    // Contains {alpha, beta}
-    gko::array<ValueType> scalars{
-        this->get_executor(), {gko::one<ValueType>(), gko::zero<ValueType>()}};
-    cusparseOperation_t trans_;
-    cusparseHybMat_t hyb_;
-};
-
-
-#endif  // CUDA_VERSION < 11000
-
-
-#if CUDA_VERSION >= 11000 || \
-    ((CUDA_VERSION >= 10020) && !(defined(_WIN32) || defined(__CYGWIN__)))
-
-
 template <typename ValueType>
 void cusparse_generic_spmv(std::shared_ptr<const gko::CudaExecutor> gpu_exec,
                            const cusparseSpMatDescr_t mat,
@@ -755,10 +418,6 @@ class CusparseGenericCoo
 };
 
 
-#endif  // CUDA_VERSION >= 11000 || ((CUDA_VERSION >= 10020) &&
-        // !(defined(_WIN32) || defined(__CYGWIN__)))
-
-
 }  // namespace detail
 
 
@@ -769,22 +428,12 @@ IMPL_CREATE_SPARSELIB_LINOP(cusparse_csrex,
 STUB_CREATE_SPARSELIB_LINOP(cusparse_csrex);
 #endif
 
-#if CUDA_VERSION < 11000
-IMPL_CREATE_SPARSELIB_LINOP(cusparse_csr, detail::CusparseCsr<etype, itype>);
-IMPL_CREATE_SPARSELIB_LINOP(cusparse_csrmp,
-                            detail::CusparseCsrmp<etype, itype>);
-IMPL_CREATE_SPARSELIB_LINOP(cusparse_csrmm,
-                            detail::CusparseCsrmm<etype, itype>);
-#else   // CUDA_VERSION >= 11000
 IMPL_CREATE_SPARSELIB_LINOP(cusparse_csr,
                             detail::CusparseGenericCsr<etype, itype>);
 STUB_CREATE_SPARSELIB_LINOP(cusparse_csrmp);
 STUB_CREATE_SPARSELIB_LINOP(cusparse_csrmm);
-#endif  // CUDA_VERSION >= 11000
 
 
-#if CUDA_VERSION >= 11000 || \
-    ((CUDA_VERSION >= 10020) && !(defined(_WIN32) || defined(__CYGWIN__)))
 IMPL_CREATE_SPARSELIB_LINOP(cusparse_gcsr,
                             detail::CusparseGenericCsr<etype, itype>);
 #if CUDA_VERSION >= 11021
@@ -796,26 +445,7 @@ IMPL_CREATE_SPARSELIB_LINOP(cusparse_gcsr2,
                             detail::CusparseGenericCsr<etype, itype, csr_algo>);
 IMPL_CREATE_SPARSELIB_LINOP(cusparse_gcoo,
                             detail::CusparseGenericCoo<etype, itype>);
-#else
-STUB_CREATE_SPARSELIB_LINOP(cusparse_gcsr);
-STUB_CREATE_SPARSELIB_LINOP(cusparse_gcsr2);
-STUB_CREATE_SPARSELIB_LINOP(cusparse_gcoo);
-#endif  // CUDA_VERSION < 11000 && ((CUDA_VERSION < 10020) || (defined(_WIN32)
-        // && defined(__CYGWIN__))))
-
-
-#if CUDA_VERSION < 11000
-IMPL_CREATE_SPARSELIB_LINOP(
-    cusparse_coo,
-    detail::CusparseHybrid<etype, itype, CUSPARSE_HYB_PARTITION_USER, 0>);
-IMPL_CREATE_SPARSELIB_LINOP(
-    cusparse_ell,
-    detail::CusparseHybrid<etype, itype, CUSPARSE_HYB_PARTITION_MAX, 0>);
-IMPL_CREATE_SPARSELIB_LINOP(cusparse_hybrid,
-                            detail::CusparseHybrid<etype, itype>);
-#else   // CUDA_VERSION >= 11000
 IMPL_CREATE_SPARSELIB_LINOP(cusparse_coo,
                             detail::CusparseGenericCoo<etype, itype>);
 STUB_CREATE_SPARSELIB_LINOP(cusparse_ell);
 STUB_CREATE_SPARSELIB_LINOP(cusparse_hybrid);
-#endif  // CUDA_VERSION >= 11000
diff --git a/common/cuda_hip/components/atomic.hpp b/common/cuda_hip/components/atomic.hpp
index 2fbb1664165..aeb77d48c75 100644
--- a/common/cuda_hip/components/atomic.hpp
+++ b/common/cuda_hip/components/atomic.hpp
@@ -101,10 +101,11 @@ GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned long long int);
 GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned int);
 
 
-#if !(defined(CUDA_VERSION) && (CUDA_VERSION < 10010))
-// CUDA 10.1 starts supporting 16-bit unsigned short int atomicCAS
+#if defined(CUDA_VERSION)
+// Support 16-bit ATOMIC_ADD and ATOMIC_MAX only on CUDA
 GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned short int);
-#endif  // !(defined(CUDA_VERSION) && (CUDA_VERSION < 10010))
+#endif
+
 
 #undef GKO_BIND_ATOMIC_HELPER_STRUCTURE
 
@@ -142,32 +143,26 @@ GKO_BIND_ATOMIC_ADD(double);
 #else  // NVIDIA
 
 
-#if !((defined(CUDA_VERSION) && (CUDA_VERSION < 8000)) || \
-      (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)))
-// CUDA 8.0 starts suppoting 64-bit double atomicAdd on devices of compute
+#if !(defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600))
+// CUDA supports 64-bit double atomicAdd on devices of compute
 // capability 6.x and higher
 GKO_BIND_ATOMIC_ADD(double);
-#endif  // !((defined(CUDA_VERSION) && (CUDA_VERSION < 8000)) ||
-        // (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)))
+#endif  // !(defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600))
 
-#if !((defined(CUDA_VERSION) && (CUDA_VERSION < 10000)) || \
-      (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
-// CUDA 10.0 starts supporting 16-bit __half floating-point atomicAdd on devices
+#if !(defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700))
+// CUDA supports 16-bit __half floating-point atomicAdd on devices
 // of compute capability 7.x and higher.
 GKO_BIND_ATOMIC_ADD(__half);
-#endif  // !((defined(CUDA_VERSION) && (CUDA_VERSION < 10000)) ||
-        // (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
+#endif  // !(defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700))
 
-#if !((defined(CUDA_VERSION) && (CUDA_VERSION < 10000)) || \
-      (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)))
-// CUDA 10.0 starts supporting 32-bit __half2 floating-point atomicAdd on
+#if !(defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600))
+// CUDA supports 32-bit __half2 floating-point atomicAdd on
 // devices of compute capability 6.x and higher. note: The atomicity of the
 // __half2 add operation is guaranteed separately for each of the two __half
 // elements; the entire __half2 is not guaranteed to be atomic as a single
 // 32-bit access.
 GKO_BIND_ATOMIC_ADD(__half2);
-#endif  // !((defined(CUDA_VERSION) && (CUDA_VERSION < 10000)) ||
-        // (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)))
+#endif  // !(defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600))
 
 
 #endif  // defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC
diff --git a/cuda/base/cusparse_bindings.hpp b/cuda/base/cusparse_bindings.hpp
index bca0a80a37b..4be00b88aaf 100644
--- a/cuda/base/cusparse_bindings.hpp
+++ b/cuda/base/cusparse_bindings.hpp
@@ -57,58 +57,6 @@ template <>
 struct is_supported<std::complex<double>, int32> : std::true_type {};
 
 
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-
-
-#define GKO_BIND_CUSPARSE32_SPMV(ValueType, CusparseName)                    \
-    inline void spmv(cusparseHandle_t handle, cusparseOperation_t transA,    \
-                     int32 m, int32 n, int32 nnz, const ValueType* alpha,    \
-                     const cusparseMatDescr_t descrA,                        \
-                     const ValueType* csrValA, const int32* csrRowPtrA,      \
-                     const int32* csrColIndA, const ValueType* x,            \
-                     const ValueType* beta, ValueType* y)                    \
-    {                                                                        \
-        GKO_ASSERT_NO_CUSPARSE_ERRORS(CusparseName(                          \
-            handle, transA, m, n, nnz, as_culibs_type(alpha), descrA,        \
-            as_culibs_type(csrValA), csrRowPtrA, csrColIndA,                 \
-            as_culibs_type(x), as_culibs_type(beta), as_culibs_type(y)));    \
-    }                                                                        \
-    static_assert(true,                                                      \
-                  "This assert is used to counter the false positive extra " \
-                  "semi-colon warnings")
-
-#define GKO_BIND_CUSPARSE64_SPMV(ValueType, CusparseName)                      \
-    inline void spmv(cusparseHandle_t handle, cusparseOperation_t transA,      \
-                     int64 m, int64 n, int64 nnz, const ValueType* alpha,      \
-                     const cusparseMatDescr_t descrA,                          \
-                     const ValueType* csrValA, const int64* csrRowPtrA,        \
-                     const int64* csrColIndA, const ValueType* x,              \
-                     const ValueType* beta, ValueType* y) GKO_NOT_IMPLEMENTED; \
-    static_assert(true,                                                        \
-                  "This assert is used to counter the false positive extra "   \
-                  "semi-colon warnings")
-
-GKO_BIND_CUSPARSE32_SPMV(float, cusparseScsrmv);
-GKO_BIND_CUSPARSE32_SPMV(double, cusparseDcsrmv);
-GKO_BIND_CUSPARSE32_SPMV(std::complex<float>, cusparseCcsrmv);
-GKO_BIND_CUSPARSE32_SPMV(std::complex<double>, cusparseZcsrmv);
-GKO_BIND_CUSPARSE64_SPMV(float, cusparseScsrmv);
-GKO_BIND_CUSPARSE64_SPMV(double, cusparseDcsrmv);
-GKO_BIND_CUSPARSE64_SPMV(std::complex<float>, cusparseCcsrmv);
-GKO_BIND_CUSPARSE64_SPMV(std::complex<double>, cusparseZcsrmv);
-template <typename ValueType>
-GKO_BIND_CUSPARSE32_SPMV(ValueType, detail::not_implemented);
-template <typename ValueType>
-GKO_BIND_CUSPARSE64_SPMV(ValueType, detail::not_implemented);
-
-
-#undef GKO_BIND_CUSPARSE32_SPMV
-#undef GKO_BIND_CUSPARSE64_SPMV
-
-
-#else  // CUDA_VERSION >= 11000
-
-
 template <typename ValueType>
 inline void spmv_buffersize(cusparseHandle_t handle, cusparseOperation_t opA,
                             const ValueType* alpha,
@@ -164,109 +112,6 @@ inline void spmm(cusparseHandle_t handle, cusparseOperation_t opA,
 }
 
 
-#endif
-
-
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-
-
-#define GKO_BIND_CUSPARSE32_SPMV(ValueType, CusparseName)                    \
-    inline void spmv_mp(cusparseHandle_t handle, cusparseOperation_t transA, \
-                        int32 m, int32 n, int32 nnz, const ValueType* alpha, \
-                        const cusparseMatDescr_t descrA,                     \
-                        const ValueType* csrValA, const int32* csrRowPtrA,   \
-                        const int32* csrColIndA, const ValueType* x,         \
-                        const ValueType* beta, ValueType* y)                 \
-    {                                                                        \
-        GKO_ASSERT_NO_CUSPARSE_ERRORS(CusparseName(                          \
-            handle, transA, m, n, nnz, as_culibs_type(alpha), descrA,        \
-            as_culibs_type(csrValA), csrRowPtrA, csrColIndA,                 \
-            as_culibs_type(x), as_culibs_type(beta), as_culibs_type(y)));    \
-    }                                                                        \
-    static_assert(true,                                                      \
-                  "This assert is used to counter the false positive extra " \
-                  "semi-colon warnings")
-
-#define GKO_BIND_CUSPARSE64_SPMV(ValueType, CusparseName)                      \
-    inline void spmv_mp(                                                       \
-        cusparseHandle_t handle, cusparseOperation_t transA, int64 m, int64 n, \
-        int64 nnz, const ValueType* alpha, const cusparseMatDescr_t descrA,    \
-        const ValueType* csrValA, const int64* csrRowPtrA,                     \
-        const int64* csrColIndA, const ValueType* x, const ValueType* beta,    \
-        ValueType* y) GKO_NOT_IMPLEMENTED;                                     \
-    static_assert(true,                                                        \
-                  "This assert is used to counter the false positive extra "   \
-                  "semi-colon warnings")
-
-GKO_BIND_CUSPARSE32_SPMV(float, cusparseScsrmv_mp);
-GKO_BIND_CUSPARSE32_SPMV(double, cusparseDcsrmv_mp);
-GKO_BIND_CUSPARSE32_SPMV(std::complex<float>, cusparseCcsrmv_mp);
-GKO_BIND_CUSPARSE32_SPMV(std::complex<double>, cusparseZcsrmv_mp);
-GKO_BIND_CUSPARSE64_SPMV(float, cusparseScsrmv_mp);
-GKO_BIND_CUSPARSE64_SPMV(double, cusparseDcsrmv_mp);
-GKO_BIND_CUSPARSE64_SPMV(std::complex<float>, cusparseCcsrmv_mp);
-GKO_BIND_CUSPARSE64_SPMV(std::complex<double>, cusparseZcsrmv_mp);
-template <typename ValueType>
-GKO_BIND_CUSPARSE32_SPMV(ValueType, detail::not_implemented);
-template <typename ValueType>
-GKO_BIND_CUSPARSE64_SPMV(ValueType, detail::not_implemented);
-
-
-#undef GKO_BIND_CUSPARSE32_SPMV
-#undef GKO_BIND_CUSPARSE64_SPMV
-
-
-#define GKO_BIND_CUSPARSE32_SPMM(ValueType, CusparseName)                     \
-    inline void spmm(cusparseHandle_t handle, cusparseOperation_t transA,     \
-                     int32 m, int32 n, int32 k, int32 nnz,                    \
-                     const ValueType* alpha, const cusparseMatDescr_t descrA, \
-                     const ValueType* csrValA, const int32* csrRowPtrA,       \
-                     const int32* csrColIndA, const ValueType* B, int32 ldb,  \
-                     const ValueType* beta, ValueType* C, int32 ldc)          \
-    {                                                                         \
-        GKO_ASSERT_NO_CUSPARSE_ERRORS(                                        \
-            CusparseName(handle, transA, m, n, k, nnz, as_culibs_type(alpha), \
-                         descrA, as_culibs_type(csrValA), csrRowPtrA,         \
-                         csrColIndA, as_culibs_type(B), ldb,                  \
-                         as_culibs_type(beta), as_culibs_type(C), ldc));      \
-    }                                                                         \
-    static_assert(true,                                                       \
-                  "This assert is used to counter the false positive extra "  \
-                  "semi-colon warnings")
-
-#define GKO_BIND_CUSPARSE64_SPMM(ValueType, CusparseName)                     \
-    inline void spmm(cusparseHandle_t handle, cusparseOperation_t transA,     \
-                     int64 m, int64 n, int64 k, int64 nnz,                    \
-                     const ValueType* alpha, const cusparseMatDescr_t descrA, \
-                     const ValueType* csrValA, const int64* csrRowPtrA,       \
-                     const int64* csrColIndA, const ValueType* B, int64 ldb,  \
-                     const ValueType* beta, ValueType* C, int64 ldc)          \
-        GKO_NOT_IMPLEMENTED;                                                  \
-    static_assert(true,                                                       \
-                  "This assert is used to counter the false positive extra "  \
-                  "semi-colon warnings")
-
-GKO_BIND_CUSPARSE32_SPMM(float, cusparseScsrmm);
-GKO_BIND_CUSPARSE32_SPMM(double, cusparseDcsrmm);
-GKO_BIND_CUSPARSE32_SPMM(std::complex<float>, cusparseCcsrmm);
-GKO_BIND_CUSPARSE32_SPMM(std::complex<double>, cusparseZcsrmm);
-GKO_BIND_CUSPARSE64_SPMM(float, cusparseScsrmm);
-GKO_BIND_CUSPARSE64_SPMM(double, cusparseDcsrmm);
-GKO_BIND_CUSPARSE64_SPMM(std::complex<float>, cusparseCcsrmm);
-GKO_BIND_CUSPARSE64_SPMM(std::complex<double>, cusparseZcsrmm);
-template <typename ValueType>
-GKO_BIND_CUSPARSE32_SPMM(ValueType, detail::not_implemented);
-template <typename ValueType>
-GKO_BIND_CUSPARSE64_SPMM(ValueType, detail::not_implemented);
-
-
-#undef GKO_BIND_CUSPARSE32_SPMM
-#undef GKO_BIND_CUSPARSE64_SPMM
-
-
-#endif  // defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-
-
 #if defined(CUDA_VERSION) && (CUDA_VERSION < 11021)
 
 
@@ -357,160 +202,6 @@ GKO_BIND_CUSPARSE_SPMV_BUFFERSIZE(std::complex<double>);
 #endif  // defined(CUDA_VERSION) && (CUDA_VERSION < 11021)
 
 
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-
-
-#define GKO_BIND_CUSPARSE32_SPMV(ValueType, CusparseName)                     \
-    inline void spmv(cusparseHandle_t handle, cusparseOperation_t transA,     \
-                     const ValueType* alpha, const cusparseMatDescr_t descrA, \
-                     const cusparseHybMat_t hybA, const ValueType* x,         \
-                     const ValueType* beta, ValueType* y)                     \
-    {                                                                         \
-        GKO_ASSERT_NO_CUSPARSE_ERRORS(CusparseName(                           \
-            handle, transA, as_culibs_type(alpha), descrA, hybA,              \
-            as_culibs_type(x), as_culibs_type(beta), as_culibs_type(y)));     \
-    }                                                                         \
-    static_assert(true,                                                       \
-                  "This assert is used to counter the false positive extra "  \
-                  "semi-colon warnings")
-
-GKO_BIND_CUSPARSE32_SPMV(float, cusparseShybmv);
-GKO_BIND_CUSPARSE32_SPMV(double, cusparseDhybmv);
-GKO_BIND_CUSPARSE32_SPMV(std::complex<float>, cusparseChybmv);
-GKO_BIND_CUSPARSE32_SPMV(std::complex<double>, cusparseZhybmv);
-template <typename ValueType>
-GKO_BIND_CUSPARSE32_SPMV(ValueType, detail::not_implemented);
-
-
-#undef GKO_BIND_CUSPARSE32_SPMV
-
-
-template <typename ValueType, typename IndexType>
-void spgemm_buffer_size(
-    cusparseHandle_t handle, IndexType m, IndexType n, IndexType k,
-    const ValueType* alpha, const cusparseMatDescr_t descrA, IndexType nnzA,
-    const IndexType* csrRowPtrA, const IndexType* csrColIndA,
-    const cusparseMatDescr_t descrB, IndexType nnzB,
-    const IndexType* csrRowPtrB, const IndexType* csrColIndB,
-    const ValueType* beta, const cusparseMatDescr_t descrD, IndexType nnzD,
-    const IndexType* csrRowPtrD, const IndexType* csrColIndD,
-    csrgemm2Info_t info, size_type& result) GKO_NOT_IMPLEMENTED;
-
-#define GKO_BIND_CUSPARSE_SPGEMM_BUFFER_SIZE(ValueType, CusparseName)          \
-    template <>                                                                \
-    inline void spgemm_buffer_size<ValueType, int32>(                          \
-        cusparseHandle_t handle, int32 m, int32 n, int32 k,                    \
-        const ValueType* alpha, const cusparseMatDescr_t descrA, int32 nnzA,   \
-        const int32* csrRowPtrA, const int32* csrColIndA,                      \
-        const cusparseMatDescr_t descrB, int32 nnzB, const int32* csrRowPtrB,  \
-        const int32* csrColIndB, const ValueType* beta,                        \
-        const cusparseMatDescr_t descrD, int32 nnzD, const int32* csrRowPtrD,  \
-        const int32* csrColIndD, csrgemm2Info_t info, size_type& result)       \
-    {                                                                          \
-        GKO_ASSERT_NO_CUSPARSE_ERRORS(                                         \
-            CusparseName(handle, m, n, k, as_culibs_type(alpha), descrA, nnzA, \
-                         csrRowPtrA, csrColIndA, descrB, nnzB, csrRowPtrB,     \
-                         csrColIndB, as_culibs_type(beta), descrD, nnzD,       \
-                         csrRowPtrD, csrColIndD, info, &result));              \
-    }                                                                          \
-    static_assert(true,                                                        \
-                  "This assert is used to counter the false positive extra "   \
-                  "semi-colon warnings")
-
-GKO_BIND_CUSPARSE_SPGEMM_BUFFER_SIZE(float, cusparseScsrgemm2_bufferSizeExt);
-GKO_BIND_CUSPARSE_SPGEMM_BUFFER_SIZE(double, cusparseDcsrgemm2_bufferSizeExt);
-GKO_BIND_CUSPARSE_SPGEMM_BUFFER_SIZE(std::complex<float>,
-                                     cusparseCcsrgemm2_bufferSizeExt);
-GKO_BIND_CUSPARSE_SPGEMM_BUFFER_SIZE(std::complex<double>,
-                                     cusparseZcsrgemm2_bufferSizeExt);
-
-
-#undef GKO_BIND_CUSPARSE_SPGEMM_BUFFER_SIZE
-
-
-template <typename IndexType>
-void spgemm_nnz(cusparseHandle_t handle, IndexType m, IndexType n, IndexType k,
-                const cusparseMatDescr_t descrA, IndexType nnzA,
-                const IndexType* csrRowPtrA, const IndexType* csrColIndA,
-                const cusparseMatDescr_t descrB, IndexType nnzB,
-                const IndexType* csrRowPtrB, const IndexType* csrColIndB,
-                const cusparseMatDescr_t descrD, IndexType nnzD,
-                const IndexType* csrRowPtrD, const IndexType* csrColIndD,
-                const cusparseMatDescr_t descrC, IndexType* csrRowPtrC,
-                IndexType* nnzC, csrgemm2Info_t info,
-                void* buffer) GKO_NOT_IMPLEMENTED;
-
-template <>
-inline void spgemm_nnz<int32>(
-    cusparseHandle_t handle, int32 m, int32 n, int32 k,
-    const cusparseMatDescr_t descrA, int32 nnzA, const int32* csrRowPtrA,
-    const int32* csrColIndA, const cusparseMatDescr_t descrB, int32 nnzB,
-    const int32* csrRowPtrB, const int32* csrColIndB,
-    const cusparseMatDescr_t descrD, int32 nnzD, const int32* csrRowPtrD,
-    const int32* csrColIndD, const cusparseMatDescr_t descrC, int32* csrRowPtrC,
-    int32* nnzC, csrgemm2Info_t info, void* buffer)
-{
-    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseXcsrgemm2Nnz(
-        handle, m, n, k, descrA, nnzA, csrRowPtrA, csrColIndA, descrB, nnzB,
-        csrRowPtrB, csrColIndB, descrD, nnzD, csrRowPtrD, csrColIndD, descrC,
-        csrRowPtrC, nnzC, info, buffer));
-}
-
-
-template <typename ValueType, typename IndexType>
-void spgemm(cusparseHandle_t handle, IndexType m, IndexType n, IndexType k,
-            const ValueType* alpha, const cusparseMatDescr_t descrA,
-            IndexType nnzA, const ValueType* csrValA,
-            const IndexType* csrRowPtrA, const IndexType* csrColIndA,
-            const cusparseMatDescr_t descrB, IndexType nnzB,
-            const ValueType* csrValB, const IndexType* csrRowPtrB,
-            const IndexType* csrColIndB, const ValueType* beta,
-            const cusparseMatDescr_t descrD, IndexType nnzD,
-            const ValueType* csrValD, const IndexType* csrRowPtrD,
-            const IndexType* csrColIndD, const cusparseMatDescr_t descrC,
-            ValueType* csrValC, const IndexType* csrRowPtrC,
-            IndexType* csrColIndC, csrgemm2Info_t info,
-            void* buffer) GKO_NOT_IMPLEMENTED;
-
-#define GKO_BIND_CUSPARSE_SPGEMM(ValueType, CusparseName)                      \
-    template <>                                                                \
-    inline void spgemm<ValueType, int32>(                                      \
-        cusparseHandle_t handle, int32 m, int32 n, int32 k,                    \
-        const ValueType* alpha, const cusparseMatDescr_t descrA, int32 nnzA,   \
-        const ValueType* csrValA, const int32* csrRowPtrA,                     \
-        const int32* csrColIndA, const cusparseMatDescr_t descrB, int32 nnzB,  \
-        const ValueType* csrValB, const int32* csrRowPtrB,                     \
-        const int32* csrColIndB, const ValueType* beta,                        \
-        const cusparseMatDescr_t descrD, int32 nnzD, const ValueType* csrValD, \
-        const int32* csrRowPtrD, const int32* csrColIndD,                      \
-        const cusparseMatDescr_t descrC, ValueType* csrValC,                   \
-        const int32* csrRowPtrC, int32* csrColIndC, csrgemm2Info_t info,       \
-        void* buffer)                                                          \
-    {                                                                          \
-        GKO_ASSERT_NO_CUSPARSE_ERRORS(CusparseName(                            \
-            handle, m, n, k, as_culibs_type(alpha), descrA, nnzA,              \
-            as_culibs_type(csrValA), csrRowPtrA, csrColIndA, descrB, nnzB,     \
-            as_culibs_type(csrValB), csrRowPtrB, csrColIndB,                   \
-            as_culibs_type(beta), descrD, nnzD, as_culibs_type(csrValD),       \
-            csrRowPtrD, csrColIndD, descrC, as_culibs_type(csrValC),           \
-            csrRowPtrC, csrColIndC, info, buffer));                            \
-    }                                                                          \
-    static_assert(true,                                                        \
-                  "This assert is used to counter the false positive extra "   \
-                  "semi-colon warnings")
-
-GKO_BIND_CUSPARSE_SPGEMM(float, cusparseScsrgemm2);
-GKO_BIND_CUSPARSE_SPGEMM(double, cusparseDcsrgemm2);
-GKO_BIND_CUSPARSE_SPGEMM(std::complex<float>, cusparseCcsrgemm2);
-GKO_BIND_CUSPARSE_SPGEMM(std::complex<double>, cusparseZcsrgemm2);
-
-
-#undef GKO_BIND_CUSPARSE_SPGEMM
-
-
-#else  // CUDA_VERSION >= 11000
-
-
 template <typename ValueType>
 void spgemm_work_estimation(cusparseHandle_t handle, const ValueType* alpha,
                             cusparseSpMatDescr_t a_descr,
@@ -574,101 +265,6 @@ void csr_set_pointers(cusparseSpMatDescr_t descr, IndexType* row_ptrs,
 }
 
 
-#endif  // CUDA_VERSION >= 11000
-
-
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-
-
-#define GKO_BIND_CUSPARSE32_CSR2HYB(ValueType, CusparseName)                 \
-    inline void csr2hyb(cusparseHandle_t handle, int32 m, int32 n,           \
-                        const cusparseMatDescr_t descrA,                     \
-                        const ValueType* csrValA, const int32* csrRowPtrA,   \
-                        const int32* csrColIndA, cusparseHybMat_t hybA,      \
-                        int32 userEllWidth,                                  \
-                        cusparseHybPartition_t partitionType)                \
-    {                                                                        \
-        GKO_ASSERT_NO_CUSPARSE_ERRORS(CusparseName(                          \
-            handle, m, n, descrA, as_culibs_type(csrValA), csrRowPtrA,       \
-            csrColIndA, hybA, userEllWidth, partitionType));                 \
-    }                                                                        \
-    static_assert(true,                                                      \
-                  "This assert is used to counter the false positive extra " \
-                  "semi-colon warnings")
-
-#define GKO_BIND_CUSPARSE64_CSR2HYB(ValueType, CusparseName)                 \
-    inline void csr2hyb(                                                     \
-        cusparseHandle_t handle, int64 m, int64 n,                           \
-        const cusparseMatDescr_t descrA, const ValueType* csrValA,           \
-        const int64* csrRowPtrA, const int64* csrColIndA,                    \
-        cusparseHybMat_t hybA, int64 userEllWidth,                           \
-        cusparseHybPartition_t partitionType) GKO_NOT_IMPLEMENTED;           \
-    static_assert(true,                                                      \
-                  "This assert is used to counter the false positive extra " \
-                  "semi-colon warnings")
-
-GKO_BIND_CUSPARSE32_CSR2HYB(float, cusparseScsr2hyb);
-GKO_BIND_CUSPARSE32_CSR2HYB(double, cusparseDcsr2hyb);
-GKO_BIND_CUSPARSE32_CSR2HYB(std::complex<float>, cusparseCcsr2hyb);
-GKO_BIND_CUSPARSE32_CSR2HYB(std::complex<double>, cusparseZcsr2hyb);
-GKO_BIND_CUSPARSE64_CSR2HYB(float, cusparseScsr2hyb);
-GKO_BIND_CUSPARSE64_CSR2HYB(double, cusparseDcsr2hyb);
-GKO_BIND_CUSPARSE64_CSR2HYB(std::complex<float>, cusparseCcsr2hyb);
-GKO_BIND_CUSPARSE64_CSR2HYB(std::complex<double>, cusparseZcsr2hyb);
-template <typename ValueType>
-GKO_BIND_CUSPARSE32_CSR2HYB(ValueType, detail::not_implemented);
-template <typename ValueType>
-GKO_BIND_CUSPARSE64_CSR2HYB(ValueType, detail::not_implemented);
-
-
-#undef GKO_BIND_CUSPARSE32_CSR2HYB
-#undef GKO_BIND_CUSPARSE64_CSR2HYB
-
-
-#endif  // defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-
-
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-
-template <typename ValueType, typename IndexType>
-inline void transpose(cusparseHandle_t handle, size_type m, size_type n,
-                      size_type nnz, const ValueType* OrigValA,
-                      const IndexType* OrigRowPtrA,
-                      const IndexType* OrigColIndA, ValueType* TransValA,
-                      IndexType* TransRowPtrA, IndexType* TransColIndA,
-                      cusparseAction_t copyValues,
-                      cusparseIndexBase_t idxBase) GKO_NOT_IMPLEMENTED;
-
-// Cusparse csr2csc use the order (row_inx, col_ptr) for csc, so we need to
-// switch row_ptr and col_idx of transposed csr here
-#define GKO_BIND_CUSPARSE_TRANSPOSE32(ValueType, CusparseName)                \
-    template <>                                                               \
-    inline void transpose<ValueType, int32>(                                  \
-        cusparseHandle_t handle, size_type m, size_type n, size_type nnz,     \
-        const ValueType* OrigValA, const int32* OrigRowPtrA,                  \
-        const int32* OrigColIndA, ValueType* TransValA, int32* TransRowPtrA,  \
-        int32* TransColIndA, cusparseAction_t copyValues,                     \
-        cusparseIndexBase_t idxBase)                                          \
-    {                                                                         \
-        GKO_ASSERT_NO_CUSPARSE_ERRORS(                                        \
-            CusparseName(handle, m, n, nnz, as_culibs_type(OrigValA),         \
-                         OrigRowPtrA, OrigColIndA, as_culibs_type(TransValA), \
-                         TransColIndA, TransRowPtrA, copyValues, idxBase));   \
-    }                                                                         \
-    static_assert(true,                                                       \
-                  "This assert is used to counter the false positive extra "  \
-                  "semi-colon warnings")
-
-GKO_BIND_CUSPARSE_TRANSPOSE32(float, cusparseScsr2csc);
-GKO_BIND_CUSPARSE_TRANSPOSE32(double, cusparseDcsr2csc);
-GKO_BIND_CUSPARSE_TRANSPOSE32(std::complex<float>, cusparseCcsr2csc);
-GKO_BIND_CUSPARSE_TRANSPOSE32(std::complex<double>, cusparseZcsr2csc);
-
-#undef GKO_BIND_CUSPARSE_TRANSPOSE32
-
-
-#else  // CUDA_VERSION >= 11000
-
 template <typename ValueType, typename IndexType>
 inline void transpose_buffersize(
     cusparseHandle_t handle, size_type m, size_type n, size_type nnz,
@@ -737,9 +333,6 @@ GKO_BIND_CUSPARSE_TRANSPOSE32(std::complex<float>);
 GKO_BIND_CUSPARSE_TRANSPOSE32(std::complex<double>);
 
 
-#endif
-
-
 inline cusparseMatDescr_t create_mat_descr()
 {
     cusparseMatDescr_t descr{};
@@ -774,26 +367,6 @@ inline void destroy(cusparseMatDescr_t descr)
 }
 
 
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-
-
-inline csrgemm2Info_t create_spgemm_info()
-{
-    csrgemm2Info_t info{};
-    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateCsrgemm2Info(&info));
-    return info;
-}
-
-
-inline void destroy(csrgemm2Info_t info)
-{
-    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyCsrgemm2Info(info));
-}
-
-
-#else  // CUDA_VERSION >= 11000
-
-
 inline cusparseSpGEMMDescr_t create_spgemm_descr()
 {
     cusparseSpGEMMDescr_t descr{};
@@ -886,7 +459,7 @@ inline void destroy(cusparseSpMatDescr_t descr)
 }
 
 
-#if (CUDA_VERSION >= 11031)
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 11031)
 
 
 template <typename AttribType>
@@ -915,9 +488,6 @@ inline void destroy(cusparseSpSMDescr_t info)
 #endif  // CUDA_VERSION >= 11031
 
 
-#endif  // defined(CUDA_VERSION) && (CUDA_VERSION >= 11000)
-
-
 #if defined(CUDA_VERSION) && (CUDA_VERSION < 11031)
 
 
@@ -1209,38 +779,6 @@ inline void csrsort<int32>(cusparseHandle_t handle, int32 m, int32 n, int32 nnz,
 }
 
 
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-
-
-template <typename ValueType, typename IndexType>
-void gather(cusparseHandle_t handle, IndexType nnz, const ValueType* in,
-            ValueType* out, const IndexType* permutation) GKO_NOT_IMPLEMENTED;
-
-#define GKO_BIND_CUSPARSE_GATHER(ValueType, CusparseName)                      \
-    template <>                                                                \
-    inline void gather<ValueType, int32>(cusparseHandle_t handle, int32 nnz,   \
-                                         const ValueType* in, ValueType* out,  \
-                                         const int32* permutation)             \
-    {                                                                          \
-        GKO_ASSERT_NO_CUSPARSE_ERRORS(                                         \
-            CusparseName(handle, nnz, as_culibs_type(in), as_culibs_type(out), \
-                         permutation, CUSPARSE_INDEX_BASE_ZERO));              \
-    }                                                                          \
-    static_assert(true,                                                        \
-                  "This assert is used to counter the false positive extra "   \
-                  "semi-colon warnings")
-
-GKO_BIND_CUSPARSE_GATHER(float, cusparseSgthr);
-GKO_BIND_CUSPARSE_GATHER(double, cusparseDgthr);
-GKO_BIND_CUSPARSE_GATHER(std::complex<float>, cusparseCgthr);
-GKO_BIND_CUSPARSE_GATHER(std::complex<double>, cusparseZgthr);
-
-#undef GKO_BIND_CUSPARSE_GATHER
-
-
-#else  // CUDA_VERSION >= 11000
-
-
 inline void gather(cusparseHandle_t handle, cusparseDnVecDescr_t in,
                    cusparseSpVecDescr_t out)
 {
@@ -1248,9 +786,6 @@ inline void gather(cusparseHandle_t handle, cusparseDnVecDescr_t in,
 }
 
 
-#endif
-
-
 GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS
 template <typename ValueType, typename IndexType>
 void ilu0_buffer_size(cusparseHandle_t handle, IndexType m, IndexType nnz,
diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp
index 7252f7d673d..a4a2b877c28 100644
--- a/cuda/base/types.hpp
+++ b/cuda/base/types.hpp
@@ -206,11 +206,6 @@ GKO_CUDA_DATA_TYPE(int8, CUDA_R_8I);
 #undef GKO_CUDA_DATA_TYPE
 
 
-#if defined(CUDA_VERSION) &&  \
-    (CUDA_VERSION >= 11000 || \
-     ((CUDA_VERSION >= 10020) && !(defined(_WIN32) || defined(__CYGWIN__))))
-
-
 template <typename T>
 struct cusparse_index_type_impl {};
 
@@ -227,10 +222,6 @@ GKO_CUDA_INDEX_TYPE(int64, CUSPARSE_INDEX_64I);
 #undef GKO_CUDA_INDEX_TYPE
 
 
-#endif  // defined(CUDA_VERSION) && (CUDA_VERSION >= 11000 || ((CUDA_VERSION >=
-        // 10020) && !(defined(_WIN32) || defined(__CYGWIN__))))
-
-
 }  // namespace detail
 
 
@@ -249,11 +240,6 @@ constexpr cudaDataType_t cuda_data_type()
 }
 
 
-#if defined(CUDA_VERSION) &&  \
-    (CUDA_VERSION >= 11000 || \
-     ((CUDA_VERSION >= 10020) && !(defined(_WIN32) || defined(__CYGWIN__))))
-
-
 /**
  * This is an alias for the `cudaIndexType_t` equivalent of `T`. By default,
  * CUSPARSE_INDEX_16U is returned.
@@ -269,10 +255,6 @@ constexpr cusparseIndexType_t cusparse_index_type()
 }
 
 
-#endif  // defined(CUDA_VERSION) && (CUDA_VERSION >= 11000 || ((CUDA_VERSION >=
-        // 10020) && !(defined(_WIN32) || defined(__CYGWIN__))))
-
-
 /**
  * This is an alias for CUDA's equivalent of `T`.
  *
diff --git a/cuda/components/cooperative_groups.cuh b/cuda/components/cooperative_groups.cuh
index 983ec32f9ac..14c104c8e29 100644
--- a/cuda/components/cooperative_groups.cuh
+++ b/cuda/components/cooperative_groups.cuh
@@ -280,90 +280,10 @@ struct is_synchronizable_group_impl<coalesced_group> : std::true_type {};
 }  // namespace detail
 
 
-namespace detail {
-
-
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-
-
-// Adds generalized shuffles that support any type to the group.
-template <typename Group>
-class enable_extended_shuffle : public Group {
-public:
-    using Group::Group;
-    using Group::shfl;
-    using Group::shfl_down;
-    using Group::shfl_up;
-    using Group::shfl_xor;
-
-#define GKO_ENABLE_SHUFFLE_OPERATION(_name, SelectorType)                   \
-    template <typename ValueType>                                           \
-    __device__ __forceinline__ ValueType _name(const ValueType& var,        \
-                                               SelectorType selector) const \
-    {                                                                       \
-        return shuffle_impl(                                                \
-            [this](uint32 v, SelectorType s) {                              \
-                return static_cast<const Group*>(this)->_name(v, s);        \
-            },                                                              \
-            var, selector);                                                 \
-    }
-
-    GKO_ENABLE_SHUFFLE_OPERATION(shfl, int32)
-    GKO_ENABLE_SHUFFLE_OPERATION(shfl_up, uint32)
-    GKO_ENABLE_SHUFFLE_OPERATION(shfl_down, uint32)
-    GKO_ENABLE_SHUFFLE_OPERATION(shfl_xor, int32)
-
-#undef GKO_ENABLE_SHUFFLE_OPERATION
-
-private:
-    template <typename ShuffleOperator, typename ValueType,
-              typename SelectorType>
-    static __device__ __forceinline__ ValueType
-    shuffle_impl(ShuffleOperator intrinsic_shuffle, const ValueType var,
-                 SelectorType selector)
-    {
-        static_assert(sizeof(ValueType) % sizeof(uint32) == 0,
-                      "Unable to shuffle sizes which are not 4-byte multiples");
-        constexpr auto value_size = sizeof(ValueType) / sizeof(uint32);
-        ValueType result;
-        auto var_array = reinterpret_cast<const uint32*>(&var);
-        auto result_array = reinterpret_cast<uint32*>(&result);
-#pragma unroll
-        for (std::size_t i = 0; i < value_size; ++i) {
-            result_array[i] = intrinsic_shuffle(var_array[i], selector);
-        }
-        return result;
-    }
-};
-
-
-#endif  // defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-
-
-}  // namespace detail
-
-
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-
-
-// Implementing this as a using directive messes up with SFINAE for some reason,
-// probably a bug in NVCC. If it is a complete type, everything works fine.
-template <unsigned Size>
-struct thread_block_tile : detail::enable_extended_shuffle<
-                               cooperative_groups::thread_block_tile<Size>> {
-    using detail::enable_extended_shuffle<
-        cooperative_groups::thread_block_tile<Size>>::enable_extended_shuffle;
-};
-
-
-#else  // CUDA_VERSION >= 11000
-
-
 // Cuda11 cooperative group's shuffle supports complex
 using cooperative_groups::thread_block_tile;
 
 
-#endif
 // inherits thread_group
 //
 // public API:
@@ -385,28 +305,6 @@ using cooperative_groups::thread_block_tile;
 namespace detail {
 
 
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-
-
-template <unsigned Size>
-struct is_group_impl<thread_block_tile<Size>> : std::true_type {};
-template <unsigned Size>
-struct is_synchronizable_group_impl<thread_block_tile<Size>> : std::true_type {
-};
-template <unsigned Size>
-struct is_communicator_group_impl<thread_block_tile<Size>> : std::true_type {};
-// make sure the original CUDA group is recognized whenever possible
-template <unsigned Size>
-struct is_group_impl<cooperative_groups::thread_block_tile<Size>>
-    : std::true_type {};
-template <unsigned Size>
-struct is_synchronizable_group_impl<cooperative_groups::thread_block_tile<Size>>
-    : std::true_type {};
-
-
-#else  // CUDA_VERSION >= 11000
-
-
 // thread_block_tile is same as cuda11's
 template <unsigned Size, typename Group>
 struct is_group_impl<thread_block_tile<Size, Group>> : std::true_type {};
@@ -418,9 +316,6 @@ struct is_communicator_group_impl<thread_block_tile<Size, Group>>
     : std::true_type {};
 
 
-#endif
-
-
 }  // namespace detail
 
 
@@ -471,24 +366,6 @@ __device__ __forceinline__ auto tiled_partition(const Group& g)
 // Only support tile_partition with 1, 2, 4, 8, 16, 32.
 // Reference:
 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-notes
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-
-
-// cooperative group before cuda11 does not contain parent group in template
-template <size_type Size, typename Group>
-__device__ __forceinline__
-    std::enable_if_t<(Size <= kernels::cuda::config::warp_size) && (Size > 0) &&
-                         (kernels::cuda::config::warp_size % Size == 0),
-                     thread_block_tile<Size>>
-    tiled_partition(const Group&)
-{
-    return thread_block_tile<Size>();
-}
-
-
-#else  // CUDA_VERSION >= 11000
-
-
 // cooperative group after cuda11 contain parent group in template.
 // we remove the information because we do not restrict cooperative group by its
 // parent group type.
@@ -500,9 +377,6 @@ __device__ __forceinline__ thread_block_tile<Size, void> tiled_partition(
 }
 
 
-#endif
-
-
 }  // namespace group
 }  // namespace cuda
 }  // namespace kernels
diff --git a/third_party/identify_stream_usage/identify_stream_usage.cpp b/third_party/identify_stream_usage/identify_stream_usage.cpp
index 5cdd4d30b09..9dc16fc1bb3 100644
--- a/third_party/identify_stream_usage/identify_stream_usage.cpp
+++ b/third_party/identify_stream_usage/identify_stream_usage.cpp
@@ -104,15 +104,10 @@ __attribute__((init_priority(1001))) std::unordered_map<std::string, void*>
 // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT
 DEFINE_OVERLOAD(cudaEventRecord, ARG(cudaEvent_t event, cudaStream_t stream),
                 ARG(event, stream));
-
-#if CUDA_VERSION >= 11000
-
 DEFINE_OVERLOAD(cudaEventRecordWithFlags,
                 ARG(cudaEvent_t event, cudaStream_t stream, unsigned int flags),
                 ARG(event, stream, flags));
 
-#endif
-
 // Execution APIS:
 // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EXECUTION.html#group__CUDART__EXECUTION
 DEFINE_OVERLOAD(cudaLaunchKernel,

From a94f508d6fe127b9187916df0f85b115b4b8c471 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Thu, 31 Oct 2024 15:20:34 +0000
Subject: [PATCH 232/448] [doc] fixes formulas for Sor

---
 include/ginkgo/core/preconditioner/gauss_seidel.hpp |  2 ++
 include/ginkgo/core/preconditioner/sor.hpp          | 10 ++++++----
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/include/ginkgo/core/preconditioner/gauss_seidel.hpp b/include/ginkgo/core/preconditioner/gauss_seidel.hpp
index 75668e652a7..d4a475badca 100644
--- a/include/ginkgo/core/preconditioner/gauss_seidel.hpp
+++ b/include/ginkgo/core/preconditioner/gauss_seidel.hpp
@@ -26,6 +26,8 @@ namespace preconditioner {
  * preconditioner.
  *
  * @see Sor
+ *
+ * @ingroup precond
  */
 template <typename ValueType = default_precision, typename IndexType = int32>
 class GaussSeidel
diff --git a/include/ginkgo/core/preconditioner/sor.hpp b/include/ginkgo/core/preconditioner/sor.hpp
index 531dded79f2..bf7af5b89a4 100644
--- a/include/ginkgo/core/preconditioner/sor.hpp
+++ b/include/ginkgo/core/preconditioner/sor.hpp
@@ -26,16 +26,16 @@ namespace preconditioner {
  * $A = D + L + U$, where $L$ contains all entries below the diagonal, and $U$
  * contains all entries above the diagonal. The application of the
  * preconditioner is then defined as solving $M x = y$ with
- * $$
+ * \f[
  * M = \frac{1}{\omega} (D + \omega L), \quad 0 < \omega < 2.
- * $$
+ * \f]
  * $\omega$ is known as the relaxation factor.
  * The preconditioner can be made symmetric, leading to the SSOR preconitioner.
  * Here, $M$ is defined as
- * $$
+ * \f[
  * M = \frac{1}{\omega (2 - \omega)} (D + \omega L) D^{-1} (D + \omega U) ,
  * \quad 0 < \omega < 2.
- * $$
+ * \f]
  * A detailed description can be found in Iterative Methods for Sparse Linear
  * Systems (Y. Saad) ch. 4.1.
  *
@@ -44,6 +44,8 @@ namespace preconditioner {
  *
  * @tparam ValueType  The value type of the internally used CSR matrix
  * @tparam IndexType  The index type of the internally used CSR matrix
+ *
+ * @ingroup precond
  */
 template <typename ValueType = default_precision, typename IndexType = int32>
 class Sor

From 3b46b0886f67766a2dd5f9072b21e5c5087a55fc Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 9 Oct 2024 13:39:49 +0200
Subject: [PATCH 233/448] add workspace for intermediate data

---
 core/distributed/preconditioner/schwarz.cpp   | 18 ++++++++++--
 .../distributed/preconditioner/schwarz.hpp    | 29 +++++++++++++++++++
 2 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/core/distributed/preconditioner/schwarz.cpp b/core/distributed/preconditioner/schwarz.cpp
index d5466cd003a..14791864bda 100644
--- a/core/distributed/preconditioner/schwarz.cpp
+++ b/core/distributed/preconditioner/schwarz.cpp
@@ -84,15 +84,27 @@ void Schwarz<ValueType, LocalIndexType, GlobalIndexType>::apply_impl(
 {
     precision_dispatch_real_complex_distributed<ValueType>(
         [this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) {
-            auto x_clone = dense_x->clone();
-            this->apply_dense_impl(dense_b, x_clone.get());
+            set_cache_to(dense_x);
+            this->apply_impl(dense_b, cache_.intermediate.get());
             dense_x->scale(dense_beta);
-            dense_x->add_scaled(dense_alpha, x_clone.get());
+            dense_x->add_scaled(dense_alpha, cache_.intermediate.get());
         },
         alpha, b, beta, x);
 }
 
 
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+template <typename VectorType>
+void Schwarz<ValueType, LocalIndexType, GlobalIndexType>::set_cache_to(
+    const VectorType* b) const
+{
+    if (dynamic_cast<VectorType*>(cache_.intermediate.get()) == nullptr) {
+        cache_.intermediate = VectorType::create_with_config_of(b);
+    }
+    cache_.intermediate->copy_from(b);
+}
+
+
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void Schwarz<ValueType, LocalIndexType, GlobalIndexType>::set_solver(
     std::shared_ptr<const LinOp> new_solver)
diff --git a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp
index adc67dfbd36..6098b7da2dc 100644
--- a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp
+++ b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp
@@ -141,6 +141,15 @@ class Schwarz
     void apply_impl(const LinOp* alpha, const LinOp* b, const LinOp* beta,
                     LinOp* x) const override;
 
+    /**
+     * Prepares the intermediate vector in workspace
+     *
+     * @param vec  vector of the first apply. the intermediate is a copy of vec
+     *             in the return.
+     */
+    template <typename VectorType>
+    void set_cache_to(const VectorType* b) const;
+
 private:
     /**
      * Sets the solver operator used as the local solver.
@@ -150,6 +159,26 @@ class Schwarz
     void set_solver(std::shared_ptr<const LinOp> new_solver);
 
     std::shared_ptr<const LinOp> local_solver_;
+
+    /**
+     * Manages a vector as a cache, so there is no need to allocate one every
+     * time an intermediate vector is required.
+     * Copying an instance will only yield an empty object since copying the
+     * cached vector would not make sense.
+     *
+     * @internal  The struct is present so the whole class can be copyable
+     *            (could also be done with writing `operator=` and copy
+     *            constructor of the enclosing class by hand)
+     */
+    mutable struct cache_struct {
+        cache_struct() = default;
+        ~cache_struct() = default;
+        cache_struct(const cache_struct&) {}
+        cache_struct(cache_struct&&) {}
+        cache_struct& operator=(const cache_struct&) { return *this; }
+        cache_struct& operator=(cache_struct&&) { return *this; }
+        std::unique_ptr<LinOp> intermediate{};
+    } cache_;
 };
 
 

From 8a8034c393ce2140ca32d3b52f5258043ffc2840 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 9 Oct 2024 14:02:43 +0200
Subject: [PATCH 234/448] apply_uses_initial_guess return the local solvers'
 value

---
 core/distributed/preconditioner/schwarz.cpp   |  7 ++++++
 .../distributed/preconditioner/schwarz.cpp    | 25 +++++++++++++++++++
 .../distributed/preconditioner/schwarz.hpp    | 10 ++++++++
 3 files changed, 42 insertions(+)

diff --git a/core/distributed/preconditioner/schwarz.cpp b/core/distributed/preconditioner/schwarz.cpp
index 14791864bda..f398aba34d4 100644
--- a/core/distributed/preconditioner/schwarz.cpp
+++ b/core/distributed/preconditioner/schwarz.cpp
@@ -51,6 +51,13 @@ Schwarz<ValueType, LocalIndexType, GlobalIndexType>::parse(
     return params;
 }
 
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+bool Schwarz<ValueType, LocalIndexType,
+             GlobalIndexType>::apply_uses_initial_guess() const
+{
+    return this->local_solver_->apply_uses_initial_guess();
+}
+
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void Schwarz<ValueType, LocalIndexType, GlobalIndexType>::apply_impl(
diff --git a/core/test/mpi/distributed/preconditioner/schwarz.cpp b/core/test/mpi/distributed/preconditioner/schwarz.cpp
index c6c0dc00650..fb6676cc011 100644
--- a/core/test/mpi/distributed/preconditioner/schwarz.cpp
+++ b/core/test/mpi/distributed/preconditioner/schwarz.cpp
@@ -8,6 +8,8 @@
 #include <ginkgo/core/distributed/preconditioner/schwarz.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/preconditioner/jacobi.hpp>
+#include <ginkgo/core/solver/cg.hpp>
+#include <ginkgo/core/stop/iteration.hpp>
 
 #include "core/test/utils.hpp"
 
@@ -143,4 +145,27 @@ TYPED_TEST(SchwarzFactory, PassExplicitFactory)
 }
 
 
+TYPED_TEST(SchwarzFactory, ApplyUsesInitialGuessAsLocalSolver)
+{
+    using value_type = typename TestFixture::value_type;
+    using Cg = typename gko::solver::Cg<value_type>;
+    using Jacobi = typename TestFixture::Jacobi;
+    using Schwarz = typename TestFixture::Schwarz;
+
+    auto schwarz_with_jacobi = Schwarz::build()
+                                   .with_local_solver(Jacobi::build())
+                                   .on(this->exec)
+                                   ->generate(this->mtx);
+    auto schwarz_with_cg =
+        Schwarz::build()
+            .with_local_solver(Cg::build().with_criteria(
+                gko::stop::Iteration::build().with_max_iters(1u)))
+            .on(this->exec)
+            ->generate(this->mtx);
+
+    ASSERT_EQ(schwarz_with_jacobi->apply_uses_initial_guess(), false);
+    ASSERT_EQ(schwarz_with_cg->apply_uses_initial_guess(), true);
+}
+
+
 }  // namespace
diff --git a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp
index 6098b7da2dc..9e5ef7261c7 100644
--- a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp
+++ b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp
@@ -65,6 +65,16 @@ class Schwarz
     using local_index_type = LocalIndexType;
     using global_index_type = GlobalIndexType;
 
+    /**
+     * Return whether the local solvers use the data in x as an initial guess.
+     *
+     * @return true when the local solvers use the data in x as an initial
+     * guess. otherwise, false.
+     *
+     * @note TODO: after adding refining step, need to revisit this.
+     */
+    bool apply_uses_initial_guess() const override;
+
     GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
     {
         /**

From da5220a86693d42e8bb0bf35d463f75e81201b5b Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 4 Nov 2024 11:13:28 +0100
Subject: [PATCH 235/448] use vector cache

---
 core/CMakeLists.txt                           |   1 +
 core/base/vector_cache.cpp                    |  55 +++
 core/distributed/preconditioner/schwarz.cpp   |  22 +-
 core/test/base/CMakeLists.txt                 |   4 +
 core/test/base/vector_cache.cpp               | 321 ++++++++++++++++++
 include/ginkgo/core/base/vector_cache.hpp     | 112 ++++++
 .../distributed/preconditioner/schwarz.hpp    |  30 +-
 include/ginkgo/core/matrix/dense.hpp          |  13 +-
 include/ginkgo/ginkgo.hpp                     |   1 +
 9 files changed, 514 insertions(+), 45 deletions(-)
 create mode 100644 core/base/vector_cache.cpp
 create mode 100644 core/test/base/vector_cache.cpp
 create mode 100644 include/ginkgo/core/base/vector_cache.hpp

diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
index 6e0960459e0..b2666845fed 100644
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -134,6 +134,7 @@ endif()
 if(GINKGO_BUILD_MPI)
     target_sources(${ginkgo_core}
         PRIVATE
+        base/vector_cache.cpp
         mpi/exception.cpp
         distributed/matrix.cpp
         distributed/partition_helpers.cpp
diff --git a/core/base/vector_cache.cpp b/core/base/vector_cache.cpp
new file mode 100644
index 00000000000..ffe96ec9f84
--- /dev/null
+++ b/core/base/vector_cache.cpp
@@ -0,0 +1,55 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "ginkgo/core/base/vector_cache.hpp"
+
+#include <ginkgo/core/base/mpi.hpp>
+#include <ginkgo/core/distributed/vector.hpp>
+
+namespace gko {
+namespace detail {
+
+
+template <typename ValueType>
+void VectorCache<ValueType>::init(std::shared_ptr<const Executor> exec,
+                                  gko::experimental::mpi::communicator comm,
+                                  dim<2> global_size, dim<2> local_size) const
+{
+    if (!vec || vec->get_size() != global_size || vec->get_executor() != exec) {
+        vec = experimental::distributed::Vector<ValueType>::create(
+            exec, comm, global_size, local_size);
+    } else if (vec->get_local_vector()->get_size() != local_size) {
+        // handle locally to eliminate the mpi call
+        vec->local_ =
+            std::move(gko::matrix::Dense<ValueType>(exec, local_size));
+    }
+}
+
+
+template <typename ValueType>
+void VectorCache<ValueType>::init_from(
+    const experimental::distributed::Vector<ValueType>* template_vec) const
+{
+    if (!vec || vec->get_size() != template_vec->get_size() ||
+        vec->get_executor() != template_vec->get_executor()) {
+        vec =
+            experimental::distributed::Vector<ValueType>::create_with_config_of(
+                template_vec);
+    } else if (vec->get_local_vector()->get_size() !=
+               template_vec->get_local_vector()->get_size()) {
+        // handle locally to eliminate the mpi call
+        vec->local_ = std::move(gko::matrix::Dense<ValueType>(
+            template_vec->get_executor(),
+            template_vec->get_local_vector()->get_size(),
+            template_vec->get_local_vector()->get_stride()));
+    }
+}
+
+
+#define GKO_DECLARE_VECTOR_CACHE(_type) class VectorCache<_type>
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_VECTOR_CACHE);
+
+
+}  // namespace detail
+}  // namespace gko
diff --git a/core/distributed/preconditioner/schwarz.cpp b/core/distributed/preconditioner/schwarz.cpp
index f398aba34d4..965414349d6 100644
--- a/core/distributed/preconditioner/schwarz.cpp
+++ b/core/distributed/preconditioner/schwarz.cpp
@@ -89,29 +89,19 @@ template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void Schwarz<ValueType, LocalIndexType, GlobalIndexType>::apply_impl(
     const LinOp* alpha, const LinOp* b, const LinOp* beta, LinOp* x) const
 {
-    precision_dispatch_real_complex_distributed<ValueType>(
+    // only dispatch distributed case
+    experimental::distributed::precision_dispatch_real_complex<ValueType>(
         [this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) {
-            set_cache_to(dense_x);
-            this->apply_impl(dense_b, cache_.intermediate.get());
+            cache_.init_from(dense_x);
+            cache_->copy_from(dense_x);
+            this->apply_impl(dense_b, cache_.get());
             dense_x->scale(dense_beta);
-            dense_x->add_scaled(dense_alpha, cache_.intermediate.get());
+            dense_x->add_scaled(dense_alpha, cache_.get());
         },
         alpha, b, beta, x);
 }
 
 
-template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-template <typename VectorType>
-void Schwarz<ValueType, LocalIndexType, GlobalIndexType>::set_cache_to(
-    const VectorType* b) const
-{
-    if (dynamic_cast<VectorType*>(cache_.intermediate.get()) == nullptr) {
-        cache_.intermediate = VectorType::create_with_config_of(b);
-    }
-    cache_.intermediate->copy_from(b);
-}
-
-
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void Schwarz<ValueType, LocalIndexType, GlobalIndexType>::set_solver(
     std::shared_ptr<const LinOp> new_solver)
diff --git a/core/test/base/CMakeLists.txt b/core/test/base/CMakeLists.txt
index d7deeec6fb7..3e59ad12686 100644
--- a/core/test/base/CMakeLists.txt
+++ b/core/test/base/CMakeLists.txt
@@ -31,3 +31,7 @@ ginkgo_create_test(segmented_range)
 ginkgo_create_test(types)
 ginkgo_create_test(utils)
 ginkgo_create_test(version EXECUTABLE_NAME version_test) # version collides with C++ stdlib header
+
+if(GINKGO_BUILD_MPI)
+    ginkgo_create_test(vector_cache MPI_SIZE 3)
+endif()
\ No newline at end of file
diff --git a/core/test/base/vector_cache.cpp b/core/test/base/vector_cache.cpp
new file mode 100644
index 00000000000..8ed3bd4f038
--- /dev/null
+++ b/core/test/base/vector_cache.cpp
@@ -0,0 +1,321 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <mpi.h>
+
+#include <gtest/gtest.h>
+
+#include <ginkgo/core/base/mpi.hpp>
+#include <ginkgo/core/base/vector_cache.hpp>
+#include <ginkgo/core/distributed/vector.hpp>
+
+#include "core/test/utils.hpp"
+
+
+template <typename ValueType>
+class VectorCache : public ::testing::Test {
+protected:
+    using value_type = ValueType;
+    using vector_type = gko::experimental::distributed::Vector<value_type>;
+
+    VectorCache()
+        : comm(gko::experimental::mpi::communicator(MPI_COMM_WORLD)),
+          ref(gko::ReferenceExecutor::create()),
+          rank(comm.rank()),
+          num(comm.size()),
+          default_local_size(rank + 1, 3),
+          default_global_size((num + 1) * num / 2, 3),
+          default_vector(vector_type::create(this->ref, this->comm,
+                                             this->default_global_size,
+                                             this->default_local_size))
+    {}
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    gko::detail::VectorCache<value_type> cache;
+    gko::experimental::mpi::communicator comm;
+    int rank;
+    int num;
+    gko::dim<2> default_local_size;
+    gko::dim<2> default_global_size;
+    std::unique_ptr<vector_type> default_vector;
+};
+
+
+TYPED_TEST_SUITE(VectorCache, gko::test::ValueTypes, TypenameNameGenerator);
+
+
+TYPED_TEST(VectorCache, CanDefaultConstruct)
+{
+    using value_type = typename TestFixture::value_type;
+    gko::detail::VectorCache<value_type> cache;
+
+    ASSERT_EQ(cache.get(), nullptr);
+}
+
+
+TYPED_TEST(VectorCache, CanInitWithSize)
+{
+    this->cache.init(this->ref, this->comm, this->default_global_size,
+                     this->default_local_size);
+
+    ASSERT_NE(this->cache.get(), nullptr);
+    GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_size(),
+                                this->default_global_size);
+    GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_local_vector()->get_size(),
+                                this->default_local_size);
+    ASSERT_EQ(this->cache->get_executor(), this->ref);
+}
+
+
+TYPED_TEST(VectorCache, SecondInitWithSameSizeIsNoOp)
+{
+    this->cache.init(this->ref, this->comm, this->default_global_size,
+                     this->default_local_size);
+    auto first_ptr = this->cache.get();
+    auto local_val_ptr = this->cache->get_local_vector()->get_const_values();
+
+    this->cache.init(this->ref, this->comm, this->default_global_size,
+                     this->default_local_size);
+
+    ASSERT_NE(this->cache.get(), nullptr);
+    ASSERT_EQ(this->cache.get(), first_ptr);
+    ASSERT_EQ(this->cache->get_local_vector()->get_const_values(),
+              local_val_ptr);
+}
+
+
+TYPED_TEST(VectorCache, SecondInitWithDifferentGlobalSizeInitializes)
+{
+    this->cache.init(this->ref, this->comm, this->default_global_size,
+                     this->default_local_size);
+    auto first_ptr = this->cache.get();
+    auto local_val_ptr = this->cache->get_local_vector()->get_const_values();
+    // generate different global size
+    gko::dim<2> second_local_size(2 * (this->rank + 1), 3);
+    gko::dim<2> second_global_size((this->num + 1) * this->num, 3);
+
+    this->cache.init(this->ref, this->comm, second_global_size,
+                     second_local_size);
+
+    ASSERT_NE(this->cache.get(), nullptr);
+    ASSERT_NE(this->cache.get(), first_ptr);
+    ASSERT_NE(this->cache->get_local_vector()->get_const_values(),
+              local_val_ptr);
+    GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_size(), second_global_size);
+    GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_local_vector()->get_size(),
+                                second_local_size);
+    ASSERT_EQ(this->cache->get_executor(), this->ref);
+}
+
+
+TYPED_TEST(VectorCache, SecondInitWithDifferentLocalSizeInitializes)
+{
+    this->cache.init(this->ref, this->comm, this->default_global_size,
+                     this->default_local_size);
+    auto first_ptr = this->cache.get();
+    auto local_val_ptr = this->cache->get_local_vector()->get_const_values();
+    auto local_size = this->default_local_size;
+    // switch the size of rank 0 and rank 1 to generate different local size but
+    // the same global size
+    if (this->rank == 0) {
+        local_size = gko::dim<2>(2, 3);
+    } else if (this->rank == 1) {
+        local_size = gko::dim<2>(1, 3);
+    }
+
+    this->cache.init(this->ref, this->comm, this->default_global_size,
+                     local_size);
+
+    ASSERT_NE(this->cache.get(), nullptr);
+    ASSERT_EQ(this->cache.get(), first_ptr);
+    // we use move to replace the value pointer not the dense pointer
+    if (this->rank >= 2) {
+        ASSERT_EQ(this->cache->get_local_vector()->get_const_values(),
+                  local_val_ptr);
+    } else {
+        ASSERT_NE(this->cache->get_local_vector()->get_const_values(),
+                  local_val_ptr);
+    }
+    GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_size(),
+                                this->default_global_size);
+    GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_local_vector()->get_size(),
+                                local_size);
+    ASSERT_EQ(this->cache->get_executor(), this->ref);
+}
+
+
+TYPED_TEST(VectorCache, CanInitFromVector)
+{
+    this->cache.init_from(this->default_vector.get());
+
+    ASSERT_NE(this->cache.get(), nullptr);
+    ASSERT_NE(this->cache.get(), this->default_vector.get());
+    GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_size(),
+                                this->default_global_size);
+    GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_local_vector()->get_size(),
+                                this->default_local_size);
+    ASSERT_EQ(this->cache->get_executor(), this->ref);
+}
+
+
+TYPED_TEST(VectorCache, SecondInitFromSameDenseIsNoOp)
+{
+    this->cache.init_from(this->default_vector.get());
+    auto first_ptr = this->cache.get();
+    auto local_val_ptr = this->cache->get_local_vector()->get_const_values();
+
+    this->cache.init_from(this->default_vector.get());
+
+    ASSERT_NE(this->cache.get(), nullptr);
+    ASSERT_NE(this->cache.get(), this->default_vector.get());
+    ASSERT_EQ(this->cache.get(), first_ptr);
+    ASSERT_EQ(this->cache->get_local_vector()->get_const_values(),
+              local_val_ptr);
+}
+
+
+TYPED_TEST(VectorCache, SecondInitFromDifferentDenseWithSameSizeIsNoOp)
+{
+    using vector_type = typename TestFixture::vector_type;
+    this->cache.init_from(this->default_vector.get());
+    auto first_ptr = this->cache.get();
+    auto local_val_ptr = this->cache->get_local_vector()->get_const_values();
+    auto vector =
+        vector_type::create(this->ref, this->comm, this->default_global_size,
+                            this->default_local_size);
+
+    this->cache.init_from(vector.get());
+
+    ASSERT_NE(this->cache.get(), nullptr);
+    ASSERT_NE(this->cache.get(), this->default_vector.get());
+    ASSERT_EQ(this->cache.get(), first_ptr);
+    ASSERT_EQ(this->cache->get_local_vector()->get_const_values(),
+              local_val_ptr);
+}
+
+
+TYPED_TEST(VectorCache,
+           SecondInitFromDifferentVectorWithDifferentGlobalSizeInitializes)
+{
+    using vector_type = typename TestFixture::vector_type;
+    this->cache.init_from(this->default_vector.get());
+    auto first_ptr = this->cache.get();
+    auto local_val_ptr = this->cache->get_local_vector()->get_const_values();
+    gko::dim<2> second_local_size(2 * (this->rank + 1), 3);
+    gko::dim<2> second_global_size((this->num + 1) * this->num, 3);
+    auto vector = vector_type::create(this->ref, this->comm, second_global_size,
+                                      second_local_size);
+
+    this->cache.init_from(vector.get());
+
+    ASSERT_NE(this->cache.get(), nullptr);
+    ASSERT_NE(this->cache.get(), this->default_vector.get());
+    ASSERT_NE(this->cache.get(), first_ptr);
+    ASSERT_NE(this->cache->get_local_vector()->get_const_values(),
+              local_val_ptr);
+    GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_size(), second_global_size);
+    GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_local_vector()->get_size(),
+                                second_local_size);
+    ASSERT_EQ(this->cache->get_executor(), this->ref);
+}
+
+
+TYPED_TEST(VectorCache,
+           SecondInitFromDifferentVectorWithDifferentLocalSizeInitializes)
+{
+    using vector_type = typename TestFixture::vector_type;
+    this->cache.init_from(this->default_vector.get());
+    auto first_ptr = this->cache.get();
+    auto local_val_ptr = this->cache->get_local_vector()->get_const_values();
+    auto local_size = this->default_local_size;
+    // switch the size of rank 0 and rank 1 to generate different local size but
+    // the same global size
+    if (this->rank == 0) {
+        local_size = gko::dim<2>(2, 3);
+    } else if (this->rank == 1) {
+        local_size = gko::dim<2>(1, 3);
+    }
+    auto vector = vector_type::create(this->ref, this->comm,
+                                      this->default_global_size, local_size);
+
+    this->cache.init_from(vector.get());
+
+    ASSERT_NE(this->cache.get(), nullptr);
+    ASSERT_NE(this->cache.get(), this->default_vector.get());
+    ASSERT_EQ(this->cache.get(), first_ptr);
+    if (this->rank >= 2) {
+        ASSERT_EQ(this->cache->get_local_vector()->get_const_values(),
+                  local_val_ptr);
+    } else {
+        ASSERT_NE(this->cache->get_local_vector()->get_const_values(),
+                  local_val_ptr);
+    }
+    GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_size(),
+                                this->default_global_size);
+    GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_local_vector()->get_size(),
+                                local_size);
+    ASSERT_EQ(this->cache->get_executor(), this->ref);
+}
+
+
+TYPED_TEST(VectorCache, VectorIsNotCopied)
+{
+    using value_type = typename TestFixture::value_type;
+    this->cache.init(this->ref, this->comm, this->default_global_size,
+                     this->default_local_size);
+    gko::detail::VectorCache<value_type> cache(this->cache);
+
+    ASSERT_EQ(cache.get(), nullptr);
+    GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_size(),
+                                this->default_global_size);
+    GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_local_vector()->get_size(),
+                                this->default_local_size);
+}
+
+
+TYPED_TEST(VectorCache, VectorIsNotMoved)
+{
+    using value_type = typename TestFixture::value_type;
+    this->cache.init(this->ref, this->comm, this->default_global_size,
+                     this->default_local_size);
+    gko::detail::VectorCache<value_type> cache(std::move(this->cache));
+
+    ASSERT_EQ(cache.get(), nullptr);
+    GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_size(),
+                                this->default_global_size);
+    GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_local_vector()->get_size(),
+                                this->default_local_size);
+}
+
+
+TYPED_TEST(VectorCache, VectorIsNotCopyAssigned)
+{
+    using value_type = typename TestFixture::value_type;
+    this->cache.init(this->ref, this->comm, this->default_global_size,
+                     this->default_local_size);
+    gko::detail::VectorCache<value_type> cache;
+    cache = this->cache;
+
+    ASSERT_EQ(cache.get(), nullptr);
+    GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_size(),
+                                this->default_global_size);
+    GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_local_vector()->get_size(),
+                                this->default_local_size);
+}
+
+
+TYPED_TEST(VectorCache, VectorIsNotMoveAssigned)
+{
+    using value_type = typename TestFixture::value_type;
+    this->cache.init(this->ref, this->comm, this->default_global_size,
+                     this->default_local_size);
+    gko::detail::VectorCache<value_type> cache;
+    cache = std::move(this->cache);
+
+    ASSERT_EQ(cache.get(), nullptr);
+    GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_size(),
+                                this->default_global_size);
+    GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_local_vector()->get_size(),
+                                this->default_local_size);
+}
\ No newline at end of file
diff --git a/include/ginkgo/core/base/vector_cache.hpp b/include/ginkgo/core/base/vector_cache.hpp
new file mode 100644
index 00000000000..88fcb22d3ec
--- /dev/null
+++ b/include/ginkgo/core/base/vector_cache.hpp
@@ -0,0 +1,112 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_PUBLIC_CORE_BASE_VECTOR_CACHE_HPP_
+#define GKO_PUBLIC_CORE_BASE_VECTOR_CACHE_HPP_
+
+
+#include <memory>
+
+#include <ginkgo/config.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/mpi.hpp>
+#include <ginkgo/core/distributed/vector.hpp>
+
+
+#if GINKGO_BUILD_MPI
+
+
+namespace gko {
+namespace detail {
+
+
+/**
+ * Manages a distributed vector that is buffered and reused internally to avoid
+ * repeated allocations. Copying an instance will only yield an empty object
+ * since copying the cached vector would not make sense. The stored object is
+ * always mutable, so the cache can be used in a const-context.
+ *
+ * @internal  The struct is present to wrap cache-like buffer storage that will
+ *            not be copied when the outer object gets copied.
+ */
+template <typename ValueType>
+struct VectorCache {
+    VectorCache() = default;
+    ~VectorCache() = default;
+    VectorCache(const VectorCache&) {}
+    VectorCache(VectorCache&&) noexcept {}
+    VectorCache& operator=(const VectorCache&) { return *this; }
+    VectorCache& operator=(VectorCache&&) noexcept { return *this; }
+    mutable std::unique_ptr<experimental::distributed::Vector<ValueType>> vec{};
+
+
+    /**
+     * Initializes the buffered vector with the same configuration as the
+     * template vector, if
+     * - the current vector is null,
+     * - the sizes of the buffered and template vector differ,
+     * - the executor of the buffered and template vector differ.
+     *
+     * @note This does not copy any data from the template vector. If only the
+     *       local size differs, only reallocate the local vector not the global
+     *       vector.
+     *
+     * @param template_vec  Defines the configuration (executor, size, stride)
+     *                      of the buffered vector.
+     */
+    void init_from(
+        const experimental::distributed::Vector<ValueType>* template_vec) const;
+
+    /**
+     * Initializes the buffered vector, if
+     * - the current vector is null,
+     * - the sizes differ,
+     * - the executor differs.
+     *
+     * @param exec  Executor associated with the buffered vector
+     * @param comm  Communicator associated with the buffered vector
+     * @param global_size  Global size of the buffered vector
+     * @param local_size  Processor-local size of the buffered vector, uses
+     *                    local_size[1] as the stride
+     */
+    void init(std::shared_ptr<const Executor> exec,
+              gko::experimental::mpi::communicator comm, dim<2> global_size,
+              dim<2> local_size) const;
+
+    /**
+     * Reference access to the underlying vector.
+     *
+     * @return  Reference to the stored vector.
+     */
+    experimental::distributed::Vector<ValueType>& operator*() const
+    {
+        return *vec;
+    }
+
+    /**
+     * Pointer access to the underlying vector.
+     * @return  Pointer to the stored vector.
+     */
+    experimental::distributed::Vector<ValueType>* operator->() const
+    {
+        return vec.get();
+    }
+
+    /**
+     * Pointer access to the underlying vector.
+     * @return  Pointer to the stored vector.
+     */
+    experimental::distributed::Vector<ValueType>* get() const
+    {
+        return vec.get();
+    }
+};
+
+
+}  // namespace detail
+}  // namespace gko
+
+
+#endif  // GINKGO_BUILD_MPI
+#endif  // GKO_PUBLIC_CORE_BASE_VECTOR_CACHE_HPP_
diff --git a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp
index 9e5ef7261c7..4c7b516abd6 100644
--- a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp
+++ b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp
@@ -14,6 +14,7 @@
 
 #include <ginkgo/core/base/abstract_factory.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
+#include <ginkgo/core/base/vector_cache.hpp>
 #include <ginkgo/core/config/config.hpp>
 #include <ginkgo/core/config/registry.hpp>
 #include <ginkgo/core/config/type_descriptor.hpp>
@@ -151,15 +152,6 @@ class Schwarz
     void apply_impl(const LinOp* alpha, const LinOp* b, const LinOp* beta,
                     LinOp* x) const override;
 
-    /**
-     * Prepares the intermediate vector in workspace
-     *
-     * @param vec  vector of the first apply. the intermediate is a copy of vec
-     *             in the return.
-     */
-    template <typename VectorType>
-    void set_cache_to(const VectorType* b) const;
-
 private:
     /**
      * Sets the solver operator used as the local solver.
@@ -170,25 +162,7 @@ class Schwarz
 
     std::shared_ptr<const LinOp> local_solver_;
 
-    /**
-     * Manages a vector as a cache, so there is no need to allocate one every
-     * time an intermediate vector is required.
-     * Copying an instance will only yield an empty object since copying the
-     * cached vector would not make sense.
-     *
-     * @internal  The struct is present so the whole class can be copyable
-     *            (could also be done with writing `operator=` and copy
-     *            constructor of the enclosing class by hand)
-     */
-    mutable struct cache_struct {
-        cache_struct() = default;
-        ~cache_struct() = default;
-        cache_struct(const cache_struct&) {}
-        cache_struct(cache_struct&&) {}
-        cache_struct& operator=(const cache_struct&) { return *this; }
-        cache_struct& operator=(cache_struct&&) { return *this; }
-        std::unique_ptr<LinOp> intermediate{};
-    } cache_;
+    detail::VectorCache<ValueType> cache_;
 };
 
 
diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp
index d7e9b1a10e0..6c633b0e13f 100644
--- a/include/ginkgo/core/matrix/dense.hpp
+++ b/include/ginkgo/core/matrix/dense.hpp
@@ -21,6 +21,16 @@
 
 
 namespace gko {
+namespace detail {
+
+
+template <typename ValueType>
+class VectorCache;
+
+
+}  // namespace detail
+
+
 namespace experimental {
 namespace distributed {
 
@@ -29,7 +39,7 @@ template <typename ValueType>
 class Vector;
 
 
-}
+}  // namespace distributed
 }  // namespace experimental
 
 
@@ -122,6 +132,7 @@ class Dense
     friend class SparsityCsr<ValueType, int64>;
     friend class Dense<to_complex<ValueType>>;
     friend class experimental::distributed::Vector<ValueType>;
+    friend class gko::detail::VectorCache<ValueType>;
 
 public:
     using EnableLinOp<Dense>::convert_to;
diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp
index 61e5b719508..defe3193157 100644
--- a/include/ginkgo/ginkgo.hpp
+++ b/include/ginkgo/ginkgo.hpp
@@ -51,6 +51,7 @@
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/base/utils.hpp>
 #include <ginkgo/core/base/utils_helper.hpp>
+#include <ginkgo/core/base/vector_cache.hpp>
 #include <ginkgo/core/base/version.hpp>
 
 #include <ginkgo/core/config/config.hpp>

From 321cf2efa4ab995052169c3ec138d16c32e07602 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 5 Nov 2024 10:57:39 +0100
Subject: [PATCH 236/448] move VectorCache into experimental::distributed

Co-authored-by: Pratik Nayak <pratikvn@protonmail.com>
---
 core/CMakeLists.txt                           |  2 +-
 core/{base => distributed}/vector_cache.cpp   | 16 +++++-----
 core/test/base/CMakeLists.txt                 |  4 ---
 core/test/mpi/distributed/CMakeLists.txt      |  1 +
 .../distributed}/vector_cache.cpp             | 18 ++++++-----
 .../ginkgo/core/base/precision_dispatch.hpp   | 25 +++++++---------
 include/ginkgo/core/distributed/matrix.hpp    | 10 +++----
 .../distributed/preconditioner/schwarz.hpp    |  2 +-
 include/ginkgo/core/distributed/vector.hpp    |  9 ++++++
 .../{base => distributed}/vector_cache.hpp    | 30 ++++++++-----------
 include/ginkgo/core/matrix/dense.hpp          | 16 +++++-----
 include/ginkgo/ginkgo.hpp                     |  2 +-
 12 files changed, 66 insertions(+), 69 deletions(-)
 rename core/{base => distributed}/vector_cache.cpp (79%)
 rename core/test/{base => mpi/distributed}/vector_cache.cpp (95%)
 rename include/ginkgo/core/{base => distributed}/vector_cache.hpp (81%)

diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
index b2666845fed..ddd8937c44f 100644
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -134,7 +134,7 @@ endif()
 if(GINKGO_BUILD_MPI)
     target_sources(${ginkgo_core}
         PRIVATE
-        base/vector_cache.cpp
+        distributed/vector_cache.cpp
         mpi/exception.cpp
         distributed/matrix.cpp
         distributed/partition_helpers.cpp
diff --git a/core/base/vector_cache.cpp b/core/distributed/vector_cache.cpp
similarity index 79%
rename from core/base/vector_cache.cpp
rename to core/distributed/vector_cache.cpp
index ffe96ec9f84..e6b1af5fc42 100644
--- a/core/base/vector_cache.cpp
+++ b/core/distributed/vector_cache.cpp
@@ -2,12 +2,15 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "ginkgo/core/base/vector_cache.hpp"
+#include "ginkgo/core/distributed/vector_cache.hpp"
 
 #include <ginkgo/core/base/mpi.hpp>
 #include <ginkgo/core/distributed/vector.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
 
 namespace gko {
+namespace experimental {
+namespace distributed {
 namespace detail {
 
 
@@ -17,8 +20,7 @@ void VectorCache<ValueType>::init(std::shared_ptr<const Executor> exec,
                                   dim<2> global_size, dim<2> local_size) const
 {
     if (!vec || vec->get_size() != global_size || vec->get_executor() != exec) {
-        vec = experimental::distributed::Vector<ValueType>::create(
-            exec, comm, global_size, local_size);
+        vec = Vector<ValueType>::create(exec, comm, global_size, local_size);
     } else if (vec->get_local_vector()->get_size() != local_size) {
         // handle locally to eliminate the mpi call
         vec->local_ =
@@ -29,13 +31,11 @@ void VectorCache<ValueType>::init(std::shared_ptr<const Executor> exec,
 
 template <typename ValueType>
 void VectorCache<ValueType>::init_from(
-    const experimental::distributed::Vector<ValueType>* template_vec) const
+    const Vector<ValueType>* template_vec) const
 {
     if (!vec || vec->get_size() != template_vec->get_size() ||
         vec->get_executor() != template_vec->get_executor()) {
-        vec =
-            experimental::distributed::Vector<ValueType>::create_with_config_of(
-                template_vec);
+        vec = Vector<ValueType>::create_with_config_of(template_vec);
     } else if (vec->get_local_vector()->get_size() !=
                template_vec->get_local_vector()->get_size()) {
         // handle locally to eliminate the mpi call
@@ -52,4 +52,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_VECTOR_CACHE);
 
 
 }  // namespace detail
+}  // namespace distributed
+}  // namespace experimental
 }  // namespace gko
diff --git a/core/test/base/CMakeLists.txt b/core/test/base/CMakeLists.txt
index 3e59ad12686..d7deeec6fb7 100644
--- a/core/test/base/CMakeLists.txt
+++ b/core/test/base/CMakeLists.txt
@@ -31,7 +31,3 @@ ginkgo_create_test(segmented_range)
 ginkgo_create_test(types)
 ginkgo_create_test(utils)
 ginkgo_create_test(version EXECUTABLE_NAME version_test) # version collides with C++ stdlib header
-
-if(GINKGO_BUILD_MPI)
-    ginkgo_create_test(vector_cache MPI_SIZE 3)
-endif()
\ No newline at end of file
diff --git a/core/test/mpi/distributed/CMakeLists.txt b/core/test/mpi/distributed/CMakeLists.txt
index 8cff893408a..85c64792198 100644
--- a/core/test/mpi/distributed/CMakeLists.txt
+++ b/core/test/mpi/distributed/CMakeLists.txt
@@ -1,5 +1,6 @@
 ginkgo_create_test(helpers MPI_SIZE 1)
 ginkgo_create_test(matrix MPI_SIZE 1)
+ginkgo_create_test(vector_cache MPI_SIZE 3)
 
 add_subdirectory(preconditioner)
 add_subdirectory(solver)
diff --git a/core/test/base/vector_cache.cpp b/core/test/mpi/distributed/vector_cache.cpp
similarity index 95%
rename from core/test/base/vector_cache.cpp
rename to core/test/mpi/distributed/vector_cache.cpp
index 8ed3bd4f038..f64c5fe9038 100644
--- a/core/test/base/vector_cache.cpp
+++ b/core/test/mpi/distributed/vector_cache.cpp
@@ -7,8 +7,8 @@
 #include <gtest/gtest.h>
 
 #include <ginkgo/core/base/mpi.hpp>
-#include <ginkgo/core/base/vector_cache.hpp>
 #include <ginkgo/core/distributed/vector.hpp>
+#include <ginkgo/core/distributed/vector_cache.hpp>
 
 #include "core/test/utils.hpp"
 
@@ -32,7 +32,7 @@ class VectorCache : public ::testing::Test {
     {}
 
     std::shared_ptr<gko::ReferenceExecutor> ref;
-    gko::detail::VectorCache<value_type> cache;
+    gko::experimental::distributed::detail::VectorCache<value_type> cache;
     gko::experimental::mpi::communicator comm;
     int rank;
     int num;
@@ -48,7 +48,7 @@ TYPED_TEST_SUITE(VectorCache, gko::test::ValueTypes, TypenameNameGenerator);
 TYPED_TEST(VectorCache, CanDefaultConstruct)
 {
     using value_type = typename TestFixture::value_type;
-    gko::detail::VectorCache<value_type> cache;
+    gko::experimental::distributed::detail::VectorCache<value_type> cache;
 
     ASSERT_EQ(cache.get(), nullptr);
 }
@@ -264,7 +264,8 @@ TYPED_TEST(VectorCache, VectorIsNotCopied)
     using value_type = typename TestFixture::value_type;
     this->cache.init(this->ref, this->comm, this->default_global_size,
                      this->default_local_size);
-    gko::detail::VectorCache<value_type> cache(this->cache);
+    gko::experimental::distributed::detail::VectorCache<value_type> cache(
+        this->cache);
 
     ASSERT_EQ(cache.get(), nullptr);
     GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_size(),
@@ -279,7 +280,8 @@ TYPED_TEST(VectorCache, VectorIsNotMoved)
     using value_type = typename TestFixture::value_type;
     this->cache.init(this->ref, this->comm, this->default_global_size,
                      this->default_local_size);
-    gko::detail::VectorCache<value_type> cache(std::move(this->cache));
+    gko::experimental::distributed::detail::VectorCache<value_type> cache(
+        std::move(this->cache));
 
     ASSERT_EQ(cache.get(), nullptr);
     GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_size(),
@@ -294,7 +296,7 @@ TYPED_TEST(VectorCache, VectorIsNotCopyAssigned)
     using value_type = typename TestFixture::value_type;
     this->cache.init(this->ref, this->comm, this->default_global_size,
                      this->default_local_size);
-    gko::detail::VectorCache<value_type> cache;
+    gko::experimental::distributed::detail::VectorCache<value_type> cache;
     cache = this->cache;
 
     ASSERT_EQ(cache.get(), nullptr);
@@ -310,7 +312,7 @@ TYPED_TEST(VectorCache, VectorIsNotMoveAssigned)
     using value_type = typename TestFixture::value_type;
     this->cache.init(this->ref, this->comm, this->default_global_size,
                      this->default_local_size);
-    gko::detail::VectorCache<value_type> cache;
+    gko::experimental::distributed::detail::VectorCache<value_type> cache;
     cache = std::move(this->cache);
 
     ASSERT_EQ(cache.get(), nullptr);
@@ -318,4 +320,4 @@ TYPED_TEST(VectorCache, VectorIsNotMoveAssigned)
                                 this->default_global_size);
     GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_local_vector()->get_size(),
                                 this->default_local_size);
-}
\ No newline at end of file
+}
diff --git a/include/ginkgo/core/base/precision_dispatch.hpp b/include/ginkgo/core/base/precision_dispatch.hpp
index d3b52949b76..8875b7d46f3 100644
--- a/include/ginkgo/core/base/precision_dispatch.hpp
+++ b/include/ginkgo/core/base/precision_dispatch.hpp
@@ -327,18 +327,16 @@ namespace distributed {
  * @throws NotSupported  if the input matrix cannot be converted to
  *                       experimental::distributed::Vector<ValueType>
  *
- * @tparam ValueType  the value type into whose associated
- * experimental::distributed::Vector type to convert the input LinOp.
+ * @tparam ValueType  the value type into whose associated Vector type to
+ *                    convert the input LinOp.
  */
 template <typename ValueType>
-detail::temporary_conversion<experimental::distributed::Vector<ValueType>>
-make_temporary_conversion(LinOp* matrix)
+gko::detail::temporary_conversion<Vector<ValueType>> make_temporary_conversion(
+    LinOp* matrix)
 {
-    auto result = detail::temporary_conversion<
-        experimental::distributed::Vector<ValueType>>::
-        template create<
-            experimental::distributed::Vector<next_precision<ValueType>>>(
-            matrix);
+    auto result =
+        gko::detail::temporary_conversion<Vector<ValueType>>::template create<
+            Vector<next_precision<ValueType>>>(matrix);
     if (!result) {
         GKO_NOT_SUPPORTED(matrix);
     }
@@ -350,14 +348,11 @@ make_temporary_conversion(LinOp* matrix)
  * @copydoc make_temporary_conversion
  */
 template <typename ValueType>
-detail::temporary_conversion<const experimental::distributed::Vector<ValueType>>
+gko::detail::temporary_conversion<const Vector<ValueType>>
 make_temporary_conversion(const LinOp* matrix)
 {
-    auto result = detail::temporary_conversion<
-        const experimental::distributed::Vector<ValueType>>::
-        template create<
-            experimental::distributed::Vector<next_precision<ValueType>>>(
-            matrix);
+    auto result = gko::detail::temporary_conversion<const Vector<ValueType>>::
+        template create<Vector<next_precision<ValueType>>>(matrix);
     if (!result) {
         GKO_NOT_SUPPORTED(matrix);
     }
diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp
index de719bb9315..493b9176205 100644
--- a/include/ginkgo/core/distributed/matrix.hpp
+++ b/include/ginkgo/core/distributed/matrix.hpp
@@ -437,7 +437,7 @@ class Matrix
      * @return A smart pointer to the newly created matrix.
      */
     template <typename MatrixType,
-              typename = std::enable_if_t<detail::is_matrix_type_builder<
+              typename = std::enable_if_t<gko::detail::is_matrix_type_builder<
                   MatrixType, ValueType, LocalIndexType>::value>>
     static std::unique_ptr<Matrix> create(std::shared_ptr<const Executor> exec,
                                           mpi::communicator comm,
@@ -478,10 +478,10 @@ class Matrix
      */
     template <typename LocalMatrixType, typename NonLocalMatrixType,
               typename = std::enable_if_t<
-                  detail::is_matrix_type_builder<LocalMatrixType, ValueType,
-                                                 LocalIndexType>::value &&
-                  detail::is_matrix_type_builder<NonLocalMatrixType, ValueType,
-                                                 LocalIndexType>::value>>
+                  gko::detail::is_matrix_type_builder<
+                      LocalMatrixType, ValueType, LocalIndexType>::value &&
+                  gko::detail::is_matrix_type_builder<
+                      NonLocalMatrixType, ValueType, LocalIndexType>::value>>
     static std::unique_ptr<Matrix> create(
         std::shared_ptr<const Executor> exec, mpi::communicator comm,
         LocalMatrixType local_matrix_template,
diff --git a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp
index 4c7b516abd6..da5ef3dc1ff 100644
--- a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp
+++ b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp
@@ -14,12 +14,12 @@
 
 #include <ginkgo/core/base/abstract_factory.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
-#include <ginkgo/core/base/vector_cache.hpp>
 #include <ginkgo/core/config/config.hpp>
 #include <ginkgo/core/config/registry.hpp>
 #include <ginkgo/core/config/type_descriptor.hpp>
 #include <ginkgo/core/distributed/matrix.hpp>
 #include <ginkgo/core/distributed/vector.hpp>
+#include <ginkgo/core/distributed/vector_cache.hpp>
 
 
 namespace gko {
diff --git a/include/ginkgo/core/distributed/vector.hpp b/include/ginkgo/core/distributed/vector.hpp
index e068f29ea26..17b8d0f944b 100644
--- a/include/ginkgo/core/distributed/vector.hpp
+++ b/include/ginkgo/core/distributed/vector.hpp
@@ -22,6 +22,14 @@
 namespace gko {
 namespace experimental {
 namespace distributed {
+namespace detail {
+
+
+template <typename ValueType>
+class VectorCache;
+
+
+}  // namespace detail
 
 
 template <typename LocalIndexType, typename GlobalIndexType>
@@ -65,6 +73,7 @@ class Vector
     friend class Vector<to_complex<ValueType>>;
     friend class Vector<remove_complex<ValueType>>;
     friend class Vector<next_precision<ValueType>>;
+    friend class detail::VectorCache<ValueType>;
 
 public:
     using EnableDistributedLinOp<Vector>::convert_to;
diff --git a/include/ginkgo/core/base/vector_cache.hpp b/include/ginkgo/core/distributed/vector_cache.hpp
similarity index 81%
rename from include/ginkgo/core/base/vector_cache.hpp
rename to include/ginkgo/core/distributed/vector_cache.hpp
index 88fcb22d3ec..ea7ef6afa4e 100644
--- a/include/ginkgo/core/base/vector_cache.hpp
+++ b/include/ginkgo/core/distributed/vector_cache.hpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#ifndef GKO_PUBLIC_CORE_BASE_VECTOR_CACHE_HPP_
-#define GKO_PUBLIC_CORE_BASE_VECTOR_CACHE_HPP_
+#ifndef GKO_PUBLIC_CORE_DISTRIBUTED_VECTOR_CACHE_HPP_
+#define GKO_PUBLIC_CORE_DISTRIBUTED_VECTOR_CACHE_HPP_
 
 
 #include <memory>
@@ -18,6 +18,8 @@
 
 
 namespace gko {
+namespace experimental {
+namespace distributed {
 namespace detail {
 
 
@@ -38,7 +40,7 @@ struct VectorCache {
     VectorCache(VectorCache&&) noexcept {}
     VectorCache& operator=(const VectorCache&) { return *this; }
     VectorCache& operator=(VectorCache&&) noexcept { return *this; }
-    mutable std::unique_ptr<experimental::distributed::Vector<ValueType>> vec{};
+    mutable std::unique_ptr<Vector<ValueType>> vec{};
 
 
     /**
@@ -55,8 +57,7 @@ struct VectorCache {
      * @param template_vec  Defines the configuration (executor, size, stride)
      *                      of the buffered vector.
      */
-    void init_from(
-        const experimental::distributed::Vector<ValueType>* template_vec) const;
+    void init_from(const Vector<ValueType>* template_vec) const;
 
     /**
      * Initializes the buffered vector, if
@@ -79,34 +80,27 @@ struct VectorCache {
      *
      * @return  Reference to the stored vector.
      */
-    experimental::distributed::Vector<ValueType>& operator*() const
-    {
-        return *vec;
-    }
+    Vector<ValueType>& operator*() const { return *vec; }
 
     /**
      * Pointer access to the underlying vector.
      * @return  Pointer to the stored vector.
      */
-    experimental::distributed::Vector<ValueType>* operator->() const
-    {
-        return vec.get();
-    }
+    Vector<ValueType>* operator->() const { return vec.get(); }
 
     /**
      * Pointer access to the underlying vector.
      * @return  Pointer to the stored vector.
      */
-    experimental::distributed::Vector<ValueType>* get() const
-    {
-        return vec.get();
-    }
+    Vector<ValueType>* get() const { return vec.get(); }
 };
 
 
 }  // namespace detail
+}  // namespace distributed
+}  // namespace experimental
 }  // namespace gko
 
 
 #endif  // GINKGO_BUILD_MPI
-#endif  // GKO_PUBLIC_CORE_BASE_VECTOR_CACHE_HPP_
+#endif  // GKO_PUBLIC_CORE_DISTRIBUTED_VECTOR_CACHE_HPP_
diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp
index 6c633b0e13f..bccd3adcd54 100644
--- a/include/ginkgo/core/matrix/dense.hpp
+++ b/include/ginkgo/core/matrix/dense.hpp
@@ -21,24 +21,22 @@
 
 
 namespace gko {
-namespace detail {
+namespace experimental {
+namespace distributed {
 
 
 template <typename ValueType>
-class VectorCache;
-
-
-}  // namespace detail
+class Vector;
 
 
-namespace experimental {
-namespace distributed {
+namespace detail {
 
 
 template <typename ValueType>
-class Vector;
+class VectorCache;
 
 
+}  // namespace detail
 }  // namespace distributed
 }  // namespace experimental
 
@@ -132,7 +130,7 @@ class Dense
     friend class SparsityCsr<ValueType, int64>;
     friend class Dense<to_complex<ValueType>>;
     friend class experimental::distributed::Vector<ValueType>;
-    friend class gko::detail::VectorCache<ValueType>;
+    friend class experimental::distributed::detail::VectorCache<ValueType>;
 
 public:
     using EnableLinOp<Dense>::convert_to;
diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp
index defe3193157..b36ece34b8d 100644
--- a/include/ginkgo/ginkgo.hpp
+++ b/include/ginkgo/ginkgo.hpp
@@ -51,7 +51,6 @@
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/base/utils.hpp>
 #include <ginkgo/core/base/utils_helper.hpp>
-#include <ginkgo/core/base/vector_cache.hpp>
 #include <ginkgo/core/base/version.hpp>
 
 #include <ginkgo/core/config/config.hpp>
@@ -70,6 +69,7 @@
 #include <ginkgo/core/distributed/preconditioner/schwarz.hpp>
 
 #include <ginkgo/core/distributed/vector.hpp>
+#include <ginkgo/core/distributed/vector_cache.hpp>
 
 #include <ginkgo/core/factorization/cholesky.hpp>
 #include <ginkgo/core/factorization/factorization.hpp>

From 908b88ab898a6a40f3900379fb24e60e49968df3 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 5 Nov 2024 10:57:48 +0100
Subject: [PATCH 237/448] fix typo of rcm

---
 reference/reorder/rcm_kernels.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/reference/reorder/rcm_kernels.cpp b/reference/reorder/rcm_kernels.cpp
index ff4bcd70214..5cbce5dc5e3 100644
--- a/reference/reorder/rcm_kernels.cpp
+++ b/reference/reorder/rcm_kernels.cpp
@@ -191,7 +191,7 @@ void compute_permutation(std::shared_ptr<const ReferenceExecutor> exec,
     for (IndexType i = 0; i < num_vertices; ++i) {
         degrees[i] = row_ptrs[i + 1] - row_ptrs[i];
     }
-    // Storing vertices left to proceess.
+    // Storing vertices left to process.
     array<IndexType> linear_queue(exec, num_vertices);
     auto linear_queue_p = linear_queue.get_data();
     IndexType head_offset = 0;

From 4049bb0e19ac262ab28e7c2ab196ff2a69083c2d Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Fri, 11 Oct 2024 12:54:00 +0200
Subject: [PATCH 238/448] [ci] add benchmark tests with complex types

---
 benchmark/test/CMakeLists.txt              |  2 +-
 benchmark/test/blas.py                     |  8 ++++++++
 benchmark/test/conversion.py               |  8 ++++++++
 benchmark/test/multi_vector_distributed.py |  9 +++++++++
 benchmark/test/preconditioner.py           |  8 ++++++++
 benchmark/test/solver.py                   |  8 ++++++++
 benchmark/test/solver_distributed.py       | 11 +++++++++++
 benchmark/test/sparse_blas.py              |  9 +++++++++
 benchmark/test/spmv.py                     |  8 ++++++++
 benchmark/test/spmv_distributed.py         |  9 +++++++++
 benchmark/test/test_framework.py.in        | 19 ++++++++++++-------
 11 files changed, 91 insertions(+), 8 deletions(-)

diff --git a/benchmark/test/CMakeLists.txt b/benchmark/test/CMakeLists.txt
index 2f43b6eaf71..c0f7bf26fd4 100644
--- a/benchmark/test/CMakeLists.txt
+++ b/benchmark/test/CMakeLists.txt
@@ -9,7 +9,7 @@ function(add_benchmark_test test_name)
                       COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.py $<TARGET_FILE:${test_name}> --generate
                       COMMENT "Regenerating reference output for ${test_name}"
                       WORKING_DIRECTORY "$<TARGET_FILE_DIR:ginkgo>")
-    add_dependencies(${regenerate_target} ${test_name})
+    add_dependencies(${regenerate_target} ${test_name} ${test_name}_dcomplex)
     add_dependencies(benchmark_test_regenerate ${regenerate_target})
 endfunction()
 add_custom_target(benchmark_test_regenerate)
diff --git a/benchmark/test/blas.py b/benchmark/test/blas.py
index ff5bddc5d08..98a775c4992 100755
--- a/benchmark/test/blas.py
+++ b/benchmark/test/blas.py
@@ -30,3 +30,11 @@
     expected_stdout="blas.profile.stdout",
     expected_stderr="blas.profile.stderr",
 )
+
+# complex
+test_framework.compare_output(
+    ["-input", '[{"n": 100}]'],
+    expected_stdout="blas_dcomplex.simple.stdout",
+    expected_stderr="blas_dcomplex.simple.stderr",
+    use_complex=True
+)
\ No newline at end of file
diff --git a/benchmark/test/conversion.py b/benchmark/test/conversion.py
index 2eada100731..ceca02f708e 100755
--- a/benchmark/test/conversion.py
+++ b/benchmark/test/conversion.py
@@ -67,3 +67,11 @@
     expected_stdout="conversion.profile.stdout",
     expected_stderr="conversion.profile.stderr",
 )
+
+# complex
+test_framework.compare_output(
+    ["-input", '[{"size": 100, "stencil": "7pt"}]', "-formats", "coo,csr"],
+    expected_stdout="conversion_dcomplex.simple.stdout",
+    expected_stderr="conversion_dcomplex.simple.stderr",
+    use_complex=True
+)
diff --git a/benchmark/test/multi_vector_distributed.py b/benchmark/test/multi_vector_distributed.py
index c62cb8ebd17..aff2e2b21ee 100644
--- a/benchmark/test/multi_vector_distributed.py
+++ b/benchmark/test/multi_vector_distributed.py
@@ -34,3 +34,12 @@
     expected_stderr="multi_vector_distributed.profile.stderr",
     num_procs=3,
 )
+
+# complex
+test_framework.compare_output_distributed(
+    ["-input", '[{"n": 100}]'],
+    expected_stdout="multi_vector_distributed_dcomplex.simple.stdout",
+    expected_stderr="multi_vector_distributed_dcomplex.simple.stderr",
+    num_procs=3,
+    use_complex=True
+)
diff --git a/benchmark/test/preconditioner.py b/benchmark/test/preconditioner.py
index 25c4adb8c5a..2d1b3bd2716 100755
--- a/benchmark/test/preconditioner.py
+++ b/benchmark/test/preconditioner.py
@@ -66,3 +66,11 @@
     expected_stderr="preconditioner.reordered.stderr",
     stdin='[{"size": 100, "stencil": "7pt"}]',
 )
+
+# complex
+test_framework.compare_output(
+    ["-input", '[{"size": 100, "stencil": "7pt"}]'],
+    expected_stdout="preconditioner_dcomplex.simple.stdout",
+    expected_stderr="preconditioner_dcomplex.simple.stderr",
+    use_complex=True
+)
diff --git a/benchmark/test/solver.py b/benchmark/test/solver.py
index 5dd1d840a4e..5ce0002df2e 100755
--- a/benchmark/test/solver.py
+++ b/benchmark/test/solver.py
@@ -51,3 +51,11 @@
     expected_stderr="solver.reordered.stderr",
     stdin='[{"size": 100, "stencil": "7pt", "optimal": {"spmv": "csr"}}]',
 )
+
+# complex input
+test_framework.compare_output(
+    ["-input", '[{"size": 100, "stencil": "7pt", "optimal": {"spmv": "csr"}}]'],
+    expected_stdout="solver_dcomplex.simple.stdout",
+    expected_stderr="solver_dcomplex.simple.stderr",
+    use_complex=True
+)
diff --git a/benchmark/test/solver_distributed.py b/benchmark/test/solver_distributed.py
index 54bbb030077..6c6efb653ef 100644
--- a/benchmark/test/solver_distributed.py
+++ b/benchmark/test/solver_distributed.py
@@ -46,3 +46,14 @@
     expected_stdout="distributed_solver.profile.stdout",
     expected_stderr="distributed_solver.profile.stderr",
 )
+
+# complex
+test_framework.compare_output(
+    [
+        "-input",
+        '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil", "optimal": {"spmv": "csr-csr"}}]',
+    ],
+    expected_stdout="distributed_solver_dcomplex.simple.stdout",
+    expected_stderr="distributed_solver_dcomplex.simple.stderr",
+    use_complex=True
+)
diff --git a/benchmark/test/sparse_blas.py b/benchmark/test/sparse_blas.py
index 8e6cda3c9bd..05ee84aade0 100755
--- a/benchmark/test/sparse_blas.py
+++ b/benchmark/test/sparse_blas.py
@@ -64,3 +64,12 @@
     expected_stderr="sparse_blas.reordered.stderr",
     stdin='[{"size": 100, "stencil": "7pt"}]',
 )
+
+# complex
+test_framework.compare_output(
+    ["-operations", "transpose", "-input",
+        '[{"size": 100, "stencil": "7pt"}]'],
+    expected_stdout="sparse_blas_dcomplex.simple.stdout",
+    expected_stderr="sparse_blas_dcomplex.simple.stderr",
+    use_complex=True
+)
diff --git a/benchmark/test/spmv.py b/benchmark/test/spmv.py
index f6f4a4b5c39..d447490802a 100755
--- a/benchmark/test/spmv.py
+++ b/benchmark/test/spmv.py
@@ -51,3 +51,11 @@
     expected_stderr="spmv.reordered.stderr",
     stdin='[{"size": 100, "stencil": "7pt"}]',
 )
+
+# complex
+test_framework.compare_output(
+    ["-input", '[{"size": 100, "stencil": "7pt"}]'],
+    expected_stdout="spmv_dcomplex.simple.stdout",
+    expected_stderr="spmv_dcomplex.simple.stderr",
+    use_complex=True
+)
diff --git a/benchmark/test/spmv_distributed.py b/benchmark/test/spmv_distributed.py
index 356db48459e..11922f7ae8d 100644
--- a/benchmark/test/spmv_distributed.py
+++ b/benchmark/test/spmv_distributed.py
@@ -40,3 +40,12 @@
     expected_stderr="spmv_distributed.profile.stderr",
     num_procs=3,
 )
+
+# complex
+test_framework.compare_output_distributed(
+    ["-input", '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}]'],
+    expected_stdout="spmv_distributed_dcomplex.simple.stdout",
+    expected_stderr="spmv_distributed_dcomplex.simple.stderr",
+    num_procs=3,
+    use_complex=True
+)
diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in
index 9294b2f02ec..9f2e5718c05 100644
--- a/benchmark/test/test_framework.py.in
+++ b/benchmark/test/test_framework.py.in
@@ -135,8 +135,11 @@ def compare_output_impl(
     expected_stderr: str,
     stdin: str,
     launcher_flags: List[str],
+    use_complex: bool
 ):
-    args = [sys.argv[1]] + args
+    base_file = sys.argv[1]
+    file = base_file if not use_complex else f"{base_file}_dcomplex"
+    args = [file] + args
     expected_stdout = str(sourcepath / "reference" / expected_stdout)
     expected_stderr = str(sourcepath / "reference" / expected_stderr)
     result = subprocess.run(
@@ -211,7 +214,7 @@ def compare_output_impl(
 
 
 def compare_output(
-    args: List[str], expected_stdout: str, expected_stderr: str, stdin: str = ""
+    args: List[str], *, expected_stdout: str, expected_stderr: str, stdin: str = "", use_complex: bool = False
 ):
     compare_output_impl(
         args,
@@ -219,16 +222,18 @@ def compare_output(
         expected_stderr=expected_stderr,
         stdin=stdin,
         launcher_flags=[],
+        use_complex=use_complex
     )
 
 
 def compare_output_distributed(
-    args, expected_stdout, expected_stderr, num_procs, stdin=""
+    args: List[str], *, expected_stdout: str, expected_stderr: str, num_procs: int, stdin: str = "", use_complex: bool = False
 ):
     compare_output_impl(
         args,
-        expected_stdout,
-        expected_stderr,
-        stdin,
-        ["@MPIEXEC_EXECUTABLE@", "@MPIEXEC_NUMPROC_FLAG@", str(num_procs)],
+        expected_stdout=expected_stdout,
+        expected_stderr=expected_stderr,
+        stdin=stdin,
+        launcher_flags=["@MPIEXEC_EXECUTABLE@", "@MPIEXEC_NUMPROC_FLAG@", str(num_procs)],
+        use_complex=use_complex
     )

From 022cdd18ee2f27b3432042ef4fa5461b2e7509ea Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Fri, 11 Oct 2024 12:55:52 +0200
Subject: [PATCH 239/448] [ci] add generated benchmark test reference

---
 .../reference/blas_dcomplex.simple.stderr     |  8 +++
 .../reference/blas_dcomplex.simple.stdout     | 28 +++++++++
 .../conversion_dcomplex.simple.stderr         | 10 ++++
 .../conversion_dcomplex.simple.stdout         | 31 ++++++++++
 .../distributed_solver_dcomplex.simple.stderr |  8 +++
 .../distributed_solver_dcomplex.simple.stdout | 59 +++++++++++++++++++
 ..._vector_distributed_dcomplex.simple.stderr |  8 +++
 ..._vector_distributed_dcomplex.simple.stdout | 28 +++++++++
 .../preconditioner_dcomplex.simple.stderr     |  7 +++
 .../preconditioner_dcomplex.simple.stdout     | 32 ++++++++++
 .../reference/solver_dcomplex.simple.stderr   |  8 +++
 .../reference/solver_dcomplex.simple.stdout   | 56 ++++++++++++++++++
 .../sparse_blas_dcomplex.simple.stderr        |  7 +++
 .../sparse_blas_dcomplex.simple.stdout        | 25 ++++++++
 .../reference/spmv_dcomplex.simple.stderr     |  8 +++
 .../reference/spmv_dcomplex.simple.stdout     | 21 +++++++
 .../spmv_distributed_dcomplex.simple.stderr   |  8 +++
 .../spmv_distributed_dcomplex.simple.stdout   | 22 +++++++
 18 files changed, 374 insertions(+)
 create mode 100644 benchmark/test/reference/blas_dcomplex.simple.stderr
 create mode 100644 benchmark/test/reference/blas_dcomplex.simple.stdout
 create mode 100644 benchmark/test/reference/conversion_dcomplex.simple.stderr
 create mode 100644 benchmark/test/reference/conversion_dcomplex.simple.stdout
 create mode 100644 benchmark/test/reference/distributed_solver_dcomplex.simple.stderr
 create mode 100644 benchmark/test/reference/distributed_solver_dcomplex.simple.stdout
 create mode 100644 benchmark/test/reference/multi_vector_distributed_dcomplex.simple.stderr
 create mode 100644 benchmark/test/reference/multi_vector_distributed_dcomplex.simple.stdout
 create mode 100644 benchmark/test/reference/preconditioner_dcomplex.simple.stderr
 create mode 100644 benchmark/test/reference/preconditioner_dcomplex.simple.stdout
 create mode 100644 benchmark/test/reference/solver_dcomplex.simple.stderr
 create mode 100644 benchmark/test/reference/solver_dcomplex.simple.stdout
 create mode 100644 benchmark/test/reference/sparse_blas_dcomplex.simple.stderr
 create mode 100644 benchmark/test/reference/sparse_blas_dcomplex.simple.stdout
 create mode 100644 benchmark/test/reference/spmv_dcomplex.simple.stderr
 create mode 100644 benchmark/test/reference/spmv_dcomplex.simple.stdout
 create mode 100644 benchmark/test/reference/spmv_distributed_dcomplex.simple.stderr
 create mode 100644 benchmark/test/reference/spmv_distributed_dcomplex.simple.stdout

diff --git a/benchmark/test/reference/blas_dcomplex.simple.stderr b/benchmark/test/reference/blas_dcomplex.simple.stderr
new file mode 100644
index 00000000000..ff505a3f1c9
--- /dev/null
+++ b/benchmark/test/reference/blas_dcomplex.simple.stderr
@@ -0,0 +1,8 @@
+Running on ReferenceExecutor
+Running with 2 warm iterations and 10 running iterations
+The random seed for right hand sides is 42
+The operations are copy,axpy,scal
+Running test case n = 100 
+	Running blas: copy
+	Running blas: axpy
+	Running blas: scal
diff --git a/benchmark/test/reference/blas_dcomplex.simple.stdout b/benchmark/test/reference/blas_dcomplex.simple.stdout
new file mode 100644
index 00000000000..54745d81104
--- /dev/null
+++ b/benchmark/test/reference/blas_dcomplex.simple.stdout
@@ -0,0 +1,28 @@
+[
+    {
+        "n": 100,
+        "blas": {
+            "copy": {
+                "time": 1.0,
+                "flops": 1.0,
+                "bandwidth": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "axpy": {
+                "time": 1.0,
+                "flops": 1.0,
+                "bandwidth": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "scal": {
+                "time": 1.0,
+                "flops": 1.0,
+                "bandwidth": 1.0,
+                "repetitions": 10,
+                "completed": true
+            }
+        }
+    }
+]
diff --git a/benchmark/test/reference/conversion_dcomplex.simple.stderr b/benchmark/test/reference/conversion_dcomplex.simple.stderr
new file mode 100644
index 00000000000..23a27c4372a
--- /dev/null
+++ b/benchmark/test/reference/conversion_dcomplex.simple.stderr
@@ -0,0 +1,10 @@
+Running on ReferenceExecutor
+Running with 2 warm iterations and 10 running iterations
+The random seed for right hand sides is 42
+The formats are coo,csr
+Running test case stencil(100, 7pt)
+Matrix is of size (125, 125), 725
+	Running conversion: coo-read
+	Running conversion: coo-csr
+	Running conversion: csr-read
+	Running conversion: csr-coo
diff --git a/benchmark/test/reference/conversion_dcomplex.simple.stdout b/benchmark/test/reference/conversion_dcomplex.simple.stdout
new file mode 100644
index 00000000000..91b69b8a248
--- /dev/null
+++ b/benchmark/test/reference/conversion_dcomplex.simple.stdout
@@ -0,0 +1,31 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "conversion": {
+            "coo-read": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "coo-csr": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "csr-read": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "csr-coo": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            }
+        },
+        "rows": 125,
+        "cols": 125,
+        "nonzeros": 725
+    }
+]
diff --git a/benchmark/test/reference/distributed_solver_dcomplex.simple.stderr b/benchmark/test/reference/distributed_solver_dcomplex.simple.stderr
new file mode 100644
index 00000000000..9d4b1f7094e
--- /dev/null
+++ b/benchmark/test/reference/distributed_solver_dcomplex.simple.stderr
@@ -0,0 +1,8 @@
+Running on ReferenceExecutor
+Running with 2 warm iterations and 1 running iterations
+The random seed for right hand sides is 42
+Running cg with 1000 iterations and residual goal of 1.000000e-06
+The number of right hand sides is 1
+Running test case stencil(100, 7pt, stencil)
+Matrix is of size (125, 125)
+	Running solver: cg
diff --git a/benchmark/test/reference/distributed_solver_dcomplex.simple.stdout b/benchmark/test/reference/distributed_solver_dcomplex.simple.stdout
new file mode 100644
index 00000000000..458115e6ab2
--- /dev/null
+++ b/benchmark/test/reference/distributed_solver_dcomplex.simple.stdout
@@ -0,0 +1,59 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "comm_pattern": "stencil",
+        "optimal": {
+            "spmv": "csr-csr"
+        },
+        "solver": {
+            "cg": {
+                "recurrent_residuals": [],
+                "true_residuals": [],
+                "implicit_residuals": [],
+                "iteration_timestamps": [],
+                "rhs_norm": 1.0,
+                "generate": {
+                    "components": {
+                        "generate(<typename>)": 1.0,
+                        "free": 1.0,
+                        "overhead": 1.0
+                    },
+                    "time": 1.0
+                },
+                "apply": {
+                    "components": {
+                        "apply(<typename>)": 1.0,
+                        "iteration": 1.0,
+                        "allocate": 1.0,
+                        "dense::fill": 1.0,
+                        "cg::initialize": 1.0,
+                        "advanced_apply(<typename>)": 1.0,
+                        "dense::row_gather": 1.0,
+                        "csr::advanced_spmv": 1.0,
+                        "dense::compute_squared_norm2": 1.0,
+                        "dense::compute_sqrt": 1.0,
+                        "copy(<typename>)": 1.0,
+                        "dense::copy": 1.0,
+                        "dense::compute_conj_dot_dispatch": 1.0,
+                        "check(<typename>)": 1.0,
+                        "residual_norm::residual_norm": 1.0,
+                        "cg::step_1": 1.0,
+                        "csr::spmv": 1.0,
+                        "cg::step_2": 1.0,
+                        "free": 1.0,
+                        "overhead": 1.0
+                    },
+                    "iterations": 7,
+                    "time": 1.0
+                },
+                "preconditioner": {},
+                "residual_norm": 1.0,
+                "repetitions": 1,
+                "completed": true
+            }
+        },
+        "rows": 125,
+        "cols": 125
+    }
+]
diff --git a/benchmark/test/reference/multi_vector_distributed_dcomplex.simple.stderr b/benchmark/test/reference/multi_vector_distributed_dcomplex.simple.stderr
new file mode 100644
index 00000000000..ff505a3f1c9
--- /dev/null
+++ b/benchmark/test/reference/multi_vector_distributed_dcomplex.simple.stderr
@@ -0,0 +1,8 @@
+Running on ReferenceExecutor
+Running with 2 warm iterations and 10 running iterations
+The random seed for right hand sides is 42
+The operations are copy,axpy,scal
+Running test case n = 100 
+	Running blas: copy
+	Running blas: axpy
+	Running blas: scal
diff --git a/benchmark/test/reference/multi_vector_distributed_dcomplex.simple.stdout b/benchmark/test/reference/multi_vector_distributed_dcomplex.simple.stdout
new file mode 100644
index 00000000000..54745d81104
--- /dev/null
+++ b/benchmark/test/reference/multi_vector_distributed_dcomplex.simple.stdout
@@ -0,0 +1,28 @@
+[
+    {
+        "n": 100,
+        "blas": {
+            "copy": {
+                "time": 1.0,
+                "flops": 1.0,
+                "bandwidth": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "axpy": {
+                "time": 1.0,
+                "flops": 1.0,
+                "bandwidth": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "scal": {
+                "time": 1.0,
+                "flops": 1.0,
+                "bandwidth": 1.0,
+                "repetitions": 10,
+                "completed": true
+            }
+        }
+    }
+]
diff --git a/benchmark/test/reference/preconditioner_dcomplex.simple.stderr b/benchmark/test/reference/preconditioner_dcomplex.simple.stderr
new file mode 100644
index 00000000000..d36bc663e57
--- /dev/null
+++ b/benchmark/test/reference/preconditioner_dcomplex.simple.stderr
@@ -0,0 +1,7 @@
+Running on ReferenceExecutor
+Running with 2 warm iterations and 10 running iterations
+The random seed for right hand sides is 42
+Running with preconditioners: none
+Running test case stencil(100, 7pt)
+Matrix is of size (125, 125), 725
+	Running preconditioner: none
diff --git a/benchmark/test/reference/preconditioner_dcomplex.simple.stdout b/benchmark/test/reference/preconditioner_dcomplex.simple.stdout
new file mode 100644
index 00000000000..ed567dcbb13
--- /dev/null
+++ b/benchmark/test/reference/preconditioner_dcomplex.simple.stdout
@@ -0,0 +1,32 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "preconditioner": {
+            "none": {
+                "generate": {
+                    "components": {
+                        "generate(<typename>)": 1.0,
+                        "overhead": 1.0
+                    },
+                    "time": 1.0,
+                    "repetitions": 10
+                },
+                "apply": {
+                    "components": {
+                        "apply(<typename>)": 1.0,
+                        "copy(<typename>)": 1.0,
+                        "dense::copy": 1.0,
+                        "overhead": 1.0
+                    },
+                    "time": 1.0,
+                    "repetitions": 10
+                },
+                "completed": true
+            }
+        },
+        "rows": 125,
+        "cols": 125,
+        "nonzeros": 725
+    }
+]
diff --git a/benchmark/test/reference/solver_dcomplex.simple.stderr b/benchmark/test/reference/solver_dcomplex.simple.stderr
new file mode 100644
index 00000000000..6baa84ee792
--- /dev/null
+++ b/benchmark/test/reference/solver_dcomplex.simple.stderr
@@ -0,0 +1,8 @@
+Running on ReferenceExecutor
+Running with 2 warm iterations and 1 running iterations
+The random seed for right hand sides is 42
+Running cg with 1000 iterations and residual goal of 1.000000e-06
+The number of right hand sides is 1
+Running test case stencil(100, 7pt)
+Matrix is of size (125, 125)
+	Running solver: cg
diff --git a/benchmark/test/reference/solver_dcomplex.simple.stdout b/benchmark/test/reference/solver_dcomplex.simple.stdout
new file mode 100644
index 00000000000..0ee0e4b9a4b
--- /dev/null
+++ b/benchmark/test/reference/solver_dcomplex.simple.stdout
@@ -0,0 +1,56 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "optimal": {
+            "spmv": "csr"
+        },
+        "solver": {
+            "cg": {
+                "recurrent_residuals": [],
+                "true_residuals": [],
+                "implicit_residuals": [],
+                "iteration_timestamps": [],
+                "rhs_norm": 1.0,
+                "generate": {
+                    "components": {
+                        "generate(<typename>)": 1.0,
+                        "free": 1.0,
+                        "overhead": 1.0
+                    },
+                    "time": 1.0
+                },
+                "apply": {
+                    "components": {
+                        "apply(<typename>)": 1.0,
+                        "iteration": 1.0,
+                        "allocate": 1.0,
+                        "dense::fill": 1.0,
+                        "cg::initialize": 1.0,
+                        "advanced_apply(<typename>)": 1.0,
+                        "csr::advanced_spmv": 1.0,
+                        "dense::compute_norm2_dispatch": 1.0,
+                        "copy(<typename>)": 1.0,
+                        "dense::copy": 1.0,
+                        "dense::compute_conj_dot_dispatch": 1.0,
+                        "check(<typename>)": 1.0,
+                        "residual_norm::residual_norm": 1.0,
+                        "cg::step_1": 1.0,
+                        "csr::spmv": 1.0,
+                        "cg::step_2": 1.0,
+                        "free": 1.0,
+                        "overhead": 1.0
+                    },
+                    "iterations": 7,
+                    "time": 1.0
+                },
+                "preconditioner": {},
+                "residual_norm": 1.0,
+                "repetitions": 1,
+                "completed": true
+            }
+        },
+        "rows": 125,
+        "cols": 125
+    }
+]
diff --git a/benchmark/test/reference/sparse_blas_dcomplex.simple.stderr b/benchmark/test/reference/sparse_blas_dcomplex.simple.stderr
new file mode 100644
index 00000000000..d4e29cd9cd7
--- /dev/null
+++ b/benchmark/test/reference/sparse_blas_dcomplex.simple.stderr
@@ -0,0 +1,7 @@
+Running on ReferenceExecutor
+Running with 2 warm iterations and 10 running iterations
+The random seed for right hand sides is 42
+The operations are transpose
+Running test case stencil(100, 7pt)
+Matrix is of size (125, 125), 725
+	Running sparse_blas: transpose
diff --git a/benchmark/test/reference/sparse_blas_dcomplex.simple.stdout b/benchmark/test/reference/sparse_blas_dcomplex.simple.stdout
new file mode 100644
index 00000000000..a44f4f189b2
--- /dev/null
+++ b/benchmark/test/reference/sparse_blas_dcomplex.simple.stdout
@@ -0,0 +1,25 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "sparse_blas": {
+            "transpose": {
+                "time": 1.0,
+                "flops": 1.0,
+                "bandwidth": 1.0,
+                "repetitions": 10,
+                "components": {
+                    "allocate": 1.0,
+                    "components::fill_array": 1.0,
+                    "csr::transpose": 1.0,
+                    "free": 1.0,
+                    "overhead": 1.0
+                },
+                "completed": true
+            }
+        },
+        "rows": 125,
+        "cols": 125,
+        "nonzeros": 725
+    }
+]
diff --git a/benchmark/test/reference/spmv_dcomplex.simple.stderr b/benchmark/test/reference/spmv_dcomplex.simple.stderr
new file mode 100644
index 00000000000..a1f6a62e866
--- /dev/null
+++ b/benchmark/test/reference/spmv_dcomplex.simple.stderr
@@ -0,0 +1,8 @@
+Running on ReferenceExecutor
+Running with 2 warm iterations and 10 running iterations
+The random seed for right hand sides is 42
+The formats are coo
+The number of right hand sides is 1
+Running test case stencil(100, 7pt)
+Matrix is of size (125, 125), 725
+	Running spmv: coo
diff --git a/benchmark/test/reference/spmv_dcomplex.simple.stdout b/benchmark/test/reference/spmv_dcomplex.simple.stdout
new file mode 100644
index 00000000000..ea0944bfd25
--- /dev/null
+++ b/benchmark/test/reference/spmv_dcomplex.simple.stdout
@@ -0,0 +1,21 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "spmv": {
+            "coo": {
+                "storage": 17400,
+                "max_relative_norm2": 1.0,
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            }
+        },
+        "rows": 125,
+        "cols": 125,
+        "nonzeros": 725,
+        "optimal": {
+            "spmv": "coo"
+        }
+    }
+]
diff --git a/benchmark/test/reference/spmv_distributed_dcomplex.simple.stderr b/benchmark/test/reference/spmv_distributed_dcomplex.simple.stderr
new file mode 100644
index 00000000000..b3739ed8774
--- /dev/null
+++ b/benchmark/test/reference/spmv_distributed_dcomplex.simple.stderr
@@ -0,0 +1,8 @@
+Running on ReferenceExecutor
+Running with 2 warm iterations and 10 running iterations
+The random seed for right hand sides is 42
+The formats are [csr]x[csr]
+The number of right hand sides is 1
+Running test case stencil(100, 7pt, stencil)
+Matrix is of size (81, 81), 144
+	Running spmv: csr-csr
diff --git a/benchmark/test/reference/spmv_distributed_dcomplex.simple.stdout b/benchmark/test/reference/spmv_distributed_dcomplex.simple.stdout
new file mode 100644
index 00000000000..779739d7d6c
--- /dev/null
+++ b/benchmark/test/reference/spmv_distributed_dcomplex.simple.stdout
@@ -0,0 +1,22 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "comm_pattern": "stencil",
+        "spmv": {
+            "csr-csr": {
+                "storage": 9972,
+                "max_relative_norm2": 1.0,
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            }
+        },
+        "rows": 81,
+        "cols": 81,
+        "nonzeros": 144,
+        "optimal": {
+            "spmv": "csr-csr"
+        }
+    }
+]

From 7199ff0c174ea93b02e847091b29533a96c1f57a Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Fri, 11 Oct 2024 12:57:08 +0200
Subject: [PATCH 240/448] [bench] fix residual norm logger for complex

---
 benchmark/utils/loggers.hpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/benchmark/utils/loggers.hpp b/benchmark/utils/loggers.hpp
index 65d086beecb..ec6f65413c3 100644
--- a/benchmark/utils/loggers.hpp
+++ b/benchmark/utils/loggers.hpp
@@ -156,14 +156,14 @@ struct ResidualLogger : gko::log::Logger {
             rec_res_norms->push_back(
                 get_norm(gko::as<vec<rc_vtype>>(residual_norm)));
         } else {
-            gko::detail::vector_dispatch<rc_vtype>(
+            gko::detail::vector_dispatch<ValueType>(
                 residual, [&](const auto v_residual) {
                     rec_res_norms->push_back(compute_norm2(v_residual));
                 });
         }
         if (solution) {
             gko::detail::vector_dispatch<
-                rc_vtype>(solution, [&](auto v_solution) {
+                ValueType>(solution, [&](auto v_solution) {
                 using concrete_type =
                     std::remove_pointer_t<std::decay_t<decltype(v_solution)>>;
                 true_res_norms->push_back(compute_residual_norm(
@@ -174,7 +174,9 @@ struct ResidualLogger : gko::log::Logger {
         }
         if (implicit_sq_residual_norm) {
             implicit_res_norms->push_back(std::sqrt(
-                get_norm(gko::as<vec<rc_vtype>>(implicit_sq_residual_norm))));
+                get_norm(gko::as<vec<ValueType>>(implicit_sq_residual_norm)
+                             ->compute_absolute()
+                             .get())));
             has_implicit_res_norm = true;
         } else {
             implicit_res_norms->push_back(-1.0);

From 233904243511d8f954cb0fe88054aec3a577d583 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Mon, 14 Oct 2024 09:18:39 +0200
Subject: [PATCH 241/448] fixup! [ci] add benchmark tests with complex types

---
 benchmark/test/test_framework.py.in | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in
index 9f2e5718c05..725f7f036c9 100644
--- a/benchmark/test/test_framework.py.in
+++ b/benchmark/test/test_framework.py.in
@@ -138,7 +138,10 @@ def compare_output_impl(
     use_complex: bool
 ):
     base_file = sys.argv[1]
-    file = base_file if not use_complex else f"{base_file}_dcomplex"
+    if base_file.endswith(".exe"):
+        file = base_file if not use_complex else base_file.replace(".exe", "_dcomplex.exe")
+    else:
+        file = base_file if not use_complex else f"{base_file}_dcomplex"
     args = [file] + args
     expected_stdout = str(sourcepath / "reference" / expected_stdout)
     expected_stderr = str(sourcepath / "reference" / expected_stderr)

From f0608bf4c0b7e2408e9fd75102664ff2b9e2f13a Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Wed, 23 Oct 2024 16:57:50 +0200
Subject: [PATCH 242/448] review updates:

- fix formatting

Co-authored-by: Yu-Hsiang M. Tsai <19565938+yhmtsai@users.noreply.github.com>
---
 benchmark/test/blas.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/test/blas.py b/benchmark/test/blas.py
index 98a775c4992..3ebce7d6444 100755
--- a/benchmark/test/blas.py
+++ b/benchmark/test/blas.py
@@ -37,4 +37,4 @@
     expected_stdout="blas_dcomplex.simple.stdout",
     expected_stderr="blas_dcomplex.simple.stderr",
     use_complex=True
-)
\ No newline at end of file
+)

From e1505a2045455b96416d866f7b1fa8bc49faaaa2 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Thu, 7 Nov 2024 18:02:22 +0100
Subject: [PATCH 243/448] run Intel CI jobs on cluster

---
 .gitlab-ci.yml    | 2 --
 .gitlab/image.yml | 9 +++------
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index cc67883c4b3..18771d9bc2d 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -344,7 +344,6 @@ build/icpx20231/igpu/release/shared:
     - .build_and_test_template
     - .default_variables
     - .quick_test_condition
-    - .disable_job_condition
     - .use_gko-oneapi20231-igpu
   variables:
     CXX_COMPILER: "icpx"
@@ -378,7 +377,6 @@ build/icpx/igpu/release/static:
     - .build_and_test_template
     - .default_variables
     - .full_test_condition
-    - .disable_job_condition
     - .use_gko-oneapi-igpu
   variables:
     CXX_COMPILER: "dpcpp"
diff --git a/.gitlab/image.yml b/.gitlab/image.yml
index 2295f6312ae..c894d439723 100644
--- a/.gitlab/image.yml
+++ b/.gitlab/image.yml
@@ -50,17 +50,14 @@
 .use_gko-oneapi-igpu:
   image: ginkgohub/oneapi:latest
   tags:
-    - private_ci
-    - intel-igpu
+    - intel-gpus
 
 .use_gko-oneapi20231-igpu:
   image: ginkgohub/spack-oneapi:20231-openmpi
   tags:
-    - private_ci
-    - intel-igpu
+    - intel-gpus
 
 .use_gko-oneapi-dgpu:
   image: ginkgohub/oneapi:latest
   tags:
-    - private_ci
-    - intel-dgpu
+    - intel-gpus

From 10e1f45f2c6641fbd4e917fbf4530148433c8372 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Fri, 8 Nov 2024 00:52:36 +0100
Subject: [PATCH 244/448] fix slow OpenMP example execution

---
 examples/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 762a3e33208..90c1f8e2632 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -101,6 +101,8 @@ if(GINKGO_BUILD_TESTS)
                      "${executor}"
                      WORKING_DIRECTORY
                      "${CMAKE_CURRENT_SOURCE_DIR}/${example}")
+            # Prevent performance issues with high core counts
+            set_property(TEST example_${example}_${executor} PROPERTY ENVIRONMENT OMP_NUM_THREADS=4)
         endforeach()
     endforeach()
 
@@ -115,6 +117,8 @@ if(GINKGO_BUILD_TESTS)
                      "${CMAKE_CURRENT_SOURCE_DIR}/file-config-solver/data/A.mtx"
                      WORKING_DIRECTORY
                      "$<TARGET_FILE_DIR:ginkgo>")
+            # Prevent performance issues with high core counts
+            set_property(TEST example_file-config-solver_${config_name}_${executor} PROPERTY ENVIRONMENT OMP_NUM_THREADS=4)
         endforeach()    
     endforeach()
         

From d1bca9d9d0c58917295ec6cfd720ca4a34ddd4b4 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Mon, 11 Nov 2024 23:49:07 +0100
Subject: [PATCH 245/448] fix factorization test precision

---
 test/factorization/factorization_kernels.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test/factorization/factorization_kernels.cpp b/test/factorization/factorization_kernels.cpp
index 7887d83e0f7..81e7105e065 100644
--- a/test/factorization/factorization_kernels.cpp
+++ b/test/factorization/factorization_kernels.cpp
@@ -53,7 +53,7 @@ TEST_F(Factorization, InitializeRowPtrsLSameAsRef)
 }
 
 
-TEST_F(Factorization, InitializeLWithoutSqrtSameAsRef)
+TEST_F(Factorization, InitializeLSameAsRef)
 {
     gko::array<index_type> l_ptrs{ref, mtx->get_size()[0] + 1};
     gko::kernels::reference::factorization::initialize_row_ptrs_l(
@@ -73,6 +73,7 @@ TEST_F(Factorization, InitializeLWithoutSqrtSameAsRef)
         gko::kernels::GKO_DEVICE_NAMESPACE::factorization::initialize_l(
             exec, dmtx.get(), dl_mtx.get(), diag_sqrt);
 
-        GKO_ASSERT_MTX_NEAR(l_mtx, dl_mtx, 0.0);
+        GKO_ASSERT_MTX_NEAR(l_mtx, dl_mtx,
+                            diag_sqrt ? r<value_type>::value : 0.0);
     }
 }

From 31f24538d1ea9d45ca8ff672ab8f1b91a22c78d5 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Tue, 12 Nov 2024 00:41:17 +0100
Subject: [PATCH 246/448] check sparsity pattern of SOR initialize kernels

---
 test/preconditioner/sor_kernels.cpp | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/test/preconditioner/sor_kernels.cpp b/test/preconditioner/sor_kernels.cpp
index cd12514bb28..f900eadf105 100644
--- a/test/preconditioner/sor_kernels.cpp
+++ b/test/preconditioner/sor_kernels.cpp
@@ -46,11 +46,17 @@ class Sor : public CommonTestFixture {
         d_mtx->read(md);
 
         result_l->read(md_l);
-        result_l->scale(gko::initialize<Dense>({0.0}, ref));
+        std::fill_n(result_l->get_col_idxs(),
+                    result_l->get_num_stored_elements(), -1);
+        std::fill_n(result_l->get_values(), result_l->get_num_stored_elements(),
+                    gko::nan<value_type>());
         d_result_l = gko::clone(exec, result_l);
 
         result_u->read(md_u);
-        result_u->scale(gko::initialize<Dense>({0.0}, ref));
+        std::fill_n(result_u->get_col_idxs(),
+                    result_u->get_num_stored_elements(), -1);
+        std::fill_n(result_u->get_values(), result_u->get_num_stored_elements(),
+                    gko::nan<value_type>());
         d_result_u = gko::clone(exec, result_u);
     }
 
@@ -73,6 +79,7 @@ TEST_F(Sor, InitializeWeightedLFactorIsSameAsReference)
     gko::kernels::GKO_DEVICE_NAMESPACE::sor::initialize_weighted_l(
         exec, d_mtx.get(), 1.24, d_result_l.get());
 
+    GKO_ASSERT_MTX_EQ_SPARSITY(result_l, d_result_l);
     GKO_ASSERT_MTX_NEAR(result_l, d_result_l, r<value_type>::value);
 }
 
@@ -84,6 +91,8 @@ TEST_F(Sor, InitializeWeightedLAndUFactorIsSameAsReference)
     gko::kernels::GKO_DEVICE_NAMESPACE::sor::initialize_weighted_l_u(
         exec, d_mtx.get(), 1.24, d_result_l.get(), d_result_u.get());
 
+    GKO_ASSERT_MTX_EQ_SPARSITY(result_l, d_result_l);
+    GKO_ASSERT_MTX_EQ_SPARSITY(result_u, d_result_u);
     GKO_ASSERT_MTX_NEAR(result_l, d_result_l, r<value_type>::value);
     GKO_ASSERT_MTX_NEAR(result_u, d_result_u, r<value_type>::value);
 }

From b9a5bc2887af4f0a35034170d91a73c7da956e26 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Tue, 12 Nov 2024 01:07:50 +0100
Subject: [PATCH 247/448] fix kernel parameter passing for SOR DPCPP kernels

---
 dpcpp/preconditioner/sor_kernels.dp.cpp | 30 ++++++++++++++++---------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/dpcpp/preconditioner/sor_kernels.dp.cpp b/dpcpp/preconditioner/sor_kernels.dp.cpp
index fe796586591..4af676288bd 100644
--- a/dpcpp/preconditioner/sor_kernels.dp.cpp
+++ b/dpcpp/preconditioner/sor_kernels.dp.cpp
@@ -31,14 +31,18 @@ void initialize_weighted_l(
                         1, 1};
 
     auto inv_weight = one(weight) / weight;
+    const auto in_row_ptrs = system_matrix->get_const_row_ptrs();
+    const auto in_col_idxs = system_matrix->get_const_col_idxs();
+    const auto in_values = system_matrix->get_const_values();
+    const auto l_row_ptrs = l_mtx->get_const_row_ptrs();
+    const auto l_col_idxs = l_mtx->get_col_idxs();
+    const auto l_values = l_mtx->get_values();
 
     exec->get_queue()->parallel_for(
         sycl_nd_range(grid_dim, block_size), [=](sycl::nd_item<3> item_ct1) {
             factorization::helpers::initialize_l(
-                num_rows, system_matrix->get_const_row_ptrs(),
-                system_matrix->get_const_col_idxs(),
-                system_matrix->get_const_values(), l_mtx->get_const_row_ptrs(),
-                l_mtx->get_col_idxs(), l_mtx->get_values(),
+                num_rows, in_row_ptrs, in_col_idxs, in_values, l_row_ptrs,
+                l_col_idxs, l_values,
                 factorization::helpers::triangular_mtx_closure(
                     [inv_weight](auto val) { return val * inv_weight; },
                     factorization::helpers::identity{}),
@@ -67,15 +71,21 @@ void initialize_weighted_l_u(
     auto inv_two_minus_weight =
         one(weight) / (static_cast<remove_complex<ValueType>>(2.0) - weight);
 
+    const auto in_row_ptrs = system_matrix->get_const_row_ptrs();
+    const auto in_col_idxs = system_matrix->get_const_col_idxs();
+    const auto in_values = system_matrix->get_const_values();
+    const auto l_row_ptrs = l_mtx->get_const_row_ptrs();
+    const auto l_col_idxs = l_mtx->get_col_idxs();
+    const auto l_values = l_mtx->get_values();
+    const auto u_row_ptrs = u_mtx->get_const_row_ptrs();
+    const auto u_col_idxs = u_mtx->get_col_idxs();
+    const auto u_values = u_mtx->get_values();
+
     exec->get_queue()->parallel_for(
         sycl_nd_range(grid_dim, block_size), [=](sycl::nd_item<3> item_ct1) {
             factorization::helpers::initialize_l_u(
-                num_rows, system_matrix->get_const_row_ptrs(),
-                system_matrix->get_const_col_idxs(),
-                system_matrix->get_const_values(), l_mtx->get_const_row_ptrs(),
-                l_mtx->get_col_idxs(), l_mtx->get_values(),
-                u_mtx->get_const_row_ptrs(), u_mtx->get_col_idxs(),
-                u_mtx->get_values(),
+                num_rows, in_row_ptrs, in_col_idxs, in_values, l_row_ptrs,
+                l_col_idxs, l_values, u_row_ptrs, u_col_idxs, u_values,
                 factorization::helpers::triangular_mtx_closure(
                     [inv_weight](auto val) { return val * inv_weight; },
                     factorization::helpers::identity{}),

From ce0e94a196b063b92392360fd5289ba5709b411e Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Tue, 12 Nov 2024 22:32:52 +0100
Subject: [PATCH 248/448] fix CI config

---
 .gitlab-ci.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 18771d9bc2d..6c3ad167880 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -339,7 +339,7 @@ build/nocuda-nomixed/nompi/clang/omp/debug/static:
     MIXED_PRECISION: "OFF"
 
 # spack oneapi 2023.1
-build/icpx20231/igpu/release/shared:
+build/icpx20231/gpu/release/shared:
   extends:
     - .build_and_test_template
     - .default_variables
@@ -372,7 +372,7 @@ build/icpx20231/igpu/release/shared:
 #     ONEAPI_DEVICE_SELECTOR: "level_zero:gpu"
 
 # It gives two available backends of GPU on tests
-build/icpx/igpu/release/static:
+build/icpx/gpu/release/static:
   extends:
     - .build_and_test_template
     - .default_variables
@@ -383,7 +383,7 @@ build/icpx/igpu/release/static:
     CXX_FLAGS: "-Wpedantic -ffp-model=precise"
     BUILD_SYCL: "ON"
     BUILD_TYPE: "Release"
-    BUILD_SHARED_LIBS: "OF"
+    BUILD_SHARED_LIBS: "OFF"
     DPCPP_SINGLE_MODE: "ON"
     ONEAPI_DEVICE_SELECTOR: "*:gpu"
     BUILD_HWLOC: "OFF"

From 0b5158ac61baf413e157657d79e94a8f0bb4198f Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 13 Nov 2024 09:47:42 +0100
Subject: [PATCH 249/448] disable static oneAPI builds

---
 .gitlab-ci.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 6c3ad167880..445f15d3c86 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -372,7 +372,7 @@ build/icpx20231/gpu/release/shared:
 #     ONEAPI_DEVICE_SELECTOR: "level_zero:gpu"
 
 # It gives two available backends of GPU on tests
-build/icpx/gpu/release/static:
+build/dpcpp/gpu/release/shared:
   extends:
     - .build_and_test_template
     - .default_variables
@@ -383,7 +383,8 @@ build/icpx/gpu/release/static:
     CXX_FLAGS: "-Wpedantic -ffp-model=precise"
     BUILD_SYCL: "ON"
     BUILD_TYPE: "Release"
-    BUILD_SHARED_LIBS: "OFF"
+    # static builds take too long
+    BUILD_SHARED_LIBS: "ON"
     DPCPP_SINGLE_MODE: "ON"
     ONEAPI_DEVICE_SELECTOR: "*:gpu"
     BUILD_HWLOC: "OFF"

From 17fde79a3b916b4b8e5c95f366020215d24e3b77 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 8 Apr 2024 18:04:56 +0200
Subject: [PATCH 250/448] create the context for each intel device. Otherwise,
 we get -999 Unknown PI error after second device ref:
 https://github.com/intel/llvm/issues/10982

---
 dpcpp/base/executor.dp.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp
index 863b8aec211..2ac292b4f60 100644
--- a/dpcpp/base/executor.dp.cpp
+++ b/dpcpp/base/executor.dp.cpp
@@ -279,7 +279,11 @@ void DpcppExecutor::set_device_property(dpcpp_queue_property property)
     // `wait()` would be needed after every call to a DPC++ function or kernel.
     // For example, without `in_order`, doing a copy, a kernel, and a copy, will
     // not necessarily happen in that order by default, which we need to avoid.
-    auto* queue = new sycl::queue{device, detail::get_property_list(property)};
+    // We need to create the context for each device. Otherwise, we get -999
+    // Unknown PI error after second device.
+    // Ref: https://github.com/intel/llvm/issues/10982
+    auto* queue = new sycl::queue{sycl::context(device), device,
+                                  detail::get_property_list(property)};
     queue_ = std::move(queue_manager<sycl::queue>{queue, detail::delete_queue});
 }
 

From 1dee5a9610e1f466c96caada5d35ec5a1d7075e5 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Tue, 29 Oct 2024 16:10:44 +0000
Subject: [PATCH 251/448] [sycl] don't use deprecated `release_matrix_handle`
 and `set_csr_data`

---
 benchmark/utils/dpcpp_linops.dp.cpp | 15 +++++++++------
 dpcpp/matrix/csr_kernels.dp.cpp     |  5 +++--
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/benchmark/utils/dpcpp_linops.dp.cpp b/benchmark/utils/dpcpp_linops.dp.cpp
index f91de85db2a..be705686546 100644
--- a/benchmark/utils/dpcpp_linops.dp.cpp
+++ b/benchmark/utils/dpcpp_linops.dp.cpp
@@ -47,12 +47,13 @@ class OnemklBase : public gko::LinOp {
         GKO_NOT_IMPLEMENTED;
     }
 
-    void initialize_mat_handle()
+    void initialize_mat_handle(std::shared_ptr<const gko::DpcppExecutor> exec)
     {
         mat_handle_ = handle_manager<oneapi::mkl::sparse::matrix_handle>(
             create_mat_handle(),
-            [](oneapi::mkl::sparse::matrix_handle_t mat_handle) {
-                oneapi::mkl::sparse::release_matrix_handle(&mat_handle);
+            [exec](oneapi::mkl::sparse::matrix_handle_t mat_handle) {
+                oneapi::mkl::sparse::release_matrix_handle(*exec->get_queue(),
+                                                           &mat_handle);
             });
     }
 
@@ -63,7 +64,8 @@ class OnemklBase : public gko::LinOp {
         if (this->get_device_exec() == nullptr) {
             GKO_NOT_IMPLEMENTED;
         }
-        this->initialize_mat_handle();
+        this->initialize_mat_handle(
+            this->get_device_exec());
     }
 
     ~OnemklBase() = default;
@@ -74,7 +76,7 @@ class OnemklBase : public gko::LinOp {
     {
         if (this != &other) {
             gko::LinOp::operator=(other);
-            this->initialize_mat_handle();
+            this->initialize_mat_handle(this->get_device_exec());
         }
         return *this;
     }
@@ -106,7 +108,8 @@ class OnemklCsr
         this->set_size(csr_->get_size());
 
         oneapi::mkl::sparse::set_csr_data(
-            this->get_mat_handle(), static_cast<int>(this->get_size()[0]),
+            *(this->get_device_exec()->get_queue()), this->get_mat_handle(),
+            static_cast<int>(this->get_size()[0]),
             static_cast<int>(this->get_size()[1]),
             oneapi::mkl::index_base::zero, csr_->get_row_ptrs(),
             csr_->get_col_idxs(), csr_->get_values());
diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp
index 7e5d0229c86..312f0e56ad3 100644
--- a/dpcpp/matrix/csr_kernels.dp.cpp
+++ b/dpcpp/matrix/csr_kernels.dp.cpp
@@ -1398,7 +1398,7 @@ bool try_general_sparselib_spmv(std::shared_ptr<const DpcppExecutor> exec,
         oneapi::mkl::sparse::matrix_handle_t mat_handle;
         oneapi::mkl::sparse::init_matrix_handle(&mat_handle);
         oneapi::mkl::sparse::set_csr_data(
-            mat_handle, IndexType(a->get_size()[0]),
+            *exec->get_queue(), mat_handle, IndexType(a->get_size()[0]),
             IndexType(a->get_size()[1]), oneapi::mkl::index_base::zero,
             const_cast<IndexType*>(a->get_const_row_ptrs()),
             const_cast<IndexType*>(a->get_const_col_idxs()),
@@ -1417,7 +1417,8 @@ bool try_general_sparselib_spmv(std::shared_ptr<const DpcppExecutor> exec,
                 const_cast<ValueType*>(b->get_const_values()), b->get_size()[1],
                 b->get_stride(), host_beta, c->get_values(), c->get_stride());
         }
-        oneapi::mkl::sparse::release_matrix_handle(&mat_handle);
+        oneapi::mkl::sparse::release_matrix_handle(*exec->get_queue(),
+                                                   &mat_handle);
     }
     return try_sparselib;
 }

From 798ae28787e95e46e11bb2be3ce735463a12ec65 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 29 Oct 2024 18:44:34 +0100
Subject: [PATCH 252/448] [sycl] remove deprecated `is_host()`

---
 dpcpp/base/executor.dp.cpp | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp
index 2ac292b4f60..5abb745a91d 100644
--- a/dpcpp/base/executor.dp.cpp
+++ b/dpcpp/base/executor.dp.cpp
@@ -50,7 +50,7 @@ bool OmpExecutor::verify_memory_to(const DpcppExecutor* dest_exec) const
 {
     auto device = detail::get_devices(
         dest_exec->get_device_type())[dest_exec->get_device_id()];
-    return device.is_host() || device.is_cpu();
+    return device.is_cpu();
 }
 
 
@@ -88,7 +88,7 @@ void DpcppExecutor::raw_free(void* ptr) const noexcept
 #endif  // GKO_VERBOSE_LEVEL >= 1
         // OpenCL error code use 0 for CL_SUCCESS and negative number for others
         // error. if the error is not from OpenCL, it will return CL_SUCCESS.
-        int err_code = err.get_cl_code();
+        int err_code = err.code().value();
         // if return CL_SUCCESS, exit 1 as DPCPP error.
         if (err_code == 0) {
             err_code = 1;
@@ -142,8 +142,7 @@ void DpcppExecutor::raw_copy_to(const DpcppExecutor* dest, size_type num_bytes,
         auto dest_queue = dest->get_queue();
         auto device = queue->get_device();
         auto dest_device = dest_queue->get_device();
-        if (((device.is_host() || device.is_cpu()) &&
-             (dest_device.is_host() || dest_device.is_cpu())) ||
+        if ((device.is_cpu() && dest_device.is_cpu()) ||
             (queue == dest_queue)) {
             dest->get_queue()->memcpy(dest_ptr, src_ptr, num_bytes).wait();
         } else {
@@ -183,7 +182,7 @@ bool DpcppExecutor::verify_memory_to(const OmpExecutor* dest_exec) const
 {
     auto device = detail::get_devices(
         get_exec_info().device_type)[get_exec_info().device_id];
-    return device.is_host() || device.is_cpu();
+    return device.is_cpu();
 }
 
 bool DpcppExecutor::verify_memory_to(const DpcppExecutor* dest_exec) const
@@ -195,9 +194,7 @@ bool DpcppExecutor::verify_memory_to(const DpcppExecutor* dest_exec) const
     auto dest_queue = dest_exec->get_queue();
     auto device = queue->get_device();
     auto dest_device = dest_queue->get_device();
-    return ((device.is_host() || device.is_cpu()) &&
-            (dest_device.is_host() || dest_device.is_cpu())) ||
-           (queue == dest_queue);
+    return (device.is_cpu() && dest_device.is_cpu()) || (queue == dest_queue);
 }
 
 
@@ -234,16 +231,14 @@ void DpcppExecutor::set_device_property(dpcpp_queue_property property)
            DpcppExecutor::get_num_devices(this->get_exec_info().device_type));
     auto device = detail::get_devices(
         this->get_exec_info().device_type)[this->get_exec_info().device_id];
-    if (!device.is_host()) {
-        try {
-            auto subgroup_sizes =
-                device.get_info<sycl::info::device::sub_group_sizes>();
-            for (auto& i : subgroup_sizes) {
-                this->get_exec_info().subgroup_sizes.push_back(i);
-            }
-        } catch (sycl::exception& err) {
-            GKO_NOT_SUPPORTED(device);
+    try {
+        auto subgroup_sizes =
+            device.get_info<sycl::info::device::sub_group_sizes>();
+        for (auto& i : subgroup_sizes) {
+            this->get_exec_info().subgroup_sizes.push_back(i);
         }
+    } catch (sycl::exception& err) {
+        GKO_NOT_SUPPORTED(device);
     }
     this->get_exec_info().num_computing_units = static_cast<int>(
         device.get_info<sycl::info::device::max_compute_units>());

From c3585ce1f7a79dc9448474be2283b0c75daa7413 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 29 Oct 2024 18:44:49 +0100
Subject: [PATCH 253/448] [sycl] use group algorithm not member functions from
 subgroup, which are removed in oneapi-2025.0.0

---
 dpcpp/components/cooperative_groups.dp.hpp    | 14 ++++++++------
 dpcpp/preconditioner/batch_jacobi_kernels.hpp | 15 ++++++++-------
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/dpcpp/components/cooperative_groups.dp.hpp b/dpcpp/components/cooperative_groups.dp.hpp
index 034bf4baf28..8a2fb4d11c0 100644
--- a/dpcpp/components/cooperative_groups.dp.hpp
+++ b/dpcpp/components/cooperative_groups.dp.hpp
@@ -162,20 +162,20 @@ class thread_block_tile : public sycl::sub_group {
     __dpct_inline__ unsigned size() const noexcept { return Size; }
 
     __dpct_inline__ void sync() const noexcept { this->barrier(); }
-
 #define GKO_BIND_SHFL(ShflOpName, ShflOp)                                      \
     template <typename ValueType, typename SelectorType>                       \
     __dpct_inline__ ValueType ShflOpName(ValueType var, SelectorType selector) \
         const noexcept                                                         \
     {                                                                          \
-        return this->ShflOp(var, selector);                                    \
+        return sycl::ShflOp(static_cast<sycl::sub_group>(*this), var,          \
+                            selector);                                         \
     }                                                                          \
     static_assert(true,                                                        \
                   "This assert is used to counter the false positive extra "   \
                   "semi-colon warnings")
 
-    GKO_BIND_SHFL(shfl, shuffle);
-    GKO_BIND_SHFL(shfl_xor, shuffle_xor);
+    GKO_BIND_SHFL(shfl, select_from_group);
+    GKO_BIND_SHFL(shfl_xor, permute_group_by_xor);
 
     // the shfl_up of out-of-range value gives undefined behavior, we
     // manually set it as the original value such that give the same result as
@@ -184,7 +184,8 @@ class thread_block_tile : public sycl::sub_group {
     __dpct_inline__ ValueType shfl_up(ValueType var,
                                       SelectorType selector) const noexcept
     {
-        const auto result = this->shuffle_up(var, selector);
+        const auto result = sycl::shift_group_right(
+            static_cast<sycl::sub_group>(*this), var, selector);
         return (data_.rank < selector) ? var : result;
     }
 
@@ -195,7 +196,8 @@ class thread_block_tile : public sycl::sub_group {
     __dpct_inline__ ValueType shfl_down(ValueType var,
                                         SelectorType selector) const noexcept
     {
-        const auto result = this->shuffle_down(var, selector);
+        const auto result = sycl::shift_group_left(
+            static_cast<sycl::sub_group>(*this), var, selector);
         return (data_.rank + selector >= Size) ? var : result;
     }
 
diff --git a/dpcpp/preconditioner/batch_jacobi_kernels.hpp b/dpcpp/preconditioner/batch_jacobi_kernels.hpp
index 769ebc47a57..dd684350375 100644
--- a/dpcpp/preconditioner/batch_jacobi_kernels.hpp
+++ b/dpcpp/preconditioner/batch_jacobi_kernels.hpp
@@ -99,15 +99,15 @@ __dpct_inline__ int choose_pivot(const int block_size,
     sg.barrier();
     int my_piv_idx = sg_tid;
     for (int a = sg_size / 2; a > 0; a /= 2) {
-        const real_type abs_ele_other = sg.shuffle_down(my_abs_ele, a);
-        const int piv_idx_other = sg.shuffle_down(my_piv_idx, a);
+        const real_type abs_ele_other = shift_group_left(sg, my_abs_ele, a);
+        const int piv_idx_other = shift_group_left(sg, my_piv_idx, a);
         if (my_abs_ele < abs_ele_other) {
             my_abs_ele = abs_ele_other;
             my_piv_idx = piv_idx_other;
         }
     }
     sg.barrier();
-    const int ipiv = sg.shuffle(my_piv_idx, 0);
+    const int ipiv = select_from_group(sg, my_piv_idx, 0);
     return ipiv;
 }
 
@@ -129,9 +129,9 @@ __dpct_inline__ void invert_dense_block(const int block_size,
             perm = k;
         }
         const ValueType d =
-            (sg.shuffle(block_row[k], ipiv) == zero<ValueType>())
+            (select_from_group(sg, block_row[k], ipiv) == zero<ValueType>())
                 ? one<ValueType>()
-                : sg.shuffle(block_row[k], ipiv);
+                : select_from_group(sg, block_row[k], ipiv);
         // scale kth col
         block_row[k] /= -d;
         if (sg_tid == ipiv) {
@@ -140,7 +140,8 @@ __dpct_inline__ void invert_dense_block(const int block_size,
         const ValueType row_val = block_row[k];
         // rank-1 update
         for (int col = 0; col < block_size; col++) {
-            const ValueType col_val = sg.shuffle(block_row[col], ipiv);
+            const ValueType col_val =
+                select_from_group(sg, block_row[col], ipiv);
             block_row[col] += row_val * col_val;
         }
         // Computations for the threads of the subwarp having local id >=
@@ -221,7 +222,7 @@ __dpct_inline__ void compute_block_jacobi_kernel(
     // array
     for (int a = 0; a < block_size; a++) {
         const int col_inv_transposed_mat = a;
-        const int col = sg.shuffle(perm, a);  // column permutation
+        const int col = select_from_group(sg, perm, a);  // column permutation
         const int row_inv_transposed_mat =
             perm;  // accumulated row swaps during pivoting
         const auto val_to_write = block_row[col];

From fda39a35099b1c5a527494d15e4439542860c5ba Mon Sep 17 00:00:00 2001
From: ginkgo-bot <ginkgo.library@gmail.com>
Date: Wed, 13 Nov 2024 10:14:35 +0000
Subject: [PATCH 254/448] Format files

Co-authored-by: Marcel Koch <marcel.koch@kit.edu>
---
 benchmark/utils/dpcpp_linops.dp.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/benchmark/utils/dpcpp_linops.dp.cpp b/benchmark/utils/dpcpp_linops.dp.cpp
index be705686546..d279f2ad027 100644
--- a/benchmark/utils/dpcpp_linops.dp.cpp
+++ b/benchmark/utils/dpcpp_linops.dp.cpp
@@ -64,8 +64,7 @@ class OnemklBase : public gko::LinOp {
         if (this->get_device_exec() == nullptr) {
             GKO_NOT_IMPLEMENTED;
         }
-        this->initialize_mat_handle(
-            this->get_device_exec());
+        this->initialize_mat_handle(this->get_device_exec());
     }
 
     ~OnemklBase() = default;

From 28fdde7b44d8f063efcc744d613d86f5d96227bd Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Wed, 13 Nov 2024 12:38:29 +0100
Subject: [PATCH 255/448] [sycl] add missing namespace

---
 dpcpp/preconditioner/batch_jacobi_kernels.hpp | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/dpcpp/preconditioner/batch_jacobi_kernels.hpp b/dpcpp/preconditioner/batch_jacobi_kernels.hpp
index dd684350375..8ac8718c3af 100644
--- a/dpcpp/preconditioner/batch_jacobi_kernels.hpp
+++ b/dpcpp/preconditioner/batch_jacobi_kernels.hpp
@@ -99,15 +99,16 @@ __dpct_inline__ int choose_pivot(const int block_size,
     sg.barrier();
     int my_piv_idx = sg_tid;
     for (int a = sg_size / 2; a > 0; a /= 2) {
-        const real_type abs_ele_other = shift_group_left(sg, my_abs_ele, a);
-        const int piv_idx_other = shift_group_left(sg, my_piv_idx, a);
+        const real_type abs_ele_other =
+            sycl::shift_group_left(sg, my_abs_ele, a);
+        const int piv_idx_other = sycl::shift_group_left(sg, my_piv_idx, a);
         if (my_abs_ele < abs_ele_other) {
             my_abs_ele = abs_ele_other;
             my_piv_idx = piv_idx_other;
         }
     }
     sg.barrier();
-    const int ipiv = select_from_group(sg, my_piv_idx, 0);
+    const int ipiv = sycl::select_from_group(sg, my_piv_idx, 0);
     return ipiv;
 }
 
@@ -129,9 +130,10 @@ __dpct_inline__ void invert_dense_block(const int block_size,
             perm = k;
         }
         const ValueType d =
-            (select_from_group(sg, block_row[k], ipiv) == zero<ValueType>())
+            (sycl::select_from_group(sg, block_row[k], ipiv) ==
+             zero<ValueType>())
                 ? one<ValueType>()
-                : select_from_group(sg, block_row[k], ipiv);
+                : sycl::select_from_group(sg, block_row[k], ipiv);
         // scale kth col
         block_row[k] /= -d;
         if (sg_tid == ipiv) {
@@ -141,7 +143,7 @@ __dpct_inline__ void invert_dense_block(const int block_size,
         // rank-1 update
         for (int col = 0; col < block_size; col++) {
             const ValueType col_val =
-                select_from_group(sg, block_row[col], ipiv);
+                sycl::select_from_group(sg, block_row[col], ipiv);
             block_row[col] += row_val * col_val;
         }
         // Computations for the threads of the subwarp having local id >=
@@ -222,7 +224,8 @@ __dpct_inline__ void compute_block_jacobi_kernel(
     // array
     for (int a = 0; a < block_size; a++) {
         const int col_inv_transposed_mat = a;
-        const int col = select_from_group(sg, perm, a);  // column permutation
+        const int col =
+            sycl::select_from_group(sg, perm, a);  // column permutation
         const int row_inv_transposed_mat =
             perm;  // accumulated row swaps during pivoting
         const auto val_to_write = block_row[col];

From 87180babfa41e2ddc9dca3a8e044dab052aa3640 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Wed, 13 Nov 2024 13:44:45 +0100
Subject: [PATCH 256/448] [sycl] use atomic_ref instead of deprecated atomic

---
 dpcpp/components/atomic.dp.hpp | 35 +++++++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/dpcpp/components/atomic.dp.hpp b/dpcpp/components/atomic.dp.hpp
index 8168421a488..1876019c4fc 100644
--- a/dpcpp/components/atomic.dp.hpp
+++ b/dpcpp/components/atomic.dp.hpp
@@ -23,6 +23,25 @@ constexpr auto local_space = sycl::access::address_space::local_space;
 constexpr auto global_space = sycl::access::address_space::global_space;
 
 
+// The defaults are based on:
+// https://github.com/intel/llvm/blob/51d92a339c7bc2ac11ec39bef42c039e1589ae3e/sycl/include/sycl/atomic.hpp#L56
+template <sycl::access::address_space addressSpace>
+struct memory_scope {};
+
+template <>
+struct memory_scope<global_space> {
+    static constexpr auto scope = sycl::memory_scope::device;
+};
+
+template <>
+struct memory_scope<local_space> {
+    static constexpr auto scope = sycl::memory_scope::work_group;
+};
+
+template <sycl::access::address_space addressSpace>
+constexpr auto memory_scope_v = memory_scope<addressSpace>::scope;
+
+
 }  // namespace atomic
 
 namespace {
@@ -35,7 +54,9 @@ T atomic_compare_exchange_strong(
     sycl::memory_order success = sycl::memory_order::relaxed,
     sycl::memory_order fail = sycl::memory_order::relaxed)
 {
-    sycl::atomic<T, addressSpace> obj(addr);
+    sycl::atomic_ref<T, sycl::memory_order::relaxed,
+                     atomic::memory_scope_v<addressSpace>, addressSpace>
+        obj(*addr.get());
     obj.compare_exchange_strong(expected, desired, success, fail);
     return expected;
 }
@@ -59,8 +80,10 @@ inline T atomic_fetch_add(
     T* addr, T operand,
     sycl::memory_order memoryOrder = sycl::memory_order::relaxed)
 {
-    sycl::atomic<T, addressSpace> obj((sycl::multi_ptr<T, addressSpace>(addr)));
-    return sycl::atomic_fetch_add(obj, operand, memoryOrder);
+    sycl::atomic_ref<T, sycl::memory_order::relaxed,
+                     atomic::memory_scope_v<addressSpace>, addressSpace>
+        obj(*addr);
+    return obj.fetch_add(operand, memoryOrder);
 }
 
 
@@ -70,8 +93,10 @@ inline T atomic_fetch_max(
     T* addr, T operand,
     sycl::memory_order memoryOrder = sycl::memory_order::relaxed)
 {
-    sycl::atomic<T, addressSpace> obj((sycl::multi_ptr<T, addressSpace>(addr)));
-    return sycl::atomic_fetch_max(obj, operand, memoryOrder);
+    sycl::atomic_ref<T, sycl::memory_order::relaxed,
+                     atomic::memory_scope_v<addressSpace>, addressSpace>
+        obj(*addr);
+    return obj.fetch_max(operand, memoryOrder);
 }
 
 

From 9404c4ad0c298a844cda4531a3c0dd1a838433d2 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Wed, 13 Nov 2024 13:45:46 +0100
Subject: [PATCH 257/448] [sycl] use local_accessor instead of deprecated
 access::target::local

---
 dpcpp/solver/batch_bicgstab_kernels.dp.cpp | 5 ++---
 dpcpp/solver/batch_cg_kernels.dp.cpp       | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
index 7036b770f1b..74648aecf44 100644
--- a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
+++ b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
@@ -76,9 +76,8 @@ class kernel_caller {
         auto res_tol = settings_.residual_tol;
 
         exec_->get_queue()->submit([&](sycl::handler& cgh) {
-            sycl::accessor<ValueType, 1, sycl::access_mode::read_write,
-                           sycl::access::target::local>
-                slm_values(sycl::range<1>(shared_size), cgh);
+            sycl::local_accessor<ValueType, 1> slm_values(
+                sycl::range<1>(shared_size), cgh);
 
             cgh.parallel_for(
                 sycl_nd_range(grid, block),
diff --git a/dpcpp/solver/batch_cg_kernels.dp.cpp b/dpcpp/solver/batch_cg_kernels.dp.cpp
index 9d3aa14ab2c..26f5c864187 100644
--- a/dpcpp/solver/batch_cg_kernels.dp.cpp
+++ b/dpcpp/solver/batch_cg_kernels.dp.cpp
@@ -76,9 +76,8 @@ class kernel_caller {
         auto res_tol = settings_.residual_tol;
 
         exec_->get_queue()->submit([&](sycl::handler& cgh) {
-            sycl::accessor<ValueType, 1, sycl::access_mode::read_write,
-                           sycl::access::target::local>
-                slm_values(sycl::range<1>(shared_size), cgh);
+            sycl::local_accessor<ValueType, 1> slm_values(
+                sycl::range<1>(shared_size), cgh);
 
             cgh.parallel_for(
                 sycl_nd_range(grid, block),

From 01072253617e9b4177f7f14455bc511e1ac884c9 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Wed, 13 Nov 2024 17:38:05 +0100
Subject: [PATCH 258/448] [sycl] use newer MKL target if available

---
 dpcpp/CMakeLists.txt | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index bf65888a6ab..516e9307e30 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -117,7 +117,13 @@ endif()
 target_include_directories(ginkgo_dpcpp
     PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/..)
 target_link_libraries(ginkgo_dpcpp PUBLIC ginkgo_device)
-target_link_libraries(ginkgo_dpcpp PRIVATE MKL::MKL_DPCPP oneDPL)
+target_link_libraries(ginkgo_dpcpp PRIVATE oneDPL)
+# MKL 2024.0+ provides the MKL::MKL_SYCL target, while older versions provide *_DPCPP
+if(MKL_VERSION_MAJOR VERSION_GREATER_EQUAL 2024)
+    target_link_libraries(ginkgo_dpcpp PRIVATE MKL::MKL_SYCL)
+else ()
+    target_link_libraries(ginkgo_dpcpp PRIVATE MKL::MKL_DPCPP)
+endif ()
 if (GINKGO_DPCPP_SINGLE_MODE)
     target_compile_definitions(ginkgo_dpcpp PRIVATE GINKGO_DPCPP_SINGLE_MODE=1)
 endif()

From 898c2b8c2bbced1e509dbad9713f11070061fae2 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Thu, 14 Nov 2024 09:50:43 +0100
Subject: [PATCH 259/448] [sycl] use non-deprecated MKL call only after v2024

Somehow the 'correct' version of the MKL calls lead to a segfault if version < 2024, even though they already deprecated the old call in v2023...
---
 benchmark/utils/dpcpp_linops.dp.cpp | 13 +++++++++----
 dpcpp/matrix/csr_kernels.dp.cpp     | 12 +++++++++---
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/benchmark/utils/dpcpp_linops.dp.cpp b/benchmark/utils/dpcpp_linops.dp.cpp
index d279f2ad027..1a391a77980 100644
--- a/benchmark/utils/dpcpp_linops.dp.cpp
+++ b/benchmark/utils/dpcpp_linops.dp.cpp
@@ -52,8 +52,11 @@ class OnemklBase : public gko::LinOp {
         mat_handle_ = handle_manager<oneapi::mkl::sparse::matrix_handle>(
             create_mat_handle(),
             [exec](oneapi::mkl::sparse::matrix_handle_t mat_handle) {
-                oneapi::mkl::sparse::release_matrix_handle(*exec->get_queue(),
-                                                           &mat_handle);
+                oneapi::mkl::sparse::release_matrix_handle(
+#if INTEL_MKL_VERSION >= 20240000
+                    *exec->get_queue(),
+#endif
+                    &mat_handle);
             });
     }
 
@@ -107,8 +110,10 @@ class OnemklCsr
         this->set_size(csr_->get_size());
 
         oneapi::mkl::sparse::set_csr_data(
-            *(this->get_device_exec()->get_queue()), this->get_mat_handle(),
-            static_cast<int>(this->get_size()[0]),
+#if INTEL_MKL_VERSION >= 20240000
+            *(this->get_device_exec()->get_queue()),
+#endif
+            this->get_mat_handle(), static_cast<int>(this->get_size()[0]),
             static_cast<int>(this->get_size()[1]),
             oneapi::mkl::index_base::zero, csr_->get_row_ptrs(),
             csr_->get_col_idxs(), csr_->get_values());
diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp
index 312f0e56ad3..44e9c5e16e6 100644
--- a/dpcpp/matrix/csr_kernels.dp.cpp
+++ b/dpcpp/matrix/csr_kernels.dp.cpp
@@ -1398,7 +1398,10 @@ bool try_general_sparselib_spmv(std::shared_ptr<const DpcppExecutor> exec,
         oneapi::mkl::sparse::matrix_handle_t mat_handle;
         oneapi::mkl::sparse::init_matrix_handle(&mat_handle);
         oneapi::mkl::sparse::set_csr_data(
-            *exec->get_queue(), mat_handle, IndexType(a->get_size()[0]),
+#if INTEL_MKL_VERSION >= 20240000
+            *exec->get_queue(),
+#endif
+            mat_handle, IndexType(a->get_size()[0]),
             IndexType(a->get_size()[1]), oneapi::mkl::index_base::zero,
             const_cast<IndexType*>(a->get_const_row_ptrs()),
             const_cast<IndexType*>(a->get_const_col_idxs()),
@@ -1417,8 +1420,11 @@ bool try_general_sparselib_spmv(std::shared_ptr<const DpcppExecutor> exec,
                 const_cast<ValueType*>(b->get_const_values()), b->get_size()[1],
                 b->get_stride(), host_beta, c->get_values(), c->get_stride());
         }
-        oneapi::mkl::sparse::release_matrix_handle(*exec->get_queue(),
-                                                   &mat_handle);
+        oneapi::mkl::sparse::release_matrix_handle(
+#if INTEL_MKL_VERSION >= 20240000
+            *exec->get_queue(),
+#endif
+            &mat_handle);
     }
     return try_sparselib;
 }

From 6df50ffcdd5d25326f3dae4aec8fdc577abee138 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Thu, 20 Jun 2024 16:43:11 +0200
Subject: [PATCH 260/448] [batch] provide default index type for matrix device
 types

---
 core/matrix/batch_struct.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/core/matrix/batch_struct.hpp b/core/matrix/batch_struct.hpp
index a3604fd9b99..593f9c99781 100644
--- a/core/matrix/batch_struct.hpp
+++ b/core/matrix/batch_struct.hpp
@@ -22,7 +22,7 @@ namespace csr {
 /**
  * Encapsulates one matrix from a batch of csr matrices.
  */
-template <typename ValueType, typename IndexType>
+template <typename ValueType, typename IndexType = const int32>
 struct batch_item {
     using value_type = ValueType;
     using index_type = IndexType;
@@ -44,7 +44,7 @@ struct batch_item {
 /**
  * A 'simple' structure to store a global uniform batch of csr matrices.
  */
-template <typename ValueType, typename IndexType>
+template <typename ValueType, typename IndexType = const int32>
 struct uniform_batch {
     using value_type = ValueType;
     using index_type = IndexType;
@@ -119,7 +119,7 @@ namespace ell {
 /**
  * Encapsulates one matrix from a batch of ell matrices.
  */
-template <typename ValueType, typename IndexType>
+template <typename ValueType, typename IndexType = const int32>
 struct batch_item {
     using value_type = ValueType;
     using index_type = IndexType;
@@ -141,7 +141,7 @@ struct batch_item {
 /**
  * A 'simple' structure to store a global uniform batch of ell matrices.
  */
-template <typename ValueType, typename IndexType>
+template <typename ValueType, typename IndexType = const int32>
 struct uniform_batch {
     using value_type = ValueType;
     using index_type = IndexType;

From 27d95e8985cd862a20d7528becc94373dfec8c11 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Thu, 20 Jun 2024 16:43:44 +0200
Subject: [PATCH 261/448] [batch] handle constness of index type same as value
 type

---
 core/matrix/batch_struct.hpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/core/matrix/batch_struct.hpp b/core/matrix/batch_struct.hpp
index 593f9c99781..13543ccb624 100644
--- a/core/matrix/batch_struct.hpp
+++ b/core/matrix/batch_struct.hpp
@@ -28,8 +28,8 @@ struct batch_item {
     using index_type = IndexType;
 
     ValueType* values;
-    const index_type* col_idxs;
-    const index_type* row_ptrs;
+    index_type* col_idxs;
+    index_type* row_ptrs;
     index_type num_rows;
     index_type num_cols;
     index_type num_nnz_per_item;
@@ -51,8 +51,8 @@ struct uniform_batch {
     using entry_type = batch_item<value_type, index_type>;
 
     ValueType* values;
-    const index_type* col_idxs;
-    const index_type* row_ptrs;
+    index_type* col_idxs;
+    index_type* row_ptrs;
     size_type num_batch_items;
     index_type num_rows;
     index_type num_cols;
@@ -125,7 +125,7 @@ struct batch_item {
     using index_type = IndexType;
 
     ValueType* values;
-    const index_type* col_idxs;
+    index_type* col_idxs;
     index_type stride;
     index_type num_rows;
     index_type num_cols;
@@ -148,7 +148,7 @@ struct uniform_batch {
     using entry_type = batch_item<value_type, index_type>;
 
     ValueType* values;
-    const index_type* col_idxs;
+    index_type* col_idxs;
     size_type num_batch_items;
     index_type stride;
     index_type num_rows;

From 48fd553a2c6ec23098197d00150396f1c845476d Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Thu, 20 Jun 2024 16:44:20 +0200
Subject: [PATCH 262/448] [batch] add macro to instantiate batched solver

---
 core/solver/batch_dispatch.hpp | 35 ++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp
index 018a6674df5..ff5bb3f5390 100644
--- a/core/solver/batch_dispatch.hpp
+++ b/core/solver/batch_dispatch.hpp
@@ -164,6 +164,41 @@ enum class log_type { simple_convergence_completion };
 }  // namespace log
 
 
+#define GKO_BATCH_INSTANTIATE_STOP(macro, ...)                          \
+    macro(__VA_ARGS__,                                                  \
+          ::gko::batch::solver::device::batch_stop::SimpleAbsResidual); \
+    template macro(                                                     \
+        __VA_ARGS__,                                                    \
+        ::gko::batch::solver::device::batch_stop::SimpleRelResidual)
+
+#define GKO_BATCH_INSTANTIATE_PRECONDITIONER(macro, ...)                   \
+    GKO_BATCH_INSTANTIATE_STOP(                                            \
+        macro, __VA_ARGS__,                                                \
+        ::gko::batch::solver::device::batch_preconditioner::Identity);     \
+    template GKO_BATCH_INSTANTIATE_STOP(                                   \
+        macro, __VA_ARGS__,                                                \
+        ::gko::batch::solver::device::batch_preconditioner::ScalarJacobi); \
+    template GKO_BATCH_INSTANTIATE_STOP(                                   \
+        macro, __VA_ARGS__,                                                \
+        ::gko::batch::solver::device::batch_preconditioner::BlockJacobi)
+
+#define GKO_BATCH_INSTANTIATE_LOGGER(macro, ...) \
+    GKO_BATCH_INSTANTIATE_PRECONDITIONER(        \
+        macro, __VA_ARGS__,                      \
+        ::gko::batch::solver::device::batch_log::SimpleFinalLogger)
+
+#define GKO_BATCH_INSTANTIATE_MATRIX(macro, ...)                     \
+    GKO_BATCH_INSTANTIATE_LOGGER(macro, __VA_ARGS__,                 \
+                                 batch::matrix::ell::uniform_batch); \
+    template GKO_BATCH_INSTANTIATE_LOGGER(                           \
+        macro, __VA_ARGS__, batch::matrix::dense::uniform_batch);    \
+    template GKO_BATCH_INSTANTIATE_LOGGER(macro, __VA_ARGS__,        \
+                                          batch::matrix::csr::uniform_batch)
+
+#define GKO_BATCH_INSTANTIATE(macro, ...) \
+    GKO_BATCH_INSTANTIATE_MATRIX(macro, __VA_ARGS__)
+
+
 /**
  * Handles dispatching to the correct instantiation of a batched solver
  * depending on runtime parameters.

From 57e066cfa9d0f30abfc4ea84a40680dfd01e43cb Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Tue, 17 Sep 2024 15:40:56 +0200
Subject: [PATCH 263/448] [batch] split bicgstab compilation (hip)

---
 .../solver/batch_bicgstab_kernels.hpp         |   2 +
 hip/CMakeLists.txt                            |   2 +
 hip/solver/batch_bicgstab_kernels.hip.cpp     | 193 +++++++-----------
 hip/solver/batch_bicgstab_launch.hip.hpp      |  78 +++++++
 .../batch_bicgstab_launch.instantiate.hip.cpp |  67 ++++++
 5 files changed, 227 insertions(+), 115 deletions(-)
 create mode 100644 hip/solver/batch_bicgstab_launch.hip.hpp
 create mode 100644 hip/solver/batch_bicgstab_launch.instantiate.hip.cpp

diff --git a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp
index 8ea31358ed5..6bce1b53bb8 100644
--- a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp
+++ b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp
@@ -5,6 +5,8 @@
 #ifndef GKO_COMMON_CUDA_HIP_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_
 #define GKO_COMMON_CUDA_HIP_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_
 
+#include "core/solver/batch_bicgstab_kernels.hpp"
+
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 7d914d57a81..c91a8609313 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -2,6 +2,7 @@ cmake_minimum_required(VERSION 3.21)
 include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
 add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/csr_kernels.instantiate.cpp CSR_INSTANTIATE)
 add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE)
+add_instantiation_files(. solver/batch_bicgstab_launch.instantiate.hip.cpp BATCH_BICGSTAB_INSTANTIATE)
 # we don't split up the dense kernels into distinct compilations
 list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
 set(GINKGO_HIP_SOURCES
@@ -18,6 +19,7 @@ set(GINKGO_HIP_SOURCES
     ${FBCSR_INSTANTIATE}
     preconditioner/batch_jacobi_kernels.hip.cpp
     solver/batch_bicgstab_kernels.hip.cpp
+    ${BATCH_BICGSTAB_INSTANTIATE}
     solver/batch_cg_kernels.hip.cpp
     solver/lower_trs_kernels.hip.cpp
     solver/upper_trs_kernels.hip.cpp
diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp
index 17199d2cd19..697bcb94551 100644
--- a/hip/solver/batch_bicgstab_kernels.hip.cpp
+++ b/hip/solver/batch_bicgstab_kernels.hip.cpp
@@ -5,19 +5,13 @@
 #include "core/solver/batch_bicgstab_kernels.hpp"
 
 #include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
 
 #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
-#include "common/cuda_hip/base/batch_struct.hpp"
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp"
 #include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
+#include "hip/solver/batch_bicgstab_launch.hip.hpp"
 
 
 namespace gko {
@@ -51,47 +45,24 @@ int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
 }
 
 
-template <typename T>
-using settings = gko::kernels::batch_bicgstab::settings<T>;
-
-
-template <typename HipValueType>
+template <typename ValueType>
 class kernel_caller {
 public:
-    using value_type = HipValueType;
+    using hip_value_type = hip_type<ValueType>;
 
     kernel_caller(std::shared_ptr<const DefaultExecutor> exec,
-                  const settings<remove_complex<value_type>> settings)
+                  const settings<remove_complex<ValueType>> settings)
         : exec_{exec}, settings_{settings}
     {}
 
-    template <typename StopType, const int n_shared,
-              const bool prec_shared_bool, typename PrecType, typename LogType,
-              typename BatchMatrixType>
-    void launch_apply_kernel(
-        const gko::kernels::batch_bicgstab::storage_config& sconf,
-        LogType& logger, PrecType& prec, const BatchMatrixType& mat,
-        const value_type* const __restrict__ b_values,
-        value_type* const __restrict__ x_values,
-        value_type* const __restrict__ workspace_data, const int& block_size,
-        const size_t& shared_size) const
-    {
-        batch_single_kernels::apply_kernel<StopType, n_shared, prec_shared_bool>
-            <<<mat.num_batch_items, block_size, shared_size,
-               exec_->get_stream()>>>(sconf, settings_.max_iterations,
-                                      settings_.residual_tol, logger, prec, mat,
-                                      b_values, x_values, workspace_data);
-    }
-
-
     template <typename BatchMatrixType, typename PrecType, typename StopType,
               typename LogType>
     void call_kernel(
         LogType logger, const BatchMatrixType& mat, PrecType prec,
-        const gko::batch::multi_vector::uniform_batch<const value_type>& b,
-        const gko::batch::multi_vector::uniform_batch<value_type>& x) const
+        const gko::batch::multi_vector::uniform_batch<const hip_value_type>& b,
+        const gko::batch::multi_vector::uniform_batch<hip_value_type>& x) const
     {
-        using real_type = gko::remove_complex<value_type>;
+        using real_type = gko::remove_complex<hip_value_type>;
         const size_type num_batch_items = mat.num_batch_items;
         constexpr int align_multiple = 8;
         const int padded_num_rows =
@@ -102,99 +73,92 @@ class kernel_caller {
             exec_->get_device_id()));
         const int block_size =
             get_num_threads_per_block<BatchMatrixType>(exec_, mat.num_rows);
-        bool is_block_size_aligned = block_size % config::warp_size == 0;
         GKO_ASSERT(block_size >= 2 * config::warp_size);
-        GKO_ASSERT(is_block_size_aligned);
+        GKO_ASSERT(block_size % config::warp_size == 0);
 
         // Returns amount required in bytes
         const size_t prec_size = PrecType::dynamic_work_size(
             padded_num_rows, mat.get_single_item_num_nnz());
-        const auto sconf =
-            gko::kernels::batch_bicgstab::compute_shared_storage<PrecType,
-                                                                 value_type>(
-                shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(),
-                b.num_rhs);
+        const auto sconf = gko::kernels::batch_bicgstab::compute_shared_storage<
+            PrecType, hip_value_type>(shmem_per_blk, padded_num_rows,
+                                      mat.get_single_item_num_nnz(), b.num_rhs);
         const size_t shared_size =
-            sconf.n_shared * padded_num_rows * sizeof(value_type) +
+            sconf.n_shared * padded_num_rows * sizeof(hip_value_type) +
             (sconf.prec_shared ? prec_size : 0);
-        auto workspace = gko::array<value_type>(
+        auto workspace = gko::array<hip_value_type>(
             exec_,
-            sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type));
-        bool is_stride_aligned =
-            sconf.gmem_stride_bytes % sizeof(value_type) == 0;
-        GKO_ASSERT(is_stride_aligned);
+            sconf.gmem_stride_bytes * num_batch_items / sizeof(hip_value_type));
+        GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(hip_value_type) == 0);
 
-        value_type* const workspace_data = workspace.get_data();
+        hip_value_type* const workspace_data = workspace.get_data();
 
-        // Only instantiate when full optimizations has been enabled. Otherwise,
-        // just use the default one with no shared memory.
         // Template parameters launch_apply_kernel<StopType, n_shared,
         // prec_shared)
-        // if (sconf.prec_shared) {
-        //     launch_apply_kernel<StopType, 9, true>(
-        //         sconf, logger, prec, mat, b.values, x.values, workspace_data,
-        //         block_size, shared_size);
-        // } else {
-        //     switch (sconf.n_shared) {
-        //     case 0:
-        launch_apply_kernel<StopType, 0, false>(
-            sconf, logger, prec, mat, b.values, x.values, workspace_data,
-            block_size, shared_size);
-        //         break;
-        //     case 1:
-        //         launch_apply_kernel<StopType, 1, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 2:
-        //         launch_apply_kernel<StopType, 2, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 3:
-        //         launch_apply_kernel<StopType, 3, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 4:
-        //         launch_apply_kernel<StopType, 4, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 5:
-        //         launch_apply_kernel<StopType, 5, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 6:
-        //         launch_apply_kernel<StopType, 6, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 7:
-        //         launch_apply_kernel<StopType, 7, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 8:
-        //         launch_apply_kernel<StopType, 8, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 9:
-        //         launch_apply_kernel<StopType, 9, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     default:
-        //         GKO_NOT_IMPLEMENTED;
-        //     }
-        // }
+        if (sconf.prec_shared) {
+            launch_apply_kernel<ValueType, 9, true, StopType>(
+                exec_, sconf, settings_, logger, prec, mat, b.values, x.values,
+                workspace_data, block_size, shared_size);
+        } else {
+            switch (sconf.n_shared) {
+            case 0:
+                launch_apply_kernel<ValueType, 0, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 1:
+                launch_apply_kernel<ValueType, 1, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 2:
+                launch_apply_kernel<ValueType, 2, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 3:
+                launch_apply_kernel<ValueType, 3, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 4:
+                launch_apply_kernel<ValueType, 4, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 5:
+                launch_apply_kernel<ValueType, 5, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 6:
+                launch_apply_kernel<ValueType, 6, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 7:
+                launch_apply_kernel<ValueType, 7, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 8:
+                launch_apply_kernel<ValueType, 8, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 9:
+                launch_apply_kernel<ValueType, 9, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            default:
+                GKO_NOT_IMPLEMENTED;
+            }
+        }
     }
 
 private:
     std::shared_ptr<const DefaultExecutor> exec_;
-    const settings<remove_complex<value_type>> settings_;
+    const settings<remove_complex<ValueType>> settings_;
 };
 
 
@@ -207,9 +171,8 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
            batch::MultiVector<ValueType>* const x,
            batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
 {
-    using hip_value_type = hip_type<ValueType>;
     auto dispatcher = batch::solver::create_dispatcher<ValueType>(
-        kernel_caller<hip_value_type>(exec, settings), settings, mat, precon);
+        kernel_caller<ValueType>(exec, settings), settings, mat, precon);
     dispatcher.apply(b, x, logdata);
 }
 
diff --git a/hip/solver/batch_bicgstab_launch.hip.hpp b/hip/solver/batch_bicgstab_launch.hip.hpp
new file mode 100644
index 00000000000..08d39b8fd5e
--- /dev/null
+++ b/hip/solver/batch_bicgstab_launch.hip.hpp
@@ -0,0 +1,78 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "common/cuda_hip/base/batch_struct.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "core/solver/batch_bicgstab_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+namespace batch_bicgstab {
+
+
+template <typename T>
+using settings = gko::kernels::batch_bicgstab::settings<T>;
+
+
+template <typename ValueType, int n_shared, bool prec_shared, typename StopType,
+          typename PrecType, typename LogType, typename BatchMatrixType>
+void launch_apply_kernel(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const gko::kernels::batch_bicgstab::storage_config& sconf,
+    const settings<remove_complex<ValueType>>& settings, LogType& logger,
+    PrecType& prec, const BatchMatrixType& mat,
+    const hip_type<ValueType>* const __restrict__ b_values,
+    hip_type<ValueType>* const __restrict__ x_values,
+    hip_type<ValueType>* const __restrict__ workspace_data,
+    const int& block_size, const size_t& shared_size);
+
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH(_vtype, _n_shared, _prec_shared, \
+                                          mat_t, log_t, pre_t, stop_t)     \
+    void launch_apply_kernel<_vtype, _n_shared, _prec_shared,              \
+                             stop_t<hip_type<_vtype>>>(                    \
+        std::shared_ptr<const DefaultExecutor> exec,                       \
+        const gko::kernels::batch_bicgstab::storage_config& sconf,         \
+        const settings<remove_complex<_vtype>>& settings,                  \
+        log_t<hip_type<gko::remove_complex<_vtype>>>& logger,              \
+        pre_t<hip_type<_vtype>>& prec,                                     \
+        const mat_t<const hip_type<_vtype>>& mat,                          \
+        const hip_type<_vtype>* const __restrict__ b_values,               \
+        hip_type<_vtype>* const __restrict__ x_values,                     \
+        hip_type<_vtype>* const __restrict__ workspace_data,               \
+        const int& block_size, const size_t& shared_size)
+
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 0, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 1, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 2, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 3, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 4, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 5, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 6, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 7, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 8, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 9, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_TRUE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 9, true)
+
+
+}  // namespace batch_bicgstab
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/solver/batch_bicgstab_launch.instantiate.hip.cpp b/hip/solver/batch_bicgstab_launch.instantiate.hip.cpp
new file mode 100644
index 00000000000..fb26c562a94
--- /dev/null
+++ b/hip/solver/batch_bicgstab_launch.instantiate.hip.cpp
@@ -0,0 +1,67 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+#include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "core/solver/batch_bicgstab_kernels.hpp"
+#include "core/solver/batch_dispatch.hpp"
+#include "hip/solver/batch_bicgstab_launch.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+namespace batch_bicgstab {
+
+
+template <typename ValueType, int n_shared, bool prec_shared, typename StopType,
+          typename PrecType, typename LogType, typename BatchMatrixType>
+void launch_apply_kernel(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const gko::kernels::batch_bicgstab::storage_config& sconf,
+    const settings<remove_complex<ValueType>>& settings, LogType& logger,
+    PrecType& prec, const BatchMatrixType& mat,
+    const hip_type<ValueType>* const __restrict__ b_values,
+    hip_type<ValueType>* const __restrict__ x_values,
+    hip_type<ValueType>* const __restrict__ workspace_data,
+    const int& block_size, const size_t& shared_size)
+{
+    batch_single_kernels::apply_kernel<StopType, n_shared, prec_shared>
+        <<<mat.num_batch_items, block_size, shared_size, exec->get_stream()>>>(
+            sconf, settings.max_iterations, as_hip_type(settings.residual_tol),
+            logger, prec, mat, b_values, x_values, workspace_data);
+}
+
+
+// begin
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_TRUE);
+// end
+
+
+}  // namespace batch_bicgstab
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko

From e60660e26d104b23bd608e1332aa772c5171dd46 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Tue, 17 Sep 2024 15:49:51 +0200
Subject: [PATCH 264/448] [batch] split bicgstab compilation (cuda)

---
 cuda/CMakeLists.txt                           |   2 +
 cuda/solver/batch_bicgstab_kernels.cu         | 249 ++++++------------
 cuda/solver/batch_bicgstab_launch.cuh         | 112 ++++++++
 .../batch_bicgstab_launch.instantiate.cu      | 120 +++++++++
 4 files changed, 320 insertions(+), 163 deletions(-)
 create mode 100644 cuda/solver/batch_bicgstab_launch.cuh
 create mode 100644 cuda/solver/batch_bicgstab_launch.instantiate.cu

diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index 000cb7b215f..9529222c540 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -3,6 +3,7 @@ add_library(ginkgo_cuda $<TARGET_OBJECTS:ginkgo_cuda_device> "")
 include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
 add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/csr_kernels.instantiate.cpp CSR_INSTANTIATE)
 add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE)
+add_instantiation_files(. solver/batch_bicgstab_launch.instantiate.cu BATCH_BICGSTAB_INSTANTIATE)
 # we don't split up the dense kernels into distinct compilations
 list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
 target_sources(ginkgo_cuda
@@ -21,6 +22,7 @@ target_sources(ginkgo_cuda
     matrix/fft_kernels.cu
     preconditioner/batch_jacobi_kernels.cu
     solver/batch_bicgstab_kernels.cu
+    ${BATCH_BICGSTAB_INSTANTIATE}
     solver/batch_cg_kernels.cu
     solver/lower_trs_kernels.cu
     solver/upper_trs_kernels.cu
diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu
index 8a5eee6b196..bd07259f771 100644
--- a/cuda/solver/batch_bicgstab_kernels.cu
+++ b/cuda/solver/batch_bicgstab_kernels.cu
@@ -5,16 +5,13 @@
 #include "core/solver/batch_bicgstab_kernels.hpp"
 
 #include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
 
-#include "common/cuda_hip/base/batch_struct.hpp"
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp"
 #include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
+#include "cuda/solver/batch_bicgstab_launch.cuh"
 
 
 namespace gko {
@@ -23,194 +20,121 @@ namespace cuda {
 namespace batch_bicgstab {
 
 
-template <typename StopType, typename PrecType, typename LogType,
-          typename BatchMatrixType, typename ValueType>
-int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
-                              const int num_rows)
-{
-    int num_warps = std::max(num_rows / 4, 2);
-    constexpr int warp_sz = static_cast<int>(config::warp_size);
-    const int min_block_size = 2 * warp_sz;
-    const int device_max_threads =
-        ((std::max(num_rows, min_block_size)) / warp_sz) * warp_sz;
-    cudaFuncAttributes funcattr;
-    cudaFuncGetAttributes(
-        &funcattr,
-        batch_single_kernels::apply_kernel<StopType, 9, true, PrecType, LogType,
-                                           BatchMatrixType, ValueType>);
-    const int num_regs_used = funcattr.numRegs;
-    int max_regs_blk = 0;
-    cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock,
-                           exec->get_device_id());
-    const int max_threads_regs =
-        ((max_regs_blk / static_cast<int>(num_regs_used)) / warp_sz) * warp_sz;
-    int max_threads = std::min(max_threads_regs, device_max_threads);
-    max_threads = max_threads <= 1024 ? max_threads : 1024;
-    return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size);
-}
-
-
-template <typename StopType, typename PrecType, typename LogType,
-          typename BatchMatrixType, typename ValueType>
-int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec)
-{
-    int shmem_per_sm = 0;
-    cudaDeviceGetAttribute(&shmem_per_sm,
-                           cudaDevAttrMaxSharedMemoryPerMultiprocessor,
-                           exec->get_device_id());
-    GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute(
-        batch_single_kernels::apply_kernel<StopType, 9, true, PrecType, LogType,
-                                           BatchMatrixType, ValueType>,
-        cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/));
-    cudaFuncAttributes funcattr;
-    cudaFuncGetAttributes(
-        &funcattr,
-        batch_single_kernels::apply_kernel<StopType, 9, true, PrecType, LogType,
-                                           BatchMatrixType, ValueType>);
-    return funcattr.maxDynamicSharedSizeBytes;
-}
-
-
-template <typename T>
-using settings = gko::kernels::batch_bicgstab::settings<T>;
-
-
-template <typename CuValueType>
+template <typename ValueType>
 class kernel_caller {
 public:
-    using value_type = CuValueType;
+    using cuda_value_type = cuda_type<ValueType>;
 
     kernel_caller(std::shared_ptr<const DefaultExecutor> exec,
-                  const settings<remove_complex<value_type>> settings)
+                  const settings<remove_complex<ValueType>> settings)
         : exec_{std::move(exec)}, settings_{settings}
     {}
 
-    template <typename StopType, const int n_shared,
-              const bool prec_shared_bool, typename PrecType, typename LogType,
-              typename BatchMatrixType>
-    void launch_apply_kernel(
-        const gko::kernels::batch_bicgstab::storage_config& sconf,
-        LogType& logger, PrecType& prec, const BatchMatrixType& mat,
-        const value_type* const __restrict__ b_values,
-        value_type* const __restrict__ x_values,
-        value_type* const __restrict__ workspace_data, const int& block_size,
-        const size_t& shared_size) const
-    {
-        batch_single_kernels::apply_kernel<StopType, n_shared, prec_shared_bool>
-            <<<mat.num_batch_items, block_size, shared_size,
-               exec_->get_stream()>>>(sconf, settings_.max_iterations,
-                                      settings_.residual_tol, logger, prec, mat,
-                                      b_values, x_values, workspace_data);
-    }
-
-
     template <typename BatchMatrixType, typename PrecType, typename StopType,
               typename LogType>
     void call_kernel(
         LogType logger, const BatchMatrixType& mat, PrecType prec,
-        const gko::batch::multi_vector::uniform_batch<const value_type>& b,
-        const gko::batch::multi_vector::uniform_batch<value_type>& x) const
+        const gko::batch::multi_vector::uniform_batch<const cuda_value_type>& b,
+        const gko::batch::multi_vector::uniform_batch<cuda_value_type>& x) const
     {
-        using real_type = gko::remove_complex<value_type>;
+        using real_type = gko::remove_complex<cuda_value_type>;
         const size_type num_batch_items = mat.num_batch_items;
         constexpr int align_multiple = 8;
         const int padded_num_rows =
             ceildiv(mat.num_rows, align_multiple) * align_multiple;
         const int shmem_per_blk =
             get_max_dynamic_shared_memory<StopType, PrecType, LogType,
-                                          BatchMatrixType, value_type>(exec_);
-        // TODO
-        const int block_size = 256;
-        // get_num_threads_per_block<StopType, PrecType, LogType,
-        //                           BatchMatrixType, value_type>(
-        //     exec_, mat.num_rows);
+                                          BatchMatrixType, cuda_value_type>(
+                exec_);
+        const int block_size =
+            get_num_threads_per_block<StopType, PrecType, LogType,
+                                      BatchMatrixType, cuda_value_type>(
+                exec_, mat.num_rows);
         GKO_ASSERT(block_size >= 2 * config::warp_size);
 
         const size_t prec_size = PrecType::dynamic_work_size(
             padded_num_rows, mat.get_single_item_num_nnz());
-        const auto sconf =
-            gko::kernels::batch_bicgstab::compute_shared_storage<PrecType,
-                                                                 value_type>(
-                shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(),
-                b.num_rhs);
+        const auto sconf = gko::kernels::batch_bicgstab::compute_shared_storage<
+            PrecType, cuda_value_type>(shmem_per_blk, padded_num_rows,
+                                       mat.get_single_item_num_nnz(),
+                                       b.num_rhs);
         const size_t shared_size =
-            sconf.n_shared * padded_num_rows * sizeof(value_type) +
+            sconf.n_shared * padded_num_rows * sizeof(cuda_value_type) +
             (sconf.prec_shared ? prec_size : 0);
-        auto workspace = gko::array<value_type>(
-            exec_,
-            sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type));
-        GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(value_type) == 0);
+        auto workspace = gko::array<cuda_value_type>(
+            exec_, sconf.gmem_stride_bytes * num_batch_items /
+                       sizeof(cuda_value_type));
+        GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(cuda_value_type) == 0);
 
-        value_type* const workspace_data = workspace.get_data();
+        cuda_value_type* const workspace_data = workspace.get_data();
 
-        // TODO: split compilation
         // Template parameters launch_apply_kernel<StopType, n_shared,
         // prec_shared>
-        // if (sconf.prec_shared) {
-        //     launch_apply_kernel<StopType, 9, true>(
-        //         sconf, logger, prec, mat, b.values, x.values, workspace_data,
-        //         block_size, shared_size);
-        // } else {
-        //     switch (sconf.n_shared) {
-        // case 0:
-        launch_apply_kernel<StopType, 0, false>(
-            sconf, logger, prec, mat, b.values, x.values, workspace_data,
-            block_size, shared_size);
-        //         break;
-        //     case 1:
-        //         launch_apply_kernel<StopType, 1, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 2:
-        //         launch_apply_kernel<StopType, 2, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 3:
-        //         launch_apply_kernel<StopType, 3, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 4:
-        //         launch_apply_kernel<StopType, 4, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 5:
-        //         launch_apply_kernel<StopType, 5, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 6:
-        //         launch_apply_kernel<StopType, 6, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 7:
-        //         launch_apply_kernel<StopType, 7, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 8:
-        //         launch_apply_kernel<StopType, 8, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 9:
-        //         launch_apply_kernel<StopType, 9, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     default:
-        //         GKO_NOT_IMPLEMENTED;
-        //     }
-        // }
+        if (sconf.prec_shared) {
+            launch_apply_kernel<cuda_value_type, 9, true, StopType>(
+                exec_, sconf, settings_, logger, prec, mat, b.values, x.values,
+                workspace_data, block_size, shared_size);
+        } else {
+            switch (sconf.n_shared) {
+            case 0:
+                launch_apply_kernel<cuda_value_type, 0, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 1:
+                launch_apply_kernel<cuda_value_type, 1, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 2:
+                launch_apply_kernel<cuda_value_type, 2, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 3:
+                launch_apply_kernel<cuda_value_type, 3, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 4:
+                launch_apply_kernel<cuda_value_type, 4, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 5:
+                launch_apply_kernel<cuda_value_type, 5, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 6:
+                launch_apply_kernel<cuda_value_type, 6, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 7:
+                launch_apply_kernel<cuda_value_type, 7, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 8:
+                launch_apply_kernel<cuda_value_type, 8, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 9:
+                launch_apply_kernel<cuda_value_type, 9, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            default:
+                GKO_NOT_IMPLEMENTED;
+            }
+        }
     }
 
 private:
     std::shared_ptr<const DefaultExecutor> exec_;
-    const settings<remove_complex<value_type>> settings_;
+    const settings<remove_complex<ValueType>> settings_;
 };
 
 
@@ -223,9 +147,8 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
            batch::MultiVector<ValueType>* const x,
            batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
 {
-    using cu_value_type = cuda_type<ValueType>;
     auto dispatcher = batch::solver::create_dispatcher<ValueType>(
-        kernel_caller<cu_value_type>(exec, settings), settings, mat, precon);
+        kernel_caller<ValueType>(exec, settings), settings, mat, precon);
     dispatcher.apply(b, x, logdata);
 }
 
diff --git a/cuda/solver/batch_bicgstab_launch.cuh b/cuda/solver/batch_bicgstab_launch.cuh
new file mode 100644
index 00000000000..6c56b6456a8
--- /dev/null
+++ b/cuda/solver/batch_bicgstab_launch.cuh
@@ -0,0 +1,112 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "common/cuda_hip/base/batch_struct.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "core/solver/batch_bicgstab_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+namespace batch_bicgstab {
+
+
+template <typename T>
+using settings = gko::kernels::batch_bicgstab::settings<T>;
+
+
+template <typename StopType, typename PrecType, typename LogType,
+          typename BatchMatrixType, typename ValueType>
+int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
+                              const int num_rows);
+
+#define GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK_(              \
+    _vtype, mat_t, log_t, pre_t, stop_t)                                    \
+    int get_num_threads_per_block<                                          \
+        stop_t<cuda_type<_vtype>>, pre_t<cuda_type<_vtype>>,                \
+        log_t<gko::remove_complex<_vtype>>, mat_t<const cuda_type<_vtype>>, \
+        cuda_type<_vtype>>(std::shared_ptr<const DefaultExecutor> exec,     \
+                           const int num_rows)
+
+#define GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK(_vtype) \
+    GKO_BATCH_INSTANTIATE(                                           \
+        GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK_, _vtype)
+
+
+template <typename StopType, typename PrecType, typename LogType,
+          typename BatchMatrixType, typename ValueType>
+int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec);
+
+#define GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY_(          \
+    _vtype, mat_t, log_t, pre_t, stop_t)                                    \
+    int get_max_dynamic_shared_memory<                                      \
+        stop_t<cuda_type<_vtype>>, pre_t<cuda_type<_vtype>>,                \
+        log_t<gko::remove_complex<_vtype>>, mat_t<const cuda_type<_vtype>>, \
+        cuda_type<_vtype>>(std::shared_ptr<const DefaultExecutor> exec)
+
+#define GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY(_vtype) \
+    GKO_BATCH_INSTANTIATE(                                               \
+        GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY_, _vtype)
+
+
+template <typename ValueType, int n_shared, bool prec_shared, typename StopType,
+          typename PrecType, typename LogType, typename BatchMatrixType>
+void launch_apply_kernel(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const gko::kernels::batch_bicgstab::storage_config& sconf,
+    const settings<remove_complex<ValueType>>& settings, LogType& logger,
+    PrecType& prec, const BatchMatrixType& mat,
+    const ValueType* const __restrict__ b_values,
+    ValueType* const __restrict__ x_values,
+    ValueType* const __restrict__ workspace_data, const int& block_size,
+    const size_t& shared_size);
+
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH(_vtype, _n_shared, _prec_shared, \
+                                          mat_t, log_t, pre_t, stop_t)     \
+    void launch_apply_kernel<cuda_type<_vtype>, _n_shared, _prec_shared,   \
+                             stop_t<cuda_type<_vtype>>>(                   \
+        std::shared_ptr<const DefaultExecutor> exec,                       \
+        const gko::kernels::batch_bicgstab::storage_config& sconf,         \
+        const settings<remove_complex<cuda_type<_vtype>>>& settings,       \
+        log_t<gko::remove_complex<cuda_type<_vtype>>>& logger,             \
+        pre_t<cuda_type<_vtype>>& prec,                                    \
+        const mat_t<const cuda_type<_vtype>>& mat,                         \
+        const cuda_type<_vtype>* const __restrict__ b_values,              \
+        cuda_type<_vtype>* const __restrict__ x_values,                    \
+        cuda_type<_vtype>* const __restrict__ workspace_data,              \
+        const int& block_size, const size_t& shared_size)
+
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 0, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 1, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 2, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 3, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 4, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 5, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 6, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 7, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 8, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 9, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_TRUE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 9, true)
+
+
+}  // namespace batch_bicgstab
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/solver/batch_bicgstab_launch.instantiate.cu b/cuda/solver/batch_bicgstab_launch.instantiate.cu
new file mode 100644
index 00000000000..b88b19abb0f
--- /dev/null
+++ b/cuda/solver/batch_bicgstab_launch.instantiate.cu
@@ -0,0 +1,120 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "cuda/solver/batch_bicgstab_launch.cuh"
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+#include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "core/solver/batch_bicgstab_kernels.hpp"
+#include "core/solver/batch_dispatch.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+namespace batch_bicgstab {
+
+
+template <typename StopType, typename PrecType, typename LogType,
+          typename BatchMatrixType, typename ValueType>
+int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
+                              const int num_rows)
+{
+    int num_warps = std::max(num_rows / 4, 2);
+    constexpr int warp_sz = static_cast<int>(config::warp_size);
+    const int min_block_size = 2 * warp_sz;
+    const int device_max_threads =
+        ((std::max(num_rows, min_block_size)) / warp_sz) * warp_sz;
+    cudaFuncAttributes funcattr;
+    cudaFuncGetAttributes(&funcattr,
+                          apply_kernel<StopType, 9, true, PrecType, LogType,
+                                       BatchMatrixType, ValueType>);
+    const int num_regs_used = funcattr.numRegs;
+    int max_regs_blk = 0;
+    cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock,
+                           exec->get_device_id());
+    const int max_threads_regs =
+        ((max_regs_blk / static_cast<int>(num_regs_used)) / warp_sz) * warp_sz;
+    int max_threads = std::min(max_threads_regs, device_max_threads);
+    max_threads = max_threads <= 1024 ? max_threads : 1024;
+    return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size);
+}
+
+
+template <typename StopType, typename PrecType, typename LogType,
+          typename BatchMatrixType, typename ValueType>
+int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec)
+{
+    int shmem_per_sm = 0;
+    cudaDeviceGetAttribute(&shmem_per_sm,
+                           cudaDevAttrMaxSharedMemoryPerMultiprocessor,
+                           exec->get_device_id());
+    GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute(
+        apply_kernel<StopType, 9, true, PrecType, LogType, BatchMatrixType,
+                     ValueType>,
+        cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/));
+    cudaFuncAttributes funcattr;
+    cudaFuncGetAttributes(&funcattr,
+                          apply_kernel<StopType, 9, true, PrecType, LogType,
+                                       BatchMatrixType, ValueType>);
+    return funcattr.maxDynamicSharedSizeBytes;
+}
+
+
+template <typename ValueType, int n_shared, bool prec_shared, typename StopType,
+          typename PrecType, typename LogType, typename BatchMatrixType>
+void launch_apply_kernel(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const gko::kernels::batch_bicgstab::storage_config& sconf,
+    const settings<remove_complex<ValueType>>& settings, LogType& logger,
+    PrecType& prec, const BatchMatrixType& mat,
+    const ValueType* const __restrict__ b_values,
+    ValueType* const __restrict__ x_values,
+    ValueType* const __restrict__ workspace_data, const int& block_size,
+    const size_t& shared_size)
+{
+    apply_kernel<StopType, n_shared, prec_shared>
+        <<<mat.num_batch_items, block_size, shared_size, exec->get_stream()>>>(
+            sconf, settings.max_iterations, as_cuda_type(settings.residual_tol),
+            logger, prec, mat, b_values, x_values, workspace_data);
+}
+
+
+// begin
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_TRUE);
+// end
+
+
+}  // namespace batch_bicgstab
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko

From 3fc0db986837ead02072fbb44b96c27e3ab20349 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Tue, 17 Sep 2024 16:11:49 +0200
Subject: [PATCH 265/448] [batch] split cg compilation (hip)

---
 common/cuda_hip/solver/batch_cg_kernels.hpp   |   2 +
 .../batch_bicgstab_launch.instantiate.cu      |  20 +--
 hip/CMakeLists.txt                            |   2 +
 hip/solver/batch_cg_kernels.hip.cpp           | 153 +++++++-----------
 hip/solver/batch_cg_launch.hip.hpp            |  70 ++++++++
 .../batch_cg_launch.instantiate.hip.cpp       |  59 +++++++
 6 files changed, 204 insertions(+), 102 deletions(-)
 create mode 100644 hip/solver/batch_cg_launch.hip.hpp
 create mode 100644 hip/solver/batch_cg_launch.instantiate.hip.cpp

diff --git a/common/cuda_hip/solver/batch_cg_kernels.hpp b/common/cuda_hip/solver/batch_cg_kernels.hpp
index 7ccdc5f9926..c8502e28b1f 100644
--- a/common/cuda_hip/solver/batch_cg_kernels.hpp
+++ b/common/cuda_hip/solver/batch_cg_kernels.hpp
@@ -6,6 +6,8 @@
 #define GKO_COMMON_CUDA_HIP_SOLVER_BATCH_CG_KERNELS_HPP_
 
 
+#include "core/solver/batch_cg_kernels.hpp"
+
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
diff --git a/cuda/solver/batch_bicgstab_launch.instantiate.cu b/cuda/solver/batch_bicgstab_launch.instantiate.cu
index b88b19abb0f..ec88cc17c85 100644
--- a/cuda/solver/batch_bicgstab_launch.instantiate.cu
+++ b/cuda/solver/batch_bicgstab_launch.instantiate.cu
@@ -29,9 +29,10 @@ int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
     const int device_max_threads =
         ((std::max(num_rows, min_block_size)) / warp_sz) * warp_sz;
     cudaFuncAttributes funcattr;
-    cudaFuncGetAttributes(&funcattr,
-                          apply_kernel<StopType, 9, true, PrecType, LogType,
-                                       BatchMatrixType, ValueType>);
+    cudaFuncGetAttributes(
+        &funcattr,
+        batch_single_kernels::apply_kernel<StopType, 9, true, PrecType, LogType,
+                                           BatchMatrixType, ValueType>);
     const int num_regs_used = funcattr.numRegs;
     int max_regs_blk = 0;
     cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock,
@@ -53,13 +54,14 @@ int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec)
                            cudaDevAttrMaxSharedMemoryPerMultiprocessor,
                            exec->get_device_id());
     GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute(
-        apply_kernel<StopType, 9, true, PrecType, LogType, BatchMatrixType,
-                     ValueType>,
+        batch_single_kernels::apply_kernel<StopType, 9, true, PrecType, LogType,
+                                           BatchMatrixType, ValueType>,
         cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/));
     cudaFuncAttributes funcattr;
-    cudaFuncGetAttributes(&funcattr,
-                          apply_kernel<StopType, 9, true, PrecType, LogType,
-                                       BatchMatrixType, ValueType>);
+    cudaFuncGetAttributes(
+        &funcattr,
+        batch_single_kernels::apply_kernel<StopType, 9, true, PrecType, LogType,
+                                           BatchMatrixType, ValueType>);
     return funcattr.maxDynamicSharedSizeBytes;
 }
 
@@ -76,7 +78,7 @@ void launch_apply_kernel(
     ValueType* const __restrict__ workspace_data, const int& block_size,
     const size_t& shared_size)
 {
-    apply_kernel<StopType, n_shared, prec_shared>
+    batch_single_kernels::apply_kernel<StopType, n_shared, prec_shared>
         <<<mat.num_batch_items, block_size, shared_size, exec->get_stream()>>>(
             sconf, settings.max_iterations, as_cuda_type(settings.residual_tol),
             logger, prec, mat, b_values, x_values, workspace_data);
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index c91a8609313..4a540046322 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -3,6 +3,7 @@ include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
 add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/csr_kernels.instantiate.cpp CSR_INSTANTIATE)
 add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE)
 add_instantiation_files(. solver/batch_bicgstab_launch.instantiate.hip.cpp BATCH_BICGSTAB_INSTANTIATE)
+add_instantiation_files(. solver/batch_cg_launch.instantiate.hip.cpp BATCH_CG_INSTANTIATE)
 # we don't split up the dense kernels into distinct compilations
 list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
 set(GINKGO_HIP_SOURCES
@@ -21,6 +22,7 @@ set(GINKGO_HIP_SOURCES
     solver/batch_bicgstab_kernels.hip.cpp
     ${BATCH_BICGSTAB_INSTANTIATE}
     solver/batch_cg_kernels.hip.cpp
+    ${BATCH_CG_INSTANTIATE}
     solver/lower_trs_kernels.hip.cpp
     solver/upper_trs_kernels.hip.cpp
     ${GKO_UNIFIED_COMMON_SOURCES}
diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp
index 6d5e3bff3b3..25ebd667a7e 100644
--- a/hip/solver/batch_cg_kernels.hip.cpp
+++ b/hip/solver/batch_cg_kernels.hip.cpp
@@ -5,18 +5,13 @@
 #include "core/solver/batch_cg_kernels.hpp"
 
 #include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
 
-#include "common/cuda_hip/base/batch_struct.hpp"
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "common/cuda_hip/solver/batch_cg_kernels.hpp"
 #include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
+#include "hip/solver/batch_cg_launch.hip.hpp"
 
 
 namespace gko {
@@ -50,47 +45,24 @@ int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
 }
 
 
-template <typename T>
-using settings = gko::kernels::batch_cg::settings<T>;
-
-
-template <typename HipValueType>
+template <typename ValueType>
 class kernel_caller {
 public:
-    using value_type = HipValueType;
+    using hip_value_type = hip_type<ValueType>;
 
     kernel_caller(std::shared_ptr<const DefaultExecutor> exec,
-                  const settings<remove_complex<value_type>> settings)
+                  const settings<remove_complex<ValueType>> settings)
         : exec_{exec}, settings_{settings}
     {}
 
-    template <typename StopType, const int n_shared,
-              const bool prec_shared_bool, typename PrecType, typename LogType,
-              typename BatchMatrixType>
-    void launch_apply_kernel(
-        const gko::kernels::batch_cg::storage_config& sconf, LogType& logger,
-        PrecType& prec, const BatchMatrixType& mat,
-        const value_type* const __restrict__ b_values,
-        value_type* const __restrict__ x_values,
-        value_type* const __restrict__ workspace_data, const int& block_size,
-        const size_t& shared_size) const
-    {
-        batch_single_kernels::apply_kernel<StopType, n_shared, prec_shared_bool>
-            <<<mat.num_batch_items, block_size, shared_size,
-               exec_->get_stream()>>>(sconf, settings_.max_iterations,
-                                      settings_.residual_tol, logger, prec, mat,
-                                      b_values, x_values, workspace_data);
-    }
-
-
     template <typename BatchMatrixType, typename PrecType, typename StopType,
               typename LogType>
     void call_kernel(
         LogType logger, const BatchMatrixType& mat, PrecType prec,
-        const gko::batch::multi_vector::uniform_batch<const value_type>& b,
-        const gko::batch::multi_vector::uniform_batch<value_type>& x) const
+        const gko::batch::multi_vector::uniform_batch<const hip_value_type>& b,
+        const gko::batch::multi_vector::uniform_batch<hip_value_type>& x) const
     {
-        using real_type = gko::remove_complex<value_type>;
+        using real_type = gko::remove_complex<hip_value_type>;
         const size_type num_batch_items = mat.num_batch_items;
         constexpr int align_multiple = 8;
         const int padded_num_rows =
@@ -101,79 +73,74 @@ class kernel_caller {
             exec_->get_device_id()));
         const int block_size =
             get_num_threads_per_block<BatchMatrixType>(exec_, mat.num_rows);
-        bool is_block_size_aligned = block_size % config::warp_size == 0;
         GKO_ASSERT(block_size >= 2 * config::warp_size);
-        GKO_ASSERT(is_block_size_aligned);
+        GKO_ASSERT(block_size % config::warp_size == 0);
 
         // Returns amount required in bytes
         const size_t prec_size = PrecType::dynamic_work_size(
             padded_num_rows, mat.get_single_item_num_nnz());
         const auto sconf =
             gko::kernels::batch_cg::compute_shared_storage<PrecType,
-                                                           value_type>(
+                                                           hip_value_type>(
                 shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(),
                 b.num_rhs);
         const size_t shared_size =
-            sconf.n_shared * padded_num_rows * sizeof(value_type) +
+            sconf.n_shared * padded_num_rows * sizeof(hip_value_type) +
             (sconf.prec_shared ? prec_size : 0);
-        auto workspace = gko::array<value_type>(
+        auto workspace = gko::array<hip_value_type>(
             exec_,
-            sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type));
-        bool is_stride_aligned =
-            sconf.gmem_stride_bytes % sizeof(value_type) == 0;
-        GKO_ASSERT(is_stride_aligned);
-
-        value_type* const workspace_data = workspace.get_data();
-
-        // Only instantiate when full optimizations has been enabled. Otherwise,
-        // just use the default one with no shared memory.
-        // Template parameters launch_apply_kernel<StopType, n_shared,
-        // prec_shared)
-        // if (sconf.prec_shared) {
-        //     launch_apply_kernel<StopType, 5, true>(
-        //         sconf, logger, prec, mat, b.values, x.values, workspace_data,
-        //         block_size, shared_size);
-        // } else {
-        //     switch (sconf.n_shared) {
-        //     case 0:
-        launch_apply_kernel<StopType, 0, false>(
-            sconf, logger, prec, mat, b.values, x.values, workspace_data,
-            block_size, shared_size);
-        //         break;
-        //     case 1:
-        //         launch_apply_kernel<StopType, 1, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 2:
-        //         launch_apply_kernel<StopType, 2, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 3:
-        //         launch_apply_kernel<StopType, 3, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 4:
-        //         launch_apply_kernel<StopType, 4, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 5:
-        //         launch_apply_kernel<StopType, 5, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     default:
-        //         GKO_NOT_IMPLEMENTED;
-        //     }
-        // }
+            sconf.gmem_stride_bytes * num_batch_items / sizeof(hip_value_type));
+        GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(hip_value_type) == 0);
+
+        hip_value_type* const workspace_data = workspace.get_data();
+
+        // Template parameters launch_apply_kernel<ValueType, n_shared,
+        // prec_shared, StopType>
+        if (sconf.prec_shared) {
+            launch_apply_kernel<ValueType, 5, true, StopType>(
+                exec_, sconf, settings_, logger, prec, mat, b.values, x.values,
+                workspace_data, block_size, shared_size);
+        } else {
+            switch (sconf.n_shared) {
+            case 0:
+                launch_apply_kernel<ValueType, 0, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 1:
+                launch_apply_kernel<ValueType, 1, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 2:
+                launch_apply_kernel<ValueType, 2, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 3:
+                launch_apply_kernel<ValueType, 3, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 4:
+                launch_apply_kernel<ValueType, 4, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 5:
+                launch_apply_kernel<ValueType, 5, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            default:
+                GKO_NOT_IMPLEMENTED;
+            }
+        }
     }
 
 private:
     std::shared_ptr<const DefaultExecutor> exec_;
-    const settings<remove_complex<value_type>> settings_;
+    const settings<remove_complex<ValueType>> settings_;
 };
 
 
diff --git a/hip/solver/batch_cg_launch.hip.hpp b/hip/solver/batch_cg_launch.hip.hpp
new file mode 100644
index 00000000000..a1e41310b8b
--- /dev/null
+++ b/hip/solver/batch_cg_launch.hip.hpp
@@ -0,0 +1,70 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "common/cuda_hip/base/batch_struct.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "core/solver/batch_cg_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+namespace batch_cg {
+
+
+template <typename T>
+using settings = gko::kernels::batch_cg::settings<T>;
+
+
+template <typename ValueType, int n_shared, bool prec_shared, typename StopType,
+          typename PrecType, typename LogType, typename BatchMatrixType>
+void launch_apply_kernel(std::shared_ptr<const DefaultExecutor> exec,
+                         const gko::kernels::batch_cg::storage_config& sconf,
+                         const settings<remove_complex<ValueType>>& settings,
+                         LogType& logger, PrecType& prec,
+                         const BatchMatrixType& mat,
+                         const hip_type<ValueType>* const __restrict__ b_values,
+                         hip_type<ValueType>* const __restrict__ x_values,
+                         hip_type<ValueType>* const __restrict__ workspace_data,
+                         const int& block_size, const size_t& shared_size);
+
+#define GKO_DECLARE_BATCH_CG_LAUNCH(_vtype, _n_shared, _prec_shared, mat_t, \
+                                    log_t, pre_t, stop_t)                   \
+    void launch_apply_kernel<hip_type<_vtype>, _n_shared, _prec_shared,     \
+                             stop_t<hip_type<_vtype>>>(                     \
+        std::shared_ptr<const DefaultExecutor> exec,                        \
+        const gko::kernels::batch_cg::storage_config& sconf,                \
+        const settings<remove_complex<_vtype>>& settings,                   \
+        log_t<hip_type<gko::remove_complex<hip_type<_vtype>>>>& logger,     \
+        pre_t<hip_type<_vtype>>& prec,                                      \
+        const mat_t<const hip_type<_vtype>>& mat,                           \
+        const hip_type<_vtype>* const __restrict__ b_values,                \
+        hip_type<_vtype>* const __restrict__ x_values,                      \
+        hip_type<_vtype>* const __restrict__ workspace_data,                \
+        const int& block_size, const size_t& shared_size)
+
+#define GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 0, false)
+#define GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 1, false)
+#define GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 2, false)
+#define GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 3, false)
+#define GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 4, false)
+#define GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, false)
+#define GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, true)
+
+
+}  // namespace batch_cg
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/solver/batch_cg_launch.instantiate.hip.cpp b/hip/solver/batch_cg_launch.instantiate.hip.cpp
new file mode 100644
index 00000000000..3605a88651d
--- /dev/null
+++ b/hip/solver/batch_cg_launch.instantiate.hip.cpp
@@ -0,0 +1,59 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+#include "common/cuda_hip/solver/batch_cg_kernels.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "core/solver/batch_cg_kernels.hpp"
+#include "core/solver/batch_dispatch.hpp"
+#include "hip/solver/batch_cg_launch.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+namespace batch_cg {
+
+
+template <typename ValueType, int n_shared, bool prec_shared, typename StopType,
+          typename PrecType, typename LogType, typename BatchMatrixType>
+void launch_apply_kernel(std::shared_ptr<const DefaultExecutor> exec,
+                         const gko::kernels::batch_cg::storage_config& sconf,
+                         const settings<remove_complex<ValueType>>& settings,
+                         LogType& logger, PrecType& prec,
+                         const BatchMatrixType& mat,
+                         const hip_type<ValueType>* const __restrict__ b_values,
+                         hip_type<ValueType>* const __restrict__ x_values,
+                         hip_type<ValueType>* const __restrict__ workspace_data,
+                         const int& block_size, const size_t& shared_size)
+{
+    batch_single_kernels::apply_kernel<StopType, n_shared, prec_shared>
+        <<<mat.num_batch_items, block_size, shared_size, exec->get_stream()>>>(
+            sconf, settings.max_iterations, as_hip_type(settings.residual_tol),
+            logger, prec, mat, b_values, x_values, workspace_data);
+}
+
+
+// begin
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE);
+// end
+
+
+}  // namespace batch_cg
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko

From 23849f001d23382b098b58ba0f20dd176cd87a22 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Tue, 17 Sep 2024 16:14:38 +0200
Subject: [PATCH 266/448] [batch] split cg compilation (cuda)

---
 cuda/CMakeLists.txt                        |   2 +
 cuda/solver/batch_cg_kernels.cu            | 204 +++++++--------------
 cuda/solver/batch_cg_launch.cuh            | 104 +++++++++++
 cuda/solver/batch_cg_launch.instantiate.cu | 114 ++++++++++++
 4 files changed, 284 insertions(+), 140 deletions(-)
 create mode 100644 cuda/solver/batch_cg_launch.cuh
 create mode 100644 cuda/solver/batch_cg_launch.instantiate.cu

diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index 9529222c540..bfa65eee79b 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -4,6 +4,7 @@ include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
 add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/csr_kernels.instantiate.cpp CSR_INSTANTIATE)
 add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE)
 add_instantiation_files(. solver/batch_bicgstab_launch.instantiate.cu BATCH_BICGSTAB_INSTANTIATE)
+add_instantiation_files(. solver/batch_cg_launch.instantiate.cu BATCH_CG_INSTANTIATE)
 # we don't split up the dense kernels into distinct compilations
 list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
 target_sources(ginkgo_cuda
@@ -24,6 +25,7 @@ target_sources(ginkgo_cuda
     solver/batch_bicgstab_kernels.cu
     ${BATCH_BICGSTAB_INSTANTIATE}
     solver/batch_cg_kernels.cu
+    ${BATCH_CG_INSTANTIATE}
     solver/lower_trs_kernels.cu
     solver/upper_trs_kernels.cu
     ${GKO_UNIFIED_COMMON_SOURCES}
diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu
index 32e66d7ee54..126a62006cf 100644
--- a/cuda/solver/batch_cg_kernels.cu
+++ b/cuda/solver/batch_cg_kernels.cu
@@ -5,16 +5,13 @@
 #include "core/solver/batch_cg_kernels.hpp"
 
 #include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
 
-#include "common/cuda_hip/base/batch_struct.hpp"
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "common/cuda_hip/solver/batch_cg_kernels.hpp"
 #include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
+#include "cuda/solver/batch_cg_launch.cuh"
 
 
 namespace gko {
@@ -23,104 +20,35 @@ namespace cuda {
 namespace batch_cg {
 
 
-template <typename StopType, typename PrecType, typename LogType,
-          typename BatchMatrixType, typename ValueType>
-int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
-                              const int num_rows)
-{
-    int num_warps = std::max(num_rows / 4, 2);
-    constexpr int warp_sz = static_cast<int>(config::warp_size);
-    const int min_block_size = 2 * warp_sz;
-    const int device_max_threads =
-        (std::max(num_rows, min_block_size) / warp_sz) * warp_sz;
-    cudaFuncAttributes funcattr;
-    cudaFuncGetAttributes(
-        &funcattr,
-        batch_single_kernels::apply_kernel<StopType, 5, true, PrecType, LogType,
-                                           BatchMatrixType, ValueType>);
-    const int num_regs_used = funcattr.numRegs;
-    int max_regs_blk = 0;
-    cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock,
-                           exec->get_device_id());
-    const int max_threads_regs =
-        ((max_regs_blk / static_cast<int>(num_regs_used)) / warp_sz) * warp_sz;
-    int max_threads = std::min(max_threads_regs, device_max_threads);
-    max_threads = max_threads <= 1024 ? max_threads : 1024;
-    return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size);
-}
-
-
-template <typename StopType, typename PrecType, typename LogType,
-          typename BatchMatrixType, typename ValueType>
-int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec)
-{
-    int shmem_per_sm = 0;
-    cudaDeviceGetAttribute(&shmem_per_sm,
-                           cudaDevAttrMaxSharedMemoryPerMultiprocessor,
-                           exec->get_device_id());
-    GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute(
-        batch_single_kernels::apply_kernel<StopType, 5, true, PrecType, LogType,
-                                           BatchMatrixType, ValueType>,
-        cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/));
-    cudaFuncAttributes funcattr;
-    cudaFuncGetAttributes(
-        &funcattr,
-        batch_single_kernels::apply_kernel<StopType, 5, true, PrecType, LogType,
-                                           BatchMatrixType, ValueType>);
-    return funcattr.maxDynamicSharedSizeBytes;
-}
-
-
-template <typename T>
-using settings = gko::kernels::batch_cg::settings<T>;
-
-
-template <typename CuValueType>
+template <typename ValueType>
 class kernel_caller {
 public:
-    using value_type = CuValueType;
+    using cuda_value_type = cuda_type<ValueType>;
 
     kernel_caller(std::shared_ptr<const DefaultExecutor> exec,
-                  const settings<remove_complex<value_type>> settings)
+                  const settings<remove_complex<ValueType>> settings)
         : exec_{std::move(exec)}, settings_{settings}
     {}
 
-    template <typename StopType, const int n_shared,
-              const bool prec_shared_bool, typename PrecType, typename LogType,
-              typename BatchMatrixType>
-    void launch_apply_kernel(
-        const gko::kernels::batch_cg::storage_config& sconf, LogType& logger,
-        PrecType& prec, const BatchMatrixType& mat,
-        const value_type* const __restrict__ b_values,
-        value_type* const __restrict__ x_values,
-        value_type* const __restrict__ workspace_data, const int& block_size,
-        const size_t& shared_size) const
-    {
-        batch_single_kernels::apply_kernel<StopType, n_shared, prec_shared_bool>
-            <<<mat.num_batch_items, block_size, shared_size,
-               exec_->get_stream()>>>(sconf, settings_.max_iterations,
-                                      settings_.residual_tol, logger, prec, mat,
-                                      b_values, x_values, workspace_data);
-    }
-
     template <typename BatchMatrixType, typename PrecType, typename StopType,
               typename LogType>
     void call_kernel(
         LogType logger, const BatchMatrixType& mat, PrecType prec,
-        const gko::batch::multi_vector::uniform_batch<const value_type>& b,
-        const gko::batch::multi_vector::uniform_batch<value_type>& x) const
+        const gko::batch::multi_vector::uniform_batch<const cuda_value_type>& b,
+        const gko::batch::multi_vector::uniform_batch<cuda_value_type>& x) const
     {
-        using real_type = gko::remove_complex<value_type>;
+        using real_type = gko::remove_complex<cuda_value_type>;
         const size_type num_batch_items = mat.num_batch_items;
         constexpr int align_multiple = 8;
         const int padded_num_rows =
             ceildiv(mat.num_rows, align_multiple) * align_multiple;
         const int shmem_per_blk =
             get_max_dynamic_shared_memory<StopType, PrecType, LogType,
-                                          BatchMatrixType, value_type>(exec_);
+                                          BatchMatrixType, cuda_value_type>(
+                exec_);
         const int block_size =
             get_num_threads_per_block<StopType, PrecType, LogType,
-                                      BatchMatrixType, value_type>(
+                                      BatchMatrixType, cuda_value_type>(
                 exec_, mat.num_rows);
         GKO_ASSERT(block_size >= 2 * config::warp_size);
 
@@ -128,69 +56,66 @@ public:
             padded_num_rows, mat.get_single_item_num_nnz());
         const auto sconf =
             gko::kernels::batch_cg::compute_shared_storage<PrecType,
-                                                           value_type>(
+                                                           cuda_value_type>(
                 shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(),
                 b.num_rhs);
         const size_t shared_size =
-            sconf.n_shared * padded_num_rows * sizeof(value_type) +
+            sconf.n_shared * padded_num_rows * sizeof(cuda_value_type) +
             (sconf.prec_shared ? prec_size : 0);
-        auto workspace = gko::array<value_type>(
-            exec_,
-            sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type));
-        GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(value_type) == 0);
-
-        value_type* const workspace_data = workspace.get_data();
-
-        // TODO: split compilation
-        // Only instantiate when full optimizations has been enabled. Otherwise,
-        // just use the default one with no shared memory.
-        // Template parameters launch_apply_kernel<StopType, n_shared,
-        // prec_shared>
-        // if (sconf.prec_shared) {
-        //     launch_apply_kernel<StopType, 5, true>(
-        //         sconf, logger, prec, mat, b.values, x.values, workspace_data,
-        //         block_size, shared_size);
-        // } else {
-        //     switch (sconf.n_shared) {
-        //     case 0:
-        launch_apply_kernel<StopType, 0, false>(
-            sconf, logger, prec, mat, b.values, x.values, workspace_data,
-            block_size, shared_size);
-        //         break;
-        //     case 1:
-        //         launch_apply_kernel<StopType, 1, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 2:
-        //         launch_apply_kernel<StopType, 2, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 3:
-        //         launch_apply_kernel<StopType, 3, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 4:
-        //         launch_apply_kernel<StopType, 4, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 5:
-        //         launch_apply_kernel<StopType, 5, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     default:
-        //         GKO_NOT_IMPLEMENTED;
-        //     }
-        // }
+        auto workspace = gko::array<cuda_value_type>(
+            exec_, sconf.gmem_stride_bytes * num_batch_items /
+                       sizeof(cuda_value_type));
+        GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(cuda_value_type) == 0);
+
+        cuda_value_type* const workspace_data = workspace.get_data();
+
+        // Template parameters launch_apply_kernel<ValueType, n_shared,
+        // prec_shared, StopType>
+        if (sconf.prec_shared) {
+            launch_apply_kernel<cuda_value_type, 5, true, StopType>(
+                exec_, sconf, settings_, logger, prec, mat, b.values, x.values,
+                workspace_data, block_size, shared_size);
+        } else {
+            switch (sconf.n_shared) {
+            case 0:
+                launch_apply_kernel<cuda_value_type, 0, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 1:
+                launch_apply_kernel<cuda_value_type, 1, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 2:
+                launch_apply_kernel<cuda_value_type, 2, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 3:
+                launch_apply_kernel<cuda_value_type, 3, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 4:
+                launch_apply_kernel<cuda_value_type, 4, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 5:
+                launch_apply_kernel<cuda_value_type, 5, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            default:
+                GKO_NOT_IMPLEMENTED;
+            }
+        }
     }
 
 private:
     std::shared_ptr<const DefaultExecutor> exec_;
-    const settings<remove_complex<value_type>> settings_;
+    const settings<remove_complex<ValueType>> settings_;
 };
 
 
@@ -203,9 +128,8 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
            batch::MultiVector<ValueType>* const x,
            batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
 {
-    using cu_value_type = cuda_type<ValueType>;
     auto dispatcher = batch::solver::create_dispatcher<ValueType>(
-        kernel_caller<cu_value_type>(exec, settings), settings, mat, precon);
+        kernel_caller<ValueType>(exec, settings), settings, mat, precon);
     dispatcher.apply(b, x, logdata);
 }
 
diff --git a/cuda/solver/batch_cg_launch.cuh b/cuda/solver/batch_cg_launch.cuh
new file mode 100644
index 00000000000..7196d6f8366
--- /dev/null
+++ b/cuda/solver/batch_cg_launch.cuh
@@ -0,0 +1,104 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "common/cuda_hip/base/batch_struct.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "core/solver/batch_cg_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+namespace batch_cg {
+
+
+template <typename T>
+using settings = gko::kernels::batch_cg::settings<T>;
+
+
+template <typename StopType, typename PrecType, typename LogType,
+          typename BatchMatrixType, typename ValueType>
+int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
+                              const int num_rows);
+
+#define GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK_(_vtype, mat_t, log_t, \
+                                                        pre_t, stop_t)        \
+    int get_num_threads_per_block<                                            \
+        stop_t<cuda_type<_vtype>>, pre_t<cuda_type<_vtype>>,                  \
+        log_t<gko::remove_complex<cuda_type<_vtype>>>,                        \
+        mat_t<const cuda_type<_vtype>>, cuda_type<_vtype>>(                   \
+        std::shared_ptr<const DefaultExecutor> exec, const int num_rows)
+
+#define GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK(_vtype)             \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK_, \
+                          _vtype)
+
+
+template <typename StopType, typename PrecType, typename LogType,
+          typename BatchMatrixType, typename ValueType>
+int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec);
+
+#define GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY_(                \
+    _vtype, mat_t, log_t, pre_t, stop_t)                                    \
+    int get_max_dynamic_shared_memory<                                      \
+        stop_t<cuda_type<_vtype>>, pre_t<cuda_type<_vtype>>,                \
+        log_t<gko::remove_complex<_vtype>>, mat_t<const cuda_type<_vtype>>, \
+        cuda_type<_vtype>>(std::shared_ptr<const DefaultExecutor> exec)
+
+#define GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY(_vtype)             \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY_, \
+                          _vtype)
+
+
+template <typename ValueType, int n_shared, bool prec_shared, typename StopType,
+          typename PrecType, typename LogType, typename BatchMatrixType>
+void launch_apply_kernel(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const gko::kernels::batch_cg::storage_config& sconf,
+    const settings<remove_complex<ValueType>>& settings, LogType& logger,
+    PrecType& prec, const BatchMatrixType& mat,
+    const cuda_type<ValueType>* const __restrict__ b_values,
+    cuda_type<ValueType>* const __restrict__ x_values,
+    cuda_type<ValueType>* const __restrict__ workspace_data,
+    const int& block_size, const size_t& shared_size);
+
+#define GKO_DECLARE_BATCH_CG_LAUNCH(_vtype, _n_shared, _prec_shared, mat_t, \
+                                    log_t, pre_t, stop_t)                   \
+    void launch_apply_kernel<cuda_type<_vtype>, _n_shared, _prec_shared,    \
+                             stop_t<cuda_type<_vtype>>>(                    \
+        std::shared_ptr<const DefaultExecutor> exec,                        \
+        const gko::kernels::batch_cg::storage_config& sconf,                \
+        const settings<remove_complex<_vtype>>& settings,                   \
+        log_t<cuda_type<gko::remove_complex<cuda_type<_vtype>>>>& logger,   \
+        pre_t<cuda_type<_vtype>>& prec,                                     \
+        const mat_t<const cuda_type<_vtype>>& mat,                          \
+        const cuda_type<_vtype>* const __restrict__ b_values,               \
+        cuda_type<_vtype>* const __restrict__ x_values,                     \
+        cuda_type<_vtype>* const __restrict__ workspace_data,               \
+        const int& block_size, const size_t& shared_size)
+
+#define GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 0, false)
+#define GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 1, false)
+#define GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 2, false)
+#define GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 3, false)
+#define GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 4, false)
+#define GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, false)
+#define GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, true)
+
+
+}  // namespace batch_cg
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/solver/batch_cg_launch.instantiate.cu b/cuda/solver/batch_cg_launch.instantiate.cu
new file mode 100644
index 00000000000..9fca587f33e
--- /dev/null
+++ b/cuda/solver/batch_cg_launch.instantiate.cu
@@ -0,0 +1,114 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "cuda/solver/batch_cg_launch.cuh"
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+#include "common/cuda_hip/solver/batch_cg_kernels.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "core/solver/batch_cg_kernels.hpp"
+#include "core/solver/batch_dispatch.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+namespace batch_cg {
+
+
+template <typename StopType, typename PrecType, typename LogType,
+          typename BatchMatrixType, typename ValueType>
+int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
+                              const int num_rows)
+{
+    int num_warps = std::max(num_rows / 4, 2);
+    constexpr int warp_sz = static_cast<int>(config::warp_size);
+    const int min_block_size = 2 * warp_sz;
+    const int device_max_threads =
+        (std::max(num_rows, min_block_size) / warp_sz) * warp_sz;
+    cudaFuncAttributes funcattr;
+    cudaFuncGetAttributes(
+        &funcattr,
+        batch_single_kernels::apply_kernel<StopType, 5, true, PrecType, LogType,
+                                           BatchMatrixType, ValueType>);
+    const int num_regs_used = funcattr.numRegs;
+    int max_regs_blk = 0;
+    cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock,
+                           exec->get_device_id());
+    const int max_threads_regs =
+        ((max_regs_blk / static_cast<int>(num_regs_used)) / warp_sz) * warp_sz;
+    int max_threads = std::min(max_threads_regs, device_max_threads);
+    max_threads = max_threads <= 1024 ? max_threads : 1024;
+    return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size);
+}
+
+
+template <typename StopType, typename PrecType, typename LogType,
+          typename BatchMatrixType, typename ValueType>
+int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec)
+{
+    int shmem_per_sm = 0;
+    cudaDeviceGetAttribute(&shmem_per_sm,
+                           cudaDevAttrMaxSharedMemoryPerMultiprocessor,
+                           exec->get_device_id());
+    GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute(
+        batch_single_kernels::apply_kernel<StopType, 5, true, PrecType, LogType,
+                                           BatchMatrixType, ValueType>,
+        cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/));
+    cudaFuncAttributes funcattr;
+    cudaFuncGetAttributes(
+        &funcattr,
+        batch_single_kernels::apply_kernel<StopType, 5, true, PrecType, LogType,
+                                           BatchMatrixType, ValueType>);
+    return funcattr.maxDynamicSharedSizeBytes;
+}
+
+
+template <typename ValueType, int n_shared, bool prec_shared, typename StopType,
+          typename PrecType, typename LogType, typename BatchMatrixType>
+void launch_apply_kernel(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const gko::kernels::batch_cg::storage_config& sconf,
+    const settings<remove_complex<ValueType>>& settings, LogType& logger,
+    PrecType& prec, const BatchMatrixType& mat,
+    const cuda_type<ValueType>* const __restrict__ b_values,
+    cuda_type<ValueType>* const __restrict__ x_values,
+    cuda_type<ValueType>* const __restrict__ workspace_data,
+    const int& block_size, const size_t& shared_size)
+{
+    batch_single_kernels::apply_kernel<StopType, n_shared, prec_shared>
+        <<<mat.num_batch_items, block_size, shared_size, exec->get_stream()>>>(
+            sconf, settings.max_iterations, as_cuda_type(settings.residual_tol),
+            logger, prec, mat, b_values, x_values, workspace_data);
+}
+
+
+// begin
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE);
+// end
+
+
+}  // namespace batch_cg
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko

From 5c43846837627670d9ae242e5551b86785db2631 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Tue, 1 Oct 2024 11:53:30 +0200
Subject: [PATCH 267/448] [batch] review updates:

- adds header guard

Co-authored-by: Pratik Nayak <pratik.nayak@kit.edu>
---
 cuda/solver/batch_bicgstab_launch.cuh    | 7 +++++++
 cuda/solver/batch_cg_launch.cuh          | 7 +++++++
 hip/solver/batch_bicgstab_launch.hip.hpp | 7 +++++++
 hip/solver/batch_cg_launch.hip.hpp       | 7 +++++++
 4 files changed, 28 insertions(+)

diff --git a/cuda/solver/batch_bicgstab_launch.cuh b/cuda/solver/batch_bicgstab_launch.cuh
index 6c56b6456a8..5106b21251e 100644
--- a/cuda/solver/batch_bicgstab_launch.cuh
+++ b/cuda/solver/batch_bicgstab_launch.cuh
@@ -2,6 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_CUDA_SOLVER_BATCH_BICGSTAB_LAUNCH_CUH_
+#define GKO_CUDA_SOLVER_BATCH_BICGSTAB_LAUNCH_CUH_
+
+
 #include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/types.hpp"
@@ -110,3 +114,6 @@ void launch_apply_kernel(
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace gko
+
+
+#endif
diff --git a/cuda/solver/batch_cg_launch.cuh b/cuda/solver/batch_cg_launch.cuh
index 7196d6f8366..9cb470eb51b 100644
--- a/cuda/solver/batch_cg_launch.cuh
+++ b/cuda/solver/batch_cg_launch.cuh
@@ -2,6 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_CUDA_SOLVER_BATCH_CG_LAUNCH_CUH_
+#define GKO_CUDA_SOLVER_BATCH_CG_LAUNCH_CUH_
+
+
 #include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/types.hpp"
@@ -102,3 +106,6 @@ void launch_apply_kernel(
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace gko
+
+
+#endif
diff --git a/hip/solver/batch_bicgstab_launch.hip.hpp b/hip/solver/batch_bicgstab_launch.hip.hpp
index 08d39b8fd5e..0f62a9487a3 100644
--- a/hip/solver/batch_bicgstab_launch.hip.hpp
+++ b/hip/solver/batch_bicgstab_launch.hip.hpp
@@ -2,6 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_HIP_SOLVER_BATCH_BICGSTAB_LAUNCH_HIP_HPP_
+#define GKO_HIP_SOLVER_BATCH_BICGSTAB_LAUNCH_HIP_HPP_
+
+
 #include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/types.hpp"
@@ -76,3 +80,6 @@ void launch_apply_kernel(
 }  // namespace hip
 }  // namespace kernels
 }  // namespace gko
+
+
+#endif
diff --git a/hip/solver/batch_cg_launch.hip.hpp b/hip/solver/batch_cg_launch.hip.hpp
index a1e41310b8b..7071c5c4065 100644
--- a/hip/solver/batch_cg_launch.hip.hpp
+++ b/hip/solver/batch_cg_launch.hip.hpp
@@ -2,6 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_HIP_SOLVER_BATCH_CG_LAUNCH_HPP_
+#define GKO_HIP_SOLVER_BATCH_CG_LAUNCH_HPP_
+
+
 #include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/types.hpp"
@@ -68,3 +72,6 @@ void launch_apply_kernel(std::shared_ptr<const DefaultExecutor> exec,
 }  // namespace hip
 }  // namespace kernels
 }  // namespace gko
+
+
+#endif

From d1b6b7b3b8d0d864c9a39ff67ca2f0e410754a83 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Wed, 2 Oct 2024 13:16:28 +0200
Subject: [PATCH 268/448] [batch] add launch bounds and fix register check

---
 .../solver/batch_bicgstab_kernels.hpp         | 19 +++++++++-----
 common/cuda_hip/solver/batch_cg_kernels.hpp   | 21 ++++++++++------
 .../batch_bicgstab_launch.instantiate.cu      | 25 +++++++++++++------
 cuda/solver/batch_cg_launch.instantiate.cu    | 22 ++++++++++------
 4 files changed, 58 insertions(+), 29 deletions(-)

diff --git a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp
index 6bce1b53bb8..9aa14243de3 100644
--- a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp
+++ b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp
@@ -27,6 +27,11 @@
 namespace gko {
 namespace kernels {
 namespace GKO_DEVICE_NAMESPACE {
+
+
+constexpr int max_bicgstab_threads = 1024;
+
+
 namespace batch_single_kernels {
 
 
@@ -170,12 +175,14 @@ __device__ __forceinline__ void update_x_middle(
 template <typename StopType, int n_shared, bool prec_shared_bool,
           typename PrecType, typename LogType, typename BatchMatrixType,
           typename ValueType>
-__global__ void apply_kernel(
-    const gko::kernels::batch_bicgstab::storage_config sconf,
-    const int max_iter, const gko::remove_complex<ValueType> tol,
-    LogType logger, PrecType prec_shared, const BatchMatrixType mat,
-    const ValueType* const __restrict__ b, ValueType* const __restrict__ x,
-    ValueType* const __restrict__ workspace = nullptr)
+__global__ void __launch_bounds__(max_bicgstab_threads)
+    apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf,
+                 const int max_iter, const gko::remove_complex<ValueType> tol,
+                 LogType logger, PrecType prec_shared,
+                 const BatchMatrixType mat,
+                 const ValueType* const __restrict__ b,
+                 ValueType* const __restrict__ x,
+                 ValueType* const __restrict__ workspace = nullptr)
 {
     using real_type = typename gko::remove_complex<ValueType>;
     const auto num_batch_items = mat.num_batch_items;
diff --git a/common/cuda_hip/solver/batch_cg_kernels.hpp b/common/cuda_hip/solver/batch_cg_kernels.hpp
index c8502e28b1f..2c42d359fff 100644
--- a/common/cuda_hip/solver/batch_cg_kernels.hpp
+++ b/common/cuda_hip/solver/batch_cg_kernels.hpp
@@ -29,6 +29,11 @@
 namespace gko {
 namespace kernels {
 namespace GKO_DEVICE_NAMESPACE {
+
+
+constexpr int max_cg_threads = 1024;
+
+
 namespace batch_single_kernels {
 
 
@@ -115,14 +120,14 @@ __device__ __forceinline__ void update_x_and_r(
 template <typename StopType, const int n_shared, const bool prec_shared_bool,
           typename PrecType, typename LogType, typename BatchMatrixType,
           typename ValueType>
-__global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf,
-                             const int max_iter,
-                             const gko::remove_complex<ValueType> tol,
-                             LogType logger, PrecType prec_shared,
-                             const BatchMatrixType mat,
-                             const ValueType* const __restrict__ b,
-                             ValueType* const __restrict__ x,
-                             ValueType* const __restrict__ workspace = nullptr)
+__global__ void __launch_bounds__(max_cg_threads)
+    apply_kernel(const gko::kernels::batch_cg::storage_config sconf,
+                 const int max_iter, const gko::remove_complex<ValueType> tol,
+                 LogType logger, PrecType prec_shared,
+                 const BatchMatrixType mat,
+                 const ValueType* const __restrict__ b,
+                 ValueType* const __restrict__ x,
+                 ValueType* const __restrict__ workspace = nullptr)
 {
     using real_type = typename gko::remove_complex<ValueType>;
     const auto num_batch_items = mat.num_batch_items;
diff --git a/cuda/solver/batch_bicgstab_launch.instantiate.cu b/cuda/solver/batch_bicgstab_launch.instantiate.cu
index ec88cc17c85..ad17394c4a9 100644
--- a/cuda/solver/batch_bicgstab_launch.instantiate.cu
+++ b/cuda/solver/batch_bicgstab_launch.instantiate.cu
@@ -27,20 +27,29 @@ int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
     constexpr int warp_sz = static_cast<int>(config::warp_size);
     const int min_block_size = 2 * warp_sz;
     const int device_max_threads =
-        ((std::max(num_rows, min_block_size)) / warp_sz) * warp_sz;
-    cudaFuncAttributes funcattr;
-    cudaFuncGetAttributes(
-        &funcattr,
-        batch_single_kernels::apply_kernel<StopType, 9, true, PrecType, LogType,
-                                           BatchMatrixType, ValueType>);
-    const int num_regs_used = funcattr.numRegs;
+        (std::max(num_rows, min_block_size) / warp_sz) * warp_sz;
+    auto get_num_regs = [](const auto func) {
+        cudaFuncAttributes funcattr;
+        cudaFuncGetAttributes(&funcattr, func);
+        return funcattr.numRegs;
+    };
+    const int num_regs_used = std::max(
+        get_num_regs(
+            batch_single_kernels::apply_kernel<StopType, 9, true, PrecType,
+                                               LogType, BatchMatrixType,
+                                               ValueType>),
+        get_num_regs(
+            batch_single_kernels::apply_kernel<StopType, 0, false, PrecType,
+                                               LogType, BatchMatrixType,
+                                               ValueType>));
     int max_regs_blk = 0;
     cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock,
                            exec->get_device_id());
     const int max_threads_regs =
         ((max_regs_blk / static_cast<int>(num_regs_used)) / warp_sz) * warp_sz;
     int max_threads = std::min(max_threads_regs, device_max_threads);
-    max_threads = max_threads <= 1024 ? max_threads : 1024;
+    max_threads = max_threads <= max_bicgstab_threads ? max_threads
+                                                      : max_bicgstab_threads;
     return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size);
 }
 
diff --git a/cuda/solver/batch_cg_launch.instantiate.cu b/cuda/solver/batch_cg_launch.instantiate.cu
index 9fca587f33e..89e96e85ace 100644
--- a/cuda/solver/batch_cg_launch.instantiate.cu
+++ b/cuda/solver/batch_cg_launch.instantiate.cu
@@ -28,19 +28,27 @@ int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
     const int min_block_size = 2 * warp_sz;
     const int device_max_threads =
         (std::max(num_rows, min_block_size) / warp_sz) * warp_sz;
-    cudaFuncAttributes funcattr;
-    cudaFuncGetAttributes(
-        &funcattr,
-        batch_single_kernels::apply_kernel<StopType, 5, true, PrecType, LogType,
-                                           BatchMatrixType, ValueType>);
-    const int num_regs_used = funcattr.numRegs;
+    auto get_num_regs = [](const auto func) {
+        cudaFuncAttributes funcattr;
+        cudaFuncGetAttributes(&funcattr, func);
+        return funcattr.numRegs;
+    };
+    const int num_regs_used = std::max(
+        get_num_regs(
+            batch_single_kernels::apply_kernel<StopType, 5, true, PrecType,
+                                               LogType, BatchMatrixType,
+                                               ValueType>),
+        get_num_regs(
+            batch_single_kernels::apply_kernel<StopType, 0, false, PrecType,
+                                               LogType, BatchMatrixType,
+                                               ValueType>));
     int max_regs_blk = 0;
     cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock,
                            exec->get_device_id());
     const int max_threads_regs =
         ((max_regs_blk / static_cast<int>(num_regs_used)) / warp_sz) * warp_sz;
     int max_threads = std::min(max_threads_regs, device_max_threads);
-    max_threads = max_threads <= 1024 ? max_threads : 1024;
+    max_threads = max_threads <= max_cg_threads ? max_threads : max_cg_threads;
     return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size);
 }
 

From b0b98ba2171b60aecd4b1ff78a176dd07dd58ca0 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Mon, 21 Oct 2024 11:19:25 +0200
Subject: [PATCH 269/448] [batch] add macro indirection

Co-authored-by: Tobias Ribizel <mail@ribizel.de>
---
 core/solver/batch_dispatch.hpp | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp
index ff5bb3f5390..3e3fd01a03c 100644
--- a/core/solver/batch_dispatch.hpp
+++ b/core/solver/batch_dispatch.hpp
@@ -164,12 +164,16 @@ enum class log_type { simple_convergence_completion };
 }  // namespace log
 
 
-#define GKO_BATCH_INSTANTIATE_STOP(macro, ...)                          \
-    macro(__VA_ARGS__,                                                  \
-          ::gko::batch::solver::device::batch_stop::SimpleAbsResidual); \
-    template macro(                                                     \
-        __VA_ARGS__,                                                    \
-        ::gko::batch::solver::device::batch_stop::SimpleRelResidual)
+#define GKO_INDIRECT(...) __VA_ARGS__
+
+
+#define GKO_BATCH_INSTANTIATE_STOP(macro, ...)                               \
+    GKO_INDIRECT(                                                            \
+        macro(__VA_ARGS__,                                                   \
+              ::gko::batch::solver::device::batch_stop::SimpleAbsResidual)); \
+    template GKO_INDIRECT(                                                   \
+        macro(__VA_ARGS__,                                                   \
+              ::gko::batch::solver::device::batch_stop::SimpleRelResidual))
 
 #define GKO_BATCH_INSTANTIATE_PRECONDITIONER(macro, ...)                   \
     GKO_BATCH_INSTANTIATE_STOP(                                            \

From 546d77b77d44b69e7f44f2e3699d7ad3224cffae Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Thu, 7 Nov 2024 09:33:02 +0000
Subject: [PATCH 270/448] [batch] unify batch solver

---
 .../cuda_hip/solver/batch_bicgstab_launch.hpp | 35 +++++-------
 .../batch_bicgstab_launch.instantiate.cpp     | 18 +++---
 .../cuda_hip/solver/batch_cg_launch.hpp       | 55 +++++++++----------
 .../solver/batch_cg_launch.instantiate.cpp    | 30 +++++-----
 cuda/CMakeLists.txt                           | 15 +++--
 cuda/solver/batch_bicgstab_launch.cuh         | 52 +-----------------
 .../batch_bicgstab_launch.instantiate.cu      | 41 --------------
 cuda/solver/batch_cg_launch.cuh               | 44 +--------------
 cuda/solver/batch_cg_launch.instantiate.cu    | 33 -----------
 hip/CMakeLists.txt                            |  4 +-
 hip/solver/batch_bicgstab_kernels.hip.cpp     | 24 ++++----
 hip/solver/batch_cg_kernels.hip.cpp           | 19 +++----
 12 files changed, 101 insertions(+), 269 deletions(-)
 rename hip/solver/batch_bicgstab_launch.hip.hpp => common/cuda_hip/solver/batch_bicgstab_launch.hpp (76%)
 rename hip/solver/batch_bicgstab_launch.instantiate.hip.cpp => common/cuda_hip/solver/batch_bicgstab_launch.instantiate.cpp (81%)
 rename hip/solver/batch_cg_launch.hip.hpp => common/cuda_hip/solver/batch_cg_launch.hpp (58%)
 rename hip/solver/batch_cg_launch.instantiate.hip.cpp => common/cuda_hip/solver/batch_cg_launch.instantiate.cpp (60%)

diff --git a/hip/solver/batch_bicgstab_launch.hip.hpp b/common/cuda_hip/solver/batch_bicgstab_launch.hpp
similarity index 76%
rename from hip/solver/batch_bicgstab_launch.hip.hpp
rename to common/cuda_hip/solver/batch_bicgstab_launch.hpp
index 0f62a9487a3..3db03db0409 100644
--- a/hip/solver/batch_bicgstab_launch.hip.hpp
+++ b/common/cuda_hip/solver/batch_bicgstab_launch.hpp
@@ -2,9 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#ifndef GKO_HIP_SOLVER_BATCH_BICGSTAB_LAUNCH_HIP_HPP_
-#define GKO_HIP_SOLVER_BATCH_BICGSTAB_LAUNCH_HIP_HPP_
-
+#pragma once
 
 #include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/base/config.hpp"
@@ -17,7 +15,7 @@
 
 namespace gko {
 namespace kernels {
-namespace hip {
+namespace GKO_DEVICE_NAMESPACE {
 namespace batch_bicgstab {
 
 
@@ -32,24 +30,24 @@ void launch_apply_kernel(
     const gko::kernels::batch_bicgstab::storage_config& sconf,
     const settings<remove_complex<ValueType>>& settings, LogType& logger,
     PrecType& prec, const BatchMatrixType& mat,
-    const hip_type<ValueType>* const __restrict__ b_values,
-    hip_type<ValueType>* const __restrict__ x_values,
-    hip_type<ValueType>* const __restrict__ workspace_data,
+    const device_type<ValueType>* const __restrict__ b_values,
+    device_type<ValueType>* const __restrict__ x_values,
+    device_type<ValueType>* const __restrict__ workspace_data,
     const int& block_size, const size_t& shared_size);
 
 #define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH(_vtype, _n_shared, _prec_shared, \
                                           mat_t, log_t, pre_t, stop_t)     \
-    void launch_apply_kernel<_vtype, _n_shared, _prec_shared,              \
-                             stop_t<hip_type<_vtype>>>(                    \
+    void launch_apply_kernel<device_type<_vtype>, _n_shared, _prec_shared, \
+                             stop_t<device_type<_vtype>>>(                 \
         std::shared_ptr<const DefaultExecutor> exec,                       \
         const gko::kernels::batch_bicgstab::storage_config& sconf,         \
-        const settings<remove_complex<_vtype>>& settings,                  \
-        log_t<hip_type<gko::remove_complex<_vtype>>>& logger,              \
-        pre_t<hip_type<_vtype>>& prec,                                     \
-        const mat_t<const hip_type<_vtype>>& mat,                          \
-        const hip_type<_vtype>* const __restrict__ b_values,               \
-        hip_type<_vtype>* const __restrict__ x_values,                     \
-        hip_type<_vtype>* const __restrict__ workspace_data,               \
+        const settings<remove_complex<device_type<_vtype>>>& settings,     \
+        log_t<gko::remove_complex<device_type<_vtype>>>& logger,           \
+        pre_t<device_type<_vtype>>& prec,                                  \
+        const mat_t<const device_type<_vtype>>& mat,                       \
+        const device_type<_vtype>* const __restrict__ b_values,            \
+        device_type<_vtype>* const __restrict__ x_values,                  \
+        device_type<_vtype>* const __restrict__ workspace_data,            \
         const int& block_size, const size_t& shared_size)
 
 #define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0_FALSE(_vtype) \
@@ -77,9 +75,6 @@ void launch_apply_kernel(
 
 
 }  // namespace batch_bicgstab
-}  // namespace hip
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
-
-
-#endif
diff --git a/hip/solver/batch_bicgstab_launch.instantiate.hip.cpp b/common/cuda_hip/solver/batch_bicgstab_launch.instantiate.cpp
similarity index 81%
rename from hip/solver/batch_bicgstab_launch.instantiate.hip.cpp
rename to common/cuda_hip/solver/batch_bicgstab_launch.instantiate.cpp
index fb26c562a94..bff6babb446 100644
--- a/hip/solver/batch_bicgstab_launch.instantiate.hip.cpp
+++ b/common/cuda_hip/solver/batch_bicgstab_launch.instantiate.cpp
@@ -2,18 +2,19 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "common/cuda_hip/solver/batch_bicgstab_launch.hpp"
+
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 #include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_bicgstab_kernels.hpp"
 #include "core/solver/batch_dispatch.hpp"
-#include "hip/solver/batch_bicgstab_launch.hip.hpp"
 
 
 namespace gko {
 namespace kernels {
-namespace hip {
+namespace GKO_DEVICE_NAMESPACE {
 namespace batch_bicgstab {
 
 
@@ -24,15 +25,16 @@ void launch_apply_kernel(
     const gko::kernels::batch_bicgstab::storage_config& sconf,
     const settings<remove_complex<ValueType>>& settings, LogType& logger,
     PrecType& prec, const BatchMatrixType& mat,
-    const hip_type<ValueType>* const __restrict__ b_values,
-    hip_type<ValueType>* const __restrict__ x_values,
-    hip_type<ValueType>* const __restrict__ workspace_data,
+    const device_type<ValueType>* const __restrict__ b_values,
+    device_type<ValueType>* const __restrict__ x_values,
+    device_type<ValueType>* const __restrict__ workspace_data,
     const int& block_size, const size_t& shared_size)
 {
     batch_single_kernels::apply_kernel<StopType, n_shared, prec_shared>
         <<<mat.num_batch_items, block_size, shared_size, exec->get_stream()>>>(
-            sconf, settings.max_iterations, as_hip_type(settings.residual_tol),
-            logger, prec, mat, b_values, x_values, workspace_data);
+            sconf, settings.max_iterations,
+            as_device_type(settings.residual_tol), logger, prec, mat, b_values,
+            x_values, workspace_data);
 }
 
 
@@ -62,6 +64,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_TRUE);
 
 
 }  // namespace batch_bicgstab
-}  // namespace hip
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/hip/solver/batch_cg_launch.hip.hpp b/common/cuda_hip/solver/batch_cg_launch.hpp
similarity index 58%
rename from hip/solver/batch_cg_launch.hip.hpp
rename to common/cuda_hip/solver/batch_cg_launch.hpp
index 7071c5c4065..6fa144ba35e 100644
--- a/hip/solver/batch_cg_launch.hip.hpp
+++ b/common/cuda_hip/solver/batch_cg_launch.hpp
@@ -2,9 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#ifndef GKO_HIP_SOLVER_BATCH_CG_LAUNCH_HPP_
-#define GKO_HIP_SOLVER_BATCH_CG_LAUNCH_HPP_
-
+#pragma once
 
 #include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/base/config.hpp"
@@ -17,7 +15,7 @@
 
 namespace gko {
 namespace kernels {
-namespace hip {
+namespace GKO_DEVICE_NAMESPACE {
 namespace batch_cg {
 
 
@@ -27,29 +25,29 @@ using settings = gko::kernels::batch_cg::settings<T>;
 
 template <typename ValueType, int n_shared, bool prec_shared, typename StopType,
           typename PrecType, typename LogType, typename BatchMatrixType>
-void launch_apply_kernel(std::shared_ptr<const DefaultExecutor> exec,
-                         const gko::kernels::batch_cg::storage_config& sconf,
-                         const settings<remove_complex<ValueType>>& settings,
-                         LogType& logger, PrecType& prec,
-                         const BatchMatrixType& mat,
-                         const hip_type<ValueType>* const __restrict__ b_values,
-                         hip_type<ValueType>* const __restrict__ x_values,
-                         hip_type<ValueType>* const __restrict__ workspace_data,
-                         const int& block_size, const size_t& shared_size);
+void launch_apply_kernel(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const gko::kernels::batch_cg::storage_config& sconf,
+    const settings<remove_complex<ValueType>>& settings, LogType& logger,
+    PrecType& prec, const BatchMatrixType& mat,
+    const device_type<ValueType>* const __restrict__ b_values,
+    device_type<ValueType>* const __restrict__ x_values,
+    device_type<ValueType>* const __restrict__ workspace_data,
+    const int& block_size, const size_t& shared_size);
 
-#define GKO_DECLARE_BATCH_CG_LAUNCH(_vtype, _n_shared, _prec_shared, mat_t, \
-                                    log_t, pre_t, stop_t)                   \
-    void launch_apply_kernel<hip_type<_vtype>, _n_shared, _prec_shared,     \
-                             stop_t<hip_type<_vtype>>>(                     \
-        std::shared_ptr<const DefaultExecutor> exec,                        \
-        const gko::kernels::batch_cg::storage_config& sconf,                \
-        const settings<remove_complex<_vtype>>& settings,                   \
-        log_t<hip_type<gko::remove_complex<hip_type<_vtype>>>>& logger,     \
-        pre_t<hip_type<_vtype>>& prec,                                      \
-        const mat_t<const hip_type<_vtype>>& mat,                           \
-        const hip_type<_vtype>* const __restrict__ b_values,                \
-        hip_type<_vtype>* const __restrict__ x_values,                      \
-        hip_type<_vtype>* const __restrict__ workspace_data,                \
+#define GKO_DECLARE_BATCH_CG_LAUNCH(_vtype, _n_shared, _prec_shared, mat_t,   \
+                                    log_t, pre_t, stop_t)                     \
+    void launch_apply_kernel<device_type<_vtype>, _n_shared, _prec_shared,    \
+                             stop_t<device_type<_vtype>>>(                    \
+        std::shared_ptr<const DefaultExecutor> exec,                          \
+        const gko::kernels::batch_cg::storage_config& sconf,                  \
+        const settings<remove_complex<_vtype>>& settings,                     \
+        log_t<device_type<gko::remove_complex<device_type<_vtype>>>>& logger, \
+        pre_t<device_type<_vtype>>& prec,                                     \
+        const mat_t<const device_type<_vtype>>& mat,                          \
+        const device_type<_vtype>* const __restrict__ b_values,               \
+        device_type<_vtype>* const __restrict__ x_values,                     \
+        device_type<_vtype>* const __restrict__ workspace_data,               \
         const int& block_size, const size_t& shared_size)
 
 #define GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE(_vtype) \
@@ -69,9 +67,6 @@ void launch_apply_kernel(std::shared_ptr<const DefaultExecutor> exec,
 
 
 }  // namespace batch_cg
-}  // namespace hip
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
-
-
-#endif
diff --git a/hip/solver/batch_cg_launch.instantiate.hip.cpp b/common/cuda_hip/solver/batch_cg_launch.instantiate.cpp
similarity index 60%
rename from hip/solver/batch_cg_launch.instantiate.hip.cpp
rename to common/cuda_hip/solver/batch_cg_launch.instantiate.cpp
index 3605a88651d..eef120df196 100644
--- a/hip/solver/batch_cg_launch.instantiate.hip.cpp
+++ b/common/cuda_hip/solver/batch_cg_launch.instantiate.cpp
@@ -2,37 +2,39 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "common/cuda_hip/solver/batch_cg_launch.hpp"
+
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 #include "common/cuda_hip/solver/batch_cg_kernels.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_cg_kernels.hpp"
 #include "core/solver/batch_dispatch.hpp"
-#include "hip/solver/batch_cg_launch.hip.hpp"
 
 
 namespace gko {
 namespace kernels {
-namespace hip {
+namespace GKO_DEVICE_NAMESPACE {
 namespace batch_cg {
 
 
 template <typename ValueType, int n_shared, bool prec_shared, typename StopType,
           typename PrecType, typename LogType, typename BatchMatrixType>
-void launch_apply_kernel(std::shared_ptr<const DefaultExecutor> exec,
-                         const gko::kernels::batch_cg::storage_config& sconf,
-                         const settings<remove_complex<ValueType>>& settings,
-                         LogType& logger, PrecType& prec,
-                         const BatchMatrixType& mat,
-                         const hip_type<ValueType>* const __restrict__ b_values,
-                         hip_type<ValueType>* const __restrict__ x_values,
-                         hip_type<ValueType>* const __restrict__ workspace_data,
-                         const int& block_size, const size_t& shared_size)
+void launch_apply_kernel(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const gko::kernels::batch_cg::storage_config& sconf,
+    const settings<remove_complex<ValueType>>& settings, LogType& logger,
+    PrecType& prec, const BatchMatrixType& mat,
+    const device_type<ValueType>* const __restrict__ b_values,
+    device_type<ValueType>* const __restrict__ x_values,
+    device_type<ValueType>* const __restrict__ workspace_data,
+    const int& block_size, const size_t& shared_size)
 {
     batch_single_kernels::apply_kernel<StopType, n_shared, prec_shared>
         <<<mat.num_batch_items, block_size, shared_size, exec->get_stream()>>>(
-            sconf, settings.max_iterations, as_hip_type(settings.residual_tol),
-            logger, prec, mat, b_values, x_values, workspace_data);
+            sconf, settings.max_iterations,
+            as_device_type(settings.residual_tol), logger, prec, mat, b_values,
+            x_values, workspace_data);
 }
 
 
@@ -54,6 +56,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE);
 
 
 }  // namespace batch_cg
-}  // namespace hip
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index bfa65eee79b..7567a1adf3c 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -3,8 +3,10 @@ add_library(ginkgo_cuda $<TARGET_OBJECTS:ginkgo_cuda_device> "")
 include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
 add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/csr_kernels.instantiate.cpp CSR_INSTANTIATE)
 add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE)
-add_instantiation_files(. solver/batch_bicgstab_launch.instantiate.cu BATCH_BICGSTAB_INSTANTIATE)
-add_instantiation_files(. solver/batch_cg_launch.instantiate.cu BATCH_CG_INSTANTIATE)
+add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip solver/batch_bicgstab_launch.instantiate.cpp BATCH_BICGSTAB_INSTANTIATE1)
+add_instantiation_files(. solver/batch_bicgstab_launch.instantiate.cu BATCH_BICGSTAB_INSTANTIATE2)
+add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip solver/batch_cg_launch.instantiate.cpp BATCH_CG_INSTANTIATE1)
+add_instantiation_files(. solver/batch_cg_launch.instantiate.cu BATCH_CG_INSTANTIATE2)
 # we don't split up the dense kernels into distinct compilations
 list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
 target_sources(ginkgo_cuda
@@ -23,9 +25,11 @@ target_sources(ginkgo_cuda
     matrix/fft_kernels.cu
     preconditioner/batch_jacobi_kernels.cu
     solver/batch_bicgstab_kernels.cu
-    ${BATCH_BICGSTAB_INSTANTIATE}
+    ${BATCH_BICGSTAB_INSTANTIATE1}
+    ${BATCH_BICGSTAB_INSTANTIATE2}
     solver/batch_cg_kernels.cu
-    ${BATCH_CG_INSTANTIATE}
+    ${BATCH_CG_INSTANTIATE1}
+    ${BATCH_CG_INSTANTIATE2}
     solver/lower_trs_kernels.cu
     solver/upper_trs_kernels.cu
     ${GKO_UNIFIED_COMMON_SOURCES}
@@ -41,7 +45,8 @@ else()
 endif()
 jacobi_generated_files(GKO_CUDA_JACOBI_SOURCES "${GKO_CUDA_JACOBI_BLOCK_SIZES}")
 # override the default language mapping for the common files, set them to CUDA
-foreach(source_file IN LISTS GKO_UNIFIED_COMMON_SOURCES GKO_CUDA_HIP_COMMON_SOURCES GKO_CUDA_JACOBI_SOURCES CSR_INSTANTIATE FBCSR_INSTANTIATE)
+foreach(source_file IN LISTS GKO_UNIFIED_COMMON_SOURCES GKO_CUDA_HIP_COMMON_SOURCES GKO_CUDA_JACOBI_SOURCES
+                             CSR_INSTANTIATE FBCSR_INSTANTIATE BATCH_BICGSTAB_INSTANTIATE1 BATCH_CG_INSTANTIATE1)
     set_source_files_properties(${source_file} PROPERTIES LANGUAGE CUDA)
 endforeach(source_file)
 target_sources(ginkgo_cuda PRIVATE ${GKO_CUDA_JACOBI_SOURCES})
diff --git a/cuda/solver/batch_bicgstab_launch.cuh b/cuda/solver/batch_bicgstab_launch.cuh
index 5106b21251e..76528c84670 100644
--- a/cuda/solver/batch_bicgstab_launch.cuh
+++ b/cuda/solver/batch_bicgstab_launch.cuh
@@ -10,6 +10,7 @@
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
+#include "common/cuda_hip/solver/batch_bicgstab_launch.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_bicgstab_kernels.hpp"
@@ -59,57 +60,6 @@ int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec);
         GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY_, _vtype)
 
 
-template <typename ValueType, int n_shared, bool prec_shared, typename StopType,
-          typename PrecType, typename LogType, typename BatchMatrixType>
-void launch_apply_kernel(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const gko::kernels::batch_bicgstab::storage_config& sconf,
-    const settings<remove_complex<ValueType>>& settings, LogType& logger,
-    PrecType& prec, const BatchMatrixType& mat,
-    const ValueType* const __restrict__ b_values,
-    ValueType* const __restrict__ x_values,
-    ValueType* const __restrict__ workspace_data, const int& block_size,
-    const size_t& shared_size);
-
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH(_vtype, _n_shared, _prec_shared, \
-                                          mat_t, log_t, pre_t, stop_t)     \
-    void launch_apply_kernel<cuda_type<_vtype>, _n_shared, _prec_shared,   \
-                             stop_t<cuda_type<_vtype>>>(                   \
-        std::shared_ptr<const DefaultExecutor> exec,                       \
-        const gko::kernels::batch_bicgstab::storage_config& sconf,         \
-        const settings<remove_complex<cuda_type<_vtype>>>& settings,       \
-        log_t<gko::remove_complex<cuda_type<_vtype>>>& logger,             \
-        pre_t<cuda_type<_vtype>>& prec,                                    \
-        const mat_t<const cuda_type<_vtype>>& mat,                         \
-        const cuda_type<_vtype>* const __restrict__ b_values,              \
-        cuda_type<_vtype>* const __restrict__ x_values,                    \
-        cuda_type<_vtype>* const __restrict__ workspace_data,              \
-        const int& block_size, const size_t& shared_size)
-
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 0, false)
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 1, false)
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 2, false)
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 3, false)
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 4, false)
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 5, false)
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 6, false)
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 7, false)
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 8, false)
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 9, false)
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_TRUE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 9, true)
-
-
 }  // namespace batch_bicgstab
 }  // namespace cuda
 }  // namespace kernels
diff --git a/cuda/solver/batch_bicgstab_launch.instantiate.cu b/cuda/solver/batch_bicgstab_launch.instantiate.cu
index ad17394c4a9..629b4f9c6ad 100644
--- a/cuda/solver/batch_bicgstab_launch.instantiate.cu
+++ b/cuda/solver/batch_bicgstab_launch.instantiate.cu
@@ -75,53 +75,12 @@ int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec)
 }
 
 
-template <typename ValueType, int n_shared, bool prec_shared, typename StopType,
-          typename PrecType, typename LogType, typename BatchMatrixType>
-void launch_apply_kernel(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const gko::kernels::batch_bicgstab::storage_config& sconf,
-    const settings<remove_complex<ValueType>>& settings, LogType& logger,
-    PrecType& prec, const BatchMatrixType& mat,
-    const ValueType* const __restrict__ b_values,
-    ValueType* const __restrict__ x_values,
-    ValueType* const __restrict__ workspace_data, const int& block_size,
-    const size_t& shared_size)
-{
-    batch_single_kernels::apply_kernel<StopType, n_shared, prec_shared>
-        <<<mat.num_batch_items, block_size, shared_size, exec->get_stream()>>>(
-            sconf, settings.max_iterations, as_cuda_type(settings.residual_tol),
-            logger, prec, mat, b_values, x_values, workspace_data);
-}
-
-
 // begin
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK);
 // split
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY);
-// split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0_FALSE);
-// split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1_FALSE);
-// split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2_FALSE);
-// split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3_FALSE);
-// split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4_FALSE);
-// split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5_FALSE);
-// split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6_FALSE);
-// split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7_FALSE);
-// split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8_FALSE);
-// split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_FALSE);
-// split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_TRUE);
 // end
 
 
diff --git a/cuda/solver/batch_cg_launch.cuh b/cuda/solver/batch_cg_launch.cuh
index 9cb470eb51b..dafaaf19a9f 100644
--- a/cuda/solver/batch_cg_launch.cuh
+++ b/cuda/solver/batch_cg_launch.cuh
@@ -10,6 +10,7 @@
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
+#include "common/cuda_hip/solver/batch_cg_launch.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_cg_kernels.hpp"
@@ -59,49 +60,6 @@ int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec);
                           _vtype)
 
 
-template <typename ValueType, int n_shared, bool prec_shared, typename StopType,
-          typename PrecType, typename LogType, typename BatchMatrixType>
-void launch_apply_kernel(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const gko::kernels::batch_cg::storage_config& sconf,
-    const settings<remove_complex<ValueType>>& settings, LogType& logger,
-    PrecType& prec, const BatchMatrixType& mat,
-    const cuda_type<ValueType>* const __restrict__ b_values,
-    cuda_type<ValueType>* const __restrict__ x_values,
-    cuda_type<ValueType>* const __restrict__ workspace_data,
-    const int& block_size, const size_t& shared_size);
-
-#define GKO_DECLARE_BATCH_CG_LAUNCH(_vtype, _n_shared, _prec_shared, mat_t, \
-                                    log_t, pre_t, stop_t)                   \
-    void launch_apply_kernel<cuda_type<_vtype>, _n_shared, _prec_shared,    \
-                             stop_t<cuda_type<_vtype>>>(                    \
-        std::shared_ptr<const DefaultExecutor> exec,                        \
-        const gko::kernels::batch_cg::storage_config& sconf,                \
-        const settings<remove_complex<_vtype>>& settings,                   \
-        log_t<cuda_type<gko::remove_complex<cuda_type<_vtype>>>>& logger,   \
-        pre_t<cuda_type<_vtype>>& prec,                                     \
-        const mat_t<const cuda_type<_vtype>>& mat,                          \
-        const cuda_type<_vtype>* const __restrict__ b_values,               \
-        cuda_type<_vtype>* const __restrict__ x_values,                     \
-        cuda_type<_vtype>* const __restrict__ workspace_data,               \
-        const int& block_size, const size_t& shared_size)
-
-#define GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 0, false)
-#define GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 1, false)
-#define GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 2, false)
-#define GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 3, false)
-#define GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 4, false)
-#define GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, false)
-#define GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, true)
-
-
 }  // namespace batch_cg
 }  // namespace cuda
 }  // namespace kernels
diff --git a/cuda/solver/batch_cg_launch.instantiate.cu b/cuda/solver/batch_cg_launch.instantiate.cu
index 89e96e85ace..70c5cecb6f5 100644
--- a/cuda/solver/batch_cg_launch.instantiate.cu
+++ b/cuda/solver/batch_cg_launch.instantiate.cu
@@ -74,45 +74,12 @@ int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec)
 }
 
 
-template <typename ValueType, int n_shared, bool prec_shared, typename StopType,
-          typename PrecType, typename LogType, typename BatchMatrixType>
-void launch_apply_kernel(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const gko::kernels::batch_cg::storage_config& sconf,
-    const settings<remove_complex<ValueType>>& settings, LogType& logger,
-    PrecType& prec, const BatchMatrixType& mat,
-    const cuda_type<ValueType>* const __restrict__ b_values,
-    cuda_type<ValueType>* const __restrict__ x_values,
-    cuda_type<ValueType>* const __restrict__ workspace_data,
-    const int& block_size, const size_t& shared_size)
-{
-    batch_single_kernels::apply_kernel<StopType, n_shared, prec_shared>
-        <<<mat.num_batch_items, block_size, shared_size, exec->get_stream()>>>(
-            sconf, settings.max_iterations, as_cuda_type(settings.residual_tol),
-            logger, prec, mat, b_values, x_values, workspace_data);
-}
-
-
 // begin
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK);
 // split
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY);
-// split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE);
-// split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE);
-// split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE);
-// split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE);
-// split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE);
-// split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE);
-// split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE);
 // end
 
 
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 4a540046322..68be287a722 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -2,8 +2,8 @@ cmake_minimum_required(VERSION 3.21)
 include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
 add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/csr_kernels.instantiate.cpp CSR_INSTANTIATE)
 add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE)
-add_instantiation_files(. solver/batch_bicgstab_launch.instantiate.hip.cpp BATCH_BICGSTAB_INSTANTIATE)
-add_instantiation_files(. solver/batch_cg_launch.instantiate.hip.cpp BATCH_CG_INSTANTIATE)
+add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip solver/batch_bicgstab_launch.instantiate.cpp BATCH_BICGSTAB_INSTANTIATE)
+add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip solver/batch_cg_launch.instantiate.cpp BATCH_CG_INSTANTIATE)
 # we don't split up the dense kernels into distinct compilations
 list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
 set(GINKGO_HIP_SOURCES
diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp
index 697bcb94551..f3e770c609d 100644
--- a/hip/solver/batch_bicgstab_kernels.hip.cpp
+++ b/hip/solver/batch_bicgstab_kernels.hip.cpp
@@ -9,9 +9,9 @@
 #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp"
+#include "common/cuda_hip/solver/batch_bicgstab_launch.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
-#include "hip/solver/batch_bicgstab_launch.hip.hpp"
 
 
 namespace gko {
@@ -95,58 +95,58 @@ class kernel_caller {
         // Template parameters launch_apply_kernel<StopType, n_shared,
         // prec_shared)
         if (sconf.prec_shared) {
-            launch_apply_kernel<ValueType, 9, true, StopType>(
+            launch_apply_kernel<hip_value_type, 9, true, StopType>(
                 exec_, sconf, settings_, logger, prec, mat, b.values, x.values,
                 workspace_data, block_size, shared_size);
         } else {
             switch (sconf.n_shared) {
             case 0:
-                launch_apply_kernel<ValueType, 0, false, StopType>(
+                launch_apply_kernel<hip_value_type, 0, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 1:
-                launch_apply_kernel<ValueType, 1, false, StopType>(
+                launch_apply_kernel<hip_value_type, 1, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 2:
-                launch_apply_kernel<ValueType, 2, false, StopType>(
+                launch_apply_kernel<hip_value_type, 2, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 3:
-                launch_apply_kernel<ValueType, 3, false, StopType>(
+                launch_apply_kernel<hip_value_type, 3, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 4:
-                launch_apply_kernel<ValueType, 4, false, StopType>(
+                launch_apply_kernel<hip_value_type, 4, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 5:
-                launch_apply_kernel<ValueType, 5, false, StopType>(
+                launch_apply_kernel<hip_value_type, 5, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 6:
-                launch_apply_kernel<ValueType, 6, false, StopType>(
+                launch_apply_kernel<hip_value_type, 6, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 7:
-                launch_apply_kernel<ValueType, 7, false, StopType>(
+                launch_apply_kernel<hip_value_type, 7, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 8:
-                launch_apply_kernel<ValueType, 8, false, StopType>(
+                launch_apply_kernel<hip_value_type, 8, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 9:
-                launch_apply_kernel<ValueType, 9, false, StopType>(
+                launch_apply_kernel<hip_value_type, 9, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp
index 25ebd667a7e..457dfcdefcf 100644
--- a/hip/solver/batch_cg_kernels.hip.cpp
+++ b/hip/solver/batch_cg_kernels.hip.cpp
@@ -9,9 +9,9 @@
 #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "common/cuda_hip/solver/batch_cg_kernels.hpp"
+#include "common/cuda_hip/solver/batch_cg_launch.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
-#include "hip/solver/batch_cg_launch.hip.hpp"
 
 
 namespace gko {
@@ -97,38 +97,38 @@ class kernel_caller {
         // Template parameters launch_apply_kernel<ValueType, n_shared,
         // prec_shared, StopType>
         if (sconf.prec_shared) {
-            launch_apply_kernel<ValueType, 5, true, StopType>(
+            launch_apply_kernel<hip_value_type, 5, true, StopType>(
                 exec_, sconf, settings_, logger, prec, mat, b.values, x.values,
                 workspace_data, block_size, shared_size);
         } else {
             switch (sconf.n_shared) {
             case 0:
-                launch_apply_kernel<ValueType, 0, false, StopType>(
+                launch_apply_kernel<hip_value_type, 0, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 1:
-                launch_apply_kernel<ValueType, 1, false, StopType>(
+                launch_apply_kernel<hip_value_type, 1, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 2:
-                launch_apply_kernel<ValueType, 2, false, StopType>(
+                launch_apply_kernel<hip_value_type, 2, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 3:
-                launch_apply_kernel<ValueType, 3, false, StopType>(
+                launch_apply_kernel<hip_value_type, 3, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 4:
-                launch_apply_kernel<ValueType, 4, false, StopType>(
+                launch_apply_kernel<hip_value_type, 4, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 5:
-                launch_apply_kernel<ValueType, 5, false, StopType>(
+                launch_apply_kernel<hip_value_type, 5, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
@@ -153,9 +153,8 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
            batch::MultiVector<ValueType>* const x,
            batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
 {
-    using hip_value_type = hip_type<ValueType>;
     auto dispatcher = batch::solver::create_dispatcher<ValueType>(
-        kernel_caller<hip_value_type>(exec, settings), settings, mat, precon);
+        kernel_caller<ValueType>(exec, settings), settings, mat, precon);
     dispatcher.apply(b, x, logdata);
 }
 

From bb2685434921717641c5910923660458d5130944 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Thu, 7 Nov 2024 16:42:18 +0000
Subject: [PATCH 271/448] [batch] split batch solver (sycl)

---
 dpcpp/CMakeLists.txt                          |   6 +-
 dpcpp/solver/batch_bicgstab_kernels.dp.cpp    | 208 ++++++------------
 dpcpp/solver/batch_bicgstab_launch.hpp        |  85 +++++++
 .../batch_bicgstab_launch.instantiate.dp.cpp  | 111 ++++++++++
 dpcpp/solver/batch_cg_kernels.dp.cpp          | 157 ++++---------
 dpcpp/solver/batch_cg_launch.hpp              |  74 +++++++
 .../solver/batch_cg_launch.instantiate.dp.cpp | 110 +++++++++
 7 files changed, 503 insertions(+), 248 deletions(-)
 create mode 100644 dpcpp/solver/batch_bicgstab_launch.hpp
 create mode 100644 dpcpp/solver/batch_bicgstab_launch.instantiate.dp.cpp
 create mode 100644 dpcpp/solver/batch_cg_launch.hpp
 create mode 100644 dpcpp/solver/batch_cg_launch.instantiate.dp.cpp

diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index 516e9307e30..fcf123a513b 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -1,11 +1,13 @@
 find_package(MKL CONFIG REQUIRED HINTS "$ENV{MKLROOT}" "$ENV{MKL_ROOT}")
 find_package(oneDPL REQUIRED HINTS "$ENV{DPL_ROOT}" "$ENV{DPLROOT}")
-# use the parameter from cmake 
+# use the parameter from cmake
 set(GINKGO_MKL_ROOT "${MKL_DIR}" PARENT_SCOPE)
 set(GINKGO_DPL_ROOT "${oneDPL_DIR}" PARENT_SCOPE)
 
 include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
 add_instantiation_files(${PROJECT_SOURCE_DIR}/common/unified matrix/dense_kernels.instantiate.cpp DENSE_INSTANTIATE)
+add_instantiation_files(. solver/batch_bicgstab_launch.instantiate.dp.cpp BATCH_BICGSTAB_INSTANTIATE)
+add_instantiation_files(. solver/batch_cg_launch.instantiate.dp.cpp BATCH_CG_INSTANTIATE)
 add_library(ginkgo_dpcpp $<TARGET_OBJECTS:ginkgo_dpcpp_device> "")
 target_sources(ginkgo_dpcpp
     PRIVATE
@@ -59,7 +61,9 @@ target_sources(ginkgo_dpcpp
     preconditioner/sor_kernels.dp.cpp
     reorder/rcm_kernels.dp.cpp
     solver/batch_bicgstab_kernels.dp.cpp
+    ${BATCH_BICGSTAB_INSTANTIATE}
     solver/batch_cg_kernels.dp.cpp
+    ${BATCH_CG_INSTANTIATE}
     solver/cb_gmres_kernels.dp.cpp
     solver/idr_kernels.dp.cpp
     solver/lower_trs_kernels.dp.cpp
diff --git a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
index 74648aecf44..2aa98c26ed1 100644
--- a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
+++ b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
@@ -6,28 +6,14 @@
 
 #include <CL/sycl.hpp>
 
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/matrix/batch_ell.hpp>
 #include <ginkgo/core/solver/batch_bicgstab.hpp>
 
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
-#include "dpcpp/base/batch_multi_vector_kernels.hpp"
 #include "dpcpp/base/batch_struct.hpp"
-#include "dpcpp/base/config.hpp"
-#include "dpcpp/base/dim3.dp.hpp"
-#include "dpcpp/base/dpct.hpp"
-#include "dpcpp/base/helper.hpp"
-#include "dpcpp/components/cooperative_groups.dp.hpp"
-#include "dpcpp/components/intrinsics.dp.hpp"
-#include "dpcpp/components/reduction.dp.hpp"
-#include "dpcpp/components/thread_ids.dp.hpp"
-#include "dpcpp/matrix/batch_csr_kernels.hpp"
-#include "dpcpp/matrix/batch_dense_kernels.hpp"
-#include "dpcpp/matrix/batch_ell_kernels.hpp"
 #include "dpcpp/matrix/batch_struct.hpp"
-#include "dpcpp/solver/batch_bicgstab_kernels.hpp"
+#include "dpcpp/solver/batch_bicgstab_launch.hpp"
 
 
 namespace gko {
@@ -40,8 +26,7 @@ template <typename T>
 using settings = gko::kernels::batch_bicgstab::settings<T>;
 
 
-__dpct_inline__ int get_group_size(int value,
-                                   int subgroup_size = config::warp_size)
+int get_group_size(int value, int subgroup_size = config::warp_size)
 {
     int num_sg = ceildiv(value, subgroup_size);
     return num_sg * subgroup_size;
@@ -56,53 +41,6 @@ class kernel_caller {
         : exec_{std::move(exec)}, settings_{settings}
     {}
 
-    template <typename StopType, const int subgroup_size,
-              const int n_shared_total, typename PrecType, typename LogType,
-              typename BatchMatrixType>
-    __dpct_inline__ void launch_apply_kernel(
-        const gko::kernels::batch_bicgstab::storage_config& sconf,
-        LogType& logger, PrecType& prec, const BatchMatrixType mat,
-        const ValueType* const __restrict__ b_values,
-        ValueType* const __restrict__ x_values,
-        ValueType* const __restrict__ workspace, const int& group_size,
-        const int& shared_size) const
-    {
-        auto num_rows = mat.num_rows;
-
-        const dim3 block(group_size);
-        const dim3 grid(mat.num_batch_items);
-
-        auto max_iters = settings_.max_iterations;
-        auto res_tol = settings_.residual_tol;
-
-        exec_->get_queue()->submit([&](sycl::handler& cgh) {
-            sycl::local_accessor<ValueType, 1> slm_values(
-                sycl::range<1>(shared_size), cgh);
-
-            cgh.parallel_for(
-                sycl_nd_range(grid, block),
-                [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(
-                    subgroup_size)]] [[intel::kernel_args_restrict]] {
-                    auto batch_id = item_ct1.get_group_linear_id();
-                    const auto mat_global_entry =
-                        gko::batch::matrix::extract_batch_item(mat, batch_id);
-                    const ValueType* const b_global_entry =
-                        gko::batch::multi_vector::batch_item_ptr(
-                            b_values, 1, num_rows, batch_id);
-                    ValueType* const x_global_entry =
-                        gko::batch::multi_vector::batch_item_ptr(
-                            x_values, 1, num_rows, batch_id);
-                    batch_single_kernels::apply_kernel<StopType,
-                                                       n_shared_total>(
-                        sconf, max_iters, res_tol, logger, prec,
-                        mat_global_entry, b_global_entry, x_global_entry,
-                        num_rows, mat.get_single_item_num_nnz(),
-                        static_cast<ValueType*>(slm_values.get_pointer()),
-                        item_ct1, workspace);
-                });
-        });
-    }
-
     template <typename BatchMatrixType, typename PrecType, typename StopType,
               typename LogType>
     void call_kernel(
@@ -152,80 +90,76 @@ class kernel_caller {
         ValueType* const workspace_data = workspace.get_data();
         int n_shared_total = sconf.n_shared + int(sconf.prec_shared);
 
-        // TODO: split compilation
-        // Only instantiate when full optimizations has been enabled. Otherwise,
-        // just use the default one with no shared memory.
-        // template
         // launch_apply_kernel<StopType, subgroup_size, n_shared_total>
-        // if (num_rows <= 32 && n_shared_total == 10) {
-        //     launch_apply_kernel<StopType, 32, 10>(
-        //         sconf, logger, prec, mat, b.values, x.values, workspace_data,
-        //         group_size, shared_size);
-        // } else if (num_rows <= 256 && n_shared_total == 10) {
-        //     launch_apply_kernel<StopType, 32, 10>(
-        //         sconf, logger, prec, mat, b.values, x.values, workspace_data,
-        //         group_size, shared_size);
-        // } else {
-        //     switch (n_shared_total) {
-        //     case 0:
-        launch_apply_kernel<StopType, 32, 0>(sconf, logger, prec, mat, b.values,
-                                             x.values, workspace_data,
-                                             group_size, shared_size);
-        //         break;
-        //     case 1:
-        //         launch_apply_kernel<StopType, 32, 1>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, group_size, shared_size);
-        //         break;
-        //     case 2:
-        //         launch_apply_kernel<StopType, 32, 2>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, group_size, shared_size);
-        //         break;
-        //     case 3:
-        //         launch_apply_kernel<StopType, 32, 3>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, group_size, shared_size);
-        //         break;
-        //     case 4:
-        //         launch_apply_kernel<StopType, 32, 4>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, group_size, shared_size);
-        //         break;
-        //     case 5:
-        //         launch_apply_kernel<StopType, 32, 5>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, group_size, shared_size);
-        //         break;
-        //     case 6:
-        //         launch_apply_kernel<StopType, 32, 6>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, group_size, shared_size);
-        //         break;
-        //     case 7:
-        //         launch_apply_kernel<StopType, 32, 7>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, group_size, shared_size);
-        //         break;
-        //     case 8:
-        //         launch_apply_kernel<StopType, 32, 8>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, group_size, shared_size);
-        //         break;
-        //     case 9:
-        //         launch_apply_kernel<StopType, 32, 9>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, group_size, shared_size);
-        //         break;
-        //     case 10:
-        //         launch_apply_kernel<StopType, 32, 10>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, group_size, shared_size);
-        //         break;
-        //     default:
-        //         GKO_NOT_IMPLEMENTED;
-        //     }
-        // }
+        if (num_rows <= 32 && n_shared_total == 10) {
+            launch_apply_kernel<ValueType, StopType, 16, 10>(
+                exec_, sconf, settings_, logger, prec, mat, b.values, x.values,
+                workspace_data, group_size, shared_size);
+        } else if (num_rows <= 256 && n_shared_total == 10) {
+            launch_apply_kernel<ValueType, StopType, 32, 10>(
+                exec_, sconf, settings_, logger, prec, mat, b.values, x.values,
+                workspace_data, group_size, shared_size);
+        } else {
+            switch (n_shared_total) {
+            case 0:
+                launch_apply_kernel<ValueType, StopType, 32, 0>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, group_size, shared_size);
+                break;
+            case 1:
+                launch_apply_kernel<ValueType, StopType, 32, 1>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, group_size, shared_size);
+                break;
+            case 2:
+                launch_apply_kernel<ValueType, StopType, 32, 2>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, group_size, shared_size);
+                break;
+            case 3:
+                launch_apply_kernel<ValueType, StopType, 32, 3>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, group_size, shared_size);
+                break;
+            case 4:
+                launch_apply_kernel<ValueType, StopType, 32, 4>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, group_size, shared_size);
+                break;
+            case 5:
+                launch_apply_kernel<ValueType, StopType, 32, 5>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, group_size, shared_size);
+                break;
+            case 6:
+                launch_apply_kernel<ValueType, StopType, 32, 6>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, group_size, shared_size);
+                break;
+            case 7:
+                launch_apply_kernel<ValueType, StopType, 32, 7>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, group_size, shared_size);
+                break;
+            case 8:
+                launch_apply_kernel<ValueType, StopType, 32, 8>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, group_size, shared_size);
+                break;
+            case 9:
+                launch_apply_kernel<ValueType, StopType, 32, 9>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, group_size, shared_size);
+                break;
+            case 10:
+                launch_apply_kernel<ValueType, StopType, 32, 10>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, group_size, shared_size);
+                break;
+            default:
+                GKO_NOT_IMPLEMENTED;
+            }
+        }
     }
 
 private:
diff --git a/dpcpp/solver/batch_bicgstab_launch.hpp b/dpcpp/solver/batch_bicgstab_launch.hpp
new file mode 100644
index 00000000000..e4b1917ee9b
--- /dev/null
+++ b/dpcpp/solver/batch_bicgstab_launch.hpp
@@ -0,0 +1,85 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#pragma once
+
+#include <CL/sycl.hpp>
+
+#include <ginkgo/core/solver/batch_bicgstab.hpp>
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "core/solver/batch_bicgstab_kernels.hpp"
+#include "core/solver/batch_dispatch.hpp"
+#include "dpcpp/base/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+namespace batch_bicgstab {
+
+
+template <typename T>
+using settings = gko::kernels::batch_bicgstab::settings<T>;
+
+
+template <typename ValueType, typename StopType, const int subgroup_size,
+          const int n_shared_total, typename PrecType, typename LogType,
+          typename BatchMatrixType>
+void launch_apply_kernel(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const gko::kernels::batch_bicgstab::storage_config& sconf,
+    const settings<remove_complex<ValueType>>& settings, LogType& logger,
+    PrecType& prec, const BatchMatrixType& mat,
+    const ValueType* const __restrict__ b_values,
+    ValueType* const __restrict__ x_values,
+    ValueType* const __restrict__ workspace, const int& group_size,
+    const int& shared_size);
+
+
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH(_vtype, _subgroup_size, _n_shared, \
+                                          mat_t, log_t, pre_t, stop_t)       \
+    void                                                                     \
+    launch_apply_kernel<_vtype, stop_t<_vtype>, _subgroup_size, _n_shared>(  \
+        std::shared_ptr<const DefaultExecutor> exec,                         \
+        const gko::kernels::batch_bicgstab::storage_config& sconf,           \
+        const settings<remove_complex<_vtype>>& settings,                    \
+        log_t<gko::remove_complex<_vtype>>& logger, pre_t<_vtype>& prec,     \
+        const mat_t<const _vtype>& mat,                                      \
+        const _vtype* const __restrict__ b_values,                           \
+        _vtype* const __restrict__ x_values,                                 \
+        _vtype* const __restrict__ workspace_data, const int& block_size,    \
+        const int& shared_size)
+
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 0)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 1)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 2)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 3)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 4)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 5)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 6)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 7)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 8)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 9)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_10(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 10)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_10_16(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 16, 10)
+
+
+}  // namespace batch_bicgstab
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/solver/batch_bicgstab_launch.instantiate.dp.cpp b/dpcpp/solver/batch_bicgstab_launch.instantiate.dp.cpp
new file mode 100644
index 00000000000..94c5e7462ce
--- /dev/null
+++ b/dpcpp/solver/batch_bicgstab_launch.instantiate.dp.cpp
@@ -0,0 +1,111 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "dpcpp/solver/batch_bicgstab_launch.hpp"
+
+#include <CL/sycl.hpp>
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "core/solver/batch_bicgstab_kernels.hpp"
+#include "core/solver/batch_dispatch.hpp"
+#include "dpcpp/base/batch_multi_vector_kernels.hpp"
+#include "dpcpp/base/batch_struct.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/base/helper.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/intrinsics.dp.hpp"
+#include "dpcpp/components/reduction.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/solver/batch_bicgstab_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+namespace batch_bicgstab {
+
+
+template <typename ValueType, typename StopType, const int subgroup_size,
+          const int n_shared_total, typename PrecType, typename LogType,
+          typename BatchMatrixType>
+void launch_apply_kernel(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const gko::kernels::batch_bicgstab::storage_config& sconf,
+    const settings<remove_complex<ValueType>>& settings, LogType& logger,
+    PrecType& prec, const BatchMatrixType& mat,
+    const ValueType* const __restrict__ b_values,
+    ValueType* const __restrict__ x_values,
+    ValueType* const __restrict__ workspace, const int& group_size,
+    const int& shared_size)
+{
+    auto num_rows = mat.num_rows;
+
+    const dim3 block(group_size);
+    const dim3 grid(mat.num_batch_items);
+
+    auto max_iters = settings.max_iterations;
+    auto res_tol = settings.residual_tol;
+
+    exec->get_queue()->submit([&](sycl::handler& cgh) {
+        sycl::local_accessor<ValueType, 1> slm_values(
+            sycl::range<1>(shared_size), cgh);
+
+        cgh.parallel_for(
+            sycl_nd_range(grid, block),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(
+                subgroup_size)]] [[intel::kernel_args_restrict]] {
+                auto batch_id = item_ct1.get_group_linear_id();
+                const auto mat_global_entry =
+                    gko::batch::matrix::extract_batch_item(mat, batch_id);
+                const ValueType* const b_global_entry =
+                    gko::batch::multi_vector::batch_item_ptr(
+                        b_values, 1, num_rows, batch_id);
+                ValueType* const x_global_entry =
+                    gko::batch::multi_vector::batch_item_ptr(
+                        x_values, 1, num_rows, batch_id);
+                batch_single_kernels::apply_kernel<StopType, n_shared_total>(
+                    sconf, max_iters, res_tol, logger, prec, mat_global_entry,
+                    b_global_entry, x_global_entry, num_rows,
+                    mat.get_single_item_num_nnz(),
+                    static_cast<ValueType*>(slm_values.get_pointer()), item_ct1,
+                    workspace);
+            });
+    });
+}
+
+
+// begin
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_10);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_10_16);
+// end
+
+
+}  // namespace batch_bicgstab
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/solver/batch_cg_kernels.dp.cpp b/dpcpp/solver/batch_cg_kernels.dp.cpp
index 26f5c864187..43807583754 100644
--- a/dpcpp/solver/batch_cg_kernels.dp.cpp
+++ b/dpcpp/solver/batch_cg_kernels.dp.cpp
@@ -6,28 +6,15 @@
 
 #include <CL/sycl.hpp>
 
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/matrix/batch_ell.hpp>
 #include <ginkgo/core/solver/batch_cg.hpp>
 
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
-#include "dpcpp/base/batch_multi_vector_kernels.hpp"
 #include "dpcpp/base/batch_struct.hpp"
-#include "dpcpp/base/config.hpp"
-#include "dpcpp/base/dim3.dp.hpp"
-#include "dpcpp/base/dpct.hpp"
-#include "dpcpp/base/helper.hpp"
-#include "dpcpp/components/cooperative_groups.dp.hpp"
-#include "dpcpp/components/intrinsics.dp.hpp"
-#include "dpcpp/components/reduction.dp.hpp"
-#include "dpcpp/components/thread_ids.dp.hpp"
-#include "dpcpp/matrix/batch_csr_kernels.hpp"
-#include "dpcpp/matrix/batch_dense_kernels.hpp"
-#include "dpcpp/matrix/batch_ell_kernels.hpp"
 #include "dpcpp/matrix/batch_struct.hpp"
 #include "dpcpp/solver/batch_cg_kernels.hpp"
+#include "dpcpp/solver/batch_cg_launch.hpp"
 
 
 namespace gko {
@@ -40,8 +27,7 @@ template <typename T>
 using settings = gko::kernels::batch_cg::settings<T>;
 
 
-__dpct_inline__ int get_group_size(int value,
-                                   int subgroup_size = config::warp_size)
+int get_group_size(int value, int subgroup_size = config::warp_size)
 {
     int num_sg = ceildiv(value, subgroup_size);
     return num_sg * subgroup_size;
@@ -56,53 +42,6 @@ class kernel_caller {
         : exec_{std::move(exec)}, settings_{settings}
     {}
 
-    template <typename StopType, const int subgroup_size,
-              const int n_shared_total, typename PrecType, typename LogType,
-              typename BatchMatrixType>
-    void launch_apply_kernel(
-        const gko::kernels::batch_cg::storage_config& sconf, LogType& logger,
-        PrecType& prec, const BatchMatrixType mat,
-        const ValueType* const __restrict__ b_values,
-        ValueType* const __restrict__ x_values,
-        ValueType* const __restrict__ workspace, const int& group_size,
-        const int& shared_size) const
-    {
-        auto num_rows = mat.num_rows;
-
-        const dim3 block(group_size);
-        const dim3 grid(mat.num_batch_items);
-
-        auto max_iters = settings_.max_iterations;
-        auto res_tol = settings_.residual_tol;
-
-        exec_->get_queue()->submit([&](sycl::handler& cgh) {
-            sycl::local_accessor<ValueType, 1> slm_values(
-                sycl::range<1>(shared_size), cgh);
-
-            cgh.parallel_for(
-                sycl_nd_range(grid, block),
-                [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(
-                    subgroup_size)]] [[intel::kernel_args_restrict]] {
-                    auto batch_id = item_ct1.get_group_linear_id();
-                    const auto mat_global_entry =
-                        gko::batch::matrix::extract_batch_item(mat, batch_id);
-                    const ValueType* const b_global_entry =
-                        gko::batch::multi_vector::batch_item_ptr(
-                            b_values, 1, num_rows, batch_id);
-                    ValueType* const x_global_entry =
-                        gko::batch::multi_vector::batch_item_ptr(
-                            x_values, 1, num_rows, batch_id);
-                    batch_single_kernels::apply_kernel<StopType,
-                                                       n_shared_total>(
-                        sconf, max_iters, res_tol, logger, prec,
-                        mat_global_entry, b_global_entry, x_global_entry,
-                        num_rows, mat.get_single_item_num_nnz(),
-                        static_cast<ValueType*>(slm_values.get_pointer()),
-                        item_ct1, workspace);
-                });
-        });
-    }
-
     template <typename BatchMatrixType, typename PrecType, typename StopType,
               typename LogType>
     void call_kernel(
@@ -151,55 +90,53 @@ class kernel_caller {
         ValueType* const workspace_data = workspace.get_data();
         int n_shared_total = sconf.n_shared + int(sconf.prec_shared);
 
-        // Only instantiate when full optimizations has been enabled. Otherwise,
-        // just use the default one with no shared memory.
         // template
         // launch_apply_kernel<StopType, subgroup_size, n_shared_total>
-        // if (num_rows <= 32 && n_shared_total == 6) {
-        //     launch_apply_kernel<StopType, 16, 6>(
-        //         sconf, logger, prec, mat, b.values, x.values, workspace_data,
-        //         group_size, shared_size);
-        // } else {
-        //     switch (n_shared_total) {
-        //     case 0:
-        launch_apply_kernel<StopType, 32, 0>(sconf, logger, prec, mat, b.values,
-                                             x.values, workspace_data,
-                                             group_size, shared_size);
-        //         break;
-        //     case 1:
-        //         launch_apply_kernel<StopType, 32, 1>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, group_size, shared_size);
-        //         break;
-        //     case 2:
-        //         launch_apply_kernel<StopType, 32, 2>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, group_size, shared_size);
-        //         break;
-        //     case 3:
-        //         launch_apply_kernel<StopType, 32, 3>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, group_size, shared_size);
-        //         break;
-        //     case 4:
-        //         launch_apply_kernel<StopType, 32, 4>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, group_size, shared_size);
-        //         break;
-        //     case 5:
-        //         launch_apply_kernel<StopType, 32, 5>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, group_size, shared_size);
-        //         break;
-        //     case 6:
-        //         launch_apply_kernel<StopType, 32, 6>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, group_size, shared_size);
-        //         break;
-        //     default:
-        //         GKO_NOT_IMPLEMENTED;
-        //     }
-        // }
+        if (num_rows <= 32 && n_shared_total == 6) {
+            launch_apply_kernel<ValueType, StopType, 16, 6>(
+                exec_, sconf, settings_, logger, prec, mat, b.values, x.values,
+                workspace_data, group_size, shared_size);
+        } else {
+            switch (n_shared_total) {
+            case 0:
+                launch_apply_kernel<ValueType, StopType, 32, 0>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, group_size, shared_size);
+                break;
+            case 1:
+                launch_apply_kernel<ValueType, StopType, 32, 1>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, group_size, shared_size);
+                break;
+            case 2:
+                launch_apply_kernel<ValueType, StopType, 32, 2>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, group_size, shared_size);
+                break;
+            case 3:
+                launch_apply_kernel<ValueType, StopType, 32, 3>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, group_size, shared_size);
+                break;
+            case 4:
+                launch_apply_kernel<ValueType, StopType, 32, 4>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, group_size, shared_size);
+                break;
+            case 5:
+                launch_apply_kernel<ValueType, StopType, 32, 5>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, group_size, shared_size);
+                break;
+            case 6:
+                launch_apply_kernel<ValueType, StopType, 32, 6>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, group_size, shared_size);
+                break;
+            default:
+                GKO_NOT_IMPLEMENTED;
+            }
+        }
     }
 
 private:
diff --git a/dpcpp/solver/batch_cg_launch.hpp b/dpcpp/solver/batch_cg_launch.hpp
new file mode 100644
index 00000000000..e756bf60c2e
--- /dev/null
+++ b/dpcpp/solver/batch_cg_launch.hpp
@@ -0,0 +1,74 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <CL/sycl.hpp>
+
+#include <ginkgo/core/solver/batch_cg.hpp>
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "core/solver/batch_cg_kernels.hpp"
+#include "dpcpp/base/batch_struct.hpp"
+#include "dpcpp/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+namespace batch_cg {
+
+
+template <typename T>
+using settings = gko::kernels::batch_cg::settings<T>;
+
+
+template <typename ValueType, typename StopType, const int subgroup_size,
+          const int n_shared_total, typename PrecType, typename LogType,
+          typename BatchMatrixType>
+void launch_apply_kernel(std::shared_ptr<const DefaultExecutor> exec,
+                         const gko::kernels::batch_cg::storage_config& sconf,
+                         const settings<remove_complex<ValueType>>& settings,
+                         LogType& logger, PrecType& prec,
+                         const BatchMatrixType& mat,
+                         const ValueType* const __restrict__ b_values,
+                         ValueType* const __restrict__ x_values,
+                         ValueType* const __restrict__ workspace,
+                         const int& group_size, const int& shared_size);
+
+#define GKO_DECLARE_BATCH_CG_LAUNCH(_vtype, _subgroup_size, _n_shared, mat_t, \
+                                    log_t, pre_t, stop_t)                     \
+    void                                                                      \
+    launch_apply_kernel<_vtype, stop_t<_vtype>, _subgroup_size, _n_shared>(   \
+        std::shared_ptr<const DefaultExecutor> exec,                          \
+        const gko::kernels::batch_cg::storage_config& sconf,                  \
+        const settings<remove_complex<_vtype>>& settings,                     \
+        log_t<gko::remove_complex<_vtype>>& logger, pre_t<_vtype>& prec,      \
+        const mat_t<const _vtype>& mat,                                       \
+        const _vtype* const __restrict__ b_values,                            \
+        _vtype* const __restrict__ x_values,                                  \
+        _vtype* const __restrict__ workspace_data, const int& block_size,     \
+        const int& shared_size)
+
+#define GKO_DECLARE_BATCH_CG_LAUNCH_0(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 32, 0)
+#define GKO_DECLARE_BATCH_CG_LAUNCH_1(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 32, 1)
+#define GKO_DECLARE_BATCH_CG_LAUNCH_2(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 32, 2)
+#define GKO_DECLARE_BATCH_CG_LAUNCH_3(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 32, 3)
+#define GKO_DECLARE_BATCH_CG_LAUNCH_4(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 32, 4)
+#define GKO_DECLARE_BATCH_CG_LAUNCH_5(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 32, 5)
+#define GKO_DECLARE_BATCH_CG_LAUNCH_6(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 32, 6)
+#define GKO_DECLARE_BATCH_CG_LAUNCH_6_16(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 16, 6)
+
+
+}  // namespace batch_cg
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/solver/batch_cg_launch.instantiate.dp.cpp b/dpcpp/solver/batch_cg_launch.instantiate.dp.cpp
new file mode 100644
index 00000000000..a45150b0d6c
--- /dev/null
+++ b/dpcpp/solver/batch_cg_launch.instantiate.dp.cpp
@@ -0,0 +1,110 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "dpcpp/solver/batch_cg_launch.hpp"
+
+#include <CL/sycl.hpp>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
+#include <ginkgo/core/solver/batch_cg.hpp>
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "core/solver/batch_dispatch.hpp"
+#include "dpcpp/base/batch_multi_vector_kernels.hpp"
+#include "dpcpp/base/batch_struct.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/base/helper.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/intrinsics.dp.hpp"
+#include "dpcpp/components/reduction.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/matrix/batch_csr_kernels.hpp"
+#include "dpcpp/matrix/batch_dense_kernels.hpp"
+#include "dpcpp/matrix/batch_ell_kernels.hpp"
+#include "dpcpp/matrix/batch_struct.hpp"
+#include "dpcpp/solver/batch_cg_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+namespace batch_cg {
+
+
+template <typename ValueType, typename StopType, const int subgroup_size,
+          const int n_shared_total, typename PrecType, typename LogType,
+          typename BatchMatrixType>
+void launch_apply_kernel(std::shared_ptr<const DefaultExecutor> exec,
+                         const gko::kernels::batch_cg::storage_config& sconf,
+                         const settings<remove_complex<ValueType>>& settings,
+                         LogType& logger, PrecType& prec,
+                         const BatchMatrixType& mat,
+                         const ValueType* const __restrict__ b_values,
+                         ValueType* const __restrict__ x_values,
+                         ValueType* const __restrict__ workspace,
+                         const int& group_size, const int& shared_size)
+{
+    auto num_rows = mat.num_rows;
+
+    const dim3 block(group_size);
+    const dim3 grid(mat.num_batch_items);
+
+    auto max_iters = settings.max_iterations;
+    auto res_tol = settings.residual_tol;
+
+    exec->get_queue()->submit([&](sycl::handler& cgh) {
+        sycl::local_accessor<ValueType, 1> slm_values(
+            sycl::range<1>(shared_size), cgh);
+
+        cgh.parallel_for(
+            sycl_nd_range(grid, block),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(
+                subgroup_size)]] [[intel::kernel_args_restrict]] {
+                auto batch_id = item_ct1.get_group_linear_id();
+                const auto mat_global_entry =
+                    gko::batch::matrix::extract_batch_item(mat, batch_id);
+                const ValueType* const b_global_entry =
+                    gko::batch::multi_vector::batch_item_ptr(
+                        b_values, 1, num_rows, batch_id);
+                ValueType* const x_global_entry =
+                    gko::batch::multi_vector::batch_item_ptr(
+                        x_values, 1, num_rows, batch_id);
+                batch_single_kernels::apply_kernel<StopType, n_shared_total>(
+                    sconf, max_iters, res_tol, logger, prec, mat_global_entry,
+                    b_global_entry, x_global_entry, num_rows,
+                    mat.get_single_item_num_nnz(),
+                    static_cast<ValueType*>(slm_values.get_pointer()), item_ct1,
+                    workspace);
+            });
+    });
+}
+
+
+// begin
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_0);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_1);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_2);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_3);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_4);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_6);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_6_16);
+// end
+
+
+}  // namespace batch_cg
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko

From 1178a68f5e8fd7abb24820caba853e2814ad2ba2 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Tue, 12 Nov 2024 11:35:46 +0000
Subject: [PATCH 272/448] [core] add instantiation macro with variable args

---
 include/ginkgo/core/base/types.hpp | 48 ++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp
index e375da15f9c..ceffec9b275 100644
--- a/include/ginkgo/core/base/types.hpp
+++ b/include/ginkgo/core/base/types.hpp
@@ -442,6 +442,54 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
 #endif
 
 
+/**
+ * Instantiates a template for each non-complex value type compiled by Ginkgo.
+ *
+ * @param _macro  A macro which expands the template instantiation
+ *                (not including the leading `template` specifier).
+ *                Should take at least two arguments, of which the first one
+ *                is the value type.
+ *
+ * @note This won't be necessary after upgrading to C++20
+ */
+#if GINKGO_DPCPP_SINGLE_MODE
+#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_VARGS(_macro, ...) \
+    template _macro(float, __VA_ARGS__);                                   \
+    template <>                                                            \
+    _macro(double, __VA_ARGS__) GKO_NOT_IMPLEMENTED
+#else
+#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_VARGS(_macro, ...) \
+    template _macro(float, __VA_ARGS__);                                   \
+    template _macro(double, __VA_ARGS__)
+#endif
+
+
+/**
+ * Instantiates a template for each non-complex value type compiled by Ginkgo.
+ *
+ * @param _macro  A macro which expands the template instantiation
+ *                (not including the leading `template` specifier).
+ *                Should take at least two arguments, of which the first one
+ *                is the value type.
+ *
+ * @note This won't be necessary after upgrading to C++20
+ */
+#if GINKGO_DPCPP_SINGLE_MODE
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(_macro, ...)          \
+    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_VARGS(_macro,       \
+                                                          __VA_ARGS__); \
+    template _macro(std::complex<float>, __VA_ARGS__);                  \
+    template <>                                                         \
+    _macro(std::complex<double>, __VA_ARGS__) GKO_NOT_IMPLEMENTED
+#else
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(_macro, ...)          \
+    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_VARGS(_macro,       \
+                                                          __VA_ARGS__); \
+    template _macro(std::complex<float>, __VA_ARGS__);                  \
+    template _macro(std::complex<double>, __VA_ARGS__)
+#endif
+
+
 /**
  * Instantiates a template for each value and scalar type compiled by Ginkgo.
  * This means all value and scalar type combinations for which

From 292e45d585bf78960cbafc822dddddeaab6cb0f5 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Tue, 12 Nov 2024 12:02:50 +0000
Subject: [PATCH 273/448] [batch] switch order of batch dispatch and value
 instantiation macros

---
 .../cuda_hip/solver/batch_bicgstab_launch.hpp | 48 +++++++++--------
 .../batch_bicgstab_launch.instantiate.cpp     | 22 ++++----
 common/cuda_hip/solver/batch_cg_launch.hpp    | 32 +++++++-----
 .../solver/batch_cg_launch.instantiate.cpp    | 14 ++---
 core/solver/batch_dispatch.hpp                | 44 +++++++++++-----
 cuda/CMakeLists.txt                           |  2 +-
 cuda/solver/batch_bicgstab_launch.cuh         | 21 +++++---
 .../batch_bicgstab_launch.instantiate.cu      |  6 +--
 cuda/solver/batch_cg_launch.cuh               | 20 ++++---
 cuda/solver/batch_cg_launch.instantiate.cu    |  6 +--
 dpcpp/solver/batch_bicgstab_launch.hpp        | 52 ++++++++++---------
 .../batch_bicgstab_launch.instantiate.dp.cpp  | 24 ++++-----
 dpcpp/solver/batch_cg_launch.hpp              | 36 +++++++------
 .../solver/batch_cg_launch.instantiate.dp.cpp | 16 +++---
 14 files changed, 196 insertions(+), 147 deletions(-)

diff --git a/common/cuda_hip/solver/batch_bicgstab_launch.hpp b/common/cuda_hip/solver/batch_bicgstab_launch.hpp
index 3db03db0409..696e11b5899 100644
--- a/common/cuda_hip/solver/batch_bicgstab_launch.hpp
+++ b/common/cuda_hip/solver/batch_bicgstab_launch.hpp
@@ -50,28 +50,32 @@ void launch_apply_kernel(
         device_type<_vtype>* const __restrict__ workspace_data,            \
         const int& block_size, const size_t& shared_size)
 
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 0, false)
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 1, false)
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 2, false)
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 3, false)
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 4, false)
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 5, false)
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 6, false)
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 7, false)
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 8, false)
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 9, false)
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_TRUE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 9, true)
+#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH(...) \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(     \
+        GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, __VA_ARGS__)
+
+#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_0_FALSE \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 0, false)
+#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_1_FALSE \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 1, false)
+#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_2_FALSE \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 2, false)
+#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_3_FALSE \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 3, false)
+#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_4_FALSE \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 4, false)
+#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_5_FALSE \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 5, false)
+#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_6_FALSE \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 6, false)
+#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_7_FALSE \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 7, false)
+#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_8_FALSE \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 8, false)
+#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_9_FALSE \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 9, false)
+#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_9_TRUE \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 9, true)
 
 
 }  // namespace batch_bicgstab
diff --git a/common/cuda_hip/solver/batch_bicgstab_launch.instantiate.cpp b/common/cuda_hip/solver/batch_bicgstab_launch.instantiate.cpp
index bff6babb446..181fd925c4c 100644
--- a/common/cuda_hip/solver/batch_bicgstab_launch.instantiate.cpp
+++ b/common/cuda_hip/solver/batch_bicgstab_launch.instantiate.cpp
@@ -39,27 +39,27 @@ void launch_apply_kernel(
 
 
 // begin
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0_FALSE);
+GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_0_FALSE;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1_FALSE);
+GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_1_FALSE;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2_FALSE);
+GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_2_FALSE;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3_FALSE);
+GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_3_FALSE;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4_FALSE);
+GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_4_FALSE;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5_FALSE);
+GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_5_FALSE;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6_FALSE);
+GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_6_FALSE;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7_FALSE);
+GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_7_FALSE;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8_FALSE);
+GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_8_FALSE;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_FALSE);
+GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_9_FALSE;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_TRUE);
+GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_9_TRUE;
 // end
 
 
diff --git a/common/cuda_hip/solver/batch_cg_launch.hpp b/common/cuda_hip/solver/batch_cg_launch.hpp
index 6fa144ba35e..fe5d96c8a21 100644
--- a/common/cuda_hip/solver/batch_cg_launch.hpp
+++ b/common/cuda_hip/solver/batch_cg_launch.hpp
@@ -50,20 +50,24 @@ void launch_apply_kernel(
         device_type<_vtype>* const __restrict__ workspace_data,               \
         const int& block_size, const size_t& shared_size)
 
-#define GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 0, false)
-#define GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 1, false)
-#define GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 2, false)
-#define GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 3, false)
-#define GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 4, false)
-#define GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, false)
-#define GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, true)
+#define GKO_INSTANTIATE_BATCH_CG_LAUNCH(...)                               \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(GKO_DECLARE_BATCH_CG_LAUNCH, \
+                                              __VA_ARGS__)
+
+#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_0_FALSE \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 0, false)
+#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_1_FALSE \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 1, false)
+#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_2_FALSE \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 2, false)
+#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_3_FALSE \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 3, false)
+#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_4_FALSE \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 4, false)
+#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_5_FALSE \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 5, false)
+#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_5_TRUE \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 5, true)
 
 
 }  // namespace batch_cg
diff --git a/common/cuda_hip/solver/batch_cg_launch.instantiate.cpp b/common/cuda_hip/solver/batch_cg_launch.instantiate.cpp
index eef120df196..bedc0bab9a5 100644
--- a/common/cuda_hip/solver/batch_cg_launch.instantiate.cpp
+++ b/common/cuda_hip/solver/batch_cg_launch.instantiate.cpp
@@ -39,19 +39,19 @@ void launch_apply_kernel(
 
 
 // begin
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE);
+GKO_INSTANTIATE_BATCH_CG_LAUNCH_0_FALSE;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE);
+GKO_INSTANTIATE_BATCH_CG_LAUNCH_1_FALSE;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE);
+GKO_INSTANTIATE_BATCH_CG_LAUNCH_2_FALSE;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE);
+GKO_INSTANTIATE_BATCH_CG_LAUNCH_3_FALSE;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE);
+GKO_INSTANTIATE_BATCH_CG_LAUNCH_4_FALSE;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE);
+GKO_INSTANTIATE_BATCH_CG_LAUNCH_5_FALSE;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE);
+GKO_INSTANTIATE_BATCH_CG_LAUNCH_5_TRUE;
 // end
 
 
diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp
index 3e3fd01a03c..0580325d67b 100644
--- a/core/solver/batch_dispatch.hpp
+++ b/core/solver/batch_dispatch.hpp
@@ -171,7 +171,7 @@ enum class log_type { simple_convergence_completion };
     GKO_INDIRECT(                                                            \
         macro(__VA_ARGS__,                                                   \
               ::gko::batch::solver::device::batch_stop::SimpleAbsResidual)); \
-    template GKO_INDIRECT(                                                   \
+    GKO_INDIRECT(                                                            \
         macro(__VA_ARGS__,                                                   \
               ::gko::batch::solver::device::batch_stop::SimpleRelResidual))
 
@@ -179,10 +179,10 @@ enum class log_type { simple_convergence_completion };
     GKO_BATCH_INSTANTIATE_STOP(                                            \
         macro, __VA_ARGS__,                                                \
         ::gko::batch::solver::device::batch_preconditioner::Identity);     \
-    template GKO_BATCH_INSTANTIATE_STOP(                                   \
+    GKO_BATCH_INSTANTIATE_STOP(                                            \
         macro, __VA_ARGS__,                                                \
         ::gko::batch::solver::device::batch_preconditioner::ScalarJacobi); \
-    template GKO_BATCH_INSTANTIATE_STOP(                                   \
+    GKO_BATCH_INSTANTIATE_STOP(                                            \
         macro, __VA_ARGS__,                                                \
         ::gko::batch::solver::device::batch_preconditioner::BlockJacobi)
 
@@ -191,16 +191,36 @@ enum class log_type { simple_convergence_completion };
         macro, __VA_ARGS__,                      \
         ::gko::batch::solver::device::batch_log::SimpleFinalLogger)
 
-#define GKO_BATCH_INSTANTIATE_MATRIX(macro, ...)                     \
-    GKO_BATCH_INSTANTIATE_LOGGER(macro, __VA_ARGS__,                 \
-                                 batch::matrix::ell::uniform_batch); \
-    template GKO_BATCH_INSTANTIATE_LOGGER(                           \
-        macro, __VA_ARGS__, batch::matrix::dense::uniform_batch);    \
-    template GKO_BATCH_INSTANTIATE_LOGGER(macro, __VA_ARGS__,        \
-                                          batch::matrix::csr::uniform_batch)
+#define GKO_BATCH_INSTANTIATE_MATRIX_VARGS(macro, ...)                 \
+    GKO_BATCH_INSTANTIATE_LOGGER(macro, __VA_ARGS__,                   \
+                                 batch::matrix::ell::uniform_batch);   \
+    GKO_BATCH_INSTANTIATE_LOGGER(macro, __VA_ARGS__,                   \
+                                 batch::matrix::dense::uniform_batch); \
+    GKO_BATCH_INSTANTIATE_LOGGER(macro, __VA_ARGS__,                   \
+                                 batch::matrix::csr::uniform_batch)
+
+/**
+ * Passes each valid configuration of batch solver template parameter to a
+ * macro. The order of template parameters is: macro(..., <matrix>, <logger>,
+ * <precond>, <stop>) Any additional macro parameter passed to
+ * GKO_BATCH_INSTANTIATE will be prepended to the batch solver template
+ * parameters.
+ */
+#define GKO_BATCH_INSTANTIATE_VARGS(macro, ...) \
+    GKO_BATCH_INSTANTIATE_MATRIX_VARGS(macro, __VA_ARGS__)
+
+
+/**
+ * Passes each valid configuration of batch solver template parameter to a
+ * macro. The order of template parameters is: macro(<matrix>, <logger>,
+ * <precond>, <stop>)
+ */
+#define GKO_BATCH_INSTANTIATE_MATRIX(macro, ...)                              \
+    GKO_BATCH_INSTANTIATE_LOGGER(macro, batch::matrix::ell::uniform_batch);   \
+    GKO_BATCH_INSTANTIATE_LOGGER(macro, batch::matrix::dense::uniform_batch); \
+    GKO_BATCH_INSTANTIATE_LOGGER(macro, batch::matrix::csr::uniform_batch)
 
-#define GKO_BATCH_INSTANTIATE(macro, ...) \
-    GKO_BATCH_INSTANTIATE_MATRIX(macro, __VA_ARGS__)
+#define GKO_BATCH_INSTANTIATE(macro) GKO_BATCH_INSTANTIATE_MATRIX(macro)
 
 
 /**
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index 7567a1adf3c..5316c4c623c 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -46,7 +46,7 @@ endif()
 jacobi_generated_files(GKO_CUDA_JACOBI_SOURCES "${GKO_CUDA_JACOBI_BLOCK_SIZES}")
 # override the default language mapping for the common files, set them to CUDA
 foreach(source_file IN LISTS GKO_UNIFIED_COMMON_SOURCES GKO_CUDA_HIP_COMMON_SOURCES GKO_CUDA_JACOBI_SOURCES
-                             CSR_INSTANTIATE FBCSR_INSTANTIATE BATCH_BICGSTAB_INSTANTIATE1 BATCH_CG_INSTANTIATE1)
+                             CSR_INSTANTIATE FBCSR_INSTANTIATE BATCH_BICGSTAB_INSTANTIATE1 BATCH_BICGSTAB_INSTANTIATE2 BATCH_CG_INSTANTIATE1 BATCH_CG_INSTANTIATE2)
     set_source_files_properties(${source_file} PROPERTIES LANGUAGE CUDA)
 endforeach(source_file)
 target_sources(ginkgo_cuda PRIVATE ${GKO_CUDA_JACOBI_SOURCES})
diff --git a/cuda/solver/batch_bicgstab_launch.cuh b/cuda/solver/batch_bicgstab_launch.cuh
index 76528c84670..737f2a923b0 100644
--- a/cuda/solver/batch_bicgstab_launch.cuh
+++ b/cuda/solver/batch_bicgstab_launch.cuh
@@ -39,9 +39,13 @@ int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
         cuda_type<_vtype>>(std::shared_ptr<const DefaultExecutor> exec,     \
                            const int num_rows)
 
-#define GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK(_vtype) \
-    GKO_BATCH_INSTANTIATE(                                           \
-        GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK_, _vtype)
+#define GKO_INSTANTIATE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK_(...) \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(                         \
+        GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK_, __VA_ARGS__)
+
+#define GKO_INSTANTIATE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK \
+    GKO_BATCH_INSTANTIATE(                                       \
+        GKO_INSTANTIATE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK_)
 
 
 template <typename StopType, typename PrecType, typename LogType,
@@ -55,9 +59,14 @@ int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec);
         log_t<gko::remove_complex<_vtype>>, mat_t<const cuda_type<_vtype>>, \
         cuda_type<_vtype>>(std::shared_ptr<const DefaultExecutor> exec)
 
-#define GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY(_vtype) \
-    GKO_BATCH_INSTANTIATE(                                               \
-        GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY_, _vtype)
+#define GKO_INSTANTIATE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY_(...) \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(                             \
+        GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY_,         \
+        __VA_ARGS__)
+
+#define GKO_INSTANTIATE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY \
+    GKO_BATCH_INSTANTIATE(                                           \
+        GKO_INSTANTIATE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY_)
 
 
 }  // namespace batch_bicgstab
diff --git a/cuda/solver/batch_bicgstab_launch.instantiate.cu b/cuda/solver/batch_bicgstab_launch.instantiate.cu
index 629b4f9c6ad..0b2e6e15cdf 100644
--- a/cuda/solver/batch_bicgstab_launch.instantiate.cu
+++ b/cuda/solver/batch_bicgstab_launch.instantiate.cu
@@ -76,11 +76,9 @@ int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec)
 
 
 // begin
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK);
+GKO_INSTANTIATE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY);
+GKO_INSTANTIATE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY;
 // end
 
 
diff --git a/cuda/solver/batch_cg_launch.cuh b/cuda/solver/batch_cg_launch.cuh
index dafaaf19a9f..e803e15fe80 100644
--- a/cuda/solver/batch_cg_launch.cuh
+++ b/cuda/solver/batch_cg_launch.cuh
@@ -39,9 +39,12 @@ int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
         mat_t<const cuda_type<_vtype>>, cuda_type<_vtype>>(                   \
         std::shared_ptr<const DefaultExecutor> exec, const int num_rows)
 
-#define GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK(_vtype)             \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK_, \
-                          _vtype)
+#define GKO_INSTANTIATE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK_(...) \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(                   \
+        GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK_, __VA_ARGS__)
+
+#define GKO_INSTANTIATE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK \
+    GKO_BATCH_INSTANTIATE(GKO_INSTANTIATE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK_)
 
 
 template <typename StopType, typename PrecType, typename LogType,
@@ -55,9 +58,14 @@ int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec);
         log_t<gko::remove_complex<_vtype>>, mat_t<const cuda_type<_vtype>>, \
         cuda_type<_vtype>>(std::shared_ptr<const DefaultExecutor> exec)
 
-#define GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY(_vtype)             \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY_, \
-                          _vtype)
+
+#define GKO_INSTANTIATE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY_(...) \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(                       \
+        GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY_, __VA_ARGS__)
+
+#define GKO_INSTANTIATE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY \
+    GKO_BATCH_INSTANTIATE(                                     \
+        GKO_INSTANTIATE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY_)
 
 
 }  // namespace batch_cg
diff --git a/cuda/solver/batch_cg_launch.instantiate.cu b/cuda/solver/batch_cg_launch.instantiate.cu
index 70c5cecb6f5..087746e6146 100644
--- a/cuda/solver/batch_cg_launch.instantiate.cu
+++ b/cuda/solver/batch_cg_launch.instantiate.cu
@@ -75,11 +75,9 @@ int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec)
 
 
 // begin
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK);
+GKO_INSTANTIATE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY);
+GKO_INSTANTIATE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY;
 // end
 
 
diff --git a/dpcpp/solver/batch_bicgstab_launch.hpp b/dpcpp/solver/batch_bicgstab_launch.hpp
index e4b1917ee9b..06ba8531b42 100644
--- a/dpcpp/solver/batch_bicgstab_launch.hpp
+++ b/dpcpp/solver/batch_bicgstab_launch.hpp
@@ -53,30 +53,34 @@ void launch_apply_kernel(
         _vtype* const __restrict__ workspace_data, const int& block_size,    \
         const int& shared_size)
 
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 0)
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 1)
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 2)
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 3)
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 4)
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 5)
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 6)
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 7)
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 8)
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 9)
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_10(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 10)
-#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_10_16(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 16, 10)
+#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH(...) \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(     \
+        GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, __VA_ARGS__)
+
+#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_0 \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 0)
+#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_1 \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 1)
+#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_2 \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 2)
+#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_3 \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 3)
+#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_4 \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 4)
+#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_5 \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 5)
+#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_6 \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 6)
+#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_7 \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 7)
+#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_8 \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 8)
+#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_9 \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 9)
+#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_10 \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 10)
+#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_10_16 \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 16, 10)
 
 
 }  // namespace batch_bicgstab
diff --git a/dpcpp/solver/batch_bicgstab_launch.instantiate.dp.cpp b/dpcpp/solver/batch_bicgstab_launch.instantiate.dp.cpp
index 94c5e7462ce..b45d6409575 100644
--- a/dpcpp/solver/batch_bicgstab_launch.instantiate.dp.cpp
+++ b/dpcpp/solver/batch_bicgstab_launch.instantiate.dp.cpp
@@ -79,29 +79,29 @@ void launch_apply_kernel(
 
 
 // begin
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0);
+GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_0;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1);
+GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_1;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2);
+GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_2;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3);
+GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_3;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4);
+GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_4;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5);
+GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_5;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6);
+GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_6;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7);
+GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_7;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8);
+GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_8;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9);
+GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_9;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_10);
+GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_10;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_10_16);
+GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_10_16;
 // end
 
 
diff --git a/dpcpp/solver/batch_cg_launch.hpp b/dpcpp/solver/batch_cg_launch.hpp
index e756bf60c2e..3fe1e704963 100644
--- a/dpcpp/solver/batch_cg_launch.hpp
+++ b/dpcpp/solver/batch_cg_launch.hpp
@@ -50,22 +50,26 @@ void launch_apply_kernel(std::shared_ptr<const DefaultExecutor> exec,
         _vtype* const __restrict__ workspace_data, const int& block_size,     \
         const int& shared_size)
 
-#define GKO_DECLARE_BATCH_CG_LAUNCH_0(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 32, 0)
-#define GKO_DECLARE_BATCH_CG_LAUNCH_1(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 32, 1)
-#define GKO_DECLARE_BATCH_CG_LAUNCH_2(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 32, 2)
-#define GKO_DECLARE_BATCH_CG_LAUNCH_3(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 32, 3)
-#define GKO_DECLARE_BATCH_CG_LAUNCH_4(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 32, 4)
-#define GKO_DECLARE_BATCH_CG_LAUNCH_5(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 32, 5)
-#define GKO_DECLARE_BATCH_CG_LAUNCH_6(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 32, 6)
-#define GKO_DECLARE_BATCH_CG_LAUNCH_6_16(_vtype) \
-    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 16, 6)
+#define GKO_INSTANTIATE_BATCH_CG_LAUNCH(...)                               \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(GKO_DECLARE_BATCH_CG_LAUNCH, \
+                                              __VA_ARGS__)
+
+#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_0 \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 32, 0)
+#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_1 \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 32, 1)
+#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_2 \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 32, 2)
+#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_3 \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 32, 3)
+#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_4 \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 32, 4)
+#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_5 \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 32, 5)
+#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_6 \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 32, 6)
+#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_6_16 \
+    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 16, 6)
 
 
 }  // namespace batch_cg
diff --git a/dpcpp/solver/batch_cg_launch.instantiate.dp.cpp b/dpcpp/solver/batch_cg_launch.instantiate.dp.cpp
index a45150b0d6c..ba887c8aeb5 100644
--- a/dpcpp/solver/batch_cg_launch.instantiate.dp.cpp
+++ b/dpcpp/solver/batch_cg_launch.instantiate.dp.cpp
@@ -86,21 +86,21 @@ void launch_apply_kernel(std::shared_ptr<const DefaultExecutor> exec,
 
 
 // begin
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_0);
+GKO_INSTANTIATE_BATCH_CG_LAUNCH_0;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_1);
+GKO_INSTANTIATE_BATCH_CG_LAUNCH_1;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_2);
+GKO_INSTANTIATE_BATCH_CG_LAUNCH_2;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_3);
+GKO_INSTANTIATE_BATCH_CG_LAUNCH_3;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_4);
+GKO_INSTANTIATE_BATCH_CG_LAUNCH_4;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5);
+GKO_INSTANTIATE_BATCH_CG_LAUNCH_5;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_6);
+GKO_INSTANTIATE_BATCH_CG_LAUNCH_6;
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_6_16);
+GKO_INSTANTIATE_BATCH_CG_LAUNCH_6_16;
 // end
 
 

From 4b681c056b1c56ea961c68a040cabbc8d3142b3e Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Tue, 12 Nov 2024 15:17:36 +0000
Subject: [PATCH 274/448] [batch] fix windows build

---
 core/solver/batch_dispatch.hpp     | 15 +++++----------
 include/ginkgo/core/base/types.hpp | 22 ++++++++++++++--------
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp
index 0580325d67b..5a37b12cf11 100644
--- a/core/solver/batch_dispatch.hpp
+++ b/core/solver/batch_dispatch.hpp
@@ -164,16 +164,11 @@ enum class log_type { simple_convergence_completion };
 }  // namespace log
 
 
-#define GKO_INDIRECT(...) __VA_ARGS__
-
-
-#define GKO_BATCH_INSTANTIATE_STOP(macro, ...)                               \
-    GKO_INDIRECT(                                                            \
-        macro(__VA_ARGS__,                                                   \
-              ::gko::batch::solver::device::batch_stop::SimpleAbsResidual)); \
-    GKO_INDIRECT(                                                            \
-        macro(__VA_ARGS__,                                                   \
-              ::gko::batch::solver::device::batch_stop::SimpleRelResidual))
+#define GKO_BATCH_INSTANTIATE_STOP(macro, ...)                          \
+    macro(__VA_ARGS__,                                                  \
+          ::gko::batch::solver::device::batch_stop::SimpleAbsResidual); \
+    macro(__VA_ARGS__,                                                  \
+          ::gko::batch::solver::device::batch_stop::SimpleRelResidual)
 
 #define GKO_BATCH_INSTANTIATE_PRECONDITIONER(macro, ...)                   \
     GKO_BATCH_INSTANTIATE_STOP(                                            \
diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp
index ceffec9b275..4b06b494707 100644
--- a/include/ginkgo/core/base/types.hpp
+++ b/include/ginkgo/core/base/types.hpp
@@ -442,6 +442,10 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
 #endif
 
 
+// Helper macro to make Windows builds work
+#define GKO_INDIRECT(...) __VA_ARGS__
+
+
 /**
  * Instantiates a template for each non-complex value type compiled by Ginkgo.
  *
@@ -454,13 +458,14 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
  */
 #if GINKGO_DPCPP_SINGLE_MODE
 #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_VARGS(_macro, ...) \
-    template _macro(float, __VA_ARGS__);                                   \
+    template GKO_INDIRECT(_macro(float, __VA_ARGS__));                     \
     template <>                                                            \
-    _macro(double, __VA_ARGS__) GKO_NOT_IMPLEMENTED
+    GKO_INDIRECT(_macro(double, __VA_ARGS__))                              \
+    GKO_NOT_IMPLEMENTED
 #else
 #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_VARGS(_macro, ...) \
-    template _macro(float, __VA_ARGS__);                                   \
-    template _macro(double, __VA_ARGS__)
+    template GKO_INDIRECT(_macro(float, __VA_ARGS__));                     \
+    template GKO_INDIRECT(_macro(double, __VA_ARGS__))
 #endif
 
 
@@ -478,15 +483,16 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
 #define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(_macro, ...)          \
     GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_VARGS(_macro,       \
                                                           __VA_ARGS__); \
-    template _macro(std::complex<float>, __VA_ARGS__);                  \
+    template GKO_INDIRECT(_macro(std::complex<float>, __VA_ARGS__));    \
     template <>                                                         \
-    _macro(std::complex<double>, __VA_ARGS__) GKO_NOT_IMPLEMENTED
+    GKO_INDIRECT(_macro(std::complex<double>, __VA_ARGS__))             \
+    GKO_NOT_IMPLEMENTED
 #else
 #define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(_macro, ...)          \
     GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_VARGS(_macro,       \
                                                           __VA_ARGS__); \
-    template _macro(std::complex<float>, __VA_ARGS__);                  \
-    template _macro(std::complex<double>, __VA_ARGS__)
+    template GKO_INDIRECT(_macro(std::complex<float>, __VA_ARGS__));    \
+    template GKO_INDIRECT(_macro(std::complex<double>, __VA_ARGS__))
 #endif
 
 

From ae3c0567280ab06193a7a0345235ca1b9831289f Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Tue, 19 Nov 2024 12:57:44 +0000
Subject: [PATCH 275/448] [fact] fix cuda 11.0 namespace issue

---
 .../factorization/factorization_helpers.hpp    |  6 +-----
 .../factorization/factorization_kernels.cpp    | 18 ++++++++++--------
 common/cuda_hip/preconditioner/sor_kernels.cpp | 14 +++++++++-----
 3 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/common/cuda_hip/factorization/factorization_helpers.hpp b/common/cuda_hip/factorization/factorization_helpers.hpp
index 87248740867..95973146e08 100644
--- a/common/cuda_hip/factorization/factorization_helpers.hpp
+++ b/common/cuda_hip/factorization/factorization_helpers.hpp
@@ -16,9 +16,6 @@ namespace factorization {
 namespace helpers {
 
 
-using namespace ::gko::factorization;
-
-
 constexpr int default_block_size{512};
 
 
@@ -107,6 +104,5 @@ __global__ __launch_bounds__(default_block_size) void initialize_l(
 }  // namespace helpers
 }  // namespace factorization
 }  // namespace GKO_DEVICE_NAMESPACE
-
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/common/cuda_hip/factorization/factorization_kernels.cpp b/common/cuda_hip/factorization/factorization_kernels.cpp
index e790cf19540..36f2f6eb4c5 100644
--- a/common/cuda_hip/factorization/factorization_kernels.cpp
+++ b/common/cuda_hip/factorization/factorization_kernels.cpp
@@ -399,7 +399,12 @@ void initialize_l_u(std::shared_ptr<const DefaultExecutor> exec,
     const auto grid_dim = static_cast<uint32>(
         ceildiv(num_rows, static_cast<size_type>(block_size)));
 
+    using namespace gko::factorization;
+
     if (grid_dim > 0) {
+        auto l_closure = triangular_mtx_closure(
+            [] __device__(auto val) { return one(val); }, identity{});
+        auto u_closure = triangular_mtx_closure(identity{}, identity{});
         helpers::
             initialize_l_u<<<grid_dim, block_size, 0, exec->get_stream()>>>(
                 num_rows, system_matrix->get_const_row_ptrs(),
@@ -408,12 +413,7 @@ void initialize_l_u(std::shared_ptr<const DefaultExecutor> exec,
                 csr_l->get_const_row_ptrs(), csr_l->get_col_idxs(),
                 as_device_type(csr_l->get_values()),
                 csr_u->get_const_row_ptrs(), csr_u->get_col_idxs(),
-                as_device_type(csr_u->get_values()),
-                helpers::triangular_mtx_closure(
-                    [] __device__(auto val) { return one(val); },
-                    helpers::identity{}),
-                helpers::triangular_mtx_closure(helpers::identity{},
-                                                helpers::identity{}));
+                as_device_type(csr_u->get_values()), l_closure, u_closure);
     }
 }
 
@@ -460,13 +460,15 @@ void initialize_l(std::shared_ptr<const DefaultExecutor> exec,
         ceildiv(num_rows, static_cast<size_type>(block_size)));
 
     if (grid_dim > 0) {
+        using namespace gko::factorization;
+
         helpers::initialize_l<<<grid_dim, block_size, 0, exec->get_stream()>>>(
             num_rows, system_matrix->get_const_row_ptrs(),
             system_matrix->get_const_col_idxs(),
             as_device_type(system_matrix->get_const_values()),
             csr_l->get_const_row_ptrs(), csr_l->get_col_idxs(),
             as_device_type(csr_l->get_values()),
-            helpers::triangular_mtx_closure(
+            triangular_mtx_closure(
                 [diag_sqrt] __device__(auto val) {
                     if (diag_sqrt) {
                         val = sqrt(val);
@@ -476,7 +478,7 @@ void initialize_l(std::shared_ptr<const DefaultExecutor> exec,
                     }
                     return val;
                 },
-                helpers::identity{}));
+                identity{}));
     }
 }
 
diff --git a/common/cuda_hip/preconditioner/sor_kernels.cpp b/common/cuda_hip/preconditioner/sor_kernels.cpp
index a415953915f..4805eca3ab3 100644
--- a/common/cuda_hip/preconditioner/sor_kernels.cpp
+++ b/common/cuda_hip/preconditioner/sor_kernels.cpp
@@ -29,6 +29,8 @@ void initialize_weighted_l(
     auto inv_weight = one(weight) / weight;
 
     if (grid_dim > 0) {
+        using namespace gko::factorization;
+
         factorization::helpers::
             initialize_l<<<grid_dim, block_size, 0, exec->get_stream()>>>(
                 num_rows, system_matrix->get_const_row_ptrs(),
@@ -36,11 +38,11 @@ void initialize_weighted_l(
                 as_device_type(system_matrix->get_const_values()),
                 l_mtx->get_const_row_ptrs(), l_mtx->get_col_idxs(),
                 as_device_type(l_mtx->get_values()),
-                factorization::helpers::triangular_mtx_closure(
+                triangular_mtx_closure(
                     [inv_weight] __device__(auto val) {
                         return val * inv_weight;
                     },
-                    factorization::helpers::identity{}));
+                    identity{}));
     }
 }
 
@@ -65,6 +67,8 @@ void initialize_weighted_l_u(
         one(weight) / (static_cast<remove_complex<ValueType>>(2.0) - weight);
 
     if (grid_dim > 0) {
+        using namespace gko::factorization;
+
         factorization::helpers::
             initialize_l_u<<<grid_dim, block_size, 0, exec->get_stream()>>>(
                 num_rows, system_matrix->get_const_row_ptrs(),
@@ -74,12 +78,12 @@ void initialize_weighted_l_u(
                 as_device_type(l_mtx->get_values()),
                 u_mtx->get_const_row_ptrs(), u_mtx->get_col_idxs(),
                 as_device_type(u_mtx->get_values()),
-                factorization::helpers::triangular_mtx_closure(
+                triangular_mtx_closure(
                     [inv_weight] __device__(auto val) {
                         return val * inv_weight;
                     },
-                    factorization::helpers::identity{}),
-                factorization::helpers::triangular_mtx_closure(
+                    identity{}),
+                triangular_mtx_closure(
                     [inv_two_minus_weight] __device__(auto val) {
                         return val * inv_two_minus_weight;
                     },

From bca9d54c4136e7f735c195afaa582d674eb08266 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Tue, 19 Nov 2024 13:16:01 +0000
Subject: [PATCH 276/448] [ci] use oldest cuda version in regular CI

---
 .gitlab-ci.yml    | 6 +++---
 .gitlab/image.yml | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 445f15d3c86..b0209e67dc5 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -92,12 +92,12 @@ trigger_pipeline:
 
 
 # cuda 11.4 and friends
-build/cuda114/nompi/gcc/cuda/release/shared:
+build/cuda110/nompi/gcc/cuda/release/shared:
   extends:
     - .build_and_test_template
     - .default_variables
     - .quick_test_condition
-    - .use_gko_cuda114-openmpi-gnu10-llvm12
+    - .use_gko_cuda110-mvapich-gnu9-llvm9
   variables:
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
@@ -648,7 +648,7 @@ cudamemcheck:
     - .before_script_template
     - .default_variables
     - .deploy_condition
-  image: ginkgohub/cuda:110-mvapich2-gnu9-llvm9-intel2020
+    - .use_gko_cuda110-mvapich-gnu9-llvm9
   tags:
     - private_ci
     - nvidia-gpu
diff --git a/.gitlab/image.yml b/.gitlab/image.yml
index c894d439723..fde85ed6df4 100644
--- a/.gitlab/image.yml
+++ b/.gitlab/image.yml
@@ -17,8 +17,8 @@
     - cpu
     - amdci
 
-.use_gko_cuda114-openmpi-gnu10-llvm12:
-  image: ginkgohub/cuda:114-openmpi-gnu10-llvm12
+.use_gko_cuda110-mvapich-gnu9-llvm9:
+  image: ginkgohub/cuda:110-mvapich2-gnu9-llvm9-intel2020
   tags:
     - private_ci
     - nvidia-gpu

From 7e87aca9062364384c0e4e19339c1545e1fe2ce1 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Tue, 19 Nov 2024 14:57:51 +0000
Subject: [PATCH 277/448] [ci] keep cuda 11.4 image for sonarcloud

---
 .gitlab/image.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.gitlab/image.yml b/.gitlab/image.yml
index fde85ed6df4..56d229d5f8a 100644
--- a/.gitlab/image.yml
+++ b/.gitlab/image.yml
@@ -23,6 +23,12 @@
     - private_ci
     - nvidia-gpu
 
+.use_gko_cuda114-openmpi-gnu10-llvm12:
+  image: ginkgohub/cuda:114-openmpi-gnu10-llvm12
+  tags:
+    - private_ci
+    - nvidia-gpu
+
 .use_gko_nvhpc233-cuda120-openmpi-gnu12-llvm16:
   image: ginkgohub/nvhpc:233-cuda120-openmpi-gnu12-llvm16
   tags:

From 227519d713f756432dee3698a43345b8a3032d59 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 13 Nov 2024 22:29:53 +0100
Subject: [PATCH 278/448] try clang_cl

---
 .github/workflows/msvc_clang.yml | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 .github/workflows/msvc_clang.yml

diff --git a/.github/workflows/msvc_clang.yml b/.github/workflows/msvc_clang.yml
new file mode 100644
index 00000000000..92b7f6fe518
--- /dev/null
+++ b/.github/workflows/msvc_clang.yml
@@ -0,0 +1,31 @@
+name: Windows-MSVC-CLANG
+
+on:
+  push:
+    branches:
+      - 'master'
+      - 'develop'
+      - 'release/**'
+    tags:
+      - '**'
+  pull_request:
+    types: [opened,synchronize]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  windows_ref:
+    name: msvc/clang
+    runs-on: [windows-latest]
+    steps:
+    - name: Checkout the latest code (shallow clone)
+      uses: actions/checkout@v3
+    - name: configure
+      run: |
+        mkdir build
+        cd build
+        cmake -T ClangCL -DBUILD_SHARED_LIBS=OFF -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_OMP=OFF ..
+        cmake --build . -j4 --config Release
+        ctest . -C Release --output-on-failure

From 6538fd9a58c49a143e3d6d7f3d1f32d5eec8d93d Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 14 Nov 2024 00:00:36 +0100
Subject: [PATCH 279/448] make class/struct consistent

---
 accessor/scaled_reduced_row_major.hpp | 2 +-
 benchmark/utils/iteration_control.hpp | 2 +-
 core/base/dense_cache.cpp             | 2 +-
 core/base/device_matrix_data.cpp      | 2 +-
 core/base/segmented_array.cpp         | 2 +-
 core/distributed/index_map.cpp        | 2 +-
 core/log/batch_logger.cpp             | 2 +-
 core/solver/direct.cpp                | 2 +-
 core/solver/multigrid.cpp             | 3 ++-
 9 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/accessor/scaled_reduced_row_major.hpp b/accessor/scaled_reduced_row_major.hpp
index 9d9f986b0fe..9369300ecca 100644
--- a/accessor/scaled_reduced_row_major.hpp
+++ b/accessor/scaled_reduced_row_major.hpp
@@ -156,7 +156,7 @@ class scaled_reduced_row_major
                   "Only Dimensionality <= 32 is currently supported");
 
     // Allow access to both `scalar_` and `compute_mask_scalar_index()`
-    friend class detail::enable_write_scalar<
+    friend struct detail::enable_write_scalar<
         dimensionality, scaled_reduced_row_major, scalar_type>;
     friend class range<scaled_reduced_row_major>;
 
diff --git a/benchmark/utils/iteration_control.hpp b/benchmark/utils/iteration_control.hpp
index f70d0c88719..bddaef99efb 100644
--- a/benchmark/utils/iteration_control.hpp
+++ b/benchmark/utils/iteration_control.hpp
@@ -63,7 +63,7 @@
 class IterationControl {
     using IndexType = unsigned int;  //!< to be compatible with GFLAGS type
 
-    class run_control;
+    struct run_control;
 
 public:
     /**
diff --git a/core/base/dense_cache.cpp b/core/base/dense_cache.cpp
index 6adbb6107c9..38a0decfa46 100644
--- a/core/base/dense_cache.cpp
+++ b/core/base/dense_cache.cpp
@@ -32,7 +32,7 @@ void DenseCache<ValueType>::init_from(
 }
 
 
-#define GKO_DECLARE_DENSE_CACHE(_type) class DenseCache<_type>
+#define GKO_DECLARE_DENSE_CACHE(_type) struct DenseCache<_type>
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CACHE);
 
 
diff --git a/core/base/device_matrix_data.cpp b/core/base/device_matrix_data.cpp
index adbd5af8e60..4c71fffe275 100644
--- a/core/base/device_matrix_data.cpp
+++ b/core/base/device_matrix_data.cpp
@@ -156,7 +156,7 @@ device_matrix_data<ValueType, IndexType>::empty_out()
 
 
 #define GKO_DECLARE_DEVICE_MATRIX_DATA(ValueType, IndexType) \
-    struct device_matrix_data<ValueType, IndexType>
+    class device_matrix_data<ValueType, IndexType>
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DEVICE_MATRIX_DATA);
 
 
diff --git a/core/base/segmented_array.cpp b/core/base/segmented_array.cpp
index 4a88d42128f..d113139f8e2 100644
--- a/core/base/segmented_array.cpp
+++ b/core/base/segmented_array.cpp
@@ -178,7 +178,7 @@ segmented_array<T>& segmented_array<T>::operator=(segmented_array&& other)
 }
 
 
-#define GKO_DECLARE_SEGMENTED_ARRAY(_type) class segmented_array<_type>
+#define GKO_DECLARE_SEGMENTED_ARRAY(_type) struct segmented_array<_type>
 
 GKO_INSTANTIATE_FOR_EACH_POD_TYPE(GKO_DECLARE_SEGMENTED_ARRAY);
 
diff --git a/core/distributed/index_map.cpp b/core/distributed/index_map.cpp
index 9f0ed8137ba..01717546bc0 100644
--- a/core/distributed/index_map.cpp
+++ b/core/distributed/index_map.cpp
@@ -176,7 +176,7 @@ index_map<LocalIndexType, GlobalIndexType>::index_map(
 }
 
 
-#define GKO_DECLARE_INDEX_MAP(_ltype, _gtype) class index_map<_ltype, _gtype>
+#define GKO_DECLARE_INDEX_MAP(_ltype, _gtype) struct index_map<_ltype, _gtype>
 
 GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_INDEX_MAP);
 
diff --git a/core/log/batch_logger.cpp b/core/log/batch_logger.cpp
index 286803c0ae1..f274019016f 100644
--- a/core/log/batch_logger.cpp
+++ b/core/log/batch_logger.cpp
@@ -63,7 +63,7 @@ log_data<ValueType>::log_data(std::shared_ptr<const Executor> exec,
     }
 }
 
-#define GKO_DECLARE_LOG_DATA(_type) class log_data<_type>
+#define GKO_DECLARE_LOG_DATA(_type) struct log_data<_type>
 
 GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_LOG_DATA);
 
diff --git a/core/solver/direct.cpp b/core/solver/direct.cpp
index c999fdea4fc..cf15bc4a9ae 100644
--- a/core/solver/direct.cpp
+++ b/core/solver/direct.cpp
@@ -280,7 +280,7 @@ std::vector<int> workspace_traits<gko::experimental::solver::Direct<
 
 
 #define GKO_DECLARE_DIRECT_TRAITS(ValueType, IndexType) \
-    class workspace_traits<                             \
+    struct workspace_traits<                            \
         gko::experimental::solver::Direct<ValueType, IndexType>>
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DIRECT_TRAITS);
diff --git a/core/solver/multigrid.cpp b/core/solver/multigrid.cpp
index 35ad7b5d1fe..2ecd3dd74c4 100644
--- a/core/solver/multigrid.cpp
+++ b/core/solver/multigrid.cpp
@@ -200,7 +200,8 @@ namespace detail {
  *
  * @note it should only be used internally
  */
-struct MultigridState {
+class MultigridState {
+public:
     MultigridState() : nrhs{static_cast<size_type>(-1)} {}
 
     /**

From c0b3275e7ecc75bc5b00e5d1fc377ca32ea4e18d Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 14 Nov 2024 11:27:37 +0100
Subject: [PATCH 280/448] fix order

---
 benchmark/spmv/spmv_common.hpp                |  4 +--
 core/base/perturbation.cpp                    |  4 +--
 core/matrix/dense.cpp                         |  4 +--
 core/matrix/ell.cpp                           | 10 +++---
 core/preconditioner/batch_jacobi.cpp          |  2 +-
 core/stop/residual_norm.cpp                   |  2 +-
 include/ginkgo/core/base/batch_dim.hpp        |  6 ++--
 include/ginkgo/core/base/executor.hpp         |  2 +-
 .../preconditioner/batch_block_jacobi.hpp     |  4 +--
 .../multigrid/fixed_coarsening_kernels.cpp    | 24 +++++++-------
 reference/test/reorder/mc64_kernels.cpp       | 28 ++++++++--------
 reference/test/reorder/rcm.cpp                |  2 +-
 reference/test/solver/bicg_kernels.cpp        | 24 +++++++-------
 reference/test/solver/cb_gmres_kernels.cpp    | 33 ++++++++++---------
 reference/test/solver/gcr_kernels.cpp         | 32 +++++++++---------
 15 files changed, 90 insertions(+), 91 deletions(-)

diff --git a/benchmark/spmv/spmv_common.hpp b/benchmark/spmv/spmv_common.hpp
index 4ac777479b2..f540eef1367 100644
--- a/benchmark/spmv/spmv_common.hpp
+++ b/benchmark/spmv/spmv_common.hpp
@@ -44,8 +44,8 @@ struct SpmvBenchmark : Benchmark<spmv_benchmark_state<Generator>> {
                   bool do_print = true)
         : name{"spmv"},
           formats{std::move(formats)},
-          generator{generator},
-          do_print{do_print}
+          do_print{do_print},
+          generator{generator}
     {}
 
     const std::string& get_name() const override { return name; }
diff --git a/core/base/perturbation.cpp b/core/base/perturbation.cpp
index 686c54e5b2d..87501361c05 100644
--- a/core/base/perturbation.cpp
+++ b/core/base/perturbation.cpp
@@ -89,9 +89,9 @@ Perturbation<ValueType>::Perturbation(std::shared_ptr<const LinOp> scalar,
                                       std::shared_ptr<const LinOp> projector)
     : EnableLinOp<Perturbation>(basis->get_executor(),
                                 gko::dim<2>{basis->get_size()[0]}),
-      scalar_{std::move(scalar)},
       basis_{std::move(basis)},
-      projector_{std::move(projector)}
+      projector_{std::move(projector)},
+      scalar_{std::move(scalar)}
 {
     this->validate_perturbation();
 }
diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp
index 171ff007b4a..a9c551857ef 100644
--- a/core/matrix/dense.cpp
+++ b/core/matrix/dense.cpp
@@ -2019,8 +2019,8 @@ Dense<ValueType>::Dense(std::shared_ptr<const Executor> exec,
                         const dim<2>& size, array<value_type> values,
                         size_type stride)
     : EnableLinOp<Dense>(exec, size),
-      values_{exec, std::move(values)},
-      stride_{stride}
+      stride_{stride},
+      values_{exec, std::move(values)}
 {
     if (size[0] > 0 && size[1] > 0) {
         GKO_ENSURE_IN_BOUNDS((size[0] - 1) * stride + size[1] - 1,
diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp
index 87b74c7f417..600c2ceb9d2 100644
--- a/core/matrix/ell.cpp
+++ b/core/matrix/ell.cpp
@@ -375,10 +375,10 @@ Ell<ValueType, IndexType>::Ell(std::shared_ptr<const Executor> exec,
                                size_type num_stored_elements_per_row,
                                size_type stride)
     : EnableLinOp<Ell>(exec, size),
+      num_stored_elements_per_row_(num_stored_elements_per_row),
       stride_(stride == 0 ? size[0] : stride),
       values_(exec, stride_ * num_stored_elements_per_row),
-      col_idxs_(exec, stride_ * num_stored_elements_per_row),
-      num_stored_elements_per_row_(num_stored_elements_per_row)
+      col_idxs_(exec, stride_ * num_stored_elements_per_row)
 {}
 
 
@@ -389,10 +389,10 @@ Ell<ValueType, IndexType>::Ell(std::shared_ptr<const Executor> exec,
                                size_type num_stored_elements_per_row,
                                size_type stride)
     : EnableLinOp<Ell>(exec, size),
-      values_{exec, std::move(values)},
-      col_idxs_{exec, std::move(col_idxs)},
       num_stored_elements_per_row_{num_stored_elements_per_row},
-      stride_{stride}
+      stride_{stride},
+      values_{exec, std::move(values)},
+      col_idxs_{exec, std::move(col_idxs)}
 {
     GKO_ASSERT_EQ(num_stored_elements_per_row_ * stride_, values_.get_size());
     GKO_ASSERT_EQ(num_stored_elements_per_row_ * stride_, col_idxs_.get_size());
diff --git a/core/preconditioner/batch_jacobi.cpp b/core/preconditioner/batch_jacobi.cpp
index f92ccd18cfc..e4382de38ec 100644
--- a/core/preconditioner/batch_jacobi.cpp
+++ b/core/preconditioner/batch_jacobi.cpp
@@ -44,8 +44,8 @@ size_type Jacobi<ValueType, IndexType>::compute_storage_space(
 template <typename ValueType, typename IndexType>
 Jacobi<ValueType, IndexType>::Jacobi(std::shared_ptr<const Executor> exec)
     : EnableBatchLinOp<Jacobi>(exec),
-      num_blocks_{0},
       block_pointers_(exec),
+      num_blocks_{0},
       blocks_(exec),
       map_block_to_row_(exec),
       blocks_cumulative_offsets_(exec)
diff --git a/core/stop/residual_norm.cpp b/core/stop/residual_norm.cpp
index 4e73cc8d56a..c962784033a 100644
--- a/core/stop/residual_norm.cpp
+++ b/core/stop/residual_norm.cpp
@@ -92,8 +92,8 @@ ResidualNormBase<ValueType>::ResidualNormBase(
     std::shared_ptr<const gko::Executor> exec, const CriterionArgs& args,
     remove_complex<ValueType> reduction_factor, mode baseline)
     : EnablePolymorphicObject<ResidualNormBase, Criterion>(exec),
-      device_storage_{exec, 2},
       reduction_factor_{reduction_factor},
+      device_storage_{exec, 2},
       baseline_{baseline},
       system_matrix_{args.system_matrix},
       b_{args.b},
diff --git a/include/ginkgo/core/base/batch_dim.hpp b/include/ginkgo/core/base/batch_dim.hpp
index e34c9a4c2c4..e502bb2704e 100644
--- a/include/ginkgo/core/base/batch_dim.hpp
+++ b/include/ginkgo/core/base/batch_dim.hpp
@@ -82,8 +82,8 @@ struct batch_dim {
      * The default constructor
      */
     batch_dim()
-        : common_size_(dim<dimensionality, dimension_type>{}),
-          num_batch_items_(0)
+        : num_batch_items_(0),
+          common_size_(dim<dimensionality, dimension_type>{})
     {}
 
     /**
@@ -97,7 +97,7 @@ struct batch_dim {
      */
     explicit batch_dim(const size_type num_batch_items,
                        const dim<dimensionality, dimension_type>& common_size)
-        : common_size_(common_size), num_batch_items_(num_batch_items)
+        : num_batch_items_(num_batch_items), common_size_(common_size)
     {}
 
 private:
diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index 963e30bfddd..63793157278 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -1715,7 +1715,7 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
 
     CudaExecutor(int device_id, std::shared_ptr<Executor> master,
                  std::shared_ptr<CudaAllocatorBase> alloc, CUstream_st* stream)
-        : alloc_{std::move(alloc)}, master_(master), stream_{stream}
+        : master_(master), alloc_{std::move(alloc)}, stream_{stream}
     {
         this->get_exec_info().device_id = device_id;
         this->get_exec_info().num_computing_units = 0;
diff --git a/reference/preconditioner/batch_block_jacobi.hpp b/reference/preconditioner/batch_block_jacobi.hpp
index 0ca4807cd3a..ed9766117e4 100644
--- a/reference/preconditioner/batch_block_jacobi.hpp
+++ b/reference/preconditioner/batch_block_jacobi.hpp
@@ -43,8 +43,8 @@ class BlockJacobi final {
         : num_blocks_{num_blocks},
           blocks_cumulative_offsets_{blocks_cumulative_offsets},
           blocks_arr_batch_{blocks_arr_batch},
-          block_ptrs_arr_{block_ptrs_arr},
-          blocks_arr_entry_{}
+          blocks_arr_entry_{},
+          block_ptrs_arr_{block_ptrs_arr}
     {}
 
     /**
diff --git a/reference/test/multigrid/fixed_coarsening_kernels.cpp b/reference/test/multigrid/fixed_coarsening_kernels.cpp
index b79b1b578dd..582950b4e17 100644
--- a/reference/test/multigrid/fixed_coarsening_kernels.cpp
+++ b/reference/test/multigrid/fixed_coarsening_kernels.cpp
@@ -43,18 +43,19 @@ class FixedCoarsening : public ::testing::Test {
     using real_type = gko::remove_complex<value_type>;
     FixedCoarsening()
         : exec(gko::ReferenceExecutor::create()),
+          mtx(Mtx::create(exec, gko::dim<2>(5, 5), 15,
+                          std::make_shared<typename Mtx::classical>())),
+          coarse(Mtx::create(exec, gko::dim<2>(3, 3), 5,
+                             std::make_shared<typename Mtx::classical>())),
           coarse_rows(exec, {0, 2, 3}),
-          fixed_coarsening_factory(MgLevel::build()
-                                       .with_coarse_rows(coarse_rows)
-                                       .with_skip_sorting(true)
-                                       .on(exec)),
+          gen_coarse_rows(exec, 5),
+          coarse_b(gko::initialize<Vec>(
+              {I<VT>({2.0, -1.0}), I<VT>({3.0, 1.0}), I<VT>({0.0, -1.0})},
+              exec)),
           fine_b(gko::initialize<Vec>(
               {I<VT>({2.0, -1.0}), I<VT>({-1.0, 2.0}), I<VT>({0.0, -1.0}),
                I<VT>({3.0, -2.0}), I<VT>({-2.0, 1.0})},
               exec)),
-          coarse_b(gko::initialize<Vec>(
-              {I<VT>({2.0, -1.0}), I<VT>({3.0, 1.0}), I<VT>({0.0, -1.0})},
-              exec)),
           restrict_ans(gko::initialize<Vec>(
               {I<VT>({2.0, -1.0}), I<VT>({0.0, -1.0}), I<VT>({3.0, -2.0})},
               exec)),
@@ -66,11 +67,10 @@ class FixedCoarsening : public ::testing::Test {
               {I<VT>({-2.0, -1.0}), I<VT>({1.0, -1.0}), I<VT>({-1.0, -1.0}),
                I<VT>({0.0, 0.0}), I<VT>({0.0, 2.0})},
               exec)),
-          mtx(Mtx::create(exec, gko::dim<2>(5, 5), 15,
-                          std::make_shared<typename Mtx::classical>())),
-          coarse(Mtx::create(exec, gko::dim<2>(3, 3), 5,
-                             std::make_shared<typename Mtx::classical>())),
-          gen_coarse_rows(exec, 5)
+          fixed_coarsening_factory(MgLevel::build()
+                                       .with_coarse_rows(coarse_rows)
+                                       .with_skip_sorting(true)
+                                       .on(exec))
     {
         this->create_mtx(mtx.get(), &gen_coarse_rows, coarse.get());
         mg_level = fixed_coarsening_factory->generate(mtx);
diff --git a/reference/test/reorder/mc64_kernels.cpp b/reference/test/reorder/mc64_kernels.cpp
index 15f90839e1b..fb20d4af2c8 100644
--- a/reference/test/reorder/mc64_kernels.cpp
+++ b/reference/test/reorder/mc64_kernels.cpp
@@ -46,13 +46,6 @@ class Mc64 : public ::testing::Test {
     Mc64()
         : ref(gko::ReferenceExecutor::create()),
           tmp{ref},
-          mtx(gko::initialize<matrix_type>({{1., 2., 0., 0., 3., 0.},
-                                            {5., 1., 0., 0., 0., 0.},
-                                            {0., 0., 0., 6., 0., 4.},
-                                            {0., 0., 4., 0., 0., 3.},
-                                            {0., 0., 0., 4., 2., 0.},
-                                            {0., 5., 8., 0., 0., 0.}},
-                                           ref)),
           weights{ref, 13},
           dual_u{ref, 6},
           distance{ref, 6},
@@ -96,28 +89,35 @@ class Mc64 : public ::testing::Test {
                                 static_cast<real_type>(std::log2(8.))}},
           initialized_distance{
               ref, I<real_type>{inf(), inf(), inf(), inf(), inf(), inf()}},
+          final_weights{ref, I<real_type>{2., 1., 0., 0., 4., 0., 2., 0., 1.,
+                                          0., 2., 3., 0.}},
+          final_dual_u{ref, I<real_type>{0., 1., -1., -2., 0., 0.}},
+          final_distance{ref, I<real_type>{inf(), inf(), 1., 0., inf(), 1.}},
           empty_permutation{ref, I<index_type>{-1, -1, -1, -1, -1, -1}},
           empty_inverse_permutation{ref, I<index_type>{-1, -1, -1, -1, -1, -1}},
           empty_matched_idxs{ref, I<index_type>{0, 0, 0, 0, 0, 0}},
           empty_unmatched_rows{ref, I<index_type>{0, 0, 0, 0, 0, 0}},
+          initial_matching_permutation{ref, I<index_type>{1, 0, 3, 5, -1, 2}},
+          initial_matching_inverse_permutation{
+              ref, I<index_type>{1, 0, 5, 2, -1, 3}},
           initial_parents{ref, I<index_type>{0, 0, 0, 0, 0, 0}},
           initial_generation{ref, I<index_type>{0, 0, 0, 0, 0, 0}},
           initial_marked_cols{ref, I<index_type>{0, 0, 0, 0, 0, 0}},
           initial_matched_idxs{ref, I<index_type>{1, 3, 5, 8, 0, 12}},
           initial_unmatched_rows{ref, I<index_type>{4, -1, 0, 0, 0, 0}},
-          initial_matching_permutation{ref, I<index_type>{1, 0, 3, 5, -1, 2}},
-          initial_matching_inverse_permutation{
-              ref, I<index_type>{1, 0, 5, 2, -1, 3}},
           final_permutation{ref, I<index_type>{1, 0, 3, 5, 4, 2}},
           final_inverse_permutation{ref, I<index_type>{1, 0, 5, 2, 4, 3}},
           final_parents{ref, I<index_type>{0, 0, 3, 4, 4, 2}},
           final_generation{ref, I<index_type>{0, 0, -4, -4, 0, -4}},
           final_marked_cols{ref, I<index_type>{3, 5, 2, 0, 0, 0}},
           final_matched_idxs{ref, I<index_type>{1, 3, 5, 8, 10, 12}},
-          final_weights{ref, I<real_type>{2., 1., 0., 0., 4., 0., 2., 0., 1.,
-                                          0., 2., 3., 0.}},
-          final_dual_u{ref, I<real_type>{0., 1., -1., -2., 0., 0.}},
-          final_distance{ref, I<real_type>{inf(), inf(), 1., 0., inf(), 1.}},
+          mtx(gko::initialize<matrix_type>({{1., 2., 0., 0., 3., 0.},
+                                            {5., 1., 0., 0., 0., 0.},
+                                            {0., 0., 0., 6., 0., 4.},
+                                            {0., 0., 4., 0., 0., 3.},
+                                            {0., 0., 0., 4., 2., 0.},
+                                            {0., 5., 8., 0., 0., 0.}},
+                                           ref)),
           zero_tol{1e-14}
     {}
 
diff --git a/reference/test/reorder/rcm.cpp b/reference/test/reorder/rcm.cpp
index f8a18e5b6ec..ec547c141e3 100644
--- a/reference/test/reorder/rcm.cpp
+++ b/reference/test/reorder/rcm.cpp
@@ -33,7 +33,6 @@ class Rcm : public ::testing::Test {
     using CsrMtx = gko::matrix::Csr<v_type, i_type>;
     Rcm()
         : exec(gko::ReferenceExecutor::create()),
-          rcm_factory(reorder_type::build().on(exec)),
           // clang-format off
           id3_mtx(gko::initialize<CsrMtx>(
               {{1.0, 0.0, 0.0},
@@ -43,6 +42,7 @@ class Rcm : public ::testing::Test {
               {{1.0, 0.0, 1.0},
               {0.0, 1.0, 0.0},
               {1.0, 0.0, 1.0}}, exec)),
+              rcm_factory(reorder_type::build().on(exec)),
           // clang-format on
           reorder_op(rcm_factory->generate(id3_mtx))
     {}
diff --git a/reference/test/solver/bicg_kernels.cpp b/reference/test/solver/bicg_kernels.cpp
index 837920ec520..fd24c52bcc8 100644
--- a/reference/test/solver/bicg_kernels.cpp
+++ b/reference/test/solver/bicg_kernels.cpp
@@ -31,6 +31,16 @@ class Bicg : public ::testing::Test {
         : exec(gko::ReferenceExecutor::create()),
           mtx(gko::initialize<Mtx>(
               {{2, -1.0, 0.0}, {-1.0, 2, -1.0}, {0.0, -1.0, 2}}, exec)),
+          mtx_big(gko::initialize<Mtx>(
+              {{8828.0, 2673.0, 4150.0, -3139.5, 3829.5, 5856.0},
+               {2673.0, 10765.5, 1805.0, 73.0, 1966.0, 3919.5},
+               {4150.0, 1805.0, 6472.5, 2656.0, 2409.5, 3836.5},
+               {-3139.5, 73.0, 2656.0, 6048.0, 665.0, -132.0},
+               {3829.5, 1966.0, 2409.5, 665.0, 4240.5, 4373.5},
+               {5856.0, 3919.5, 3836.5, -132.0, 4373.5, 5678.0}},
+              exec)),
+          mtx_non_symmetric(gko::initialize<Mtx>(
+              {{1.0, 2.0, 3.0}, {3.0, 2.0, -1.0}, {0.0, -1.0, 2}}, exec)),
           stopped{},
           non_stopped{},
           bicg_factory(Solver::build()
@@ -41,14 +51,6 @@ class Bicg : public ::testing::Test {
                                gko::stop::ResidualNorm<value_type>::build()
                                    .with_reduction_factor(r<value_type>::value))
                            .on(exec)),
-          mtx_big(gko::initialize<Mtx>(
-              {{8828.0, 2673.0, 4150.0, -3139.5, 3829.5, 5856.0},
-               {2673.0, 10765.5, 1805.0, 73.0, 1966.0, 3919.5},
-               {4150.0, 1805.0, 6472.5, 2656.0, 2409.5, 3836.5},
-               {-3139.5, 73.0, 2656.0, 6048.0, 665.0, -132.0},
-               {3829.5, 1966.0, 2409.5, 665.0, 4240.5, 4373.5},
-               {5856.0, 3919.5, 3836.5, -132.0, 4373.5, 5678.0}},
-              exec)),
           bicg_factory_big(
               Solver::build()
                   .with_criteria(
@@ -62,11 +64,7 @@ class Bicg : public ::testing::Test {
                       gko::stop::Iteration::build().with_max_iters(100u),
                       gko::stop::ImplicitResidualNorm<value_type>::build()
                           .with_reduction_factor(r<value_type>::value))
-                  .on(exec)),
-          mtx_non_symmetric(gko::initialize<Mtx>(
-              {{1.0, 2.0, 3.0}, {3.0, 2.0, -1.0}, {0.0, -1.0, 2}}, exec))
-
-
+                  .on(exec))
     {
         auto small_size = gko::dim<2>{2, 2};
         auto small_scalar_size = gko::dim<2>{1, small_size[1]};
diff --git a/reference/test/solver/cb_gmres_kernels.cpp b/reference/test/solver/cb_gmres_kernels.cpp
index a027c02705b..ae3ecce9963 100644
--- a/reference/test/solver/cb_gmres_kernels.cpp
+++ b/reference/test/solver/cb_gmres_kernels.cpp
@@ -39,6 +39,21 @@ class CbGmres : public ::testing::Test {
               {{1.0, 2.0, 3.0}, {3.0, 2.0, -1.0}, {0.0, -1.0, 2}}, exec)),
           mtx2(gko::initialize<Mtx>(
               {{1.0, 2.0, 3.0}, {4.0, 2.0, 1.0}, {0.0, 1.0, 2.0}}, exec)),
+          mtx_medium(
+              gko::initialize<Mtx>({{-86.40, 153.30, -108.90, 8.60, -61.60},
+                                    {7.70, -77.00, 3.30, -149.20, 74.80},
+                                    {-121.40, 37.10, 55.30, -74.20, -19.20},
+                                    {-111.40, -22.60, 110.10, -106.20, 88.90},
+                                    {-0.70, 111.70, 154.40, 235.00, -76.50}},
+                                   exec)),
+          mtx_big(gko::initialize<Mtx>(
+              {{2295.7, -764.8, 1166.5, 428.9, 291.7, -774.5},
+               {2752.6, -1127.7, 1212.8, -299.1, 987.7, 786.8},
+               {138.3, 78.2, 485.5, -899.9, 392.9, 1408.9},
+               {-1907.1, 2106.6, 1026.0, 634.7, 194.6, -534.1},
+               {-365.0, -715.8, 870.7, 67.5, 279.8, 1927.8},
+               {-848.1, -280.5, -381.8, -187.1, 51.2, -176.2}},
+              exec)),
           storage_prec{storage_helper_type::value},
           cb_gmres_factory(
               gmres_type::build()
@@ -51,14 +66,6 @@ class CbGmres : public ::testing::Test {
                           .with_baseline(gko::stop::mode::initial_resnorm)
                           .with_reduction_factor(this->reduction_factor()))
                   .on(exec)),
-          mtx_big(gko::initialize<Mtx>(
-              {{2295.7, -764.8, 1166.5, 428.9, 291.7, -774.5},
-               {2752.6, -1127.7, 1212.8, -299.1, 987.7, 786.8},
-               {138.3, 78.2, 485.5, -899.9, 392.9, 1408.9},
-               {-1907.1, 2106.6, 1026.0, 634.7, 194.6, -534.1},
-               {-365.0, -715.8, 870.7, 67.5, 279.8, 1927.8},
-               {-848.1, -280.5, -381.8, -187.1, 51.2, -176.2}},
-              exec)),
           cb_gmres_factory_big(
               gmres_type::build()
                   .with_storage_precision(storage_prec)
@@ -67,14 +74,8 @@ class CbGmres : public ::testing::Test {
                       gko::stop::ResidualNorm<value_type>::build()
                           .with_baseline(gko::stop::mode::initial_resnorm)
                           .with_reduction_factor(this->reduction_factor()))
-                  .on(exec)),
-          mtx_medium(
-              gko::initialize<Mtx>({{-86.40, 153.30, -108.90, 8.60, -61.60},
-                                    {7.70, -77.00, 3.30, -149.20, 74.80},
-                                    {-121.40, 37.10, 55.30, -74.20, -19.20},
-                                    {-111.40, -22.60, 110.10, -106.20, 88.90},
-                                    {-0.70, 111.70, 154.40, 235.00, -76.50}},
-                                   exec))
+                  .on(exec))
+
     {}
 
     constexpr nc_value_type reduction_factor() const noexcept
diff --git a/reference/test/solver/gcr_kernels.cpp b/reference/test/solver/gcr_kernels.cpp
index a81c3ce4285..7ca885cfab8 100644
--- a/reference/test/solver/gcr_kernels.cpp
+++ b/reference/test/solver/gcr_kernels.cpp
@@ -40,6 +40,21 @@ class Gcr : public ::testing::Test {
           non_stopped{},
           mtx(gko::initialize<Mtx>(
               {{1.0, 2.0, 3.0}, {3.0, 2.0, -1.0}, {0.0, -1.0, 2}}, exec)),
+          mtx_medium(
+              gko::initialize<Mtx>({{-86.40, 153.30, -108.90, 8.60, -61.60},
+                                    {7.70, -77.00, 3.30, -149.20, 74.80},
+                                    {-121.40, 37.10, 55.30, -74.20, -19.20},
+                                    {-111.40, -22.60, 110.10, -106.20, 88.90},
+                                    {-0.70, 111.70, 154.40, 235.00, -76.50}},
+                                   exec)),
+          mtx_big(gko::initialize<Mtx>(
+              {{2295.7, -764.8, 1166.5, 428.9, 291.7, -774.5},
+               {2752.6, -1127.7, 1212.8, -299.1, 987.7, 786.8},
+               {138.3, 78.2, 485.5, -899.9, 392.9, 1408.9},
+               {-1907.1, 2106.6, 1026.0, 634.7, 194.6, -534.1},
+               {-365.0, -715.8, 870.7, 67.5, 279.8, 1927.8},
+               {-848.1, -280.5, -381.8, -187.1, 51.2, -176.2}},
+              exec)),
           gcr_factory(Solver::build()
                           .with_criteria(
                               gko::stop::Iteration::build().with_max_iters(4u),
@@ -49,14 +64,6 @@ class Gcr : public ::testing::Test {
                                   .with_reduction_factor(r<value_type>::value))
                           .with_krylov_dim(3u)
                           .on(exec)),
-          mtx_big(gko::initialize<Mtx>(
-              {{2295.7, -764.8, 1166.5, 428.9, 291.7, -774.5},
-               {2752.6, -1127.7, 1212.8, -299.1, 987.7, 786.8},
-               {138.3, 78.2, 485.5, -899.9, 392.9, 1408.9},
-               {-1907.1, 2106.6, 1026.0, 634.7, 194.6, -534.1},
-               {-365.0, -715.8, 870.7, 67.5, 279.8, 1927.8},
-               {-848.1, -280.5, -381.8, -187.1, 51.2, -176.2}},
-              exec)),
           gcr_factory_big(
               Solver::build()
                   .with_criteria(
@@ -70,14 +77,7 @@ class Gcr : public ::testing::Test {
                       gko::stop::Iteration::build().with_max_iters(100u),
                       gko::stop::ImplicitResidualNorm<value_type>::build()
                           .with_reduction_factor(r<value_type>::value))
-                  .on(exec)),
-          mtx_medium(
-              gko::initialize<Mtx>({{-86.40, 153.30, -108.90, 8.60, -61.60},
-                                    {7.70, -77.00, 3.30, -149.20, 74.80},
-                                    {-121.40, 37.10, 55.30, -74.20, -19.20},
-                                    {-111.40, -22.60, 110.10, -106.20, 88.90},
-                                    {-0.70, 111.70, 154.40, 235.00, -76.50}},
-                                   exec))
+                  .on(exec))
     {
         auto small_size = gko::dim<2>{3, 2};
         small_b = gko::initialize<Mtx>(

From a714cbc94f83c9a6a98bf0a893582df16bd4f1c2 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 14 Nov 2024 13:00:47 +0100
Subject: [PATCH 281/448] friend class is not in the nearest enclosing namesace

---
 include/ginkgo/core/base/executor.hpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index 63793157278..224860b72b7 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -1270,8 +1270,13 @@ namespace detail {
 
 template <typename ConcreteExecutor>
 class ExecutorBase : public Executor {
-    GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
-    friend class ReferenceExecutor;
+    // friend class is not in the nearest enclosing namesace, so we write the
+    // full name
+    friend class ::gko::OmpExecutor;
+    friend class ::gko::HipExecutor;
+    friend class ::gko::DpcppExecutor;
+    friend class ::gko::CudaExecutor;
+    friend class ::gko::ReferenceExecutor;
 
 public:
     void run(const Operation& op) const override

From 64e081e949321ad8b34238abd54e1260e335cf5c Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 14 Nov 2024 18:03:28 +0100
Subject: [PATCH 282/448] fix the function declaration of definition different
 from declaration in instantiation

---
 .../base/batch_multi_vector_kernels.cpp       | 14 +++----
 .../cuda_hip/matrix/csr_kernels.template.cpp  | 12 +++---
 .../matrix/fbcsr_kernels.template.cpp         | 38 +++++++++---------
 common/cuda_hip/reorder/rcm_kernels.cpp       |  8 ++--
 .../unified/matrix/dense_kernels.template.cpp |  6 +--
 core/matrix/csr.cpp                           |  4 +-
 core/matrix/dense.cpp                         |  3 +-
 core/matrix/fbcsr.cpp                         | 27 +++++--------
 cuda/preconditioner/batch_jacobi_kernels.cu   | 22 +++++-----
 cuda/solver/batch_bicgstab_kernels.cu         |  7 ++--
 cuda/solver/batch_cg_kernels.cu               |  7 ++--
 dpcpp/base/batch_multi_vector_kernels.dp.cpp  | 26 ++++++------
 dpcpp/matrix/csr_kernels.dp.cpp               | 12 +++---
 dpcpp/matrix/fbcsr_kernels.dp.cpp             |  6 +--
 .../batch_jacobi_kernels.dp.cpp               | 22 +++++-----
 dpcpp/reorder/rcm_kernels.dp.cpp              |  4 +-
 dpcpp/solver/batch_bicgstab_kernels.dp.cpp    |  7 ++--
 dpcpp/solver/batch_cg_kernels.dp.cpp          |  7 ++--
 .../batch_jacobi_kernels.hip.cpp              | 22 +++++-----
 hip/solver/batch_bicgstab_kernels.hip.cpp     |  7 ++--
 hip/solver/batch_cg_kernels.hip.cpp           |  7 ++--
 omp/base/batch_multi_vector_kernels.cpp       | 26 ++++++------
 omp/components/prefix_sum_kernels.cpp         |  3 +-
 omp/matrix/csr_kernels.cpp                    | 12 +++---
 omp/matrix/fbcsr_kernels.cpp                  | 40 +++++++++----------
 omp/preconditioner/batch_jacobi_kernels.cpp   | 23 +++++------
 omp/reorder/rcm_kernels.cpp                   |  5 +--
 omp/solver/batch_bicgstab_kernels.cpp         |  7 ++--
 omp/solver/batch_cg_kernels.cpp               |  7 ++--
 reference/matrix/csr_kernels.cpp              | 12 +++---
 reference/matrix/dense_kernels.cpp            |  6 +--
 reference/matrix/fbcsr_kernels.cpp            | 40 +++++++++----------
 .../preconditioner/batch_jacobi_kernels.cpp   | 23 +++++------
 reference/reorder/rcm_kernels.cpp             |  6 +--
 reference/solver/batch_bicgstab_kernels.cpp   |  7 ++--
 reference/solver/batch_cg_kernels.cpp         |  7 ++--
 36 files changed, 228 insertions(+), 264 deletions(-)

diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.cpp b/common/cuda_hip/base/batch_multi_vector_kernels.cpp
index 17f65487464..8154dc440df 100644
--- a/common/cuda_hip/base/batch_multi_vector_kernels.cpp
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.cpp
@@ -30,8 +30,8 @@ constexpr auto default_block_size = 256;
 
 template <typename ValueType>
 void scale(std::shared_ptr<const DefaultExecutor> exec,
-           const batch::MultiVector<ValueType>* const alpha,
-           batch::MultiVector<ValueType>* const x)
+           const batch::MultiVector<ValueType>* alpha,
+           batch::MultiVector<ValueType>* x)
 {
     const auto num_blocks = x->get_num_batch_items();
     const auto alpha_ub = get_batch_struct(alpha);
@@ -61,9 +61,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
 
 template <typename ValueType>
 void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
-                const batch::MultiVector<ValueType>* const alpha,
-                const batch::MultiVector<ValueType>* const x,
-                batch::MultiVector<ValueType>* const y)
+                const batch::MultiVector<ValueType>* alpha,
+                const batch::MultiVector<ValueType>* x,
+                batch::MultiVector<ValueType>* y)
 {
     const auto num_blocks = x->get_num_batch_items();
     const size_type nrhs = x->get_common_size()[1];
@@ -127,8 +127,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
 
 template <typename ValueType>
 void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
-                   const batch::MultiVector<ValueType>* const x,
-                   batch::MultiVector<remove_complex<ValueType>>* const result)
+                   const batch::MultiVector<ValueType>* x,
+                   batch::MultiVector<remove_complex<ValueType>>* result)
 {
     const auto num_blocks = x->get_num_batch_items();
     const auto num_rhs = x->get_common_size()[1];
diff --git a/common/cuda_hip/matrix/csr_kernels.template.cpp b/common/cuda_hip/matrix/csr_kernels.template.cpp
index 757e689a777..909349ed7ab 100644
--- a/common/cuda_hip/matrix/csr_kernels.template.cpp
+++ b/common/cuda_hip/matrix/csr_kernels.template.cpp
@@ -1823,9 +1823,9 @@ void extract_diagonal(std::shared_ptr<const DefaultExecutor> exec,
 
 
 template <typename ValueType, typename IndexType>
-void check_diagonal_entries_exist(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const matrix::Csr<ValueType, IndexType>* const mtx, bool& has_all_diags)
+void check_diagonal_entries_exist(std::shared_ptr<const DefaultExecutor> exec,
+                                  const matrix::Csr<ValueType, IndexType>* mtx,
+                                  bool& has_all_diags)
 {
     const auto num_diag = static_cast<IndexType>(
         std::min(mtx->get_size()[0], mtx->get_size()[1]));
@@ -1846,9 +1846,9 @@ void check_diagonal_entries_exist(
 
 template <typename ValueType, typename IndexType>
 void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
-                         const matrix::Dense<ValueType>* const alpha,
-                         const matrix::Dense<ValueType>* const beta,
-                         matrix::Csr<ValueType, IndexType>* const mtx)
+                         const matrix::Dense<ValueType>* alpha,
+                         const matrix::Dense<ValueType>* beta,
+                         matrix::Csr<ValueType, IndexType>* mtx)
 {
     const auto nrows = mtx->get_size()[0];
     if (nrows == 0) {
diff --git a/common/cuda_hip/matrix/fbcsr_kernels.template.cpp b/common/cuda_hip/matrix/fbcsr_kernels.template.cpp
index 960708378e1..23f865b6ace 100644
--- a/common/cuda_hip/matrix/fbcsr_kernels.template.cpp
+++ b/common/cuda_hip/matrix/fbcsr_kernels.template.cpp
@@ -294,8 +294,8 @@ __global__ void __launch_bounds__(default_block_size)
 
 template <typename ValueType, typename IndexType>
 void fallback_transpose(const std::shared_ptr<const DefaultExecutor> exec,
-                        const matrix::Fbcsr<ValueType, IndexType>* const input,
-                        matrix::Fbcsr<ValueType, IndexType>* const output)
+                        const matrix::Fbcsr<ValueType, IndexType>* input,
+                        matrix::Fbcsr<ValueType, IndexType>* output)
 {
     const auto in_num_row_blocks = input->get_num_block_rows();
     const auto out_num_row_blocks = output->get_num_block_rows();
@@ -353,8 +353,8 @@ void fill_in_dense(std::shared_ptr<const DefaultExecutor> exec,
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(const std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Fbcsr<ValueType, IndexType>* const source,
-                    matrix::Csr<ValueType, IndexType>* const result)
+                    const matrix::Fbcsr<ValueType, IndexType>* source,
+                    matrix::Csr<ValueType, IndexType>* result)
 {
     constexpr auto warps_per_block = default_block_size / config::warp_size;
     const auto num_blocks =
@@ -373,8 +373,7 @@ void convert_to_csr(const std::shared_ptr<const DefaultExecutor> exec,
 template <typename ValueType, typename IndexType>
 void is_sorted_by_column_index(
     std::shared_ptr<const DefaultExecutor> exec,
-    const matrix::Fbcsr<ValueType, IndexType>* const to_check,
-    bool* const is_sorted)
+    const matrix::Fbcsr<ValueType, IndexType>* to_check, bool* is_sorted)
 {
     *is_sorted = true;
     auto gpu_array = array<bool>(exec, 1);
@@ -396,7 +395,7 @@ void is_sorted_by_column_index(
 
 template <typename ValueType, typename IndexType>
 void sort_by_column_index(const std::shared_ptr<const DefaultExecutor> exec,
-                          matrix::Fbcsr<ValueType, IndexType>* const to_sort)
+                          matrix::Fbcsr<ValueType, IndexType>* to_sort)
     GKO_NOT_IMPLEMENTED;
 
 
@@ -412,8 +411,8 @@ namespace {
 template <typename ValueType>
 void dense_transpose(std::shared_ptr<const DefaultExecutor> exec,
                      const size_type nrows, const size_type ncols,
-                     const size_type orig_stride, const ValueType* const orig,
-                     const size_type trans_stride, ValueType* const trans)
+                     const size_type orig_stride, const ValueType* orig,
+                     const size_type trans_stride, ValueType* trans)
 {
     if (nrows == 0) {
         return;
@@ -439,9 +438,8 @@ void dense_transpose(std::shared_ptr<const DefaultExecutor> exec,
 
 template <typename ValueType, typename IndexType>
 void spmv(std::shared_ptr<const DefaultExecutor> exec,
-          const matrix::Fbcsr<ValueType, IndexType>* const a,
-          const matrix::Dense<ValueType>* const b,
-          matrix::Dense<ValueType>* const c)
+          const matrix::Fbcsr<ValueType, IndexType>* a,
+          const matrix::Dense<ValueType>* b, matrix::Dense<ValueType>* c)
 {
     if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
         // empty output: nothing to do
@@ -494,11 +492,11 @@ void spmv(std::shared_ptr<const DefaultExecutor> exec,
 
 template <typename ValueType, typename IndexType>
 void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec,
-                   const matrix::Dense<ValueType>* const alpha,
-                   const matrix::Fbcsr<ValueType, IndexType>* const a,
-                   const matrix::Dense<ValueType>* const b,
-                   const matrix::Dense<ValueType>* const beta,
-                   matrix::Dense<ValueType>* const c)
+                   const matrix::Dense<ValueType>* alpha,
+                   const matrix::Fbcsr<ValueType, IndexType>* a,
+                   const matrix::Dense<ValueType>* b,
+                   const matrix::Dense<ValueType>* beta,
+                   matrix::Dense<ValueType>* c)
 {
     if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
         // empty output: nothing to do
@@ -556,7 +554,7 @@ namespace {
 template <int mat_blk_sz, typename ValueType, typename IndexType>
 void transpose_blocks_impl(syn::value_list<int, mat_blk_sz>,
                            std::shared_ptr<const DefaultExecutor> exec,
-                           matrix::Fbcsr<ValueType, IndexType>* const mat)
+                           matrix::Fbcsr<ValueType, IndexType>* mat)
 {
     constexpr int subwarp_size = config::warp_size;
     const auto nbnz = mat->get_num_stored_blocks();
@@ -579,8 +577,8 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_transpose_blocks,
 
 template <typename ValueType, typename IndexType>
 void transpose(const std::shared_ptr<const DefaultExecutor> exec,
-               const matrix::Fbcsr<ValueType, IndexType>* const orig,
-               matrix::Fbcsr<ValueType, IndexType>* const trans)
+               const matrix::Fbcsr<ValueType, IndexType>* orig,
+               matrix::Fbcsr<ValueType, IndexType>* trans)
 {
 #ifdef GKO_COMPILING_CUDA
     if (sparselib::is_supported<ValueType, IndexType>::value) {
diff --git a/common/cuda_hip/reorder/rcm_kernels.cpp b/common/cuda_hip/reorder/rcm_kernels.cpp
index 72729db30f1..4315a9ed702 100644
--- a/common/cuda_hip/reorder/rcm_kernels.cpp
+++ b/common/cuda_hip/reorder/rcm_kernels.cpp
@@ -614,11 +614,9 @@ void sort_levels(std::shared_ptr<const DefaultExecutor> exec,
 
 template <typename IndexType>
 void compute_permutation(std::shared_ptr<const DefaultExecutor> exec,
-                         const IndexType num_rows,
-                         const IndexType* const row_ptrs,
-                         const IndexType* const col_idxs,
-                         IndexType* const permutation,
-                         IndexType* const inv_permutation,
+                         const IndexType num_rows, const IndexType* row_ptrs,
+                         const IndexType* col_idxs, IndexType* permutation,
+                         IndexType* inv_permutation,
                          const gko::reorder::starting_strategy strategy)
 {
     if (num_rows == 0) {
diff --git a/common/unified/matrix/dense_kernels.template.cpp b/common/unified/matrix/dense_kernels.template.cpp
index f469bd997aa..16630233578 100644
--- a/common/unified/matrix/dense_kernels.template.cpp
+++ b/common/unified/matrix/dense_kernels.template.cpp
@@ -730,9 +730,9 @@ void get_imag(std::shared_ptr<const DefaultExecutor> exec,
 
 template <typename ValueType, typename ScalarType>
 void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
-                         const matrix::Dense<ScalarType>* const alpha,
-                         const matrix::Dense<ScalarType>* const beta,
-                         matrix::Dense<ValueType>* const mtx)
+                         const matrix::Dense<ScalarType>* alpha,
+                         const matrix::Dense<ScalarType>* beta,
+                         matrix::Dense<ValueType>* mtx)
 {
     run_kernel(
         exec,
diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp
index e50732a3be9..897eb1a48db 100644
--- a/core/matrix/csr.cpp
+++ b/core/matrix/csr.cpp
@@ -1029,8 +1029,8 @@ void Csr<ValueType, IndexType>::inv_scale_impl(const LinOp* alpha)
 
 
 template <typename ValueType, typename IndexType>
-void Csr<ValueType, IndexType>::add_scaled_identity_impl(const LinOp* const a,
-                                                         const LinOp* const b)
+void Csr<ValueType, IndexType>::add_scaled_identity_impl(const LinOp* a,
+                                                         const LinOp* b)
 {
     bool has_diags{false};
     this->get_executor()->run(
diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp
index a9c551857ef..367b0232969 100644
--- a/core/matrix/dense.cpp
+++ b/core/matrix/dense.cpp
@@ -1906,8 +1906,7 @@ void Dense<ValueType>::get_imag(ptr_param<real_type> result) const
 
 
 template <typename ValueType>
-void Dense<ValueType>::add_scaled_identity_impl(const LinOp* const a,
-                                                const LinOp* const b)
+void Dense<ValueType>::add_scaled_identity_impl(const LinOp* a, const LinOp* b)
 {
     precision_dispatch_real_complex<ValueType>(
         [this](auto dense_alpha, auto dense_beta, auto dense_x) {
diff --git a/core/matrix/fbcsr.cpp b/core/matrix/fbcsr.cpp
index a48e32be088..8ed9b117280 100644
--- a/core/matrix/fbcsr.cpp
+++ b/core/matrix/fbcsr.cpp
@@ -103,8 +103,7 @@ Fbcsr<ValueType, IndexType>::Fbcsr(Fbcsr&& other) : Fbcsr{other.get_executor()}
 
 
 template <typename ValueType, typename IndexType>
-void Fbcsr<ValueType, IndexType>::apply_impl(const LinOp* const b,
-                                             LinOp* const x) const
+void Fbcsr<ValueType, IndexType>::apply_impl(const LinOp* b, LinOp* x) const
 {
     if (auto b_fbcsr = dynamic_cast<const Fbcsr<ValueType, IndexType>*>(b)) {
         // if b is a FBCSR matrix, we need an SpGeMM
@@ -122,10 +121,8 @@ void Fbcsr<ValueType, IndexType>::apply_impl(const LinOp* const b,
 
 
 template <typename ValueType, typename IndexType>
-void Fbcsr<ValueType, IndexType>::apply_impl(const LinOp* const alpha,
-                                             const LinOp* const b,
-                                             const LinOp* const beta,
-                                             LinOp* const x) const
+void Fbcsr<ValueType, IndexType>::apply_impl(const LinOp* alpha, const LinOp* b,
+                                             const LinOp* beta, LinOp* x) const
 {
     if (auto b_fbcsr = dynamic_cast<const Fbcsr<ValueType, IndexType>*>(b)) {
         // if b is a FBCSR matrix, we need an SpGeMM
@@ -148,7 +145,7 @@ void Fbcsr<ValueType, IndexType>::apply_impl(const LinOp* const alpha,
 
 template <typename ValueType, typename IndexType>
 void Fbcsr<ValueType, IndexType>::convert_to(
-    Fbcsr<next_precision<ValueType>, IndexType>* const result) const
+    Fbcsr<next_precision<ValueType>, IndexType>* result) const
 {
     result->values_ = this->values_;
     result->col_idxs_ = this->col_idxs_;
@@ -161,15 +158,14 @@ void Fbcsr<ValueType, IndexType>::convert_to(
 
 template <typename ValueType, typename IndexType>
 void Fbcsr<ValueType, IndexType>::move_to(
-    Fbcsr<next_precision<ValueType>, IndexType>* const result)
+    Fbcsr<next_precision<ValueType>, IndexType>* result)
 {
     this->convert_to(result);
 }
 
 
 template <typename ValueType, typename IndexType>
-void Fbcsr<ValueType, IndexType>::convert_to(
-    Dense<ValueType>* const result) const
+void Fbcsr<ValueType, IndexType>::convert_to(Dense<ValueType>* result) const
 {
     auto exec = this->get_executor();
     auto tmp_result = make_temporary_output_clone(exec, result);
@@ -180,7 +176,7 @@ void Fbcsr<ValueType, IndexType>::convert_to(
 
 
 template <typename ValueType, typename IndexType>
-void Fbcsr<ValueType, IndexType>::move_to(Dense<ValueType>* const result)
+void Fbcsr<ValueType, IndexType>::move_to(Dense<ValueType>* result)
 {
     this->convert_to(result);
 }
@@ -188,7 +184,7 @@ void Fbcsr<ValueType, IndexType>::move_to(Dense<ValueType>* const result)
 
 template <typename ValueType, typename IndexType>
 void Fbcsr<ValueType, IndexType>::convert_to(
-    Csr<ValueType, IndexType>* const result) const
+    Csr<ValueType, IndexType>* result) const
 {
     auto exec = this->get_executor();
     {
@@ -204,8 +200,7 @@ void Fbcsr<ValueType, IndexType>::convert_to(
 
 
 template <typename ValueType, typename IndexType>
-void Fbcsr<ValueType, IndexType>::move_to(
-    Csr<ValueType, IndexType>* const result)
+void Fbcsr<ValueType, IndexType>::move_to(Csr<ValueType, IndexType>* result)
 {
     this->convert_to(result);
 }
@@ -213,7 +208,7 @@ void Fbcsr<ValueType, IndexType>::move_to(
 
 template <typename ValueType, typename IndexType>
 void Fbcsr<ValueType, IndexType>::convert_to(
-    SparsityCsr<ValueType, IndexType>* const result) const
+    SparsityCsr<ValueType, IndexType>* result) const
 {
     result->set_size(
         gko::dim<2>{static_cast<size_type>(this->get_num_block_rows()),
@@ -227,7 +222,7 @@ void Fbcsr<ValueType, IndexType>::convert_to(
 
 template <typename ValueType, typename IndexType>
 void Fbcsr<ValueType, IndexType>::move_to(
-    SparsityCsr<ValueType, IndexType>* const result)
+    SparsityCsr<ValueType, IndexType>* result)
 {
     this->convert_to(result);
 }
diff --git a/cuda/preconditioner/batch_jacobi_kernels.cu b/cuda/preconditioner/batch_jacobi_kernels.cu
index 8768937dc6d..2ac5717308a 100644
--- a/cuda/preconditioner/batch_jacobi_kernels.cu
+++ b/cuda/preconditioner/batch_jacobi_kernels.cu
@@ -45,8 +45,7 @@ using batch_jacobi_cuda_compiled_max_block_sizes =
 template <typename IndexType>
 void compute_cumulative_block_storage(
     std::shared_ptr<const DefaultExecutor> exec, const size_type num_blocks,
-    const IndexType* const block_pointers,
-    IndexType* const blocks_cumulative_offsets)
+    const IndexType* block_pointers, IndexType* blocks_cumulative_offsets)
 {
     dim3 block(default_block_size);
     dim3 grid(ceildiv(num_blocks, default_block_size));
@@ -66,8 +65,8 @@ GKO_INSTANTIATE_FOR_INT32_TYPE(
 template <typename IndexType>
 void find_row_block_map(std::shared_ptr<const DefaultExecutor> exec,
                         const size_type num_blocks,
-                        const IndexType* const block_pointers,
-                        IndexType* const map_block_to_row)
+                        const IndexType* block_pointers,
+                        IndexType* map_block_to_row)
 {
     dim3 block(default_block_size);
     dim3 grid(ceildiv(num_blocks, default_block_size));
@@ -83,10 +82,10 @@ GKO_INSTANTIATE_FOR_INT32_TYPE(
 template <typename ValueType, typename IndexType>
 void extract_common_blocks_pattern(
     std::shared_ptr<const DefaultExecutor> exec,
-    const gko::matrix::Csr<ValueType, IndexType>* const first_sys_csr,
-    const size_type num_blocks, const IndexType* const cumulative_block_storage,
-    const IndexType* const block_pointers,
-    const IndexType* const map_block_to_row, IndexType* const blocks_pattern)
+    const gko::matrix::Csr<ValueType, IndexType>* first_sys_csr,
+    const size_type num_blocks, const IndexType* cumulative_block_storage,
+    const IndexType* block_pointers, const IndexType* map_block_to_row,
+    IndexType* blocks_pattern)
 {
     const auto nrows = first_sys_csr->get_size()[0];
     dim3 block(default_block_size);
@@ -143,11 +142,10 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_block_jacobi_helper,
 template <typename ValueType, typename IndexType>
 void compute_block_jacobi(
     std::shared_ptr<const DefaultExecutor> exec,
-    const batch::matrix::Csr<ValueType, IndexType>* const sys_csr,
+    const batch::matrix::Csr<ValueType, IndexType>* sys_csr,
     const uint32 max_block_size, const size_type num_blocks,
-    const IndexType* const cumulative_block_storage,
-    const IndexType* const block_pointers,
-    const IndexType* const blocks_pattern, ValueType* const blocks)
+    const IndexType* cumulative_block_storage, const IndexType* block_pointers,
+    const IndexType* blocks_pattern, ValueType* blocks)
 {
     select_compute_block_jacobi_helper(
         batch_jacobi_cuda_compiled_max_block_sizes(),
diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu
index bd07259f771..e8052637763 100644
--- a/cuda/solver/batch_bicgstab_kernels.cu
+++ b/cuda/solver/batch_bicgstab_kernels.cu
@@ -141,10 +141,9 @@ private:
 template <typename ValueType>
 void apply(std::shared_ptr<const DefaultExecutor> exec,
            const settings<remove_complex<ValueType>>& settings,
-           const batch::BatchLinOp* const mat,
-           const batch::BatchLinOp* const precon,
-           const batch::MultiVector<ValueType>* const b,
-           batch::MultiVector<ValueType>* const x,
+           const batch::BatchLinOp* mat, const batch::BatchLinOp* precon,
+           const batch::MultiVector<ValueType>* b,
+           batch::MultiVector<ValueType>* x,
            batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
 {
     auto dispatcher = batch::solver::create_dispatcher<ValueType>(
diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu
index 126a62006cf..e45e1baf03b 100644
--- a/cuda/solver/batch_cg_kernels.cu
+++ b/cuda/solver/batch_cg_kernels.cu
@@ -122,10 +122,9 @@ private:
 template <typename ValueType>
 void apply(std::shared_ptr<const DefaultExecutor> exec,
            const settings<remove_complex<ValueType>>& settings,
-           const batch::BatchLinOp* const mat,
-           const batch::BatchLinOp* const precon,
-           const batch::MultiVector<ValueType>* const b,
-           batch::MultiVector<ValueType>* const x,
+           const batch::BatchLinOp* mat, const batch::BatchLinOp* precon,
+           const batch::MultiVector<ValueType>* b,
+           batch::MultiVector<ValueType>* x,
            batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
 {
     auto dispatcher = batch::solver::create_dispatcher<ValueType>(
diff --git a/dpcpp/base/batch_multi_vector_kernels.dp.cpp b/dpcpp/base/batch_multi_vector_kernels.dp.cpp
index 0d2662bdccd..1d38a165956 100644
--- a/dpcpp/base/batch_multi_vector_kernels.dp.cpp
+++ b/dpcpp/base/batch_multi_vector_kernels.dp.cpp
@@ -35,8 +35,8 @@ namespace batch_multi_vector {
 
 template <typename ValueType>
 void scale(std::shared_ptr<const DefaultExecutor> exec,
-           const batch::MultiVector<ValueType>* const alpha,
-           batch::MultiVector<ValueType>* const x)
+           const batch::MultiVector<ValueType>* alpha,
+           batch::MultiVector<ValueType>* x)
 {
     const auto alpha_ub = get_batch_struct(alpha);
     const auto x_ub = get_batch_struct(x);
@@ -108,9 +108,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
 
 template <typename ValueType>
 void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
-                const batch::MultiVector<ValueType>* const alpha,
-                const batch::MultiVector<ValueType>* const x,
-                batch::MultiVector<ValueType>* const y)
+                const batch::MultiVector<ValueType>* alpha,
+                const batch::MultiVector<ValueType>* x,
+                batch::MultiVector<ValueType>* y)
 {
     constexpr int max_subgroup_size = config::warp_size;
     const int num_rows = x->get_common_size()[0];
@@ -167,9 +167,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
 
 template <typename ValueType>
 void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
-                 const batch::MultiVector<ValueType>* const x,
-                 const batch::MultiVector<ValueType>* const y,
-                 batch::MultiVector<ValueType>* const result)
+                 const batch::MultiVector<ValueType>* x,
+                 const batch::MultiVector<ValueType>* y,
+                 batch::MultiVector<ValueType>* result)
 {
     const auto x_ub = get_batch_struct(x);
     const auto y_ub = get_batch_struct(y);
@@ -236,9 +236,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
 
 template <typename ValueType>
 void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
-                      const batch::MultiVector<ValueType>* const x,
-                      const batch::MultiVector<ValueType>* const y,
-                      batch::MultiVector<ValueType>* const result)
+                      const batch::MultiVector<ValueType>* x,
+                      const batch::MultiVector<ValueType>* y,
+                      batch::MultiVector<ValueType>* result)
 {
     const auto x_ub = get_batch_struct(x);
     const auto y_ub = get_batch_struct(y);
@@ -281,8 +281,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
 
 template <typename ValueType>
 void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
-                   const batch::MultiVector<ValueType>* const x,
-                   batch::MultiVector<remove_complex<ValueType>>* const result)
+                   const batch::MultiVector<ValueType>* x,
+                   batch::MultiVector<remove_complex<ValueType>>* result)
 {
     const auto x_ub = get_batch_struct(x);
     const auto res_ub = get_batch_struct(result);
diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp
index 44e9c5e16e6..4dce0aa6ac2 100644
--- a/dpcpp/matrix/csr_kernels.dp.cpp
+++ b/dpcpp/matrix/csr_kernels.dp.cpp
@@ -2652,9 +2652,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
 
 
 template <typename ValueType, typename IndexType>
-void check_diagonal_entries_exist(
-    std::shared_ptr<const DpcppExecutor> exec,
-    const matrix::Csr<ValueType, IndexType>* const mtx, bool& has_all_diags)
+void check_diagonal_entries_exist(std::shared_ptr<const DpcppExecutor> exec,
+                                  const matrix::Csr<ValueType, IndexType>* mtx,
+                                  bool& has_all_diags)
 {
     const auto num_diag = static_cast<IndexType>(
         std::min(mtx->get_size()[0], mtx->get_size()[1]));
@@ -2678,9 +2678,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void add_scaled_identity(std::shared_ptr<const DpcppExecutor> exec,
-                         const matrix::Dense<ValueType>* const alpha,
-                         const matrix::Dense<ValueType>* const beta,
-                         matrix::Csr<ValueType, IndexType>* const mtx)
+                         const matrix::Dense<ValueType>* alpha,
+                         const matrix::Dense<ValueType>* beta,
+                         matrix::Csr<ValueType, IndexType>* mtx)
 {
     const auto nrows = mtx->get_size()[0];
     if (nrows == 0) {
diff --git a/dpcpp/matrix/fbcsr_kernels.dp.cpp b/dpcpp/matrix/fbcsr_kernels.dp.cpp
index bf858be51e3..e9eb02f5fb2 100644
--- a/dpcpp/matrix/fbcsr_kernels.dp.cpp
+++ b/dpcpp/matrix/fbcsr_kernels.dp.cpp
@@ -69,8 +69,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(const std::shared_ptr<const DpcppExecutor> exec,
-                    const matrix::Fbcsr<ValueType, IndexType>* const source,
-                    matrix::Csr<ValueType, IndexType>* const result)
+                    const matrix::Fbcsr<ValueType, IndexType>* source,
+                    matrix::Csr<ValueType, IndexType>* result)
     GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -108,7 +108,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void sort_by_column_index(const std::shared_ptr<const DpcppExecutor> exec,
-                          matrix::Fbcsr<ValueType, IndexType>* const to_sort)
+                          matrix::Fbcsr<ValueType, IndexType>* to_sort)
     GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp b/dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp
index d85f93e74f2..7721359716c 100644
--- a/dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp
+++ b/dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp
@@ -37,8 +37,7 @@ using batch_jacobi_dpcpp_compiled_max_block_sizes =
 template <typename IndexType>
 void compute_cumulative_block_storage(
     std::shared_ptr<const DefaultExecutor> exec, const size_type num_blocks,
-    const IndexType* const block_pointers,
-    IndexType* const blocks_cumulative_offsets)
+    const IndexType* block_pointers, IndexType* blocks_cumulative_offsets)
 {
     exec->get_queue()->submit([&](sycl::handler& cgh) {
         cgh.parallel_for(num_blocks, [=](auto id) {
@@ -57,8 +56,8 @@ GKO_INSTANTIATE_FOR_INT32_TYPE(
 template <typename IndexType>
 void find_row_block_map(std::shared_ptr<const DefaultExecutor> exec,
                         const size_type num_blocks,
-                        const IndexType* const block_pointers,
-                        IndexType* const map_block_to_row)
+                        const IndexType* block_pointers,
+                        IndexType* map_block_to_row)
 {
     exec->get_queue()->submit([&](sycl::handler& cgh) {
         cgh.parallel_for(num_blocks, [=](auto id) {
@@ -75,10 +74,10 @@ GKO_INSTANTIATE_FOR_INT32_TYPE(
 template <typename ValueType, typename IndexType>
 void extract_common_blocks_pattern(
     std::shared_ptr<const DefaultExecutor> exec,
-    const gko::matrix::Csr<ValueType, IndexType>* const first_sys_csr,
-    const size_type num_blocks, const IndexType* const cumulative_block_storage,
-    const IndexType* const block_pointers, const IndexType* const map_block_row,
-    IndexType* const blocks_pattern)
+    const gko::matrix::Csr<ValueType, IndexType>* first_sys_csr,
+    const size_type num_blocks, const IndexType* cumulative_block_storage,
+    const IndexType* block_pointers, const IndexType* map_block_row,
+    IndexType* blocks_pattern)
 {
     const auto nrows = first_sys_csr->get_size()[0];
     constexpr int subgroup_size = config::warp_size;
@@ -160,11 +159,10 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_block_jacobi_helper,
 template <typename ValueType, typename IndexType>
 void compute_block_jacobi(
     std::shared_ptr<const DefaultExecutor> exec,
-    const batch::matrix::Csr<ValueType, IndexType>* const sys_csr,
+    const batch::matrix::Csr<ValueType, IndexType>* sys_csr,
     const uint32 user_given_max_block_size, const size_type num_blocks,
-    const IndexType* const cumulative_block_storage,
-    const IndexType* const block_pointers,
-    const IndexType* const blocks_pattern, ValueType* const blocks)
+    const IndexType* cumulative_block_storage, const IndexType* block_pointers,
+    const IndexType* blocks_pattern, ValueType* blocks)
 {
     select_compute_block_jacobi_helper(
         batch_jacobi_dpcpp_compiled_max_block_sizes(),
diff --git a/dpcpp/reorder/rcm_kernels.dp.cpp b/dpcpp/reorder/rcm_kernels.dp.cpp
index 2985f1a0dc7..b1cd6fc1319 100644
--- a/dpcpp/reorder/rcm_kernels.dp.cpp
+++ b/dpcpp/reorder/rcm_kernels.dp.cpp
@@ -27,8 +27,8 @@ namespace rcm {
 template <typename IndexType>
 void compute_permutation(
     std::shared_ptr<const DpcppExecutor> exec, const IndexType num_vertices,
-    const IndexType* const row_ptrs, const IndexType* const col_idxs,
-    IndexType* const permutation, IndexType* const inv_permutation,
+    const IndexType* row_ptrs, const IndexType* col_idxs,
+    IndexType* permutation, IndexType* inv_permutation,
     const gko::reorder::starting_strategy strategy) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_RCM_COMPUTE_PERMUTATION_KERNEL);
diff --git a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
index 2aa98c26ed1..c02ca02e1d8 100644
--- a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
+++ b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
@@ -171,10 +171,9 @@ class kernel_caller {
 template <typename ValueType>
 void apply(std::shared_ptr<const DefaultExecutor> exec,
            const settings<remove_complex<ValueType>>& settings,
-           const batch::BatchLinOp* const mat,
-           const batch::BatchLinOp* const precond,
-           const batch::MultiVector<ValueType>* const b,
-           batch::MultiVector<ValueType>* const x,
+           const batch::BatchLinOp* mat, const batch::BatchLinOp* precond,
+           const batch::MultiVector<ValueType>* b,
+           batch::MultiVector<ValueType>* x,
            batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
 {
     auto dispatcher = batch::solver::create_dispatcher<ValueType>(
diff --git a/dpcpp/solver/batch_cg_kernels.dp.cpp b/dpcpp/solver/batch_cg_kernels.dp.cpp
index 43807583754..d94019125b1 100644
--- a/dpcpp/solver/batch_cg_kernels.dp.cpp
+++ b/dpcpp/solver/batch_cg_kernels.dp.cpp
@@ -148,10 +148,9 @@ class kernel_caller {
 template <typename ValueType>
 void apply(std::shared_ptr<const DefaultExecutor> exec,
            const settings<remove_complex<ValueType>>& settings,
-           const batch::BatchLinOp* const mat,
-           const batch::BatchLinOp* const precond,
-           const batch::MultiVector<ValueType>* const b,
-           batch::MultiVector<ValueType>* const x,
+           const batch::BatchLinOp* mat, const batch::BatchLinOp* precond,
+           const batch::MultiVector<ValueType>* b,
+           batch::MultiVector<ValueType>* x,
            batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
 {
     auto dispatcher = batch::solver::create_dispatcher<ValueType>(
diff --git a/hip/preconditioner/batch_jacobi_kernels.hip.cpp b/hip/preconditioner/batch_jacobi_kernels.hip.cpp
index 2380bc6a0bd..fdd57a95127 100644
--- a/hip/preconditioner/batch_jacobi_kernels.hip.cpp
+++ b/hip/preconditioner/batch_jacobi_kernels.hip.cpp
@@ -47,8 +47,7 @@ using batch_jacobi_hip_compiled_max_block_sizes =
 template <typename IndexType>
 void compute_cumulative_block_storage(
     std::shared_ptr<const DefaultExecutor> exec, const size_type num_blocks,
-    const IndexType* const block_pointers,
-    IndexType* const blocks_cumulative_offsets)
+    const IndexType* block_pointers, IndexType* blocks_cumulative_offsets)
 {
     dim3 block(default_block_size);
     dim3 grid(ceildiv(num_blocks, default_block_size));
@@ -68,8 +67,8 @@ GKO_INSTANTIATE_FOR_INT32_TYPE(
 template <typename IndexType>
 void find_row_block_map(std::shared_ptr<const DefaultExecutor> exec,
                         const size_type num_blocks,
-                        const IndexType* const block_pointers,
-                        IndexType* const map_block_to_row)
+                        const IndexType* block_pointers,
+                        IndexType* map_block_to_row)
 {
     dim3 block(default_block_size);
     dim3 grid(ceildiv(num_blocks, default_block_size));
@@ -85,10 +84,10 @@ GKO_INSTANTIATE_FOR_INT32_TYPE(
 template <typename ValueType, typename IndexType>
 void extract_common_blocks_pattern(
     std::shared_ptr<const DefaultExecutor> exec,
-    const gko::matrix::Csr<ValueType, IndexType>* const first_sys_csr,
-    const size_type num_blocks, const IndexType* const cumulative_block_storage,
-    const IndexType* const block_pointers,
-    const IndexType* const map_block_to_row, IndexType* const blocks_pattern)
+    const gko::matrix::Csr<ValueType, IndexType>* first_sys_csr,
+    const size_type num_blocks, const IndexType* cumulative_block_storage,
+    const IndexType* block_pointers, const IndexType* map_block_to_row,
+    IndexType* blocks_pattern)
 {
     const auto nrows = first_sys_csr->get_size()[0];
     dim3 block(default_block_size);
@@ -146,11 +145,10 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_block_jacobi_helper,
 template <typename ValueType, typename IndexType>
 void compute_block_jacobi(
     std::shared_ptr<const DefaultExecutor> exec,
-    const batch::matrix::Csr<ValueType, IndexType>* const sys_csr,
+    const batch::matrix::Csr<ValueType, IndexType>* sys_csr,
     const uint32 max_block_size, const size_type num_blocks,
-    const IndexType* const cumulative_block_storage,
-    const IndexType* const block_pointers,
-    const IndexType* const blocks_pattern, ValueType* const blocks)
+    const IndexType* cumulative_block_storage, const IndexType* block_pointers,
+    const IndexType* blocks_pattern, ValueType* blocks)
 {
     select_compute_block_jacobi_helper(
         batch_jacobi_hip_compiled_max_block_sizes(),
diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp
index f3e770c609d..3e019fd3ad1 100644
--- a/hip/solver/batch_bicgstab_kernels.hip.cpp
+++ b/hip/solver/batch_bicgstab_kernels.hip.cpp
@@ -165,10 +165,9 @@ class kernel_caller {
 template <typename ValueType>
 void apply(std::shared_ptr<const DefaultExecutor> exec,
            const settings<remove_complex<ValueType>>& settings,
-           const batch::BatchLinOp* const mat,
-           const batch::BatchLinOp* const precon,
-           const batch::MultiVector<ValueType>* const b,
-           batch::MultiVector<ValueType>* const x,
+           const batch::BatchLinOp* mat, const batch::BatchLinOp* precon,
+           const batch::MultiVector<ValueType>* b,
+           batch::MultiVector<ValueType>* x,
            batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
 {
     auto dispatcher = batch::solver::create_dispatcher<ValueType>(
diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp
index 457dfcdefcf..7f6f7ffe1db 100644
--- a/hip/solver/batch_cg_kernels.hip.cpp
+++ b/hip/solver/batch_cg_kernels.hip.cpp
@@ -147,10 +147,9 @@ class kernel_caller {
 template <typename ValueType>
 void apply(std::shared_ptr<const DefaultExecutor> exec,
            const settings<remove_complex<ValueType>>& settings,
-           const batch::BatchLinOp* const mat,
-           const batch::BatchLinOp* const precon,
-           const batch::MultiVector<ValueType>* const b,
-           batch::MultiVector<ValueType>* const x,
+           const batch::BatchLinOp* mat, const batch::BatchLinOp* precon,
+           const batch::MultiVector<ValueType>* b,
+           batch::MultiVector<ValueType>* x,
            batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
 {
     auto dispatcher = batch::solver::create_dispatcher<ValueType>(
diff --git a/omp/base/batch_multi_vector_kernels.cpp b/omp/base/batch_multi_vector_kernels.cpp
index f740e3c32f0..5b57921ab8f 100644
--- a/omp/base/batch_multi_vector_kernels.cpp
+++ b/omp/base/batch_multi_vector_kernels.cpp
@@ -24,8 +24,8 @@ namespace batch_multi_vector {
 
 template <typename ValueType>
 void scale(std::shared_ptr<const DefaultExecutor> exec,
-           const batch::MultiVector<ValueType>* const alpha,
-           batch::MultiVector<ValueType>* const x)
+           const batch::MultiVector<ValueType>* alpha,
+           batch::MultiVector<ValueType>* x)
 {
     const auto x_ub = host::get_batch_struct(x);
     const auto alpha_ub = host::get_batch_struct(alpha);
@@ -43,9 +43,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
 
 template <typename ValueType>
 void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
-                const batch::MultiVector<ValueType>* const alpha,
-                const batch::MultiVector<ValueType>* const x,
-                batch::MultiVector<ValueType>* const y)
+                const batch::MultiVector<ValueType>* alpha,
+                const batch::MultiVector<ValueType>* x,
+                batch::MultiVector<ValueType>* y)
 {
     const auto x_ub = host::get_batch_struct(x);
     const auto y_ub = host::get_batch_struct(y);
@@ -65,9 +65,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
 
 template <typename ValueType>
 void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
-                 const batch::MultiVector<ValueType>* const x,
-                 const batch::MultiVector<ValueType>* const y,
-                 batch::MultiVector<ValueType>* const result)
+                 const batch::MultiVector<ValueType>* x,
+                 const batch::MultiVector<ValueType>* y,
+                 batch::MultiVector<ValueType>* result)
 {
     const auto x_ub = host::get_batch_struct(x);
     const auto y_ub = host::get_batch_struct(y);
@@ -87,9 +87,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
 
 template <typename ValueType>
 void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
-                      const batch::MultiVector<ValueType>* const x,
-                      const batch::MultiVector<ValueType>* const y,
-                      batch::MultiVector<ValueType>* const result)
+                      const batch::MultiVector<ValueType>* x,
+                      const batch::MultiVector<ValueType>* y,
+                      batch::MultiVector<ValueType>* result)
 {
     const auto x_ub = host::get_batch_struct(x);
     const auto y_ub = host::get_batch_struct(y);
@@ -109,8 +109,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
 
 template <typename ValueType>
 void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
-                   const batch::MultiVector<ValueType>* const x,
-                   batch::MultiVector<remove_complex<ValueType>>* const result)
+                   const batch::MultiVector<ValueType>* x,
+                   batch::MultiVector<remove_complex<ValueType>>* result)
 {
     const auto x_ub = host::get_batch_struct(x);
     const auto res_ub = host::get_batch_struct(result);
diff --git a/omp/components/prefix_sum_kernels.cpp b/omp/components/prefix_sum_kernels.cpp
index 08d184b7616..edccdb0ca47 100644
--- a/omp/components/prefix_sum_kernels.cpp
+++ b/omp/components/prefix_sum_kernels.cpp
@@ -23,8 +23,7 @@ namespace components {
  */
 template <typename IndexType>
 void prefix_sum_nonnegative(std::shared_ptr<const OmpExecutor> exec,
-                            IndexType* const counts,
-                            const size_type num_entries)
+                            IndexType* counts, const size_type num_entries)
 {
     // the operation only makes sense for arrays of size at least 2
     if (num_entries < 2) {
diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp
index 8e47caef520..87b328b1093 100644
--- a/omp/matrix/csr_kernels.cpp
+++ b/omp/matrix/csr_kernels.cpp
@@ -1218,9 +1218,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
 
 
 template <typename ValueType, typename IndexType>
-void check_diagonal_entries_exist(
-    std::shared_ptr<const OmpExecutor> exec,
-    const matrix::Csr<ValueType, IndexType>* const mtx, bool& has_all_diags)
+void check_diagonal_entries_exist(std::shared_ptr<const OmpExecutor> exec,
+                                  const matrix::Csr<ValueType, IndexType>* mtx,
+                                  bool& has_all_diags)
 {
     bool l_has_all_diags = true;
     const size_type minsize = std::min(mtx->get_size()[0], mtx->get_size()[1]);
@@ -1247,9 +1247,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void add_scaled_identity(std::shared_ptr<const OmpExecutor> exec,
-                         const matrix::Dense<ValueType>* const alpha,
-                         const matrix::Dense<ValueType>* const beta,
-                         matrix::Csr<ValueType, IndexType>* const mtx)
+                         const matrix::Dense<ValueType>* alpha,
+                         const matrix::Dense<ValueType>* beta,
+                         matrix::Csr<ValueType, IndexType>* mtx)
 {
     const auto nrows = static_cast<IndexType>(mtx->get_size()[0]);
     const auto row_ptrs = mtx->get_const_row_ptrs();
diff --git a/omp/matrix/fbcsr_kernels.cpp b/omp/matrix/fbcsr_kernels.cpp
index a6342034a56..d17d47a7467 100644
--- a/omp/matrix/fbcsr_kernels.cpp
+++ b/omp/matrix/fbcsr_kernels.cpp
@@ -39,9 +39,8 @@ namespace fbcsr {
 
 template <typename ValueType, typename IndexType>
 void spmv(std::shared_ptr<const OmpExecutor> exec,
-          const matrix::Fbcsr<ValueType, IndexType>* const a,
-          const matrix::Dense<ValueType>* const b,
-          matrix::Dense<ValueType>* const c)
+          const matrix::Fbcsr<ValueType, IndexType>* a,
+          const matrix::Dense<ValueType>* b, matrix::Dense<ValueType>* c)
 {
     const int bs = a->get_block_size();
     const auto nvecs = static_cast<IndexType>(b->get_size()[1]);
@@ -80,11 +79,11 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL);
 
 template <typename ValueType, typename IndexType>
 void advanced_spmv(std::shared_ptr<const OmpExecutor> exec,
-                   const matrix::Dense<ValueType>* const alpha,
-                   const matrix::Fbcsr<ValueType, IndexType>* const a,
-                   const matrix::Dense<ValueType>* const b,
-                   const matrix::Dense<ValueType>* const beta,
-                   matrix::Dense<ValueType>* const c)
+                   const matrix::Dense<ValueType>* alpha,
+                   const matrix::Fbcsr<ValueType, IndexType>* a,
+                   const matrix::Dense<ValueType>* b,
+                   const matrix::Dense<ValueType>* beta,
+                   matrix::Dense<ValueType>* c)
 {
     const int bs = a->get_block_size();
     const auto nvecs = static_cast<IndexType>(b->get_size()[1]);
@@ -183,8 +182,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void fill_in_dense(std::shared_ptr<const OmpExecutor> exec,
-                   const matrix::Fbcsr<ValueType, IndexType>* const source,
-                   matrix::Dense<ValueType>* const result)
+                   const matrix::Fbcsr<ValueType, IndexType>* source,
+                   matrix::Dense<ValueType>* result)
 {
     const auto bs = source->get_block_size();
     const auto nbrows = source->get_num_block_rows();
@@ -216,8 +215,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(const std::shared_ptr<const OmpExecutor> exec,
-                    const matrix::Fbcsr<ValueType, IndexType>* const source,
-                    matrix::Csr<ValueType, IndexType>* const result)
+                    const matrix::Fbcsr<ValueType, IndexType>* source,
+                    matrix::Csr<ValueType, IndexType>* result)
 {
     const auto nbrows = source->get_num_block_rows();
     const auto bs = source->get_block_size();
@@ -324,8 +323,8 @@ void transpose_and_transform(
 
 template <typename ValueType, typename IndexType>
 void transpose(std::shared_ptr<const OmpExecutor> exec,
-               const matrix::Fbcsr<ValueType, IndexType>* const orig,
-               matrix::Fbcsr<ValueType, IndexType>* const trans)
+               const matrix::Fbcsr<ValueType, IndexType>* orig,
+               matrix::Fbcsr<ValueType, IndexType>* trans)
 {
     transpose_and_transform(exec, trans, orig,
                             [](const ValueType x) { return x; });
@@ -337,8 +336,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void conj_transpose(std::shared_ptr<const OmpExecutor> exec,
-                    const matrix::Fbcsr<ValueType, IndexType>* const orig,
-                    matrix::Fbcsr<ValueType, IndexType>* const trans)
+                    const matrix::Fbcsr<ValueType, IndexType>* orig,
+                    matrix::Fbcsr<ValueType, IndexType>* trans)
 {
     transpose_and_transform(exec, trans, orig,
                             [](const ValueType x) { return conj(x); });
@@ -351,8 +350,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void is_sorted_by_column_index(
     std::shared_ptr<const OmpExecutor> exec,
-    const matrix::Fbcsr<ValueType, IndexType>* const to_check,
-    bool* const is_sorted)
+    const matrix::Fbcsr<ValueType, IndexType>* to_check, bool* is_sorted)
 {
     const auto row_ptrs = to_check->get_const_row_ptrs();
     const auto col_idxs = to_check->get_const_col_idxs();
@@ -419,7 +417,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_sort_col_idx,
 
 template <typename ValueType, typename IndexType>
 void sort_by_column_index(const std::shared_ptr<const OmpExecutor> exec,
-                          matrix::Fbcsr<ValueType, IndexType>* const to_sort)
+                          matrix::Fbcsr<ValueType, IndexType>* to_sort)
 {
     const int bs = to_sort->get_block_size();
     select_sort_col_idx(
@@ -434,8 +432,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void extract_diagonal(std::shared_ptr<const OmpExecutor> exec,
-                      const matrix::Fbcsr<ValueType, IndexType>* const orig,
-                      matrix::Diagonal<ValueType>* const diag)
+                      const matrix::Fbcsr<ValueType, IndexType>* orig,
+                      matrix::Diagonal<ValueType>* diag)
 {
     const auto row_ptrs = orig->get_const_row_ptrs();
     const auto col_idxs = orig->get_const_col_idxs();
diff --git a/omp/preconditioner/batch_jacobi_kernels.cpp b/omp/preconditioner/batch_jacobi_kernels.cpp
index 90c8f0c1865..58fb2602075 100644
--- a/omp/preconditioner/batch_jacobi_kernels.cpp
+++ b/omp/preconditioner/batch_jacobi_kernels.cpp
@@ -23,8 +23,7 @@ namespace batch_jacobi {
 template <typename IndexType>
 void compute_cumulative_block_storage(
     std::shared_ptr<const DefaultExecutor> exec, const size_type num_blocks,
-    const IndexType* const block_pointers,
-    IndexType* const blocks_cumulative_offsets)
+    const IndexType* block_pointers, IndexType* blocks_cumulative_offsets)
 {
 #pragma omp parallel for
     for (int i = 0; i < num_blocks; i++) {
@@ -43,8 +42,8 @@ GKO_INSTANTIATE_FOR_INT32_TYPE(
 template <typename IndexType>
 void find_row_block_map(std::shared_ptr<const DefaultExecutor> exec,
                         const size_type num_blocks,
-                        const IndexType* const block_pointers,
-                        IndexType* const map_block_to_row)
+                        const IndexType* block_pointers,
+                        IndexType* map_block_to_row)
 {
 #pragma omp parallel for
     for (size_type block_idx = 0; block_idx < num_blocks; block_idx++) {
@@ -62,10 +61,10 @@ GKO_INSTANTIATE_FOR_INT32_TYPE(
 template <typename ValueType, typename IndexType>
 void extract_common_blocks_pattern(
     std::shared_ptr<const DefaultExecutor> exec,
-    const gko::matrix::Csr<ValueType, IndexType>* const first_sys_csr,
-    const size_type num_blocks, const IndexType* const cumulative_block_storage,
-    const IndexType* const block_pointers, const IndexType* const,
-    IndexType* const blocks_pattern)
+    const gko::matrix::Csr<ValueType, IndexType>* first_sys_csr,
+    const size_type num_blocks, const IndexType* cumulative_block_storage,
+    const IndexType* block_pointers, const IndexType*,
+    IndexType* blocks_pattern)
 {
 #pragma omp parallel for
     for (size_type k = 0; k < num_blocks; k++) {
@@ -82,10 +81,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
 template <typename ValueType, typename IndexType>
 void compute_block_jacobi(
     std::shared_ptr<const DefaultExecutor> exec,
-    const batch::matrix::Csr<ValueType, IndexType>* const sys_csr, const uint32,
-    const size_type num_blocks, const IndexType* const cumulative_block_storage,
-    const IndexType* const block_pointers,
-    const IndexType* const blocks_pattern, ValueType* const blocks)
+    const batch::matrix::Csr<ValueType, IndexType>* sys_csr, const uint32,
+    const size_type num_blocks, const IndexType* cumulative_block_storage,
+    const IndexType* block_pointers, const IndexType* blocks_pattern,
+    ValueType* blocks)
 {
     const auto nbatch = sys_csr->get_num_batch_items();
     const auto A_batch = host::get_batch_struct(sys_csr);
diff --git a/omp/reorder/rcm_kernels.cpp b/omp/reorder/rcm_kernels.cpp
index dd4eb020695..fbe5c8d42c4 100644
--- a/omp/reorder/rcm_kernels.cpp
+++ b/omp/reorder/rcm_kernels.cpp
@@ -742,9 +742,8 @@ IndexType handle_isolated_nodes(std::shared_ptr<const OmpExecutor> exec,
 template <typename IndexType>
 void compute_permutation(std::shared_ptr<const OmpExecutor> exec,
                          const IndexType num_vertices,
-                         const IndexType* const row_ptrs,
-                         const IndexType* const col_idxs, IndexType* const perm,
-                         IndexType* const inv_perm,
+                         const IndexType* row_ptrs, const IndexType* col_idxs,
+                         IndexType* perm, IndexType* inv_perm,
                          const gko::reorder::starting_strategy strategy)
 {
     // compute node degrees
diff --git a/omp/solver/batch_bicgstab_kernels.cpp b/omp/solver/batch_bicgstab_kernels.cpp
index ed880507116..5e069806f60 100644
--- a/omp/solver/batch_bicgstab_kernels.cpp
+++ b/omp/solver/batch_bicgstab_kernels.cpp
@@ -80,10 +80,9 @@ class kernel_caller {
 template <typename ValueType>
 void apply(std::shared_ptr<const DefaultExecutor> exec,
            const settings<remove_complex<ValueType>>& settings,
-           const batch::BatchLinOp* const mat,
-           const batch::BatchLinOp* const precond,
-           const batch::MultiVector<ValueType>* const b,
-           batch::MultiVector<ValueType>* const x,
+           const batch::BatchLinOp* mat, const batch::BatchLinOp* precond,
+           const batch::MultiVector<ValueType>* b,
+           batch::MultiVector<ValueType>* x,
            batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
 {
     auto dispatcher = batch::solver::create_dispatcher<ValueType>(
diff --git a/omp/solver/batch_cg_kernels.cpp b/omp/solver/batch_cg_kernels.cpp
index 89d4441db64..0664c0244b6 100644
--- a/omp/solver/batch_cg_kernels.cpp
+++ b/omp/solver/batch_cg_kernels.cpp
@@ -86,10 +86,9 @@ class kernel_caller {
 template <typename ValueType>
 void apply(std::shared_ptr<const DefaultExecutor> exec,
            const settings<remove_complex<ValueType>>& settings,
-           const batch::BatchLinOp* const mat,
-           const batch::BatchLinOp* const precond,
-           const batch::MultiVector<ValueType>* const b,
-           batch::MultiVector<ValueType>* const x,
+           const batch::BatchLinOp* mat, const batch::BatchLinOp* precond,
+           const batch::MultiVector<ValueType>* b,
+           batch::MultiVector<ValueType>* x,
            batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
 {
     auto dispatcher = batch::solver::create_dispatcher<ValueType>(
diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp
index be97da442a1..a0607110b79 100644
--- a/reference/matrix/csr_kernels.cpp
+++ b/reference/matrix/csr_kernels.cpp
@@ -1218,9 +1218,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_SCALE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
-void check_diagonal_entries_exist(
-    std::shared_ptr<const ReferenceExecutor> exec,
-    const matrix::Csr<ValueType, IndexType>* const mtx, bool& has_all_diags)
+void check_diagonal_entries_exist(std::shared_ptr<const ReferenceExecutor> exec,
+                                  const matrix::Csr<ValueType, IndexType>* mtx,
+                                  bool& has_all_diags)
 {
     has_all_diags = true;
     const auto row_ptrs = mtx->get_const_row_ptrs();
@@ -1246,9 +1246,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void add_scaled_identity(std::shared_ptr<const ReferenceExecutor> exec,
-                         const matrix::Dense<ValueType>* const alpha,
-                         const matrix::Dense<ValueType>* const beta,
-                         matrix::Csr<ValueType, IndexType>* const mtx)
+                         const matrix::Dense<ValueType>* alpha,
+                         const matrix::Dense<ValueType>* beta,
+                         matrix::Csr<ValueType, IndexType>* mtx)
 {
     const auto nrows = static_cast<IndexType>(mtx->get_size()[0]);
     const auto row_ptrs = mtx->get_const_row_ptrs();
diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp
index 40c3c40a3ae..921a49998b7 100644
--- a/reference/matrix/dense_kernels.cpp
+++ b/reference/matrix/dense_kernels.cpp
@@ -1242,9 +1242,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_IMAG_KERNEL);
 
 template <typename ValueType, typename ScalarType>
 void add_scaled_identity(std::shared_ptr<const ReferenceExecutor> exec,
-                         const matrix::Dense<ScalarType>* const alpha,
-                         const matrix::Dense<ScalarType>* const beta,
-                         matrix::Dense<ValueType>* const mtx)
+                         const matrix::Dense<ScalarType>* alpha,
+                         const matrix::Dense<ScalarType>* beta,
+                         matrix::Dense<ValueType>* mtx)
 {
     const auto dim = mtx->get_size();
     for (size_type row = 0; row < dim[0]; row++) {
diff --git a/reference/matrix/fbcsr_kernels.cpp b/reference/matrix/fbcsr_kernels.cpp
index cdedc36ddc0..4c170a973a7 100644
--- a/reference/matrix/fbcsr_kernels.cpp
+++ b/reference/matrix/fbcsr_kernels.cpp
@@ -40,9 +40,8 @@ namespace fbcsr {
 
 template <typename ValueType, typename IndexType>
 void spmv(const std::shared_ptr<const ReferenceExecutor>,
-          const matrix::Fbcsr<ValueType, IndexType>* const a,
-          const matrix::Dense<ValueType>* const b,
-          matrix::Dense<ValueType>* const c)
+          const matrix::Fbcsr<ValueType, IndexType>* a,
+          const matrix::Dense<ValueType>* b, matrix::Dense<ValueType>* c)
 {
     const int bs = a->get_block_size();
     const auto nvecs = static_cast<IndexType>(b->get_size()[1]);
@@ -80,11 +79,11 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL);
 
 template <typename ValueType, typename IndexType>
 void advanced_spmv(const std::shared_ptr<const ReferenceExecutor>,
-                   const matrix::Dense<ValueType>* const alpha,
-                   const matrix::Fbcsr<ValueType, IndexType>* const a,
-                   const matrix::Dense<ValueType>* const b,
-                   const matrix::Dense<ValueType>* const beta,
-                   matrix::Dense<ValueType>* const c)
+                   const matrix::Dense<ValueType>* alpha,
+                   const matrix::Fbcsr<ValueType, IndexType>* a,
+                   const matrix::Dense<ValueType>* b,
+                   const matrix::Dense<ValueType>* beta,
+                   matrix::Dense<ValueType>* c)
 {
     const int bs = a->get_block_size();
     const auto nvecs = static_cast<IndexType>(b->get_size()[1]);
@@ -183,8 +182,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void fill_in_dense(const std::shared_ptr<const ReferenceExecutor>,
-                   const matrix::Fbcsr<ValueType, IndexType>* const source,
-                   matrix::Dense<ValueType>* const result)
+                   const matrix::Fbcsr<ValueType, IndexType>* source,
+                   matrix::Dense<ValueType>* result)
 {
     const int bs = source->get_block_size();
     const IndexType nbrows = source->get_num_block_rows();
@@ -219,8 +218,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(const std::shared_ptr<const ReferenceExecutor>,
-                    const matrix::Fbcsr<ValueType, IndexType>* const source,
-                    matrix::Csr<ValueType, IndexType>* const result)
+                    const matrix::Fbcsr<ValueType, IndexType>* source,
+                    matrix::Csr<ValueType, IndexType>* result)
 {
     const int bs = source->get_block_size();
     const IndexType nbrows = source->get_num_block_rows();
@@ -347,8 +346,8 @@ void transpose_and_transform(
 
 template <typename ValueType, typename IndexType>
 void transpose(std::shared_ptr<const ReferenceExecutor> exec,
-               const matrix::Fbcsr<ValueType, IndexType>* const orig,
-               matrix::Fbcsr<ValueType, IndexType>* const trans)
+               const matrix::Fbcsr<ValueType, IndexType>* orig,
+               matrix::Fbcsr<ValueType, IndexType>* trans)
 {
     transpose_and_transform(exec, trans, orig,
                             [](const ValueType x) { return x; });
@@ -360,8 +359,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void conj_transpose(std::shared_ptr<const ReferenceExecutor> exec,
-                    const matrix::Fbcsr<ValueType, IndexType>* const orig,
-                    matrix::Fbcsr<ValueType, IndexType>* const trans)
+                    const matrix::Fbcsr<ValueType, IndexType>* orig,
+                    matrix::Fbcsr<ValueType, IndexType>* trans)
 {
     transpose_and_transform(exec, trans, orig,
                             [](const ValueType x) { return conj(x); });
@@ -374,8 +373,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void is_sorted_by_column_index(
     std::shared_ptr<const ReferenceExecutor>,
-    const matrix::Fbcsr<ValueType, IndexType>* const to_check,
-    bool* const is_sorted)
+    const matrix::Fbcsr<ValueType, IndexType>* to_check, bool* is_sorted)
 {
     const auto row_ptrs = to_check->get_const_row_ptrs();
     const auto col_idxs = to_check->get_const_col_idxs();
@@ -441,7 +439,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_sort_col_idx,
 
 template <typename ValueType, typename IndexType>
 void sort_by_column_index(const std::shared_ptr<const ReferenceExecutor> exec,
-                          matrix::Fbcsr<ValueType, IndexType>* const to_sort)
+                          matrix::Fbcsr<ValueType, IndexType>* to_sort)
 {
     const int bs = to_sort->get_block_size();
     select_sort_col_idx(
@@ -456,8 +454,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void extract_diagonal(std::shared_ptr<const ReferenceExecutor>,
-                      const matrix::Fbcsr<ValueType, IndexType>* const orig,
-                      matrix::Diagonal<ValueType>* const diag)
+                      const matrix::Fbcsr<ValueType, IndexType>* orig,
+                      matrix::Diagonal<ValueType>* diag)
 {
     const auto row_ptrs = orig->get_const_row_ptrs();
     const auto col_idxs = orig->get_const_col_idxs();
diff --git a/reference/preconditioner/batch_jacobi_kernels.cpp b/reference/preconditioner/batch_jacobi_kernels.cpp
index a012e019b41..f994c8c448b 100644
--- a/reference/preconditioner/batch_jacobi_kernels.cpp
+++ b/reference/preconditioner/batch_jacobi_kernels.cpp
@@ -23,8 +23,7 @@ namespace batch_jacobi {
 template <typename IndexType>
 void compute_cumulative_block_storage(
     std::shared_ptr<const DefaultExecutor> exec, const size_type num_blocks,
-    const IndexType* const block_pointers,
-    IndexType* const blocks_cumulative_offsets)
+    const IndexType* block_pointers, IndexType* blocks_cumulative_offsets)
 {
     blocks_cumulative_offsets[0] = 0;
     for (int i = 0; i < num_blocks; i++) {
@@ -41,8 +40,8 @@ GKO_INSTANTIATE_FOR_INT32_TYPE(
 template <typename IndexType>
 void find_row_block_map(std::shared_ptr<const DefaultExecutor> exec,
                         const size_type num_blocks,
-                        const IndexType* const block_pointers,
-                        IndexType* const map_block_to_row)
+                        const IndexType* block_pointers,
+                        IndexType* map_block_to_row)
 {
     for (size_type block_idx = 0; block_idx < num_blocks; block_idx++) {
         for (IndexType i = block_pointers[block_idx];
@@ -59,10 +58,10 @@ GKO_INSTANTIATE_FOR_INT32_TYPE(
 template <typename ValueType, typename IndexType>
 void extract_common_blocks_pattern(
     std::shared_ptr<const DefaultExecutor> exec,
-    const gko::matrix::Csr<ValueType, IndexType>* const first_sys_csr,
-    const size_type num_blocks, const IndexType* const cumulative_block_storage,
-    const IndexType* const block_pointers, const IndexType* const,
-    IndexType* const blocks_pattern)
+    const gko::matrix::Csr<ValueType, IndexType>* first_sys_csr,
+    const size_type num_blocks, const IndexType* cumulative_block_storage,
+    const IndexType* block_pointers, const IndexType*,
+    IndexType* blocks_pattern)
 {
     for (size_type k = 0; k < num_blocks; k++) {
         batch_single_kernels::extract_block_pattern_impl(
@@ -78,10 +77,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
 template <typename ValueType, typename IndexType>
 void compute_block_jacobi(
     std::shared_ptr<const DefaultExecutor> exec,
-    const batch::matrix::Csr<ValueType, IndexType>* const sys_csr, const uint32,
-    const size_type num_blocks, const IndexType* const cumulative_block_storage,
-    const IndexType* const block_pointers,
-    const IndexType* const blocks_pattern, ValueType* const blocks)
+    const batch::matrix::Csr<ValueType, IndexType>* sys_csr, const uint32,
+    const size_type num_blocks, const IndexType* cumulative_block_storage,
+    const IndexType* block_pointers, const IndexType* blocks_pattern,
+    ValueType* blocks)
 {
     const auto nbatch = sys_csr->get_num_batch_items();
     const auto A_batch = host::get_batch_struct(sys_csr);
diff --git a/reference/reorder/rcm_kernels.cpp b/reference/reorder/rcm_kernels.cpp
index 5cbce5dc5e3..6564952e5b0 100644
--- a/reference/reorder/rcm_kernels.cpp
+++ b/reference/reorder/rcm_kernels.cpp
@@ -179,10 +179,8 @@ IndexType find_starting_node(std::shared_ptr<const ReferenceExecutor> exec,
 template <typename IndexType>
 void compute_permutation(std::shared_ptr<const ReferenceExecutor> exec,
                          const IndexType num_vertices,
-                         const IndexType* const row_ptrs,
-                         const IndexType* const col_idxs,
-                         IndexType* const permutation,
-                         IndexType* const inv_permutation,
+                         const IndexType* row_ptrs, const IndexType* col_idxs,
+                         IndexType* permutation, IndexType* inv_permutation,
                          const gko::reorder::starting_strategy strategy)
 {
     // compute node degrees
diff --git a/reference/solver/batch_bicgstab_kernels.cpp b/reference/solver/batch_bicgstab_kernels.cpp
index 20883e24434..5bc75c5ebdb 100644
--- a/reference/solver/batch_bicgstab_kernels.cpp
+++ b/reference/solver/batch_bicgstab_kernels.cpp
@@ -76,10 +76,9 @@ class kernel_caller {
 template <typename ValueType>
 void apply(std::shared_ptr<const DefaultExecutor> exec,
            const settings<remove_complex<ValueType>>& settings,
-           const batch::BatchLinOp* const mat,
-           const batch::BatchLinOp* const precon,
-           const batch::MultiVector<ValueType>* const b,
-           batch::MultiVector<ValueType>* const x,
+           const batch::BatchLinOp* mat, const batch::BatchLinOp* precon,
+           const batch::MultiVector<ValueType>* b,
+           batch::MultiVector<ValueType>* x,
            batch::log::detail::log_data<remove_complex<ValueType>>& log_data)
 {
     auto dispatcher = batch::solver::create_dispatcher<ValueType>(
diff --git a/reference/solver/batch_cg_kernels.cpp b/reference/solver/batch_cg_kernels.cpp
index f2155f98719..ba54329c31a 100644
--- a/reference/solver/batch_cg_kernels.cpp
+++ b/reference/solver/batch_cg_kernels.cpp
@@ -76,10 +76,9 @@ class kernel_caller {
 template <typename ValueType>
 void apply(std::shared_ptr<const DefaultExecutor> exec,
            const settings<remove_complex<ValueType>>& settings,
-           const batch::BatchLinOp* const mat,
-           const batch::BatchLinOp* const precon,
-           const batch::MultiVector<ValueType>* const b,
-           batch::MultiVector<ValueType>* const x,
+           const batch::BatchLinOp* mat, const batch::BatchLinOp* precon,
+           const batch::MultiVector<ValueType>* b,
+           batch::MultiVector<ValueType>* x,
            batch::log::detail::log_data<remove_complex<ValueType>>& log_data)
 {
     auto dispatcher = batch::solver::create_dispatcher<ValueType>(

From 575a72ed7dcb9316faa6db7bdc7d709ccba51484 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 20 Nov 2024 23:06:08 +0100
Subject: [PATCH 283/448] alias in template will lead an issue

---
 core/test/matrix/csr_builder.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/core/test/matrix/csr_builder.cpp b/core/test/matrix/csr_builder.cpp
index a06437bed12..24cbe4718c5 100644
--- a/core/test/matrix/csr_builder.cpp
+++ b/core/test/matrix/csr_builder.cpp
@@ -59,6 +59,11 @@ TYPED_TEST(CsrBuilder, UpdatesSrowOnDestruction)
     using value_type = typename TestFixture::value_type;
     using index_type = typename TestFixture::index_type;
     struct mock_strategy : public Mtx::strategy_type {
+#if defined(_MSC_VER) && defined(__clang__)
+        // only clang_cl in Windows needs this workaround. detail:
+        // https://github.com/llvm/llvm-project/issues/64996
+        using Mtx = Mtx;
+#endif
         virtual void process(const gko::array<index_type>&,
                              gko::array<index_type>*) override
         {

From aecd77f8ff8f857f5e3bf974c211eac9cf87ffa5 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 21 Nov 2024 00:45:13 +0100
Subject: [PATCH 284/448] Revert "try clang_cl"

---
 .github/workflows/msvc_clang.yml | 31 -------------------------------
 1 file changed, 31 deletions(-)
 delete mode 100644 .github/workflows/msvc_clang.yml

diff --git a/.github/workflows/msvc_clang.yml b/.github/workflows/msvc_clang.yml
deleted file mode 100644
index 92b7f6fe518..00000000000
--- a/.github/workflows/msvc_clang.yml
+++ /dev/null
@@ -1,31 +0,0 @@
-name: Windows-MSVC-CLANG
-
-on:
-  push:
-    branches:
-      - 'master'
-      - 'develop'
-      - 'release/**'
-    tags:
-      - '**'
-  pull_request:
-    types: [opened,synchronize]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  windows_ref:
-    name: msvc/clang
-    runs-on: [windows-latest]
-    steps:
-    - name: Checkout the latest code (shallow clone)
-      uses: actions/checkout@v3
-    - name: configure
-      run: |
-        mkdir build
-        cd build
-        cmake -T ClangCL -DBUILD_SHARED_LIBS=OFF -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_OMP=OFF ..
-        cmake --build . -j4 --config Release
-        ctest . -C Release --output-on-failure

From 646ee09e5fc82352ed0c05f83cb3916af6141f1d Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 12 Nov 2024 14:15:28 +0100
Subject: [PATCH 285/448] casting in core only

In clang18 (at least), compiler will include convert_to/move_to/... as undefined symbol by dynamic_cast. It is not an issue in linux when building the shared library and OSX with `-undefined dynamic_lookup`. However, clang in msys2 (WINDOWS) does not have `--allow-shlib-undefined`, which should be enabled by default when building shared libraries.
---
 core/base/batch_instantiation.hpp           | 48 +++++++++++
 core/device_hooks/common_kernels.inc.cpp    | 16 +++-
 core/solver/batch_bicgstab.cpp              | 26 +++++-
 core/solver/batch_bicgstab_kernels.hpp      | 16 ++--
 core/solver/batch_cg.cpp                    | 27 ++++++-
 core/solver/batch_cg_kernels.hpp            | 23 +++---
 core/solver/batch_dispatch.hpp              | 88 ++++++++++-----------
 cuda/solver/batch_bicgstab_kernels.cu       | 10 ++-
 cuda/solver/batch_cg_kernels.cu             | 10 ++-
 dpcpp/solver/batch_bicgstab_kernels.dp.cpp  |  9 ++-
 dpcpp/solver/batch_cg_kernels.dp.cpp        |  9 ++-
 hip/solver/batch_bicgstab_kernels.hip.cpp   | 10 ++-
 hip/solver/batch_cg_kernels.hip.cpp         | 10 ++-
 include/ginkgo/core/base/types.hpp          | 37 +++++++++
 omp/solver/batch_bicgstab_kernels.cpp       |  8 +-
 omp/solver/batch_cg_kernels.cpp             |  8 +-
 reference/solver/batch_bicgstab_kernels.cpp | 24 +++---
 reference/solver/batch_cg_kernels.cpp       | 25 +++---
 18 files changed, 279 insertions(+), 125 deletions(-)
 create mode 100644 core/base/batch_instantiation.hpp

diff --git a/core/base/batch_instantiation.hpp b/core/base/batch_instantiation.hpp
new file mode 100644
index 00000000000..a686e9838a7
--- /dev/null
+++ b/core/base/batch_instantiation.hpp
@@ -0,0 +1,48 @@
+// SPDX-FileCopyrightText: 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_PUBLIC_CORE_BASE_BATCH_INSTANTIATION_HPP_
+#define GKO_PUBLIC_CORE_BASE_BATCH_INSTANTIATION_HPP_
+
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_csr.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
+#include <ginkgo/core/matrix/batch_identity.hpp>
+#include <ginkgo/core/preconditioner/batch_jacobi.hpp>
+
+namespace gko {
+namespace batch {
+
+/**
+ * Instantiates a template for each valid combination of value type, batch
+ * matrix type, and batch preconditioner type. This only allows batch matrix
+ * type and preconditioner type also uses the same value type.
+ *
+ * @param _macro  A macro which expands the template instantiation
+ *                (not including the leading `template` specifier).
+ *                Should take three arguments, where the first is replaced by
+ *                the value type, the second by the matrix, and the third by the
+ *                preconditioner.
+ *
+ * @note the second and third arguments only accept the base type.s
+ */
+#define GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER(_macro)         \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_ARGS(_macro, gko::batch::matrix::Csr, \
+                                             gko::batch::matrix::Identity);   \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_ARGS(_macro, gko::batch::matrix::Ell, \
+                                             gko::batch::matrix::Identity);   \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_ARGS(                                 \
+        _macro, gko::batch::matrix::Dense, gko::batch::matrix::Identity);     \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_ARGS(                                 \
+        _macro, gko::batch::matrix::Csr, gko::batch::preconditioner::Jacobi); \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_ARGS(                                 \
+        _macro, gko::batch::matrix::Ell, gko::batch::preconditioner::Jacobi); \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_ARGS(                                 \
+        _macro, gko::batch::matrix::Dense, gko::batch::preconditioner::Jacobi)
+
+}  // namespace batch
+}  // namespace gko
+
+#endif  //
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 26de8531741..290b5afd907 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -5,6 +5,7 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/types.hpp>
 
+#include "core/base/batch_instantiation.hpp"
 #include "core/base/batch_multi_vector_kernels.hpp"
 #include "core/base/device_matrix_data_kernels.hpp"
 #include "core/base/index_set_kernels.hpp"
@@ -168,6 +169,13 @@
     _macro(ValueType, ValueTypeKrylovBases) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
     GKO_INSTANTIATE_FOR_EACH_CB_GMRES_CONST_TYPE(_macro)
 
+#define GKO_STUB_BATCH_VALUE_MATRIX_PRECONDITIONER(_declare, _wrapper)         \
+    template <typename ValueType, typename BatchMatrixType, typename PrecType> \
+    _declare(ValueType, BatchMatrixType, PrecType)                             \
+        GKO_NOT_COMPILED(GKO_HOOK_MODULE);                                     \
+    GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER(_wrapper)
+
+
 namespace gko {
 namespace kernels {
 namespace GKO_HOOK_MODULE {
@@ -421,7 +429,9 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DIAGONAL_FILL_IN_MATRIX_DATA_KERNEL);
 namespace batch_bicgstab {
 
 
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL);
+GKO_STUB_BATCH_VALUE_MATRIX_PRECONDITIONER(
+    GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL,
+    GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL_WRAPPER);
 
 
 }  // namespace batch_bicgstab
@@ -430,7 +440,9 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL);
 namespace batch_cg {
 
 
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_CG_APPLY_KERNEL);
+GKO_STUB_BATCH_VALUE_MATRIX_PRECONDITIONER(
+    GKO_DECLARE_BATCH_CG_APPLY_KERNEL,
+    GKO_DECLARE_BATCH_CG_APPLY_KERNEL_WRAPPER);
 
 
 }  // namespace batch_cg
diff --git a/core/solver/batch_bicgstab.cpp b/core/solver/batch_bicgstab.cpp
index c22c712b411..66b0bf9f704 100644
--- a/core/solver/batch_bicgstab.cpp
+++ b/core/solver/batch_bicgstab.cpp
@@ -7,8 +7,14 @@
 #include <ginkgo/core/base/batch_lin_op.hpp>
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/batch_csr.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
+#include <ginkgo/core/matrix/batch_identity.hpp>
+#include <ginkgo/core/preconditioner/batch_jacobi.hpp>
 
 #include "core/base/batch_multi_vector_kernels.hpp"
+#include "core/base/dispatch_helper.hpp"
 #include "core/solver/batch_bicgstab_kernels.hpp"
 
 
@@ -45,14 +51,26 @@ void Bicgstab<ValueType>::solver_apply(
     const MultiVector<ValueType>* b, MultiVector<ValueType>* x,
     log::detail::log_data<remove_complex<ValueType>>* log_data) const
 {
-    using MVec = MultiVector<ValueType>;
     const kernels::batch_bicgstab::settings<remove_complex<ValueType>> settings{
         this->max_iterations_, static_cast<real_type>(this->residual_tol_),
         parameters_.tolerance_type};
     auto exec = this->get_executor();
-    exec->run(bicgstab::make_apply(settings, this->system_matrix_.get(),
-                                   this->preconditioner_.get(), b, x,
-                                   *log_data));
+
+    run<matrix::Dense<ValueType>, matrix::Csr<ValueType>,
+        matrix::Ell<ValueType>>(this->system_matrix_.get(), [&](auto matrix) {
+        if (this->preconditioner_ == nullptr) {
+            auto identity =
+                matrix::Identity<ValueType>::create(exec, matrix->get_size());
+            exec->run(bicgstab::make_apply(settings, matrix, identity.get(), b,
+                                           x, *log_data));
+        } else {
+            run<matrix::Identity<ValueType>, preconditioner::Jacobi<ValueType>>(
+                this->preconditioner_.get(), [&](auto preconditioner) {
+                    exec->run(bicgstab::make_apply(
+                        settings, matrix, preconditioner, b, x, *log_data));
+                });
+        }
+    });
 }
 
 
diff --git a/core/solver/batch_bicgstab_kernels.hpp b/core/solver/batch_bicgstab_kernels.hpp
index 615ed472597..2485e7e454e 100644
--- a/core/solver/batch_bicgstab_kernels.hpp
+++ b/core/solver/batch_bicgstab_kernels.hpp
@@ -174,19 +174,25 @@ storage_config compute_shared_storage(const int available_shared_mem,
 }  // namespace batch_bicgstab
 
 
-#define GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL(_type)                       \
+#define GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL(_type, _matrix, _prec)       \
     void apply(                                                              \
         std::shared_ptr<const DefaultExecutor> exec,                         \
         const gko::kernels::batch_bicgstab::settings<remove_complex<_type>>& \
             options,                                                         \
-        const batch::BatchLinOp* a, const batch::BatchLinOp* preconditioner, \
+        const _matrix* a, const _prec* preconditioner,                       \
         const batch::MultiVector<_type>* b, batch::MultiVector<_type>* x,    \
         gko::batch::log::detail::log_data<remove_complex<_type>>& logdata)
 
+#define GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL_WRAPPER(_vtype, _matrix, \
+                                                        _precond)        \
+    GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL(_vtype, _matrix<_vtype>,     \
+                                            _precond<_vtype>)
 
-#define GKO_DECLARE_ALL_AS_TEMPLATES \
-    template <typename ValueType>    \
-    GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL(ValueType)
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES                                           \
+    template <typename ValueType, typename BatchMatrixType, typename PrecType> \
+    GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL(ValueType, BatchMatrixType,        \
+                                            PrecType)
 
 
 GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(batch_bicgstab,
diff --git a/core/solver/batch_cg.cpp b/core/solver/batch_cg.cpp
index 0ab1ca8564f..0ac9f111cea 100644
--- a/core/solver/batch_cg.cpp
+++ b/core/solver/batch_cg.cpp
@@ -7,11 +7,16 @@
 #include <ginkgo/core/base/batch_lin_op.hpp>
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/batch_csr.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
+#include <ginkgo/core/matrix/batch_identity.hpp>
+#include <ginkgo/core/preconditioner/batch_jacobi.hpp>
 
 #include "core/base/batch_multi_vector_kernels.hpp"
+#include "core/base/dispatch_helper.hpp"
 #include "core/solver/batch_cg_kernels.hpp"
 
-
 namespace gko {
 namespace batch {
 namespace solver {
@@ -49,8 +54,24 @@ void Cg<ValueType>::solver_apply(
         this->max_iterations_, static_cast<real_type>(this->residual_tol_),
         parameters_.tolerance_type};
     auto exec = this->get_executor();
-    exec->run(cg::make_apply(settings, this->system_matrix_.get(),
-                             this->preconditioner_.get(), b, x, *log_data));
+
+    run<batch::matrix::Dense<ValueType>, batch::matrix::Csr<ValueType>,
+        batch::matrix::Ell<ValueType>>(
+        this->system_matrix_.get(), [&](auto matrix) {
+            if (this->preconditioner_ == nullptr) {
+                auto identity = matrix::Identity<ValueType>::create(
+                    exec, matrix->get_size());
+                exec->run(cg::make_apply(settings, matrix, identity.get(), b, x,
+                                         *log_data));
+            } else {
+                run<batch::matrix::Identity<ValueType>,
+                    batch::preconditioner::Jacobi<ValueType>>(
+                    this->preconditioner_.get(), [&](auto preconditioner) {
+                        exec->run(cg::make_apply(
+                            settings, matrix, preconditioner, b, x, *log_data));
+                    });
+            }
+        });
 }
 
 
diff --git a/core/solver/batch_cg_kernels.hpp b/core/solver/batch_cg_kernels.hpp
index b21a2c07d3e..79e5e6c397d 100644
--- a/core/solver/batch_cg_kernels.hpp
+++ b/core/solver/batch_cg_kernels.hpp
@@ -162,19 +162,22 @@ storage_config compute_shared_storage(const int available_shared_mem,
 }  // namespace batch_cg
 
 
-#define GKO_DECLARE_BATCH_CG_APPLY_KERNEL(_type)                               \
-    void apply(                                                                \
-        std::shared_ptr<const DefaultExecutor> exec,                           \
-        const gko::kernels::batch_cg::settings<remove_complex<_type>>&         \
-            options,                                                           \
-        const batch::BatchLinOp* mat, const batch::BatchLinOp* preconditioner, \
-        const batch::MultiVector<_type>* b, batch::MultiVector<_type>* x,      \
+#define GKO_DECLARE_BATCH_CG_APPLY_KERNEL(_type, _matrix, _prec)          \
+    void apply(                                                           \
+        std::shared_ptr<const DefaultExecutor> exec,                      \
+        const gko::kernels::batch_cg::settings<remove_complex<_type>>&    \
+            options,                                                      \
+        const _matrix* mat, const _prec* preconditioner,                  \
+        const batch::MultiVector<_type>* b, batch::MultiVector<_type>* x, \
         gko::batch::log::detail::log_data<remove_complex<_type>>& logdata)
 
+#define GKO_DECLARE_BATCH_CG_APPLY_KERNEL_WRAPPER(_vtype, _matrix, _precond) \
+    GKO_DECLARE_BATCH_CG_APPLY_KERNEL(_vtype, _matrix<_vtype>, _precond<_vtype>)
 
-#define GKO_DECLARE_ALL_AS_TEMPLATES \
-    template <typename ValueType>    \
-    GKO_DECLARE_BATCH_CG_APPLY_KERNEL(ValueType)
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES                                           \
+    template <typename ValueType, typename BatchMatrixType, typename PrecType> \
+    GKO_DECLARE_BATCH_CG_APPLY_KERNEL(ValueType, BatchMatrixType, PrecType)
 
 
 GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(batch_cg, GKO_DECLARE_ALL_AS_TEMPLATES);
diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp
index 5a37b12cf11..f798515f2e2 100644
--- a/core/solver/batch_dispatch.hpp
+++ b/core/solver/batch_dispatch.hpp
@@ -229,7 +229,8 @@ enum class log_type { simple_convergence_completion };
  * @tparam SettingsType  Structure type of options for the particular solver to
  * be used.
  */
-template <typename ValueType, typename KernelCaller, typename SettingsType>
+template <typename ValueType, typename KernelCaller, typename SettingsType,
+          typename BatchMatrixType, typename PrecType>
 class batch_solver_dispatch {
 public:
     using value_type = ValueType;
@@ -238,7 +239,8 @@ class batch_solver_dispatch {
 
     batch_solver_dispatch(
         const KernelCaller& kernel_caller, const SettingsType& settings,
-        const BatchLinOp* const matrix, const BatchLinOp* const preconditioner,
+        const BatchMatrixType* const matrix,
+        const PrecType* const preconditioner,
         const log::detail::log_type logger_type =
             log::detail::log_type::simple_convergence_completion)
         : caller_{kernel_caller},
@@ -248,21 +250,21 @@ class batch_solver_dispatch {
           logger_type_{logger_type}
     {}
 
-    template <typename PrecType, typename BatchMatrixType, typename LogType>
+    template <typename PrecEntry, typename BatchMatrixEntry, typename LogType>
     void dispatch_on_stop(
-        const LogType& logger, const BatchMatrixType& mat_item,
-        PrecType precond,
+        const LogType& logger, const BatchMatrixEntry& mat_item,
+        PrecEntry precond,
         const multi_vector::uniform_batch<const device_value_type>& b_item,
         const multi_vector::uniform_batch<device_value_type>& x_item)
     {
         if (settings_.tol_type == stop::tolerance_type::absolute) {
             caller_.template call_kernel<
-                BatchMatrixType, PrecType,
+                BatchMatrixEntry, PrecEntry,
                 device::batch_stop::SimpleAbsResidual<device_value_type>,
                 LogType>(logger, mat_item, precond, b_item, x_item);
         } else if (settings_.tol_type == stop::tolerance_type::relative) {
             caller_.template call_kernel<
-                BatchMatrixType, PrecType,
+                BatchMatrixEntry, PrecEntry,
                 device::batch_stop::SimpleRelResidual<device_value_type>,
                 LogType>(logger, mat_item, precond, b_item, x_item);
         } else {
@@ -270,37 +272,37 @@ class batch_solver_dispatch {
         }
     }
 
-    template <typename BatchMatrixType, typename LogType>
+    template <typename BatchMatrixEntry, typename LogType>
     void dispatch_on_preconditioner(
-        const LogType& logger, const BatchMatrixType& mat_item,
+        const LogType& logger, const BatchMatrixEntry& mat_item,
         const multi_vector::uniform_batch<const device_value_type>& b_item,
         const multi_vector::uniform_batch<device_value_type>& x_item)
     {
-        if (!precond_ ||
-            dynamic_cast<const matrix::Identity<value_type>*>(precond_)) {
+        if constexpr (std::is_same_v<PrecType, matrix::Identity<value_type>>) {
             dispatch_on_stop(
                 logger, mat_item,
                 device::batch_preconditioner::Identity<device_value_type>(),
                 b_item, x_item);
-        } else if (auto prec = dynamic_cast<
-                       const batch::preconditioner::Jacobi<value_type>*>(
-                       precond_)) {
-            const auto max_block_size = prec->get_max_block_size();
+        } else if constexpr (std::is_same_v<
+                                 PrecType,
+                                 batch::preconditioner::Jacobi<value_type>>) {
+            const auto max_block_size = precond_->get_max_block_size();
             if (max_block_size == 1) {
                 dispatch_on_stop(logger, mat_item,
                                  device::batch_preconditioner::ScalarJacobi<
                                      device_value_type>(),
                                  b_item, x_item);
             } else {
-                const auto num_blocks = prec->get_num_blocks();
-                const auto block_ptrs_arr = prec->get_const_block_pointers();
+                const auto num_blocks = precond_->get_num_blocks();
+                const auto block_ptrs_arr =
+                    precond_->get_const_block_pointers();
                 const auto row_block_map_arr =
-                    prec->get_const_map_block_to_row();
+                    precond_->get_const_map_block_to_row();
                 const auto blocks_arr =
                     reinterpret_cast<DeviceValueType<const ValueType*>>(
-                        prec->get_const_blocks());
+                        precond_->get_const_blocks());
                 const auto blocks_cumul_storage =
-                    prec->get_const_blocks_cumulative_offsets();
+                    precond_->get_const_blocks_cumulative_offsets();
 
                 dispatch_on_stop(
                     logger, mat_item,
@@ -315,9 +317,9 @@ class batch_solver_dispatch {
         }
     }
 
-    template <typename BatchMatrixType>
+    template <typename BatchMatrixEntry>
     void dispatch_on_logger(
-        const BatchMatrixType& amat,
+        const BatchMatrixEntry& amat,
         const multi_vector::uniform_batch<const device_value_type>& b_item,
         const multi_vector::uniform_batch<device_value_type>& x_item,
         batch::log::detail::log_data<real_type>& log_data)
@@ -337,23 +339,8 @@ class batch_solver_dispatch {
         const multi_vector::uniform_batch<device_value_type>& x_item,
         batch::log::detail::log_data<real_type>& log_data)
     {
-        if (auto batch_mat =
-                dynamic_cast<const batch::matrix::Ell<ValueType, int32>*>(
-                    mat_)) {
-            auto mat_item = device::get_batch_struct(batch_mat);
-            dispatch_on_logger(mat_item, b_item, x_item, log_data);
-        } else if (auto batch_mat =
-                       dynamic_cast<const batch::matrix::Dense<ValueType>*>(
-                           mat_)) {
-            auto mat_item = device::get_batch_struct(batch_mat);
-            dispatch_on_logger(mat_item, b_item, x_item, log_data);
-        } else if (auto batch_mat = dynamic_cast<
-                       const batch::matrix::Csr<ValueType, int32>*>(mat_)) {
-            auto mat_item = device::get_batch_struct(batch_mat);
-            dispatch_on_logger(mat_item, b_item, x_item, log_data);
-        } else {
-            GKO_NOT_SUPPORTED(mat_);
-        }
+        auto mat_item = device::get_batch_struct(mat_);
+        dispatch_on_logger(mat_item, b_item, x_item, log_data);
     }
 
     /**
@@ -375,8 +362,8 @@ class batch_solver_dispatch {
 private:
     const KernelCaller caller_;
     const SettingsType settings_;
-    const BatchLinOp* mat_;
-    const BatchLinOp* precond_;
+    const BatchMatrixType* mat_;
+    const PrecType* precond_;
     const log::detail::log_type logger_type_;
 };
 
@@ -384,14 +371,19 @@ class batch_solver_dispatch {
 /**
  * Convenient function to create a dispatcher. Infers most template arguments.
  */
-template <typename ValueType, typename KernelCaller, typename SettingsType>
-batch_solver_dispatch<ValueType, KernelCaller, SettingsType> create_dispatcher(
-    const KernelCaller& kernel_caller, const SettingsType& settings,
-    const BatchLinOp* const matrix, const BatchLinOp* const preconditioner,
-    const log::detail::log_type logger_type =
-        log::detail::log_type::simple_convergence_completion)
+template <typename ValueType, typename KernelCaller, typename SettingsType,
+          typename BatchMatrixType, typename PrecType>
+batch_solver_dispatch<ValueType, KernelCaller, SettingsType, BatchMatrixType,
+                      PrecType>
+create_dispatcher(const KernelCaller& kernel_caller,
+                  const SettingsType& settings,
+                  const BatchMatrixType* const matrix,
+                  const PrecType* const preconditioner,
+                  const log::detail::log_type logger_type =
+                      log::detail::log_type::simple_convergence_completion)
 {
-    return batch_solver_dispatch<ValueType, KernelCaller, SettingsType>(
+    return batch_solver_dispatch<ValueType, KernelCaller, SettingsType,
+                                 BatchMatrixType, PrecType>(
         kernel_caller, settings, matrix, preconditioner, logger_type);
 }
 
diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu
index e8052637763..74d312c95ef 100644
--- a/cuda/solver/batch_bicgstab_kernels.cu
+++ b/cuda/solver/batch_bicgstab_kernels.cu
@@ -9,6 +9,7 @@
 #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp"
+#include "core/base/batch_instantiation.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "cuda/solver/batch_bicgstab_launch.cuh"
@@ -138,20 +139,21 @@ private:
 };
 
 
-template <typename ValueType>
+template <typename ValueType, typename BatchMatrixType, typename PrecType>
 void apply(std::shared_ptr<const DefaultExecutor> exec,
            const settings<remove_complex<ValueType>>& settings,
-           const batch::BatchLinOp* mat, const batch::BatchLinOp* precon,
+           const BatchMatrixType* mat, const PrecType* precond,
            const batch::MultiVector<ValueType>* b,
            batch::MultiVector<ValueType>* x,
            batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
 {
     auto dispatcher = batch::solver::create_dispatcher<ValueType>(
-        kernel_caller<ValueType>(exec, settings), settings, mat, precon);
+        kernel_caller<ValueType>(exec, settings), settings, mat, precond);
     dispatcher.apply(b, x, logdata);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER(
+    GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL_WRAPPER);
 
 
 }  // namespace batch_bicgstab
diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu
index e45e1baf03b..e1aec94852b 100644
--- a/cuda/solver/batch_cg_kernels.cu
+++ b/cuda/solver/batch_cg_kernels.cu
@@ -9,6 +9,7 @@
 #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "common/cuda_hip/solver/batch_cg_kernels.hpp"
+#include "core/base/batch_instantiation.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "cuda/solver/batch_cg_launch.cuh"
@@ -119,20 +120,21 @@ private:
 };
 
 
-template <typename ValueType>
+template <typename ValueType, typename BatchMatrixType, typename PrecType>
 void apply(std::shared_ptr<const DefaultExecutor> exec,
            const settings<remove_complex<ValueType>>& settings,
-           const batch::BatchLinOp* mat, const batch::BatchLinOp* precon,
+           const BatchMatrixType* mat, const PrecType* precond,
            const batch::MultiVector<ValueType>* b,
            batch::MultiVector<ValueType>* x,
            batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
 {
     auto dispatcher = batch::solver::create_dispatcher<ValueType>(
-        kernel_caller<ValueType>(exec, settings), settings, mat, precon);
+        kernel_caller<ValueType>(exec, settings), settings, mat, precond);
     dispatcher.apply(b, x, logdata);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER(
+    GKO_DECLARE_BATCH_CG_APPLY_KERNEL_WRAPPER);
 
 
 }  // namespace batch_cg
diff --git a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
index c02ca02e1d8..e86eec5f21b 100644
--- a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
+++ b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
@@ -8,6 +8,7 @@
 
 #include <ginkgo/core/solver/batch_bicgstab.hpp>
 
+#include "core/base/batch_instantiation.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
@@ -168,10 +169,10 @@ class kernel_caller {
 };
 
 
-template <typename ValueType>
+template <typename ValueType, typename BatchMatrixType, typename PrecType>
 void apply(std::shared_ptr<const DefaultExecutor> exec,
            const settings<remove_complex<ValueType>>& settings,
-           const batch::BatchLinOp* mat, const batch::BatchLinOp* precond,
+           const BatchMatrixType* mat, const PrecType* precond,
            const batch::MultiVector<ValueType>* b,
            batch::MultiVector<ValueType>* x,
            batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
@@ -181,8 +182,8 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
     dispatcher.apply(b, x, logdata);
 }
 
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER(
+    GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL_WRAPPER);
 
 
 }  // namespace batch_bicgstab
diff --git a/dpcpp/solver/batch_cg_kernels.dp.cpp b/dpcpp/solver/batch_cg_kernels.dp.cpp
index d94019125b1..5ded4a53978 100644
--- a/dpcpp/solver/batch_cg_kernels.dp.cpp
+++ b/dpcpp/solver/batch_cg_kernels.dp.cpp
@@ -8,6 +8,7 @@
 
 #include <ginkgo/core/solver/batch_cg.hpp>
 
+#include "core/base/batch_instantiation.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
@@ -145,10 +146,10 @@ class kernel_caller {
 };
 
 
-template <typename ValueType>
+template <typename ValueType, typename BatchMatrixType, typename PrecType>
 void apply(std::shared_ptr<const DefaultExecutor> exec,
            const settings<remove_complex<ValueType>>& settings,
-           const batch::BatchLinOp* mat, const batch::BatchLinOp* precond,
+           const BatchMatrixType* mat, const PrecType* precond,
            const batch::MultiVector<ValueType>* b,
            batch::MultiVector<ValueType>* x,
            batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
@@ -158,8 +159,8 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
     dispatcher.apply(b, x, logdata);
 }
 
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER(
+    GKO_DECLARE_BATCH_CG_APPLY_KERNEL_WRAPPER);
 
 
 }  // namespace batch_cg
diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp
index 3e019fd3ad1..66d6130cfd0 100644
--- a/hip/solver/batch_bicgstab_kernels.hip.cpp
+++ b/hip/solver/batch_bicgstab_kernels.hip.cpp
@@ -10,6 +10,7 @@
 #include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp"
 #include "common/cuda_hip/solver/batch_bicgstab_launch.hpp"
+#include "core/base/batch_instantiation.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
 
@@ -162,20 +163,21 @@ class kernel_caller {
 };
 
 
-template <typename ValueType>
+template <typename ValueType, typename BatchMatrixType, typename PrecType>
 void apply(std::shared_ptr<const DefaultExecutor> exec,
            const settings<remove_complex<ValueType>>& settings,
-           const batch::BatchLinOp* mat, const batch::BatchLinOp* precon,
+           const BatchMatrixType* mat, const PrecType* precond,
            const batch::MultiVector<ValueType>* b,
            batch::MultiVector<ValueType>* x,
            batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
 {
     auto dispatcher = batch::solver::create_dispatcher<ValueType>(
-        kernel_caller<ValueType>(exec, settings), settings, mat, precon);
+        kernel_caller<ValueType>(exec, settings), settings, mat, precond);
     dispatcher.apply(b, x, logdata);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER(
+    GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL_WRAPPER);
 
 
 }  // namespace batch_bicgstab
diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp
index 7f6f7ffe1db..f36974aae06 100644
--- a/hip/solver/batch_cg_kernels.hip.cpp
+++ b/hip/solver/batch_cg_kernels.hip.cpp
@@ -10,6 +10,7 @@
 #include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "common/cuda_hip/solver/batch_cg_kernels.hpp"
 #include "common/cuda_hip/solver/batch_cg_launch.hpp"
+#include "core/base/batch_instantiation.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
 
@@ -144,20 +145,21 @@ class kernel_caller {
 };
 
 
-template <typename ValueType>
+template <typename ValueType, typename BatchMatrixType, typename PrecType>
 void apply(std::shared_ptr<const DefaultExecutor> exec,
            const settings<remove_complex<ValueType>>& settings,
-           const batch::BatchLinOp* mat, const batch::BatchLinOp* precon,
+           const BatchMatrixType* mat, const PrecType* precond,
            const batch::MultiVector<ValueType>* b,
            batch::MultiVector<ValueType>* x,
            batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
 {
     auto dispatcher = batch::solver::create_dispatcher<ValueType>(
-        kernel_caller<ValueType>(exec, settings), settings, mat, precon);
+        kernel_caller<ValueType>(exec, settings), settings, mat, precond);
     dispatcher.apply(b, x, logdata);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER(
+    GKO_DECLARE_BATCH_CG_APPLY_KERNEL_WRAPPER);
 
 
 }  // namespace batch_cg
diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp
index 4b06b494707..32f80e9ac05 100644
--- a/include/ginkgo/core/base/types.hpp
+++ b/include/ginkgo/core/base/types.hpp
@@ -419,6 +419,23 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
     template _macro(double)
 #endif
 
+/**
+ * Instantiates a template with additional arguments for each non-complex value
+ * type compiled by Ginkgo.
+ *
+ * @see GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE
+ */
+#if GINKGO_DPCPP_SINGLE_MODE
+#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_ARGS(_macro, ...) \
+    template _macro(float, __VA_ARGS__);                                  \
+    template <>                                                           \
+    _macro(double, __VA_ARGS__) GKO_NOT_IMPLEMENTED
+#else
+#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_ARGS(_macro, ...) \
+    template _macro(float, __VA_ARGS__);                                  \
+    template _macro(double, __VA_ARGS__)
+#endif
+
 
 /**
  * Instantiates a template for each value type compiled by Ginkgo.
@@ -496,6 +513,26 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
 #endif
 
 
+/**
+ * Instantiates a template with additional arguments for each value type
+ * compiled by Ginkgo.
+ *
+ * @see GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE
+ */
+#if GINKGO_DPCPP_SINGLE_MODE
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_ARGS(_macro, ...)                  \
+    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_ARGS(_macro, __VA_ARGS__); \
+    template _macro(std::complex<float>, __VA_ARGS__);                         \
+    template <>                                                                \
+    _macro(std::complex<double>, __VA_ARGS__) GKO_NOT_IMPLEMENTED
+#else
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_ARGS(_macro, ...)                  \
+    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_ARGS(_macro, __VA_ARGS__); \
+    template _macro(std::complex<float>, __VA_ARGS__);                         \
+    template _macro(std::complex<double>, __VA_ARGS__)
+#endif
+
+
 /**
  * Instantiates a template for each value and scalar type compiled by Ginkgo.
  * This means all value and scalar type combinations for which
diff --git a/omp/solver/batch_bicgstab_kernels.cpp b/omp/solver/batch_bicgstab_kernels.cpp
index 5e069806f60..f8a4dbb8172 100644
--- a/omp/solver/batch_bicgstab_kernels.cpp
+++ b/omp/solver/batch_bicgstab_kernels.cpp
@@ -8,6 +8,7 @@
 
 #include <ginkgo/core/base/array.hpp>
 
+#include "core/base/batch_instantiation.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "reference/base/batch_multi_vector_kernels.hpp"
 #include "reference/matrix/batch_csr_kernels.hpp"
@@ -77,10 +78,10 @@ class kernel_caller {
 };
 
 
-template <typename ValueType>
+template <typename ValueType, typename BatchMatrixType, typename PrecType>
 void apply(std::shared_ptr<const DefaultExecutor> exec,
            const settings<remove_complex<ValueType>>& settings,
-           const batch::BatchLinOp* mat, const batch::BatchLinOp* precond,
+           const BatchMatrixType* mat, const PrecType* precond,
            const batch::MultiVector<ValueType>* b,
            batch::MultiVector<ValueType>* x,
            batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
@@ -90,7 +91,8 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
     dispatcher.apply(b, x, logdata);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER(
+    GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL_WRAPPER);
 
 
 }  // namespace batch_bicgstab
diff --git a/omp/solver/batch_cg_kernels.cpp b/omp/solver/batch_cg_kernels.cpp
index 0664c0244b6..26a7046a176 100644
--- a/omp/solver/batch_cg_kernels.cpp
+++ b/omp/solver/batch_cg_kernels.cpp
@@ -8,6 +8,7 @@
 
 #include <ginkgo/core/base/array.hpp>
 
+#include "core/base/batch_instantiation.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "reference/base/batch_multi_vector_kernels.hpp"
 #include "reference/matrix/batch_csr_kernels.hpp"
@@ -83,10 +84,10 @@ class kernel_caller {
 };
 
 
-template <typename ValueType>
+template <typename ValueType, typename BatchMatrixType, typename PrecType>
 void apply(std::shared_ptr<const DefaultExecutor> exec,
            const settings<remove_complex<ValueType>>& settings,
-           const batch::BatchLinOp* mat, const batch::BatchLinOp* precond,
+           const BatchMatrixType* mat, const PrecType* precond,
            const batch::MultiVector<ValueType>* b,
            batch::MultiVector<ValueType>* x,
            batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
@@ -96,7 +97,8 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
     dispatcher.apply(b, x, logdata);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER(
+    GKO_DECLARE_BATCH_CG_APPLY_KERNEL_WRAPPER);
 
 
 }  // namespace batch_cg
diff --git a/reference/solver/batch_bicgstab_kernels.cpp b/reference/solver/batch_bicgstab_kernels.cpp
index 5bc75c5ebdb..3f105f27c48 100644
--- a/reference/solver/batch_bicgstab_kernels.cpp
+++ b/reference/solver/batch_bicgstab_kernels.cpp
@@ -4,6 +4,7 @@
 
 #include "core/solver/batch_bicgstab_kernels.hpp"
 
+#include "core/base/batch_instantiation.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "reference/base/batch_multi_vector_kernels.hpp"
 #include "reference/matrix/batch_csr_kernels.hpp"
@@ -37,10 +38,10 @@ class kernel_caller {
         : exec_{std::move(exec)}, settings_{settings}
     {}
 
-    template <typename BatchMatrixType, typename PrecType, typename StopType,
+    template <typename BatchMatrixEntry, typename PrecEntry, typename StopType,
               typename LogType>
     void call_kernel(
-        const LogType& logger, const BatchMatrixType& mat, PrecType prec,
+        const LogType& logger, const BatchMatrixEntry& mat, PrecEntry prec,
         const gko::batch::multi_vector::uniform_batch<const ValueType>& b,
         const gko::batch::multi_vector::uniform_batch<ValueType>& x) const
     {
@@ -55,13 +56,13 @@ class kernel_caller {
         const size_type local_size_bytes =
             gko::kernels::batch_bicgstab::local_memory_requirement<ValueType>(
                 num_rows, num_rhs) +
-            PrecType::dynamic_work_size(num_rows,
-                                        mat.get_single_item_num_nnz());
+            PrecEntry::dynamic_work_size(num_rows,
+                                         mat.get_single_item_num_nnz());
         array<unsigned char> local_space(exec_, local_size_bytes);
 
         for (size_type batch_id = 0; batch_id < num_batch_items; batch_id++) {
             batch_single_kernels::batch_entry_bicgstab_impl<
-                StopType, PrecType, LogType, BatchMatrixType, ValueType>(
+                StopType, PrecEntry, LogType, BatchMatrixEntry, ValueType>(
                 settings_, logger, prec, mat, b, x, batch_id,
                 local_space.get_data());
         }
@@ -73,20 +74,21 @@ class kernel_caller {
 };
 
 
-template <typename ValueType>
+template <typename ValueType, typename BatchMatrixType, typename PrecType>
 void apply(std::shared_ptr<const DefaultExecutor> exec,
            const settings<remove_complex<ValueType>>& settings,
-           const batch::BatchLinOp* mat, const batch::BatchLinOp* precon,
+           const BatchMatrixType* mat, const PrecType* precond,
            const batch::MultiVector<ValueType>* b,
            batch::MultiVector<ValueType>* x,
-           batch::log::detail::log_data<remove_complex<ValueType>>& log_data)
+           batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
 {
     auto dispatcher = batch::solver::create_dispatcher<ValueType>(
-        kernel_caller<ValueType>(exec, settings), settings, mat, precon);
-    dispatcher.apply(b, x, log_data);
+        kernel_caller<ValueType>(exec, settings), settings, mat, precond);
+    dispatcher.apply(b, x, logdata);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER(
+    GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL_WRAPPER);
 
 
 }  // namespace batch_bicgstab
diff --git a/reference/solver/batch_cg_kernels.cpp b/reference/solver/batch_cg_kernels.cpp
index ba54329c31a..3acc49fc524 100644
--- a/reference/solver/batch_cg_kernels.cpp
+++ b/reference/solver/batch_cg_kernels.cpp
@@ -4,6 +4,7 @@
 
 #include "core/solver/batch_cg_kernels.hpp"
 
+#include "core/base/batch_instantiation.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "reference/base/batch_multi_vector_kernels.hpp"
 #include "reference/matrix/batch_csr_kernels.hpp"
@@ -11,7 +12,6 @@
 #include "reference/matrix/batch_ell_kernels.hpp"
 #include "reference/solver/batch_cg_kernels.hpp"
 
-
 namespace gko {
 namespace kernels {
 namespace reference {
@@ -37,10 +37,10 @@ class kernel_caller {
         : exec_{std::move(exec)}, settings_{settings}
     {}
 
-    template <typename BatchMatrixType, typename PrecType, typename StopType,
+    template <typename BatchMatrixEntry, typename PrecEntry, typename StopType,
               typename LogType>
     void call_kernel(
-        const LogType& logger, const BatchMatrixType& mat, PrecType prec,
+        const LogType& logger, const BatchMatrixEntry& mat, PrecEntry prec,
         const gko::batch::multi_vector::uniform_batch<const ValueType>& b,
         const gko::batch::multi_vector::uniform_batch<ValueType>& x) const
     {
@@ -55,13 +55,13 @@ class kernel_caller {
         const size_type local_size_bytes =
             gko::kernels::batch_cg::local_memory_requirement<ValueType>(
                 num_rows, num_rhs) +
-            PrecType::dynamic_work_size(num_rows,
-                                        mat.get_single_item_num_nnz());
+            PrecEntry::dynamic_work_size(num_rows,
+                                         mat.get_single_item_num_nnz());
         array<unsigned char> local_space(exec_, local_size_bytes);
 
         for (size_type batch_id = 0; batch_id < num_batch_items; batch_id++) {
             batch_single_kernels::batch_entry_cg_impl<
-                StopType, PrecType, LogType, BatchMatrixType, ValueType>(
+                StopType, PrecEntry, LogType, BatchMatrixEntry, ValueType>(
                 settings_, logger, prec, mat, b, x, batch_id,
                 local_space.get_data());
         }
@@ -73,20 +73,21 @@ class kernel_caller {
 };
 
 
-template <typename ValueType>
+template <typename ValueType, typename BatchMatrixType, typename PrecType>
 void apply(std::shared_ptr<const DefaultExecutor> exec,
            const settings<remove_complex<ValueType>>& settings,
-           const batch::BatchLinOp* mat, const batch::BatchLinOp* precon,
+           const BatchMatrixType* mat, const PrecType* precond,
            const batch::MultiVector<ValueType>* b,
            batch::MultiVector<ValueType>* x,
-           batch::log::detail::log_data<remove_complex<ValueType>>& log_data)
+           batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
 {
     auto dispatcher = batch::solver::create_dispatcher<ValueType>(
-        kernel_caller<ValueType>(exec, settings), settings, mat, precon);
-    dispatcher.apply(b, x, log_data);
+        kernel_caller<ValueType>(exec, settings), settings, mat, precond);
+    dispatcher.apply(b, x, logdata);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER(
+    GKO_DECLARE_BATCH_CG_APPLY_KERNEL_WRAPPER);
 
 
 }  // namespace batch_cg

From 739320b269848a687aadb249d5087b0c7a7c6cdc Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 12 Nov 2024 19:48:54 +0100
Subject: [PATCH 286/448] fix test solve_lambda

---
 .../test/solver/batch_bicgstab_kernels.cpp    | 21 ++++++++++++++++---
 reference/test/solver/batch_cg_kernels.cpp    | 20 ++++++++++++++++--
 test/preconditioner/batch_jacobi_kernels.cpp  | 20 +++++++++++++++---
 test/solver/batch_bicgstab_kernels.cpp        | 21 ++++++++++++++++---
 test/solver/batch_cg_kernels.cpp              | 21 ++++++++++++++++---
 5 files changed, 89 insertions(+), 14 deletions(-)

diff --git a/reference/test/solver/batch_bicgstab_kernels.cpp b/reference/test/solver/batch_bicgstab_kernels.cpp
index ddb6d09e12a..9644b4f2d26 100644
--- a/reference/test/solver/batch_bicgstab_kernels.cpp
+++ b/reference/test/solver/batch_bicgstab_kernels.cpp
@@ -14,9 +14,12 @@
 #include <ginkgo/core/matrix/batch_csr.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
+#include <ginkgo/core/matrix/batch_identity.hpp>
+#include <ginkgo/core/preconditioner/batch_jacobi.hpp>
 #include <ginkgo/core/solver/batch_bicgstab.hpp>
 
 #include "core/base/batch_utilities.hpp"
+#include "core/base/dispatch_helper.hpp"
 #include "core/matrix/batch_dense_kernels.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/batch_helpers.hpp"
@@ -49,9 +52,21 @@ class BatchBicgstab : public ::testing::Test {
                                   const gko::batch::BatchLinOp* prec,
                                   const Mtx* mtx, const MVec* b, MVec* x,
                                   LogData& log_data) {
-            gko::kernels::reference::batch_bicgstab::apply<
-                typename Mtx::value_type>(executor, opts, mtx, prec, b, x,
-                                          log_data);
+            if (prec == nullptr) {
+                auto identity =
+                    gko::batch::matrix::Identity<value_type>::create(
+                        executor, mtx->get_size());
+                gko::kernels::reference::batch_bicgstab::apply(
+                    executor, opts, mtx, identity.get(), b, x, log_data);
+            } else {
+                gko::run<gko::batch::matrix::Identity<value_type>,
+                         gko::batch::preconditioner::Jacobi<value_type>>(
+                    prec, [&](auto preconditioner) {
+                        gko::kernels::reference::batch_bicgstab::apply(
+                            executor, opts, mtx, preconditioner, b, x,
+                            log_data);
+                    });
+            }
         };
     }
 
diff --git a/reference/test/solver/batch_cg_kernels.cpp b/reference/test/solver/batch_cg_kernels.cpp
index 4ccabfb8849..924372cd5b0 100644
--- a/reference/test/solver/batch_cg_kernels.cpp
+++ b/reference/test/solver/batch_cg_kernels.cpp
@@ -14,9 +14,12 @@
 #include <ginkgo/core/matrix/batch_csr.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
+#include <ginkgo/core/matrix/batch_identity.hpp>
+#include <ginkgo/core/preconditioner/batch_jacobi.hpp>
 #include <ginkgo/core/solver/batch_cg.hpp>
 
 #include "core/base/batch_utilities.hpp"
+#include "core/base/dispatch_helper.hpp"
 #include "core/matrix/batch_dense_kernels.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/batch_helpers.hpp"
@@ -49,8 +52,21 @@ class BatchCg : public ::testing::Test {
                                   const gko::batch::BatchLinOp* prec,
                                   const Mtx* mtx, const MVec* b, MVec* x,
                                   LogData& log_data) {
-            gko::kernels::reference::batch_cg::apply<typename Mtx::value_type>(
-                executor, opts, mtx, prec, b, x, log_data);
+            if (prec == nullptr) {
+                auto identity =
+                    gko::batch::matrix::Identity<value_type>::create(
+                        executor, mtx->get_size());
+                gko::kernels::reference::batch_cg::apply(
+                    executor, opts, mtx, identity.get(), b, x, log_data);
+            } else {
+                gko::run<gko::batch::matrix::Identity<value_type>,
+                         gko::batch::preconditioner::Jacobi<value_type>>(
+                    prec, [&](auto preconditioner) {
+                        gko::kernels::reference::batch_cg::apply(
+                            executor, opts, mtx, preconditioner, b, x,
+                            log_data);
+                    });
+            }
         };
     }
 
diff --git a/test/preconditioner/batch_jacobi_kernels.cpp b/test/preconditioner/batch_jacobi_kernels.cpp
index 62e309361c9..993f551aced 100644
--- a/test/preconditioner/batch_jacobi_kernels.cpp
+++ b/test/preconditioner/batch_jacobi_kernels.cpp
@@ -14,10 +14,12 @@
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/batch_csr.hpp>
+#include <ginkgo/core/matrix/batch_identity.hpp>
 #include <ginkgo/core/preconditioner/batch_jacobi.hpp>
 #include <ginkgo/core/preconditioner/jacobi.hpp>
 #include <ginkgo/core/solver/batch_bicgstab.hpp>
 
+#include "core/base/dispatch_helper.hpp"
 #include "core/solver/batch_bicgstab_kernels.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/assertions.hpp"
@@ -113,9 +115,21 @@ class BatchJacobi : public CommonTestFixture {
                                   const gko::batch::BatchLinOp* prec,
                                   const Mtx* mtx, const MVec* b, MVec* x,
                                   LogData& log_data) {
-            gko::kernels::GKO_DEVICE_NAMESPACE::batch_bicgstab::apply<
-                typename Mtx::value_type>(executor, settings, mtx, prec, b, x,
-                                          log_data);
+            if (prec == nullptr) {
+                auto identity =
+                    gko::batch::matrix::Identity<value_type>::create(
+                        executor, mtx->get_size());
+                gko::kernels::GKO_DEVICE_NAMESPACE::batch_bicgstab::apply(
+                    executor, settings, mtx, identity.get(), b, x, log_data);
+            } else {
+                gko::run<gko::batch::matrix::Identity<value_type>,
+                         gko::batch::preconditioner::Jacobi<value_type>>(
+                    prec, [&](auto preconditioner) {
+                        gko::kernels::GKO_DEVICE_NAMESPACE::batch_bicgstab::
+                            apply(executor, settings, mtx, preconditioner, b, x,
+                                  log_data);
+                    });
+            }
         };
         solver_settings = Settings{max_iters, tol,
                                    gko::batch::stop::tolerance_type::relative};
diff --git a/test/solver/batch_bicgstab_kernels.cpp b/test/solver/batch_bicgstab_kernels.cpp
index 1a852eacfe9..b799c45bf33 100644
--- a/test/solver/batch_bicgstab_kernels.cpp
+++ b/test/solver/batch_bicgstab_kernels.cpp
@@ -14,9 +14,12 @@
 #include <ginkgo/core/matrix/batch_csr.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
+#include <ginkgo/core/matrix/batch_identity.hpp>
+#include <ginkgo/core/preconditioner/batch_jacobi.hpp>
 #include <ginkgo/core/solver/batch_bicgstab.hpp>
 
 #include "core/base/batch_utilities.hpp"
+#include "core/base/dispatch_helper.hpp"
 #include "core/matrix/batch_dense_kernels.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/batch_helpers.hpp"
@@ -48,9 +51,21 @@ class BatchBicgstab : public CommonTestFixture {
                                   const gko::batch::BatchLinOp* prec,
                                   const Mtx* mtx, const MVec* b, MVec* x,
                                   LogData& log_data) {
-            gko::kernels::GKO_DEVICE_NAMESPACE::batch_bicgstab::apply<
-                typename Mtx::value_type>(executor, settings, mtx, prec, b, x,
-                                          log_data);
+            if (prec == nullptr) {
+                auto identity =
+                    gko::batch::matrix::Identity<value_type>::create(
+                        executor, mtx->get_size());
+                gko::kernels::GKO_DEVICE_NAMESPACE::batch_bicgstab::apply(
+                    executor, settings, mtx, identity.get(), b, x, log_data);
+            } else {
+                gko::run<gko::batch::matrix::Identity<value_type>,
+                         gko::batch::preconditioner::Jacobi<value_type>>(
+                    prec, [&](auto preconditioner) {
+                        gko::kernels::GKO_DEVICE_NAMESPACE::batch_bicgstab::
+                            apply(executor, settings, mtx, preconditioner, b, x,
+                                  log_data);
+                    });
+            }
         };
         solver_settings = Settings{max_iters, tol,
                                    gko::batch::stop::tolerance_type::relative};
diff --git a/test/solver/batch_cg_kernels.cpp b/test/solver/batch_cg_kernels.cpp
index 4c6de9004c9..57685f44825 100644
--- a/test/solver/batch_cg_kernels.cpp
+++ b/test/solver/batch_cg_kernels.cpp
@@ -13,9 +13,12 @@
 #include <ginkgo/core/log/batch_logger.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
+#include <ginkgo/core/matrix/batch_identity.hpp>
+#include <ginkgo/core/preconditioner/batch_jacobi.hpp>
 #include <ginkgo/core/solver/batch_cg.hpp>
 
 #include "core/base/batch_utilities.hpp"
+#include "core/base/dispatch_helper.hpp"
 #include "core/matrix/batch_dense_kernels.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/batch_helpers.hpp"
@@ -46,9 +49,21 @@ class BatchCg : public CommonTestFixture {
                                   const gko::batch::BatchLinOp* prec,
                                   const Mtx* mtx, const MVec* b, MVec* x,
                                   LogData& log_data) {
-            gko::kernels::GKO_DEVICE_NAMESPACE::batch_cg::apply<
-                typename Mtx::value_type>(executor, settings, mtx, prec, b, x,
-                                          log_data);
+            if (prec == nullptr) {
+                auto identity =
+                    gko::batch::matrix::Identity<value_type>::create(
+                        executor, mtx->get_size());
+                gko::kernels::GKO_DEVICE_NAMESPACE::batch_cg::apply(
+                    executor, settings, mtx, identity.get(), b, x, log_data);
+            } else {
+                gko::run<gko::batch::matrix::Identity<value_type>,
+                         gko::batch::preconditioner::Jacobi<value_type>>(
+                    prec, [&](auto preconditioner) {
+                        gko::kernels::GKO_DEVICE_NAMESPACE::batch_cg::apply(
+                            executor, settings, mtx, preconditioner, b, x,
+                            log_data);
+                    });
+            }
         };
         solver_settings = Settings{max_iters, tol,
                                    gko::batch::stop::tolerance_type::relative};

From 33cb7664d3ee78c1fab36b767543827d653b24b2 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 12 Nov 2024 15:47:54 +0100
Subject: [PATCH 287/448] msys

---
 .github/workflows/msys.yml | 94 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 .github/workflows/msys.yml

diff --git a/.github/workflows/msys.yml b/.github/workflows/msys.yml
new file mode 100644
index 00000000000..c980b885e2b
--- /dev/null
+++ b/.github/workflows/msys.yml
@@ -0,0 +1,94 @@
+name: msys
+
+on:
+  push:
+
+jobs:
+  build:
+    strategy:
+      fail-fast: false
+      matrix:
+        include: [
+          { msystem: MINGW64, runner: windows-2022 },
+          { msystem: CLANG64, runner: windows-2022 },
+        ]
+    name: ${{ matrix.msystem }}
+    runs-on: ${{ matrix.runner }}
+    steps:
+      - name: Get CPU Name
+        run : |
+          Get-CIMInstance -Class Win32_Processor | Select-Object -Property Name
+      - name: Setup JIT minidump
+        if: ${{ matrix.msystem != 'CLANGARM64' }}
+        run: |
+          Set-Location '${{ runner.temp }}'
+          Invoke-WebRequest -Uri 'https://download.sysinternals.com/files/Procdump.zip' -OutFile Procdump.zip
+          Expand-Archive Procdump.zip -DestinationPath .
+          New-Item -Path '_dumps' -ItemType Directory
+          .\procdump64.exe -accepteula -ma -i "${{ runner.temp }}/_dumps"
+          .\procdump.exe -accepteula -ma -i "${{ runner.temp }}/_dumps"
+      - name: Configure Pagefile
+        if: ${{ matrix.msystem != 'CLANGARM64' }}
+        # https://github.com/al-cheb/configure-pagefile-action/issues/16
+        continue-on-error: true
+        uses: al-cheb/configure-pagefile-action@v1.4
+        with:
+          minimum-size: 4GB
+          maximum-size: 16GB
+          disk-root: "C:"
+
+      - name: Checkout the latest code (shallow clone)
+        uses: actions/checkout@v4
+        with: 
+          path: temp
+
+      # to match the autobuild environment
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+          architecture: 'x64'
+
+      - uses: msys2/setup-msys2@v2
+        with:
+          msystem: ${{ matrix.msystem }}
+          install: git python base-devel
+          pacboy: >-
+            toolchain:p
+            cmake:p
+            ninja:p
+          update: true
+          release: ${{ runner.arch != 'ARM64' }}
+          location: 'D:\M'
+
+      - name: Add staging repo
+        shell: msys2 {0}
+        run: |
+          cp /etc/pacman.conf /etc/pacman.conf.bak
+          grep -qFx '[staging]' /etc/pacman.conf || sed -i '/^# \[staging\]/,/^$/ s|^# ||g' /etc/pacman.conf
+      - name: Update using staging
+        run: |
+          msys2 -c 'pacman --noconfirm -Suuy'
+          msys2 -c 'pacman --noconfirm -Suu'
+      - name: Move Checkout
+        run: |
+          If (Test-Path "C:\_") { rm -r -fo "C:\_" }
+          Copy-Item -Path ".\temp" -Destination "C:\_" -Recurse
+      - name: CI-Build
+        shell: msys2 {0}
+        id: build
+        run: |
+          cd /C/_
+          unset VCPKG_ROOT
+          mkdir build
+          cd build
+          cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON -DGINKGO_BUILD_HWLOC=OFF -DGINKGO_BUILD_EXAMPLES=OFF -DGINKGO_BUILD_BENCHMARKS=OFF ..
+          ninja
+          ctest --output-on-failure
+      - name: "Clean up runner"
+        if: ${{ always() }}
+        continue-on-error: true
+        run: |
+          If (Test-Path "C:\_") { rm -r -fo "C:\_" }
+          msys2 -c 'mv -f /etc/pacman.conf.bak /etc/pacman.conf'
+          msys2 -c 'pacman --noconfirm -Suuy'
+          msys2 -c 'pacman --noconfirm -Suu'
\ No newline at end of file

From 8212d7abc0dbb60c629529e6b4566d7f35ff4f47 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 12 Nov 2024 17:17:33 +0100
Subject: [PATCH 288/448] also seperate config to another library in mingw
 (including msys2/clang)

---
 core/CMakeLists.txt | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
index ddd8937c44f..801ba46d248 100644
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -10,11 +10,13 @@ set(config_source
 if(GINKGO_BUILD_MPI)
     list(APPEND config_source config/schwarz_config.cpp)
 endif()
-# MSVC: To solve LNK1189, we separate the library as a workaround
+# MSVC: LNK1189 issue
+# CLANG in MSYS2 (MINGW): too many exported symbols
+# We separate the library as a workaround to solve this issue
 # To make ginkgo still be the major library, we make the original to ginkgo_core in MSVC/shared
 # TODO: should think another way to solve it like dllexport or def file
 set(ginkgo_core "ginkgo")
-if(MSVC AND BUILD_SHARED_LIBS)
+if((MSVC OR MINGW) AND BUILD_SHARED_LIBS)
     set(ginkgo_core "ginkgo_core")
 endif()
 
@@ -142,8 +144,8 @@ if(GINKGO_BUILD_MPI)
         distributed/preconditioner/schwarz.cpp)
 endif()
 
-# MSVC/shared: make ginkgo be the major library
-if(MSVC AND BUILD_SHARED_LIBS)
+# MSVC or CLANG/msys2 with shared: make ginkgo be the major library
+if((MSVC OR MINGW) AND BUILD_SHARED_LIBS)
     add_library(ginkgo "")
     target_sources(ginkgo PRIVATE ${config_source})
     ginkgo_compile_features(ginkgo)
@@ -161,7 +163,7 @@ ginkgo_compile_features(${ginkgo_core})
 # add a namespace alias so Ginkgo can always be included as Ginkgo::ginkgo
 # regardless of whether it is installed or added as a subdirectory
 add_library(Ginkgo::ginkgo ALIAS ginkgo)
-if(MSVC AND BUILD_SHARED_LIBS)
+if((MSVC OR MINGW) AND BUILD_SHARED_LIBS)
     target_link_libraries(ginkgo PUBLIC ${ginkgo_core})
 endif()
 target_link_libraries(${ginkgo_core}

From d466af57e1044afac0fa30791c20545fb38fad93 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 13 Nov 2024 14:09:24 +0100
Subject: [PATCH 289/448] use -fno-assume-unique-vtables to make dyanmic_cast
 to the final class work

---
 CMakeLists.txt | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6d0804b4eed..2dded88d122 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -94,7 +94,13 @@ if(MSVC)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /bigobj")
 endif()
 if(MINGW OR CYGWIN)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wa,-mbig-obj")
+    if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+        # Otherwise, dynamic_cast to the class marked by final will be failed.
+        # https://reviews.llvm.org/D154658 should be relevant
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-assume-unique-vtables")
+    else()
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wa,-mbig-obj")
+    endif()
 endif()
 
 # For now, PGI/NVHPC nvc++ compiler doesn't seem to support

From 0308a24fbbe797d1af814b7d5485456a16f4f47c Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 13 Nov 2024 19:22:47 +0100
Subject: [PATCH 290/448] reuse the macro from another pr

Co-authored-by: Marcel Koch <marcel.koch@kit.edu>
---
 core/base/batch_instantiation.hpp  | 24 +++++++++----------
 include/ginkgo/core/base/types.hpp | 37 ------------------------------
 2 files changed, 12 insertions(+), 49 deletions(-)

diff --git a/core/base/batch_instantiation.hpp b/core/base/batch_instantiation.hpp
index a686e9838a7..e7f0153c849 100644
--- a/core/base/batch_instantiation.hpp
+++ b/core/base/batch_instantiation.hpp
@@ -28,18 +28,18 @@ namespace batch {
  *
  * @note the second and third arguments only accept the base type.s
  */
-#define GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER(_macro)         \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_ARGS(_macro, gko::batch::matrix::Csr, \
-                                             gko::batch::matrix::Identity);   \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_ARGS(_macro, gko::batch::matrix::Ell, \
-                                             gko::batch::matrix::Identity);   \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_ARGS(                                 \
-        _macro, gko::batch::matrix::Dense, gko::batch::matrix::Identity);     \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_ARGS(                                 \
-        _macro, gko::batch::matrix::Csr, gko::batch::preconditioner::Jacobi); \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_ARGS(                                 \
-        _macro, gko::batch::matrix::Ell, gko::batch::preconditioner::Jacobi); \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_ARGS(                                 \
+#define GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER(_macro)          \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(_macro, gko::batch::matrix::Csr, \
+                                              gko::batch::matrix::Identity);   \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(_macro, gko::batch::matrix::Ell, \
+                                              gko::batch::matrix::Identity);   \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(                                 \
+        _macro, gko::batch::matrix::Dense, gko::batch::matrix::Identity);      \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(                                 \
+        _macro, gko::batch::matrix::Csr, gko::batch::preconditioner::Jacobi);  \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(                                 \
+        _macro, gko::batch::matrix::Ell, gko::batch::preconditioner::Jacobi);  \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(                                 \
         _macro, gko::batch::matrix::Dense, gko::batch::preconditioner::Jacobi)
 
 }  // namespace batch
diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp
index 32f80e9ac05..4b06b494707 100644
--- a/include/ginkgo/core/base/types.hpp
+++ b/include/ginkgo/core/base/types.hpp
@@ -419,23 +419,6 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
     template _macro(double)
 #endif
 
-/**
- * Instantiates a template with additional arguments for each non-complex value
- * type compiled by Ginkgo.
- *
- * @see GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE
- */
-#if GINKGO_DPCPP_SINGLE_MODE
-#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_ARGS(_macro, ...) \
-    template _macro(float, __VA_ARGS__);                                  \
-    template <>                                                           \
-    _macro(double, __VA_ARGS__) GKO_NOT_IMPLEMENTED
-#else
-#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_ARGS(_macro, ...) \
-    template _macro(float, __VA_ARGS__);                                  \
-    template _macro(double, __VA_ARGS__)
-#endif
-
 
 /**
  * Instantiates a template for each value type compiled by Ginkgo.
@@ -513,26 +496,6 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
 #endif
 
 
-/**
- * Instantiates a template with additional arguments for each value type
- * compiled by Ginkgo.
- *
- * @see GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE
- */
-#if GINKGO_DPCPP_SINGLE_MODE
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_ARGS(_macro, ...)                  \
-    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_ARGS(_macro, __VA_ARGS__); \
-    template _macro(std::complex<float>, __VA_ARGS__);                         \
-    template <>                                                                \
-    _macro(std::complex<double>, __VA_ARGS__) GKO_NOT_IMPLEMENTED
-#else
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_ARGS(_macro, ...)                  \
-    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_ARGS(_macro, __VA_ARGS__); \
-    template _macro(std::complex<float>, __VA_ARGS__);                         \
-    template _macro(std::complex<double>, __VA_ARGS__)
-#endif
-
-
 /**
  * Instantiates a template for each value and scalar type compiled by Ginkgo.
  * This means all value and scalar type combinations for which

From 98e6fbdbc94262822c4437f45964c7aa24a4650e Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 21 Nov 2024 15:12:19 +0100
Subject: [PATCH 291/448] make the macro stack clear

---
 .../cuda_hip/solver/batch_bicgstab_launch.hpp | 27 ++++----
 common/cuda_hip/solver/batch_cg_launch.hpp    | 19 +++---
 core/base/batch_instantiation.hpp             | 44 ++++++------
 core/solver/batch_dispatch.hpp                | 67 +++++++------------
 cuda/solver/batch_bicgstab_launch.cuh         | 18 ++---
 cuda/solver/batch_cg_launch.cuh               | 28 +++-----
 dpcpp/solver/batch_bicgstab_launch.hpp        | 28 ++++----
 dpcpp/solver/batch_cg_launch.hpp              | 21 +++---
 8 files changed, 104 insertions(+), 148 deletions(-)

diff --git a/common/cuda_hip/solver/batch_bicgstab_launch.hpp b/common/cuda_hip/solver/batch_bicgstab_launch.hpp
index 696e11b5899..3886c33bcd5 100644
--- a/common/cuda_hip/solver/batch_bicgstab_launch.hpp
+++ b/common/cuda_hip/solver/batch_bicgstab_launch.hpp
@@ -11,6 +11,7 @@
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_bicgstab_kernels.hpp"
+#include "core/solver/batch_dispatch.hpp"
 
 
 namespace gko {
@@ -50,32 +51,28 @@ void launch_apply_kernel(
         device_type<_vtype>* const __restrict__ workspace_data,            \
         const int& block_size, const size_t& shared_size)
 
-#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH(...) \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(     \
-        GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, __VA_ARGS__)
-
 #define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_0_FALSE \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 0, false)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, 0, false)
 #define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_1_FALSE \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 1, false)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, 1, false)
 #define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_2_FALSE \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 2, false)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, 2, false)
 #define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_3_FALSE \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 3, false)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, 3, false)
 #define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_4_FALSE \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 4, false)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, 4, false)
 #define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_5_FALSE \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 5, false)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, 5, false)
 #define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_6_FALSE \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 6, false)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, 6, false)
 #define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_7_FALSE \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 7, false)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, 7, false)
 #define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_8_FALSE \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 8, false)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, 8, false)
 #define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_9_FALSE \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 9, false)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, 9, false)
 #define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_9_TRUE \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 9, true)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, 9, true)
 
 
 }  // namespace batch_bicgstab
diff --git a/common/cuda_hip/solver/batch_cg_launch.hpp b/common/cuda_hip/solver/batch_cg_launch.hpp
index fe5d96c8a21..4306dc2bfab 100644
--- a/common/cuda_hip/solver/batch_cg_launch.hpp
+++ b/common/cuda_hip/solver/batch_cg_launch.hpp
@@ -11,6 +11,7 @@
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_cg_kernels.hpp"
+#include "core/solver/batch_dispatch.hpp"
 
 
 namespace gko {
@@ -50,24 +51,20 @@ void launch_apply_kernel(
         device_type<_vtype>* const __restrict__ workspace_data,               \
         const int& block_size, const size_t& shared_size)
 
-#define GKO_INSTANTIATE_BATCH_CG_LAUNCH(...)                               \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(GKO_DECLARE_BATCH_CG_LAUNCH, \
-                                              __VA_ARGS__)
-
 #define GKO_INSTANTIATE_BATCH_CG_LAUNCH_0_FALSE \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 0, false)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, 0, false)
 #define GKO_INSTANTIATE_BATCH_CG_LAUNCH_1_FALSE \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 1, false)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, 1, false)
 #define GKO_INSTANTIATE_BATCH_CG_LAUNCH_2_FALSE \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 2, false)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, 2, false)
 #define GKO_INSTANTIATE_BATCH_CG_LAUNCH_3_FALSE \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 3, false)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, 3, false)
 #define GKO_INSTANTIATE_BATCH_CG_LAUNCH_4_FALSE \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 4, false)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, 4, false)
 #define GKO_INSTANTIATE_BATCH_CG_LAUNCH_5_FALSE \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 5, false)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, 5, false)
 #define GKO_INSTANTIATE_BATCH_CG_LAUNCH_5_TRUE \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 5, true)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, 5, true)
 
 
 }  // namespace batch_cg
diff --git a/core/base/batch_instantiation.hpp b/core/base/batch_instantiation.hpp
index e7f0153c849..6ea3faa104e 100644
--- a/core/base/batch_instantiation.hpp
+++ b/core/base/batch_instantiation.hpp
@@ -12,37 +12,43 @@
 #include <ginkgo/core/matrix/batch_identity.hpp>
 #include <ginkgo/core/preconditioner/batch_jacobi.hpp>
 
+
 namespace gko {
 namespace batch {
 
+
+// just make the call list more consistent
+#define GKO_CALL(_macro, ...) _macro(__VA_ARGS__)
+
+#define GKO_BATCH_INSTANTIATE_PRECONDITIONER(_next, ...) \
+    _next(__VA_ARGS__, gko::batch::matrix::Identity);    \
+    _next(__VA_ARGS__, gko::batch::preconditioner::Jacobi)
+
+#define GKO_BATCH_INSTANTIATE_MATRIX(_next, ...)   \
+    _next(__VA_ARGS__, gko::batch::matrix::Ell);   \
+    _next(__VA_ARGS__, gko::batch::matrix::Dense); \
+    _next(__VA_ARGS__, gko::batch::matrix::Csr)
+
 /**
  * Instantiates a template for each valid combination of value type, batch
  * matrix type, and batch preconditioner type. This only allows batch matrix
  * type and preconditioner type also uses the same value type.
  *
- * @param _macro  A macro which expands the template instantiation
- *                (not including the leading `template` specifier).
- *                Should take three arguments, where the first is replaced by
- *                the value type, the second by the matrix, and the third by the
- *                preconditioner.
+ * @param args   the first should be a macro which expands the template
+ *               instantiation (not including the leading `template` specifier).
+ *               Should take three arguments, where the first is replaced by the
+ *               value type, the second by the matrix, and the third by the
+ *               preconditioner.
  *
  * @note the second and third arguments only accept the base type.s
  */
-#define GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER(_macro)          \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(_macro, gko::batch::matrix::Csr, \
-                                              gko::batch::matrix::Identity);   \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(_macro, gko::batch::matrix::Ell, \
-                                              gko::batch::matrix::Identity);   \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(                                 \
-        _macro, gko::batch::matrix::Dense, gko::batch::matrix::Identity);      \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(                                 \
-        _macro, gko::batch::matrix::Csr, gko::batch::preconditioner::Jacobi);  \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(                                 \
-        _macro, gko::batch::matrix::Ell, gko::batch::preconditioner::Jacobi);  \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(                                 \
-        _macro, gko::batch::matrix::Dense, gko::batch::preconditioner::Jacobi)
+#define GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER(...) \
+    GKO_CALL(GKO_BATCH_INSTANTIATE_MATRIX,                         \
+             GKO_BATCH_INSTANTIATE_PRECONDITIONER,                 \
+             GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS, __VA_ARGS__)
+
 
 }  // namespace batch
 }  // namespace gko
 
-#endif  //
+#endif  // GKO_PUBLIC_CORE_BASE_BATCH_INSTANTIATION_HPP_
diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp
index f798515f2e2..2a438f9aafd 100644
--- a/core/solver/batch_dispatch.hpp
+++ b/core/solver/batch_dispatch.hpp
@@ -17,6 +17,7 @@
 #include <ginkgo/core/solver/batch_bicgstab.hpp>
 #include <ginkgo/core/stop/batch_stop_enum.hpp>
 
+#include "core/base/batch_instantiation.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 
@@ -164,35 +165,28 @@ enum class log_type { simple_convergence_completion };
 }  // namespace log
 
 
-#define GKO_BATCH_INSTANTIATE_STOP(macro, ...)                          \
-    macro(__VA_ARGS__,                                                  \
+#define GKO_BATCH_INSTANTIATE_STOP(_next, ...)                          \
+    _next(__VA_ARGS__,                                                  \
           ::gko::batch::solver::device::batch_stop::SimpleAbsResidual); \
-    macro(__VA_ARGS__,                                                  \
+    _next(__VA_ARGS__,                                                  \
           ::gko::batch::solver::device::batch_stop::SimpleRelResidual)
 
-#define GKO_BATCH_INSTANTIATE_PRECONDITIONER(macro, ...)                   \
-    GKO_BATCH_INSTANTIATE_STOP(                                            \
-        macro, __VA_ARGS__,                                                \
-        ::gko::batch::solver::device::batch_preconditioner::Identity);     \
-    GKO_BATCH_INSTANTIATE_STOP(                                            \
-        macro, __VA_ARGS__,                                                \
-        ::gko::batch::solver::device::batch_preconditioner::ScalarJacobi); \
-    GKO_BATCH_INSTANTIATE_STOP(                                            \
-        macro, __VA_ARGS__,                                                \
-        ::gko::batch::solver::device::batch_preconditioner::BlockJacobi)
-
-#define GKO_BATCH_INSTANTIATE_LOGGER(macro, ...) \
-    GKO_BATCH_INSTANTIATE_PRECONDITIONER(        \
-        macro, __VA_ARGS__,                      \
-        ::gko::batch::solver::device::batch_log::SimpleFinalLogger)
-
-#define GKO_BATCH_INSTANTIATE_MATRIX_VARGS(macro, ...)                 \
-    GKO_BATCH_INSTANTIATE_LOGGER(macro, __VA_ARGS__,                   \
-                                 batch::matrix::ell::uniform_batch);   \
-    GKO_BATCH_INSTANTIATE_LOGGER(macro, __VA_ARGS__,                   \
-                                 batch::matrix::dense::uniform_batch); \
-    GKO_BATCH_INSTANTIATE_LOGGER(macro, __VA_ARGS__,                   \
-                                 batch::matrix::csr::uniform_batch)
+#define GKO_BATCH_INSTANTIATE_DEVICE_PRECONDITIONER(_next, ...)              \
+    _next(__VA_ARGS__,                                                       \
+          ::gko::batch::solver::device::batch_preconditioner::Identity);     \
+    _next(__VA_ARGS__,                                                       \
+          ::gko::batch::solver::device::batch_preconditioner::ScalarJacobi); \
+    _next(__VA_ARGS__,                                                       \
+          ::gko::batch::solver::device::batch_preconditioner::BlockJacobi)
+
+#define GKO_BATCH_INSTANTIATE_LOGGER(_next, ...) \
+    _next(__VA_ARGS__,                           \
+          ::gko::batch::solver::device::batch_log::SimpleFinalLogger)
+
+#define GKO_BATCH_INSTANTIATE_MATRIX_BATCH(_next, ...)       \
+    _next(__VA_ARGS__, batch::matrix::ell::uniform_batch);   \
+    _next(__VA_ARGS__, batch::matrix::dense::uniform_batch); \
+    _next(__VA_ARGS__, batch::matrix::csr::uniform_batch)
 
 /**
  * Passes each valid configuration of batch solver template parameter to a
@@ -201,22 +195,11 @@ enum class log_type { simple_convergence_completion };
  * GKO_BATCH_INSTANTIATE will be prepended to the batch solver template
  * parameters.
  */
-#define GKO_BATCH_INSTANTIATE_VARGS(macro, ...) \
-    GKO_BATCH_INSTANTIATE_MATRIX_VARGS(macro, __VA_ARGS__)
-
-
-/**
- * Passes each valid configuration of batch solver template parameter to a
- * macro. The order of template parameters is: macro(<matrix>, <logger>,
- * <precond>, <stop>)
- */
-#define GKO_BATCH_INSTANTIATE_MATRIX(macro, ...)                              \
-    GKO_BATCH_INSTANTIATE_LOGGER(macro, batch::matrix::ell::uniform_batch);   \
-    GKO_BATCH_INSTANTIATE_LOGGER(macro, batch::matrix::dense::uniform_batch); \
-    GKO_BATCH_INSTANTIATE_LOGGER(macro, batch::matrix::csr::uniform_batch)
-
-#define GKO_BATCH_INSTANTIATE(macro) GKO_BATCH_INSTANTIATE_MATRIX(macro)
-
+#define GKO_BATCH_INSTANTIATE(...)                                             \
+    GKO_CALL(GKO_BATCH_INSTANTIATE_MATRIX_BATCH, GKO_BATCH_INSTANTIATE_LOGGER, \
+             GKO_BATCH_INSTANTIATE_DEVICE_PRECONDITIONER,                      \
+             GKO_BATCH_INSTANTIATE_STOP,                                       \
+             GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS, __VA_ARGS__)
 
 /**
  * Handles dispatching to the correct instantiation of a batched solver
diff --git a/cuda/solver/batch_bicgstab_launch.cuh b/cuda/solver/batch_bicgstab_launch.cuh
index 737f2a923b0..b4e8753ccca 100644
--- a/cuda/solver/batch_bicgstab_launch.cuh
+++ b/cuda/solver/batch_bicgstab_launch.cuh
@@ -31,7 +31,7 @@ template <typename StopType, typename PrecType, typename LogType,
 int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
                               const int num_rows);
 
-#define GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK_(              \
+#define GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK(               \
     _vtype, mat_t, log_t, pre_t, stop_t)                                    \
     int get_num_threads_per_block<                                          \
         stop_t<cuda_type<_vtype>>, pre_t<cuda_type<_vtype>>,                \
@@ -39,34 +39,24 @@ int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
         cuda_type<_vtype>>(std::shared_ptr<const DefaultExecutor> exec,     \
                            const int num_rows)
 
-#define GKO_INSTANTIATE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK_(...) \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(                         \
-        GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK_, __VA_ARGS__)
-
 #define GKO_INSTANTIATE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK \
-    GKO_BATCH_INSTANTIATE(                                       \
-        GKO_INSTANTIATE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK_)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK)
 
 
 template <typename StopType, typename PrecType, typename LogType,
           typename BatchMatrixType, typename ValueType>
 int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec);
 
-#define GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY_(          \
+#define GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY(           \
     _vtype, mat_t, log_t, pre_t, stop_t)                                    \
     int get_max_dynamic_shared_memory<                                      \
         stop_t<cuda_type<_vtype>>, pre_t<cuda_type<_vtype>>,                \
         log_t<gko::remove_complex<_vtype>>, mat_t<const cuda_type<_vtype>>, \
         cuda_type<_vtype>>(std::shared_ptr<const DefaultExecutor> exec)
 
-#define GKO_INSTANTIATE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY_(...) \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(                             \
-        GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY_,         \
-        __VA_ARGS__)
-
 #define GKO_INSTANTIATE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY \
     GKO_BATCH_INSTANTIATE(                                           \
-        GKO_INSTANTIATE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY_)
+        GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY)
 
 
 }  // namespace batch_bicgstab
diff --git a/cuda/solver/batch_cg_launch.cuh b/cuda/solver/batch_cg_launch.cuh
index e803e15fe80..94d948cf202 100644
--- a/cuda/solver/batch_cg_launch.cuh
+++ b/cuda/solver/batch_cg_launch.cuh
@@ -31,41 +31,31 @@ template <typename StopType, typename PrecType, typename LogType,
 int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
                               const int num_rows);
 
-#define GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK_(_vtype, mat_t, log_t, \
-                                                        pre_t, stop_t)        \
-    int get_num_threads_per_block<                                            \
-        stop_t<cuda_type<_vtype>>, pre_t<cuda_type<_vtype>>,                  \
-        log_t<gko::remove_complex<cuda_type<_vtype>>>,                        \
-        mat_t<const cuda_type<_vtype>>, cuda_type<_vtype>>(                   \
+#define GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK(_vtype, mat_t, log_t, \
+                                                       pre_t, stop_t)        \
+    int get_num_threads_per_block<                                           \
+        stop_t<cuda_type<_vtype>>, pre_t<cuda_type<_vtype>>,                 \
+        log_t<gko::remove_complex<cuda_type<_vtype>>>,                       \
+        mat_t<const cuda_type<_vtype>>, cuda_type<_vtype>>(                  \
         std::shared_ptr<const DefaultExecutor> exec, const int num_rows)
 
-#define GKO_INSTANTIATE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK_(...) \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(                   \
-        GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK_, __VA_ARGS__)
-
 #define GKO_INSTANTIATE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK \
-    GKO_BATCH_INSTANTIATE(GKO_INSTANTIATE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK_)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK)
 
 
 template <typename StopType, typename PrecType, typename LogType,
           typename BatchMatrixType, typename ValueType>
 int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec);
 
-#define GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY_(                \
+#define GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY(                 \
     _vtype, mat_t, log_t, pre_t, stop_t)                                    \
     int get_max_dynamic_shared_memory<                                      \
         stop_t<cuda_type<_vtype>>, pre_t<cuda_type<_vtype>>,                \
         log_t<gko::remove_complex<_vtype>>, mat_t<const cuda_type<_vtype>>, \
         cuda_type<_vtype>>(std::shared_ptr<const DefaultExecutor> exec)
 
-
-#define GKO_INSTANTIATE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY_(...) \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(                       \
-        GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY_, __VA_ARGS__)
-
 #define GKO_INSTANTIATE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY \
-    GKO_BATCH_INSTANTIATE(                                     \
-        GKO_INSTANTIATE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY_)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY)
 
 
 }  // namespace batch_cg
diff --git a/dpcpp/solver/batch_bicgstab_launch.hpp b/dpcpp/solver/batch_bicgstab_launch.hpp
index 06ba8531b42..a9c78b9df45 100644
--- a/dpcpp/solver/batch_bicgstab_launch.hpp
+++ b/dpcpp/solver/batch_bicgstab_launch.hpp
@@ -53,34 +53,30 @@ void launch_apply_kernel(
         _vtype* const __restrict__ workspace_data, const int& block_size,    \
         const int& shared_size)
 
-#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH(...) \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(     \
-        GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, __VA_ARGS__)
-
 #define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_0 \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 0)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, 32, 0)
 #define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_1 \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 1)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, 32, 1)
 #define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_2 \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 2)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, 32, 2)
 #define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_3 \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 3)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, 32, 3)
 #define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_4 \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 4)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, 32, 4)
 #define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_5 \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 5)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, 32, 5)
 #define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_6 \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 6)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, 32, 6)
 #define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_7 \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 7)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, 32, 7)
 #define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_8 \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 8)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, 32, 8)
 #define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_9 \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 9)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, 32, 9)
 #define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_10 \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 10)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, 32, 10)
 #define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_10_16 \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 16, 10)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, 16, 10)
 
 
 }  // namespace batch_bicgstab
diff --git a/dpcpp/solver/batch_cg_launch.hpp b/dpcpp/solver/batch_cg_launch.hpp
index 3fe1e704963..c5f8e0d5dba 100644
--- a/dpcpp/solver/batch_cg_launch.hpp
+++ b/dpcpp/solver/batch_cg_launch.hpp
@@ -6,6 +6,7 @@
 
 #include <ginkgo/core/solver/batch_cg.hpp>
 
+#include "core/base/batch_instantiation.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_cg_kernels.hpp"
@@ -50,26 +51,22 @@ void launch_apply_kernel(std::shared_ptr<const DefaultExecutor> exec,
         _vtype* const __restrict__ workspace_data, const int& block_size,     \
         const int& shared_size)
 
-#define GKO_INSTANTIATE_BATCH_CG_LAUNCH(...)                               \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(GKO_DECLARE_BATCH_CG_LAUNCH, \
-                                              __VA_ARGS__)
-
 #define GKO_INSTANTIATE_BATCH_CG_LAUNCH_0 \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 32, 0)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, 32, 0)
 #define GKO_INSTANTIATE_BATCH_CG_LAUNCH_1 \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 32, 1)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, 32, 1)
 #define GKO_INSTANTIATE_BATCH_CG_LAUNCH_2 \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 32, 2)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, 32, 2)
 #define GKO_INSTANTIATE_BATCH_CG_LAUNCH_3 \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 32, 3)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, 32, 3)
 #define GKO_INSTANTIATE_BATCH_CG_LAUNCH_4 \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 32, 4)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, 32, 4)
 #define GKO_INSTANTIATE_BATCH_CG_LAUNCH_5 \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 32, 5)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, 32, 5)
 #define GKO_INSTANTIATE_BATCH_CG_LAUNCH_6 \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 32, 6)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, 32, 6)
 #define GKO_INSTANTIATE_BATCH_CG_LAUNCH_6_16 \
-    GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 16, 6)
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, 16, 6)
 
 
 }  // namespace batch_cg

From 195ece1b4d4fd8edb32dfc5711c16675b0860c88 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 21 Nov 2024 16:26:38 +0100
Subject: [PATCH 292/448] add GKO_INDIRECT to properly expand the __VA_ARGS__
 in msvc

---
 core/base/batch_instantiation.hpp  | 16 +++++------
 core/solver/batch_dispatch.hpp     | 45 +++++++++++++++++-------------
 include/ginkgo/core/base/types.hpp |  2 ++
 3 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/core/base/batch_instantiation.hpp b/core/base/batch_instantiation.hpp
index 6ea3faa104e..dbcccefb469 100644
--- a/core/base/batch_instantiation.hpp
+++ b/core/base/batch_instantiation.hpp
@@ -18,16 +18,16 @@ namespace batch {
 
 
 // just make the call list more consistent
-#define GKO_CALL(_macro, ...) _macro(__VA_ARGS__)
+#define GKO_CALL(_macro, ...) GKO_INDIRECT(_macro(__VA_ARGS__))
 
-#define GKO_BATCH_INSTANTIATE_PRECONDITIONER(_next, ...) \
-    _next(__VA_ARGS__, gko::batch::matrix::Identity);    \
-    _next(__VA_ARGS__, gko::batch::preconditioner::Jacobi)
+#define GKO_BATCH_INSTANTIATE_PRECONDITIONER(_next, ...)            \
+    GKO_INDIRECT(_next(__VA_ARGS__, gko::batch::matrix::Identity)); \
+    GKO_INDIRECT(_next(__VA_ARGS__, gko::batch::preconditioner::Jacobi))
 
-#define GKO_BATCH_INSTANTIATE_MATRIX(_next, ...)   \
-    _next(__VA_ARGS__, gko::batch::matrix::Ell);   \
-    _next(__VA_ARGS__, gko::batch::matrix::Dense); \
-    _next(__VA_ARGS__, gko::batch::matrix::Csr)
+#define GKO_BATCH_INSTANTIATE_MATRIX(_next, ...)                 \
+    GKO_INDIRECT(_next(__VA_ARGS__, gko::batch::matrix::Ell));   \
+    GKO_INDIRECT(_next(__VA_ARGS__, gko::batch::matrix::Dense)); \
+    GKO_INDIRECT(_next(__VA_ARGS__, gko::batch::matrix::Csr))
 
 /**
  * Instantiates a template for each valid combination of value type, batch
diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp
index 2a438f9aafd..d76bc72d489 100644
--- a/core/solver/batch_dispatch.hpp
+++ b/core/solver/batch_dispatch.hpp
@@ -165,28 +165,34 @@ enum class log_type { simple_convergence_completion };
 }  // namespace log
 
 
-#define GKO_BATCH_INSTANTIATE_STOP(_next, ...)                          \
-    _next(__VA_ARGS__,                                                  \
-          ::gko::batch::solver::device::batch_stop::SimpleAbsResidual); \
-    _next(__VA_ARGS__,                                                  \
-          ::gko::batch::solver::device::batch_stop::SimpleRelResidual)
-
-#define GKO_BATCH_INSTANTIATE_DEVICE_PRECONDITIONER(_next, ...)              \
-    _next(__VA_ARGS__,                                                       \
-          ::gko::batch::solver::device::batch_preconditioner::Identity);     \
-    _next(__VA_ARGS__,                                                       \
-          ::gko::batch::solver::device::batch_preconditioner::ScalarJacobi); \
-    _next(__VA_ARGS__,                                                       \
-          ::gko::batch::solver::device::batch_preconditioner::BlockJacobi)
+#define GKO_BATCH_INSTANTIATE_STOP(_next, ...)                               \
+    GKO_INDIRECT(                                                            \
+        _next(__VA_ARGS__,                                                   \
+              ::gko::batch::solver::device::batch_stop::SimpleAbsResidual)); \
+    GKO_INDIRECT(                                                            \
+        _next(__VA_ARGS__,                                                   \
+              ::gko::batch::solver::device::batch_stop::SimpleRelResidual))
+
+#define GKO_BATCH_INSTANTIATE_DEVICE_PRECONDITIONER(_next, ...)               \
+    GKO_INDIRECT(                                                             \
+        _next(__VA_ARGS__,                                                    \
+              ::gko::batch::solver::device::batch_preconditioner::Identity)); \
+    GKO_INDIRECT(_next(                                                       \
+        __VA_ARGS__,                                                          \
+        ::gko::batch::solver::device::batch_preconditioner::ScalarJacobi));   \
+    GKO_INDIRECT(_next(                                                       \
+        __VA_ARGS__,                                                          \
+        ::gko::batch::solver::device::batch_preconditioner::BlockJacobi))
 
 #define GKO_BATCH_INSTANTIATE_LOGGER(_next, ...) \
-    _next(__VA_ARGS__,                           \
-          ::gko::batch::solver::device::batch_log::SimpleFinalLogger)
+    GKO_INDIRECT(                                \
+        _next(__VA_ARGS__,                       \
+              ::gko::batch::solver::device::batch_log::SimpleFinalLogger))
 
-#define GKO_BATCH_INSTANTIATE_MATRIX_BATCH(_next, ...)       \
-    _next(__VA_ARGS__, batch::matrix::ell::uniform_batch);   \
-    _next(__VA_ARGS__, batch::matrix::dense::uniform_batch); \
-    _next(__VA_ARGS__, batch::matrix::csr::uniform_batch)
+#define GKO_BATCH_INSTANTIATE_MATRIX_BATCH(_next, ...)                     \
+    GKO_INDIRECT(_next(__VA_ARGS__, batch::matrix::ell::uniform_batch));   \
+    GKO_INDIRECT(_next(__VA_ARGS__, batch::matrix::dense::uniform_batch)); \
+    GKO_INDIRECT(_next(__VA_ARGS__, batch::matrix::csr::uniform_batch))
 
 /**
  * Passes each valid configuration of batch solver template parameter to a
@@ -201,6 +207,7 @@ enum class log_type { simple_convergence_completion };
              GKO_BATCH_INSTANTIATE_STOP,                                       \
              GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS, __VA_ARGS__)
 
+
 /**
  * Handles dispatching to the correct instantiation of a batched solver
  * depending on runtime parameters.
diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp
index 4b06b494707..72dd8a93584 100644
--- a/include/ginkgo/core/base/types.hpp
+++ b/include/ginkgo/core/base/types.hpp
@@ -443,6 +443,8 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
 
 
 // Helper macro to make Windows builds work
+// In MSVC, __VA_ARGS__ behave like one argument by default.
+// with this, we can expand the __VA_ARGS__ properly
 #define GKO_INDIRECT(...) __VA_ARGS__
 
 

From 49a8abf39f8776c261cd04144cc262fc12a62015 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 21 Nov 2024 23:37:30 +0100
Subject: [PATCH 293/448] no way to face nullptr preconditioner in batch apply

Co-authored-by: Marcel Koch <marcel.koch@kit.edu>
---
 core/solver/batch_bicgstab.cpp                | 17 +++++---------
 core/solver/batch_cg.cpp                      | 19 +++++-----------
 core/test/utils/batch_helpers.hpp             |  4 +++-
 .../test/solver/batch_bicgstab_kernels.cpp    | 21 +++++-------------
 reference/test/solver/batch_cg_kernels.cpp    | 21 +++++-------------
 test/preconditioner/batch_jacobi_kernels.cpp  | 22 ++++++-------------
 test/solver/batch_bicgstab_kernels.cpp        | 22 ++++++-------------
 test/solver/batch_cg_kernels.cpp              | 22 ++++++-------------
 8 files changed, 47 insertions(+), 101 deletions(-)

diff --git a/core/solver/batch_bicgstab.cpp b/core/solver/batch_bicgstab.cpp
index 66b0bf9f704..73fc0a2c852 100644
--- a/core/solver/batch_bicgstab.cpp
+++ b/core/solver/batch_bicgstab.cpp
@@ -58,18 +58,11 @@ void Bicgstab<ValueType>::solver_apply(
 
     run<matrix::Dense<ValueType>, matrix::Csr<ValueType>,
         matrix::Ell<ValueType>>(this->system_matrix_.get(), [&](auto matrix) {
-        if (this->preconditioner_ == nullptr) {
-            auto identity =
-                matrix::Identity<ValueType>::create(exec, matrix->get_size());
-            exec->run(bicgstab::make_apply(settings, matrix, identity.get(), b,
-                                           x, *log_data));
-        } else {
-            run<matrix::Identity<ValueType>, preconditioner::Jacobi<ValueType>>(
-                this->preconditioner_.get(), [&](auto preconditioner) {
-                    exec->run(bicgstab::make_apply(
-                        settings, matrix, preconditioner, b, x, *log_data));
-                });
-        }
+        run<matrix::Identity<ValueType>, preconditioner::Jacobi<ValueType>>(
+            this->preconditioner_.get(), [&](auto preconditioner) {
+                exec->run(bicgstab::make_apply(settings, matrix, preconditioner,
+                                               b, x, *log_data));
+            });
     });
 }
 
diff --git a/core/solver/batch_cg.cpp b/core/solver/batch_cg.cpp
index 0ac9f111cea..13a5afffcaa 100644
--- a/core/solver/batch_cg.cpp
+++ b/core/solver/batch_cg.cpp
@@ -58,19 +58,12 @@ void Cg<ValueType>::solver_apply(
     run<batch::matrix::Dense<ValueType>, batch::matrix::Csr<ValueType>,
         batch::matrix::Ell<ValueType>>(
         this->system_matrix_.get(), [&](auto matrix) {
-            if (this->preconditioner_ == nullptr) {
-                auto identity = matrix::Identity<ValueType>::create(
-                    exec, matrix->get_size());
-                exec->run(cg::make_apply(settings, matrix, identity.get(), b, x,
-                                         *log_data));
-            } else {
-                run<batch::matrix::Identity<ValueType>,
-                    batch::preconditioner::Jacobi<ValueType>>(
-                    this->preconditioner_.get(), [&](auto preconditioner) {
-                        exec->run(cg::make_apply(
-                            settings, matrix, preconditioner, b, x, *log_data));
-                    });
-            }
+            run<batch::matrix::Identity<ValueType>,
+                batch::preconditioner::Jacobi<ValueType>>(
+                this->preconditioner_.get(), [&](auto preconditioner) {
+                    exec->run(cg::make_apply(settings, matrix, preconditioner,
+                                             b, x, *log_data));
+                });
         });
 }
 
diff --git a/core/test/utils/batch_helpers.hpp b/core/test/utils/batch_helpers.hpp
index eff6626de31..15c4d7560d9 100644
--- a/core/test/utils/batch_helpers.hpp
+++ b/core/test/utils/batch_helpers.hpp
@@ -13,6 +13,7 @@
 #include <ginkgo/core/base/device_matrix_data.hpp>
 #include <ginkgo/core/base/matrix_data.hpp>
 #include <ginkgo/core/log/batch_logger.hpp>
+#include <ginkgo/core/matrix/batch_identity.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
 #include "core/test/utils/assertions.hpp"
@@ -334,7 +335,8 @@ ResultWithLogData<typename MatrixType::value_type> solve_linear_system(
     if (precond_factory) {
         precond = precond_factory->generate(sys.matrix);
     } else {
-        precond = nullptr;
+        precond = gko::batch::matrix::Identity<value_type>::create(
+            exec, sys.matrix->get_size());
     }
 
     solve_lambda(settings, precond.get(), sys.matrix.get(), sys.rhs.get(),
diff --git a/reference/test/solver/batch_bicgstab_kernels.cpp b/reference/test/solver/batch_bicgstab_kernels.cpp
index 9644b4f2d26..c7b36ba875c 100644
--- a/reference/test/solver/batch_bicgstab_kernels.cpp
+++ b/reference/test/solver/batch_bicgstab_kernels.cpp
@@ -52,21 +52,12 @@ class BatchBicgstab : public ::testing::Test {
                                   const gko::batch::BatchLinOp* prec,
                                   const Mtx* mtx, const MVec* b, MVec* x,
                                   LogData& log_data) {
-            if (prec == nullptr) {
-                auto identity =
-                    gko::batch::matrix::Identity<value_type>::create(
-                        executor, mtx->get_size());
-                gko::kernels::reference::batch_bicgstab::apply(
-                    executor, opts, mtx, identity.get(), b, x, log_data);
-            } else {
-                gko::run<gko::batch::matrix::Identity<value_type>,
-                         gko::batch::preconditioner::Jacobi<value_type>>(
-                    prec, [&](auto preconditioner) {
-                        gko::kernels::reference::batch_bicgstab::apply(
-                            executor, opts, mtx, preconditioner, b, x,
-                            log_data);
-                    });
-            }
+            gko::run<gko::batch::matrix::Identity<value_type>,
+                     gko::batch::preconditioner::Jacobi<value_type>>(
+                prec, [&](auto preconditioner) {
+                    gko::kernels::reference::batch_bicgstab::apply(
+                        executor, opts, mtx, preconditioner, b, x, log_data);
+                });
         };
     }
 
diff --git a/reference/test/solver/batch_cg_kernels.cpp b/reference/test/solver/batch_cg_kernels.cpp
index 924372cd5b0..86efa158fb5 100644
--- a/reference/test/solver/batch_cg_kernels.cpp
+++ b/reference/test/solver/batch_cg_kernels.cpp
@@ -52,21 +52,12 @@ class BatchCg : public ::testing::Test {
                                   const gko::batch::BatchLinOp* prec,
                                   const Mtx* mtx, const MVec* b, MVec* x,
                                   LogData& log_data) {
-            if (prec == nullptr) {
-                auto identity =
-                    gko::batch::matrix::Identity<value_type>::create(
-                        executor, mtx->get_size());
-                gko::kernels::reference::batch_cg::apply(
-                    executor, opts, mtx, identity.get(), b, x, log_data);
-            } else {
-                gko::run<gko::batch::matrix::Identity<value_type>,
-                         gko::batch::preconditioner::Jacobi<value_type>>(
-                    prec, [&](auto preconditioner) {
-                        gko::kernels::reference::batch_cg::apply(
-                            executor, opts, mtx, preconditioner, b, x,
-                            log_data);
-                    });
-            }
+            gko::run<gko::batch::matrix::Identity<value_type>,
+                     gko::batch::preconditioner::Jacobi<value_type>>(
+                prec, [&](auto preconditioner) {
+                    gko::kernels::reference::batch_cg::apply(
+                        executor, opts, mtx, preconditioner, b, x, log_data);
+                });
         };
     }
 
diff --git a/test/preconditioner/batch_jacobi_kernels.cpp b/test/preconditioner/batch_jacobi_kernels.cpp
index 993f551aced..fe013cee9aa 100644
--- a/test/preconditioner/batch_jacobi_kernels.cpp
+++ b/test/preconditioner/batch_jacobi_kernels.cpp
@@ -115,21 +115,13 @@ class BatchJacobi : public CommonTestFixture {
                                   const gko::batch::BatchLinOp* prec,
                                   const Mtx* mtx, const MVec* b, MVec* x,
                                   LogData& log_data) {
-            if (prec == nullptr) {
-                auto identity =
-                    gko::batch::matrix::Identity<value_type>::create(
-                        executor, mtx->get_size());
-                gko::kernels::GKO_DEVICE_NAMESPACE::batch_bicgstab::apply(
-                    executor, settings, mtx, identity.get(), b, x, log_data);
-            } else {
-                gko::run<gko::batch::matrix::Identity<value_type>,
-                         gko::batch::preconditioner::Jacobi<value_type>>(
-                    prec, [&](auto preconditioner) {
-                        gko::kernels::GKO_DEVICE_NAMESPACE::batch_bicgstab::
-                            apply(executor, settings, mtx, preconditioner, b, x,
-                                  log_data);
-                    });
-            }
+            gko::run<gko::batch::matrix::Identity<value_type>,
+                     gko::batch::preconditioner::Jacobi<value_type>>(
+                prec, [&](auto preconditioner) {
+                    gko::kernels::GKO_DEVICE_NAMESPACE::batch_bicgstab::apply(
+                        executor, settings, mtx, preconditioner, b, x,
+                        log_data);
+                });
         };
         solver_settings = Settings{max_iters, tol,
                                    gko::batch::stop::tolerance_type::relative};
diff --git a/test/solver/batch_bicgstab_kernels.cpp b/test/solver/batch_bicgstab_kernels.cpp
index b799c45bf33..c5eb3996926 100644
--- a/test/solver/batch_bicgstab_kernels.cpp
+++ b/test/solver/batch_bicgstab_kernels.cpp
@@ -51,21 +51,13 @@ class BatchBicgstab : public CommonTestFixture {
                                   const gko::batch::BatchLinOp* prec,
                                   const Mtx* mtx, const MVec* b, MVec* x,
                                   LogData& log_data) {
-            if (prec == nullptr) {
-                auto identity =
-                    gko::batch::matrix::Identity<value_type>::create(
-                        executor, mtx->get_size());
-                gko::kernels::GKO_DEVICE_NAMESPACE::batch_bicgstab::apply(
-                    executor, settings, mtx, identity.get(), b, x, log_data);
-            } else {
-                gko::run<gko::batch::matrix::Identity<value_type>,
-                         gko::batch::preconditioner::Jacobi<value_type>>(
-                    prec, [&](auto preconditioner) {
-                        gko::kernels::GKO_DEVICE_NAMESPACE::batch_bicgstab::
-                            apply(executor, settings, mtx, preconditioner, b, x,
-                                  log_data);
-                    });
-            }
+            gko::run<gko::batch::matrix::Identity<value_type>,
+                     gko::batch::preconditioner::Jacobi<value_type>>(
+                prec, [&](auto preconditioner) {
+                    gko::kernels::GKO_DEVICE_NAMESPACE::batch_bicgstab::apply(
+                        executor, settings, mtx, preconditioner, b, x,
+                        log_data);
+                });
         };
         solver_settings = Settings{max_iters, tol,
                                    gko::batch::stop::tolerance_type::relative};
diff --git a/test/solver/batch_cg_kernels.cpp b/test/solver/batch_cg_kernels.cpp
index 57685f44825..582f26ec497 100644
--- a/test/solver/batch_cg_kernels.cpp
+++ b/test/solver/batch_cg_kernels.cpp
@@ -49,21 +49,13 @@ class BatchCg : public CommonTestFixture {
                                   const gko::batch::BatchLinOp* prec,
                                   const Mtx* mtx, const MVec* b, MVec* x,
                                   LogData& log_data) {
-            if (prec == nullptr) {
-                auto identity =
-                    gko::batch::matrix::Identity<value_type>::create(
-                        executor, mtx->get_size());
-                gko::kernels::GKO_DEVICE_NAMESPACE::batch_cg::apply(
-                    executor, settings, mtx, identity.get(), b, x, log_data);
-            } else {
-                gko::run<gko::batch::matrix::Identity<value_type>,
-                         gko::batch::preconditioner::Jacobi<value_type>>(
-                    prec, [&](auto preconditioner) {
-                        gko::kernels::GKO_DEVICE_NAMESPACE::batch_cg::apply(
-                            executor, settings, mtx, preconditioner, b, x,
-                            log_data);
-                    });
-            }
+            gko::run<gko::batch::matrix::Identity<value_type>,
+                     gko::batch::preconditioner::Jacobi<value_type>>(
+                prec, [&](auto preconditioner) {
+                    gko::kernels::GKO_DEVICE_NAMESPACE::batch_cg::apply(
+                        executor, settings, mtx, preconditioner, b, x,
+                        log_data);
+                });
         };
         solver_settings = Settings{max_iters, tol,
                                    gko::batch::stop::tolerance_type::relative};

From e342d3b34e30d04d3117e9e424c02f77479db09e Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 22 Nov 2024 01:33:34 +0100
Subject: [PATCH 294/448] Revert "msys"

---
 .github/workflows/msys.yml | 94 --------------------------------------
 1 file changed, 94 deletions(-)
 delete mode 100644 .github/workflows/msys.yml

diff --git a/.github/workflows/msys.yml b/.github/workflows/msys.yml
deleted file mode 100644
index c980b885e2b..00000000000
--- a/.github/workflows/msys.yml
+++ /dev/null
@@ -1,94 +0,0 @@
-name: msys
-
-on:
-  push:
-
-jobs:
-  build:
-    strategy:
-      fail-fast: false
-      matrix:
-        include: [
-          { msystem: MINGW64, runner: windows-2022 },
-          { msystem: CLANG64, runner: windows-2022 },
-        ]
-    name: ${{ matrix.msystem }}
-    runs-on: ${{ matrix.runner }}
-    steps:
-      - name: Get CPU Name
-        run : |
-          Get-CIMInstance -Class Win32_Processor | Select-Object -Property Name
-      - name: Setup JIT minidump
-        if: ${{ matrix.msystem != 'CLANGARM64' }}
-        run: |
-          Set-Location '${{ runner.temp }}'
-          Invoke-WebRequest -Uri 'https://download.sysinternals.com/files/Procdump.zip' -OutFile Procdump.zip
-          Expand-Archive Procdump.zip -DestinationPath .
-          New-Item -Path '_dumps' -ItemType Directory
-          .\procdump64.exe -accepteula -ma -i "${{ runner.temp }}/_dumps"
-          .\procdump.exe -accepteula -ma -i "${{ runner.temp }}/_dumps"
-      - name: Configure Pagefile
-        if: ${{ matrix.msystem != 'CLANGARM64' }}
-        # https://github.com/al-cheb/configure-pagefile-action/issues/16
-        continue-on-error: true
-        uses: al-cheb/configure-pagefile-action@v1.4
-        with:
-          minimum-size: 4GB
-          maximum-size: 16GB
-          disk-root: "C:"
-
-      - name: Checkout the latest code (shallow clone)
-        uses: actions/checkout@v4
-        with: 
-          path: temp
-
-      # to match the autobuild environment
-      - uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-          architecture: 'x64'
-
-      - uses: msys2/setup-msys2@v2
-        with:
-          msystem: ${{ matrix.msystem }}
-          install: git python base-devel
-          pacboy: >-
-            toolchain:p
-            cmake:p
-            ninja:p
-          update: true
-          release: ${{ runner.arch != 'ARM64' }}
-          location: 'D:\M'
-
-      - name: Add staging repo
-        shell: msys2 {0}
-        run: |
-          cp /etc/pacman.conf /etc/pacman.conf.bak
-          grep -qFx '[staging]' /etc/pacman.conf || sed -i '/^# \[staging\]/,/^$/ s|^# ||g' /etc/pacman.conf
-      - name: Update using staging
-        run: |
-          msys2 -c 'pacman --noconfirm -Suuy'
-          msys2 -c 'pacman --noconfirm -Suu'
-      - name: Move Checkout
-        run: |
-          If (Test-Path "C:\_") { rm -r -fo "C:\_" }
-          Copy-Item -Path ".\temp" -Destination "C:\_" -Recurse
-      - name: CI-Build
-        shell: msys2 {0}
-        id: build
-        run: |
-          cd /C/_
-          unset VCPKG_ROOT
-          mkdir build
-          cd build
-          cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON -DGINKGO_BUILD_HWLOC=OFF -DGINKGO_BUILD_EXAMPLES=OFF -DGINKGO_BUILD_BENCHMARKS=OFF ..
-          ninja
-          ctest --output-on-failure
-      - name: "Clean up runner"
-        if: ${{ always() }}
-        continue-on-error: true
-        run: |
-          If (Test-Path "C:\_") { rm -r -fo "C:\_" }
-          msys2 -c 'mv -f /etc/pacman.conf.bak /etc/pacman.conf'
-          msys2 -c 'pacman --noconfirm -Suuy'
-          msys2 -c 'pacman --noconfirm -Suu'
\ No newline at end of file

From b0b1f9b3cbf66d7cb5fc3a5f0a802f9c41bb0cb7 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Fri, 15 Nov 2024 22:22:52 +0100
Subject: [PATCH 295/448] Enable file-config conditionally

+ It depends on nlohmann-json
---
 examples/CMakeLists.txt | 44 +++++++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 90c1f8e2632..1da769fdf3f 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -20,7 +20,6 @@ set(EXAMPLES_EXEC_LIST
 set(EXAMPLES_LIST
     ${EXAMPLES_EXEC_LIST}
     custom-stopping-criterion
-    file-config-solver
     ginkgo-overhead
     minimal-cuda-solver
     mixed-spmv
@@ -64,6 +63,15 @@ else()
     message(STATUS "No Kokkos found, disabling examples with Kokkos assembly.")
 endif()
 
+set(GKO_FILE_CONFIG_ENABLED 0)
+find_package(nlohmann_json 3.9.1 QUIET)
+if(nlohmann_json_FOUND)
+    set(GKO_FILE_CONFIG_ENABLED 1)
+    list(APPEND EXAMPLES_LIST file-config-solver)
+else()
+    message(STATUS "No nlohmann-json found, disabling file-config example")
+endif()
+
 foreach(example ${EXAMPLES_LIST})
     add_subdirectory(${example})
 endforeach()
@@ -106,20 +114,22 @@ if(GINKGO_BUILD_TESTS)
         endforeach()
     endforeach()
 
-    file(GLOB config_list RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" file-config-solver/config/*.json)
-    foreach(config IN LISTS config_list)
-        get_filename_component(config_name "${config}" NAME_WE)
-        foreach(executor IN LISTS executors)
-            add_test(NAME example_file-config-solver_${config_name}_${executor} 
-                     COMMAND 
-                     "$<TARGET_FILE:file-config-solver>"
-                     "${executor}" "${CMAKE_CURRENT_SOURCE_DIR}/file-config-solver/config/${config_name}.json"
-                     "${CMAKE_CURRENT_SOURCE_DIR}/file-config-solver/data/A.mtx"
-                     WORKING_DIRECTORY
-                     "$<TARGET_FILE_DIR:ginkgo>")
-            # Prevent performance issues with high core counts
-            set_property(TEST example_file-config-solver_${config_name}_${executor} PROPERTY ENVIRONMENT OMP_NUM_THREADS=4)
-        endforeach()    
-    endforeach()
-        
+    if(GKO_FILE_CONFIG_ENABLED)
+        file(GLOB config_list RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" file-config-solver/config/*.json)
+        foreach(config IN LISTS config_list)
+            get_filename_component(config_name "${config}" NAME_WE)
+            foreach(executor IN LISTS executors)
+                add_test(NAME example_file-config-solver_${config_name}_${executor}
+                    COMMAND
+                    "$<TARGET_FILE:file-config-solver>"
+                    "${executor}" "${CMAKE_CURRENT_SOURCE_DIR}/file-config-solver/config/${config_name}.json"
+                    "${CMAKE_CURRENT_SOURCE_DIR}/file-config-solver/data/A.mtx"
+                    WORKING_DIRECTORY
+                    "$<TARGET_FILE_DIR:ginkgo>")
+                # Prevent performance issues with high core counts
+                set_property(TEST example_file-config-solver_${config_name}_${executor} PROPERTY ENVIRONMENT OMP_NUM_THREADS=4)
+            endforeach()
+        endforeach()
+    endif()
+
 endif()

From e00b23c224578a658da106f86749e3b7d17df9e5 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Fri, 22 Nov 2024 15:12:51 +0100
Subject: [PATCH 296/448] move find_package to root CMakeLists.txt

---
 CMakeLists.txt          | 2 +-
 examples/CMakeLists.txt | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2dded88d122..b8275dab205 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -246,7 +246,7 @@ endif()
 if(GINKGO_BUILD_BENCHMARKS)
     find_package(gflags 2.2.2 QUIET)
 endif()
-if(GINKGO_BUILD_TESTS OR GINKGO_BUILD_BENCHMARKS)
+if(GINKGO_BUILD_TESTS OR GINKGO_BUILD_BENCHMARKS OR GINKGO_BUILD_EXAMPLES)
     find_package(nlohmann_json 3.9.1 QUIET)
 endif()
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 1da769fdf3f..d08f88a40cb 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -64,7 +64,6 @@ else()
 endif()
 
 set(GKO_FILE_CONFIG_ENABLED 0)
-find_package(nlohmann_json 3.9.1 QUIET)
 if(nlohmann_json_FOUND)
     set(GKO_FILE_CONFIG_ENABLED 1)
     list(APPEND EXAMPLES_LIST file-config-solver)

From 47b427f2cdafef85bc36050d73d29789cf0f08b6 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Fri, 22 Nov 2024 15:27:32 +0100
Subject: [PATCH 297/448] always pull in json support with examples enabled

---
 examples/CMakeLists.txt    | 40 ++++++++++++++------------------------
 third_party/CMakeLists.txt |  2 +-
 2 files changed, 16 insertions(+), 26 deletions(-)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index d08f88a40cb..5d8b9d9aa22 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -20,6 +20,7 @@ set(EXAMPLES_EXEC_LIST
 set(EXAMPLES_LIST
     ${EXAMPLES_EXEC_LIST}
     custom-stopping-criterion
+    file-config-solver
     ginkgo-overhead
     minimal-cuda-solver
     mixed-spmv
@@ -63,14 +64,6 @@ else()
     message(STATUS "No Kokkos found, disabling examples with Kokkos assembly.")
 endif()
 
-set(GKO_FILE_CONFIG_ENABLED 0)
-if(nlohmann_json_FOUND)
-    set(GKO_FILE_CONFIG_ENABLED 1)
-    list(APPEND EXAMPLES_LIST file-config-solver)
-else()
-    message(STATUS "No nlohmann-json found, disabling file-config example")
-endif()
-
 foreach(example ${EXAMPLES_LIST})
     add_subdirectory(${example})
 endforeach()
@@ -113,22 +106,19 @@ if(GINKGO_BUILD_TESTS)
         endforeach()
     endforeach()
 
-    if(GKO_FILE_CONFIG_ENABLED)
-        file(GLOB config_list RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" file-config-solver/config/*.json)
-        foreach(config IN LISTS config_list)
-            get_filename_component(config_name "${config}" NAME_WE)
-            foreach(executor IN LISTS executors)
-                add_test(NAME example_file-config-solver_${config_name}_${executor}
-                    COMMAND
-                    "$<TARGET_FILE:file-config-solver>"
-                    "${executor}" "${CMAKE_CURRENT_SOURCE_DIR}/file-config-solver/config/${config_name}.json"
-                    "${CMAKE_CURRENT_SOURCE_DIR}/file-config-solver/data/A.mtx"
-                    WORKING_DIRECTORY
-                    "$<TARGET_FILE_DIR:ginkgo>")
-                # Prevent performance issues with high core counts
-                set_property(TEST example_file-config-solver_${config_name}_${executor} PROPERTY ENVIRONMENT OMP_NUM_THREADS=4)
-            endforeach()
+    file(GLOB config_list RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" file-config-solver/config/*.json)
+    foreach(config IN LISTS config_list)
+        get_filename_component(config_name "${config}" NAME_WE)
+        foreach(executor IN LISTS executors)
+            add_test(NAME example_file-config-solver_${config_name}_${executor}
+                COMMAND
+                "$<TARGET_FILE:file-config-solver>"
+                "${executor}" "${CMAKE_CURRENT_SOURCE_DIR}/file-config-solver/config/${config_name}.json"
+                "${CMAKE_CURRENT_SOURCE_DIR}/file-config-solver/data/A.mtx"
+                WORKING_DIRECTORY
+                "$<TARGET_FILE_DIR:ginkgo>")
+            # Prevent performance issues with high core counts
+            set_property(TEST example_file-config-solver_${config_name}_${executor} PROPERTY ENVIRONMENT OMP_NUM_THREADS=4)
         endforeach()
-    endif()
-
+    endforeach()
 endif()
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
index cfba7759170..1977d8e7f68 100644
--- a/third_party/CMakeLists.txt
+++ b/third_party/CMakeLists.txt
@@ -13,7 +13,7 @@ if(GINKGO_BUILD_BENCHMARKS)
     endif()
 endif()
 
-if(GINKGO_BUILD_TESTS OR GINKGO_BUILD_BENCHMARKS)
+if(GINKGO_BUILD_TESTS OR GINKGO_BUILD_BENCHMARKS OR GINKGO_BUILD_EXAMPLES)
     if (NOT nlohmann_json_FOUND)
         add_subdirectory(nlohmann_json)
     endif()

From f91351907b987ec48e29942706576cbdf8ccad21 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 20 Nov 2024 17:02:38 +0100
Subject: [PATCH 298/448] use compile flag internally for
 THRUST_CUB_WRAPPED_NAMESPACE

---
 cmake/create_test.cmake        | 4 ++--
 core/test/gtest/CMakeLists.txt | 4 ++--
 cuda/CMakeLists.txt            | 4 ++--
 hip/CMakeLists.txt             | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake
index 9ab0c40de20..fb0eade6269 100644
--- a/cmake/create_test.cmake
+++ b/cmake/create_test.cmake
@@ -158,7 +158,7 @@ endfunction(ginkgo_create_cuda_test)
 ## Internal function allowing separate test name, filename and target name
 function(ginkgo_create_cuda_test_internal test_name filename test_target_name)
     add_executable(${test_target_name} ${filename})
-    target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_CUDA GKO_DEVICE_NAMESPACE=cuda)
+    target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_CUDA GKO_DEVICE_NAMESPACE=cuda THRUST_CUB_WRAPPED_NAMESPACE=gko)
     if(MSVC)
         target_compile_options(${test_target_name}
             PRIVATE
@@ -186,7 +186,7 @@ endfunction(ginkgo_create_hip_test)
 function(ginkgo_create_hip_test_internal test_name filename test_target_name)
     set_source_files_properties(${filename} PROPERTIES LANGUAGE HIP)
     add_executable(${test_target_name} ${filename})
-    target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_HIP GKO_DEVICE_NAMESPACE=hip)
+    target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_HIP GKO_DEVICE_NAMESPACE=hip THRUST_CUB_WRAPPED_NAMESPACE=gko)
     ginkgo_set_test_target_properties(${test_target_name} "_hip" ${ARGN})
     ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE hipgpu)
 endfunction(ginkgo_create_hip_test_internal)
diff --git a/core/test/gtest/CMakeLists.txt b/core/test/gtest/CMakeLists.txt
index f500ddb6ae5..468e4f2da81 100644
--- a/core/test/gtest/CMakeLists.txt
+++ b/core/test/gtest/CMakeLists.txt
@@ -28,10 +28,10 @@ if (GINKGO_BUILD_OMP)
     add_gtest_main("_omp" "GKO_COMPILING_OMP;GKO_DEVICE_NAMESPACE=omp")
 endif()
 if (GINKGO_BUILD_CUDA)
-    add_gtest_main("_cuda" "GKO_COMPILING_CUDA;GKO_DEVICE_NAMESPACE=cuda")
+    add_gtest_main("_cuda" "GKO_COMPILING_CUDA;GKO_DEVICE_NAMESPACE=cuda;THRUST_CUB_WRAPPED_NAMESPACE=gko")
 endif()
 if (GINKGO_BUILD_HIP)
-    add_gtest_main("_hip" "GKO_COMPILING_HIP;GKO_DEVICE_NAMESPACE=hip")
+    add_gtest_main("_hip" "GKO_COMPILING_HIP;GKO_DEVICE_NAMESPACE=hip;THRUST_CUB_WRAPPED_NAMESPACE=gko")
 endif()
 if (GINKGO_BUILD_SYCL)
     add_gtest_main("_dpcpp" "GKO_COMPILING_DPCPP;GKO_DEVICE_NAMESPACE=dpcpp")
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index 5316c4c623c..883d3ff1efa 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -71,7 +71,7 @@ if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
 endif()
 
 ginkgo_compile_features(ginkgo_cuda)
-target_compile_definitions(ginkgo_cuda PRIVATE GKO_COMPILING_CUDA GKO_DEVICE_NAMESPACE=cuda)
+target_compile_definitions(ginkgo_cuda PRIVATE GKO_COMPILING_CUDA GKO_DEVICE_NAMESPACE=cuda THRUST_CUB_WRAPPED_NAMESPACE=gko)
 
 # include path for generated headers like jacobi_common.hpp
 target_include_directories(ginkgo_cuda
@@ -84,7 +84,7 @@ ginkgo_default_includes(ginkgo_cuda)
 ginkgo_install_library(ginkgo_cuda)
 
 if (GINKGO_CHECK_CIRCULAR_DEPS)
-    ginkgo_check_headers(ginkgo_cuda "GKO_COMPILING_CUDA;GKO_DEVICE_NAMESPACE=cuda")
+    ginkgo_check_headers(ginkgo_cuda "GKO_COMPILING_CUDA;GKO_DEVICE_NAMESPACE=cuda;THRUST_CUB_WRAPPED_NAMESPACE=gko")
 endif()
 
 if(GINKGO_BUILD_TESTS)
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 68be287a722..3ebb465c53a 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -62,7 +62,7 @@ target_include_directories(ginkgo_hip
     PRIVATE
         ${CMAKE_CURRENT_BINARY_DIR} # for generated headers like jacobi_common.hip.hpp
         )
-target_compile_definitions(ginkgo_hip PRIVATE GKO_COMPILING_HIP GKO_DEVICE_NAMESPACE=hip)
+target_compile_definitions(ginkgo_hip PRIVATE GKO_COMPILING_HIP GKO_DEVICE_NAMESPACE=hip THRUST_CUB_WRAPPED_NAMESPACE=gko)
 
 target_link_libraries(ginkgo_hip PUBLIC ginkgo_device)
 target_link_libraries(ginkgo_hip PRIVATE hip::host roc::hipblas roc::hipsparse hip::hiprand roc::rocrand roc::rocthrust)
@@ -81,7 +81,7 @@ ginkgo_default_includes(ginkgo_hip)
 ginkgo_install_library(ginkgo_hip)
 
 if (GINKGO_CHECK_CIRCULAR_DEPS)
-    ginkgo_check_headers(ginkgo_hip "GKO_COMPILING_HIP;GKO_DEVICE_NAMESPACE=hip")
+    ginkgo_check_headers(ginkgo_hip "GKO_COMPILING_HIP;GKO_DEVICE_NAMESPACE=hip;THRUST_CUB_WRAPPED_NAMESPACE=gko")
 endif()
 
 if(GINKGO_BUILD_TESTS)

From d505024e4e058509d782f798617666c61a2773b8 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 20 Nov 2024 18:14:33 +0100
Subject: [PATCH 299/448] avoid cpp see thrust

---
 .../factorization/factorization_kernels.cpp   |  1 +
 common/cuda_hip/reorder/rcm_kernels.cpp       |  1 +
 common/cuda_hip/solver/multigrid_kernels.cpp  |  1 +
 common/unified/base/kernel_launch.hpp         |  2 +
 cuda/base/config.hpp                          |  3 +-
 cuda/base/cublas_bindings.hpp                 | 18 -------
 cuda/base/cublas_handle.hpp                   | 43 +++++++++++++++
 cuda/base/executor.cpp                        |  7 ++-
 hip/base/config.hip.hpp                       |  2 +-
 hip/base/executor.hip.cpp                     |  4 +-
 hip/base/hipblas_bindings.hip.hpp             | 18 -------
 hip/base/hipblas_handle.hpp                   | 48 +++++++++++++++++
 hip/base/hipsparse_bindings.hip.hpp           | 18 -------
 hip/base/hipsparse_handle.hpp                 | 53 +++++++++++++++++++
 14 files changed, 156 insertions(+), 63 deletions(-)
 create mode 100644 cuda/base/cublas_handle.hpp
 create mode 100644 hip/base/hipblas_handle.hpp
 create mode 100644 hip/base/hipsparse_handle.hpp

diff --git a/common/cuda_hip/factorization/factorization_kernels.cpp b/common/cuda_hip/factorization/factorization_kernels.cpp
index 36f2f6eb4c5..f26ef668d34 100644
--- a/common/cuda_hip/factorization/factorization_kernels.cpp
+++ b/common/cuda_hip/factorization/factorization_kernels.cpp
@@ -7,6 +7,7 @@
 #include <ginkgo/core/base/array.hpp>
 
 #include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
diff --git a/common/cuda_hip/reorder/rcm_kernels.cpp b/common/cuda_hip/reorder/rcm_kernels.cpp
index 4315a9ed702..75050d3e977 100644
--- a/common/cuda_hip/reorder/rcm_kernels.cpp
+++ b/common/cuda_hip/reorder/rcm_kernels.cpp
@@ -21,6 +21,7 @@
 #include <ginkgo/core/matrix/permutation.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/components/memory.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
diff --git a/common/cuda_hip/solver/multigrid_kernels.cpp b/common/cuda_hip/solver/multigrid_kernels.cpp
index b9e411bd5f8..9b22e457203 100644
--- a/common/cuda_hip/solver/multigrid_kernels.cpp
+++ b/common/cuda_hip/solver/multigrid_kernels.cpp
@@ -9,6 +9,7 @@
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
diff --git a/common/unified/base/kernel_launch.hpp b/common/unified/base/kernel_launch.hpp
index 455d3d67a6d..d4810e1aa95 100644
--- a/common/unified/base/kernel_launch.hpp
+++ b/common/unified/base/kernel_launch.hpp
@@ -17,6 +17,7 @@
 #if defined(GKO_COMPILING_CUDA)
 
 #define GKO_KERNEL __device__
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/types.hpp"
 
 
@@ -43,6 +44,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr unpack_member_type<T> unpack_member(T value)
 #elif defined(GKO_COMPILING_HIP)
 
 #define GKO_KERNEL __device__
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/types.hpp"
 
 
diff --git a/cuda/base/config.hpp b/cuda/base/config.hpp
index fe280c76dec..f89cb0702f6 100644
--- a/cuda/base/config.hpp
+++ b/cuda/base/config.hpp
@@ -6,10 +6,9 @@
 #define GKO_CUDA_BASE_CONFIG_HPP_
 
 
+#include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 
-#include "common/cuda_hip/base/math.hpp"
-
 
 namespace gko {
 namespace kernels {
diff --git a/cuda/base/cublas_bindings.hpp b/cuda/base/cublas_bindings.hpp
index ae5e66b6448..9a8b4070b03 100644
--- a/cuda/base/cublas_bindings.hpp
+++ b/cuda/base/cublas_bindings.hpp
@@ -10,7 +10,6 @@
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/types.hpp"
 
 
@@ -229,23 +228,6 @@ GKO_BIND_CUBLAS_NORM2(ValueType, detail::not_implemented);
 #undef GKO_BIND_CUBLAS_NORM2
 
 
-inline cublasHandle_t init(cudaStream_t stream)
-{
-    cublasHandle_t handle;
-    GKO_ASSERT_NO_CUBLAS_ERRORS(cublasCreate(&handle));
-    GKO_ASSERT_NO_CUBLAS_ERRORS(
-        cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE));
-    GKO_ASSERT_NO_CUBLAS_ERRORS(cublasSetStream(handle, stream));
-    return handle;
-}
-
-
-inline void destroy(cublasHandle_t handle)
-{
-    GKO_ASSERT_NO_CUBLAS_ERRORS(cublasDestroy(handle));
-}
-
-
 }  // namespace cublas
 
 
diff --git a/cuda/base/cublas_handle.hpp b/cuda/base/cublas_handle.hpp
new file mode 100644
index 00000000000..eb52a62975c
--- /dev/null
+++ b/cuda/base/cublas_handle.hpp
@@ -0,0 +1,43 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_CUDA_BASE_CUBLAS_HANDLE_HPP_
+#define GKO_CUDA_BASE_CUBLAS_HANDLE_HPP_
+
+
+#include <cublas_v2.h>
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+namespace cublas {
+
+
+inline cublasHandle_t init(cudaStream_t stream)
+{
+    cublasHandle_t handle;
+    GKO_ASSERT_NO_CUBLAS_ERRORS(cublasCreate(&handle));
+    GKO_ASSERT_NO_CUBLAS_ERRORS(
+        cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE));
+    GKO_ASSERT_NO_CUBLAS_ERRORS(cublasSetStream(handle, stream));
+    return handle;
+}
+
+
+inline void destroy(cublasHandle_t handle)
+{
+    GKO_ASSERT_NO_CUBLAS_ERRORS(cublasDestroy(handle));
+}
+
+
+}  // namespace cublas
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CUDA_BASE_CUBLAS_HANDLE_HPP_
diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp
index caf5269fa3d..8380eddcf1b 100644
--- a/cuda/base/executor.cpp
+++ b/cuda/base/executor.cpp
@@ -9,6 +9,7 @@
 #include <thread>
 
 #include <cuda_runtime.h>
+#include <cublas_v2.h>
 
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/device.hpp>
@@ -17,7 +18,8 @@
 #include <ginkgo/core/base/memory.hpp>
 
 #include "common/cuda_hip/base/config.hpp"
-#include "cuda/base/cublas_bindings.hpp"
+#include "common/cuda_hip/base/executor.hpp.inc"
+#include "cuda/base/cublas_handle.hpp"
 #include "cuda/base/cusparse_handle.hpp"
 #include "cuda/base/device.hpp"
 #include "cuda/base/scoped_device_id.hpp"
@@ -26,9 +28,6 @@
 namespace gko {
 
 
-#include "common/cuda_hip/base/executor.hpp.inc"
-
-
 std::unique_ptr<CudaAllocatorBase> cuda_allocator_from_mode(
     int device_id, allocation_mode mode)
 {
diff --git a/hip/base/config.hip.hpp b/hip/base/config.hip.hpp
index 114eb2f0f0a..832b750f0fd 100644
--- a/hip/base/config.hip.hpp
+++ b/hip/base/config.hip.hpp
@@ -6,9 +6,9 @@
 #define GKO_HIP_BASE_CONFIG_HIP_HPP_
 
 
+#include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 
-#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 
 
diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp
index d4b1d614681..769d650d984 100644
--- a/hip/base/executor.hip.cpp
+++ b/hip/base/executor.hip.cpp
@@ -13,8 +13,8 @@
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "hip/base/device.hpp"
-#include "hip/base/hipblas_bindings.hip.hpp"
-#include "hip/base/hipsparse_bindings.hip.hpp"
+#include "hip/base/hipblas_handle.hpp"
+#include "hip/base/hipsparse_handle.hpp"
 #include "hip/base/scoped_device_id.hip.hpp"
 
 
diff --git a/hip/base/hipblas_bindings.hip.hpp b/hip/base/hipblas_bindings.hip.hpp
index 4641b64277d..04c1610c0cc 100644
--- a/hip/base/hipblas_bindings.hip.hpp
+++ b/hip/base/hipblas_bindings.hip.hpp
@@ -240,24 +240,6 @@ GKO_BIND_HIPBLAS_NORM2(ValueType, detail::not_implemented);
 #undef GKO_BIND_HIPBLAS_NORM2
 
 
-inline hipblasContext* init(hipStream_t stream)
-{
-    hipblasHandle_t handle;
-    GKO_ASSERT_NO_HIPBLAS_ERRORS(hipblasCreate(&handle));
-    GKO_ASSERT_NO_HIPBLAS_ERRORS(
-        hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE));
-    GKO_ASSERT_NO_HIPBLAS_ERRORS(hipblasSetStream(handle, stream));
-    return reinterpret_cast<hipblasContext*>(handle);
-}
-
-
-inline void destroy_hipblas_handle(hipblasContext* handle)
-{
-    GKO_ASSERT_NO_HIPBLAS_ERRORS(
-        hipblasDestroy(reinterpret_cast<hipblasHandle_t>(handle)));
-}
-
-
 }  // namespace hipblas
 
 
diff --git a/hip/base/hipblas_handle.hpp b/hip/base/hipblas_handle.hpp
new file mode 100644
index 00000000000..86441124903
--- /dev/null
+++ b/hip/base/hipblas_handle.hpp
@@ -0,0 +1,48 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_HIP_BASE_HIPBLAS_HANDLE_HPP_
+#define GKO_HIP_BASE_HIPBLAS_HANDLE_HPP_
+
+
+#if HIP_VERSION >= 50200000
+#include <hipblas/hipblas.h>
+#else
+#include <hipblas.h>
+#endif
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+namespace hipblas {
+
+
+inline hipblasContext* init(hipStream_t stream)
+{
+    hipblasHandle_t handle;
+    GKO_ASSERT_NO_HIPBLAS_ERRORS(hipblasCreate(&handle));
+    GKO_ASSERT_NO_HIPBLAS_ERRORS(
+        hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE));
+    GKO_ASSERT_NO_HIPBLAS_ERRORS(hipblasSetStream(handle, stream));
+    return reinterpret_cast<hipblasContext*>(handle);
+}
+
+
+inline void destroy_hipblas_handle(hipblasContext* handle)
+{
+    GKO_ASSERT_NO_HIPBLAS_ERRORS(
+        hipblasDestroy(reinterpret_cast<hipblasHandle_t>(handle)));
+}
+
+
+}  // namespace hipblas
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_BASE_HIPBLAS_HANDLE_HPP_
diff --git a/hip/base/hipsparse_bindings.hip.hpp b/hip/base/hipsparse_bindings.hip.hpp
index af01f9dc94a..5fbabc006ff 100644
--- a/hip/base/hipsparse_bindings.hip.hpp
+++ b/hip/base/hipsparse_bindings.hip.hpp
@@ -577,24 +577,6 @@ GKO_BIND_HIPSPARSE64_CSRSV2_SOLVE(ValueType, detail::not_implemented);
 #undef GKO_BIND_HIPSPARSE64_CSRSV2_SOLVE
 
 
-inline hipsparseContext* init(hipStream_t stream)
-{
-    hipsparseHandle_t handle{};
-    GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseCreate(&handle));
-    GKO_ASSERT_NO_HIPSPARSE_ERRORS(
-        hipsparseSetPointerMode(handle, HIPSPARSE_POINTER_MODE_DEVICE));
-    GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseSetStream(handle, stream));
-    return reinterpret_cast<hipsparseContext*>(handle);
-}
-
-
-inline void destroy_hipsparse_handle(hipsparseContext* handle)
-{
-    GKO_ASSERT_NO_HIPSPARSE_ERRORS(
-        hipsparseDestroy(reinterpret_cast<hipsparseHandle_t>(handle)));
-}
-
-
 inline hipsparseMatDescr_t create_mat_descr()
 {
     hipsparseMatDescr_t descr{};
diff --git a/hip/base/hipsparse_handle.hpp b/hip/base/hipsparse_handle.hpp
new file mode 100644
index 00000000000..eabe1931a6d
--- /dev/null
+++ b/hip/base/hipsparse_handle.hpp
@@ -0,0 +1,53 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_HIP_BASE_HIPSPARSE_HANDLE_HPP_
+#define GKO_HIP_BASE_HIPSPARSE_HANDLE_HPP_
+
+
+#if HIP_VERSION >= 50200000
+#include <hipsparse/hipsparse.h>
+#else
+#include <hipsparse.h>
+#endif
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The HIPSPARSE namespace.
+ *
+ * @ingroup hipsparse
+ */
+namespace hipsparse {
+
+
+inline hipsparseContext* init(hipStream_t stream)
+{
+    hipsparseHandle_t handle{};
+    GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseCreate(&handle));
+    GKO_ASSERT_NO_HIPSPARSE_ERRORS(
+        hipsparseSetPointerMode(handle, HIPSPARSE_POINTER_MODE_DEVICE));
+    GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseSetStream(handle, stream));
+    return reinterpret_cast<hipsparseContext*>(handle);
+}
+
+
+inline void destroy_hipsparse_handle(hipsparseContext* handle)
+{
+    GKO_ASSERT_NO_HIPSPARSE_ERRORS(
+        hipsparseDestroy(reinterpret_cast<hipsparseHandle_t>(handle)));
+}
+
+
+}  // namespace hipsparse
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_BASE_HIPSPARSE_HANDLE_HPP_

From 4c46df1e9f89ce33fcb119ff0f5b0f118365f461 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 21 Nov 2024 11:05:52 +0100
Subject: [PATCH 300/448] adapt the test with custom thrust/cub namespace

---
 cuda/test/base/math.cu                         |  5 +++--
 cuda/test/components/cooperative_groups.cu     |  3 ++-
 hip/test/base/math.hip.cpp                     |  5 +++--
 hip/test/components/cooperative_groups.hip.cpp | 10 ++++++----
 4 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/cuda/test/base/math.cu b/cuda/test/base/math.cu
index 71532b45e80..d1d9373b0ef 100644
--- a/cuda/test/base/math.cu
+++ b/cuda/test/base/math.cu
@@ -18,7 +18,8 @@
 #include "cuda/test/utils.hpp"
 
 
-namespace {
+// put the test in gko namespace to easily adapt the thrust/cub in gko or not
+namespace gko {
 namespace kernel {
 
 
@@ -120,4 +121,4 @@ TEST_F(IsFinite, DoubleComplex)
 }
 
 
-}  // namespace
+}  // namespace gko
diff --git a/cuda/test/components/cooperative_groups.cu b/cuda/test/components/cooperative_groups.cu
index 0b384cd704e..b12d8bb7e4a 100644
--- a/cuda/test/components/cooperative_groups.cu
+++ b/cuda/test/components/cooperative_groups.cu
@@ -73,7 +73,8 @@ __global__ void cg_shuffle(bool* s)
         group::tiled_partition<config::warp_size>(group::this_thread_block());
     auto i = int(group.thread_rank());
     test_assert(s, group.shfl_up(i, 1) == max(0, i - 1));
-    test_assert(s, group.shfl_down(i, 1) == min(i + 1, config::warp_size - 1));
+    test_assert(s, group.shfl_down(i, 1) ==
+                       min(i + 1, static_cast<int>(config::warp_size) - 1));
     test_assert(s, group.shfl(i, 0) == 0);
 }
 
diff --git a/hip/test/base/math.hip.cpp b/hip/test/base/math.hip.cpp
index 01fb96afa7c..f01b56739d9 100644
--- a/hip/test/base/math.hip.cpp
+++ b/hip/test/base/math.hip.cpp
@@ -24,7 +24,8 @@
 #include "hip/test/utils.hip.hpp"
 
 
-namespace {
+// put the test in gko namespace to easily adapt the thrust/cub in gko or not
+namespace gko {
 namespace kernel {
 
 
@@ -126,4 +127,4 @@ TEST_F(IsFinite, DoubleComplex)
 }
 
 
-}  // namespace
+}  // namespace gko
diff --git a/hip/test/components/cooperative_groups.hip.cpp b/hip/test/components/cooperative_groups.hip.cpp
index bd8c79b9849..fd3480d322c 100644
--- a/hip/test/components/cooperative_groups.hip.cpp
+++ b/hip/test/components/cooperative_groups.hip.cpp
@@ -22,10 +22,11 @@
 #include "hip/test/utils.hip.hpp"
 
 
-namespace {
+// put the test in gko namespace to easily adapt the thrust/cub in gko or not
+namespace gko {
 
 
-using namespace gko::kernels::hip;
+using namespace kernels::hip;
 
 
 class CooperativeGroups : public HipTestFixture {
@@ -80,7 +81,8 @@ __global__ void cg_shuffle(bool* s)
         group::tiled_partition<config::warp_size>(group::this_thread_block());
     auto i = int(group.thread_rank());
     test_assert(s, group.shfl_up(i, 1) == max(0, i - 1));
-    test_assert(s, group.shfl_down(i, 1) == min(i + 1, config::warp_size - 1));
+    test_assert(s, group.shfl_down(i, 1) ==
+                       min(i + 1, static_cast<int>(config::warp_size) - 1));
     test_assert(s, group.shfl(i, 0) == 0);
 }
 
@@ -337,4 +339,4 @@ TEST_F(CooperativeGroups, ShuffleSumComplexDouble)
 }
 
 
-}  // namespace
+}  // namespace gko

From 14447d713537c1b9eb1ea25dfe598c5c958ec9c9 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 21 Nov 2024 11:49:46 +0100
Subject: [PATCH 301/448] add GINKGO_CUSTOM_THRUST_NAMESPACE option

---
 .gitlab/scripts.yml            |  2 ++
 .gitlab/variables.yml          |  1 +
 CMakeLists.txt                 |  1 +
 INSTALL.md                     |  3 +++
 cmake/create_test.cmake        | 10 ++++++++--
 cmake/get_info.cmake           |  2 +-
 core/test/gtest/CMakeLists.txt |  4 ++--
 cuda/CMakeLists.txt            | 13 ++++++++++---
 hip/CMakeLists.txt             | 13 ++++++++++---
 9 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/.gitlab/scripts.yml b/.gitlab/scripts.yml
index dda127ff535..3ed1a9d75a4 100644
--- a/.gitlab/scripts.yml
+++ b/.gitlab/scripts.yml
@@ -44,6 +44,7 @@
         -DGINKGO_BUILD_OMP=${BUILD_OMP} -DGINKGO_BUILD_CUDA=${BUILD_CUDA}
         -DGINKGO_BUILD_HIP=${BUILD_HIP} -DGINKGO_BUILD_SYCL=${BUILD_SYCL}
         -DGINKGO_BUILD_MPI=${BUILD_MPI} ${MPI_STR}
+        -DGINKGO_CUSTOM_THRUST_NAMESPACE={CUSTOM_THRUST_NAMESPACE}
         -DGINKGO_BUILD_HWLOC=${BUILD_HWLOC}
         -DGINKGO_BUILD_PAPI_SDE=${BUILD_PAPI_SDE}
         -DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_EXAMPLES=ON
@@ -87,6 +88,7 @@
         -DGINKGO_BUILD_OMP=${BUILD_OMP} -DGINKGO_BUILD_CUDA=${BUILD_CUDA}
         -DGINKGO_BUILD_HIP=${BUILD_HIP} -DGINKGO_BUILD_SYCL=${BUILD_SYCL}
         -DGINKGO_BUILD_MPI=${BUILD_MPI} ${MPI_STR}
+        -DGINKGO_CUSTOM_THRUST_NAMESPACE={CUSTOM_THRUST_NAMESPACE}
         -DGINKGO_BUILD_HWLOC=${BUILD_HWLOC}
         -DGINKGO_BUILD_PAPI_SDE=${BUILD_PAPI_SDE}
         -DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_EXAMPLES=ON
diff --git a/.gitlab/variables.yml b/.gitlab/variables.yml
index 4fd7fc338eb..b8f20de50e8 100644
--- a/.gitlab/variables.yml
+++ b/.gitlab/variables.yml
@@ -11,6 +11,7 @@
     BUILD_CUDA: "OFF"
     BUILD_HIP: "OFF"
     BUILD_SYCL: "OFF"
+    CUSTOM_THRUST_NAMESPACE: "ON"
     BUILD_HWLOC: "ON"
     BUILD_PAPI_SDE: "OFF"
     BUILD_MPI: "OFF"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b8275dab205..9035eae8f0e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,6 +32,7 @@ option(GINKGO_BUILD_DOC "Generate documentation" OFF)
 option(GINKGO_FAST_TESTS "Reduces the input size for a few tests known to be time-intensive" OFF)
 option(GINKGO_TEST_NONDEFAULT_STREAM "Uses non-default streams in CUDA and HIP tests" OFF)
 option(GINKGO_MIXED_PRECISION "Instantiate true mixed-precision kernels (otherwise they will be conversion-based using implicit temporary storage)" OFF)
+option(GINKGO_CUSTOM_THRUST_NAMESPACE "Add custom namespace to thrust and cub to avoid potential break when the other libraries also uses thrust" OFF)
 option(GINKGO_SKIP_DEPENDENCY_UPDATE
     "Do not update dependencies each time the project is rebuilt" ON)
 option(GINKGO_WITH_CLANG_TIDY "Make Ginkgo call `clang-tidy` to find programming issues." OFF)
diff --git a/INSTALL.md b/INSTALL.md
index 9719bdfb920..f436b6eef4a 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -53,6 +53,9 @@ Ginkgo adds the following additional switches to control what is being built:
     `OFF` otherwise.
 *   `-DCMAKE_HIP_ARCHITECTURES="gpuarch1;gpuarch2"` the AMDGPU targets to be passed to the compiler.
     If empty, compiler chooses based on the available GPUs.
+*   `-DGINKGO_CUSTOM_THRUST_NAMESPACE={ON, OFF}` adds custom namespace to thrust and the underlying cub in internal Ginkgo.
+    Default is `OFF`. If encountering some weird issues only when using Ginkgo with other libraries using thrust or cub,
+    enabling this option may help.
 *   `-DGINKGO_BUILD_HWLOC={ON, OFF}` builds Ginkgo with HWLOC. Default is `OFF`.
 *   `-DGINKGO_BUILD_DOC={ON, OFF}` creates an HTML version of Ginkgo's documentation
     from inline comments in the code. The default is `OFF`.
diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake
index fb0eade6269..3b4874c306f 100644
--- a/cmake/create_test.cmake
+++ b/cmake/create_test.cmake
@@ -158,7 +158,10 @@ endfunction(ginkgo_create_cuda_test)
 ## Internal function allowing separate test name, filename and target name
 function(ginkgo_create_cuda_test_internal test_name filename test_target_name)
     add_executable(${test_target_name} ${filename})
-    target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_CUDA GKO_DEVICE_NAMESPACE=cuda THRUST_CUB_WRAPPED_NAMESPACE=gko)
+    target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_CUDA GKO_DEVICE_NAMESPACE=cuda)
+    if(GINKGO_CUSTOM_THRUST_NAMESPACE)
+        target_compile_definitions(${test_target_name} PRIVATE THRUST_CUB_WRAPPED_NAMESPACE=gko)
+    endif()
     if(MSVC)
         target_compile_options(${test_target_name}
             PRIVATE
@@ -186,7 +189,10 @@ endfunction(ginkgo_create_hip_test)
 function(ginkgo_create_hip_test_internal test_name filename test_target_name)
     set_source_files_properties(${filename} PROPERTIES LANGUAGE HIP)
     add_executable(${test_target_name} ${filename})
-    target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_HIP GKO_DEVICE_NAMESPACE=hip THRUST_CUB_WRAPPED_NAMESPACE=gko)
+    target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_HIP GKO_DEVICE_NAMESPACE=hip)
+    if(GINKGO_CUSTOM_THRUST_NAMESPACE)
+        target_compile_definitions(${test_target_name} PRIVATE THRUST_CUB_WRAPPED_NAMESPACE=gko)
+    endif()
     ginkgo_set_test_target_properties(${test_target_name} "_hip" ${ARGN})
     ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE hipgpu)
 endfunction(ginkgo_create_hip_test_internal)
diff --git a/cmake/get_info.cmake b/cmake/get_info.cmake
index 63f43c645f0..e9ef299c1e5 100644
--- a/cmake/get_info.cmake
+++ b/cmake/get_info.cmake
@@ -130,7 +130,7 @@ foreach(log_type ${log_types})
         "GINKGO_BUILD_OMP;GINKGO_BUILD_MPI;GINKGO_BUILD_REFERENCE;GINKGO_BUILD_CUDA;GINKGO_BUILD_HIP;GINKGO_BUILD_SYCL")
     ginkgo_print_module_footer(${${log_type}} "  Enabled features:")
     ginkgo_print_foreach_variable(${${log_type}}
-        "GINKGO_MIXED_PRECISION;GINKGO_HAVE_GPU_AWARE_MPI")
+        "GINKGO_MIXED_PRECISION;GINKGO_HAVE_GPU_AWARE_MPI;GINKGO_CUSTOM_THRUST_NAMESPACE")
     ginkgo_print_module_footer(${${log_type}} "  Tests, benchmarks and examples:")
     ginkgo_print_foreach_variable(${${log_type}}
         "GINKGO_BUILD_TESTS;GINKGO_FAST_TESTS;GINKGO_BUILD_EXAMPLES;GINKGO_EXTLIB_EXAMPLE;GINKGO_BUILD_BENCHMARKS;GINKGO_BENCHMARK_ENABLE_TUNING")
diff --git a/core/test/gtest/CMakeLists.txt b/core/test/gtest/CMakeLists.txt
index 468e4f2da81..f500ddb6ae5 100644
--- a/core/test/gtest/CMakeLists.txt
+++ b/core/test/gtest/CMakeLists.txt
@@ -28,10 +28,10 @@ if (GINKGO_BUILD_OMP)
     add_gtest_main("_omp" "GKO_COMPILING_OMP;GKO_DEVICE_NAMESPACE=omp")
 endif()
 if (GINKGO_BUILD_CUDA)
-    add_gtest_main("_cuda" "GKO_COMPILING_CUDA;GKO_DEVICE_NAMESPACE=cuda;THRUST_CUB_WRAPPED_NAMESPACE=gko")
+    add_gtest_main("_cuda" "GKO_COMPILING_CUDA;GKO_DEVICE_NAMESPACE=cuda")
 endif()
 if (GINKGO_BUILD_HIP)
-    add_gtest_main("_hip" "GKO_COMPILING_HIP;GKO_DEVICE_NAMESPACE=hip;THRUST_CUB_WRAPPED_NAMESPACE=gko")
+    add_gtest_main("_hip" "GKO_COMPILING_HIP;GKO_DEVICE_NAMESPACE=hip")
 endif()
 if (GINKGO_BUILD_SYCL)
     add_gtest_main("_dpcpp" "GKO_COMPILING_DPCPP;GKO_DEVICE_NAMESPACE=dpcpp")
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index 883d3ff1efa..3766a1cce03 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -71,7 +71,10 @@ if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
 endif()
 
 ginkgo_compile_features(ginkgo_cuda)
-target_compile_definitions(ginkgo_cuda PRIVATE GKO_COMPILING_CUDA GKO_DEVICE_NAMESPACE=cuda THRUST_CUB_WRAPPED_NAMESPACE=gko)
+target_compile_definitions(ginkgo_cuda PRIVATE GKO_COMPILING_CUDA GKO_DEVICE_NAMESPACE=cuda)
+if(GINKGO_CUSTOM_THRUST_NAMESPACE)
+    target_compile_definitions(ginkgo_cuda PRIVATE THRUST_CUB_WRAPPED_NAMESPACE=gko)
+endif()
 
 # include path for generated headers like jacobi_common.hpp
 target_include_directories(ginkgo_cuda
@@ -83,8 +86,12 @@ target_link_libraries(ginkgo_cuda PUBLIC ginkgo_device ${CMAKE_DL_LIBS})
 ginkgo_default_includes(ginkgo_cuda)
 ginkgo_install_library(ginkgo_cuda)
 
-if (GINKGO_CHECK_CIRCULAR_DEPS)
-    ginkgo_check_headers(ginkgo_cuda "GKO_COMPILING_CUDA;GKO_DEVICE_NAMESPACE=cuda;THRUST_CUB_WRAPPED_NAMESPACE=gko")
+if(GINKGO_CHECK_CIRCULAR_DEPS)
+    set(check_header_def "GKO_COMPILING_CUDA;GKO_DEVICE_NAMESPACE=cuda")
+    if(GINKGO_CUSTOM_THRUST_NAMESPACE)
+        set(check_header_def "${check_header_def};THRUST_CUB_WRAPPED_NAMESPACE=gko")
+    endif()
+    ginkgo_check_headers(ginkgo_cuda "${check_header_def}")
 endif()
 
 if(GINKGO_BUILD_TESTS)
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 3ebb465c53a..224e25dacde 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -62,7 +62,10 @@ target_include_directories(ginkgo_hip
     PRIVATE
         ${CMAKE_CURRENT_BINARY_DIR} # for generated headers like jacobi_common.hip.hpp
         )
-target_compile_definitions(ginkgo_hip PRIVATE GKO_COMPILING_HIP GKO_DEVICE_NAMESPACE=hip THRUST_CUB_WRAPPED_NAMESPACE=gko)
+target_compile_definitions(ginkgo_hip PRIVATE GKO_COMPILING_HIP GKO_DEVICE_NAMESPACE=hip)
+if(GINKGO_CUSTOM_THRUST_NAMESPACE)
+    target_compile_definitions(ginkgo_hip PRIVATE THRUST_CUB_WRAPPED_NAMESPACE=gko)
+endif()
 
 target_link_libraries(ginkgo_hip PUBLIC ginkgo_device)
 target_link_libraries(ginkgo_hip PRIVATE hip::host roc::hipblas roc::hipsparse hip::hiprand roc::rocrand roc::rocthrust)
@@ -80,8 +83,12 @@ ginkgo_compile_features(ginkgo_hip)
 ginkgo_default_includes(ginkgo_hip)
 ginkgo_install_library(ginkgo_hip)
 
-if (GINKGO_CHECK_CIRCULAR_DEPS)
-    ginkgo_check_headers(ginkgo_hip "GKO_COMPILING_HIP;GKO_DEVICE_NAMESPACE=hip;THRUST_CUB_WRAPPED_NAMESPACE=gko")
+if(GINKGO_CHECK_CIRCULAR_DEPS)
+    set(check_header_def "GKO_COMPILING_HIP;GKO_DEVICE_NAMESPACE=hip")
+    if(GINKGO_CUSTOM_THRUST_NAMESPACE)
+        set(check_header_def "${check_header_def};THRUST_CUB_WRAPPED_NAMESPACE=gko")
+    endif()
+    ginkgo_check_headers(ginkgo_hip "${check_header_def}")
 endif()
 
 if(GINKGO_BUILD_TESTS)

From 0170c0bab4a0227fa917705929938b3186821c4d Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 21 Nov 2024 18:41:59 +0100
Subject: [PATCH 302/448] split custom_thrust_namespace to cuda/hip and disable
 them if no-effect or not-full-support

---
 .gitlab/scripts.yml     |  6 ++++--
 CMakeLists.txt          | 13 ++++++++++++-
 INSTALL.md              |  5 ++++-
 cmake/create_test.cmake |  4 ++--
 cmake/get_info.cmake    |  2 +-
 cuda/CMakeLists.txt     |  4 ++--
 cuda/get_info.cmake     |  1 +
 hip/CMakeLists.txt      |  4 ++--
 hip/get_info.cmake      |  1 +
 9 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/.gitlab/scripts.yml b/.gitlab/scripts.yml
index 3ed1a9d75a4..2f9705aacda 100644
--- a/.gitlab/scripts.yml
+++ b/.gitlab/scripts.yml
@@ -44,7 +44,8 @@
         -DGINKGO_BUILD_OMP=${BUILD_OMP} -DGINKGO_BUILD_CUDA=${BUILD_CUDA}
         -DGINKGO_BUILD_HIP=${BUILD_HIP} -DGINKGO_BUILD_SYCL=${BUILD_SYCL}
         -DGINKGO_BUILD_MPI=${BUILD_MPI} ${MPI_STR}
-        -DGINKGO_CUSTOM_THRUST_NAMESPACE={CUSTOM_THRUST_NAMESPACE}
+        -DGINKGO_CUDA_CUSTOM_THRUST_NAMESPACE={CUSTOM_THRUST_NAMESPACE}
+        -DGINKGO_HIP_CUSTOM_THRUST_NAMESPACE={CUSTOM_THRUST_NAMESPACE}
         -DGINKGO_BUILD_HWLOC=${BUILD_HWLOC}
         -DGINKGO_BUILD_PAPI_SDE=${BUILD_PAPI_SDE}
         -DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_EXAMPLES=ON
@@ -88,7 +89,8 @@
         -DGINKGO_BUILD_OMP=${BUILD_OMP} -DGINKGO_BUILD_CUDA=${BUILD_CUDA}
         -DGINKGO_BUILD_HIP=${BUILD_HIP} -DGINKGO_BUILD_SYCL=${BUILD_SYCL}
         -DGINKGO_BUILD_MPI=${BUILD_MPI} ${MPI_STR}
-        -DGINKGO_CUSTOM_THRUST_NAMESPACE={CUSTOM_THRUST_NAMESPACE}
+        -DGINKGO_CUDA_CUSTOM_THRUST_NAMESPACE={CUSTOM_THRUST_NAMESPACE}
+        -DGINKGO_HIP_CUSTOM_THRUST_NAMESPACE={CUSTOM_THRUST_NAMESPACE}
         -DGINKGO_BUILD_HWLOC=${BUILD_HWLOC}
         -DGINKGO_BUILD_PAPI_SDE=${BUILD_PAPI_SDE}
         -DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_EXAMPLES=ON
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9035eae8f0e..77185b39670 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,7 +32,8 @@ option(GINKGO_BUILD_DOC "Generate documentation" OFF)
 option(GINKGO_FAST_TESTS "Reduces the input size for a few tests known to be time-intensive" OFF)
 option(GINKGO_TEST_NONDEFAULT_STREAM "Uses non-default streams in CUDA and HIP tests" OFF)
 option(GINKGO_MIXED_PRECISION "Instantiate true mixed-precision kernels (otherwise they will be conversion-based using implicit temporary storage)" OFF)
-option(GINKGO_CUSTOM_THRUST_NAMESPACE "Add custom namespace to thrust and cub to avoid potential break when the other libraries also uses thrust" OFF)
+option(GINKGO_CUDA_CUSTOM_THRUST_NAMESPACE "Add custom namespace to thrust and cub in cuda to avoid potential break when the other libraries also uses thrust" OFF)
+option(GINKGO_HIP_CUSTOM_THRUST_NAMESPACE "Add custom namespace to thrust in hip to avoid potential break when the other libraries also uses thrust" OFF)
 option(GINKGO_SKIP_DEPENDENCY_UPDATE
     "Do not update dependencies each time the project is rebuilt" ON)
 option(GINKGO_WITH_CLANG_TIDY "Make Ginkgo call `clang-tidy` to find programming issues." OFF)
@@ -73,9 +74,19 @@ gko_rename_cache(GINKGO_CUDA_COMPILER_FLAGS CMAKE_CUDA_FLAGS BOOL "Flags used by
 # load executor-specific configuration
 if(GINKGO_BUILD_CUDA)
     include(cmake/cuda.cmake)
+    if(GINKGO_CUDA_CUSTOM_THRUST_NAMESPACE AND CUDAToolkit_VERSION VERSION_LESS 11.6)
+        message(STATUS "We disable custom thrust namespace for cuda before 11.6 because it has no effect in the thrust shipped by cuda before 11.6")
+        set(GINKGO_CUDA_CUSTOM_THRUST_NAMESPACE OFF CACHE BOOL "Add custom namespace to thrust and cub in cuda to avoid potential break when the other libraries also uses thrust" FORCE)
+    endif()
 endif()
 if(GINKGO_BUILD_HIP)
     include(cmake/hip.cmake)
+    if(GINKGO_HIP_CUSTOM_THRUST_NAMESPACE AND GINKGO_HIP_PLATFORM_AMD AND GINKGO_HIP_VERSION VERSION_LESS 5.7)
+        # Hip allow custom namespace but does not fully make everything in the custom namespace before rocm-5.7
+        # more specific pr: https://github.com/ROCm/rocThrust/pull/286
+        message(STATUS "We disable custom thrust namespace for hip before 5.7 because hip does not fully support it before 5.7")
+        set(GINKGO_HIP_CUSTOM_THRUST_NAMESPACE OFF CACHE BOOL "Add custom namespace to thrust in hip to avoid potential break when the other libraries also uses thrust" FORCE)
+    endif()
 endif()
 if(GINKGO_BUILD_SYCL)
     include(cmake/sycl.cmake)
diff --git a/INSTALL.md b/INSTALL.md
index f436b6eef4a..4b47f0f9d26 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -53,7 +53,10 @@ Ginkgo adds the following additional switches to control what is being built:
     `OFF` otherwise.
 *   `-DCMAKE_HIP_ARCHITECTURES="gpuarch1;gpuarch2"` the AMDGPU targets to be passed to the compiler.
     If empty, compiler chooses based on the available GPUs.
-*   `-DGINKGO_CUSTOM_THRUST_NAMESPACE={ON, OFF}` adds custom namespace to thrust and the underlying cub in internal Ginkgo.
+*   `-DGINKGO_CUDA_CUSTOM_THRUST_NAMESPACE={ON, OFF}` adds custom namespace to thrust and the underlying cub in Ginkgo cuda.
+    Default is `OFF`. If encountering some weird issues only when using Ginkgo with other libraries using thrust or cub,
+    enabling this option may help.
+*   `-DGINKGO_HIP_CUSTOM_THRUST_NAMESPACE={ON, OFF}` adds custom namespace to thrust in Ginkgo hip.
     Default is `OFF`. If encountering some weird issues only when using Ginkgo with other libraries using thrust or cub,
     enabling this option may help.
 *   `-DGINKGO_BUILD_HWLOC={ON, OFF}` builds Ginkgo with HWLOC. Default is `OFF`.
diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake
index 3b4874c306f..20f074778a1 100644
--- a/cmake/create_test.cmake
+++ b/cmake/create_test.cmake
@@ -159,7 +159,7 @@ endfunction(ginkgo_create_cuda_test)
 function(ginkgo_create_cuda_test_internal test_name filename test_target_name)
     add_executable(${test_target_name} ${filename})
     target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_CUDA GKO_DEVICE_NAMESPACE=cuda)
-    if(GINKGO_CUSTOM_THRUST_NAMESPACE)
+    if(GINKGO_CUDA_CUSTOM_THRUST_NAMESPACE)
         target_compile_definitions(${test_target_name} PRIVATE THRUST_CUB_WRAPPED_NAMESPACE=gko)
     endif()
     if(MSVC)
@@ -190,7 +190,7 @@ function(ginkgo_create_hip_test_internal test_name filename test_target_name)
     set_source_files_properties(${filename} PROPERTIES LANGUAGE HIP)
     add_executable(${test_target_name} ${filename})
     target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_HIP GKO_DEVICE_NAMESPACE=hip)
-    if(GINKGO_CUSTOM_THRUST_NAMESPACE)
+    if(GINKGO_HIP_CUSTOM_THRUST_NAMESPACE)
         target_compile_definitions(${test_target_name} PRIVATE THRUST_CUB_WRAPPED_NAMESPACE=gko)
     endif()
     ginkgo_set_test_target_properties(${test_target_name} "_hip" ${ARGN})
diff --git a/cmake/get_info.cmake b/cmake/get_info.cmake
index e9ef299c1e5..63f43c645f0 100644
--- a/cmake/get_info.cmake
+++ b/cmake/get_info.cmake
@@ -130,7 +130,7 @@ foreach(log_type ${log_types})
         "GINKGO_BUILD_OMP;GINKGO_BUILD_MPI;GINKGO_BUILD_REFERENCE;GINKGO_BUILD_CUDA;GINKGO_BUILD_HIP;GINKGO_BUILD_SYCL")
     ginkgo_print_module_footer(${${log_type}} "  Enabled features:")
     ginkgo_print_foreach_variable(${${log_type}}
-        "GINKGO_MIXED_PRECISION;GINKGO_HAVE_GPU_AWARE_MPI;GINKGO_CUSTOM_THRUST_NAMESPACE")
+        "GINKGO_MIXED_PRECISION;GINKGO_HAVE_GPU_AWARE_MPI")
     ginkgo_print_module_footer(${${log_type}} "  Tests, benchmarks and examples:")
     ginkgo_print_foreach_variable(${${log_type}}
         "GINKGO_BUILD_TESTS;GINKGO_FAST_TESTS;GINKGO_BUILD_EXAMPLES;GINKGO_EXTLIB_EXAMPLE;GINKGO_BUILD_BENCHMARKS;GINKGO_BENCHMARK_ENABLE_TUNING")
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index 3766a1cce03..1cdc582ded6 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -72,7 +72,7 @@ endif()
 
 ginkgo_compile_features(ginkgo_cuda)
 target_compile_definitions(ginkgo_cuda PRIVATE GKO_COMPILING_CUDA GKO_DEVICE_NAMESPACE=cuda)
-if(GINKGO_CUSTOM_THRUST_NAMESPACE)
+if(GINKGO_CUDA_CUSTOM_THRUST_NAMESPACE)
     target_compile_definitions(ginkgo_cuda PRIVATE THRUST_CUB_WRAPPED_NAMESPACE=gko)
 endif()
 
@@ -88,7 +88,7 @@ ginkgo_install_library(ginkgo_cuda)
 
 if(GINKGO_CHECK_CIRCULAR_DEPS)
     set(check_header_def "GKO_COMPILING_CUDA;GKO_DEVICE_NAMESPACE=cuda")
-    if(GINKGO_CUSTOM_THRUST_NAMESPACE)
+    if(GINKGO_CUDA_CUSTOM_THRUST_NAMESPACE)
         set(check_header_def "${check_header_def};THRUST_CUB_WRAPPED_NAMESPACE=gko")
     endif()
     ginkgo_check_headers(ginkgo_cuda "${check_header_def}")
diff --git a/cuda/get_info.cmake b/cuda/get_info.cmake
index 3c00b05e0b0..3347630ee14 100644
--- a/cuda/get_info.cmake
+++ b/cuda/get_info.cmake
@@ -1,5 +1,6 @@
 ginkgo_print_module_header(${detailed_log} "CUDA")
 ginkgo_print_variable(${detailed_log} "CMAKE_CUDA_ARCHITECTURES")
+ginkgo_print_variable(${detailed_log} "GINKGO_CUDA_CUSTOM_THRUST_NAMESPACE")
 ginkgo_print_module_footer(${detailed_log} "CUDA variables:")
 ginkgo_print_variable(${detailed_log} "CMAKE_CUDA_COMPILER")
 ginkgo_print_variable(${detailed_log} "CMAKE_CUDA_COMPILER_VERSION")
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 224e25dacde..e3e61e5a81c 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -63,7 +63,7 @@ target_include_directories(ginkgo_hip
         ${CMAKE_CURRENT_BINARY_DIR} # for generated headers like jacobi_common.hip.hpp
         )
 target_compile_definitions(ginkgo_hip PRIVATE GKO_COMPILING_HIP GKO_DEVICE_NAMESPACE=hip)
-if(GINKGO_CUSTOM_THRUST_NAMESPACE)
+if(GINKGO_HIP_CUSTOM_THRUST_NAMESPACE)
     target_compile_definitions(ginkgo_hip PRIVATE THRUST_CUB_WRAPPED_NAMESPACE=gko)
 endif()
 
@@ -85,7 +85,7 @@ ginkgo_install_library(ginkgo_hip)
 
 if(GINKGO_CHECK_CIRCULAR_DEPS)
     set(check_header_def "GKO_COMPILING_HIP;GKO_DEVICE_NAMESPACE=hip")
-    if(GINKGO_CUSTOM_THRUST_NAMESPACE)
+    if(GINKGO_HIP_CUSTOM_THRUST_NAMESPACE)
         set(check_header_def "${check_header_def};THRUST_CUB_WRAPPED_NAMESPACE=gko")
     endif()
     ginkgo_check_headers(ginkgo_hip "${check_header_def}")
diff --git a/hip/get_info.cmake b/hip/get_info.cmake
index 14a770234fa..2b94e89cbf7 100644
--- a/hip/get_info.cmake
+++ b/hip/get_info.cmake
@@ -1,4 +1,5 @@
 ginkgo_print_module_header(${detailed_log} "HIP")
+ginkgo_print_variable(${detailed_log} "GINKGO_HIP_CUSTOM_THRUST_NAMESPACE")
 ginkgo_print_module_footer(${detailed_log} "HIP variables:")
 ginkgo_print_flags(${detailed_log} "CMAKE_HIP_FLAGS")
 ginkgo_print_flags(${detailed_log} "CMAKE_HIP_COMPILER")

From caaed3cb8d74cb3780fe8dc75fc713c48af1e5df Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 22 Nov 2024 14:57:16 +0100
Subject: [PATCH 303/448] enable the option by default and update documentation

Co-authored-by: Tobias Ribizel <ribizel@kit.edu>
Co-authored-by: Natalie Beams <246972+nbeams@users.noreply.github.com>
---
 CMakeLists.txt |  4 ++--
 INSTALL.md     | 10 ++++------
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 77185b39670..ea7fae42169 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,8 +32,8 @@ option(GINKGO_BUILD_DOC "Generate documentation" OFF)
 option(GINKGO_FAST_TESTS "Reduces the input size for a few tests known to be time-intensive" OFF)
 option(GINKGO_TEST_NONDEFAULT_STREAM "Uses non-default streams in CUDA and HIP tests" OFF)
 option(GINKGO_MIXED_PRECISION "Instantiate true mixed-precision kernels (otherwise they will be conversion-based using implicit temporary storage)" OFF)
-option(GINKGO_CUDA_CUSTOM_THRUST_NAMESPACE "Add custom namespace to thrust and cub in cuda to avoid potential break when the other libraries also uses thrust" OFF)
-option(GINKGO_HIP_CUSTOM_THRUST_NAMESPACE "Add custom namespace to thrust in hip to avoid potential break when the other libraries also uses thrust" OFF)
+option(GINKGO_CUDA_CUSTOM_THRUST_NAMESPACE "Add custom namespace to thrust and cub in cuda to avoid potential conflicts when other libraries also use thrust" ON)
+option(GINKGO_HIP_CUSTOM_THRUST_NAMESPACE "Add custom namespace to thrust in hip to avoid potential conflicts when other libraries also use thrust" ON)
 option(GINKGO_SKIP_DEPENDENCY_UPDATE
     "Do not update dependencies each time the project is rebuilt" ON)
 option(GINKGO_WITH_CLANG_TIDY "Make Ginkgo call `clang-tidy` to find programming issues." OFF)
diff --git a/INSTALL.md b/INSTALL.md
index 4b47f0f9d26..6dbac186131 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -53,12 +53,10 @@ Ginkgo adds the following additional switches to control what is being built:
     `OFF` otherwise.
 *   `-DCMAKE_HIP_ARCHITECTURES="gpuarch1;gpuarch2"` the AMDGPU targets to be passed to the compiler.
     If empty, compiler chooses based on the available GPUs.
-*   `-DGINKGO_CUDA_CUSTOM_THRUST_NAMESPACE={ON, OFF}` adds custom namespace to thrust and the underlying cub in Ginkgo cuda.
-    Default is `OFF`. If encountering some weird issues only when using Ginkgo with other libraries using thrust or cub,
-    enabling this option may help.
-*   `-DGINKGO_HIP_CUSTOM_THRUST_NAMESPACE={ON, OFF}` adds custom namespace to thrust in Ginkgo hip.
-    Default is `OFF`. If encountering some weird issues only when using Ginkgo with other libraries using thrust or cub,
-    enabling this option may help.
+*   `-DGINKGO_CUDA_CUSTOM_THRUST_NAMESPACE={ON, OFF}` adds custom namespace to thrust and the underlying cub in Ginkgo CUDA.
+    Default is `ON`. It avoids the potential conflicts from thrust when other libraries also use thrust.
+*   `-DGINKGO_HIP_CUSTOM_THRUST_NAMESPACE={ON, OFF}` adds custom namespace to thrust in Ginkgo HIP.
+    Default is `ON`. It avoids the potential conflicts from thrust when other libraries also use thrust.
 *   `-DGINKGO_BUILD_HWLOC={ON, OFF}` builds Ginkgo with HWLOC. Default is `OFF`.
 *   `-DGINKGO_BUILD_DOC={ON, OFF}` creates an HTML version of Ginkgo's documentation
     from inline comments in the code. The default is `OFF`.

From 59391094106408fa3e0ebc0a9b2d5b5010322732 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 26 Nov 2024 11:35:44 +0100
Subject: [PATCH 304/448] do not expose the custom namespace option to user

Co-authored-by: Tobias Ribizel <ribizel@kit.edu>
---
 .gitlab/scripts.yml   |  4 ----
 .gitlab/variables.yml |  1 -
 CMakeLists.txt        | 18 +++++++++++-------
 INSTALL.md            |  4 ----
 4 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/.gitlab/scripts.yml b/.gitlab/scripts.yml
index 2f9705aacda..dda127ff535 100644
--- a/.gitlab/scripts.yml
+++ b/.gitlab/scripts.yml
@@ -44,8 +44,6 @@
         -DGINKGO_BUILD_OMP=${BUILD_OMP} -DGINKGO_BUILD_CUDA=${BUILD_CUDA}
         -DGINKGO_BUILD_HIP=${BUILD_HIP} -DGINKGO_BUILD_SYCL=${BUILD_SYCL}
         -DGINKGO_BUILD_MPI=${BUILD_MPI} ${MPI_STR}
-        -DGINKGO_CUDA_CUSTOM_THRUST_NAMESPACE={CUSTOM_THRUST_NAMESPACE}
-        -DGINKGO_HIP_CUSTOM_THRUST_NAMESPACE={CUSTOM_THRUST_NAMESPACE}
         -DGINKGO_BUILD_HWLOC=${BUILD_HWLOC}
         -DGINKGO_BUILD_PAPI_SDE=${BUILD_PAPI_SDE}
         -DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_EXAMPLES=ON
@@ -89,8 +87,6 @@
         -DGINKGO_BUILD_OMP=${BUILD_OMP} -DGINKGO_BUILD_CUDA=${BUILD_CUDA}
         -DGINKGO_BUILD_HIP=${BUILD_HIP} -DGINKGO_BUILD_SYCL=${BUILD_SYCL}
         -DGINKGO_BUILD_MPI=${BUILD_MPI} ${MPI_STR}
-        -DGINKGO_CUDA_CUSTOM_THRUST_NAMESPACE={CUSTOM_THRUST_NAMESPACE}
-        -DGINKGO_HIP_CUSTOM_THRUST_NAMESPACE={CUSTOM_THRUST_NAMESPACE}
         -DGINKGO_BUILD_HWLOC=${BUILD_HWLOC}
         -DGINKGO_BUILD_PAPI_SDE=${BUILD_PAPI_SDE}
         -DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_EXAMPLES=ON
diff --git a/.gitlab/variables.yml b/.gitlab/variables.yml
index b8f20de50e8..4fd7fc338eb 100644
--- a/.gitlab/variables.yml
+++ b/.gitlab/variables.yml
@@ -11,7 +11,6 @@
     BUILD_CUDA: "OFF"
     BUILD_HIP: "OFF"
     BUILD_SYCL: "OFF"
-    CUSTOM_THRUST_NAMESPACE: "ON"
     BUILD_HWLOC: "ON"
     BUILD_PAPI_SDE: "OFF"
     BUILD_MPI: "OFF"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ea7fae42169..092c0ee9f3e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,8 +32,6 @@ option(GINKGO_BUILD_DOC "Generate documentation" OFF)
 option(GINKGO_FAST_TESTS "Reduces the input size for a few tests known to be time-intensive" OFF)
 option(GINKGO_TEST_NONDEFAULT_STREAM "Uses non-default streams in CUDA and HIP tests" OFF)
 option(GINKGO_MIXED_PRECISION "Instantiate true mixed-precision kernels (otherwise they will be conversion-based using implicit temporary storage)" OFF)
-option(GINKGO_CUDA_CUSTOM_THRUST_NAMESPACE "Add custom namespace to thrust and cub in cuda to avoid potential conflicts when other libraries also use thrust" ON)
-option(GINKGO_HIP_CUSTOM_THRUST_NAMESPACE "Add custom namespace to thrust in hip to avoid potential conflicts when other libraries also use thrust" ON)
 option(GINKGO_SKIP_DEPENDENCY_UPDATE
     "Do not update dependencies each time the project is rebuilt" ON)
 option(GINKGO_WITH_CLANG_TIDY "Make Ginkgo call `clang-tidy` to find programming issues." OFF)
@@ -75,17 +73,23 @@ gko_rename_cache(GINKGO_CUDA_COMPILER_FLAGS CMAKE_CUDA_FLAGS BOOL "Flags used by
 if(GINKGO_BUILD_CUDA)
     include(cmake/cuda.cmake)
     if(GINKGO_CUDA_CUSTOM_THRUST_NAMESPACE AND CUDAToolkit_VERSION VERSION_LESS 11.6)
-        message(STATUS "We disable custom thrust namespace for cuda before 11.6 because it has no effect in the thrust shipped by cuda before 11.6")
-        set(GINKGO_CUDA_CUSTOM_THRUST_NAMESPACE OFF CACHE BOOL "Add custom namespace to thrust and cub in cuda to avoid potential break when the other libraries also uses thrust" FORCE)
+        message(STATUS "Disable custom thrust namespace for cuda before 11.6 because it has no effect in the thrust shipped by cuda before 11.6")
+        set(GINKGO_CUDA_CUSTOM_THRUST_NAMESPACE OFF)
+    else()
+        message(STATUS "Enable custom thrust namespace for cuda")
+        set(GINKGO_CUDA_CUSTOM_THRUST_NAMESPACE ON)
     endif()
 endif()
 if(GINKGO_BUILD_HIP)
     include(cmake/hip.cmake)
-    if(GINKGO_HIP_CUSTOM_THRUST_NAMESPACE AND GINKGO_HIP_PLATFORM_AMD AND GINKGO_HIP_VERSION VERSION_LESS 5.7)
+    if(GINKGO_HIP_PLATFORM_AMD AND GINKGO_HIP_VERSION VERSION_LESS 5.7)
         # Hip allow custom namespace but does not fully make everything in the custom namespace before rocm-5.7
         # more specific pr: https://github.com/ROCm/rocThrust/pull/286
-        message(STATUS "We disable custom thrust namespace for hip before 5.7 because hip does not fully support it before 5.7")
-        set(GINKGO_HIP_CUSTOM_THRUST_NAMESPACE OFF CACHE BOOL "Add custom namespace to thrust in hip to avoid potential break when the other libraries also uses thrust" FORCE)
+        message(STATUS "Disable custom thrust namespace for hip before 5.7 because hip does not fully support it before 5.7")
+        set(GINKGO_HIP_CUSTOM_THRUST_NAMESPACE OFF)
+    else()
+        message(STATUS "Enable custom thrust namespace for hip")
+        set(GINKGO_HIP_CUSTOM_THRUST_NAMESPACE ON)
     endif()
 endif()
 if(GINKGO_BUILD_SYCL)
diff --git a/INSTALL.md b/INSTALL.md
index 6dbac186131..9719bdfb920 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -53,10 +53,6 @@ Ginkgo adds the following additional switches to control what is being built:
     `OFF` otherwise.
 *   `-DCMAKE_HIP_ARCHITECTURES="gpuarch1;gpuarch2"` the AMDGPU targets to be passed to the compiler.
     If empty, compiler chooses based on the available GPUs.
-*   `-DGINKGO_CUDA_CUSTOM_THRUST_NAMESPACE={ON, OFF}` adds custom namespace to thrust and the underlying cub in Ginkgo CUDA.
-    Default is `ON`. It avoids the potential conflicts from thrust when other libraries also use thrust.
-*   `-DGINKGO_HIP_CUSTOM_THRUST_NAMESPACE={ON, OFF}` adds custom namespace to thrust in Ginkgo HIP.
-    Default is `ON`. It avoids the potential conflicts from thrust when other libraries also use thrust.
 *   `-DGINKGO_BUILD_HWLOC={ON, OFF}` builds Ginkgo with HWLOC. Default is `OFF`.
 *   `-DGINKGO_BUILD_DOC={ON, OFF}` creates an HTML version of Ginkgo's documentation
     from inline comments in the code. The default is `OFF`.

From a5fdfa905a7dda54fb9f293453e726cb21b415ab Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 25 Nov 2024 16:36:23 +0100
Subject: [PATCH 305/448] update cuda arch for list and selector

---
 cmake/Modules/CudaArchitectureSelector.cmake | 12 +++++++-----
 common/cuda_hip/base/executor.hpp.inc        |  1 +
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/cmake/Modules/CudaArchitectureSelector.cmake b/cmake/Modules/CudaArchitectureSelector.cmake
index f6295d11e06..dd9b5e64e4a 100644
--- a/cmake/Modules/CudaArchitectureSelector.cmake
+++ b/cmake/Modules/CudaArchitectureSelector.cmake
@@ -68,10 +68,10 @@
 #
 # GPU generation name:
 #   Has to be one of the strings ``Tesla``, ``Fermi``, ``Kepler``, ``Maxwell``,
-#   ``Pascal``, ``Volta``, ``Turing``, ``Ampere``. Specifying one of the strings
-#   will add flags for the generation of CUBIN code for all architectures
-#   belonging to that GPU generation (except the ones listed in the
-#   ``UNSUPPORTED`` list).
+#   ``Pascal``, ``Volta``, ``Turing``, ``Ampere``, ``Ada``, ``Hopper``.
+#   Specifying one of the strings will add flags for the generation of CUBIN
+#   code for all architectures belonging to that GPU generation (except the
+#   ones listed in the ``UNSUPPORTED`` list).
 #
 # Virtual and physical architecture specification:
 #   A string of the form ``XX(YY)``, where ``XX`` is the identifier of the
@@ -240,7 +240,9 @@ function(cas_get_architectures_by_name name output)
     set( pascal_version 6)
     set(  volta_version "7(0|2)")
     set( turing_version 75)
-    set( ampere_version 8)
+    set( ampere_version "8(0|6|7)")
+    set(    ada_version 89)
+    set( hopper_version 9)
     string(TOLOWER ${name} lower_name)
     if(NOT DEFINED ${lower_name}_version)
         message(FATAL_ERROR "${name} is not a valid GPU generation name")
diff --git a/common/cuda_hip/base/executor.hpp.inc b/common/cuda_hip/base/executor.hpp.inc
index 3c6a5275b0d..a478f74c840 100644
--- a/common/cuda_hip/base/executor.hpp.inc
+++ b/common/cuda_hip/base/executor.hpp.inc
@@ -34,6 +34,7 @@ inline int convert_sm_ver_to_cores(int major, int minor)
         {0x80, 64},   // Ampere Generation (SM 8.0) GA100 class
         {0x86, 128},  // Ampere Generation (SM 8.6)
         {0x87, 128},  // Ampere Generation (SM 8.7)
+        {0x89, 128},  // Ada Generation (SM 8.9)
         {0x90, 128},  // Hopper Generation (SM 9.0)
         {-1, -1}};
 

From e986f76d721a94b74fe3db1e7b112aab56e604d8 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 27 Nov 2024 21:29:01 +0100
Subject: [PATCH 306/448] missing change from custom thrust namespace

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 092c0ee9f3e..1f351038c92 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,7 +72,7 @@ gko_rename_cache(GINKGO_CUDA_COMPILER_FLAGS CMAKE_CUDA_FLAGS BOOL "Flags used by
 # load executor-specific configuration
 if(GINKGO_BUILD_CUDA)
     include(cmake/cuda.cmake)
-    if(GINKGO_CUDA_CUSTOM_THRUST_NAMESPACE AND CUDAToolkit_VERSION VERSION_LESS 11.6)
+    if(CUDAToolkit_VERSION VERSION_LESS 11.6)
         message(STATUS "Disable custom thrust namespace for cuda before 11.6 because it has no effect in the thrust shipped by cuda before 11.6")
         set(GINKGO_CUDA_CUSTOM_THRUST_NAMESPACE OFF)
     else()

From 61d44bdafc91838fff4c96b0f6c2ee476374362e Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Wed, 27 Nov 2024 15:43:36 +0100
Subject: [PATCH 307/448] [ci] disable threadsanitizer temporarily

In its current state the threadsanitizer CI job has the following issues:
- it's slow. The run can take >10h. Running it locally is much faster, so some configuration might be wrong.
- false positives. Many tests fail, with either broken stack traces, or non locally reproducible output.
---
 .gitlab-ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index b0209e67dc5..74c50322865 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -599,6 +599,7 @@ threadsanitizer:
     - .deploy_condition
     - .before_script_template
     - .use_gko-rocm514-nompi-gnu11-llvm11
+    - .disable_job_condition
   script:
     - LD_PRELOAD=/usr/local/lib/libomp.so
       CC=clang CXX=clang++

From 74fbd02ffd5e8d3e73395a60fa382a3dc91f6010 Mon Sep 17 00:00:00 2001
From: Fritz Goebel <fritz.goebel@kit.edu>
Date: Thu, 11 Jul 2024 16:41:43 +0000
Subject: [PATCH 308/448] Add overlap communication to read_distributed

---
 .../cuda_hip/distributed/matrix_kernels.cpp   |  28 ++++
 core/device_hooks/common_kernels.inc.cpp      |   3 +
 core/distributed/matrix.cpp                   | 123 ++++++++++++++++--
 core/distributed/matrix_kernels.hpp           |  31 +++++
 dpcpp/distributed/matrix_kernels.dp.cpp       |  28 ++++
 include/ginkgo/core/distributed/matrix.hpp    |  24 +++-
 omp/distributed/matrix_kernels.cpp            |  28 ++++
 reference/distributed/matrix_kernels.cpp      |  62 +++++++++
 test/mpi/matrix.cpp                           |  53 +++++++-
 9 files changed, 363 insertions(+), 17 deletions(-)

diff --git a/common/cuda_hip/distributed/matrix_kernels.cpp b/common/cuda_hip/distributed/matrix_kernels.cpp
index 88988febbb0..70824bf03d9 100644
--- a/common/cuda_hip/distributed/matrix_kernels.cpp
+++ b/common/cuda_hip/distributed/matrix_kernels.cpp
@@ -49,6 +49,34 @@ struct input_type {
 };
 
 
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void count_overlap_entries(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    comm_index_type local_part,
+    array<comm_index_type>& overlap_count) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_COUNT_OVERLAP_ENTRIES);
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void fill_overlap_send_buffers(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    comm_index_type local_part, array<comm_index_type>& offsets,
+    array<GlobalIndexType>& overlap_row_idxs,
+    array<GlobalIndexType>& overlap_col_idxs,
+    array<ValueType>& overlap_values) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS);
+
+
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void separate_local_nonlocal(
     std::shared_ptr<const DefaultExecutor> exec,
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 290b5afd907..6870633c900 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -283,6 +283,9 @@ GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
 namespace distributed_matrix {
 
 
+GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_COUNT_OVERLAP_ENTRIES);
+GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS);
 GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL);
 
 
diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp
index 63f359cc40a..a64a07619b3 100644
--- a/core/distributed/matrix.cpp
+++ b/core/distributed/matrix.cpp
@@ -4,6 +4,9 @@
 
 #include "ginkgo/core/distributed/matrix.hpp"
 
+#include <numeric>
+#include <vector>
+
 #include <ginkgo/core/base/precision_dispatch.hpp>
 #include <ginkgo/core/distributed/vector.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
@@ -11,6 +14,7 @@
 #include <ginkgo/core/matrix/diagonal.hpp>
 
 #include "core/distributed/matrix_kernels.hpp"
+#include "ginkgo/core/base/mtx_io.hpp"
 
 
 namespace gko {
@@ -20,6 +24,10 @@ namespace matrix {
 namespace {
 
 
+GKO_REGISTER_OPERATION(count_overlap_entries,
+                       distributed_matrix::count_overlap_entries);
+GKO_REGISTER_OPERATION(fill_overlap_send_buffers,
+                       distributed_matrix::fill_overlap_send_buffers);
 GKO_REGISTER_OPERATION(separate_local_nonlocal,
                        distributed_matrix::separate_local_nonlocal);
 
@@ -243,7 +251,8 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     std::shared_ptr<const Partition<local_index_type, global_index_type>>
         row_partition,
     std::shared_ptr<const Partition<local_index_type, global_index_type>>
-        col_partition)
+        col_partition,
+    assembly assembly_type)
 {
     const auto comm = this->get_communicator();
     GKO_ASSERT_EQ(data.get_size()[0], row_partition->get_size());
@@ -252,14 +261,105 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     GKO_ASSERT_EQ(comm.size(), col_partition->get_num_parts());
     auto exec = this->get_executor();
     auto local_part = comm.rank();
+    auto use_host_buffer = mpi::requires_host_buffer(exec, comm);
 
     // set up LinOp sizes
-    auto num_parts = static_cast<size_type>(row_partition->get_num_parts());
     auto global_num_rows = row_partition->get_size();
     auto global_num_cols = col_partition->get_size();
     dim<2> global_dim{global_num_rows, global_num_cols};
     this->set_size(global_dim);
 
+    device_matrix_data<value_type, global_index_type> all_data{exec};
+    if (assembly_type == assembly::communicate) {
+        array<comm_index_type> overlap_count{exec, comm.size()};
+        overlap_count.fill(0);
+        auto tmp_part = make_temporary_clone(exec, row_partition);
+        exec->run(matrix::make_count_overlap_entries(
+            data, tmp_part.get(), local_part, overlap_count));
+
+        overlap_count.set_executor(exec->get_master());
+        std::vector<comm_index_type> overlap_send_sizes(
+            overlap_count.get_data(), overlap_count.get_data() + comm.size());
+        std::vector<comm_index_type> overlap_send_offsets(comm.size() + 1);
+        std::vector<comm_index_type> overlap_recv_sizes(comm.size());
+        std::vector<comm_index_type> overlap_recv_offsets(comm.size() + 1);
+
+        std::partial_sum(overlap_send_sizes.begin(), overlap_send_sizes.end(),
+                         overlap_send_offsets.begin() + 1);
+        comm.all_to_all(exec, overlap_send_sizes.data(), 1,
+                        overlap_recv_sizes.data(), 1);
+        std::partial_sum(overlap_recv_sizes.begin(), overlap_recv_sizes.end(),
+                         overlap_recv_offsets.begin() + 1);
+        overlap_send_offsets[0] = 0;
+        overlap_recv_offsets[0] = 0;
+
+        size_type n_send = overlap_send_offsets.back();
+        size_type n_recv = overlap_recv_offsets.back();
+        array<global_index_type> overlap_send_row_idxs{exec, n_send};
+        array<global_index_type> overlap_send_col_idxs{exec, n_send};
+        array<value_type> overlap_send_values{exec, n_send};
+        array<global_index_type> overlap_recv_row_idxs{exec, n_recv};
+        array<global_index_type> overlap_recv_col_idxs{exec, n_recv};
+        array<value_type> overlap_recv_values{exec, n_recv};
+        auto offset_array =
+            make_const_array_view(exec->get_master(), comm.size() + 1,
+                                  overlap_send_offsets.data())
+                .copy_to_array();
+        offset_array.set_executor(exec);
+        exec->run(matrix::make_fill_overlap_send_buffers(
+            data, tmp_part.get(), local_part, offset_array,
+            overlap_send_row_idxs, overlap_send_col_idxs, overlap_send_values));
+
+        if (use_host_buffer) {
+            overlap_send_row_idxs.set_executor(exec->get_master());
+            overlap_send_col_idxs.set_executor(exec->get_master());
+            overlap_send_values.set_executor(exec->get_master());
+            overlap_recv_row_idxs.set_executor(exec->get_master());
+            overlap_recv_col_idxs.set_executor(exec->get_master());
+            overlap_recv_values.set_executor(exec->get_master());
+        }
+        comm.all_to_all_v(
+            use_host_buffer ? exec : exec->get_master(),
+            overlap_send_row_idxs.get_const_data(), overlap_send_sizes.data(),
+            overlap_send_offsets.data(), overlap_recv_row_idxs.get_data(),
+            overlap_recv_sizes.data(), overlap_recv_offsets.data());
+        comm.all_to_all_v(
+            use_host_buffer ? exec : exec->get_master(),
+            overlap_send_col_idxs.get_const_data(), overlap_send_sizes.data(),
+            overlap_send_offsets.data(), overlap_recv_col_idxs.get_data(),
+            overlap_recv_sizes.data(), overlap_recv_offsets.data());
+        comm.all_to_all_v(
+            use_host_buffer ? exec : exec->get_master(),
+            overlap_send_values.get_const_data(), overlap_send_sizes.data(),
+            overlap_send_offsets.data(), overlap_recv_values.get_data(),
+            overlap_recv_sizes.data(), overlap_recv_offsets.data());
+        if (use_host_buffer) {
+            overlap_recv_row_idxs.set_executor(exec);
+            overlap_recv_col_idxs.set_executor(exec);
+            overlap_recv_values.set_executor(exec);
+        }
+
+        size_type n_nnz = data.get_num_stored_elements();
+        array<global_index_type> all_row_idxs{exec, n_nnz + n_recv};
+        array<global_index_type> all_col_idxs{exec, n_nnz + n_recv};
+        array<value_type> all_values{exec, n_nnz + n_recv};
+        exec->copy_from(exec, n_nnz, data.get_const_row_idxs(),
+                        all_row_idxs.get_data());
+        exec->copy_from(exec, n_recv, overlap_recv_row_idxs.get_data(),
+                        all_row_idxs.get_data() + n_nnz);
+        exec->copy_from(exec, n_nnz, data.get_const_col_idxs(),
+                        all_col_idxs.get_data());
+        exec->copy_from(exec, n_recv, overlap_recv_col_idxs.get_data(),
+                        all_col_idxs.get_data() + n_nnz);
+        exec->copy_from(exec, n_nnz, data.get_const_values(),
+                        all_values.get_data());
+        exec->copy_from(exec, n_recv, overlap_recv_values.get_data(),
+                        all_values.get_data() + n_nnz);
+        all_data = device_matrix_data<value_type, global_index_type>{
+            exec, global_dim, all_row_idxs, all_col_idxs, all_values};
+        all_data.sum_duplicates();
+    }
+
     // temporary storage for the output
     array<local_index_type> local_row_idxs{exec};
     array<local_index_type> local_col_idxs{exec};
@@ -273,7 +373,8 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     // as well as the rows of the non-local block. The columns of the non-local
     // block are still in global indices.
     exec->run(matrix::make_separate_local_nonlocal(
-        data, make_temporary_clone(exec, row_partition).get(),
+        assembly_type == assembly::communicate ? all_data : data,
+        make_temporary_clone(exec, row_partition).get(),
         make_temporary_clone(exec, col_partition).get(), local_part,
         local_row_idxs, local_col_idxs, local_values, non_local_row_idxs,
         global_non_local_col_idxs, non_local_values));
@@ -335,7 +436,6 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
             imap.get_executor(), imap.get_non_local_size(),
             imap.get_remote_local_idxs().get_const_flat_data())
             .copy_to_array();
-    auto use_host_buffer = mpi::requires_host_buffer(exec, comm);
     if (use_host_buffer) {
         recv_gather_idxs.set_executor(exec->get_master());
         gather_idxs_.clear();
@@ -358,12 +458,13 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     std::shared_ptr<const Partition<local_index_type, global_index_type>>
         row_partition,
     std::shared_ptr<const Partition<local_index_type, global_index_type>>
-        col_partition)
+        col_partition,
+    assembly assembly_type)
 {
     return this->read_distributed(
         device_matrix_data<value_type, global_index_type>::create_from_host(
             this->get_executor(), data),
-        row_partition, col_partition);
+        row_partition, col_partition, assembly_type);
 }
 
 
@@ -371,12 +472,13 @@ template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     const matrix_data<ValueType, global_index_type>& data,
     std::shared_ptr<const Partition<local_index_type, global_index_type>>
-        partition)
+        partition,
+    assembly assembly_type)
 {
     return this->read_distributed(
         device_matrix_data<value_type, global_index_type>::create_from_host(
             this->get_executor(), data),
-        partition, partition);
+        partition, partition, assembly_type);
 }
 
 
@@ -384,9 +486,10 @@ template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     const device_matrix_data<ValueType, GlobalIndexType>& data,
     std::shared_ptr<const Partition<local_index_type, global_index_type>>
-        partition)
+        partition,
+    assembly assembly_type)
 {
-    return this->read_distributed(data, partition, partition);
+    return this->read_distributed(data, partition, partition, assembly_type);
 }
 
 
diff --git a/core/distributed/matrix_kernels.hpp b/core/distributed/matrix_kernels.hpp
index f24e8c9945e..fa7e891eb4e 100644
--- a/core/distributed/matrix_kernels.hpp
+++ b/core/distributed/matrix_kernels.hpp
@@ -19,6 +19,29 @@ namespace gko {
 namespace kernels {
 
 
+#define GKO_DECLARE_COUNT_OVERLAP_ENTRIES(ValueType, LocalIndexType, \
+                                          GlobalIndexType)           \
+    void count_overlap_entries(                                      \
+        std::shared_ptr<const DefaultExecutor> exec,                 \
+        const device_matrix_data<ValueType, GlobalIndexType>& input, \
+        const experimental::distributed::Partition<                  \
+            LocalIndexType, GlobalIndexType>* row_partition,         \
+        comm_index_type local_part, array<comm_index_type>& overlap_count)
+
+
+#define GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS(ValueType, LocalIndexType, \
+                                              GlobalIndexType)           \
+    void fill_overlap_send_buffers(                                      \
+        std::shared_ptr<const DefaultExecutor> exec,                     \
+        const device_matrix_data<ValueType, GlobalIndexType>& input,     \
+        const experimental::distributed::Partition<                      \
+            LocalIndexType, GlobalIndexType>* row_partition,             \
+        comm_index_type local_part, array<comm_index_type>& offsets,     \
+        array<GlobalIndexType>& overlap_row_idxs,                        \
+        array<GlobalIndexType>& overlap_col_idxs,                        \
+        array<ValueType>& overlap_values)
+
+
 #define GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL(ValueType, LocalIndexType,         \
                                             GlobalIndexType)                   \
     void separate_local_nonlocal(                                              \
@@ -37,6 +60,14 @@ namespace kernels {
 
 #define GKO_DECLARE_ALL_AS_TEMPLATES                                    \
     using comm_index_type = experimental::distributed::comm_index_type; \
+    template <typename ValueType, typename LocalIndexType,              \
+              typename GlobalIndexType>                                 \
+    GKO_DECLARE_COUNT_OVERLAP_ENTRIES(ValueType, LocalIndexType,        \
+                                      GlobalIndexType);                 \
+    template <typename ValueType, typename LocalIndexType,              \
+              typename GlobalIndexType>                                 \
+    GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS(ValueType, LocalIndexType,    \
+                                          GlobalIndexType);             \
     template <typename ValueType, typename LocalIndexType,              \
               typename GlobalIndexType>                                 \
     GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL(ValueType, LocalIndexType,      \
diff --git a/dpcpp/distributed/matrix_kernels.dp.cpp b/dpcpp/distributed/matrix_kernels.dp.cpp
index 47adaaeca59..9225e58ad14 100644
--- a/dpcpp/distributed/matrix_kernels.dp.cpp
+++ b/dpcpp/distributed/matrix_kernels.dp.cpp
@@ -13,6 +13,34 @@ namespace dpcpp {
 namespace distributed_matrix {
 
 
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void count_overlap_entries(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    comm_index_type local_part,
+    array<comm_index_type>& overlap_count) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_COUNT_OVERLAP_ENTRIES);
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void fill_overlap_send_buffers(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    comm_index_type local_part, array<comm_index_type>& offsets,
+    array<GlobalIndexType>& overlap_row_idxs,
+    array<GlobalIndexType>& overlap_col_idxs,
+    array<ValueType>& overlap_values) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS);
+
+
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void separate_local_nonlocal(
     std::shared_ptr<const DefaultExecutor> exec,
diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp
index 493b9176205..7f1ec56a77b 100644
--- a/include/ginkgo/core/distributed/matrix.hpp
+++ b/include/ginkgo/core/distributed/matrix.hpp
@@ -133,6 +133,18 @@ namespace experimental {
 namespace distributed {
 
 
+/**
+ * assembly defines how the read_distributed function of the distributed
+ * matrix treats non-local indices in the (device_)matrix_data:
+ * - communicate communicates the overlap between ranks and adds up all local
+ *   contributions. Indices smaller than 0 or larger than the global size
+ *   of the matrix are ignored.
+ * - local_only does not communicate any overlap but ignores all non-local
+ *   indices.
+ */
+enum class assembly { communicate, local_only };
+
+
 template <typename LocalIndexType, typename GlobalIndexType>
 class Partition;
 template <typename ValueType>
@@ -297,7 +309,8 @@ class Matrix
     void read_distributed(
         const device_matrix_data<value_type, global_index_type>& data,
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
-            partition);
+            partition,
+        assembly assembly_type = assembly::local_only);
 
     /**
      * Reads a square matrix from the matrix_data structure and a global
@@ -311,7 +324,8 @@ class Matrix
     void read_distributed(
         const matrix_data<value_type, global_index_type>& data,
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
-            partition);
+            partition,
+        assembly assembly_type = assembly::local_only);
 
     /**
      * Reads a matrix from the device_matrix_data structure, a global row
@@ -335,7 +349,8 @@ class Matrix
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
             row_partition,
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
-            col_partition);
+            col_partition,
+        assembly assembly_type = assembly::local_only);
 
     /**
      * Reads a matrix from the matrix_data structure, a global row partition,
@@ -351,7 +366,8 @@ class Matrix
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
             row_partition,
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
-            col_partition);
+            col_partition,
+        assembly assembly_type = assembly::local_only);
 
     /**
      * Get read access to the stored local matrix.
diff --git a/omp/distributed/matrix_kernels.cpp b/omp/distributed/matrix_kernels.cpp
index 2f36ec4a778..a3e8cb60868 100644
--- a/omp/distributed/matrix_kernels.cpp
+++ b/omp/distributed/matrix_kernels.cpp
@@ -20,6 +20,34 @@ namespace omp {
 namespace distributed_matrix {
 
 
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void count_overlap_entries(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    comm_index_type local_part,
+    array<comm_index_type>& overlap_count) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_COUNT_OVERLAP_ENTRIES);
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void fill_overlap_send_buffers(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    comm_index_type local_part, array<comm_index_type>& offsets,
+    array<GlobalIndexType>& overlap_row_idxs,
+    array<GlobalIndexType>& overlap_col_idxs,
+    array<ValueType>& overlap_values) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS);
+
+
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void separate_local_nonlocal(
     std::shared_ptr<const DefaultExecutor> exec,
diff --git a/reference/distributed/matrix_kernels.cpp b/reference/distributed/matrix_kernels.cpp
index 95176b34656..d8b0f9e1d4f 100644
--- a/reference/distributed/matrix_kernels.cpp
+++ b/reference/distributed/matrix_kernels.cpp
@@ -7,6 +7,7 @@
 #include "core/base/allocator.hpp"
 #include "core/base/device_matrix_data_kernels.hpp"
 #include "core/base/iterator_factory.hpp"
+#include "ginkgo/core/distributed/partition.hpp"
 #include "reference/distributed/partition_helpers.hpp"
 
 
@@ -16,6 +17,67 @@ namespace reference {
 namespace distributed_matrix {
 
 
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void count_overlap_entries(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    comm_index_type local_part, array<comm_index_type>& overlap_count)
+{
+    auto input_row_idxs = input.get_const_row_idxs();
+    auto row_part_ids = row_partition->get_part_ids();
+
+    size_type row_range_id = 0;
+    for (size_type i = 0; i < input.get_num_stored_elements(); ++i) {
+        auto global_row = input_row_idxs[i];
+        row_range_id = find_range(global_row, row_partition, row_range_id);
+        row_range_id = find_range(global_row, row_partition, row_range_id);
+        auto row_part_id = row_part_ids[row_range_id];
+        if (row_part_id != local_part) {
+            overlap_count.get_data()[row_part_id]++;
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_COUNT_OVERLAP_ENTRIES);
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void fill_overlap_send_buffers(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    comm_index_type local_part, array<comm_index_type>& offsets,
+    array<GlobalIndexType>& overlap_row_idxs,
+    array<GlobalIndexType>& overlap_col_idxs, array<ValueType>& overlap_values)
+{
+    auto input_row_idxs = input.get_const_row_idxs();
+    auto input_col_idxs = input.get_const_col_idxs();
+    auto input_vals = input.get_const_values();
+    auto row_part_ids = row_partition->get_part_ids();
+
+    size_type row_range_id = 0;
+    for (size_type i = 0; i < input.get_num_stored_elements(); ++i) {
+        auto global_row = input_row_idxs[i];
+        row_range_id = find_range(global_row, row_partition, row_range_id);
+        row_range_id = find_range(global_row, row_partition, row_range_id);
+        auto row_part_id = row_part_ids[row_range_id];
+        if (row_part_id != local_part) {
+            auto idx = offsets.get_data()[row_part_id]++;
+            overlap_row_idxs.get_data()[idx] = global_row;
+            overlap_col_idxs.get_data()[idx] = input_col_idxs[i];
+            overlap_values.get_data()[idx] = input_vals[i];
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS);
+
+
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void separate_local_nonlocal(
     std::shared_ptr<const DefaultExecutor> exec,
diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp
index f4b8af2fb19..1b25ad7eb6d 100644
--- a/test/mpi/matrix.cpp
+++ b/test/mpi/matrix.cpp
@@ -60,9 +60,16 @@ class MatrixCreation : public CommonMpiTestFixture {
                      {3, 4, 7},
                      {4, 0, 9},
                      {4, 4, 10}}},
-          dist_input{{{size, {{0, 1, 1}, {0, 3, 2}, {1, 1, 3}, {1, 2, 4}}},
-                      {size, {{2, 1, 5}, {2, 2, 6}, {3, 3, 8}, {3, 4, 7}}},
-                      {size, {{4, 0, 9}, {4, 4, 10}}}}},
+          dist_input{
+              {{size,
+                {{0, 1, 1},
+                 {0, 3, 2},
+                 {1, 1, 3},
+                 {1, 2, 4},
+                 {2, 0, 1},
+                 {2, 3, 1}}},
+               {size, {{0, 0, 1}, {2, 1, 5}, {2, 2, 6}, {3, 3, 8}, {3, 4, 7}}},
+               {size, {{2, 2, 1}, {3, 3, -1}, {4, 0, 9}, {4, 4, 10}}}}},
           engine(42)
     {
         row_part = Partition::build_from_contiguous(
@@ -134,6 +141,26 @@ TYPED_TEST(MatrixCreation, ReadsDistributedLocalData)
 }
 
 
+TYPED_TEST(MatrixCreation, ReadsDistributedLocalDataWithCommunicate)
+{
+    using value_type = typename TestFixture::value_type;
+    using csr = typename TestFixture::local_matrix_type;
+    I<I<value_type>> res_local[] = {{{1, 1}, {0, 3}}, {{7, 1}, {0, 7}}, {{10}}};
+    I<I<value_type>> res_non_local[] = {
+        {{0, 2}, {4, 0}}, {{1, 5, 0}, {0, 0, 7}}, {{9}}};
+    auto rank = this->dist_mat->get_communicator().rank();
+
+    this->dist_mat->read_distributed(
+        this->dist_input[rank], this->row_part,
+        gko::experimental::distributed::assembly::communicate);
+
+    GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_local_matrix()),
+                        res_local[rank], 0);
+    GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_non_local_matrix()),
+                        res_non_local[rank], 0);
+}
+
+
 TYPED_TEST(MatrixCreation, ReadsDistributedWithColPartition)
 {
     using value_type = typename TestFixture::value_type;
@@ -153,6 +180,26 @@ TYPED_TEST(MatrixCreation, ReadsDistributedWithColPartition)
 }
 
 
+TYPED_TEST(MatrixCreation, ReadsDistributedWithColPartitionAndCommunicate)
+{
+    using value_type = typename TestFixture::value_type;
+    using csr = typename TestFixture::local_matrix_type;
+    I<I<value_type>> res_local[] = {{{2, 0}, {0, 0}}, {{1, 5}, {0, 0}}, {{0}}};
+    I<I<value_type>> res_non_local[] = {
+        {{1, 1, 0}, {0, 3, 4}}, {{1, 0, 7}, {7, 7, 0}}, {{10, 9}}};
+    auto rank = this->dist_mat->get_communicator().rank();
+
+    this->dist_mat->read_distributed(
+        this->dist_input[rank], this->row_part, this->col_part,
+        gko::experimental::distributed::assembly::communicate);
+
+    GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_local_matrix()),
+                        res_local[rank], 0);
+    GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_non_local_matrix()),
+                        res_non_local[rank], 0);
+}
+
+
 TYPED_TEST(MatrixCreation, BuildOnlyLocal)
 {
     using value_type = typename TestFixture::value_type;

From 9d32b1d36a5a540986d81ea5a35864f007fb8f52 Mon Sep 17 00:00:00 2001
From: Fritz Goebel <fritz.goebel@kit.edu>
Date: Wed, 17 Jul 2024 12:28:11 +0000
Subject: [PATCH 309/448] Add reference kernel tests

---
 reference/test/distributed/matrix_kernels.cpp | 81 +++++++++++++++++++
 1 file changed, 81 insertions(+)

diff --git a/reference/test/distributed/matrix_kernels.cpp b/reference/test/distributed/matrix_kernels.cpp
index a34844cbde9..00d3fcd8895 100644
--- a/reference/test/distributed/matrix_kernels.cpp
+++ b/reference/test/distributed/matrix_kernels.cpp
@@ -17,6 +17,8 @@
 #include <ginkgo/core/matrix/csr.hpp>
 
 #include "core/test/utils.hpp"
+#include "ginkgo/core/base/array.hpp"
+#include "ginkgo/core/base/types.hpp"
 
 
 namespace {
@@ -186,6 +188,85 @@ TYPED_TEST_SUITE(Matrix, gko::test::ValueLocalGlobalIndexTypes,
                  TupleTypenameNameGenerator);
 
 
+TYPED_TEST(Matrix, CountOverlapEntries)
+{
+    using lit = typename TestFixture::local_index_type;
+    using git = typename TestFixture::global_index_type;
+    using vt = typename TestFixture::value_type;
+    using ca = gko::array<comm_index_type>;
+    this->mapping = {this->ref, {1, 0, 2, 2, 0, 1, 1}};
+    std::vector<ca> overlap_count_ref{
+        ca{this->ref, I<comm_index_type>{0, 5, 3}},
+        ca{this->ref, I<comm_index_type>{4, 0, 3}},
+        ca{this->ref, I<comm_index_type>{4, 5, 0}}};
+    comm_index_type num_parts = 3;
+    auto partition =
+        gko::experimental::distributed::Partition<lit, git>::build_from_mapping(
+            this->ref, this->mapping, num_parts);
+    auto input = this->create_input_full_rank();
+
+    gko::array<comm_index_type> overlap_count{
+        this->ref, static_cast<gko::size_type>(num_parts)};
+    for (gko::size_type i = 0; i < num_parts; i++) {
+        overlap_count.fill(0);
+        gko::kernels::reference::distributed_matrix::count_overlap_entries(
+            this->ref, input, partition.get(), i, overlap_count);
+        GKO_ASSERT_ARRAY_EQ(overlap_count, overlap_count_ref[i]);
+    }
+}
+
+
+TYPED_TEST(Matrix, FillOverlapSendBuffers)
+{
+    using lit = typename TestFixture::local_index_type;
+    using git = typename TestFixture::global_index_type;
+    using vt = typename TestFixture::value_type;
+    using ca = gko::array<comm_index_type>;
+    using ga = gko::array<git>;
+    using va = gko::array<vt>;
+    this->mapping = {this->ref, {1, 0, 2, 2, 0, 1, 1}};
+    std::vector<ca> overlap_offsets{
+        ca{this->ref, I<comm_index_type>{0, 0, 5, 8}},
+        ca{this->ref, I<comm_index_type>{0, 4, 4, 7}},
+        ca{this->ref, I<comm_index_type>{0, 4, 9, 9}}};
+    std::vector<ga> overlap_row_idxs_ref{
+        ga{this->ref, I<git>{0, 0, 5, 5, 6, 2, 3, 3}},
+        ga{this->ref, I<git>{1, 1, 4, 4, 2, 3, 3}},
+        ga{this->ref, I<git>{1, 1, 4, 4, 0, 0, 5, 5, 6}}};
+    std::vector<ga> overlap_col_idxs_ref{
+        ga{this->ref, I<git>{0, 3, 4, 5, 5, 2, 0, 3}},
+        ga{this->ref, I<git>{1, 2, 4, 6, 2, 0, 3}},
+        ga{this->ref, I<git>{1, 2, 4, 6, 0, 3, 4, 5, 5}}};
+    std::vector<va> overlap_values_ref{
+        va{this->ref, I<vt>{1, 2, 10, 11, 12, 5, 6, 7}},
+        va{this->ref, I<vt>{3, 4, 8, 9, 5, 6, 7}},
+        va{this->ref, I<vt>{3, 4, 8, 9, 1, 2, 10, 11, 12}}};
+    comm_index_type num_parts = 3;
+    auto partition =
+        gko::experimental::distributed::Partition<lit, git>::build_from_mapping(
+            this->ref, this->mapping, num_parts);
+    auto input = this->create_input_full_rank();
+
+    gko::array<git> overlap_row_idxs{this->ref};
+    gko::array<git> overlap_col_idxs{this->ref};
+    gko::array<vt> overlap_values{this->ref};
+    for (gko::size_type i = 0; i < num_parts; i++) {
+        overlap_row_idxs.resize_and_reset(
+            overlap_offsets[i].get_data()[num_parts]);
+        overlap_col_idxs.resize_and_reset(
+            overlap_offsets[i].get_data()[num_parts]);
+        overlap_values.resize_and_reset(
+            overlap_offsets[i].get_data()[num_parts]);
+        gko::kernels::reference::distributed_matrix::fill_overlap_send_buffers(
+            this->ref, input, partition.get(), i, overlap_offsets[i],
+            overlap_row_idxs, overlap_col_idxs, overlap_values);
+        GKO_ASSERT_ARRAY_EQ(overlap_row_idxs, overlap_row_idxs_ref[i]);
+        GKO_ASSERT_ARRAY_EQ(overlap_col_idxs, overlap_col_idxs_ref[i]);
+        GKO_ASSERT_ARRAY_EQ(overlap_values, overlap_values_ref[i]);
+    }
+}
+
+
 TYPED_TEST(Matrix, SeparateLocalNonLocalEmpty)
 {
     using lit = typename TestFixture::local_index_type;

From c29a076314cebbb96037e835c9c6549e679f18a0 Mon Sep 17 00:00:00 2001
From: Fritz Goebel <fritz.goebel@kit.edu>
Date: Fri, 19 Jul 2024 12:20:14 +0000
Subject: [PATCH 310/448] Add device kernels and tests

---
 .../cuda_hip/distributed/matrix_kernels.cpp   | 94 ++++++++++++++++++-
 core/distributed/matrix.cpp                   | 51 +++++-----
 core/distributed/matrix_kernels.hpp           | 22 +++--
 dpcpp/distributed/matrix_kernels.dp.cpp       |  8 +-
 omp/distributed/matrix_kernels.cpp            | 70 +++++++++++++-
 reference/distributed/matrix_kernels.cpp      | 54 ++++++++---
 reference/test/distributed/matrix_kernels.cpp | 48 ++++++----
 test/distributed/matrix_kernels.cpp           | 60 +++++++++++-
 8 files changed, 324 insertions(+), 83 deletions(-)

diff --git a/common/cuda_hip/distributed/matrix_kernels.cpp b/common/cuda_hip/distributed/matrix_kernels.cpp
index 70824bf03d9..ab3ec9da8b1 100644
--- a/common/cuda_hip/distributed/matrix_kernels.cpp
+++ b/common/cuda_hip/distributed/matrix_kernels.cpp
@@ -20,6 +20,9 @@
 
 #include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/components/atomic.hpp"
+#include "common/unified/base/kernel_launch.hpp"
+#include "core/components/format_conversion_kernels.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
 
 
 namespace gko {
@@ -55,8 +58,67 @@ void count_overlap_entries(
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part,
-    array<comm_index_type>& overlap_count) GKO_NOT_IMPLEMENTED;
+    comm_index_type local_part, array<comm_index_type>& overlap_count,
+    array<GlobalIndexType>& overlap_positions,
+    array<GlobalIndexType>& original_positions)
+{
+    auto row_part_ids = row_partition->get_part_ids();
+    const auto* row_range_bounds = row_partition->get_range_bounds();
+    const auto* row_range_starting_indices =
+        row_partition->get_range_starting_indices();
+    const auto num_row_ranges = row_partition->get_num_ranges();
+    const auto num_input_elements = input.get_num_stored_elements();
+
+    auto policy = thrust_policy(exec);
+
+    // precompute the row and column range id of each input element
+    auto input_row_idxs = input.get_const_row_idxs();
+    array<size_type> row_range_ids{exec, num_input_elements};
+    thrust::upper_bound(policy, row_range_bounds + 1,
+                        row_range_bounds + num_row_ranges + 1, input_row_idxs,
+                        input_row_idxs + num_input_elements,
+                        row_range_ids.get_data());
+
+    array<comm_index_type> row_part_ids_per_entry{exec, num_input_elements};
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto part_id, auto part_ids, auto range_ids,
+                      auto part_ids_per_entry, auto orig_positions) {
+            part_ids_per_entry[i] = part_ids[range_ids[i]];
+            orig_positions[i] = part_ids_per_entry[i] == part_id ? -1 : i;
+        },
+        num_input_elements, local_part, row_part_ids, row_range_ids.get_data(),
+        row_part_ids_per_entry.get_data(), original_positions.get_data());
+
+    thrust::stable_sort_by_key(
+        policy, row_part_ids_per_entry.get_data(),
+        row_part_ids_per_entry.get_data() + num_input_elements,
+        original_positions.get_data());
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto orig_positions, auto overl_positions) {
+            overl_positions[i] = orig_positions[i] >= 0 ? 1 : 0;
+        },
+        num_input_elements, original_positions.get_const_data(),
+        overlap_positions.get_data());
+
+    components::prefix_sum_nonnegative(exec, overlap_positions.get_data(),
+                                       num_input_elements);
+    size_type num_parts = row_partition->get_num_parts();
+    array<comm_index_type> row_part_ptrs{exec, num_parts + 1};
+    row_part_ptrs.fill(0);
+    components::convert_idxs_to_ptrs(
+        exec, row_part_ids_per_entry.get_const_data(), num_input_elements,
+        num_parts, row_part_ptrs.get_data());
+
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto part_id, auto part_ptrs, auto count) {
+            count[i] = i == part_id ? 0 : part_ptrs[i + 1] - part_ptrs[i];
+        },
+        num_parts, local_part, row_part_ptrs.get_data(),
+        overlap_count.get_data());
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_COUNT_OVERLAP_ENTRIES);
@@ -68,10 +130,32 @@ void fill_overlap_send_buffers(
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part, array<comm_index_type>& offsets,
+    comm_index_type local_part, const array<GlobalIndexType>& overlap_positions,
+    const array<GlobalIndexType>& original_positions,
     array<GlobalIndexType>& overlap_row_idxs,
-    array<GlobalIndexType>& overlap_col_idxs,
-    array<ValueType>& overlap_values) GKO_NOT_IMPLEMENTED;
+    array<GlobalIndexType>& overlap_col_idxs, array<ValueType>& overlap_values)
+{
+    auto num_entries = input.get_num_stored_elements();
+    auto input_row_idxs = input.get_const_row_idxs();
+    auto input_col_idxs = input.get_const_col_idxs();
+    auto input_values = input.get_const_values();
+
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto in_rows, auto in_cols, auto in_vals,
+                      auto in_pos, auto out_pos, auto out_rows, auto out_cols,
+                      auto out_vals) {
+            if (in_pos[i] >= 0) {
+                out_rows[out_pos[i]] = in_rows[in_pos[i]];
+                out_cols[out_pos[i]] = in_cols[in_pos[i]];
+                out_vals[out_pos[i]] = in_vals[in_pos[i]];
+            }
+        },
+        num_entries, input_row_idxs, input_col_idxs, input_values,
+        original_positions.get_const_data(), overlap_positions.get_const_data(),
+        overlap_row_idxs.get_data(), overlap_col_idxs.get_data(),
+        overlap_values.get_data());
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS);
diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp
index a64a07619b3..3799895abf3 100644
--- a/core/distributed/matrix.cpp
+++ b/core/distributed/matrix.cpp
@@ -4,17 +4,14 @@
 
 #include "ginkgo/core/distributed/matrix.hpp"
 
-#include <numeric>
-#include <vector>
-
 #include <ginkgo/core/base/precision_dispatch.hpp>
 #include <ginkgo/core/distributed/vector.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
 
+#include "core/components/prefix_sum_kernels.hpp"
 #include "core/distributed/matrix_kernels.hpp"
-#include "ginkgo/core/base/mtx_io.hpp"
 
 
 namespace gko {
@@ -271,18 +268,23 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
 
     device_matrix_data<value_type, global_index_type> all_data{exec};
     if (assembly_type == assembly::communicate) {
-        array<comm_index_type> overlap_count{exec, comm.size()};
+        size_type num_entries = data.get_num_stored_elements();
+        size_type num_parts = comm.size();
+        array<comm_index_type> overlap_count{exec, num_parts};
+        array<global_index_type> overlap_positions{exec, num_entries};
+        array<global_index_type> original_positions{exec, num_entries};
         overlap_count.fill(0);
         auto tmp_part = make_temporary_clone(exec, row_partition);
         exec->run(matrix::make_count_overlap_entries(
-            data, tmp_part.get(), local_part, overlap_count));
+            data, tmp_part.get(), local_part, overlap_count, overlap_positions,
+            original_positions));
 
         overlap_count.set_executor(exec->get_master());
         std::vector<comm_index_type> overlap_send_sizes(
-            overlap_count.get_data(), overlap_count.get_data() + comm.size());
-        std::vector<comm_index_type> overlap_send_offsets(comm.size() + 1);
-        std::vector<comm_index_type> overlap_recv_sizes(comm.size());
-        std::vector<comm_index_type> overlap_recv_offsets(comm.size() + 1);
+            overlap_count.get_data(), overlap_count.get_data() + num_parts);
+        std::vector<comm_index_type> overlap_send_offsets(num_parts + 1);
+        std::vector<comm_index_type> overlap_recv_sizes(num_parts);
+        std::vector<comm_index_type> overlap_recv_offsets(num_parts + 1);
 
         std::partial_sum(overlap_send_sizes.begin(), overlap_send_sizes.end(),
                          overlap_send_offsets.begin() + 1);
@@ -301,14 +303,10 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
         array<global_index_type> overlap_recv_row_idxs{exec, n_recv};
         array<global_index_type> overlap_recv_col_idxs{exec, n_recv};
         array<value_type> overlap_recv_values{exec, n_recv};
-        auto offset_array =
-            make_const_array_view(exec->get_master(), comm.size() + 1,
-                                  overlap_send_offsets.data())
-                .copy_to_array();
-        offset_array.set_executor(exec);
         exec->run(matrix::make_fill_overlap_send_buffers(
-            data, tmp_part.get(), local_part, offset_array,
-            overlap_send_row_idxs, overlap_send_col_idxs, overlap_send_values));
+            data, tmp_part.get(), local_part, overlap_positions,
+            original_positions, overlap_send_row_idxs, overlap_send_col_idxs,
+            overlap_send_values));
 
         if (use_host_buffer) {
             overlap_send_row_idxs.set_executor(exec->get_master());
@@ -339,22 +337,21 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
             overlap_recv_values.set_executor(exec);
         }
 
-        size_type n_nnz = data.get_num_stored_elements();
-        array<global_index_type> all_row_idxs{exec, n_nnz + n_recv};
-        array<global_index_type> all_col_idxs{exec, n_nnz + n_recv};
-        array<value_type> all_values{exec, n_nnz + n_recv};
-        exec->copy_from(exec, n_nnz, data.get_const_row_idxs(),
+        array<global_index_type> all_row_idxs{exec, num_entries + n_recv};
+        array<global_index_type> all_col_idxs{exec, num_entries + n_recv};
+        array<value_type> all_values{exec, num_entries + n_recv};
+        exec->copy_from(exec, num_entries, data.get_const_row_idxs(),
                         all_row_idxs.get_data());
         exec->copy_from(exec, n_recv, overlap_recv_row_idxs.get_data(),
-                        all_row_idxs.get_data() + n_nnz);
-        exec->copy_from(exec, n_nnz, data.get_const_col_idxs(),
+                        all_row_idxs.get_data() + num_entries);
+        exec->copy_from(exec, num_entries, data.get_const_col_idxs(),
                         all_col_idxs.get_data());
         exec->copy_from(exec, n_recv, overlap_recv_col_idxs.get_data(),
-                        all_col_idxs.get_data() + n_nnz);
-        exec->copy_from(exec, n_nnz, data.get_const_values(),
+                        all_col_idxs.get_data() + num_entries);
+        exec->copy_from(exec, num_entries, data.get_const_values(),
                         all_values.get_data());
         exec->copy_from(exec, n_recv, overlap_recv_values.get_data(),
-                        all_values.get_data() + n_nnz);
+                        all_values.get_data() + num_entries);
         all_data = device_matrix_data<value_type, global_index_type>{
             exec, global_dim, all_row_idxs, all_col_idxs, all_values};
         all_data.sum_duplicates();
diff --git a/core/distributed/matrix_kernels.hpp b/core/distributed/matrix_kernels.hpp
index fa7e891eb4e..4cdaf3e17fe 100644
--- a/core/distributed/matrix_kernels.hpp
+++ b/core/distributed/matrix_kernels.hpp
@@ -19,14 +19,16 @@ namespace gko {
 namespace kernels {
 
 
-#define GKO_DECLARE_COUNT_OVERLAP_ENTRIES(ValueType, LocalIndexType, \
-                                          GlobalIndexType)           \
-    void count_overlap_entries(                                      \
-        std::shared_ptr<const DefaultExecutor> exec,                 \
-        const device_matrix_data<ValueType, GlobalIndexType>& input, \
-        const experimental::distributed::Partition<                  \
-            LocalIndexType, GlobalIndexType>* row_partition,         \
-        comm_index_type local_part, array<comm_index_type>& overlap_count)
+#define GKO_DECLARE_COUNT_OVERLAP_ENTRIES(ValueType, LocalIndexType,       \
+                                          GlobalIndexType)                 \
+    void count_overlap_entries(                                            \
+        std::shared_ptr<const DefaultExecutor> exec,                       \
+        const device_matrix_data<ValueType, GlobalIndexType>& input,       \
+        const experimental::distributed::Partition<                        \
+            LocalIndexType, GlobalIndexType>* row_partition,               \
+        comm_index_type local_part, array<comm_index_type>& overlap_count, \
+        array<GlobalIndexType>& overlap_positions,                         \
+        array<GlobalIndexType>& original_positions)
 
 
 #define GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS(ValueType, LocalIndexType, \
@@ -36,7 +38,9 @@ namespace kernels {
         const device_matrix_data<ValueType, GlobalIndexType>& input,     \
         const experimental::distributed::Partition<                      \
             LocalIndexType, GlobalIndexType>* row_partition,             \
-        comm_index_type local_part, array<comm_index_type>& offsets,     \
+        comm_index_type local_part,                                      \
+        const array<GlobalIndexType>& overlap_positions,                 \
+        const array<GlobalIndexType>& original_positions,                \
         array<GlobalIndexType>& overlap_row_idxs,                        \
         array<GlobalIndexType>& overlap_col_idxs,                        \
         array<ValueType>& overlap_values)
diff --git a/dpcpp/distributed/matrix_kernels.dp.cpp b/dpcpp/distributed/matrix_kernels.dp.cpp
index 9225e58ad14..60fc0686473 100644
--- a/dpcpp/distributed/matrix_kernels.dp.cpp
+++ b/dpcpp/distributed/matrix_kernels.dp.cpp
@@ -19,8 +19,9 @@ void count_overlap_entries(
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part,
-    array<comm_index_type>& overlap_count) GKO_NOT_IMPLEMENTED;
+    comm_index_type local_part, array<comm_index_type>& overlap_count,
+    array<GlobalIndexType>& overlap_positions,
+    array<GlobalIndexType>& original_positions) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_COUNT_OVERLAP_ENTRIES);
@@ -32,7 +33,8 @@ void fill_overlap_send_buffers(
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part, array<comm_index_type>& offsets,
+    comm_index_type local_part, const array<GlobalIndexType>& overlap_positions,
+    const array<GlobalIndexType>& original_positions,
     array<GlobalIndexType>& overlap_row_idxs,
     array<GlobalIndexType>& overlap_col_idxs,
     array<ValueType>& overlap_values) GKO_NOT_IMPLEMENTED;
diff --git a/omp/distributed/matrix_kernels.cpp b/omp/distributed/matrix_kernels.cpp
index a3e8cb60868..55ee5524116 100644
--- a/omp/distributed/matrix_kernels.cpp
+++ b/omp/distributed/matrix_kernels.cpp
@@ -4,6 +4,8 @@
 
 #include "core/distributed/matrix_kernels.hpp"
 
+#include <algorithm>
+
 #include <omp.h>
 
 #include <ginkgo/core/base/exception_helpers.hpp>
@@ -26,8 +28,50 @@ void count_overlap_entries(
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part,
-    array<comm_index_type>& overlap_count) GKO_NOT_IMPLEMENTED;
+    comm_index_type local_part, array<comm_index_type>& overlap_count,
+    array<GlobalIndexType>& overlap_positions,
+    array<GlobalIndexType>& original_positions)
+{
+    auto num_input_elements = input.get_num_stored_elements();
+    auto input_row_idxs = input.get_const_row_idxs();
+    auto row_part_ids = row_partition->get_part_ids();
+    array<comm_index_type> row_part_ids_per_entry{exec, num_input_elements};
+
+    size_type row_range_id = 0;
+#pragma omp parallel for firstprivate(row_range_id)
+    for (size_type i = 0; i < input.get_num_stored_elements(); ++i) {
+        auto global_row = input_row_idxs[i];
+        row_range_id = find_range(global_row, row_partition, row_range_id);
+        auto row_part_id = row_part_ids[row_range_id];
+        row_part_ids_per_entry.get_data()[i] = row_part_id;
+        if (row_part_id != local_part) {
+#pragma omp atomic
+            overlap_count.get_data()[row_part_id]++;
+            original_positions.get_data()[i] = i;
+        } else {
+            original_positions.get_data()[i] = -1;
+        }
+    }
+
+    auto comp = [row_part_ids_per_entry, local_part](auto i, auto j) {
+        comm_index_type a =
+            i == -1 ? local_part : row_part_ids_per_entry.get_const_data()[i];
+        comm_index_type b =
+            j == -1 ? local_part : row_part_ids_per_entry.get_const_data()[j];
+        return a < b;
+    };
+    std::stable_sort(original_positions.get_data(),
+                     original_positions.get_data() + num_input_elements, comp);
+
+#pragma omp parallel for
+    for (size_type i = 0; i < num_input_elements; i++) {
+        overlap_positions.get_data()[i] =
+            original_positions.get_const_data()[i] == -1 ? 0 : 1;
+    }
+
+    components::prefix_sum_nonnegative(exec, overlap_positions.get_data(),
+                                       num_input_elements);
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_COUNT_OVERLAP_ENTRIES);
@@ -39,10 +83,26 @@ void fill_overlap_send_buffers(
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part, array<comm_index_type>& offsets,
+    comm_index_type local_part, const array<GlobalIndexType>& overlap_positions,
+    const array<GlobalIndexType>& original_positions,
     array<GlobalIndexType>& overlap_row_idxs,
-    array<GlobalIndexType>& overlap_col_idxs,
-    array<ValueType>& overlap_values) GKO_NOT_IMPLEMENTED;
+    array<GlobalIndexType>& overlap_col_idxs, array<ValueType>& overlap_values)
+{
+    auto input_row_idxs = input.get_const_row_idxs();
+    auto input_col_idxs = input.get_const_col_idxs();
+    auto input_vals = input.get_const_values();
+
+#pragma omp parallel for
+    for (size_type i = 0; i < input.get_num_stored_elements(); ++i) {
+        auto in_pos = original_positions.get_const_data()[i];
+        if (in_pos >= 0) {
+            auto out_pos = overlap_positions.get_const_data()[i];
+            overlap_row_idxs.get_data()[out_pos] = input_row_idxs[in_pos];
+            overlap_col_idxs.get_data()[out_pos] = input_col_idxs[in_pos];
+            overlap_values.get_data()[out_pos] = input_vals[in_pos];
+        }
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS);
diff --git a/reference/distributed/matrix_kernels.cpp b/reference/distributed/matrix_kernels.cpp
index d8b0f9e1d4f..6a57a64e075 100644
--- a/reference/distributed/matrix_kernels.cpp
+++ b/reference/distributed/matrix_kernels.cpp
@@ -4,10 +4,12 @@
 
 #include "core/distributed/matrix_kernels.hpp"
 
+#include <algorithm>
+#include <numeric>
+
 #include "core/base/allocator.hpp"
 #include "core/base/device_matrix_data_kernels.hpp"
 #include "core/base/iterator_factory.hpp"
-#include "ginkgo/core/distributed/partition.hpp"
 #include "reference/distributed/partition_helpers.hpp"
 
 
@@ -23,21 +25,47 @@ void count_overlap_entries(
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part, array<comm_index_type>& overlap_count)
+    comm_index_type local_part, array<comm_index_type>& overlap_count,
+    array<GlobalIndexType>& overlap_positions,
+    array<GlobalIndexType>& original_positions)
 {
+    auto num_input_elements = input.get_num_stored_elements();
     auto input_row_idxs = input.get_const_row_idxs();
     auto row_part_ids = row_partition->get_part_ids();
+    array<comm_index_type> row_part_ids_per_entry{exec, num_input_elements};
 
     size_type row_range_id = 0;
     for (size_type i = 0; i < input.get_num_stored_elements(); ++i) {
         auto global_row = input_row_idxs[i];
         row_range_id = find_range(global_row, row_partition, row_range_id);
-        row_range_id = find_range(global_row, row_partition, row_range_id);
         auto row_part_id = row_part_ids[row_range_id];
+        row_part_ids_per_entry.get_data()[i] = row_part_id;
         if (row_part_id != local_part) {
             overlap_count.get_data()[row_part_id]++;
+            original_positions.get_data()[i] = i;
+        } else {
+            original_positions.get_data()[i] = -1;
         }
     }
+
+    auto comp = [row_part_ids_per_entry, local_part](auto i, auto j) {
+        comm_index_type a =
+            i == -1 ? local_part : row_part_ids_per_entry.get_const_data()[i];
+        comm_index_type b =
+            j == -1 ? local_part : row_part_ids_per_entry.get_const_data()[j];
+        return a < b;
+    };
+
+    std::stable_sort(original_positions.get_data(),
+                     original_positions.get_data() + num_input_elements, comp);
+    for (size_type i = 0; i < num_input_elements; i++) {
+        overlap_positions.get_data()[i] =
+            original_positions.get_const_data()[i] == -1 ? 0 : 1;
+    }
+
+    std::exclusive_scan(overlap_positions.get_data(),
+                        overlap_positions.get_data() + num_input_elements,
+                        overlap_positions.get_data(), 0);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
@@ -50,26 +78,22 @@ void fill_overlap_send_buffers(
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part, array<comm_index_type>& offsets,
+    comm_index_type local_part, const array<GlobalIndexType>& overlap_positions,
+    const array<GlobalIndexType>& original_positions,
     array<GlobalIndexType>& overlap_row_idxs,
     array<GlobalIndexType>& overlap_col_idxs, array<ValueType>& overlap_values)
 {
     auto input_row_idxs = input.get_const_row_idxs();
     auto input_col_idxs = input.get_const_col_idxs();
     auto input_vals = input.get_const_values();
-    auto row_part_ids = row_partition->get_part_ids();
 
-    size_type row_range_id = 0;
     for (size_type i = 0; i < input.get_num_stored_elements(); ++i) {
-        auto global_row = input_row_idxs[i];
-        row_range_id = find_range(global_row, row_partition, row_range_id);
-        row_range_id = find_range(global_row, row_partition, row_range_id);
-        auto row_part_id = row_part_ids[row_range_id];
-        if (row_part_id != local_part) {
-            auto idx = offsets.get_data()[row_part_id]++;
-            overlap_row_idxs.get_data()[idx] = global_row;
-            overlap_col_idxs.get_data()[idx] = input_col_idxs[i];
-            overlap_values.get_data()[idx] = input_vals[i];
+        auto in_pos = original_positions.get_const_data()[i];
+        if (in_pos >= 0) {
+            auto out_pos = overlap_positions.get_const_data()[i];
+            overlap_row_idxs.get_data()[out_pos] = input_row_idxs[in_pos];
+            overlap_col_idxs.get_data()[out_pos] = input_col_idxs[in_pos];
+            overlap_values.get_data()[out_pos] = input_vals[in_pos];
         }
     }
 }
diff --git a/reference/test/distributed/matrix_kernels.cpp b/reference/test/distributed/matrix_kernels.cpp
index 00d3fcd8895..80fc8eb3330 100644
--- a/reference/test/distributed/matrix_kernels.cpp
+++ b/reference/test/distributed/matrix_kernels.cpp
@@ -17,8 +17,6 @@
 #include <ginkgo/core/matrix/csr.hpp>
 
 #include "core/test/utils.hpp"
-#include "ginkgo/core/base/array.hpp"
-#include "ginkgo/core/base/types.hpp"
 
 
 namespace {
@@ -194,24 +192,37 @@ TYPED_TEST(Matrix, CountOverlapEntries)
     using git = typename TestFixture::global_index_type;
     using vt = typename TestFixture::value_type;
     using ca = gko::array<comm_index_type>;
+    using ga = gko::array<git>;
     this->mapping = {this->ref, {1, 0, 2, 2, 0, 1, 1}};
     std::vector<ca> overlap_count_ref{
         ca{this->ref, I<comm_index_type>{0, 5, 3}},
         ca{this->ref, I<comm_index_type>{4, 0, 3}},
         ca{this->ref, I<comm_index_type>{4, 5, 0}}};
+    std::vector<ga> overlap_pos_ref{
+        ga{this->ref, I<git>{0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7}},
+        ga{this->ref, I<git>{0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 5, 6}},
+        ga{this->ref, I<git>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9}}};
+    std::vector<ga> original_pos_ref{
+        ga{this->ref, I<git>{-1, -1, -1, -1, 0, 1, 9, 10, 11, 4, 5, 6}},
+        ga{this->ref, I<git>{2, 3, 7, 8, -1, -1, -1, -1, -1, 4, 5, 6}},
+        ga{this->ref, I<git>{2, 3, 7, 8, 0, 1, 9, 10, 11, -1, -1, -1}}};
     comm_index_type num_parts = 3;
     auto partition =
         gko::experimental::distributed::Partition<lit, git>::build_from_mapping(
             this->ref, this->mapping, num_parts);
     auto input = this->create_input_full_rank();
 
-    gko::array<comm_index_type> overlap_count{
-        this->ref, static_cast<gko::size_type>(num_parts)};
+    ca overlap_count{this->ref, static_cast<gko::size_type>(num_parts)};
+    ga overlap_positions{this->ref, input.get_num_stored_elements()};
+    ga original_positions{this->ref, input.get_num_stored_elements()};
     for (gko::size_type i = 0; i < num_parts; i++) {
         overlap_count.fill(0);
         gko::kernels::reference::distributed_matrix::count_overlap_entries(
-            this->ref, input, partition.get(), i, overlap_count);
+            this->ref, input, partition.get(), i, overlap_count,
+            overlap_positions, original_positions);
         GKO_ASSERT_ARRAY_EQ(overlap_count, overlap_count_ref[i]);
+        GKO_ASSERT_ARRAY_EQ(overlap_positions, overlap_pos_ref[i]);
+        GKO_ASSERT_ARRAY_EQ(original_positions, original_pos_ref[i]);
     }
 }
 
@@ -225,10 +236,14 @@ TYPED_TEST(Matrix, FillOverlapSendBuffers)
     using ga = gko::array<git>;
     using va = gko::array<vt>;
     this->mapping = {this->ref, {1, 0, 2, 2, 0, 1, 1}};
-    std::vector<ca> overlap_offsets{
-        ca{this->ref, I<comm_index_type>{0, 0, 5, 8}},
-        ca{this->ref, I<comm_index_type>{0, 4, 4, 7}},
-        ca{this->ref, I<comm_index_type>{0, 4, 9, 9}}};
+    std::vector<ga> overlap_positions{
+        ga{this->ref, I<git>{0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7}},
+        ga{this->ref, I<git>{0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 5, 6}},
+        ga{this->ref, I<git>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9}}};
+    std::vector<ga> original_positions{
+        ga{this->ref, I<git>{-1, -1, -1, -1, 0, 1, 9, 10, 11, 4, 5, 6}},
+        ga{this->ref, I<git>{2, 3, 7, 8, -1, -1, -1, -1, -1, 4, 5, 6}},
+        ga{this->ref, I<git>{2, 3, 7, 8, 0, 1, 9, 10, 11, -1, -1, -1}}};
     std::vector<ga> overlap_row_idxs_ref{
         ga{this->ref, I<git>{0, 0, 5, 5, 6, 2, 3, 3}},
         ga{this->ref, I<git>{1, 1, 4, 4, 2, 3, 3}},
@@ -251,15 +266,14 @@ TYPED_TEST(Matrix, FillOverlapSendBuffers)
     gko::array<git> overlap_col_idxs{this->ref};
     gko::array<vt> overlap_values{this->ref};
     for (gko::size_type i = 0; i < num_parts; i++) {
-        overlap_row_idxs.resize_and_reset(
-            overlap_offsets[i].get_data()[num_parts]);
-        overlap_col_idxs.resize_and_reset(
-            overlap_offsets[i].get_data()[num_parts]);
-        overlap_values.resize_and_reset(
-            overlap_offsets[i].get_data()[num_parts]);
+        auto num_entries = overlap_row_idxs_ref[i].get_size();
+        overlap_row_idxs.resize_and_reset(num_entries);
+        overlap_col_idxs.resize_and_reset(num_entries);
+        overlap_values.resize_and_reset(num_entries);
         gko::kernels::reference::distributed_matrix::fill_overlap_send_buffers(
-            this->ref, input, partition.get(), i, overlap_offsets[i],
-            overlap_row_idxs, overlap_col_idxs, overlap_values);
+            this->ref, input, partition.get(), i, overlap_positions[i],
+            original_positions[i], overlap_row_idxs, overlap_col_idxs,
+            overlap_values);
         GKO_ASSERT_ARRAY_EQ(overlap_row_idxs, overlap_row_idxs_ref[i]);
         GKO_ASSERT_ARRAY_EQ(overlap_col_idxs, overlap_col_idxs_ref[i]);
         GKO_ASSERT_ARRAY_EQ(overlap_values, overlap_values_ref[i]);
diff --git a/test/distributed/matrix_kernels.cpp b/test/distributed/matrix_kernels.cpp
index ad91d699496..6de772e8006 100644
--- a/test/distributed/matrix_kernels.cpp
+++ b/test/distributed/matrix_kernels.cpp
@@ -48,8 +48,9 @@ class Matrix : public CommonTestFixture {
     {
         gko::device_matrix_data<value_type, global_index_type> d_input{exec,
                                                                        input};
-        for (comm_index_type part = 0; part < row_partition->get_num_parts();
-             ++part) {
+        gko::size_type num_parts = row_partition->get_num_parts();
+        gko::size_type num_entries = input.get_num_stored_elements();
+        for (comm_index_type part = 0; part < num_parts; ++part) {
             gko::array<local_index_type> local_row_idxs{ref};
             gko::array<local_index_type> local_col_idxs{ref};
             gko::array<value_type> local_values{ref};
@@ -62,6 +63,55 @@ class Matrix : public CommonTestFixture {
             gko::array<local_index_type> d_non_local_row_idxs{exec};
             gko::array<global_index_type> d_non_local_col_idxs{exec};
             gko::array<value_type> d_non_local_values{exec};
+            gko::array<comm_index_type> overlap_count{ref, num_parts};
+            overlap_count.fill(0);
+            gko::array<comm_index_type> d_overlap_count{exec, num_parts};
+            d_overlap_count.fill(0);
+            gko::array<global_index_type> overlap_positions{ref, num_entries};
+            gko::array<global_index_type> d_overlap_positions{exec,
+                                                              num_entries};
+            gko::array<global_index_type> original_positions{ref, num_entries};
+            gko::array<global_index_type> d_original_positions{exec,
+                                                               num_entries};
+
+            gko::kernels::reference::distributed_matrix::count_overlap_entries(
+                ref, input, row_partition.get(), part, overlap_count,
+                overlap_positions, original_positions);
+            gko::kernels::GKO_DEVICE_NAMESPACE::distributed_matrix::
+                count_overlap_entries(
+                    exec, d_input, d_row_partition.get(), part, d_overlap_count,
+                    d_overlap_positions, d_original_positions);
+
+            gko::array<global_index_type> overlap_offsets{ref, num_parts + 1};
+            std::partial_sum(overlap_count.get_data(),
+                             overlap_count.get_data() + num_parts,
+                             overlap_offsets.get_data() + 1);
+            overlap_offsets.get_data()[0] = 0;
+            gko::array<global_index_type> d_overlap_offsets{exec,
+                                                            overlap_offsets};
+            gko::size_type num_overlap_entries =
+                overlap_offsets.get_data()[num_parts];
+            gko::array<global_index_type> overlap_row_idxs{ref,
+                                                           num_overlap_entries};
+            gko::array<global_index_type> overlap_col_idxs{ref,
+                                                           num_overlap_entries};
+            gko::array<value_type> overlap_values{ref, num_overlap_entries};
+            gko::array<global_index_type> d_overlap_row_idxs{
+                exec, num_overlap_entries};
+            gko::array<global_index_type> d_overlap_col_idxs{
+                exec, num_overlap_entries};
+            gko::array<value_type> d_overlap_values{exec, num_overlap_entries};
+
+            gko::kernels::reference::distributed_matrix::
+                fill_overlap_send_buffers(ref, input, row_partition.get(), part,
+                                          overlap_positions, original_positions,
+                                          overlap_row_idxs, overlap_col_idxs,
+                                          overlap_values);
+            gko::kernels::GKO_DEVICE_NAMESPACE::distributed_matrix::
+                fill_overlap_send_buffers(
+                    exec, d_input, d_row_partition.get(), part,
+                    d_overlap_positions, d_original_positions,
+                    d_overlap_row_idxs, d_overlap_col_idxs, d_overlap_values);
 
             gko::kernels::reference::distributed_matrix::
                 separate_local_nonlocal(
@@ -75,6 +125,12 @@ class Matrix : public CommonTestFixture {
                     d_non_local_row_idxs, d_non_local_col_idxs,
                     d_non_local_values);
 
+            GKO_ASSERT_ARRAY_EQ(overlap_positions, d_overlap_positions);
+            GKO_ASSERT_ARRAY_EQ(original_positions, d_original_positions);
+            GKO_ASSERT_ARRAY_EQ(overlap_count, d_overlap_count);
+            GKO_ASSERT_ARRAY_EQ(overlap_row_idxs, d_overlap_row_idxs);
+            GKO_ASSERT_ARRAY_EQ(overlap_col_idxs, d_overlap_col_idxs);
+            GKO_ASSERT_ARRAY_EQ(overlap_values, d_overlap_values);
             GKO_ASSERT_ARRAY_EQ(local_row_idxs, d_local_row_idxs);
             GKO_ASSERT_ARRAY_EQ(local_col_idxs, d_local_col_idxs);
             GKO_ASSERT_ARRAY_EQ(local_values, d_local_values);

From c697513a5a8f1bd25a993c9011ef4c7144112254 Mon Sep 17 00:00:00 2001
From: Fritz Goebel <fritz.goebel@kit.edu>
Date: Fri, 18 Oct 2024 17:45:52 +0200
Subject: [PATCH 311/448] Address Review comments

---
 .../cuda_hip/distributed/matrix_kernels.cpp   |  35 ++--
 core/device_hooks/common_kernels.inc.cpp      |   4 +-
 core/distributed/matrix.cpp                   | 149 +++++++++---------
 core/distributed/matrix_kernels.hpp           |  68 ++++----
 dpcpp/distributed/matrix_kernels.dp.cpp       |  20 +--
 include/ginkgo/core/distributed/matrix.hpp    |  12 +-
 omp/distributed/matrix_kernels.cpp            |  32 ++--
 reference/distributed/matrix_kernels.cpp      |  36 ++---
 reference/test/distributed/matrix_kernels.cpp |  66 ++++----
 test/distributed/matrix_kernels.cpp           |  91 +++++------
 test/mpi/matrix.cpp                           |   4 +-
 11 files changed, 253 insertions(+), 264 deletions(-)

diff --git a/common/cuda_hip/distributed/matrix_kernels.cpp b/common/cuda_hip/distributed/matrix_kernels.cpp
index ab3ec9da8b1..159ddbce296 100644
--- a/common/cuda_hip/distributed/matrix_kernels.cpp
+++ b/common/cuda_hip/distributed/matrix_kernels.cpp
@@ -53,13 +53,13 @@ struct input_type {
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void count_overlap_entries(
+void count_non_owning_entries(
     std::shared_ptr<const DefaultExecutor> exec,
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part, array<comm_index_type>& overlap_count,
-    array<GlobalIndexType>& overlap_positions,
+    comm_index_type local_part, array<comm_index_type>& send_count,
+    array<GlobalIndexType>& send_positions,
     array<GlobalIndexType>& original_positions)
 {
     auto row_part_ids = row_partition->get_part_ids();
@@ -96,13 +96,13 @@ void count_overlap_entries(
         original_positions.get_data());
     run_kernel(
         exec,
-        [] GKO_KERNEL(auto i, auto orig_positions, auto overl_positions) {
-            overl_positions[i] = orig_positions[i] >= 0 ? 1 : 0;
+        [] GKO_KERNEL(auto i, auto orig_positions, auto s_positions) {
+            s_positions[i] = orig_positions[i] >= 0 ? 1 : 0;
         },
         num_input_elements, original_positions.get_const_data(),
-        overlap_positions.get_data());
+        send_positions.get_data());
 
-    components::prefix_sum_nonnegative(exec, overlap_positions.get_data(),
+    components::prefix_sum_nonnegative(exec, send_positions.get_data(),
                                        num_input_elements);
     size_type num_parts = row_partition->get_num_parts();
     array<comm_index_type> row_part_ptrs{exec, num_parts + 1};
@@ -116,24 +116,23 @@ void count_overlap_entries(
         [] GKO_KERNEL(auto i, auto part_id, auto part_ptrs, auto count) {
             count[i] = i == part_id ? 0 : part_ptrs[i + 1] - part_ptrs[i];
         },
-        num_parts, local_part, row_part_ptrs.get_data(),
-        overlap_count.get_data());
+        num_parts, local_part, row_part_ptrs.get_data(), send_count.get_data());
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
-    GKO_DECLARE_COUNT_OVERLAP_ENTRIES);
+    GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void fill_overlap_send_buffers(
+void fill_send_buffers(
     std::shared_ptr<const DefaultExecutor> exec,
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part, const array<GlobalIndexType>& overlap_positions,
+    comm_index_type local_part, const array<GlobalIndexType>& send_positions,
     const array<GlobalIndexType>& original_positions,
-    array<GlobalIndexType>& overlap_row_idxs,
-    array<GlobalIndexType>& overlap_col_idxs, array<ValueType>& overlap_values)
+    array<GlobalIndexType>& send_row_idxs,
+    array<GlobalIndexType>& send_col_idxs, array<ValueType>& send_values)
 {
     auto num_entries = input.get_num_stored_elements();
     auto input_row_idxs = input.get_const_row_idxs();
@@ -152,13 +151,13 @@ void fill_overlap_send_buffers(
             }
         },
         num_entries, input_row_idxs, input_col_idxs, input_values,
-        original_positions.get_const_data(), overlap_positions.get_const_data(),
-        overlap_row_idxs.get_data(), overlap_col_idxs.get_data(),
-        overlap_values.get_data());
+        original_positions.get_const_data(), send_positions.get_const_data(),
+        send_row_idxs.get_data(), send_col_idxs.get_data(),
+        send_values.get_data());
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
-    GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS);
+    GKO_DECLARE_FILL_SEND_BUFFERS);
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 6870633c900..6f5874b81ec 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -283,9 +283,9 @@ GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
 namespace distributed_matrix {
 
 
-GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_COUNT_OVERLAP_ENTRIES);
 GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
-    GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS);
+    GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
+GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_FILL_SEND_BUFFERS);
 GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL);
 
 
diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp
index 3799895abf3..2c174f26806 100644
--- a/core/distributed/matrix.cpp
+++ b/core/distributed/matrix.cpp
@@ -21,10 +21,10 @@ namespace matrix {
 namespace {
 
 
-GKO_REGISTER_OPERATION(count_overlap_entries,
-                       distributed_matrix::count_overlap_entries);
-GKO_REGISTER_OPERATION(fill_overlap_send_buffers,
-                       distributed_matrix::fill_overlap_send_buffers);
+GKO_REGISTER_OPERATION(count_non_owning_entries,
+                       distributed_matrix::count_non_owning_entries);
+GKO_REGISTER_OPERATION(fill_send_buffers,
+                       distributed_matrix::fill_send_buffers);
 GKO_REGISTER_OPERATION(separate_local_nonlocal,
                        distributed_matrix::separate_local_nonlocal);
 
@@ -249,7 +249,7 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
         row_partition,
     std::shared_ptr<const Partition<local_index_type, global_index_type>>
         col_partition,
-    assembly assembly_type)
+    assembly_mode assembly_type)
 {
     const auto comm = this->get_communicator();
     GKO_ASSERT_EQ(data.get_size()[0], row_partition->get_size());
@@ -259,6 +259,8 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     auto exec = this->get_executor();
     auto local_part = comm.rank();
     auto use_host_buffer = mpi::requires_host_buffer(exec, comm);
+    auto tmp_row_partition = make_temporary_clone(exec, row_partition);
+    auto tmp_col_partition = make_temporary_clone(exec, col_partition);
 
     // set up LinOp sizes
     auto global_num_rows = row_partition->get_size();
@@ -267,74 +269,69 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     this->set_size(global_dim);
 
     device_matrix_data<value_type, global_index_type> all_data{exec};
-    if (assembly_type == assembly::communicate) {
+    if (assembly_type == assembly_mode::communicate) {
         size_type num_entries = data.get_num_stored_elements();
         size_type num_parts = comm.size();
-        array<comm_index_type> overlap_count{exec, num_parts};
-        array<global_index_type> overlap_positions{exec, num_entries};
+        array<comm_index_type> send_sizes{exec, num_parts};
+        array<global_index_type> send_positions{exec, num_entries};
         array<global_index_type> original_positions{exec, num_entries};
-        overlap_count.fill(0);
-        auto tmp_part = make_temporary_clone(exec, row_partition);
-        exec->run(matrix::make_count_overlap_entries(
-            data, tmp_part.get(), local_part, overlap_count, overlap_positions,
-            original_positions));
-
-        overlap_count.set_executor(exec->get_master());
-        std::vector<comm_index_type> overlap_send_sizes(
-            overlap_count.get_data(), overlap_count.get_data() + num_parts);
-        std::vector<comm_index_type> overlap_send_offsets(num_parts + 1);
-        std::vector<comm_index_type> overlap_recv_sizes(num_parts);
-        std::vector<comm_index_type> overlap_recv_offsets(num_parts + 1);
-
-        std::partial_sum(overlap_send_sizes.begin(), overlap_send_sizes.end(),
-                         overlap_send_offsets.begin() + 1);
-        comm.all_to_all(exec, overlap_send_sizes.data(), 1,
-                        overlap_recv_sizes.data(), 1);
-        std::partial_sum(overlap_recv_sizes.begin(), overlap_recv_sizes.end(),
-                         overlap_recv_offsets.begin() + 1);
-        overlap_send_offsets[0] = 0;
-        overlap_recv_offsets[0] = 0;
-
-        size_type n_send = overlap_send_offsets.back();
-        size_type n_recv = overlap_recv_offsets.back();
-        array<global_index_type> overlap_send_row_idxs{exec, n_send};
-        array<global_index_type> overlap_send_col_idxs{exec, n_send};
-        array<value_type> overlap_send_values{exec, n_send};
-        array<global_index_type> overlap_recv_row_idxs{exec, n_recv};
-        array<global_index_type> overlap_recv_col_idxs{exec, n_recv};
-        array<value_type> overlap_recv_values{exec, n_recv};
-        exec->run(matrix::make_fill_overlap_send_buffers(
-            data, tmp_part.get(), local_part, overlap_positions,
-            original_positions, overlap_send_row_idxs, overlap_send_col_idxs,
-            overlap_send_values));
+        send_sizes.fill(0);
+        exec->run(matrix::make_count_non_owning_entries(
+            data, tmp_row_partition.get(), local_part, send_sizes,
+            send_positions, original_positions));
+
+        send_sizes.set_executor(exec->get_master());
+        array<comm_index_type> send_offsets{exec->get_master(), num_parts + 1};
+        array<comm_index_type> recv_sizes{exec->get_master(), num_parts};
+        array<comm_index_type> recv_offsets{exec->get_master(), num_parts + 1};
+
+        std::partial_sum(send_sizes.get_data(),
+                         send_sizes.get_data() + num_parts,
+                         send_offsets.get_data() + 1);
+        comm.all_to_all(exec, send_sizes.get_data(), 1, recv_sizes.get_data(),
+                        1);
+        std::partial_sum(recv_sizes.get_data(),
+                         recv_sizes.get_data() + num_parts,
+                         recv_offsets.get_data() + 1);
+        send_offsets.get_data()[0] = 0;
+        recv_offsets.get_data()[0] = 0;
+
+        size_type n_send = send_offsets.get_data()[num_parts];
+        size_type n_recv = recv_offsets.get_data()[num_parts];
+        array<global_index_type> send_row_idxs{exec, n_send};
+        array<global_index_type> send_col_idxs{exec, n_send};
+        array<value_type> send_values{exec, n_send};
+        array<global_index_type> recv_row_idxs{exec, n_recv};
+        array<global_index_type> recv_col_idxs{exec, n_recv};
+        array<value_type> recv_values{exec, n_recv};
+        exec->run(matrix::make_fill_send_buffers(
+            data, tmp_row_partition.get(), local_part, send_positions,
+            original_positions, send_row_idxs, send_col_idxs, send_values));
 
         if (use_host_buffer) {
-            overlap_send_row_idxs.set_executor(exec->get_master());
-            overlap_send_col_idxs.set_executor(exec->get_master());
-            overlap_send_values.set_executor(exec->get_master());
-            overlap_recv_row_idxs.set_executor(exec->get_master());
-            overlap_recv_col_idxs.set_executor(exec->get_master());
-            overlap_recv_values.set_executor(exec->get_master());
+            send_row_idxs.set_executor(exec->get_master());
+            send_col_idxs.set_executor(exec->get_master());
+            send_values.set_executor(exec->get_master());
+            recv_row_idxs.set_executor(exec->get_master());
+            recv_col_idxs.set_executor(exec->get_master());
+            recv_values.set_executor(exec->get_master());
         }
-        comm.all_to_all_v(
-            use_host_buffer ? exec : exec->get_master(),
-            overlap_send_row_idxs.get_const_data(), overlap_send_sizes.data(),
-            overlap_send_offsets.data(), overlap_recv_row_idxs.get_data(),
-            overlap_recv_sizes.data(), overlap_recv_offsets.data());
-        comm.all_to_all_v(
-            use_host_buffer ? exec : exec->get_master(),
-            overlap_send_col_idxs.get_const_data(), overlap_send_sizes.data(),
-            overlap_send_offsets.data(), overlap_recv_col_idxs.get_data(),
-            overlap_recv_sizes.data(), overlap_recv_offsets.data());
-        comm.all_to_all_v(
-            use_host_buffer ? exec : exec->get_master(),
-            overlap_send_values.get_const_data(), overlap_send_sizes.data(),
-            overlap_send_offsets.data(), overlap_recv_values.get_data(),
-            overlap_recv_sizes.data(), overlap_recv_offsets.data());
+        comm.all_to_all_v(use_host_buffer ? exec : exec->get_master(),
+                          send_row_idxs.get_const_data(), send_sizes.get_data(),
+                          send_offsets.get_data(), recv_row_idxs.get_data(),
+                          recv_sizes.get_data(), recv_offsets.get_data());
+        comm.all_to_all_v(use_host_buffer ? exec : exec->get_master(),
+                          send_col_idxs.get_const_data(), send_sizes.get_data(),
+                          send_offsets.get_data(), recv_col_idxs.get_data(),
+                          recv_sizes.get_data(), recv_offsets.get_data());
+        comm.all_to_all_v(use_host_buffer ? exec : exec->get_master(),
+                          send_values.get_const_data(), send_sizes.get_data(),
+                          send_offsets.get_data(), recv_values.get_data(),
+                          recv_sizes.get_data(), recv_offsets.get_data());
         if (use_host_buffer) {
-            overlap_recv_row_idxs.set_executor(exec);
-            overlap_recv_col_idxs.set_executor(exec);
-            overlap_recv_values.set_executor(exec);
+            recv_row_idxs.set_executor(exec);
+            recv_col_idxs.set_executor(exec);
+            recv_values.set_executor(exec);
         }
 
         array<global_index_type> all_row_idxs{exec, num_entries + n_recv};
@@ -342,18 +339,19 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
         array<value_type> all_values{exec, num_entries + n_recv};
         exec->copy_from(exec, num_entries, data.get_const_row_idxs(),
                         all_row_idxs.get_data());
-        exec->copy_from(exec, n_recv, overlap_recv_row_idxs.get_data(),
+        exec->copy_from(exec, n_recv, recv_row_idxs.get_data(),
                         all_row_idxs.get_data() + num_entries);
         exec->copy_from(exec, num_entries, data.get_const_col_idxs(),
                         all_col_idxs.get_data());
-        exec->copy_from(exec, n_recv, overlap_recv_col_idxs.get_data(),
+        exec->copy_from(exec, n_recv, recv_col_idxs.get_data(),
                         all_col_idxs.get_data() + num_entries);
         exec->copy_from(exec, num_entries, data.get_const_values(),
                         all_values.get_data());
-        exec->copy_from(exec, n_recv, overlap_recv_values.get_data(),
+        exec->copy_from(exec, n_recv, recv_values.get_data(),
                         all_values.get_data() + num_entries);
         all_data = device_matrix_data<value_type, global_index_type>{
-            exec, global_dim, all_row_idxs, all_col_idxs, all_values};
+            exec, global_dim, std::move(all_row_idxs), std::move(all_col_idxs),
+            std::move(all_values)};
         all_data.sum_duplicates();
     }
 
@@ -370,9 +368,8 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     // as well as the rows of the non-local block. The columns of the non-local
     // block are still in global indices.
     exec->run(matrix::make_separate_local_nonlocal(
-        assembly_type == assembly::communicate ? all_data : data,
-        make_temporary_clone(exec, row_partition).get(),
-        make_temporary_clone(exec, col_partition).get(), local_part,
+        assembly_type == assembly_mode::communicate ? all_data : data,
+        tmp_row_partition.get(), tmp_col_partition.get(), local_part,
         local_row_idxs, local_col_idxs, local_values, non_local_row_idxs,
         global_non_local_col_idxs, non_local_values));
 
@@ -456,7 +453,7 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
         row_partition,
     std::shared_ptr<const Partition<local_index_type, global_index_type>>
         col_partition,
-    assembly assembly_type)
+    assembly_mode assembly_type)
 {
     return this->read_distributed(
         device_matrix_data<value_type, global_index_type>::create_from_host(
@@ -470,7 +467,7 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     const matrix_data<ValueType, global_index_type>& data,
     std::shared_ptr<const Partition<local_index_type, global_index_type>>
         partition,
-    assembly assembly_type)
+    assembly_mode assembly_type)
 {
     return this->read_distributed(
         device_matrix_data<value_type, global_index_type>::create_from_host(
@@ -484,7 +481,7 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     const device_matrix_data<ValueType, GlobalIndexType>& data,
     std::shared_ptr<const Partition<local_index_type, global_index_type>>
         partition,
-    assembly assembly_type)
+    assembly_mode assembly_type)
 {
     return this->read_distributed(data, partition, partition, assembly_type);
 }
diff --git a/core/distributed/matrix_kernels.hpp b/core/distributed/matrix_kernels.hpp
index 4cdaf3e17fe..3ba02c27718 100644
--- a/core/distributed/matrix_kernels.hpp
+++ b/core/distributed/matrix_kernels.hpp
@@ -19,31 +19,30 @@ namespace gko {
 namespace kernels {
 
 
-#define GKO_DECLARE_COUNT_OVERLAP_ENTRIES(ValueType, LocalIndexType,       \
-                                          GlobalIndexType)                 \
-    void count_overlap_entries(                                            \
-        std::shared_ptr<const DefaultExecutor> exec,                       \
-        const device_matrix_data<ValueType, GlobalIndexType>& input,       \
-        const experimental::distributed::Partition<                        \
-            LocalIndexType, GlobalIndexType>* row_partition,               \
-        comm_index_type local_part, array<comm_index_type>& overlap_count, \
-        array<GlobalIndexType>& overlap_positions,                         \
+#define GKO_DECLARE_COUNT_NON_OWNING_ENTRIES(ValueType, LocalIndexType, \
+                                             GlobalIndexType)           \
+    void count_non_owning_entries(                                      \
+        std::shared_ptr<const DefaultExecutor> exec,                    \
+        const device_matrix_data<ValueType, GlobalIndexType>& input,    \
+        const experimental::distributed::Partition<                     \
+            LocalIndexType, GlobalIndexType>* row_partition,            \
+        comm_index_type local_part, array<comm_index_type>& send_count, \
+        array<GlobalIndexType>& send_positions,                         \
         array<GlobalIndexType>& original_positions)
 
 
-#define GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS(ValueType, LocalIndexType, \
-                                              GlobalIndexType)           \
-    void fill_overlap_send_buffers(                                      \
-        std::shared_ptr<const DefaultExecutor> exec,                     \
-        const device_matrix_data<ValueType, GlobalIndexType>& input,     \
-        const experimental::distributed::Partition<                      \
-            LocalIndexType, GlobalIndexType>* row_partition,             \
-        comm_index_type local_part,                                      \
-        const array<GlobalIndexType>& overlap_positions,                 \
-        const array<GlobalIndexType>& original_positions,                \
-        array<GlobalIndexType>& overlap_row_idxs,                        \
-        array<GlobalIndexType>& overlap_col_idxs,                        \
-        array<ValueType>& overlap_values)
+#define GKO_DECLARE_FILL_SEND_BUFFERS(ValueType, LocalIndexType,     \
+                                      GlobalIndexType)               \
+    void fill_send_buffers(                                          \
+        std::shared_ptr<const DefaultExecutor> exec,                 \
+        const device_matrix_data<ValueType, GlobalIndexType>& input, \
+        const experimental::distributed::Partition<                  \
+            LocalIndexType, GlobalIndexType>* row_partition,         \
+        comm_index_type local_part,                                  \
+        const array<GlobalIndexType>& send_positions,                \
+        const array<GlobalIndexType>& original_positions,            \
+        array<GlobalIndexType>& send_row_idxs,                       \
+        array<GlobalIndexType>& send_col_idxs, array<ValueType>& send_values)
 
 
 #define GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL(ValueType, LocalIndexType,         \
@@ -62,19 +61,18 @@ namespace kernels {
         array<ValueType>& non_local_values)
 
 
-#define GKO_DECLARE_ALL_AS_TEMPLATES                                    \
-    using comm_index_type = experimental::distributed::comm_index_type; \
-    template <typename ValueType, typename LocalIndexType,              \
-              typename GlobalIndexType>                                 \
-    GKO_DECLARE_COUNT_OVERLAP_ENTRIES(ValueType, LocalIndexType,        \
-                                      GlobalIndexType);                 \
-    template <typename ValueType, typename LocalIndexType,              \
-              typename GlobalIndexType>                                 \
-    GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS(ValueType, LocalIndexType,    \
-                                          GlobalIndexType);             \
-    template <typename ValueType, typename LocalIndexType,              \
-              typename GlobalIndexType>                                 \
-    GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL(ValueType, LocalIndexType,      \
+#define GKO_DECLARE_ALL_AS_TEMPLATES                                           \
+    using comm_index_type = experimental::distributed::comm_index_type;        \
+    template <typename ValueType, typename LocalIndexType,                     \
+              typename GlobalIndexType>                                        \
+    GKO_DECLARE_COUNT_NON_OWNING_ENTRIES(ValueType, LocalIndexType,            \
+                                         GlobalIndexType);                     \
+    template <typename ValueType, typename LocalIndexType,                     \
+              typename GlobalIndexType>                                        \
+    GKO_DECLARE_FILL_SEND_BUFFERS(ValueType, LocalIndexType, GlobalIndexType); \
+    template <typename ValueType, typename LocalIndexType,                     \
+              typename GlobalIndexType>                                        \
+    GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL(ValueType, LocalIndexType,             \
                                         GlobalIndexType)
 
 
diff --git a/dpcpp/distributed/matrix_kernels.dp.cpp b/dpcpp/distributed/matrix_kernels.dp.cpp
index 60fc0686473..e7aad7689a8 100644
--- a/dpcpp/distributed/matrix_kernels.dp.cpp
+++ b/dpcpp/distributed/matrix_kernels.dp.cpp
@@ -14,33 +14,33 @@ namespace distributed_matrix {
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void count_overlap_entries(
+void non_owningverlap_entries(
     std::shared_ptr<const DefaultExecutor> exec,
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part, array<comm_index_type>& overlap_count,
-    array<GlobalIndexType>& overlap_positions,
+    comm_index_type local_part, array<comm_index_type>& send_count,
+    array<GlobalIndexType>& send_positions,
     array<GlobalIndexType>& original_positions) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
-    GKO_DECLARE_COUNT_OVERLAP_ENTRIES);
+    GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void fill_overlap_send_buffers(
+void fill_send_buffers(
     std::shared_ptr<const DefaultExecutor> exec,
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part, const array<GlobalIndexType>& overlap_positions,
+    comm_index_type local_part, const array<GlobalIndexType>& send_positions,
     const array<GlobalIndexType>& original_positions,
-    array<GlobalIndexType>& overlap_row_idxs,
-    array<GlobalIndexType>& overlap_col_idxs,
-    array<ValueType>& overlap_values) GKO_NOT_IMPLEMENTED;
+    array<GlobalIndexType>& send_row_idxs,
+    array<GlobalIndexType>& send_col_idxs,
+    array<ValueType>& send_values) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
-    GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS);
+    GKO_DECLARE_FILL_SEND_BUFFERS);
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp
index 7f1ec56a77b..0e8814e9cec 100644
--- a/include/ginkgo/core/distributed/matrix.hpp
+++ b/include/ginkgo/core/distributed/matrix.hpp
@@ -134,7 +134,7 @@ namespace distributed {
 
 
 /**
- * assembly defines how the read_distributed function of the distributed
+ * assembly_mode defines how the read_distributed function of the distributed
  * matrix treats non-local indices in the (device_)matrix_data:
  * - communicate communicates the overlap between ranks and adds up all local
  *   contributions. Indices smaller than 0 or larger than the global size
@@ -142,7 +142,7 @@ namespace distributed {
  * - local_only does not communicate any overlap but ignores all non-local
  *   indices.
  */
-enum class assembly { communicate, local_only };
+enum class assembly_mode { communicate, local_only };
 
 
 template <typename LocalIndexType, typename GlobalIndexType>
@@ -310,7 +310,7 @@ class Matrix
         const device_matrix_data<value_type, global_index_type>& data,
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
             partition,
-        assembly assembly_type = assembly::local_only);
+        assembly_mode assembly_type = assembly_mode::local_only);
 
     /**
      * Reads a square matrix from the matrix_data structure and a global
@@ -325,7 +325,7 @@ class Matrix
         const matrix_data<value_type, global_index_type>& data,
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
             partition,
-        assembly assembly_type = assembly::local_only);
+        assembly_mode assembly_type = assembly_mode::local_only);
 
     /**
      * Reads a matrix from the device_matrix_data structure, a global row
@@ -350,7 +350,7 @@ class Matrix
             row_partition,
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
             col_partition,
-        assembly assembly_type = assembly::local_only);
+        assembly_mode assembly_type = assembly_mode::local_only);
 
     /**
      * Reads a matrix from the matrix_data structure, a global row partition,
@@ -367,7 +367,7 @@ class Matrix
             row_partition,
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
             col_partition,
-        assembly assembly_type = assembly::local_only);
+        assembly_mode assembly_type = assembly_mode::local_only);
 
     /**
      * Get read access to the stored local matrix.
diff --git a/omp/distributed/matrix_kernels.cpp b/omp/distributed/matrix_kernels.cpp
index 55ee5524116..a42b1f230fe 100644
--- a/omp/distributed/matrix_kernels.cpp
+++ b/omp/distributed/matrix_kernels.cpp
@@ -23,13 +23,13 @@ namespace distributed_matrix {
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void count_overlap_entries(
+void count_non_owning_entries(
     std::shared_ptr<const DefaultExecutor> exec,
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part, array<comm_index_type>& overlap_count,
-    array<GlobalIndexType>& overlap_positions,
+    comm_index_type local_part, array<comm_index_type>& send_count,
+    array<GlobalIndexType>& send_positions,
     array<GlobalIndexType>& original_positions)
 {
     auto num_input_elements = input.get_num_stored_elements();
@@ -46,7 +46,7 @@ void count_overlap_entries(
         row_part_ids_per_entry.get_data()[i] = row_part_id;
         if (row_part_id != local_part) {
 #pragma omp atomic
-            overlap_count.get_data()[row_part_id]++;
+            send_count.get_data()[row_part_id]++;
             original_positions.get_data()[i] = i;
         } else {
             original_positions.get_data()[i] = -1;
@@ -65,28 +65,28 @@ void count_overlap_entries(
 
 #pragma omp parallel for
     for (size_type i = 0; i < num_input_elements; i++) {
-        overlap_positions.get_data()[i] =
+        send_positions.get_data()[i] =
             original_positions.get_const_data()[i] == -1 ? 0 : 1;
     }
 
-    components::prefix_sum_nonnegative(exec, overlap_positions.get_data(),
+    components::prefix_sum_nonnegative(exec, send_positions.get_data(),
                                        num_input_elements);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
-    GKO_DECLARE_COUNT_OVERLAP_ENTRIES);
+    GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void fill_overlap_send_buffers(
+void fill_send_buffers(
     std::shared_ptr<const DefaultExecutor> exec,
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part, const array<GlobalIndexType>& overlap_positions,
+    comm_index_type local_part, const array<GlobalIndexType>& send_positions,
     const array<GlobalIndexType>& original_positions,
-    array<GlobalIndexType>& overlap_row_idxs,
-    array<GlobalIndexType>& overlap_col_idxs, array<ValueType>& overlap_values)
+    array<GlobalIndexType>& send_row_idxs,
+    array<GlobalIndexType>& send_col_idxs, array<ValueType>& send_values)
 {
     auto input_row_idxs = input.get_const_row_idxs();
     auto input_col_idxs = input.get_const_col_idxs();
@@ -96,16 +96,16 @@ void fill_overlap_send_buffers(
     for (size_type i = 0; i < input.get_num_stored_elements(); ++i) {
         auto in_pos = original_positions.get_const_data()[i];
         if (in_pos >= 0) {
-            auto out_pos = overlap_positions.get_const_data()[i];
-            overlap_row_idxs.get_data()[out_pos] = input_row_idxs[in_pos];
-            overlap_col_idxs.get_data()[out_pos] = input_col_idxs[in_pos];
-            overlap_values.get_data()[out_pos] = input_vals[in_pos];
+            auto out_pos = send_positions.get_const_data()[i];
+            send_row_idxs.get_data()[out_pos] = input_row_idxs[in_pos];
+            send_col_idxs.get_data()[out_pos] = input_col_idxs[in_pos];
+            send_values.get_data()[out_pos] = input_vals[in_pos];
         }
     }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
-    GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS);
+    GKO_DECLARE_FILL_SEND_BUFFERS);
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
diff --git a/reference/distributed/matrix_kernels.cpp b/reference/distributed/matrix_kernels.cpp
index 6a57a64e075..1be2660721b 100644
--- a/reference/distributed/matrix_kernels.cpp
+++ b/reference/distributed/matrix_kernels.cpp
@@ -20,13 +20,13 @@ namespace distributed_matrix {
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void count_overlap_entries(
+void count_non_owning_entries(
     std::shared_ptr<const DefaultExecutor> exec,
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part, array<comm_index_type>& overlap_count,
-    array<GlobalIndexType>& overlap_positions,
+    comm_index_type local_part, array<comm_index_type>& send_count,
+    array<GlobalIndexType>& send_positions,
     array<GlobalIndexType>& original_positions)
 {
     auto num_input_elements = input.get_num_stored_elements();
@@ -41,7 +41,7 @@ void count_overlap_entries(
         auto row_part_id = row_part_ids[row_range_id];
         row_part_ids_per_entry.get_data()[i] = row_part_id;
         if (row_part_id != local_part) {
-            overlap_count.get_data()[row_part_id]++;
+            send_count.get_data()[row_part_id]++;
             original_positions.get_data()[i] = i;
         } else {
             original_positions.get_data()[i] = -1;
@@ -59,29 +59,29 @@ void count_overlap_entries(
     std::stable_sort(original_positions.get_data(),
                      original_positions.get_data() + num_input_elements, comp);
     for (size_type i = 0; i < num_input_elements; i++) {
-        overlap_positions.get_data()[i] =
+        send_positions.get_data()[i] =
             original_positions.get_const_data()[i] == -1 ? 0 : 1;
     }
 
-    std::exclusive_scan(overlap_positions.get_data(),
-                        overlap_positions.get_data() + num_input_elements,
-                        overlap_positions.get_data(), 0);
+    std::exclusive_scan(send_positions.get_data(),
+                        send_positions.get_data() + num_input_elements,
+                        send_positions.get_data(), 0);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
-    GKO_DECLARE_COUNT_OVERLAP_ENTRIES);
+    GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void fill_overlap_send_buffers(
+void fill_send_buffers(
     std::shared_ptr<const DefaultExecutor> exec,
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part, const array<GlobalIndexType>& overlap_positions,
+    comm_index_type local_part, const array<GlobalIndexType>& send_positions,
     const array<GlobalIndexType>& original_positions,
-    array<GlobalIndexType>& overlap_row_idxs,
-    array<GlobalIndexType>& overlap_col_idxs, array<ValueType>& overlap_values)
+    array<GlobalIndexType>& send_row_idxs,
+    array<GlobalIndexType>& send_col_idxs, array<ValueType>& send_values)
 {
     auto input_row_idxs = input.get_const_row_idxs();
     auto input_col_idxs = input.get_const_col_idxs();
@@ -90,16 +90,16 @@ void fill_overlap_send_buffers(
     for (size_type i = 0; i < input.get_num_stored_elements(); ++i) {
         auto in_pos = original_positions.get_const_data()[i];
         if (in_pos >= 0) {
-            auto out_pos = overlap_positions.get_const_data()[i];
-            overlap_row_idxs.get_data()[out_pos] = input_row_idxs[in_pos];
-            overlap_col_idxs.get_data()[out_pos] = input_col_idxs[in_pos];
-            overlap_values.get_data()[out_pos] = input_vals[in_pos];
+            auto out_pos = send_positions.get_const_data()[i];
+            send_row_idxs.get_data()[out_pos] = input_row_idxs[in_pos];
+            send_col_idxs.get_data()[out_pos] = input_col_idxs[in_pos];
+            send_values.get_data()[out_pos] = input_vals[in_pos];
         }
     }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
-    GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS);
+    GKO_DECLARE_FILL_SEND_BUFFERS);
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
diff --git a/reference/test/distributed/matrix_kernels.cpp b/reference/test/distributed/matrix_kernels.cpp
index 80fc8eb3330..80063f7e582 100644
--- a/reference/test/distributed/matrix_kernels.cpp
+++ b/reference/test/distributed/matrix_kernels.cpp
@@ -190,15 +190,13 @@ TYPED_TEST(Matrix, CountOverlapEntries)
 {
     using lit = typename TestFixture::local_index_type;
     using git = typename TestFixture::global_index_type;
-    using vt = typename TestFixture::value_type;
     using ca = gko::array<comm_index_type>;
     using ga = gko::array<git>;
     this->mapping = {this->ref, {1, 0, 2, 2, 0, 1, 1}};
-    std::vector<ca> overlap_count_ref{
-        ca{this->ref, I<comm_index_type>{0, 5, 3}},
-        ca{this->ref, I<comm_index_type>{4, 0, 3}},
-        ca{this->ref, I<comm_index_type>{4, 5, 0}}};
-    std::vector<ga> overlap_pos_ref{
+    std::vector<ca> send_count_ref{ca{this->ref, I<comm_index_type>{0, 5, 3}},
+                                   ca{this->ref, I<comm_index_type>{4, 0, 3}},
+                                   ca{this->ref, I<comm_index_type>{4, 5, 0}}};
+    std::vector<ga> send_pos_ref{
         ga{this->ref, I<git>{0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7}},
         ga{this->ref, I<git>{0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 5, 6}},
         ga{this->ref, I<git>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9}}};
@@ -212,16 +210,18 @@ TYPED_TEST(Matrix, CountOverlapEntries)
             this->ref, this->mapping, num_parts);
     auto input = this->create_input_full_rank();
 
-    ca overlap_count{this->ref, static_cast<gko::size_type>(num_parts)};
-    ga overlap_positions{this->ref, input.get_num_stored_elements()};
+    ca send_count{this->ref, static_cast<gko::size_type>(num_parts)};
+    ga send_positions{this->ref, input.get_num_stored_elements()};
     ga original_positions{this->ref, input.get_num_stored_elements()};
     for (gko::size_type i = 0; i < num_parts; i++) {
-        overlap_count.fill(0);
-        gko::kernels::reference::distributed_matrix::count_overlap_entries(
-            this->ref, input, partition.get(), i, overlap_count,
-            overlap_positions, original_positions);
-        GKO_ASSERT_ARRAY_EQ(overlap_count, overlap_count_ref[i]);
-        GKO_ASSERT_ARRAY_EQ(overlap_positions, overlap_pos_ref[i]);
+        send_count.fill(0);
+
+        gko::kernels::reference::distributed_matrix::count_non_owning_entries(
+            this->ref, input, partition.get(), i, send_count, send_positions,
+            original_positions);
+
+        GKO_ASSERT_ARRAY_EQ(send_count, send_count_ref[i]);
+        GKO_ASSERT_ARRAY_EQ(send_positions, send_pos_ref[i]);
         GKO_ASSERT_ARRAY_EQ(original_positions, original_pos_ref[i]);
     }
 }
@@ -232,11 +232,10 @@ TYPED_TEST(Matrix, FillOverlapSendBuffers)
     using lit = typename TestFixture::local_index_type;
     using git = typename TestFixture::global_index_type;
     using vt = typename TestFixture::value_type;
-    using ca = gko::array<comm_index_type>;
     using ga = gko::array<git>;
     using va = gko::array<vt>;
     this->mapping = {this->ref, {1, 0, 2, 2, 0, 1, 1}};
-    std::vector<ga> overlap_positions{
+    std::vector<ga> send_positions{
         ga{this->ref, I<git>{0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7}},
         ga{this->ref, I<git>{0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 5, 6}},
         ga{this->ref, I<git>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9}}};
@@ -244,15 +243,15 @@ TYPED_TEST(Matrix, FillOverlapSendBuffers)
         ga{this->ref, I<git>{-1, -1, -1, -1, 0, 1, 9, 10, 11, 4, 5, 6}},
         ga{this->ref, I<git>{2, 3, 7, 8, -1, -1, -1, -1, -1, 4, 5, 6}},
         ga{this->ref, I<git>{2, 3, 7, 8, 0, 1, 9, 10, 11, -1, -1, -1}}};
-    std::vector<ga> overlap_row_idxs_ref{
+    std::vector<ga> send_row_idxs_ref{
         ga{this->ref, I<git>{0, 0, 5, 5, 6, 2, 3, 3}},
         ga{this->ref, I<git>{1, 1, 4, 4, 2, 3, 3}},
         ga{this->ref, I<git>{1, 1, 4, 4, 0, 0, 5, 5, 6}}};
-    std::vector<ga> overlap_col_idxs_ref{
+    std::vector<ga> send_col_idxs_ref{
         ga{this->ref, I<git>{0, 3, 4, 5, 5, 2, 0, 3}},
         ga{this->ref, I<git>{1, 2, 4, 6, 2, 0, 3}},
         ga{this->ref, I<git>{1, 2, 4, 6, 0, 3, 4, 5, 5}}};
-    std::vector<va> overlap_values_ref{
+    std::vector<va> send_values_ref{
         va{this->ref, I<vt>{1, 2, 10, 11, 12, 5, 6, 7}},
         va{this->ref, I<vt>{3, 4, 8, 9, 5, 6, 7}},
         va{this->ref, I<vt>{3, 4, 8, 9, 1, 2, 10, 11, 12}}};
@@ -262,21 +261,22 @@ TYPED_TEST(Matrix, FillOverlapSendBuffers)
             this->ref, this->mapping, num_parts);
     auto input = this->create_input_full_rank();
 
-    gko::array<git> overlap_row_idxs{this->ref};
-    gko::array<git> overlap_col_idxs{this->ref};
-    gko::array<vt> overlap_values{this->ref};
+    gko::array<git> send_row_idxs{this->ref};
+    gko::array<git> send_col_idxs{this->ref};
+    gko::array<vt> send_values{this->ref};
     for (gko::size_type i = 0; i < num_parts; i++) {
-        auto num_entries = overlap_row_idxs_ref[i].get_size();
-        overlap_row_idxs.resize_and_reset(num_entries);
-        overlap_col_idxs.resize_and_reset(num_entries);
-        overlap_values.resize_and_reset(num_entries);
-        gko::kernels::reference::distributed_matrix::fill_overlap_send_buffers(
-            this->ref, input, partition.get(), i, overlap_positions[i],
-            original_positions[i], overlap_row_idxs, overlap_col_idxs,
-            overlap_values);
-        GKO_ASSERT_ARRAY_EQ(overlap_row_idxs, overlap_row_idxs_ref[i]);
-        GKO_ASSERT_ARRAY_EQ(overlap_col_idxs, overlap_col_idxs_ref[i]);
-        GKO_ASSERT_ARRAY_EQ(overlap_values, overlap_values_ref[i]);
+        auto num_entries = send_row_idxs_ref[i].get_size();
+        send_row_idxs.resize_and_reset(num_entries);
+        send_col_idxs.resize_and_reset(num_entries);
+        send_values.resize_and_reset(num_entries);
+
+        gko::kernels::reference::distributed_matrix::fill_send_buffers(
+            this->ref, input, partition.get(), i, send_positions[i],
+            original_positions[i], send_row_idxs, send_col_idxs, send_values);
+
+        GKO_ASSERT_ARRAY_EQ(send_row_idxs, send_row_idxs_ref[i]);
+        GKO_ASSERT_ARRAY_EQ(send_col_idxs, send_col_idxs_ref[i]);
+        GKO_ASSERT_ARRAY_EQ(send_values, send_values_ref[i]);
     }
 }
 
diff --git a/test/distributed/matrix_kernels.cpp b/test/distributed/matrix_kernels.cpp
index 6de772e8006..2cef5a49f92 100644
--- a/test/distributed/matrix_kernels.cpp
+++ b/test/distributed/matrix_kernels.cpp
@@ -63,55 +63,50 @@ class Matrix : public CommonTestFixture {
             gko::array<local_index_type> d_non_local_row_idxs{exec};
             gko::array<global_index_type> d_non_local_col_idxs{exec};
             gko::array<value_type> d_non_local_values{exec};
-            gko::array<comm_index_type> overlap_count{ref, num_parts};
-            overlap_count.fill(0);
-            gko::array<comm_index_type> d_overlap_count{exec, num_parts};
-            d_overlap_count.fill(0);
-            gko::array<global_index_type> overlap_positions{ref, num_entries};
-            gko::array<global_index_type> d_overlap_positions{exec,
-                                                              num_entries};
+            gko::array<comm_index_type> send_count{ref, num_parts};
+            send_count.fill(0);
+            gko::array<comm_index_type> d_send_count{exec, num_parts};
+            d_send_count.fill(0);
+            gko::array<global_index_type> send_positions{ref, num_entries};
+            gko::array<global_index_type> d_send_positions{exec, num_entries};
             gko::array<global_index_type> original_positions{ref, num_entries};
             gko::array<global_index_type> d_original_positions{exec,
                                                                num_entries};
 
-            gko::kernels::reference::distributed_matrix::count_overlap_entries(
-                ref, input, row_partition.get(), part, overlap_count,
-                overlap_positions, original_positions);
-            gko::kernels::GKO_DEVICE_NAMESPACE::distributed_matrix::
-                count_overlap_entries(
-                    exec, d_input, d_row_partition.get(), part, d_overlap_count,
-                    d_overlap_positions, d_original_positions);
-
-            gko::array<global_index_type> overlap_offsets{ref, num_parts + 1};
-            std::partial_sum(overlap_count.get_data(),
-                             overlap_count.get_data() + num_parts,
-                             overlap_offsets.get_data() + 1);
-            overlap_offsets.get_data()[0] = 0;
-            gko::array<global_index_type> d_overlap_offsets{exec,
-                                                            overlap_offsets};
-            gko::size_type num_overlap_entries =
-                overlap_offsets.get_data()[num_parts];
-            gko::array<global_index_type> overlap_row_idxs{ref,
-                                                           num_overlap_entries};
-            gko::array<global_index_type> overlap_col_idxs{ref,
-                                                           num_overlap_entries};
-            gko::array<value_type> overlap_values{ref, num_overlap_entries};
-            gko::array<global_index_type> d_overlap_row_idxs{
-                exec, num_overlap_entries};
-            gko::array<global_index_type> d_overlap_col_idxs{
-                exec, num_overlap_entries};
-            gko::array<value_type> d_overlap_values{exec, num_overlap_entries};
-
             gko::kernels::reference::distributed_matrix::
-                fill_overlap_send_buffers(ref, input, row_partition.get(), part,
-                                          overlap_positions, original_positions,
-                                          overlap_row_idxs, overlap_col_idxs,
-                                          overlap_values);
+                count_non_owning_entries(ref, input, row_partition.get(), part,
+                                         send_count, send_positions,
+                                         original_positions);
+            gko::kernels::GKO_DEVICE_NAMESPACE::distributed_matrix::
+                count_non_owning_entries(exec, d_input, d_row_partition.get(),
+                                         part, d_send_count, d_send_positions,
+                                         d_original_positions);
+
+            gko::array<global_index_type> send_offsets{ref, num_parts + 1};
+            std::partial_sum(send_count.get_data(),
+                             send_count.get_data() + num_parts,
+                             send_offsets.get_data() + 1);
+            send_offsets.get_data()[0] = 0;
+            gko::array<global_index_type> d_send_offsets{exec, send_offsets};
+            gko::size_type num_send_entries =
+                send_offsets.get_data()[num_parts];
+            gko::array<global_index_type> send_row_idxs{ref, num_send_entries};
+            gko::array<global_index_type> send_col_idxs{ref, num_send_entries};
+            gko::array<value_type> send_values{ref, num_send_entries};
+            gko::array<global_index_type> d_send_row_idxs{exec,
+                                                          num_send_entries};
+            gko::array<global_index_type> d_send_col_idxs{exec,
+                                                          num_send_entries};
+            gko::array<value_type> d_send_values{exec, num_send_entries};
+
+            gko::kernels::reference::distributed_matrix::fill_send_buffers(
+                ref, input, row_partition.get(), part, send_positions,
+                original_positions, send_row_idxs, send_col_idxs, send_values);
             gko::kernels::GKO_DEVICE_NAMESPACE::distributed_matrix::
-                fill_overlap_send_buffers(
-                    exec, d_input, d_row_partition.get(), part,
-                    d_overlap_positions, d_original_positions,
-                    d_overlap_row_idxs, d_overlap_col_idxs, d_overlap_values);
+                fill_send_buffers(exec, d_input, d_row_partition.get(), part,
+                                  d_send_positions, d_original_positions,
+                                  d_send_row_idxs, d_send_col_idxs,
+                                  d_send_values);
 
             gko::kernels::reference::distributed_matrix::
                 separate_local_nonlocal(
@@ -125,12 +120,12 @@ class Matrix : public CommonTestFixture {
                     d_non_local_row_idxs, d_non_local_col_idxs,
                     d_non_local_values);
 
-            GKO_ASSERT_ARRAY_EQ(overlap_positions, d_overlap_positions);
+            GKO_ASSERT_ARRAY_EQ(send_positions, d_send_positions);
             GKO_ASSERT_ARRAY_EQ(original_positions, d_original_positions);
-            GKO_ASSERT_ARRAY_EQ(overlap_count, d_overlap_count);
-            GKO_ASSERT_ARRAY_EQ(overlap_row_idxs, d_overlap_row_idxs);
-            GKO_ASSERT_ARRAY_EQ(overlap_col_idxs, d_overlap_col_idxs);
-            GKO_ASSERT_ARRAY_EQ(overlap_values, d_overlap_values);
+            GKO_ASSERT_ARRAY_EQ(send_count, d_send_count);
+            GKO_ASSERT_ARRAY_EQ(send_row_idxs, d_send_row_idxs);
+            GKO_ASSERT_ARRAY_EQ(send_col_idxs, d_send_col_idxs);
+            GKO_ASSERT_ARRAY_EQ(send_values, d_send_values);
             GKO_ASSERT_ARRAY_EQ(local_row_idxs, d_local_row_idxs);
             GKO_ASSERT_ARRAY_EQ(local_col_idxs, d_local_col_idxs);
             GKO_ASSERT_ARRAY_EQ(local_values, d_local_values);
diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp
index 1b25ad7eb6d..0cfb3aca477 100644
--- a/test/mpi/matrix.cpp
+++ b/test/mpi/matrix.cpp
@@ -152,7 +152,7 @@ TYPED_TEST(MatrixCreation, ReadsDistributedLocalDataWithCommunicate)
 
     this->dist_mat->read_distributed(
         this->dist_input[rank], this->row_part,
-        gko::experimental::distributed::assembly::communicate);
+        gko::experimental::distributed::assembly_mode::communicate);
 
     GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_local_matrix()),
                         res_local[rank], 0);
@@ -191,7 +191,7 @@ TYPED_TEST(MatrixCreation, ReadsDistributedWithColPartitionAndCommunicate)
 
     this->dist_mat->read_distributed(
         this->dist_input[rank], this->row_part, this->col_part,
-        gko::experimental::distributed::assembly::communicate);
+        gko::experimental::distributed::assembly_mode::communicate);
 
     GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_local_matrix()),
                         res_local[rank], 0);

From 75b590e03276fb8510cbdd1c00c1f0eda32bc639 Mon Sep 17 00:00:00 2001
From: Fritz Goebel <fritz.goebel@kit.edu>
Date: Mon, 21 Oct 2024 12:43:30 +0200
Subject: [PATCH 312/448] Small fixes

---
 core/distributed/matrix.cpp              | 2 +-
 dpcpp/distributed/matrix_kernels.dp.cpp  | 2 +-
 reference/distributed/matrix_kernels.cpp | 7 +++----
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp
index 2c174f26806..acd1b415ef5 100644
--- a/core/distributed/matrix.cpp
+++ b/core/distributed/matrix.cpp
@@ -275,7 +275,7 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
         array<comm_index_type> send_sizes{exec, num_parts};
         array<global_index_type> send_positions{exec, num_entries};
         array<global_index_type> original_positions{exec, num_entries};
-        send_sizes.fill(0);
+        send_sizes.fill(zero<comm_index_type>());
         exec->run(matrix::make_count_non_owning_entries(
             data, tmp_row_partition.get(), local_part, send_sizes,
             send_positions, original_positions));
diff --git a/dpcpp/distributed/matrix_kernels.dp.cpp b/dpcpp/distributed/matrix_kernels.dp.cpp
index e7aad7689a8..8582fab9378 100644
--- a/dpcpp/distributed/matrix_kernels.dp.cpp
+++ b/dpcpp/distributed/matrix_kernels.dp.cpp
@@ -14,7 +14,7 @@ namespace distributed_matrix {
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void non_owningverlap_entries(
+void count_non_owning_entries(
     std::shared_ptr<const DefaultExecutor> exec,
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
diff --git a/reference/distributed/matrix_kernels.cpp b/reference/distributed/matrix_kernels.cpp
index 1be2660721b..915ec109657 100644
--- a/reference/distributed/matrix_kernels.cpp
+++ b/reference/distributed/matrix_kernels.cpp
@@ -5,11 +5,11 @@
 #include "core/distributed/matrix_kernels.hpp"
 
 #include <algorithm>
-#include <numeric>
 
 #include "core/base/allocator.hpp"
 #include "core/base/device_matrix_data_kernels.hpp"
 #include "core/base/iterator_factory.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
 #include "reference/distributed/partition_helpers.hpp"
 
 
@@ -63,9 +63,8 @@ void count_non_owning_entries(
             original_positions.get_const_data()[i] == -1 ? 0 : 1;
     }
 
-    std::exclusive_scan(send_positions.get_data(),
-                        send_positions.get_data() + num_input_elements,
-                        send_positions.get_data(), 0);
+    components::prefix_sum_nonnegative(exec, send_positions.get_data(),
+                                       num_input_elements);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(

From 4335e90fe2c4c7d88a27ac864ea99158535219c7 Mon Sep 17 00:00:00 2001
From: Fritz Goebel <fritz.goebel@kit.edu>
Date: Mon, 21 Oct 2024 14:00:26 +0200
Subject: [PATCH 313/448] Add missing include

---
 core/distributed/matrix.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp
index acd1b415ef5..dc04e9a9545 100644
--- a/core/distributed/matrix.cpp
+++ b/core/distributed/matrix.cpp
@@ -4,6 +4,7 @@
 
 #include "ginkgo/core/distributed/matrix.hpp"
 
+#include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/precision_dispatch.hpp>
 #include <ginkgo/core/distributed/vector.hpp>
 #include <ginkgo/core/matrix/coo.hpp>

From d3c8a20d928934a842333087097332b455153c8c Mon Sep 17 00:00:00 2001
From: Fritz Goebel <fritz.goebel@kit.edu>
Date: Tue, 22 Oct 2024 10:49:55 +0200
Subject: [PATCH 314/448] Fix circular dependency with array.fill

---
 common/cuda_hip/distributed/matrix_kernels.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/common/cuda_hip/distributed/matrix_kernels.cpp b/common/cuda_hip/distributed/matrix_kernels.cpp
index 159ddbce296..a1a75a5af63 100644
--- a/common/cuda_hip/distributed/matrix_kernels.cpp
+++ b/common/cuda_hip/distributed/matrix_kernels.cpp
@@ -21,6 +21,7 @@
 #include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/components/atomic.hpp"
 #include "common/unified/base/kernel_launch.hpp"
+#include "core/components/fill_array_kernels.hpp"
 #include "core/components/format_conversion_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 
@@ -106,7 +107,9 @@ void count_non_owning_entries(
                                        num_input_elements);
     size_type num_parts = row_partition->get_num_parts();
     array<comm_index_type> row_part_ptrs{exec, num_parts + 1};
-    row_part_ptrs.fill(0);
+    components::fill_array(exec, row_part_ptrs.get_data(), num_parts + 1,
+                           zero<comm_index_type>());
+
     components::convert_idxs_to_ptrs(
         exec, row_part_ids_per_entry.get_const_data(), num_input_elements,
         num_parts, row_part_ptrs.get_data());

From ef2113a351eb4b44b12ae6ed89a3d3bf3c396653 Mon Sep 17 00:00:00 2001
From: Fritz Goebel <fritz.goebel@tum.de>
Date: Mon, 4 Nov 2024 10:51:40 +0100
Subject: [PATCH 315/448] Address Review comments

---
 .../cuda_hip/distributed/matrix_kernels.cpp   |   2 -
 core/distributed/matrix.cpp                   | 190 ++++++++++--------
 include/ginkgo/core/distributed/matrix.hpp    |   2 +
 omp/distributed/matrix_kernels.cpp            |   2 +-
 reference/distributed/matrix_kernels.cpp      |   2 +-
 5 files changed, 111 insertions(+), 87 deletions(-)

diff --git a/common/cuda_hip/distributed/matrix_kernels.cpp b/common/cuda_hip/distributed/matrix_kernels.cpp
index a1a75a5af63..dc736627ae2 100644
--- a/common/cuda_hip/distributed/matrix_kernels.cpp
+++ b/common/cuda_hip/distributed/matrix_kernels.cpp
@@ -107,8 +107,6 @@ void count_non_owning_entries(
                                        num_input_elements);
     size_type num_parts = row_partition->get_num_parts();
     array<comm_index_type> row_part_ptrs{exec, num_parts + 1};
-    components::fill_array(exec, row_part_ptrs.get_data(), num_parts + 1,
-                           zero<comm_index_type>());
 
     components::convert_idxs_to_ptrs(
         exec, row_part_ids_per_entry.get_const_data(), num_input_elements,
diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp
index dc04e9a9545..04f5bbf1ed4 100644
--- a/core/distributed/matrix.cpp
+++ b/core/distributed/matrix.cpp
@@ -31,6 +31,111 @@ GKO_REGISTER_OPERATION(separate_local_nonlocal,
 
 
 }  // namespace
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void communicate_non_owning(
+    std::shared_ptr<const Executor> exec, mpi::communicator comm,
+    const Partition<LocalIndexType, GlobalIndexType>* row_partition,
+    const Partition<LocalIndexType, GlobalIndexType>* col_partition,
+    const device_matrix_data<ValueType, GlobalIndexType>& input_data,
+    device_matrix_data<ValueType, GlobalIndexType>& all_data)
+{
+    size_type num_entries = input_data.get_num_stored_elements();
+    size_type num_parts = comm.size();
+    auto use_host_buffer = mpi::requires_host_buffer(exec, comm);
+    auto global_num_rows = row_partition->get_size();
+    auto global_num_cols = col_partition->get_size();
+    auto local_part = comm.rank();
+    dim<2> global_dim{global_num_rows, global_num_cols};
+    array<comm_index_type> send_sizes{exec, num_parts};
+    array<GlobalIndexType> send_positions{exec, num_entries};
+    array<GlobalIndexType> original_positions{exec, num_entries};
+    send_sizes.fill(zero<comm_index_type>());
+    exec->run(matrix::make_count_non_owning_entries(
+        input_data, row_partition, local_part, send_sizes, send_positions,
+        original_positions));
+
+    send_sizes.set_executor(exec->get_master());
+    array<comm_index_type> send_offsets{exec->get_master(), num_parts + 1};
+    array<comm_index_type> recv_sizes{exec->get_master(), num_parts};
+    array<comm_index_type> recv_offsets{exec->get_master(), num_parts + 1};
+
+    std::partial_sum(send_sizes.get_data(), send_sizes.get_data() + num_parts,
+                     send_offsets.get_data() + 1);
+    comm.all_to_all(exec, send_sizes.get_data(), 1, recv_sizes.get_data(), 1);
+    std::partial_sum(recv_sizes.get_data(), recv_sizes.get_data() + num_parts,
+                     recv_offsets.get_data() + 1);
+    send_offsets.get_data()[0] = 0;
+    recv_offsets.get_data()[0] = 0;
+
+    size_type n_send = send_offsets.get_data()[num_parts];
+    size_type n_recv = recv_offsets.get_data()[num_parts];
+    array<GlobalIndexType> send_row_idxs{exec, n_send};
+    array<GlobalIndexType> send_col_idxs{exec, n_send};
+    array<ValueType> send_values{exec, n_send};
+    array<GlobalIndexType> recv_row_idxs{exec, n_recv};
+    array<GlobalIndexType> recv_col_idxs{exec, n_recv};
+    array<ValueType> recv_values{exec, n_recv};
+    exec->run(matrix::make_fill_send_buffers(
+        input_data, row_partition, local_part, send_positions,
+        original_positions, send_row_idxs, send_col_idxs, send_values));
+
+    if (use_host_buffer) {
+        send_row_idxs.set_executor(exec->get_master());
+        send_col_idxs.set_executor(exec->get_master());
+        send_values.set_executor(exec->get_master());
+        recv_row_idxs.set_executor(exec->get_master());
+        recv_col_idxs.set_executor(exec->get_master());
+        recv_values.set_executor(exec->get_master());
+    }
+    auto row_req = comm.i_all_to_all_v(
+        use_host_buffer ? exec : exec->get_master(),
+        send_row_idxs.get_const_data(), send_sizes.get_data(),
+        send_offsets.get_data(), recv_row_idxs.get_data(),
+        recv_sizes.get_data(), recv_offsets.get_data());
+    auto col_req = comm.i_all_to_all_v(
+        use_host_buffer ? exec : exec->get_master(),
+        send_col_idxs.get_const_data(), send_sizes.get_data(),
+        send_offsets.get_data(), recv_col_idxs.get_data(),
+        recv_sizes.get_data(), recv_offsets.get_data());
+    auto val_req =
+        comm.i_all_to_all_v(use_host_buffer ? exec : exec->get_master(),
+                            send_values.get_const_data(), send_sizes.get_data(),
+                            send_offsets.get_data(), recv_values.get_data(),
+                            recv_sizes.get_data(), recv_offsets.get_data());
+
+    array<GlobalIndexType> all_row_idxs{exec, num_entries + n_recv};
+    array<GlobalIndexType> all_col_idxs{exec, num_entries + n_recv};
+    array<ValueType> all_values{exec, num_entries + n_recv};
+    exec->copy_from(exec, num_entries, input_data.get_const_row_idxs(),
+                    all_row_idxs.get_data());
+    exec->copy_from(exec, num_entries, input_data.get_const_values(),
+                    all_values.get_data());
+    exec->copy_from(exec, num_entries, input_data.get_const_col_idxs(),
+                    all_col_idxs.get_data());
+
+    row_req.wait();
+    col_req.wait();
+    val_req.wait();
+    if (use_host_buffer) {
+        recv_row_idxs.set_executor(exec);
+        recv_col_idxs.set_executor(exec);
+        recv_values.set_executor(exec);
+    }
+    exec->copy_from(exec, n_recv, recv_row_idxs.get_data(),
+                    all_row_idxs.get_data() + num_entries);
+    exec->copy_from(exec, n_recv, recv_col_idxs.get_data(),
+                    all_col_idxs.get_data() + num_entries);
+    exec->copy_from(exec, n_recv, recv_values.get_data(),
+                    all_values.get_data() + num_entries);
+    all_data = device_matrix_data<ValueType, GlobalIndexType>{
+        exec, global_dim, std::move(all_row_idxs), std::move(all_col_idxs),
+        std::move(all_values)};
+    all_data.sum_duplicates();
+}
+
+
 }  // namespace matrix
 
 
@@ -271,89 +376,8 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
 
     device_matrix_data<value_type, global_index_type> all_data{exec};
     if (assembly_type == assembly_mode::communicate) {
-        size_type num_entries = data.get_num_stored_elements();
-        size_type num_parts = comm.size();
-        array<comm_index_type> send_sizes{exec, num_parts};
-        array<global_index_type> send_positions{exec, num_entries};
-        array<global_index_type> original_positions{exec, num_entries};
-        send_sizes.fill(zero<comm_index_type>());
-        exec->run(matrix::make_count_non_owning_entries(
-            data, tmp_row_partition.get(), local_part, send_sizes,
-            send_positions, original_positions));
-
-        send_sizes.set_executor(exec->get_master());
-        array<comm_index_type> send_offsets{exec->get_master(), num_parts + 1};
-        array<comm_index_type> recv_sizes{exec->get_master(), num_parts};
-        array<comm_index_type> recv_offsets{exec->get_master(), num_parts + 1};
-
-        std::partial_sum(send_sizes.get_data(),
-                         send_sizes.get_data() + num_parts,
-                         send_offsets.get_data() + 1);
-        comm.all_to_all(exec, send_sizes.get_data(), 1, recv_sizes.get_data(),
-                        1);
-        std::partial_sum(recv_sizes.get_data(),
-                         recv_sizes.get_data() + num_parts,
-                         recv_offsets.get_data() + 1);
-        send_offsets.get_data()[0] = 0;
-        recv_offsets.get_data()[0] = 0;
-
-        size_type n_send = send_offsets.get_data()[num_parts];
-        size_type n_recv = recv_offsets.get_data()[num_parts];
-        array<global_index_type> send_row_idxs{exec, n_send};
-        array<global_index_type> send_col_idxs{exec, n_send};
-        array<value_type> send_values{exec, n_send};
-        array<global_index_type> recv_row_idxs{exec, n_recv};
-        array<global_index_type> recv_col_idxs{exec, n_recv};
-        array<value_type> recv_values{exec, n_recv};
-        exec->run(matrix::make_fill_send_buffers(
-            data, tmp_row_partition.get(), local_part, send_positions,
-            original_positions, send_row_idxs, send_col_idxs, send_values));
-
-        if (use_host_buffer) {
-            send_row_idxs.set_executor(exec->get_master());
-            send_col_idxs.set_executor(exec->get_master());
-            send_values.set_executor(exec->get_master());
-            recv_row_idxs.set_executor(exec->get_master());
-            recv_col_idxs.set_executor(exec->get_master());
-            recv_values.set_executor(exec->get_master());
-        }
-        comm.all_to_all_v(use_host_buffer ? exec : exec->get_master(),
-                          send_row_idxs.get_const_data(), send_sizes.get_data(),
-                          send_offsets.get_data(), recv_row_idxs.get_data(),
-                          recv_sizes.get_data(), recv_offsets.get_data());
-        comm.all_to_all_v(use_host_buffer ? exec : exec->get_master(),
-                          send_col_idxs.get_const_data(), send_sizes.get_data(),
-                          send_offsets.get_data(), recv_col_idxs.get_data(),
-                          recv_sizes.get_data(), recv_offsets.get_data());
-        comm.all_to_all_v(use_host_buffer ? exec : exec->get_master(),
-                          send_values.get_const_data(), send_sizes.get_data(),
-                          send_offsets.get_data(), recv_values.get_data(),
-                          recv_sizes.get_data(), recv_offsets.get_data());
-        if (use_host_buffer) {
-            recv_row_idxs.set_executor(exec);
-            recv_col_idxs.set_executor(exec);
-            recv_values.set_executor(exec);
-        }
-
-        array<global_index_type> all_row_idxs{exec, num_entries + n_recv};
-        array<global_index_type> all_col_idxs{exec, num_entries + n_recv};
-        array<value_type> all_values{exec, num_entries + n_recv};
-        exec->copy_from(exec, num_entries, data.get_const_row_idxs(),
-                        all_row_idxs.get_data());
-        exec->copy_from(exec, n_recv, recv_row_idxs.get_data(),
-                        all_row_idxs.get_data() + num_entries);
-        exec->copy_from(exec, num_entries, data.get_const_col_idxs(),
-                        all_col_idxs.get_data());
-        exec->copy_from(exec, n_recv, recv_col_idxs.get_data(),
-                        all_col_idxs.get_data() + num_entries);
-        exec->copy_from(exec, num_entries, data.get_const_values(),
-                        all_values.get_data());
-        exec->copy_from(exec, n_recv, recv_values.get_data(),
-                        all_values.get_data() + num_entries);
-        all_data = device_matrix_data<value_type, global_index_type>{
-            exec, global_dim, std::move(all_row_idxs), std::move(all_col_idxs),
-            std::move(all_values)};
-        all_data.sum_duplicates();
+        matrix::communicate_non_owning(exec, comm, tmp_row_partition.get(),
+                                       tmp_col_partition.get(), data, all_data);
     }
 
     // temporary storage for the output
diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp
index 0e8814e9cec..4470069aba7 100644
--- a/include/ginkgo/core/distributed/matrix.hpp
+++ b/include/ginkgo/core/distributed/matrix.hpp
@@ -303,6 +303,7 @@ class Matrix
      *
      * @param data  The device_matrix_data structure.
      * @param partition  The global row and column partition.
+     * @param assembly_mode  The mode of assembly.
      *
      * @return the index_map induced by the partitions and the matrix structure
      */
@@ -341,6 +342,7 @@ class Matrix
      * @param data  The device_matrix_data structure.
      * @param row_partition  The global row partition.
      * @param col_partition  The global col partition.
+     * @param assembly_mode  The mode of assembly.
      *
      * @return the index_map induced by the partitions and the matrix structure
      */
diff --git a/omp/distributed/matrix_kernels.cpp b/omp/distributed/matrix_kernels.cpp
index a42b1f230fe..8276519f0ff 100644
--- a/omp/distributed/matrix_kernels.cpp
+++ b/omp/distributed/matrix_kernels.cpp
@@ -53,7 +53,7 @@ void count_non_owning_entries(
         }
     }
 
-    auto comp = [row_part_ids_per_entry, local_part](auto i, auto j) {
+    auto comp = [&row_part_ids_per_entry, local_part](auto i, auto j) {
         comm_index_type a =
             i == -1 ? local_part : row_part_ids_per_entry.get_const_data()[i];
         comm_index_type b =
diff --git a/reference/distributed/matrix_kernels.cpp b/reference/distributed/matrix_kernels.cpp
index 915ec109657..410be379eff 100644
--- a/reference/distributed/matrix_kernels.cpp
+++ b/reference/distributed/matrix_kernels.cpp
@@ -48,7 +48,7 @@ void count_non_owning_entries(
         }
     }
 
-    auto comp = [row_part_ids_per_entry, local_part](auto i, auto j) {
+    auto comp = [&row_part_ids_per_entry, local_part](auto i, auto j) {
         comm_index_type a =
             i == -1 ? local_part : row_part_ids_per_entry.get_const_data()[i];
         comm_index_type b =

From 16dc9a88ca33546e7f106e0a1db498c0e0e2d94a Mon Sep 17 00:00:00 2001
From: Fritz Goebel <fritz.goebel@tum.de>
Date: Wed, 20 Nov 2024 18:20:48 +0100
Subject: [PATCH 316/448] Move additive read distributed to free function

---
 common/cuda_hip/CMakeLists.txt                |   1 +
 .../distributed/assembly_helpers_kernels.cpp  | 137 ++++++++++++
 .../cuda_hip/distributed/matrix_kernels.cpp   | 112 ----------
 core/CMakeLists.txt                           |   1 +
 core/device_hooks/common_kernels.inc.cpp      |  12 +-
 core/distributed/assembly_helpers.cpp         | 142 +++++++++++++
 core/distributed/assembly_helpers_kernels.hpp |  69 ++++++
 core/distributed/matrix.cpp                   | 169 ++++-----------
 core/distributed/matrix_kernels.hpp           |  43 +---
 dpcpp/CMakeLists.txt                          |   1 +
 .../assembly_helpers_kernels.dp.cpp           |  49 +++++
 dpcpp/distributed/matrix_kernels.dp.cpp       |  30 ---
 .../core/distributed/assembly_helpers.hpp     |  41 ++++
 include/ginkgo/core/distributed/matrix.hpp    |  45 +++-
 include/ginkgo/ginkgo.hpp                     |   1 +
 omp/CMakeLists.txt                            |   1 +
 omp/distributed/assembly_helpers_kernels.cpp  | 114 ++++++++++
 omp/distributed/matrix_kernels.cpp            |  88 --------
 reference/CMakeLists.txt                      |   1 +
 .../distributed/assembly_helpers_kernels.cpp  | 107 ++++++++++
 reference/distributed/matrix_kernels.cpp      |  85 --------
 reference/test/distributed/CMakeLists.txt     |   1 +
 .../distributed/assembly_helpers_kernels.cpp  | 153 ++++++++++++++
 reference/test/distributed/matrix_kernels.cpp |  95 ---------
 test/distributed/CMakeLists.txt               |   1 +
 test/distributed/assembly_helpers_kernels.cpp | 197 ++++++++++++++++++
 test/distributed/matrix_kernels.cpp           |  55 +----
 test/mpi/matrix.cpp                           |  20 --
 28 files changed, 1118 insertions(+), 653 deletions(-)
 create mode 100644 common/cuda_hip/distributed/assembly_helpers_kernels.cpp
 create mode 100644 core/distributed/assembly_helpers.cpp
 create mode 100644 core/distributed/assembly_helpers_kernels.hpp
 create mode 100644 dpcpp/distributed/assembly_helpers_kernels.dp.cpp
 create mode 100644 include/ginkgo/core/distributed/assembly_helpers.hpp
 create mode 100644 omp/distributed/assembly_helpers_kernels.cpp
 create mode 100644 reference/distributed/assembly_helpers_kernels.cpp
 create mode 100644 reference/test/distributed/assembly_helpers_kernels.cpp
 create mode 100644 test/distributed/assembly_helpers_kernels.cpp

diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt
index 267444d2144..dd38ca4f7b2 100644
--- a/common/cuda_hip/CMakeLists.txt
+++ b/common/cuda_hip/CMakeLists.txt
@@ -4,6 +4,7 @@ set(CUDA_HIP_SOURCES
     base/device_matrix_data_kernels.cpp
     base/index_set_kernels.cpp
     components/prefix_sum_kernels.cpp
+    distributed/assembly_helpers_kernels.cpp
     distributed/index_map_kernels.cpp
     distributed/matrix_kernels.cpp
     distributed/partition_helpers_kernels.cpp
diff --git a/common/cuda_hip/distributed/assembly_helpers_kernels.cpp b/common/cuda_hip/distributed/assembly_helpers_kernels.cpp
new file mode 100644
index 00000000000..85b65c8f56f
--- /dev/null
+++ b/common/cuda_hip/distributed/assembly_helpers_kernels.cpp
@@ -0,0 +1,137 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/distributed/assembly_helpers_kernels.hpp"
+
+#include <thrust/binary_search.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sort.h>
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/unified/base/kernel_launch.hpp"
+#include "core/components/fill_array_kernels.hpp"
+#include "core/components/format_conversion_kernels.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace assembly_helpers {
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void count_non_owning_entries(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    comm_index_type local_part, array<comm_index_type>& send_count,
+    array<GlobalIndexType>& send_positions,
+    array<GlobalIndexType>& original_positions)
+{
+    auto row_part_ids = row_partition->get_part_ids();
+    const auto* row_range_bounds = row_partition->get_range_bounds();
+    const auto* row_range_starting_indices =
+        row_partition->get_range_starting_indices();
+    const auto num_row_ranges = row_partition->get_num_ranges();
+    const auto num_input_elements = input.get_num_stored_elements();
+
+    auto policy = thrust_policy(exec);
+
+    // precompute the row and column range id of each input element
+    auto input_row_idxs = input.get_const_row_idxs();
+    array<size_type> row_range_ids{exec, num_input_elements};
+    thrust::upper_bound(policy, row_range_bounds + 1,
+                        row_range_bounds + num_row_ranges + 1, input_row_idxs,
+                        input_row_idxs + num_input_elements,
+                        row_range_ids.get_data());
+
+    array<comm_index_type> row_part_ids_per_entry{exec, num_input_elements};
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto part_id, auto part_ids, auto range_ids,
+                      auto part_ids_per_entry, auto orig_positions) {
+            part_ids_per_entry[i] = part_ids[range_ids[i]];
+            orig_positions[i] = part_ids_per_entry[i] == part_id ? -1 : i;
+        },
+        num_input_elements, local_part, row_part_ids, row_range_ids.get_data(),
+        row_part_ids_per_entry.get_data(), original_positions.get_data());
+
+    thrust::stable_sort_by_key(
+        policy, row_part_ids_per_entry.get_data(),
+        row_part_ids_per_entry.get_data() + num_input_elements,
+        original_positions.get_data());
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto orig_positions, auto s_positions) {
+            s_positions[i] = orig_positions[i] >= 0 ? 1 : 0;
+        },
+        num_input_elements, original_positions.get_const_data(),
+        send_positions.get_data());
+
+    components::prefix_sum_nonnegative(exec, send_positions.get_data(),
+                                       num_input_elements);
+    size_type num_parts = row_partition->get_num_parts();
+    array<comm_index_type> row_part_ptrs{exec, num_parts + 1};
+
+    components::convert_idxs_to_ptrs(
+        exec, row_part_ids_per_entry.get_const_data(), num_input_elements,
+        num_parts, row_part_ptrs.get_data());
+
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto part_id, auto part_ptrs, auto count) {
+            count[i] = i == part_id ? 0 : part_ptrs[i + 1] - part_ptrs[i];
+        },
+        num_parts, local_part, row_part_ptrs.get_data(), send_count.get_data());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void fill_send_buffers(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    comm_index_type local_part, const array<GlobalIndexType>& send_positions,
+    const array<GlobalIndexType>& original_positions,
+    array<GlobalIndexType>& send_row_idxs,
+    array<GlobalIndexType>& send_col_idxs, array<ValueType>& send_values)
+{
+    auto num_entries = input.get_num_stored_elements();
+    auto input_row_idxs = input.get_const_row_idxs();
+    auto input_col_idxs = input.get_const_col_idxs();
+    auto input_values = input.get_const_values();
+
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto in_rows, auto in_cols, auto in_vals,
+                      auto in_pos, auto out_pos, auto out_rows, auto out_cols,
+                      auto out_vals) {
+            if (in_pos[i] >= 0) {
+                out_rows[out_pos[i]] = in_rows[in_pos[i]];
+                out_cols[out_pos[i]] = in_cols[in_pos[i]];
+                out_vals[out_pos[i]] = in_vals[in_pos[i]];
+            }
+        },
+        num_entries, input_row_idxs, input_col_idxs, input_values,
+        original_positions.get_const_data(), send_positions.get_const_data(),
+        send_row_idxs.get_data(), send_col_idxs.get_data(),
+        send_values.get_data());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_FILL_SEND_BUFFERS);
+
+
+}  // namespace assembly_helpers
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/distributed/matrix_kernels.cpp b/common/cuda_hip/distributed/matrix_kernels.cpp
index dc736627ae2..88988febbb0 100644
--- a/common/cuda_hip/distributed/matrix_kernels.cpp
+++ b/common/cuda_hip/distributed/matrix_kernels.cpp
@@ -20,10 +20,6 @@
 
 #include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/components/atomic.hpp"
-#include "common/unified/base/kernel_launch.hpp"
-#include "core/components/fill_array_kernels.hpp"
-#include "core/components/format_conversion_kernels.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
 
 
 namespace gko {
@@ -53,114 +49,6 @@ struct input_type {
 };
 
 
-template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void count_non_owning_entries(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const device_matrix_data<ValueType, GlobalIndexType>& input,
-    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
-        row_partition,
-    comm_index_type local_part, array<comm_index_type>& send_count,
-    array<GlobalIndexType>& send_positions,
-    array<GlobalIndexType>& original_positions)
-{
-    auto row_part_ids = row_partition->get_part_ids();
-    const auto* row_range_bounds = row_partition->get_range_bounds();
-    const auto* row_range_starting_indices =
-        row_partition->get_range_starting_indices();
-    const auto num_row_ranges = row_partition->get_num_ranges();
-    const auto num_input_elements = input.get_num_stored_elements();
-
-    auto policy = thrust_policy(exec);
-
-    // precompute the row and column range id of each input element
-    auto input_row_idxs = input.get_const_row_idxs();
-    array<size_type> row_range_ids{exec, num_input_elements};
-    thrust::upper_bound(policy, row_range_bounds + 1,
-                        row_range_bounds + num_row_ranges + 1, input_row_idxs,
-                        input_row_idxs + num_input_elements,
-                        row_range_ids.get_data());
-
-    array<comm_index_type> row_part_ids_per_entry{exec, num_input_elements};
-    run_kernel(
-        exec,
-        [] GKO_KERNEL(auto i, auto part_id, auto part_ids, auto range_ids,
-                      auto part_ids_per_entry, auto orig_positions) {
-            part_ids_per_entry[i] = part_ids[range_ids[i]];
-            orig_positions[i] = part_ids_per_entry[i] == part_id ? -1 : i;
-        },
-        num_input_elements, local_part, row_part_ids, row_range_ids.get_data(),
-        row_part_ids_per_entry.get_data(), original_positions.get_data());
-
-    thrust::stable_sort_by_key(
-        policy, row_part_ids_per_entry.get_data(),
-        row_part_ids_per_entry.get_data() + num_input_elements,
-        original_positions.get_data());
-    run_kernel(
-        exec,
-        [] GKO_KERNEL(auto i, auto orig_positions, auto s_positions) {
-            s_positions[i] = orig_positions[i] >= 0 ? 1 : 0;
-        },
-        num_input_elements, original_positions.get_const_data(),
-        send_positions.get_data());
-
-    components::prefix_sum_nonnegative(exec, send_positions.get_data(),
-                                       num_input_elements);
-    size_type num_parts = row_partition->get_num_parts();
-    array<comm_index_type> row_part_ptrs{exec, num_parts + 1};
-
-    components::convert_idxs_to_ptrs(
-        exec, row_part_ids_per_entry.get_const_data(), num_input_elements,
-        num_parts, row_part_ptrs.get_data());
-
-    run_kernel(
-        exec,
-        [] GKO_KERNEL(auto i, auto part_id, auto part_ptrs, auto count) {
-            count[i] = i == part_id ? 0 : part_ptrs[i + 1] - part_ptrs[i];
-        },
-        num_parts, local_part, row_part_ptrs.get_data(), send_count.get_data());
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
-    GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
-
-
-template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void fill_send_buffers(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const device_matrix_data<ValueType, GlobalIndexType>& input,
-    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
-        row_partition,
-    comm_index_type local_part, const array<GlobalIndexType>& send_positions,
-    const array<GlobalIndexType>& original_positions,
-    array<GlobalIndexType>& send_row_idxs,
-    array<GlobalIndexType>& send_col_idxs, array<ValueType>& send_values)
-{
-    auto num_entries = input.get_num_stored_elements();
-    auto input_row_idxs = input.get_const_row_idxs();
-    auto input_col_idxs = input.get_const_col_idxs();
-    auto input_values = input.get_const_values();
-
-    run_kernel(
-        exec,
-        [] GKO_KERNEL(auto i, auto in_rows, auto in_cols, auto in_vals,
-                      auto in_pos, auto out_pos, auto out_rows, auto out_cols,
-                      auto out_vals) {
-            if (in_pos[i] >= 0) {
-                out_rows[out_pos[i]] = in_rows[in_pos[i]];
-                out_cols[out_pos[i]] = in_cols[in_pos[i]];
-                out_vals[out_pos[i]] = in_vals[in_pos[i]];
-            }
-        },
-        num_entries, input_row_idxs, input_col_idxs, input_values,
-        original_positions.get_const_data(), send_positions.get_const_data(),
-        send_row_idxs.get_data(), send_col_idxs.get_data(),
-        send_values.get_data());
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
-    GKO_DECLARE_FILL_SEND_BUFFERS);
-
-
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void separate_local_nonlocal(
     std::shared_ptr<const DefaultExecutor> exec,
diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
index 801ba46d248..bd98da373b7 100644
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -138,6 +138,7 @@ if(GINKGO_BUILD_MPI)
         PRIVATE
         distributed/vector_cache.cpp
         mpi/exception.cpp
+        distributed/assembly_helpers.cpp
         distributed/matrix.cpp
         distributed/partition_helpers.cpp
         distributed/vector.cpp
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 6f5874b81ec..cf21e423326 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -16,6 +16,7 @@
 #include "core/components/precision_conversion_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/components/reduce_array_kernels.hpp"
+#include "core/distributed/assembly_helpers_kernels.hpp"
 #include "core/distributed/index_map_kernels.hpp"
 #include "core/distributed/matrix_kernels.hpp"
 #include "core/distributed/partition_helpers_kernels.hpp"
@@ -280,12 +281,21 @@ GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
 
 }
 
-namespace distributed_matrix {
+
+namespace assembly_helpers {
 
 
 GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
 GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_FILL_SEND_BUFFERS);
+
+
+}  // namespace assembly_helpers
+
+
+namespace distributed_matrix {
+
+
 GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL);
 
 
diff --git a/core/distributed/assembly_helpers.cpp b/core/distributed/assembly_helpers.cpp
new file mode 100644
index 00000000000..ff2eec0bf32
--- /dev/null
+++ b/core/distributed/assembly_helpers.cpp
@@ -0,0 +1,142 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "ginkgo/core/distributed/assembly_helpers.hpp"
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/distributed/partition.hpp>
+
+#include "core/components/prefix_sum_kernels.hpp"
+#include "core/distributed/assembly_helpers_kernels.hpp"
+
+
+namespace gko {
+namespace experimental {
+namespace distributed {
+namespace assembly_helpers {
+namespace {
+
+
+GKO_REGISTER_OPERATION(count_non_owning_entries,
+                       assembly_helpers::count_non_owning_entries);
+GKO_REGISTER_OPERATION(fill_send_buffers, assembly_helpers::fill_send_buffers);
+
+
+}  // namespace
+}  // namespace assembly_helpers
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+device_matrix_data<ValueType, GlobalIndexType> assemble(
+    mpi::communicator comm,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    ptr_param<const Partition<LocalIndexType, GlobalIndexType>> partition)
+{
+    auto exec = input.get_executor();
+    size_type num_entries = input.get_num_stored_elements();
+    size_type num_parts = comm.size();
+    auto use_host_buffer = mpi::requires_host_buffer(exec, comm);
+    auto local_part = comm.rank();
+    auto global_dim = input.get_size();
+    array<comm_index_type> send_sizes{exec, num_parts};
+    array<GlobalIndexType> send_positions{exec, num_entries};
+    array<GlobalIndexType> original_positions{exec, num_entries};
+    send_sizes.fill(zero<comm_index_type>());
+    exec->run(assembly_helpers::make_count_non_owning_entries(
+        input, partition.get(), local_part, send_sizes, send_positions,
+        original_positions));
+
+    send_sizes.set_executor(exec->get_master());
+    array<comm_index_type> send_offsets{exec->get_master(), num_parts + 1};
+    array<comm_index_type> recv_sizes{exec->get_master(), num_parts};
+    array<comm_index_type> recv_offsets{exec->get_master(), num_parts + 1};
+
+    std::partial_sum(send_sizes.get_data(), send_sizes.get_data() + num_parts,
+                     send_offsets.get_data() + 1);
+    comm.all_to_all(exec, send_sizes.get_data(), 1, recv_sizes.get_data(), 1);
+    std::partial_sum(recv_sizes.get_data(), recv_sizes.get_data() + num_parts,
+                     recv_offsets.get_data() + 1);
+    send_offsets.get_data()[0] = 0;
+    recv_offsets.get_data()[0] = 0;
+
+    size_type n_send = send_offsets.get_data()[num_parts];
+    size_type n_recv = recv_offsets.get_data()[num_parts];
+    array<GlobalIndexType> send_row_idxs{exec, n_send};
+    array<GlobalIndexType> send_col_idxs{exec, n_send};
+    array<ValueType> send_values{exec, n_send};
+    array<GlobalIndexType> recv_row_idxs{exec, n_recv};
+    array<GlobalIndexType> recv_col_idxs{exec, n_recv};
+    array<ValueType> recv_values{exec, n_recv};
+    exec->run(assembly_helpers::make_fill_send_buffers(
+        input, partition.get(), local_part, send_positions, original_positions,
+        send_row_idxs, send_col_idxs, send_values));
+
+    if (use_host_buffer) {
+        send_row_idxs.set_executor(exec->get_master());
+        send_col_idxs.set_executor(exec->get_master());
+        send_values.set_executor(exec->get_master());
+        recv_row_idxs.set_executor(exec->get_master());
+        recv_col_idxs.set_executor(exec->get_master());
+        recv_values.set_executor(exec->get_master());
+    }
+    auto row_req = comm.i_all_to_all_v(
+        use_host_buffer ? exec : exec->get_master(),
+        send_row_idxs.get_const_data(), send_sizes.get_data(),
+        send_offsets.get_data(), recv_row_idxs.get_data(),
+        recv_sizes.get_data(), recv_offsets.get_data());
+    auto col_req = comm.i_all_to_all_v(
+        use_host_buffer ? exec : exec->get_master(),
+        send_col_idxs.get_const_data(), send_sizes.get_data(),
+        send_offsets.get_data(), recv_col_idxs.get_data(),
+        recv_sizes.get_data(), recv_offsets.get_data());
+    auto val_req =
+        comm.i_all_to_all_v(use_host_buffer ? exec : exec->get_master(),
+                            send_values.get_const_data(), send_sizes.get_data(),
+                            send_offsets.get_data(), recv_values.get_data(),
+                            recv_sizes.get_data(), recv_offsets.get_data());
+
+    array<GlobalIndexType> all_row_idxs{exec, num_entries + n_recv};
+    array<GlobalIndexType> all_col_idxs{exec, num_entries + n_recv};
+    array<ValueType> all_values{exec, num_entries + n_recv};
+    exec->copy_from(exec, num_entries, input.get_const_row_idxs(),
+                    all_row_idxs.get_data());
+    exec->copy_from(exec, num_entries, input.get_const_values(),
+                    all_values.get_data());
+    exec->copy_from(exec, num_entries, input.get_const_col_idxs(),
+                    all_col_idxs.get_data());
+
+    row_req.wait();
+    col_req.wait();
+    val_req.wait();
+    if (use_host_buffer) {
+        recv_row_idxs.set_executor(exec);
+        recv_col_idxs.set_executor(exec);
+        recv_values.set_executor(exec);
+    }
+    exec->copy_from(exec, n_recv, recv_row_idxs.get_data(),
+                    all_row_idxs.get_data() + num_entries);
+    exec->copy_from(exec, n_recv, recv_col_idxs.get_data(),
+                    all_col_idxs.get_data() + num_entries);
+    exec->copy_from(exec, n_recv, recv_values.get_data(),
+                    all_values.get_data() + num_entries);
+    auto all_data = device_matrix_data<ValueType, GlobalIndexType>{
+        exec, global_dim, std::move(all_row_idxs), std::move(all_col_idxs),
+        std::move(all_values)};
+    all_data.sum_duplicates();
+
+    return all_data;
+}
+
+#define GKO_DECLARE_ASSEMBLE(_value_type, _local_type, _global_type) \
+    device_matrix_data<_value_type, _global_type> assemble(          \
+        mpi::communicator comm,                                      \
+        const device_matrix_data<_value_type, _global_type>& input,  \
+        ptr_param<const Partition<_local_type, _global_type>> partition)
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_ASSEMBLE);
+
+
+}  // namespace distributed
+}  // namespace experimental
+}  // namespace gko
diff --git a/core/distributed/assembly_helpers_kernels.hpp b/core/distributed/assembly_helpers_kernels.hpp
new file mode 100644
index 00000000000..6adfc104487
--- /dev/null
+++ b/core/distributed/assembly_helpers_kernels.hpp
@@ -0,0 +1,69 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_CORE_ASSEMBLY_HELPERS_KERNELS_HPP_
+#define GKO_CORE_ASSEMBLY_HELPERS_KERNELS_HPP_
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/device_matrix_data.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/distributed/partition.hpp>
+
+#include "core/base/kernel_declaration.hpp"
+
+
+namespace gko {
+namespace kernels {
+
+
+#define GKO_DECLARE_COUNT_NON_OWNING_ENTRIES(ValueType, LocalIndexType, \
+                                             GlobalIndexType)           \
+    void count_non_owning_entries(                                      \
+        std::shared_ptr<const DefaultExecutor> exec,                    \
+        const device_matrix_data<ValueType, GlobalIndexType>& input,    \
+        const experimental::distributed::Partition<                     \
+            LocalIndexType, GlobalIndexType>* row_partition,            \
+        comm_index_type local_part, array<comm_index_type>& send_count, \
+        array<GlobalIndexType>& send_positions,                         \
+        array<GlobalIndexType>& original_positions)
+
+
+#define GKO_DECLARE_FILL_SEND_BUFFERS(ValueType, LocalIndexType,     \
+                                      GlobalIndexType)               \
+    void fill_send_buffers(                                          \
+        std::shared_ptr<const DefaultExecutor> exec,                 \
+        const device_matrix_data<ValueType, GlobalIndexType>& input, \
+        const experimental::distributed::Partition<                  \
+            LocalIndexType, GlobalIndexType>* row_partition,         \
+        comm_index_type local_part,                                  \
+        const array<GlobalIndexType>& send_positions,                \
+        const array<GlobalIndexType>& original_positions,            \
+        array<GlobalIndexType>& send_row_idxs,                       \
+        array<GlobalIndexType>& send_col_idxs, array<ValueType>& send_values)
+
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES                                    \
+    using comm_index_type = experimental::distributed::comm_index_type; \
+    template <typename ValueType, typename LocalIndexType,              \
+              typename GlobalIndexType>                                 \
+    GKO_DECLARE_COUNT_NON_OWNING_ENTRIES(ValueType, LocalIndexType,     \
+                                         GlobalIndexType);              \
+    template <typename ValueType, typename LocalIndexType,              \
+              typename GlobalIndexType>                                 \
+    GKO_DECLARE_FILL_SEND_BUFFERS(ValueType, LocalIndexType, GlobalIndexType)
+
+
+GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(assembly_helpers,
+                                        GKO_DECLARE_ALL_AS_TEMPLATES);
+
+
+#undef GKO_DECLARE_ALL_AS_TEMPLATES
+
+
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CORE_ASSEMBLY_HELPERS_KERNELS_HPP_
diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp
index 04f5bbf1ed4..c99f5128ec4 100644
--- a/core/distributed/matrix.cpp
+++ b/core/distributed/matrix.cpp
@@ -6,6 +6,7 @@
 
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/precision_dispatch.hpp>
+#include <ginkgo/core/distributed/assembly_helpers.hpp>
 #include <ginkgo/core/distributed/vector.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
@@ -22,120 +23,11 @@ namespace matrix {
 namespace {
 
 
-GKO_REGISTER_OPERATION(count_non_owning_entries,
-                       distributed_matrix::count_non_owning_entries);
-GKO_REGISTER_OPERATION(fill_send_buffers,
-                       distributed_matrix::fill_send_buffers);
 GKO_REGISTER_OPERATION(separate_local_nonlocal,
                        distributed_matrix::separate_local_nonlocal);
 
 
 }  // namespace
-
-
-template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void communicate_non_owning(
-    std::shared_ptr<const Executor> exec, mpi::communicator comm,
-    const Partition<LocalIndexType, GlobalIndexType>* row_partition,
-    const Partition<LocalIndexType, GlobalIndexType>* col_partition,
-    const device_matrix_data<ValueType, GlobalIndexType>& input_data,
-    device_matrix_data<ValueType, GlobalIndexType>& all_data)
-{
-    size_type num_entries = input_data.get_num_stored_elements();
-    size_type num_parts = comm.size();
-    auto use_host_buffer = mpi::requires_host_buffer(exec, comm);
-    auto global_num_rows = row_partition->get_size();
-    auto global_num_cols = col_partition->get_size();
-    auto local_part = comm.rank();
-    dim<2> global_dim{global_num_rows, global_num_cols};
-    array<comm_index_type> send_sizes{exec, num_parts};
-    array<GlobalIndexType> send_positions{exec, num_entries};
-    array<GlobalIndexType> original_positions{exec, num_entries};
-    send_sizes.fill(zero<comm_index_type>());
-    exec->run(matrix::make_count_non_owning_entries(
-        input_data, row_partition, local_part, send_sizes, send_positions,
-        original_positions));
-
-    send_sizes.set_executor(exec->get_master());
-    array<comm_index_type> send_offsets{exec->get_master(), num_parts + 1};
-    array<comm_index_type> recv_sizes{exec->get_master(), num_parts};
-    array<comm_index_type> recv_offsets{exec->get_master(), num_parts + 1};
-
-    std::partial_sum(send_sizes.get_data(), send_sizes.get_data() + num_parts,
-                     send_offsets.get_data() + 1);
-    comm.all_to_all(exec, send_sizes.get_data(), 1, recv_sizes.get_data(), 1);
-    std::partial_sum(recv_sizes.get_data(), recv_sizes.get_data() + num_parts,
-                     recv_offsets.get_data() + 1);
-    send_offsets.get_data()[0] = 0;
-    recv_offsets.get_data()[0] = 0;
-
-    size_type n_send = send_offsets.get_data()[num_parts];
-    size_type n_recv = recv_offsets.get_data()[num_parts];
-    array<GlobalIndexType> send_row_idxs{exec, n_send};
-    array<GlobalIndexType> send_col_idxs{exec, n_send};
-    array<ValueType> send_values{exec, n_send};
-    array<GlobalIndexType> recv_row_idxs{exec, n_recv};
-    array<GlobalIndexType> recv_col_idxs{exec, n_recv};
-    array<ValueType> recv_values{exec, n_recv};
-    exec->run(matrix::make_fill_send_buffers(
-        input_data, row_partition, local_part, send_positions,
-        original_positions, send_row_idxs, send_col_idxs, send_values));
-
-    if (use_host_buffer) {
-        send_row_idxs.set_executor(exec->get_master());
-        send_col_idxs.set_executor(exec->get_master());
-        send_values.set_executor(exec->get_master());
-        recv_row_idxs.set_executor(exec->get_master());
-        recv_col_idxs.set_executor(exec->get_master());
-        recv_values.set_executor(exec->get_master());
-    }
-    auto row_req = comm.i_all_to_all_v(
-        use_host_buffer ? exec : exec->get_master(),
-        send_row_idxs.get_const_data(), send_sizes.get_data(),
-        send_offsets.get_data(), recv_row_idxs.get_data(),
-        recv_sizes.get_data(), recv_offsets.get_data());
-    auto col_req = comm.i_all_to_all_v(
-        use_host_buffer ? exec : exec->get_master(),
-        send_col_idxs.get_const_data(), send_sizes.get_data(),
-        send_offsets.get_data(), recv_col_idxs.get_data(),
-        recv_sizes.get_data(), recv_offsets.get_data());
-    auto val_req =
-        comm.i_all_to_all_v(use_host_buffer ? exec : exec->get_master(),
-                            send_values.get_const_data(), send_sizes.get_data(),
-                            send_offsets.get_data(), recv_values.get_data(),
-                            recv_sizes.get_data(), recv_offsets.get_data());
-
-    array<GlobalIndexType> all_row_idxs{exec, num_entries + n_recv};
-    array<GlobalIndexType> all_col_idxs{exec, num_entries + n_recv};
-    array<ValueType> all_values{exec, num_entries + n_recv};
-    exec->copy_from(exec, num_entries, input_data.get_const_row_idxs(),
-                    all_row_idxs.get_data());
-    exec->copy_from(exec, num_entries, input_data.get_const_values(),
-                    all_values.get_data());
-    exec->copy_from(exec, num_entries, input_data.get_const_col_idxs(),
-                    all_col_idxs.get_data());
-
-    row_req.wait();
-    col_req.wait();
-    val_req.wait();
-    if (use_host_buffer) {
-        recv_row_idxs.set_executor(exec);
-        recv_col_idxs.set_executor(exec);
-        recv_values.set_executor(exec);
-    }
-    exec->copy_from(exec, n_recv, recv_row_idxs.get_data(),
-                    all_row_idxs.get_data() + num_entries);
-    exec->copy_from(exec, n_recv, recv_col_idxs.get_data(),
-                    all_col_idxs.get_data() + num_entries);
-    exec->copy_from(exec, n_recv, recv_values.get_data(),
-                    all_values.get_data() + num_entries);
-    all_data = device_matrix_data<ValueType, GlobalIndexType>{
-        exec, global_dim, std::move(all_row_idxs), std::move(all_col_idxs),
-        std::move(all_values)};
-    all_data.sum_duplicates();
-}
-
-
 }  // namespace matrix
 
 
@@ -354,8 +246,7 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     std::shared_ptr<const Partition<local_index_type, global_index_type>>
         row_partition,
     std::shared_ptr<const Partition<local_index_type, global_index_type>>
-        col_partition,
-    assembly_mode assembly_type)
+        col_partition)
 {
     const auto comm = this->get_communicator();
     GKO_ASSERT_EQ(data.get_size()[0], row_partition->get_size());
@@ -374,12 +265,6 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     dim<2> global_dim{global_num_rows, global_num_cols};
     this->set_size(global_dim);
 
-    device_matrix_data<value_type, global_index_type> all_data{exec};
-    if (assembly_type == assembly_mode::communicate) {
-        matrix::communicate_non_owning(exec, comm, tmp_row_partition.get(),
-                                       tmp_col_partition.get(), data, all_data);
-    }
-
     // temporary storage for the output
     array<local_index_type> local_row_idxs{exec};
     array<local_index_type> local_col_idxs{exec};
@@ -393,8 +278,7 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     // as well as the rows of the non-local block. The columns of the non-local
     // block are still in global indices.
     exec->run(matrix::make_separate_local_nonlocal(
-        assembly_type == assembly_mode::communicate ? all_data : data,
-        tmp_row_partition.get(), tmp_col_partition.get(), local_part,
+        data, tmp_row_partition.get(), tmp_col_partition.get(), local_part,
         local_row_idxs, local_col_idxs, local_values, non_local_row_idxs,
         global_non_local_col_idxs, non_local_values));
 
@@ -477,13 +361,25 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     std::shared_ptr<const Partition<local_index_type, global_index_type>>
         row_partition,
     std::shared_ptr<const Partition<local_index_type, global_index_type>>
-        col_partition,
-    assembly_mode assembly_type)
+        col_partition)
 {
     return this->read_distributed(
         device_matrix_data<value_type, global_index_type>::create_from_host(
             this->get_executor(), data),
-        row_partition, col_partition, assembly_type);
+        row_partition, col_partition);
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
+    const matrix_data<ValueType, global_index_type>& data,
+    std::shared_ptr<const Partition<local_index_type, global_index_type>>
+        partition)
+{
+    return this->read_distributed(
+        device_matrix_data<value_type, global_index_type>::create_from_host(
+            this->get_executor(), data),
+        partition, partition);
 }
 
 
@@ -494,10 +390,28 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
         partition,
     assembly_mode assembly_type)
 {
-    return this->read_distributed(
+    if (assembly_type == assembly_mode::local_only) {
+        return this->read_distributed(
+            device_matrix_data<value_type, global_index_type>::create_from_host(
+                this->get_executor(), data),
+            partition, partition);
+    }
+    auto all_data = assemble<ValueType, LocalIndexType, GlobalIndexType>(
+        this->get_communicator(),
         device_matrix_data<value_type, global_index_type>::create_from_host(
             this->get_executor(), data),
-        partition, partition, assembly_type);
+        partition);
+    return this->read_distributed(all_data, partition, partition);
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
+    const device_matrix_data<ValueType, GlobalIndexType>& data,
+    std::shared_ptr<const Partition<local_index_type, global_index_type>>
+        partition)
+{
+    return this->read_distributed(data, partition, partition);
 }
 
 
@@ -508,7 +422,12 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
         partition,
     assembly_mode assembly_type)
 {
-    return this->read_distributed(data, partition, partition, assembly_type);
+    if (assembly_type == assembly_mode::local_only) {
+        return this->read_distributed(data, partition, partition);
+    }
+    auto all_data = assemble<ValueType, LocalIndexType, GlobalIndexType>(
+        this->get_communicator(), data, partition);
+    return this->read_distributed(all_data, partition, partition);
 }
 
 
diff --git a/core/distributed/matrix_kernels.hpp b/core/distributed/matrix_kernels.hpp
index 3ba02c27718..f24e8c9945e 100644
--- a/core/distributed/matrix_kernels.hpp
+++ b/core/distributed/matrix_kernels.hpp
@@ -19,32 +19,6 @@ namespace gko {
 namespace kernels {
 
 
-#define GKO_DECLARE_COUNT_NON_OWNING_ENTRIES(ValueType, LocalIndexType, \
-                                             GlobalIndexType)           \
-    void count_non_owning_entries(                                      \
-        std::shared_ptr<const DefaultExecutor> exec,                    \
-        const device_matrix_data<ValueType, GlobalIndexType>& input,    \
-        const experimental::distributed::Partition<                     \
-            LocalIndexType, GlobalIndexType>* row_partition,            \
-        comm_index_type local_part, array<comm_index_type>& send_count, \
-        array<GlobalIndexType>& send_positions,                         \
-        array<GlobalIndexType>& original_positions)
-
-
-#define GKO_DECLARE_FILL_SEND_BUFFERS(ValueType, LocalIndexType,     \
-                                      GlobalIndexType)               \
-    void fill_send_buffers(                                          \
-        std::shared_ptr<const DefaultExecutor> exec,                 \
-        const device_matrix_data<ValueType, GlobalIndexType>& input, \
-        const experimental::distributed::Partition<                  \
-            LocalIndexType, GlobalIndexType>* row_partition,         \
-        comm_index_type local_part,                                  \
-        const array<GlobalIndexType>& send_positions,                \
-        const array<GlobalIndexType>& original_positions,            \
-        array<GlobalIndexType>& send_row_idxs,                       \
-        array<GlobalIndexType>& send_col_idxs, array<ValueType>& send_values)
-
-
 #define GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL(ValueType, LocalIndexType,         \
                                             GlobalIndexType)                   \
     void separate_local_nonlocal(                                              \
@@ -61,18 +35,11 @@ namespace kernels {
         array<ValueType>& non_local_values)
 
 
-#define GKO_DECLARE_ALL_AS_TEMPLATES                                           \
-    using comm_index_type = experimental::distributed::comm_index_type;        \
-    template <typename ValueType, typename LocalIndexType,                     \
-              typename GlobalIndexType>                                        \
-    GKO_DECLARE_COUNT_NON_OWNING_ENTRIES(ValueType, LocalIndexType,            \
-                                         GlobalIndexType);                     \
-    template <typename ValueType, typename LocalIndexType,                     \
-              typename GlobalIndexType>                                        \
-    GKO_DECLARE_FILL_SEND_BUFFERS(ValueType, LocalIndexType, GlobalIndexType); \
-    template <typename ValueType, typename LocalIndexType,                     \
-              typename GlobalIndexType>                                        \
-    GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL(ValueType, LocalIndexType,             \
+#define GKO_DECLARE_ALL_AS_TEMPLATES                                    \
+    using comm_index_type = experimental::distributed::comm_index_type; \
+    template <typename ValueType, typename LocalIndexType,              \
+              typename GlobalIndexType>                                 \
+    GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL(ValueType, LocalIndexType,      \
                                         GlobalIndexType)
 
 
diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index fcf123a513b..15055cc6645 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -20,6 +20,7 @@ target_sources(ginkgo_dpcpp
     base/timer.dp.cpp
     base/version.dp.cpp
     components/prefix_sum_kernels.dp.cpp
+    distributed/assembly_helpers_kernels.dp.cpp
     distributed/index_map_kernels.dp.cpp
     distributed/matrix_kernels.dp.cpp
     distributed/partition_helpers_kernels.dp.cpp
diff --git a/dpcpp/distributed/assembly_helpers_kernels.dp.cpp b/dpcpp/distributed/assembly_helpers_kernels.dp.cpp
new file mode 100644
index 00000000000..f86b46f0846
--- /dev/null
+++ b/dpcpp/distributed/assembly_helpers_kernels.dp.cpp
@@ -0,0 +1,49 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/distributed/assembly_helpers_kernels.hpp"
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+namespace assembly_helpers {
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void count_non_owning_entries(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    comm_index_type local_part, array<comm_index_type>& send_count,
+    array<GlobalIndexType>& send_positions,
+    array<GlobalIndexType>& original_positions) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void fill_send_buffers(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    comm_index_type local_part, const array<GlobalIndexType>& send_positions,
+    const array<GlobalIndexType>& original_positions,
+    array<GlobalIndexType>& send_row_idxs,
+    array<GlobalIndexType>& send_col_idxs,
+    array<ValueType>& send_values) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_FILL_SEND_BUFFERS);
+
+
+}  // namespace assembly_helpers
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/distributed/matrix_kernels.dp.cpp b/dpcpp/distributed/matrix_kernels.dp.cpp
index 8582fab9378..47adaaeca59 100644
--- a/dpcpp/distributed/matrix_kernels.dp.cpp
+++ b/dpcpp/distributed/matrix_kernels.dp.cpp
@@ -13,36 +13,6 @@ namespace dpcpp {
 namespace distributed_matrix {
 
 
-template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void count_non_owning_entries(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const device_matrix_data<ValueType, GlobalIndexType>& input,
-    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
-        row_partition,
-    comm_index_type local_part, array<comm_index_type>& send_count,
-    array<GlobalIndexType>& send_positions,
-    array<GlobalIndexType>& original_positions) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
-    GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
-
-
-template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void fill_send_buffers(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const device_matrix_data<ValueType, GlobalIndexType>& input,
-    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
-        row_partition,
-    comm_index_type local_part, const array<GlobalIndexType>& send_positions,
-    const array<GlobalIndexType>& original_positions,
-    array<GlobalIndexType>& send_row_idxs,
-    array<GlobalIndexType>& send_col_idxs,
-    array<ValueType>& send_values) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
-    GKO_DECLARE_FILL_SEND_BUFFERS);
-
-
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void separate_local_nonlocal(
     std::shared_ptr<const DefaultExecutor> exec,
diff --git a/include/ginkgo/core/distributed/assembly_helpers.hpp b/include/ginkgo/core/distributed/assembly_helpers.hpp
new file mode 100644
index 00000000000..11237fc8d56
--- /dev/null
+++ b/include/ginkgo/core/distributed/assembly_helpers.hpp
@@ -0,0 +1,41 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_PUBLIC_CORE_DISTRIBUTED_ASSEMBLY_HELPERS_HPP_
+#define GKO_PUBLIC_CORE_DISTRIBUTED_ASSEMBLY_HELPERS_HPP_
+
+
+#include <ginkgo/config.hpp>
+
+
+#if GINKGO_BUILD_MPI
+
+
+#include <ginkgo/core/base/device_matrix_data.hpp>
+#include <ginkgo/core/base/mpi.hpp>
+#include <ginkgo/core/base/range.hpp>
+
+
+namespace gko {
+namespace experimental {
+namespace distributed {
+
+template <typename LocalIndexType, typename GlobalIndexType>
+class Partition;
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+device_matrix_data<ValueType, GlobalIndexType> assemble(
+    mpi::communicator comm,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    ptr_param<const Partition<LocalIndexType, GlobalIndexType>> partition);
+
+
+}  // namespace distributed
+}  // namespace experimental
+}  // namespace gko
+
+
+#endif  // GINKGO_BUILD_MPI
+#endif  // GKO_PUBLIC_CORE_DISTRIBUTED_ASSEMBLY_HELPERS_HPP_
diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp
index 4470069aba7..917d8a1bab6 100644
--- a/include/ginkgo/core/distributed/matrix.hpp
+++ b/include/ginkgo/core/distributed/matrix.hpp
@@ -290,6 +290,27 @@ class Matrix
     void move_to(Matrix<next_precision<value_type>, local_index_type,
                         global_index_type>* result) override;
 
+    /**
+     * Reads a square matrix from the device_matrix_data structure and a global
+     * partition.
+     *
+     * The global size of the final matrix is inferred from the size of the
+     * partition. Both the number of rows and columns of the device_matrix_data
+     * are ignored.
+     *
+     * @note The matrix data can contain entries for rows other than those owned
+     *        by the process. Entries for those rows are discarded.
+     *
+     * @param data  The device_matrix_data structure.
+     * @param partition  The global row and column partition.
+     *
+     * @return the index_map induced by the partitions and the matrix structure
+     */
+    void read_distributed(
+        const device_matrix_data<value_type, global_index_type>& data,
+        std::shared_ptr<const Partition<local_index_type, global_index_type>>
+            partition);
+
     /**
      * Reads a square matrix from the device_matrix_data structure and a global
      * partition.
@@ -311,7 +332,21 @@ class Matrix
         const device_matrix_data<value_type, global_index_type>& data,
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
             partition,
-        assembly_mode assembly_type = assembly_mode::local_only);
+        assembly_mode assembly_type);
+
+    /**
+     * Reads a square matrix from the matrix_data structure and a global
+     * partition.
+     *
+     * @see read_distributed
+     *
+     * @note For efficiency it is advised to use the device_matrix_data
+     * overload.
+     */
+    void read_distributed(
+        const matrix_data<value_type, global_index_type>& data,
+        std::shared_ptr<const Partition<local_index_type, global_index_type>>
+            partition);
 
     /**
      * Reads a square matrix from the matrix_data structure and a global
@@ -326,7 +361,7 @@ class Matrix
         const matrix_data<value_type, global_index_type>& data,
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
             partition,
-        assembly_mode assembly_type = assembly_mode::local_only);
+        assembly_mode assembly_type);
 
     /**
      * Reads a matrix from the device_matrix_data structure, a global row
@@ -351,8 +386,7 @@ class Matrix
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
             row_partition,
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
-            col_partition,
-        assembly_mode assembly_type = assembly_mode::local_only);
+            col_partition);
 
     /**
      * Reads a matrix from the matrix_data structure, a global row partition,
@@ -368,8 +402,7 @@ class Matrix
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
             row_partition,
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
-            col_partition,
-        assembly_mode assembly_type = assembly_mode::local_only);
+            col_partition);
 
     /**
      * Get read access to the stored local matrix.
diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp
index b36ece34b8d..78f6c80381a 100644
--- a/include/ginkgo/ginkgo.hpp
+++ b/include/ginkgo/ginkgo.hpp
@@ -58,6 +58,7 @@
 #include <ginkgo/core/config/registry.hpp>
 #include <ginkgo/core/config/type_descriptor.hpp>
 
+#include <ginkgo/core/distributed/assembly_helpers.hpp>
 #include <ginkgo/core/distributed/base.hpp>
 #include <ginkgo/core/distributed/index_map.hpp>
 #include <ginkgo/core/distributed/lin_op.hpp>
diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt
index fef0702048f..945f174313f 100644
--- a/omp/CMakeLists.txt
+++ b/omp/CMakeLists.txt
@@ -10,6 +10,7 @@ target_sources(ginkgo_omp
     base/scoped_device_id.cpp
     base/version.cpp
     components/prefix_sum_kernels.cpp
+    distributed/assembly_helpers_kernels.cpp
     distributed/index_map_kernels.cpp
     distributed/matrix_kernels.cpp
     distributed/partition_helpers_kernels.cpp
diff --git a/omp/distributed/assembly_helpers_kernels.cpp b/omp/distributed/assembly_helpers_kernels.cpp
new file mode 100644
index 00000000000..93706671562
--- /dev/null
+++ b/omp/distributed/assembly_helpers_kernels.cpp
@@ -0,0 +1,114 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/distributed/assembly_helpers_kernels.hpp"
+
+#include <algorithm>
+
+#include <omp.h>
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+#include "core/base/allocator.hpp"
+#include "core/base/device_matrix_data_kernels.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+#include "reference/distributed/partition_helpers.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace omp {
+namespace assembly_helpers {
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void count_non_owning_entries(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    comm_index_type local_part, array<comm_index_type>& send_count,
+    array<GlobalIndexType>& send_positions,
+    array<GlobalIndexType>& original_positions)
+{
+    auto num_input_elements = input.get_num_stored_elements();
+    auto input_row_idxs = input.get_const_row_idxs();
+    auto row_part_ids = row_partition->get_part_ids();
+    array<comm_index_type> row_part_ids_per_entry{exec, num_input_elements};
+
+    size_type row_range_id = 0;
+#pragma omp parallel for firstprivate(row_range_id)
+    for (size_type i = 0; i < input.get_num_stored_elements(); ++i) {
+        auto global_row = input_row_idxs[i];
+        row_range_id = find_range(global_row, row_partition, row_range_id);
+        auto row_part_id = row_part_ids[row_range_id];
+        row_part_ids_per_entry.get_data()[i] = row_part_id;
+        if (row_part_id != local_part) {
+#pragma omp atomic
+            send_count.get_data()[row_part_id]++;
+            original_positions.get_data()[i] = i;
+        } else {
+            original_positions.get_data()[i] = -1;
+        }
+    }
+
+    auto comp = [&row_part_ids_per_entry, local_part](auto i, auto j) {
+        comm_index_type a =
+            i == -1 ? local_part : row_part_ids_per_entry.get_const_data()[i];
+        comm_index_type b =
+            j == -1 ? local_part : row_part_ids_per_entry.get_const_data()[j];
+        return a < b;
+    };
+    std::stable_sort(original_positions.get_data(),
+                     original_positions.get_data() + num_input_elements, comp);
+
+#pragma omp parallel for
+    for (size_type i = 0; i < num_input_elements; i++) {
+        send_positions.get_data()[i] =
+            original_positions.get_const_data()[i] == -1 ? 0 : 1;
+    }
+
+    components::prefix_sum_nonnegative(exec, send_positions.get_data(),
+                                       num_input_elements);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void fill_send_buffers(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    comm_index_type local_part, const array<GlobalIndexType>& send_positions,
+    const array<GlobalIndexType>& original_positions,
+    array<GlobalIndexType>& send_row_idxs,
+    array<GlobalIndexType>& send_col_idxs, array<ValueType>& send_values)
+{
+    auto input_row_idxs = input.get_const_row_idxs();
+    auto input_col_idxs = input.get_const_col_idxs();
+    auto input_vals = input.get_const_values();
+
+#pragma omp parallel for
+    for (size_type i = 0; i < input.get_num_stored_elements(); ++i) {
+        auto in_pos = original_positions.get_const_data()[i];
+        if (in_pos >= 0) {
+            auto out_pos = send_positions.get_const_data()[i];
+            send_row_idxs.get_data()[out_pos] = input_row_idxs[in_pos];
+            send_col_idxs.get_data()[out_pos] = input_col_idxs[in_pos];
+            send_values.get_data()[out_pos] = input_vals[in_pos];
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_FILL_SEND_BUFFERS);
+
+
+}  // namespace assembly_helpers
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
diff --git a/omp/distributed/matrix_kernels.cpp b/omp/distributed/matrix_kernels.cpp
index 8276519f0ff..2f36ec4a778 100644
--- a/omp/distributed/matrix_kernels.cpp
+++ b/omp/distributed/matrix_kernels.cpp
@@ -4,8 +4,6 @@
 
 #include "core/distributed/matrix_kernels.hpp"
 
-#include <algorithm>
-
 #include <omp.h>
 
 #include <ginkgo/core/base/exception_helpers.hpp>
@@ -22,92 +20,6 @@ namespace omp {
 namespace distributed_matrix {
 
 
-template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void count_non_owning_entries(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const device_matrix_data<ValueType, GlobalIndexType>& input,
-    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
-        row_partition,
-    comm_index_type local_part, array<comm_index_type>& send_count,
-    array<GlobalIndexType>& send_positions,
-    array<GlobalIndexType>& original_positions)
-{
-    auto num_input_elements = input.get_num_stored_elements();
-    auto input_row_idxs = input.get_const_row_idxs();
-    auto row_part_ids = row_partition->get_part_ids();
-    array<comm_index_type> row_part_ids_per_entry{exec, num_input_elements};
-
-    size_type row_range_id = 0;
-#pragma omp parallel for firstprivate(row_range_id)
-    for (size_type i = 0; i < input.get_num_stored_elements(); ++i) {
-        auto global_row = input_row_idxs[i];
-        row_range_id = find_range(global_row, row_partition, row_range_id);
-        auto row_part_id = row_part_ids[row_range_id];
-        row_part_ids_per_entry.get_data()[i] = row_part_id;
-        if (row_part_id != local_part) {
-#pragma omp atomic
-            send_count.get_data()[row_part_id]++;
-            original_positions.get_data()[i] = i;
-        } else {
-            original_positions.get_data()[i] = -1;
-        }
-    }
-
-    auto comp = [&row_part_ids_per_entry, local_part](auto i, auto j) {
-        comm_index_type a =
-            i == -1 ? local_part : row_part_ids_per_entry.get_const_data()[i];
-        comm_index_type b =
-            j == -1 ? local_part : row_part_ids_per_entry.get_const_data()[j];
-        return a < b;
-    };
-    std::stable_sort(original_positions.get_data(),
-                     original_positions.get_data() + num_input_elements, comp);
-
-#pragma omp parallel for
-    for (size_type i = 0; i < num_input_elements; i++) {
-        send_positions.get_data()[i] =
-            original_positions.get_const_data()[i] == -1 ? 0 : 1;
-    }
-
-    components::prefix_sum_nonnegative(exec, send_positions.get_data(),
-                                       num_input_elements);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
-    GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
-
-
-template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void fill_send_buffers(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const device_matrix_data<ValueType, GlobalIndexType>& input,
-    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
-        row_partition,
-    comm_index_type local_part, const array<GlobalIndexType>& send_positions,
-    const array<GlobalIndexType>& original_positions,
-    array<GlobalIndexType>& send_row_idxs,
-    array<GlobalIndexType>& send_col_idxs, array<ValueType>& send_values)
-{
-    auto input_row_idxs = input.get_const_row_idxs();
-    auto input_col_idxs = input.get_const_col_idxs();
-    auto input_vals = input.get_const_values();
-
-#pragma omp parallel for
-    for (size_type i = 0; i < input.get_num_stored_elements(); ++i) {
-        auto in_pos = original_positions.get_const_data()[i];
-        if (in_pos >= 0) {
-            auto out_pos = send_positions.get_const_data()[i];
-            send_row_idxs.get_data()[out_pos] = input_row_idxs[in_pos];
-            send_col_idxs.get_data()[out_pos] = input_col_idxs[in_pos];
-            send_values.get_data()[out_pos] = input_vals[in_pos];
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
-    GKO_DECLARE_FILL_SEND_BUFFERS);
-
-
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void separate_local_nonlocal(
     std::shared_ptr<const DefaultExecutor> exec,
diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt
index e2f27dab57e..c39c4680b9d 100644
--- a/reference/CMakeLists.txt
+++ b/reference/CMakeLists.txt
@@ -12,6 +12,7 @@ target_sources(ginkgo_reference
     components/reduce_array_kernels.cpp
     components/precision_conversion_kernels.cpp
     components/prefix_sum_kernels.cpp
+    distributed/assembly_helpers_kernels.cpp
     distributed/index_map_kernels.cpp
     distributed/matrix_kernels.cpp
     distributed/partition_helpers_kernels.cpp
diff --git a/reference/distributed/assembly_helpers_kernels.cpp b/reference/distributed/assembly_helpers_kernels.cpp
new file mode 100644
index 00000000000..9f9632dd9d5
--- /dev/null
+++ b/reference/distributed/assembly_helpers_kernels.cpp
@@ -0,0 +1,107 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/distributed/assembly_helpers_kernels.hpp"
+
+#include <algorithm>
+
+#include "core/base/allocator.hpp"
+#include "core/base/device_matrix_data_kernels.hpp"
+#include "core/base/iterator_factory.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+#include "reference/distributed/partition_helpers.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace reference {
+namespace assembly_helpers {
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void count_non_owning_entries(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    comm_index_type local_part, array<comm_index_type>& send_count,
+    array<GlobalIndexType>& send_positions,
+    array<GlobalIndexType>& original_positions)
+{
+    auto num_input_elements = input.get_num_stored_elements();
+    auto input_row_idxs = input.get_const_row_idxs();
+    auto row_part_ids = row_partition->get_part_ids();
+    array<comm_index_type> row_part_ids_per_entry{exec, num_input_elements};
+
+    size_type row_range_id = 0;
+    for (size_type i = 0; i < input.get_num_stored_elements(); ++i) {
+        auto global_row = input_row_idxs[i];
+        row_range_id = find_range(global_row, row_partition, row_range_id);
+        auto row_part_id = row_part_ids[row_range_id];
+        row_part_ids_per_entry.get_data()[i] = row_part_id;
+        if (row_part_id != local_part) {
+            send_count.get_data()[row_part_id]++;
+            original_positions.get_data()[i] = i;
+        } else {
+            original_positions.get_data()[i] = -1;
+        }
+    }
+
+    auto comp = [&row_part_ids_per_entry, local_part](auto i, auto j) {
+        comm_index_type a =
+            i == -1 ? local_part : row_part_ids_per_entry.get_const_data()[i];
+        comm_index_type b =
+            j == -1 ? local_part : row_part_ids_per_entry.get_const_data()[j];
+        return a < b;
+    };
+
+    std::stable_sort(original_positions.get_data(),
+                     original_positions.get_data() + num_input_elements, comp);
+    for (size_type i = 0; i < num_input_elements; i++) {
+        send_positions.get_data()[i] =
+            original_positions.get_const_data()[i] == -1 ? 0 : 1;
+    }
+
+    components::prefix_sum_nonnegative(exec, send_positions.get_data(),
+                                       num_input_elements);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void fill_send_buffers(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    comm_index_type local_part, const array<GlobalIndexType>& send_positions,
+    const array<GlobalIndexType>& original_positions,
+    array<GlobalIndexType>& send_row_idxs,
+    array<GlobalIndexType>& send_col_idxs, array<ValueType>& send_values)
+{
+    auto input_row_idxs = input.get_const_row_idxs();
+    auto input_col_idxs = input.get_const_col_idxs();
+    auto input_vals = input.get_const_values();
+
+    for (size_type i = 0; i < input.get_num_stored_elements(); ++i) {
+        auto in_pos = original_positions.get_const_data()[i];
+        if (in_pos >= 0) {
+            auto out_pos = send_positions.get_const_data()[i];
+            send_row_idxs.get_data()[out_pos] = input_row_idxs[in_pos];
+            send_col_idxs.get_data()[out_pos] = input_col_idxs[in_pos];
+            send_values.get_data()[out_pos] = input_vals[in_pos];
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_FILL_SEND_BUFFERS);
+
+
+}  // namespace assembly_helpers
+}  // namespace reference
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/distributed/matrix_kernels.cpp b/reference/distributed/matrix_kernels.cpp
index 410be379eff..95176b34656 100644
--- a/reference/distributed/matrix_kernels.cpp
+++ b/reference/distributed/matrix_kernels.cpp
@@ -4,12 +4,9 @@
 
 #include "core/distributed/matrix_kernels.hpp"
 
-#include <algorithm>
-
 #include "core/base/allocator.hpp"
 #include "core/base/device_matrix_data_kernels.hpp"
 #include "core/base/iterator_factory.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
 #include "reference/distributed/partition_helpers.hpp"
 
 
@@ -19,88 +16,6 @@ namespace reference {
 namespace distributed_matrix {
 
 
-template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void count_non_owning_entries(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const device_matrix_data<ValueType, GlobalIndexType>& input,
-    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
-        row_partition,
-    comm_index_type local_part, array<comm_index_type>& send_count,
-    array<GlobalIndexType>& send_positions,
-    array<GlobalIndexType>& original_positions)
-{
-    auto num_input_elements = input.get_num_stored_elements();
-    auto input_row_idxs = input.get_const_row_idxs();
-    auto row_part_ids = row_partition->get_part_ids();
-    array<comm_index_type> row_part_ids_per_entry{exec, num_input_elements};
-
-    size_type row_range_id = 0;
-    for (size_type i = 0; i < input.get_num_stored_elements(); ++i) {
-        auto global_row = input_row_idxs[i];
-        row_range_id = find_range(global_row, row_partition, row_range_id);
-        auto row_part_id = row_part_ids[row_range_id];
-        row_part_ids_per_entry.get_data()[i] = row_part_id;
-        if (row_part_id != local_part) {
-            send_count.get_data()[row_part_id]++;
-            original_positions.get_data()[i] = i;
-        } else {
-            original_positions.get_data()[i] = -1;
-        }
-    }
-
-    auto comp = [&row_part_ids_per_entry, local_part](auto i, auto j) {
-        comm_index_type a =
-            i == -1 ? local_part : row_part_ids_per_entry.get_const_data()[i];
-        comm_index_type b =
-            j == -1 ? local_part : row_part_ids_per_entry.get_const_data()[j];
-        return a < b;
-    };
-
-    std::stable_sort(original_positions.get_data(),
-                     original_positions.get_data() + num_input_elements, comp);
-    for (size_type i = 0; i < num_input_elements; i++) {
-        send_positions.get_data()[i] =
-            original_positions.get_const_data()[i] == -1 ? 0 : 1;
-    }
-
-    components::prefix_sum_nonnegative(exec, send_positions.get_data(),
-                                       num_input_elements);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
-    GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
-
-
-template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void fill_send_buffers(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const device_matrix_data<ValueType, GlobalIndexType>& input,
-    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
-        row_partition,
-    comm_index_type local_part, const array<GlobalIndexType>& send_positions,
-    const array<GlobalIndexType>& original_positions,
-    array<GlobalIndexType>& send_row_idxs,
-    array<GlobalIndexType>& send_col_idxs, array<ValueType>& send_values)
-{
-    auto input_row_idxs = input.get_const_row_idxs();
-    auto input_col_idxs = input.get_const_col_idxs();
-    auto input_vals = input.get_const_values();
-
-    for (size_type i = 0; i < input.get_num_stored_elements(); ++i) {
-        auto in_pos = original_positions.get_const_data()[i];
-        if (in_pos >= 0) {
-            auto out_pos = send_positions.get_const_data()[i];
-            send_row_idxs.get_data()[out_pos] = input_row_idxs[in_pos];
-            send_col_idxs.get_data()[out_pos] = input_col_idxs[in_pos];
-            send_values.get_data()[out_pos] = input_vals[in_pos];
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
-    GKO_DECLARE_FILL_SEND_BUFFERS);
-
-
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void separate_local_nonlocal(
     std::shared_ptr<const DefaultExecutor> exec,
diff --git a/reference/test/distributed/CMakeLists.txt b/reference/test/distributed/CMakeLists.txt
index c4619369d6d..443eb05a03b 100644
--- a/reference/test/distributed/CMakeLists.txt
+++ b/reference/test/distributed/CMakeLists.txt
@@ -1,3 +1,4 @@
+ginkgo_create_test(assembly_helpers_kernels)
 ginkgo_create_test(index_map_kernels)
 ginkgo_create_test(matrix_kernels)
 ginkgo_create_test(partition_helpers_kernels)
diff --git a/reference/test/distributed/assembly_helpers_kernels.cpp b/reference/test/distributed/assembly_helpers_kernels.cpp
new file mode 100644
index 00000000000..b11b736e567
--- /dev/null
+++ b/reference/test/distributed/assembly_helpers_kernels.cpp
@@ -0,0 +1,153 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/distributed/assembly_helpers_kernels.hpp"
+
+#include <vector>
+
+#include <gtest/gtest-typed-test.h>
+#include <gtest/gtest.h>
+
+#include <ginkgo/core/base/device_matrix_data.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+using comm_index_type = gko::experimental::distributed::comm_index_type;
+
+
+template <typename ValueLocalGlobalIndexType>
+class AssemblyHelpers : public ::testing::Test {
+protected:
+    using value_type = typename std::tuple_element<
+        0, decltype(ValueLocalGlobalIndexType())>::type;
+    using local_index_type = typename std::tuple_element<
+        1, decltype(ValueLocalGlobalIndexType())>::type;
+    using global_index_type = typename std::tuple_element<
+        2, decltype(ValueLocalGlobalIndexType())>::type;
+    using Mtx = gko::matrix::Csr<value_type, local_index_type>;
+
+    AssemblyHelpers() : ref(gko::ReferenceExecutor::create()), mapping{ref} {}
+
+    gko::device_matrix_data<value_type, global_index_type> create_input()
+    {
+        return gko::device_matrix_data<value_type, global_index_type>{
+            this->ref, gko::dim<2>{7, 7},
+            gko::array<global_index_type>{ref,
+                                          {0, 0, 1, 1, 2, 3, 3, 4, 4, 5, 5, 6}},
+            gko::array<global_index_type>{ref,
+                                          {0, 3, 1, 2, 2, 0, 3, 4, 6, 4, 5, 5}},
+            gko::array<value_type>{ref,
+                                   {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}}};
+    }
+
+    std::shared_ptr<const gko::ReferenceExecutor> ref;
+    gko::array<comm_index_type> mapping;
+};
+
+TYPED_TEST_SUITE(AssemblyHelpers, gko::test::ValueLocalGlobalIndexTypes,
+                 TupleTypenameNameGenerator);
+
+
+TYPED_TEST(AssemblyHelpers, CountOverlapEntries)
+{
+    using lit = typename TestFixture::local_index_type;
+    using git = typename TestFixture::global_index_type;
+    using ca = gko::array<comm_index_type>;
+    using ga = gko::array<git>;
+    this->mapping = {this->ref, {1, 0, 2, 2, 0, 1, 1}};
+    std::vector<ca> send_count_ref{ca{this->ref, I<comm_index_type>{0, 5, 3}},
+                                   ca{this->ref, I<comm_index_type>{4, 0, 3}},
+                                   ca{this->ref, I<comm_index_type>{4, 5, 0}}};
+    std::vector<ga> send_pos_ref{
+        ga{this->ref, I<git>{0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7}},
+        ga{this->ref, I<git>{0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 5, 6}},
+        ga{this->ref, I<git>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9}}};
+    std::vector<ga> original_pos_ref{
+        ga{this->ref, I<git>{-1, -1, -1, -1, 0, 1, 9, 10, 11, 4, 5, 6}},
+        ga{this->ref, I<git>{2, 3, 7, 8, -1, -1, -1, -1, -1, 4, 5, 6}},
+        ga{this->ref, I<git>{2, 3, 7, 8, 0, 1, 9, 10, 11, -1, -1, -1}}};
+    comm_index_type num_parts = 3;
+    auto partition =
+        gko::experimental::distributed::Partition<lit, git>::build_from_mapping(
+            this->ref, this->mapping, num_parts);
+    auto input = this->create_input();
+
+    ca send_count{this->ref, static_cast<gko::size_type>(num_parts)};
+    ga send_positions{this->ref, input.get_num_stored_elements()};
+    ga original_positions{this->ref, input.get_num_stored_elements()};
+    for (gko::size_type i = 0; i < num_parts; i++) {
+        send_count.fill(0);
+
+        gko::kernels::reference::assembly_helpers::count_non_owning_entries(
+            this->ref, input, partition.get(), i, send_count, send_positions,
+            original_positions);
+
+        GKO_ASSERT_ARRAY_EQ(send_count, send_count_ref[i]);
+        GKO_ASSERT_ARRAY_EQ(send_positions, send_pos_ref[i]);
+        GKO_ASSERT_ARRAY_EQ(original_positions, original_pos_ref[i]);
+    }
+}
+
+
+TYPED_TEST(AssemblyHelpers, FillOverlapSendBuffers)
+{
+    using lit = typename TestFixture::local_index_type;
+    using git = typename TestFixture::global_index_type;
+    using vt = typename TestFixture::value_type;
+    using ga = gko::array<git>;
+    using va = gko::array<vt>;
+    this->mapping = {this->ref, {1, 0, 2, 2, 0, 1, 1}};
+    std::vector<ga> send_positions{
+        ga{this->ref, I<git>{0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7}},
+        ga{this->ref, I<git>{0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 5, 6}},
+        ga{this->ref, I<git>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9}}};
+    std::vector<ga> original_positions{
+        ga{this->ref, I<git>{-1, -1, -1, -1, 0, 1, 9, 10, 11, 4, 5, 6}},
+        ga{this->ref, I<git>{2, 3, 7, 8, -1, -1, -1, -1, -1, 4, 5, 6}},
+        ga{this->ref, I<git>{2, 3, 7, 8, 0, 1, 9, 10, 11, -1, -1, -1}}};
+    std::vector<ga> send_row_idxs_ref{
+        ga{this->ref, I<git>{0, 0, 5, 5, 6, 2, 3, 3}},
+        ga{this->ref, I<git>{1, 1, 4, 4, 2, 3, 3}},
+        ga{this->ref, I<git>{1, 1, 4, 4, 0, 0, 5, 5, 6}}};
+    std::vector<ga> send_col_idxs_ref{
+        ga{this->ref, I<git>{0, 3, 4, 5, 5, 2, 0, 3}},
+        ga{this->ref, I<git>{1, 2, 4, 6, 2, 0, 3}},
+        ga{this->ref, I<git>{1, 2, 4, 6, 0, 3, 4, 5, 5}}};
+    std::vector<va> send_values_ref{
+        va{this->ref, I<vt>{1, 2, 10, 11, 12, 5, 6, 7}},
+        va{this->ref, I<vt>{3, 4, 8, 9, 5, 6, 7}},
+        va{this->ref, I<vt>{3, 4, 8, 9, 1, 2, 10, 11, 12}}};
+    comm_index_type num_parts = 3;
+    auto partition =
+        gko::experimental::distributed::Partition<lit, git>::build_from_mapping(
+            this->ref, this->mapping, num_parts);
+    auto input = this->create_input();
+
+    gko::array<git> send_row_idxs{this->ref};
+    gko::array<git> send_col_idxs{this->ref};
+    gko::array<vt> send_values{this->ref};
+    for (gko::size_type i = 0; i < num_parts; i++) {
+        auto num_entries = send_row_idxs_ref[i].get_size();
+        send_row_idxs.resize_and_reset(num_entries);
+        send_col_idxs.resize_and_reset(num_entries);
+        send_values.resize_and_reset(num_entries);
+
+        gko::kernels::reference::assembly_helpers::fill_send_buffers(
+            this->ref, input, partition.get(), i, send_positions[i],
+            original_positions[i], send_row_idxs, send_col_idxs, send_values);
+
+        GKO_ASSERT_ARRAY_EQ(send_row_idxs, send_row_idxs_ref[i]);
+        GKO_ASSERT_ARRAY_EQ(send_col_idxs, send_col_idxs_ref[i]);
+        GKO_ASSERT_ARRAY_EQ(send_values, send_values_ref[i]);
+    }
+}
+
+
+}  // namespace
diff --git a/reference/test/distributed/matrix_kernels.cpp b/reference/test/distributed/matrix_kernels.cpp
index 80063f7e582..a34844cbde9 100644
--- a/reference/test/distributed/matrix_kernels.cpp
+++ b/reference/test/distributed/matrix_kernels.cpp
@@ -186,101 +186,6 @@ TYPED_TEST_SUITE(Matrix, gko::test::ValueLocalGlobalIndexTypes,
                  TupleTypenameNameGenerator);
 
 
-TYPED_TEST(Matrix, CountOverlapEntries)
-{
-    using lit = typename TestFixture::local_index_type;
-    using git = typename TestFixture::global_index_type;
-    using ca = gko::array<comm_index_type>;
-    using ga = gko::array<git>;
-    this->mapping = {this->ref, {1, 0, 2, 2, 0, 1, 1}};
-    std::vector<ca> send_count_ref{ca{this->ref, I<comm_index_type>{0, 5, 3}},
-                                   ca{this->ref, I<comm_index_type>{4, 0, 3}},
-                                   ca{this->ref, I<comm_index_type>{4, 5, 0}}};
-    std::vector<ga> send_pos_ref{
-        ga{this->ref, I<git>{0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7}},
-        ga{this->ref, I<git>{0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 5, 6}},
-        ga{this->ref, I<git>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9}}};
-    std::vector<ga> original_pos_ref{
-        ga{this->ref, I<git>{-1, -1, -1, -1, 0, 1, 9, 10, 11, 4, 5, 6}},
-        ga{this->ref, I<git>{2, 3, 7, 8, -1, -1, -1, -1, -1, 4, 5, 6}},
-        ga{this->ref, I<git>{2, 3, 7, 8, 0, 1, 9, 10, 11, -1, -1, -1}}};
-    comm_index_type num_parts = 3;
-    auto partition =
-        gko::experimental::distributed::Partition<lit, git>::build_from_mapping(
-            this->ref, this->mapping, num_parts);
-    auto input = this->create_input_full_rank();
-
-    ca send_count{this->ref, static_cast<gko::size_type>(num_parts)};
-    ga send_positions{this->ref, input.get_num_stored_elements()};
-    ga original_positions{this->ref, input.get_num_stored_elements()};
-    for (gko::size_type i = 0; i < num_parts; i++) {
-        send_count.fill(0);
-
-        gko::kernels::reference::distributed_matrix::count_non_owning_entries(
-            this->ref, input, partition.get(), i, send_count, send_positions,
-            original_positions);
-
-        GKO_ASSERT_ARRAY_EQ(send_count, send_count_ref[i]);
-        GKO_ASSERT_ARRAY_EQ(send_positions, send_pos_ref[i]);
-        GKO_ASSERT_ARRAY_EQ(original_positions, original_pos_ref[i]);
-    }
-}
-
-
-TYPED_TEST(Matrix, FillOverlapSendBuffers)
-{
-    using lit = typename TestFixture::local_index_type;
-    using git = typename TestFixture::global_index_type;
-    using vt = typename TestFixture::value_type;
-    using ga = gko::array<git>;
-    using va = gko::array<vt>;
-    this->mapping = {this->ref, {1, 0, 2, 2, 0, 1, 1}};
-    std::vector<ga> send_positions{
-        ga{this->ref, I<git>{0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7}},
-        ga{this->ref, I<git>{0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 5, 6}},
-        ga{this->ref, I<git>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9}}};
-    std::vector<ga> original_positions{
-        ga{this->ref, I<git>{-1, -1, -1, -1, 0, 1, 9, 10, 11, 4, 5, 6}},
-        ga{this->ref, I<git>{2, 3, 7, 8, -1, -1, -1, -1, -1, 4, 5, 6}},
-        ga{this->ref, I<git>{2, 3, 7, 8, 0, 1, 9, 10, 11, -1, -1, -1}}};
-    std::vector<ga> send_row_idxs_ref{
-        ga{this->ref, I<git>{0, 0, 5, 5, 6, 2, 3, 3}},
-        ga{this->ref, I<git>{1, 1, 4, 4, 2, 3, 3}},
-        ga{this->ref, I<git>{1, 1, 4, 4, 0, 0, 5, 5, 6}}};
-    std::vector<ga> send_col_idxs_ref{
-        ga{this->ref, I<git>{0, 3, 4, 5, 5, 2, 0, 3}},
-        ga{this->ref, I<git>{1, 2, 4, 6, 2, 0, 3}},
-        ga{this->ref, I<git>{1, 2, 4, 6, 0, 3, 4, 5, 5}}};
-    std::vector<va> send_values_ref{
-        va{this->ref, I<vt>{1, 2, 10, 11, 12, 5, 6, 7}},
-        va{this->ref, I<vt>{3, 4, 8, 9, 5, 6, 7}},
-        va{this->ref, I<vt>{3, 4, 8, 9, 1, 2, 10, 11, 12}}};
-    comm_index_type num_parts = 3;
-    auto partition =
-        gko::experimental::distributed::Partition<lit, git>::build_from_mapping(
-            this->ref, this->mapping, num_parts);
-    auto input = this->create_input_full_rank();
-
-    gko::array<git> send_row_idxs{this->ref};
-    gko::array<git> send_col_idxs{this->ref};
-    gko::array<vt> send_values{this->ref};
-    for (gko::size_type i = 0; i < num_parts; i++) {
-        auto num_entries = send_row_idxs_ref[i].get_size();
-        send_row_idxs.resize_and_reset(num_entries);
-        send_col_idxs.resize_and_reset(num_entries);
-        send_values.resize_and_reset(num_entries);
-
-        gko::kernels::reference::distributed_matrix::fill_send_buffers(
-            this->ref, input, partition.get(), i, send_positions[i],
-            original_positions[i], send_row_idxs, send_col_idxs, send_values);
-
-        GKO_ASSERT_ARRAY_EQ(send_row_idxs, send_row_idxs_ref[i]);
-        GKO_ASSERT_ARRAY_EQ(send_col_idxs, send_col_idxs_ref[i]);
-        GKO_ASSERT_ARRAY_EQ(send_values, send_values_ref[i]);
-    }
-}
-
-
 TYPED_TEST(Matrix, SeparateLocalNonLocalEmpty)
 {
     using lit = typename TestFixture::local_index_type;
diff --git a/test/distributed/CMakeLists.txt b/test/distributed/CMakeLists.txt
index 9e0c875de0e..91a497020d5 100644
--- a/test/distributed/CMakeLists.txt
+++ b/test/distributed/CMakeLists.txt
@@ -1,3 +1,4 @@
+ginkgo_create_common_test(assembly_helpers_kernels DISABLE_EXECUTORS dpcpp)
 ginkgo_create_common_test(index_map_kernels DISABLE_EXECUTORS dpcpp)
 ginkgo_create_common_test(matrix_kernels DISABLE_EXECUTORS dpcpp)
 ginkgo_create_common_test(partition_kernels DISABLE_EXECUTORS dpcpp)
diff --git a/test/distributed/assembly_helpers_kernels.cpp b/test/distributed/assembly_helpers_kernels.cpp
new file mode 100644
index 00000000000..50b8c5c5469
--- /dev/null
+++ b/test/distributed/assembly_helpers_kernels.cpp
@@ -0,0 +1,197 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/distributed/assembly_helpers_kernels.hpp"
+
+#include <algorithm>
+
+#include <gtest/gtest-typed-test.h>
+#include <gtest/gtest.h>
+
+#include <ginkgo/core/base/device_matrix_data.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+#include "core/test/utils.hpp"
+#include "test/utils/common_fixture.hpp"
+
+
+using comm_index_type = gko::experimental::distributed::comm_index_type;
+
+
+template <typename ValueLocalGlobalIndexType>
+class AssemblyHelpers : public CommonTestFixture {
+protected:
+    using value_type = typename std::tuple_element<
+        0, decltype(ValueLocalGlobalIndexType())>::type;
+    using local_index_type = typename std::tuple_element<
+        1, decltype(ValueLocalGlobalIndexType())>::type;
+    using global_index_type = typename std::tuple_element<
+        2, decltype(ValueLocalGlobalIndexType())>::type;
+
+    AssemblyHelpers() : engine(42) {}
+
+    void validate(
+        gko::ptr_param<const gko::experimental::distributed::Partition<
+            local_index_type, global_index_type>>
+            row_partition,
+        gko::ptr_param<const gko::experimental::distributed::Partition<
+            local_index_type, global_index_type>>
+            d_row_partition,
+        gko::device_matrix_data<value_type, global_index_type> input)
+    {
+        gko::device_matrix_data<value_type, global_index_type> d_input{exec,
+                                                                       input};
+        gko::size_type num_parts = row_partition->get_num_parts();
+        gko::size_type num_entries = input.get_num_stored_elements();
+        for (comm_index_type part = 0; part < num_parts; ++part) {
+            gko::array<comm_index_type> send_count{ref, num_parts};
+            send_count.fill(0);
+            gko::array<comm_index_type> d_send_count{exec, num_parts};
+            d_send_count.fill(0);
+            gko::array<global_index_type> send_positions{ref, num_entries};
+            gko::array<global_index_type> d_send_positions{exec, num_entries};
+            gko::array<global_index_type> original_positions{ref, num_entries};
+            gko::array<global_index_type> d_original_positions{exec,
+                                                               num_entries};
+
+            gko::kernels::reference::assembly_helpers::count_non_owning_entries(
+                ref, input, row_partition.get(), part, send_count,
+                send_positions, original_positions);
+            gko::kernels::GKO_DEVICE_NAMESPACE::assembly_helpers::
+                count_non_owning_entries(exec, d_input, d_row_partition.get(),
+                                         part, d_send_count, d_send_positions,
+                                         d_original_positions);
+
+            gko::array<global_index_type> send_offsets{ref, num_parts + 1};
+            std::partial_sum(send_count.get_data(),
+                             send_count.get_data() + num_parts,
+                             send_offsets.get_data() + 1);
+            send_offsets.get_data()[0] = 0;
+            gko::array<global_index_type> d_send_offsets{exec, send_offsets};
+            gko::size_type num_send_entries =
+                send_offsets.get_data()[num_parts];
+            gko::array<global_index_type> send_row_idxs{ref, num_send_entries};
+            gko::array<global_index_type> send_col_idxs{ref, num_send_entries};
+            gko::array<value_type> send_values{ref, num_send_entries};
+            gko::array<global_index_type> d_send_row_idxs{exec,
+                                                          num_send_entries};
+            gko::array<global_index_type> d_send_col_idxs{exec,
+                                                          num_send_entries};
+            gko::array<value_type> d_send_values{exec, num_send_entries};
+
+            gko::kernels::reference::assembly_helpers::fill_send_buffers(
+                ref, input, row_partition.get(), part, send_positions,
+                original_positions, send_row_idxs, send_col_idxs, send_values);
+            gko::kernels::GKO_DEVICE_NAMESPACE::assembly_helpers::
+                fill_send_buffers(exec, d_input, d_row_partition.get(), part,
+                                  d_send_positions, d_original_positions,
+                                  d_send_row_idxs, d_send_col_idxs,
+                                  d_send_values);
+
+            GKO_ASSERT_ARRAY_EQ(send_positions, d_send_positions);
+            GKO_ASSERT_ARRAY_EQ(original_positions, d_original_positions);
+            GKO_ASSERT_ARRAY_EQ(send_count, d_send_count);
+            GKO_ASSERT_ARRAY_EQ(send_row_idxs, d_send_row_idxs);
+            GKO_ASSERT_ARRAY_EQ(send_col_idxs, d_send_col_idxs);
+            GKO_ASSERT_ARRAY_EQ(send_values, d_send_values);
+        }
+    }
+
+    std::default_random_engine engine;
+};
+
+TYPED_TEST_SUITE(AssemblyHelpers, gko::test::ValueLocalGlobalIndexTypes,
+                 TupleTypenameNameGenerator);
+
+
+TYPED_TEST(AssemblyHelpers, AssembleDiagOffdiagEmptyIsSameAsRef)
+{
+    using value_type = typename TestFixture::value_type;
+    using local_index_type = typename TestFixture::local_index_type;
+    using global_index_type = typename TestFixture::global_index_type;
+    gko::array<comm_index_type> mapping{this->ref, {1, 0, 2, 2, 0, 1, 1, 2}};
+    comm_index_type num_parts = 3;
+
+    auto partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->ref,
+                                                                 mapping,
+                                                                 num_parts);
+    auto d_partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->exec,
+                                                                 mapping,
+                                                                 num_parts);
+
+    this->validate(
+        partition, d_partition,
+        gko::device_matrix_data<value_type, global_index_type>{this->ref});
+}
+
+
+TYPED_TEST(AssemblyHelpers, AssembleLocalSmallIsEquivalentToRef)
+{
+    using value_type = typename TestFixture::value_type;
+    using local_index_type = typename TestFixture::local_index_type;
+    using global_index_type = typename TestFixture::global_index_type;
+    gko::experimental::distributed::comm_index_type num_parts = 3;
+    gko::size_type num_rows = 10;
+    gko::size_type num_cols = 10;
+    auto mapping = gko::test::generate_random_array<
+        gko::experimental::distributed::comm_index_type>(
+        num_rows,
+        std::uniform_int_distribution<
+            gko::experimental::distributed::comm_index_type>(0, num_parts - 1),
+        this->engine, this->ref);
+    auto input = gko::test::generate_random_device_matrix_data<
+        value_type, global_index_type>(
+        num_rows, num_cols,
+        std::uniform_int_distribution<int>(0, static_cast<int>(num_cols - 1)),
+        std::uniform_real_distribution<gko::remove_complex<value_type>>(0, 1),
+        this->engine, this->ref);
+
+    auto partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->ref,
+                                                                 mapping,
+                                                                 num_parts);
+    auto d_partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->exec,
+                                                                 mapping,
+                                                                 num_parts);
+
+    this->validate(partition, d_partition, input);
+}
+
+
+TYPED_TEST(AssemblyHelpers, AssembleLocalIsEquivalentToRef)
+{
+    using value_type = typename TestFixture::value_type;
+    using local_index_type = typename TestFixture::local_index_type;
+    using global_index_type = typename TestFixture::global_index_type;
+    gko::experimental::distributed::comm_index_type num_parts = 13;
+    gko::size_type num_rows = 67;
+    gko::size_type num_cols = 67;
+    auto mapping = gko::test::generate_random_array<
+        gko::experimental::distributed::comm_index_type>(
+        num_rows,
+        std::uniform_int_distribution<
+            gko::experimental::distributed::comm_index_type>(0, num_parts - 1),
+        this->engine, this->ref);
+    auto input = gko::test::generate_random_device_matrix_data<
+        value_type, global_index_type>(
+        num_rows, num_cols,
+        std::uniform_int_distribution<int>(static_cast<int>(num_cols - 1),
+                                           static_cast<int>(num_cols - 1)),
+        std::uniform_real_distribution<gko::remove_complex<value_type>>(0, 1),
+        this->engine, this->ref);
+
+    auto partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->ref,
+                                                                 mapping,
+                                                                 num_parts);
+    auto d_partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->exec,
+                                                                 mapping,
+                                                                 num_parts);
+
+    this->validate(partition, d_partition, input);
+}
diff --git a/test/distributed/matrix_kernels.cpp b/test/distributed/matrix_kernels.cpp
index 2cef5a49f92..ad91d699496 100644
--- a/test/distributed/matrix_kernels.cpp
+++ b/test/distributed/matrix_kernels.cpp
@@ -48,9 +48,8 @@ class Matrix : public CommonTestFixture {
     {
         gko::device_matrix_data<value_type, global_index_type> d_input{exec,
                                                                        input};
-        gko::size_type num_parts = row_partition->get_num_parts();
-        gko::size_type num_entries = input.get_num_stored_elements();
-        for (comm_index_type part = 0; part < num_parts; ++part) {
+        for (comm_index_type part = 0; part < row_partition->get_num_parts();
+             ++part) {
             gko::array<local_index_type> local_row_idxs{ref};
             gko::array<local_index_type> local_col_idxs{ref};
             gko::array<value_type> local_values{ref};
@@ -63,50 +62,6 @@ class Matrix : public CommonTestFixture {
             gko::array<local_index_type> d_non_local_row_idxs{exec};
             gko::array<global_index_type> d_non_local_col_idxs{exec};
             gko::array<value_type> d_non_local_values{exec};
-            gko::array<comm_index_type> send_count{ref, num_parts};
-            send_count.fill(0);
-            gko::array<comm_index_type> d_send_count{exec, num_parts};
-            d_send_count.fill(0);
-            gko::array<global_index_type> send_positions{ref, num_entries};
-            gko::array<global_index_type> d_send_positions{exec, num_entries};
-            gko::array<global_index_type> original_positions{ref, num_entries};
-            gko::array<global_index_type> d_original_positions{exec,
-                                                               num_entries};
-
-            gko::kernels::reference::distributed_matrix::
-                count_non_owning_entries(ref, input, row_partition.get(), part,
-                                         send_count, send_positions,
-                                         original_positions);
-            gko::kernels::GKO_DEVICE_NAMESPACE::distributed_matrix::
-                count_non_owning_entries(exec, d_input, d_row_partition.get(),
-                                         part, d_send_count, d_send_positions,
-                                         d_original_positions);
-
-            gko::array<global_index_type> send_offsets{ref, num_parts + 1};
-            std::partial_sum(send_count.get_data(),
-                             send_count.get_data() + num_parts,
-                             send_offsets.get_data() + 1);
-            send_offsets.get_data()[0] = 0;
-            gko::array<global_index_type> d_send_offsets{exec, send_offsets};
-            gko::size_type num_send_entries =
-                send_offsets.get_data()[num_parts];
-            gko::array<global_index_type> send_row_idxs{ref, num_send_entries};
-            gko::array<global_index_type> send_col_idxs{ref, num_send_entries};
-            gko::array<value_type> send_values{ref, num_send_entries};
-            gko::array<global_index_type> d_send_row_idxs{exec,
-                                                          num_send_entries};
-            gko::array<global_index_type> d_send_col_idxs{exec,
-                                                          num_send_entries};
-            gko::array<value_type> d_send_values{exec, num_send_entries};
-
-            gko::kernels::reference::distributed_matrix::fill_send_buffers(
-                ref, input, row_partition.get(), part, send_positions,
-                original_positions, send_row_idxs, send_col_idxs, send_values);
-            gko::kernels::GKO_DEVICE_NAMESPACE::distributed_matrix::
-                fill_send_buffers(exec, d_input, d_row_partition.get(), part,
-                                  d_send_positions, d_original_positions,
-                                  d_send_row_idxs, d_send_col_idxs,
-                                  d_send_values);
 
             gko::kernels::reference::distributed_matrix::
                 separate_local_nonlocal(
@@ -120,12 +75,6 @@ class Matrix : public CommonTestFixture {
                     d_non_local_row_idxs, d_non_local_col_idxs,
                     d_non_local_values);
 
-            GKO_ASSERT_ARRAY_EQ(send_positions, d_send_positions);
-            GKO_ASSERT_ARRAY_EQ(original_positions, d_original_positions);
-            GKO_ASSERT_ARRAY_EQ(send_count, d_send_count);
-            GKO_ASSERT_ARRAY_EQ(send_row_idxs, d_send_row_idxs);
-            GKO_ASSERT_ARRAY_EQ(send_col_idxs, d_send_col_idxs);
-            GKO_ASSERT_ARRAY_EQ(send_values, d_send_values);
             GKO_ASSERT_ARRAY_EQ(local_row_idxs, d_local_row_idxs);
             GKO_ASSERT_ARRAY_EQ(local_col_idxs, d_local_col_idxs);
             GKO_ASSERT_ARRAY_EQ(local_values, d_local_values);
diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp
index 0cfb3aca477..9ee91ce8c3f 100644
--- a/test/mpi/matrix.cpp
+++ b/test/mpi/matrix.cpp
@@ -180,26 +180,6 @@ TYPED_TEST(MatrixCreation, ReadsDistributedWithColPartition)
 }
 
 
-TYPED_TEST(MatrixCreation, ReadsDistributedWithColPartitionAndCommunicate)
-{
-    using value_type = typename TestFixture::value_type;
-    using csr = typename TestFixture::local_matrix_type;
-    I<I<value_type>> res_local[] = {{{2, 0}, {0, 0}}, {{1, 5}, {0, 0}}, {{0}}};
-    I<I<value_type>> res_non_local[] = {
-        {{1, 1, 0}, {0, 3, 4}}, {{1, 0, 7}, {7, 7, 0}}, {{10, 9}}};
-    auto rank = this->dist_mat->get_communicator().rank();
-
-    this->dist_mat->read_distributed(
-        this->dist_input[rank], this->row_part, this->col_part,
-        gko::experimental::distributed::assembly_mode::communicate);
-
-    GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_local_matrix()),
-                        res_local[rank], 0);
-    GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_non_local_matrix()),
-                        res_non_local[rank], 0);
-}
-
-
 TYPED_TEST(MatrixCreation, BuildOnlyLocal)
 {
     using value_type = typename TestFixture::value_type;

From 9b6d777db9c012cdb226367225f492bf18840968 Mon Sep 17 00:00:00 2001
From: Fritz Goebel <fritz.goebel@tum.de>
Date: Thu, 21 Nov 2024 10:08:26 +0100
Subject: [PATCH 317/448] Add documentation for the assemble function

---
 .../ginkgo/core/distributed/assembly_helpers.hpp  | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/include/ginkgo/core/distributed/assembly_helpers.hpp b/include/ginkgo/core/distributed/assembly_helpers.hpp
index 11237fc8d56..95a07bcfb06 100644
--- a/include/ginkgo/core/distributed/assembly_helpers.hpp
+++ b/include/ginkgo/core/distributed/assembly_helpers.hpp
@@ -25,6 +25,21 @@ template <typename LocalIndexType, typename GlobalIndexType>
 class Partition;
 
 
+/**
+ * Assembles device_matrix_data entries owned by this MPI rank from other ranks
+ * and communicates entries located on this MPI rank owned by other ranks to
+ * their respective owners. This can be useful e.g. in a finite element code
+ * where each rank assembles a local contribution to a global system matrix and
+ * the global matrix has to be assembled by summing up the local contributions
+ * on rank boundaries.
+ *
+ * @param comm the communicator used to assemble the global matrix.
+ * @param input the device_matrix_data structure.
+ * @param partition the partition used to determine owndership.
+ *
+ * @return the globally assembled device_matrix_data structure for this MPI
+ * rank.
+ */
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 device_matrix_data<ValueType, GlobalIndexType> assemble(
     mpi::communicator comm,

From a6feacc7fd50f92e91063148e8cfe22f536684e6 Mon Sep 17 00:00:00 2001
From: Fritz Goebel <fritz.goebel@tum.de>
Date: Fri, 22 Nov 2024 11:20:26 +0100
Subject: [PATCH 318/448] Address review comments

---
 core/distributed/assembly_helpers.cpp         |  13 +-
 core/distributed/matrix.cpp                   |  59 +++++++--
 .../core/distributed/assembly_helpers.hpp     |   6 +-
 include/ginkgo/core/distributed/matrix.hpp    |  44 ++++++-
 test/distributed/assembly_helpers_kernels.cpp |   6 +-
 test/mpi/CMakeLists.txt                       |   1 +
 test/mpi/assembly_helpers.cpp                 | 112 ++++++++++++++++++
 test/mpi/matrix.cpp                           |  20 ++++
 8 files changed, 241 insertions(+), 20 deletions(-)
 create mode 100644 test/mpi/assembly_helpers.cpp

diff --git a/core/distributed/assembly_helpers.cpp b/core/distributed/assembly_helpers.cpp
index ff2eec0bf32..61fbc3500bc 100644
--- a/core/distributed/assembly_helpers.cpp
+++ b/core/distributed/assembly_helpers.cpp
@@ -28,7 +28,7 @@ GKO_REGISTER_OPERATION(fill_send_buffers, assembly_helpers::fill_send_buffers);
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-device_matrix_data<ValueType, GlobalIndexType> assemble(
+device_matrix_data<ValueType, GlobalIndexType> add_non_local_entries(
     mpi::communicator comm,
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     ptr_param<const Partition<LocalIndexType, GlobalIndexType>> partition)
@@ -128,13 +128,14 @@ device_matrix_data<ValueType, GlobalIndexType> assemble(
     return all_data;
 }
 
-#define GKO_DECLARE_ASSEMBLE(_value_type, _local_type, _global_type) \
-    device_matrix_data<_value_type, _global_type> assemble(          \
-        mpi::communicator comm,                                      \
-        const device_matrix_data<_value_type, _global_type>& input,  \
+#define GKO_DECLARE_ADD_NON_LOCAL_ENTRIES(_value_type, _local_type,      \
+                                          _global_type)                  \
+    device_matrix_data<_value_type, _global_type> add_non_local_entries( \
+        mpi::communicator comm,                                          \
+        const device_matrix_data<_value_type, _global_type>& input,      \
         ptr_param<const Partition<_local_type, _global_type>> partition)
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
-    GKO_DECLARE_ASSEMBLE);
+    GKO_DECLARE_ADD_NON_LOCAL_ENTRIES);
 
 
 }  // namespace distributed
diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp
index c99f5128ec4..ce96fecc88e 100644
--- a/core/distributed/matrix.cpp
+++ b/core/distributed/matrix.cpp
@@ -354,6 +354,24 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     }
 }
 
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
+    const device_matrix_data<value_type, global_index_type>& data,
+    std::shared_ptr<const Partition<local_index_type, global_index_type>>
+        row_partition,
+    std::shared_ptr<const Partition<local_index_type, global_index_type>>
+        col_partition,
+    assembly_mode assembly_type)
+{
+    if (assembly_type == assembly_mode::local_only) {
+        return this->read_distributed(data, row_partition, col_partition);
+    }
+    auto all_data =
+        add_non_local_entries<ValueType, LocalIndexType, GlobalIndexType>(
+            this->get_communicator(), data, row_partition);
+    return this->read_distributed(all_data, row_partition, col_partition);
+}
+
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
@@ -370,6 +388,31 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
 }
 
 
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
+    const matrix_data<value_type, global_index_type>& data,
+    std::shared_ptr<const Partition<local_index_type, global_index_type>>
+        row_partition,
+    std::shared_ptr<const Partition<local_index_type, global_index_type>>
+        col_partition,
+    assembly_mode assembly_type)
+{
+    if (assembly_type == assembly_mode::local_only) {
+        return this->read_distributed(
+            device_matrix_data<value_type, global_index_type>::create_from_host(
+                this->get_executor(), data),
+            row_partition, col_partition);
+    }
+    auto all_data =
+        add_non_local_entries<ValueType, LocalIndexType, GlobalIndexType>(
+            this->get_communicator(),
+            device_matrix_data<value_type, global_index_type>::create_from_host(
+                this->get_executor(), data),
+            row_partition);
+    return this->read_distributed(all_data, row_partition, col_partition);
+}
+
+
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     const matrix_data<ValueType, global_index_type>& data,
@@ -396,11 +439,12 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
                 this->get_executor(), data),
             partition, partition);
     }
-    auto all_data = assemble<ValueType, LocalIndexType, GlobalIndexType>(
-        this->get_communicator(),
-        device_matrix_data<value_type, global_index_type>::create_from_host(
-            this->get_executor(), data),
-        partition);
+    auto all_data =
+        add_non_local_entries<ValueType, LocalIndexType, GlobalIndexType>(
+            this->get_communicator(),
+            device_matrix_data<value_type, global_index_type>::create_from_host(
+                this->get_executor(), data),
+            partition);
     return this->read_distributed(all_data, partition, partition);
 }
 
@@ -425,8 +469,9 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     if (assembly_type == assembly_mode::local_only) {
         return this->read_distributed(data, partition, partition);
     }
-    auto all_data = assemble<ValueType, LocalIndexType, GlobalIndexType>(
-        this->get_communicator(), data, partition);
+    auto all_data =
+        add_non_local_entries<ValueType, LocalIndexType, GlobalIndexType>(
+            this->get_communicator(), data, partition);
     return this->read_distributed(all_data, partition, partition);
 }
 
diff --git a/include/ginkgo/core/distributed/assembly_helpers.hpp b/include/ginkgo/core/distributed/assembly_helpers.hpp
index 95a07bcfb06..ab21176743b 100644
--- a/include/ginkgo/core/distributed/assembly_helpers.hpp
+++ b/include/ginkgo/core/distributed/assembly_helpers.hpp
@@ -31,17 +31,17 @@ class Partition;
  * their respective owners. This can be useful e.g. in a finite element code
  * where each rank assembles a local contribution to a global system matrix and
  * the global matrix has to be assembled by summing up the local contributions
- * on rank boundaries.
+ * on rank boundaries. The partition used is only relevant for row ownership.
  *
  * @param comm the communicator used to assemble the global matrix.
  * @param input the device_matrix_data structure.
- * @param partition the partition used to determine owndership.
+ * @param partition the partition used to determine row owndership.
  *
  * @return the globally assembled device_matrix_data structure for this MPI
  * rank.
  */
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-device_matrix_data<ValueType, GlobalIndexType> assemble(
+device_matrix_data<ValueType, GlobalIndexType> add_non_local_entries(
     mpi::communicator comm,
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     ptr_param<const Partition<LocalIndexType, GlobalIndexType>> partition);
diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp
index 917d8a1bab6..c512d0c140c 100644
--- a/include/ginkgo/core/distributed/matrix.hpp
+++ b/include/ginkgo/core/distributed/matrix.hpp
@@ -377,7 +377,6 @@ class Matrix
      * @param data  The device_matrix_data structure.
      * @param row_partition  The global row partition.
      * @param col_partition  The global col partition.
-     * @param assembly_mode  The mode of assembly.
      *
      * @return the index_map induced by the partitions and the matrix structure
      */
@@ -388,6 +387,32 @@ class Matrix
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
             col_partition);
 
+    /**
+     * Reads a matrix from the device_matrix_data structure, a global row
+     * partition, and a global column partition.
+     *
+     * The global size of the final matrix is inferred from the size of the row
+     * partition and the size of the column partition. Both the number of rows
+     * and columns of the device_matrix_data are ignored.
+     *
+     * @note The matrix data can contain entries for rows other than those owned
+     *        by the process. Entries for those rows are discarded.
+     *
+     * @param data  The device_matrix_data structure.
+     * @param row_partition  The global row partition.
+     * @param col_partition  The global col partition.
+     * @param assembly_mode  The mode of assembly.
+     *
+     * @return the index_map induced by the partitions and the matrix structure
+     */
+    void read_distributed(
+        const device_matrix_data<value_type, global_index_type>& data,
+        std::shared_ptr<const Partition<local_index_type, global_index_type>>
+            row_partition,
+        std::shared_ptr<const Partition<local_index_type, global_index_type>>
+            col_partition,
+        assembly_mode assembly_type);
+
     /**
      * Reads a matrix from the matrix_data structure, a global row partition,
      * and a global column partition.
@@ -404,6 +429,23 @@ class Matrix
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
             col_partition);
 
+    /**
+     * Reads a matrix from the matrix_data structure, a global row partition,
+     * and a global column partition.
+     *
+     * @see read_distributed
+     *
+     * @note For efficiency it is advised to use the device_matrix_data
+     * overload.
+     */
+    void read_distributed(
+        const matrix_data<value_type, global_index_type>& data,
+        std::shared_ptr<const Partition<local_index_type, global_index_type>>
+            row_partition,
+        std::shared_ptr<const Partition<local_index_type, global_index_type>>
+            col_partition,
+        assembly_mode assembly_type);
+
     /**
      * Get read access to the stored local matrix.
      *
diff --git a/test/distributed/assembly_helpers_kernels.cpp b/test/distributed/assembly_helpers_kernels.cpp
index 50b8c5c5469..f0642121afa 100644
--- a/test/distributed/assembly_helpers_kernels.cpp
+++ b/test/distributed/assembly_helpers_kernels.cpp
@@ -105,7 +105,7 @@ TYPED_TEST_SUITE(AssemblyHelpers, gko::test::ValueLocalGlobalIndexTypes,
                  TupleTypenameNameGenerator);
 
 
-TYPED_TEST(AssemblyHelpers, AssembleDiagOffdiagEmptyIsSameAsRef)
+TYPED_TEST(AssemblyHelpers, AddNonLocalEntriesDiagOffdiagEmptyIsSameAsRef)
 {
     using value_type = typename TestFixture::value_type;
     using local_index_type = typename TestFixture::local_index_type;
@@ -128,7 +128,7 @@ TYPED_TEST(AssemblyHelpers, AssembleDiagOffdiagEmptyIsSameAsRef)
 }
 
 
-TYPED_TEST(AssemblyHelpers, AssembleLocalSmallIsEquivalentToRef)
+TYPED_TEST(AssemblyHelpers, AddNonLocalEntriesLocalSmallIsEquivalentToRef)
 {
     using value_type = typename TestFixture::value_type;
     using local_index_type = typename TestFixture::local_index_type;
@@ -162,7 +162,7 @@ TYPED_TEST(AssemblyHelpers, AssembleLocalSmallIsEquivalentToRef)
 }
 
 
-TYPED_TEST(AssemblyHelpers, AssembleLocalIsEquivalentToRef)
+TYPED_TEST(AssemblyHelpers, AddNonLocalEntriesLocalIsEquivalentToRef)
 {
     using value_type = typename TestFixture::value_type;
     using local_index_type = typename TestFixture::local_index_type;
diff --git a/test/mpi/CMakeLists.txt b/test/mpi/CMakeLists.txt
index 193b4518dcc..cf49f2a4691 100644
--- a/test/mpi/CMakeLists.txt
+++ b/test/mpi/CMakeLists.txt
@@ -1,3 +1,4 @@
+ginkgo_create_common_and_reference_test(assembly_helpers MPI_SIZE 3)
 ginkgo_create_common_and_reference_test(matrix MPI_SIZE 3)
 ginkgo_create_common_and_reference_test(partition_helpers MPI_SIZE 3)
 ginkgo_create_common_and_reference_test(vector MPI_SIZE 3)
diff --git a/test/mpi/assembly_helpers.cpp b/test/mpi/assembly_helpers.cpp
new file mode 100644
index 00000000000..66ef796cf8c
--- /dev/null
+++ b/test/mpi/assembly_helpers.cpp
@@ -0,0 +1,112 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <array>
+#include <memory>
+#include <random>
+
+#include <mpi.h>
+
+#include <gtest/gtest.h>
+
+#include <ginkgo/config.hpp>
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/device_matrix_data.hpp>
+#include <ginkgo/core/base/matrix_data.hpp>
+#include <ginkgo/core/distributed/assembly_helpers.hpp>
+#include <ginkgo/core/distributed/partition.hpp>
+
+#include "core/test/utils.hpp"
+#include "ginkgo/core/base/exception.hpp"
+#include "test/utils/mpi/common_fixture.hpp"
+
+
+#ifndef GKO_COMPILING_DPCPP
+
+
+template <typename ValueLocalGlobalIndexType>
+class AssemblyHelpers : public CommonMpiTestFixture {
+protected:
+    using value_type = typename std::tuple_element<
+        0, decltype(ValueLocalGlobalIndexType())>::type;
+    using local_index_type = typename std::tuple_element<
+        1, decltype(ValueLocalGlobalIndexType())>::type;
+    using global_index_type = typename std::tuple_element<
+        2, decltype(ValueLocalGlobalIndexType())>::type;
+    using Partition =
+        gko::experimental::distributed::Partition<local_index_type,
+                                                  global_index_type>;
+    using matrix_data = gko::matrix_data<value_type, global_index_type>;
+
+
+    AssemblyHelpers()
+        : size{5, 5},
+          dist_input{
+              {{size,
+                {{0, 1, 1},
+                 {0, 3, 2},
+                 {1, 1, 3},
+                 {1, 2, 4},
+                 {2, 0, 1},
+                 {2, 3, 1}}},
+               {size, {{0, 0, 1}, {2, 1, 5}, {2, 2, 6}, {3, 3, 8}, {3, 4, 7}}},
+               {size, {{2, 2, 1}, {3, 3, -1}, {4, 0, 9}, {4, 4, 10}}}}},
+          res_row_idxs{{{exec, {0, 0, 0, 1, 1, 2, 2}},
+                        {exec, {0, 2, 2, 2, 2, 3, 3}},
+                        {exec, {2, 3, 4, 4}}}},
+          res_col_idxs{{{exec, {0, 1, 3, 1, 2, 0, 3}},
+                        {exec, {0, 0, 1, 2, 3, 3, 4}},
+                        {exec, {2, 3, 0, 4}}}},
+          res_values{{{exec, {1, 1, 2, 3, 4, 1, 1}},
+                      {exec, {1, 1, 5, 7, 1, 7, 7}},
+                      {exec, {1, -1, 9, 10}}}},
+          engine(42)
+    {
+        row_part = Partition::build_from_contiguous(
+            exec, gko::array<global_index_type>(
+                      exec, I<global_index_type>{0, 2, 4, 5}));
+    }
+
+    void SetUp() override { ASSERT_EQ(comm.size(), 3); }
+
+
+    gko::dim<2> size;
+    std::shared_ptr<Partition> row_part;
+
+    gko::matrix_data<value_type, global_index_type> mat_input;
+    std::array<matrix_data, 3> dist_input;
+    std::array<gko::array<global_index_type>, 3> res_row_idxs;
+    std::array<gko::array<global_index_type>, 3> res_col_idxs;
+    std::array<gko::array<value_type>, 3> res_values;
+
+    std::default_random_engine engine;
+};
+
+TYPED_TEST_SUITE(AssemblyHelpers, gko::test::ValueLocalGlobalIndexTypes,
+                 TupleTypenameNameGenerator);
+
+
+TYPED_TEST(AssemblyHelpers, AddsNonLocalEntries)
+{
+    using value_type = typename TestFixture::value_type;
+    using local_index_type = typename TestFixture::local_index_type;
+    using global_index_type = typename TestFixture::global_index_type;
+    I<I<value_type>> res_local[] = {{{1, 1}, {0, 3}}, {{7, 1}, {0, 7}}, {{10}}};
+    I<I<value_type>> res_non_local[] = {
+        {{0, 2}, {4, 0}}, {{1, 5, 0}, {0, 0, 7}}, {{9}}};
+    auto rank = this->comm.rank();
+    auto input = gko::device_matrix_data<value_type, global_index_type>::
+        create_from_host(this->exec, this->dist_input[rank]);
+
+    auto result = gko::experimental::distributed::add_non_local_entries<
+        value_type, local_index_type, global_index_type>(this->comm, input,
+                                                         this->row_part);
+
+    auto result_arrays = result.empty_out();
+    GKO_ASSERT_ARRAY_EQ(result_arrays.row_idxs, this->res_row_idxs[rank]);
+    GKO_ASSERT_ARRAY_EQ(result_arrays.col_idxs, this->res_col_idxs[rank]);
+    GKO_ASSERT_ARRAY_EQ(result_arrays.values, this->res_values[rank]);
+}
+
+#endif
diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp
index 9ee91ce8c3f..0cfb3aca477 100644
--- a/test/mpi/matrix.cpp
+++ b/test/mpi/matrix.cpp
@@ -180,6 +180,26 @@ TYPED_TEST(MatrixCreation, ReadsDistributedWithColPartition)
 }
 
 
+TYPED_TEST(MatrixCreation, ReadsDistributedWithColPartitionAndCommunicate)
+{
+    using value_type = typename TestFixture::value_type;
+    using csr = typename TestFixture::local_matrix_type;
+    I<I<value_type>> res_local[] = {{{2, 0}, {0, 0}}, {{1, 5}, {0, 0}}, {{0}}};
+    I<I<value_type>> res_non_local[] = {
+        {{1, 1, 0}, {0, 3, 4}}, {{1, 0, 7}, {7, 7, 0}}, {{10, 9}}};
+    auto rank = this->dist_mat->get_communicator().rank();
+
+    this->dist_mat->read_distributed(
+        this->dist_input[rank], this->row_part, this->col_part,
+        gko::experimental::distributed::assembly_mode::communicate);
+
+    GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_local_matrix()),
+                        res_local[rank], 0);
+    GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_non_local_matrix()),
+                        res_non_local[rank], 0);
+}
+
+
 TYPED_TEST(MatrixCreation, BuildOnlyLocal)
 {
     using value_type = typename TestFixture::value_type;

From 0e37518b32060d3e0ea60f73501304ef86a4046a Mon Sep 17 00:00:00 2001
From: Fritz Goebel <fritz.goebel@tum.de>
Date: Fri, 22 Nov 2024 11:28:50 +0100
Subject: [PATCH 319/448] Move fill_send_buffers to unified kernels

---
 .../distributed/assembly_helpers_kernels.cpp  | 37 ------------
 common/unified/CMakeLists.txt                 |  1 +
 .../distributed/assembly_helpers_kernels.cpp  | 58 +++++++++++++++++++
 omp/distributed/assembly_helpers_kernels.cpp  | 31 ----------
 4 files changed, 59 insertions(+), 68 deletions(-)
 create mode 100644 common/unified/distributed/assembly_helpers_kernels.cpp

diff --git a/common/cuda_hip/distributed/assembly_helpers_kernels.cpp b/common/cuda_hip/distributed/assembly_helpers_kernels.cpp
index 85b65c8f56f..e4488db3c84 100644
--- a/common/cuda_hip/distributed/assembly_helpers_kernels.cpp
+++ b/common/cuda_hip/distributed/assembly_helpers_kernels.cpp
@@ -94,43 +94,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
 
 
-template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void fill_send_buffers(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const device_matrix_data<ValueType, GlobalIndexType>& input,
-    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
-        row_partition,
-    comm_index_type local_part, const array<GlobalIndexType>& send_positions,
-    const array<GlobalIndexType>& original_positions,
-    array<GlobalIndexType>& send_row_idxs,
-    array<GlobalIndexType>& send_col_idxs, array<ValueType>& send_values)
-{
-    auto num_entries = input.get_num_stored_elements();
-    auto input_row_idxs = input.get_const_row_idxs();
-    auto input_col_idxs = input.get_const_col_idxs();
-    auto input_values = input.get_const_values();
-
-    run_kernel(
-        exec,
-        [] GKO_KERNEL(auto i, auto in_rows, auto in_cols, auto in_vals,
-                      auto in_pos, auto out_pos, auto out_rows, auto out_cols,
-                      auto out_vals) {
-            if (in_pos[i] >= 0) {
-                out_rows[out_pos[i]] = in_rows[in_pos[i]];
-                out_cols[out_pos[i]] = in_cols[in_pos[i]];
-                out_vals[out_pos[i]] = in_vals[in_pos[i]];
-            }
-        },
-        num_entries, input_row_idxs, input_col_idxs, input_values,
-        original_positions.get_const_data(), send_positions.get_const_data(),
-        send_row_idxs.get_data(), send_col_idxs.get_data(),
-        send_values.get_data());
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
-    GKO_DECLARE_FILL_SEND_BUFFERS);
-
-
 }  // namespace assembly_helpers
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
diff --git a/common/unified/CMakeLists.txt b/common/unified/CMakeLists.txt
index 00bc21df0c6..7e153fc20a5 100644
--- a/common/unified/CMakeLists.txt
+++ b/common/unified/CMakeLists.txt
@@ -6,6 +6,7 @@ set(UNIFIED_SOURCES
     components/format_conversion_kernels.cpp
     components/precision_conversion_kernels.cpp
     components/reduce_array_kernels.cpp
+    distributed/assembly_helpers_kernels.cpp
     distributed/partition_helpers_kernels.cpp
     distributed/partition_kernels.cpp
     matrix/coo_kernels.cpp
diff --git a/common/unified/distributed/assembly_helpers_kernels.cpp b/common/unified/distributed/assembly_helpers_kernels.cpp
new file mode 100644
index 00000000000..3ea198519fb
--- /dev/null
+++ b/common/unified/distributed/assembly_helpers_kernels.cpp
@@ -0,0 +1,58 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/distributed/assembly_helpers_kernels.hpp"
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+#include "common/unified/base/kernel_launch.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace assembly_helpers {
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void fill_send_buffers(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    comm_index_type local_part, const array<GlobalIndexType>& send_positions,
+    const array<GlobalIndexType>& original_positions,
+    array<GlobalIndexType>& send_row_idxs,
+    array<GlobalIndexType>& send_col_idxs, array<ValueType>& send_values)
+{
+    auto num_entries = input.get_num_stored_elements();
+    auto input_row_idxs = input.get_const_row_idxs();
+    auto input_col_idxs = input.get_const_col_idxs();
+    auto input_values = input.get_const_values();
+
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto in_rows, auto in_cols, auto in_vals,
+                      auto in_pos, auto out_pos, auto out_rows, auto out_cols,
+                      auto out_vals) {
+            if (in_pos[i] >= 0) {
+                out_rows[out_pos[i]] = in_rows[in_pos[i]];
+                out_cols[out_pos[i]] = in_cols[in_pos[i]];
+                out_vals[out_pos[i]] = in_vals[in_pos[i]];
+            }
+        },
+        num_entries, input_row_idxs, input_col_idxs, input_values,
+        original_positions.get_const_data(), send_positions.get_const_data(),
+        send_row_idxs.get_data(), send_col_idxs.get_data(),
+        send_values.get_data());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_FILL_SEND_BUFFERS);
+
+
+}  // namespace assembly_helpers
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/omp/distributed/assembly_helpers_kernels.cpp b/omp/distributed/assembly_helpers_kernels.cpp
index 93706671562..7cdbefa05c3 100644
--- a/omp/distributed/assembly_helpers_kernels.cpp
+++ b/omp/distributed/assembly_helpers_kernels.cpp
@@ -77,37 +77,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
 
 
-template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void fill_send_buffers(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const device_matrix_data<ValueType, GlobalIndexType>& input,
-    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
-        row_partition,
-    comm_index_type local_part, const array<GlobalIndexType>& send_positions,
-    const array<GlobalIndexType>& original_positions,
-    array<GlobalIndexType>& send_row_idxs,
-    array<GlobalIndexType>& send_col_idxs, array<ValueType>& send_values)
-{
-    auto input_row_idxs = input.get_const_row_idxs();
-    auto input_col_idxs = input.get_const_col_idxs();
-    auto input_vals = input.get_const_values();
-
-#pragma omp parallel for
-    for (size_type i = 0; i < input.get_num_stored_elements(); ++i) {
-        auto in_pos = original_positions.get_const_data()[i];
-        if (in_pos >= 0) {
-            auto out_pos = send_positions.get_const_data()[i];
-            send_row_idxs.get_data()[out_pos] = input_row_idxs[in_pos];
-            send_col_idxs.get_data()[out_pos] = input_col_idxs[in_pos];
-            send_values.get_data()[out_pos] = input_vals[in_pos];
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
-    GKO_DECLARE_FILL_SEND_BUFFERS);
-
-
 }  // namespace assembly_helpers
 }  // namespace omp
 }  // namespace kernels

From b5749cced39e84293e51a045748307784ae9f4e3 Mon Sep 17 00:00:00 2001
From: Fritz Goebel <fritz.goebel@tum.de>
Date: Mon, 25 Nov 2024 09:25:33 +0100
Subject: [PATCH 320/448] Address review comments

---
 core/distributed/assembly_helpers.cpp         | 15 ++---
 core/distributed/matrix.cpp                   | 56 +++++--------------
 .../core/distributed/assembly_helpers.hpp     |  3 +-
 include/ginkgo/core/distributed/matrix.hpp    |  4 +-
 test/mpi/assembly_helpers.cpp                 |  2 +-
 5 files changed, 27 insertions(+), 53 deletions(-)

diff --git a/core/distributed/assembly_helpers.cpp b/core/distributed/assembly_helpers.cpp
index 61fbc3500bc..6e01a96f293 100644
--- a/core/distributed/assembly_helpers.cpp
+++ b/core/distributed/assembly_helpers.cpp
@@ -28,7 +28,7 @@ GKO_REGISTER_OPERATION(fill_send_buffers, assembly_helpers::fill_send_buffers);
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-device_matrix_data<ValueType, GlobalIndexType> add_non_local_entries(
+device_matrix_data<ValueType, GlobalIndexType> assemble_rows_from_neighbors(
     mpi::communicator comm,
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     ptr_param<const Partition<LocalIndexType, GlobalIndexType>> partition)
@@ -128,14 +128,15 @@ device_matrix_data<ValueType, GlobalIndexType> add_non_local_entries(
     return all_data;
 }
 
-#define GKO_DECLARE_ADD_NON_LOCAL_ENTRIES(_value_type, _local_type,      \
-                                          _global_type)                  \
-    device_matrix_data<_value_type, _global_type> add_non_local_entries( \
-        mpi::communicator comm,                                          \
-        const device_matrix_data<_value_type, _global_type>& input,      \
+#define GKO_DECLARE_ASSEMBLE_ROWS_FROM_NEIGHBORS(_value_type, _local_type, \
+                                                 _global_type)             \
+    device_matrix_data<_value_type, _global_type>                          \
+    assemble_rows_from_neighbors(                                          \
+        mpi::communicator comm,                                            \
+        const device_matrix_data<_value_type, _global_type>& input,        \
         ptr_param<const Partition<_local_type, _global_type>> partition)
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
-    GKO_DECLARE_ADD_NON_LOCAL_ENTRIES);
+    GKO_DECLARE_ASSEMBLE_ROWS_FROM_NEIGHBORS);
 
 
 }  // namespace distributed
diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp
index ce96fecc88e..ce30ef4d029 100644
--- a/core/distributed/matrix.cpp
+++ b/core/distributed/matrix.cpp
@@ -366,9 +366,9 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     if (assembly_type == assembly_mode::local_only) {
         return this->read_distributed(data, row_partition, col_partition);
     }
-    auto all_data =
-        add_non_local_entries<ValueType, LocalIndexType, GlobalIndexType>(
-            this->get_communicator(), data, row_partition);
+    auto all_data = assemble_rows_from_neighbors<ValueType, LocalIndexType,
+                                                 GlobalIndexType>(
+        this->get_communicator(), data, row_partition);
     return this->read_distributed(all_data, row_partition, col_partition);
 }
 
@@ -397,19 +397,10 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
         col_partition,
     assembly_mode assembly_type)
 {
-    if (assembly_type == assembly_mode::local_only) {
-        return this->read_distributed(
-            device_matrix_data<value_type, global_index_type>::create_from_host(
-                this->get_executor(), data),
-            row_partition, col_partition);
-    }
-    auto all_data =
-        add_non_local_entries<ValueType, LocalIndexType, GlobalIndexType>(
-            this->get_communicator(),
-            device_matrix_data<value_type, global_index_type>::create_from_host(
-                this->get_executor(), data),
-            row_partition);
-    return this->read_distributed(all_data, row_partition, col_partition);
+    return this->read_distributed(
+        device_matrix_data<value_type, global_index_type>::create_from_host(
+            this->get_executor(), data),
+        row_partition, col_partition, assembly_type);
 }
 
 
@@ -433,29 +424,10 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
         partition,
     assembly_mode assembly_type)
 {
-    if (assembly_type == assembly_mode::local_only) {
-        return this->read_distributed(
-            device_matrix_data<value_type, global_index_type>::create_from_host(
-                this->get_executor(), data),
-            partition, partition);
-    }
-    auto all_data =
-        add_non_local_entries<ValueType, LocalIndexType, GlobalIndexType>(
-            this->get_communicator(),
-            device_matrix_data<value_type, global_index_type>::create_from_host(
-                this->get_executor(), data),
-            partition);
-    return this->read_distributed(all_data, partition, partition);
-}
-
-
-template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
-    const device_matrix_data<ValueType, GlobalIndexType>& data,
-    std::shared_ptr<const Partition<local_index_type, global_index_type>>
-        partition)
-{
-    return this->read_distributed(data, partition, partition);
+    return this->read_distributed(
+        device_matrix_data<value_type, global_index_type>::create_from_host(
+            this->get_executor(), data),
+        partition, partition, assembly_type);
 }
 
 
@@ -469,9 +441,9 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     if (assembly_type == assembly_mode::local_only) {
         return this->read_distributed(data, partition, partition);
     }
-    auto all_data =
-        add_non_local_entries<ValueType, LocalIndexType, GlobalIndexType>(
-            this->get_communicator(), data, partition);
+    auto all_data = assemble_rows_from_neighbors<ValueType, LocalIndexType,
+                                                 GlobalIndexType>(
+        this->get_communicator(), data, partition);
     return this->read_distributed(all_data, partition, partition);
 }
 
diff --git a/include/ginkgo/core/distributed/assembly_helpers.hpp b/include/ginkgo/core/distributed/assembly_helpers.hpp
index ab21176743b..c504e49e433 100644
--- a/include/ginkgo/core/distributed/assembly_helpers.hpp
+++ b/include/ginkgo/core/distributed/assembly_helpers.hpp
@@ -21,6 +21,7 @@ namespace gko {
 namespace experimental {
 namespace distributed {
 
+
 template <typename LocalIndexType, typename GlobalIndexType>
 class Partition;
 
@@ -41,7 +42,7 @@ class Partition;
  * rank.
  */
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-device_matrix_data<ValueType, GlobalIndexType> add_non_local_entries(
+device_matrix_data<ValueType, GlobalIndexType> assemble_rows_from_neighbors(
     mpi::communicator comm,
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     ptr_param<const Partition<LocalIndexType, GlobalIndexType>> partition);
diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp
index c512d0c140c..f6878168781 100644
--- a/include/ginkgo/core/distributed/matrix.hpp
+++ b/include/ginkgo/core/distributed/matrix.hpp
@@ -136,10 +136,10 @@ namespace distributed {
 /**
  * assembly_mode defines how the read_distributed function of the distributed
  * matrix treats non-local indices in the (device_)matrix_data:
- * - communicate communicates the overlap between ranks and adds up all local
+ * - `communicate` communicates the overlap between ranks and adds up all local
  *   contributions. Indices smaller than 0 or larger than the global size
  *   of the matrix are ignored.
- * - local_only does not communicate any overlap but ignores all non-local
+ * - `local_only` does not communicate any overlap but ignores all non-local
  *   indices.
  */
 enum class assembly_mode { communicate, local_only };
diff --git a/test/mpi/assembly_helpers.cpp b/test/mpi/assembly_helpers.cpp
index 66ef796cf8c..b447cb66e93 100644
--- a/test/mpi/assembly_helpers.cpp
+++ b/test/mpi/assembly_helpers.cpp
@@ -99,7 +99,7 @@ TYPED_TEST(AssemblyHelpers, AddsNonLocalEntries)
     auto input = gko::device_matrix_data<value_type, global_index_type>::
         create_from_host(this->exec, this->dist_input[rank]);
 
-    auto result = gko::experimental::distributed::add_non_local_entries<
+    auto result = gko::experimental::distributed::assemble_rows_from_neighbors<
         value_type, local_index_type, global_index_type>(this->comm, input,
                                                          this->row_part);
 

From de504ea56f59aa79785ec485416cd18c50b07979 Mon Sep 17 00:00:00 2001
From: Fritz Goebel <fritz.goebel@tum.de>
Date: Wed, 27 Nov 2024 15:23:30 +0100
Subject: [PATCH 321/448] Adress review comments

Co-authored-by: Yu-Hsiang M. Tsai <yhmtsai@gmail.com>
Co-authored-by: Marcel Koch <marcel.koch@kit.edu>
---
 core/distributed/matrix.cpp                   | 75 ++++-----------
 .../core/distributed/assembly_helpers.hpp     |  6 +-
 include/ginkgo/core/distributed/matrix.hpp    | 91 ++-----------------
 .../distributed/assembly_helpers_kernels.cpp  |  6 --
 test/distributed/assembly_helpers_kernels.cpp |  4 +-
 test/mpi/assembly_helpers.cpp                 |  3 -
 6 files changed, 30 insertions(+), 155 deletions(-)

diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp
index ce30ef4d029..1dc2280cba5 100644
--- a/core/distributed/matrix.cpp
+++ b/core/distributed/matrix.cpp
@@ -246,7 +246,8 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     std::shared_ptr<const Partition<local_index_type, global_index_type>>
         row_partition,
     std::shared_ptr<const Partition<local_index_type, global_index_type>>
-        col_partition)
+        col_partition,
+    assembly_mode assembly_type)
 {
     const auto comm = this->get_communicator();
     GKO_ASSERT_EQ(data.get_size()[0], row_partition->get_size());
@@ -259,6 +260,16 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     auto tmp_row_partition = make_temporary_clone(exec, row_partition);
     auto tmp_col_partition = make_temporary_clone(exec, col_partition);
 
+    const device_matrix_data<value_type, global_index_type>* all_data_ptr =
+        &data;
+    device_matrix_data<value_type, global_index_type> assembled_data(exec);
+    if (assembly_type == assembly_mode::communicate) {
+        assembled_data = assemble_rows_from_neighbors<ValueType, LocalIndexType,
+                                                      GlobalIndexType>(
+            this->get_communicator(), data, row_partition);
+        all_data_ptr = &assembled_data;
+    }
+
     // set up LinOp sizes
     auto global_num_rows = row_partition->get_size();
     auto global_num_cols = col_partition->get_size();
@@ -278,9 +289,9 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     // as well as the rows of the non-local block. The columns of the non-local
     // block are still in global indices.
     exec->run(matrix::make_separate_local_nonlocal(
-        data, tmp_row_partition.get(), tmp_col_partition.get(), local_part,
-        local_row_idxs, local_col_idxs, local_values, non_local_row_idxs,
-        global_non_local_col_idxs, non_local_values));
+        *all_data_ptr, tmp_row_partition.get(), tmp_col_partition.get(),
+        local_part, local_row_idxs, local_col_idxs, local_values,
+        non_local_row_idxs, global_non_local_col_idxs, non_local_values));
 
     auto imap = index_map<local_index_type, global_index_type>(
         exec, col_partition, comm.rank(), global_non_local_col_idxs);
@@ -354,39 +365,6 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     }
 }
 
-template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
-    const device_matrix_data<value_type, global_index_type>& data,
-    std::shared_ptr<const Partition<local_index_type, global_index_type>>
-        row_partition,
-    std::shared_ptr<const Partition<local_index_type, global_index_type>>
-        col_partition,
-    assembly_mode assembly_type)
-{
-    if (assembly_type == assembly_mode::local_only) {
-        return this->read_distributed(data, row_partition, col_partition);
-    }
-    auto all_data = assemble_rows_from_neighbors<ValueType, LocalIndexType,
-                                                 GlobalIndexType>(
-        this->get_communicator(), data, row_partition);
-    return this->read_distributed(all_data, row_partition, col_partition);
-}
-
-
-template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
-    const matrix_data<value_type, global_index_type>& data,
-    std::shared_ptr<const Partition<local_index_type, global_index_type>>
-        row_partition,
-    std::shared_ptr<const Partition<local_index_type, global_index_type>>
-        col_partition)
-{
-    return this->read_distributed(
-        device_matrix_data<value_type, global_index_type>::create_from_host(
-            this->get_executor(), data),
-        row_partition, col_partition);
-}
-
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
@@ -404,19 +382,6 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
 }
 
 
-template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
-    const matrix_data<ValueType, global_index_type>& data,
-    std::shared_ptr<const Partition<local_index_type, global_index_type>>
-        partition)
-{
-    return this->read_distributed(
-        device_matrix_data<value_type, global_index_type>::create_from_host(
-            this->get_executor(), data),
-        partition, partition);
-}
-
-
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     const matrix_data<ValueType, global_index_type>& data,
@@ -433,18 +398,12 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
-    const device_matrix_data<ValueType, GlobalIndexType>& data,
+    const device_matrix_data<value_type, global_index_type>& data,
     std::shared_ptr<const Partition<local_index_type, global_index_type>>
         partition,
     assembly_mode assembly_type)
 {
-    if (assembly_type == assembly_mode::local_only) {
-        return this->read_distributed(data, partition, partition);
-    }
-    auto all_data = assemble_rows_from_neighbors<ValueType, LocalIndexType,
-                                                 GlobalIndexType>(
-        this->get_communicator(), data, partition);
-    return this->read_distributed(all_data, partition, partition);
+    return this->read_distributed(data, partition, partition, assembly_type);
 }
 
 
diff --git a/include/ginkgo/core/distributed/assembly_helpers.hpp b/include/ginkgo/core/distributed/assembly_helpers.hpp
index c504e49e433..fc6ffeb608f 100644
--- a/include/ginkgo/core/distributed/assembly_helpers.hpp
+++ b/include/ginkgo/core/distributed/assembly_helpers.hpp
@@ -34,9 +34,9 @@ class Partition;
  * the global matrix has to be assembled by summing up the local contributions
  * on rank boundaries. The partition used is only relevant for row ownership.
  *
- * @param comm the communicator used to assemble the global matrix.
- * @param input the device_matrix_data structure.
- * @param partition the partition used to determine row owndership.
+ * @param comm  the communicator used to assemble the global matrix.
+ * @param input  the device_matrix_data structure.
+ * @param partition  the partition used to determine row owndership.
  *
  * @return the globally assembled device_matrix_data structure for this MPI
  * rank.
diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp
index f6878168781..2f2f470a4ed 100644
--- a/include/ginkgo/core/distributed/matrix.hpp
+++ b/include/ginkgo/core/distributed/matrix.hpp
@@ -299,32 +299,11 @@ class Matrix
      * are ignored.
      *
      * @note The matrix data can contain entries for rows other than those owned
-     *        by the process. Entries for those rows are discarded.
+     *       by the process. Entries for those rows are discarded.
      *
      * @param data  The device_matrix_data structure.
      * @param partition  The global row and column partition.
-     *
-     * @return the index_map induced by the partitions and the matrix structure
-     */
-    void read_distributed(
-        const device_matrix_data<value_type, global_index_type>& data,
-        std::shared_ptr<const Partition<local_index_type, global_index_type>>
-            partition);
-
-    /**
-     * Reads a square matrix from the device_matrix_data structure and a global
-     * partition.
-     *
-     * The global size of the final matrix is inferred from the size of the
-     * partition. Both the number of rows and columns of the device_matrix_data
-     * are ignored.
-     *
-     * @note The matrix data can contain entries for rows other than those owned
-     *        by the process. Entries for those rows are discarded.
-     *
-     * @param data  The device_matrix_data structure.
-     * @param partition  The global row and column partition.
-     * @param assembly_mode  The mode of assembly.
+     * @param assembly_type  The mode of assembly.
      *
      * @return the index_map induced by the partitions and the matrix structure
      */
@@ -332,21 +311,7 @@ class Matrix
         const device_matrix_data<value_type, global_index_type>& data,
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
             partition,
-        assembly_mode assembly_type);
-
-    /**
-     * Reads a square matrix from the matrix_data structure and a global
-     * partition.
-     *
-     * @see read_distributed
-     *
-     * @note For efficiency it is advised to use the device_matrix_data
-     * overload.
-     */
-    void read_distributed(
-        const matrix_data<value_type, global_index_type>& data,
-        std::shared_ptr<const Partition<local_index_type, global_index_type>>
-            partition);
+        assembly_mode assembly_type = assembly_mode::local_only);
 
     /**
      * Reads a square matrix from the matrix_data structure and a global
@@ -361,7 +326,7 @@ class Matrix
         const matrix_data<value_type, global_index_type>& data,
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
             partition,
-        assembly_mode assembly_type);
+        assembly_mode assembly_type = assembly_mode::local_only);
 
     /**
      * Reads a matrix from the device_matrix_data structure, a global row
@@ -372,36 +337,12 @@ class Matrix
      * and columns of the device_matrix_data are ignored.
      *
      * @note The matrix data can contain entries for rows other than those owned
-     *        by the process. Entries for those rows are discarded.
+     *       by the process. Entries for those rows are discarded.
      *
      * @param data  The device_matrix_data structure.
      * @param row_partition  The global row partition.
      * @param col_partition  The global col partition.
-     *
-     * @return the index_map induced by the partitions and the matrix structure
-     */
-    void read_distributed(
-        const device_matrix_data<value_type, global_index_type>& data,
-        std::shared_ptr<const Partition<local_index_type, global_index_type>>
-            row_partition,
-        std::shared_ptr<const Partition<local_index_type, global_index_type>>
-            col_partition);
-
-    /**
-     * Reads a matrix from the device_matrix_data structure, a global row
-     * partition, and a global column partition.
-     *
-     * The global size of the final matrix is inferred from the size of the row
-     * partition and the size of the column partition. Both the number of rows
-     * and columns of the device_matrix_data are ignored.
-     *
-     * @note The matrix data can contain entries for rows other than those owned
-     *        by the process. Entries for those rows are discarded.
-     *
-     * @param data  The device_matrix_data structure.
-     * @param row_partition  The global row partition.
-     * @param col_partition  The global col partition.
-     * @param assembly_mode  The mode of assembly.
+     * @param assembly_type  The mode of assembly.
      *
      * @return the index_map induced by the partitions and the matrix structure
      */
@@ -411,23 +352,7 @@ class Matrix
             row_partition,
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
             col_partition,
-        assembly_mode assembly_type);
-
-    /**
-     * Reads a matrix from the matrix_data structure, a global row partition,
-     * and a global column partition.
-     *
-     * @see read_distributed
-     *
-     * @note For efficiency it is advised to use the device_matrix_data
-     * overload.
-     */
-    void read_distributed(
-        const matrix_data<value_type, global_index_type>& data,
-        std::shared_ptr<const Partition<local_index_type, global_index_type>>
-            row_partition,
-        std::shared_ptr<const Partition<local_index_type, global_index_type>>
-            col_partition);
+        assembly_mode assembly_type = assembly_mode::local_only);
 
     /**
      * Reads a matrix from the matrix_data structure, a global row partition,
@@ -444,7 +369,7 @@ class Matrix
             row_partition,
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
             col_partition,
-        assembly_mode assembly_type);
+        assembly_mode assembly_type = assembly_mode::local_only);
 
     /**
      * Get read access to the stored local matrix.
diff --git a/reference/test/distributed/assembly_helpers_kernels.cpp b/reference/test/distributed/assembly_helpers_kernels.cpp
index b11b736e567..cdfd28853b5 100644
--- a/reference/test/distributed/assembly_helpers_kernels.cpp
+++ b/reference/test/distributed/assembly_helpers_kernels.cpp
@@ -16,9 +16,6 @@
 #include "core/test/utils.hpp"
 
 
-namespace {
-
-
 using comm_index_type = gko::experimental::distributed::comm_index_type;
 
 
@@ -148,6 +145,3 @@ TYPED_TEST(AssemblyHelpers, FillOverlapSendBuffers)
         GKO_ASSERT_ARRAY_EQ(send_values, send_values_ref[i]);
     }
 }
-
-
-}  // namespace
diff --git a/test/distributed/assembly_helpers_kernels.cpp b/test/distributed/assembly_helpers_kernels.cpp
index f0642121afa..4864ffec471 100644
--- a/test/distributed/assembly_helpers_kernels.cpp
+++ b/test/distributed/assembly_helpers_kernels.cpp
@@ -105,7 +105,7 @@ TYPED_TEST_SUITE(AssemblyHelpers, gko::test::ValueLocalGlobalIndexTypes,
                  TupleTypenameNameGenerator);
 
 
-TYPED_TEST(AssemblyHelpers, AddNonLocalEntriesDiagOffdiagEmptyIsSameAsRef)
+TYPED_TEST(AssemblyHelpers, AddNonLocalEntriesEmptyIsSameAsRef)
 {
     using value_type = typename TestFixture::value_type;
     using local_index_type = typename TestFixture::local_index_type;
@@ -179,7 +179,7 @@ TYPED_TEST(AssemblyHelpers, AddNonLocalEntriesLocalIsEquivalentToRef)
     auto input = gko::test::generate_random_device_matrix_data<
         value_type, global_index_type>(
         num_rows, num_cols,
-        std::uniform_int_distribution<int>(static_cast<int>(num_cols - 1),
+        std::uniform_int_distribution<int>(static_cast<int>(1),
                                            static_cast<int>(num_cols - 1)),
         std::uniform_real_distribution<gko::remove_complex<value_type>>(0, 1),
         this->engine, this->ref);
diff --git a/test/mpi/assembly_helpers.cpp b/test/mpi/assembly_helpers.cpp
index b447cb66e93..c16e5402f96 100644
--- a/test/mpi/assembly_helpers.cpp
+++ b/test/mpi/assembly_helpers.cpp
@@ -92,9 +92,6 @@ TYPED_TEST(AssemblyHelpers, AddsNonLocalEntries)
     using value_type = typename TestFixture::value_type;
     using local_index_type = typename TestFixture::local_index_type;
     using global_index_type = typename TestFixture::global_index_type;
-    I<I<value_type>> res_local[] = {{{1, 1}, {0, 3}}, {{7, 1}, {0, 7}}, {{10}}};
-    I<I<value_type>> res_non_local[] = {
-        {{0, 2}, {4, 0}}, {{1, 5, 0}, {0, 0, 7}}, {{9}}};
     auto rank = this->comm.rank();
     auto input = gko::device_matrix_data<value_type, global_index_type>::
         create_from_host(this->exec, this->dist_input[rank]);

From fca55a6e5f2bbd7740f2e5dff00d50a348166d73 Mon Sep 17 00:00:00 2001
From: Fritz Goebel <fritz.goebel@tum.de>
Date: Wed, 27 Nov 2024 17:22:41 +0100
Subject: [PATCH 322/448] Fix multiple definitions in dpcpp

---
 .../distributed/assembly_helpers_kernels.dp.cpp  | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/dpcpp/distributed/assembly_helpers_kernels.dp.cpp b/dpcpp/distributed/assembly_helpers_kernels.dp.cpp
index f86b46f0846..ba5d3024425 100644
--- a/dpcpp/distributed/assembly_helpers_kernels.dp.cpp
+++ b/dpcpp/distributed/assembly_helpers_kernels.dp.cpp
@@ -27,22 +27,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
 
 
-template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void fill_send_buffers(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const device_matrix_data<ValueType, GlobalIndexType>& input,
-    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
-        row_partition,
-    comm_index_type local_part, const array<GlobalIndexType>& send_positions,
-    const array<GlobalIndexType>& original_positions,
-    array<GlobalIndexType>& send_row_idxs,
-    array<GlobalIndexType>& send_col_idxs,
-    array<ValueType>& send_values) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
-    GKO_DECLARE_FILL_SEND_BUFFERS);
-
-
 }  // namespace assembly_helpers
 }  // namespace dpcpp
 }  // namespace kernels

From 2170a3abe4b46fda90d0c1c4f21883223abd7a9d Mon Sep 17 00:00:00 2001
From: Fritz Goebel <fritz.goebel@tum.de>
Date: Thu, 28 Nov 2024 13:00:00 +0100
Subject: [PATCH 323/448] Move `assembly_helpers` to `assembly`

---
 common/cuda_hip/CMakeLists.txt                  |  2 +-
 ...helpers_kernels.cpp => assembly_kernels.cpp} |  6 +++---
 common/unified/CMakeLists.txt                   |  2 +-
 ...helpers_kernels.cpp => assembly_kernels.cpp} |  6 +++---
 core/CMakeLists.txt                             |  2 +-
 core/device_hooks/common_kernels.inc.cpp        |  6 +++---
 .../{assembly_helpers.cpp => assembly.cpp}      | 16 ++++++++--------
 ...helpers_kernels.hpp => assembly_kernels.hpp} |  9 ++++-----
 core/distributed/matrix.cpp                     |  2 +-
 dpcpp/CMakeLists.txt                            |  2 +-
 ...s_kernels.dp.cpp => assembly_kernels.dp.cpp} |  6 +++---
 .../{assembly_helpers.hpp => assembly.hpp}      |  6 +++---
 include/ginkgo/ginkgo.hpp                       |  2 +-
 omp/CMakeLists.txt                              |  2 +-
 ...helpers_kernels.cpp => assembly_kernels.cpp} |  6 +++---
 reference/CMakeLists.txt                        |  2 +-
 ...helpers_kernels.cpp => assembly_kernels.cpp} |  6 +++---
 reference/test/distributed/CMakeLists.txt       |  2 +-
 ...helpers_kernels.cpp => assembly_kernels.cpp} |  6 +++---
 test/distributed/CMakeLists.txt                 |  2 +-
 ...helpers_kernels.cpp => assembly_kernels.cpp} | 17 ++++++++---------
 test/mpi/CMakeLists.txt                         |  2 +-
 test/mpi/{assembly_helpers.cpp => assembly.cpp} |  2 +-
 23 files changed, 56 insertions(+), 58 deletions(-)
 rename common/cuda_hip/distributed/{assembly_helpers_kernels.cpp => assembly_kernels.cpp} (96%)
 rename common/unified/distributed/{assembly_helpers_kernels.cpp => assembly_kernels.cpp} (94%)
 rename core/distributed/{assembly_helpers.cpp => assembly.cpp} (93%)
 rename core/distributed/{assembly_helpers_kernels.hpp => assembly_kernels.hpp} (90%)
 rename dpcpp/distributed/{assembly_helpers_kernels.dp.cpp => assembly_kernels.dp.cpp} (88%)
 rename include/ginkgo/core/distributed/{assembly_helpers.hpp => assembly.hpp} (90%)
 rename omp/distributed/{assembly_helpers_kernels.cpp => assembly_kernels.cpp} (95%)
 rename reference/distributed/{assembly_helpers_kernels.cpp => assembly_kernels.cpp} (97%)
 rename reference/test/distributed/{assembly_helpers_kernels.cpp => assembly_kernels.cpp} (96%)
 rename test/distributed/{assembly_helpers_kernels.cpp => assembly_kernels.cpp} (93%)
 rename test/mpi/{assembly_helpers.cpp => assembly.cpp} (98%)

diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt
index dd38ca4f7b2..5cfa55ca687 100644
--- a/common/cuda_hip/CMakeLists.txt
+++ b/common/cuda_hip/CMakeLists.txt
@@ -4,7 +4,7 @@ set(CUDA_HIP_SOURCES
     base/device_matrix_data_kernels.cpp
     base/index_set_kernels.cpp
     components/prefix_sum_kernels.cpp
-    distributed/assembly_helpers_kernels.cpp
+    distributed/assembly_kernels.cpp
     distributed/index_map_kernels.cpp
     distributed/matrix_kernels.cpp
     distributed/partition_helpers_kernels.cpp
diff --git a/common/cuda_hip/distributed/assembly_helpers_kernels.cpp b/common/cuda_hip/distributed/assembly_kernels.cpp
similarity index 96%
rename from common/cuda_hip/distributed/assembly_helpers_kernels.cpp
rename to common/cuda_hip/distributed/assembly_kernels.cpp
index e4488db3c84..81478538477 100644
--- a/common/cuda_hip/distributed/assembly_helpers_kernels.cpp
+++ b/common/cuda_hip/distributed/assembly_kernels.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/distributed/assembly_helpers_kernels.hpp"
+#include "core/distributed/assembly_kernels.hpp"
 
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
@@ -20,7 +20,7 @@
 namespace gko {
 namespace kernels {
 namespace GKO_DEVICE_NAMESPACE {
-namespace assembly_helpers {
+namespace assembly {
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
@@ -94,7 +94,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
 
 
-}  // namespace assembly_helpers
+}  // namespace assembly
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/common/unified/CMakeLists.txt b/common/unified/CMakeLists.txt
index 7e153fc20a5..e4e4fafe038 100644
--- a/common/unified/CMakeLists.txt
+++ b/common/unified/CMakeLists.txt
@@ -6,7 +6,7 @@ set(UNIFIED_SOURCES
     components/format_conversion_kernels.cpp
     components/precision_conversion_kernels.cpp
     components/reduce_array_kernels.cpp
-    distributed/assembly_helpers_kernels.cpp
+    distributed/assembly_kernels.cpp
     distributed/partition_helpers_kernels.cpp
     distributed/partition_kernels.cpp
     matrix/coo_kernels.cpp
diff --git a/common/unified/distributed/assembly_helpers_kernels.cpp b/common/unified/distributed/assembly_kernels.cpp
similarity index 94%
rename from common/unified/distributed/assembly_helpers_kernels.cpp
rename to common/unified/distributed/assembly_kernels.cpp
index 3ea198519fb..a3ac5207f17 100644
--- a/common/unified/distributed/assembly_helpers_kernels.cpp
+++ b/common/unified/distributed/assembly_kernels.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/distributed/assembly_helpers_kernels.hpp"
+#include "core/distributed/assembly_kernels.hpp"
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 
@@ -12,7 +12,7 @@
 namespace gko {
 namespace kernels {
 namespace GKO_DEVICE_NAMESPACE {
-namespace assembly_helpers {
+namespace assembly {
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
@@ -52,7 +52,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_FILL_SEND_BUFFERS);
 
 
-}  // namespace assembly_helpers
+}  // namespace assembly
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
index bd98da373b7..598167c0d7c 100644
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -138,7 +138,7 @@ if(GINKGO_BUILD_MPI)
         PRIVATE
         distributed/vector_cache.cpp
         mpi/exception.cpp
-        distributed/assembly_helpers.cpp
+        distributed/assembly.cpp
         distributed/matrix.cpp
         distributed/partition_helpers.cpp
         distributed/vector.cpp
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index cf21e423326..3fdd6dc0b5a 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -16,7 +16,7 @@
 #include "core/components/precision_conversion_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/components/reduce_array_kernels.hpp"
-#include "core/distributed/assembly_helpers_kernels.hpp"
+#include "core/distributed/assembly_kernels.hpp"
 #include "core/distributed/index_map_kernels.hpp"
 #include "core/distributed/matrix_kernels.hpp"
 #include "core/distributed/partition_helpers_kernels.hpp"
@@ -282,7 +282,7 @@ GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
 }
 
 
-namespace assembly_helpers {
+namespace assembly {
 
 
 GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
@@ -290,7 +290,7 @@ GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
 GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_FILL_SEND_BUFFERS);
 
 
-}  // namespace assembly_helpers
+}  // namespace assembly
 
 
 namespace distributed_matrix {
diff --git a/core/distributed/assembly_helpers.cpp b/core/distributed/assembly.cpp
similarity index 93%
rename from core/distributed/assembly_helpers.cpp
rename to core/distributed/assembly.cpp
index 6e01a96f293..424e641f845 100644
--- a/core/distributed/assembly_helpers.cpp
+++ b/core/distributed/assembly.cpp
@@ -2,29 +2,29 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "ginkgo/core/distributed/assembly_helpers.hpp"
+#include "ginkgo/core/distributed/assembly.hpp"
 
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/distributed/partition.hpp>
 
 #include "core/components/prefix_sum_kernels.hpp"
-#include "core/distributed/assembly_helpers_kernels.hpp"
+#include "core/distributed/assembly_kernels.hpp"
 
 
 namespace gko {
 namespace experimental {
 namespace distributed {
-namespace assembly_helpers {
+namespace assembly {
 namespace {
 
 
 GKO_REGISTER_OPERATION(count_non_owning_entries,
-                       assembly_helpers::count_non_owning_entries);
-GKO_REGISTER_OPERATION(fill_send_buffers, assembly_helpers::fill_send_buffers);
+                       assembly::count_non_owning_entries);
+GKO_REGISTER_OPERATION(fill_send_buffers, assembly::fill_send_buffers);
 
 
 }  // namespace
-}  // namespace assembly_helpers
+}  // namespace assembly
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
@@ -43,7 +43,7 @@ device_matrix_data<ValueType, GlobalIndexType> assemble_rows_from_neighbors(
     array<GlobalIndexType> send_positions{exec, num_entries};
     array<GlobalIndexType> original_positions{exec, num_entries};
     send_sizes.fill(zero<comm_index_type>());
-    exec->run(assembly_helpers::make_count_non_owning_entries(
+    exec->run(assembly::make_count_non_owning_entries(
         input, partition.get(), local_part, send_sizes, send_positions,
         original_positions));
 
@@ -68,7 +68,7 @@ device_matrix_data<ValueType, GlobalIndexType> assemble_rows_from_neighbors(
     array<GlobalIndexType> recv_row_idxs{exec, n_recv};
     array<GlobalIndexType> recv_col_idxs{exec, n_recv};
     array<ValueType> recv_values{exec, n_recv};
-    exec->run(assembly_helpers::make_fill_send_buffers(
+    exec->run(assembly::make_fill_send_buffers(
         input, partition.get(), local_part, send_positions, original_positions,
         send_row_idxs, send_col_idxs, send_values));
 
diff --git a/core/distributed/assembly_helpers_kernels.hpp b/core/distributed/assembly_kernels.hpp
similarity index 90%
rename from core/distributed/assembly_helpers_kernels.hpp
rename to core/distributed/assembly_kernels.hpp
index 6adfc104487..d3f39558844 100644
--- a/core/distributed/assembly_helpers_kernels.hpp
+++ b/core/distributed/assembly_kernels.hpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#ifndef GKO_CORE_ASSEMBLY_HELPERS_KERNELS_HPP_
-#define GKO_CORE_ASSEMBLY_HELPERS_KERNELS_HPP_
+#ifndef GKO_CORE_ASSEMBLY_KERNELS_HPP_
+#define GKO_CORE_ASSEMBLY_KERNELS_HPP_
 
 
 #include <ginkgo/core/base/array.hpp>
@@ -55,8 +55,7 @@ namespace kernels {
     GKO_DECLARE_FILL_SEND_BUFFERS(ValueType, LocalIndexType, GlobalIndexType)
 
 
-GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(assembly_helpers,
-                                        GKO_DECLARE_ALL_AS_TEMPLATES);
+GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(assembly, GKO_DECLARE_ALL_AS_TEMPLATES);
 
 
 #undef GKO_DECLARE_ALL_AS_TEMPLATES
@@ -66,4 +65,4 @@ GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(assembly_helpers,
 }  // namespace gko
 
 
-#endif  // GKO_CORE_ASSEMBLY_HELPERS_KERNELS_HPP_
+#endif  // GKO_CORE_ASSEMBLY_KERNELS_HPP_
diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp
index 1dc2280cba5..442771c66b5 100644
--- a/core/distributed/matrix.cpp
+++ b/core/distributed/matrix.cpp
@@ -6,7 +6,7 @@
 
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/precision_dispatch.hpp>
-#include <ginkgo/core/distributed/assembly_helpers.hpp>
+#include <ginkgo/core/distributed/assembly.hpp>
 #include <ginkgo/core/distributed/vector.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index 15055cc6645..81a2a6034ea 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -20,7 +20,7 @@ target_sources(ginkgo_dpcpp
     base/timer.dp.cpp
     base/version.dp.cpp
     components/prefix_sum_kernels.dp.cpp
-    distributed/assembly_helpers_kernels.dp.cpp
+    distributed/assembly_kernels.dp.cpp
     distributed/index_map_kernels.dp.cpp
     distributed/matrix_kernels.dp.cpp
     distributed/partition_helpers_kernels.dp.cpp
diff --git a/dpcpp/distributed/assembly_helpers_kernels.dp.cpp b/dpcpp/distributed/assembly_kernels.dp.cpp
similarity index 88%
rename from dpcpp/distributed/assembly_helpers_kernels.dp.cpp
rename to dpcpp/distributed/assembly_kernels.dp.cpp
index ba5d3024425..e0cc872b783 100644
--- a/dpcpp/distributed/assembly_helpers_kernels.dp.cpp
+++ b/dpcpp/distributed/assembly_kernels.dp.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/distributed/assembly_helpers_kernels.hpp"
+#include "core/distributed/assembly_kernels.hpp"
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 
@@ -10,7 +10,7 @@
 namespace gko {
 namespace kernels {
 namespace dpcpp {
-namespace assembly_helpers {
+namespace assembly {
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
@@ -27,7 +27,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
 
 
-}  // namespace assembly_helpers
+}  // namespace assembly
 }  // namespace dpcpp
 }  // namespace kernels
 }  // namespace gko
diff --git a/include/ginkgo/core/distributed/assembly_helpers.hpp b/include/ginkgo/core/distributed/assembly.hpp
similarity index 90%
rename from include/ginkgo/core/distributed/assembly_helpers.hpp
rename to include/ginkgo/core/distributed/assembly.hpp
index fc6ffeb608f..5bf48fee619 100644
--- a/include/ginkgo/core/distributed/assembly_helpers.hpp
+++ b/include/ginkgo/core/distributed/assembly.hpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#ifndef GKO_PUBLIC_CORE_DISTRIBUTED_ASSEMBLY_HELPERS_HPP_
-#define GKO_PUBLIC_CORE_DISTRIBUTED_ASSEMBLY_HELPERS_HPP_
+#ifndef GKO_PUBLIC_CORE_DISTRIBUTED_ASSEMBLY_HPP_
+#define GKO_PUBLIC_CORE_DISTRIBUTED_ASSEMBLY_HPP_
 
 
 #include <ginkgo/config.hpp>
@@ -54,4 +54,4 @@ device_matrix_data<ValueType, GlobalIndexType> assemble_rows_from_neighbors(
 
 
 #endif  // GINKGO_BUILD_MPI
-#endif  // GKO_PUBLIC_CORE_DISTRIBUTED_ASSEMBLY_HELPERS_HPP_
+#endif  // GKO_PUBLIC_CORE_DISTRIBUTED_ASSEMBLY_HPP_
diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp
index 78f6c80381a..c2eb2b4a134 100644
--- a/include/ginkgo/ginkgo.hpp
+++ b/include/ginkgo/ginkgo.hpp
@@ -58,7 +58,7 @@
 #include <ginkgo/core/config/registry.hpp>
 #include <ginkgo/core/config/type_descriptor.hpp>
 
-#include <ginkgo/core/distributed/assembly_helpers.hpp>
+#include <ginkgo/core/distributed/assembly.hpp>
 #include <ginkgo/core/distributed/base.hpp>
 #include <ginkgo/core/distributed/index_map.hpp>
 #include <ginkgo/core/distributed/lin_op.hpp>
diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt
index 945f174313f..700a01116c9 100644
--- a/omp/CMakeLists.txt
+++ b/omp/CMakeLists.txt
@@ -10,7 +10,7 @@ target_sources(ginkgo_omp
     base/scoped_device_id.cpp
     base/version.cpp
     components/prefix_sum_kernels.cpp
-    distributed/assembly_helpers_kernels.cpp
+    distributed/assembly_kernels.cpp
     distributed/index_map_kernels.cpp
     distributed/matrix_kernels.cpp
     distributed/partition_helpers_kernels.cpp
diff --git a/omp/distributed/assembly_helpers_kernels.cpp b/omp/distributed/assembly_kernels.cpp
similarity index 95%
rename from omp/distributed/assembly_helpers_kernels.cpp
rename to omp/distributed/assembly_kernels.cpp
index 7cdbefa05c3..9fa9976e607 100644
--- a/omp/distributed/assembly_helpers_kernels.cpp
+++ b/omp/distributed/assembly_kernels.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/distributed/assembly_helpers_kernels.hpp"
+#include "core/distributed/assembly_kernels.hpp"
 
 #include <algorithm>
 
@@ -19,7 +19,7 @@
 namespace gko {
 namespace kernels {
 namespace omp {
-namespace assembly_helpers {
+namespace assembly {
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
@@ -77,7 +77,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
 
 
-}  // namespace assembly_helpers
+}  // namespace assembly
 }  // namespace omp
 }  // namespace kernels
 }  // namespace gko
diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt
index c39c4680b9d..94e61d43d5c 100644
--- a/reference/CMakeLists.txt
+++ b/reference/CMakeLists.txt
@@ -12,7 +12,7 @@ target_sources(ginkgo_reference
     components/reduce_array_kernels.cpp
     components/precision_conversion_kernels.cpp
     components/prefix_sum_kernels.cpp
-    distributed/assembly_helpers_kernels.cpp
+    distributed/assembly_kernels.cpp
     distributed/index_map_kernels.cpp
     distributed/matrix_kernels.cpp
     distributed/partition_helpers_kernels.cpp
diff --git a/reference/distributed/assembly_helpers_kernels.cpp b/reference/distributed/assembly_kernels.cpp
similarity index 97%
rename from reference/distributed/assembly_helpers_kernels.cpp
rename to reference/distributed/assembly_kernels.cpp
index 9f9632dd9d5..e38680243a0 100644
--- a/reference/distributed/assembly_helpers_kernels.cpp
+++ b/reference/distributed/assembly_kernels.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/distributed/assembly_helpers_kernels.hpp"
+#include "core/distributed/assembly_kernels.hpp"
 
 #include <algorithm>
 
@@ -16,7 +16,7 @@
 namespace gko {
 namespace kernels {
 namespace reference {
-namespace assembly_helpers {
+namespace assembly {
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
@@ -101,7 +101,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_FILL_SEND_BUFFERS);
 
 
-}  // namespace assembly_helpers
+}  // namespace assembly
 }  // namespace reference
 }  // namespace kernels
 }  // namespace gko
diff --git a/reference/test/distributed/CMakeLists.txt b/reference/test/distributed/CMakeLists.txt
index 443eb05a03b..171974c01cb 100644
--- a/reference/test/distributed/CMakeLists.txt
+++ b/reference/test/distributed/CMakeLists.txt
@@ -1,4 +1,4 @@
-ginkgo_create_test(assembly_helpers_kernels)
+ginkgo_create_test(assembly_kernels)
 ginkgo_create_test(index_map_kernels)
 ginkgo_create_test(matrix_kernels)
 ginkgo_create_test(partition_helpers_kernels)
diff --git a/reference/test/distributed/assembly_helpers_kernels.cpp b/reference/test/distributed/assembly_kernels.cpp
similarity index 96%
rename from reference/test/distributed/assembly_helpers_kernels.cpp
rename to reference/test/distributed/assembly_kernels.cpp
index cdfd28853b5..89662b4efef 100644
--- a/reference/test/distributed/assembly_helpers_kernels.cpp
+++ b/reference/test/distributed/assembly_kernels.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/distributed/assembly_helpers_kernels.hpp"
+#include "core/distributed/assembly_kernels.hpp"
 
 #include <vector>
 
@@ -82,7 +82,7 @@ TYPED_TEST(AssemblyHelpers, CountOverlapEntries)
     for (gko::size_type i = 0; i < num_parts; i++) {
         send_count.fill(0);
 
-        gko::kernels::reference::assembly_helpers::count_non_owning_entries(
+        gko::kernels::reference::assembly::count_non_owning_entries(
             this->ref, input, partition.get(), i, send_count, send_positions,
             original_positions);
 
@@ -136,7 +136,7 @@ TYPED_TEST(AssemblyHelpers, FillOverlapSendBuffers)
         send_col_idxs.resize_and_reset(num_entries);
         send_values.resize_and_reset(num_entries);
 
-        gko::kernels::reference::assembly_helpers::fill_send_buffers(
+        gko::kernels::reference::assembly::fill_send_buffers(
             this->ref, input, partition.get(), i, send_positions[i],
             original_positions[i], send_row_idxs, send_col_idxs, send_values);
 
diff --git a/test/distributed/CMakeLists.txt b/test/distributed/CMakeLists.txt
index 91a497020d5..db482164a01 100644
--- a/test/distributed/CMakeLists.txt
+++ b/test/distributed/CMakeLists.txt
@@ -1,4 +1,4 @@
-ginkgo_create_common_test(assembly_helpers_kernels DISABLE_EXECUTORS dpcpp)
+ginkgo_create_common_test(assembly_kernels DISABLE_EXECUTORS dpcpp)
 ginkgo_create_common_test(index_map_kernels DISABLE_EXECUTORS dpcpp)
 ginkgo_create_common_test(matrix_kernels DISABLE_EXECUTORS dpcpp)
 ginkgo_create_common_test(partition_kernels DISABLE_EXECUTORS dpcpp)
diff --git a/test/distributed/assembly_helpers_kernels.cpp b/test/distributed/assembly_kernels.cpp
similarity index 93%
rename from test/distributed/assembly_helpers_kernels.cpp
rename to test/distributed/assembly_kernels.cpp
index 4864ffec471..d1e2f708ca6 100644
--- a/test/distributed/assembly_helpers_kernels.cpp
+++ b/test/distributed/assembly_kernels.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/distributed/assembly_helpers_kernels.hpp"
+#include "core/distributed/assembly_kernels.hpp"
 
 #include <algorithm>
 
@@ -55,10 +55,10 @@ class AssemblyHelpers : public CommonTestFixture {
             gko::array<global_index_type> d_original_positions{exec,
                                                                num_entries};
 
-            gko::kernels::reference::assembly_helpers::count_non_owning_entries(
+            gko::kernels::reference::assembly::count_non_owning_entries(
                 ref, input, row_partition.get(), part, send_count,
                 send_positions, original_positions);
-            gko::kernels::GKO_DEVICE_NAMESPACE::assembly_helpers::
+            gko::kernels::GKO_DEVICE_NAMESPACE::assembly::
                 count_non_owning_entries(exec, d_input, d_row_partition.get(),
                                          part, d_send_count, d_send_positions,
                                          d_original_positions);
@@ -80,14 +80,13 @@ class AssemblyHelpers : public CommonTestFixture {
                                                           num_send_entries};
             gko::array<value_type> d_send_values{exec, num_send_entries};
 
-            gko::kernels::reference::assembly_helpers::fill_send_buffers(
+            gko::kernels::reference::assembly::fill_send_buffers(
                 ref, input, row_partition.get(), part, send_positions,
                 original_positions, send_row_idxs, send_col_idxs, send_values);
-            gko::kernels::GKO_DEVICE_NAMESPACE::assembly_helpers::
-                fill_send_buffers(exec, d_input, d_row_partition.get(), part,
-                                  d_send_positions, d_original_positions,
-                                  d_send_row_idxs, d_send_col_idxs,
-                                  d_send_values);
+            gko::kernels::GKO_DEVICE_NAMESPACE::assembly::fill_send_buffers(
+                exec, d_input, d_row_partition.get(), part, d_send_positions,
+                d_original_positions, d_send_row_idxs, d_send_col_idxs,
+                d_send_values);
 
             GKO_ASSERT_ARRAY_EQ(send_positions, d_send_positions);
             GKO_ASSERT_ARRAY_EQ(original_positions, d_original_positions);
diff --git a/test/mpi/CMakeLists.txt b/test/mpi/CMakeLists.txt
index cf49f2a4691..46b8294f550 100644
--- a/test/mpi/CMakeLists.txt
+++ b/test/mpi/CMakeLists.txt
@@ -1,4 +1,4 @@
-ginkgo_create_common_and_reference_test(assembly_helpers MPI_SIZE 3)
+ginkgo_create_common_and_reference_test(assembly MPI_SIZE 3)
 ginkgo_create_common_and_reference_test(matrix MPI_SIZE 3)
 ginkgo_create_common_and_reference_test(partition_helpers MPI_SIZE 3)
 ginkgo_create_common_and_reference_test(vector MPI_SIZE 3)
diff --git a/test/mpi/assembly_helpers.cpp b/test/mpi/assembly.cpp
similarity index 98%
rename from test/mpi/assembly_helpers.cpp
rename to test/mpi/assembly.cpp
index c16e5402f96..9db0eab553a 100644
--- a/test/mpi/assembly_helpers.cpp
+++ b/test/mpi/assembly.cpp
@@ -14,7 +14,7 @@
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/device_matrix_data.hpp>
 #include <ginkgo/core/base/matrix_data.hpp>
-#include <ginkgo/core/distributed/assembly_helpers.hpp>
+#include <ginkgo/core/distributed/assembly.hpp>
 #include <ginkgo/core/distributed/partition.hpp>
 
 #include "core/test/utils.hpp"

From c7e5d2f0f32492d3ad151e475ecb9282feb39083 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 25 Sep 2024 15:43:21 +0200
Subject: [PATCH 324/448] add failed test when given symbolic without fillin. -
 bitmap: lead wrong answer or segfault - hashmap: infinite loop

---
 test/factorization/lu_kernels.cpp | 79 +++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/test/factorization/lu_kernels.cpp b/test/factorization/lu_kernels.cpp
index d38b6346cd8..b94e362fcb3 100644
--- a/test/factorization/lu_kernels.cpp
+++ b/test/factorization/lu_kernels.cpp
@@ -12,6 +12,8 @@
 
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/matrix_data.hpp>
+#include <ginkgo/core/base/mtx_io.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/factorization/lu.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
@@ -349,3 +351,80 @@ TYPED_TEST(Lu, GenerateUnsymmWithUnknownSparsityIsEquivalentToRef)
                             r<value_type>::value);
     });
 }
+
+
+TYPED_TEST(Lu, GenerateIluWithBitmapIsEquivalentToRef)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using matrix_type = typename TestFixture::matrix_type;
+    using sparsity_pattern_type = typename TestFixture::sparsity_pattern_type;
+    // diag + full first row and column
+    // the third and forth row use bitmap for lookup table
+    auto mtx = gko::share(gko::initialize<matrix_type>({{1.0, 1.0, 1.0, 1.0},
+                                                        {1.0, 1.0, 0.0, 0.0},
+                                                        {1.0, 0.0, 1.0, 0.0},
+                                                        {1.0, 0.0, 0.0, 1.0}},
+                                                       this->ref));
+    auto dmtx = gko::share(mtx->clone(this->exec));
+    auto sparsity = gko::share(sparsity_pattern_type::create(this->ref));
+    mtx->convert_to(sparsity);
+    auto dsparsity = gko::share(sparsity->clone(this->exec));
+
+    auto factory =
+        gko::experimental::factorization::Lu<value_type, index_type>::build()
+            .with_symbolic_factorization(sparsity)
+            .on(this->ref);
+    auto dfactory =
+        gko::experimental::factorization::Lu<value_type, index_type>::build()
+            .with_symbolic_factorization(dsparsity)
+            .on(this->exec);
+
+    auto lu = factory->generate(mtx);
+    auto dlu = dfactory->generate(dmtx);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(lu->get_combined(), dlu->get_combined());
+    GKO_ASSERT_MTX_NEAR(lu->get_combined(), dlu->get_combined(),
+                        r<value_type>::value);
+}
+
+
+TYPED_TEST(Lu, GenerateIluWithHashmapIsEquivalentToRef)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using matrix_type = typename TestFixture::matrix_type;
+    using sparsity_pattern_type = typename TestFixture::sparsity_pattern_type;
+    int n = 68;
+    // the first row and second last row use hashmap for lookup table
+    gko::matrix_data<value_type, index_type> data(gko::dim<2>(n, n));
+    for (int i = 0; i < n; i++) {
+        data.nonzeros.emplace_back(i, i, gko::one<value_type>());
+    }
+    // add dependence
+    data.nonzeros.emplace_back(n - 3, 0, gko::one<value_type>());
+    // add a entry whose col idx is not shown in the above row
+    data.nonzeros.emplace_back(0, n - 2, gko::one<value_type>());
+    data.sort_row_major();
+    auto mtx = gko::share(matrix_type::create(this->ref));
+    mtx->read(data);
+    auto dmtx = gko::share(mtx->clone(this->exec));
+    auto sparsity = gko::share(sparsity_pattern_type::create(this->ref));
+    mtx->convert_to(sparsity);
+    auto dsparsity = gko::share(sparsity->clone(this->exec));
+    auto factory =
+        gko::experimental::factorization::Lu<value_type, index_type>::build()
+            .with_symbolic_factorization(sparsity)
+            .on(this->ref);
+    auto dfactory =
+        gko::experimental::factorization::Lu<value_type, index_type>::build()
+            .with_symbolic_factorization(dsparsity)
+            .on(this->exec);
+
+    auto lu = factory->generate(mtx);
+    auto dlu = dfactory->generate(dmtx);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(lu->get_combined(), dlu->get_combined());
+    GKO_ASSERT_MTX_NEAR(lu->get_combined(), dlu->get_combined(),
+                        r<value_type>::value);
+}

From 1591769894f9c30258a232807981f2df9d59f1bb Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 25 Sep 2024 16:40:08 +0200
Subject: [PATCH 325/448] fix infinite loop of lookup_hash_unsafe and add test
 in reference

---
 core/matrix/csr_lookup.hpp                  | 14 +++--
 reference/test/factorization/lu_kernels.cpp | 68 +++++++++++++++++++++
 2 files changed, 77 insertions(+), 5 deletions(-)

diff --git a/core/matrix/csr_lookup.hpp b/core/matrix/csr_lookup.hpp
index a7b687c3618..50d9c087b68 100644
--- a/core/matrix/csr_lookup.hpp
+++ b/core/matrix/csr_lookup.hpp
@@ -183,7 +183,8 @@ struct device_sparsity_lookup {
             result = lookup_search_unsafe(col);
             break;
         }
-        GKO_ASSERT(local_cols[result] == col);
+        GKO_ASSERT(result >= 0 && result < row_nnz &&
+                   local_cols[result] == col);
         return result;
     }
 
@@ -230,7 +231,8 @@ struct device_sparsity_lookup {
         const auto out_idx =
             block_bases[block] +
             gko::detail::popcount(block_bitmaps[block] & prefix_mask);
-        GKO_ASSERT(local_cols[out_idx] == col);
+        GKO_ASSERT(out_idx >= 0 && out_idx < row_nnz &&
+                   local_cols[out_idx] == col);
         return out_idx;
     }
 
@@ -262,15 +264,17 @@ struct device_sparsity_lookup {
             (static_cast<unsigned_index_type>(col) * hash_param) % hashmap_size;
         GKO_ASSERT(hashmap[hash] >= 0);
         GKO_ASSERT(hashmap[hash] < row_nnz);
-        while (local_cols[hashmap[hash]] != col) {
+        auto out_idx = hashmap[hash];
+        // linear probing with invalid_index sentinel to avoid infinite loop
+        while (out_idx >= 0 && local_cols[out_idx] != col) {
             hash++;
             if (hash >= hashmap_size) {
                 hash = 0;
             }
-            GKO_ASSERT(hashmap[hash] >= 0);
+            out_idx = hashmap[hash];
             GKO_ASSERT(hashmap[hash] < row_nnz);
         }
-        const auto out_idx = hashmap[hash];
+        // out_idx is either correct or invalid_index, the hashmap sentinel
         return out_idx;
     }
 
diff --git a/reference/test/factorization/lu_kernels.cpp b/reference/test/factorization/lu_kernels.cpp
index f4a8b240b38..d027968e97f 100644
--- a/reference/test/factorization/lu_kernels.cpp
+++ b/reference/test/factorization/lu_kernels.cpp
@@ -36,6 +36,8 @@ class Lu : public ::testing::Test {
     using index_type =
         typename std::tuple_element<1, decltype(ValueIndexType())>::type;
     using matrix_type = gko::matrix::Csr<value_type, index_type>;
+    using sparsity_pattern_type =
+        gko::matrix::SparsityCsr<value_type, index_type>;
 
     Lu()
         : ref(gko::ReferenceExecutor::create()),
@@ -329,3 +331,69 @@ TYPED_TEST(Lu, FactorizeWithKnownSparsityWorks)
         ASSERT_EQ(lu->get_diagonal(), nullptr);
     });
 }
+
+
+TYPED_TEST(Lu, GenerateIluWithBitmapIsEquivalentToRef)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using matrix_type = typename TestFixture::matrix_type;
+    using sparsity_pattern_type = typename TestFixture::sparsity_pattern_type;
+    // diag + full first row and column
+    // the third and forth row use bitmap for lookup table
+    auto mtx = gko::share(gko::initialize<matrix_type>({{1.0, 1.0, 1.0, 1.0},
+                                                        {1.0, 1.0, 0.0, 0.0},
+                                                        {1.0, 0.0, 1.0, 0.0},
+                                                        {1.0, 0.0, 0.0, 1.0}},
+                                                       this->ref));
+    auto sparsity = gko::share(sparsity_pattern_type::create(this->ref));
+    mtx->convert_to(sparsity);
+    auto result = gko::initialize<matrix_type>({{1.0, 1.0, 1.0, 1.0},
+                                                {1.0, 0.0, 0.0, 0.0},
+                                                {1.0, 0.0, 0.0, 0.0},
+                                                {1.0, 0.0, 0.0, 0.0}},
+                                               this->ref);
+    auto factory =
+        gko::experimental::factorization::Lu<value_type, index_type>::build()
+            .with_symbolic_factorization(sparsity)
+            .on(this->ref);
+
+    auto lu = factory->generate(mtx);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(lu->get_combined(), mtx);
+    GKO_ASSERT_MTX_NEAR(lu->get_combined(), result, r<value_type>::value);
+}
+
+
+TYPED_TEST(Lu, GenerateIluWithHashmapIsEquivalentToRef)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using matrix_type = typename TestFixture::matrix_type;
+    using sparsity_pattern_type = typename TestFixture::sparsity_pattern_type;
+    int n = 68;
+    // the first row and second last row use hashmap for lookup table
+    gko::matrix_data<value_type, index_type> data(gko::dim<2>(n, n));
+    for (int i = 0; i < n; i++) {
+        data.nonzeros.emplace_back(i, i, gko::one<value_type>());
+    }
+    // add dependence
+    data.nonzeros.emplace_back(n - 3, 0, gko::one<value_type>());
+    // add a entry whose col idx is not shown in the above row
+    data.nonzeros.emplace_back(0, n - 2, gko::one<value_type>());
+    data.sort_row_major();
+    auto mtx = gko::share(matrix_type::create(this->ref));
+    mtx->read(data);
+    auto sparsity = gko::share(sparsity_pattern_type::create(this->ref));
+    mtx->convert_to(sparsity);
+    auto factory =
+        gko::experimental::factorization::Lu<value_type, index_type>::build()
+            .with_symbolic_factorization(sparsity)
+            .on(this->ref);
+
+    auto lu = factory->generate(mtx);
+
+    // the result combined matrix is the same as the original matrix
+    GKO_ASSERT_MTX_EQ_SPARSITY(lu->get_combined(), mtx);
+    GKO_ASSERT_MTX_NEAR(lu->get_combined(), mtx, r<value_type>::value);
+}
\ No newline at end of file

From 48a775c5e0565fd20bc5c75ff6bdc5cdc2716428 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 25 Sep 2024 21:15:36 +0200
Subject: [PATCH 326/448] add checked_lookup into LU

---
 common/cuda_hip/factorization/lu_kernels.cpp | 36 +++++++++++++++-----
 core/factorization/lu.cpp                    | 11 +++---
 core/factorization/lu_kernels.hpp            |  2 +-
 core/test/config/factorization.cpp           |  3 ++
 dpcpp/factorization/lu_kernels.dp.cpp        |  2 +-
 include/ginkgo/core/factorization/lu.hpp     |  9 +++++
 omp/factorization/lu_kernels.cpp             | 15 +++++---
 reference/factorization/lu_kernels.cpp       | 15 +++++---
 reference/test/factorization/lu_kernels.cpp  |  4 ++-
 test/factorization/lu_kernels.cpp            |  8 +++--
 10 files changed, 79 insertions(+), 26 deletions(-)

diff --git a/common/cuda_hip/factorization/lu_kernels.cpp b/common/cuda_hip/factorization/lu_kernels.cpp
index aa432bf711c..6cb9b02129b 100644
--- a/common/cuda_hip/factorization/lu_kernels.cpp
+++ b/common/cuda_hip/factorization/lu_kernels.cpp
@@ -85,7 +85,7 @@ __global__ __launch_bounds__(default_block_size) void initialize(
 }
 
 
-template <typename ValueType, typename IndexType>
+template <bool checked_lookup, typename ValueType, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void factorize(
     const IndexType* __restrict__ row_ptrs, const IndexType* __restrict__ cols,
     const IndexType* __restrict__ storage_offsets,
@@ -130,8 +130,16 @@ __global__ __launch_bounds__(default_block_size) void factorize(
              upper_nz += config::warp_size) {
             const auto upper_col = cols[upper_nz];
             const auto upper_val = vals[upper_nz];
-            const auto output_pos = lookup.lookup_unsafe(upper_col) + row_begin;
-            vals[output_pos] -= scale * upper_val;
+            if (checked_lookup) {
+                const auto pos = lookup[upper_col];
+                if (pos != invalid_index<IndexType>()) {
+                    vals[row_begin + pos] -= scale * upper_val;
+                }
+            } else {
+                const auto output_pos =
+                    lookup.lookup_unsafe(upper_col) + row_begin;
+                vals[output_pos] -= scale * upper_val;
+            }
         }
     }
     scheduler.mark_ready();
@@ -252,7 +260,7 @@ template <typename ValueType, typename IndexType>
 void factorize(std::shared_ptr<const DefaultExecutor> exec,
                const IndexType* lookup_offsets, const int64* lookup_descs,
                const int32* lookup_storage, const IndexType* diag_idxs,
-               matrix::Csr<ValueType, IndexType>* factors,
+               matrix::Csr<ValueType, IndexType>* factors, bool checked_lookup,
                array<int>& tmp_storage)
 {
     const auto num_rows = factors->get_size()[0];
@@ -260,11 +268,21 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
         syncfree_storage storage(exec, tmp_storage, num_rows);
         const auto num_blocks =
             ceildiv(num_rows, default_block_size / config::warp_size);
-        kernel::factorize<<<num_blocks, default_block_size, 0,
-                            exec->get_stream()>>>(
-            factors->get_const_row_ptrs(), factors->get_const_col_idxs(),
-            lookup_offsets, lookup_storage, lookup_descs, diag_idxs,
-            as_device_type(factors->get_values()), storage, num_rows);
+        if (checked_lookup) {
+            kernel::factorize<true>
+                <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                    factors->get_const_row_ptrs(),
+                    factors->get_const_col_idxs(), lookup_offsets,
+                    lookup_storage, lookup_descs, diag_idxs,
+                    as_device_type(factors->get_values()), storage, num_rows);
+        } else {
+            kernel::factorize<false>
+                <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                    factors->get_const_row_ptrs(),
+                    factors->get_const_col_idxs(), lookup_offsets,
+                    lookup_storage, lookup_descs, diag_idxs,
+                    as_device_type(factors->get_values()), storage, num_rows);
+        }
     }
 }
 
diff --git a/core/factorization/lu.cpp b/core/factorization/lu.cpp
index fb9cab4154a..3f2790e674a 100644
--- a/core/factorization/lu.cpp
+++ b/core/factorization/lu.cpp
@@ -69,6 +69,9 @@ Lu<ValueType, IndexType>::parse(const config::pnode& config,
     if (auto& obj = config.get("skip_sorting")) {
         params.with_skip_sorting(config::get_value<bool>(obj));
     }
+    if (auto& obj = config.get("checked_lookup")) {
+        params.with_checked_lookup(config::get_value<bool>(obj));
+    }
 
     return params;
 }
@@ -160,10 +163,10 @@ std::unique_ptr<LinOp> Lu<ValueType, IndexType>::generate_impl(
         storage.get_const_data(), diag_idxs.get_data(), factors.get()));
     // run numerical factorization
     array<int> tmp{exec};
-    exec->run(make_factorize(storage_offsets.get_const_data(),
-                             row_descs.get_const_data(),
-                             storage.get_const_data(),
-                             diag_idxs.get_const_data(), factors.get(), tmp));
+    exec->run(make_factorize(
+        storage_offsets.get_const_data(), row_descs.get_const_data(),
+        storage.get_const_data(), diag_idxs.get_const_data(), factors.get(),
+        parameters_.checked_lookup, tmp));
     return factorization_type::create_from_combined_lu(std::move(factors));
 }
 
diff --git a/core/factorization/lu_kernels.hpp b/core/factorization/lu_kernels.hpp
index f497398cb90..581c50bcf6e 100644
--- a/core/factorization/lu_kernels.hpp
+++ b/core/factorization/lu_kernels.hpp
@@ -33,7 +33,7 @@ namespace kernels {
                    const IndexType* lookup_offsets, const int64* lookup_descs, \
                    const int32* lookup_storage, const IndexType* diag_idxs,    \
                    matrix::Csr<ValueType, IndexType>* factors,                 \
-                   array<int>& tmp_storage)
+                   bool checked_lookup, array<int>& tmp_storage)
 
 
 #define GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE(IndexType)                  \
diff --git a/core/test/config/factorization.cpp b/core/test/config/factorization.cpp
index f5a4b19d3d9..9ee196222d3 100644
--- a/core/test/config/factorization.cpp
+++ b/core/test/config/factorization.cpp
@@ -178,6 +178,8 @@ struct Lu : FactorizationConfigTest<
             gko::experimental::factorization::symbolic_type::near_symmetric);
         config_map["skip_sorting"] = pnode{true};
         param.with_skip_sorting(true);
+        config_map["checked_lookup"] = pnode{true};
+        param.with_checked_lookup(true);
     }
 
     template <typename AnswerType>
@@ -190,6 +192,7 @@ struct Lu : FactorizationConfigTest<
                   ans_param.symbolic_factorization);
         ASSERT_EQ(res_param.symbolic_algorithm, ans_param.symbolic_algorithm);
         ASSERT_EQ(res_param.skip_sorting, ans_param.skip_sorting);
+        ASSERT_EQ(res_param.checked_lookup, ans_param.checked_lookup);
     }
 };
 
diff --git a/dpcpp/factorization/lu_kernels.dp.cpp b/dpcpp/factorization/lu_kernels.dp.cpp
index a891b5b7b2f..d6a1c2ed5b2 100644
--- a/dpcpp/factorization/lu_kernels.dp.cpp
+++ b/dpcpp/factorization/lu_kernels.dp.cpp
@@ -39,7 +39,7 @@ template <typename ValueType, typename IndexType>
 void factorize(std::shared_ptr<const DefaultExecutor> exec,
                const IndexType* lookup_offsets, const int64* lookup_descs,
                const int32* lookup_storage, const IndexType* diag_idxs,
-               matrix::Csr<ValueType, IndexType>* factors,
+               matrix::Csr<ValueType, IndexType>* factors, bool checked_lookup,
                array<int>& tmp_storage) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE);
diff --git a/include/ginkgo/core/factorization/lu.hpp b/include/ginkgo/core/factorization/lu.hpp
index d00f5a111b3..9fba621548e 100644
--- a/include/ginkgo/core/factorization/lu.hpp
+++ b/include/ginkgo/core/factorization/lu.hpp
@@ -97,6 +97,15 @@ class Lu
          * incorrect results or crash.
          */
         bool GKO_FACTORY_PARAMETER_SCALAR(skip_sorting, false);
+
+        /**
+         * The symbolic factoization should contains the fill-in information. If
+         * it is not the case (like Ilu), users might face hang or illegal
+         * access issue. Please enable this option when the symbolic
+         * factorization does not contain the full fill-in information. Symbolic
+         * factorization must still contain the entry for the original matrix.
+         */
+        bool GKO_FACTORY_PARAMETER_SCALAR(checked_lookup, false);
     };
 
     /**
diff --git a/omp/factorization/lu_kernels.cpp b/omp/factorization/lu_kernels.cpp
index 53847ff2b6c..95ff7e3087f 100644
--- a/omp/factorization/lu_kernels.cpp
+++ b/omp/factorization/lu_kernels.cpp
@@ -66,7 +66,7 @@ template <typename ValueType, typename IndexType>
 void factorize(std::shared_ptr<const DefaultExecutor> exec,
                const IndexType* lookup_offsets, const int64* lookup_descs,
                const int32* lookup_storage, const IndexType* diag_idxs,
-               matrix::Csr<ValueType, IndexType>* factors,
+               matrix::Csr<ValueType, IndexType>* factors, bool checked_lookup,
                array<int>& tmp_storage)
 {
     const auto num_rows = factors->get_size()[0];
@@ -89,8 +89,15 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
             for (auto dep_nz = dep_diag_idx + 1; dep_nz < dep_end; dep_nz++) {
                 const auto col = cols[dep_nz];
                 const auto val = vals[dep_nz];
-                const auto nz = row_begin + lookup.lookup_unsafe(col);
-                vals[nz] -= scale * val;
+                if (checked_lookup) {
+                    const auto idx = lookup[col];
+                    if (idx != invalid_index<IndexType>()) {
+                        vals[row_begin + idx] -= scale * val;
+                    }
+                } else {
+                    const auto nz = row_begin + lookup.lookup_unsafe(col);
+                    vals[nz] -= scale * val;
+                }
             }
         }
     }
@@ -185,4 +192,4 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
 }  // namespace lu_factorization
 }  // namespace omp
 }  // namespace kernels
-}  // namespace gko
+}  // namespace gko
\ No newline at end of file
diff --git a/reference/factorization/lu_kernels.cpp b/reference/factorization/lu_kernels.cpp
index d8516cffb49..b6cafd590a5 100644
--- a/reference/factorization/lu_kernels.cpp
+++ b/reference/factorization/lu_kernels.cpp
@@ -65,7 +65,7 @@ template <typename ValueType, typename IndexType>
 void factorize(std::shared_ptr<const DefaultExecutor> exec,
                const IndexType* lookup_offsets, const int64* lookup_descs,
                const int32* lookup_storage, const IndexType* diag_idxs,
-               matrix::Csr<ValueType, IndexType>* factors,
+               matrix::Csr<ValueType, IndexType>* factors, bool checked_lookup,
                array<int>& tmp_storage)
 {
     const auto num_rows = factors->get_size()[0];
@@ -87,8 +87,15 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
             for (auto dep_nz = dep_diag_idx + 1; dep_nz < dep_end; dep_nz++) {
                 const auto col = cols[dep_nz];
                 const auto val = vals[dep_nz];
-                const auto nz = row_begin + lookup.lookup_unsafe(col);
-                vals[nz] -= scale * val;
+                if (checked_lookup) {
+                    const auto idx = lookup[col];
+                    if (idx != invalid_index<IndexType>()) {
+                        vals[row_begin + idx] -= scale * val;
+                    }
+                } else {
+                    const auto nz = row_begin + lookup.lookup_unsafe(col);
+                    vals[nz] -= scale * val;
+                }
             }
         }
     }
@@ -182,4 +189,4 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
 }  // namespace lu_factorization
 }  // namespace reference
 }  // namespace kernels
-}  // namespace gko
+}  // namespace gko
\ No newline at end of file
diff --git a/reference/test/factorization/lu_kernels.cpp b/reference/test/factorization/lu_kernels.cpp
index d027968e97f..5233a76f117 100644
--- a/reference/test/factorization/lu_kernels.cpp
+++ b/reference/test/factorization/lu_kernels.cpp
@@ -218,7 +218,7 @@ TYPED_TEST(Lu, KernelFactorizeWorks)
         gko::kernels::reference::lu_factorization::factorize(
             this->ref, this->storage_offsets.get_const_data(),
             this->row_descs.get_const_data(), this->storage.get_const_data(),
-            diag_idxs.get_const_data(), this->mtx_lu.get(), tmp);
+            diag_idxs.get_const_data(), this->mtx_lu.get(), false, tmp);
 
         GKO_ASSERT_MTX_NEAR(this->mtx_lu, mtx_lu_ref,
                             15 * r<value_type>::value);
@@ -356,6 +356,7 @@ TYPED_TEST(Lu, GenerateIluWithBitmapIsEquivalentToRef)
     auto factory =
         gko::experimental::factorization::Lu<value_type, index_type>::build()
             .with_symbolic_factorization(sparsity)
+            .with_checked_lookup(true)
             .on(this->ref);
 
     auto lu = factory->generate(mtx);
@@ -389,6 +390,7 @@ TYPED_TEST(Lu, GenerateIluWithHashmapIsEquivalentToRef)
     auto factory =
         gko::experimental::factorization::Lu<value_type, index_type>::build()
             .with_symbolic_factorization(sparsity)
+            .with_checked_lookup(true)
             .on(this->ref);
 
     auto lu = factory->generate(mtx);
diff --git a/test/factorization/lu_kernels.cpp b/test/factorization/lu_kernels.cpp
index b94e362fcb3..baec66d603f 100644
--- a/test/factorization/lu_kernels.cpp
+++ b/test/factorization/lu_kernels.cpp
@@ -196,11 +196,11 @@ TYPED_TEST(Lu, KernelFactorizeIsEquivalentToRef)
         gko::kernels::reference::lu_factorization::factorize(
             this->ref, this->storage_offsets.get_const_data(),
             this->row_descs.get_const_data(), this->storage.get_const_data(),
-            diag_idxs.get_const_data(), this->mtx_lu.get(), tmp);
+            diag_idxs.get_const_data(), this->mtx_lu.get(), false, tmp);
         gko::kernels::GKO_DEVICE_NAMESPACE::lu_factorization::factorize(
             this->exec, this->dstorage_offsets.get_const_data(),
             this->drow_descs.get_const_data(), this->dstorage.get_const_data(),
-            ddiag_idxs.get_const_data(), this->dmtx_lu.get(), dtmp);
+            ddiag_idxs.get_const_data(), this->dmtx_lu.get(), false, dtmp);
 
         GKO_ASSERT_MTX_NEAR(this->mtx_lu, this->dmtx_lu, r<value_type>::value);
     });
@@ -374,10 +374,12 @@ TYPED_TEST(Lu, GenerateIluWithBitmapIsEquivalentToRef)
     auto factory =
         gko::experimental::factorization::Lu<value_type, index_type>::build()
             .with_symbolic_factorization(sparsity)
+            .with_checked_lookup(true)
             .on(this->ref);
     auto dfactory =
         gko::experimental::factorization::Lu<value_type, index_type>::build()
             .with_symbolic_factorization(dsparsity)
+            .with_checked_lookup(true)
             .on(this->exec);
 
     auto lu = factory->generate(mtx);
@@ -415,10 +417,12 @@ TYPED_TEST(Lu, GenerateIluWithHashmapIsEquivalentToRef)
     auto factory =
         gko::experimental::factorization::Lu<value_type, index_type>::build()
             .with_symbolic_factorization(sparsity)
+            .with_checked_lookup(true)
             .on(this->ref);
     auto dfactory =
         gko::experimental::factorization::Lu<value_type, index_type>::build()
             .with_symbolic_factorization(dsparsity)
+            .with_checked_lookup(true)
             .on(this->exec);
 
     auto lu = factory->generate(mtx);

From 0dcdbfffd655875727cd05a4d468b5198d046818 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 25 Sep 2024 22:51:25 +0200
Subject: [PATCH 327/448] add ilu syncfree through lu implementation

---
 core/factorization/ilu.cpp                | 50 +++++++++++++++++++----
 core/test/config/factorization.cpp        |  3 ++
 include/ginkgo/core/factorization/ilu.hpp | 17 ++++++++
 test/factorization/CMakeLists.txt         |  2 +-
 test/factorization/ilu_kernels.cpp        | 28 +++++++++++++
 5 files changed, 91 insertions(+), 9 deletions(-)

diff --git a/core/factorization/ilu.cpp b/core/factorization/ilu.cpp
index 41df4065979..015a5829493 100644
--- a/core/factorization/ilu.cpp
+++ b/core/factorization/ilu.cpp
@@ -10,6 +10,8 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/config/config.hpp>
 #include <ginkgo/core/config/registry.hpp>
+#include <ginkgo/core/factorization/lu.hpp>
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 #include "core/base/array_access.hpp"
 #include "core/config/config_helper.hpp"
@@ -52,6 +54,17 @@ Ilu<ValueType, IndexType>::parse(const config::pnode& config,
     if (auto& obj = config.get("skip_sorting")) {
         params.with_skip_sorting(config::get_value<bool>(obj));
     }
+    if (auto& obj = config.get("algorithm")) {
+        using gko::factorization::factorize_algorithm;
+        auto str = obj.get_string();
+        if (str == "sparselib") {
+            params.with_algorithm(factorize_algorithm::sparselib);
+        } else if (str == "syncfree") {
+            params.with_algorithm(factorize_algorithm::syncfree);
+        } else {
+            GKO_INVALID_CONFIG_VALUE("algorithm", str);
+        }
+    }
     return params;
 }
 
@@ -66,7 +79,8 @@ std::unique_ptr<Composition<ValueType>> Ilu<ValueType, IndexType>::generate_l_u(
 
     // Converts the system matrix to CSR.
     // Throws an exception if it is not convertible.
-    auto local_system_matrix = matrix_type::create(exec);
+    auto local_system_matrix = share(matrix_type::create(exec));
+    std::shared_ptr<const matrix_type> ilu;
     as<ConvertibleTo<matrix_type>>(system_matrix.get())
         ->convert_to(local_system_matrix);
 
@@ -79,16 +93,36 @@ std::unique_ptr<Composition<ValueType>> Ilu<ValueType, IndexType>::generate_l_u(
         local_system_matrix.get(), false));
 
     // Compute LU factorization
-    exec->run(ilu_factorization::make_compute_ilu(local_system_matrix.get()));
-
+    if (std::dynamic_pointer_cast<const OmpExecutor>(exec) ||
+        parameters_.algorithm == factorize_algorithm::syncfree) {
+        auto sparsity =
+            share(gko::matrix::SparsityCsr<ValueType, IndexType>::create_const(
+                exec, local_system_matrix->get_size(),
+                make_const_array_view(
+                    exec, local_system_matrix->get_num_stored_elements(),
+                    local_system_matrix->get_const_col_idxs()),
+                make_const_array_view(
+                    exec, local_system_matrix->get_size()[0] + 1,
+                    local_system_matrix->get_const_row_ptrs())));
+        ilu =
+            gko::experimental::factorization::Lu<ValueType, IndexType>::build()
+                .with_checked_lookup(true)
+                .with_symbolic_factorization(sparsity)
+                .on(exec)
+                ->generate(local_system_matrix)
+                ->get_combined();
+    } else {
+        exec->run(
+            ilu_factorization::make_compute_ilu(local_system_matrix.get()));
+        ilu = local_system_matrix;
+    }
     // Separate L and U factors: nnz
-    const auto matrix_size = local_system_matrix->get_size();
+    const auto matrix_size = ilu->get_size();
     const auto num_rows = matrix_size[0];
     array<IndexType> l_row_ptrs{exec, num_rows + 1};
     array<IndexType> u_row_ptrs{exec, num_rows + 1};
     exec->run(ilu_factorization::make_initialize_row_ptrs_l_u(
-        local_system_matrix.get(), l_row_ptrs.get_data(),
-        u_row_ptrs.get_data()));
+        ilu.get(), l_row_ptrs.get_data(), u_row_ptrs.get_data()));
 
     // Get nnz from device memory
     auto l_nnz = static_cast<size_type>(get_element(l_row_ptrs, num_rows));
@@ -107,8 +141,8 @@ std::unique_ptr<Composition<ValueType>> Ilu<ValueType, IndexType>::generate_l_u(
         std::move(u_row_ptrs), parameters_.u_strategy);
 
     // Separate L and U: columns and values
-    exec->run(ilu_factorization::make_initialize_l_u(
-        local_system_matrix.get(), l_factor.get(), u_factor.get()));
+    exec->run(ilu_factorization::make_initialize_l_u(ilu.get(), l_factor.get(),
+                                                     u_factor.get()));
 
     return Composition<ValueType>::create(std::move(l_factor),
                                           std::move(u_factor));
diff --git a/core/test/config/factorization.cpp b/core/test/config/factorization.cpp
index 9ee196222d3..014fb5e346d 100644
--- a/core/test/config/factorization.cpp
+++ b/core/test/config/factorization.cpp
@@ -111,6 +111,8 @@ struct Ilu : FactorizationConfigTest<gko::factorization::Ilu<float, int>,
                 typename gko::matrix::Csr<float, int>::sparselib>());
         config_map["skip_sorting"] = pnode{true};
         param.with_skip_sorting(true);
+        config_map["algorithm"] = pnode{"syncfree"};
+        param.with_algorithm(gko::factorization::factorize_algorithm::syncfree);
     }
 
     template <typename AnswerType>
@@ -122,6 +124,7 @@ struct Ilu : FactorizationConfigTest<gko::factorization::Ilu<float, int>,
         check_strategy(res_param.l_strategy, ans_param.l_strategy);
         check_strategy(res_param.u_strategy, ans_param.u_strategy);
         ASSERT_EQ(res_param.skip_sorting, ans_param.skip_sorting);
+        ASSERT_EQ(res_param.algorithm, ans_param.algorithm);
     }
 };
 
diff --git a/include/ginkgo/core/factorization/ilu.hpp b/include/ginkgo/core/factorization/ilu.hpp
index 80f11ab7b6f..2de5ea8bca8 100644
--- a/include/ginkgo/core/factorization/ilu.hpp
+++ b/include/ginkgo/core/factorization/ilu.hpp
@@ -25,6 +25,14 @@ namespace gko {
 namespace factorization {
 
 
+/**
+ * A helper for algorithm selection in the incomplete factorization.
+ * sparselib is only available for cuda and hip.
+ * syncfree is Ginkgo's implementation through the Lu factorization with given
+ * sparsity.
+ */
+enum class factorize_algorithm { sparselib, syncfree };
+
 /**
  * Represents an incomplete LU factorization -- ILU(0) -- of a sparse matrix.
  *
@@ -94,6 +102,15 @@ class Ilu : public Composition<ValueType> {
          * incorrect.
          */
         bool GKO_FACTORY_PARAMETER_SCALAR(skip_sorting, false);
+
+        /**
+         * Select the implementation which is supposed to be used for
+         * the incomplete factorization. This only matters for the Cuda and Hip
+         * executor where the choice is between the Ginkgo (syncfree) and the
+         * cuSPARSE/hipSPARSE (sparselib) implementation. Default is sparselib.
+         */
+        factorize_algorithm GKO_FACTORY_PARAMETER_SCALAR(
+            algorithm, factorize_algorithm::sparselib);
     };
     GKO_ENABLE_LIN_OP_FACTORY(Ilu, parameters, Factory);
     GKO_ENABLE_BUILD_METHOD(Factory);
diff --git a/test/factorization/CMakeLists.txt b/test/factorization/CMakeLists.txt
index 8b5aa51287b..5f0bc8b7f30 100644
--- a/test/factorization/CMakeLists.txt
+++ b/test/factorization/CMakeLists.txt
@@ -2,7 +2,7 @@ ginkgo_create_common_test(cholesky_kernels DISABLE_EXECUTORS dpcpp)
 ginkgo_create_common_test(factorization_kernels)
 ginkgo_create_common_test(lu_kernels DISABLE_EXECUTORS dpcpp)
 ginkgo_create_common_test(ic_kernels DISABLE_EXECUTORS dpcpp omp)
-ginkgo_create_common_test(ilu_kernels DISABLE_EXECUTORS dpcpp omp)
+ginkgo_create_common_test(ilu_kernels DISABLE_EXECUTORS dpcpp)
 ginkgo_create_common_test(par_ic_kernels)
 ginkgo_create_common_test(par_ict_kernels)
 ginkgo_create_common_test(par_ilu_kernels)
diff --git a/test/factorization/ilu_kernels.cpp b/test/factorization/ilu_kernels.cpp
index 004b0d34a4f..297f0d6d922 100644
--- a/test/factorization/ilu_kernels.cpp
+++ b/test/factorization/ilu_kernels.cpp
@@ -55,6 +55,26 @@ TEST_F(Ilu, ComputeILUIsEquivalentToRefSorted)
 }
 
 
+TEST_F(Ilu, ComputeILUBySyncfreeIsEquivalentToRefSorted)
+{
+    auto fact = gko::factorization::Ilu<>::build()
+                    .with_skip_sorting(true)
+                    .on(ref)
+                    ->generate(mtx);
+    auto dfact =
+        gko::factorization::Ilu<>::build()
+            .with_skip_sorting(true)
+            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .on(exec)
+            ->generate(dmtx);
+
+    GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), dfact->get_l_factor(), 1e-14);
+    GKO_ASSERT_MTX_NEAR(fact->get_u_factor(), dfact->get_u_factor(), 1e-14);
+    GKO_ASSERT_MTX_EQ_SPARSITY(fact->get_l_factor(), dfact->get_l_factor());
+    GKO_ASSERT_MTX_EQ_SPARSITY(fact->get_u_factor(), dfact->get_u_factor());
+}
+
+
 TEST_F(Ilu, ComputeILUIsEquivalentToRefUnsorted)
 {
     gko::test::unsort_matrix(mtx, rand_engine);
@@ -74,11 +94,19 @@ TEST_F(Ilu, SetsCorrectStrategy)
 {
     auto dfact = gko::factorization::Ilu<>::build()
                      .with_l_strategy(std::make_shared<Csr::merge_path>())
+#ifdef GKO_COMPILING_OMP
+                     .with_u_strategy(std::make_shared<Csr::merge_path>())
+#else
                      .with_u_strategy(std::make_shared<Csr::load_balance>(exec))
+#endif
                      .on(exec)
                      ->generate(dmtx);
 
     ASSERT_EQ(dfact->get_l_factor()->get_strategy()->get_name(), "merge_path");
+#ifdef GKO_COMPILING_OMP
+    ASSERT_EQ(dfact->get_u_factor()->get_strategy()->get_name(), "merge_path");
+#else
     ASSERT_EQ(dfact->get_u_factor()->get_strategy()->get_name(),
               "load_balance");
+#endif
 }

From 5a25365e0d6e9c69d4bf42e8b8ea352f736d584d Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 30 Sep 2024 11:45:17 +0200
Subject: [PATCH 328/448] update the documentation, change checked_lookup ->
 has_full_fillin (opposite behavior).

Co-authored-by: Tobias Ribizel <ribizel@kit.edu>
Co-authored-by: Natalie Beams <246972+nbeams@users.noreply.github.com>
---
 common/cuda_hip/factorization/ilu_kernels.cpp |  6 +++---
 common/cuda_hip/factorization/lu_kernels.cpp  | 12 ++++++------
 core/device_hooks/common_kernels.inc.cpp      |  2 +-
 core/factorization/ilu.cpp                    |  4 ++--
 core/factorization/ilu_kernels.hpp            |  8 ++++----
 core/factorization/lu.cpp                     |  6 +++---
 core/factorization/lu_kernels.hpp             |  2 +-
 core/test/config/factorization.cpp            |  6 +++---
 dpcpp/factorization/ilu_kernels.dp.cpp        |  6 +++---
 dpcpp/factorization/lu_kernels.dp.cpp         |  2 +-
 include/ginkgo/core/factorization/ilu.hpp     |  6 +++---
 include/ginkgo/core/factorization/lu.hpp      | 13 +++++++------
 omp/factorization/ilu_kernels.cpp             |  6 +++---
 omp/factorization/lu_kernels.cpp              |  4 ++--
 reference/factorization/ilu_kernels.cpp       |  6 +++---
 reference/factorization/lu_kernels.cpp        |  4 ++--
 reference/test/factorization/lu_kernels.cpp   |  4 ++--
 test/factorization/lu_kernels.cpp             |  8 ++++----
 18 files changed, 53 insertions(+), 52 deletions(-)

diff --git a/common/cuda_hip/factorization/ilu_kernels.cpp b/common/cuda_hip/factorization/ilu_kernels.cpp
index 0469b80fe86..b81f8fb9092 100644
--- a/common/cuda_hip/factorization/ilu_kernels.cpp
+++ b/common/cuda_hip/factorization/ilu_kernels.cpp
@@ -17,8 +17,8 @@ namespace ilu_factorization {
 
 
 template <typename ValueType, typename IndexType>
-void compute_lu(std::shared_ptr<const DefaultExecutor> exec,
-                matrix::Csr<ValueType, IndexType>* m)
+void sparselib_ilu(std::shared_ptr<const DefaultExecutor> exec,
+                   matrix::Csr<ValueType, IndexType>* m)
 {
     const auto id = exec->get_device_id();
     auto handle = exec->get_sparselib_handle();
@@ -55,7 +55,7 @@ void compute_lu(std::shared_ptr<const DefaultExecutor> exec,
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_ILU_COMPUTE_LU_KERNEL);
+    GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);
 
 
 }  // namespace ilu_factorization
diff --git a/common/cuda_hip/factorization/lu_kernels.cpp b/common/cuda_hip/factorization/lu_kernels.cpp
index 6cb9b02129b..8c69f1f071d 100644
--- a/common/cuda_hip/factorization/lu_kernels.cpp
+++ b/common/cuda_hip/factorization/lu_kernels.cpp
@@ -85,7 +85,7 @@ __global__ __launch_bounds__(default_block_size) void initialize(
 }
 
 
-template <bool checked_lookup, typename ValueType, typename IndexType>
+template <bool has_all_fillin, typename ValueType, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void factorize(
     const IndexType* __restrict__ row_ptrs, const IndexType* __restrict__ cols,
     const IndexType* __restrict__ storage_offsets,
@@ -130,7 +130,7 @@ __global__ __launch_bounds__(default_block_size) void factorize(
              upper_nz += config::warp_size) {
             const auto upper_col = cols[upper_nz];
             const auto upper_val = vals[upper_nz];
-            if (checked_lookup) {
+            if (!has_all_fillin) {
                 const auto pos = lookup[upper_col];
                 if (pos != invalid_index<IndexType>()) {
                     vals[row_begin + pos] -= scale * upper_val;
@@ -260,7 +260,7 @@ template <typename ValueType, typename IndexType>
 void factorize(std::shared_ptr<const DefaultExecutor> exec,
                const IndexType* lookup_offsets, const int64* lookup_descs,
                const int32* lookup_storage, const IndexType* diag_idxs,
-               matrix::Csr<ValueType, IndexType>* factors, bool checked_lookup,
+               matrix::Csr<ValueType, IndexType>* factors, bool has_all_fillin,
                array<int>& tmp_storage)
 {
     const auto num_rows = factors->get_size()[0];
@@ -268,15 +268,15 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
         syncfree_storage storage(exec, tmp_storage, num_rows);
         const auto num_blocks =
             ceildiv(num_rows, default_block_size / config::warp_size);
-        if (checked_lookup) {
-            kernel::factorize<true>
+        if (!has_all_fillin) {
+            kernel::factorize<false>
                 <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
                     factors->get_const_row_ptrs(),
                     factors->get_const_col_idxs(), lookup_offsets,
                     lookup_storage, lookup_descs, diag_idxs,
                     as_device_type(factors->get_values()), storage, num_rows);
         } else {
-            kernel::factorize<false>
+            kernel::factorize<true>
                 <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
                     factors->get_const_row_ptrs(),
                     factors->get_const_col_idxs(), lookup_offsets,
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 3fdd6dc0b5a..dc789271e5f 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -909,7 +909,7 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC_COMPUTE_KERNEL);
 namespace ilu_factorization {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ILU_COMPUTE_LU_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);
 
 
 }  // namespace ilu_factorization
diff --git a/core/factorization/ilu.cpp b/core/factorization/ilu.cpp
index 015a5829493..dccfabcb51a 100644
--- a/core/factorization/ilu.cpp
+++ b/core/factorization/ilu.cpp
@@ -26,7 +26,7 @@ namespace ilu_factorization {
 namespace {
 
 
-GKO_REGISTER_OPERATION(compute_ilu, ilu_factorization::compute_lu);
+GKO_REGISTER_OPERATION(compute_ilu, ilu_factorization::sparselib_ilu);
 GKO_REGISTER_OPERATION(add_diagonal_elements,
                        factorization::add_diagonal_elements);
 GKO_REGISTER_OPERATION(initialize_row_ptrs_l_u,
@@ -106,7 +106,7 @@ std::unique_ptr<Composition<ValueType>> Ilu<ValueType, IndexType>::generate_l_u(
                     local_system_matrix->get_const_row_ptrs())));
         ilu =
             gko::experimental::factorization::Lu<ValueType, IndexType>::build()
-                .with_checked_lookup(true)
+                .with_has_all_fillin(false)
                 .with_symbolic_factorization(sparsity)
                 .on(exec)
                 ->generate(local_system_matrix)
diff --git a/core/factorization/ilu_kernels.hpp b/core/factorization/ilu_kernels.hpp
index 2371c17fda4..ef90764b141 100644
--- a/core/factorization/ilu_kernels.hpp
+++ b/core/factorization/ilu_kernels.hpp
@@ -20,14 +20,14 @@ namespace gko {
 namespace kernels {
 
 
-#define GKO_DECLARE_ILU_COMPUTE_LU_KERNEL(ValueType, IndexType)  \
-    void compute_lu(std::shared_ptr<const DefaultExecutor> exec, \
-                    matrix::Csr<ValueType, IndexType>* system_matrix)
+#define GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL(ValueType, IndexType)  \
+    void sparselib_ilu(std::shared_ptr<const DefaultExecutor> exec, \
+                       matrix::Csr<ValueType, IndexType>* system_matrix)
 
 
 #define GKO_DECLARE_ALL_AS_TEMPLATES                  \
     template <typename ValueType, typename IndexType> \
-    GKO_DECLARE_ILU_COMPUTE_LU_KERNEL(ValueType, IndexType)
+    GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL(ValueType, IndexType)
 
 
 GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(ilu_factorization,
diff --git a/core/factorization/lu.cpp b/core/factorization/lu.cpp
index 3f2790e674a..31d69b2a993 100644
--- a/core/factorization/lu.cpp
+++ b/core/factorization/lu.cpp
@@ -69,8 +69,8 @@ Lu<ValueType, IndexType>::parse(const config::pnode& config,
     if (auto& obj = config.get("skip_sorting")) {
         params.with_skip_sorting(config::get_value<bool>(obj));
     }
-    if (auto& obj = config.get("checked_lookup")) {
-        params.with_checked_lookup(config::get_value<bool>(obj));
+    if (auto& obj = config.get("has_all_fillin")) {
+        params.with_has_all_fillin(config::get_value<bool>(obj));
     }
 
     return params;
@@ -166,7 +166,7 @@ std::unique_ptr<LinOp> Lu<ValueType, IndexType>::generate_impl(
     exec->run(make_factorize(
         storage_offsets.get_const_data(), row_descs.get_const_data(),
         storage.get_const_data(), diag_idxs.get_const_data(), factors.get(),
-        parameters_.checked_lookup, tmp));
+        parameters_.has_all_fillin, tmp));
     return factorization_type::create_from_combined_lu(std::move(factors));
 }
 
diff --git a/core/factorization/lu_kernels.hpp b/core/factorization/lu_kernels.hpp
index 581c50bcf6e..b2d398985fd 100644
--- a/core/factorization/lu_kernels.hpp
+++ b/core/factorization/lu_kernels.hpp
@@ -33,7 +33,7 @@ namespace kernels {
                    const IndexType* lookup_offsets, const int64* lookup_descs, \
                    const int32* lookup_storage, const IndexType* diag_idxs,    \
                    matrix::Csr<ValueType, IndexType>* factors,                 \
-                   bool checked_lookup, array<int>& tmp_storage)
+                   bool has_all_fillin, array<int>& tmp_storage)
 
 
 #define GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE(IndexType)                  \
diff --git a/core/test/config/factorization.cpp b/core/test/config/factorization.cpp
index 014fb5e346d..8fc8d138f28 100644
--- a/core/test/config/factorization.cpp
+++ b/core/test/config/factorization.cpp
@@ -181,8 +181,8 @@ struct Lu : FactorizationConfigTest<
             gko::experimental::factorization::symbolic_type::near_symmetric);
         config_map["skip_sorting"] = pnode{true};
         param.with_skip_sorting(true);
-        config_map["checked_lookup"] = pnode{true};
-        param.with_checked_lookup(true);
+        config_map["has_all_fillin"] = pnode{false};
+        param.with_has_all_fillin(false);
     }
 
     template <typename AnswerType>
@@ -195,7 +195,7 @@ struct Lu : FactorizationConfigTest<
                   ans_param.symbolic_factorization);
         ASSERT_EQ(res_param.symbolic_algorithm, ans_param.symbolic_algorithm);
         ASSERT_EQ(res_param.skip_sorting, ans_param.skip_sorting);
-        ASSERT_EQ(res_param.checked_lookup, ans_param.checked_lookup);
+        ASSERT_EQ(res_param.has_all_fillin, ans_param.has_all_fillin);
     }
 };
 
diff --git a/dpcpp/factorization/ilu_kernels.dp.cpp b/dpcpp/factorization/ilu_kernels.dp.cpp
index 78fd24c08f4..847547f7706 100644
--- a/dpcpp/factorization/ilu_kernels.dp.cpp
+++ b/dpcpp/factorization/ilu_kernels.dp.cpp
@@ -17,11 +17,11 @@ namespace ilu_factorization {
 
 
 template <typename ValueType, typename IndexType>
-void compute_lu(std::shared_ptr<const DefaultExecutor> exec,
-                matrix::Csr<ValueType, IndexType>* m) GKO_NOT_IMPLEMENTED;
+void sparselib_ilu(std::shared_ptr<const DefaultExecutor> exec,
+                   matrix::Csr<ValueType, IndexType>* m) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_ILU_COMPUTE_LU_KERNEL);
+    GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);
 
 
 }  // namespace ilu_factorization
diff --git a/dpcpp/factorization/lu_kernels.dp.cpp b/dpcpp/factorization/lu_kernels.dp.cpp
index d6a1c2ed5b2..9fdf1165043 100644
--- a/dpcpp/factorization/lu_kernels.dp.cpp
+++ b/dpcpp/factorization/lu_kernels.dp.cpp
@@ -39,7 +39,7 @@ template <typename ValueType, typename IndexType>
 void factorize(std::shared_ptr<const DefaultExecutor> exec,
                const IndexType* lookup_offsets, const int64* lookup_descs,
                const int32* lookup_storage, const IndexType* diag_idxs,
-               matrix::Csr<ValueType, IndexType>* factors, bool checked_lookup,
+               matrix::Csr<ValueType, IndexType>* factors, bool has_all_fillin,
                array<int>& tmp_storage) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE);
diff --git a/include/ginkgo/core/factorization/ilu.hpp b/include/ginkgo/core/factorization/ilu.hpp
index 2de5ea8bca8..d527d8c9912 100644
--- a/include/ginkgo/core/factorization/ilu.hpp
+++ b/include/ginkgo/core/factorization/ilu.hpp
@@ -27,7 +27,7 @@ namespace factorization {
 
 /**
  * A helper for algorithm selection in the incomplete factorization.
- * sparselib is only available for cuda and hip.
+ * sparselib is only available for CUDA and HIP.
  * syncfree is Ginkgo's implementation through the Lu factorization with given
  * sparsity.
  */
@@ -105,9 +105,9 @@ class Ilu : public Composition<ValueType> {
 
         /**
          * Select the implementation which is supposed to be used for
-         * the incomplete factorization. This only matters for the Cuda and Hip
+         * the incomplete factorization. This only matters for the CUDA and HIP
          * executor where the choice is between the Ginkgo (syncfree) and the
-         * cuSPARSE/hipSPARSE (sparselib) implementation. Default is sparselib.
+         * cuSPARSE/HIPSPARSE (sparselib) implementation. Default is sparselib.
          */
         factorize_algorithm GKO_FACTORY_PARAMETER_SCALAR(
             algorithm, factorize_algorithm::sparselib);
diff --git a/include/ginkgo/core/factorization/lu.hpp b/include/ginkgo/core/factorization/lu.hpp
index 9fba621548e..e803d8d1b03 100644
--- a/include/ginkgo/core/factorization/lu.hpp
+++ b/include/ginkgo/core/factorization/lu.hpp
@@ -99,13 +99,14 @@ class Lu
         bool GKO_FACTORY_PARAMETER_SCALAR(skip_sorting, false);
 
         /**
-         * The symbolic factoization should contains the fill-in information. If
-         * it is not the case (like Ilu), users might face hang or illegal
-         * access issue. Please enable this option when the symbolic
-         * factorization does not contain the full fill-in information. Symbolic
-         * factorization must still contain the entry for the original matrix.
+         * The symbolic factorization contains the fill-in for the matrix. If it
+         * does not have full fill-in, as in Ilu, this parameter must be set to
+         * false in order to avoid the possibility of hanging or illegal memory
+         * accesses during the factorization process. When this is true, the
+         * symbolic factorization must still contain the non-zero locations in
+         * the original matrix, at minimum.
          */
-        bool GKO_FACTORY_PARAMETER_SCALAR(checked_lookup, false);
+        bool GKO_FACTORY_PARAMETER_SCALAR(has_all_fillin, true);
     };
 
     /**
diff --git a/omp/factorization/ilu_kernels.cpp b/omp/factorization/ilu_kernels.cpp
index 70982c80753..b88e6a77900 100644
--- a/omp/factorization/ilu_kernels.cpp
+++ b/omp/factorization/ilu_kernels.cpp
@@ -17,11 +17,11 @@ namespace ilu_factorization {
 
 
 template <typename ValueType, typename IndexType>
-void compute_lu(std::shared_ptr<const DefaultExecutor> exec,
-                matrix::Csr<ValueType, IndexType>* m) GKO_NOT_IMPLEMENTED;
+void sparselib_ilu(std::shared_ptr<const DefaultExecutor> exec,
+                   matrix::Csr<ValueType, IndexType>* m) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_ILU_COMPUTE_LU_KERNEL);
+    GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);
 
 
 }  // namespace ilu_factorization
diff --git a/omp/factorization/lu_kernels.cpp b/omp/factorization/lu_kernels.cpp
index 95ff7e3087f..c18bda37cec 100644
--- a/omp/factorization/lu_kernels.cpp
+++ b/omp/factorization/lu_kernels.cpp
@@ -66,7 +66,7 @@ template <typename ValueType, typename IndexType>
 void factorize(std::shared_ptr<const DefaultExecutor> exec,
                const IndexType* lookup_offsets, const int64* lookup_descs,
                const int32* lookup_storage, const IndexType* diag_idxs,
-               matrix::Csr<ValueType, IndexType>* factors, bool checked_lookup,
+               matrix::Csr<ValueType, IndexType>* factors, bool has_all_fillin,
                array<int>& tmp_storage)
 {
     const auto num_rows = factors->get_size()[0];
@@ -89,7 +89,7 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
             for (auto dep_nz = dep_diag_idx + 1; dep_nz < dep_end; dep_nz++) {
                 const auto col = cols[dep_nz];
                 const auto val = vals[dep_nz];
-                if (checked_lookup) {
+                if (!has_all_fillin) {
                     const auto idx = lookup[col];
                     if (idx != invalid_index<IndexType>()) {
                         vals[row_begin + idx] -= scale * val;
diff --git a/reference/factorization/ilu_kernels.cpp b/reference/factorization/ilu_kernels.cpp
index fdbe8a9e86f..3323e0b6cef 100644
--- a/reference/factorization/ilu_kernels.cpp
+++ b/reference/factorization/ilu_kernels.cpp
@@ -23,8 +23,8 @@ namespace ilu_factorization {
 
 
 template <typename ValueType, typename IndexType>
-void compute_lu(std::shared_ptr<const DefaultExecutor> exec,
-                matrix::Csr<ValueType, IndexType>* m)
+void sparselib_ilu(std::shared_ptr<const DefaultExecutor> exec,
+                   matrix::Csr<ValueType, IndexType>* m)
 {
     vector<IndexType> diagonals{m->get_size()[0], -1, exec};
     const auto row_ptrs = m->get_const_row_ptrs();
@@ -66,7 +66,7 @@ void compute_lu(std::shared_ptr<const DefaultExecutor> exec,
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_ILU_COMPUTE_LU_KERNEL);
+    GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);
 
 
 }  // namespace ilu_factorization
diff --git a/reference/factorization/lu_kernels.cpp b/reference/factorization/lu_kernels.cpp
index b6cafd590a5..5bf159a6de5 100644
--- a/reference/factorization/lu_kernels.cpp
+++ b/reference/factorization/lu_kernels.cpp
@@ -65,7 +65,7 @@ template <typename ValueType, typename IndexType>
 void factorize(std::shared_ptr<const DefaultExecutor> exec,
                const IndexType* lookup_offsets, const int64* lookup_descs,
                const int32* lookup_storage, const IndexType* diag_idxs,
-               matrix::Csr<ValueType, IndexType>* factors, bool checked_lookup,
+               matrix::Csr<ValueType, IndexType>* factors, bool has_all_fillin,
                array<int>& tmp_storage)
 {
     const auto num_rows = factors->get_size()[0];
@@ -87,7 +87,7 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
             for (auto dep_nz = dep_diag_idx + 1; dep_nz < dep_end; dep_nz++) {
                 const auto col = cols[dep_nz];
                 const auto val = vals[dep_nz];
-                if (checked_lookup) {
+                if (!has_all_fillin) {
                     const auto idx = lookup[col];
                     if (idx != invalid_index<IndexType>()) {
                         vals[row_begin + idx] -= scale * val;
diff --git a/reference/test/factorization/lu_kernels.cpp b/reference/test/factorization/lu_kernels.cpp
index 5233a76f117..cfc86eba6d3 100644
--- a/reference/test/factorization/lu_kernels.cpp
+++ b/reference/test/factorization/lu_kernels.cpp
@@ -356,7 +356,7 @@ TYPED_TEST(Lu, GenerateIluWithBitmapIsEquivalentToRef)
     auto factory =
         gko::experimental::factorization::Lu<value_type, index_type>::build()
             .with_symbolic_factorization(sparsity)
-            .with_checked_lookup(true)
+            .with_has_all_fillin(false)
             .on(this->ref);
 
     auto lu = factory->generate(mtx);
@@ -390,7 +390,7 @@ TYPED_TEST(Lu, GenerateIluWithHashmapIsEquivalentToRef)
     auto factory =
         gko::experimental::factorization::Lu<value_type, index_type>::build()
             .with_symbolic_factorization(sparsity)
-            .with_checked_lookup(true)
+            .with_has_all_fillin(false)
             .on(this->ref);
 
     auto lu = factory->generate(mtx);
diff --git a/test/factorization/lu_kernels.cpp b/test/factorization/lu_kernels.cpp
index baec66d603f..7ad036df85e 100644
--- a/test/factorization/lu_kernels.cpp
+++ b/test/factorization/lu_kernels.cpp
@@ -374,12 +374,12 @@ TYPED_TEST(Lu, GenerateIluWithBitmapIsEquivalentToRef)
     auto factory =
         gko::experimental::factorization::Lu<value_type, index_type>::build()
             .with_symbolic_factorization(sparsity)
-            .with_checked_lookup(true)
+            .with_has_all_fillin(false)
             .on(this->ref);
     auto dfactory =
         gko::experimental::factorization::Lu<value_type, index_type>::build()
             .with_symbolic_factorization(dsparsity)
-            .with_checked_lookup(true)
+            .with_has_all_fillin(false)
             .on(this->exec);
 
     auto lu = factory->generate(mtx);
@@ -417,12 +417,12 @@ TYPED_TEST(Lu, GenerateIluWithHashmapIsEquivalentToRef)
     auto factory =
         gko::experimental::factorization::Lu<value_type, index_type>::build()
             .with_symbolic_factorization(sparsity)
-            .with_checked_lookup(true)
+            .with_has_all_fillin(false)
             .on(this->ref);
     auto dfactory =
         gko::experimental::factorization::Lu<value_type, index_type>::build()
             .with_symbolic_factorization(dsparsity)
-            .with_checked_lookup(true)
+            .with_has_all_fillin(false)
             .on(this->exec);
 
     auto lu = factory->generate(mtx);

From ecad47dacc00a58437cf1f7d64effb3372674e4d Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 30 Sep 2024 13:51:22 +0200
Subject: [PATCH 329/448] use unpack directly from factorization and add unpack
 with strategy

---
 core/factorization/factorization.cpp          | 38 +++++++++----
 core/factorization/ilu.cpp                    | 22 ++++----
 .../core/factorization/factorization.hpp      | 13 ++++-
 .../test/factorization/factorization.cpp      | 54 +++++++++++++++++++
 4 files changed, 105 insertions(+), 22 deletions(-)

diff --git a/core/factorization/factorization.cpp b/core/factorization/factorization.cpp
index 1df1f49aa13..4a16e84a1e0 100644
--- a/core/factorization/factorization.cpp
+++ b/core/factorization/factorization.cpp
@@ -31,10 +31,25 @@ GKO_REGISTER_OPERATION(initialize_l, factorization::initialize_l);
 
 template <typename ValueType, typename IndexType>
 std::unique_ptr<Factorization<ValueType, IndexType>>
-Factorization<ValueType, IndexType>::unpack() const
+Factorization<ValueType, IndexType>::unpack(
+    std::shared_ptr<typename matrix_type::strategy_type> lower_factor_strategy,
+    std::shared_ptr<typename matrix_type::strategy_type> upper_factor_strategy)
+    const
 {
     const auto exec = this->get_executor();
     const auto size = this->get_size();
+    auto create_matrix = [](auto exec, auto size, auto vals, auto col_idxs,
+                            auto row_ptrs, auto strategy) {
+        if (strategy == nullptr) {
+            return matrix_type::create(exec, size, std::move(vals),
+                                       std::move(col_idxs),
+                                       std::move(row_ptrs));
+        } else {
+            return matrix_type::create(exec, size, std::move(vals),
+                                       std::move(col_idxs), std::move(row_ptrs),
+                                       strategy);
+        }
+    };
     switch (this->get_storage_type()) {
     case storage_type::empty:
         GKO_NOT_SUPPORTED(nullptr);
@@ -53,12 +68,14 @@ Factorization<ValueType, IndexType>::unpack() const
         const auto u_nnz =
             static_cast<size_type>(get_element(u_row_ptrs, size[0]));
         // create matrices
-        auto l_mtx = matrix_type::create(
-            exec, size, array<value_type>{exec, l_nnz},
-            array<index_type>{exec, l_nnz}, std::move(l_row_ptrs));
-        auto u_mtx = matrix_type::create(
-            exec, size, array<value_type>{exec, u_nnz},
-            array<index_type>{exec, u_nnz}, std::move(u_row_ptrs));
+        auto l_mtx =
+            create_matrix(exec, size, array<value_type>{exec, l_nnz},
+                          array<index_type>{exec, l_nnz}, std::move(l_row_ptrs),
+                          lower_factor_strategy);
+        auto u_mtx =
+            create_matrix(exec, size, array<value_type>{exec, u_nnz},
+                          array<index_type>{exec, u_nnz}, std::move(u_row_ptrs),
+                          upper_factor_strategy);
         // fill matrices
         exec->run(make_initialize_l_u(mtx.get(), l_mtx.get(), u_mtx.get()));
         return create_from_composition(
@@ -72,9 +89,10 @@ Factorization<ValueType, IndexType>::unpack() const
         const auto l_nnz =
             static_cast<size_type>(get_element(l_row_ptrs, size[0]));
         // create matrices
-        auto l_mtx = matrix_type::create(
-            exec, size, array<value_type>{exec, l_nnz},
-            array<index_type>{exec, l_nnz}, std::move(l_row_ptrs));
+        auto l_mtx =
+            create_matrix(exec, size, array<value_type>{exec, l_nnz},
+                          array<index_type>{exec, l_nnz}, std::move(l_row_ptrs),
+                          lower_factor_strategy);
         // fill matrices
         exec->run(make_initialize_l(mtx.get(), l_mtx.get(), false));
         auto u_mtx = l_mtx->conj_transpose();
diff --git a/core/factorization/ilu.cpp b/core/factorization/ilu.cpp
index dccfabcb51a..32812f699c1 100644
--- a/core/factorization/ilu.cpp
+++ b/core/factorization/ilu.cpp
@@ -80,7 +80,6 @@ std::unique_ptr<Composition<ValueType>> Ilu<ValueType, IndexType>::generate_l_u(
     // Converts the system matrix to CSR.
     // Throws an exception if it is not convertible.
     auto local_system_matrix = share(matrix_type::create(exec));
-    std::shared_ptr<const matrix_type> ilu;
     as<ConvertibleTo<matrix_type>>(system_matrix.get())
         ->convert_to(local_system_matrix);
 
@@ -104,25 +103,26 @@ std::unique_ptr<Composition<ValueType>> Ilu<ValueType, IndexType>::generate_l_u(
                 make_const_array_view(
                     exec, local_system_matrix->get_size()[0] + 1,
                     local_system_matrix->get_const_row_ptrs())));
-        ilu =
+        auto unpack =
             gko::experimental::factorization::Lu<ValueType, IndexType>::build()
                 .with_has_all_fillin(false)
                 .with_symbolic_factorization(sparsity)
                 .on(exec)
                 ->generate(local_system_matrix)
-                ->get_combined();
-    } else {
-        exec->run(
-            ilu_factorization::make_compute_ilu(local_system_matrix.get()));
-        ilu = local_system_matrix;
+                ->unpack(parameters_.l_strategy, parameters_.u_strategy);
+        return Composition<ValueType>::create(unpack->get_lower_factor(),
+                                              unpack->get_upper_factor());
     }
+    exec->run(ilu_factorization::make_compute_ilu(local_system_matrix.get()));
+
     // Separate L and U factors: nnz
-    const auto matrix_size = ilu->get_size();
+    const auto matrix_size = local_system_matrix->get_size();
     const auto num_rows = matrix_size[0];
     array<IndexType> l_row_ptrs{exec, num_rows + 1};
     array<IndexType> u_row_ptrs{exec, num_rows + 1};
     exec->run(ilu_factorization::make_initialize_row_ptrs_l_u(
-        ilu.get(), l_row_ptrs.get_data(), u_row_ptrs.get_data()));
+        local_system_matrix.get(), l_row_ptrs.get_data(),
+        u_row_ptrs.get_data()));
 
     // Get nnz from device memory
     auto l_nnz = static_cast<size_type>(get_element(l_row_ptrs, num_rows));
@@ -141,8 +141,8 @@ std::unique_ptr<Composition<ValueType>> Ilu<ValueType, IndexType>::generate_l_u(
         std::move(u_row_ptrs), parameters_.u_strategy);
 
     // Separate L and U: columns and values
-    exec->run(ilu_factorization::make_initialize_l_u(ilu.get(), l_factor.get(),
-                                                     u_factor.get()));
+    exec->run(ilu_factorization::make_initialize_l_u(
+        local_system_matrix.get(), l_factor.get(), u_factor.get()));
 
     return Composition<ValueType>::create(std::move(l_factor),
                                           std::move(u_factor));
diff --git a/include/ginkgo/core/factorization/factorization.hpp b/include/ginkgo/core/factorization/factorization.hpp
index 39345f59a44..5b203fb7e09 100644
--- a/include/ginkgo/core/factorization/factorization.hpp
+++ b/include/ginkgo/core/factorization/factorization.hpp
@@ -88,10 +88,21 @@ class Factorization : public EnableLinOp<Factorization<ValueType, IndexType>> {
      * for triangular solves to a composition representation that can also be
      * used to access individual factors and multiply with the factorization.
      *
+     * @param lower_factor_strategy  the Csr strategy for the lower factor and
+     *                               the transposed lower factor.
+     * @param upper_factor_strategy  the Csr strategy for the upper factor
+     *
      * @return  a new Factorization object containing this factorization
      *          represented as storage_type::composition.
+     *
+     * @note The strategy only has effect when it is unpacked from the combined
+     * matrix.
      */
-    std::unique_ptr<Factorization> unpack() const;
+    std::unique_ptr<Factorization> unpack(
+        std::shared_ptr<typename matrix_type::strategy_type>
+            lower_factor_strategy = nullptr,
+        std::shared_ptr<typename matrix_type::strategy_type>
+            upper_factor_strategy = nullptr) const;
 
     /** Returns the storage type used by this factorization. */
     storage_type get_storage_type() const;
diff --git a/reference/test/factorization/factorization.cpp b/reference/test/factorization/factorization.cpp
index 2ded81d4867..402307e452a 100644
--- a/reference/test/factorization/factorization.cpp
+++ b/reference/test/factorization/factorization.cpp
@@ -252,6 +252,31 @@ TYPED_TEST(Factorization, UnpackCombinedLUWorks)
 }
 
 
+TYPED_TEST(Factorization, UnpackCombinedLUWorksWithStrategy)
+{
+    using factorization_type = typename TestFixture::factorization_type;
+    using matrix_type = typename TestFixture::matrix_type;
+    auto fact = factorization_type::create_from_combined_lu(
+        this->combined_mtx->clone());
+
+    auto separated =
+        fact->unpack(std::make_shared<typename matrix_type::classical>(),
+                     std::make_shared<typename matrix_type::merge_path>());
+
+    ASSERT_EQ(separated->get_storage_type(),
+              gko::experimental::factorization::storage_type::composition);
+    ASSERT_EQ(separated->get_combined(), nullptr);
+    ASSERT_EQ(separated->get_diagonal(), nullptr);
+    GKO_ASSERT_MTX_NEAR(separated->get_lower_factor(), this->lower_mtx, 0.0);
+    GKO_ASSERT_MTX_NEAR(separated->get_upper_factor(), this->upper_nonunit_mtx,
+                        0.0);
+    ASSERT_EQ(separated->get_lower_factor()->get_strategy()->get_name(),
+              "classical");
+    ASSERT_EQ(separated->get_upper_factor()->get_strategy()->get_name(),
+              "merge_path");
+}
+
+
 TYPED_TEST(Factorization, UnpackSymmCombinedCholeskyWorks)
 {
     using matrix_type = typename TestFixture::matrix_type;
@@ -273,6 +298,35 @@ TYPED_TEST(Factorization, UnpackSymmCombinedCholeskyWorks)
 }
 
 
+TYPED_TEST(Factorization, UnpackSymmCombinedCholeskyWorksWithStrategy)
+{
+    using matrix_type = typename TestFixture::matrix_type;
+    using factorization_type = typename TestFixture::factorization_type;
+    auto fact = factorization_type::create_from_combined_cholesky(
+        this->combined_mtx->clone());
+
+    // second one is ignored in cholesky to keep the same behavior as
+    // factorization::Ic
+    auto separated =
+        fact->unpack(std::make_shared<typename matrix_type::classical>(),
+                     std::make_shared<typename matrix_type::merge_path>());
+
+    ASSERT_EQ(separated->get_storage_type(),
+              gko::experimental::factorization::storage_type::symm_composition);
+    ASSERT_EQ(separated->get_combined(), nullptr);
+    ASSERT_EQ(separated->get_diagonal(), nullptr);
+    GKO_ASSERT_MTX_NEAR(separated->get_lower_factor(), this->lower_cholesky_mtx,
+                        0.0);
+    GKO_ASSERT_MTX_NEAR(
+        separated->get_upper_factor(),
+        gko::as<matrix_type>(this->lower_cholesky_mtx->conj_transpose()), 0.0);
+    ASSERT_EQ(separated->get_lower_factor()->get_strategy()->get_name(),
+              "classical");
+    ASSERT_EQ(separated->get_upper_factor()->get_strategy()->get_name(),
+              "classical");
+}
+
+
 TYPED_TEST(Factorization, UnpackCompositionWorks)
 {
     using factorization_type = typename TestFixture::factorization_type;

From 4494990512f78308ae02ed7728caf3d5e7a42c8e Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 30 Sep 2024 13:57:01 +0200
Subject: [PATCH 330/448] remove the duplicated initialization

---
 core/factorization/lu.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/core/factorization/lu.cpp b/core/factorization/lu.cpp
index 31d69b2a993..e997b053947 100644
--- a/core/factorization/lu.cpp
+++ b/core/factorization/lu.cpp
@@ -154,10 +154,6 @@ std::unique_ptr<LinOp> Lu<ValueType, IndexType>::generate_impl(
         factors->get_const_row_ptrs(), factors->get_const_col_idxs(), num_rows,
         allowed_sparsity, storage_offsets.get_const_data(),
         row_descs.get_data(), storage.get_data()));
-    // initialize factors
-    exec->run(make_fill_array(factors->get_values(),
-                              factors->get_num_stored_elements(),
-                              zero<ValueType>()));
     exec->run(make_initialize(
         mtx.get(), storage_offsets.get_const_data(), row_descs.get_const_data(),
         storage.get_const_data(), diag_idxs.get_data(), factors.get()));

From 690946c72bab1fb7c6f8f9f2be85faba3660d8df Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 29 Oct 2024 10:24:46 +0100
Subject: [PATCH 331/448] Revert "use unpack directly from factorization and
 add unpack with strategy" to avoid new strategy input

---
 core/factorization/factorization.cpp          | 38 ++++---------
 core/factorization/ilu.cpp                    | 22 ++++----
 .../core/factorization/factorization.hpp      | 13 +----
 .../test/factorization/factorization.cpp      | 54 -------------------
 4 files changed, 22 insertions(+), 105 deletions(-)

diff --git a/core/factorization/factorization.cpp b/core/factorization/factorization.cpp
index 4a16e84a1e0..1df1f49aa13 100644
--- a/core/factorization/factorization.cpp
+++ b/core/factorization/factorization.cpp
@@ -31,25 +31,10 @@ GKO_REGISTER_OPERATION(initialize_l, factorization::initialize_l);
 
 template <typename ValueType, typename IndexType>
 std::unique_ptr<Factorization<ValueType, IndexType>>
-Factorization<ValueType, IndexType>::unpack(
-    std::shared_ptr<typename matrix_type::strategy_type> lower_factor_strategy,
-    std::shared_ptr<typename matrix_type::strategy_type> upper_factor_strategy)
-    const
+Factorization<ValueType, IndexType>::unpack() const
 {
     const auto exec = this->get_executor();
     const auto size = this->get_size();
-    auto create_matrix = [](auto exec, auto size, auto vals, auto col_idxs,
-                            auto row_ptrs, auto strategy) {
-        if (strategy == nullptr) {
-            return matrix_type::create(exec, size, std::move(vals),
-                                       std::move(col_idxs),
-                                       std::move(row_ptrs));
-        } else {
-            return matrix_type::create(exec, size, std::move(vals),
-                                       std::move(col_idxs), std::move(row_ptrs),
-                                       strategy);
-        }
-    };
     switch (this->get_storage_type()) {
     case storage_type::empty:
         GKO_NOT_SUPPORTED(nullptr);
@@ -68,14 +53,12 @@ Factorization<ValueType, IndexType>::unpack(
         const auto u_nnz =
             static_cast<size_type>(get_element(u_row_ptrs, size[0]));
         // create matrices
-        auto l_mtx =
-            create_matrix(exec, size, array<value_type>{exec, l_nnz},
-                          array<index_type>{exec, l_nnz}, std::move(l_row_ptrs),
-                          lower_factor_strategy);
-        auto u_mtx =
-            create_matrix(exec, size, array<value_type>{exec, u_nnz},
-                          array<index_type>{exec, u_nnz}, std::move(u_row_ptrs),
-                          upper_factor_strategy);
+        auto l_mtx = matrix_type::create(
+            exec, size, array<value_type>{exec, l_nnz},
+            array<index_type>{exec, l_nnz}, std::move(l_row_ptrs));
+        auto u_mtx = matrix_type::create(
+            exec, size, array<value_type>{exec, u_nnz},
+            array<index_type>{exec, u_nnz}, std::move(u_row_ptrs));
         // fill matrices
         exec->run(make_initialize_l_u(mtx.get(), l_mtx.get(), u_mtx.get()));
         return create_from_composition(
@@ -89,10 +72,9 @@ Factorization<ValueType, IndexType>::unpack(
         const auto l_nnz =
             static_cast<size_type>(get_element(l_row_ptrs, size[0]));
         // create matrices
-        auto l_mtx =
-            create_matrix(exec, size, array<value_type>{exec, l_nnz},
-                          array<index_type>{exec, l_nnz}, std::move(l_row_ptrs),
-                          lower_factor_strategy);
+        auto l_mtx = matrix_type::create(
+            exec, size, array<value_type>{exec, l_nnz},
+            array<index_type>{exec, l_nnz}, std::move(l_row_ptrs));
         // fill matrices
         exec->run(make_initialize_l(mtx.get(), l_mtx.get(), false));
         auto u_mtx = l_mtx->conj_transpose();
diff --git a/core/factorization/ilu.cpp b/core/factorization/ilu.cpp
index 32812f699c1..dccfabcb51a 100644
--- a/core/factorization/ilu.cpp
+++ b/core/factorization/ilu.cpp
@@ -80,6 +80,7 @@ std::unique_ptr<Composition<ValueType>> Ilu<ValueType, IndexType>::generate_l_u(
     // Converts the system matrix to CSR.
     // Throws an exception if it is not convertible.
     auto local_system_matrix = share(matrix_type::create(exec));
+    std::shared_ptr<const matrix_type> ilu;
     as<ConvertibleTo<matrix_type>>(system_matrix.get())
         ->convert_to(local_system_matrix);
 
@@ -103,26 +104,25 @@ std::unique_ptr<Composition<ValueType>> Ilu<ValueType, IndexType>::generate_l_u(
                 make_const_array_view(
                     exec, local_system_matrix->get_size()[0] + 1,
                     local_system_matrix->get_const_row_ptrs())));
-        auto unpack =
+        ilu =
             gko::experimental::factorization::Lu<ValueType, IndexType>::build()
                 .with_has_all_fillin(false)
                 .with_symbolic_factorization(sparsity)
                 .on(exec)
                 ->generate(local_system_matrix)
-                ->unpack(parameters_.l_strategy, parameters_.u_strategy);
-        return Composition<ValueType>::create(unpack->get_lower_factor(),
-                                              unpack->get_upper_factor());
+                ->get_combined();
+    } else {
+        exec->run(
+            ilu_factorization::make_compute_ilu(local_system_matrix.get()));
+        ilu = local_system_matrix;
     }
-    exec->run(ilu_factorization::make_compute_ilu(local_system_matrix.get()));
-
     // Separate L and U factors: nnz
-    const auto matrix_size = local_system_matrix->get_size();
+    const auto matrix_size = ilu->get_size();
     const auto num_rows = matrix_size[0];
     array<IndexType> l_row_ptrs{exec, num_rows + 1};
     array<IndexType> u_row_ptrs{exec, num_rows + 1};
     exec->run(ilu_factorization::make_initialize_row_ptrs_l_u(
-        local_system_matrix.get(), l_row_ptrs.get_data(),
-        u_row_ptrs.get_data()));
+        ilu.get(), l_row_ptrs.get_data(), u_row_ptrs.get_data()));
 
     // Get nnz from device memory
     auto l_nnz = static_cast<size_type>(get_element(l_row_ptrs, num_rows));
@@ -141,8 +141,8 @@ std::unique_ptr<Composition<ValueType>> Ilu<ValueType, IndexType>::generate_l_u(
         std::move(u_row_ptrs), parameters_.u_strategy);
 
     // Separate L and U: columns and values
-    exec->run(ilu_factorization::make_initialize_l_u(
-        local_system_matrix.get(), l_factor.get(), u_factor.get()));
+    exec->run(ilu_factorization::make_initialize_l_u(ilu.get(), l_factor.get(),
+                                                     u_factor.get()));
 
     return Composition<ValueType>::create(std::move(l_factor),
                                           std::move(u_factor));
diff --git a/include/ginkgo/core/factorization/factorization.hpp b/include/ginkgo/core/factorization/factorization.hpp
index 5b203fb7e09..39345f59a44 100644
--- a/include/ginkgo/core/factorization/factorization.hpp
+++ b/include/ginkgo/core/factorization/factorization.hpp
@@ -88,21 +88,10 @@ class Factorization : public EnableLinOp<Factorization<ValueType, IndexType>> {
      * for triangular solves to a composition representation that can also be
      * used to access individual factors and multiply with the factorization.
      *
-     * @param lower_factor_strategy  the Csr strategy for the lower factor and
-     *                               the transposed lower factor.
-     * @param upper_factor_strategy  the Csr strategy for the upper factor
-     *
      * @return  a new Factorization object containing this factorization
      *          represented as storage_type::composition.
-     *
-     * @note The strategy only has effect when it is unpacked from the combined
-     * matrix.
      */
-    std::unique_ptr<Factorization> unpack(
-        std::shared_ptr<typename matrix_type::strategy_type>
-            lower_factor_strategy = nullptr,
-        std::shared_ptr<typename matrix_type::strategy_type>
-            upper_factor_strategy = nullptr) const;
+    std::unique_ptr<Factorization> unpack() const;
 
     /** Returns the storage type used by this factorization. */
     storage_type get_storage_type() const;
diff --git a/reference/test/factorization/factorization.cpp b/reference/test/factorization/factorization.cpp
index 402307e452a..2ded81d4867 100644
--- a/reference/test/factorization/factorization.cpp
+++ b/reference/test/factorization/factorization.cpp
@@ -252,31 +252,6 @@ TYPED_TEST(Factorization, UnpackCombinedLUWorks)
 }
 
 
-TYPED_TEST(Factorization, UnpackCombinedLUWorksWithStrategy)
-{
-    using factorization_type = typename TestFixture::factorization_type;
-    using matrix_type = typename TestFixture::matrix_type;
-    auto fact = factorization_type::create_from_combined_lu(
-        this->combined_mtx->clone());
-
-    auto separated =
-        fact->unpack(std::make_shared<typename matrix_type::classical>(),
-                     std::make_shared<typename matrix_type::merge_path>());
-
-    ASSERT_EQ(separated->get_storage_type(),
-              gko::experimental::factorization::storage_type::composition);
-    ASSERT_EQ(separated->get_combined(), nullptr);
-    ASSERT_EQ(separated->get_diagonal(), nullptr);
-    GKO_ASSERT_MTX_NEAR(separated->get_lower_factor(), this->lower_mtx, 0.0);
-    GKO_ASSERT_MTX_NEAR(separated->get_upper_factor(), this->upper_nonunit_mtx,
-                        0.0);
-    ASSERT_EQ(separated->get_lower_factor()->get_strategy()->get_name(),
-              "classical");
-    ASSERT_EQ(separated->get_upper_factor()->get_strategy()->get_name(),
-              "merge_path");
-}
-
-
 TYPED_TEST(Factorization, UnpackSymmCombinedCholeskyWorks)
 {
     using matrix_type = typename TestFixture::matrix_type;
@@ -298,35 +273,6 @@ TYPED_TEST(Factorization, UnpackSymmCombinedCholeskyWorks)
 }
 
 
-TYPED_TEST(Factorization, UnpackSymmCombinedCholeskyWorksWithStrategy)
-{
-    using matrix_type = typename TestFixture::matrix_type;
-    using factorization_type = typename TestFixture::factorization_type;
-    auto fact = factorization_type::create_from_combined_cholesky(
-        this->combined_mtx->clone());
-
-    // second one is ignored in cholesky to keep the same behavior as
-    // factorization::Ic
-    auto separated =
-        fact->unpack(std::make_shared<typename matrix_type::classical>(),
-                     std::make_shared<typename matrix_type::merge_path>());
-
-    ASSERT_EQ(separated->get_storage_type(),
-              gko::experimental::factorization::storage_type::symm_composition);
-    ASSERT_EQ(separated->get_combined(), nullptr);
-    ASSERT_EQ(separated->get_diagonal(), nullptr);
-    GKO_ASSERT_MTX_NEAR(separated->get_lower_factor(), this->lower_cholesky_mtx,
-                        0.0);
-    GKO_ASSERT_MTX_NEAR(
-        separated->get_upper_factor(),
-        gko::as<matrix_type>(this->lower_cholesky_mtx->conj_transpose()), 0.0);
-    ASSERT_EQ(separated->get_lower_factor()->get_strategy()->get_name(),
-              "classical");
-    ASSERT_EQ(separated->get_upper_factor()->get_strategy()->get_name(),
-              "classical");
-}
-
-
 TYPED_TEST(Factorization, UnpackCompositionWorks)
 {
     using factorization_type = typename TestFixture::factorization_type;

From 7b0b8e856e333e577c579dfe7606f29868717729 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 6 Nov 2024 11:00:17 +0100
Subject: [PATCH 332/448] update documentation

---
 common/cuda_hip/factorization/lu_kernels.cpp |  8 ++++----
 core/factorization/ilu.cpp                   |  2 +-
 core/factorization/lu.cpp                    |  6 +++---
 core/factorization/lu_kernels.hpp            |  2 +-
 core/test/config/factorization.cpp           |  6 +++---
 dpcpp/factorization/lu_kernels.dp.cpp        |  2 +-
 include/ginkgo/core/factorization/lu.hpp     | 17 ++++++++++-------
 omp/factorization/lu_kernels.cpp             |  4 ++--
 reference/factorization/lu_kernels.cpp       |  4 ++--
 reference/test/factorization/lu_kernels.cpp  |  4 ++--
 test/factorization/lu_kernels.cpp            |  8 ++++----
 11 files changed, 33 insertions(+), 30 deletions(-)

diff --git a/common/cuda_hip/factorization/lu_kernels.cpp b/common/cuda_hip/factorization/lu_kernels.cpp
index 8c69f1f071d..b6c2207f4c3 100644
--- a/common/cuda_hip/factorization/lu_kernels.cpp
+++ b/common/cuda_hip/factorization/lu_kernels.cpp
@@ -85,7 +85,7 @@ __global__ __launch_bounds__(default_block_size) void initialize(
 }
 
 
-template <bool has_all_fillin, typename ValueType, typename IndexType>
+template <bool full_fillin, typename ValueType, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void factorize(
     const IndexType* __restrict__ row_ptrs, const IndexType* __restrict__ cols,
     const IndexType* __restrict__ storage_offsets,
@@ -130,7 +130,7 @@ __global__ __launch_bounds__(default_block_size) void factorize(
              upper_nz += config::warp_size) {
             const auto upper_col = cols[upper_nz];
             const auto upper_val = vals[upper_nz];
-            if (!has_all_fillin) {
+            if (!full_fillin) {
                 const auto pos = lookup[upper_col];
                 if (pos != invalid_index<IndexType>()) {
                     vals[row_begin + pos] -= scale * upper_val;
@@ -260,7 +260,7 @@ template <typename ValueType, typename IndexType>
 void factorize(std::shared_ptr<const DefaultExecutor> exec,
                const IndexType* lookup_offsets, const int64* lookup_descs,
                const int32* lookup_storage, const IndexType* diag_idxs,
-               matrix::Csr<ValueType, IndexType>* factors, bool has_all_fillin,
+               matrix::Csr<ValueType, IndexType>* factors, bool full_fillin,
                array<int>& tmp_storage)
 {
     const auto num_rows = factors->get_size()[0];
@@ -268,7 +268,7 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
         syncfree_storage storage(exec, tmp_storage, num_rows);
         const auto num_blocks =
             ceildiv(num_rows, default_block_size / config::warp_size);
-        if (!has_all_fillin) {
+        if (!full_fillin) {
             kernel::factorize<false>
                 <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
                     factors->get_const_row_ptrs(),
diff --git a/core/factorization/ilu.cpp b/core/factorization/ilu.cpp
index dccfabcb51a..b9f4e2057c0 100644
--- a/core/factorization/ilu.cpp
+++ b/core/factorization/ilu.cpp
@@ -106,7 +106,7 @@ std::unique_ptr<Composition<ValueType>> Ilu<ValueType, IndexType>::generate_l_u(
                     local_system_matrix->get_const_row_ptrs())));
         ilu =
             gko::experimental::factorization::Lu<ValueType, IndexType>::build()
-                .with_has_all_fillin(false)
+                .with_full_fillin(false)
                 .with_symbolic_factorization(sparsity)
                 .on(exec)
                 ->generate(local_system_matrix)
diff --git a/core/factorization/lu.cpp b/core/factorization/lu.cpp
index e997b053947..c2152da2735 100644
--- a/core/factorization/lu.cpp
+++ b/core/factorization/lu.cpp
@@ -69,8 +69,8 @@ Lu<ValueType, IndexType>::parse(const config::pnode& config,
     if (auto& obj = config.get("skip_sorting")) {
         params.with_skip_sorting(config::get_value<bool>(obj));
     }
-    if (auto& obj = config.get("has_all_fillin")) {
-        params.with_has_all_fillin(config::get_value<bool>(obj));
+    if (auto& obj = config.get("full_fillin")) {
+        params.with_full_fillin(config::get_value<bool>(obj));
     }
 
     return params;
@@ -162,7 +162,7 @@ std::unique_ptr<LinOp> Lu<ValueType, IndexType>::generate_impl(
     exec->run(make_factorize(
         storage_offsets.get_const_data(), row_descs.get_const_data(),
         storage.get_const_data(), diag_idxs.get_const_data(), factors.get(),
-        parameters_.has_all_fillin, tmp));
+        parameters_.full_fillin, tmp));
     return factorization_type::create_from_combined_lu(std::move(factors));
 }
 
diff --git a/core/factorization/lu_kernels.hpp b/core/factorization/lu_kernels.hpp
index b2d398985fd..ebc521ef2b5 100644
--- a/core/factorization/lu_kernels.hpp
+++ b/core/factorization/lu_kernels.hpp
@@ -33,7 +33,7 @@ namespace kernels {
                    const IndexType* lookup_offsets, const int64* lookup_descs, \
                    const int32* lookup_storage, const IndexType* diag_idxs,    \
                    matrix::Csr<ValueType, IndexType>* factors,                 \
-                   bool has_all_fillin, array<int>& tmp_storage)
+                   bool full_fillin, array<int>& tmp_storage)
 
 
 #define GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE(IndexType)                  \
diff --git a/core/test/config/factorization.cpp b/core/test/config/factorization.cpp
index 8fc8d138f28..42ae70f8474 100644
--- a/core/test/config/factorization.cpp
+++ b/core/test/config/factorization.cpp
@@ -181,8 +181,8 @@ struct Lu : FactorizationConfigTest<
             gko::experimental::factorization::symbolic_type::near_symmetric);
         config_map["skip_sorting"] = pnode{true};
         param.with_skip_sorting(true);
-        config_map["has_all_fillin"] = pnode{false};
-        param.with_has_all_fillin(false);
+        config_map["full_fillin"] = pnode{false};
+        param.with_full_fillin(false);
     }
 
     template <typename AnswerType>
@@ -195,7 +195,7 @@ struct Lu : FactorizationConfigTest<
                   ans_param.symbolic_factorization);
         ASSERT_EQ(res_param.symbolic_algorithm, ans_param.symbolic_algorithm);
         ASSERT_EQ(res_param.skip_sorting, ans_param.skip_sorting);
-        ASSERT_EQ(res_param.has_all_fillin, ans_param.has_all_fillin);
+        ASSERT_EQ(res_param.full_fillin, ans_param.full_fillin);
     }
 };
 
diff --git a/dpcpp/factorization/lu_kernels.dp.cpp b/dpcpp/factorization/lu_kernels.dp.cpp
index 9fdf1165043..bd26b1f79ca 100644
--- a/dpcpp/factorization/lu_kernels.dp.cpp
+++ b/dpcpp/factorization/lu_kernels.dp.cpp
@@ -39,7 +39,7 @@ template <typename ValueType, typename IndexType>
 void factorize(std::shared_ptr<const DefaultExecutor> exec,
                const IndexType* lookup_offsets, const int64* lookup_descs,
                const int32* lookup_storage, const IndexType* diag_idxs,
-               matrix::Csr<ValueType, IndexType>* factors, bool has_all_fillin,
+               matrix::Csr<ValueType, IndexType>* factors, bool full_fillin,
                array<int>& tmp_storage) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE);
diff --git a/include/ginkgo/core/factorization/lu.hpp b/include/ginkgo/core/factorization/lu.hpp
index e803d8d1b03..3704719f32a 100644
--- a/include/ginkgo/core/factorization/lu.hpp
+++ b/include/ginkgo/core/factorization/lu.hpp
@@ -99,14 +99,17 @@ class Lu
         bool GKO_FACTORY_PARAMETER_SCALAR(skip_sorting, false);
 
         /**
-         * The symbolic factorization contains the fill-in for the matrix. If it
-         * does not have full fill-in, as in Ilu, this parameter must be set to
-         * false in order to avoid the possibility of hanging or illegal memory
-         * accesses during the factorization process. When this is true, the
-         * symbolic factorization must still contain the non-zero locations in
-         * the original matrix, at minimum.
+         * If the user provides the symbolic factorization, it should contain
+         * the fill-in for the matrix. i.e., When this is true, the symbolic
+         * factorization must contain the non-zero locations in the original
+         * matrix and the corresponding fill-in locations during factorization.
+         * If it does not have full fill-in, as in Ilu, this parameter must be
+         * set to false in order to avoid the possibility of hanging or illegal
+         * memory accesses during the factorization process. Also, the symbolic
+         * factorization still needs to contain all entries from the original
+         * matrix.
          */
-        bool GKO_FACTORY_PARAMETER_SCALAR(has_all_fillin, true);
+        bool GKO_FACTORY_PARAMETER_SCALAR(full_fillin, true);
     };
 
     /**
diff --git a/omp/factorization/lu_kernels.cpp b/omp/factorization/lu_kernels.cpp
index c18bda37cec..de6876f1487 100644
--- a/omp/factorization/lu_kernels.cpp
+++ b/omp/factorization/lu_kernels.cpp
@@ -66,7 +66,7 @@ template <typename ValueType, typename IndexType>
 void factorize(std::shared_ptr<const DefaultExecutor> exec,
                const IndexType* lookup_offsets, const int64* lookup_descs,
                const int32* lookup_storage, const IndexType* diag_idxs,
-               matrix::Csr<ValueType, IndexType>* factors, bool has_all_fillin,
+               matrix::Csr<ValueType, IndexType>* factors, bool full_fillin,
                array<int>& tmp_storage)
 {
     const auto num_rows = factors->get_size()[0];
@@ -89,7 +89,7 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
             for (auto dep_nz = dep_diag_idx + 1; dep_nz < dep_end; dep_nz++) {
                 const auto col = cols[dep_nz];
                 const auto val = vals[dep_nz];
-                if (!has_all_fillin) {
+                if (!full_fillin) {
                     const auto idx = lookup[col];
                     if (idx != invalid_index<IndexType>()) {
                         vals[row_begin + idx] -= scale * val;
diff --git a/reference/factorization/lu_kernels.cpp b/reference/factorization/lu_kernels.cpp
index 5bf159a6de5..33fb2f94c4b 100644
--- a/reference/factorization/lu_kernels.cpp
+++ b/reference/factorization/lu_kernels.cpp
@@ -65,7 +65,7 @@ template <typename ValueType, typename IndexType>
 void factorize(std::shared_ptr<const DefaultExecutor> exec,
                const IndexType* lookup_offsets, const int64* lookup_descs,
                const int32* lookup_storage, const IndexType* diag_idxs,
-               matrix::Csr<ValueType, IndexType>* factors, bool has_all_fillin,
+               matrix::Csr<ValueType, IndexType>* factors, bool full_fillin,
                array<int>& tmp_storage)
 {
     const auto num_rows = factors->get_size()[0];
@@ -87,7 +87,7 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
             for (auto dep_nz = dep_diag_idx + 1; dep_nz < dep_end; dep_nz++) {
                 const auto col = cols[dep_nz];
                 const auto val = vals[dep_nz];
-                if (!has_all_fillin) {
+                if (!full_fillin) {
                     const auto idx = lookup[col];
                     if (idx != invalid_index<IndexType>()) {
                         vals[row_begin + idx] -= scale * val;
diff --git a/reference/test/factorization/lu_kernels.cpp b/reference/test/factorization/lu_kernels.cpp
index cfc86eba6d3..6f4f7a1d088 100644
--- a/reference/test/factorization/lu_kernels.cpp
+++ b/reference/test/factorization/lu_kernels.cpp
@@ -356,7 +356,7 @@ TYPED_TEST(Lu, GenerateIluWithBitmapIsEquivalentToRef)
     auto factory =
         gko::experimental::factorization::Lu<value_type, index_type>::build()
             .with_symbolic_factorization(sparsity)
-            .with_has_all_fillin(false)
+            .with_full_fillin(false)
             .on(this->ref);
 
     auto lu = factory->generate(mtx);
@@ -390,7 +390,7 @@ TYPED_TEST(Lu, GenerateIluWithHashmapIsEquivalentToRef)
     auto factory =
         gko::experimental::factorization::Lu<value_type, index_type>::build()
             .with_symbolic_factorization(sparsity)
-            .with_has_all_fillin(false)
+            .with_full_fillin(false)
             .on(this->ref);
 
     auto lu = factory->generate(mtx);
diff --git a/test/factorization/lu_kernels.cpp b/test/factorization/lu_kernels.cpp
index 7ad036df85e..fab1456badc 100644
--- a/test/factorization/lu_kernels.cpp
+++ b/test/factorization/lu_kernels.cpp
@@ -374,12 +374,12 @@ TYPED_TEST(Lu, GenerateIluWithBitmapIsEquivalentToRef)
     auto factory =
         gko::experimental::factorization::Lu<value_type, index_type>::build()
             .with_symbolic_factorization(sparsity)
-            .with_has_all_fillin(false)
+            .with_full_fillin(false)
             .on(this->ref);
     auto dfactory =
         gko::experimental::factorization::Lu<value_type, index_type>::build()
             .with_symbolic_factorization(dsparsity)
-            .with_has_all_fillin(false)
+            .with_full_fillin(false)
             .on(this->exec);
 
     auto lu = factory->generate(mtx);
@@ -417,12 +417,12 @@ TYPED_TEST(Lu, GenerateIluWithHashmapIsEquivalentToRef)
     auto factory =
         gko::experimental::factorization::Lu<value_type, index_type>::build()
             .with_symbolic_factorization(sparsity)
-            .with_has_all_fillin(false)
+            .with_full_fillin(false)
             .on(this->ref);
     auto dfactory =
         gko::experimental::factorization::Lu<value_type, index_type>::build()
             .with_symbolic_factorization(dsparsity)
-            .with_has_all_fillin(false)
+            .with_full_fillin(false)
             .on(this->exec);
 
     auto lu = factory->generate(mtx);

From 94777bee14f1af1d32a1a9f6e095a76973f757d5 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 6 Nov 2024 15:56:45 +0100
Subject: [PATCH 333/448] cholesky failed test

---
 .../test/factorization/cholesky_kernels.cpp   | 84 +++++++++++++++++
 test/factorization/cholesky_kernels.cpp       | 89 +++++++++++++++++++
 2 files changed, 173 insertions(+)

diff --git a/reference/test/factorization/cholesky_kernels.cpp b/reference/test/factorization/cholesky_kernels.cpp
index d63e491e26a..1fd16bb631c 100644
--- a/reference/test/factorization/cholesky_kernels.cpp
+++ b/reference/test/factorization/cholesky_kernels.cpp
@@ -492,4 +492,88 @@ TYPED_TEST(Cholesky, FactorizeWithKnownSparsityWorks)
 }
 
 
+TYPED_TEST(Cholesky, GenerateIcWithBitmapIsEquivalentToRef)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using matrix_type = typename TestFixture::matrix_type;
+    using sparsity_matrix_type = typename TestFixture::sparsity_matrix_type;
+    // diag + full first row and column
+    // the third and forth row use bitmap for lookup table
+    auto mtx = gko::share(gko::initialize<matrix_type>({{1.0, 1.0, 1.0, 1.0},
+                                                        {1.0, 2.0, 0.0, 0.0},
+                                                        {1.0, 0.0, 2.0, 0.0},
+                                                        {1.0, 0.0, 0.0, 2.0}},
+                                                       this->ref));
+    auto sparsity = gko::share(sparsity_matrix_type::create(this->ref));
+    mtx->convert_to(sparsity);
+    auto result = gko::initialize<matrix_type>({{1.0, 1.0, 1.0, 1.0},
+                                                {1.0, 1.0, 0.0, 0.0},
+                                                {1.0, 0.0, 1.0, 0.0},
+                                                {1.0, 0.0, 0.0, 1.0}},
+                                               this->ref);
+    auto factory =
+        gko::experimental::factorization::Cholesky<value_type,
+                                                   index_type>::build()
+            .with_symbolic_factorization(sparsity)
+            .on(this->ref);
+
+    auto cholesky = factory->generate(mtx);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(cholesky->get_combined(), mtx);
+    GKO_ASSERT_MTX_NEAR(cholesky->get_combined(), result, r<value_type>::value);
+}
+
+
+TYPED_TEST(Cholesky, GenerateIcWithHashmapIsEquivalentToRef)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using matrix_type = typename TestFixture::matrix_type;
+    using sparsity_matrix_type = typename TestFixture::sparsity_matrix_type;
+    int n = 68;
+    gko::matrix_data<value_type, index_type> data(gko::dim<2>(n, n));
+    gko::matrix_data<value_type, index_type> result(gko::dim<2>(n, n));
+    for (int i = 0; i < n; i++) {
+        if (i == n - 2 || i == n - 3) {
+            data.nonzeros.emplace_back(i, i, value_type{2});
+        } else {
+            data.nonzeros.emplace_back(i, i, gko::one<value_type>());
+        }
+        result.nonzeros.emplace_back(i, i, gko::one<value_type>());
+    }
+    // the following rows use hashmap for lookup table
+    // add dependence
+    data.nonzeros.emplace_back(n - 3, 0, gko::one<value_type>());
+    data.nonzeros.emplace_back(0, n - 3, gko::one<value_type>());
+    // add a entry whose col idx is not shown in the above row
+    data.nonzeros.emplace_back(0, n - 2, gko::one<value_type>());
+    data.nonzeros.emplace_back(n - 2, 0, gko::one<value_type>());
+    data.sort_row_major();
+    auto mtx = gko::share(matrix_type::create(this->ref));
+    mtx->read(data);
+    // prepare result
+    result.nonzeros.emplace_back(n - 3, 0, gko::one<value_type>());
+    result.nonzeros.emplace_back(0, n - 3, gko::one<value_type>());
+    result.nonzeros.emplace_back(0, n - 2, gko::one<value_type>());
+    result.nonzeros.emplace_back(n - 2, 0, gko::one<value_type>());
+    result.sort_row_major();
+    auto result_mtx = gko::share(matrix_type::create(this->ref));
+    result_mtx->read(result);
+    auto sparsity = gko::share(sparsity_matrix_type::create(this->ref));
+    mtx->convert_to(sparsity);
+    auto factory =
+        gko::experimental::factorization::Cholesky<value_type,
+                                                   index_type>::build()
+            .with_symbolic_factorization(sparsity)
+            .on(this->ref);
+
+    auto cholesky = factory->generate(mtx);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(cholesky->get_combined(), result_mtx);
+    GKO_ASSERT_MTX_NEAR(cholesky->get_combined(), result_mtx,
+                        r<value_type>::value);
+}
+
+
 }  // namespace
diff --git a/test/factorization/cholesky_kernels.cpp b/test/factorization/cholesky_kernels.cpp
index aff2abed6c5..c3cb8421e45 100644
--- a/test/factorization/cholesky_kernels.cpp
+++ b/test/factorization/cholesky_kernels.cpp
@@ -480,4 +480,93 @@ TYPED_TEST(Cholesky, GenerateWithKnownSparsityIsEquivalentToRef)
 }
 
 
+TYPED_TEST(Cholesky, GenerateIcWithBitmapIsEquivalentToRef)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using matrix_type = typename TestFixture::matrix_type;
+    using sparsity_pattern_type = typename TestFixture::sparsity_pattern_type;
+    // diag + full first row and column
+    // the third and forth row use bitmap for lookup table
+    auto mtx = gko::share(gko::initialize<matrix_type>({{1.0, 1.0, 1.0, 1.0},
+                                                        {1.0, 2.0, 0.0, 0.0},
+                                                        {1.0, 0.0, 2.0, 0.0},
+                                                        {1.0, 0.0, 0.0, 2.0}},
+                                                       this->ref));
+    auto dmtx = gko::share(mtx->clone(this->exec));
+    auto sparsity = gko::share(sparsity_pattern_type::create(this->ref));
+    mtx->convert_to(sparsity);
+    auto dsparsity = gko::share(sparsity->clone(this->exec));
+
+    auto factory =
+        gko::experimental::factorization::Cholesky<value_type,
+                                                   index_type>::build()
+            .with_symbolic_factorization(sparsity)
+            .on(this->ref);
+    auto dfactory =
+        gko::experimental::factorization::Cholesky<value_type,
+                                                   index_type>::build()
+            .with_symbolic_factorization(dsparsity)
+            .on(this->exec);
+
+    auto cholesky = factory->generate(mtx);
+    auto dcholesky = dfactory->generate(dmtx);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(cholesky->get_combined(),
+                               dcholesky->get_combined());
+    GKO_ASSERT_MTX_NEAR(cholesky->get_combined(), dcholesky->get_combined(),
+                        r<value_type>::value);
+}
+
+
+TYPED_TEST(Cholesky, GenerateIluWithHashmapIsEquivalentToRef)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using matrix_type = typename TestFixture::matrix_type;
+    using sparsity_pattern_type = typename TestFixture::sparsity_pattern_type;
+    int n = 68;
+    gko::matrix_data<value_type, index_type> data(gko::dim<2>(n, n));
+    for (int i = 0; i < n; i++) {
+        if (i == n - 2 || i == n - 3) {
+            data.nonzeros.emplace_back(i, i, value_type{2});
+        } else {
+            data.nonzeros.emplace_back(i, i, gko::one<value_type>());
+        }
+    }
+    // the following rows use hashmap for lookup table
+    // add dependence
+    data.nonzeros.emplace_back(n - 3, 0, gko::one<value_type>());
+    data.nonzeros.emplace_back(0, n - 3, gko::one<value_type>());
+    // add a entry whose col idx is not shown in the above row
+    data.nonzeros.emplace_back(0, n - 2, gko::one<value_type>());
+    data.nonzeros.emplace_back(n - 2, 0, gko::one<value_type>());
+    data.sort_row_major();
+    auto mtx = gko::share(matrix_type::create(this->ref));
+    mtx->read(data);
+    auto dmtx = gko::share(mtx->clone(this->exec));
+    auto sparsity = gko::share(sparsity_pattern_type::create(this->ref));
+    mtx->convert_to(sparsity);
+    auto dsparsity = gko::share(sparsity->clone(this->exec));
+    auto factory =
+        gko::experimental::factorization::Cholesky<value_type,
+                                                   index_type>::build()
+            .with_symbolic_factorization(sparsity)
+            .on(this->ref);
+    auto dfactory =
+        gko::experimental::factorization::Cholesky<value_type,
+                                                   index_type>::build()
+            .with_symbolic_factorization(dsparsity)
+            .on(this->exec);
+
+    auto cholesky = factory->generate(mtx);
+    auto dcholesky = dfactory->generate(dmtx);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(cholesky->get_combined(),
+                               dcholesky->get_combined());
+    GKO_ASSERT_MTX_NEAR(cholesky->get_combined(), dcholesky->get_combined(),
+                        r<value_type>::value);
+}
+
+
 }  // namespace

From d9ad2d58c627ad863c88f2c74b1049aa01084da3 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 6 Nov 2024 16:06:23 +0100
Subject: [PATCH 334/448] move the algorithm enum to another header

---
 include/ginkgo/core/factorization/ilu.hpp     |  9 +------
 .../incompleted_factorization.hpp             | 25 +++++++++++++++++++
 include/ginkgo/ginkgo.hpp                     |  1 +
 3 files changed, 27 insertions(+), 8 deletions(-)
 create mode 100644 include/ginkgo/core/factorization/incompleted_factorization.hpp

diff --git a/include/ginkgo/core/factorization/ilu.hpp b/include/ginkgo/core/factorization/ilu.hpp
index d527d8c9912..a04da70989c 100644
--- a/include/ginkgo/core/factorization/ilu.hpp
+++ b/include/ginkgo/core/factorization/ilu.hpp
@@ -13,6 +13,7 @@
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/config/config.hpp>
 #include <ginkgo/core/config/registry.hpp>
+#include <ginkgo/core/factorization/incompleted_factorization.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
 
@@ -25,14 +26,6 @@ namespace gko {
 namespace factorization {
 
 
-/**
- * A helper for algorithm selection in the incomplete factorization.
- * sparselib is only available for CUDA and HIP.
- * syncfree is Ginkgo's implementation through the Lu factorization with given
- * sparsity.
- */
-enum class factorize_algorithm { sparselib, syncfree };
-
 /**
  * Represents an incomplete LU factorization -- ILU(0) -- of a sparse matrix.
  *
diff --git a/include/ginkgo/core/factorization/incompleted_factorization.hpp b/include/ginkgo/core/factorization/incompleted_factorization.hpp
new file mode 100644
index 00000000000..9f712f56e23
--- /dev/null
+++ b/include/ginkgo/core/factorization/incompleted_factorization.hpp
@@ -0,0 +1,25 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_PUBLIC_CORE_FACTORIZATION_INCOMPLETED_FACTORIZATION_HPP_
+#define GKO_PUBLIC_CORE_FACTORIZATION_INCOMPLETED_FACTORIZATION_HPP_
+
+
+namespace gko {
+namespace factorization {
+
+
+/**
+ * An enum class for algorithm selection in the incomplete factorization.
+ * `sparselib` is only available for CUDA and HIP.
+ * `syncfree` is Ginkgo's implementation through the Lu/Cholesky factorization
+ * with given sparsity.
+ */
+enum class factorize_algorithm { sparselib, syncfree };
+
+
+}  // namespace factorization
+}  // namespace gko
+
+#endif  // GKO_PUBLIC_CORE_FACTORIZATION_INCOMPLETED_FACTORIZATION_HPP_
diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp
index c2eb2b4a134..d1cb0248b08 100644
--- a/include/ginkgo/ginkgo.hpp
+++ b/include/ginkgo/ginkgo.hpp
@@ -76,6 +76,7 @@
 #include <ginkgo/core/factorization/factorization.hpp>
 #include <ginkgo/core/factorization/ic.hpp>
 #include <ginkgo/core/factorization/ilu.hpp>
+#include <ginkgo/core/factorization/incompleted_factorization.hpp>
 #include <ginkgo/core/factorization/lu.hpp>
 #include <ginkgo/core/factorization/par_ic.hpp>
 #include <ginkgo/core/factorization/par_ict.hpp>

From 7fe6ed8bf93f5f7ad0bdd3b95b38d61e8f5274cf Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 6 Nov 2024 16:54:29 +0100
Subject: [PATCH 335/448] cholesky with safe lookup, wrap it into Ic, and add
 some missing tests

---
 .../factorization/cholesky_kernels.cpp        | 40 +++++++++++-----
 core/factorization/cholesky.cpp               |  6 ++-
 core/factorization/cholesky_kernels.hpp       |  3 +-
 core/factorization/ic.cpp                     | 48 ++++++++++++++++---
 core/factorization/ilu.cpp                    |  4 +-
 core/test/config/factorization.cpp            |  6 +++
 dpcpp/factorization/cholesky_kernels.dp.cpp   |  2 +-
 .../ginkgo/core/factorization/cholesky.hpp    | 11 +++++
 include/ginkgo/core/factorization/ic.hpp      | 10 ++++
 include/ginkgo/core/factorization/ilu.hpp     |  2 +-
 omp/factorization/cholesky_kernels.cpp        | 13 +++--
 reference/factorization/cholesky_kernels.cpp  | 13 +++--
 .../test/factorization/cholesky_kernels.cpp   |  4 +-
 reference/test/factorization/ic_kernels.cpp   | 18 +++++++
 reference/test/factorization/ilu_kernels.cpp  | 17 +++++++
 test/factorization/CMakeLists.txt             |  2 +-
 test/factorization/cholesky_kernels.cpp       |  8 +++-
 test/factorization/ic_kernels.cpp             | 20 ++++++++
 18 files changed, 195 insertions(+), 32 deletions(-)

diff --git a/common/cuda_hip/factorization/cholesky_kernels.cpp b/common/cuda_hip/factorization/cholesky_kernels.cpp
index e5f2bf5e5e5..87d69a5db83 100644
--- a/common/cuda_hip/factorization/cholesky_kernels.cpp
+++ b/common/cuda_hip/factorization/cholesky_kernels.cpp
@@ -46,6 +46,8 @@ constexpr int default_block_size = 512;
 
 
 #include "core/factorization/elimination_forest.hpp"
+
+
 namespace kernel {
 
 
@@ -161,7 +163,7 @@ __global__ __launch_bounds__(default_block_size) void symbolic_factorize(
 }
 
 
-template <typename ValueType, typename IndexType>
+template <bool full_fillin, typename ValueType, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void factorize(
     const IndexType* __restrict__ row_ptrs, const IndexType* __restrict__ cols,
     const IndexType* __restrict__ storage_offsets,
@@ -200,9 +202,16 @@ __global__ __launch_bounds__(default_block_size) void factorize(
             const auto upper_col = cols[upper_nz];
             if (upper_col >= row) {
                 const auto upper_val = vals[upper_nz];
-                const auto output_pos =
-                    lookup.lookup_unsafe(upper_col) + row_begin;
-                vals[output_pos] -= scale * upper_val;
+                if (!full_fillin) {
+                    const auto pos = lookup[upper_col];
+                    if (pos != invalid_index<IndexType>()) {
+                        vals[row_begin + pos] -= scale * upper_val;
+                    }
+                } else {
+                    const auto output_pos =
+                        lookup.lookup_unsafe(upper_col) + row_begin;
+                    vals[output_pos] -= scale * upper_val;
+                }
             }
         }
     }
@@ -355,7 +364,7 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
                const int32* lookup_storage, const IndexType* diag_idxs,
                const IndexType* transpose_idxs,
                const factorization::elimination_forest<IndexType>& forest,
-               matrix::Csr<ValueType, IndexType>* factors,
+               matrix::Csr<ValueType, IndexType>* factors, bool full_fillin,
                array<int>& tmp_storage)
 {
     const auto num_rows = factors->get_size()[0];
@@ -363,12 +372,21 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
         syncfree_storage storage(exec, tmp_storage, num_rows);
         const auto num_blocks =
             ceildiv(num_rows, default_block_size / config::warp_size);
-        kernel::factorize<<<num_blocks, default_block_size, 0,
-                            exec->get_stream()>>>(
-            factors->get_const_row_ptrs(), factors->get_const_col_idxs(),
-            lookup_offsets, lookup_storage, lookup_descs, diag_idxs,
-            transpose_idxs, as_device_type(factors->get_values()), storage,
-            num_rows);
+        if (!full_fillin) {
+            kernel::factorize<false>
+                <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                    factors->get_const_row_ptrs(),
+                    factors->get_const_col_idxs(), lookup_offsets,
+                    lookup_storage, lookup_descs, diag_idxs, transpose_idxs,
+                    as_device_type(factors->get_values()), storage, num_rows);
+        } else {
+            kernel::factorize<true>
+                <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                    factors->get_const_row_ptrs(),
+                    factors->get_const_col_idxs(), lookup_offsets,
+                    lookup_storage, lookup_descs, diag_idxs, transpose_idxs,
+                    as_device_type(factors->get_values()), storage, num_rows);
+        }
     }
 }
 
diff --git a/core/factorization/cholesky.cpp b/core/factorization/cholesky.cpp
index 81627ad229b..7366abda86d 100644
--- a/core/factorization/cholesky.cpp
+++ b/core/factorization/cholesky.cpp
@@ -51,6 +51,9 @@ Cholesky<ValueType, IndexType>::parse(
     if (auto& obj = config.get("skip_sorting")) {
         params.with_skip_sorting(config::get_value<bool>(obj));
     }
+    if (auto& obj = config.get("full_fillin")) {
+        params.with_full_fillin(config::get_value<bool>(obj));
+    }
 
     return params;
 }
@@ -137,7 +140,8 @@ std::unique_ptr<LinOp> Cholesky<ValueType, IndexType>::generate_impl(
     exec->run(make_factorize(
         storage_offsets.get_const_data(), row_descs.get_const_data(),
         storage.get_const_data(), diag_idxs.get_const_data(),
-        transpose_idxs.get_const_data(), *forest, factors.get(), tmp));
+        transpose_idxs.get_const_data(), *forest, factors.get(),
+        parameters_.full_fillin, tmp));
     return factorization_type::create_from_combined_cholesky(
         std::move(factors));
 }
diff --git a/core/factorization/cholesky_kernels.hpp b/core/factorization/cholesky_kernels.hpp
index db889ce1162..8230fa4552d 100644
--- a/core/factorization/cholesky_kernels.hpp
+++ b/core/factorization/cholesky_kernels.hpp
@@ -61,7 +61,8 @@ namespace kernels {
         const int32* lookup_storage, const IndexType* diag_idxs,         \
         const IndexType* transpose_idxs,                                 \
         const gko::factorization::elimination_forest<IndexType>& forest, \
-        matrix::Csr<ValueType, IndexType>* factors, array<int>& tmp_storage)
+        matrix::Csr<ValueType, IndexType>* factors, bool full_fillin,    \
+        array<int>& tmp_storage)
 
 
 #define GKO_DECLARE_ALL_AS_TEMPLATES                               \
diff --git a/core/factorization/ic.cpp b/core/factorization/ic.cpp
index 2257e6256e4..9c036b08dec 100644
--- a/core/factorization/ic.cpp
+++ b/core/factorization/ic.cpp
@@ -11,6 +11,7 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/config/config.hpp>
 #include <ginkgo/core/config/registry.hpp>
+#include <ginkgo/core/factorization/cholesky.hpp>
 
 #include "core/base/array_access.hpp"
 #include "core/config/config_helper.hpp"
@@ -52,6 +53,17 @@ Ic<ValueType, IndexType>::parse(const config::pnode& config,
     if (auto& obj = config.get("both_factors")) {
         params.with_both_factors(config::get_value<bool>(obj));
     }
+    if (auto& obj = config.get("algorithm")) {
+        using gko::factorization::factorize_algorithm;
+        auto str = obj.get_string();
+        if (str == "sparselib") {
+            params.with_algorithm(factorize_algorithm::sparselib);
+        } else if (str == "syncfree") {
+            params.with_algorithm(factorize_algorithm::syncfree);
+        } else {
+            GKO_INVALID_CONFIG_VALUE("algorithm", str);
+        }
+    }
     return params;
 }
 
@@ -67,7 +79,7 @@ std::unique_ptr<Composition<ValueType>> Ic<ValueType, IndexType>::generate(
 
     // Converts the system matrix to CSR.
     // Throws an exception if it is not convertible.
-    auto local_system_matrix = matrix_type::create(exec);
+    auto local_system_matrix = share(matrix_type::create(exec));
     as<ConvertibleTo<matrix_type>>(system_matrix.get())
         ->convert_to(local_system_matrix);
 
@@ -79,15 +91,39 @@ std::unique_ptr<Composition<ValueType>> Ic<ValueType, IndexType>::generate(
     exec->run(ic_factorization::make_add_diagonal_elements(
         local_system_matrix.get(), false));
 
+    std::shared_ptr<const matrix_type> ic;
     // Compute LC factorization
-    exec->run(ic_factorization::make_compute(local_system_matrix.get()));
+    if (std::dynamic_pointer_cast<const OmpExecutor>(exec) ||
+        parameters_.algorithm == factorize_algorithm::syncfree) {
+        auto sparsity =
+            share(gko::matrix::SparsityCsr<ValueType, IndexType>::create_const(
+                exec, local_system_matrix->get_size(),
+                make_const_array_view(
+                    exec, local_system_matrix->get_num_stored_elements(),
+                    local_system_matrix->get_const_col_idxs()),
+                make_const_array_view(
+                    exec, local_system_matrix->get_size()[0] + 1,
+                    local_system_matrix->get_const_row_ptrs())));
+        ic = gko::experimental::factorization::Cholesky<ValueType,
+                                                        IndexType>::build()
+                 .with_full_fillin(false)
+                 .with_symbolic_factorization(sparsity)
+                 .with_skip_sorting(
+                     true)  // we have decided sort or not in earlir stage
+                 .on(exec)
+                 ->generate(local_system_matrix)
+                 ->get_combined();
+    } else {
+        exec->run(ic_factorization::make_compute(local_system_matrix.get()));
+        ic = local_system_matrix;
+    }
 
     // Extract lower factor: compute non-zeros
-    const auto matrix_size = local_system_matrix->get_size();
+    const auto matrix_size = ic->get_size();
     const auto num_rows = matrix_size[0];
     array<IndexType> l_row_ptrs{exec, num_rows + 1};
     exec->run(ic_factorization::make_initialize_row_ptrs_l(
-        local_system_matrix.get(), l_row_ptrs.get_data()));
+        ic.get(), l_row_ptrs.get_data()));
 
     // Get nnz from device memory
     auto l_nnz = static_cast<size_type>(get_element(l_row_ptrs, num_rows));
@@ -100,8 +136,8 @@ std::unique_ptr<Composition<ValueType>> Ic<ValueType, IndexType>::generate(
         std::move(l_row_ptrs), parameters_.l_strategy);
 
     // Extract lower factor: columns and values
-    exec->run(ic_factorization::make_initialize_l(local_system_matrix.get(),
-                                                  l_factor.get(), false));
+    exec->run(
+        ic_factorization::make_initialize_l(ic.get(), l_factor.get(), false));
 
     if (both_factors) {
         auto lh_factor = l_factor->conj_transpose();
diff --git a/core/factorization/ilu.cpp b/core/factorization/ilu.cpp
index b9f4e2057c0..0969c6a5bb7 100644
--- a/core/factorization/ilu.cpp
+++ b/core/factorization/ilu.cpp
@@ -80,7 +80,6 @@ std::unique_ptr<Composition<ValueType>> Ilu<ValueType, IndexType>::generate_l_u(
     // Converts the system matrix to CSR.
     // Throws an exception if it is not convertible.
     auto local_system_matrix = share(matrix_type::create(exec));
-    std::shared_ptr<const matrix_type> ilu;
     as<ConvertibleTo<matrix_type>>(system_matrix.get())
         ->convert_to(local_system_matrix);
 
@@ -92,6 +91,7 @@ std::unique_ptr<Composition<ValueType>> Ilu<ValueType, IndexType>::generate_l_u(
     exec->run(ilu_factorization::make_add_diagonal_elements(
         local_system_matrix.get(), false));
 
+    std::shared_ptr<const matrix_type> ilu;
     // Compute LU factorization
     if (std::dynamic_pointer_cast<const OmpExecutor>(exec) ||
         parameters_.algorithm == factorize_algorithm::syncfree) {
@@ -108,6 +108,8 @@ std::unique_ptr<Composition<ValueType>> Ilu<ValueType, IndexType>::generate_l_u(
             gko::experimental::factorization::Lu<ValueType, IndexType>::build()
                 .with_full_fillin(false)
                 .with_symbolic_factorization(sparsity)
+                .with_skip_sorting(
+                    true)  // we have decided sort or not in earlir stage
                 .on(exec)
                 ->generate(local_system_matrix)
                 ->get_combined();
diff --git a/core/test/config/factorization.cpp b/core/test/config/factorization.cpp
index 42ae70f8474..8c1fefb3d95 100644
--- a/core/test/config/factorization.cpp
+++ b/core/test/config/factorization.cpp
@@ -75,6 +75,8 @@ struct Ic : FactorizationConfigTest<gko::factorization::Ic<float, int>,
         param.with_skip_sorting(true);
         config_map["both_factors"] = pnode{false};
         param.with_both_factors(false);
+        config_map["algorithm"] = pnode{"syncfree"};
+        param.with_algorithm(gko::factorization::factorize_algorithm::syncfree);
     }
 
     template <typename AnswerType>
@@ -86,6 +88,7 @@ struct Ic : FactorizationConfigTest<gko::factorization::Ic<float, int>,
         check_strategy(res_param.l_strategy, ans_param.l_strategy);
         ASSERT_EQ(res_param.skip_sorting, ans_param.skip_sorting);
         ASSERT_EQ(res_param.both_factors, ans_param.both_factors);
+        ASSERT_EQ(res_param.algorithm, ans_param.algorithm);
     }
 };
 
@@ -146,6 +149,8 @@ struct Cholesky : FactorizationConfigTest<
             detail::registry_accessor::get_data<Sparsity>(reg, "sparsity"));
         config_map["skip_sorting"] = pnode{true};
         param.with_skip_sorting(true);
+        config_map["full_fillin"] = pnode{false};
+        param.with_full_fillin(false);
     }
 
     template <typename AnswerType>
@@ -157,6 +162,7 @@ struct Cholesky : FactorizationConfigTest<
         ASSERT_EQ(res_param.symbolic_factorization,
                   ans_param.symbolic_factorization);
         ASSERT_EQ(res_param.skip_sorting, ans_param.skip_sorting);
+        ASSERT_EQ(res_param.full_fillin, ans_param.full_fillin);
     }
 };
 
diff --git a/dpcpp/factorization/cholesky_kernels.dp.cpp b/dpcpp/factorization/cholesky_kernels.dp.cpp
index b381e6989e4..e13810deb74 100644
--- a/dpcpp/factorization/cholesky_kernels.dp.cpp
+++ b/dpcpp/factorization/cholesky_kernels.dp.cpp
@@ -170,7 +170,7 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
                const int32* lookup_storage, const IndexType* diag_idxs,
                const IndexType* transpose_idxs,
                const factorization::elimination_forest<IndexType>& forest,
-               matrix::Csr<ValueType, IndexType>* factors,
+               matrix::Csr<ValueType, IndexType>* factors, bool full_fillin,
                array<int>& tmp_storage) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE);
diff --git a/include/ginkgo/core/factorization/cholesky.hpp b/include/ginkgo/core/factorization/cholesky.hpp
index 0b3a7fb0caf..a984738cdfa 100644
--- a/include/ginkgo/core/factorization/cholesky.hpp
+++ b/include/ginkgo/core/factorization/cholesky.hpp
@@ -67,6 +67,17 @@ class Cholesky
          * incorrect results or crash.
          */
         bool GKO_FACTORY_PARAMETER_SCALAR(skip_sorting, false);
+
+        /**
+         * If the user provides the symbolic factorization, it should contain
+         * the fill-in for the matrix. i.e., When this is true, the symbolic
+         * factorization must contain the non-zero locations in the original
+         * matrix and the corresponding fill-in locations during factorization.
+         * If it does not have full fill-in, as in Ilu, this parameter must be
+         * set to false in order to avoid the possibility of hanging or illegal
+         * memory accesses during the factorization process.
+         */
+        bool GKO_FACTORY_PARAMETER_SCALAR(full_fillin, true);
     };
 
     /**
diff --git a/include/ginkgo/core/factorization/ic.hpp b/include/ginkgo/core/factorization/ic.hpp
index 616360ce039..cb638709864 100644
--- a/include/ginkgo/core/factorization/ic.hpp
+++ b/include/ginkgo/core/factorization/ic.hpp
@@ -13,6 +13,7 @@
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/config/config.hpp>
 #include <ginkgo/core/config/registry.hpp>
+#include <ginkgo/core/factorization/incompleted_factorization.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
 
@@ -99,6 +100,15 @@ class Ic : public Composition<ValueType> {
          * be used to avoid the transposition operation.
          */
         bool GKO_FACTORY_PARAMETER_SCALAR(both_factors, true);
+
+        /**
+         * Select the implementation which is supposed to be used for
+         * the incomplete factorization. This only matters for the CUDA and HIP
+         * executor where the choice is between the Ginkgo (syncfree) and the
+         * cuSPARSE/hipSPARSE (sparselib) implementation. Default is sparselib.
+         */
+        factorize_algorithm GKO_FACTORY_PARAMETER_SCALAR(
+            algorithm, factorize_algorithm::sparselib);
     };
     GKO_ENABLE_LIN_OP_FACTORY(Ic, parameters, Factory);
     GKO_ENABLE_BUILD_METHOD(Factory);
diff --git a/include/ginkgo/core/factorization/ilu.hpp b/include/ginkgo/core/factorization/ilu.hpp
index a04da70989c..2abec999604 100644
--- a/include/ginkgo/core/factorization/ilu.hpp
+++ b/include/ginkgo/core/factorization/ilu.hpp
@@ -100,7 +100,7 @@ class Ilu : public Composition<ValueType> {
          * Select the implementation which is supposed to be used for
          * the incomplete factorization. This only matters for the CUDA and HIP
          * executor where the choice is between the Ginkgo (syncfree) and the
-         * cuSPARSE/HIPSPARSE (sparselib) implementation. Default is sparselib.
+         * cuSPARSE/hipSPARSE (sparselib) implementation. Default is sparselib.
          */
         factorize_algorithm GKO_FACTORY_PARAMETER_SCALAR(
             algorithm, factorize_algorithm::sparselib);
diff --git a/omp/factorization/cholesky_kernels.cpp b/omp/factorization/cholesky_kernels.cpp
index 8ce5392ebde..9c39085c98f 100644
--- a/omp/factorization/cholesky_kernels.cpp
+++ b/omp/factorization/cholesky_kernels.cpp
@@ -210,7 +210,7 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
                const int32* lookup_storage, const IndexType* diag_idxs,
                const IndexType* transpose_idxs,
                const factorization::elimination_forest<IndexType>& forest,
-               matrix::Csr<ValueType, IndexType>* factors,
+               matrix::Csr<ValueType, IndexType>* factors, bool full_fillin,
                array<int>& tmp_storage)
 {
     const auto num_rows = factors->get_size()[0];
@@ -233,8 +233,15 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
                 const auto col = cols[dep_nz];
                 if (col < row) {
                     const auto val = vals[dep_nz];
-                    const auto nz = row_begin + lookup.lookup_unsafe(col);
-                    vals[nz] -= scale * val;
+                    if (!full_fillin) {
+                        const auto idx = lookup[col];
+                        if (idx != invalid_index<IndexType>()) {
+                            vals[row_begin + idx] -= scale * val;
+                        }
+                    } else {
+                        const auto nz = row_begin + lookup.lookup_unsafe(col);
+                        vals[nz] -= scale * val;
+                    }
                 }
             }
         }
diff --git a/reference/factorization/cholesky_kernels.cpp b/reference/factorization/cholesky_kernels.cpp
index 2aeee99d45d..882d10ebd72 100644
--- a/reference/factorization/cholesky_kernels.cpp
+++ b/reference/factorization/cholesky_kernels.cpp
@@ -181,7 +181,7 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
                const int32* lookup_storage, const IndexType* diag_idxs,
                const IndexType* transpose_idxs,
                const factorization::elimination_forest<IndexType>& forest,
-               matrix::Csr<ValueType, IndexType>* factors,
+               matrix::Csr<ValueType, IndexType>* factors, bool full_fillin,
                array<int>& tmp_storage)
 {
     const auto num_rows = factors->get_size()[0];
@@ -204,8 +204,15 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
                 const auto col = cols[dep_nz];
                 if (col < row) {
                     const auto val = vals[dep_nz];
-                    const auto nz = row_begin + lookup.lookup_unsafe(col);
-                    vals[nz] -= scale * val;
+                    if (!full_fillin) {
+                        const auto idx = lookup[col];
+                        if (idx != invalid_index<IndexType>()) {
+                            vals[row_begin + idx] -= scale * val;
+                        }
+                    } else {
+                        const auto nz = row_begin + lookup.lookup_unsafe(col);
+                        vals[nz] -= scale * val;
+                    }
                 }
             }
         }
diff --git a/reference/test/factorization/cholesky_kernels.cpp b/reference/test/factorization/cholesky_kernels.cpp
index 1fd16bb631c..86a2254a70b 100644
--- a/reference/test/factorization/cholesky_kernels.cpp
+++ b/reference/test/factorization/cholesky_kernels.cpp
@@ -426,7 +426,7 @@ TYPED_TEST(Cholesky, KernelFactorizeWorks)
                 this->row_descs.get_const_data(),
                 this->storage.get_const_data(), diag_idxs.get_data(),
                 transpose_idxs.get_data(), *this->forest, this->combined.get(),
-                tmp);
+                true, tmp);
 
             GKO_ASSERT_MTX_NEAR(this->combined, this->combined_ref,
                                 r<value_type>::value);
@@ -516,6 +516,7 @@ TYPED_TEST(Cholesky, GenerateIcWithBitmapIsEquivalentToRef)
         gko::experimental::factorization::Cholesky<value_type,
                                                    index_type>::build()
             .with_symbolic_factorization(sparsity)
+            .with_full_fillin(false)
             .on(this->ref);
 
     auto cholesky = factory->generate(mtx);
@@ -566,6 +567,7 @@ TYPED_TEST(Cholesky, GenerateIcWithHashmapIsEquivalentToRef)
         gko::experimental::factorization::Cholesky<value_type,
                                                    index_type>::build()
             .with_symbolic_factorization(sparsity)
+            .with_full_fillin(false)
             .on(this->ref);
 
     auto cholesky = factory->generate(mtx);
diff --git a/reference/test/factorization/ic_kernels.cpp b/reference/test/factorization/ic_kernels.cpp
index cdcb6b12bc8..2d69d01592b 100644
--- a/reference/test/factorization/ic_kernels.cpp
+++ b/reference/test/factorization/ic_kernels.cpp
@@ -188,4 +188,22 @@ TYPED_TEST(Ic, GenerateGeneral)
 }
 
 
+TYPED_TEST(Ic, GenerateGeneralBySyncfree)
+{
+    using factorization_type = typename TestFixture::factorization_type;
+    using Csr = typename TestFixture::Csr;
+
+    auto fact =
+        factorization_type::build()
+            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .on(this->exec)
+            ->generate(this->mtx_system);
+
+    GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), this->mtx_l_it_expect, this->tol);
+    GKO_ASSERT_MTX_NEAR(fact->get_lt_factor(),
+                        gko::as<Csr>(this->mtx_l_it_expect->conj_transpose()),
+                        this->tol);
+}
+
+
 }  // namespace
diff --git a/reference/test/factorization/ilu_kernels.cpp b/reference/test/factorization/ilu_kernels.cpp
index c750ca93fc8..977fad42dcc 100644
--- a/reference/test/factorization/ilu_kernels.cpp
+++ b/reference/test/factorization/ilu_kernels.cpp
@@ -351,6 +351,23 @@ TYPED_TEST(Ilu, GenerateForCsrSmall)
 }
 
 
+TYPED_TEST(Ilu, GenerateForCsrSmallBySyncfree)
+{
+    using value_type = typename TestFixture::value_type;
+    using ilu_type = typename TestFixture::ilu_type;
+    auto factors =
+        ilu_type::build()
+            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .on(this->exec)
+            ->generate(this->mtx_csr_small);
+    auto l_factor = factors->get_l_factor();
+    auto u_factor = factors->get_u_factor();
+
+    GKO_ASSERT_MTX_NEAR(l_factor, this->small_l_expected, r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(u_factor, this->small_u_expected, r<value_type>::value);
+}
+
+
 TYPED_TEST(Ilu, GenerateForCsrSmall2ZeroDiagonal)
 {
     using value_type = typename TestFixture::value_type;
diff --git a/test/factorization/CMakeLists.txt b/test/factorization/CMakeLists.txt
index 5f0bc8b7f30..3791e610da0 100644
--- a/test/factorization/CMakeLists.txt
+++ b/test/factorization/CMakeLists.txt
@@ -1,7 +1,7 @@
 ginkgo_create_common_test(cholesky_kernels DISABLE_EXECUTORS dpcpp)
 ginkgo_create_common_test(factorization_kernels)
 ginkgo_create_common_test(lu_kernels DISABLE_EXECUTORS dpcpp)
-ginkgo_create_common_test(ic_kernels DISABLE_EXECUTORS dpcpp omp)
+ginkgo_create_common_test(ic_kernels DISABLE_EXECUTORS dpcpp)
 ginkgo_create_common_test(ilu_kernels DISABLE_EXECUTORS dpcpp)
 ginkgo_create_common_test(par_ic_kernels)
 ginkgo_create_common_test(par_ict_kernels)
diff --git a/test/factorization/cholesky_kernels.cpp b/test/factorization/cholesky_kernels.cpp
index c3cb8421e45..bf0be37d42a 100644
--- a/test/factorization/cholesky_kernels.cpp
+++ b/test/factorization/cholesky_kernels.cpp
@@ -415,12 +415,12 @@ TYPED_TEST(Cholesky, KernelFactorizeIsEquivalentToRef)
             this->ref, this->storage_offsets.get_const_data(),
             this->row_descs.get_const_data(), this->storage.get_const_data(),
             diag_idxs.get_const_data(), transpose_idxs.get_const_data(),
-            *this->forest, this->mtx_chol.get(), tmp);
+            *this->forest, this->mtx_chol.get(), true, tmp);
         gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::factorize(
             this->exec, this->dstorage_offsets.get_const_data(),
             this->drow_descs.get_const_data(), this->dstorage.get_const_data(),
             ddiag_idxs.get_const_data(), dtranspose_idxs.get_const_data(),
-            *this->dforest, this->dmtx_chol.get(), dtmp);
+            *this->dforest, this->dmtx_chol.get(), true, dtmp);
 
         GKO_ASSERT_MTX_NEAR(this->mtx_chol, this->dmtx_chol,
                             r<value_type>::value);
@@ -502,11 +502,13 @@ TYPED_TEST(Cholesky, GenerateIcWithBitmapIsEquivalentToRef)
         gko::experimental::factorization::Cholesky<value_type,
                                                    index_type>::build()
             .with_symbolic_factorization(sparsity)
+            .with_full_fillin(false)
             .on(this->ref);
     auto dfactory =
         gko::experimental::factorization::Cholesky<value_type,
                                                    index_type>::build()
             .with_symbolic_factorization(dsparsity)
+            .with_full_fillin(false)
             .on(this->exec);
 
     auto cholesky = factory->generate(mtx);
@@ -552,11 +554,13 @@ TYPED_TEST(Cholesky, GenerateIluWithHashmapIsEquivalentToRef)
         gko::experimental::factorization::Cholesky<value_type,
                                                    index_type>::build()
             .with_symbolic_factorization(sparsity)
+            .with_full_fillin(false)
             .on(this->ref);
     auto dfactory =
         gko::experimental::factorization::Cholesky<value_type,
                                                    index_type>::build()
             .with_symbolic_factorization(dsparsity)
+            .with_full_fillin(false)
             .on(this->exec);
 
     auto cholesky = factory->generate(mtx);
diff --git a/test/factorization/ic_kernels.cpp b/test/factorization/ic_kernels.cpp
index 9f0b60443f2..a93d14778ad 100644
--- a/test/factorization/ic_kernels.cpp
+++ b/test/factorization/ic_kernels.cpp
@@ -55,6 +55,26 @@ TEST_F(Ic, ComputeICIsEquivalentToRefSorted)
 }
 
 
+TEST_F(Ic, ComputeICBySyncfreeIsEquivalentToRefSorted)
+{
+    auto fact = gko::factorization::Ic<>::build()
+                    .with_skip_sorting(true)
+                    .on(ref)
+                    ->generate(mtx);
+    auto dfact =
+        gko::factorization::Ic<>::build()
+            .with_skip_sorting(true)
+            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .on(exec)
+            ->generate(dmtx);
+
+    GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), dfact->get_l_factor(), 1e-14);
+    GKO_ASSERT_MTX_NEAR(fact->get_lt_factor(), dfact->get_lt_factor(), 1e-14);
+    GKO_ASSERT_MTX_EQ_SPARSITY(fact->get_l_factor(), dfact->get_l_factor());
+    GKO_ASSERT_MTX_EQ_SPARSITY(fact->get_lt_factor(), dfact->get_lt_factor());
+}
+
+
 TEST_F(Ic, ComputeICIsEquivalentToRefUnsorted)
 {
     gko::test::unsort_matrix(mtx, rand_engine);

From a362d46ce6a527f13c9aa85e3760d8cbcb31d80e Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 12 Nov 2024 19:12:02 +0100
Subject: [PATCH 336/448] copy the call from lu/cholesky to ilu/ic and delete
 full_fillin

Co-authored-by: Tobias Ribizel <ribizel@kit.edu>
---
 core/factorization/cholesky.cpp               |  6 +-
 core/factorization/ic.cpp                     | 84 +++++++++++++----
 core/factorization/ilu.cpp                    | 68 ++++++++++----
 core/factorization/lu.cpp                     | 11 +--
 core/test/config/factorization.cpp            |  6 --
 .../ginkgo/core/factorization/cholesky.hpp    | 15 +--
 include/ginkgo/core/factorization/lu.hpp      | 17 +---
 .../test/factorization/cholesky_kernels.cpp   | 92 ------------------
 reference/test/factorization/ic_kernels.cpp   | 81 +++++++++++++++-
 reference/test/factorization/ilu_kernels.cpp  | 91 ++++++++++++++++++
 reference/test/factorization/lu_kernels.cpp   | 68 --------------
 test/factorization/cholesky_kernels.cpp       | 93 -------------------
 test/factorization/ic_kernels.cpp             | 75 +++++++++++++++
 test/factorization/ilu_kernels.cpp            | 69 ++++++++++++++
 test/factorization/lu_kernels.cpp             | 81 ----------------
 15 files changed, 439 insertions(+), 418 deletions(-)

diff --git a/core/factorization/cholesky.cpp b/core/factorization/cholesky.cpp
index 7366abda86d..92d598f0bd7 100644
--- a/core/factorization/cholesky.cpp
+++ b/core/factorization/cholesky.cpp
@@ -51,9 +51,6 @@ Cholesky<ValueType, IndexType>::parse(
     if (auto& obj = config.get("skip_sorting")) {
         params.with_skip_sorting(config::get_value<bool>(obj));
     }
-    if (auto& obj = config.get("full_fillin")) {
-        params.with_full_fillin(config::get_value<bool>(obj));
-    }
 
     return params;
 }
@@ -140,8 +137,7 @@ std::unique_ptr<LinOp> Cholesky<ValueType, IndexType>::generate_impl(
     exec->run(make_factorize(
         storage_offsets.get_const_data(), row_descs.get_const_data(),
         storage.get_const_data(), diag_idxs.get_const_data(),
-        transpose_idxs.get_const_data(), *forest, factors.get(),
-        parameters_.full_fillin, tmp));
+        transpose_idxs.get_const_data(), *forest, factors.get(), true, tmp));
     return factorization_type::create_from_combined_cholesky(
         std::move(factors));
 }
diff --git a/core/factorization/ic.cpp b/core/factorization/ic.cpp
index 9c036b08dec..a01df15f7c8 100644
--- a/core/factorization/ic.cpp
+++ b/core/factorization/ic.cpp
@@ -14,9 +14,14 @@
 #include <ginkgo/core/factorization/cholesky.hpp>
 
 #include "core/base/array_access.hpp"
+#include "core/components/fill_array_kernels.hpp"
 #include "core/config/config_helper.hpp"
+#include "core/factorization/cholesky_kernels.hpp"
+#include "core/factorization/elimination_forest.hpp"
 #include "core/factorization/factorization_kernels.hpp"
 #include "core/factorization/ic_kernels.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/matrix/csr_lookup.hpp"
 
 
 namespace gko {
@@ -31,6 +36,13 @@ GKO_REGISTER_OPERATION(add_diagonal_elements,
 GKO_REGISTER_OPERATION(initialize_row_ptrs_l,
                        factorization::initialize_row_ptrs_l);
 GKO_REGISTER_OPERATION(initialize_l, factorization::initialize_l);
+// for gko syncfree implementation
+GKO_REGISTER_OPERATION(fill_array, components::fill_array);
+GKO_REGISTER_OPERATION(build_lookup_offsets, csr::build_lookup_offsets);
+GKO_REGISTER_OPERATION(build_lookup, csr::build_lookup);
+GKO_REGISTER_OPERATION(forest_from_factor, cholesky::forest_from_factor);
+GKO_REGISTER_OPERATION(initialize, cholesky::initialize);
+GKO_REGISTER_OPERATION(factorize, cholesky::factorize);
 
 
 }  // anonymous namespace
@@ -95,24 +107,60 @@ std::unique_ptr<Composition<ValueType>> Ic<ValueType, IndexType>::generate(
     // Compute LC factorization
     if (std::dynamic_pointer_cast<const OmpExecutor>(exec) ||
         parameters_.algorithm == factorize_algorithm::syncfree) {
-        auto sparsity =
-            share(gko::matrix::SparsityCsr<ValueType, IndexType>::create_const(
-                exec, local_system_matrix->get_size(),
-                make_const_array_view(
-                    exec, local_system_matrix->get_num_stored_elements(),
-                    local_system_matrix->get_const_col_idxs()),
-                make_const_array_view(
-                    exec, local_system_matrix->get_size()[0] + 1,
-                    local_system_matrix->get_const_row_ptrs())));
-        ic = gko::experimental::factorization::Cholesky<ValueType,
-                                                        IndexType>::build()
-                 .with_full_fillin(false)
-                 .with_symbolic_factorization(sparsity)
-                 .with_skip_sorting(
-                     true)  // we have decided sort or not in earlir stage
-                 .on(exec)
-                 ->generate(local_system_matrix)
-                 ->get_combined();
+        std::unique_ptr<gko::factorization::elimination_forest<IndexType>>
+            forest;
+        const auto nnz = local_system_matrix->get_num_stored_elements();
+        const auto num_rows = local_system_matrix->get_size()[0];
+        auto factors = share(
+            matrix_type::create(exec, local_system_matrix->get_size(), nnz));
+        exec->copy_from(exec, nnz, local_system_matrix->get_const_col_idxs(),
+                        factors->get_col_idxs());
+        exec->copy_from(exec, num_rows + 1,
+                        local_system_matrix->get_const_row_ptrs(),
+                        factors->get_row_ptrs());
+        // update srow to be safe
+        factors->set_strategy(factors->get_strategy());
+        forest =
+            std::make_unique<gko::factorization::elimination_forest<IndexType>>(
+                exec, num_rows);
+        exec->run(
+            ic_factorization::make_forest_from_factor(factors.get(), *forest));
+
+        // setup lookup structure on factors
+        array<IndexType> storage_offsets{exec, num_rows + 1};
+        array<int64> row_descs{exec, num_rows};
+        array<IndexType> diag_idxs{exec, num_rows};
+        array<IndexType> transpose_idxs{exec,
+                                        factors->get_num_stored_elements()};
+        const auto allowed_sparsity = gko::matrix::csr::sparsity_type::bitmap |
+                                      gko::matrix::csr::sparsity_type::full |
+                                      gko::matrix::csr::sparsity_type::hash;
+        exec->run(ic_factorization::make_build_lookup_offsets(
+            factors->get_const_row_ptrs(), factors->get_const_col_idxs(),
+            num_rows, allowed_sparsity, storage_offsets.get_data()));
+        const auto storage_size =
+            static_cast<size_type>(get_element(storage_offsets, num_rows));
+        array<int32> storage{exec, storage_size};
+        exec->run(ic_factorization::make_build_lookup(
+            factors->get_const_row_ptrs(), factors->get_const_col_idxs(),
+            num_rows, allowed_sparsity, storage_offsets.get_const_data(),
+            row_descs.get_data(), storage.get_data()));
+        // initialize factors
+        exec->run(ic_factorization::make_fill_array(
+            factors->get_values(), factors->get_num_stored_elements(),
+            zero<ValueType>()));
+        exec->run(ic_factorization::make_initialize(
+            local_system_matrix.get(), storage_offsets.get_const_data(),
+            row_descs.get_const_data(), storage.get_const_data(),
+            diag_idxs.get_data(), transpose_idxs.get_data(), factors.get()));
+        // run numerical factorization
+        array<int> tmp{exec};
+        exec->run(ic_factorization::make_factorize(
+            storage_offsets.get_const_data(), row_descs.get_const_data(),
+            storage.get_const_data(), diag_idxs.get_const_data(),
+            transpose_idxs.get_const_data(), *forest, factors.get(), false,
+            tmp));
+        ic = factors;
     } else {
         exec->run(ic_factorization::make_compute(local_system_matrix.get()));
         ic = local_system_matrix;
diff --git a/core/factorization/ilu.cpp b/core/factorization/ilu.cpp
index 0969c6a5bb7..6528504706a 100644
--- a/core/factorization/ilu.cpp
+++ b/core/factorization/ilu.cpp
@@ -14,10 +14,14 @@
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 #include "core/base/array_access.hpp"
+#include "core/components/fill_array_kernels.hpp"
 #include "core/config/config_helper.hpp"
 #include "core/factorization/factorization_kernels.hpp"
 #include "core/factorization/ilu_kernels.hpp"
+#include "core/factorization/lu_kernels.hpp"
 #include "core/factorization/par_ilu_kernels.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/matrix/csr_lookup.hpp"
 
 
 namespace gko {
@@ -32,6 +36,12 @@ GKO_REGISTER_OPERATION(add_diagonal_elements,
 GKO_REGISTER_OPERATION(initialize_row_ptrs_l_u,
                        factorization::initialize_row_ptrs_l_u);
 GKO_REGISTER_OPERATION(initialize_l_u, factorization::initialize_l_u);
+// for gko syncfree implementation
+GKO_REGISTER_OPERATION(fill_array, components::fill_array);
+GKO_REGISTER_OPERATION(build_lookup_offsets, csr::build_lookup_offsets);
+GKO_REGISTER_OPERATION(build_lookup, csr::build_lookup);
+GKO_REGISTER_OPERATION(initialize, lu_factorization::initialize);
+GKO_REGISTER_OPERATION(factorize, lu_factorization::factorize);
 
 
 }  // anonymous namespace
@@ -95,24 +105,46 @@ std::unique_ptr<Composition<ValueType>> Ilu<ValueType, IndexType>::generate_l_u(
     // Compute LU factorization
     if (std::dynamic_pointer_cast<const OmpExecutor>(exec) ||
         parameters_.algorithm == factorize_algorithm::syncfree) {
-        auto sparsity =
-            share(gko::matrix::SparsityCsr<ValueType, IndexType>::create_const(
-                exec, local_system_matrix->get_size(),
-                make_const_array_view(
-                    exec, local_system_matrix->get_num_stored_elements(),
-                    local_system_matrix->get_const_col_idxs()),
-                make_const_array_view(
-                    exec, local_system_matrix->get_size()[0] + 1,
-                    local_system_matrix->get_const_row_ptrs())));
-        ilu =
-            gko::experimental::factorization::Lu<ValueType, IndexType>::build()
-                .with_full_fillin(false)
-                .with_symbolic_factorization(sparsity)
-                .with_skip_sorting(
-                    true)  // we have decided sort or not in earlir stage
-                .on(exec)
-                ->generate(local_system_matrix)
-                ->get_combined();
+        const auto nnz = local_system_matrix->get_num_stored_elements();
+        const auto num_rows = local_system_matrix->get_size()[0];
+        auto factors = share(
+            matrix_type::create(exec, local_system_matrix->get_size(), nnz));
+        exec->copy_from(exec, nnz, local_system_matrix->get_const_col_idxs(),
+                        factors->get_col_idxs());
+        exec->copy_from(exec, num_rows + 1,
+                        local_system_matrix->get_const_row_ptrs(),
+                        factors->get_row_ptrs());
+        // update srow to be safe
+        factors->set_strategy(factors->get_strategy());
+
+        // setup lookup structure on factors
+        array<IndexType> storage_offsets{exec, num_rows + 1};
+        array<int64> row_descs{exec, num_rows};
+        array<IndexType> diag_idxs{exec, num_rows};
+        const auto allowed_sparsity = gko::matrix::csr::sparsity_type::bitmap |
+                                      gko::matrix::csr::sparsity_type::full |
+                                      gko::matrix::csr::sparsity_type::hash;
+        exec->run(ilu_factorization::make_build_lookup_offsets(
+            factors->get_const_row_ptrs(), factors->get_const_col_idxs(),
+            num_rows, allowed_sparsity, storage_offsets.get_data()));
+        const auto storage_size =
+            static_cast<size_type>(get_element(storage_offsets, num_rows));
+        array<int32> storage{exec, storage_size};
+        exec->run(ilu_factorization::make_build_lookup(
+            factors->get_const_row_ptrs(), factors->get_const_col_idxs(),
+            num_rows, allowed_sparsity, storage_offsets.get_const_data(),
+            row_descs.get_data(), storage.get_data()));
+        exec->run(ilu_factorization::make_initialize(
+            local_system_matrix.get(), storage_offsets.get_const_data(),
+            row_descs.get_const_data(), storage.get_const_data(),
+            diag_idxs.get_data(), factors.get()));
+        // run numerical factorization
+        array<int> tmp{exec};
+        exec->run(ilu_factorization::make_factorize(
+            storage_offsets.get_const_data(), row_descs.get_const_data(),
+            storage.get_const_data(), diag_idxs.get_const_data(), factors.get(),
+            false, tmp));
+        ilu = factors;
     } else {
         exec->run(
             ilu_factorization::make_compute_ilu(local_system_matrix.get()));
diff --git a/core/factorization/lu.cpp b/core/factorization/lu.cpp
index c2152da2735..4feb78083d2 100644
--- a/core/factorization/lu.cpp
+++ b/core/factorization/lu.cpp
@@ -69,9 +69,6 @@ Lu<ValueType, IndexType>::parse(const config::pnode& config,
     if (auto& obj = config.get("skip_sorting")) {
         params.with_skip_sorting(config::get_value<bool>(obj));
     }
-    if (auto& obj = config.get("full_fillin")) {
-        params.with_full_fillin(config::get_value<bool>(obj));
-    }
 
     return params;
 }
@@ -159,10 +156,10 @@ std::unique_ptr<LinOp> Lu<ValueType, IndexType>::generate_impl(
         storage.get_const_data(), diag_idxs.get_data(), factors.get()));
     // run numerical factorization
     array<int> tmp{exec};
-    exec->run(make_factorize(
-        storage_offsets.get_const_data(), row_descs.get_const_data(),
-        storage.get_const_data(), diag_idxs.get_const_data(), factors.get(),
-        parameters_.full_fillin, tmp));
+    exec->run(
+        make_factorize(storage_offsets.get_const_data(),
+                       row_descs.get_const_data(), storage.get_const_data(),
+                       diag_idxs.get_const_data(), factors.get(), true, tmp));
     return factorization_type::create_from_combined_lu(std::move(factors));
 }
 
diff --git a/core/test/config/factorization.cpp b/core/test/config/factorization.cpp
index 8c1fefb3d95..0d0f2faf0d4 100644
--- a/core/test/config/factorization.cpp
+++ b/core/test/config/factorization.cpp
@@ -149,8 +149,6 @@ struct Cholesky : FactorizationConfigTest<
             detail::registry_accessor::get_data<Sparsity>(reg, "sparsity"));
         config_map["skip_sorting"] = pnode{true};
         param.with_skip_sorting(true);
-        config_map["full_fillin"] = pnode{false};
-        param.with_full_fillin(false);
     }
 
     template <typename AnswerType>
@@ -162,7 +160,6 @@ struct Cholesky : FactorizationConfigTest<
         ASSERT_EQ(res_param.symbolic_factorization,
                   ans_param.symbolic_factorization);
         ASSERT_EQ(res_param.skip_sorting, ans_param.skip_sorting);
-        ASSERT_EQ(res_param.full_fillin, ans_param.full_fillin);
     }
 };
 
@@ -187,8 +184,6 @@ struct Lu : FactorizationConfigTest<
             gko::experimental::factorization::symbolic_type::near_symmetric);
         config_map["skip_sorting"] = pnode{true};
         param.with_skip_sorting(true);
-        config_map["full_fillin"] = pnode{false};
-        param.with_full_fillin(false);
     }
 
     template <typename AnswerType>
@@ -201,7 +196,6 @@ struct Lu : FactorizationConfigTest<
                   ans_param.symbolic_factorization);
         ASSERT_EQ(res_param.symbolic_algorithm, ans_param.symbolic_algorithm);
         ASSERT_EQ(res_param.skip_sorting, ans_param.skip_sorting);
-        ASSERT_EQ(res_param.full_fillin, ans_param.full_fillin);
     }
 };
 
diff --git a/include/ginkgo/core/factorization/cholesky.hpp b/include/ginkgo/core/factorization/cholesky.hpp
index a984738cdfa..e2665ed1975 100644
--- a/include/ginkgo/core/factorization/cholesky.hpp
+++ b/include/ginkgo/core/factorization/cholesky.hpp
@@ -23,7 +23,9 @@ namespace factorization {
  * Computes a Cholesky factorization of a symmetric, positive-definite sparse
  * matrix. This LinOpFactory returns a Factorization storing the L and L^H
  * factors for the provided system matrix in matrix::Csr format. If no symbolic
- * factorization is provided, it will be computed first.
+ * factorization is provided, it will be computed first. It expects all fill-in
+ * entries present in symbolic factorization. If symbolic factorization misses
+ * some entries, please refer to Ic.
  *
  * @tparam ValueType  the type used to store values of the system matrix
  * @tparam IndexType  the type used to store sparsity pattern indices of the
@@ -67,17 +69,6 @@ class Cholesky
          * incorrect results or crash.
          */
         bool GKO_FACTORY_PARAMETER_SCALAR(skip_sorting, false);
-
-        /**
-         * If the user provides the symbolic factorization, it should contain
-         * the fill-in for the matrix. i.e., When this is true, the symbolic
-         * factorization must contain the non-zero locations in the original
-         * matrix and the corresponding fill-in locations during factorization.
-         * If it does not have full fill-in, as in Ilu, this parameter must be
-         * set to false in order to avoid the possibility of hanging or illegal
-         * memory accesses during the factorization process.
-         */
-        bool GKO_FACTORY_PARAMETER_SCALAR(full_fillin, true);
     };
 
     /**
diff --git a/include/ginkgo/core/factorization/lu.hpp b/include/ginkgo/core/factorization/lu.hpp
index 3704719f32a..090cf5cfe11 100644
--- a/include/ginkgo/core/factorization/lu.hpp
+++ b/include/ginkgo/core/factorization/lu.hpp
@@ -42,7 +42,9 @@ enum class symbolic_type {
  * Computes an LU factorization of a sparse matrix. This LinOpFactory returns a
  * Factorization storing the L and U factors for the provided system matrix in
  * matrix::Csr format. If no symbolic factorization is provided, it will be
- * computed first.
+ * computed first. It expects all fill-in entries present in symbolic
+ * factorization. If symbolic factorization misses some entries, please refer to
+ * Ilu.
  *
  * @tparam ValueType  the type used to store values of the system matrix
  * @tparam IndexType  the type used to store sparsity pattern indices of the
@@ -97,19 +99,6 @@ class Lu
          * incorrect results or crash.
          */
         bool GKO_FACTORY_PARAMETER_SCALAR(skip_sorting, false);
-
-        /**
-         * If the user provides the symbolic factorization, it should contain
-         * the fill-in for the matrix. i.e., When this is true, the symbolic
-         * factorization must contain the non-zero locations in the original
-         * matrix and the corresponding fill-in locations during factorization.
-         * If it does not have full fill-in, as in Ilu, this parameter must be
-         * set to false in order to avoid the possibility of hanging or illegal
-         * memory accesses during the factorization process. Also, the symbolic
-         * factorization still needs to contain all entries from the original
-         * matrix.
-         */
-        bool GKO_FACTORY_PARAMETER_SCALAR(full_fillin, true);
     };
 
     /**
diff --git a/reference/test/factorization/cholesky_kernels.cpp b/reference/test/factorization/cholesky_kernels.cpp
index 86a2254a70b..b4c33d76ab9 100644
--- a/reference/test/factorization/cholesky_kernels.cpp
+++ b/reference/test/factorization/cholesky_kernels.cpp
@@ -25,9 +25,6 @@
 #include "matrices/config.hpp"
 
 
-namespace {
-
-
 template <typename ValueIndexType>
 class Cholesky : public ::testing::Test {
 protected:
@@ -490,92 +487,3 @@ TYPED_TEST(Cholesky, FactorizeWithKnownSparsityWorks)
         },
         false);
 }
-
-
-TYPED_TEST(Cholesky, GenerateIcWithBitmapIsEquivalentToRef)
-{
-    using value_type = typename TestFixture::value_type;
-    using index_type = typename TestFixture::index_type;
-    using matrix_type = typename TestFixture::matrix_type;
-    using sparsity_matrix_type = typename TestFixture::sparsity_matrix_type;
-    // diag + full first row and column
-    // the third and forth row use bitmap for lookup table
-    auto mtx = gko::share(gko::initialize<matrix_type>({{1.0, 1.0, 1.0, 1.0},
-                                                        {1.0, 2.0, 0.0, 0.0},
-                                                        {1.0, 0.0, 2.0, 0.0},
-                                                        {1.0, 0.0, 0.0, 2.0}},
-                                                       this->ref));
-    auto sparsity = gko::share(sparsity_matrix_type::create(this->ref));
-    mtx->convert_to(sparsity);
-    auto result = gko::initialize<matrix_type>({{1.0, 1.0, 1.0, 1.0},
-                                                {1.0, 1.0, 0.0, 0.0},
-                                                {1.0, 0.0, 1.0, 0.0},
-                                                {1.0, 0.0, 0.0, 1.0}},
-                                               this->ref);
-    auto factory =
-        gko::experimental::factorization::Cholesky<value_type,
-                                                   index_type>::build()
-            .with_symbolic_factorization(sparsity)
-            .with_full_fillin(false)
-            .on(this->ref);
-
-    auto cholesky = factory->generate(mtx);
-
-    GKO_ASSERT_MTX_EQ_SPARSITY(cholesky->get_combined(), mtx);
-    GKO_ASSERT_MTX_NEAR(cholesky->get_combined(), result, r<value_type>::value);
-}
-
-
-TYPED_TEST(Cholesky, GenerateIcWithHashmapIsEquivalentToRef)
-{
-    using value_type = typename TestFixture::value_type;
-    using index_type = typename TestFixture::index_type;
-    using matrix_type = typename TestFixture::matrix_type;
-    using sparsity_matrix_type = typename TestFixture::sparsity_matrix_type;
-    int n = 68;
-    gko::matrix_data<value_type, index_type> data(gko::dim<2>(n, n));
-    gko::matrix_data<value_type, index_type> result(gko::dim<2>(n, n));
-    for (int i = 0; i < n; i++) {
-        if (i == n - 2 || i == n - 3) {
-            data.nonzeros.emplace_back(i, i, value_type{2});
-        } else {
-            data.nonzeros.emplace_back(i, i, gko::one<value_type>());
-        }
-        result.nonzeros.emplace_back(i, i, gko::one<value_type>());
-    }
-    // the following rows use hashmap for lookup table
-    // add dependence
-    data.nonzeros.emplace_back(n - 3, 0, gko::one<value_type>());
-    data.nonzeros.emplace_back(0, n - 3, gko::one<value_type>());
-    // add a entry whose col idx is not shown in the above row
-    data.nonzeros.emplace_back(0, n - 2, gko::one<value_type>());
-    data.nonzeros.emplace_back(n - 2, 0, gko::one<value_type>());
-    data.sort_row_major();
-    auto mtx = gko::share(matrix_type::create(this->ref));
-    mtx->read(data);
-    // prepare result
-    result.nonzeros.emplace_back(n - 3, 0, gko::one<value_type>());
-    result.nonzeros.emplace_back(0, n - 3, gko::one<value_type>());
-    result.nonzeros.emplace_back(0, n - 2, gko::one<value_type>());
-    result.nonzeros.emplace_back(n - 2, 0, gko::one<value_type>());
-    result.sort_row_major();
-    auto result_mtx = gko::share(matrix_type::create(this->ref));
-    result_mtx->read(result);
-    auto sparsity = gko::share(sparsity_matrix_type::create(this->ref));
-    mtx->convert_to(sparsity);
-    auto factory =
-        gko::experimental::factorization::Cholesky<value_type,
-                                                   index_type>::build()
-            .with_symbolic_factorization(sparsity)
-            .with_full_fillin(false)
-            .on(this->ref);
-
-    auto cholesky = factory->generate(mtx);
-
-    GKO_ASSERT_MTX_EQ_SPARSITY(cholesky->get_combined(), result_mtx);
-    GKO_ASSERT_MTX_NEAR(cholesky->get_combined(), result_mtx,
-                        r<value_type>::value);
-}
-
-
-}  // namespace
diff --git a/reference/test/factorization/ic_kernels.cpp b/reference/test/factorization/ic_kernels.cpp
index 2d69d01592b..5195231cf02 100644
--- a/reference/test/factorization/ic_kernels.cpp
+++ b/reference/test/factorization/ic_kernels.cpp
@@ -17,9 +17,6 @@
 #include "core/test/utils.hpp"
 
 
-namespace {
-
-
 class DummyLinOp : public gko::EnableLinOp<DummyLinOp>,
                    public gko::EnableCreateMethod<DummyLinOp> {
 public:
@@ -206,4 +203,80 @@ TYPED_TEST(Ic, GenerateGeneralBySyncfree)
 }
 
 
-}  // namespace
+TYPED_TEST(Ic, GenerateIcWithBitmapIsEquivalentToRefBySyncfree)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Csr = typename TestFixture::Csr;
+    // diag + full first row and column
+    // the third and forth row use bitmap for lookup table
+    auto mtx = gko::share(gko::initialize<Csr>({{1.0, 1.0, 1.0, 1.0},
+                                                {1.0, 2.0, 0.0, 0.0},
+                                                {1.0, 0.0, 2.0, 0.0},
+                                                {1.0, 0.0, 0.0, 2.0}},
+                                               this->ref));
+    auto result_l = gko::initialize<Csr>({{1.0, 0.0, 0.0, 0.0},
+                                          {1.0, 1.0, 0.0, 0.0},
+                                          {1.0, 0.0, 1.0, 0.0},
+                                          {1.0, 0.0, 0.0, 1.0}},
+                                         this->ref);
+    auto result_lt = gko::as<Csr>(result_l->conj_transpose());
+    auto factory =
+        gko::factorization::Ic<value_type, index_type>::build()
+            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .on(this->ref);
+
+    auto ic = factory->generate(mtx);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(ic->get_l_factor(), result_l);
+    GKO_ASSERT_MTX_NEAR(ic->get_l_factor(), result_l, this->tol);
+    GKO_ASSERT_MTX_EQ_SPARSITY(ic->get_lt_factor(), result_lt);
+    GKO_ASSERT_MTX_NEAR(ic->get_lt_factor(), result_lt, this->tol);
+}
+
+
+TYPED_TEST(Ic, GenerateIcWithHashmapIsEquivalentToRefBySyncfree)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Csr = typename TestFixture::Csr;
+    int n = 68;
+    gko::matrix_data<value_type, index_type> data(gko::dim<2>(n, n));
+    gko::matrix_data<value_type, index_type> result(gko::dim<2>(n, n));
+    for (int i = 0; i < n; i++) {
+        if (i == n - 2 || i == n - 3) {
+            data.nonzeros.emplace_back(i, i, value_type{2});
+        } else {
+            data.nonzeros.emplace_back(i, i, gko::one<value_type>());
+        }
+        result.nonzeros.emplace_back(i, i, gko::one<value_type>());
+    }
+    // the following rows use hashmap for lookup table
+    // add dependence
+    data.nonzeros.emplace_back(n - 3, 0, gko::one<value_type>());
+    data.nonzeros.emplace_back(0, n - 3, gko::one<value_type>());
+    // add a entry whose col idx is not shown in the above row
+    data.nonzeros.emplace_back(0, n - 2, gko::one<value_type>());
+    data.nonzeros.emplace_back(n - 2, 0, gko::one<value_type>());
+    data.sort_row_major();
+    auto mtx = gko::share(Csr::create(this->ref));
+    mtx->read(data);
+    // prepare result (lower triangular part)
+    result.nonzeros.emplace_back(n - 3, 0, gko::one<value_type>());
+    result.nonzeros.emplace_back(n - 2, 0, gko::one<value_type>());
+    result.sort_row_major();
+    auto result_l = gko::share(Csr::create(this->ref));
+    result_l->read(result);
+    auto result_lt = gko::as<Csr>(result_l->conj_transpose());
+    auto factory =
+        gko::factorization::Ic<value_type, index_type>::build()
+            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .on(this->ref);
+
+    auto ic = factory->generate(mtx);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(ic->get_l_factor(), result_l);
+    GKO_ASSERT_MTX_NEAR(ic->get_l_factor(), result_l, this->tol);
+    GKO_ASSERT_MTX_EQ_SPARSITY(ic->get_lt_factor(), result_lt);
+    GKO_ASSERT_MTX_NEAR(ic->get_lt_factor(), result_lt, this->tol);
+}
diff --git a/reference/test/factorization/ilu_kernels.cpp b/reference/test/factorization/ilu_kernels.cpp
index 977fad42dcc..e06915530e2 100644
--- a/reference/test/factorization/ilu_kernels.cpp
+++ b/reference/test/factorization/ilu_kernels.cpp
@@ -368,6 +368,97 @@ TYPED_TEST(Ilu, GenerateForCsrSmallBySyncfree)
 }
 
 
+TYPED_TEST(Ilu, GenerateIluWithBitmapIsEquivalentToRefBySyncfree)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Csr = typename TestFixture::Csr;
+    // diag + full first row and column
+    // the third and forth row use bitmap for lookup table
+    auto mtx = gko::share(gko::initialize<Csr>({{1.0, 1.0, 1.0, 1.0},
+                                                {1.0, 1.0, 0.0, 0.0},
+                                                {1.0, 0.0, 1.0, 0.0},
+                                                {1.0, 0.0, 0.0, 1.0}},
+                                               this->ref));
+    // generate matrix from matrix data to ensure the sparsity
+    gko::matrix_data<value_type, index_type> result_l_data(gko::dim<2>(4, 4),
+                                                           {{0, 0, 1},
+                                                            {1, 0, 1},
+                                                            {1, 1, 1},
+                                                            {2, 0, 1},
+                                                            {2, 2, 1},
+                                                            {3, 0, 1},
+                                                            {3, 3, 1}});
+    auto result_l = Csr::create(this->ref);
+    result_l->read(result_l_data);
+    gko::matrix_data<value_type, index_type> result_u_data(gko::dim<2>(4, 4),
+                                                           {{0, 0, 1},
+                                                            {0, 1, 1},
+                                                            {0, 2, 1},
+                                                            {0, 3, 1},
+                                                            {1, 1, 0},
+                                                            {2, 2, 0},
+                                                            {3, 3, 0}});
+    auto result_u = Csr::create(this->ref);
+    result_u->read(result_u_data);
+    auto factory =
+        gko::factorization::Ilu<value_type, index_type>::build()
+            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .on(this->ref);
+
+    auto lu = factory->generate(mtx);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(lu->get_l_factor(), result_l);
+    GKO_ASSERT_MTX_NEAR(lu->get_l_factor(), result_l, r<value_type>::value);
+    GKO_ASSERT_MTX_EQ_SPARSITY(lu->get_u_factor(), result_u);
+    GKO_ASSERT_MTX_NEAR(lu->get_u_factor(), result_u, r<value_type>::value);
+}
+
+
+TYPED_TEST(Ilu, GenerateIluWithHashmapIsEquivalentToRefBySyncfree)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Csr = typename TestFixture::Csr;
+    int n = 68;
+    // the first row and second last row use hashmap for lookup table
+    gko::matrix_data<value_type, index_type> data(gko::dim<2>(n, n));
+    gko::matrix_data<value_type, index_type> result_l_data(gko::dim<2>(n, n));
+    gko::matrix_data<value_type, index_type> result_u_data(gko::dim<2>(n, n));
+    for (int i = 0; i < n; i++) {
+        data.nonzeros.emplace_back(i, i, gko::one<value_type>());
+        result_l_data.nonzeros.emplace_back(i, i, gko::one<value_type>());
+        result_u_data.nonzeros.emplace_back(i, i, gko::one<value_type>());
+    }
+    // add dependence
+    data.nonzeros.emplace_back(n - 3, 0, gko::one<value_type>());
+    result_l_data.nonzeros.emplace_back(n - 3, 0, gko::one<value_type>());
+    // add a entry whose col idx is not shown in the above row
+    data.nonzeros.emplace_back(0, n - 2, gko::one<value_type>());
+    result_u_data.nonzeros.emplace_back(0, n - 2, gko::one<value_type>());
+    data.sort_row_major();
+    result_l_data.sort_row_major();
+    result_u_data.sort_row_major();
+    auto mtx = gko::share(Csr::create(this->ref));
+    mtx->read(data);
+    auto result_l = gko::share(Csr::create(this->ref));
+    result_l->read(result_l_data);
+    auto result_u = gko::share(Csr::create(this->ref));
+    result_u->read(result_u_data);
+    auto factory =
+        gko::factorization::Ilu<value_type, index_type>::build()
+            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .on(this->ref);
+
+    auto lu = factory->generate(mtx);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(lu->get_l_factor(), result_l);
+    GKO_ASSERT_MTX_NEAR(lu->get_l_factor(), result_l, r<value_type>::value);
+    GKO_ASSERT_MTX_EQ_SPARSITY(lu->get_u_factor(), result_u);
+    GKO_ASSERT_MTX_NEAR(lu->get_u_factor(), result_u, r<value_type>::value);
+}
+
+
 TYPED_TEST(Ilu, GenerateForCsrSmall2ZeroDiagonal)
 {
     using value_type = typename TestFixture::value_type;
diff --git a/reference/test/factorization/lu_kernels.cpp b/reference/test/factorization/lu_kernels.cpp
index 6f4f7a1d088..68619bcf994 100644
--- a/reference/test/factorization/lu_kernels.cpp
+++ b/reference/test/factorization/lu_kernels.cpp
@@ -331,71 +331,3 @@ TYPED_TEST(Lu, FactorizeWithKnownSparsityWorks)
         ASSERT_EQ(lu->get_diagonal(), nullptr);
     });
 }
-
-
-TYPED_TEST(Lu, GenerateIluWithBitmapIsEquivalentToRef)
-{
-    using value_type = typename TestFixture::value_type;
-    using index_type = typename TestFixture::index_type;
-    using matrix_type = typename TestFixture::matrix_type;
-    using sparsity_pattern_type = typename TestFixture::sparsity_pattern_type;
-    // diag + full first row and column
-    // the third and forth row use bitmap for lookup table
-    auto mtx = gko::share(gko::initialize<matrix_type>({{1.0, 1.0, 1.0, 1.0},
-                                                        {1.0, 1.0, 0.0, 0.0},
-                                                        {1.0, 0.0, 1.0, 0.0},
-                                                        {1.0, 0.0, 0.0, 1.0}},
-                                                       this->ref));
-    auto sparsity = gko::share(sparsity_pattern_type::create(this->ref));
-    mtx->convert_to(sparsity);
-    auto result = gko::initialize<matrix_type>({{1.0, 1.0, 1.0, 1.0},
-                                                {1.0, 0.0, 0.0, 0.0},
-                                                {1.0, 0.0, 0.0, 0.0},
-                                                {1.0, 0.0, 0.0, 0.0}},
-                                               this->ref);
-    auto factory =
-        gko::experimental::factorization::Lu<value_type, index_type>::build()
-            .with_symbolic_factorization(sparsity)
-            .with_full_fillin(false)
-            .on(this->ref);
-
-    auto lu = factory->generate(mtx);
-
-    GKO_ASSERT_MTX_EQ_SPARSITY(lu->get_combined(), mtx);
-    GKO_ASSERT_MTX_NEAR(lu->get_combined(), result, r<value_type>::value);
-}
-
-
-TYPED_TEST(Lu, GenerateIluWithHashmapIsEquivalentToRef)
-{
-    using value_type = typename TestFixture::value_type;
-    using index_type = typename TestFixture::index_type;
-    using matrix_type = typename TestFixture::matrix_type;
-    using sparsity_pattern_type = typename TestFixture::sparsity_pattern_type;
-    int n = 68;
-    // the first row and second last row use hashmap for lookup table
-    gko::matrix_data<value_type, index_type> data(gko::dim<2>(n, n));
-    for (int i = 0; i < n; i++) {
-        data.nonzeros.emplace_back(i, i, gko::one<value_type>());
-    }
-    // add dependence
-    data.nonzeros.emplace_back(n - 3, 0, gko::one<value_type>());
-    // add a entry whose col idx is not shown in the above row
-    data.nonzeros.emplace_back(0, n - 2, gko::one<value_type>());
-    data.sort_row_major();
-    auto mtx = gko::share(matrix_type::create(this->ref));
-    mtx->read(data);
-    auto sparsity = gko::share(sparsity_pattern_type::create(this->ref));
-    mtx->convert_to(sparsity);
-    auto factory =
-        gko::experimental::factorization::Lu<value_type, index_type>::build()
-            .with_symbolic_factorization(sparsity)
-            .with_full_fillin(false)
-            .on(this->ref);
-
-    auto lu = factory->generate(mtx);
-
-    // the result combined matrix is the same as the original matrix
-    GKO_ASSERT_MTX_EQ_SPARSITY(lu->get_combined(), mtx);
-    GKO_ASSERT_MTX_NEAR(lu->get_combined(), mtx, r<value_type>::value);
-}
\ No newline at end of file
diff --git a/test/factorization/cholesky_kernels.cpp b/test/factorization/cholesky_kernels.cpp
index bf0be37d42a..61bc86bbf43 100644
--- a/test/factorization/cholesky_kernels.cpp
+++ b/test/factorization/cholesky_kernels.cpp
@@ -480,97 +480,4 @@ TYPED_TEST(Cholesky, GenerateWithKnownSparsityIsEquivalentToRef)
 }
 
 
-TYPED_TEST(Cholesky, GenerateIcWithBitmapIsEquivalentToRef)
-{
-    using value_type = typename TestFixture::value_type;
-    using index_type = typename TestFixture::index_type;
-    using matrix_type = typename TestFixture::matrix_type;
-    using sparsity_pattern_type = typename TestFixture::sparsity_pattern_type;
-    // diag + full first row and column
-    // the third and forth row use bitmap for lookup table
-    auto mtx = gko::share(gko::initialize<matrix_type>({{1.0, 1.0, 1.0, 1.0},
-                                                        {1.0, 2.0, 0.0, 0.0},
-                                                        {1.0, 0.0, 2.0, 0.0},
-                                                        {1.0, 0.0, 0.0, 2.0}},
-                                                       this->ref));
-    auto dmtx = gko::share(mtx->clone(this->exec));
-    auto sparsity = gko::share(sparsity_pattern_type::create(this->ref));
-    mtx->convert_to(sparsity);
-    auto dsparsity = gko::share(sparsity->clone(this->exec));
-
-    auto factory =
-        gko::experimental::factorization::Cholesky<value_type,
-                                                   index_type>::build()
-            .with_symbolic_factorization(sparsity)
-            .with_full_fillin(false)
-            .on(this->ref);
-    auto dfactory =
-        gko::experimental::factorization::Cholesky<value_type,
-                                                   index_type>::build()
-            .with_symbolic_factorization(dsparsity)
-            .with_full_fillin(false)
-            .on(this->exec);
-
-    auto cholesky = factory->generate(mtx);
-    auto dcholesky = dfactory->generate(dmtx);
-
-    GKO_ASSERT_MTX_EQ_SPARSITY(cholesky->get_combined(),
-                               dcholesky->get_combined());
-    GKO_ASSERT_MTX_NEAR(cholesky->get_combined(), dcholesky->get_combined(),
-                        r<value_type>::value);
-}
-
-
-TYPED_TEST(Cholesky, GenerateIluWithHashmapIsEquivalentToRef)
-{
-    using value_type = typename TestFixture::value_type;
-    using index_type = typename TestFixture::index_type;
-    using matrix_type = typename TestFixture::matrix_type;
-    using sparsity_pattern_type = typename TestFixture::sparsity_pattern_type;
-    int n = 68;
-    gko::matrix_data<value_type, index_type> data(gko::dim<2>(n, n));
-    for (int i = 0; i < n; i++) {
-        if (i == n - 2 || i == n - 3) {
-            data.nonzeros.emplace_back(i, i, value_type{2});
-        } else {
-            data.nonzeros.emplace_back(i, i, gko::one<value_type>());
-        }
-    }
-    // the following rows use hashmap for lookup table
-    // add dependence
-    data.nonzeros.emplace_back(n - 3, 0, gko::one<value_type>());
-    data.nonzeros.emplace_back(0, n - 3, gko::one<value_type>());
-    // add a entry whose col idx is not shown in the above row
-    data.nonzeros.emplace_back(0, n - 2, gko::one<value_type>());
-    data.nonzeros.emplace_back(n - 2, 0, gko::one<value_type>());
-    data.sort_row_major();
-    auto mtx = gko::share(matrix_type::create(this->ref));
-    mtx->read(data);
-    auto dmtx = gko::share(mtx->clone(this->exec));
-    auto sparsity = gko::share(sparsity_pattern_type::create(this->ref));
-    mtx->convert_to(sparsity);
-    auto dsparsity = gko::share(sparsity->clone(this->exec));
-    auto factory =
-        gko::experimental::factorization::Cholesky<value_type,
-                                                   index_type>::build()
-            .with_symbolic_factorization(sparsity)
-            .with_full_fillin(false)
-            .on(this->ref);
-    auto dfactory =
-        gko::experimental::factorization::Cholesky<value_type,
-                                                   index_type>::build()
-            .with_symbolic_factorization(dsparsity)
-            .with_full_fillin(false)
-            .on(this->exec);
-
-    auto cholesky = factory->generate(mtx);
-    auto dcholesky = dfactory->generate(dmtx);
-
-    GKO_ASSERT_MTX_EQ_SPARSITY(cholesky->get_combined(),
-                               dcholesky->get_combined());
-    GKO_ASSERT_MTX_NEAR(cholesky->get_combined(), dcholesky->get_combined(),
-                        r<value_type>::value);
-}
-
-
 }  // namespace
diff --git a/test/factorization/ic_kernels.cpp b/test/factorization/ic_kernels.cpp
index a93d14778ad..587e93c0a87 100644
--- a/test/factorization/ic_kernels.cpp
+++ b/test/factorization/ic_kernels.cpp
@@ -90,6 +90,81 @@ TEST_F(Ic, ComputeICIsEquivalentToRefUnsorted)
 }
 
 
+TEST_F(Ic, ComputeICWithBitmapIsEquivalentToRefBySyncfree)
+{
+    // diag + full first row and column
+    // the third and forth row use bitmap for lookup table
+    auto mtx = gko::share(gko::initialize<Csr>({{1.0, 1.0, 1.0, 1.0},
+                                                {1.0, 2.0, 0.0, 0.0},
+                                                {1.0, 0.0, 2.0, 0.0},
+                                                {1.0, 0.0, 0.0, 2.0}},
+                                               this->ref));
+    auto dmtx = gko::share(mtx->clone(this->exec));
+
+    auto factory =
+        gko::factorization::Ic<value_type, index_type>::build()
+            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .on(this->ref);
+    auto dfactory =
+        gko::factorization::Ic<value_type, index_type>::build()
+            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .on(this->exec);
+
+    auto ic = factory->generate(mtx);
+    auto dic = dfactory->generate(dmtx);
+
+    GKO_ASSERT_MTX_NEAR(ic->get_l_factor(), dic->get_l_factor(),
+                        r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(ic->get_lt_factor(), dic->get_lt_factor(),
+                        r<value_type>::value);
+    GKO_ASSERT_MTX_EQ_SPARSITY(ic->get_l_factor(), dic->get_l_factor());
+    GKO_ASSERT_MTX_EQ_SPARSITY(ic->get_lt_factor(), dic->get_lt_factor());
+}
+
+
+TEST_F(Ic, ComputeICWithHashmapIsEquivalentToRefBySyncfree)
+{
+    int n = 68;
+    gko::matrix_data<value_type, index_type> data(gko::dim<2>(n, n));
+    for (int i = 0; i < n; i++) {
+        if (i == n - 2 || i == n - 3) {
+            data.nonzeros.emplace_back(i, i, value_type{2});
+        } else {
+            data.nonzeros.emplace_back(i, i, gko::one<value_type>());
+        }
+    }
+    // the following rows use hashmap for lookup table
+    // add dependence
+    data.nonzeros.emplace_back(n - 3, 0, gko::one<value_type>());
+    data.nonzeros.emplace_back(0, n - 3, gko::one<value_type>());
+    // add a entry whose col idx is not shown in the above row
+    data.nonzeros.emplace_back(0, n - 2, gko::one<value_type>());
+    data.nonzeros.emplace_back(n - 2, 0, gko::one<value_type>());
+    data.sort_row_major();
+    auto mtx = gko::share(Csr::create(this->ref));
+    mtx->read(data);
+    auto dmtx = gko::share(mtx->clone(this->exec));
+    auto factory =
+        gko::factorization::Ic<value_type, index_type>::build()
+            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .on(this->ref);
+    auto dfactory =
+        gko::factorization::Ic<value_type, index_type>::build()
+            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .on(this->exec);
+
+    auto ic = factory->generate(mtx);
+    auto dic = dfactory->generate(dmtx);
+
+    GKO_ASSERT_MTX_NEAR(ic->get_l_factor(), dic->get_l_factor(),
+                        r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(ic->get_lt_factor(), dic->get_lt_factor(),
+                        r<value_type>::value);
+    GKO_ASSERT_MTX_EQ_SPARSITY(ic->get_l_factor(), dic->get_l_factor());
+    GKO_ASSERT_MTX_EQ_SPARSITY(ic->get_lt_factor(), dic->get_lt_factor());
+}
+
+
 TEST_F(Ic, SetsCorrectStrategy)
 {
     auto dfact = gko::factorization::Ic<>::build()
diff --git a/test/factorization/ilu_kernels.cpp b/test/factorization/ilu_kernels.cpp
index 297f0d6d922..a45104f33a7 100644
--- a/test/factorization/ilu_kernels.cpp
+++ b/test/factorization/ilu_kernels.cpp
@@ -90,6 +90,75 @@ TEST_F(Ilu, ComputeILUIsEquivalentToRefUnsorted)
 }
 
 
+TEST_F(Ilu, ComputeILUWithBitmapIsEquivalentToRefBySyncfree)
+{
+    // diag + full first row and column
+    // the third and forth row use bitmap for lookup table
+    auto mtx = gko::share(gko::initialize<Csr>({{1.0, 1.0, 1.0, 1.0},
+                                                {1.0, 1.0, 0.0, 0.0},
+                                                {1.0, 0.0, 1.0, 0.0},
+                                                {1.0, 0.0, 0.0, 1.0}},
+                                               this->ref));
+    auto dmtx = gko::share(mtx->clone(this->exec));
+
+    auto factory =
+        gko::factorization::Ilu<value_type, index_type>::build()
+            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .on(this->ref);
+    auto dfactory =
+        gko::factorization::Ilu<value_type, index_type>::build()
+            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .on(this->exec);
+
+    auto ilu = factory->generate(mtx);
+    auto dilu = dfactory->generate(dmtx);
+
+    GKO_ASSERT_MTX_NEAR(ilu->get_l_factor(), dilu->get_l_factor(),
+                        r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(ilu->get_u_factor(), dilu->get_u_factor(),
+                        r<value_type>::value);
+    GKO_ASSERT_MTX_EQ_SPARSITY(ilu->get_l_factor(), dilu->get_l_factor());
+    GKO_ASSERT_MTX_EQ_SPARSITY(ilu->get_u_factor(), dilu->get_u_factor());
+}
+
+
+TEST_F(Ilu, ComputeILUWithHashmapIsEquivalentToRefBySyncfree)
+{
+    int n = 68;
+    // the first row and second last row use hashmap for lookup table
+    gko::matrix_data<value_type, index_type> data(gko::dim<2>(n, n));
+    for (int i = 0; i < n; i++) {
+        data.nonzeros.emplace_back(i, i, gko::one<value_type>());
+    }
+    // add dependence
+    data.nonzeros.emplace_back(n - 3, 0, gko::one<value_type>());
+    // add a entry whose col idx is not shown in the above row
+    data.nonzeros.emplace_back(0, n - 2, gko::one<value_type>());
+    data.sort_row_major();
+    auto mtx = gko::share(Csr::create(this->ref));
+    mtx->read(data);
+    auto dmtx = gko::share(mtx->clone(this->exec));
+    auto factory =
+        gko::factorization::Ilu<value_type, index_type>::build()
+            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .on(this->ref);
+    auto dfactory =
+        gko::factorization::Ilu<value_type, index_type>::build()
+            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .on(this->exec);
+
+    auto ilu = factory->generate(mtx);
+    auto dilu = dfactory->generate(dmtx);
+
+    GKO_ASSERT_MTX_NEAR(ilu->get_l_factor(), dilu->get_l_factor(),
+                        r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(ilu->get_u_factor(), dilu->get_u_factor(),
+                        r<value_type>::value);
+    GKO_ASSERT_MTX_EQ_SPARSITY(ilu->get_l_factor(), dilu->get_l_factor());
+    GKO_ASSERT_MTX_EQ_SPARSITY(ilu->get_u_factor(), dilu->get_u_factor());
+}
+
+
 TEST_F(Ilu, SetsCorrectStrategy)
 {
     auto dfact = gko::factorization::Ilu<>::build()
diff --git a/test/factorization/lu_kernels.cpp b/test/factorization/lu_kernels.cpp
index fab1456badc..58b0aa6c6d4 100644
--- a/test/factorization/lu_kernels.cpp
+++ b/test/factorization/lu_kernels.cpp
@@ -351,84 +351,3 @@ TYPED_TEST(Lu, GenerateUnsymmWithUnknownSparsityIsEquivalentToRef)
                             r<value_type>::value);
     });
 }
-
-
-TYPED_TEST(Lu, GenerateIluWithBitmapIsEquivalentToRef)
-{
-    using value_type = typename TestFixture::value_type;
-    using index_type = typename TestFixture::index_type;
-    using matrix_type = typename TestFixture::matrix_type;
-    using sparsity_pattern_type = typename TestFixture::sparsity_pattern_type;
-    // diag + full first row and column
-    // the third and forth row use bitmap for lookup table
-    auto mtx = gko::share(gko::initialize<matrix_type>({{1.0, 1.0, 1.0, 1.0},
-                                                        {1.0, 1.0, 0.0, 0.0},
-                                                        {1.0, 0.0, 1.0, 0.0},
-                                                        {1.0, 0.0, 0.0, 1.0}},
-                                                       this->ref));
-    auto dmtx = gko::share(mtx->clone(this->exec));
-    auto sparsity = gko::share(sparsity_pattern_type::create(this->ref));
-    mtx->convert_to(sparsity);
-    auto dsparsity = gko::share(sparsity->clone(this->exec));
-
-    auto factory =
-        gko::experimental::factorization::Lu<value_type, index_type>::build()
-            .with_symbolic_factorization(sparsity)
-            .with_full_fillin(false)
-            .on(this->ref);
-    auto dfactory =
-        gko::experimental::factorization::Lu<value_type, index_type>::build()
-            .with_symbolic_factorization(dsparsity)
-            .with_full_fillin(false)
-            .on(this->exec);
-
-    auto lu = factory->generate(mtx);
-    auto dlu = dfactory->generate(dmtx);
-
-    GKO_ASSERT_MTX_EQ_SPARSITY(lu->get_combined(), dlu->get_combined());
-    GKO_ASSERT_MTX_NEAR(lu->get_combined(), dlu->get_combined(),
-                        r<value_type>::value);
-}
-
-
-TYPED_TEST(Lu, GenerateIluWithHashmapIsEquivalentToRef)
-{
-    using value_type = typename TestFixture::value_type;
-    using index_type = typename TestFixture::index_type;
-    using matrix_type = typename TestFixture::matrix_type;
-    using sparsity_pattern_type = typename TestFixture::sparsity_pattern_type;
-    int n = 68;
-    // the first row and second last row use hashmap for lookup table
-    gko::matrix_data<value_type, index_type> data(gko::dim<2>(n, n));
-    for (int i = 0; i < n; i++) {
-        data.nonzeros.emplace_back(i, i, gko::one<value_type>());
-    }
-    // add dependence
-    data.nonzeros.emplace_back(n - 3, 0, gko::one<value_type>());
-    // add a entry whose col idx is not shown in the above row
-    data.nonzeros.emplace_back(0, n - 2, gko::one<value_type>());
-    data.sort_row_major();
-    auto mtx = gko::share(matrix_type::create(this->ref));
-    mtx->read(data);
-    auto dmtx = gko::share(mtx->clone(this->exec));
-    auto sparsity = gko::share(sparsity_pattern_type::create(this->ref));
-    mtx->convert_to(sparsity);
-    auto dsparsity = gko::share(sparsity->clone(this->exec));
-    auto factory =
-        gko::experimental::factorization::Lu<value_type, index_type>::build()
-            .with_symbolic_factorization(sparsity)
-            .with_full_fillin(false)
-            .on(this->ref);
-    auto dfactory =
-        gko::experimental::factorization::Lu<value_type, index_type>::build()
-            .with_symbolic_factorization(dsparsity)
-            .with_full_fillin(false)
-            .on(this->exec);
-
-    auto lu = factory->generate(mtx);
-    auto dlu = dfactory->generate(dmtx);
-
-    GKO_ASSERT_MTX_EQ_SPARSITY(lu->get_combined(), dlu->get_combined());
-    GKO_ASSERT_MTX_NEAR(lu->get_combined(), dlu->get_combined(),
-                        r<value_type>::value);
-}

From bc66df28db45cf9b46fa8cd9d5cf091bd56677f1 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 20 Nov 2024 18:02:39 +0100
Subject: [PATCH 337/448] refine the wording and fix wrong bool value

Co-authored-by: Natalie Beams <246972+nbeams@users.noreply.github.com>
---
 common/cuda_hip/factorization/ic_kernels.cpp  |  7 +++---
 core/device_hooks/common_kernels.inc.cpp      |  2 +-
 core/factorization/ic.cpp                     | 13 +++++-----
 core/factorization/ic_kernels.hpp             |  8 +++---
 core/factorization/ilu.cpp                    | 12 ++++-----
 dpcpp/factorization/ic_kernels.dp.cpp         |  7 +++---
 .../ginkgo/core/factorization/cholesky.hpp    |  4 +--
 include/ginkgo/core/factorization/ic.hpp      |  9 ++++---
 include/ginkgo/core/factorization/ilu.hpp     |  9 ++++---
 .../incomplete_factorization.hpp              | 25 +++++++++++++++++++
 .../incompleted_factorization.hpp             | 25 -------------------
 include/ginkgo/core/factorization/lu.hpp      |  6 ++---
 include/ginkgo/ginkgo.hpp                     |  2 +-
 omp/factorization/ic_kernels.cpp              |  7 +++---
 reference/factorization/ic_kernels.cpp        |  7 +++---
 reference/test/factorization/ic_kernels.cpp   |  9 ++++---
 reference/test/factorization/ilu_kernels.cpp  |  9 ++++---
 reference/test/factorization/lu_kernels.cpp   |  4 +--
 test/factorization/ic_kernels.cpp             | 15 +++++++----
 test/factorization/ilu_kernels.cpp            | 15 +++++++----
 test/factorization/lu_kernels.cpp             |  4 +--
 21 files changed, 110 insertions(+), 89 deletions(-)
 create mode 100644 include/ginkgo/core/factorization/incomplete_factorization.hpp
 delete mode 100644 include/ginkgo/core/factorization/incompleted_factorization.hpp

diff --git a/common/cuda_hip/factorization/ic_kernels.cpp b/common/cuda_hip/factorization/ic_kernels.cpp
index 62963c479bd..e84032bac35 100644
--- a/common/cuda_hip/factorization/ic_kernels.cpp
+++ b/common/cuda_hip/factorization/ic_kernels.cpp
@@ -17,8 +17,8 @@ namespace ic_factorization {
 
 
 template <typename ValueType, typename IndexType>
-void compute(std::shared_ptr<const DefaultExecutor> exec,
-             matrix::Csr<ValueType, IndexType>* m)
+void sparselib_ic(std::shared_ptr<const DefaultExecutor> exec,
+                  matrix::Csr<ValueType, IndexType>* m)
 {
     const auto id = exec->get_device_id();
     auto handle = exec->get_sparselib_handle();
@@ -54,7 +54,8 @@ void compute(std::shared_ptr<const DefaultExecutor> exec,
     sparselib::destroy(desc);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC_COMPUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_IC_SPARSELIB_IC_KERNEL);
 
 
 }  // namespace ic_factorization
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index dc789271e5f..98d85b2b6d2 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -900,7 +900,7 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL);
 namespace ic_factorization {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC_COMPUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC_SPARSELIB_IC_KERNEL);
 
 
 }  // namespace ic_factorization
diff --git a/core/factorization/ic.cpp b/core/factorization/ic.cpp
index a01df15f7c8..6e57e02e250 100644
--- a/core/factorization/ic.cpp
+++ b/core/factorization/ic.cpp
@@ -30,7 +30,7 @@ namespace ic_factorization {
 namespace {
 
 
-GKO_REGISTER_OPERATION(compute, ic_factorization::compute);
+GKO_REGISTER_OPERATION(sparselib_ic, ic_factorization::sparselib_ic);
 GKO_REGISTER_OPERATION(add_diagonal_elements,
                        factorization::add_diagonal_elements);
 GKO_REGISTER_OPERATION(initialize_row_ptrs_l,
@@ -66,12 +66,12 @@ Ic<ValueType, IndexType>::parse(const config::pnode& config,
         params.with_both_factors(config::get_value<bool>(obj));
     }
     if (auto& obj = config.get("algorithm")) {
-        using gko::factorization::factorize_algorithm;
+        using gko::factorization::incomplete_factorize_algorithm;
         auto str = obj.get_string();
         if (str == "sparselib") {
-            params.with_algorithm(factorize_algorithm::sparselib);
+            params.with_algorithm(incomplete_factorize_algorithm::sparselib);
         } else if (str == "syncfree") {
-            params.with_algorithm(factorize_algorithm::syncfree);
+            params.with_algorithm(incomplete_factorize_algorithm::syncfree);
         } else {
             GKO_INVALID_CONFIG_VALUE("algorithm", str);
         }
@@ -106,7 +106,7 @@ std::unique_ptr<Composition<ValueType>> Ic<ValueType, IndexType>::generate(
     std::shared_ptr<const matrix_type> ic;
     // Compute LC factorization
     if (std::dynamic_pointer_cast<const OmpExecutor>(exec) ||
-        parameters_.algorithm == factorize_algorithm::syncfree) {
+        parameters_.algorithm == incomplete_factorize_algorithm::syncfree) {
         std::unique_ptr<gko::factorization::elimination_forest<IndexType>>
             forest;
         const auto nnz = local_system_matrix->get_num_stored_elements();
@@ -162,7 +162,8 @@ std::unique_ptr<Composition<ValueType>> Ic<ValueType, IndexType>::generate(
             tmp));
         ic = factors;
     } else {
-        exec->run(ic_factorization::make_compute(local_system_matrix.get()));
+        exec->run(
+            ic_factorization::make_sparselib_ic(local_system_matrix.get()));
         ic = local_system_matrix;
     }
 
diff --git a/core/factorization/ic_kernels.hpp b/core/factorization/ic_kernels.hpp
index 187e6cf0e6d..cfdb816314b 100644
--- a/core/factorization/ic_kernels.hpp
+++ b/core/factorization/ic_kernels.hpp
@@ -19,13 +19,13 @@ namespace gko {
 namespace kernels {
 
 
-#define GKO_DECLARE_IC_COMPUTE_KERNEL(ValueType, IndexType)   \
-    void compute(std::shared_ptr<const DefaultExecutor> exec, \
-                 matrix::Csr<ValueType, IndexType>* system_matrix)
+#define GKO_DECLARE_IC_SPARSELIB_IC_KERNEL(ValueType, IndexType)   \
+    void sparselib_ic(std::shared_ptr<const DefaultExecutor> exec, \
+                      matrix::Csr<ValueType, IndexType>* system_matrix)
 
 #define GKO_DECLARE_ALL_AS_TEMPLATES                  \
     template <typename ValueType, typename IndexType> \
-    GKO_DECLARE_IC_COMPUTE_KERNEL(ValueType, IndexType)
+    GKO_DECLARE_IC_SPARSELIB_IC_KERNEL(ValueType, IndexType)
 
 
 GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(ic_factorization,
diff --git a/core/factorization/ilu.cpp b/core/factorization/ilu.cpp
index 6528504706a..b66b9d37189 100644
--- a/core/factorization/ilu.cpp
+++ b/core/factorization/ilu.cpp
@@ -30,7 +30,7 @@ namespace ilu_factorization {
 namespace {
 
 
-GKO_REGISTER_OPERATION(compute_ilu, ilu_factorization::sparselib_ilu);
+GKO_REGISTER_OPERATION(sparselib_ilu, ilu_factorization::sparselib_ilu);
 GKO_REGISTER_OPERATION(add_diagonal_elements,
                        factorization::add_diagonal_elements);
 GKO_REGISTER_OPERATION(initialize_row_ptrs_l_u,
@@ -65,12 +65,12 @@ Ilu<ValueType, IndexType>::parse(const config::pnode& config,
         params.with_skip_sorting(config::get_value<bool>(obj));
     }
     if (auto& obj = config.get("algorithm")) {
-        using gko::factorization::factorize_algorithm;
+        using gko::factorization::incomplete_factorize_algorithm;
         auto str = obj.get_string();
         if (str == "sparselib") {
-            params.with_algorithm(factorize_algorithm::sparselib);
+            params.with_algorithm(incomplete_factorize_algorithm::sparselib);
         } else if (str == "syncfree") {
-            params.with_algorithm(factorize_algorithm::syncfree);
+            params.with_algorithm(incomplete_factorize_algorithm::syncfree);
         } else {
             GKO_INVALID_CONFIG_VALUE("algorithm", str);
         }
@@ -104,7 +104,7 @@ std::unique_ptr<Composition<ValueType>> Ilu<ValueType, IndexType>::generate_l_u(
     std::shared_ptr<const matrix_type> ilu;
     // Compute LU factorization
     if (std::dynamic_pointer_cast<const OmpExecutor>(exec) ||
-        parameters_.algorithm == factorize_algorithm::syncfree) {
+        parameters_.algorithm == incomplete_factorize_algorithm::syncfree) {
         const auto nnz = local_system_matrix->get_num_stored_elements();
         const auto num_rows = local_system_matrix->get_size()[0];
         auto factors = share(
@@ -147,7 +147,7 @@ std::unique_ptr<Composition<ValueType>> Ilu<ValueType, IndexType>::generate_l_u(
         ilu = factors;
     } else {
         exec->run(
-            ilu_factorization::make_compute_ilu(local_system_matrix.get()));
+            ilu_factorization::make_sparselib_ilu(local_system_matrix.get()));
         ilu = local_system_matrix;
     }
     // Separate L and U factors: nnz
diff --git a/dpcpp/factorization/ic_kernels.dp.cpp b/dpcpp/factorization/ic_kernels.dp.cpp
index 287849a68a8..b2626e7876a 100644
--- a/dpcpp/factorization/ic_kernels.dp.cpp
+++ b/dpcpp/factorization/ic_kernels.dp.cpp
@@ -17,10 +17,11 @@ namespace ic_factorization {
 
 
 template <typename ValueType, typename IndexType>
-void compute(std::shared_ptr<const DefaultExecutor> exec,
-             matrix::Csr<ValueType, IndexType>* m) GKO_NOT_IMPLEMENTED;
+void sparselib_ic(std::shared_ptr<const DefaultExecutor> exec,
+                  matrix::Csr<ValueType, IndexType>* m) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC_COMPUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_IC_SPARSELIB_IC_KERNEL);
 
 
 }  // namespace ic_factorization
diff --git a/include/ginkgo/core/factorization/cholesky.hpp b/include/ginkgo/core/factorization/cholesky.hpp
index e2665ed1975..f720d8ddbf7 100644
--- a/include/ginkgo/core/factorization/cholesky.hpp
+++ b/include/ginkgo/core/factorization/cholesky.hpp
@@ -24,8 +24,8 @@ namespace factorization {
  * matrix. This LinOpFactory returns a Factorization storing the L and L^H
  * factors for the provided system matrix in matrix::Csr format. If no symbolic
  * factorization is provided, it will be computed first. It expects all fill-in
- * entries present in symbolic factorization. If symbolic factorization misses
- * some entries, please refer to Ic.
+ * entries to be present in the symbolic factorization. If the symbolic
+ * factorization is missing some entries, please refer to Ic.
  *
  * @tparam ValueType  the type used to store values of the system matrix
  * @tparam IndexType  the type used to store sparsity pattern indices of the
diff --git a/include/ginkgo/core/factorization/ic.hpp b/include/ginkgo/core/factorization/ic.hpp
index cb638709864..acc60918e99 100644
--- a/include/ginkgo/core/factorization/ic.hpp
+++ b/include/ginkgo/core/factorization/ic.hpp
@@ -13,7 +13,7 @@
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/config/config.hpp>
 #include <ginkgo/core/config/registry.hpp>
-#include <ginkgo/core/factorization/incompleted_factorization.hpp>
+#include <ginkgo/core/factorization/incomplete_factorization.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
 
@@ -105,10 +105,11 @@ class Ic : public Composition<ValueType> {
          * Select the implementation which is supposed to be used for
          * the incomplete factorization. This only matters for the CUDA and HIP
          * executor where the choice is between the Ginkgo (syncfree) and the
-         * cuSPARSE/hipSPARSE (sparselib) implementation. Default is sparselib.
+         * cuSPARSE/hipSPARSE/reference (sparselib) implementation. Default is
+         * sparselib.
          */
-        factorize_algorithm GKO_FACTORY_PARAMETER_SCALAR(
-            algorithm, factorize_algorithm::sparselib);
+        incomplete_factorize_algorithm GKO_FACTORY_PARAMETER_SCALAR(
+            algorithm, incomplete_factorize_algorithm::sparselib);
     };
     GKO_ENABLE_LIN_OP_FACTORY(Ic, parameters, Factory);
     GKO_ENABLE_BUILD_METHOD(Factory);
diff --git a/include/ginkgo/core/factorization/ilu.hpp b/include/ginkgo/core/factorization/ilu.hpp
index 2abec999604..27e3e665479 100644
--- a/include/ginkgo/core/factorization/ilu.hpp
+++ b/include/ginkgo/core/factorization/ilu.hpp
@@ -13,7 +13,7 @@
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/config/config.hpp>
 #include <ginkgo/core/config/registry.hpp>
-#include <ginkgo/core/factorization/incompleted_factorization.hpp>
+#include <ginkgo/core/factorization/incomplete_factorization.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
 
@@ -100,10 +100,11 @@ class Ilu : public Composition<ValueType> {
          * Select the implementation which is supposed to be used for
          * the incomplete factorization. This only matters for the CUDA and HIP
          * executor where the choice is between the Ginkgo (syncfree) and the
-         * cuSPARSE/hipSPARSE (sparselib) implementation. Default is sparselib.
+         * cuSPARSE/hipSPARSE/reference (sparselib) implementation. Default is
+         * sparselib.
          */
-        factorize_algorithm GKO_FACTORY_PARAMETER_SCALAR(
-            algorithm, factorize_algorithm::sparselib);
+        incomplete_factorize_algorithm GKO_FACTORY_PARAMETER_SCALAR(
+            algorithm, incomplete_factorize_algorithm::sparselib);
     };
     GKO_ENABLE_LIN_OP_FACTORY(Ilu, parameters, Factory);
     GKO_ENABLE_BUILD_METHOD(Factory);
diff --git a/include/ginkgo/core/factorization/incomplete_factorization.hpp b/include/ginkgo/core/factorization/incomplete_factorization.hpp
new file mode 100644
index 00000000000..57f1ae80f84
--- /dev/null
+++ b/include/ginkgo/core/factorization/incomplete_factorization.hpp
@@ -0,0 +1,25 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_PUBLIC_CORE_FACTORIZATION_INCOMPLETE_FACTORIZATION_HPP_
+#define GKO_PUBLIC_CORE_FACTORIZATION_INCOMPLETE_FACTORIZATION_HPP_
+
+
+namespace gko {
+namespace factorization {
+
+
+/**
+ * An enum class for algorithm selection in the incomplete factorization.
+ * `sparselib` is only available for CUDA, HIP, and reference.
+ * `syncfree` is Ginkgo's implementation by using the Lu/Cholesky factorization
+ * components with given sparsity.
+ */
+enum class incomplete_factorize_algorithm { sparselib, syncfree };
+
+
+}  // namespace factorization
+}  // namespace gko
+
+#endif  // GKO_PUBLIC_CORE_FACTORIZATION_INCOMPLETE_FACTORIZATION_HPP_
diff --git a/include/ginkgo/core/factorization/incompleted_factorization.hpp b/include/ginkgo/core/factorization/incompleted_factorization.hpp
deleted file mode 100644
index 9f712f56e23..00000000000
--- a/include/ginkgo/core/factorization/incompleted_factorization.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_PUBLIC_CORE_FACTORIZATION_INCOMPLETED_FACTORIZATION_HPP_
-#define GKO_PUBLIC_CORE_FACTORIZATION_INCOMPLETED_FACTORIZATION_HPP_
-
-
-namespace gko {
-namespace factorization {
-
-
-/**
- * An enum class for algorithm selection in the incomplete factorization.
- * `sparselib` is only available for CUDA and HIP.
- * `syncfree` is Ginkgo's implementation through the Lu/Cholesky factorization
- * with given sparsity.
- */
-enum class factorize_algorithm { sparselib, syncfree };
-
-
-}  // namespace factorization
-}  // namespace gko
-
-#endif  // GKO_PUBLIC_CORE_FACTORIZATION_INCOMPLETED_FACTORIZATION_HPP_
diff --git a/include/ginkgo/core/factorization/lu.hpp b/include/ginkgo/core/factorization/lu.hpp
index 090cf5cfe11..a80b7f9079b 100644
--- a/include/ginkgo/core/factorization/lu.hpp
+++ b/include/ginkgo/core/factorization/lu.hpp
@@ -42,9 +42,9 @@ enum class symbolic_type {
  * Computes an LU factorization of a sparse matrix. This LinOpFactory returns a
  * Factorization storing the L and U factors for the provided system matrix in
  * matrix::Csr format. If no symbolic factorization is provided, it will be
- * computed first. It expects all fill-in entries present in symbolic
- * factorization. If symbolic factorization misses some entries, please refer to
- * Ilu.
+ * computed first. It expects all fill-in entries to be present in the symbolic
+ * factorization. If the symbolic factorization is missing some entries, please
+ * refer to Ilu.
  *
  * @tparam ValueType  the type used to store values of the system matrix
  * @tparam IndexType  the type used to store sparsity pattern indices of the
diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp
index d1cb0248b08..e9cda520a19 100644
--- a/include/ginkgo/ginkgo.hpp
+++ b/include/ginkgo/ginkgo.hpp
@@ -76,7 +76,7 @@
 #include <ginkgo/core/factorization/factorization.hpp>
 #include <ginkgo/core/factorization/ic.hpp>
 #include <ginkgo/core/factorization/ilu.hpp>
-#include <ginkgo/core/factorization/incompleted_factorization.hpp>
+#include <ginkgo/core/factorization/incomplete_factorization.hpp>
 #include <ginkgo/core/factorization/lu.hpp>
 #include <ginkgo/core/factorization/par_ic.hpp>
 #include <ginkgo/core/factorization/par_ict.hpp>
diff --git a/omp/factorization/ic_kernels.cpp b/omp/factorization/ic_kernels.cpp
index f9b78abc835..c071ba2ca87 100644
--- a/omp/factorization/ic_kernels.cpp
+++ b/omp/factorization/ic_kernels.cpp
@@ -17,10 +17,11 @@ namespace ic_factorization {
 
 
 template <typename ValueType, typename IndexType>
-void compute(std::shared_ptr<const DefaultExecutor> exec,
-             matrix::Csr<ValueType, IndexType>* m) GKO_NOT_IMPLEMENTED;
+void sparselib_ic(std::shared_ptr<const DefaultExecutor> exec,
+                  matrix::Csr<ValueType, IndexType>* m) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC_COMPUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_IC_SPARSELIB_IC_KERNEL);
 
 
 }  // namespace ic_factorization
diff --git a/reference/factorization/ic_kernels.cpp b/reference/factorization/ic_kernels.cpp
index 6f88467262a..93945c2da14 100644
--- a/reference/factorization/ic_kernels.cpp
+++ b/reference/factorization/ic_kernels.cpp
@@ -21,8 +21,8 @@ namespace ic_factorization {
 
 
 template <typename ValueType, typename IndexType>
-void compute(std::shared_ptr<const DefaultExecutor> exec,
-             matrix::Csr<ValueType, IndexType>* m)
+void sparselib_ic(std::shared_ptr<const DefaultExecutor> exec,
+                  matrix::Csr<ValueType, IndexType>* m)
 {
     vector<IndexType> diagonals{m->get_size()[0], -1, exec};
     const auto row_ptrs = m->get_const_row_ptrs();
@@ -69,7 +69,8 @@ void compute(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC_COMPUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_IC_SPARSELIB_IC_KERNEL);
 
 
 }  // namespace ic_factorization
diff --git a/reference/test/factorization/ic_kernels.cpp b/reference/test/factorization/ic_kernels.cpp
index 5195231cf02..22e03b2fe75 100644
--- a/reference/test/factorization/ic_kernels.cpp
+++ b/reference/test/factorization/ic_kernels.cpp
@@ -192,7 +192,8 @@ TYPED_TEST(Ic, GenerateGeneralBySyncfree)
 
     auto fact =
         factorization_type::build()
-            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .with_algorithm(
+                gko::factorization::incomplete_factorize_algorithm::syncfree)
             .on(this->exec)
             ->generate(this->mtx_system);
 
@@ -223,7 +224,8 @@ TYPED_TEST(Ic, GenerateIcWithBitmapIsEquivalentToRefBySyncfree)
     auto result_lt = gko::as<Csr>(result_l->conj_transpose());
     auto factory =
         gko::factorization::Ic<value_type, index_type>::build()
-            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .with_algorithm(
+                gko::factorization::incomplete_factorize_algorithm::syncfree)
             .on(this->ref);
 
     auto ic = factory->generate(mtx);
@@ -270,7 +272,8 @@ TYPED_TEST(Ic, GenerateIcWithHashmapIsEquivalentToRefBySyncfree)
     auto result_lt = gko::as<Csr>(result_l->conj_transpose());
     auto factory =
         gko::factorization::Ic<value_type, index_type>::build()
-            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .with_algorithm(
+                gko::factorization::incomplete_factorize_algorithm::syncfree)
             .on(this->ref);
 
     auto ic = factory->generate(mtx);
diff --git a/reference/test/factorization/ilu_kernels.cpp b/reference/test/factorization/ilu_kernels.cpp
index e06915530e2..1810c44f0a0 100644
--- a/reference/test/factorization/ilu_kernels.cpp
+++ b/reference/test/factorization/ilu_kernels.cpp
@@ -357,7 +357,8 @@ TYPED_TEST(Ilu, GenerateForCsrSmallBySyncfree)
     using ilu_type = typename TestFixture::ilu_type;
     auto factors =
         ilu_type::build()
-            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .with_algorithm(
+                gko::factorization::incomplete_factorize_algorithm::syncfree)
             .on(this->exec)
             ->generate(this->mtx_csr_small);
     auto l_factor = factors->get_l_factor();
@@ -403,7 +404,8 @@ TYPED_TEST(Ilu, GenerateIluWithBitmapIsEquivalentToRefBySyncfree)
     result_u->read(result_u_data);
     auto factory =
         gko::factorization::Ilu<value_type, index_type>::build()
-            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .with_algorithm(
+                gko::factorization::incomplete_factorize_algorithm::syncfree)
             .on(this->ref);
 
     auto lu = factory->generate(mtx);
@@ -447,7 +449,8 @@ TYPED_TEST(Ilu, GenerateIluWithHashmapIsEquivalentToRefBySyncfree)
     result_u->read(result_u_data);
     auto factory =
         gko::factorization::Ilu<value_type, index_type>::build()
-            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .with_algorithm(
+                gko::factorization::incomplete_factorize_algorithm::syncfree)
             .on(this->ref);
 
     auto lu = factory->generate(mtx);
diff --git a/reference/test/factorization/lu_kernels.cpp b/reference/test/factorization/lu_kernels.cpp
index 68619bcf994..c10937ac486 100644
--- a/reference/test/factorization/lu_kernels.cpp
+++ b/reference/test/factorization/lu_kernels.cpp
@@ -36,8 +36,6 @@ class Lu : public ::testing::Test {
     using index_type =
         typename std::tuple_element<1, decltype(ValueIndexType())>::type;
     using matrix_type = gko::matrix::Csr<value_type, index_type>;
-    using sparsity_pattern_type =
-        gko::matrix::SparsityCsr<value_type, index_type>;
 
     Lu()
         : ref(gko::ReferenceExecutor::create()),
@@ -218,7 +216,7 @@ TYPED_TEST(Lu, KernelFactorizeWorks)
         gko::kernels::reference::lu_factorization::factorize(
             this->ref, this->storage_offsets.get_const_data(),
             this->row_descs.get_const_data(), this->storage.get_const_data(),
-            diag_idxs.get_const_data(), this->mtx_lu.get(), false, tmp);
+            diag_idxs.get_const_data(), this->mtx_lu.get(), true, tmp);
 
         GKO_ASSERT_MTX_NEAR(this->mtx_lu, mtx_lu_ref,
                             15 * r<value_type>::value);
diff --git a/test/factorization/ic_kernels.cpp b/test/factorization/ic_kernels.cpp
index 587e93c0a87..e45124ea34d 100644
--- a/test/factorization/ic_kernels.cpp
+++ b/test/factorization/ic_kernels.cpp
@@ -64,7 +64,8 @@ TEST_F(Ic, ComputeICBySyncfreeIsEquivalentToRefSorted)
     auto dfact =
         gko::factorization::Ic<>::build()
             .with_skip_sorting(true)
-            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .with_algorithm(
+                gko::factorization::incomplete_factorize_algorithm::syncfree)
             .on(exec)
             ->generate(dmtx);
 
@@ -103,11 +104,13 @@ TEST_F(Ic, ComputeICWithBitmapIsEquivalentToRefBySyncfree)
 
     auto factory =
         gko::factorization::Ic<value_type, index_type>::build()
-            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .with_algorithm(
+                gko::factorization::incomplete_factorize_algorithm::syncfree)
             .on(this->ref);
     auto dfactory =
         gko::factorization::Ic<value_type, index_type>::build()
-            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .with_algorithm(
+                gko::factorization::incomplete_factorize_algorithm::syncfree)
             .on(this->exec);
 
     auto ic = factory->generate(mtx);
@@ -146,11 +149,13 @@ TEST_F(Ic, ComputeICWithHashmapIsEquivalentToRefBySyncfree)
     auto dmtx = gko::share(mtx->clone(this->exec));
     auto factory =
         gko::factorization::Ic<value_type, index_type>::build()
-            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .with_algorithm(
+                gko::factorization::incomplete_factorize_algorithm::syncfree)
             .on(this->ref);
     auto dfactory =
         gko::factorization::Ic<value_type, index_type>::build()
-            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .with_algorithm(
+                gko::factorization::incomplete_factorize_algorithm::syncfree)
             .on(this->exec);
 
     auto ic = factory->generate(mtx);
diff --git a/test/factorization/ilu_kernels.cpp b/test/factorization/ilu_kernels.cpp
index a45104f33a7..8f9d9f3bb02 100644
--- a/test/factorization/ilu_kernels.cpp
+++ b/test/factorization/ilu_kernels.cpp
@@ -64,7 +64,8 @@ TEST_F(Ilu, ComputeILUBySyncfreeIsEquivalentToRefSorted)
     auto dfact =
         gko::factorization::Ilu<>::build()
             .with_skip_sorting(true)
-            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .with_algorithm(
+                gko::factorization::incomplete_factorize_algorithm::syncfree)
             .on(exec)
             ->generate(dmtx);
 
@@ -103,11 +104,13 @@ TEST_F(Ilu, ComputeILUWithBitmapIsEquivalentToRefBySyncfree)
 
     auto factory =
         gko::factorization::Ilu<value_type, index_type>::build()
-            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .with_algorithm(
+                gko::factorization::incomplete_factorize_algorithm::syncfree)
             .on(this->ref);
     auto dfactory =
         gko::factorization::Ilu<value_type, index_type>::build()
-            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .with_algorithm(
+                gko::factorization::incomplete_factorize_algorithm::syncfree)
             .on(this->exec);
 
     auto ilu = factory->generate(mtx);
@@ -140,11 +143,13 @@ TEST_F(Ilu, ComputeILUWithHashmapIsEquivalentToRefBySyncfree)
     auto dmtx = gko::share(mtx->clone(this->exec));
     auto factory =
         gko::factorization::Ilu<value_type, index_type>::build()
-            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .with_algorithm(
+                gko::factorization::incomplete_factorize_algorithm::syncfree)
             .on(this->ref);
     auto dfactory =
         gko::factorization::Ilu<value_type, index_type>::build()
-            .with_algorithm(gko::factorization::factorize_algorithm::syncfree)
+            .with_algorithm(
+                gko::factorization::incomplete_factorize_algorithm::syncfree)
             .on(this->exec);
 
     auto ilu = factory->generate(mtx);
diff --git a/test/factorization/lu_kernels.cpp b/test/factorization/lu_kernels.cpp
index 58b0aa6c6d4..b9b8bbf00ee 100644
--- a/test/factorization/lu_kernels.cpp
+++ b/test/factorization/lu_kernels.cpp
@@ -196,11 +196,11 @@ TYPED_TEST(Lu, KernelFactorizeIsEquivalentToRef)
         gko::kernels::reference::lu_factorization::factorize(
             this->ref, this->storage_offsets.get_const_data(),
             this->row_descs.get_const_data(), this->storage.get_const_data(),
-            diag_idxs.get_const_data(), this->mtx_lu.get(), false, tmp);
+            diag_idxs.get_const_data(), this->mtx_lu.get(), true, tmp);
         gko::kernels::GKO_DEVICE_NAMESPACE::lu_factorization::factorize(
             this->exec, this->dstorage_offsets.get_const_data(),
             this->drow_descs.get_const_data(), this->dstorage.get_const_data(),
-            ddiag_idxs.get_const_data(), this->dmtx_lu.get(), false, dtmp);
+            ddiag_idxs.get_const_data(), this->dmtx_lu.get(), true, dtmp);
 
         GKO_ASSERT_MTX_NEAR(this->mtx_lu, this->dmtx_lu, r<value_type>::value);
     });

From c54a3ee21caa16fb7d6522fda5b227ad0024f83c Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 21 Nov 2024 16:52:30 +0100
Subject: [PATCH 338/448] update to incomplete_factorization and throw with omp
 using sparselib

---
 core/factorization/ic.cpp                     |  13 +-
 core/factorization/ilu.cpp                    |  13 +-
 core/test/config/factorization.cpp            |   6 +-
 include/ginkgo/core/factorization/ic.hpp      |   4 +-
 include/ginkgo/core/factorization/ilu.hpp     |   4 +-
 .../incomplete_factorization.hpp              |   2 +-
 reference/test/factorization/ic_kernels.cpp   |   9 +-
 reference/test/factorization/ilu_kernels.cpp  |   9 +-
 test/factorization/ic_kernels.cpp             | 108 +++++++++-------
 test/factorization/ilu_kernels.cpp            | 115 ++++++++++--------
 10 files changed, 159 insertions(+), 124 deletions(-)

diff --git a/core/factorization/ic.cpp b/core/factorization/ic.cpp
index 6e57e02e250..818d0586b64 100644
--- a/core/factorization/ic.cpp
+++ b/core/factorization/ic.cpp
@@ -66,12 +66,12 @@ Ic<ValueType, IndexType>::parse(const config::pnode& config,
         params.with_both_factors(config::get_value<bool>(obj));
     }
     if (auto& obj = config.get("algorithm")) {
-        using gko::factorization::incomplete_factorize_algorithm;
+        using gko::factorization::incomplete_algorithm;
         auto str = obj.get_string();
         if (str == "sparselib") {
-            params.with_algorithm(incomplete_factorize_algorithm::sparselib);
+            params.with_algorithm(incomplete_algorithm::sparselib);
         } else if (str == "syncfree") {
-            params.with_algorithm(incomplete_factorize_algorithm::syncfree);
+            params.with_algorithm(incomplete_algorithm::syncfree);
         } else {
             GKO_INVALID_CONFIG_VALUE("algorithm", str);
         }
@@ -105,8 +105,7 @@ std::unique_ptr<Composition<ValueType>> Ic<ValueType, IndexType>::generate(
 
     std::shared_ptr<const matrix_type> ic;
     // Compute LC factorization
-    if (std::dynamic_pointer_cast<const OmpExecutor>(exec) ||
-        parameters_.algorithm == incomplete_factorize_algorithm::syncfree) {
+    if (parameters_.algorithm == incomplete_algorithm::syncfree) {
         std::unique_ptr<gko::factorization::elimination_forest<IndexType>>
             forest;
         const auto nnz = local_system_matrix->get_num_stored_elements();
@@ -161,6 +160,10 @@ std::unique_ptr<Composition<ValueType>> Ic<ValueType, IndexType>::generate(
             transpose_idxs.get_const_data(), *forest, factors.get(), false,
             tmp));
         ic = factors;
+    } else if (std::dynamic_pointer_cast<const OmpExecutor>(exec)) {
+        GKO_INVALID_STATE(
+            "OmpExecutor does not support sparselib algorithm. Please use "
+            "syncfree algorithm.");
     } else {
         exec->run(
             ic_factorization::make_sparselib_ic(local_system_matrix.get()));
diff --git a/core/factorization/ilu.cpp b/core/factorization/ilu.cpp
index b66b9d37189..1ef0afa9ef3 100644
--- a/core/factorization/ilu.cpp
+++ b/core/factorization/ilu.cpp
@@ -65,12 +65,12 @@ Ilu<ValueType, IndexType>::parse(const config::pnode& config,
         params.with_skip_sorting(config::get_value<bool>(obj));
     }
     if (auto& obj = config.get("algorithm")) {
-        using gko::factorization::incomplete_factorize_algorithm;
+        using gko::factorization::incomplete_algorithm;
         auto str = obj.get_string();
         if (str == "sparselib") {
-            params.with_algorithm(incomplete_factorize_algorithm::sparselib);
+            params.with_algorithm(incomplete_algorithm::sparselib);
         } else if (str == "syncfree") {
-            params.with_algorithm(incomplete_factorize_algorithm::syncfree);
+            params.with_algorithm(incomplete_algorithm::syncfree);
         } else {
             GKO_INVALID_CONFIG_VALUE("algorithm", str);
         }
@@ -103,8 +103,7 @@ std::unique_ptr<Composition<ValueType>> Ilu<ValueType, IndexType>::generate_l_u(
 
     std::shared_ptr<const matrix_type> ilu;
     // Compute LU factorization
-    if (std::dynamic_pointer_cast<const OmpExecutor>(exec) ||
-        parameters_.algorithm == incomplete_factorize_algorithm::syncfree) {
+    if (parameters_.algorithm == incomplete_algorithm::syncfree) {
         const auto nnz = local_system_matrix->get_num_stored_elements();
         const auto num_rows = local_system_matrix->get_size()[0];
         auto factors = share(
@@ -145,6 +144,10 @@ std::unique_ptr<Composition<ValueType>> Ilu<ValueType, IndexType>::generate_l_u(
             storage.get_const_data(), diag_idxs.get_const_data(), factors.get(),
             false, tmp));
         ilu = factors;
+    } else if (std::dynamic_pointer_cast<const OmpExecutor>(exec)) {
+        GKO_INVALID_STATE(
+            "OmpExecutor does not support sparselib algorithm. Please use "
+            "syncfree algorithm.");
     } else {
         exec->run(
             ilu_factorization::make_sparselib_ilu(local_system_matrix.get()));
diff --git a/core/test/config/factorization.cpp b/core/test/config/factorization.cpp
index 0d0f2faf0d4..1391787437a 100644
--- a/core/test/config/factorization.cpp
+++ b/core/test/config/factorization.cpp
@@ -76,7 +76,8 @@ struct Ic : FactorizationConfigTest<gko::factorization::Ic<float, int>,
         config_map["both_factors"] = pnode{false};
         param.with_both_factors(false);
         config_map["algorithm"] = pnode{"syncfree"};
-        param.with_algorithm(gko::factorization::factorize_algorithm::syncfree);
+        param.with_algorithm(
+            gko::factorization::incomplete_algorithm::syncfree);
     }
 
     template <typename AnswerType>
@@ -115,7 +116,8 @@ struct Ilu : FactorizationConfigTest<gko::factorization::Ilu<float, int>,
         config_map["skip_sorting"] = pnode{true};
         param.with_skip_sorting(true);
         config_map["algorithm"] = pnode{"syncfree"};
-        param.with_algorithm(gko::factorization::factorize_algorithm::syncfree);
+        param.with_algorithm(
+            gko::factorization::incomplete_algorithm::syncfree);
     }
 
     template <typename AnswerType>
diff --git a/include/ginkgo/core/factorization/ic.hpp b/include/ginkgo/core/factorization/ic.hpp
index acc60918e99..4bc28e1dee9 100644
--- a/include/ginkgo/core/factorization/ic.hpp
+++ b/include/ginkgo/core/factorization/ic.hpp
@@ -108,8 +108,8 @@ class Ic : public Composition<ValueType> {
          * cuSPARSE/hipSPARSE/reference (sparselib) implementation. Default is
          * sparselib.
          */
-        incomplete_factorize_algorithm GKO_FACTORY_PARAMETER_SCALAR(
-            algorithm, incomplete_factorize_algorithm::sparselib);
+        incomplete_algorithm GKO_FACTORY_PARAMETER_SCALAR(
+            algorithm, incomplete_algorithm::sparselib);
     };
     GKO_ENABLE_LIN_OP_FACTORY(Ic, parameters, Factory);
     GKO_ENABLE_BUILD_METHOD(Factory);
diff --git a/include/ginkgo/core/factorization/ilu.hpp b/include/ginkgo/core/factorization/ilu.hpp
index 27e3e665479..96c1e6c10f8 100644
--- a/include/ginkgo/core/factorization/ilu.hpp
+++ b/include/ginkgo/core/factorization/ilu.hpp
@@ -103,8 +103,8 @@ class Ilu : public Composition<ValueType> {
          * cuSPARSE/hipSPARSE/reference (sparselib) implementation. Default is
          * sparselib.
          */
-        incomplete_factorize_algorithm GKO_FACTORY_PARAMETER_SCALAR(
-            algorithm, incomplete_factorize_algorithm::sparselib);
+        incomplete_algorithm GKO_FACTORY_PARAMETER_SCALAR(
+            algorithm, incomplete_algorithm::sparselib);
     };
     GKO_ENABLE_LIN_OP_FACTORY(Ilu, parameters, Factory);
     GKO_ENABLE_BUILD_METHOD(Factory);
diff --git a/include/ginkgo/core/factorization/incomplete_factorization.hpp b/include/ginkgo/core/factorization/incomplete_factorization.hpp
index 57f1ae80f84..85831f2c7e9 100644
--- a/include/ginkgo/core/factorization/incomplete_factorization.hpp
+++ b/include/ginkgo/core/factorization/incomplete_factorization.hpp
@@ -16,7 +16,7 @@ namespace factorization {
  * `syncfree` is Ginkgo's implementation by using the Lu/Cholesky factorization
  * components with given sparsity.
  */
-enum class incomplete_factorize_algorithm { sparselib, syncfree };
+enum class incomplete_algorithm { sparselib, syncfree };
 
 
 }  // namespace factorization
diff --git a/reference/test/factorization/ic_kernels.cpp b/reference/test/factorization/ic_kernels.cpp
index 22e03b2fe75..1593da136a4 100644
--- a/reference/test/factorization/ic_kernels.cpp
+++ b/reference/test/factorization/ic_kernels.cpp
@@ -192,8 +192,7 @@ TYPED_TEST(Ic, GenerateGeneralBySyncfree)
 
     auto fact =
         factorization_type::build()
-            .with_algorithm(
-                gko::factorization::incomplete_factorize_algorithm::syncfree)
+            .with_algorithm(gko::factorization::incomplete_algorithm::syncfree)
             .on(this->exec)
             ->generate(this->mtx_system);
 
@@ -224,8 +223,7 @@ TYPED_TEST(Ic, GenerateIcWithBitmapIsEquivalentToRefBySyncfree)
     auto result_lt = gko::as<Csr>(result_l->conj_transpose());
     auto factory =
         gko::factorization::Ic<value_type, index_type>::build()
-            .with_algorithm(
-                gko::factorization::incomplete_factorize_algorithm::syncfree)
+            .with_algorithm(gko::factorization::incomplete_algorithm::syncfree)
             .on(this->ref);
 
     auto ic = factory->generate(mtx);
@@ -272,8 +270,7 @@ TYPED_TEST(Ic, GenerateIcWithHashmapIsEquivalentToRefBySyncfree)
     auto result_lt = gko::as<Csr>(result_l->conj_transpose());
     auto factory =
         gko::factorization::Ic<value_type, index_type>::build()
-            .with_algorithm(
-                gko::factorization::incomplete_factorize_algorithm::syncfree)
+            .with_algorithm(gko::factorization::incomplete_algorithm::syncfree)
             .on(this->ref);
 
     auto ic = factory->generate(mtx);
diff --git a/reference/test/factorization/ilu_kernels.cpp b/reference/test/factorization/ilu_kernels.cpp
index 1810c44f0a0..aaeb44382f1 100644
--- a/reference/test/factorization/ilu_kernels.cpp
+++ b/reference/test/factorization/ilu_kernels.cpp
@@ -357,8 +357,7 @@ TYPED_TEST(Ilu, GenerateForCsrSmallBySyncfree)
     using ilu_type = typename TestFixture::ilu_type;
     auto factors =
         ilu_type::build()
-            .with_algorithm(
-                gko::factorization::incomplete_factorize_algorithm::syncfree)
+            .with_algorithm(gko::factorization::incomplete_algorithm::syncfree)
             .on(this->exec)
             ->generate(this->mtx_csr_small);
     auto l_factor = factors->get_l_factor();
@@ -404,8 +403,7 @@ TYPED_TEST(Ilu, GenerateIluWithBitmapIsEquivalentToRefBySyncfree)
     result_u->read(result_u_data);
     auto factory =
         gko::factorization::Ilu<value_type, index_type>::build()
-            .with_algorithm(
-                gko::factorization::incomplete_factorize_algorithm::syncfree)
+            .with_algorithm(gko::factorization::incomplete_algorithm::syncfree)
             .on(this->ref);
 
     auto lu = factory->generate(mtx);
@@ -449,8 +447,7 @@ TYPED_TEST(Ilu, GenerateIluWithHashmapIsEquivalentToRefBySyncfree)
     result_u->read(result_u_data);
     auto factory =
         gko::factorization::Ilu<value_type, index_type>::build()
-            .with_algorithm(
-                gko::factorization::incomplete_factorize_algorithm::syncfree)
+            .with_algorithm(gko::factorization::incomplete_algorithm::syncfree)
             .on(this->ref);
 
     auto lu = factory->generate(mtx);
diff --git a/test/factorization/ic_kernels.cpp b/test/factorization/ic_kernels.cpp
index e45124ea34d..d29bbe543e4 100644
--- a/test/factorization/ic_kernels.cpp
+++ b/test/factorization/ic_kernels.cpp
@@ -9,6 +9,7 @@
 
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/factorization/ic.hpp>
 #include <ginkgo/core/factorization/par_ic.hpp>
@@ -37,24 +38,6 @@ class Ic : public CommonTestFixture {
 };
 
 
-TEST_F(Ic, ComputeICIsEquivalentToRefSorted)
-{
-    auto fact = gko::factorization::Ic<>::build()
-                    .with_skip_sorting(true)
-                    .on(ref)
-                    ->generate(mtx);
-    auto dfact = gko::factorization::Ic<>::build()
-                     .with_skip_sorting(true)
-                     .on(exec)
-                     ->generate(dmtx);
-
-    GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), dfact->get_l_factor(), 1e-14);
-    GKO_ASSERT_MTX_NEAR(fact->get_lt_factor(), dfact->get_lt_factor(), 1e-14);
-    GKO_ASSERT_MTX_EQ_SPARSITY(fact->get_l_factor(), dfact->get_l_factor());
-    GKO_ASSERT_MTX_EQ_SPARSITY(fact->get_lt_factor(), dfact->get_lt_factor());
-}
-
-
 TEST_F(Ic, ComputeICBySyncfreeIsEquivalentToRefSorted)
 {
     auto fact = gko::factorization::Ic<>::build()
@@ -64,8 +47,7 @@ TEST_F(Ic, ComputeICBySyncfreeIsEquivalentToRefSorted)
     auto dfact =
         gko::factorization::Ic<>::build()
             .with_skip_sorting(true)
-            .with_algorithm(
-                gko::factorization::incomplete_factorize_algorithm::syncfree)
+            .with_algorithm(gko::factorization::incomplete_algorithm::syncfree)
             .on(exec)
             ->generate(dmtx);
 
@@ -76,21 +58,6 @@ TEST_F(Ic, ComputeICBySyncfreeIsEquivalentToRefSorted)
 }
 
 
-TEST_F(Ic, ComputeICIsEquivalentToRefUnsorted)
-{
-    gko::test::unsort_matrix(mtx, rand_engine);
-    dmtx->copy_from(mtx);
-
-    auto fact = gko::factorization::Ic<>::build().on(ref)->generate(mtx);
-    auto dfact = gko::factorization::Ic<>::build().on(exec)->generate(dmtx);
-
-    GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), dfact->get_l_factor(), 1e-14);
-    GKO_ASSERT_MTX_NEAR(fact->get_lt_factor(), dfact->get_lt_factor(), 1e-14);
-    GKO_ASSERT_MTX_EQ_SPARSITY(fact->get_l_factor(), dfact->get_l_factor());
-    GKO_ASSERT_MTX_EQ_SPARSITY(fact->get_lt_factor(), dfact->get_lt_factor());
-}
-
-
 TEST_F(Ic, ComputeICWithBitmapIsEquivalentToRefBySyncfree)
 {
     // diag + full first row and column
@@ -104,13 +71,11 @@ TEST_F(Ic, ComputeICWithBitmapIsEquivalentToRefBySyncfree)
 
     auto factory =
         gko::factorization::Ic<value_type, index_type>::build()
-            .with_algorithm(
-                gko::factorization::incomplete_factorize_algorithm::syncfree)
+            .with_algorithm(gko::factorization::incomplete_algorithm::syncfree)
             .on(this->ref);
     auto dfactory =
         gko::factorization::Ic<value_type, index_type>::build()
-            .with_algorithm(
-                gko::factorization::incomplete_factorize_algorithm::syncfree)
+            .with_algorithm(gko::factorization::incomplete_algorithm::syncfree)
             .on(this->exec);
 
     auto ic = factory->generate(mtx);
@@ -149,13 +114,11 @@ TEST_F(Ic, ComputeICWithHashmapIsEquivalentToRefBySyncfree)
     auto dmtx = gko::share(mtx->clone(this->exec));
     auto factory =
         gko::factorization::Ic<value_type, index_type>::build()
-            .with_algorithm(
-                gko::factorization::incomplete_factorize_algorithm::syncfree)
+            .with_algorithm(gko::factorization::incomplete_algorithm::syncfree)
             .on(this->ref);
     auto dfactory =
         gko::factorization::Ic<value_type, index_type>::build()
-            .with_algorithm(
-                gko::factorization::incomplete_factorize_algorithm::syncfree)
+            .with_algorithm(gko::factorization::incomplete_algorithm::syncfree)
             .on(this->exec);
 
     auto ic = factory->generate(mtx);
@@ -172,11 +135,64 @@ TEST_F(Ic, ComputeICWithHashmapIsEquivalentToRefBySyncfree)
 
 TEST_F(Ic, SetsCorrectStrategy)
 {
+    auto dfact =
+        gko::factorization::Ic<>::build()
+            .with_l_strategy(std::make_shared<Csr::merge_path>())
+            .with_algorithm(gko::factorization::incomplete_algorithm::syncfree)
+            .on(exec)
+            ->generate(dmtx);
+
+    ASSERT_EQ(dfact->get_l_factor()->get_strategy()->get_name(), "merge_path");
+    ASSERT_EQ(dfact->get_lt_factor()->get_strategy()->get_name(), "merge_path");
+}
+
+
+#ifdef GKO_COMPILING_OMP
+
+
+TEST_F(Ic, OmpComputeICBySparselibShouldThrow)
+{
+    ASSERT_THROW(gko::factorization::Ic<>::build()
+                     .with_skip_sorting(true)
+                     .on(exec)
+                     ->generate(dmtx),
+                 gko::InvalidStateError);
+}
+
+
+#else
+
+
+TEST_F(Ic, ComputeICIsEquivalentToRefSorted)
+{
+    auto fact = gko::factorization::Ic<>::build()
+                    .with_skip_sorting(true)
+                    .on(ref)
+                    ->generate(mtx);
     auto dfact = gko::factorization::Ic<>::build()
-                     .with_l_strategy(std::make_shared<Csr::merge_path>())
+                     .with_skip_sorting(true)
                      .on(exec)
                      ->generate(dmtx);
 
-    ASSERT_EQ(dfact->get_l_factor()->get_strategy()->get_name(), "merge_path");
-    ASSERT_EQ(dfact->get_lt_factor()->get_strategy()->get_name(), "merge_path");
+    GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), dfact->get_l_factor(), 1e-14);
+    GKO_ASSERT_MTX_NEAR(fact->get_lt_factor(), dfact->get_lt_factor(), 1e-14);
+    GKO_ASSERT_MTX_EQ_SPARSITY(fact->get_l_factor(), dfact->get_l_factor());
+    GKO_ASSERT_MTX_EQ_SPARSITY(fact->get_lt_factor(), dfact->get_lt_factor());
+}
+
+
+TEST_F(Ic, ComputeICIsEquivalentToRefUnsorted)
+{
+    gko::test::unsort_matrix(mtx, rand_engine);
+    dmtx->copy_from(mtx);
+
+    auto fact = gko::factorization::Ic<>::build().on(ref)->generate(mtx);
+    auto dfact = gko::factorization::Ic<>::build().on(exec)->generate(dmtx);
+
+    GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), dfact->get_l_factor(), 1e-14);
+    GKO_ASSERT_MTX_NEAR(fact->get_lt_factor(), dfact->get_lt_factor(), 1e-14);
+    GKO_ASSERT_MTX_EQ_SPARSITY(fact->get_l_factor(), dfact->get_l_factor());
+    GKO_ASSERT_MTX_EQ_SPARSITY(fact->get_lt_factor(), dfact->get_lt_factor());
 }
+
+#endif
diff --git a/test/factorization/ilu_kernels.cpp b/test/factorization/ilu_kernels.cpp
index 8f9d9f3bb02..c08c59eeb20 100644
--- a/test/factorization/ilu_kernels.cpp
+++ b/test/factorization/ilu_kernels.cpp
@@ -9,6 +9,7 @@
 
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/factorization/ilu.hpp>
 #include <ginkgo/core/factorization/par_ilu.hpp>
@@ -37,24 +38,6 @@ class Ilu : public CommonTestFixture {
 };
 
 
-TEST_F(Ilu, ComputeILUIsEquivalentToRefSorted)
-{
-    auto fact = gko::factorization::Ilu<>::build()
-                    .with_skip_sorting(true)
-                    .on(ref)
-                    ->generate(mtx);
-    auto dfact = gko::factorization::Ilu<>::build()
-                     .with_skip_sorting(true)
-                     .on(exec)
-                     ->generate(dmtx);
-
-    GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), dfact->get_l_factor(), 1e-14);
-    GKO_ASSERT_MTX_NEAR(fact->get_u_factor(), dfact->get_u_factor(), 1e-14);
-    GKO_ASSERT_MTX_EQ_SPARSITY(fact->get_l_factor(), dfact->get_l_factor());
-    GKO_ASSERT_MTX_EQ_SPARSITY(fact->get_u_factor(), dfact->get_u_factor());
-}
-
-
 TEST_F(Ilu, ComputeILUBySyncfreeIsEquivalentToRefSorted)
 {
     auto fact = gko::factorization::Ilu<>::build()
@@ -64,8 +47,7 @@ TEST_F(Ilu, ComputeILUBySyncfreeIsEquivalentToRefSorted)
     auto dfact =
         gko::factorization::Ilu<>::build()
             .with_skip_sorting(true)
-            .with_algorithm(
-                gko::factorization::incomplete_factorize_algorithm::syncfree)
+            .with_algorithm(gko::factorization::incomplete_algorithm::syncfree)
             .on(exec)
             ->generate(dmtx);
 
@@ -76,21 +58,6 @@ TEST_F(Ilu, ComputeILUBySyncfreeIsEquivalentToRefSorted)
 }
 
 
-TEST_F(Ilu, ComputeILUIsEquivalentToRefUnsorted)
-{
-    gko::test::unsort_matrix(mtx, rand_engine);
-    dmtx->copy_from(mtx);
-
-    auto fact = gko::factorization::Ilu<>::build().on(ref)->generate(mtx);
-    auto dfact = gko::factorization::Ilu<>::build().on(exec)->generate(dmtx);
-
-    GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), dfact->get_l_factor(), 1e-14);
-    GKO_ASSERT_MTX_NEAR(fact->get_u_factor(), dfact->get_u_factor(), 1e-14);
-    GKO_ASSERT_MTX_EQ_SPARSITY(fact->get_l_factor(), dfact->get_l_factor());
-    GKO_ASSERT_MTX_EQ_SPARSITY(fact->get_u_factor(), dfact->get_u_factor());
-}
-
-
 TEST_F(Ilu, ComputeILUWithBitmapIsEquivalentToRefBySyncfree)
 {
     // diag + full first row and column
@@ -104,13 +71,11 @@ TEST_F(Ilu, ComputeILUWithBitmapIsEquivalentToRefBySyncfree)
 
     auto factory =
         gko::factorization::Ilu<value_type, index_type>::build()
-            .with_algorithm(
-                gko::factorization::incomplete_factorize_algorithm::syncfree)
+            .with_algorithm(gko::factorization::incomplete_algorithm::syncfree)
             .on(this->ref);
     auto dfactory =
         gko::factorization::Ilu<value_type, index_type>::build()
-            .with_algorithm(
-                gko::factorization::incomplete_factorize_algorithm::syncfree)
+            .with_algorithm(gko::factorization::incomplete_algorithm::syncfree)
             .on(this->exec);
 
     auto ilu = factory->generate(mtx);
@@ -143,13 +108,11 @@ TEST_F(Ilu, ComputeILUWithHashmapIsEquivalentToRefBySyncfree)
     auto dmtx = gko::share(mtx->clone(this->exec));
     auto factory =
         gko::factorization::Ilu<value_type, index_type>::build()
-            .with_algorithm(
-                gko::factorization::incomplete_factorize_algorithm::syncfree)
+            .with_algorithm(gko::factorization::incomplete_algorithm::syncfree)
             .on(this->ref);
     auto dfactory =
         gko::factorization::Ilu<value_type, index_type>::build()
-            .with_algorithm(
-                gko::factorization::incomplete_factorize_algorithm::syncfree)
+            .with_algorithm(gko::factorization::incomplete_algorithm::syncfree)
             .on(this->exec);
 
     auto ilu = factory->generate(mtx);
@@ -166,15 +129,17 @@ TEST_F(Ilu, ComputeILUWithHashmapIsEquivalentToRefBySyncfree)
 
 TEST_F(Ilu, SetsCorrectStrategy)
 {
-    auto dfact = gko::factorization::Ilu<>::build()
-                     .with_l_strategy(std::make_shared<Csr::merge_path>())
+    auto dfact =
+        gko::factorization::Ilu<>::build()
+            .with_l_strategy(std::make_shared<Csr::merge_path>())
 #ifdef GKO_COMPILING_OMP
-                     .with_u_strategy(std::make_shared<Csr::merge_path>())
+            .with_u_strategy(std::make_shared<Csr::merge_path>())
 #else
-                     .with_u_strategy(std::make_shared<Csr::load_balance>(exec))
+            .with_u_strategy(std::make_shared<Csr::load_balance>(exec))
 #endif
-                     .on(exec)
-                     ->generate(dmtx);
+            .with_algorithm(gko::factorization::incomplete_algorithm::syncfree)
+            .on(exec)
+            ->generate(dmtx);
 
     ASSERT_EQ(dfact->get_l_factor()->get_strategy()->get_name(), "merge_path");
 #ifdef GKO_COMPILING_OMP
@@ -184,3 +149,55 @@ TEST_F(Ilu, SetsCorrectStrategy)
               "load_balance");
 #endif
 }
+
+
+#ifdef GKO_COMPILING_OMP
+
+
+TEST_F(Ilu, OmpComputeILUBySparselibShouldThrow)
+{
+    ASSERT_THROW(gko::factorization::Ilu<>::build()
+                     .with_skip_sorting(true)
+                     .on(exec)
+                     ->generate(dmtx),
+                 gko::InvalidStateError);
+}
+
+
+#else
+
+
+TEST_F(Ilu, ComputeILUIsEquivalentToRefSorted)
+{
+    auto fact = gko::factorization::Ilu<>::build()
+                    .with_skip_sorting(true)
+                    .on(ref)
+                    ->generate(mtx);
+    auto dfact = gko::factorization::Ilu<>::build()
+                     .with_skip_sorting(true)
+                     .on(exec)
+                     ->generate(dmtx);
+
+    GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), dfact->get_l_factor(), 1e-14);
+    GKO_ASSERT_MTX_NEAR(fact->get_u_factor(), dfact->get_u_factor(), 1e-14);
+    GKO_ASSERT_MTX_EQ_SPARSITY(fact->get_l_factor(), dfact->get_l_factor());
+    GKO_ASSERT_MTX_EQ_SPARSITY(fact->get_u_factor(), dfact->get_u_factor());
+}
+
+
+TEST_F(Ilu, ComputeILUIsEquivalentToRefUnsorted)
+{
+    gko::test::unsort_matrix(mtx, rand_engine);
+    dmtx->copy_from(mtx);
+
+    auto fact = gko::factorization::Ilu<>::build().on(ref)->generate(mtx);
+    auto dfact = gko::factorization::Ilu<>::build().on(exec)->generate(dmtx);
+
+    GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), dfact->get_l_factor(), 1e-14);
+    GKO_ASSERT_MTX_NEAR(fact->get_u_factor(), dfact->get_u_factor(), 1e-14);
+    GKO_ASSERT_MTX_EQ_SPARSITY(fact->get_l_factor(), dfact->get_l_factor());
+    GKO_ASSERT_MTX_EQ_SPARSITY(fact->get_u_factor(), dfact->get_u_factor());
+}
+
+
+#endif

From 4e0ac80ea439ce33bcaacf88669e4587b8106240 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 27 Nov 2024 10:32:14 +0100
Subject: [PATCH 339/448] fix Ref also considered as Omp executor

Co-authored-by: Marcel Koch <marcel.koch@kit.edu>
---
 core/factorization/ic.cpp          |  3 ++-
 core/factorization/ilu.cpp         |  3 ++-
 test/factorization/ic_kernels.cpp  | 10 ++++++----
 test/factorization/ilu_kernels.cpp | 10 ++++++----
 4 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/core/factorization/ic.cpp b/core/factorization/ic.cpp
index 818d0586b64..bf9d5e7bbf4 100644
--- a/core/factorization/ic.cpp
+++ b/core/factorization/ic.cpp
@@ -160,7 +160,8 @@ std::unique_ptr<Composition<ValueType>> Ic<ValueType, IndexType>::generate(
             transpose_idxs.get_const_data(), *forest, factors.get(), false,
             tmp));
         ic = factors;
-    } else if (std::dynamic_pointer_cast<const OmpExecutor>(exec)) {
+    } else if (std::dynamic_pointer_cast<const OmpExecutor>(exec) &&
+               !std::dynamic_pointer_cast<const ReferenceExecutor>(exec)) {
         GKO_INVALID_STATE(
             "OmpExecutor does not support sparselib algorithm. Please use "
             "syncfree algorithm.");
diff --git a/core/factorization/ilu.cpp b/core/factorization/ilu.cpp
index 1ef0afa9ef3..f7703f3d20b 100644
--- a/core/factorization/ilu.cpp
+++ b/core/factorization/ilu.cpp
@@ -144,7 +144,8 @@ std::unique_ptr<Composition<ValueType>> Ilu<ValueType, IndexType>::generate_l_u(
             storage.get_const_data(), diag_idxs.get_const_data(), factors.get(),
             false, tmp));
         ilu = factors;
-    } else if (std::dynamic_pointer_cast<const OmpExecutor>(exec)) {
+    } else if (std::dynamic_pointer_cast<const OmpExecutor>(exec) &&
+               !std::dynamic_pointer_cast<const ReferenceExecutor>(exec)) {
         GKO_INVALID_STATE(
             "OmpExecutor does not support sparselib algorithm. Please use "
             "syncfree algorithm.");
diff --git a/test/factorization/ic_kernels.cpp b/test/factorization/ic_kernels.cpp
index d29bbe543e4..51c73ae3272 100644
--- a/test/factorization/ic_kernels.cpp
+++ b/test/factorization/ic_kernels.cpp
@@ -40,10 +40,12 @@ class Ic : public CommonTestFixture {
 
 TEST_F(Ic, ComputeICBySyncfreeIsEquivalentToRefSorted)
 {
-    auto fact = gko::factorization::Ic<>::build()
-                    .with_skip_sorting(true)
-                    .on(ref)
-                    ->generate(mtx);
+    auto fact =
+        gko::factorization::Ic<>::build()
+            .with_skip_sorting(true)
+            .with_algorithm(gko::factorization::incomplete_algorithm::syncfree)
+            .on(ref)
+            ->generate(mtx);
     auto dfact =
         gko::factorization::Ic<>::build()
             .with_skip_sorting(true)
diff --git a/test/factorization/ilu_kernels.cpp b/test/factorization/ilu_kernels.cpp
index c08c59eeb20..6f119d63bac 100644
--- a/test/factorization/ilu_kernels.cpp
+++ b/test/factorization/ilu_kernels.cpp
@@ -40,10 +40,12 @@ class Ilu : public CommonTestFixture {
 
 TEST_F(Ilu, ComputeILUBySyncfreeIsEquivalentToRefSorted)
 {
-    auto fact = gko::factorization::Ilu<>::build()
-                    .with_skip_sorting(true)
-                    .on(ref)
-                    ->generate(mtx);
+    auto fact =
+        gko::factorization::Ilu<>::build()
+            .with_skip_sorting(true)
+            .with_algorithm(gko::factorization::incomplete_algorithm::syncfree)
+            .on(ref)
+            ->generate(mtx);
     auto dfact =
         gko::factorization::Ilu<>::build()
             .with_skip_sorting(true)

From b73753c185bbbecef0e8f87b3f0a036b7a5f8041 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 28 Nov 2024 14:33:41 +0100
Subject: [PATCH 340/448] use if constexpr when it is possible and reverse
 if-else

Co-authored-by: Tobias Ribizel <ribizel@kit.edu>
---
 .../factorization/cholesky_kernels.cpp        | 10 ++--
 common/cuda_hip/factorization/lu_kernels.cpp  | 16 +++---
 omp/factorization/cholesky_kernels.cpp        | 51 ++++++++++++++-----
 omp/factorization/lu_kernels.cpp              | 43 ++++++++++++----
 reference/factorization/cholesky_kernels.cpp  | 51 ++++++++++++++-----
 reference/factorization/lu_kernels.cpp        | 43 ++++++++++++----
 6 files changed, 157 insertions(+), 57 deletions(-)

diff --git a/common/cuda_hip/factorization/cholesky_kernels.cpp b/common/cuda_hip/factorization/cholesky_kernels.cpp
index 87d69a5db83..7ff1382d8c6 100644
--- a/common/cuda_hip/factorization/cholesky_kernels.cpp
+++ b/common/cuda_hip/factorization/cholesky_kernels.cpp
@@ -202,15 +202,15 @@ __global__ __launch_bounds__(default_block_size) void factorize(
             const auto upper_col = cols[upper_nz];
             if (upper_col >= row) {
                 const auto upper_val = vals[upper_nz];
-                if (!full_fillin) {
+                if constexpr (full_fillin) {
+                    const auto output_pos =
+                        lookup.lookup_unsafe(upper_col) + row_begin;
+                    vals[output_pos] -= scale * upper_val;
+                } else {
                     const auto pos = lookup[upper_col];
                     if (pos != invalid_index<IndexType>()) {
                         vals[row_begin + pos] -= scale * upper_val;
                     }
-                } else {
-                    const auto output_pos =
-                        lookup.lookup_unsafe(upper_col) + row_begin;
-                    vals[output_pos] -= scale * upper_val;
                 }
             }
         }
diff --git a/common/cuda_hip/factorization/lu_kernels.cpp b/common/cuda_hip/factorization/lu_kernels.cpp
index b6c2207f4c3..b0d54e44217 100644
--- a/common/cuda_hip/factorization/lu_kernels.cpp
+++ b/common/cuda_hip/factorization/lu_kernels.cpp
@@ -130,15 +130,15 @@ __global__ __launch_bounds__(default_block_size) void factorize(
              upper_nz += config::warp_size) {
             const auto upper_col = cols[upper_nz];
             const auto upper_val = vals[upper_nz];
-            if (!full_fillin) {
+            if constexpr (full_fillin) {
+                const auto output_pos =
+                    lookup.lookup_unsafe(upper_col) + row_begin;
+                vals[output_pos] -= scale * upper_val;
+            } else {
                 const auto pos = lookup[upper_col];
                 if (pos != invalid_index<IndexType>()) {
                     vals[row_begin + pos] -= scale * upper_val;
                 }
-            } else {
-                const auto output_pos =
-                    lookup.lookup_unsafe(upper_col) + row_begin;
-                vals[output_pos] -= scale * upper_val;
             }
         }
     }
@@ -268,15 +268,15 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
         syncfree_storage storage(exec, tmp_storage, num_rows);
         const auto num_blocks =
             ceildiv(num_rows, default_block_size / config::warp_size);
-        if (!full_fillin) {
-            kernel::factorize<false>
+        if (full_fillin) {
+            kernel::factorize<true>
                 <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
                     factors->get_const_row_ptrs(),
                     factors->get_const_col_idxs(), lookup_offsets,
                     lookup_storage, lookup_descs, diag_idxs,
                     as_device_type(factors->get_values()), storage, num_rows);
         } else {
-            kernel::factorize<true>
+            kernel::factorize<false>
                 <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
                     factors->get_const_row_ptrs(),
                     factors->get_const_col_idxs(), lookup_offsets,
diff --git a/omp/factorization/cholesky_kernels.cpp b/omp/factorization/cholesky_kernels.cpp
index 9c39085c98f..aa4aabfc731 100644
--- a/omp/factorization/cholesky_kernels.cpp
+++ b/omp/factorization/cholesky_kernels.cpp
@@ -204,14 +204,17 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_INITIALIZE);
 
 
-template <typename ValueType, typename IndexType>
-void factorize(std::shared_ptr<const DefaultExecutor> exec,
-               const IndexType* lookup_offsets, const int64* lookup_descs,
-               const int32* lookup_storage, const IndexType* diag_idxs,
-               const IndexType* transpose_idxs,
-               const factorization::elimination_forest<IndexType>& forest,
-               matrix::Csr<ValueType, IndexType>* factors, bool full_fillin,
-               array<int>& tmp_storage)
+namespace {
+
+
+template <bool full_fillin, typename ValueType, typename IndexType>
+void factorize_impl(std::shared_ptr<const DefaultExecutor> exec,
+                    const IndexType* lookup_offsets, const int64* lookup_descs,
+                    const int32* lookup_storage, const IndexType* diag_idxs,
+                    const IndexType* transpose_idxs,
+                    const factorization::elimination_forest<IndexType>& forest,
+                    matrix::Csr<ValueType, IndexType>* factors,
+                    array<int>& tmp_storage)
 {
     const auto num_rows = factors->get_size()[0];
     const auto row_ptrs = factors->get_const_row_ptrs();
@@ -233,14 +236,14 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
                 const auto col = cols[dep_nz];
                 if (col < row) {
                     const auto val = vals[dep_nz];
-                    if (!full_fillin) {
+                    if constexpr (full_fillin) {
+                        const auto nz = row_begin + lookup.lookup_unsafe(col);
+                        vals[nz] -= scale * val;
+                    } else {
                         const auto idx = lookup[col];
                         if (idx != invalid_index<IndexType>()) {
                             vals[row_begin + idx] -= scale * val;
                         }
-                    } else {
-                        const auto nz = row_begin + lookup.lookup_unsafe(col);
-                        vals[nz] -= scale * val;
                     }
                 }
             }
@@ -255,6 +258,30 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void factorize(std::shared_ptr<const DefaultExecutor> exec,
+               const IndexType* lookup_offsets, const int64* lookup_descs,
+               const int32* lookup_storage, const IndexType* diag_idxs,
+               const IndexType* transpose_idxs,
+               const factorization::elimination_forest<IndexType>& forest,
+               matrix::Csr<ValueType, IndexType>* factors, bool full_fillin,
+               array<int>& tmp_storage)
+{
+    if (full_fillin) {
+        factorize_impl<true>(exec, lookup_offsets, lookup_descs, lookup_storage,
+                             diag_idxs, transpose_idxs, forest, factors,
+                             tmp_storage);
+    } else {
+        factorize_impl<false>(exec, lookup_offsets, lookup_descs,
+                              lookup_storage, diag_idxs, transpose_idxs, forest,
+                              factors, tmp_storage);
+    }
+}
+
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE);
 
 
diff --git a/omp/factorization/lu_kernels.cpp b/omp/factorization/lu_kernels.cpp
index de6876f1487..4b13f9a352c 100644
--- a/omp/factorization/lu_kernels.cpp
+++ b/omp/factorization/lu_kernels.cpp
@@ -62,12 +62,15 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_INITIALIZE);
 
 
-template <typename ValueType, typename IndexType>
-void factorize(std::shared_ptr<const DefaultExecutor> exec,
-               const IndexType* lookup_offsets, const int64* lookup_descs,
-               const int32* lookup_storage, const IndexType* diag_idxs,
-               matrix::Csr<ValueType, IndexType>* factors, bool full_fillin,
-               array<int>& tmp_storage)
+namespace {
+
+
+template <bool full_fillin, typename ValueType, typename IndexType>
+void factorize_impl(std::shared_ptr<const DefaultExecutor> exec,
+                    const IndexType* lookup_offsets, const int64* lookup_descs,
+                    const int32* lookup_storage, const IndexType* diag_idxs,
+                    matrix::Csr<ValueType, IndexType>* factors,
+                    array<int>& tmp_storage)
 {
     const auto num_rows = factors->get_size()[0];
     const auto row_ptrs = factors->get_const_row_ptrs();
@@ -89,20 +92,40 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
             for (auto dep_nz = dep_diag_idx + 1; dep_nz < dep_end; dep_nz++) {
                 const auto col = cols[dep_nz];
                 const auto val = vals[dep_nz];
-                if (!full_fillin) {
+                if constexpr (full_fillin) {
+                    const auto nz = row_begin + lookup.lookup_unsafe(col);
+                    vals[nz] -= scale * val;
+                } else {
                     const auto idx = lookup[col];
                     if (idx != invalid_index<IndexType>()) {
                         vals[row_begin + idx] -= scale * val;
                     }
-                } else {
-                    const auto nz = row_begin + lookup.lookup_unsafe(col);
-                    vals[nz] -= scale * val;
                 }
             }
         }
     }
 }
 
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void factorize(std::shared_ptr<const DefaultExecutor> exec,
+               const IndexType* lookup_offsets, const int64* lookup_descs,
+               const int32* lookup_storage, const IndexType* diag_idxs,
+               matrix::Csr<ValueType, IndexType>* factors, bool full_fillin,
+               array<int>& tmp_storage)
+{
+    if (full_fillin) {
+        factorize_impl<true>(exec, lookup_offsets, lookup_descs, lookup_storage,
+                             diag_idxs, factors, tmp_storage);
+    } else {
+        factorize_impl<false>(exec, lookup_offsets, lookup_descs,
+                              lookup_storage, diag_idxs, factors, tmp_storage);
+    }
+}
+
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE);
 
 
diff --git a/reference/factorization/cholesky_kernels.cpp b/reference/factorization/cholesky_kernels.cpp
index 882d10ebd72..e4d7112a15f 100644
--- a/reference/factorization/cholesky_kernels.cpp
+++ b/reference/factorization/cholesky_kernels.cpp
@@ -175,14 +175,17 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_INITIALIZE);
 
 
-template <typename ValueType, typename IndexType>
-void factorize(std::shared_ptr<const DefaultExecutor> exec,
-               const IndexType* lookup_offsets, const int64* lookup_descs,
-               const int32* lookup_storage, const IndexType* diag_idxs,
-               const IndexType* transpose_idxs,
-               const factorization::elimination_forest<IndexType>& forest,
-               matrix::Csr<ValueType, IndexType>* factors, bool full_fillin,
-               array<int>& tmp_storage)
+namespace {
+
+
+template <bool full_fillin, typename ValueType, typename IndexType>
+void factorize_impl(std::shared_ptr<const DefaultExecutor> exec,
+                    const IndexType* lookup_offsets, const int64* lookup_descs,
+                    const int32* lookup_storage, const IndexType* diag_idxs,
+                    const IndexType* transpose_idxs,
+                    const factorization::elimination_forest<IndexType>& forest,
+                    matrix::Csr<ValueType, IndexType>* factors,
+                    array<int>& tmp_storage)
 {
     const auto num_rows = factors->get_size()[0];
     const auto row_ptrs = factors->get_const_row_ptrs();
@@ -204,14 +207,14 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
                 const auto col = cols[dep_nz];
                 if (col < row) {
                     const auto val = vals[dep_nz];
-                    if (!full_fillin) {
+                    if constexpr (full_fillin) {
+                        const auto nz = row_begin + lookup.lookup_unsafe(col);
+                        vals[nz] -= scale * val;
+                    } else {
                         const auto idx = lookup[col];
                         if (idx != invalid_index<IndexType>()) {
                             vals[row_begin + idx] -= scale * val;
                         }
-                    } else {
-                        const auto nz = row_begin + lookup.lookup_unsafe(col);
-                        vals[nz] -= scale * val;
                     }
                 }
             }
@@ -227,6 +230,30 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void factorize(std::shared_ptr<const DefaultExecutor> exec,
+               const IndexType* lookup_offsets, const int64* lookup_descs,
+               const int32* lookup_storage, const IndexType* diag_idxs,
+               const IndexType* transpose_idxs,
+               const factorization::elimination_forest<IndexType>& forest,
+               matrix::Csr<ValueType, IndexType>* factors, bool full_fillin,
+               array<int>& tmp_storage)
+{
+    if (full_fillin) {
+        factorize_impl<true>(exec, lookup_offsets, lookup_descs, lookup_storage,
+                             diag_idxs, transpose_idxs, forest, factors,
+                             tmp_storage);
+    } else {
+        factorize_impl<false>(exec, lookup_offsets, lookup_descs,
+                              lookup_storage, diag_idxs, transpose_idxs, forest,
+                              factors, tmp_storage);
+    }
+}
+
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE);
 
 
diff --git a/reference/factorization/lu_kernels.cpp b/reference/factorization/lu_kernels.cpp
index 33fb2f94c4b..c72b14456e1 100644
--- a/reference/factorization/lu_kernels.cpp
+++ b/reference/factorization/lu_kernels.cpp
@@ -61,12 +61,15 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_INITIALIZE);
 
 
-template <typename ValueType, typename IndexType>
-void factorize(std::shared_ptr<const DefaultExecutor> exec,
-               const IndexType* lookup_offsets, const int64* lookup_descs,
-               const int32* lookup_storage, const IndexType* diag_idxs,
-               matrix::Csr<ValueType, IndexType>* factors, bool full_fillin,
-               array<int>& tmp_storage)
+namespace {
+
+
+template <bool full_fillin, typename ValueType, typename IndexType>
+void factorize_impl(std::shared_ptr<const DefaultExecutor> exec,
+                    const IndexType* lookup_offsets, const int64* lookup_descs,
+                    const int32* lookup_storage, const IndexType* diag_idxs,
+                    matrix::Csr<ValueType, IndexType>* factors,
+                    array<int>& tmp_storage)
 {
     const auto num_rows = factors->get_size()[0];
     const auto row_ptrs = factors->get_const_row_ptrs();
@@ -87,20 +90,40 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
             for (auto dep_nz = dep_diag_idx + 1; dep_nz < dep_end; dep_nz++) {
                 const auto col = cols[dep_nz];
                 const auto val = vals[dep_nz];
-                if (!full_fillin) {
+                if constexpr (full_fillin) {
+                    const auto nz = row_begin + lookup.lookup_unsafe(col);
+                    vals[nz] -= scale * val;
+                } else {
                     const auto idx = lookup[col];
                     if (idx != invalid_index<IndexType>()) {
                         vals[row_begin + idx] -= scale * val;
                     }
-                } else {
-                    const auto nz = row_begin + lookup.lookup_unsafe(col);
-                    vals[nz] -= scale * val;
                 }
             }
         }
     }
 }
 
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void factorize(std::shared_ptr<const DefaultExecutor> exec,
+               const IndexType* lookup_offsets, const int64* lookup_descs,
+               const int32* lookup_storage, const IndexType* diag_idxs,
+               matrix::Csr<ValueType, IndexType>* factors, bool full_fillin,
+               array<int>& tmp_storage)
+{
+    if (full_fillin) {
+        factorize_impl<true>(exec, lookup_offsets, lookup_descs, lookup_storage,
+                             diag_idxs, factors, tmp_storage);
+    } else {
+        factorize_impl<false>(exec, lookup_offsets, lookup_descs,
+                              lookup_storage, diag_idxs, factors, tmp_storage);
+    }
+}
+
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE);
 
 

From 9c072cc9a003072080296614952294b91ead821d Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 22 Oct 2024 12:05:26 +0200
Subject: [PATCH 341/448] half base type

---
 core/base/extended_float.hpp      | 393 +-----------------
 core/test/base/extended_float.cpp |  41 +-
 include/ginkgo/core/base/half.hpp | 669 ++++++++++++++++++++++++++++++
 include/ginkgo/ginkgo.hpp         |   1 +
 4 files changed, 707 insertions(+), 397 deletions(-)
 create mode 100644 include/ginkgo/core/base/half.hpp

diff --git a/core/base/extended_float.hpp b/core/base/extended_float.hpp
index c14b5d1bd39..dd7d46c363d 100644
--- a/core/base/extended_float.hpp
+++ b/core/base/extended_float.hpp
@@ -9,6 +9,7 @@
 #include <limits>
 #include <type_traits>
 
+#include <ginkgo/core/base/half.hpp>
 #include <ginkgo/core/base/types.hpp>
 
 
@@ -30,341 +31,6 @@
 namespace gko {
 
 
-template <typename, size_type, size_type>
-class truncated;
-
-
-namespace detail {
-
-
-template <std::size_t, typename = void>
-struct uint_of_impl {};
-
-template <std::size_t Bits>
-struct uint_of_impl<Bits, std::enable_if_t<(Bits <= 16)>> {
-    using type = uint16;
-};
-
-template <std::size_t Bits>
-struct uint_of_impl<Bits, std::enable_if_t<(16 < Bits && Bits <= 32)>> {
-    using type = uint32;
-};
-
-template <std::size_t Bits>
-struct uint_of_impl<Bits, std::enable_if_t<(32 < Bits)>> {
-    using type = uint64;
-};
-
-template <std::size_t Bits>
-using uint_of = typename uint_of_impl<Bits>::type;
-
-
-template <typename T>
-struct basic_float_traits {};
-
-template <>
-struct basic_float_traits<float16> {
-    using type = float16;
-    static constexpr int sign_bits = 1;
-    static constexpr int significand_bits = 10;
-    static constexpr int exponent_bits = 5;
-    static constexpr bool rounds_to_nearest = true;
-};
-
-template <>
-struct basic_float_traits<float32> {
-    using type = float32;
-    static constexpr int sign_bits = 1;
-    static constexpr int significand_bits = 23;
-    static constexpr int exponent_bits = 8;
-    static constexpr bool rounds_to_nearest = true;
-};
-
-template <>
-struct basic_float_traits<float64> {
-    using type = float64;
-    static constexpr int sign_bits = 1;
-    static constexpr int significand_bits = 52;
-    static constexpr int exponent_bits = 11;
-    static constexpr bool rounds_to_nearest = true;
-};
-
-template <typename FloatType, size_type NumComponents, size_type ComponentId>
-struct basic_float_traits<truncated<FloatType, NumComponents, ComponentId>> {
-    using type = truncated<FloatType, NumComponents, ComponentId>;
-    static constexpr int sign_bits = ComponentId == 0 ? 1 : 0;
-    static constexpr int exponent_bits =
-        ComponentId == 0 ? basic_float_traits<FloatType>::exponent_bits : 0;
-    static constexpr int significand_bits =
-        ComponentId == 0 ? sizeof(type) * byte_size - exponent_bits - 1
-                         : sizeof(type) * byte_size;
-    static constexpr bool rounds_to_nearest = false;
-};
-
-
-template <typename UintType>
-constexpr UintType create_ones(int n)
-{
-    return (n == sizeof(UintType) * byte_size ? static_cast<UintType>(0)
-                                              : static_cast<UintType>(1) << n) -
-           static_cast<UintType>(1);
-}
-
-template <typename T>
-struct float_traits {
-    using type = typename basic_float_traits<T>::type;
-    using bits_type = uint_of<sizeof(type) * byte_size>;
-    static constexpr int sign_bits = basic_float_traits<T>::sign_bits;
-    static constexpr int significand_bits =
-        basic_float_traits<T>::significand_bits;
-    static constexpr int exponent_bits = basic_float_traits<T>::exponent_bits;
-    static constexpr bits_type significand_mask =
-        create_ones<bits_type>(significand_bits);
-    static constexpr bits_type exponent_mask =
-        create_ones<bits_type>(significand_bits + exponent_bits) -
-        significand_mask;
-    static constexpr bits_type bias_mask =
-        create_ones<bits_type>(significand_bits + exponent_bits - 1) -
-        significand_mask;
-    static constexpr bits_type sign_mask =
-        create_ones<bits_type>(sign_bits + significand_bits + exponent_bits) -
-        exponent_mask - significand_mask;
-    static constexpr bool rounds_to_nearest =
-        basic_float_traits<T>::rounds_to_nearest;
-
-    static constexpr auto eps =
-        1.0 / (1ll << (significand_bits + rounds_to_nearest));
-
-    static constexpr bool is_inf(bits_type data)
-    {
-        return (data & exponent_mask) == exponent_mask &&
-               (data & significand_mask) == bits_type{};
-    }
-
-    static constexpr bool is_nan(bits_type data)
-    {
-        return (data & exponent_mask) == exponent_mask &&
-               (data & significand_mask) != bits_type{};
-    }
-
-    static constexpr bool is_denom(bits_type data)
-    {
-        return (data & exponent_mask) == bits_type{};
-    }
-};
-
-
-template <typename SourceType, typename ResultType,
-          bool = (sizeof(SourceType) <= sizeof(ResultType))>
-struct precision_converter;
-
-// upcasting implementation details
-template <typename SourceType, typename ResultType>
-struct precision_converter<SourceType, ResultType, true> {
-    using source_traits = float_traits<SourceType>;
-    using result_traits = float_traits<ResultType>;
-    using source_bits = typename source_traits::bits_type;
-    using result_bits = typename result_traits::bits_type;
-
-    static_assert(source_traits::exponent_bits <=
-                          result_traits::exponent_bits &&
-                      source_traits::significand_bits <=
-                          result_traits::significand_bits,
-                  "SourceType has to have both lower range and precision or "
-                  "higher range and precision than ResultType");
-
-    static constexpr int significand_offset =
-        result_traits::significand_bits - source_traits::significand_bits;
-    static constexpr int exponent_offset = significand_offset;
-    static constexpr int sign_offset = result_traits::exponent_bits -
-                                       source_traits::exponent_bits +
-                                       exponent_offset;
-    static constexpr result_bits bias_change =
-        result_traits::bias_mask -
-        (static_cast<result_bits>(source_traits::bias_mask) << exponent_offset);
-
-    static constexpr result_bits shift_significand(source_bits data) noexcept
-    {
-        return static_cast<result_bits>(data & source_traits::significand_mask)
-               << significand_offset;
-    }
-
-    static constexpr result_bits shift_exponent(source_bits data) noexcept
-    {
-        return update_bias(
-            static_cast<result_bits>(data & source_traits::exponent_mask)
-            << exponent_offset);
-    }
-
-    static constexpr result_bits shift_sign(source_bits data) noexcept
-    {
-        return static_cast<result_bits>(data & source_traits::sign_mask)
-               << sign_offset;
-    }
-
-private:
-    static constexpr result_bits update_bias(result_bits data) noexcept
-    {
-        return data == typename result_traits::bits_type{} ? data
-                                                           : data + bias_change;
-    }
-};
-
-// downcasting implementation details
-template <typename SourceType, typename ResultType>
-struct precision_converter<SourceType, ResultType, false> {
-    using source_traits = float_traits<SourceType>;
-    using result_traits = float_traits<ResultType>;
-    using source_bits = typename source_traits::bits_type;
-    using result_bits = typename result_traits::bits_type;
-
-    static_assert(source_traits::exponent_bits >=
-                          result_traits::exponent_bits &&
-                      source_traits::significand_bits >=
-                          result_traits::significand_bits,
-                  "SourceType has to have both lower range and precision or "
-                  "higher range and precision than ResultType");
-
-    static constexpr int significand_offset =
-        source_traits::significand_bits - result_traits::significand_bits;
-    static constexpr int exponent_offset = significand_offset;
-    static constexpr int sign_offset = source_traits::exponent_bits -
-                                       result_traits::exponent_bits +
-                                       exponent_offset;
-    static constexpr source_bits bias_change =
-        (source_traits::bias_mask >> exponent_offset) -
-        static_cast<source_bits>(result_traits::bias_mask);
-
-    static constexpr result_bits shift_significand(source_bits data) noexcept
-    {
-        return static_cast<result_bits>(
-            (data & source_traits::significand_mask) >> significand_offset);
-    }
-
-    static constexpr result_bits shift_exponent(source_bits data) noexcept
-    {
-        return static_cast<result_bits>(update_bias(
-            (data & source_traits::exponent_mask) >> exponent_offset));
-    }
-
-    static constexpr result_bits shift_sign(source_bits data) noexcept
-    {
-        return static_cast<result_bits>((data & source_traits::sign_mask) >>
-                                        sign_offset);
-    }
-
-private:
-    static constexpr source_bits update_bias(source_bits data) noexcept
-    {
-        return data <= bias_change ? typename source_traits::bits_type{}
-                                   : limit_exponent(data - bias_change);
-    }
-
-    static constexpr source_bits limit_exponent(source_bits data) noexcept
-    {
-        return data >= static_cast<source_bits>(result_traits::exponent_mask)
-                   ? static_cast<source_bits>(result_traits::exponent_mask)
-                   : data;
-    }
-};
-
-
-}  // namespace detail
-
-
-/**
- * A class providing basic support for half precision floating point types.
- *
- * For now the only features are reduced storage compared to single precision
- * and conversions from and to single precision floating point type.
- */
-class half {
-public:
-    half() noexcept = default;
-
-    GKO_ATTRIBUTES half(float32 val) noexcept
-    {
-#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-        const auto tmp = __float2half_rn(val);
-        data_ = reinterpret_cast<const uint16&>(tmp);
-#else   // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-        data_ = float2half(reinterpret_cast<const uint32&>(val));
-#endif  // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-    }
-
-    GKO_ATTRIBUTES half(float64 val) noexcept : half(static_cast<float32>(val))
-    {}
-
-    GKO_ATTRIBUTES operator float32() const noexcept
-    {
-#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-        return __half2float(reinterpret_cast<const __half&>(data_));
-#else   // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-        const auto bits = half2float(data_);
-        return reinterpret_cast<const float32&>(bits);
-#endif  // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-    }
-
-    GKO_ATTRIBUTES operator float64() const noexcept
-    {
-        return static_cast<float64>(static_cast<float32>(*this));
-    }
-
-    GKO_ATTRIBUTES half operator-() const noexcept
-    {
-        auto res = *this;
-        // flip sign bit
-        res.data_ ^= f16_traits::sign_mask;
-        return res;
-    }
-
-private:
-    using f16_traits = detail::float_traits<float16>;
-    using f32_traits = detail::float_traits<float32>;
-
-    static uint16 float2half(uint32 data_) noexcept
-    {
-        using conv = detail::precision_converter<float32, float16>;
-        if (f32_traits::is_inf(data_)) {
-            return conv::shift_sign(data_) | f16_traits::exponent_mask;
-        } else if (f32_traits::is_nan(data_)) {
-            return conv::shift_sign(data_) | f16_traits::exponent_mask |
-                   f16_traits::significand_mask;
-        } else {
-            const auto exp = conv::shift_exponent(data_);
-            if (f16_traits::is_inf(exp)) {
-                return conv::shift_sign(data_) | exp;
-            } else if (f16_traits::is_denom(exp)) {
-                // TODO: handle denormals
-                return conv::shift_sign(data_);
-            } else {
-                return conv::shift_sign(data_) | exp |
-                       conv::shift_significand(data_);
-            }
-        }
-    }
-
-    static uint32 half2float(uint16 data_) noexcept
-    {
-        using conv = detail::precision_converter<float16, float32>;
-        if (f16_traits::is_inf(data_)) {
-            return conv::shift_sign(data_) | f32_traits::exponent_mask;
-        } else if (f16_traits::is_nan(data_)) {
-            return conv::shift_sign(data_) | f32_traits::exponent_mask |
-                   f32_traits::significand_mask;
-        } else if (f16_traits::is_denom(data_)) {
-            // TODO: handle denormals
-            return conv::shift_sign(data_);
-        } else {
-            return conv::shift_sign(data_) | conv::shift_exponent(data_) |
-                   conv::shift_significand(data_);
-        }
-    }
-
-    uint16 data_;
-};
-
-
 /**
  * This template implements the truncated (or split) storage of a floating point
  * type.
@@ -458,38 +124,6 @@ class truncated {
 namespace std {
 
 
-template <>
-class complex<gko::half> {
-public:
-    using value_type = gko::half;
-
-    complex(const value_type& real = 0.f, const value_type& imag = 0.f)
-        : real_(real), imag_(imag)
-    {}
-
-    template <typename U>
-    explicit complex(const complex<U>& other)
-        : complex(static_cast<value_type>(other.real()),
-                  static_cast<value_type>(other.imag()))
-    {}
-
-    value_type real() const noexcept { return real_; }
-
-    value_type imag() const noexcept { return imag_; }
-
-
-    operator std::complex<gko::float32>() const noexcept
-    {
-        return std::complex<gko::float32>(static_cast<gko::float32>(real_),
-                                          static_cast<gko::float32>(imag_));
-    }
-
-private:
-    value_type real_;
-    value_type imag_;
-};
-
-
 template <typename T, gko::size_type NumComponents>
 class complex<gko::truncated<T, NumComponents>> {
 public:
@@ -521,31 +155,6 @@ class complex<gko::truncated<T, NumComponents>> {
 };
 
 
-template <>
-struct is_scalar<gko::half> : std::true_type {};
-
-
-template <>
-struct numeric_limits<gko::half> {
-    static constexpr bool is_specialized{true};
-    static constexpr bool is_signed{true};
-    static constexpr bool is_integer{false};
-    static constexpr bool is_exact{false};
-    static constexpr bool is_bounded{true};
-    static constexpr bool is_modulo{false};
-    static constexpr int digits{
-        gko::detail::float_traits<gko::half>::significand_bits + 1};
-    // 3/10 is approx. log_10(2)
-    static constexpr int digits10{digits * 3 / 10};
-
-    // Note: gko::half can't return gko::half here because it does not have
-    //       a constexpr constructor.
-    static constexpr float epsilon()
-    {
-        return gko::detail::float_traits<gko::half>::eps;
-    }
-};
-
 }  // namespace std
 
 
diff --git a/core/test/base/extended_float.cpp b/core/test/base/extended_float.cpp
index 6148c7c350a..818843baa38 100644
--- a/core/test/base/extended_float.cpp
+++ b/core/test/base/extended_float.cpp
@@ -9,6 +9,8 @@
 
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/base/half.hpp>
+
 
 namespace {
 
@@ -110,7 +112,12 @@ TEST_F(FloatToHalf, ConvertsNan)
 {
     half x = create_from_bits("0" "11111111" "00000000000000000000001");
 
+    #if defined(SYCL_LANGUAGE_VERSION) 
+    // Sycl put the 1000000000, but ours put mask
+    ASSERT_EQ(get_bits(x), get_bits("0" "11111" "1000000000"));
+    #else
     ASSERT_EQ(get_bits(x), get_bits("0" "11111" "1111111111"));
+    #endif
 }
 
 
@@ -118,7 +125,12 @@ TEST_F(FloatToHalf, ConvertsNegNan)
 {
     half x = create_from_bits("1" "11111111" "00010000000000000000000");
 
+    #if defined(SYCL_LANGUAGE_VERSION)
+    // Sycl put the 1000000000, but ours put mask
+    ASSERT_EQ(get_bits(x), get_bits("1" "11111" "1000000000"));
+    #else
     ASSERT_EQ(get_bits(x), get_bits("1" "11111" "1111111111"));
+    #endif
 }
 
 
@@ -162,12 +174,21 @@ TEST_F(FloatToHalf, TruncatesSmallNumber)
 }
 
 
-TEST_F(FloatToHalf, TruncatesLargeNumber)
+TEST_F(FloatToHalf, TruncatesLargeNumberRoundToEven)
 {
-    half x = create_from_bits("1" "10001110" "10010011111000010000100");
-
-    ASSERT_EQ(get_bits(x), get_bits("1" "11110" "1001001111"));
-
+    half neg_x = create_from_bits("1" "10001110" "10010011111000010000100");
+    half neg_x2 = create_from_bits("1" "10001110" "10010011101000010000100");
+    half x = create_from_bits("0" "10001110" "10010011111000010000100");
+    half x2 = create_from_bits("0" "10001110" "10010011101000010000100");
+    half x3 = create_from_bits("0" "10001110" "10010011101000000000000");
+    half x4 = create_from_bits("0" "10001110" "10010011111000000000000");
+
+    EXPECT_EQ(get_bits(x), get_bits("0" "11110" "1001010000"));
+    EXPECT_EQ(get_bits(x2), get_bits("0" "11110" "1001001111"));
+    EXPECT_EQ(get_bits(x3), get_bits("0" "11110" "1001001110"));
+    EXPECT_EQ(get_bits(x4), get_bits("0" "11110" "1001010000"));
+    EXPECT_EQ(get_bits(neg_x), get_bits("1" "11110" "1001010000"));
+    EXPECT_EQ(get_bits(neg_x2), get_bits("1" "11110" "1001001111"));
 }
 
 
@@ -216,7 +237,12 @@ TEST_F(HalfToFloat, ConvertsNan)
 {
     float x = create_from_bits("0" "11111" "0001001000");
 
+    #if defined(SYCL_LANGUAGE_VERSION) 
+    // sycl keeps significand
+    ASSERT_EQ(get_bits(x), get_bits("0" "11111111" "00010010000000000000000"));
+    #else
     ASSERT_EQ(get_bits(x), get_bits("0" "11111111" "11111111111111111111111"));
+    #endif
 }
 
 
@@ -224,7 +250,12 @@ TEST_F(HalfToFloat, ConvertsNegNan)
 {
     float x = create_from_bits("1" "11111" "0000000001");
 
+    #if defined(SYCL_LANGUAGE_VERSION) 
+    // sycl keeps significand
+    ASSERT_EQ(get_bits(x), get_bits("1" "11111111" "00000000010000000000000"));
+    #else
     ASSERT_EQ(get_bits(x), get_bits("1" "11111111" "11111111111111111111111"));
+    #endif
 }
 
 
diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp
new file mode 100644
index 00000000000..25a38abb6eb
--- /dev/null
+++ b/include/ginkgo/core/base/half.hpp
@@ -0,0 +1,669 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_PUBLIC_CORE_BASE_HALF_HPP_
+#define GKO_PUBLIC_CORE_BASE_HALF_HPP_
+
+
+#include <complex>
+#include <cstring>
+#include <type_traits>
+
+#include <ginkgo/core/base/std_extensions.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
+class __half;
+
+
+namespace gko {
+
+
+template <typename, size_type, size_type>
+class truncated;
+
+
+namespace detail {
+
+
+template <std::size_t, typename = void>
+struct uint_of_impl {};
+
+template <std::size_t Bits>
+struct uint_of_impl<Bits, std::enable_if_t<(Bits <= 16)>> {
+    using type = uint16;
+};
+
+template <std::size_t Bits>
+struct uint_of_impl<Bits, std::enable_if_t<(16 < Bits && Bits <= 32)>> {
+    using type = uint32;
+};
+
+template <std::size_t Bits>
+struct uint_of_impl<Bits, std::enable_if_t<(32 < Bits) && (Bits <= 64)>> {
+    using type = uint64;
+};
+
+template <std::size_t Bits>
+using uint_of = typename uint_of_impl<Bits>::type;
+
+
+template <typename T>
+struct basic_float_traits {};
+
+template <>
+struct basic_float_traits<float16> {
+    using type = float16;
+    static constexpr int sign_bits = 1;
+    static constexpr int significand_bits = 10;
+    static constexpr int exponent_bits = 5;
+    static constexpr bool rounds_to_nearest = true;
+};
+
+template <>
+struct basic_float_traits<__half> {
+    using type = __half;
+    static constexpr int sign_bits = 1;
+    static constexpr int significand_bits = 10;
+    static constexpr int exponent_bits = 5;
+    static constexpr bool rounds_to_nearest = true;
+};
+
+template <>
+struct basic_float_traits<float32> {
+    using type = float32;
+    static constexpr int sign_bits = 1;
+    static constexpr int significand_bits = 23;
+    static constexpr int exponent_bits = 8;
+    static constexpr bool rounds_to_nearest = true;
+};
+
+template <>
+struct basic_float_traits<float64> {
+    using type = float64;
+    static constexpr int sign_bits = 1;
+    static constexpr int significand_bits = 52;
+    static constexpr int exponent_bits = 11;
+    static constexpr bool rounds_to_nearest = true;
+};
+
+template <typename FloatType, size_type NumComponents, size_type ComponentId>
+struct basic_float_traits<truncated<FloatType, NumComponents, ComponentId>> {
+    using type = truncated<FloatType, NumComponents, ComponentId>;
+    static constexpr int sign_bits = ComponentId == 0 ? 1 : 0;
+    static constexpr int exponent_bits =
+        ComponentId == 0 ? basic_float_traits<FloatType>::exponent_bits : 0;
+    static constexpr int significand_bits =
+        ComponentId == 0 ? sizeof(type) * byte_size - exponent_bits - 1
+                         : sizeof(type) * byte_size;
+    static constexpr bool rounds_to_nearest = false;
+};
+
+
+template <typename UintType>
+constexpr UintType create_ones(int n)
+{
+    return (n == sizeof(UintType) * byte_size ? static_cast<UintType>(0)
+                                              : static_cast<UintType>(1) << n) -
+           static_cast<UintType>(1);
+}
+
+
+template <typename T>
+struct float_traits {
+    using type = typename basic_float_traits<T>::type;
+    using bits_type = uint_of<sizeof(type) * byte_size>;
+    static constexpr int sign_bits = basic_float_traits<T>::sign_bits;
+    static constexpr int significand_bits =
+        basic_float_traits<T>::significand_bits;
+    static constexpr int exponent_bits = basic_float_traits<T>::exponent_bits;
+    static constexpr bits_type significand_mask =
+        create_ones<bits_type>(significand_bits);
+    static constexpr bits_type exponent_mask =
+        create_ones<bits_type>(significand_bits + exponent_bits) -
+        significand_mask;
+    static constexpr bits_type bias_mask =
+        create_ones<bits_type>(significand_bits + exponent_bits - 1) -
+        significand_mask;
+    static constexpr bits_type sign_mask =
+        create_ones<bits_type>(sign_bits + significand_bits + exponent_bits) -
+        exponent_mask - significand_mask;
+    static constexpr bool rounds_to_nearest =
+        basic_float_traits<T>::rounds_to_nearest;
+
+    static constexpr auto eps =
+        1.0 / (1ll << (significand_bits + rounds_to_nearest));
+
+    static constexpr bool is_inf(bits_type data)
+    {
+        return (data & exponent_mask) == exponent_mask &&
+               (data & significand_mask) == bits_type{};
+    }
+
+    static constexpr bool is_nan(bits_type data)
+    {
+        return (data & exponent_mask) == exponent_mask &&
+               (data & significand_mask) != bits_type{};
+    }
+
+    static constexpr bool is_denom(bits_type data)
+    {
+        return (data & exponent_mask) == bits_type{};
+    }
+};
+
+
+template <typename SourceType, typename ResultType,
+          bool = (sizeof(SourceType) <= sizeof(ResultType))>
+struct precision_converter;
+
+// upcasting implementation details
+template <typename SourceType, typename ResultType>
+struct precision_converter<SourceType, ResultType, true> {
+    using source_traits = float_traits<SourceType>;
+    using result_traits = float_traits<ResultType>;
+    using source_bits = typename source_traits::bits_type;
+    using result_bits = typename result_traits::bits_type;
+
+    static_assert(source_traits::exponent_bits <=
+                          result_traits::exponent_bits &&
+                      source_traits::significand_bits <=
+                          result_traits::significand_bits,
+                  "SourceType has to have both lower range and precision or "
+                  "higher range and precision than ResultType");
+
+    static constexpr int significand_offset =
+        result_traits::significand_bits - source_traits::significand_bits;
+    static constexpr int exponent_offset = significand_offset;
+    static constexpr int sign_offset = result_traits::exponent_bits -
+                                       source_traits::exponent_bits +
+                                       exponent_offset;
+    static constexpr result_bits bias_change =
+        result_traits::bias_mask -
+        (static_cast<result_bits>(source_traits::bias_mask) << exponent_offset);
+
+    static constexpr result_bits shift_significand(source_bits data) noexcept
+    {
+        return static_cast<result_bits>(data & source_traits::significand_mask)
+               << significand_offset;
+    }
+
+    static constexpr result_bits shift_exponent(source_bits data) noexcept
+    {
+        return update_bias(
+            static_cast<result_bits>(data & source_traits::exponent_mask)
+            << exponent_offset);
+    }
+
+    static constexpr result_bits shift_sign(source_bits data) noexcept
+    {
+        return static_cast<result_bits>(data & source_traits::sign_mask)
+               << sign_offset;
+    }
+
+private:
+    static constexpr result_bits update_bias(result_bits data) noexcept
+    {
+        return data == typename result_traits::bits_type{} ? data
+                                                           : data + bias_change;
+    }
+};
+
+// downcasting implementation details
+template <typename SourceType, typename ResultType>
+struct precision_converter<SourceType, ResultType, false> {
+    using source_traits = float_traits<SourceType>;
+    using result_traits = float_traits<ResultType>;
+    using source_bits = typename source_traits::bits_type;
+    using result_bits = typename result_traits::bits_type;
+
+    static_assert(source_traits::exponent_bits >=
+                          result_traits::exponent_bits &&
+                      source_traits::significand_bits >=
+                          result_traits::significand_bits,
+                  "SourceType has to have both lower range and precision or "
+                  "higher range and precision than ResultType");
+
+    static constexpr int significand_offset =
+        source_traits::significand_bits - result_traits::significand_bits;
+    static constexpr int exponent_offset = significand_offset;
+    static constexpr int sign_offset = source_traits::exponent_bits -
+                                       result_traits::exponent_bits +
+                                       exponent_offset;
+    static constexpr source_bits bias_change =
+        (source_traits::bias_mask >> exponent_offset) -
+        static_cast<source_bits>(result_traits::bias_mask);
+
+    static constexpr result_bits shift_significand(source_bits data) noexcept
+    {
+        return static_cast<result_bits>(
+            (data & source_traits::significand_mask) >> significand_offset);
+    }
+
+    static constexpr result_bits shift_exponent(source_bits data) noexcept
+    {
+        return static_cast<result_bits>(update_bias(
+            (data & source_traits::exponent_mask) >> exponent_offset));
+    }
+
+    static constexpr result_bits shift_sign(source_bits data) noexcept
+    {
+        return static_cast<result_bits>((data & source_traits::sign_mask) >>
+                                        sign_offset);
+    }
+
+private:
+    static constexpr source_bits update_bias(source_bits data) noexcept
+    {
+        return data <= bias_change ? typename source_traits::bits_type{}
+                                   : limit_exponent(data - bias_change);
+    }
+
+    static constexpr source_bits limit_exponent(source_bits data) noexcept
+    {
+        return data >= static_cast<source_bits>(result_traits::exponent_mask)
+                   ? static_cast<source_bits>(result_traits::exponent_mask)
+                   : data;
+    }
+};
+
+
+}  // namespace detail
+
+
+/**
+ * A class providing basic support for half precision floating point types.
+ *
+ * For now the only features are reduced storage compared to single precision
+ * and conversions from and to single precision floating point type.
+ */
+class half {
+public:
+    // create half value from the bits directly.
+    static constexpr half create_from_bits(uint16 bits) noexcept
+    {
+        half result;
+        result.data_ = bits;
+        return result;
+    }
+
+    // TODO: NVHPC (host side) may not use zero initialization for the data
+    // member by default constructor in some cases. Not sure whether it is
+    // caused by something else in jacobi or isai.
+    constexpr half() noexcept : data_(0){};
+
+    template <typename T, typename = std::enable_if_t<std::is_scalar<T>::value>>
+    half(const T val) : data_(0)
+    {
+        this->float2half(static_cast<float>(val));
+    }
+
+    template <typename V>
+    half& operator=(const V val)
+    {
+        this->float2half(static_cast<float>(val));
+        return *this;
+    }
+
+    operator float() const noexcept
+    {
+        const auto bits = half2float(data_);
+        float ans(0);
+        std::memcpy(&ans, &bits, sizeof(float));
+        return ans;
+    }
+
+    // can not use half operator _op(const half) for half + half
+    // operation will cast it to float and then do float operation such that it
+    // becomes float in the end.
+#define HALF_OPERATOR(_op, _opeq)                                  \
+    friend half operator _op(const half lhf, const half rhf)       \
+    {                                                              \
+        return static_cast<half>(static_cast<float>(lhf)           \
+                                     _op static_cast<float>(rhf)); \
+    }                                                              \
+    half& operator _opeq(const half& hf)                           \
+    {                                                              \
+        auto result = *this _op hf;                                \
+        data_ = result.data_;                                      \
+        return *this;                                              \
+    }
+    HALF_OPERATOR(+, +=)
+    HALF_OPERATOR(-, -=)
+    HALF_OPERATOR(*, *=)
+    HALF_OPERATOR(/, /=)
+
+    // Do operation with different type
+    // If it is floating point, using floating point as type.
+    // If it is integer, using half as type
+#define HALF_FRIEND_OPERATOR(_op, _opeq)                                   \
+    template <typename T>                                                  \
+    friend std::enable_if_t<                                               \
+        !std::is_same<T, half>::value && std::is_scalar<T>::value,         \
+        std::conditional_t<std::is_floating_point<T>::value, T, half>>     \
+    operator _op(const half hf, const T val)                               \
+    {                                                                      \
+        using type =                                                       \
+            std::conditional_t<std::is_floating_point<T>::value, T, half>; \
+        auto result = static_cast<type>(hf);                               \
+        result _opeq static_cast<type>(val);                               \
+        return result;                                                     \
+    }                                                                      \
+    template <typename T>                                                  \
+    friend std::enable_if_t<                                               \
+        !std::is_same<T, half>::value && std::is_scalar<T>::value,         \
+        std::conditional_t<std::is_floating_point<T>::value, T, half>>     \
+    operator _op(const T val, const half hf)                               \
+    {                                                                      \
+        using type =                                                       \
+            std::conditional_t<std::is_floating_point<T>::value, T, half>; \
+        auto result = static_cast<type>(val);                              \
+        result _opeq static_cast<type>(hf);                                \
+        return result;                                                     \
+    }
+
+    HALF_FRIEND_OPERATOR(+, +=)
+    HALF_FRIEND_OPERATOR(-, -=)
+    HALF_FRIEND_OPERATOR(*, *=)
+    HALF_FRIEND_OPERATOR(/, /=)
+
+    // the negative
+    half operator-() const
+    {
+        auto val = 0.0f - *this;
+        return static_cast<half>(val);
+    }
+
+private:
+    using f16_traits = detail::float_traits<float16>;
+    using f32_traits = detail::float_traits<float32>;
+
+    void float2half(float val) noexcept
+    {
+        uint32 bit_val(0);
+        std::memcpy(&bit_val, &val, sizeof(float));
+        data_ = float2half(bit_val);
+    }
+
+    static constexpr uint16 float2half(uint32 data_) noexcept
+    {
+        using conv = detail::precision_converter<float32, float16>;
+        if (f32_traits::is_inf(data_)) {
+            return conv::shift_sign(data_) | f16_traits::exponent_mask;
+        } else if (f32_traits::is_nan(data_)) {
+            return conv::shift_sign(data_) | f16_traits::exponent_mask |
+                   f16_traits::significand_mask;
+        } else {
+            const auto exp = conv::shift_exponent(data_);
+            if (f16_traits::is_inf(exp)) {
+                return conv::shift_sign(data_) | exp;
+            } else if (f16_traits::is_denom(exp)) {
+                // TODO: handle denormals
+                return conv::shift_sign(data_);
+            } else {
+                // Rounding to even
+                const auto result = conv::shift_sign(data_) | exp |
+                                    conv::shift_significand(data_);
+                const auto tail =
+                    data_ & static_cast<f32_traits::bits_type>(
+                                (1 << conv::significand_offset) - 1);
+
+                constexpr auto half = static_cast<f32_traits::bits_type>(
+                    1 << (conv::significand_offset - 1));
+                return result +
+                       (tail > half || ((tail == half) && (result & 1)));
+            }
+        }
+    }
+
+    static constexpr uint32 half2float(uint16 data_) noexcept
+    {
+        using conv = detail::precision_converter<float16, float32>;
+        if (f16_traits::is_inf(data_)) {
+            return conv::shift_sign(data_) | f32_traits::exponent_mask;
+        } else if (f16_traits::is_nan(data_)) {
+            return conv::shift_sign(data_) | f32_traits::exponent_mask |
+                   f32_traits::significand_mask;
+        } else if (f16_traits::is_denom(data_)) {
+            // TODO: handle denormals
+            return conv::shift_sign(data_);
+        } else {
+            return conv::shift_sign(data_) | conv::shift_exponent(data_) |
+                   conv::shift_significand(data_);
+        }
+    }
+
+    uint16 data_;
+};
+
+
+}  // namespace gko
+
+
+namespace std {
+
+
+template <>
+class complex<gko::half> {
+public:
+    using value_type = gko::half;
+
+    complex(const value_type& real = value_type(0.f),
+            const value_type& imag = value_type(0.f))
+        : real_(real), imag_(imag)
+    {}
+
+    template <typename T, typename U,
+              typename = std::enable_if_t<std::is_scalar<T>::value &&
+                                          std::is_scalar<U>::value>>
+    explicit complex(const T& real, const U& imag)
+        : real_(static_cast<value_type>(real)),
+          imag_(static_cast<value_type>(imag))
+    {}
+
+    template <typename T, typename = std::enable_if_t<std::is_scalar<T>::value>>
+    complex(const T& real)
+        : real_(static_cast<value_type>(real)),
+          imag_(static_cast<value_type>(0.f))
+    {}
+
+    // When using complex(real, imag), MSVC with CUDA try to recognize the
+    // complex is a member not constructor.
+    template <typename T, typename = std::enable_if_t<std::is_scalar<T>::value>>
+    explicit complex(const complex<T>& other)
+        : real_(static_cast<value_type>(other.real())),
+          imag_(static_cast<value_type>(other.imag()))
+    {}
+
+    value_type real() const noexcept { return real_; }
+
+    value_type imag() const noexcept { return imag_; }
+
+    operator std::complex<float>() const noexcept
+    {
+        return std::complex<float>(static_cast<float>(real_),
+                                   static_cast<float>(imag_));
+    }
+
+    template <typename V>
+    complex& operator=(const V& val)
+    {
+        real_ = val;
+        imag_ = value_type();
+        return *this;
+    }
+
+    template <typename V>
+    complex& operator=(const std::complex<V>& val)
+    {
+        real_ = val.real();
+        imag_ = val.imag();
+        return *this;
+    }
+
+    complex& operator+=(const value_type& real)
+    {
+        real_ += real;
+        return *this;
+    }
+
+    complex& operator-=(const value_type& real)
+    {
+        real_ -= real;
+        return *this;
+    }
+
+    complex& operator*=(const value_type& real)
+    {
+        real_ *= real;
+        imag_ *= real;
+        return *this;
+    }
+
+    complex& operator/=(const value_type& real)
+    {
+        real_ /= real;
+        imag_ /= real;
+        return *this;
+    }
+
+    template <typename T>
+    complex& operator+=(const complex<T>& val)
+    {
+        real_ += val.real();
+        imag_ += val.imag();
+        return *this;
+    }
+
+    template <typename T>
+    complex& operator-=(const complex<T>& val)
+    {
+        real_ -= val.real();
+        imag_ -= val.imag();
+        return *this;
+    }
+
+    template <typename T>
+    complex& operator*=(const complex<T>& val)
+    {
+        auto val_f = static_cast<std::complex<float>>(val);
+        auto result_f = static_cast<std::complex<float>>(*this);
+        result_f *= val_f;
+        real_ = result_f.real();
+        imag_ = result_f.imag();
+        return *this;
+    }
+
+    template <typename T>
+    complex& operator/=(const complex<T>& val)
+    {
+        auto val_f = static_cast<std::complex<float>>(val);
+        auto result_f = static_cast<std::complex<float>>(*this);
+        result_f /= val_f;
+        real_ = result_f.real();
+        imag_ = result_f.imag();
+        return *this;
+    }
+
+// It's for MacOS.
+// TODO: check whether mac compiler always use complex version even when real
+// half
+#define COMPLEX_HALF_OPERATOR(_op, _opeq)                                \
+    friend complex<gko::half> operator _op(const complex<gko::half> lhf, \
+                                           const complex<gko::half> rhf) \
+    {                                                                    \
+        auto a = lhf;                                                    \
+        a _opeq rhf;                                                     \
+        return a;                                                        \
+    }
+
+    COMPLEX_HALF_OPERATOR(+, +=)
+    COMPLEX_HALF_OPERATOR(-, -=)
+    COMPLEX_HALF_OPERATOR(*, *=)
+    COMPLEX_HALF_OPERATOR(/, /=)
+
+private:
+    value_type real_;
+    value_type imag_;
+};
+
+
+template <>
+struct numeric_limits<gko::half> {
+    static constexpr bool is_specialized{true};
+    static constexpr bool is_signed{true};
+    static constexpr bool is_integer{false};
+    static constexpr bool is_exact{false};
+    static constexpr bool is_bounded{true};
+    static constexpr bool is_modulo{false};
+    static constexpr int digits{
+        gko::detail::float_traits<gko::half>::significand_bits + 1};
+    // 3/10 is approx. log_10(2)
+    static constexpr int digits10{digits * 3 / 10};
+
+    static constexpr gko::half epsilon()
+    {
+        constexpr auto bits = static_cast<std::uint16_t>(0b0'00101'0000000000u);
+        return gko::half::create_from_bits(bits);
+    }
+
+    static constexpr gko::half infinity()
+    {
+        constexpr auto bits = static_cast<std::uint16_t>(0b0'11111'0000000000u);
+        return gko::half::create_from_bits(bits);
+    }
+
+    static constexpr gko::half min()
+    {
+        constexpr auto bits = static_cast<std::uint16_t>(0b0'00001'0000000000u);
+        return gko::half::create_from_bits(bits);
+    }
+
+    static constexpr gko::half max()
+    {
+        constexpr auto bits = static_cast<std::uint16_t>(0b0'11110'1111111111u);
+        return gko::half::create_from_bits(bits);
+    }
+
+    static constexpr gko::half lowest()
+    {
+        constexpr auto bits = static_cast<std::uint16_t>(0b1'11110'1111111111u);
+        return gko::half::create_from_bits(bits);
+    };
+
+    static constexpr gko::half quiet_NaN()
+    {
+        constexpr auto bits = static_cast<std::uint16_t>(0b0'11111'1111111111u);
+        return gko::half::create_from_bits(bits);
+    }
+};
+
+
+// complex using a template on operator= for any kind of complex<T>, so we can
+// do full specialization for half
+template <>
+inline complex<double>& complex<double>::operator=(
+    const std::complex<gko::half>& a)
+{
+    complex<double> t(a.real(), a.imag());
+    operator=(t);
+    return *this;
+}
+
+
+// For MSVC
+template <>
+inline complex<float>& complex<float>::operator=(
+    const std::complex<gko::half>& a)
+{
+    complex<float> t(a.real(), a.imag());
+    operator=(t);
+    return *this;
+}
+
+
+}  // namespace std
+
+
+#endif  // GKO_PUBLIC_CORE_BASE_HALF_HPP_
diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp
index e9cda520a19..2234232905a 100644
--- a/include/ginkgo/ginkgo.hpp
+++ b/include/ginkgo/ginkgo.hpp
@@ -25,6 +25,7 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/fwd_decls.hpp>
+#include <ginkgo/core/base/half.hpp>
 #include <ginkgo/core/base/index_set.hpp>
 #include <ginkgo/core/base/intrinsics.hpp>
 #include <ginkgo/core/base/lin_op.hpp>

From 7a070527e357d0b1602134bc6c235f48ea98d5d4 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 23 Oct 2024 00:31:30 +0200
Subject: [PATCH 342/448] half does not have constexpr constructor

---
 core/test/accessor/reduced_row_major_ginkgo.cpp | 17 ++++++++++-------
 core/test/utils.hpp                             | 11 ++++++-----
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/core/test/accessor/reduced_row_major_ginkgo.cpp b/core/test/accessor/reduced_row_major_ginkgo.cpp
index 7acad0b9638..41aaed54457 100644
--- a/core/test/accessor/reduced_row_major_ginkgo.cpp
+++ b/core/test/accessor/reduced_row_major_ginkgo.cpp
@@ -10,11 +10,12 @@
 
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/base/half.hpp>  // necessary for gko::half
+
 #include "accessor/index_span.hpp"
 #include "accessor/range.hpp"
 #include "accessor/reduced_row_major.hpp"
 #include "accessor/utils.hpp"
-#include "core/base/extended_float.hpp"  // necessary for gko::half
 #include "core/test/utils.hpp"
 
 
@@ -33,12 +34,7 @@ class ReducedStorage3d : public ::testing::Test {
     using st_type =
         typename std::tuple_element<1, decltype(ArithmeticStorageType{})>::type;
     using rcar_type = gko::acc::remove_complex_t<ar_type>;
-    static constexpr rcar_type delta{
-        std::is_same<ar_type, st_type>::value
-            ? 0
-            : std::numeric_limits<
-                  gko::acc::remove_complex_t<st_type>>::epsilon() *
-                  1e1};
+    static const rcar_type delta;
 
     // Type for `check_accessor_correctness` to forward the indices
     using t = std::tuple<int, int, int>;
@@ -119,6 +115,13 @@ class ReducedStorage3d : public ::testing::Test {
     }
 };
 
+template <typename T>
+const typename ReducedStorage3d<T>::rcar_type ReducedStorage3d<T>::delta =
+    std::is_same<ar_type, st_type>::value
+        ? 0
+        : std::numeric_limits<gko::acc::remove_complex_t<st_type>>::epsilon() *
+              1e1;
+
 using ReducedStorage3dTypes =
     ::testing::Types<std::tuple<double, double>, std::tuple<double, float>,
                      std::tuple<float, float>, std::tuple<double, gko::half>,
diff --git a/core/test/utils.hpp b/core/test/utils.hpp
index cacc7191bbf..eee2900d731 100644
--- a/core/test/utils.hpp
+++ b/core/test/utils.hpp
@@ -365,15 +365,16 @@ template <typename Precision, typename OutputType>
 struct reduction_factor {
     using nc_output = remove_complex<OutputType>;
     using nc_precision = remove_complex<Precision>;
-    static constexpr nc_output value{
-        std::numeric_limits<nc_precision>::epsilon() * nc_output{10} *
-        (gko::is_complex<Precision>() ? nc_output{1.4142} : one<nc_output>())};
+
+    static const nc_output value;
 };
 
 
 template <typename Precision, typename OutputType>
-constexpr remove_complex<OutputType>
-    reduction_factor<Precision, OutputType>::value;
+const remove_complex<OutputType>
+    reduction_factor<Precision, OutputType>::value =
+        std::numeric_limits<nc_precision>::epsilon() * nc_output{10} *
+        (gko::is_complex<Precision>() ? nc_output{1.4142} : one<nc_output>());
 
 
 }  // namespace test

From 82f79c13db8ebf7922ca392e55d6aab83b04b54e Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 22 Oct 2024 15:14:06 +0200
Subject: [PATCH 343/448] fix the undefined behavior and the issue from
 big-endian, and extract half to another test

---
 core/test/base/CMakeLists.txt     |   1 +
 core/test/base/extended_float.cpp | 232 ++----------------------
 core/test/base/half.cpp           | 285 ++++++++++++++++++++++++++++++
 3 files changed, 304 insertions(+), 214 deletions(-)
 create mode 100644 core/test/base/half.cpp

diff --git a/core/test/base/CMakeLists.txt b/core/test/base/CMakeLists.txt
index d7deeec6fb7..50306c61455 100644
--- a/core/test/base/CMakeLists.txt
+++ b/core/test/base/CMakeLists.txt
@@ -14,6 +14,7 @@ ginkgo_create_test(exception EXECUTABLE_NAME exception_test) # exception collide
 ginkgo_create_test(exception_helpers)
 ginkgo_create_test(extended_float)
 ginkgo_create_test(executor)
+ginkgo_create_test(half)
 ginkgo_create_test(index_range)
 ginkgo_create_test(iterator_factory)
 ginkgo_create_test(lin_op)
diff --git a/core/test/base/extended_float.cpp b/core/test/base/extended_float.cpp
index 818843baa38..bdb7a58ed84 100644
--- a/core/test/base/extended_float.cpp
+++ b/core/test/base/extended_float.cpp
@@ -49,15 +49,28 @@ class ExtendedFloatTestBase : public ::testing::Test {
     static floating<N - 1> create_from_bits(const char (&s)[N])
     {
         auto bits = std::bitset<N - 1>(s).to_ullong();
-        return reinterpret_cast<floating<N - 1>&>(bits);
+        // We cast to the same size of integer type first.
+        // Otherwise, the first memory chunk is different when we use
+        // reinterpret_cast or memcpy to get the smaller type out of unsigned
+        // long long.
+        using bits_type =
+            typename gko::detail::float_traits<floating<N - 1>>::bits_type;
+        auto bits_val = static_cast<bits_type>(bits);
+        floating<N - 1> result;
+        static_assert(sizeof(floating<N - 1>) == sizeof(bits_type),
+                      "the type should have the same size as its bits_type");
+        std::memcpy(&result, &bits_val, sizeof(bits_type));
+        return result;
     }
 
     template <typename T>
     static std::bitset<sizeof(T) * byte_size> get_bits(T val)
     {
-        auto bits =
-            reinterpret_cast<typename gko::detail::float_traits<T>::bits_type&>(
-                val);
+        using bits_type = typename gko::detail::float_traits<T>::bits_type;
+        bits_type bits;
+        static_assert(sizeof(T) == sizeof(bits_type),
+                      "the type should have the same size as its bits_type");
+        std::memcpy(&bits, &val, sizeof(T));
         return std::bitset<sizeof(T) * byte_size>(bits);
     }
 
@@ -69,218 +82,9 @@ class ExtendedFloatTestBase : public ::testing::Test {
 };
 
 
-class FloatToHalf : public ExtendedFloatTestBase {};
-
-
-// clang-format does terrible formatting of string literal concatenation
-// clang-format off
-
-
-TEST_F(FloatToHalf, ConvertsOne)
-{
-    half x = create_from_bits("0" "01111111" "00000000000000000000000");
-
-    ASSERT_EQ(get_bits(x), get_bits("0" "01111" "0000000000"));
-}
-
-
-TEST_F(FloatToHalf, ConvertsZero)
-{
-    half x = create_from_bits("0" "00000000" "00000000000000000000000");
-
-    ASSERT_EQ(get_bits(x), get_bits("0" "00000" "0000000000"));
-}
-
-
-TEST_F(FloatToHalf, ConvertsInf)
-{
-    half x = create_from_bits("0" "11111111" "00000000000000000000000");
-
-    ASSERT_EQ(get_bits(x), get_bits("0" "11111" "0000000000"));
-}
-
-
-TEST_F(FloatToHalf, ConvertsNegInf)
-{
-    half x = create_from_bits("1" "11111111" "00000000000000000000000");
-
-    ASSERT_EQ(get_bits(x), get_bits("1" "11111" "0000000000"));
-}
-
-
-TEST_F(FloatToHalf, ConvertsNan)
-{
-    half x = create_from_bits("0" "11111111" "00000000000000000000001");
-
-    #if defined(SYCL_LANGUAGE_VERSION) 
-    // Sycl put the 1000000000, but ours put mask
-    ASSERT_EQ(get_bits(x), get_bits("0" "11111" "1000000000"));
-    #else
-    ASSERT_EQ(get_bits(x), get_bits("0" "11111" "1111111111"));
-    #endif
-}
-
-
-TEST_F(FloatToHalf, ConvertsNegNan)
-{
-    half x = create_from_bits("1" "11111111" "00010000000000000000000");
-
-    #if defined(SYCL_LANGUAGE_VERSION)
-    // Sycl put the 1000000000, but ours put mask
-    ASSERT_EQ(get_bits(x), get_bits("1" "11111" "1000000000"));
-    #else
-    ASSERT_EQ(get_bits(x), get_bits("1" "11111" "1111111111"));
-    #endif
-}
-
-
-TEST_F(FloatToHalf, FlushesToZero)
-{
-    half x = create_from_bits("0" "00000111" "00010001000100000001000");
-
-    ASSERT_EQ(get_bits(x), get_bits("0" "00000" "0000000000"));
-}
-
-
-TEST_F(FloatToHalf, FlushesToNegZero)
-{
-    half x = create_from_bits("1" "00000010" "00010001000100000001000");
-
-    ASSERT_EQ(get_bits(x), get_bits("1" "00000" "0000000000"));
-}
-
-
-TEST_F(FloatToHalf, FlushesToInf)
-{
-    half x = create_from_bits("0" "10100000" "10010000000000010000100");
-
-    ASSERT_EQ(get_bits(x), get_bits("0" "11111" "0000000000"));
-}
-
-
-TEST_F(FloatToHalf, FlushesToNegInf)
-{
-    half x = create_from_bits("1" "11000000" "10010000000000010000100");
-
-    ASSERT_EQ(get_bits(x), get_bits("1" "11111" "0000000000"));
-}
-
-
-TEST_F(FloatToHalf, TruncatesSmallNumber)
-{
-    half x = create_from_bits("0" "01110001" "10010000000000010000100");
-
-    ASSERT_EQ(get_bits(x), get_bits("0" "00001" "1001000000"));
-}
-
-
-TEST_F(FloatToHalf, TruncatesLargeNumberRoundToEven)
-{
-    half neg_x = create_from_bits("1" "10001110" "10010011111000010000100");
-    half neg_x2 = create_from_bits("1" "10001110" "10010011101000010000100");
-    half x = create_from_bits("0" "10001110" "10010011111000010000100");
-    half x2 = create_from_bits("0" "10001110" "10010011101000010000100");
-    half x3 = create_from_bits("0" "10001110" "10010011101000000000000");
-    half x4 = create_from_bits("0" "10001110" "10010011111000000000000");
-
-    EXPECT_EQ(get_bits(x), get_bits("0" "11110" "1001010000"));
-    EXPECT_EQ(get_bits(x2), get_bits("0" "11110" "1001001111"));
-    EXPECT_EQ(get_bits(x3), get_bits("0" "11110" "1001001110"));
-    EXPECT_EQ(get_bits(x4), get_bits("0" "11110" "1001010000"));
-    EXPECT_EQ(get_bits(neg_x), get_bits("1" "11110" "1001010000"));
-    EXPECT_EQ(get_bits(neg_x2), get_bits("1" "11110" "1001001111"));
-}
-
-
-// clang-format on
-
-
-class HalfToFloat : public ExtendedFloatTestBase {};
-
-
-// clang-format off
-
-
-TEST_F(HalfToFloat, ConvertsOne)
-{
-    float x = create_from_bits("0" "01111" "0000000000");
-
-    ASSERT_EQ(get_bits(x), get_bits("0" "01111111" "00000000000000000000000"));
-}
-
-
-TEST_F(HalfToFloat, ConvertsZero)
-{
-    float x = create_from_bits("0" "00000" "0000000000");
-
-    ASSERT_EQ(get_bits(x), get_bits("0" "00000000" "00000000000000000000000"));
-}
-
-
-TEST_F(HalfToFloat, ConvertsInf)
-{
-    float x = create_from_bits("0" "11111" "0000000000");
-
-    ASSERT_EQ(get_bits(x), get_bits("0" "11111111" "00000000000000000000000"));
-}
-
-
-TEST_F(HalfToFloat, ConvertsNegInf)
-{
-    float x = create_from_bits("1" "11111" "0000000000");
-
-    ASSERT_EQ(get_bits(x), get_bits("1" "11111111" "00000000000000000000000"));
-}
-
-
-TEST_F(HalfToFloat, ConvertsNan)
-{
-    float x = create_from_bits("0" "11111" "0001001000");
-
-    #if defined(SYCL_LANGUAGE_VERSION) 
-    // sycl keeps significand
-    ASSERT_EQ(get_bits(x), get_bits("0" "11111111" "00010010000000000000000"));
-    #else
-    ASSERT_EQ(get_bits(x), get_bits("0" "11111111" "11111111111111111111111"));
-    #endif
-}
-
-
-TEST_F(HalfToFloat, ConvertsNegNan)
-{
-    float x = create_from_bits("1" "11111" "0000000001");
-
-    #if defined(SYCL_LANGUAGE_VERSION) 
-    // sycl keeps significand
-    ASSERT_EQ(get_bits(x), get_bits("1" "11111111" "00000000010000000000000"));
-    #else
-    ASSERT_EQ(get_bits(x), get_bits("1" "11111111" "11111111111111111111111"));
-    #endif
-}
-
-
-TEST_F(HalfToFloat, ExtendsSmallNumber)
-{
-    float x = create_from_bits("0" "00001" "1000010001");
-
-    ASSERT_EQ(get_bits(x), get_bits("0" "01110001" "10000100010000000000000"));
-}
-
-
-TEST_F(HalfToFloat, ExtendsLargeNumber)
-{
-    float x = create_from_bits("1" "11110" "1001001111");
-
-    ASSERT_EQ(get_bits(x), get_bits("1" "10001110" "10010011110000000000000"));
-}
-
-
-// clang-format on
-
-
 class TruncatedDouble : public ExtendedFloatTestBase {};
 
-
+// clang-format does terrible formatting of string literal concatenation
 // clang-format off
 
 
diff --git a/core/test/base/half.cpp b/core/test/base/half.cpp
new file mode 100644
index 00000000000..f20bac0d47a
--- /dev/null
+++ b/core/test/base/half.cpp
@@ -0,0 +1,285 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <bitset>
+#include <cstring>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include <ginkgo/core/base/half.hpp>
+
+
+template <std::size_t N>
+struct floating_impl;
+
+template <>
+struct floating_impl<16> {
+    using type = gko::half;
+};
+
+template <>
+struct floating_impl<32> {
+    using type = float;
+};
+
+template <>
+struct floating_impl<64> {
+    using type = double;
+};
+
+template <std::size_t N>
+using floating = typename floating_impl<N>::type;
+
+
+class ExtendedFloatTestBase : public ::testing::Test {
+protected:
+    using half = gko::half;
+
+    static constexpr auto byte_size = gko::byte_size;
+
+    template <std::size_t N>
+    static floating<N - 1> create_from_bits(const char (&s)[N])
+    {
+        auto bits = std::bitset<N - 1>(s).to_ullong();
+        // We cast to the same size of integer type first.
+        // Otherwise, the first memory chunk is different when we use
+        // reinterpret_cast or memcpy to get the smaller type out of unsigned
+        // long long.
+        using bits_type =
+            typename gko::detail::float_traits<floating<N - 1>>::bits_type;
+        auto bits_val = static_cast<bits_type>(bits);
+        floating<N - 1> result;
+        static_assert(sizeof(floating<N - 1>) == sizeof(bits_type),
+                      "the type should have the same size as its bits_type");
+        std::memcpy(&result, &bits_val, sizeof(bits_type));
+        return result;
+    }
+
+    template <typename T>
+    static std::bitset<sizeof(T) * byte_size> get_bits(T val)
+    {
+        using bits_type = typename gko::detail::float_traits<T>::bits_type;
+        bits_type bits;
+        static_assert(sizeof(T) == sizeof(bits_type),
+                      "the type should have the same size as its bits_type");
+        std::memcpy(&bits, &val, sizeof(T));
+        return std::bitset<sizeof(T) * byte_size>(bits);
+    }
+
+    template <std::size_t N>
+    static std::bitset<N - 1> get_bits(const char (&s)[N])
+    {
+        return std::bitset<N - 1>(s);
+    }
+};
+
+
+class FloatToHalf : public ExtendedFloatTestBase {};
+
+
+// clang-format does terrible formatting of string literal concatenation
+// clang-format off
+
+
+TEST_F(FloatToHalf, ConvertsOne)
+{
+    half x = create_from_bits("0" "01111111" "00000000000000000000000");
+
+    ASSERT_EQ(get_bits(x), get_bits("0" "01111" "0000000000"));
+}
+
+
+TEST_F(FloatToHalf, ConvertsZero)
+{
+    half x = create_from_bits("0" "00000000" "00000000000000000000000");
+
+    ASSERT_EQ(get_bits(x), get_bits("0" "00000" "0000000000"));
+}
+
+
+TEST_F(FloatToHalf, ConvertsInf)
+{
+    half x = create_from_bits("0" "11111111" "00000000000000000000000");
+
+    ASSERT_EQ(get_bits(x), get_bits("0" "11111" "0000000000"));
+}
+
+
+TEST_F(FloatToHalf, ConvertsNegInf)
+{
+    half x = create_from_bits("1" "11111111" "00000000000000000000000");
+
+    ASSERT_EQ(get_bits(x), get_bits("1" "11111" "0000000000"));
+}
+
+
+TEST_F(FloatToHalf, ConvertsNan)
+{
+    half x = create_from_bits("0" "11111111" "00000000000000000000001");
+
+    #if defined(SYCL_LANGUAGE_VERSION) 
+    // Sycl put the 1000000000, but ours put mask
+    ASSERT_EQ(get_bits(x), get_bits("0" "11111" "1000000000"));
+    #else
+    ASSERT_EQ(get_bits(x), get_bits("0" "11111" "1111111111"));
+    #endif
+}
+
+
+TEST_F(FloatToHalf, ConvertsNegNan)
+{
+    half x = create_from_bits("1" "11111111" "00010000000000000000000");
+
+    #if defined(SYCL_LANGUAGE_VERSION)
+    // Sycl put the 1000000000, but ours put mask
+    ASSERT_EQ(get_bits(x), get_bits("1" "11111" "1000000000"));
+    #else
+    ASSERT_EQ(get_bits(x), get_bits("1" "11111" "1111111111"));
+    #endif
+}
+
+
+TEST_F(FloatToHalf, FlushesToZero)
+{
+    half x = create_from_bits("0" "00000111" "00010001000100000001000");
+
+    ASSERT_EQ(get_bits(x), get_bits("0" "00000" "0000000000"));
+}
+
+
+TEST_F(FloatToHalf, FlushesToNegZero)
+{
+    half x = create_from_bits("1" "00000010" "00010001000100000001000");
+
+    ASSERT_EQ(get_bits(x), get_bits("1" "00000" "0000000000"));
+}
+
+
+TEST_F(FloatToHalf, FlushesToInf)
+{
+    half x = create_from_bits("0" "10100000" "10010000000000010000100");
+
+    ASSERT_EQ(get_bits(x), get_bits("0" "11111" "0000000000"));
+}
+
+
+TEST_F(FloatToHalf, FlushesToNegInf)
+{
+    half x = create_from_bits("1" "11000000" "10010000000000010000100");
+
+    ASSERT_EQ(get_bits(x), get_bits("1" "11111" "0000000000"));
+}
+
+
+TEST_F(FloatToHalf, TruncatesSmallNumber)
+{
+    half x = create_from_bits("0" "01110001" "10010000000000010000100");
+
+    ASSERT_EQ(get_bits(x), get_bits("0" "00001" "1001000000"));
+}
+
+
+TEST_F(FloatToHalf, TruncatesLargeNumberRoundToEven)
+{
+    half neg_x = create_from_bits("1" "10001110" "10010011111000010000100");
+    half neg_x2 = create_from_bits("1" "10001110" "10010011101000010000100");
+    half x = create_from_bits("0" "10001110" "10010011111000010000100");
+    half x2 = create_from_bits("0" "10001110" "10010011101000010000100");
+    half x3 = create_from_bits("0" "10001110" "10010011101000000000000");
+    half x4 = create_from_bits("0" "10001110" "10010011111000000000000");
+
+    EXPECT_EQ(get_bits(x), get_bits("0" "11110" "1001010000"));
+    EXPECT_EQ(get_bits(x2), get_bits("0" "11110" "1001001111"));
+    EXPECT_EQ(get_bits(x3), get_bits("0" "11110" "1001001110"));
+    EXPECT_EQ(get_bits(x4), get_bits("0" "11110" "1001010000"));
+    EXPECT_EQ(get_bits(neg_x), get_bits("1" "11110" "1001010000"));
+    EXPECT_EQ(get_bits(neg_x2), get_bits("1" "11110" "1001001111"));
+}
+
+
+// clang-format on
+
+
+class HalfToFloat : public ExtendedFloatTestBase {};
+
+
+// clang-format off
+
+
+TEST_F(HalfToFloat, ConvertsOne)
+{
+    float x = create_from_bits("0" "01111" "0000000000");
+
+    ASSERT_EQ(get_bits(x), get_bits("0" "01111111" "00000000000000000000000"));
+}
+
+
+TEST_F(HalfToFloat, ConvertsZero)
+{
+    float x = create_from_bits("0" "00000" "0000000000");
+
+    ASSERT_EQ(get_bits(x), get_bits("0" "00000000" "00000000000000000000000"));
+}
+
+
+TEST_F(HalfToFloat, ConvertsInf)
+{
+    float x = create_from_bits("0" "11111" "0000000000");
+
+    ASSERT_EQ(get_bits(x), get_bits("0" "11111111" "00000000000000000000000"));
+}
+
+
+TEST_F(HalfToFloat, ConvertsNegInf)
+{
+    float x = create_from_bits("1" "11111" "0000000000");
+
+    ASSERT_EQ(get_bits(x), get_bits("1" "11111111" "00000000000000000000000"));
+}
+
+
+TEST_F(HalfToFloat, ConvertsNan)
+{
+    float x = create_from_bits("0" "11111" "0001001000");
+
+    #if defined(SYCL_LANGUAGE_VERSION) 
+    // sycl keeps significand
+    ASSERT_EQ(get_bits(x), get_bits("0" "11111111" "00010010000000000000000"));
+    #else
+    ASSERT_EQ(get_bits(x), get_bits("0" "11111111" "11111111111111111111111"));
+    #endif
+}
+
+
+TEST_F(HalfToFloat, ConvertsNegNan)
+{
+    float x = create_from_bits("1" "11111" "0000000001");
+
+    #if defined(SYCL_LANGUAGE_VERSION) 
+    // sycl keeps significand
+    ASSERT_EQ(get_bits(x), get_bits("1" "11111111" "00000000010000000000000"));
+    #else
+    ASSERT_EQ(get_bits(x), get_bits("1" "11111111" "11111111111111111111111"));
+    #endif
+}
+
+
+TEST_F(HalfToFloat, ExtendsSmallNumber)
+{
+    float x = create_from_bits("0" "00001" "1000010001");
+
+    ASSERT_EQ(get_bits(x), get_bits("0" "01110001" "10000100010000000000000"));
+}
+
+
+TEST_F(HalfToFloat, ExtendsLargeNumber)
+{
+    float x = create_from_bits("1" "11110" "1001001111");
+
+    ASSERT_EQ(get_bits(x), get_bits("1" "10001110" "10010011110000000000000"));
+}
+
+
+// clang-format on

From d8352271d71a32f3b4c019b1432253c75ded96ac Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 22 Oct 2024 17:34:34 +0200
Subject: [PATCH 344/448] jacobi use __half in device not gko::half now

---
 .../jacobi_advanced_apply_kernels.instantiate.cpp      |  2 +-
 .../jacobi_generate_kernels.instantiate.cpp            | 10 +++++-----
 common/cuda_hip/preconditioner/jacobi_kernels.cpp      |  4 ++--
 .../jacobi_simple_apply_kernels.instantiate.cpp        |  2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp b/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp
index 0ecc3d0d44b..131c530d2ee 100644
--- a/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp
+++ b/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp
@@ -90,7 +90,7 @@ __launch_bounds__(warps_per_block* config::warp_size) advanced_adaptive_apply(
         ValueType, block_precisions[block_id],
         multiply_vec<max_block_size>(
             subwarp, block_size, v,
-            reinterpret_cast<const resolved_precision*>(
+            reinterpret_cast<const device_type<resolved_precision>*>(
                 blocks + storage_scheme.get_group_offset(block_id)) +
                 storage_scheme.get_block_offset(block_id) +
                 subwarp.thread_rank(),
diff --git a/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp b/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp
index d004309c622..fdb0ad11e9e 100644
--- a/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp
+++ b/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp
@@ -68,7 +68,7 @@ __device__ __forceinline__ bool validate_precision_reduction_feasibility(
         }
     }
 
-    return succeeded && block_cond >= 1.0 &&
+    return succeeded && block_cond >= remove_complex<ValueType>{1.0} &&
            block_cond * static_cast<remove_complex<ValueType>>(
                             float_traits<remove_complex<ValueType>>::eps) <
                remove_complex<ValueType>{1e-3};
@@ -160,7 +160,7 @@ __launch_bounds__(warps_per_block* config::warp_size) adaptive_generate(
                 accuracy, block_cond,
                 [&subwarp, &block_size, &row, &block_data, &storage_scheme,
                  &block_id] {
-                    using target = reduce_precision<ValueType>;
+                    using target = device_type<reduce_precision<ValueType>>;
                     return validate_precision_reduction_feasibility<
                         max_block_size, target>(
                         subwarp, block_size, row,
@@ -170,8 +170,8 @@ __launch_bounds__(warps_per_block* config::warp_size) adaptive_generate(
                 },
                 [&subwarp, &block_size, &row, &block_data, &storage_scheme,
                  &block_id] {
-                    using target =
-                        reduce_precision<reduce_precision<ValueType>>;
+                    using target = device_type<
+                        reduce_precision<reduce_precision<ValueType>>>;
                     return validate_precision_reduction_feasibility<
                         max_block_size, target>(
                         subwarp, block_size, row,
@@ -195,7 +195,7 @@ __launch_bounds__(warps_per_block* config::warp_size) adaptive_generate(
             ValueType, prec,
             copy_matrix<max_block_size, and_transpose>(
                 subwarp, block_size, row, 1, perm, trans_perm,
-                reinterpret_cast<resolved_precision*>(
+                reinterpret_cast<device_type<resolved_precision>*>(
                     block_data + storage_scheme.get_group_offset(block_id)) +
                     storage_scheme.get_block_offset(block_id),
                 storage_scheme.get_stride()));
diff --git a/common/cuda_hip/preconditioner/jacobi_kernels.cpp b/common/cuda_hip/preconditioner/jacobi_kernels.cpp
index f3b099e7c18..6f2d4ae3974 100644
--- a/common/cuda_hip/preconditioner/jacobi_kernels.cpp
+++ b/common/cuda_hip/preconditioner/jacobi_kernels.cpp
@@ -206,11 +206,11 @@ __launch_bounds__(warps_per_block* config::warp_size) adaptive_transpose_jacobi(
     GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION(
         ValueType, block_precisions[block_id],
         auto local_block =
-            reinterpret_cast<const resolved_precision*>(
+            reinterpret_cast<const device_type<resolved_precision>*>(
                 blocks + storage_scheme.get_group_offset(block_id)) +
             storage_scheme.get_block_offset(block_id);
         auto local_out_block =
-            reinterpret_cast<resolved_precision*>(
+            reinterpret_cast<device_type<resolved_precision>*>(
                 out_blocks + storage_scheme.get_group_offset(block_id)) +
             storage_scheme.get_block_offset(block_id);
         for (int i = rank; i < block_size * block_size; i += subwarp_size) {
diff --git a/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.cpp b/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.cpp
index 734385970e3..faf869718a6 100644
--- a/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.cpp
+++ b/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.cpp
@@ -84,7 +84,7 @@ __global__ void __launch_bounds__(warps_per_block* config::warp_size)
         ValueType, block_precisions[block_id],
         multiply_vec<max_block_size>(
             subwarp, block_size, v,
-            reinterpret_cast<const resolved_precision*>(
+            reinterpret_cast<const device_type<resolved_precision>*>(
                 blocks + storage_scheme.get_group_offset(block_id)) +
                 storage_scheme.get_block_offset(block_id) +
                 subwarp.thread_rank(),

From be0c1920335e3688ef4797a79429dfa43236f71a Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 22 Oct 2024 18:02:57 +0200
Subject: [PATCH 345/448] type map

---
 accessor/cuda_helper.hpp          | 15 ++++++++++++-
 accessor/hip_helper.hpp           | 14 +++++++++++-
 cuda/base/types.hpp               | 36 +++++++++++++++++++++++++------
 hip/base/types.hip.hpp            | 33 ++++++++++++++++++++++++----
 include/ginkgo/core/base/math.hpp |  9 +++++++-
 5 files changed, 94 insertions(+), 13 deletions(-)

diff --git a/accessor/cuda_helper.hpp b/accessor/cuda_helper.hpp
index 31d3599516d..3efc6eb22b7 100644
--- a/accessor/cuda_helper.hpp
+++ b/accessor/cuda_helper.hpp
@@ -17,7 +17,15 @@
 #include "utils.hpp"
 
 
+struct __half;
+
+
 namespace gko {
+
+
+class half;
+
+
 namespace acc {
 namespace detail {
 
@@ -27,6 +35,11 @@ struct cuda_type {
     using type = T;
 };
 
+template <>
+struct cuda_type<gko::half> {
+    using type = __half;
+};
+
 // Unpack cv and reference / pointer qualifiers
 template <typename T>
 struct cuda_type<const T> {
@@ -57,7 +70,7 @@ struct cuda_type<T&&> {
 // Transform std::complex to thrust::complex
 template <typename T>
 struct cuda_type<std::complex<T>> {
-    using type = thrust::complex<T>;
+    using type = thrust::complex<typename cuda_type<T>::type>;
 };
 
 
diff --git a/accessor/hip_helper.hpp b/accessor/hip_helper.hpp
index 6b76b726c10..8827fd6eb11 100644
--- a/accessor/hip_helper.hpp
+++ b/accessor/hip_helper.hpp
@@ -17,7 +17,15 @@
 #include "utils.hpp"
 
 
+struct __half;
+
+
 namespace gko {
+
+
+class half;
+
+
 namespace acc {
 namespace detail {
 
@@ -53,11 +61,15 @@ struct hip_type<T&&> {
     using type = typename hip_type<T>::type&&;
 };
 
+template <>
+struct hip_type<gko::half> {
+    using type = __half;
+};
 
 // Transform std::complex to thrust::complex
 template <typename T>
 struct hip_type<std::complex<T>> {
-    using type = thrust::complex<T>;
+    using type = thrust::complex<typename hip_type<T>::type>;
 };
 
 
diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp
index a4a2b877c28..05f07ceb8dd 100644
--- a/cuda/base/types.hpp
+++ b/cuda/base/types.hpp
@@ -14,20 +14,17 @@
 #include <cusparse.h>
 #include <thrust/complex.h>
 
+#include <ginkgo/core/base/half.hpp>
 #include <ginkgo/core/base/matrix_data.hpp>
 #include <ginkgo/core/base/types.hpp>
 
 
 namespace gko {
 
-
 namespace kernels {
 namespace cuda {
-
-
 namespace detail {
 
-
 /**
  * @internal
  *
@@ -124,6 +121,17 @@ struct culibs_type_impl<std::complex<double>> {
     using type = cuDoubleComplex;
 };
 
+
+template <>
+struct culibs_type_impl<half> {
+    using type = __half;
+};
+
+template <>
+struct culibs_type_impl<std::complex<half>> {
+    using type = __half2;
+};
+
 template <typename T>
 struct culibs_type_impl<thrust::complex<T>> {
     using type = typename culibs_type_impl<std::complex<T>>::type;
@@ -154,9 +162,14 @@ struct cuda_type_impl<volatile T> {
     using type = volatile typename cuda_type_impl<T>::type;
 };
 
+template <>
+struct cuda_type_impl<half> {
+    using type = __half;
+};
+
 template <typename T>
 struct cuda_type_impl<std::complex<T>> {
-    using type = thrust::complex<T>;
+    using type = thrust::complex<typename cuda_type_impl<T>::type>;
 };
 
 template <>
@@ -169,6 +182,11 @@ struct cuda_type_impl<cuComplex> {
     using type = thrust::complex<float>;
 };
 
+template <>
+struct cuda_type_impl<__half2> {
+    using type = thrust::complex<__half>;
+};
+
 template <typename T>
 struct cuda_struct_member_type_impl {
     using type = T;
@@ -176,7 +194,12 @@ struct cuda_struct_member_type_impl {
 
 template <typename T>
 struct cuda_struct_member_type_impl<std::complex<T>> {
-    using type = fake_complex<T>;
+    using type = fake_complex<typename cuda_struct_member_type_impl<T>::type>;
+};
+
+template <>
+struct cuda_struct_member_type_impl<gko::half> {
+    using type = __half;
 };
 
 template <typename ValueType, typename IndexType>
@@ -200,6 +223,7 @@ GKO_CUDA_DATA_TYPE(float, CUDA_R_32F);
 GKO_CUDA_DATA_TYPE(double, CUDA_R_64F);
 GKO_CUDA_DATA_TYPE(std::complex<float>, CUDA_C_32F);
 GKO_CUDA_DATA_TYPE(std::complex<double>, CUDA_C_64F);
+GKO_CUDA_DATA_TYPE(std::complex<float16>, CUDA_C_16F);
 GKO_CUDA_DATA_TYPE(int32, CUDA_R_32I);
 GKO_CUDA_DATA_TYPE(int8, CUDA_R_8I);
 
diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp
index bb0d4a2d0c9..c3982b7562e 100644
--- a/hip/base/types.hip.hpp
+++ b/hip/base/types.hip.hpp
@@ -21,14 +21,13 @@
 #endif
 #include <thrust/complex.h>
 
+#include <ginkgo/core/base/half.hpp>
 #include <ginkgo/core/base/matrix_data.hpp>
 
 #include "common/cuda_hip/base/runtime.hpp"
 
 
 namespace gko {
-
-
 namespace kernels {
 namespace hip {
 namespace detail {
@@ -130,6 +129,17 @@ struct hiplibs_type_impl<std::complex<double>> {
     using type = hipDoubleComplex;
 };
 
+template <>
+struct hiplibs_type_impl<half> {
+    using type = __half;
+};
+
+template <>
+struct hiplibs_type_impl<std::complex<half>> {
+    using type = __half2;
+};
+
+
 template <typename T>
 struct hiplibs_type_impl<thrust::complex<T>> {
     using type = typename hiplibs_type_impl<std::complex<T>>::type;
@@ -202,9 +212,14 @@ struct hip_type_impl<volatile T> {
     using type = volatile typename hip_type_impl<T>::type;
 };
 
+template <>
+struct hip_type_impl<gko::half> {
+    using type = __half;
+};
+
 template <typename T>
 struct hip_type_impl<std::complex<T>> {
-    using type = thrust::complex<T>;
+    using type = thrust::complex<typename hip_type_impl<T>::type>;
 };
 
 template <>
@@ -217,6 +232,11 @@ struct hip_type_impl<hipComplex> {
     using type = thrust::complex<float>;
 };
 
+template <>
+struct hip_type_impl<__half2> {
+    using type = thrust::complex<__half>;
+};
+
 template <typename T>
 struct hip_struct_member_type_impl {
     using type = T;
@@ -224,7 +244,12 @@ struct hip_struct_member_type_impl {
 
 template <typename T>
 struct hip_struct_member_type_impl<std::complex<T>> {
-    using type = fake_complex<T>;
+    using type = fake_complex<typename hip_struct_member_type_impl<T>::type>;
+};
+
+template <>
+struct hip_struct_member_type_impl<gko::half> {
+    using type = __half;
 };
 
 template <typename ValueType, typename IndexType>
diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp
index cd5e489b95d..5e15bb05d6a 100644
--- a/include/ginkgo/core/base/math.hpp
+++ b/include/ginkgo/core/base/math.hpp
@@ -21,6 +21,9 @@
 namespace gko {
 
 
+class half;
+
+
 // HIP should not see std::abs or std::sqrt, we want the custom implementation.
 // Hence, provide the using declaration only for some cases
 namespace kernels {
@@ -151,8 +154,12 @@ struct is_complex_impl<std::complex<T>>
 template <typename T>
 struct is_complex_or_scalar_impl : std::is_scalar<T> {};
 
+template <>
+struct is_complex_or_scalar_impl<half> : std::true_type {};
+
 template <typename T>
-struct is_complex_or_scalar_impl<std::complex<T>> : std::is_scalar<T> {};
+struct is_complex_or_scalar_impl<std::complex<T>>
+    : is_complex_or_scalar_impl<T> {};
 
 
 /**

From 5033387f5f0bb3f6fdba501a68f74d24c2a1d6a8 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 23 Oct 2024 13:50:49 +0200
Subject: [PATCH 346/448] fix error: non-constant-expression cannot be narrowed

---
 reference/test/base/batch_multi_vector_kernels.cpp |  4 ++--
 reference/test/matrix/coo_kernels.cpp              | 14 ++++++++------
 reference/test/matrix/csr_kernels.cpp              | 14 ++++++++------
 reference/test/matrix/dense_kernels.cpp            |  4 ++--
 reference/test/matrix/diagonal_kernels.cpp         | 14 ++++++++------
 reference/test/matrix/ell_kernels.cpp              | 14 ++++++++------
 reference/test/matrix/fbcsr_kernels.cpp            | 14 ++++++++------
 reference/test/matrix/hybrid_kernels.cpp           | 14 ++++++++------
 reference/test/matrix/sellp_kernels.cpp            | 14 ++++++++------
 test/mpi/matrix.cpp                                |  4 ++--
 10 files changed, 62 insertions(+), 48 deletions(-)

diff --git a/reference/test/base/batch_multi_vector_kernels.cpp b/reference/test/base/batch_multi_vector_kernels.cpp
index e673046a490..694ae491ef4 100644
--- a/reference/test/base/batch_multi_vector_kernels.cpp
+++ b/reference/test/base/batch_multi_vector_kernels.cpp
@@ -349,7 +349,7 @@ TYPED_TEST(MultiVector, ConvertsToPrecision)
     // If OtherT is more precise: 0, otherwise r
     auto residual = r<OtherT>::value < r<T>::value
                         ? gko::remove_complex<T>{0}
-                        : gko::remove_complex<T>{r<OtherT>::value};
+                        : static_cast<gko::remove_complex<T>>(r<OtherT>::value);
 
     this->mtx_1->convert_to(tmp.get());
     tmp->convert_to(res.get());
@@ -373,7 +373,7 @@ TYPED_TEST(MultiVector, MovesToPrecision)
     // If OtherT is more precise: 0, otherwise r
     auto residual = r<OtherT>::value < r<T>::value
                         ? gko::remove_complex<T>{0}
-                        : gko::remove_complex<T>{r<OtherT>::value};
+                        : static_cast<gko::remove_complex<T>>(r<OtherT>::value);
 
     this->mtx_1->move_to(tmp.get());
     tmp->move_to(res.get());
diff --git a/reference/test/matrix/coo_kernels.cpp b/reference/test/matrix/coo_kernels.cpp
index 42b68d1cb4c..fcca61a33d4 100644
--- a/reference/test/matrix/coo_kernels.cpp
+++ b/reference/test/matrix/coo_kernels.cpp
@@ -85,9 +85,10 @@ TYPED_TEST(Coo, ConvertsToPrecision)
     auto tmp = OtherCoo::create(this->exec);
     auto res = Coo::create(this->exec);
     // If OtherType is more precise: 0, otherwise r
-    auto residual = r<OtherType>::value < r<ValueType>::value
-                        ? gko::remove_complex<ValueType>{0}
-                        : gko::remove_complex<ValueType>{r<OtherType>::value};
+    auto residual =
+        r<OtherType>::value < r<ValueType>::value
+            ? gko::remove_complex<ValueType>{0}
+            : static_cast<gko::remove_complex<ValueType>>(r<OtherType>::value);
 
     this->mtx->convert_to(tmp);
     tmp->convert_to(res);
@@ -106,9 +107,10 @@ TYPED_TEST(Coo, MovesToPrecision)
     auto tmp = OtherCoo::create(this->exec);
     auto res = Coo::create(this->exec);
     // If OtherType is more precise: 0, otherwise r
-    auto residual = r<OtherType>::value < r<ValueType>::value
-                        ? gko::remove_complex<ValueType>{0}
-                        : gko::remove_complex<ValueType>{r<OtherType>::value};
+    auto residual =
+        r<OtherType>::value < r<ValueType>::value
+            ? gko::remove_complex<ValueType>{0}
+            : static_cast<gko::remove_complex<ValueType>>(r<OtherType>::value);
 
     this->mtx->move_to(tmp);
     tmp->move_to(res);
diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp
index 2d4c61786ad..2dd68bd9239 100644
--- a/reference/test/matrix/csr_kernels.cpp
+++ b/reference/test/matrix/csr_kernels.cpp
@@ -794,9 +794,10 @@ TYPED_TEST(Csr, ConvertsToPrecision)
     auto tmp = OtherCsr::create(this->exec);
     auto res = Csr::create(this->exec);
     // If OtherType is more precise: 0, otherwise r
-    auto residual = r<OtherType>::value < r<ValueType>::value
-                        ? gko::remove_complex<ValueType>{0}
-                        : gko::remove_complex<ValueType>{r<OtherType>::value};
+    auto residual =
+        r<OtherType>::value < r<ValueType>::value
+            ? gko::remove_complex<ValueType>{0}
+            : static_cast<gko::remove_complex<ValueType>>(r<OtherType>::value);
 
     // use mtx2 as mtx's strategy would involve creating a CudaExecutor
     this->mtx2->convert_to(tmp);
@@ -819,9 +820,10 @@ TYPED_TEST(Csr, MovesToPrecision)
     auto tmp = OtherCsr::create(this->exec);
     auto res = Csr::create(this->exec);
     // If OtherType is more precise: 0, otherwise r
-    auto residual = r<OtherType>::value < r<ValueType>::value
-                        ? gko::remove_complex<ValueType>{0}
-                        : gko::remove_complex<ValueType>{r<OtherType>::value};
+    auto residual =
+        r<OtherType>::value < r<ValueType>::value
+            ? gko::remove_complex<ValueType>{0}
+            : static_cast<gko::remove_complex<ValueType>>(r<OtherType>::value);
 
     // use mtx2 as mtx's strategy would involve creating a CudaExecutor
     this->mtx2->move_to(tmp);
diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp
index 41294c89d49..51b0aa148fd 100644
--- a/reference/test/matrix/dense_kernels.cpp
+++ b/reference/test/matrix/dense_kernels.cpp
@@ -753,7 +753,7 @@ TYPED_TEST(Dense, ConvertsToPrecision)
     // If OtherT is more precise: 0, otherwise r
     auto residual = r<OtherT>::value < r<T>::value
                         ? gko::remove_complex<T>{0}
-                        : gko::remove_complex<T>{r<OtherT>::value};
+                        : static_cast<gko::remove_complex<T>>(r<OtherT>::value);
 
     this->mtx1->convert_to(tmp);
     tmp->convert_to(res);
@@ -773,7 +773,7 @@ TYPED_TEST(Dense, MovesToPrecision)
     // If OtherT is more precise: 0, otherwise r
     auto residual = r<OtherT>::value < r<T>::value
                         ? gko::remove_complex<T>{0}
-                        : gko::remove_complex<T>{r<OtherT>::value};
+                        : static_cast<gko::remove_complex<T>>(r<OtherT>::value);
 
     this->mtx1->move_to(tmp);
     tmp->move_to(res);
diff --git a/reference/test/matrix/diagonal_kernels.cpp b/reference/test/matrix/diagonal_kernels.cpp
index 208c9d98639..b0932c7eb66 100644
--- a/reference/test/matrix/diagonal_kernels.cpp
+++ b/reference/test/matrix/diagonal_kernels.cpp
@@ -91,9 +91,10 @@ TYPED_TEST(Diagonal, ConvertsToPrecision)
     auto tmp = OtherDiagonal::create(this->exec);
     auto res = Diagonal::create(this->exec);
     // If OtherType is more precise: 0, otherwise r
-    auto residual = r<OtherType>::value < r<ValueType>::value
-                        ? gko::remove_complex<ValueType>{0}
-                        : gko::remove_complex<ValueType>{r<OtherType>::value};
+    auto residual =
+        r<OtherType>::value < r<ValueType>::value
+            ? gko::remove_complex<ValueType>{0}
+            : static_cast<gko::remove_complex<ValueType>>(r<OtherType>::value);
 
     this->diag1->convert_to(tmp);
     tmp->convert_to(res);
@@ -111,9 +112,10 @@ TYPED_TEST(Diagonal, MovesToPrecision)
     auto tmp = OtherDiagonal::create(this->exec);
     auto res = Diagonal::create(this->exec);
     // If OtherType is more precise: 0, otherwise r
-    auto residual = r<OtherType>::value < r<ValueType>::value
-                        ? gko::remove_complex<ValueType>{0}
-                        : gko::remove_complex<ValueType>{r<OtherType>::value};
+    auto residual =
+        r<OtherType>::value < r<ValueType>::value
+            ? gko::remove_complex<ValueType>{0}
+            : static_cast<gko::remove_complex<ValueType>>(r<OtherType>::value);
 
     this->diag1->move_to(tmp);
     tmp->move_to(res);
diff --git a/reference/test/matrix/ell_kernels.cpp b/reference/test/matrix/ell_kernels.cpp
index c96dcae773a..e1eef9f087c 100644
--- a/reference/test/matrix/ell_kernels.cpp
+++ b/reference/test/matrix/ell_kernels.cpp
@@ -449,9 +449,10 @@ TYPED_TEST(Ell, ConvertsToPrecision)
     auto tmp = OtherEll::create(this->exec);
     auto res = Ell::create(this->exec);
     // If OtherType is more precise: 0, otherwise r
-    auto residual = r<OtherType>::value < r<ValueType>::value
-                        ? gko::remove_complex<ValueType>{0}
-                        : gko::remove_complex<ValueType>{r<OtherType>::value};
+    auto residual =
+        r<OtherType>::value < r<ValueType>::value
+            ? gko::remove_complex<ValueType>{0}
+            : static_cast<gko::remove_complex<ValueType>>(r<OtherType>::value);
 
     this->mtx1->convert_to(tmp);
     tmp->convert_to(res);
@@ -470,9 +471,10 @@ TYPED_TEST(Ell, MovesToPrecision)
     auto tmp = OtherEll::create(this->exec);
     auto res = Ell::create(this->exec);
     // If OtherType is more precise: 0, otherwise r
-    auto residual = r<OtherType>::value < r<ValueType>::value
-                        ? gko::remove_complex<ValueType>{0}
-                        : gko::remove_complex<ValueType>{r<OtherType>::value};
+    auto residual =
+        r<OtherType>::value < r<ValueType>::value
+            ? gko::remove_complex<ValueType>{0}
+            : static_cast<gko::remove_complex<ValueType>>(r<OtherType>::value);
 
     this->mtx1->move_to(tmp);
     tmp->move_to(res);
diff --git a/reference/test/matrix/fbcsr_kernels.cpp b/reference/test/matrix/fbcsr_kernels.cpp
index cd82bade8b7..f7c6d2197ef 100644
--- a/reference/test/matrix/fbcsr_kernels.cpp
+++ b/reference/test/matrix/fbcsr_kernels.cpp
@@ -277,9 +277,10 @@ TYPED_TEST(Fbcsr, ConvertsToPrecision)
     auto tmp = OtherFbcsr::create(this->exec);
     auto res = Fbcsr::create(this->exec);
     // If OtherType is more precise: 0, otherwise r
-    auto residual = r<OtherType>::value < r<ValueType>::value
-                        ? gko::remove_complex<ValueType>{0}
-                        : gko::remove_complex<ValueType>{r<OtherType>::value};
+    auto residual =
+        r<OtherType>::value < r<ValueType>::value
+            ? gko::remove_complex<ValueType>{0}
+            : static_cast<gko::remove_complex<ValueType>>(r<OtherType>::value);
 
     this->mtx->convert_to(tmp);
     tmp->convert_to(res);
@@ -298,9 +299,10 @@ TYPED_TEST(Fbcsr, MovesToPrecision)
     auto tmp = OtherFbcsr::create(this->exec);
     auto res = Fbcsr::create(this->exec);
     // If OtherType is more precise: 0, otherwise r
-    auto residual = r<OtherType>::value < r<ValueType>::value
-                        ? gko::remove_complex<ValueType>{0}
-                        : gko::remove_complex<ValueType>{r<OtherType>::value};
+    auto residual =
+        r<OtherType>::value < r<ValueType>::value
+            ? gko::remove_complex<ValueType>{0}
+            : static_cast<gko::remove_complex<ValueType>>(r<OtherType>::value);
 
     this->mtx->move_to(tmp);
     tmp->move_to(res);
diff --git a/reference/test/matrix/hybrid_kernels.cpp b/reference/test/matrix/hybrid_kernels.cpp
index 014b5bb1024..754e599b8fe 100644
--- a/reference/test/matrix/hybrid_kernels.cpp
+++ b/reference/test/matrix/hybrid_kernels.cpp
@@ -239,9 +239,10 @@ TYPED_TEST(Hybrid, ConvertsToPrecision)
     auto tmp = OtherHybrid::create(this->exec);
     auto res = Hybrid::create(this->exec);
     // If OtherType is more precise: 0, otherwise r
-    auto residual = r<OtherType>::value < r<ValueType>::value
-                        ? gko::remove_complex<ValueType>{0}
-                        : gko::remove_complex<ValueType>{r<OtherType>::value};
+    auto residual =
+        r<OtherType>::value < r<ValueType>::value
+            ? gko::remove_complex<ValueType>{0}
+            : static_cast<gko::remove_complex<ValueType>>(r<OtherType>::value);
 
     this->mtx1->convert_to(tmp);
     tmp->convert_to(res);
@@ -260,9 +261,10 @@ TYPED_TEST(Hybrid, MovesToPrecision)
     auto tmp = OtherHybrid::create(this->exec);
     auto res = Hybrid::create(this->exec);
     // If OtherType is more precise: 0, otherwise r
-    auto residual = r<OtherType>::value < r<ValueType>::value
-                        ? gko::remove_complex<ValueType>{0}
-                        : gko::remove_complex<ValueType>{r<OtherType>::value};
+    auto residual =
+        r<OtherType>::value < r<ValueType>::value
+            ? gko::remove_complex<ValueType>{0}
+            : static_cast<gko::remove_complex<ValueType>>(r<OtherType>::value);
 
     this->mtx1->move_to(tmp);
     tmp->move_to(res);
diff --git a/reference/test/matrix/sellp_kernels.cpp b/reference/test/matrix/sellp_kernels.cpp
index 18cf793c7f3..a39d8e16832 100644
--- a/reference/test/matrix/sellp_kernels.cpp
+++ b/reference/test/matrix/sellp_kernels.cpp
@@ -195,9 +195,10 @@ TYPED_TEST(Sellp, ConvertsToPrecision)
     auto tmp = OtherSellp::create(this->exec);
     auto res = Sellp::create(this->exec);
     // If OtherType is more precise: 0, otherwise r
-    auto residual = r<OtherType>::value < r<ValueType>::value
-                        ? gko::remove_complex<ValueType>{0}
-                        : gko::remove_complex<ValueType>{r<OtherType>::value};
+    auto residual =
+        r<OtherType>::value < r<ValueType>::value
+            ? gko::remove_complex<ValueType>{0}
+            : static_cast<gko::remove_complex<ValueType>>(r<OtherType>::value);
 
     this->mtx1->convert_to(tmp);
     tmp->convert_to(res);
@@ -216,9 +217,10 @@ TYPED_TEST(Sellp, MovesToPrecision)
     auto tmp = OtherSellp::create(this->exec);
     auto res = Sellp::create(this->exec);
     // If OtherType is more precise: 0, otherwise r
-    auto residual = r<OtherType>::value < r<ValueType>::value
-                        ? gko::remove_complex<ValueType>{0}
-                        : gko::remove_complex<ValueType>{r<OtherType>::value};
+    auto residual =
+        r<OtherType>::value < r<ValueType>::value
+            ? gko::remove_complex<ValueType>{0}
+            : static_cast<gko::remove_complex<ValueType>>(r<OtherType>::value);
 
     this->mtx1->move_to(tmp);
     tmp->move_to(res);
diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp
index 0cfb3aca477..88fe4092668 100644
--- a/test/mpi/matrix.cpp
+++ b/test/mpi/matrix.cpp
@@ -741,7 +741,7 @@ TYPED_TEST(Matrix, CanConvertToNextPrecision)
     // If OtherT is more precise: 0, otherwise r
     auto residual = r<OtherT>::value < r<T>::value
                         ? gko::remove_complex<T>{0}
-                        : gko::remove_complex<T>{r<OtherT>::value};
+                        : static_cast<gko::remove_complex<T>>(r<OtherT>::value);
 
     this->dist_mat->convert_to(tmp);
     tmp->convert_to(res);
@@ -768,7 +768,7 @@ TYPED_TEST(Matrix, CanMoveToNextPrecision)
     // If OtherT is more precise: 0, otherwise r
     auto residual = r<OtherT>::value < r<T>::value
                         ? gko::remove_complex<T>{0}
-                        : gko::remove_complex<T>{r<OtherT>::value};
+                        : static_cast<gko::remove_complex<T>>(r<OtherT>::value);
 
     this->dist_mat->move_to(tmp);
     tmp->convert_to(res);

From cdde2f978524c86bfc000d1848f15d5399a57129 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 23 Oct 2024 13:57:12 +0200
Subject: [PATCH 347/448] update gdb-ginkgo

Co-authored-by: Marcel Koch <marcel.koch@kit.edu>
---
 dev_tools/scripts/gdb-ginkgo.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/dev_tools/scripts/gdb-ginkgo.py b/dev_tools/scripts/gdb-ginkgo.py
index d3de8f09a25..122d177031f 100644
--- a/dev_tools/scripts/gdb-ginkgo.py
+++ b/dev_tools/scripts/gdb-ginkgo.py
@@ -51,6 +51,7 @@ def next(self):
 
 _versioned_namespace = '__8::'
 
+
 # new version adapted from https://gcc.gnu.org/pipermail/gcc-cvs/2021-November/356230.html
 # necessary due to empty class optimization
 def is_specialization_of(x, template_name):
@@ -64,6 +65,7 @@ def is_specialization_of(x, template_name):
         expr = '^std::{}<.*>$'.format(template_name)
     return re.match(expr, x) is not None
 
+
 def get_template_arg_list(type_obj):
     "Return a type's template arguments as a list"
     n = 0
@@ -75,6 +77,7 @@ def get_template_arg_list(type_obj):
             return template_args
         n += 1
 
+
 def _tuple_impl_get(val):
     "Return the tuple element stored in a _Tuple_impl<N, T> base class."
     bases = val.type.fields()
@@ -95,6 +98,7 @@ def _tuple_impl_get(val):
     else:
         raise ValueError("Unsupported implementation for std::tuple: %s" % str(val.type))
 
+
 def tuple_get(n, val):
     "Return the result of std::get<n>(val) on a std::tuple"
     tuple_size = len(get_template_arg_list(val.type))
@@ -108,6 +112,7 @@ def tuple_get(n, val):
         n -= 1
     return _tuple_impl_get(node)
 
+
 def get_unique_ptr_data_ptr(val):
     "Return the result of val.get() on a std::unique_ptr"
     # std::unique_ptr<T, D> contains a std::tuple<D::pointer, D>,
@@ -220,12 +225,28 @@ def display_hint(self):
         return 'array'
 
 
+class GkoHalfPrinter:
+    "Print a gko::half"
+
+    def __init__(self, val):
+        # GDB doesn't seem to consider the user-defined conversion in its Value.cast,
+        # so we need to call the conversion operator explicitly
+        address = hex(val.address)
+        self.float_val = gdb.parse_and_eval(f"reinterpret_cast<gko::half*>({address})->operator float()")
+
+    def to_string(self):
+        self.float_val.fetch_lazy()
+        return self.float_val
+
+
 def lookup_type(val):
     if not str(val.type.unqualified()).startswith('gko::'):
         return None
     suffix = str(val.type.unqualified())[5:]
     if suffix.startswith('array<') and val.type.code == gdb.TYPE_CODE_STRUCT:
         return GkoArrayPrinter(val)
+    if suffix.startswith("half") and val.type.code == gdb.TYPE_CODE_STRUCT:
+        return GkoHalfPrinter(val)
     return None
 
 

From a18e836a38f89771d2ad122df9e5edf3744e2d50 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 24 Oct 2024 02:28:45 +0200
Subject: [PATCH 348/448] make half not rely on type

---
 core/test/base/half.cpp            |  2 +-
 include/ginkgo/core/base/half.hpp  | 51 ++++++++++++++++--------------
 include/ginkgo/core/base/types.hpp |  5 ++-
 3 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/core/test/base/half.cpp b/core/test/base/half.cpp
index f20bac0d47a..51d3e60ce40 100644
--- a/core/test/base/half.cpp
+++ b/core/test/base/half.cpp
@@ -37,7 +37,7 @@ class ExtendedFloatTestBase : public ::testing::Test {
 protected:
     using half = gko::half;
 
-    static constexpr auto byte_size = gko::byte_size;
+    static constexpr auto byte_size = gko::detail::byte_size;
 
     template <std::size_t N>
     static floating<N - 1> create_from_bits(const char (&s)[N])
diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp
index 25a38abb6eb..fb5761c51fb 100644
--- a/include/ginkgo/core/base/half.hpp
+++ b/include/ginkgo/core/base/half.hpp
@@ -6,13 +6,12 @@
 #define GKO_PUBLIC_CORE_BASE_HALF_HPP_
 
 
+#include <climits>
 #include <complex>
+#include <cstdint>
 #include <cstring>
 #include <type_traits>
 
-#include <ginkgo/core/base/std_extensions.hpp>
-#include <ginkgo/core/base/types.hpp>
-
 
 class __half;
 
@@ -20,29 +19,34 @@ class __half;
 namespace gko {
 
 
-template <typename, size_type, size_type>
+template <typename, std::size_t, std::size_t>
 class truncated;
 
 
+class half;
+
+
 namespace detail {
 
 
+constexpr std::size_t byte_size = CHAR_BIT;
+
 template <std::size_t, typename = void>
 struct uint_of_impl {};
 
 template <std::size_t Bits>
 struct uint_of_impl<Bits, std::enable_if_t<(Bits <= 16)>> {
-    using type = uint16;
+    using type = std::uint16_t;
 };
 
 template <std::size_t Bits>
 struct uint_of_impl<Bits, std::enable_if_t<(16 < Bits && Bits <= 32)>> {
-    using type = uint32;
+    using type = std::uint32_t;
 };
 
 template <std::size_t Bits>
 struct uint_of_impl<Bits, std::enable_if_t<(32 < Bits) && (Bits <= 64)>> {
-    using type = uint64;
+    using type = std::uint64_t;
 };
 
 template <std::size_t Bits>
@@ -53,8 +57,8 @@ template <typename T>
 struct basic_float_traits {};
 
 template <>
-struct basic_float_traits<float16> {
-    using type = float16;
+struct basic_float_traits<half> {
+    using type = half;
     static constexpr int sign_bits = 1;
     static constexpr int significand_bits = 10;
     static constexpr int exponent_bits = 5;
@@ -71,8 +75,8 @@ struct basic_float_traits<__half> {
 };
 
 template <>
-struct basic_float_traits<float32> {
-    using type = float32;
+struct basic_float_traits<float> {
+    using type = float;
     static constexpr int sign_bits = 1;
     static constexpr int significand_bits = 23;
     static constexpr int exponent_bits = 8;
@@ -80,15 +84,16 @@ struct basic_float_traits<float32> {
 };
 
 template <>
-struct basic_float_traits<float64> {
-    using type = float64;
+struct basic_float_traits<double> {
+    using type = double;
     static constexpr int sign_bits = 1;
     static constexpr int significand_bits = 52;
     static constexpr int exponent_bits = 11;
     static constexpr bool rounds_to_nearest = true;
 };
 
-template <typename FloatType, size_type NumComponents, size_type ComponentId>
+template <typename FloatType, std::size_t NumComponents,
+          std::size_t ComponentId>
 struct basic_float_traits<truncated<FloatType, NumComponents, ComponentId>> {
     using type = truncated<FloatType, NumComponents, ComponentId>;
     static constexpr int sign_bits = ComponentId == 0 ? 1 : 0;
@@ -281,7 +286,7 @@ struct precision_converter<SourceType, ResultType, false> {
 class half {
 public:
     // create half value from the bits directly.
-    static constexpr half create_from_bits(uint16 bits) noexcept
+    static constexpr half create_from_bits(std::uint16_t bits) noexcept
     {
         half result;
         result.data_ = bits;
@@ -376,19 +381,19 @@ class half {
     }
 
 private:
-    using f16_traits = detail::float_traits<float16>;
-    using f32_traits = detail::float_traits<float32>;
+    using f16_traits = detail::float_traits<half>;
+    using f32_traits = detail::float_traits<float>;
 
     void float2half(float val) noexcept
     {
-        uint32 bit_val(0);
+        std::uint32_t bit_val(0);
         std::memcpy(&bit_val, &val, sizeof(float));
         data_ = float2half(bit_val);
     }
 
-    static constexpr uint16 float2half(uint32 data_) noexcept
+    static constexpr std::uint16_t float2half(std::uint32_t data_) noexcept
     {
-        using conv = detail::precision_converter<float32, float16>;
+        using conv = detail::precision_converter<float, half>;
         if (f32_traits::is_inf(data_)) {
             return conv::shift_sign(data_) | f16_traits::exponent_mask;
         } else if (f32_traits::is_nan(data_)) {
@@ -417,9 +422,9 @@ class half {
         }
     }
 
-    static constexpr uint32 half2float(uint16 data_) noexcept
+    static constexpr std::uint32_t half2float(std::uint16_t data_) noexcept
     {
-        using conv = detail::precision_converter<float16, float32>;
+        using conv = detail::precision_converter<half, float>;
         if (f16_traits::is_inf(data_)) {
             return conv::shift_sign(data_) | f32_traits::exponent_mask;
         } else if (f16_traits::is_nan(data_)) {
@@ -434,7 +439,7 @@ class half {
         }
     }
 
-    uint16 data_;
+    std::uint16_t data_;
 };
 
 
diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp
index 72dd8a93584..1d5963c0fe8 100644
--- a/include/ginkgo/core/base/types.hpp
+++ b/include/ginkgo/core/base/types.hpp
@@ -17,6 +17,8 @@
 #include <string>
 #include <type_traits>
 
+#include <ginkgo/core/base/half.hpp>
+
 
 #ifdef __HIPCC__
 #include <hip/hip_runtime.h>
@@ -138,9 +140,6 @@ using uint64 = std::uint64_t;
 using uintptr = std::uintptr_t;
 
 
-class half;
-
-
 /**
  * Half precision floating point type.
  */

From 34a5ef5af33733719ccda57caf30b230dae523c8 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 24 Oct 2024 14:02:22 +0200
Subject: [PATCH 349/448] collect the reused part and undef after usage

Co-authored-by: Marcel Koch <marcel.koch@kit.edu>
---
 .pre-commit-config.yaml                |   4 +
 core/test/base/extended_float.cpp      | 108 ++++------------------
 core/test/base/floating_bit_helper.hpp |  82 +++++++++++++++++
 core/test/base/half.cpp                | 123 +++++--------------------
 include/ginkgo/core/base/half.hpp      |   7 ++
 5 files changed, 133 insertions(+), 191 deletions(-)
 create mode 100644 core/test/base/floating_bit_helper.hpp

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index fca3a1ef28f..8eccb113759 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,4 +1,8 @@
 repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+    -  id: end-of-file-fixer
 - repo: https://github.com/pre-commit/mirrors-clang-format
   rev: 'v14.0.0'  # The default in Ubuntu 22.04, which is used in our CI
   hooks:
diff --git a/core/test/base/extended_float.cpp b/core/test/base/extended_float.cpp
index bdb7a58ed84..764f5fc0c8d 100644
--- a/core/test/base/extended_float.cpp
+++ b/core/test/base/extended_float.cpp
@@ -9,86 +9,22 @@
 
 #include <gtest/gtest.h>
 
-#include <ginkgo/core/base/half.hpp>
-
-
-namespace {
-
-
-template <std::size_t N>
-struct floating_impl;
-
-template <>
-struct floating_impl<16> {
-    using type = gko::half;
-};
-
-template <>
-struct floating_impl<32> {
-    using type = float;
-};
-
-template <>
-struct floating_impl<64> {
-    using type = double;
-};
-
-template <std::size_t N>
-using floating = typename floating_impl<N>::type;
-
-
-class ExtendedFloatTestBase : public ::testing::Test {
-protected:
-    using half = gko::half;
-    template <typename T, std::size_t NumComponents, std::size_t ComponentId>
-    using truncated = gko::truncated<T, NumComponents, ComponentId>;
-
-    static constexpr auto byte_size = gko::byte_size;
-
-    template <std::size_t N>
-    static floating<N - 1> create_from_bits(const char (&s)[N])
-    {
-        auto bits = std::bitset<N - 1>(s).to_ullong();
-        // We cast to the same size of integer type first.
-        // Otherwise, the first memory chunk is different when we use
-        // reinterpret_cast or memcpy to get the smaller type out of unsigned
-        // long long.
-        using bits_type =
-            typename gko::detail::float_traits<floating<N - 1>>::bits_type;
-        auto bits_val = static_cast<bits_type>(bits);
-        floating<N - 1> result;
-        static_assert(sizeof(floating<N - 1>) == sizeof(bits_type),
-                      "the type should have the same size as its bits_type");
-        std::memcpy(&result, &bits_val, sizeof(bits_type));
-        return result;
-    }
-
-    template <typename T>
-    static std::bitset<sizeof(T) * byte_size> get_bits(T val)
-    {
-        using bits_type = typename gko::detail::float_traits<T>::bits_type;
-        bits_type bits;
-        static_assert(sizeof(T) == sizeof(bits_type),
-                      "the type should have the same size as its bits_type");
-        std::memcpy(&bits, &val, sizeof(T));
-        return std::bitset<sizeof(T) * byte_size>(bits);
-    }
-
-    template <std::size_t N>
-    static std::bitset<N - 1> get_bits(const char (&s)[N])
-    {
-        return std::bitset<N - 1>(s);
-    }
-};
-
-
-class TruncatedDouble : public ExtendedFloatTestBase {};
+#include "core/test/base/floating_bit_helper.hpp"
+
+
+using namespace floating_bit_helper;
+
+using half = gko::half;
+
+template <typename T, std::size_t NumComponents, std::size_t ComponentId>
+using truncated = gko::truncated<T, NumComponents, ComponentId>;
+
 
 // clang-format does terrible formatting of string literal concatenation
 // clang-format off
 
 
-TEST_F(TruncatedDouble, SplitsDoubleToHalves)
+TEST(TruncatedDouble, SplitsDoubleToHalves)
 {
     double x = create_from_bits("1" "11110100100" "1111" "1000110110110101"
                                 "1100101011010101" "1001011101110111");
@@ -102,7 +38,7 @@ TEST_F(TruncatedDouble, SplitsDoubleToHalves)
 }
 
 
-TEST_F(TruncatedDouble, AssemblesDoubleFromHalves)
+TEST(TruncatedDouble, AssemblesDoubleFromHalves)
 {
     double x = create_from_bits("1" "11110100100" "1111" "1000110110110101"
                                 "1100101011010101" "1001011101110111");
@@ -121,7 +57,7 @@ TEST_F(TruncatedDouble, AssemblesDoubleFromHalves)
 }
 
 
-TEST_F(TruncatedDouble, SplitsDoubleToQuarters)
+TEST(TruncatedDouble, SplitsDoubleToQuarters)
 {
     double x = create_from_bits("1" "11110100100" "1111" "1000110110110101"
                                 "1100101011010101" "1001011101110111");
@@ -138,7 +74,7 @@ TEST_F(TruncatedDouble, SplitsDoubleToQuarters)
 }
 
 
-TEST_F(TruncatedDouble, AssemblesDoubleFromQuarters)
+TEST(TruncatedDouble, AssemblesDoubleFromQuarters)
 {
     double x = create_from_bits("1" "11110100100" "1111" "1000110110110101"
                                 "1100101011010101" "1001011101110111");
@@ -167,16 +103,7 @@ TEST_F(TruncatedDouble, AssemblesDoubleFromQuarters)
 }
 
 
-// clang-format on
-
-
-class TruncatedFloat : public ExtendedFloatTestBase {};
-
-
-// clang-format off
-
-
-TEST_F(TruncatedFloat, SplitsFloatToHalves)
+TEST(TruncatedFloat, SplitsFloatToHalves)
 {
     float x = create_from_bits("1" "11110100" "1001111" "1000110110110101");
 
@@ -188,7 +115,7 @@ TEST_F(TruncatedFloat, SplitsFloatToHalves)
 }
 
 
-TEST_F(TruncatedFloat, AssemblesFloatFromHalves)
+TEST(TruncatedFloat, AssemblesFloatFromHalves)
 {
     float x = create_from_bits("1" "11110100" "1001111" "1000110110110101");
     auto p1 = static_cast<truncated<float, 2, 0>>(x);
@@ -205,6 +132,3 @@ TEST_F(TruncatedFloat, AssemblesFloatFromHalves)
 
 
 // clang-format on
-
-
-}  // namespace
diff --git a/core/test/base/floating_bit_helper.hpp b/core/test/base/floating_bit_helper.hpp
new file mode 100644
index 00000000000..bbdc76ee9c2
--- /dev/null
+++ b/core/test/base/floating_bit_helper.hpp
@@ -0,0 +1,82 @@
+// SPDX-FileCopyrightText: 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_CORE_TEST_BASE_FLOATING_BIT_HELPER_HPP_
+#define GKO_CORE_TEST_BASE_FLOATING_BIT_HELPER_HPP_
+
+
+#include <bitset>
+#include <cstring>
+
+#include <ginkgo/core/base/half.hpp>
+
+namespace floating_bit_helper {
+
+
+constexpr auto byte_size = gko::detail::byte_size;
+
+
+template <std::size_t N>
+struct floating_impl;
+
+template <>
+struct floating_impl<16> {
+    using type = gko::half;
+};
+
+template <>
+struct floating_impl<32> {
+    using type = float;
+};
+
+template <>
+struct floating_impl<64> {
+    using type = double;
+};
+
+
+template <std::size_t N>
+using floating = typename floating_impl<N>::type;
+
+
+template <std::size_t N>
+floating<N - 1> create_from_bits(const char (&s)[N])
+{
+    auto bits = std::bitset<N - 1>(s).to_ullong();
+    // We cast to the same size of integer type first.
+    // Otherwise, the first memory chunk is different when we use
+    // reinterpret_cast or memcpy to get the smaller type out of unsigned
+    // long long.
+    using bits_type =
+        typename gko::detail::float_traits<floating<N - 1>>::bits_type;
+    auto bits_val = static_cast<bits_type>(bits);
+    floating<N - 1> result;
+    static_assert(sizeof(floating<N - 1>) == sizeof(bits_type),
+                  "the type should have the same size as its bits_type");
+    std::memcpy(&result, &bits_val, sizeof(bits_type));
+    return result;
+}
+
+
+template <typename T>
+std::bitset<sizeof(T) * byte_size> get_bits(T val)
+{
+    using bits_type = typename gko::detail::float_traits<T>::bits_type;
+    bits_type bits;
+    static_assert(sizeof(T) == sizeof(bits_type),
+                  "the type should have the same size as its bits_type");
+    std::memcpy(&bits, &val, sizeof(T));
+    return std::bitset<sizeof(T) * byte_size>(bits);
+}
+
+template <std::size_t N>
+std::bitset<N - 1> get_bits(const char (&s)[N])
+{
+    return std::bitset<N - 1>(s);
+}
+
+
+}  // namespace floating_bit_helper
+
+#endif  // GKO_CORE_TEST_BASE_FLOATING_BIT_HELPER_HPP_
diff --git a/core/test/base/half.cpp b/core/test/base/half.cpp
index 51d3e60ce40..39c47c49e15 100644
--- a/core/test/base/half.cpp
+++ b/core/test/base/half.cpp
@@ -2,88 +2,22 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <bitset>
-#include <cstring>
-#include <string>
-
 #include <gtest/gtest.h>
 
 #include <ginkgo/core/base/half.hpp>
 
+#include "core/test/base/floating_bit_helper.hpp"
+
 
-template <std::size_t N>
-struct floating_impl;
-
-template <>
-struct floating_impl<16> {
-    using type = gko::half;
-};
-
-template <>
-struct floating_impl<32> {
-    using type = float;
-};
-
-template <>
-struct floating_impl<64> {
-    using type = double;
-};
-
-template <std::size_t N>
-using floating = typename floating_impl<N>::type;
-
-
-class ExtendedFloatTestBase : public ::testing::Test {
-protected:
-    using half = gko::half;
-
-    static constexpr auto byte_size = gko::detail::byte_size;
-
-    template <std::size_t N>
-    static floating<N - 1> create_from_bits(const char (&s)[N])
-    {
-        auto bits = std::bitset<N - 1>(s).to_ullong();
-        // We cast to the same size of integer type first.
-        // Otherwise, the first memory chunk is different when we use
-        // reinterpret_cast or memcpy to get the smaller type out of unsigned
-        // long long.
-        using bits_type =
-            typename gko::detail::float_traits<floating<N - 1>>::bits_type;
-        auto bits_val = static_cast<bits_type>(bits);
-        floating<N - 1> result;
-        static_assert(sizeof(floating<N - 1>) == sizeof(bits_type),
-                      "the type should have the same size as its bits_type");
-        std::memcpy(&result, &bits_val, sizeof(bits_type));
-        return result;
-    }
-
-    template <typename T>
-    static std::bitset<sizeof(T) * byte_size> get_bits(T val)
-    {
-        using bits_type = typename gko::detail::float_traits<T>::bits_type;
-        bits_type bits;
-        static_assert(sizeof(T) == sizeof(bits_type),
-                      "the type should have the same size as its bits_type");
-        std::memcpy(&bits, &val, sizeof(T));
-        return std::bitset<sizeof(T) * byte_size>(bits);
-    }
-
-    template <std::size_t N>
-    static std::bitset<N - 1> get_bits(const char (&s)[N])
-    {
-        return std::bitset<N - 1>(s);
-    }
-};
-
-
-class FloatToHalf : public ExtendedFloatTestBase {};
+using half = gko::half;
+using namespace floating_bit_helper;
 
 
 // clang-format does terrible formatting of string literal concatenation
 // clang-format off
 
 
-TEST_F(FloatToHalf, ConvertsOne)
+TEST(FloatToHalf, ConvertsOne)
 {
     half x = create_from_bits("0" "01111111" "00000000000000000000000");
 
@@ -91,7 +25,7 @@ TEST_F(FloatToHalf, ConvertsOne)
 }
 
 
-TEST_F(FloatToHalf, ConvertsZero)
+TEST(FloatToHalf, ConvertsZero)
 {
     half x = create_from_bits("0" "00000000" "00000000000000000000000");
 
@@ -99,7 +33,7 @@ TEST_F(FloatToHalf, ConvertsZero)
 }
 
 
-TEST_F(FloatToHalf, ConvertsInf)
+TEST(FloatToHalf, ConvertsInf)
 {
     half x = create_from_bits("0" "11111111" "00000000000000000000000");
 
@@ -107,7 +41,7 @@ TEST_F(FloatToHalf, ConvertsInf)
 }
 
 
-TEST_F(FloatToHalf, ConvertsNegInf)
+TEST(FloatToHalf, ConvertsNegInf)
 {
     half x = create_from_bits("1" "11111111" "00000000000000000000000");
 
@@ -115,7 +49,7 @@ TEST_F(FloatToHalf, ConvertsNegInf)
 }
 
 
-TEST_F(FloatToHalf, ConvertsNan)
+TEST(FloatToHalf, ConvertsNan)
 {
     half x = create_from_bits("0" "11111111" "00000000000000000000001");
 
@@ -128,7 +62,7 @@ TEST_F(FloatToHalf, ConvertsNan)
 }
 
 
-TEST_F(FloatToHalf, ConvertsNegNan)
+TEST(FloatToHalf, ConvertsNegNan)
 {
     half x = create_from_bits("1" "11111111" "00010000000000000000000");
 
@@ -141,7 +75,7 @@ TEST_F(FloatToHalf, ConvertsNegNan)
 }
 
 
-TEST_F(FloatToHalf, FlushesToZero)
+TEST(FloatToHalf, FlushesToZero)
 {
     half x = create_from_bits("0" "00000111" "00010001000100000001000");
 
@@ -149,7 +83,7 @@ TEST_F(FloatToHalf, FlushesToZero)
 }
 
 
-TEST_F(FloatToHalf, FlushesToNegZero)
+TEST(FloatToHalf, FlushesToNegZero)
 {
     half x = create_from_bits("1" "00000010" "00010001000100000001000");
 
@@ -157,7 +91,7 @@ TEST_F(FloatToHalf, FlushesToNegZero)
 }
 
 
-TEST_F(FloatToHalf, FlushesToInf)
+TEST(FloatToHalf, FlushesToInf)
 {
     half x = create_from_bits("0" "10100000" "10010000000000010000100");
 
@@ -165,7 +99,7 @@ TEST_F(FloatToHalf, FlushesToInf)
 }
 
 
-TEST_F(FloatToHalf, FlushesToNegInf)
+TEST(FloatToHalf, FlushesToNegInf)
 {
     half x = create_from_bits("1" "11000000" "10010000000000010000100");
 
@@ -173,7 +107,7 @@ TEST_F(FloatToHalf, FlushesToNegInf)
 }
 
 
-TEST_F(FloatToHalf, TruncatesSmallNumber)
+TEST(FloatToHalf, TruncatesSmallNumber)
 {
     half x = create_from_bits("0" "01110001" "10010000000000010000100");
 
@@ -181,7 +115,7 @@ TEST_F(FloatToHalf, TruncatesSmallNumber)
 }
 
 
-TEST_F(FloatToHalf, TruncatesLargeNumberRoundToEven)
+TEST(FloatToHalf, TruncatesLargeNumberRoundToEven)
 {
     half neg_x = create_from_bits("1" "10001110" "10010011111000010000100");
     half neg_x2 = create_from_bits("1" "10001110" "10010011101000010000100");
@@ -199,16 +133,7 @@ TEST_F(FloatToHalf, TruncatesLargeNumberRoundToEven)
 }
 
 
-// clang-format on
-
-
-class HalfToFloat : public ExtendedFloatTestBase {};
-
-
-// clang-format off
-
-
-TEST_F(HalfToFloat, ConvertsOne)
+TEST(HalfToFloat, ConvertsOne)
 {
     float x = create_from_bits("0" "01111" "0000000000");
 
@@ -216,7 +141,7 @@ TEST_F(HalfToFloat, ConvertsOne)
 }
 
 
-TEST_F(HalfToFloat, ConvertsZero)
+TEST(HalfToFloat, ConvertsZero)
 {
     float x = create_from_bits("0" "00000" "0000000000");
 
@@ -224,7 +149,7 @@ TEST_F(HalfToFloat, ConvertsZero)
 }
 
 
-TEST_F(HalfToFloat, ConvertsInf)
+TEST(HalfToFloat, ConvertsInf)
 {
     float x = create_from_bits("0" "11111" "0000000000");
 
@@ -232,7 +157,7 @@ TEST_F(HalfToFloat, ConvertsInf)
 }
 
 
-TEST_F(HalfToFloat, ConvertsNegInf)
+TEST(HalfToFloat, ConvertsNegInf)
 {
     float x = create_from_bits("1" "11111" "0000000000");
 
@@ -240,7 +165,7 @@ TEST_F(HalfToFloat, ConvertsNegInf)
 }
 
 
-TEST_F(HalfToFloat, ConvertsNan)
+TEST(HalfToFloat, ConvertsNan)
 {
     float x = create_from_bits("0" "11111" "0001001000");
 
@@ -253,7 +178,7 @@ TEST_F(HalfToFloat, ConvertsNan)
 }
 
 
-TEST_F(HalfToFloat, ConvertsNegNan)
+TEST(HalfToFloat, ConvertsNegNan)
 {
     float x = create_from_bits("1" "11111" "0000000001");
 
@@ -266,7 +191,7 @@ TEST_F(HalfToFloat, ConvertsNegNan)
 }
 
 
-TEST_F(HalfToFloat, ExtendsSmallNumber)
+TEST(HalfToFloat, ExtendsSmallNumber)
 {
     float x = create_from_bits("0" "00001" "1000010001");
 
@@ -274,7 +199,7 @@ TEST_F(HalfToFloat, ExtendsSmallNumber)
 }
 
 
-TEST_F(HalfToFloat, ExtendsLargeNumber)
+TEST(HalfToFloat, ExtendsLargeNumber)
 {
     float x = create_from_bits("1" "11110" "1001001111");
 
diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp
index fb5761c51fb..b559ad2cfc5 100644
--- a/include/ginkgo/core/base/half.hpp
+++ b/include/ginkgo/core/base/half.hpp
@@ -334,11 +334,14 @@ class half {
         data_ = result.data_;                                      \
         return *this;                                              \
     }
+
     HALF_OPERATOR(+, +=)
     HALF_OPERATOR(-, -=)
     HALF_OPERATOR(*, *=)
     HALF_OPERATOR(/, /=)
 
+#undef HALF_OPERATOR
+
     // Do operation with different type
     // If it is floating point, using floating point as type.
     // If it is integer, using half as type
@@ -373,6 +376,8 @@ class half {
     HALF_FRIEND_OPERATOR(*, *=)
     HALF_FRIEND_OPERATOR(/, /=)
 
+#undef HALF_FRIEND_OPERATOR
+
     // the negative
     half operator-() const
     {
@@ -588,6 +593,8 @@ class complex<gko::half> {
     COMPLEX_HALF_OPERATOR(*, *=)
     COMPLEX_HALF_OPERATOR(/, /=)
 
+#undef COMPLEX_HALF_OPERATOR
+
 private:
     value_type real_;
     value_type imag_;

From 14ef89c4253c9aad87bf33c18cb55eaab2a490e1 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 24 Oct 2024 12:14:54 +0200
Subject: [PATCH 350/448] use memcpy not std::memcpy in hip

---
 hip/components/memory.hip.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hip/components/memory.hip.hpp b/hip/components/memory.hip.hpp
index d8238c11795..8a98ee822b8 100644
--- a/hip/components/memory.hip.hpp
+++ b/hip/components/memory.hip.hpp
@@ -99,7 +99,7 @@ __device__ __forceinline__ ValueType load_generic(const ValueType* ptr)
     auto cast_value = HIP_ATOMIC_LOAD(reinterpret_cast<const atomic_type*>(ptr),
                                       memorder, scope);
     ValueType result{};
-    std::memcpy(&result, &cast_value, sizeof(ValueType));
+    memcpy(&result, &cast_value, sizeof(ValueType));
     return result;
 }
 
@@ -122,7 +122,7 @@ __device__ __forceinline__ void store_generic(ValueType* ptr, ValueType value)
     static_assert(sizeof(atomic_type) == sizeof(ValueType), "invalid map");
     static_assert(alignof(atomic_type) == alignof(ValueType), "invalid map");
     atomic_type cast_value{};
-    std::memcpy(&cast_value, &value, sizeof(ValueType));
+    memcpy(&cast_value, &value, sizeof(ValueType));
     HIP_ATOMIC_STORE(reinterpret_cast<atomic_type*>(ptr), cast_value, memorder,
                      scope);
 }

From 368cd08da042d0d2a6a433fb69eb3e2901022568 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 18 Nov 2024 11:10:13 +0100
Subject: [PATCH 351/448] add alignment

---
 core/test/base/half.cpp           | 8 ++++++++
 include/ginkgo/core/base/half.hpp | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/core/test/base/half.cpp b/core/test/base/half.cpp
index 39c47c49e15..7fcd0ffa70f 100644
--- a/core/test/base/half.cpp
+++ b/core/test/base/half.cpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <cstdint>
+
 #include <gtest/gtest.h>
 
 #include <ginkgo/core/base/half.hpp>
@@ -13,6 +15,12 @@ using half = gko::half;
 using namespace floating_bit_helper;
 
 
+TEST(Half, SizeAndAlign)
+{
+    ASSERT_EQ(sizeof(half), sizeof(std::uint16_t));
+    ASSERT_EQ(alignof(half), alignof(std::uint16_t));
+}
+
 // clang-format does terrible formatting of string literal concatenation
 // clang-format off
 
diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp
index b559ad2cfc5..bd04d2da832 100644
--- a/include/ginkgo/core/base/half.hpp
+++ b/include/ginkgo/core/base/half.hpp
@@ -283,7 +283,7 @@ struct precision_converter<SourceType, ResultType, false> {
  * For now the only features are reduced storage compared to single precision
  * and conversions from and to single precision floating point type.
  */
-class half {
+class alignas(std::uint16_t) half {
 public:
     // create half value from the bits directly.
     static constexpr half create_from_bits(std::uint16_t bits) noexcept

From f198aac1d76a52a118cbcda417ef88af2a1e5ca1 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 18 Nov 2024 13:45:44 +0100
Subject: [PATCH 352/448] delete the sycl half test as we do not enable it
 directly

---
 core/test/base/half.cpp | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/core/test/base/half.cpp b/core/test/base/half.cpp
index 7fcd0ffa70f..82732c62d16 100644
--- a/core/test/base/half.cpp
+++ b/core/test/base/half.cpp
@@ -61,12 +61,7 @@ TEST(FloatToHalf, ConvertsNan)
 {
     half x = create_from_bits("0" "11111111" "00000000000000000000001");
 
-    #if defined(SYCL_LANGUAGE_VERSION) 
-    // Sycl put the 1000000000, but ours put mask
-    ASSERT_EQ(get_bits(x), get_bits("0" "11111" "1000000000"));
-    #else
     ASSERT_EQ(get_bits(x), get_bits("0" "11111" "1111111111"));
-    #endif
 }
 
 
@@ -74,12 +69,7 @@ TEST(FloatToHalf, ConvertsNegNan)
 {
     half x = create_from_bits("1" "11111111" "00010000000000000000000");
 
-    #if defined(SYCL_LANGUAGE_VERSION)
-    // Sycl put the 1000000000, but ours put mask
-    ASSERT_EQ(get_bits(x), get_bits("1" "11111" "1000000000"));
-    #else
     ASSERT_EQ(get_bits(x), get_bits("1" "11111" "1111111111"));
-    #endif
 }
 
 
@@ -177,12 +167,7 @@ TEST(HalfToFloat, ConvertsNan)
 {
     float x = create_from_bits("0" "11111" "0001001000");
 
-    #if defined(SYCL_LANGUAGE_VERSION) 
-    // sycl keeps significand
-    ASSERT_EQ(get_bits(x), get_bits("0" "11111111" "00010010000000000000000"));
-    #else
     ASSERT_EQ(get_bits(x), get_bits("0" "11111111" "11111111111111111111111"));
-    #endif
 }
 
 
@@ -190,12 +175,7 @@ TEST(HalfToFloat, ConvertsNegNan)
 {
     float x = create_from_bits("1" "11111" "0000000001");
 
-    #if defined(SYCL_LANGUAGE_VERSION) 
-    // sycl keeps significand
-    ASSERT_EQ(get_bits(x), get_bits("1" "11111111" "00000000010000000000000"));
-    #else
     ASSERT_EQ(get_bits(x), get_bits("1" "11111111" "11111111111111111111111"));
-    #endif
 }
 
 

From a3982f22dde08d7dfc11415edcec0191ce749267 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Sat, 30 Nov 2024 00:25:05 +0100
Subject: [PATCH 353/448] use reference for half when it is possible

---
 include/ginkgo/core/base/half.hpp | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp
index bd04d2da832..b6d11dd7c64 100644
--- a/include/ginkgo/core/base/half.hpp
+++ b/include/ginkgo/core/base/half.hpp
@@ -286,7 +286,7 @@ struct precision_converter<SourceType, ResultType, false> {
 class alignas(std::uint16_t) half {
 public:
     // create half value from the bits directly.
-    static constexpr half create_from_bits(std::uint16_t bits) noexcept
+    static constexpr half create_from_bits(const std::uint16_t& bits) noexcept
     {
         half result;
         result.data_ = bits;
@@ -299,13 +299,13 @@ class alignas(std::uint16_t) half {
     constexpr half() noexcept : data_(0){};
 
     template <typename T, typename = std::enable_if_t<std::is_scalar<T>::value>>
-    half(const T val) : data_(0)
+    half(const T& val) : data_(0)
     {
         this->float2half(static_cast<float>(val));
     }
 
     template <typename V>
-    half& operator=(const V val)
+    half& operator=(const V& val)
     {
         this->float2half(static_cast<float>(val));
         return *this;
@@ -323,7 +323,7 @@ class alignas(std::uint16_t) half {
     // operation will cast it to float and then do float operation such that it
     // becomes float in the end.
 #define HALF_OPERATOR(_op, _opeq)                                  \
-    friend half operator _op(const half lhf, const half rhf)       \
+    friend half operator _op(const half& lhf, const half& rhf)     \
     {                                                              \
         return static_cast<half>(static_cast<float>(lhf)           \
                                      _op static_cast<float>(rhf)); \
@@ -350,7 +350,7 @@ class alignas(std::uint16_t) half {
     friend std::enable_if_t<                                               \
         !std::is_same<T, half>::value && std::is_scalar<T>::value,         \
         std::conditional_t<std::is_floating_point<T>::value, T, half>>     \
-    operator _op(const half hf, const T val)                               \
+    operator _op(const half& hf, const T& val)                             \
     {                                                                      \
         using type =                                                       \
             std::conditional_t<std::is_floating_point<T>::value, T, half>; \
@@ -362,7 +362,7 @@ class alignas(std::uint16_t) half {
     friend std::enable_if_t<                                               \
         !std::is_same<T, half>::value && std::is_scalar<T>::value,         \
         std::conditional_t<std::is_floating_point<T>::value, T, half>>     \
-    operator _op(const T val, const half hf)                               \
+    operator _op(const T& val, const half& hf)                             \
     {                                                                      \
         using type =                                                       \
             std::conditional_t<std::is_floating_point<T>::value, T, half>; \
@@ -389,7 +389,7 @@ class alignas(std::uint16_t) half {
     using f16_traits = detail::float_traits<half>;
     using f32_traits = detail::float_traits<float>;
 
-    void float2half(float val) noexcept
+    void float2half(const float& val) noexcept
     {
         std::uint32_t bit_val(0);
         std::memcpy(&bit_val, &val, sizeof(float));
@@ -576,16 +576,12 @@ class complex<gko::half> {
         return *this;
     }
 
-// It's for MacOS.
-// TODO: check whether mac compiler always use complex version even when real
-// half
-#define COMPLEX_HALF_OPERATOR(_op, _opeq)                                \
-    friend complex<gko::half> operator _op(const complex<gko::half> lhf, \
-                                           const complex<gko::half> rhf) \
-    {                                                                    \
-        auto a = lhf;                                                    \
-        a _opeq rhf;                                                     \
-        return a;                                                        \
+#define COMPLEX_HALF_OPERATOR(_op, _opeq)                               \
+    friend complex operator _op(const complex& lhf, const complex& rhf) \
+    {                                                                   \
+        auto a = lhf;                                                   \
+        a _opeq rhf;                                                    \
+        return a;                                                       \
     }
 
     COMPLEX_HALF_OPERATOR(+, +=)

From 92edd8f705d23571aeaaf2b40bd6bfcba2eb68ea Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 23 Oct 2024 18:02:36 +0200
Subject: [PATCH 354/448] instantiation/testing/next/prev/stub type definition

---
 core/base/mixed_precision_types.hpp      | 151 +++++++++++++++++++++++
 core/device_hooks/common_kernels.inc.cpp |  63 +++++++++-
 core/test/utils.hpp                      |  48 ++++++-
 include/ginkgo/core/base/math.hpp        |  45 +++++++
 include/ginkgo/core/base/types.hpp       | 116 +++++++++++++++++
 5 files changed, 418 insertions(+), 5 deletions(-)

diff --git a/core/base/mixed_precision_types.hpp b/core/base/mixed_precision_types.hpp
index d9747e5cad8..5ef5de94e34 100644
--- a/core/base/mixed_precision_types.hpp
+++ b/core/base/mixed_precision_types.hpp
@@ -7,23 +7,44 @@
 
 
 #include <ginkgo/config.hpp>
+#include <ginkgo/core/base/half.hpp>
 #include <ginkgo/core/base/types.hpp>
 
 
 #ifdef GINKGO_MIXED_PRECISION
 
+
 #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, ...) \
     template _macro(float, float, float, __VA_ARGS__);                \
     template _macro(float, float, double, __VA_ARGS__);               \
     template _macro(float, double, float, __VA_ARGS__);               \
     template _macro(float, double, double, __VA_ARGS__)
 
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1_WITH_HALF(_macro, \
+                                                                   ...)    \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, __VA_ARGS__); \
+    GKO_ADAPT_HF(template _macro(float, half, half, __VA_ARGS__));         \
+    GKO_ADAPT_HF(template _macro(float, half, float, __VA_ARGS__));        \
+    GKO_ADAPT_HF(template _macro(float, half, double, __VA_ARGS__));       \
+    GKO_ADAPT_HF(template _macro(float, float, half, __VA_ARGS__));        \
+    GKO_ADAPT_HF(template _macro(float, double, half, __VA_ARGS__))
+
 #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, ...) \
     template _macro(double, float, float, __VA_ARGS__);               \
     template _macro(double, float, double, __VA_ARGS__);              \
     template _macro(double, double, float, __VA_ARGS__);              \
     template _macro(double, double, double, __VA_ARGS__)
 
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2_WITH_HALF(_macro, \
+                                                                   ...)    \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, __VA_ARGS__); \
+    GKO_ADAPT_HF(template _macro(double, half, half, __VA_ARGS__));        \
+    GKO_ADAPT_HF(template _macro(double, half, float, __VA_ARGS__));       \
+    GKO_ADAPT_HF(template _macro(double, half, double, __VA_ARGS__));      \
+    GKO_ADAPT_HF(template _macro(double, float, half, __VA_ARGS__));       \
+    GKO_ADAPT_HF(template _macro(double, double, half, __VA_ARGS__))
+
+
 #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, ...) \
     template _macro(std::complex<float>, std::complex<float>,         \
                     std::complex<float>, __VA_ARGS__);                \
@@ -33,6 +54,19 @@
                     std::complex<float>, __VA_ARGS__);                \
     template _macro(std::complex<float>, std::complex<double>,        \
                     std::complex<double>, __VA_ARGS__)
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3_WITH_HALF(_macro,  \
+                                                                   ...)     \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, __VA_ARGS__);  \
+    GKO_ADAPT_HF(template _macro(std::complex<float>, std::complex<half>,   \
+                                 std::complex<half>, __VA_ARGS__));         \
+    GKO_ADAPT_HF(template _macro(std::complex<float>, std::complex<half>,   \
+                                 std::complex<float>, __VA_ARGS__));        \
+    GKO_ADAPT_HF(template _macro(std::complex<float>, std::complex<half>,   \
+                                 std::complex<double>, __VA_ARGS__));       \
+    GKO_ADAPT_HF(template _macro(std::complex<float>, std::complex<float>,  \
+                                 std::complex<half>, __VA_ARGS__));         \
+    GKO_ADAPT_HF(template _macro(std::complex<float>, std::complex<double>, \
+                                 std::complex<half>, __VA_ARGS__))
 
 #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, ...) \
     template _macro(std::complex<double>, std::complex<float>,        \
@@ -44,22 +78,95 @@
     template _macro(std::complex<double>, std::complex<double>,       \
                     std::complex<double>, __VA_ARGS__)
 
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4_WITH_HALF(_macro,   \
+                                                                   ...)      \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, __VA_ARGS__);   \
+    GKO_ADAPT_HF(template _macro(std::complex<double>, std::complex<half>,   \
+                                 std::complex<half>, __VA_ARGS__));          \
+    GKO_ADAPT_HF(template _macro(std::complex<double>, std::complex<half>,   \
+                                 std::complex<float>, __VA_ARGS__));         \
+    GKO_ADAPT_HF(template _macro(std::complex<double>, std::complex<half>,   \
+                                 std::complex<double>, __VA_ARGS__));        \
+    GKO_ADAPT_HF(template _macro(std::complex<double>, std::complex<float>,  \
+                                 std::complex<half>, __VA_ARGS__));          \
+    GKO_ADAPT_HF(template _macro(std::complex<double>, std::complex<double>, \
+                                 std::complex<half>, __VA_ARGS__))
+
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5_WITH_HALF(_macro, \
+                                                                   ...)    \
+    GKO_ADAPT_HF(template _macro(half, half, half, __VA_ARGS__));          \
+    GKO_ADAPT_HF(template _macro(half, half, float, __VA_ARGS__));         \
+    GKO_ADAPT_HF(template _macro(half, half, double, __VA_ARGS__));        \
+    GKO_ADAPT_HF(template _macro(half, float, half, __VA_ARGS__));         \
+    GKO_ADAPT_HF(template _macro(half, float, float, __VA_ARGS__));        \
+    GKO_ADAPT_HF(template _macro(half, float, double, __VA_ARGS__));       \
+    GKO_ADAPT_HF(template _macro(half, double, half, __VA_ARGS__));        \
+    GKO_ADAPT_HF(template _macro(half, double, float, __VA_ARGS__));       \
+    GKO_ADAPT_HF(template _macro(half, double, double, __VA_ARGS__))
+
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6_WITH_HALF(_macro, \
+                                                                   ...)    \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, std::complex<half>,   \
+                                 std::complex<half>, __VA_ARGS__));        \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, std::complex<half>,   \
+                                 std::complex<float>, __VA_ARGS__));       \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, std::complex<half>,   \
+                                 std::complex<double>, __VA_ARGS__));      \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, std::complex<float>,  \
+                                 std::complex<half>, __VA_ARGS__));        \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, std::complex<float>,  \
+                                 std::complex<float>, __VA_ARGS__));       \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, std::complex<float>,  \
+                                 std::complex<double>, __VA_ARGS__));      \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, std::complex<double>, \
+                                 std::complex<half>, __VA_ARGS__));        \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, std::complex<double>, \
+                                 std::complex<float>, __VA_ARGS__));       \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, std::complex<double>, \
+                                 std::complex<double>, __VA_ARGS__))
+
 #else
 
 #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, ...) \
     template _macro(float, float, float, __VA_ARGS__)
 
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1_WITH_HALF(_macro, \
+                                                                   ...)    \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, __VA_ARGS__)
+
 #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, ...) \
     template _macro(double, double, double, __VA_ARGS__)
 
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2_WITH_HALF(_macro, \
+                                                                   ...)    \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, __VA_ARGS__)
+
 #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, ...) \
     template _macro(std::complex<float>, std::complex<float>,         \
                     std::complex<float>, __VA_ARGS__)
 
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3_WITH_HALF(_macro, \
+                                                                   ...)    \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, __VA_ARGS__)
+
 #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, ...) \
     template _macro(std::complex<double>, std::complex<double>,       \
                     std::complex<double>, __VA_ARGS__)
 
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4_WITH_HALF(_macro, \
+                                                                   ...)    \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, __VA_ARGS__)
+
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5_WITH_HALF(_macro, \
+                                                                   ...)    \
+    GKO_ADAPT_HF(template _macro(half, half, half, __VA_ARGS__))
+
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6_WITH_HALF(_macro, \
+                                                                   ...)    \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, std::complex<half>,   \
+                                 std::complex<half>, __VA_ARGS__))
+
+
 #endif
 
 
@@ -69,11 +176,27 @@
     GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, __VA_ARGS__); \
     GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, __VA_ARGS__)
 
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_WITH_HALF(_macro, ...)     \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1_WITH_HALF(_macro,       \
+                                                               __VA_ARGS__); \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2_WITH_HALF(_macro,       \
+                                                               __VA_ARGS__); \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3_WITH_HALF(_macro,       \
+                                                               __VA_ARGS__); \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4_WITH_HALF(_macro,       \
+                                                               __VA_ARGS__); \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5_WITH_HALF(_macro,       \
+                                                               __VA_ARGS__); \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6_WITH_HALF(_macro,       \
+                                                               __VA_ARGS__)
 
 #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(_macro) \
     GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, int32);       \
     GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, int64)
 
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(_macro) \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_WITH_HALF(_macro, int32);       \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_WITH_HALF(_macro, int64)
 
 #ifdef GINKGO_MIXED_PRECISION
 #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...)             \
@@ -85,12 +208,36 @@
     template _macro(std::complex<float>, std::complex<double>, __VA_ARGS__); \
     template _macro(std::complex<double>, std::complex<float>, __VA_ARGS__); \
     template _macro(std::complex<double>, std::complex<double>, __VA_ARGS__)
+
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2_WITH_HALF(_macro, ...)     \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, __VA_ARGS__);          \
+    GKO_ADAPT_HF(template _macro(half, half, __VA_ARGS__));                    \
+    GKO_ADAPT_HF(template _macro(half, float, __VA_ARGS__));                   \
+    GKO_ADAPT_HF(template _macro(half, double, __VA_ARGS__));                  \
+    GKO_ADAPT_HF(template _macro(float, half, __VA_ARGS__));                   \
+    GKO_ADAPT_HF(template _macro(double, half, __VA_ARGS__));                  \
+    GKO_ADAPT_HF(                                                              \
+        template _macro(std::complex<half>, std::complex<half>, __VA_ARGS__)); \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, std::complex<float>,      \
+                                 __VA_ARGS__));                                \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, std::complex<double>,     \
+                                 __VA_ARGS__));                                \
+    GKO_ADAPT_HF(template _macro(std::complex<float>, std::complex<half>,      \
+                                 __VA_ARGS__));                                \
+    GKO_ADAPT_HF(template _macro(std::complex<double>, std::complex<half>,     \
+                                 __VA_ARGS__))
 #else
 #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...)            \
     template _macro(float, float, __VA_ARGS__);                             \
     template _macro(double, double, __VA_ARGS__);                           \
     template _macro(std::complex<float>, std::complex<float>, __VA_ARGS__); \
     template _macro(std::complex<double>, std::complex<double>, __VA_ARGS__)
+
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2_WITH_HALF(_macro, ...) \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, __VA_ARGS__);      \
+    GKO_ADAPT_HF(template _macro(half, half, __VA_ARGS__));                \
+    GKO_ADAPT_HF(                                                          \
+        template _macro(std::complex<half>, std::complex<half>, __VA_ARGS__))
 #endif
 
 
@@ -98,5 +245,9 @@
     GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, int32);       \
     GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, int64)
 
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2_WITH_HALF(  \
+    _macro)                                                               \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2_WITH_HALF(_macro, int32); \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2_WITH_HALF(_macro, int64)
 
 #endif  // GKO_CORE_BASE_MIXED_PRECISION_TYPES_HPP_
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 98d85b2b6d2..6ffeb1c5f71 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -79,26 +79,37 @@
 
 #define GKO_STUB(_macro) _macro GKO_NOT_COMPILED(GKO_HOOK_MODULE)
 
-#define GKO_STUB_VALUE_CONVERSION(_macro)                             \
-    template <typename SourceType, typename TargetType>               \
-    _macro(SourceType, TargetType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro)
 
 #define GKO_STUB_NON_COMPLEX_VALUE_TYPE(_macro)          \
     template <typename ValueType>                        \
     _macro(ValueType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
     GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro)
 
+#define GKO_STUB_NON_COMPLEX_VALUE_TYPE_WITH_HALF(_macro) \
+    template <typename ValueType>                         \
+    _macro(ValueType) GKO_NOT_COMPILED(GKO_HOOK_MODULE);  \
+    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF(_macro)
+
 #define GKO_STUB_VALUE_TYPE(_macro)                      \
     template <typename ValueType>                        \
     _macro(ValueType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
     GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro)
 
+#define GKO_STUB_VALUE_TYPE_WITH_HALF(_macro)            \
+    template <typename ValueType>                        \
+    _macro(ValueType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(_macro)
+
 #define GKO_STUB_VALUE_AND_SCALAR_TYPE(_macro)                       \
     template <typename ValueType, typename ScalarType>               \
     _macro(ValueType, ScalarType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
     GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro)
 
+#define GKO_STUB_VALUE_AND_SCALAR_TYPE_WITH_HALF(_macro)             \
+    template <typename ValueType, typename ScalarType>               \
+    _macro(ValueType, ScalarType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF(_macro)
+
 #define GKO_STUB_INDEX_TYPE(_macro)                      \
     template <typename IndexType>                        \
     _macro(IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
@@ -114,16 +125,31 @@
     _macro(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
     GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro)
 
+#define GKO_STUB_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF(_macro) \
+    template <typename ValueType, typename IndexType>               \
+    _macro(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
+    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF(_macro)
+
 #define GKO_STUB_VALUE_AND_INDEX_TYPE(_macro)                       \
     template <typename ValueType, typename IndexType>               \
     _macro(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
     GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro)
 
+#define GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(_macro)             \
+    template <typename ValueType, typename IndexType>               \
+    _macro(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(_macro)
+
 #define GKO_STUB_VALUE_AND_INT32_TYPE(_macro)                       \
     template <typename ValueType, typename IndexType>               \
     _macro(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
     GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(_macro)
 
+#define GKO_STUB_VALUE_AND_INT32_TYPE_WITH_HALF(_macro)             \
+    template <typename ValueType, typename IndexType>               \
+    _macro(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(_macro)
+
 #define GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(_macro)                     \
     template <typename InputValueType, typename MatrixValueType,        \
               typename OutputValueType, typename IndexType>             \
@@ -131,6 +157,13 @@
         GKO_NOT_COMPILED(GKO_HOOK_MODULE);                              \
     GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(_macro)
 
+#define GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(_macro)           \
+    template <typename InputValueType, typename MatrixValueType,        \
+              typename OutputValueType, typename IndexType>             \
+    _macro(InputValueType, MatrixValueType, OutputValueType, IndexType) \
+        GKO_NOT_COMPILED(GKO_HOOK_MODULE);                              \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(_macro)
+
 #define GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_2(_macro)            \
     template <typename InputValueType, typename OutputValueType, \
               typename IndexType>                                \
@@ -138,6 +171,13 @@
         GKO_NOT_COMPILED(GKO_HOOK_MODULE);                       \
     GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2(_macro)
 
+#define GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_2_WITH_HALF(_macro)  \
+    template <typename InputValueType, typename OutputValueType, \
+              typename IndexType>                                \
+    _macro(InputValueType, OutputValueType, IndexType)           \
+        GKO_NOT_COMPILED(GKO_HOOK_MODULE);                       \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2_WITH_HALF(_macro)
+
 #define GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro) \
     template <typename ValueType, typename LocalIndexType, \
               typename GlobalIndexType>                    \
@@ -150,16 +190,31 @@
     _macro(IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
     GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(_macro)
 
+#define GKO_STUB_TEMPLATE_TYPE_WITH_HALF(_macro)         \
+    template <typename IndexType>                        \
+    _macro(IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
+    GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF(_macro)
+
 #define GKO_STUB_VALUE_CONVERSION(_macro)                             \
     template <typename SourceType, typename TargetType>               \
     _macro(SourceType, TargetType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
     GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro)
 
+#define GKO_STUB_VALUE_CONVERSION_WITH_HALF(_macro)                   \
+    template <typename SourceType, typename TargetType>               \
+    _macro(SourceType, TargetType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_WITH_HALF(_macro)
+
 #define GKO_STUB_VALUE_CONVERSION_OR_COPY(_macro)                     \
     template <typename SourceType, typename TargetType>               \
     _macro(SourceType, TargetType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
     GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro)
 
+#define GKO_STUB_VALUE_CONVERSION_OR_COPY_WITH_HALF(_macro)           \
+    template <typename SourceType, typename TargetType>               \
+    _macro(SourceType, TargetType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY_WITH_HALF(_macro)
+
 #define GKO_STUB_CB_GMRES(_macro)                                              \
     template <typename ValueType, typename ValueTypeKrylovBases>               \
     _macro(ValueType, ValueTypeKrylovBases) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
diff --git a/core/test/utils.hpp b/core/test/utils.hpp
index eee2900d731..ab9326400e0 100644
--- a/core/test/utils.hpp
+++ b/core/test/utils.hpp
@@ -15,6 +15,7 @@
 
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/base/half.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/name_demangling.hpp>
 #include <ginkgo/core/base/types.hpp>
@@ -327,10 +328,25 @@ using RealValueTypes =
     ::testing::Types<float, double>;
 #endif
 
+using RealValueTypesWithHalf = ::testing::Types<
+#if GINKGO_ENABLE_HALF
+    gko::half,
+#endif
+#if !GINKGO_DPCPP_SINGLE_MODE
+    double,
+#endif
+    float>;
+
 using ComplexValueTypes = add_inner_wrapper_t<std::complex, RealValueTypes>;
 
+using ComplexValueTypesWithHalf =
+    add_inner_wrapper_t<std::complex, RealValueTypesWithHalf>;
+
 using ValueTypes = merge_type_list_t<RealValueTypes, ComplexValueTypes>;
 
+using ValueTypesWithHalf =
+    merge_type_list_t<RealValueTypesWithHalf, ComplexValueTypesWithHalf>;
+
 using IndexTypes = ::testing::Types<int32, int64>;
 
 using IntegerTypes = merge_type_list_t<IndexTypes, ::testing::Types<size_type>>;
@@ -341,22 +357,44 @@ using LocalGlobalIndexTypes =
 
 using PODTypes = merge_type_list_t<RealValueTypes, IntegerTypes>;
 
+using PODTypesWithHalf =
+    merge_type_list_t<RealValueTypesWithHalf, IntegerTypes>;
+
 using ComplexAndPODTypes = merge_type_list_t<ComplexValueTypes, PODTypes>;
 
+using ComplexAndPODTypesWithHalf =
+    merge_type_list_t<ComplexValueTypesWithHalf, PODTypes>;
+
 using ValueIndexTypes = cartesian_type_product_t<ValueTypes, IndexTypes>;
 
+using ValueIndexTypesWithHalf =
+    cartesian_type_product_t<ValueTypesWithHalf, IndexTypes>;
+
 using RealValueIndexTypes =
     cartesian_type_product_t<RealValueTypes, IndexTypes>;
 
+using RealValueIndexTypesWithHalf =
+    cartesian_type_product_t<RealValueTypesWithHalf, IndexTypes>;
+
 using ComplexValueIndexTypes =
     cartesian_type_product_t<ComplexValueTypes, IndexTypes>;
 
+using ComplexValueIndexTypesWithHalf =
+    cartesian_type_product_t<ComplexValueTypesWithHalf, IndexTypes>;
+
 using TwoValueIndexType = add_to_cartesian_type_product_t<
     merge_type_list_t<
         cartesian_type_product_t<RealValueTypes, RealValueTypes>,
         cartesian_type_product_t<ComplexValueTypes, ComplexValueTypes>>,
     IndexTypes>;
 
+using TwoValueIndexTypeWithHalf = add_to_cartesian_type_product_t<
+    merge_type_list_t<cartesian_type_product_t<RealValueTypesWithHalf,
+                                               RealValueTypesWithHalf>,
+                      cartesian_type_product_t<ComplexValueTypesWithHalf,
+                                               ComplexValueTypesWithHalf>>,
+    IndexTypes>;
+
 using ValueLocalGlobalIndexTypes =
     add_to_cartesian_type_product_left_t<ValueTypes, LocalGlobalIndexTypes>;
 
@@ -365,7 +403,6 @@ template <typename Precision, typename OutputType>
 struct reduction_factor {
     using nc_output = remove_complex<OutputType>;
     using nc_precision = remove_complex<Precision>;
-
     static const nc_output value;
 };
 
@@ -456,4 +493,13 @@ struct TupleTypenameNameGenerator {
 };
 
 
+#define SKIP_IF_HALF(type)                                                   \
+    if (std::is_same<gko::remove_complex<type>, gko::half>::value) {         \
+        GTEST_SKIP() << "Skip due to half mode";                             \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+
+
 #endif  // GKO_CORE_TEST_UTILS_HPP_
diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp
index 5e15bb05d6a..73da407194e 100644
--- a/include/ginkgo/core/base/math.hpp
+++ b/include/ginkgo/core/base/math.hpp
@@ -383,6 +383,31 @@ struct next_precision_impl<std::complex<T>> {
 };
 
 
+template <typename T>
+struct next_precision_with_half_impl {};
+
+
+template <>
+struct next_precision_with_half_impl<gko::half> {
+    using type = float;
+};
+
+template <>
+struct next_precision_with_half_impl<float> {
+    using type = double;
+};
+
+template <>
+struct next_precision_with_half_impl<double> {
+    using type = gko::half;
+};
+
+template <typename T>
+struct next_precision_with_half_impl<std::complex<T>> {
+    using type = std::complex<typename next_precision_with_half_impl<T>::type>;
+};
+
+
 template <typename T>
 struct reduce_precision_impl {
     using type = T;
@@ -477,6 +502,26 @@ using next_precision = typename detail::next_precision_impl<T>::type;
 template <typename T>
 using previous_precision = next_precision<T>;
 
+/**
+ * Obtains the next type in the singly-linked precision list with half.
+ */
+#if GINKGO_ENABLE_HALF
+template <typename T>
+using next_precision_with_half =
+    typename detail::next_precision_with_half_impl<T>::type;
+
+template <typename T>
+using previous_precision_with_half =
+    next_precision_with_half<next_precision_with_half<T>>;
+#else
+// fallback to float/double list
+template <typename T>
+using next_precision_with_half = next_precision<T>;
+
+template <typename T>
+using previous_precision_with_half = previous_precision<T>;
+#endif
+
 
 /**
  * Obtains the next type in the hierarchy with lower precision than T.
diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp
index 1d5963c0fe8..5e1fb2a14e3 100644
--- a/include/ginkgo/core/base/types.hpp
+++ b/include/ginkgo/core/base/types.hpp
@@ -17,6 +17,7 @@
 #include <string>
 #include <type_traits>
 
+#include <ginkgo/config.hpp>
 #include <ginkgo/core/base/half.hpp>
 
 
@@ -399,6 +400,17 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
     _enable_macro(CudaExecutor, cuda)
 
 
+// cuda half operation is supported from arch 5.3
+#if GINKGO_ENABLE_HALF && (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 530)
+#define GKO_ADAPT_HF(_macro) _macro
+#else
+#define GKO_ADAPT_HF(_macro)                                                 \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+#endif
+
+
 /**
  * Instantiates a template for each non-complex value type compiled by Ginkgo.
  *
@@ -418,6 +430,10 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
     template _macro(double)
 #endif
 
+#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF(_macro) \
+    GKO_ADAPT_HF(template _macro(half));                                  \
+    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro)
+
 
 /**
  * Instantiates a template for each value type compiled by Ginkgo.
@@ -440,6 +456,11 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
     template _macro(std::complex<double>)
 #endif
 
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(_macro) \
+    GKO_ADAPT_HF(template _macro(half));                      \
+    GKO_ADAPT_HF(template _macro(std::complex<half>));        \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro)
+
 
 // Helper macro to make Windows builds work
 // In MSVC, __VA_ARGS__ behave like one argument by default.
@@ -528,6 +549,12 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
     template _macro(std::complex<double>, double)
 #endif
 
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF(_macro)   \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro);                \
+    GKO_ADAPT_HF(template _macro(half, half));                             \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, std::complex<half>)); \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, half))
+
 
 /**
  * Instantiates a template for each index type compiled by Ginkgo.
@@ -566,6 +593,11 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
     template _macro(float, int64);                                        \
     template _macro(double, int64)
 #endif
+#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF( \
+    _macro)                                                                  \
+    GKO_ADAPT_HF(template _macro(half, int32));                              \
+    GKO_ADAPT_HF(template _macro(half, int64));                              \
+    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro)
 
 #if GINKGO_DPCPP_SINGLE_MODE
 #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(_macro) \
@@ -583,6 +615,11 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
     template _macro(std::complex<double>, int32)
 #endif
 
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(_macro) \
+    GKO_ADAPT_HF(template _macro(half, int32));                         \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, int32));           \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(_macro)
+
 
 /**
  * Instantiates a template for each value and index type compiled by Ginkgo.
@@ -610,6 +647,13 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
     template _macro(std::complex<double>, int64)
 #endif
 
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(_macro) \
+    GKO_ADAPT_HF(template _macro(half, int32));                         \
+    GKO_ADAPT_HF(template _macro(half, int64));                         \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, int32));           \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, int64));           \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro)
+
 
 /**
  * Instantiates a template for each non-complex value, local and global index
@@ -643,6 +687,14 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
     template _macro(double, int64, int64)
 #endif
 
+#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_WITH_HALF( \
+    _macro)                                                                               \
+    GKO_ADAPT_HF(template _macro(half, int32, int32));                                    \
+    GKO_ADAPT_HF(template _macro(half, int32, int64));                                    \
+    GKO_ADAPT_HF(template _macro(half, int64, int64));                                    \
+    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(               \
+        _macro)
+
 
 /**
  * Instantiates a template for each value and index type compiled by Ginkgo.
@@ -677,6 +729,16 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
     template _macro(std::complex<double>, int64, int64)
 #endif
 
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_WITH_HALF( \
+    _macro)                                                                   \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro);       \
+    GKO_ADAPT_HF(template _macro(half, int32, int32));                        \
+    GKO_ADAPT_HF(template _macro(half, int32, int64));                        \
+    GKO_ADAPT_HF(template _macro(half, int64, int64));                        \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, int32, int32));          \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, int32, int64));          \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, int64, int64))
+
 
 #if GINKGO_DPCPP_SINGLE_MODE
 #define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro)                  \
@@ -732,6 +794,40 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
     template _macro(std::complex<double>, std::complex<double>)
 #endif
 
+#if GINKGO_DPCPP_SINGLE_MODE
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_WITH_HALF(_macro)           \
+    GKO_ADAPT_HF(template <> _macro(half, double) GKO_NOT_IMPLEMENTED);       \
+    GKO_ADAPT_HF(template <> _macro(double, half) GKO_NOT_IMPLEMENTED);       \
+    GKO_ADAPT_HF(template _macro(float, half));                               \
+    GKO_ADAPT_HF(template _macro(half, float));                               \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, std::complex<float>));   \
+    GKO_ADAPT_HF(template <> _macro(std::complex<half>, std::complex<double>) \
+                     GKO_NOT_IMPLEMENTED);                                    \
+    GKO_ADAPT_HF(template _macro(std::complex<float>, std::complex<half>));   \
+    GKO_ADAPT_HF(template <> _macro(std::complex<double>, std::complex<half>) \
+                     GKO_NOT_IMPLEMENTED);                                    \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro)
+#else
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_WITH_HALF(_macro)          \
+    GKO_ADAPT_HF(template _macro(half, double));                             \
+    GKO_ADAPT_HF(template _macro(double, half));                             \
+    GKO_ADAPT_HF(template _macro(float, half));                              \
+    GKO_ADAPT_HF(template _macro(half, float));                              \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, std::complex<float>));  \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, std::complex<double>)); \
+    GKO_ADAPT_HF(template _macro(std::complex<float>, std::complex<half>));  \
+    GKO_ADAPT_HF(template _macro(std::complex<double>, std::complex<half>)); \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro)
+#endif
+
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY_WITH_HALF(_macro) \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_WITH_HALF(_macro);            \
+    GKO_ADAPT_HF(template _macro(half, half));                              \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, std::complex<half>));  \
+    template _macro(float, float);                                          \
+    template _macro(double, double);                                        \
+    template _macro(std::complex<float>, std::complex<float>);              \
+    template _macro(std::complex<double>, std::complex<double>)
 
 /**
  * Instantiates a template for each value type pair compiled by Ginkgo.
@@ -749,6 +845,11 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
     template _macro(std::complex<float>, std::complex<float>); \
     template _macro(std::complex<double>, std::complex<double>)
 
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_PAIR_WITH_HALF(_macro)         \
+    GKO_ADAPT_HF(template _macro(half, half));                             \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, half));               \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, std::complex<half>)); \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_PAIR(_macro)
 
 /**
  * Instantiates a template for each combined value and index type compiled by
@@ -771,6 +872,12 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
     template _macro(std::complex<float>, std::complex<float>);         \
     template _macro(std::complex<double>, std::complex<double>)
 
+#define GKO_INSTANTIATE_FOR_EACH_COMBINED_VALUE_AND_INDEX_TYPE_WITH_HALF(  \
+    _macro)                                                                \
+    GKO_ADAPT_HF(template _macro(half, half));                             \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, std::complex<half>)); \
+    GKO_INSTANTIATE_FOR_EACH_COMBINED_VALUE_AND_INDEX_TYPE(_macro)
+
 /**
  * Instantiates a template for each value and index type compiled by Ginkgo.
  *
@@ -789,6 +896,10 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
     template _macro(int32);                       \
     template _macro(int64)
 
+#define GKO_INSTANTIATE_FOR_EACH_POD_TYPE_WITH_HALF(_macro) \
+    GKO_ADAPT_HF(template _macro(half));                    \
+    GKO_ADAPT_HF(template _macro(std::complex<half>));      \
+    GKO_INSTANTIATE_FOR_EACH_POD_TYPE(_macro)
 
 /**
  * Instantiates a template for each normal type
@@ -803,6 +914,11 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
     GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(_macro);       \
     template _macro(gko::size_type)
 
+#define GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF(_macro) \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(_macro);       \
+    GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(_macro);                 \
+    template _macro(gko::size_type)
+
 
 /**
  * Instantiates a template for int32 type.

From 54241847b7e8ae05ff35ffc10f8344e01e091ee1 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 23 Oct 2024 18:25:12 +0200
Subject: [PATCH 355/448] half option

---
 CMakeLists.txt               | 6 ++++++
 cmake/get_info.cmake         | 2 +-
 include/ginkgo/config.hpp.in | 5 +++++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1f351038c92..fea0c3efd40 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,6 +32,12 @@ option(GINKGO_BUILD_DOC "Generate documentation" OFF)
 option(GINKGO_FAST_TESTS "Reduces the input size for a few tests known to be time-intensive" OFF)
 option(GINKGO_TEST_NONDEFAULT_STREAM "Uses non-default streams in CUDA and HIP tests" OFF)
 option(GINKGO_MIXED_PRECISION "Instantiate true mixed-precision kernels (otherwise they will be conversion-based using implicit temporary storage)" OFF)
+option(GINKGO_ENABLE_HALF "Enable the use of half precision" ON)
+# We do not support MSVC. SYCL will come later
+if(MSVC OR GINKGO_BUILD_SYCL)
+    message(STATUS "HALF is not supported in MSVC, and later support in SYCL")
+    set(GINKGO_ENABLE_HALF OFF CACHE BOOL "Enable the use of half precision" FORCE)
+endif()
 option(GINKGO_SKIP_DEPENDENCY_UPDATE
     "Do not update dependencies each time the project is rebuilt" ON)
 option(GINKGO_WITH_CLANG_TIDY "Make Ginkgo call `clang-tidy` to find programming issues." OFF)
diff --git a/cmake/get_info.cmake b/cmake/get_info.cmake
index 63f43c645f0..57816ab8008 100644
--- a/cmake/get_info.cmake
+++ b/cmake/get_info.cmake
@@ -130,7 +130,7 @@ foreach(log_type ${log_types})
         "GINKGO_BUILD_OMP;GINKGO_BUILD_MPI;GINKGO_BUILD_REFERENCE;GINKGO_BUILD_CUDA;GINKGO_BUILD_HIP;GINKGO_BUILD_SYCL")
     ginkgo_print_module_footer(${${log_type}} "  Enabled features:")
     ginkgo_print_foreach_variable(${${log_type}}
-        "GINKGO_MIXED_PRECISION;GINKGO_HAVE_GPU_AWARE_MPI")
+        "GINKGO_MIXED_PRECISION;GINKGO_HAVE_GPU_AWARE_MPI;GINKGO_ENABLE_HALF")
     ginkgo_print_module_footer(${${log_type}} "  Tests, benchmarks and examples:")
     ginkgo_print_foreach_variable(${${log_type}}
         "GINKGO_BUILD_TESTS;GINKGO_FAST_TESTS;GINKGO_BUILD_EXAMPLES;GINKGO_EXTLIB_EXAMPLE;GINKGO_BUILD_BENCHMARKS;GINKGO_BENCHMARK_ENABLE_TUNING")
diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in
index 1dfa6bc61bc..cf25dcd3c77 100644
--- a/include/ginkgo/config.hpp.in
+++ b/include/ginkgo/config.hpp.in
@@ -105,6 +105,11 @@
 #define GKO_HAVE_HWLOC @GINKGO_HAVE_HWLOC@
 // clang-format on
 
+/* Is half operation available ? */
+// clang-format off
+#cmakedefine01 GINKGO_ENABLE_HALF
+// clang-format on
+
 
 /* Do we need to use blocking communication in our SpMV? */
 // clang-format off

From 3f78b6c0dde63174b9ddd52eadac1215a447d689 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 23 Oct 2024 22:39:38 +0200
Subject: [PATCH 356/448] device type mapping

---
 common/cuda_hip/base/math.hpp  | 124 +++++++++++++++++++++++++++++----
 common/cuda_hip/base/types.hpp |  14 ++++
 cuda/base/types.hpp            |   1 -
 hip/base/types.hip.hpp         |   1 -
 4 files changed, 126 insertions(+), 14 deletions(-)

diff --git a/common/cuda_hip/base/math.hpp b/common/cuda_hip/base/math.hpp
index 8c655174524..7f0391d904c 100644
--- a/common/cuda_hip/base/math.hpp
+++ b/common/cuda_hip/base/math.hpp
@@ -11,6 +11,21 @@
 #include <ginkgo/core/base/math.hpp>
 
 
+#ifdef GKO_COMPILING_CUDA
+
+
+#include <cuda_fp16.h>
+
+
+#elif defined(GKO_COMPILING_HIP)
+
+
+#include <hip/hip_fp16.h>
+
+
+#endif
+
+
 namespace gko {
 
 
@@ -18,9 +33,35 @@ namespace gko {
 // __device__ function (even though it is constexpr)
 template <typename T>
 struct device_numeric_limits {
-    static constexpr auto inf = std::numeric_limits<T>::infinity();
-    static constexpr auto max = std::numeric_limits<T>::max();
-    static constexpr auto min = std::numeric_limits<T>::min();
+    static constexpr auto inf() { return std::numeric_limits<T>::infinity(); }
+    static constexpr auto max() { return std::numeric_limits<T>::max(); }
+    static constexpr auto min() { return std::numeric_limits<T>::min(); }
+};
+
+template <>
+struct device_numeric_limits<__half> {
+    // from __half documentation, it accepts unsigned short
+    // __half and __half_raw does not have constexpr constructor
+    static GKO_ATTRIBUTES GKO_INLINE auto inf()
+    {
+        __half_raw bits;
+        bits.x = static_cast<unsigned short>(0b0111110000000000u);
+        return __half{bits};
+    }
+
+    static GKO_ATTRIBUTES GKO_INLINE auto max()
+    {
+        __half_raw bits;
+        bits.x = static_cast<unsigned short>(0b0111101111111111u);
+        return __half{bits};
+    }
+
+    static GKO_ATTRIBUTES GKO_INLINE auto min()
+    {
+        __half_raw bits;
+        bits.x = static_cast<unsigned short>(0b0000010000000000u);
+        return __half{bits};
+    }
 };
 
 
@@ -33,15 +74,6 @@ struct remove_complex_impl<thrust::complex<T>> {
 };
 
 
-template <typename T>
-struct is_complex_impl<thrust::complex<T>>
-    : public std::integral_constant<bool, true> {};
-
-
-template <typename T>
-struct is_complex_or_scalar_impl<thrust::complex<T>> : std::is_scalar<T> {};
-
-
 template <typename T>
 struct truncate_type_impl<thrust::complex<T>> {
     using type = thrust::complex<typename truncate_type_impl<T>::type>;
@@ -52,4 +84,72 @@ struct truncate_type_impl<thrust::complex<T>> {
 }  // namespace gko
 
 
+namespace thrust {
+
+
+template <>
+GKO_ATTRIBUTES GKO_INLINE complex<__half> sqrt<__half>(const complex<__half>& a)
+{
+    return sqrt(static_cast<complex<float>>(a));
+}
+
+
+template <>
+GKO_ATTRIBUTES GKO_INLINE __half abs<__half>(const complex<__half>& z)
+{
+    return abs(static_cast<complex<float>>(z));
+}
+
+
+}  // namespace thrust
+
+
+namespace gko {
+
+
+// It is required by NVHPC 23.3, `isnan` is undefined when NVHPC is used as a
+// host compiler.
+#if defined(__CUDACC__) || defined(GKO_COMPILING_HIP)
+
+__device__ __forceinline__ bool is_nan(const __half& val)
+{
+    // from the cuda_fp16.hpp
+#if GINKGO_HIP_PLATFORM_HCC || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
+    return __hisnan(val);
+#else
+    return isnan(static_cast<float>(val));
+#endif
+}
+
+__device__ __forceinline__ bool is_nan(const thrust::complex<__half>& val)
+{
+    return is_nan(val.real()) || is_nan(val.imag());
+}
+
+
+__device__ __forceinline__ __half abs(const __half& val)
+{
+#if GINKGO_HIP_PLATFORM_HCC || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
+    return __habs(val);
+#else
+    return abs(static_cast<float>(val));
+#endif
+}
+
+__device__ __forceinline__ __half sqrt(const __half& val)
+{
+#if GINKGO_HIP_PLATFORM_HCC || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
+    return hsqrt(val);
+#else
+    return sqrt(static_cast<float>(val));
+#endif
+}
+
+
+#endif
+
+
+}  // namespace gko
+
+
 #endif  // GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_
diff --git a/common/cuda_hip/base/types.hpp b/common/cuda_hip/base/types.hpp
index 08f0516d691..42ca57eb0bf 100644
--- a/common/cuda_hip/base/types.hpp
+++ b/common/cuda_hip/base/types.hpp
@@ -14,5 +14,19 @@
 #error "Executor definition missing"
 #endif
 
+#define THRUST_HALF_FRIEND_OPERATOR(_op, _opeq)                               \
+    GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> operator _op(           \
+        const thrust::complex<__half> lhs, const thrust::complex<__half> rhs) \
+    {                                                                         \
+        return thrust::complex<float>{lhs} _op thrust::complex<float>(rhs);   \
+    }
+
+THRUST_HALF_FRIEND_OPERATOR(+, +=)
+THRUST_HALF_FRIEND_OPERATOR(-, -=)
+THRUST_HALF_FRIEND_OPERATOR(*, *=)
+THRUST_HALF_FRIEND_OPERATOR(/, /=)
+
+#undef THRUST_HALF_FRIEND_OPERATOR
+
 
 #endif  // GKO_COMMON_CUDA_HIP_BASE_TYPES_HPP_
diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp
index 05f07ceb8dd..05b604923da 100644
--- a/cuda/base/types.hpp
+++ b/cuda/base/types.hpp
@@ -20,7 +20,6 @@
 
 
 namespace gko {
-
 namespace kernels {
 namespace cuda {
 namespace detail {
diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp
index c3982b7562e..6b78cceea99 100644
--- a/hip/base/types.hip.hpp
+++ b/hip/base/types.hip.hpp
@@ -26,7 +26,6 @@
 
 #include "common/cuda_hip/base/runtime.hpp"
 
-
 namespace gko {
 namespace kernels {
 namespace hip {

From 91af999b4342f5d5dbcd8c60c3262c376cadd52e Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 28 Nov 2024 16:46:04 +0100
Subject: [PATCH 357/448] consider custom namespace for thrust::complex<__half>
 and benchmark

---
 benchmark/CMakeLists.txt              |  6 ++++++
 common/cuda_hip/base/math.hpp         |  5 +++++
 common/cuda_hip/base/thrust_macro.hpp | 22 ++++++++++++++++++++++
 common/cuda_hip/base/types.hpp        | 15 +++++++++------
 4 files changed, 42 insertions(+), 6 deletions(-)
 create mode 100644 common/cuda_hip/base/thrust_macro.hpp

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 55ed76d1613..c780a497c32 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -18,6 +18,9 @@ function(ginkgo_benchmark_cusparse_linops type def)
             PRIVATE
             $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
     endif()
+    if(GINKGO_CUDA_CUSTOM_THRUST_NAMESPACE)
+        target_compile_definitions(cusparse_linops_${type} PRIVATE THRUST_CUB_WRAPPED_NAMESPACE=gko)
+    endif()
     # make the dependency public to catch issues
     target_compile_definitions(cusparse_linops_${type} PUBLIC ${def})
     target_compile_definitions(cusparse_linops_${type} PRIVATE GKO_COMPILING_CUDA)
@@ -28,6 +31,9 @@ endfunction()
 function(ginkgo_benchmark_hipsparse_linops type def)
     add_library(hipsparse_linops_${type} utils/hip_linops.hip.cpp)
     set_source_files_properties(utils/hip_linops.hip.cpp PROPERTIES LANGUAGE HIP)
+    if(GINKGO_CUDA_CUSTOM_THRUST_NAMESPACE)
+        target_compile_definitions(hipsparse_linops_${type} PRIVATE THRUST_CUB_WRAPPED_NAMESPACE=gko)
+    endif()
     target_compile_definitions(hipsparse_linops_${type} PUBLIC ${def})
     target_compile_definitions(hipsparse_linops_${type} PRIVATE GKO_COMPILING_HIP)
     target_include_directories(hipsparse_linops_${type} SYSTEM PRIVATE ${HIPBLAS_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS})
diff --git a/common/cuda_hip/base/math.hpp b/common/cuda_hip/base/math.hpp
index 7f0391d904c..3d2975c1eee 100644
--- a/common/cuda_hip/base/math.hpp
+++ b/common/cuda_hip/base/math.hpp
@@ -26,6 +26,9 @@
 #endif
 
 
+#include "common/cuda_hip/base/thrust_macro.hpp"
+
+
 namespace gko {
 
 
@@ -84,6 +87,7 @@ struct truncate_type_impl<thrust::complex<T>> {
 }  // namespace gko
 
 
+GKO_THRUST_NAEMSPACE_PREFIX
 namespace thrust {
 
 
@@ -102,6 +106,7 @@ GKO_ATTRIBUTES GKO_INLINE __half abs<__half>(const complex<__half>& z)
 
 
 }  // namespace thrust
+GKO_THRUST_NAEMSPACE_POSTFIX
 
 
 namespace gko {
diff --git a/common/cuda_hip/base/thrust_macro.hpp b/common/cuda_hip/base/thrust_macro.hpp
new file mode 100644
index 00000000000..c5e3fc40010
--- /dev/null
+++ b/common/cuda_hip/base/thrust_macro.hpp
@@ -0,0 +1,22 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_BASE_THRUST_MACRO_HPP_
+#define GKO_COMMON_CUDA_HIP_BASE_THRUST_MACRO_HPP_
+
+// although thrust provides the similar thing, these macro are only defined when
+// they supported. Thus, we need to provide our own macro to make it work with
+// the old version
+#ifdef THRUST_CUB_WRAPPED_NAMESPACE
+#define GKO_THRUST_NAEMSPACE_PREFIX namespace THRUST_CUB_WRAPPED_NAMESPACE {
+#define GKO_THRUST_NAEMSPACE_POSTFIX }
+#define GKO_THRUST_QUALIFIER ::THRUST_CUB_WRAPPED_NAMESPACE::thrust
+#else
+#define GKO_THRUST_NAEMSPACE_PREFIX
+#define GKO_THRUST_NAEMSPACE_POSTFIX
+#define GKO_THRUST_QUALIFIER ::thrust
+#endif  // THRUST_CUB_WRAPPED_NAMESPACE
+
+
+#endif  // GKO_COMMON_CUDA_HIP_BASE_THRUST_MACRO_HPP_
diff --git a/common/cuda_hip/base/types.hpp b/common/cuda_hip/base/types.hpp
index 42ca57eb0bf..e65b179ed68 100644
--- a/common/cuda_hip/base/types.hpp
+++ b/common/cuda_hip/base/types.hpp
@@ -5,7 +5,7 @@
 #ifndef GKO_COMMON_CUDA_HIP_BASE_TYPES_HPP_
 #define GKO_COMMON_CUDA_HIP_BASE_TYPES_HPP_
 
-
+#include "common/cuda_hip/base/math.hpp"
 #if defined(GKO_COMPILING_CUDA)
 #include "cuda/base/types.hpp"
 #elif defined(GKO_COMPILING_HIP)
@@ -14,11 +14,14 @@
 #error "Executor definition missing"
 #endif
 
-#define THRUST_HALF_FRIEND_OPERATOR(_op, _opeq)                               \
-    GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> operator _op(           \
-        const thrust::complex<__half> lhs, const thrust::complex<__half> rhs) \
-    {                                                                         \
-        return thrust::complex<float>{lhs} _op thrust::complex<float>(rhs);   \
+
+#define THRUST_HALF_FRIEND_OPERATOR(_op, _opeq)                     \
+    GKO_ATTRIBUTES GKO_INLINE GKO_THRUST_QUALIFIER::complex<__half> \
+    operator _op(const GKO_THRUST_QUALIFIER::complex<__half> lhs,   \
+                 const GKO_THRUST_QUALIFIER::complex<__half> rhs)   \
+    {                                                               \
+        return GKO_THRUST_QUALIFIER::complex<float>{                \
+            lhs} _op GKO_THRUST_QUALIFIER::complex<float>(rhs);     \
     }
 
 THRUST_HALF_FRIEND_OPERATOR(+, +=)

From 9635e89464c8bb325390fe325cd51aeae4e9e72f Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 24 Oct 2024 01:11:29 +0200
Subject: [PATCH 358/448] atomic and cooperative_groups

---
 common/cuda_hip/components/atomic.hpp     | 54 ++++++++++++++++++++++-
 hip/components/cooperative_groups.hip.hpp | 12 ++---
 omp/components/atomic.hpp                 | 54 +++++++++++++++++++++--
 3 files changed, 108 insertions(+), 12 deletions(-)

diff --git a/common/cuda_hip/components/atomic.hpp b/common/cuda_hip/components/atomic.hpp
index aeb77d48c75..954bc7476ed 100644
--- a/common/cuda_hip/components/atomic.hpp
+++ b/common/cuda_hip/components/atomic.hpp
@@ -39,6 +39,7 @@ struct atomic_helper {
 };
 
 
+// TODO: consider it implemented by memcpy.
 template <typename ResultType, typename ValueType>
 __forceinline__ __device__ ResultType reinterpret(ValueType val)
 {
@@ -95,15 +96,64 @@ __forceinline__ __device__ ResultType reinterpret(ValueType val)
         }                                                                    \
     };
 
+
+#define GKO_BIND_ATOMIC_HELPER_FAKE_STRUCTURE(CONVERTER_TYPE)               \
+    template <typename ValueType>                                           \
+    struct atomic_helper<                                                   \
+        ValueType,                                                          \
+        std::enable_if_t<(sizeof(ValueType) == sizeof(CONVERTER_TYPE))>> {  \
+        __forceinline__ __device__ static ValueType atomic_add(             \
+            ValueType* __restrict__ addr, ValueType val)                    \
+        {                                                                   \
+            assert(false);                                                  \
+            using c_type = CONVERTER_TYPE;                                  \
+            return atomic_wrapper(                                          \
+                addr, [&val](c_type& old, c_type assumed, c_type* c_addr) { \
+                    old = *c_addr;                                          \
+                    *c_addr = reinterpret<c_type>(                          \
+                        val + reinterpret<ValueType>(assumed));             \
+                });                                                         \
+        }                                                                   \
+        __forceinline__ __device__ static ValueType atomic_max(             \
+            ValueType* __restrict__ addr, ValueType val)                    \
+        {                                                                   \
+            assert(false);                                                  \
+            using c_type = CONVERTER_TYPE;                                  \
+            return atomic_wrapper(                                          \
+                addr, [&val](c_type& old, c_type assumed, c_type* c_addr) { \
+                    if (reinterpret<ValueType>(assumed) < val) {            \
+                        old = *c_addr;                                      \
+                        *c_addr = reinterpret<c_type>(assumed);             \
+                    }                                                       \
+                });                                                         \
+        }                                                                   \
+                                                                            \
+    private:                                                                \
+        template <typename Callable>                                        \
+        __forceinline__ __device__ static ValueType atomic_wrapper(         \
+            ValueType* __restrict__ addr, Callable set_old)                 \
+        {                                                                   \
+            CONVERTER_TYPE* address_as_converter =                          \
+                reinterpret_cast<CONVERTER_TYPE*>(addr);                    \
+            CONVERTER_TYPE old = *address_as_converter;                     \
+            CONVERTER_TYPE assumed = old;                                   \
+            set_old(old, assumed, address_as_converter);                    \
+            return reinterpret<ValueType>(old);                             \
+        }                                                                   \
+    };
+
 // Support 64-bit ATOMIC_ADD and ATOMIC_MAX
 GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned long long int);
 // Support 32-bit ATOMIC_ADD and ATOMIC_MAX
 GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned int);
 
 
-#if defined(CUDA_VERSION)
-// Support 16-bit ATOMIC_ADD and ATOMIC_MAX only on CUDA
+#if defined(CUDA_VERSION) && (__CUDA_ARCH__ >= 700)
+// Support 16-bit atomicCAS, atomicADD, and atomicMAX only on CUDA with CC
+// >= 7.0
 GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned short int);
+#else
+GKO_BIND_ATOMIC_HELPER_FAKE_STRUCTURE(unsigned short int)
 #endif
 
 
diff --git a/hip/components/cooperative_groups.hip.hpp b/hip/components/cooperative_groups.hip.hpp
index 36618bb7f3e..46c2fb195bc 100644
--- a/hip/components/cooperative_groups.hip.hpp
+++ b/hip/components/cooperative_groups.hip.hpp
@@ -306,7 +306,7 @@ class enable_extended_shuffle : public Group {
                                                SelectorType selector) const \
     {                                                                       \
         return shuffle_impl(                                                \
-            [this](uint32 v, SelectorType s) {                              \
+            [this](uint16 v, SelectorType s) {                              \
                 return static_cast<const Group*>(this)->_name(v, s);        \
             },                                                              \
             var, selector);                                                 \
@@ -326,12 +326,12 @@ class enable_extended_shuffle : public Group {
     shuffle_impl(ShuffleOperator intrinsic_shuffle, const ValueType var,
                  SelectorType selector)
     {
-        static_assert(sizeof(ValueType) % sizeof(uint32) == 0,
-                      "Unable to shuffle sizes which are not 4-byte multiples");
-        constexpr auto value_size = sizeof(ValueType) / sizeof(uint32);
+        static_assert(sizeof(ValueType) % sizeof(uint16) == 0,
+                      "Unable to shuffle sizes which are not 2-byte multiples");
+        constexpr auto value_size = sizeof(ValueType) / sizeof(uint16);
         ValueType result;
-        auto var_array = reinterpret_cast<const uint32*>(&var);
-        auto result_array = reinterpret_cast<uint32*>(&result);
+        auto var_array = reinterpret_cast<const uint16*>(&var);
+        auto result_array = reinterpret_cast<uint16*>(&result);
 #pragma unroll
         for (std::size_t i = 0; i < value_size; ++i) {
             result_array[i] = intrinsic_shuffle(var_array[i], selector);
diff --git a/omp/components/atomic.hpp b/omp/components/atomic.hpp
index c3580cd36bb..35b94a65fe5 100644
--- a/omp/components/atomic.hpp
+++ b/omp/components/atomic.hpp
@@ -8,6 +8,7 @@
 
 #include <type_traits>
 
+#include <ginkgo/core/base/half.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 
@@ -32,10 +33,55 @@ void atomic_add(ValueType& out, ValueType val)
     // The C++ standard explicitly allows casting complex<double>* to double*
     // [complex.numbers.general]
     auto values = reinterpret_cast<gko::remove_complex<ValueType>*>(&out);
-#pragma omp atomic
-    values[0] += real(val);
-#pragma omp atomic
-    values[1] += imag(val);
+    atomic_add(values[0], real(val));
+    atomic_add(values[1], imag(val));
+}
+
+
+template <typename ResultType, typename ValueType>
+inline ResultType copy_cast(const ValueType& val)
+{
+    static_assert(
+        sizeof(ValueType) == sizeof(ResultType) &&
+            std::alignment_of_v<ResultType> == std::alignment_of_v<ValueType>,
+        "only copy the same alignment and size type");
+    ResultType res;
+    std::memcpy(&res, &val, sizeof(ValueType));
+    return res;
+}
+
+
+template <>
+void atomic_add(half& out, half val)
+{
+#ifdef __NVCOMPILER
+// NVC++ uses atomic capture on uint16 leads the following error.
+// use of undefined value '%L.B*' br label %L.B* !llvm.loop !*, !dbg !*
+#pragma omp critical
+    {
+        out += val;
+    }
+#else
+    static_assert(
+        sizeof(half) == sizeof(uint16_t) &&
+            std::alignment_of_v<uint16_t> == std::alignment_of_v<half>,
+        "half does not fulfill the requirement of reinterpret_cast to half or "
+        "vice versa.");
+    // It is undefined behavior with reinterpret_cast, but we do not have any
+    // workaround when the #omp atomic does not support custom precision
+    uint16_t* address_as_converter = reinterpret_cast<uint16_t*>(&out);
+    uint16_t old = *address_as_converter;
+    uint16_t assumed;
+    do {
+        assumed = old;
+        auto answer = copy_cast<uint16_t>(copy_cast<half>(assumed) + val);
+#pragma omp atomic capture
+        {
+            old = *address_as_converter;
+            *address_as_converter = (old == assumed) ? answer : old;
+        }
+    } while (assumed != old);
+#endif
 }
 
 

From 8f7c8071da8c0c0ec5ac09be6f3b9dc12370b7a2 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 24 Oct 2024 01:16:50 +0200
Subject: [PATCH 359/448] fix math and device_numeric_limit

---
 common/cuda_hip/base/math.hpp                 |  11 ++
 common/cuda_hip/components/merging.hpp        |   4 +-
 .../factorization/par_ict_kernels.cpp         |   4 +-
 .../factorization/par_ilut_select_kernels.hpp |   4 +-
 .../factorization/par_ilut_spgeam_kernels.cpp |   4 +-
 common/cuda_hip/reorder/rcm_kernels.cpp       |   2 +-
 cuda/test/base/math.cu                        |   4 +-
 hip/test/base/math.hip.cpp                    |   4 +-
 include/ginkgo/core/base/math.hpp             | 103 +++++-------------
 9 files changed, 54 insertions(+), 86 deletions(-)

diff --git a/common/cuda_hip/base/math.hpp b/common/cuda_hip/base/math.hpp
index 3d2975c1eee..f83533d8f0d 100644
--- a/common/cuda_hip/base/math.hpp
+++ b/common/cuda_hip/base/math.hpp
@@ -83,6 +83,17 @@ struct truncate_type_impl<thrust::complex<T>> {
 };
 
 
+template <typename T>
+struct is_complex_impl<thrust::complex<T>> : public std::true_type {};
+
+template <>
+struct is_complex_or_scalar_impl<__half> : public std::true_type {};
+
+template <typename T>
+struct is_complex_or_scalar_impl<thrust::complex<T>>
+    : public is_complex_or_scalar_impl<T> {};
+
+
 }  // namespace detail
 }  // namespace gko
 
diff --git a/common/cuda_hip/components/merging.hpp b/common/cuda_hip/components/merging.hpp
index ab070741fbd..b832a97176e 100644
--- a/common/cuda_hip/components/merging.hpp
+++ b/common/cuda_hip/components/merging.hpp
@@ -131,7 +131,7 @@ __forceinline__ __device__ void group_merge(const ValueType* __restrict__ a,
     IndexType a_begin{};
     IndexType b_begin{};
     auto lane = static_cast<IndexType>(group.thread_rank());
-    auto sentinel = device_numeric_limits<IndexType>::max;
+    auto sentinel = device_numeric_limits<IndexType>::max();
     auto a_cur = checked_load(a, a_begin + lane, a_size, sentinel);
     auto b_cur = checked_load(b, b_begin + lane, b_size, sentinel);
     for (IndexType c_begin{}; c_begin < c_size; c_begin += group_size) {
@@ -240,7 +240,7 @@ __forceinline__ __device__ void sequential_merge(
     auto c_size = a_size + b_size;
     IndexType a_begin{};
     IndexType b_begin{};
-    auto sentinel = device_numeric_limits<IndexType>::max;
+    auto sentinel = device_numeric_limits<IndexType>::max();
     auto a_cur = checked_load(a, a_begin, a_size, sentinel);
     auto b_cur = checked_load(b, b_begin, b_size, sentinel);
     for (IndexType c_begin{}; c_begin < c_size; c_begin++) {
diff --git a/common/cuda_hip/factorization/par_ict_kernels.cpp b/common/cuda_hip/factorization/par_ict_kernels.cpp
index 94aa5e5124e..3446f124123 100644
--- a/common/cuda_hip/factorization/par_ict_kernels.cpp
+++ b/common/cuda_hip/factorization/par_ict_kernels.cpp
@@ -128,7 +128,7 @@ __global__ __launch_bounds__(default_block_size) void ict_tri_spgeam_init(
 
     IndexType l_new_begin = l_new_row_ptrs[row];
 
-    constexpr auto sentinel = device_numeric_limits<IndexType>::max;
+    constexpr auto sentinel = device_numeric_limits<IndexType>::max();
     // load column indices and values for the first merge step
     auto a_col = checked_load(a_col_idxs, a_begin + lane, a_end, sentinel);
     auto a_val = checked_load(a_vals, a_begin + lane, a_end, zero<ValueType>());
@@ -456,4 +456,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 }  // namespace par_ict_factorization
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/common/cuda_hip/factorization/par_ilut_select_kernels.hpp b/common/cuda_hip/factorization/par_ilut_select_kernels.hpp
index 6f5940c2b14..79a562ff834 100644
--- a/common/cuda_hip/factorization/par_ilut_select_kernels.hpp
+++ b/common/cuda_hip/factorization/par_ilut_select_kernels.hpp
@@ -254,7 +254,7 @@ __global__ __launch_bounds__(basecase_block_size) void basecase_select(
     const ValueType* __restrict__ input, IndexType size, IndexType rank,
     ValueType* __restrict__ out)
 {
-    constexpr auto sentinel = device_numeric_limits<ValueType>::inf;
+    constexpr auto sentinel = device_numeric_limits<ValueType>::inf();
     ValueType local[basecase_local_size];
     __shared__ ValueType sh_local[basecase_size];
     for (int i = 0; i < basecase_local_size; ++i) {
@@ -301,4 +301,4 @@ __global__ __launch_bounds__(config::warp_size) void find_bucket(
 }  // namespace kernels
 }  // namespace gko
 
-#endif  // GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_SELECT_KERNELS_HIP_HPP_
\ No newline at end of file
+#endif  // GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_SELECT_KERNELS_HIP_HPP_
diff --git a/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp b/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp
index 6cc77660394..a29cf6f2cb3 100644
--- a/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp
+++ b/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp
@@ -150,7 +150,7 @@ __global__ __launch_bounds__(default_block_size) void tri_spgeam_init(
     IndexType l_new_begin = l_new_row_ptrs[row];
     IndexType u_new_begin = u_new_row_ptrs[row];
 
-    constexpr auto sentinel = device_numeric_limits<IndexType>::max;
+    constexpr auto sentinel = device_numeric_limits<IndexType>::max();
     // load column indices and values for the first merge step
     auto a_col = checked_load(a_col_idxs, a_begin + lane, a_end, sentinel);
     auto a_val = checked_load(a_vals, a_begin + lane, a_end, zero<ValueType>());
@@ -396,4 +396,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 }  // namespace par_ilut_factorization
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/common/cuda_hip/reorder/rcm_kernels.cpp b/common/cuda_hip/reorder/rcm_kernels.cpp
index 75050d3e977..2bb18cbdd22 100644
--- a/common/cuda_hip/reorder/rcm_kernels.cpp
+++ b/common/cuda_hip/reorder/rcm_kernels.cpp
@@ -525,7 +525,7 @@ __global__ __launch_bounds__(default_block_size) void ubfs_min_neighbor_kernel(
     const auto begin = row_ptrs[row];
     const auto end = row_ptrs[row + 1];
     const auto cur_level = node_levels[row];
-    auto min_neighbor = device_numeric_limits<IndexType>::max;
+    auto min_neighbor = device_numeric_limits<IndexType>::max();
     for (auto nz = begin; nz < end; nz++) {
         const auto col = col_idxs[nz];
         const auto neighbor_level = node_levels[col];
diff --git a/cuda/test/base/math.cu b/cuda/test/base/math.cu
index d1d9373b0ef..1025c3cc489 100644
--- a/cuda/test/base/math.cu
+++ b/cuda/test/base/math.cu
@@ -26,7 +26,7 @@ namespace kernel {
 template <typename T, typename FuncType>
 __device__ bool test_real_is_finite_function(FuncType isfin)
 {
-    constexpr T inf = gko::device_numeric_limits<T>::inf;
+    constexpr T inf = gko::device_numeric_limits<T>::inf();
     constexpr T quiet_nan = NAN;
     bool test_true{};
     bool test_false{};
@@ -46,7 +46,7 @@ __device__ bool test_complex_is_finite_function(FuncType isfin)
                   "Template type must be a complex type.");
     using T = gko::remove_complex<ComplexType>;
     using c_type = gko::kernels::cuda::cuda_type<ComplexType>;
-    constexpr T inf = gko::device_numeric_limits<T>::inf;
+    constexpr T inf = gko::device_numeric_limits<T>::inf();
     constexpr T quiet_nan = NAN;
     bool test_true{};
     bool test_false{};
diff --git a/hip/test/base/math.hip.cpp b/hip/test/base/math.hip.cpp
index f01b56739d9..f69ca804aa9 100644
--- a/hip/test/base/math.hip.cpp
+++ b/hip/test/base/math.hip.cpp
@@ -32,7 +32,7 @@ namespace kernel {
 template <typename T, typename FuncType>
 __device__ bool test_real_is_finite_function(FuncType isfin)
 {
-    constexpr T inf = gko::device_numeric_limits<T>::inf;
+    constexpr T inf = gko::device_numeric_limits<T>::inf();
     constexpr T quiet_nan = NAN;
     bool test_true{};
     bool test_false{};
@@ -52,7 +52,7 @@ __device__ bool test_complex_is_finite_function(FuncType isfin)
                   "Template type must be a complex type.");
     using T = gko::remove_complex<ComplexType>;
     using c_type = gko::kernels::hip::hip_type<ComplexType>;
-    constexpr T inf = gko::device_numeric_limits<T>::inf;
+    constexpr T inf = gko::device_numeric_limits<T>::inf();
     constexpr T quiet_nan = NAN;
     bool test_true{};
     bool test_false{};
diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp
index 73da407194e..e308b092ea6 100644
--- a/include/ginkgo/core/base/math.hpp
+++ b/include/ginkgo/core/base/math.hpp
@@ -14,6 +14,7 @@
 #include <utility>
 
 #include <ginkgo/config.hpp>
+#include <ginkgo/core/base/half.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/base/utils.hpp>
 
@@ -21,79 +22,6 @@
 namespace gko {
 
 
-class half;
-
-
-// HIP should not see std::abs or std::sqrt, we want the custom implementation.
-// Hence, provide the using declaration only for some cases
-namespace kernels {
-namespace reference {
-
-
-using std::abs;
-
-
-using std::sqrt;
-
-
-}  // namespace reference
-}  // namespace kernels
-
-
-namespace kernels {
-namespace omp {
-
-
-using std::abs;
-
-
-using std::sqrt;
-
-
-}  // namespace omp
-}  // namespace kernels
-
-
-namespace kernels {
-namespace cuda {
-
-
-using std::abs;
-
-
-using std::sqrt;
-
-
-}  // namespace cuda
-}  // namespace kernels
-
-
-namespace kernels {
-namespace dpcpp {
-
-
-using std::abs;
-
-
-using std::sqrt;
-
-
-}  // namespace dpcpp
-}  // namespace kernels
-
-
-namespace test {
-
-
-using std::abs;
-
-
-using std::sqrt;
-
-
-}  // namespace test
-
-
 // type manipulations
 
 
@@ -706,6 +634,13 @@ GKO_INLINE constexpr T one()
     return T(1);
 }
 
+template <>
+GKO_INLINE constexpr half one<half>()
+{
+    constexpr auto bits = static_cast<uint16>(0b0'01111'0000000000u);
+    return half::create_from_bits(bits);
+}
+
 
 /**
  * Returns the multiplicative identity for T.
@@ -983,6 +918,7 @@ GKO_INLINE constexpr auto squared_norm(const T& x)
     return real(conj(x) * x);
 }
 
+using std::abs;
 
 /**
  * Returns the absolute value of the object.
@@ -1008,6 +944,27 @@ abs(const T& x)
     return sqrt(squared_norm(x));
 }
 
+// increase the priority in function lookup
+GKO_INLINE gko::half abs(const std::complex<gko::half>& x)
+{
+    // Using float abs not sqrt on norm to avoid overflow
+    return static_cast<gko::half>(abs(std::complex<float>(x)));
+}
+
+
+using std::sqrt;
+
+GKO_INLINE gko::half sqrt(gko::half a)
+{
+    return gko::half(std::sqrt(float(a)));
+}
+
+GKO_INLINE std::complex<gko::half> sqrt(std::complex<gko::half> a)
+{
+    return std::complex<gko::half>(sqrt(std::complex<float>(
+        static_cast<float>(a.real()), static_cast<float>(a.imag()))));
+}
+
 
 /**
  * Returns the value of pi.

From d321ce7f5a803c2cd4483eab2d8d81930cfdd2f6 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 24 Oct 2024 02:00:19 +0200
Subject: [PATCH 360/448] array operation in half

---
 .../components/absolute_array_kernels.cpp       |  6 ++++--
 .../unified/components/fill_array_kernels.cpp   | 17 +++++++++++++----
 .../components/precision_conversion_kernels.cpp |  3 ++-
 .../unified/components/reduce_array_kernels.cpp |  3 ++-
 core/base/array.cpp                             |  9 +++++----
 core/base/segmented_array.cpp                   |  2 +-
 core/device_hooks/common_kernels.inc.cpp        | 12 ++++++------
 include/ginkgo/core/base/segmented_array.hpp    |  7 ++++++-
 reference/components/absolute_array_kernels.cpp |  6 ++++--
 reference/components/fill_array_kernels.cpp     |  5 +++--
 .../components/precision_conversion_kernels.cpp |  3 ++-
 reference/components/reduce_array_kernels.cpp   |  3 ++-
 12 files changed, 50 insertions(+), 26 deletions(-)

diff --git a/common/unified/components/absolute_array_kernels.cpp b/common/unified/components/absolute_array_kernels.cpp
index c9ab364353c..423fa234c39 100644
--- a/common/unified/components/absolute_array_kernels.cpp
+++ b/common/unified/components/absolute_array_kernels.cpp
@@ -23,7 +23,8 @@ void inplace_absolute_array(std::shared_ptr<const DefaultExecutor> exec,
         data);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_ARRAY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_INPLACE_ABSOLUTE_ARRAY_KERNEL);
 
 
 template <typename ValueType>
@@ -37,7 +38,8 @@ void outplace_absolute_array(std::shared_ptr<const DefaultExecutor> exec,
         n, in, out);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_ARRAY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_OUTPLACE_ABSOLUTE_ARRAY_KERNEL);
 
 
 }  // namespace components
diff --git a/common/unified/components/fill_array_kernels.cpp b/common/unified/components/fill_array_kernels.cpp
index d78a6e9f346..3e87d782974 100644
--- a/common/unified/components/fill_array_kernels.cpp
+++ b/common/unified/components/fill_array_kernels.cpp
@@ -23,7 +23,7 @@ void fill_array(std::shared_ptr<const DefaultExecutor> exec, ValueType* array,
         array, val);
 }
 
-GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF(GKO_DECLARE_FILL_ARRAY_KERNEL);
 template GKO_DECLARE_FILL_ARRAY_KERNEL(bool);
 
 
@@ -32,11 +32,20 @@ void fill_seq_array(std::shared_ptr<const DefaultExecutor> exec,
                     ValueType* array, size_type n)
 {
     run_kernel(
-        exec, [] GKO_KERNEL(auto idx, auto array) { array[idx] = idx; }, n,
-        array);
+        exec,
+        [] GKO_KERNEL(auto idx, auto array) {
+            if constexpr (std::is_same_v<remove_complex<ValueType>, half>) {
+                // __half can not be from int64_t
+                array[idx] = static_cast<long long>(idx);
+            } else {
+                array[idx] = idx;
+            }
+        },
+        n, array);
 }
 
-GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_FILL_SEQ_ARRAY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF(
+    GKO_DECLARE_FILL_SEQ_ARRAY_KERNEL);
 
 
 }  // namespace components
diff --git a/common/unified/components/precision_conversion_kernels.cpp b/common/unified/components/precision_conversion_kernels.cpp
index 0402d9bef68..94a8d4e4d0f 100644
--- a/common/unified/components/precision_conversion_kernels.cpp
+++ b/common/unified/components/precision_conversion_kernels.cpp
@@ -23,7 +23,8 @@ void convert_precision(std::shared_ptr<const DefaultExecutor> exec,
         size, in, out);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(GKO_DECLARE_CONVERT_PRECISION_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_WITH_HALF(
+    GKO_DECLARE_CONVERT_PRECISION_KERNEL);
 
 
 }  // namespace components
diff --git a/common/unified/components/reduce_array_kernels.cpp b/common/unified/components/reduce_array_kernels.cpp
index bc8da6fa311..1e7d19264cd 100644
--- a/common/unified/components/reduce_array_kernels.cpp
+++ b/common/unified/components/reduce_array_kernels.cpp
@@ -34,7 +34,8 @@ void reduce_add_array(std::shared_ptr<const DefaultExecutor> exec,
         arr, result);
 }
 
-GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_REDUCE_ADD_ARRAY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF(
+    GKO_DECLARE_REDUCE_ADD_ARRAY_KERNEL);
 
 
 }  // namespace components
diff --git a/core/base/array.cpp b/core/base/array.cpp
index a41f7c07e55..7a98223a7b2 100644
--- a/core/base/array.cpp
+++ b/core/base/array.cpp
@@ -51,7 +51,8 @@ void convert_data(std::shared_ptr<const Executor> exec, size_type size,
     void convert_data<From, To>(std::shared_ptr<const Executor>, size_type, \
                                 const From*, To*)
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(GKO_DECLARE_ARRAY_CONVERSION);
+GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_WITH_HALF(
+    GKO_DECLARE_ARRAY_CONVERSION);
 
 
 }  // namespace detail
@@ -88,19 +89,19 @@ ValueType reduce_add(const array<ValueType>& input_arr,
 
 #define GKO_DECLARE_ARRAY_FILL(_type) void array<_type>::fill(const _type value)
 
-GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_ARRAY_FILL);
+GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF(GKO_DECLARE_ARRAY_FILL);
 
 
 #define GKO_DECLARE_ARRAY_REDUCE_ADD(_type) \
     void reduce_add(const array<_type>& arr, array<_type>& value)
 
-GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_ARRAY_REDUCE_ADD);
+GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF(GKO_DECLARE_ARRAY_REDUCE_ADD);
 
 
 #define GKO_DECLARE_ARRAY_REDUCE_ADD2(_type) \
     _type reduce_add(const array<_type>& arr, const _type val)
 
-GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_ARRAY_REDUCE_ADD2);
+GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF(GKO_DECLARE_ARRAY_REDUCE_ADD2);
 
 
 }  // namespace gko
diff --git a/core/base/segmented_array.cpp b/core/base/segmented_array.cpp
index d113139f8e2..4c6356799f9 100644
--- a/core/base/segmented_array.cpp
+++ b/core/base/segmented_array.cpp
@@ -180,7 +180,7 @@ segmented_array<T>& segmented_array<T>::operator=(segmented_array&& other)
 
 #define GKO_DECLARE_SEGMENTED_ARRAY(_type) struct segmented_array<_type>
 
-GKO_INSTANTIATE_FOR_EACH_POD_TYPE(GKO_DECLARE_SEGMENTED_ARRAY);
+GKO_INSTANTIATE_FOR_EACH_POD_TYPE_WITH_HALF(GKO_DECLARE_SEGMENTED_ARRAY);
 
 
 }  // namespace gko
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 6ffeb1c5f71..224aacc7369 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -238,19 +238,19 @@ namespace GKO_HOOK_MODULE {
 namespace components {
 
 
-GKO_STUB_VALUE_CONVERSION(GKO_DECLARE_CONVERT_PRECISION_KERNEL);
+GKO_STUB_VALUE_CONVERSION_WITH_HALF(GKO_DECLARE_CONVERT_PRECISION_KERNEL);
 GKO_STUB_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_NONNEGATIVE_KERNEL);
 // explicitly instantiate for size_type, as this is
 // used in the SellP format
 template GKO_DECLARE_PREFIX_SUM_NONNEGATIVE_KERNEL(size_type);
 
-GKO_STUB_TEMPLATE_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL);
+GKO_STUB_TEMPLATE_TYPE_WITH_HALF(GKO_DECLARE_FILL_ARRAY_KERNEL);
 template GKO_DECLARE_FILL_ARRAY_KERNEL(bool);
 
-GKO_STUB_TEMPLATE_TYPE(GKO_DECLARE_FILL_SEQ_ARRAY_KERNEL);
-GKO_STUB_TEMPLATE_TYPE(GKO_DECLARE_REDUCE_ADD_ARRAY_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_ARRAY_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_ARRAY_KERNEL);
+GKO_STUB_TEMPLATE_TYPE_WITH_HALF(GKO_DECLARE_FILL_SEQ_ARRAY_KERNEL);
+GKO_STUB_TEMPLATE_TYPE_WITH_HALF(GKO_DECLARE_REDUCE_ADD_ARRAY_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_INPLACE_ABSOLUTE_ARRAY_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_OUTPLACE_ABSOLUTE_ARRAY_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DEVICE_MATRIX_DATA_REMOVE_ZEROS_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(
diff --git a/include/ginkgo/core/base/segmented_array.hpp b/include/ginkgo/core/base/segmented_array.hpp
index 49a7e6f9d38..b34605cc902 100644
--- a/include/ginkgo/core/base/segmented_array.hpp
+++ b/include/ginkgo/core/base/segmented_array.hpp
@@ -2,7 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#pragma once
+#ifndef GKO_PUBLIC_CORE_BASE_SEGMENTED_ARRAY_HPP_
+#define GKO_PUBLIC_CORE_BASE_SEGMENTED_ARRAY_HPP_
+
+
 #include <numeric>
 
 #include <ginkgo/config.hpp>
@@ -183,3 +186,5 @@ class copy_back_deleter<segmented_array<T>>
 
 }  // namespace detail
 }  // namespace gko
+
+#endif  // GKO_PUBLIC_CORE_BASE_SEGMENTED_ARRAY_HPP_
diff --git a/reference/components/absolute_array_kernels.cpp b/reference/components/absolute_array_kernels.cpp
index 964e1f80d6a..759caae894c 100644
--- a/reference/components/absolute_array_kernels.cpp
+++ b/reference/components/absolute_array_kernels.cpp
@@ -20,7 +20,8 @@ void inplace_absolute_array(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_ARRAY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_INPLACE_ABSOLUTE_ARRAY_KERNEL);
 
 
 template <typename ValueType>
@@ -33,7 +34,8 @@ void outplace_absolute_array(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_ARRAY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_OUTPLACE_ABSOLUTE_ARRAY_KERNEL);
 
 
 }  // namespace components
diff --git a/reference/components/fill_array_kernels.cpp b/reference/components/fill_array_kernels.cpp
index 1649aa87982..663ad8f5b6b 100644
--- a/reference/components/fill_array_kernels.cpp
+++ b/reference/components/fill_array_kernels.cpp
@@ -20,7 +20,7 @@ void fill_array(std::shared_ptr<const DefaultExecutor> exec, ValueType* array,
     std::fill_n(array, n, val);
 }
 
-GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF(GKO_DECLARE_FILL_ARRAY_KERNEL);
 template GKO_DECLARE_FILL_ARRAY_KERNEL(bool);
 
 
@@ -31,7 +31,8 @@ void fill_seq_array(std::shared_ptr<const DefaultExecutor> exec,
     std::iota(array, array + n, 0);
 }
 
-GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_FILL_SEQ_ARRAY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF(
+    GKO_DECLARE_FILL_SEQ_ARRAY_KERNEL);
 
 
 }  // namespace components
diff --git a/reference/components/precision_conversion_kernels.cpp b/reference/components/precision_conversion_kernels.cpp
index db12d9316ee..5ec37a1cd72 100644
--- a/reference/components/precision_conversion_kernels.cpp
+++ b/reference/components/precision_conversion_kernels.cpp
@@ -20,7 +20,8 @@ void convert_precision(std::shared_ptr<const DefaultExecutor> exec,
     std::copy_n(in, size, out);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(GKO_DECLARE_CONVERT_PRECISION_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_WITH_HALF(
+    GKO_DECLARE_CONVERT_PRECISION_KERNEL);
 
 
 }  // namespace components
diff --git a/reference/components/reduce_array_kernels.cpp b/reference/components/reduce_array_kernels.cpp
index a70ef95a878..3c3c6f620ec 100644
--- a/reference/components/reduce_array_kernels.cpp
+++ b/reference/components/reduce_array_kernels.cpp
@@ -22,7 +22,8 @@ void reduce_add_array(std::shared_ptr<const DefaultExecutor> exec,
                                         val.get_const_data()[0]);
 }
 
-GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_REDUCE_ADD_ARRAY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF(
+    GKO_DECLARE_REDUCE_ADD_ARRAY_KERNEL);
 
 
 }  // namespace components

From 8073f4ba80d4450b9f2bfff286b1c721dddd575b Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 23 Oct 2024 18:30:29 +0200
Subject: [PATCH 361/448] matrix with half

---
 common/cuda_hip/matrix/coo_kernels.cpp        |  10 +-
 .../matrix/csr_kernels.instantiate.cpp        | 124 ++++---
 .../cuda_hip/matrix/csr_kernels.template.cpp  |   6 +-
 common/cuda_hip/matrix/dense_kernels.cpp      |  33 +-
 common/cuda_hip/matrix/diagonal_kernels.cpp   |   2 +-
 common/cuda_hip/matrix/ell_kernels.cpp        |   4 +-
 .../matrix/fbcsr_kernels.instantiate.cpp      |  21 +-
 .../matrix/fbcsr_kernels.template.cpp         |   2 +-
 common/cuda_hip/matrix/sellp_kernels.cpp      |   5 +-
 .../cuda_hip/matrix/sparsity_csr_kernels.cpp  |  14 +-
 common/unified/matrix/coo_kernels.cpp         |   4 +-
 common/unified/matrix/csr_kernels.cpp         |  16 +-
 .../matrix/dense_kernels.instantiate.cpp      | 100 ++---
 common/unified/matrix/diagonal_kernels.cpp    |  14 +-
 common/unified/matrix/ell_kernels.cpp         |  13 +-
 common/unified/matrix/hybrid_kernels.cpp      |   4 +-
 .../matrix/scaled_permutation_kernels.cpp     |   4 +-
 common/unified/matrix/sellp_kernels.cpp       |  10 +-
 .../unified/matrix/sparsity_csr_kernels.cpp   |   6 +-
 core/device_hooks/common_kernels.inc.cpp      | 351 +++++++++++-------
 core/matrix/coo.cpp                           |  29 +-
 core/matrix/csr.cpp                           |  29 +-
 core/matrix/dense.cpp                         |  38 +-
 core/matrix/diagonal.cpp                      |  30 +-
 core/matrix/ell.cpp                           |  30 +-
 core/matrix/fbcsr.cpp                         |  32 +-
 core/matrix/hybrid.cpp                        |  32 +-
 core/matrix/identity.cpp                      |   4 +-
 core/matrix/permutation.cpp                   |   7 +-
 core/matrix/row_gatherer.cpp                  |  13 +-
 core/matrix/scaled_permutation.cpp            |   2 +-
 core/matrix/sellp.cpp                         |  33 +-
 core/matrix/sparsity_csr.cpp                  |   3 +-
 dpcpp/matrix/coo_kernels.dp.cpp               |  10 +-
 dpcpp/matrix/csr_kernels.dp.cpp               |  74 ++--
 dpcpp/matrix/dense_kernels.dp.cpp             |  86 +++--
 dpcpp/matrix/diagonal_kernels.dp.cpp          |   2 +-
 dpcpp/matrix/ell_kernels.dp.cpp               |   4 +-
 dpcpp/matrix/fbcsr_kernels.dp.cpp             |  21 +-
 dpcpp/matrix/sellp_kernels.dp.cpp             |   5 +-
 dpcpp/matrix/sparsity_csr_kernels.dp.cpp      |  14 +-
 .../ginkgo/core/base/precision_dispatch.hpp   |  29 +-
 include/ginkgo/core/matrix/coo.hpp            |  57 ++-
 include/ginkgo/core/matrix/csr.hpp            |  73 ++--
 include/ginkgo/core/matrix/dense.hpp          |  34 +-
 include/ginkgo/core/matrix/diagonal.hpp       |  36 +-
 include/ginkgo/core/matrix/ell.hpp            |  57 ++-
 include/ginkgo/core/matrix/fbcsr.hpp          |  61 ++-
 include/ginkgo/core/matrix/hybrid.hpp         |  40 +-
 include/ginkgo/core/matrix/sellp.hpp          |  57 ++-
 omp/matrix/coo_kernels.cpp                    |  10 +-
 omp/matrix/csr_kernels.cpp                    |  60 +--
 omp/matrix/dense_kernels.cpp                  |  33 +-
 omp/matrix/diagonal_kernels.cpp               |   2 +-
 omp/matrix/ell_kernels.cpp                    |   4 +-
 omp/matrix/fbcsr_kernels.cpp                  |  21 +-
 omp/matrix/sellp_kernels.cpp                  |   5 +-
 omp/matrix/sparsity_csr_kernels.cpp           |  10 +-
 reference/matrix/coo_kernels.cpp              |  14 +-
 reference/matrix/csr_kernels.cpp              |  76 ++--
 reference/matrix/dense_kernels.cpp            | 132 ++++---
 reference/matrix/diagonal_kernels.cpp         |  16 +-
 reference/matrix/ell_kernels.cpp              |  19 +-
 reference/matrix/fbcsr_kernels.cpp            |  21 +-
 reference/matrix/hybrid_kernels.cpp           |   4 +-
 .../matrix/scaled_permutation_kernels.cpp     |   4 +-
 reference/matrix/sellp_kernels.cpp            |  15 +-
 reference/matrix/sparsity_csr_kernels.cpp     |  16 +-
 reference/test/base/combination.cpp           |  13 +-
 reference/test/matrix/coo_kernels.cpp         |  17 +-
 reference/test/matrix/csr_kernels.cpp         |   8 +-
 reference/test/matrix/dense_kernels.cpp       |  21 +-
 reference/test/matrix/diagonal_kernels.cpp    |  15 +-
 reference/test/matrix/ell_kernels.cpp         |  17 +-
 reference/test/matrix/fbcsr_kernels.cpp       |  17 +-
 reference/test/matrix/hybrid_kernels.cpp      |  17 +-
 reference/test/matrix/scaled_permutation.cpp  |   3 +-
 reference/test/matrix/sellp_kernels.cpp       |  26 +-
 test/matrix/fbcsr_kernels.cpp                 |  14 +-
 test/matrix/matrix.cpp                        |  10 +-
 80 files changed, 1480 insertions(+), 845 deletions(-)

diff --git a/common/cuda_hip/matrix/coo_kernels.cpp b/common/cuda_hip/matrix/coo_kernels.cpp
index cffe18d981b..4609f9f7f95 100644
--- a/common/cuda_hip/matrix/coo_kernels.cpp
+++ b/common/cuda_hip/matrix/coo_kernels.cpp
@@ -238,7 +238,8 @@ void spmv(std::shared_ptr<const DefaultExecutor> exec,
     spmv2(exec, a, b, c);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_COO_SPMV_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -253,7 +254,7 @@ void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec,
     advanced_spmv2(exec, alpha, a, b, c);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_COO_ADVANCED_SPMV_KERNEL);
 
 
@@ -295,7 +296,8 @@ void spmv2(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_COO_SPMV2_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -338,7 +340,7 @@ void advanced_spmv2(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL);
 
 
diff --git a/common/cuda_hip/matrix/csr_kernels.instantiate.cpp b/common/cuda_hip/matrix/csr_kernels.instantiate.cpp
index f62ca1c1815..2e28de95f5d 100644
--- a/common/cuda_hip/matrix/csr_kernels.instantiate.cpp
+++ b/common/cuda_hip/matrix/csr_kernels.instantiate.cpp
@@ -17,108 +17,136 @@ namespace csr {
 
 
 // begin
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL);
 
 
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(GKO_DECLARE_CSR_SPMV_KERNEL,
-                                                 int32);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1_WITH_HALF(
+    GKO_DECLARE_CSR_SPMV_KERNEL, int32);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(GKO_DECLARE_CSR_SPMV_KERNEL,
-                                                 int32);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2_WITH_HALF(
+    GKO_DECLARE_CSR_SPMV_KERNEL, int32);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(GKO_DECLARE_CSR_SPMV_KERNEL,
-                                                 int32);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3_WITH_HALF(
+    GKO_DECLARE_CSR_SPMV_KERNEL, int32);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(GKO_DECLARE_CSR_SPMV_KERNEL,
-                                                 int32);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4_WITH_HALF(
+    GKO_DECLARE_CSR_SPMV_KERNEL, int32);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(GKO_DECLARE_CSR_SPMV_KERNEL,
-                                                 int64);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5_WITH_HALF(
+    GKO_DECLARE_CSR_SPMV_KERNEL, int32);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(GKO_DECLARE_CSR_SPMV_KERNEL,
-                                                 int64);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6_WITH_HALF(
+    GKO_DECLARE_CSR_SPMV_KERNEL, int32);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(GKO_DECLARE_CSR_SPMV_KERNEL,
-                                                 int64);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1_WITH_HALF(
+    GKO_DECLARE_CSR_SPMV_KERNEL, int64);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(GKO_DECLARE_CSR_SPMV_KERNEL,
-                                                 int64);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2_WITH_HALF(
+    GKO_DECLARE_CSR_SPMV_KERNEL, int64);
+// split
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3_WITH_HALF(
+    GKO_DECLARE_CSR_SPMV_KERNEL, int64);
+// split
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4_WITH_HALF(
+    GKO_DECLARE_CSR_SPMV_KERNEL, int64);
+// split
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5_WITH_HALF(
+    GKO_DECLARE_CSR_SPMV_KERNEL, int64);
+// split
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6_WITH_HALF(
+    GKO_DECLARE_CSR_SPMV_KERNEL, int64);
 
 
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1_WITH_HALF(
+    GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32);
+// split
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2_WITH_HALF(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3_WITH_HALF(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4_WITH_HALF(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5_WITH_HALF(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6_WITH_HALF(
+    GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32);
+// split
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1_WITH_HALF(
+    GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64);
+// split
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2_WITH_HALF(
+    GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64);
+// split
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3_WITH_HALF(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4_WITH_HALF(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5_WITH_HALF(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6_WITH_HALF(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64);
 
 
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL);
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX);
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_SPGEMM_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL);
 GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_SPGEAM_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_INDEX_SET_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_FROM_INDEX_SET_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL);
 // end
 
diff --git a/common/cuda_hip/matrix/csr_kernels.template.cpp b/common/cuda_hip/matrix/csr_kernels.template.cpp
index 909349ed7ab..f808e234670 100644
--- a/common/cuda_hip/matrix/csr_kernels.template.cpp
+++ b/common/cuda_hip/matrix/csr_kernels.template.cpp
@@ -278,7 +278,7 @@ __global__ __launch_bounds__(spmv_block_size) void abstract_spmv(
 {
     using arithmetic_type = typename output_accessor::arithmetic_type;
     using output_type = typename output_accessor::storage_type;
-    const arithmetic_type scale_factor = alpha[0];
+    const auto scale_factor = static_cast<arithmetic_type>(alpha[0]);
     spmv_kernel(nwarps, num_rows, val, col_idxs, row_ptrs, srow, b, c,
                 [&scale_factor](const arithmetic_type& x) {
                     return static_cast<output_type>(scale_factor * x);
@@ -486,7 +486,7 @@ __global__ __launch_bounds__(spmv_block_size) void abstract_reduce(
     const IndexType* __restrict__ last_row,
     const MatrixValueType* __restrict__ alpha, acc::range<output_accessor> c)
 {
-    const arithmetic_type alpha_val = alpha[0];
+    const auto alpha_val = static_cast<arithmetic_type>(alpha[0]);
     merge_path_reduce(
         nwarps, last_val, last_row, c,
         [&alpha_val](const arithmetic_type& x) { return alpha_val * x; });
@@ -1193,7 +1193,7 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup(
             const auto i = base_i + lane;
             const auto col = i < row_len
                                  ? local_cols[i]
-                                 : device_numeric_limits<IndexType>::max;
+                                 : device_numeric_limits<IndexType>::max();
             const auto rel_col = static_cast<int32>(col - min_col);
             const auto block = rel_col / bitmap_block_size;
             const auto col_in_block = rel_col % bitmap_block_size;
diff --git a/common/cuda_hip/matrix/dense_kernels.cpp b/common/cuda_hip/matrix/dense_kernels.cpp
index d8391ace023..d0d4985dd82 100644
--- a/common/cuda_hip/matrix/dense_kernels.cpp
+++ b/common/cuda_hip/matrix/dense_kernels.cpp
@@ -461,7 +461,7 @@ void convert_to_coo(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL);
 
 
@@ -491,7 +491,7 @@ void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL);
 
 
@@ -521,7 +521,7 @@ void convert_to_ell(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL);
 
 
@@ -544,7 +544,7 @@ void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL);
 
 
@@ -565,7 +565,7 @@ void count_nonzero_blocks_per_row(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL);
 
 
@@ -598,7 +598,7 @@ void convert_to_hybrid(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL);
 
 
@@ -629,7 +629,7 @@ void convert_to_sellp(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL);
 
 
@@ -657,7 +657,7 @@ void convert_to_sparsity_csr(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL);
 
 
@@ -681,7 +681,7 @@ void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
 
 
@@ -706,7 +706,7 @@ void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
 
 
@@ -729,7 +729,7 @@ void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
 
 
@@ -760,7 +760,8 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
 
 
 template <typename ValueType>
@@ -787,7 +788,7 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
 
 template <typename ValueType>
@@ -812,7 +813,8 @@ void transpose(std::shared_ptr<const DefaultExecutor> exec,
     }
 };
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
 
 
 template <typename ValueType>
@@ -837,7 +839,8 @@ void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
 
 
 }  // namespace dense
diff --git a/common/cuda_hip/matrix/diagonal_kernels.cpp b/common/cuda_hip/matrix/diagonal_kernels.cpp
index e12d3ed4f9f..baee454c36d 100644
--- a/common/cuda_hip/matrix/diagonal_kernels.cpp
+++ b/common/cuda_hip/matrix/diagonal_kernels.cpp
@@ -81,7 +81,7 @@ void apply_to_csr(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DIAGONAL_APPLY_TO_CSR_KERNEL);
 
 
diff --git a/common/cuda_hip/matrix/ell_kernels.cpp b/common/cuda_hip/matrix/ell_kernels.cpp
index bfdd3f21e51..16371166662 100644
--- a/common/cuda_hip/matrix/ell_kernels.cpp
+++ b/common/cuda_hip/matrix/ell_kernels.cpp
@@ -354,7 +354,7 @@ void spmv(std::shared_ptr<const DefaultExecutor> exec,
         b, c);
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ELL_SPMV_KERNEL);
 
 
@@ -388,7 +388,7 @@ void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec,
         b, c, alpha, beta);
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL);
 
 
diff --git a/common/cuda_hip/matrix/fbcsr_kernels.instantiate.cpp b/common/cuda_hip/matrix/fbcsr_kernels.instantiate.cpp
index a3beaac4a85..a7a0263cd35 100644
--- a/common/cuda_hip/matrix/fbcsr_kernels.instantiate.cpp
+++ b/common/cuda_hip/matrix/fbcsr_kernels.instantiate.cpp
@@ -17,26 +17,27 @@ namespace fbcsr {
 
 
 // begin
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL);
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_FBCSR_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL);
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL);
 // end
 
diff --git a/common/cuda_hip/matrix/fbcsr_kernels.template.cpp b/common/cuda_hip/matrix/fbcsr_kernels.template.cpp
index 23f865b6ace..e10cf10b540 100644
--- a/common/cuda_hip/matrix/fbcsr_kernels.template.cpp
+++ b/common/cuda_hip/matrix/fbcsr_kernels.template.cpp
@@ -564,7 +564,7 @@ void transpose_blocks_impl(syn::value_list<int, mat_blk_sz>,
     if (grid_dim > 0) {
         kernel::transpose_blocks<mat_blk_sz, subwarp_size>
             <<<grid_dim, block_size, 0, exec->get_stream()>>>(
-                nbnz, mat->get_values());
+                nbnz, as_device_type(mat->get_values()));
     }
 }
 
diff --git a/common/cuda_hip/matrix/sellp_kernels.cpp b/common/cuda_hip/matrix/sellp_kernels.cpp
index 3e8fba395b3..4d37a0452a6 100644
--- a/common/cuda_hip/matrix/sellp_kernels.cpp
+++ b/common/cuda_hip/matrix/sellp_kernels.cpp
@@ -105,7 +105,8 @@ void spmv(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_SELLP_SPMV_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -131,7 +132,7 @@ void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL);
 
 
diff --git a/common/cuda_hip/matrix/sparsity_csr_kernels.cpp b/common/cuda_hip/matrix/sparsity_csr_kernels.cpp
index 269708e19ae..ddda357fa31 100644
--- a/common/cuda_hip/matrix/sparsity_csr_kernels.cpp
+++ b/common/cuda_hip/matrix/sparsity_csr_kernels.cpp
@@ -72,11 +72,11 @@ __device__ void device_classical_spmv(const size_type num_rows,
     const auto subrow = thread::get_subwarp_num_flat<subwarp_size>();
     const auto subid = subwarp_tile.thread_rank();
     const IndexType column_id = blockIdx.y;
-    const arithmetic_type value = val[0];
+    const auto value = static_cast<arithmetic_type>(val[0]);
     auto row = thread::get_subwarp_id_flat<subwarp_size>();
     for (; row < num_rows; row += subrow) {
         const auto ind_end = row_ptrs[row + 1];
-        arithmetic_type temp_val = zero<arithmetic_type>();
+        auto temp_val = zero<arithmetic_type>();
         for (auto ind = row_ptrs[row] + subid; ind < ind_end;
              ind += subwarp_size) {
             temp_val += value * b(col_idxs[ind], column_id);
@@ -138,7 +138,7 @@ void transpose(std::shared_ptr<const DefaultExecutor> exec,
                matrix::SparsityCsr<ValueType, IndexType>* trans)
     GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL);
 
 
@@ -246,7 +246,7 @@ void spmv(std::shared_ptr<const DefaultExecutor> exec,
         syn::value_list<int>(), syn::type_list<>(), exec, a, b, c);
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL);
 
 
@@ -264,7 +264,7 @@ void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec,
         syn::value_list<int>(), syn::type_list<>(), exec, a, b, c, alpha, beta);
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL);
 
 
@@ -297,7 +297,7 @@ void sort_by_column_index(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX);
 
 
@@ -320,7 +320,7 @@ void is_sorted_by_column_index(
     cpu_array = gpu_array;
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SPARSITY_CSR_IS_SORTED_BY_COLUMN_INDEX);
 
 
diff --git a/common/unified/matrix/coo_kernels.cpp b/common/unified/matrix/coo_kernels.cpp
index ce13d7500ab..233dffc6f37 100644
--- a/common/unified/matrix/coo_kernels.cpp
+++ b/common/unified/matrix/coo_kernels.cpp
@@ -38,7 +38,7 @@ void extract_diagonal(std::shared_ptr<const DefaultExecutor> exec,
         diag->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_COO_EXTRACT_DIAGONAL_KERNEL);
 
 
@@ -58,7 +58,7 @@ void fill_in_dense(std::shared_ptr<const DefaultExecutor> exec,
         orig->get_const_row_idxs(), orig->get_const_col_idxs(), result);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_COO_FILL_IN_DENSE_KERNEL);
 
 
diff --git a/common/unified/matrix/csr_kernels.cpp b/common/unified/matrix/csr_kernels.cpp
index 5236c1c9da9..d5741bb3e1c 100644
--- a/common/unified/matrix/csr_kernels.cpp
+++ b/common/unified/matrix/csr_kernels.cpp
@@ -52,7 +52,7 @@ void inv_col_permute(std::shared_ptr<const DefaultExecutor> exec,
         col_permuted->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_INV_COL_PERMUTE_KERNEL);
 
 
@@ -86,7 +86,7 @@ void inv_col_scale_permute(std::shared_ptr<const DefaultExecutor> exec,
         col_permuted->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_INV_COL_SCALE_PERMUTE_KERNEL);
 
 
@@ -102,7 +102,8 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
         x->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SCALE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_SCALE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -117,7 +118,8 @@ void inv_scale(std::shared_ptr<const DefaultExecutor> exec,
         x->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_SCALE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_INV_SCALE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -152,7 +154,7 @@ void convert_to_sellp(std::shared_ptr<const DefaultExecutor> exec,
         output->get_col_idxs(), output->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL);
 
 
@@ -183,7 +185,7 @@ void convert_to_ell(std::shared_ptr<const DefaultExecutor> exec,
         output->get_col_idxs(), output->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL);
 
 
@@ -227,7 +229,7 @@ void convert_to_hybrid(std::shared_ptr<const DefaultExecutor> exec,
         result->get_coo_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL);
 
 
diff --git a/common/unified/matrix/dense_kernels.instantiate.cpp b/common/unified/matrix/dense_kernels.instantiate.cpp
index aca8ad5bec4..dcf48573fc6 100644
--- a/common/unified/matrix/dense_kernels.instantiate.cpp
+++ b/common/unified/matrix/dense_kernels.instantiate.cpp
@@ -12,87 +12,99 @@ namespace dense {
 
 
 // begin
-GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(
+GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY_WITH_HALF(
     GKO_DECLARE_DENSE_COPY_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_FILL_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_FILL_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_SCALE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_SCALE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_INV_SCALE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_ADD_SCALED_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_SUB_SCALED_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_NONSYMM_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_INV_NONSYMM_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2_WITH_HALF(
     GKO_DECLARE_DENSE_ROW_GATHER_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2_WITH_HALF(
     GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COL_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_INV_COL_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_SYMM_SCALE_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_INV_SYMM_SCALE_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_NONSYMM_SCALE_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_INV_NONSYMM_SCALE_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_ROW_SCALE_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_INV_ROW_SCALE_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COL_SCALE_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_INV_COL_SCALE_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MAKE_COMPLEX_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_REAL_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_IMAG_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_MAKE_COMPLEX_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GET_REAL_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GET_IMAG_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_ADD_SCALED_IDENTITY_KERNEL);
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL);
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL);
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL_SIZE_T);
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL);
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_MEAN_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_COMPUTE_MEAN_KERNEL);
 // end
 
 
diff --git a/common/unified/matrix/diagonal_kernels.cpp b/common/unified/matrix/diagonal_kernels.cpp
index dae037a5134..75960e800d7 100644
--- a/common/unified/matrix/diagonal_kernels.cpp
+++ b/common/unified/matrix/diagonal_kernels.cpp
@@ -36,7 +36,8 @@ void apply_to_dense(std::shared_ptr<const DefaultExecutor> exec,
         b->get_size(), a->get_const_values(), b, c, inverse);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DIAGONAL_APPLY_TO_DENSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DIAGONAL_APPLY_TO_DENSE_KERNEL);
 
 
 template <typename ValueType>
@@ -53,7 +54,7 @@ void right_apply_to_dense(std::shared_ptr<const DefaultExecutor> exec,
         b->get_size(), a->get_const_values(), b, c);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_DIAGONAL_RIGHT_APPLY_TO_DENSE_KERNEL);
 
 
@@ -74,7 +75,7 @@ void right_apply_to_csr(std::shared_ptr<const DefaultExecutor> exec,
         c->get_const_col_idxs());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DIAGONAL_RIGHT_APPLY_TO_CSR_KERNEL);
 
 
@@ -95,7 +96,7 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
         output->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DIAGONAL_FILL_IN_MATRIX_DATA_KERNEL);
 
 
@@ -120,7 +121,7 @@ void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,
         result->get_col_idxs(), result->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DIAGONAL_CONVERT_TO_CSR_KERNEL);
 
 
@@ -137,7 +138,8 @@ void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
         orig->get_size()[0], orig->get_const_values(), trans->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DIAGONAL_CONJ_TRANSPOSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DIAGONAL_CONJ_TRANSPOSE_KERNEL);
 
 
 }  // namespace diagonal
diff --git a/common/unified/matrix/ell_kernels.cpp b/common/unified/matrix/ell_kernels.cpp
index 6d23e08b68b..24fc90a888e 100644
--- a/common/unified/matrix/ell_kernels.cpp
+++ b/common/unified/matrix/ell_kernels.cpp
@@ -67,7 +67,7 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
         output->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ELL_FILL_IN_MATRIX_DATA_KERNEL);
 
 
@@ -94,7 +94,7 @@ void fill_in_dense(std::shared_ptr<const DefaultExecutor> exec,
         source->get_const_values(), result);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ELL_FILL_IN_DENSE_KERNEL);
 
 
@@ -121,7 +121,8 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
         result->get_col_idxs(), result->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_COPY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_ELL_COPY_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -150,7 +151,7 @@ void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,
         result->get_col_idxs(), result->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ELL_CONVERT_TO_CSR_KERNEL);
 
 
@@ -172,7 +173,7 @@ void count_nonzeros_per_row(std::shared_ptr<const DefaultExecutor> exec,
         static_cast<int64>(source->get_stride()), source->get_const_col_idxs());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ELL_COUNT_NONZEROS_PER_ROW_KERNEL);
 
 
@@ -198,7 +199,7 @@ void extract_diagonal(std::shared_ptr<const DefaultExecutor> exec,
         orig->get_const_values(), diag->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ELL_EXTRACT_DIAGONAL_KERNEL);
 
 
diff --git a/common/unified/matrix/hybrid_kernels.cpp b/common/unified/matrix/hybrid_kernels.cpp
index 8a21a2415f7..79a596febea 100644
--- a/common/unified/matrix/hybrid_kernels.cpp
+++ b/common/unified/matrix/hybrid_kernels.cpp
@@ -89,7 +89,7 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
         result->get_coo_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_HYBRID_FILL_IN_MATRIX_DATA_KERNEL);
 
 
@@ -150,7 +150,7 @@ void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,
         coo_row_ptrs, result->get_col_idxs(), result->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL);
 
 
diff --git a/common/unified/matrix/scaled_permutation_kernels.cpp b/common/unified/matrix/scaled_permutation_kernels.cpp
index 3eaab65e8e6..4cdc7974e50 100644
--- a/common/unified/matrix/scaled_permutation_kernels.cpp
+++ b/common/unified/matrix/scaled_permutation_kernels.cpp
@@ -32,7 +32,7 @@ void invert(std::shared_ptr<const DefaultExecutor> exec,
         size, input_scale, input_permutation, output_scale, output_permutation);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SCALED_PERMUTATION_INVERT_KERNEL);
 
 
@@ -58,7 +58,7 @@ void compose(std::shared_ptr<const DefaultExecutor> exec,
         output_permutation, output_scale);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SCALED_PERMUTATION_COMPOSE_KERNEL);
 
 
diff --git a/common/unified/matrix/sellp_kernels.cpp b/common/unified/matrix/sellp_kernels.cpp
index 93b71ff43f2..23bfe160a69 100644
--- a/common/unified/matrix/sellp_kernels.cpp
+++ b/common/unified/matrix/sellp_kernels.cpp
@@ -87,7 +87,7 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
         output->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SELLP_FILL_IN_MATRIX_DATA_KERNEL);
 
 
@@ -119,7 +119,7 @@ void fill_in_dense(std::shared_ptr<const DefaultExecutor> exec,
         source->get_const_values(), result);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SELLP_FILL_IN_DENSE_KERNEL);
 
 
@@ -149,7 +149,7 @@ void count_nonzeros_per_row(std::shared_ptr<const DefaultExecutor> exec,
         source->get_const_slice_sets(), source->get_const_col_idxs(), result);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SELLP_COUNT_NONZEROS_PER_ROW_KERNEL);
 
 
@@ -183,7 +183,7 @@ void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,
         result->get_col_idxs(), result->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL);
 
 
@@ -215,7 +215,7 @@ void extract_diagonal(std::shared_ptr<const DefaultExecutor> exec,
         orig->get_const_values(), diag->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SELLP_EXTRACT_DIAGONAL_KERNEL);
 
 
diff --git a/common/unified/matrix/sparsity_csr_kernels.cpp b/common/unified/matrix/sparsity_csr_kernels.cpp
index c5a9c79a89b..b3f26358ad3 100644
--- a/common/unified/matrix/sparsity_csr_kernels.cpp
+++ b/common/unified/matrix/sparsity_csr_kernels.cpp
@@ -41,7 +41,7 @@ void fill_in_dense(std::shared_ptr<const DefaultExecutor> exec,
         input->get_const_col_idxs(), input->get_const_value(), output);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL);
 
 
@@ -70,7 +70,7 @@ void diagonal_element_prefix_sum(
     components::prefix_sum_nonnegative(exec, prefix_sum, num_rows + 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SPARSITY_CSR_DIAGONAL_ELEMENT_PREFIX_SUM_KERNEL);
 
 
@@ -106,7 +106,7 @@ void remove_diagonal_elements(std::shared_ptr<const DefaultExecutor> exec,
         matrix->get_col_idxs());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SPARSITY_CSR_REMOVE_DIAGONAL_ELEMENTS_KERNEL);
 
 
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 224aacc7369..78b80ec2859 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -411,69 +411,93 @@ GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_ELL_ADD_SCALED_IDENTITY_KERNEL);
 namespace dense {
 
 
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
-GKO_STUB_VALUE_CONVERSION_OR_COPY(GKO_DECLARE_DENSE_COPY_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_FILL_KERNEL);
-GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_SCALE_KERNEL);
-GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_INV_SCALE_KERNEL);
-GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_KERNEL);
-GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_KERNEL);
-GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_IDENTITY_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_MEAN_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_APPLY_KERNEL);
+GKO_STUB_VALUE_CONVERSION_OR_COPY_WITH_HALF(GKO_DECLARE_DENSE_COPY_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_FILL_KERNEL);
+GKO_STUB_VALUE_AND_SCALAR_TYPE_WITH_HALF(GKO_DECLARE_DENSE_SCALE_KERNEL);
+GKO_STUB_VALUE_AND_SCALAR_TYPE_WITH_HALF(GKO_DECLARE_DENSE_INV_SCALE_KERNEL);
+GKO_STUB_VALUE_AND_SCALAR_TYPE_WITH_HALF(GKO_DECLARE_DENSE_ADD_SCALED_KERNEL);
+GKO_STUB_VALUE_AND_SCALAR_TYPE_WITH_HALF(GKO_DECLARE_DENSE_SUB_SCALED_KERNEL);
+GKO_STUB_VALUE_AND_SCALAR_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_ADD_SCALED_IDENTITY_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_MEAN_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL_SIZE_T);
-GKO_STUB_VALUE_AND_INDEX_TYPE(
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL);
-GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_2(GKO_DECLARE_DENSE_ROW_GATHER_KERNEL);
-GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_2(
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL);
+GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_2_WITH_HALF(
+    GKO_DECLARE_DENSE_ROW_GATHER_KERNEL);
+GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_2_WITH_HALF(
     GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_COL_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_COL_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_NONSYMM_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_NONSYMM_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_SYMM_SCALE_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_SYMM_SCALE_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_ROW_SCALE_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_COL_SCALE_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_ROW_SCALE_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_COL_SCALE_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_NONSYMM_SCALE_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COL_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_INV_COL_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_NONSYMM_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_INV_NONSYMM_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_SYMM_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_INV_SYMM_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_ROW_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_COL_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_INV_ROW_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_INV_COL_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_NONSYMM_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_INV_NONSYMM_SCALE_PERMUTE_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_MAKE_COMPLEX_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_GET_REAL_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_GET_IMAG_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_MAKE_COMPLEX_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GET_REAL_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GET_IMAG_KERNEL);
 
 
 }  // namespace dense
@@ -482,13 +506,17 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_GET_IMAG_KERNEL);
 namespace diagonal {
 
 
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DIAGONAL_APPLY_TO_DENSE_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DIAGONAL_RIGHT_APPLY_TO_DENSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DIAGONAL_APPLY_TO_CSR_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DIAGONAL_RIGHT_APPLY_TO_CSR_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DIAGONAL_CONVERT_TO_CSR_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DIAGONAL_CONJ_TRANSPOSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DIAGONAL_FILL_IN_MATRIX_DATA_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DIAGONAL_APPLY_TO_DENSE_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DIAGONAL_RIGHT_APPLY_TO_DENSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_DIAGONAL_APPLY_TO_CSR_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_DIAGONAL_RIGHT_APPLY_TO_CSR_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_DIAGONAL_CONVERT_TO_CSR_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DIAGONAL_CONJ_TRANSPOSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_DIAGONAL_FILL_IN_MATRIX_DATA_KERNEL);
 
 
 }  // namespace diagonal
@@ -675,17 +703,21 @@ GKO_STUB_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_CHECK_STOP_KERNEL);
 namespace sparsity_csr {
 
 
-GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL);
-GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(
+GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL);
+GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SPARSITY_CSR_DIAGONAL_ELEMENT_PREFIX_SUM_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SPARSITY_CSR_REMOVE_DIAGONAL_ELEMENTS_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX);
-GKO_STUB_VALUE_AND_INDEX_TYPE(
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SPARSITY_CSR_IS_SORTED_BY_COLUMN_INDEX);
 
 
@@ -695,38 +727,54 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(
 namespace csr {
 
 
-GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPMV_KERNEL);
-GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_COL_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_COL_SCALE_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(
+GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_SPMV_KERNEL);
+GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_SPGEMM_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_SPGEAM_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_INV_COL_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_INV_COL_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_INDEX_SET_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_FROM_INDEX_SET_KERNEL);
 GKO_STUB_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_OFFSETS_KERNEL);
 GKO_STUB_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL);
@@ -735,12 +783,14 @@ GKO_STUB_INDEX_TYPE(GKO_DECLARE_CSR_BENCHMARK_LOOKUP_KERNEL);
 template <typename ValueType, typename IndexType>
 GKO_DECLARE_CSR_SCALE_KERNEL(ValueType, IndexType)
 GKO_NOT_COMPILED(GKO_HOOK_MODULE);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SCALE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_SCALE_KERNEL);
 
 template <typename ValueType, typename IndexType>
 GKO_DECLARE_CSR_INV_SCALE_KERNEL(ValueType, IndexType)
 GKO_NOT_COMPILED(GKO_HOOK_MODULE);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_SCALE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_INV_SCALE_KERNEL);
 
 
 }  // namespace csr
@@ -749,16 +799,20 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_SCALE_KERNEL);
 namespace fbcsr {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_FBCSR_SPMV_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL);
 
 
 }  // namespace fbcsr
@@ -767,12 +821,13 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL);
 namespace coo {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_ADVANCED_SPMV_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV2_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_FILL_IN_DENSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_EXTRACT_DIAGONAL_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_COO_SPMV_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_COO_ADVANCED_SPMV_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_COO_SPMV2_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_COO_FILL_IN_DENSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_COO_EXTRACT_DIAGONAL_KERNEL);
 
 
 }  // namespace coo
@@ -781,15 +836,19 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_EXTRACT_DIAGONAL_KERNEL);
 namespace ell {
 
 
-GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL);
-GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_FILL_IN_MATRIX_DATA_KERNEL);
+GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_ELL_SPMV_KERNEL);
+GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_ELL_FILL_IN_MATRIX_DATA_KERNEL);
 GKO_STUB_INDEX_TYPE(GKO_DECLARE_ELL_COMPUTE_MAX_ROW_NNZ_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_FILL_IN_DENSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_COPY_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_CONVERT_TO_CSR_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_COUNT_NONZEROS_PER_ROW_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_EXTRACT_DIAGONAL_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_ELL_FILL_IN_DENSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_ELL_COPY_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_ELL_CONVERT_TO_CSR_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_ELL_COUNT_NONZEROS_PER_ROW_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_ELL_EXTRACT_DIAGONAL_KERNEL);
 
 
 }  // namespace ell
@@ -822,8 +881,10 @@ namespace hybrid {
 
 GKO_STUB(GKO_DECLARE_HYBRID_COMPUTE_COO_ROW_PTRS_KERNEL);
 GKO_STUB(GKO_DECLARE_HYBRID_COMPUTE_ROW_NNZ);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_HYBRID_FILL_IN_MATRIX_DATA_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_HYBRID_FILL_IN_MATRIX_DATA_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL);
 
 
 }  // namespace hybrid
@@ -842,8 +903,10 @@ GKO_STUB_INDEX_TYPE(GKO_DECLARE_PERMUTATION_COMPOSE_KERNEL);
 namespace scaled_permutation {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SCALED_PERMUTATION_INVERT_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SCALED_PERMUTATION_COMPOSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_SCALED_PERMUTATION_INVERT_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_SCALED_PERMUTATION_COMPOSE_KERNEL);
 
 
 }  // namespace scaled_permutation
@@ -852,14 +915,18 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SCALED_PERMUTATION_COMPOSE_KERNEL);
 namespace sellp {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_SPMV_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_FILL_IN_MATRIX_DATA_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_SELLP_SPMV_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_SELLP_FILL_IN_MATRIX_DATA_KERNEL);
 GKO_STUB_INDEX_TYPE(GKO_DECLARE_SELLP_COMPUTE_SLICE_SETS_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_FILL_IN_DENSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_COUNT_NONZEROS_PER_ROW_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_EXTRACT_DIAGONAL_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_SELLP_FILL_IN_DENSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_SELLP_COUNT_NONZEROS_PER_ROW_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_SELLP_EXTRACT_DIAGONAL_KERNEL);
 
 
 }  // namespace sellp
diff --git a/core/matrix/coo.cpp b/core/matrix/coo.cpp
index 1368dc261c3..7b3b3876295 100644
--- a/core/matrix/coo.cpp
+++ b/core/matrix/coo.cpp
@@ -214,7 +214,7 @@ void Coo<ValueType, IndexType>::apply2_impl(const LinOp* alpha, const LinOp* b,
 
 template <typename ValueType, typename IndexType>
 void Coo<ValueType, IndexType>::convert_to(
-    Coo<next_precision<ValueType>, IndexType>* result) const
+    Coo<next_precision_with_half<ValueType>, IndexType>* result) const
 {
     result->values_ = this->values_;
     result->row_idxs_ = this->row_idxs_;
@@ -225,12 +225,35 @@ void Coo<ValueType, IndexType>::convert_to(
 
 template <typename ValueType, typename IndexType>
 void Coo<ValueType, IndexType>::move_to(
-    Coo<next_precision<ValueType>, IndexType>* result)
+    Coo<next_precision_with_half<ValueType>, IndexType>* result)
 {
     this->convert_to(result);
 }
 
 
+#if GINKGO_ENABLE_HALF
+template <typename ValueType, typename IndexType>
+void Coo<ValueType, IndexType>::convert_to(
+    Coo<next_precision_with_half<next_precision_with_half<ValueType>>,
+        IndexType>* result) const
+{
+    result->values_ = this->values_;
+    result->row_idxs_ = this->row_idxs_;
+    result->col_idxs_ = this->col_idxs_;
+    result->set_size(this->get_size());
+}
+
+
+template <typename ValueType, typename IndexType>
+void Coo<ValueType, IndexType>::move_to(
+    Coo<next_precision_with_half<next_precision_with_half<ValueType>>,
+        IndexType>* result)
+{
+    this->convert_to(result);
+}
+#endif
+
+
 template <typename ValueType, typename IndexType>
 void Coo<ValueType, IndexType>::convert_to(
     Csr<ValueType, IndexType>* result) const
@@ -404,7 +427,7 @@ Coo<ValueType, IndexType>::compute_absolute() const
 
 #define GKO_DECLARE_COO_MATRIX(ValueType, IndexType) \
     class Coo<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_MATRIX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_COO_MATRIX);
 
 
 }  // namespace matrix
diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp
index 897eb1a48db..1bb3e778478 100644
--- a/core/matrix/csr.cpp
+++ b/core/matrix/csr.cpp
@@ -304,7 +304,7 @@ void Csr<ValueType, IndexType>::apply_impl(const LinOp* alpha, const LinOp* b,
 
 template <typename ValueType, typename IndexType>
 void Csr<ValueType, IndexType>::convert_to(
-    Csr<next_precision<ValueType>, IndexType>* result) const
+    Csr<next_precision_with_half<ValueType>, IndexType>* result) const
 {
     result->values_ = this->values_;
     result->col_idxs_ = this->col_idxs_;
@@ -316,11 +316,34 @@ void Csr<ValueType, IndexType>::convert_to(
 
 template <typename ValueType, typename IndexType>
 void Csr<ValueType, IndexType>::move_to(
-    Csr<next_precision<ValueType>, IndexType>* result)
+    Csr<next_precision_with_half<ValueType>, IndexType>* result)
 {
     this->convert_to(result);
 }
 
+#if GINKGO_ENABLE_HALF
+template <typename ValueType, typename IndexType>
+void Csr<ValueType, IndexType>::convert_to(
+    Csr<next_precision_with_half<next_precision_with_half<ValueType>>,
+        IndexType>* result) const
+{
+    result->values_ = this->values_;
+    result->col_idxs_ = this->col_idxs_;
+    result->row_ptrs_ = this->row_ptrs_;
+    result->set_size(this->get_size());
+    convert_strategy_helper(result);
+}
+
+
+template <typename ValueType, typename IndexType>
+void Csr<ValueType, IndexType>::move_to(
+    Csr<next_precision_with_half<next_precision_with_half<ValueType>>,
+        IndexType>* result)
+{
+    this->convert_to(result);
+}
+#endif
+
 
 template <typename ValueType, typename IndexType>
 void Csr<ValueType, IndexType>::convert_to(
@@ -1047,7 +1070,7 @@ void Csr<ValueType, IndexType>::add_scaled_identity_impl(const LinOp* a,
 
 #define GKO_DECLARE_CSR_MATRIX(ValueType, IndexType) \
     class Csr<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_MATRIX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_MATRIX);
 
 
 }  // namespace matrix
diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp
index 367b0232969..071e689232e 100644
--- a/core/matrix/dense.cpp
+++ b/core/matrix/dense.cpp
@@ -582,7 +582,7 @@ Dense<ValueType>::Dense(Dense<ValueType>&& other) : Dense(other.get_executor())
 
 template <typename ValueType>
 void Dense<ValueType>::convert_to(
-    Dense<next_precision<ValueType>>* result) const
+    Dense<next_precision_with_half<ValueType>>* result) const
 {
     if (result->get_size() != this->get_size()) {
         result->set_size(this->get_size());
@@ -597,12 +597,41 @@ void Dense<ValueType>::convert_to(
 
 
 template <typename ValueType>
-void Dense<ValueType>::move_to(Dense<next_precision<ValueType>>* result)
+void Dense<ValueType>::move_to(
+    Dense<next_precision_with_half<ValueType>>* result)
 {
     this->convert_to(result);
 }
 
 
+#if GINKGO_ENABLE_HALF
+template <typename ValueType>
+void Dense<ValueType>::convert_to(
+    Dense<next_precision_with_half<next_precision_with_half<ValueType>>>*
+        result) const
+{
+    if (result->get_size() != this->get_size()) {
+        result->set_size(this->get_size());
+        result->stride_ = stride_;
+        result->values_.resize_and_reset(result->get_size()[0] *
+                                         result->stride_);
+    }
+    auto exec = this->get_executor();
+    exec->run(dense::make_copy(
+        this, make_temporary_output_clone(exec, result).get()));
+}
+
+
+template <typename ValueType>
+void Dense<ValueType>::move_to(
+    Dense<next_precision_with_half<next_precision_with_half<ValueType>>>*
+        result)
+{
+    this->convert_to(result);
+}
+#endif
+
+
 template <typename ValueType>
 template <typename IndexType>
 void Dense<ValueType>::convert_impl(Coo<ValueType, IndexType>* result) const
@@ -1519,7 +1548,8 @@ template <typename ValueType, typename Function>
 void gather_mixed_real_complex(Function fn, LinOp* out)
 {
 #ifdef GINKGO_MIXED_PRECISION
-    run<matrix::Dense, ValueType, next_precision<ValueType>>(out, fn);
+    run<matrix::Dense, ValueType, next_precision_with_half<ValueType>,
+        next_precision_with_half<next_precision_with_half<ValueType>>>(out, fn);
 #else
     precision_dispatch<ValueType>(fn, out);
 #endif
@@ -2029,7 +2059,7 @@ Dense<ValueType>::Dense(std::shared_ptr<const Executor> exec,
 
 
 #define GKO_DECLARE_DENSE_MATRIX(_type) class Dense<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_MATRIX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_MATRIX);
 
 
 }  // namespace matrix
diff --git a/core/matrix/diagonal.cpp b/core/matrix/diagonal.cpp
index 1a442ffc789..85c5739b529 100644
--- a/core/matrix/diagonal.cpp
+++ b/core/matrix/diagonal.cpp
@@ -149,7 +149,7 @@ std::unique_ptr<LinOp> Diagonal<ValueType>::conj_transpose() const
 
 template <typename ValueType>
 void Diagonal<ValueType>::convert_to(
-    Diagonal<next_precision<ValueType>>* result) const
+    Diagonal<next_precision_with_half<ValueType>>* result) const
 {
     result->values_ = this->values_;
     result->set_size(this->get_size());
@@ -157,12 +157,34 @@ void Diagonal<ValueType>::convert_to(
 
 
 template <typename ValueType>
-void Diagonal<ValueType>::move_to(Diagonal<next_precision<ValueType>>* result)
+void Diagonal<ValueType>::move_to(
+    Diagonal<next_precision_with_half<ValueType>>* result)
 {
     this->convert_to(result);
 }
 
 
+#if GINKGO_ENABLE_HALF
+template <typename ValueType>
+void Diagonal<ValueType>::convert_to(
+    Diagonal<next_precision_with_half<next_precision_with_half<ValueType>>>*
+        result) const
+{
+    result->values_ = this->values_;
+    result->set_size(this->get_size());
+}
+
+
+template <typename ValueType>
+void Diagonal<ValueType>::move_to(
+    Diagonal<next_precision_with_half<next_precision_with_half<ValueType>>>*
+        result)
+{
+    this->convert_to(result);
+}
+#endif
+
+
 template <typename ValueType>
 void Diagonal<ValueType>::convert_to(Csr<ValueType, int32>* result) const
 {
@@ -373,7 +395,7 @@ std::unique_ptr<const Diagonal<ValueType>> Diagonal<ValueType>::create_const(
 
 
 #define GKO_DECLARE_DIAGONAL_MATRIX(value_type) class Diagonal<value_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DIAGONAL_MATRIX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DIAGONAL_MATRIX);
 
 
 }  // namespace matrix
@@ -391,7 +413,7 @@ std::unique_ptr<LinOp> DiagonalExtractable<ValueType>::extract_diagonal_linop()
 #define GKO_DECLARE_DIAGONAL_EXTRACTABLE(value_type) \
     std::unique_ptr<LinOp>                           \
     DiagonalExtractable<value_type>::extract_diagonal_linop() const
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DIAGONAL_EXTRACTABLE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DIAGONAL_EXTRACTABLE);
 
 
 }  // namespace gko
diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp
index 600c2ceb9d2..eafd9fa9cad 100644
--- a/core/matrix/ell.cpp
+++ b/core/matrix/ell.cpp
@@ -154,7 +154,7 @@ void Ell<ValueType, IndexType>::apply_impl(const LinOp* alpha, const LinOp* b,
 
 template <typename ValueType, typename IndexType>
 void Ell<ValueType, IndexType>::convert_to(
-    Ell<next_precision<ValueType>, IndexType>* result) const
+    Ell<next_precision_with_half<ValueType>, IndexType>* result) const
 {
     result->values_ = this->values_;
     result->col_idxs_ = this->col_idxs_;
@@ -166,12 +166,36 @@ void Ell<ValueType, IndexType>::convert_to(
 
 template <typename ValueType, typename IndexType>
 void Ell<ValueType, IndexType>::move_to(
-    Ell<next_precision<ValueType>, IndexType>* result)
+    Ell<next_precision_with_half<ValueType>, IndexType>* result)
 {
     this->convert_to(result);
 }
 
 
+#if GINKGO_ENABLE_HALF
+template <typename ValueType, typename IndexType>
+void Ell<ValueType, IndexType>::convert_to(
+    Ell<next_precision_with_half<next_precision_with_half<ValueType>>,
+        IndexType>* result) const
+{
+    result->values_ = this->values_;
+    result->col_idxs_ = this->col_idxs_;
+    result->num_stored_elements_per_row_ = this->num_stored_elements_per_row_;
+    result->stride_ = this->stride_;
+    result->set_size(this->get_size());
+}
+
+
+template <typename ValueType, typename IndexType>
+void Ell<ValueType, IndexType>::move_to(
+    Ell<next_precision_with_half<next_precision_with_half<ValueType>>,
+        IndexType>* result)
+{
+    this->convert_to(result);
+}
+#endif
+
+
 template <typename ValueType, typename IndexType>
 void Ell<ValueType, IndexType>::convert_to(Dense<ValueType>* result) const
 {
@@ -401,7 +425,7 @@ Ell<ValueType, IndexType>::Ell(std::shared_ptr<const Executor> exec,
 
 #define GKO_DECLARE_ELL_MATRIX(ValueType, IndexType) \
     class Ell<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_MATRIX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_ELL_MATRIX);
 
 
 }  // namespace matrix
diff --git a/core/matrix/fbcsr.cpp b/core/matrix/fbcsr.cpp
index 8ed9b117280..f1612be10e0 100644
--- a/core/matrix/fbcsr.cpp
+++ b/core/matrix/fbcsr.cpp
@@ -145,7 +145,7 @@ void Fbcsr<ValueType, IndexType>::apply_impl(const LinOp* alpha, const LinOp* b,
 
 template <typename ValueType, typename IndexType>
 void Fbcsr<ValueType, IndexType>::convert_to(
-    Fbcsr<next_precision<ValueType>, IndexType>* result) const
+    Fbcsr<next_precision_with_half<ValueType>, IndexType>* const result) const
 {
     result->values_ = this->values_;
     result->col_idxs_ = this->col_idxs_;
@@ -158,12 +158,37 @@ void Fbcsr<ValueType, IndexType>::convert_to(
 
 template <typename ValueType, typename IndexType>
 void Fbcsr<ValueType, IndexType>::move_to(
-    Fbcsr<next_precision<ValueType>, IndexType>* result)
+    Fbcsr<next_precision_with_half<ValueType>, IndexType>* const result)
 {
     this->convert_to(result);
 }
 
 
+#if GINKGO_ENABLE_HALF
+template <typename ValueType, typename IndexType>
+void Fbcsr<ValueType, IndexType>::convert_to(
+    Fbcsr<next_precision_with_half<next_precision_with_half<ValueType>>,
+          IndexType>* const result) const
+{
+    result->values_ = this->values_;
+    result->col_idxs_ = this->col_idxs_;
+    result->row_ptrs_ = this->row_ptrs_;
+    result->set_size(this->get_size());
+    // block sizes are immutable except for assignment/conversion
+    result->bs_ = this->bs_;
+}
+
+
+template <typename ValueType, typename IndexType>
+void Fbcsr<ValueType, IndexType>::move_to(
+    Fbcsr<next_precision_with_half<next_precision_with_half<ValueType>>,
+          IndexType>* const result)
+{
+    this->convert_to(result);
+}
+#endif
+
+
 template <typename ValueType, typename IndexType>
 void Fbcsr<ValueType, IndexType>::convert_to(Dense<ValueType>* result) const
 {
@@ -474,7 +499,8 @@ Fbcsr<ValueType, IndexType>::Fbcsr(std::shared_ptr<const Executor> exec,
 
 #define GKO_DECLARE_FBCSR_MATRIX(ValueType, IndexType) \
     class Fbcsr<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_MATRIX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_FBCSR_MATRIX);
 
 
 }  // namespace matrix
diff --git a/core/matrix/hybrid.cpp b/core/matrix/hybrid.cpp
index d450a0dfc35..72137558a10 100644
--- a/core/matrix/hybrid.cpp
+++ b/core/matrix/hybrid.cpp
@@ -203,7 +203,7 @@ void Hybrid<ValueType, IndexType>::apply_impl(const LinOp* alpha,
 
 template <typename ValueType, typename IndexType>
 void Hybrid<ValueType, IndexType>::convert_to(
-    Hybrid<next_precision<ValueType>, IndexType>* result) const
+    Hybrid<next_precision_with_half<ValueType>, IndexType>* result) const
 {
     this->ell_->convert_to(result->ell_);
     this->coo_->convert_to(result->coo_);
@@ -216,12 +216,37 @@ void Hybrid<ValueType, IndexType>::convert_to(
 
 template <typename ValueType, typename IndexType>
 void Hybrid<ValueType, IndexType>::move_to(
-    Hybrid<next_precision<ValueType>, IndexType>* result)
+    Hybrid<next_precision_with_half<ValueType>, IndexType>* result)
 {
     this->convert_to(result);
 }
 
 
+#if GINKGO_ENABLE_HALF
+template <typename ValueType, typename IndexType>
+void Hybrid<ValueType, IndexType>::convert_to(
+    Hybrid<next_precision_with_half<next_precision_with_half<ValueType>>,
+           IndexType>* result) const
+{
+    this->ell_->convert_to(result->ell_.get());
+    this->coo_->convert_to(result->coo_.get());
+    // TODO set strategy correctly
+    // There is no way to correctly clone the strategy like in
+    // Csr::convert_to
+    result->set_size(this->get_size());
+}
+
+
+template <typename ValueType, typename IndexType>
+void Hybrid<ValueType, IndexType>::move_to(
+    Hybrid<next_precision_with_half<next_precision_with_half<ValueType>>,
+           IndexType>* result)
+{
+    this->convert_to(result);
+}
+#endif
+
+
 template <typename ValueType, typename IndexType>
 void Hybrid<ValueType, IndexType>::convert_to(Dense<ValueType>* result) const
 {
@@ -418,7 +443,8 @@ Hybrid<ValueType, IndexType>::compute_absolute() const
 
 #define GKO_DECLARE_HYBRID_MATRIX(ValueType, IndexType) \
     class Hybrid<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_HYBRID_MATRIX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_HYBRID_MATRIX);
 
 
 }  // namespace matrix
diff --git a/core/matrix/identity.cpp b/core/matrix/identity.cpp
index 7e035be82a3..ecd93b6f959 100644
--- a/core/matrix/identity.cpp
+++ b/core/matrix/identity.cpp
@@ -83,9 +83,9 @@ std::unique_ptr<Identity<ValueType>> Identity<ValueType>::create(
 
 
 #define GKO_DECLARE_IDENTITY_MATRIX(_type) class Identity<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDENTITY_MATRIX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDENTITY_MATRIX);
 #define GKO_DECLARE_IDENTITY_FACTORY(_type) class IdentityFactory<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDENTITY_FACTORY);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDENTITY_FACTORY);
 
 
 }  // namespace matrix
diff --git a/core/matrix/permutation.cpp b/core/matrix/permutation.cpp
index 0fe7ba2b2ce..b6b9ff2d7e4 100644
--- a/core/matrix/permutation.cpp
+++ b/core/matrix/permutation.cpp
@@ -267,8 +267,11 @@ void dispatch_dense(const LinOp* op, Functor fn)
 {
     using matrix::Dense;
     using std::complex;
-    run<Dense, double, float, std::complex<double>, std::complex<float>>(op,
-                                                                         fn);
+    run<Dense,
+#if GINKGO_ENABLE_HALF
+        gko::half, std::complex<gko::half>,
+#endif
+        double, float, std::complex<double>, std::complex<float>>(op, fn);
 }
 
 
diff --git a/core/matrix/row_gatherer.cpp b/core/matrix/row_gatherer.cpp
index fecc60a0ca9..56fcbf93d88 100644
--- a/core/matrix/row_gatherer.cpp
+++ b/core/matrix/row_gatherer.cpp
@@ -4,6 +4,7 @@
 
 #include "ginkgo/core/matrix/row_gatherer.hpp"
 
+#include <ginkgo/core/base/half.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
 #include "core/base/dispatch_helper.hpp"
@@ -64,7 +65,11 @@ RowGatherer<IndexType>::create_const(
 template <typename IndexType>
 void RowGatherer<IndexType>::apply_impl(const LinOp* in, LinOp* out) const
 {
-    run<Dense, float, double, std::complex<float>, std::complex<double>>(
+    run<Dense,
+#if GINKGO_ENABLE_HALF
+        gko::half, std::complex<gko::half>,
+#endif
+        float, double, std::complex<float>, std::complex<double>>(
         in, [&](auto gather) { gather->row_gather(&row_idxs_, out); });
 }
 
@@ -72,7 +77,11 @@ template <typename IndexType>
 void RowGatherer<IndexType>::apply_impl(const LinOp* alpha, const LinOp* in,
                                         const LinOp* beta, LinOp* out) const
 {
-    run<Dense, float, double, std::complex<float>, std::complex<double>>(
+    run<Dense,
+#if GINKGO_ENABLE_HALF
+        gko::half, std::complex<gko::half>,
+#endif
+        float, double, std::complex<float>, std::complex<double>>(
         in,
         [&](auto gather) { gather->row_gather(alpha, &row_idxs_, beta, out); });
 }
diff --git a/core/matrix/scaled_permutation.cpp b/core/matrix/scaled_permutation.cpp
index 0f295d6b5be..bbe353e543e 100644
--- a/core/matrix/scaled_permutation.cpp
+++ b/core/matrix/scaled_permutation.cpp
@@ -174,7 +174,7 @@ void ScaledPermutation<ValueType, IndexType>::write(
 
 #define GKO_DECLARE_SCALED_PERMUTATION_MATRIX(ValueType, IndexType) \
     class ScaledPermutation<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SCALED_PERMUTATION_MATRIX);
 
 
diff --git a/core/matrix/sellp.cpp b/core/matrix/sellp.cpp
index a4787e758bf..bd81b08bada 100644
--- a/core/matrix/sellp.cpp
+++ b/core/matrix/sellp.cpp
@@ -176,7 +176,7 @@ void Sellp<ValueType, IndexType>::apply_impl(const LinOp* alpha, const LinOp* b,
 
 template <typename ValueType, typename IndexType>
 void Sellp<ValueType, IndexType>::convert_to(
-    Sellp<next_precision<ValueType>, IndexType>* result) const
+    Sellp<next_precision_with_half<ValueType>, IndexType>* result) const
 {
     result->values_ = this->values_;
     result->col_idxs_ = this->col_idxs_;
@@ -190,12 +190,38 @@ void Sellp<ValueType, IndexType>::convert_to(
 
 template <typename ValueType, typename IndexType>
 void Sellp<ValueType, IndexType>::move_to(
-    Sellp<next_precision<ValueType>, IndexType>* result)
+    Sellp<next_precision_with_half<ValueType>, IndexType>* result)
 {
     this->convert_to(result);
 }
 
 
+#if GINKGO_ENABLE_HALF
+template <typename ValueType, typename IndexType>
+void Sellp<ValueType, IndexType>::convert_to(
+    Sellp<next_precision_with_half<next_precision_with_half<ValueType>>,
+          IndexType>* result) const
+{
+    result->values_ = this->values_;
+    result->col_idxs_ = this->col_idxs_;
+    result->slice_lengths_ = this->slice_lengths_;
+    result->slice_sets_ = this->slice_sets_;
+    result->slice_size_ = this->slice_size_;
+    result->stride_factor_ = this->stride_factor_;
+    result->set_size(this->get_size());
+}
+
+
+template <typename ValueType, typename IndexType>
+void Sellp<ValueType, IndexType>::move_to(
+    Sellp<next_precision_with_half<next_precision_with_half<ValueType>>,
+          IndexType>* result)
+{
+    this->convert_to(result);
+}
+#endif
+
+
 template <typename ValueType, typename IndexType>
 void Sellp<ValueType, IndexType>::convert_to(Dense<ValueType>* result) const
 {
@@ -363,7 +389,8 @@ Sellp<ValueType, IndexType>::compute_absolute() const
 
 #define GKO_DECLARE_SELLP_MATRIX(ValueType, IndexType) \
     class Sellp<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_MATRIX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_SELLP_MATRIX);
 
 
 }  // namespace matrix
diff --git a/core/matrix/sparsity_csr.cpp b/core/matrix/sparsity_csr.cpp
index 9b8ea04da52..a4d8b2fa281 100644
--- a/core/matrix/sparsity_csr.cpp
+++ b/core/matrix/sparsity_csr.cpp
@@ -346,7 +346,8 @@ bool SparsityCsr<ValueType, IndexType>::is_sorted_by_column_index() const
 
 #define GKO_DECLARE_SPARSITY_MATRIX(ValueType, IndexType) \
     class SparsityCsr<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_MATRIX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_SPARSITY_MATRIX);
 
 
 }  // namespace matrix
diff --git a/dpcpp/matrix/coo_kernels.dp.cpp b/dpcpp/matrix/coo_kernels.dp.cpp
index 595af92b33b..7e8a9acfac3 100644
--- a/dpcpp/matrix/coo_kernels.dp.cpp
+++ b/dpcpp/matrix/coo_kernels.dp.cpp
@@ -259,7 +259,8 @@ void spmv(std::shared_ptr<const DpcppExecutor> exec,
     spmv2(exec, a, b, c);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_COO_SPMV_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -274,7 +275,7 @@ void advanced_spmv(std::shared_ptr<const DpcppExecutor> exec,
     advanced_spmv2(exec, alpha, a, b, c);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_COO_ADVANCED_SPMV_KERNEL);
 
 
@@ -311,7 +312,8 @@ void spmv2(std::shared_ptr<const DpcppExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_COO_SPMV2_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -350,7 +352,7 @@ void advanced_spmv2(std::shared_ptr<const DpcppExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL);
 
 
diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp
index 4dce0aa6ac2..efcb9b7f470 100644
--- a/dpcpp/matrix/csr_kernels.dp.cpp
+++ b/dpcpp/matrix/csr_kernels.dp.cpp
@@ -31,6 +31,7 @@
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/base/helper.hpp"
+#include "dpcpp/base/onemkl_bindings.hpp"
 #include "dpcpp/components/atomic.dp.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
@@ -266,7 +267,7 @@ void abstract_spmv(
 {
     using arithmetic_type = typename output_accessor::arithmetic_type;
     using output_type = typename output_accessor::storage_type;
-    const arithmetic_type scale_factor = alpha[0];
+    const auto scale_factor = static_cast<arithmetic_type>(alpha[0]);
     spmv_kernel(
         nwarps, num_rows, val, col_idxs, row_ptrs, srow, b, c,
         [&scale_factor](const arithmetic_type& x) {
@@ -479,8 +480,8 @@ void abstract_merge_path_spmv(
     sycl::nd_item<3> item_ct1, IndexType* shared_row_ptrs)
 {
     using type = typename output_accessor::arithmetic_type;
-    const type alpha_val = alpha[0];
-    const type beta_val = beta[0];
+    const type alpha_val = static_cast<type>(alpha[0]);
+    const type beta_val = static_cast<type>(beta[0]);
     merge_path_spmv<items_per_thread>(
         num_rows, val, col_idxs, row_ptrs, srow, b, c, row_out, val_out,
         [&alpha_val](const type& x) { return alpha_val * x; },
@@ -566,7 +567,7 @@ void abstract_reduce(
     uninitialized_array<IndexType, spmv_block_size>& tmp_ind,
     uninitialized_array<arithmetic_type, spmv_block_size>& tmp_val)
 {
-    const arithmetic_type alpha_val = alpha[0];
+    const auto alpha_val = static_cast<arithmetic_type>(alpha[0]);
     merge_path_reduce(
         nwarps, last_val, last_row, c,
         [&alpha_val](const arithmetic_type& x) { return alpha_val * x; },
@@ -694,8 +695,8 @@ void abstract_classical_spmv(
     acc::range<output_accessor> c, sycl::nd_item<3> item_ct1)
 {
     using type = typename output_accessor::arithmetic_type;
-    const type alpha_val = alpha[0];
-    const type beta_val = beta[0];
+    const type alpha_val = static_cast<type>(alpha[0]);
+    const type beta_val = static_cast<type>(beta[0]);
     device_classical_spmv<subgroup_size>(
         num_rows, val, col_idxs, row_ptrs, b, c,
         [&alpha_val, &beta_val](const type& x, const type& y) {
@@ -1393,8 +1394,9 @@ bool try_general_sparselib_spmv(std::shared_ptr<const DpcppExecutor> exec,
                                 const ValueType host_beta,
                                 matrix::Dense<ValueType>* c)
 {
-    bool try_sparselib = !is_complex<ValueType>();
-    if (try_sparselib) {
+    constexpr bool try_sparselib =
+        !is_complex<ValueType>() && !std::is_same<ValueType, gko::half>::value;
+    if constexpr (try_sparselib) {
         oneapi::mkl::sparse::matrix_handle_t mat_handle;
         oneapi::mkl::sparse::init_matrix_handle(&mat_handle);
         oneapi::mkl::sparse::set_csr_data(
@@ -1532,7 +1534,7 @@ void spmv(std::shared_ptr<const DpcppExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_SPMV_KERNEL);
 
 
@@ -1604,7 +1606,7 @@ void advanced_spmv(std::shared_ptr<const DpcppExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL);
 
 
@@ -1684,7 +1686,7 @@ void calculate_nonzeros_per_row_in_span(
                              row_nnz->get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
 
 
@@ -1696,7 +1698,7 @@ void calculate_nonzeros_per_row_in_index_set(
     const gko::index_set<IndexType>& col_index_set,
     IndexType* row_nnz) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_INDEX_SET_KERNEL);
 
 
@@ -1723,7 +1725,7 @@ void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,
         result->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
 
 
@@ -1735,7 +1737,7 @@ void compute_submatrix_from_index_set(
     const gko::index_set<IndexType>& col_index_set,
     matrix::Csr<ValueType, IndexType>* result) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_FROM_INDEX_SET_KERNEL);
 
 
@@ -1997,7 +1999,8 @@ void spgemm(std::shared_ptr<const DpcppExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_SPGEMM_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -2130,7 +2133,7 @@ void advanced_spgemm(std::shared_ptr<const DpcppExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL);
 
 
@@ -2216,7 +2219,8 @@ void spgeam(std::shared_ptr<const DpcppExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_SPGEAM_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -2237,7 +2241,7 @@ void fill_in_dense(std::shared_ptr<const DpcppExecutor> exec,
                           result->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL);
 
 
@@ -2247,7 +2251,7 @@ void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,
                       array<IndexType>& row_ptrs, array<IndexType>& col_idxs,
                       array<ValueType>& values) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL);
 
 
@@ -2310,7 +2314,8 @@ void transpose(std::shared_ptr<const DpcppExecutor> exec,
     generic_transpose<false>(exec, orig, trans);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -2321,7 +2326,7 @@ void conj_transpose(std::shared_ptr<const DpcppExecutor> exec,
     generic_transpose<true>(exec, orig, trans);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL);
 
 
@@ -2347,7 +2352,7 @@ void inv_symm_permute(std::shared_ptr<const DpcppExecutor> exec,
         permuted->get_col_idxs(), permuted->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL);
 
 
@@ -2374,7 +2379,7 @@ void inv_nonsymm_permute(std::shared_ptr<const DpcppExecutor> exec,
         permuted->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL);
 
 
@@ -2400,7 +2405,7 @@ void row_permute(std::shared_ptr<const DpcppExecutor> exec,
         row_permuted->get_col_idxs(), row_permuted->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL);
 
 
@@ -2426,7 +2431,7 @@ void inv_row_permute(std::shared_ptr<const DpcppExecutor> exec,
         row_permuted->get_col_idxs(), row_permuted->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL);
 
 
@@ -2452,7 +2457,7 @@ void inv_symm_scale_permute(std::shared_ptr<const DpcppExecutor> exec,
         permuted->get_col_idxs(), permuted->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL);
 
 
@@ -2482,7 +2487,7 @@ void inv_nonsymm_scale_permute(std::shared_ptr<const DpcppExecutor> exec,
         permuted->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL);
 
 
@@ -2508,7 +2513,7 @@ void row_scale_permute(std::shared_ptr<const DpcppExecutor> exec,
         row_permuted->get_col_idxs(), row_permuted->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL);
 
 
@@ -2534,7 +2539,7 @@ void inv_row_scale_permute(std::shared_ptr<const DpcppExecutor> exec,
         row_permuted->get_col_idxs(), row_permuted->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL);
 
 
@@ -2592,7 +2597,7 @@ void sort_by_column_index(std::shared_ptr<const DpcppExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX);
 
 
@@ -2624,7 +2629,7 @@ void is_sorted_by_column_index(
     *is_sorted = get_element(is_sorted_device_array, 0);
 };
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX);
 
 
@@ -2648,7 +2653,8 @@ void extract_diagonal(std::shared_ptr<const DpcppExecutor> exec,
                              orig_row_ptrs, orig_col_idxs, diag_values);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
 
 
 template <typename ValueType, typename IndexType>
@@ -2672,7 +2678,7 @@ void check_diagonal_entries_exist(std::shared_ptr<const DpcppExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST);
 
 
@@ -2695,7 +2701,7 @@ void add_scaled_identity(std::shared_ptr<const DpcppExecutor> exec,
         mtx->get_const_col_idxs(), mtx->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index 04f3229eaed..c6eb163bc7d 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -177,7 +177,7 @@ void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
     compute_dot(exec, x, y, result, tmp);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
 
 
@@ -192,7 +192,7 @@ void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
     compute_conj_dot(exec, x, y, result, tmp);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
 
 
@@ -206,7 +206,7 @@ void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
     compute_norm2(exec, x, result, tmp);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
 
 
@@ -217,21 +217,26 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   matrix::Dense<ValueType>* c)
 {
     using namespace oneapi::mkl;
-    if (b->get_stride() != 0 && c->get_stride() != 0) {
-        if (a->get_size()[1] > 0) {
-            oneapi::mkl::blas::row_major::gemm(
-                *exec->get_queue(), transpose::nontrans, transpose::nontrans,
-                c->get_size()[0], c->get_size()[1], a->get_size()[1],
-                one<ValueType>(), a->get_const_values(), a->get_stride(),
-                b->get_const_values(), b->get_stride(), zero<ValueType>(),
-                c->get_values(), c->get_stride());
-        } else {
-            dense::fill(exec, c, zero<ValueType>());
+    if constexpr (onemkl::is_supported<ValueType>::value) {
+        if (b->get_stride() != 0 && c->get_stride() != 0) {
+            if (a->get_size()[1] > 0) {
+                oneapi::mkl::blas::row_major::gemm(
+                    *exec->get_queue(), transpose::nontrans,
+                    transpose::nontrans, c->get_size()[0], c->get_size()[1],
+                    a->get_size()[1], one<ValueType>(), a->get_const_values(),
+                    a->get_stride(), b->get_const_values(), b->get_stride(),
+                    zero<ValueType>(), c->get_values(), c->get_stride());
+            } else {
+                dense::fill(exec, c, zero<ValueType>());
+            }
         }
+    } else {
+        GKO_NOT_IMPLEMENTED;
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
 
 
 template <typename ValueType>
@@ -241,23 +246,28 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
            const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
 {
     using namespace oneapi::mkl;
-    if (b->get_stride() != 0 && c->get_stride() != 0) {
-        if (a->get_size()[1] > 0) {
-            oneapi::mkl::blas::row_major::gemm(
-                *exec->get_queue(), transpose::nontrans, transpose::nontrans,
-                c->get_size()[0], c->get_size()[1], a->get_size()[1],
-                exec->copy_val_to_host(alpha->get_const_values()),
-                a->get_const_values(), a->get_stride(), b->get_const_values(),
-                b->get_stride(),
-                exec->copy_val_to_host(beta->get_const_values()),
-                c->get_values(), c->get_stride());
-        } else {
-            dense::scale(exec, beta, c);
+    if constexpr (onemkl::is_supported<ValueType>::value) {
+        if (b->get_stride() != 0 && c->get_stride() != 0) {
+            if (a->get_size()[1] > 0) {
+                oneapi::mkl::blas::row_major::gemm(
+                    *exec->get_queue(), transpose::nontrans,
+                    transpose::nontrans, c->get_size()[0], c->get_size()[1],
+                    a->get_size()[1],
+                    exec->copy_val_to_host(alpha->get_const_values()),
+                    a->get_const_values(), a->get_stride(),
+                    b->get_const_values(), b->get_stride(),
+                    exec->copy_val_to_host(beta->get_const_values()),
+                    c->get_values(), c->get_stride());
+            } else {
+                dense::scale(exec, beta, c);
+            }
         }
+    } else {
+        GKO_NOT_IMPLEMENTED;
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -292,7 +302,7 @@ void convert_to_coo(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL);
 
 
@@ -326,7 +336,7 @@ void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL);
 
 
@@ -365,7 +375,7 @@ void convert_to_ell(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL);
 
 
@@ -375,7 +385,7 @@ void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,
                       matrix::Fbcsr<ValueType, IndexType>* result)
     GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL);
 
 
@@ -385,7 +395,7 @@ void count_nonzero_blocks_per_row(std::shared_ptr<const DefaultExecutor> exec,
                                   int bs,
                                   IndexType* result) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL);
 
 
@@ -441,7 +451,7 @@ void convert_to_hybrid(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL);
 
 
@@ -484,7 +494,7 @@ void convert_to_sellp(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL);
 
 
@@ -516,7 +526,7 @@ void convert_to_sparsity_csr(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL);
 
 
@@ -538,7 +548,8 @@ void transpose(std::shared_ptr<const DefaultExecutor> exec,
         queue, orig, trans);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
 
 
 template <typename ValueType>
@@ -565,7 +576,8 @@ void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
                                 trans->get_values(), trans->get_stride());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
 
 
 }  // namespace dense
diff --git a/dpcpp/matrix/diagonal_kernels.dp.cpp b/dpcpp/matrix/diagonal_kernels.dp.cpp
index 2b63138abbe..272a6dbd581 100644
--- a/dpcpp/matrix/diagonal_kernels.dp.cpp
+++ b/dpcpp/matrix/diagonal_kernels.dp.cpp
@@ -82,7 +82,7 @@ void apply_to_csr(std::shared_ptr<const DpcppExecutor> exec,
                          inverse);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DIAGONAL_APPLY_TO_CSR_KERNEL);
 
 
diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp
index a97cb602d52..b33ed28b12d 100644
--- a/dpcpp/matrix/ell_kernels.dp.cpp
+++ b/dpcpp/matrix/ell_kernels.dp.cpp
@@ -415,7 +415,7 @@ void spmv(std::shared_ptr<const DpcppExecutor> exec,
         exec, num_worker_per_row, a, b, c);
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ELL_SPMV_KERNEL);
 
 
@@ -451,7 +451,7 @@ void advanced_spmv(std::shared_ptr<const DpcppExecutor> exec,
         exec, num_worker_per_row, a, b, c, alpha, beta);
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL);
 
 
diff --git a/dpcpp/matrix/fbcsr_kernels.dp.cpp b/dpcpp/matrix/fbcsr_kernels.dp.cpp
index e9eb02f5fb2..7d53b862d67 100644
--- a/dpcpp/matrix/fbcsr_kernels.dp.cpp
+++ b/dpcpp/matrix/fbcsr_kernels.dp.cpp
@@ -32,7 +32,8 @@ void spmv(std::shared_ptr<const DpcppExecutor> exec,
           const matrix::Dense<ValueType>* b,
           matrix::Dense<ValueType>* c) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_FBCSR_SPMV_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -43,7 +44,7 @@ void advanced_spmv(std::shared_ptr<const DpcppExecutor> exec,
                    const matrix::Dense<ValueType>* beta,
                    matrix::Dense<ValueType>* c) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL);
 
 
@@ -54,7 +55,7 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
                          array<IndexType>& col_idxs,
                          array<ValueType>& values) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL);
 
 
@@ -63,7 +64,7 @@ void fill_in_dense(std::shared_ptr<const DpcppExecutor> exec,
                    const matrix::Fbcsr<ValueType, IndexType>* source,
                    matrix::Dense<ValueType>* result) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL);
 
 
@@ -73,7 +74,7 @@ void convert_to_csr(const std::shared_ptr<const DpcppExecutor> exec,
                     matrix::Csr<ValueType, IndexType>* result)
     GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL);
 
 
@@ -82,7 +83,7 @@ void transpose(std::shared_ptr<const DpcppExecutor> exec,
                const matrix::Fbcsr<ValueType, IndexType>* orig,
                matrix::Fbcsr<ValueType, IndexType>* trans) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL);
 
 
@@ -92,7 +93,7 @@ void conj_transpose(std::shared_ptr<const DpcppExecutor> exec,
                     matrix::Fbcsr<ValueType, IndexType>* trans)
     GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL);
 
 
@@ -102,7 +103,7 @@ void is_sorted_by_column_index(
     const matrix::Fbcsr<ValueType, IndexType>* to_check,
     bool* is_sorted) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX);
 
 
@@ -111,7 +112,7 @@ void sort_by_column_index(const std::shared_ptr<const DpcppExecutor> exec,
                           matrix::Fbcsr<ValueType, IndexType>* to_sort)
     GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX);
 
 
@@ -120,7 +121,7 @@ void extract_diagonal(std::shared_ptr<const DpcppExecutor> exec,
                       const matrix::Fbcsr<ValueType, IndexType>* orig,
                       matrix::Diagonal<ValueType>* diag) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL);
 
 
diff --git a/dpcpp/matrix/sellp_kernels.dp.cpp b/dpcpp/matrix/sellp_kernels.dp.cpp
index 9c0fe717e8a..e83e8f2ce1a 100644
--- a/dpcpp/matrix/sellp_kernels.dp.cpp
+++ b/dpcpp/matrix/sellp_kernels.dp.cpp
@@ -119,7 +119,8 @@ void spmv(std::shared_ptr<const DpcppExecutor> exec,
                 b->get_const_values(), c->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_SELLP_SPMV_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -142,7 +143,7 @@ void advanced_spmv(std::shared_ptr<const DpcppExecutor> exec,
         beta->get_const_values(), c->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL);
 
 
diff --git a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp
index 66c57ac5b35..0e076794ac8 100644
--- a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp
+++ b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp
@@ -57,11 +57,11 @@ void device_classical_spmv(const size_type num_rows,
     const auto subrow = thread::get_subwarp_num_flat<subgroup_size>(item_ct1);
     const auto subid = subgroup_tile.thread_rank();
     const IndexType column_id = item_ct1.get_group(1);
-    const arithmetic_type value = static_cast<arithmetic_type>(val[0]);
+    const auto value = static_cast<arithmetic_type>(val[0]);
     auto row = thread::get_subwarp_id_flat<subgroup_size>(item_ct1);
     for (; row < num_rows; row += subrow) {
         const auto ind_end = row_ptrs[row + 1];
-        arithmetic_type temp_val = zero<arithmetic_type>();
+        auto temp_val = zero<arithmetic_type>();
         for (auto ind = row_ptrs[row] + subid; ind < ind_end;
              ind += subgroup_size) {
             temp_val += value * b(col_idxs[ind], column_id);
@@ -237,7 +237,7 @@ void spmv(std::shared_ptr<const DpcppExecutor> exec,
         syn::value_list<int>(), syn::type_list<>(), exec, a, b, c);
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL);
 
 
@@ -255,7 +255,7 @@ void advanced_spmv(std::shared_ptr<const DpcppExecutor> exec,
         syn::value_list<int>(), syn::type_list<>(), exec, a, b, c, alpha, beta);
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL);
 
 
@@ -265,7 +265,7 @@ void transpose(std::shared_ptr<const DpcppExecutor> exec,
                matrix::SparsityCsr<ValueType, IndexType>* trans)
     GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL);
 
 
@@ -290,7 +290,7 @@ void sort_by_column_index(std::shared_ptr<const DpcppExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX);
 
 
@@ -324,7 +324,7 @@ void is_sorted_by_column_index(
     cpu_array = gpu_array;
 };
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SPARSITY_CSR_IS_SORTED_BY_COLUMN_INDEX);
 
 
diff --git a/include/ginkgo/core/base/precision_dispatch.hpp b/include/ginkgo/core/base/precision_dispatch.hpp
index 8875b7d46f3..ad31a6b19e8 100644
--- a/include/ginkgo/core/base/precision_dispatch.hpp
+++ b/include/ginkgo/core/base/precision_dispatch.hpp
@@ -48,13 +48,15 @@ make_temporary_conversion(Ptr&& matrix)
 {
     using Pointee = detail::pointee<Ptr>;
     using Dense = matrix::Dense<ValueType>;
-    using NextDense = matrix::Dense<next_precision<ValueType>>;
+    using NextDense = matrix::Dense<next_precision_with_half<ValueType>>;
+    using NextNextDense = matrix::Dense<
+        next_precision_with_half<next_precision_with_half<ValueType>>>;
     using MaybeConstDense =
         std::conditional_t<std::is_const<Pointee>::value, const Dense, Dense>;
     auto result = detail::temporary_conversion<
-        MaybeConstDense>::template create<NextDense>(matrix);
+        MaybeConstDense>::template create<NextDense, NextNextDense>(matrix);
     if (!result) {
-        GKO_NOT_SUPPORTED(*matrix);
+        GKO_NOT_SUPPORTED(matrix);
     }
     return result;
 }
@@ -226,23 +228,26 @@ void mixed_precision_dispatch(Function fn, const LinOp* in, LinOp* out)
 {
 #ifdef GINKGO_MIXED_PRECISION
     using fst_type = matrix::Dense<ValueType>;
-    using snd_type = matrix::Dense<next_precision<ValueType>>;
-    if (auto dense_in = dynamic_cast<const fst_type*>(in)) {
+    using snd_type = matrix::Dense<next_precision_with_half<ValueType>>;
+    using trd_type = matrix::Dense<
+        next_precision_with_half<next_precision_with_half<ValueType>>>;
+    auto dispatch_out_vector = [&](auto dense_in) {
         if (auto dense_out = dynamic_cast<fst_type*>(out)) {
             fn(dense_in, dense_out);
         } else if (auto dense_out = dynamic_cast<snd_type*>(out)) {
             fn(dense_in, dense_out);
-        } else {
-            GKO_NOT_SUPPORTED(out);
-        }
-    } else if (auto dense_in = dynamic_cast<const snd_type*>(in)) {
-        if (auto dense_out = dynamic_cast<fst_type*>(out)) {
-            fn(dense_in, dense_out);
-        } else if (auto dense_out = dynamic_cast<snd_type*>(out)) {
+        } else if (auto dense_out = dynamic_cast<trd_type*>(out)) {
             fn(dense_in, dense_out);
         } else {
             GKO_NOT_SUPPORTED(out);
         }
+    };
+    if (auto dense_in = dynamic_cast<const fst_type*>(in)) {
+        dispatch_out_vector(dense_in);
+    } else if (auto dense_in = dynamic_cast<const snd_type*>(in)) {
+        dispatch_out_vector(dense_in);
+    } else if (auto dense_in = dynamic_cast<const trd_type*>(in)) {
+        dispatch_out_vector(dense_in);
     } else {
         GKO_NOT_SUPPORTED(in);
     }
diff --git a/include/ginkgo/core/matrix/coo.hpp b/include/ginkgo/core/matrix/coo.hpp
index 9373107df69..a0edf5aa862 100644
--- a/include/ginkgo/core/matrix/coo.hpp
+++ b/include/ginkgo/core/matrix/coo.hpp
@@ -47,15 +47,21 @@ class Hybrid;
  * @ingroup LinOp
  */
 template <typename ValueType = default_precision, typename IndexType = int32>
-class Coo : public EnableLinOp<Coo<ValueType, IndexType>>,
-            public ConvertibleTo<Coo<next_precision<ValueType>, IndexType>>,
-            public ConvertibleTo<Csr<ValueType, IndexType>>,
-            public ConvertibleTo<Dense<ValueType>>,
-            public DiagonalExtractable<ValueType>,
-            public ReadableFromMatrixData<ValueType, IndexType>,
-            public WritableToMatrixData<ValueType, IndexType>,
-            public EnableAbsoluteComputation<
-                remove_complex<Coo<ValueType, IndexType>>> {
+class Coo
+    : public EnableLinOp<Coo<ValueType, IndexType>>,
+      public ConvertibleTo<Coo<next_precision_with_half<ValueType>, IndexType>>,
+#if GINKGO_ENABLE_HALF
+      public ConvertibleTo<
+          Coo<next_precision_with_half<next_precision_with_half<ValueType>>,
+              IndexType>>,
+#endif
+      public ConvertibleTo<Csr<ValueType, IndexType>>,
+      public ConvertibleTo<Dense<ValueType>>,
+      public DiagonalExtractable<ValueType>,
+      public ReadableFromMatrixData<ValueType, IndexType>,
+      public WritableToMatrixData<ValueType, IndexType>,
+      public EnableAbsoluteComputation<
+          remove_complex<Coo<ValueType, IndexType>>> {
     friend class EnablePolymorphicObject<Coo, LinOp>;
     friend class Csr<ValueType, IndexType>;
     friend class Dense<ValueType>;
@@ -66,8 +72,10 @@ class Coo : public EnableLinOp<Coo<ValueType, IndexType>>,
 public:
     using EnableLinOp<Coo>::convert_to;
     using EnableLinOp<Coo>::move_to;
-    using ConvertibleTo<Coo<next_precision<ValueType>, IndexType>>::convert_to;
-    using ConvertibleTo<Coo<next_precision<ValueType>, IndexType>>::move_to;
+    using ConvertibleTo<
+        Coo<next_precision_with_half<ValueType>, IndexType>>::convert_to;
+    using ConvertibleTo<
+        Coo<next_precision_with_half<ValueType>, IndexType>>::move_to;
     using ConvertibleTo<Csr<ValueType, IndexType>>::convert_to;
     using ConvertibleTo<Csr<ValueType, IndexType>>::move_to;
     using ConvertibleTo<Dense<ValueType>>::convert_to;
@@ -80,12 +88,33 @@ class Coo : public EnableLinOp<Coo<ValueType, IndexType>>,
     using device_mat_data = device_matrix_data<ValueType, IndexType>;
     using absolute_type = remove_complex<Coo>;
 
-    friend class Coo<next_precision<ValueType>, IndexType>;
+    friend class Coo<previous_precision_with_half<ValueType>, IndexType>;
+
+    void convert_to(Coo<next_precision_with_half<ValueType>, IndexType>* result)
+        const override;
+
+    void move_to(
+        Coo<next_precision_with_half<ValueType>, IndexType>* result) override;
+
+#if GINKGO_ENABLE_HALF
+    friend class Coo<
+        previous_precision_with_half<previous_precision_with_half<ValueType>>,
+        IndexType>;
+    using ConvertibleTo<
+        Coo<next_precision_with_half<next_precision_with_half<ValueType>>,
+            IndexType>>::convert_to;
+    using ConvertibleTo<
+        Coo<next_precision_with_half<next_precision_with_half<ValueType>>,
+            IndexType>>::move_to;
 
     void convert_to(
-        Coo<next_precision<ValueType>, IndexType>* result) const override;
+        Coo<next_precision_with_half<next_precision_with_half<ValueType>>,
+            IndexType>* result) const override;
 
-    void move_to(Coo<next_precision<ValueType>, IndexType>* result) override;
+    void move_to(
+        Coo<next_precision_with_half<next_precision_with_half<ValueType>>,
+            IndexType>* result) override;
+#endif
 
     void convert_to(Csr<ValueType, IndexType>* other) const override;
 
diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp
index f27fe12a934..2f66683085f 100644
--- a/include/ginkgo/core/matrix/csr.hpp
+++ b/include/ginkgo/core/matrix/csr.hpp
@@ -98,23 +98,29 @@ void strategy_rebuild_helper(Csr<ValueType, IndexType>* result);
  * @ingroup LinOp
  */
 template <typename ValueType = default_precision, typename IndexType = int32>
-class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
-            public ConvertibleTo<Csr<next_precision<ValueType>, IndexType>>,
-            public ConvertibleTo<Dense<ValueType>>,
-            public ConvertibleTo<Coo<ValueType, IndexType>>,
-            public ConvertibleTo<Ell<ValueType, IndexType>>,
-            public ConvertibleTo<Fbcsr<ValueType, IndexType>>,
-            public ConvertibleTo<Hybrid<ValueType, IndexType>>,
-            public ConvertibleTo<Sellp<ValueType, IndexType>>,
-            public ConvertibleTo<SparsityCsr<ValueType, IndexType>>,
-            public DiagonalExtractable<ValueType>,
-            public ReadableFromMatrixData<ValueType, IndexType>,
-            public WritableToMatrixData<ValueType, IndexType>,
-            public Transposable,
-            public Permutable<IndexType>,
-            public EnableAbsoluteComputation<
-                remove_complex<Csr<ValueType, IndexType>>>,
-            public ScaledIdentityAddable {
+class Csr
+    : public EnableLinOp<Csr<ValueType, IndexType>>,
+      public ConvertibleTo<Csr<next_precision_with_half<ValueType>, IndexType>>,
+#if GINKGO_ENABLE_HALF
+      public ConvertibleTo<
+          Csr<next_precision_with_half<next_precision_with_half<ValueType>>,
+              IndexType>>,
+#endif
+      public ConvertibleTo<Dense<ValueType>>,
+      public ConvertibleTo<Coo<ValueType, IndexType>>,
+      public ConvertibleTo<Ell<ValueType, IndexType>>,
+      public ConvertibleTo<Fbcsr<ValueType, IndexType>>,
+      public ConvertibleTo<Hybrid<ValueType, IndexType>>,
+      public ConvertibleTo<Sellp<ValueType, IndexType>>,
+      public ConvertibleTo<SparsityCsr<ValueType, IndexType>>,
+      public DiagonalExtractable<ValueType>,
+      public ReadableFromMatrixData<ValueType, IndexType>,
+      public WritableToMatrixData<ValueType, IndexType>,
+      public Transposable,
+      public Permutable<IndexType>,
+      public EnableAbsoluteComputation<
+          remove_complex<Csr<ValueType, IndexType>>>,
+      public ScaledIdentityAddable {
     friend class EnablePolymorphicObject<Csr, LinOp>;
     friend class Coo<ValueType, IndexType>;
     friend class Dense<ValueType>;
@@ -130,8 +136,10 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
 public:
     using EnableLinOp<Csr>::convert_to;
     using EnableLinOp<Csr>::move_to;
-    using ConvertibleTo<Csr<next_precision<ValueType>, IndexType>>::convert_to;
-    using ConvertibleTo<Csr<next_precision<ValueType>, IndexType>>::move_to;
+    using ConvertibleTo<
+        Csr<next_precision_with_half<ValueType>, IndexType>>::convert_to;
+    using ConvertibleTo<
+        Csr<next_precision_with_half<ValueType>, IndexType>>::move_to;
     using ConvertibleTo<Dense<ValueType>>::convert_to;
     using ConvertibleTo<Dense<ValueType>>::move_to;
     using ConvertibleTo<Coo<ValueType, IndexType>>::convert_to;
@@ -688,12 +696,33 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
         index_type max_length_per_row_;
     };
 
-    friend class Csr<next_precision<ValueType>, IndexType>;
+    friend class Csr<previous_precision_with_half<ValueType>, IndexType>;
+
+    void convert_to(Csr<next_precision_with_half<ValueType>, IndexType>* result)
+        const override;
+
+    void move_to(
+        Csr<next_precision_with_half<ValueType>, IndexType>* result) override;
+
+#if GINKGO_ENABLE_HALF
+    friend class Csr<
+        previous_precision_with_half<previous_precision_with_half<ValueType>>,
+        IndexType>;
+    using ConvertibleTo<
+        Csr<next_precision_with_half<next_precision_with_half<ValueType>>,
+            IndexType>>::convert_to;
+    using ConvertibleTo<
+        Csr<next_precision_with_half<next_precision_with_half<ValueType>>,
+            IndexType>>::move_to;
 
     void convert_to(
-        Csr<next_precision<ValueType>, IndexType>* result) const override;
+        Csr<next_precision_with_half<next_precision_with_half<ValueType>>,
+            IndexType>* result) const override;
 
-    void move_to(Csr<next_precision<ValueType>, IndexType>* result) override;
+    void move_to(
+        Csr<next_precision_with_half<next_precision_with_half<ValueType>>,
+            IndexType>* result) override;
+#endif
 
     void convert_to(Dense<ValueType>* other) const override;
 
diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp
index bccd3adcd54..9ae96ca46d6 100644
--- a/include/ginkgo/core/matrix/dense.hpp
+++ b/include/ginkgo/core/matrix/dense.hpp
@@ -87,7 +87,11 @@ class SparsityCsr;
 template <typename ValueType = default_precision>
 class Dense
     : public EnableLinOp<Dense<ValueType>>,
-      public ConvertibleTo<Dense<next_precision<ValueType>>>,
+      public ConvertibleTo<Dense<next_precision_with_half<ValueType>>>,
+#if GINKGO_ENABLE_HALF
+      public ConvertibleTo<
+          Dense<next_precision_with_half<next_precision_with_half<ValueType>>>>,
+#endif
       public ConvertibleTo<Coo<ValueType, int32>>,
       public ConvertibleTo<Coo<ValueType, int64>>,
       public ConvertibleTo<Csr<ValueType, int32>>,
@@ -135,8 +139,8 @@ class Dense
 public:
     using EnableLinOp<Dense>::convert_to;
     using EnableLinOp<Dense>::move_to;
-    using ConvertibleTo<Dense<next_precision<ValueType>>>::convert_to;
-    using ConvertibleTo<Dense<next_precision<ValueType>>>::move_to;
+    using ConvertibleTo<Dense<next_precision_with_half<ValueType>>>::convert_to;
+    using ConvertibleTo<Dense<next_precision_with_half<ValueType>>>::move_to;
     using ConvertibleTo<Coo<ValueType, int32>>::convert_to;
     using ConvertibleTo<Coo<ValueType, int32>>::move_to;
     using ConvertibleTo<Coo<ValueType, int64>>::convert_to;
@@ -276,11 +280,29 @@ class Dense
         return other->create_const_view_of_impl();
     }
 
-    friend class Dense<next_precision<ValueType>>;
+    friend class Dense<previous_precision_with_half<ValueType>>;
 
-    void convert_to(Dense<next_precision<ValueType>>* result) const override;
+    void convert_to(
+        Dense<next_precision_with_half<ValueType>>* result) const override;
 
-    void move_to(Dense<next_precision<ValueType>>* result) override;
+    void move_to(Dense<next_precision_with_half<ValueType>>* result) override;
+
+#if GINKGO_ENABLE_HALF
+    friend class Dense<
+        previous_precision_with_half<previous_precision_with_half<ValueType>>>;
+    using ConvertibleTo<Dense<next_precision_with_half<
+        next_precision_with_half<ValueType>>>>::convert_to;
+    using ConvertibleTo<Dense<next_precision_with_half<
+        next_precision_with_half<ValueType>>>>::move_to;
+
+    void convert_to(
+        Dense<next_precision_with_half<next_precision_with_half<ValueType>>>*
+            result) const override;
+
+    void move_to(
+        Dense<next_precision_with_half<next_precision_with_half<ValueType>>>*
+            result) override;
+#endif
 
     void convert_to(Coo<ValueType, int32>* result) const override;
 
diff --git a/include/ginkgo/core/matrix/diagonal.hpp b/include/ginkgo/core/matrix/diagonal.hpp
index 56906a4d96f..3b11399138b 100644
--- a/include/ginkgo/core/matrix/diagonal.hpp
+++ b/include/ginkgo/core/matrix/diagonal.hpp
@@ -41,7 +41,11 @@ class Diagonal
     : public EnableLinOp<Diagonal<ValueType>>,
       public ConvertibleTo<Csr<ValueType, int32>>,
       public ConvertibleTo<Csr<ValueType, int64>>,
-      public ConvertibleTo<Diagonal<next_precision<ValueType>>>,
+      public ConvertibleTo<Diagonal<next_precision_with_half<ValueType>>>,
+#if GINKGO_ENABLE_HALF
+      public ConvertibleTo<Diagonal<
+          next_precision_with_half<next_precision_with_half<ValueType>>>>,
+#endif
       public Transposable,
       public WritableToMatrixData<ValueType, int32>,
       public WritableToMatrixData<ValueType, int64>,
@@ -60,8 +64,9 @@ class Diagonal
     using ConvertibleTo<Csr<ValueType, int32>>::move_to;
     using ConvertibleTo<Csr<ValueType, int64>>::convert_to;
     using ConvertibleTo<Csr<ValueType, int64>>::move_to;
-    using ConvertibleTo<Diagonal<next_precision<ValueType>>>::convert_to;
-    using ConvertibleTo<Diagonal<next_precision<ValueType>>>::move_to;
+    using ConvertibleTo<
+        Diagonal<next_precision_with_half<ValueType>>>::convert_to;
+    using ConvertibleTo<Diagonal<next_precision_with_half<ValueType>>>::move_to;
 
     using value_type = ValueType;
     using index_type = int64;
@@ -71,15 +76,34 @@ class Diagonal
     using device_mat_data32 = device_matrix_data<ValueType, int32>;
     using absolute_type = remove_complex<Diagonal>;
 
-    friend class Diagonal<next_precision<ValueType>>;
+    friend class Diagonal<previous_precision_with_half<ValueType>>;
 
     std::unique_ptr<LinOp> transpose() const override;
 
     std::unique_ptr<LinOp> conj_transpose() const override;
 
-    void convert_to(Diagonal<next_precision<ValueType>>* result) const override;
+    void convert_to(
+        Diagonal<next_precision_with_half<ValueType>>* result) const override;
 
-    void move_to(Diagonal<next_precision<ValueType>>* result) override;
+    void move_to(
+        Diagonal<next_precision_with_half<ValueType>>* result) override;
+
+#if GINKGO_ENABLE_HALF
+    friend class Diagonal<
+        previous_precision_with_half<previous_precision_with_half<ValueType>>>;
+    using ConvertibleTo<Diagonal<next_precision_with_half<
+        next_precision_with_half<ValueType>>>>::convert_to;
+    using ConvertibleTo<Diagonal<next_precision_with_half<
+        next_precision_with_half<ValueType>>>>::move_to;
+
+    void convert_to(
+        Diagonal<next_precision_with_half<next_precision_with_half<ValueType>>>*
+            result) const override;
+
+    void move_to(
+        Diagonal<next_precision_with_half<next_precision_with_half<ValueType>>>*
+            result) override;
+#endif
 
     void convert_to(Csr<ValueType, int32>* result) const override;
 
diff --git a/include/ginkgo/core/matrix/ell.hpp b/include/ginkgo/core/matrix/ell.hpp
index 37f4c0e7f55..adbd3505855 100644
--- a/include/ginkgo/core/matrix/ell.hpp
+++ b/include/ginkgo/core/matrix/ell.hpp
@@ -49,28 +49,36 @@ class Hybrid;
  * @ingroup LinOp
  */
 template <typename ValueType = default_precision, typename IndexType = int32>
-class Ell : public EnableLinOp<Ell<ValueType, IndexType>>,
-            public ConvertibleTo<Ell<next_precision<ValueType>, IndexType>>,
-            public ConvertibleTo<Dense<ValueType>>,
-            public ConvertibleTo<Csr<ValueType, IndexType>>,
-            public DiagonalExtractable<ValueType>,
-            public ReadableFromMatrixData<ValueType, IndexType>,
-            public WritableToMatrixData<ValueType, IndexType>,
-            public EnableAbsoluteComputation<
-                remove_complex<Ell<ValueType, IndexType>>> {
+class Ell
+    : public EnableLinOp<Ell<ValueType, IndexType>>,
+      public ConvertibleTo<Ell<next_precision_with_half<ValueType>, IndexType>>,
+#if GINKGO_ENABLE_HALF
+      public ConvertibleTo<
+          Ell<next_precision_with_half<next_precision_with_half<ValueType>>,
+              IndexType>>,
+#endif
+      public ConvertibleTo<Dense<ValueType>>,
+      public ConvertibleTo<Csr<ValueType, IndexType>>,
+      public DiagonalExtractable<ValueType>,
+      public ReadableFromMatrixData<ValueType, IndexType>,
+      public WritableToMatrixData<ValueType, IndexType>,
+      public EnableAbsoluteComputation<
+          remove_complex<Ell<ValueType, IndexType>>> {
     friend class EnablePolymorphicObject<Ell, LinOp>;
     friend class Dense<ValueType>;
     friend class Coo<ValueType, IndexType>;
     friend class Csr<ValueType, IndexType>;
     friend class Ell<to_complex<ValueType>, IndexType>;
-    friend class Ell<next_precision<ValueType>, IndexType>;
+    friend class Ell<previous_precision_with_half<ValueType>, IndexType>;
     friend class Hybrid<ValueType, IndexType>;
 
 public:
     using EnableLinOp<Ell>::convert_to;
     using EnableLinOp<Ell>::move_to;
-    using ConvertibleTo<Ell<next_precision<ValueType>, IndexType>>::convert_to;
-    using ConvertibleTo<Ell<next_precision<ValueType>, IndexType>>::move_to;
+    using ConvertibleTo<
+        Ell<next_precision_with_half<ValueType>, IndexType>>::convert_to;
+    using ConvertibleTo<
+        Ell<next_precision_with_half<ValueType>, IndexType>>::move_to;
     using ConvertibleTo<Dense<ValueType>>::convert_to;
     using ConvertibleTo<Dense<ValueType>>::move_to;
     using ConvertibleTo<Csr<ValueType, IndexType>>::convert_to;
@@ -83,10 +91,31 @@ class Ell : public EnableLinOp<Ell<ValueType, IndexType>>,
     using device_mat_data = device_matrix_data<ValueType, IndexType>;
     using absolute_type = remove_complex<Ell>;
 
+    void convert_to(Ell<next_precision_with_half<ValueType>, IndexType>* result)
+        const override;
+
+    void move_to(
+        Ell<next_precision_with_half<ValueType>, IndexType>* result) override;
+
+#if GINKGO_ENABLE_HALF
+    friend class Ell<
+        previous_precision_with_half<previous_precision_with_half<ValueType>>,
+        IndexType>;
+    using ConvertibleTo<
+        Ell<next_precision_with_half<next_precision_with_half<ValueType>>,
+            IndexType>>::convert_to;
+    using ConvertibleTo<
+        Ell<next_precision_with_half<next_precision_with_half<ValueType>>,
+            IndexType>>::move_to;
+
     void convert_to(
-        Ell<next_precision<ValueType>, IndexType>* result) const override;
+        Ell<next_precision_with_half<next_precision_with_half<ValueType>>,
+            IndexType>* result) const override;
 
-    void move_to(Ell<next_precision<ValueType>, IndexType>* result) override;
+    void move_to(
+        Ell<next_precision_with_half<next_precision_with_half<ValueType>>,
+            IndexType>* result) override;
+#endif
 
     void convert_to(Dense<ValueType>* other) const override;
 
diff --git a/include/ginkgo/core/matrix/fbcsr.hpp b/include/ginkgo/core/matrix/fbcsr.hpp
index ce327e7e8a0..283807b242c 100644
--- a/include/ginkgo/core/matrix/fbcsr.hpp
+++ b/include/ginkgo/core/matrix/fbcsr.hpp
@@ -96,17 +96,24 @@ inline IndexType get_num_blocks(const int block_size, const IndexType size)
  * @ingroup LinOp
  */
 template <typename ValueType = default_precision, typename IndexType = int32>
-class Fbcsr : public EnableLinOp<Fbcsr<ValueType, IndexType>>,
-              public ConvertibleTo<Fbcsr<next_precision<ValueType>, IndexType>>,
-              public ConvertibleTo<Dense<ValueType>>,
-              public ConvertibleTo<Csr<ValueType, IndexType>>,
-              public ConvertibleTo<SparsityCsr<ValueType, IndexType>>,
-              public DiagonalExtractable<ValueType>,
-              public ReadableFromMatrixData<ValueType, IndexType>,
-              public WritableToMatrixData<ValueType, IndexType>,
-              public Transposable,
-              public EnableAbsoluteComputation<
-                  remove_complex<Fbcsr<ValueType, IndexType>>> {
+class Fbcsr
+    : public EnableLinOp<Fbcsr<ValueType, IndexType>>,
+      public ConvertibleTo<
+          Fbcsr<next_precision_with_half<ValueType>, IndexType>>,
+#if GINKGO_ENABLE_HALF
+      public ConvertibleTo<
+          Fbcsr<next_precision_with_half<next_precision_with_half<ValueType>>,
+                IndexType>>,
+#endif
+      public ConvertibleTo<Dense<ValueType>>,
+      public ConvertibleTo<Csr<ValueType, IndexType>>,
+      public ConvertibleTo<SparsityCsr<ValueType, IndexType>>,
+      public DiagonalExtractable<ValueType>,
+      public ReadableFromMatrixData<ValueType, IndexType>,
+      public WritableToMatrixData<ValueType, IndexType>,
+      public Transposable,
+      public EnableAbsoluteComputation<
+          remove_complex<Fbcsr<ValueType, IndexType>>> {
     friend class EnablePolymorphicObject<Fbcsr, LinOp>;
     friend class Csr<ValueType, IndexType>;
     friend class Dense<ValueType>;
@@ -136,8 +143,9 @@ class Fbcsr : public EnableLinOp<Fbcsr<ValueType, IndexType>>,
     using EnableLinOp<Fbcsr<ValueType, IndexType>>::convert_to;
 
     using ConvertibleTo<
-        Fbcsr<next_precision<ValueType>, IndexType>>::convert_to;
-    using ConvertibleTo<Fbcsr<next_precision<ValueType>, IndexType>>::move_to;
+        Fbcsr<next_precision_with_half<ValueType>, IndexType>>::convert_to;
+    using ConvertibleTo<
+        Fbcsr<next_precision_with_half<ValueType>, IndexType>>::move_to;
     using ConvertibleTo<Dense<ValueType>>::convert_to;
     using ConvertibleTo<Dense<ValueType>>::move_to;
     using ConvertibleTo<Csr<ValueType, IndexType>>::convert_to;
@@ -145,12 +153,33 @@ class Fbcsr : public EnableLinOp<Fbcsr<ValueType, IndexType>>,
     using ConvertibleTo<SparsityCsr<ValueType, IndexType>>::convert_to;
     using ConvertibleTo<SparsityCsr<ValueType, IndexType>>::move_to;
 
-    friend class Fbcsr<next_precision<ValueType>, IndexType>;
+    friend class Fbcsr<previous_precision_with_half<ValueType>, IndexType>;
+
+    void convert_to(Fbcsr<next_precision_with_half<ValueType>, IndexType>*
+                        result) const override;
+
+    void move_to(
+        Fbcsr<next_precision_with_half<ValueType>, IndexType>* result) override;
+
+#if GINKGO_ENABLE_HALF
+    friend class Fbcsr<
+        previous_precision_with_half<previous_precision_with_half<ValueType>>,
+        IndexType>;
+    using ConvertibleTo<
+        Fbcsr<next_precision_with_half<next_precision_with_half<ValueType>>,
+              IndexType>>::convert_to;
+    using ConvertibleTo<
+        Fbcsr<next_precision_with_half<next_precision_with_half<ValueType>>,
+              IndexType>>::move_to;
 
     void convert_to(
-        Fbcsr<next_precision<ValueType>, IndexType>* result) const override;
+        Fbcsr<next_precision_with_half<next_precision_with_half<ValueType>>,
+              IndexType>* result) const override;
 
-    void move_to(Fbcsr<next_precision<ValueType>, IndexType>* result) override;
+    void move_to(
+        Fbcsr<next_precision_with_half<next_precision_with_half<ValueType>>,
+              IndexType>* result) override;
+#endif
 
     void convert_to(Dense<ValueType>* other) const override;
 
diff --git a/include/ginkgo/core/matrix/hybrid.hpp b/include/ginkgo/core/matrix/hybrid.hpp
index 5e995cb0ba0..24cb3ed26c7 100644
--- a/include/ginkgo/core/matrix/hybrid.hpp
+++ b/include/ginkgo/core/matrix/hybrid.hpp
@@ -41,7 +41,13 @@ class Csr;
 template <typename ValueType = default_precision, typename IndexType = int32>
 class Hybrid
     : public EnableLinOp<Hybrid<ValueType, IndexType>>,
-      public ConvertibleTo<Hybrid<next_precision<ValueType>, IndexType>>,
+      public ConvertibleTo<
+          Hybrid<next_precision_with_half<ValueType>, IndexType>>,
+#if GINKGO_ENABLE_HALF
+      public ConvertibleTo<
+          Hybrid<next_precision_with_half<next_precision_with_half<ValueType>>,
+                 IndexType>>,
+#endif
       public ConvertibleTo<Dense<ValueType>>,
       public ConvertibleTo<Csr<ValueType, IndexType>>,
       public DiagonalExtractable<ValueType>,
@@ -59,8 +65,9 @@ class Hybrid
     using EnableLinOp<Hybrid>::convert_to;
     using EnableLinOp<Hybrid>::move_to;
     using ConvertibleTo<
-        Hybrid<next_precision<ValueType>, IndexType>>::convert_to;
-    using ConvertibleTo<Hybrid<next_precision<ValueType>, IndexType>>::move_to;
+        Hybrid<next_precision_with_half<ValueType>, IndexType>>::convert_to;
+    using ConvertibleTo<
+        Hybrid<next_precision_with_half<ValueType>, IndexType>>::move_to;
     using ConvertibleTo<Dense<ValueType>>::convert_to;
     using ConvertibleTo<Dense<ValueType>>::move_to;
     using ConvertibleTo<Csr<ValueType, IndexType>>::convert_to;
@@ -355,12 +362,33 @@ class Hybrid
         imbalance_bounded_limit strategy_;
     };
 
-    friend class Hybrid<next_precision<ValueType>, IndexType>;
+    friend class Hybrid<previous_precision_with_half<ValueType>, IndexType>;
+
+    void convert_to(Hybrid<next_precision_with_half<ValueType>, IndexType>*
+                        result) const override;
+
+    void move_to(Hybrid<next_precision_with_half<ValueType>, IndexType>* result)
+        override;
+
+#if GINKGO_ENABLE_HALF
+    friend class Hybrid<
+        previous_precision_with_half<previous_precision_with_half<ValueType>>,
+        IndexType>;
+    using ConvertibleTo<
+        Hybrid<next_precision_with_half<next_precision_with_half<ValueType>>,
+               IndexType>>::convert_to;
+    using ConvertibleTo<
+        Hybrid<next_precision_with_half<next_precision_with_half<ValueType>>,
+               IndexType>>::move_to;
 
     void convert_to(
-        Hybrid<next_precision<ValueType>, IndexType>* result) const override;
+        Hybrid<next_precision_with_half<next_precision_with_half<ValueType>>,
+               IndexType>* result) const override;
 
-    void move_to(Hybrid<next_precision<ValueType>, IndexType>* result) override;
+    void move_to(
+        Hybrid<next_precision_with_half<next_precision_with_half<ValueType>>,
+               IndexType>* result) override;
+#endif
 
     void convert_to(Dense<ValueType>* other) const override;
 
diff --git a/include/ginkgo/core/matrix/sellp.hpp b/include/ginkgo/core/matrix/sellp.hpp
index e6520324030..6140a832c85 100644
--- a/include/ginkgo/core/matrix/sellp.hpp
+++ b/include/ginkgo/core/matrix/sellp.hpp
@@ -40,15 +40,22 @@ class Csr;
  * @ingroup LinOp
  */
 template <typename ValueType = default_precision, typename IndexType = int32>
-class Sellp : public EnableLinOp<Sellp<ValueType, IndexType>>,
-              public ConvertibleTo<Sellp<next_precision<ValueType>, IndexType>>,
-              public ConvertibleTo<Dense<ValueType>>,
-              public ConvertibleTo<Csr<ValueType, IndexType>>,
-              public DiagonalExtractable<ValueType>,
-              public ReadableFromMatrixData<ValueType, IndexType>,
-              public WritableToMatrixData<ValueType, IndexType>,
-              public EnableAbsoluteComputation<
-                  remove_complex<Sellp<ValueType, IndexType>>> {
+class Sellp
+    : public EnableLinOp<Sellp<ValueType, IndexType>>,
+      public ConvertibleTo<
+          Sellp<next_precision_with_half<ValueType>, IndexType>>,
+#if GINKGO_ENABLE_HALF
+      public ConvertibleTo<
+          Sellp<next_precision_with_half<next_precision_with_half<ValueType>>,
+                IndexType>>,
+#endif
+      public ConvertibleTo<Dense<ValueType>>,
+      public ConvertibleTo<Csr<ValueType, IndexType>>,
+      public DiagonalExtractable<ValueType>,
+      public ReadableFromMatrixData<ValueType, IndexType>,
+      public WritableToMatrixData<ValueType, IndexType>,
+      public EnableAbsoluteComputation<
+          remove_complex<Sellp<ValueType, IndexType>>> {
     friend class EnablePolymorphicObject<Sellp, LinOp>;
     friend class Dense<ValueType>;
     friend class Csr<ValueType, IndexType>;
@@ -58,8 +65,9 @@ class Sellp : public EnableLinOp<Sellp<ValueType, IndexType>>,
     using EnableLinOp<Sellp>::convert_to;
     using EnableLinOp<Sellp>::move_to;
     using ConvertibleTo<
-        Sellp<next_precision<ValueType>, IndexType>>::convert_to;
-    using ConvertibleTo<Sellp<next_precision<ValueType>, IndexType>>::move_to;
+        Sellp<next_precision_with_half<ValueType>, IndexType>>::convert_to;
+    using ConvertibleTo<
+        Sellp<next_precision_with_half<ValueType>, IndexType>>::move_to;
     using ConvertibleTo<Dense<ValueType>>::convert_to;
     using ConvertibleTo<Dense<ValueType>>::move_to;
     using ConvertibleTo<Csr<ValueType, IndexType>>::convert_to;
@@ -72,12 +80,33 @@ class Sellp : public EnableLinOp<Sellp<ValueType, IndexType>>,
     using device_mat_data = device_matrix_data<ValueType, IndexType>;
     using absolute_type = remove_complex<Sellp>;
 
-    friend class Sellp<next_precision<ValueType>, IndexType>;
+    friend class Sellp<previous_precision_with_half<ValueType>, IndexType>;
+
+    void convert_to(Sellp<next_precision_with_half<ValueType>, IndexType>*
+                        result) const override;
+
+    void move_to(
+        Sellp<next_precision_with_half<ValueType>, IndexType>* result) override;
+
+#if GINKGO_ENABLE_HALF
+    friend class Sellp<
+        previous_precision_with_half<previous_precision_with_half<ValueType>>,
+        IndexType>;
+    using ConvertibleTo<
+        Sellp<next_precision_with_half<next_precision_with_half<ValueType>>,
+              IndexType>>::convert_to;
+    using ConvertibleTo<
+        Sellp<next_precision_with_half<next_precision_with_half<ValueType>>,
+              IndexType>>::move_to;
 
     void convert_to(
-        Sellp<next_precision<ValueType>, IndexType>* result) const override;
+        Sellp<next_precision_with_half<next_precision_with_half<ValueType>>,
+              IndexType>* result) const override;
 
-    void move_to(Sellp<next_precision<ValueType>, IndexType>* result) override;
+    void move_to(
+        Sellp<next_precision_with_half<next_precision_with_half<ValueType>>,
+              IndexType>* result) override;
+#endif
 
     void convert_to(Dense<ValueType>* other) const override;
 
diff --git a/omp/matrix/coo_kernels.cpp b/omp/matrix/coo_kernels.cpp
index 021795d8e9c..6d4a46b7ed3 100644
--- a/omp/matrix/coo_kernels.cpp
+++ b/omp/matrix/coo_kernels.cpp
@@ -42,7 +42,8 @@ void spmv(std::shared_ptr<const OmpExecutor> exec,
     spmv2(exec, a, b, c);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_COO_SPMV_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -57,7 +58,7 @@ void advanced_spmv(std::shared_ptr<const OmpExecutor> exec,
     advanced_spmv2(exec, alpha, a, b, c);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_COO_ADVANCED_SPMV_KERNEL);
 
 
@@ -306,7 +307,8 @@ void spmv2(std::shared_ptr<const OmpExecutor> exec,
     generic_spmv2(exec, a, b, c, one<ValueType>());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_COO_SPMV2_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -319,7 +321,7 @@ void advanced_spmv2(std::shared_ptr<const OmpExecutor> exec,
     generic_spmv2(exec, a, b, c, alpha->at(0, 0));
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL);
 
 
diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp
index 87b328b1093..d9c7b9840c1 100644
--- a/omp/matrix/csr_kernels.cpp
+++ b/omp/matrix/csr_kernels.cpp
@@ -77,7 +77,7 @@ void spmv(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_SPMV_KERNEL);
 
 
@@ -95,8 +95,8 @@ void advanced_spmv(std::shared_ptr<const OmpExecutor> exec,
 
     auto row_ptrs = a->get_const_row_ptrs();
     auto col_idxs = a->get_const_col_idxs();
-    arithmetic_type valpha = alpha->at(0, 0);
-    arithmetic_type vbeta = beta->at(0, 0);
+    auto valpha = static_cast<arithmetic_type>(alpha->at(0, 0));
+    auto vbeta = static_cast<arithmetic_type>(beta->at(0, 0));
 
     const auto a_vals =
         acc::helper::build_const_rrm_accessor<arithmetic_type>(a);
@@ -118,7 +118,7 @@ void advanced_spmv(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL);
 
 
@@ -374,7 +374,8 @@ void spgemm(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_SPGEMM_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -490,7 +491,7 @@ void advanced_spgemm(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL);
 
 
@@ -540,7 +541,8 @@ void spgeam(std::shared_ptr<const OmpExecutor> exec,
         [](IndexType, IndexType) {});
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_SPGEAM_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -563,7 +565,7 @@ void fill_in_dense(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL);
 
 
@@ -633,7 +635,7 @@ void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,
     std::copy(col_idx_vec.begin(), col_idx_vec.end(), col_idxs.get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL);
 
 
@@ -692,7 +694,8 @@ void transpose(std::shared_ptr<const OmpExecutor> exec,
                             [](const ValueType x) { return x; });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -704,7 +707,7 @@ void conj_transpose(std::shared_ptr<const OmpExecutor> exec,
                             [](const ValueType x) { return conj(x); });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL);
 
 
@@ -728,7 +731,7 @@ void calculate_nonzeros_per_row_in_span(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
 
 
@@ -775,7 +778,7 @@ void calculate_nonzeros_per_row_in_index_set(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_INDEX_SET_KERNEL);
 
 
@@ -808,7 +811,7 @@ void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
 
 
@@ -868,7 +871,7 @@ void compute_submatrix_from_index_set(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_FROM_INDEX_SET_KERNEL);
 
 
@@ -881,7 +884,7 @@ void inv_symm_permute(std::shared_ptr<const DefaultExecutor> exec,
     inv_nonsymm_permute(exec, perm, perm, orig, permuted);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL);
 
 
@@ -921,7 +924,7 @@ void inv_nonsymm_permute(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL);
 
 
@@ -959,7 +962,7 @@ void row_permute(std::shared_ptr<const OmpExecutor> exec, const IndexType* perm,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL);
 
 
@@ -998,7 +1001,7 @@ void inv_row_permute(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL);
 
 
@@ -1011,7 +1014,7 @@ void inv_symm_scale_permute(std::shared_ptr<const DefaultExecutor> exec,
     inv_nonsymm_scale_permute(exec, scale, perm, scale, perm, orig, permuted);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL);
 
 
@@ -1055,7 +1058,7 @@ void inv_nonsymm_scale_permute(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL);
 
 
@@ -1096,7 +1099,7 @@ void row_scale_permute(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL);
 
 
@@ -1137,7 +1140,7 @@ void inv_row_scale_permute(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL);
 
 
@@ -1160,7 +1163,7 @@ void sort_by_column_index(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX);
 
 
@@ -1188,7 +1191,7 @@ void is_sorted_by_column_index(
     *is_sorted = local_is_sorted;
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX);
 
 
@@ -1214,7 +1217,8 @@ void extract_diagonal(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
 
 
 template <typename ValueType, typename IndexType>
@@ -1241,7 +1245,7 @@ void check_diagonal_entries_exist(std::shared_ptr<const OmpExecutor> exec,
     has_all_diags = l_has_all_diags;
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST);
 
 
@@ -1270,7 +1274,7 @@ void add_scaled_identity(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/omp/matrix/dense_kernels.cpp b/omp/matrix/dense_kernels.cpp
index d1c0f2f8949..4ca5aa0c075 100644
--- a/omp/matrix/dense_kernels.cpp
+++ b/omp/matrix/dense_kernels.cpp
@@ -46,7 +46,7 @@ void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
     compute_dot(exec, x, y, result, tmp);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
 
 
@@ -60,7 +60,7 @@ void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
     compute_conj_dot(exec, x, y, result, tmp);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
 
 
@@ -73,7 +73,7 @@ void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
     compute_norm2(exec, x, result, tmp);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
 
 
@@ -100,7 +100,8 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
 
 
 template <typename ValueType>
@@ -136,7 +137,7 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -168,7 +169,7 @@ void convert_to_coo(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL);
 
 
@@ -199,7 +200,7 @@ void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL);
 
 
@@ -232,7 +233,7 @@ void convert_to_ell(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL);
 
 
@@ -280,7 +281,7 @@ void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL);
 
 
@@ -326,7 +327,7 @@ void convert_to_hybrid(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL);
 
 
@@ -368,7 +369,7 @@ void convert_to_sellp(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL);
 
 
@@ -398,7 +399,7 @@ void convert_to_sparsity_csr(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL);
 
 
@@ -415,7 +416,8 @@ void transpose(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
 
 
 template <typename ValueType>
@@ -431,7 +433,8 @@ void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -461,7 +464,7 @@ void count_nonzero_blocks_per_row(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL);
 
 
diff --git a/omp/matrix/diagonal_kernels.cpp b/omp/matrix/diagonal_kernels.cpp
index 71363c7bc6e..c16e740dc45 100644
--- a/omp/matrix/diagonal_kernels.cpp
+++ b/omp/matrix/diagonal_kernels.cpp
@@ -43,7 +43,7 @@ void apply_to_csr(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DIAGONAL_APPLY_TO_CSR_KERNEL);
 
 
diff --git a/omp/matrix/ell_kernels.cpp b/omp/matrix/ell_kernels.cpp
index c35a3654b86..dc200ae0f93 100644
--- a/omp/matrix/ell_kernels.cpp
+++ b/omp/matrix/ell_kernels.cpp
@@ -185,7 +185,7 @@ void spmv(std::shared_ptr<const OmpExecutor> exec,
     spmv_blocked<4>(exec, a, b, c, out);
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ELL_SPMV_KERNEL);
 
 
@@ -228,7 +228,7 @@ void advanced_spmv(std::shared_ptr<const OmpExecutor> exec,
     spmv_blocked<4>(exec, a, b, c, out);
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL);
 
 
diff --git a/omp/matrix/fbcsr_kernels.cpp b/omp/matrix/fbcsr_kernels.cpp
index d17d47a7467..14dcb1db77a 100644
--- a/omp/matrix/fbcsr_kernels.cpp
+++ b/omp/matrix/fbcsr_kernels.cpp
@@ -74,7 +74,8 @@ void spmv(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_FBCSR_SPMV_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -118,7 +119,7 @@ void advanced_spmv(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL);
 
 
@@ -176,7 +177,7 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
     std::copy(col_idx_vec.begin(), col_idx_vec.end(), col_idxs.get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL);
 
 
@@ -209,7 +210,7 @@ void fill_in_dense(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL);
 
 
@@ -255,7 +256,7 @@ void convert_to_csr(const std::shared_ptr<const OmpExecutor> exec,
     row_ptrs[result->get_size()[0]] = source->get_num_stored_elements();
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL);
 
 
@@ -330,7 +331,7 @@ void transpose(std::shared_ptr<const OmpExecutor> exec,
                             [](const ValueType x) { return x; });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL);
 
 
@@ -343,7 +344,7 @@ void conj_transpose(std::shared_ptr<const OmpExecutor> exec,
                             [](const ValueType x) { return conj(x); });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL);
 
 
@@ -371,7 +372,7 @@ void is_sorted_by_column_index(
     *is_sorted = local_is_sorted;
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX);
 
 
@@ -426,7 +427,7 @@ void sort_by_column_index(const std::shared_ptr<const OmpExecutor> exec,
         syn::value_list<int>(), syn::type_list<>(), to_sort);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX);
 
 
@@ -463,7 +464,7 @@ void extract_diagonal(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL);
 
 
diff --git a/omp/matrix/sellp_kernels.cpp b/omp/matrix/sellp_kernels.cpp
index 7f8b16264ce..6306093b36d 100644
--- a/omp/matrix/sellp_kernels.cpp
+++ b/omp/matrix/sellp_kernels.cpp
@@ -155,7 +155,8 @@ void spmv(std::shared_ptr<const OmpExecutor> exec,
     spmv_blocked<4>(exec, a, b, c, out);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_SELLP_SPMV_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -194,7 +195,7 @@ void advanced_spmv(std::shared_ptr<const OmpExecutor> exec,
     spmv_blocked<4>(exec, a, b, c, out);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL);
 
 
diff --git a/omp/matrix/sparsity_csr_kernels.cpp b/omp/matrix/sparsity_csr_kernels.cpp
index 35bb42c70a6..560ee6d4890 100644
--- a/omp/matrix/sparsity_csr_kernels.cpp
+++ b/omp/matrix/sparsity_csr_kernels.cpp
@@ -58,7 +58,7 @@ void spmv(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL);
 
 
@@ -95,7 +95,7 @@ void advanced_spmv(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL);
 
 
@@ -149,7 +149,7 @@ void transpose(std::shared_ptr<const OmpExecutor> exec,
     transpose_and_transform(exec, trans, orig);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL);
 
 
@@ -168,7 +168,7 @@ void sort_by_column_index(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX);
 
 
@@ -197,7 +197,7 @@ void is_sorted_by_column_index(
     *is_sorted = local_is_sorted;
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SPARSITY_CSR_IS_SORTED_BY_COLUMN_INDEX);
 
 
diff --git a/reference/matrix/coo_kernels.cpp b/reference/matrix/coo_kernels.cpp
index f9bf9f5f33d..ebb8c1dfce6 100644
--- a/reference/matrix/coo_kernels.cpp
+++ b/reference/matrix/coo_kernels.cpp
@@ -38,7 +38,8 @@ void spmv(std::shared_ptr<const ReferenceExecutor> exec,
     spmv2(exec, a, b, c);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_COO_SPMV_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -53,7 +54,7 @@ void advanced_spmv(std::shared_ptr<const ReferenceExecutor> exec,
     advanced_spmv2(exec, alpha, a, b, c);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_COO_ADVANCED_SPMV_KERNEL);
 
 
@@ -73,7 +74,8 @@ void spmv2(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_COO_SPMV2_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -96,7 +98,7 @@ void advanced_spmv2(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL);
 
 
@@ -113,7 +115,7 @@ void fill_in_dense(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_COO_FILL_IN_DENSE_KERNEL);
 
 
@@ -136,7 +138,7 @@ void extract_diagonal(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_COO_EXTRACT_DIAGONAL_KERNEL);
 
 
diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp
index a0607110b79..679844084d2 100644
--- a/reference/matrix/csr_kernels.cpp
+++ b/reference/matrix/csr_kernels.cpp
@@ -76,7 +76,7 @@ void spmv(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_SPMV_KERNEL);
 
 
@@ -94,8 +94,8 @@ void advanced_spmv(std::shared_ptr<const ReferenceExecutor> exec,
 
     auto row_ptrs = a->get_const_row_ptrs();
     auto col_idxs = a->get_const_col_idxs();
-    arithmetic_type valpha = alpha->at(0, 0);
-    arithmetic_type vbeta = beta->at(0, 0);
+    auto valpha = static_cast<arithmetic_type>(alpha->at(0, 0));
+    auto vbeta = static_cast<arithmetic_type>(beta->at(0, 0));
 
     const auto a_vals =
         acc::helper::build_const_rrm_accessor<arithmetic_type>(a);
@@ -116,7 +116,7 @@ void advanced_spmv(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL);
 
 
@@ -240,7 +240,8 @@ void spgemm(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_SPGEMM_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -295,7 +296,7 @@ void advanced_spgemm(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL);
 
 
@@ -345,7 +346,8 @@ void spgeam(std::shared_ptr<const ReferenceExecutor> exec,
         [](IndexType, IndexType) {});
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_SPGEAM_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -367,7 +369,7 @@ void fill_in_dense(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL);
 
 
@@ -414,7 +416,7 @@ void convert_to_sellp(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL);
 
 
@@ -445,7 +447,7 @@ void convert_to_ell(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL);
 
 
@@ -515,7 +517,7 @@ void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,
     std::copy(col_idx_vec.begin(), col_idx_vec.end(), col_idxs.get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL);
 
 
@@ -574,7 +576,8 @@ void transpose(std::shared_ptr<const ReferenceExecutor> exec,
                             [](const ValueType x) { return x; });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -586,7 +589,7 @@ void conj_transpose(std::shared_ptr<const ReferenceExecutor> exec,
                             [](const ValueType x) { return conj(x); });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL);
 
 
@@ -610,7 +613,7 @@ void calculate_nonzeros_per_row_in_span(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
 
 
@@ -657,7 +660,7 @@ void calculate_nonzeros_per_row_in_index_set(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_INDEX_SET_KERNEL);
 
 
@@ -691,7 +694,7 @@ void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
 
 
@@ -749,7 +752,7 @@ void compute_submatrix_from_index_set(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_FROM_INDEX_SET_KERNEL);
 
 
@@ -800,7 +803,7 @@ void convert_to_hybrid(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL);
 
 
@@ -813,7 +816,7 @@ void inv_symm_permute(std::shared_ptr<const ReferenceExecutor> exec,
     inv_nonsymm_permute(exec, perm, perm, orig, permuted);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL);
 
 
@@ -851,7 +854,7 @@ void inv_nonsymm_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL);
 
 
@@ -886,7 +889,7 @@ void row_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL);
 
 
@@ -921,7 +924,7 @@ void inv_row_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL);
 
 
@@ -951,7 +954,7 @@ void inv_col_permute(std::shared_ptr<const ReferenceExecutor> exec,
     cp_row_ptrs[num_rows] = in_row_ptrs[num_rows];
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_INV_COL_PERMUTE_KERNEL);
 
 
@@ -964,7 +967,7 @@ void inv_symm_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
     inv_nonsymm_scale_permute(exec, scale, perm, scale, perm, orig, permuted);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL);
 
 
@@ -1006,7 +1009,7 @@ void inv_nonsymm_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL);
 
 
@@ -1043,7 +1046,7 @@ void row_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL);
 
 
@@ -1080,7 +1083,7 @@ void inv_row_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL);
 
 
@@ -1111,7 +1114,7 @@ void inv_col_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
     cp_row_ptrs[num_rows] = in_row_ptrs[num_rows];
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_INV_COL_SCALE_PERMUTE_KERNEL);
 
 
@@ -1133,7 +1136,7 @@ void sort_by_column_index(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX);
 
 
@@ -1157,7 +1160,7 @@ void is_sorted_by_column_index(
     return;
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX);
 
 
@@ -1182,7 +1185,8 @@ void extract_diagonal(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
 
 
 template <typename ValueType, typename IndexType>
@@ -1198,7 +1202,8 @@ void scale(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SCALE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_SCALE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -1214,7 +1219,8 @@ void inv_scale(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_SCALE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CSR_INV_SCALE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -1240,7 +1246,7 @@ void check_diagonal_entries_exist(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST);
 
 
@@ -1263,7 +1269,7 @@ void add_scaled_identity(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp
index 921a49998b7..561073c8c2d 100644
--- a/reference/matrix/dense_kernels.cpp
+++ b/reference/matrix/dense_kernels.cpp
@@ -56,7 +56,8 @@ void simple_apply(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
 
 
 template <typename ValueType>
@@ -89,7 +90,7 @@ void apply(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
 
 template <typename InValueType, typename OutValueType>
@@ -105,7 +106,7 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(
+GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY_WITH_HALF(
     GKO_DECLARE_DENSE_COPY_KERNEL);
 
 
@@ -120,7 +121,7 @@ void fill(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_FILL_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_FILL_KERNEL);
 
 
 template <typename ValueType, typename ScalarType>
@@ -142,7 +143,8 @@ void scale(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_SCALE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_SCALE_KERNEL);
 
 
 template <typename ValueType, typename ScalarType>
@@ -165,7 +167,7 @@ void inv_scale(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_INV_SCALE_KERNEL);
 
 
@@ -189,7 +191,7 @@ void add_scaled(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_ADD_SCALED_KERNEL);
 
 
@@ -213,7 +215,7 @@ void sub_scaled(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_SUB_SCALED_KERNEL);
 
 
@@ -229,7 +231,8 @@ void add_scaled_diag(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL);
 
 
 template <typename ValueType>
@@ -244,7 +247,8 @@ void sub_scaled_diag(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
 
 
 template <typename ValueType>
@@ -263,7 +267,8 @@ void compute_dot(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
 
 
 template <typename ValueType>
@@ -275,7 +280,7 @@ void compute_dot_dispatch(std::shared_ptr<const ReferenceExecutor> exec,
     compute_dot(exec, x, y, result, tmp);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
 
 
@@ -295,7 +300,8 @@ void compute_conj_dot(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
 
 
 template <typename ValueType>
@@ -308,7 +314,7 @@ void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
     compute_conj_dot(exec, x, y, result, tmp);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
 
 
@@ -331,7 +337,8 @@ void compute_norm2(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
 
 
 template <typename ValueType>
@@ -343,7 +350,7 @@ void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
     compute_norm2(exec, x, result, tmp);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
 
 
@@ -363,7 +370,8 @@ void compute_norm1(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
 
 
 template <typename ValueType>
@@ -386,7 +394,8 @@ void compute_mean(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_MEAN_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_COMPUTE_MEAN_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -400,7 +409,7 @@ void fill_in_matrix_data(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL);
 
 
@@ -420,7 +429,7 @@ void compute_squared_norm2(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL);
 
 
@@ -435,7 +444,7 @@ void compute_sqrt(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL);
 
 
@@ -466,7 +475,7 @@ void convert_to_coo(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL);
 
 
@@ -498,7 +507,7 @@ void convert_to_csr(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL);
 
 
@@ -530,7 +539,7 @@ void convert_to_ell(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL);
 
 
@@ -577,7 +586,7 @@ void convert_to_fbcsr(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL);
 
 
@@ -626,7 +635,7 @@ void convert_to_hybrid(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL);
 
 
@@ -662,7 +671,7 @@ void convert_to_sellp(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL);
 
 
@@ -692,7 +701,7 @@ void convert_to_sparsity_csr(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL);
 
 
@@ -713,7 +722,7 @@ void compute_max_nnz_per_row(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL);
 
 
@@ -745,7 +754,7 @@ void compute_slice_sets(std::shared_ptr<const DefaultExecutor> exec,
     components::prefix_sum_nonnegative(exec, slice_sets, num_slices + 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL);
 
 
@@ -765,9 +774,9 @@ void count_nonzeros_per_row(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL_SIZE_T);
 
 
@@ -797,7 +806,7 @@ void count_nonzero_blocks_per_row(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL);
 
 
@@ -813,7 +822,8 @@ void transpose(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
 
 
 template <typename ValueType>
@@ -828,7 +838,8 @@ void conj_transpose(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -844,7 +855,7 @@ void symm_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL);
 
 
@@ -862,7 +873,7 @@ void inv_symm_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL);
 
 
@@ -879,7 +890,7 @@ void nonsymm_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_NONSYMM_PERMUTE_KERNEL);
 
 
@@ -896,7 +907,7 @@ void inv_nonsymm_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_INV_NONSYMM_PERMUTE_KERNEL);
 
 
@@ -912,7 +923,7 @@ void row_gather(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2_WITH_HALF(
     GKO_DECLARE_DENSE_ROW_GATHER_KERNEL);
 
 
@@ -937,7 +948,7 @@ void advanced_row_gather(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2_WITH_HALF(
     GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL);
 
 
@@ -953,7 +964,7 @@ void col_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COL_PERMUTE_KERNEL);
 
 
@@ -970,7 +981,7 @@ void inv_row_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL);
 
 
@@ -987,7 +998,7 @@ void inv_col_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_INV_COL_PERMUTE_KERNEL);
 
 
@@ -1006,7 +1017,7 @@ void symm_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_SYMM_SCALE_PERMUTE_KERNEL);
 
 
@@ -1025,7 +1036,7 @@ void inv_symm_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_INV_SYMM_SCALE_PERMUTE_KERNEL);
 
 
@@ -1048,7 +1059,7 @@ void nonsymm_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_NONSYMM_SCALE_PERMUTE_KERNEL);
 
 
@@ -1071,7 +1082,7 @@ void inv_nonsymm_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_INV_NONSYMM_SCALE_PERMUTE_KERNEL);
 
 
@@ -1089,7 +1100,7 @@ void row_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_ROW_SCALE_PERMUTE_KERNEL);
 
 
@@ -1107,7 +1118,7 @@ void inv_row_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_INV_ROW_SCALE_PERMUTE_KERNEL);
 
 
@@ -1125,7 +1136,7 @@ void col_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_COL_SCALE_PERMUTE_KERNEL);
 
 
@@ -1143,7 +1154,7 @@ void inv_col_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_INV_COL_SCALE_PERMUTE_KERNEL);
 
 
@@ -1158,7 +1169,8 @@ void extract_diagonal(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL);
 
 
 template <typename ValueType>
@@ -1173,7 +1185,8 @@ void inplace_absolute_dense(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL);
 
 
 template <typename ValueType>
@@ -1189,7 +1202,8 @@ void outplace_absolute_dense(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL);
 
 
 template <typename ValueType>
@@ -1205,7 +1219,7 @@ void make_complex(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MAKE_COMPLEX_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_MAKE_COMPLEX_KERNEL);
 
 
 template <typename ValueType>
@@ -1221,7 +1235,7 @@ void get_real(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_REAL_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GET_REAL_KERNEL);
 
 
 template <typename ValueType>
@@ -1237,7 +1251,7 @@ void get_imag(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_IMAG_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GET_IMAG_KERNEL);
 
 
 template <typename ValueType, typename ScalarType>
@@ -1257,7 +1271,7 @@ void add_scaled_identity(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF(
     GKO_DECLARE_DENSE_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/reference/matrix/diagonal_kernels.cpp b/reference/matrix/diagonal_kernels.cpp
index 028b7685c2b..47d59728ab0 100644
--- a/reference/matrix/diagonal_kernels.cpp
+++ b/reference/matrix/diagonal_kernels.cpp
@@ -35,7 +35,8 @@ void apply_to_dense(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DIAGONAL_APPLY_TO_DENSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DIAGONAL_APPLY_TO_DENSE_KERNEL);
 
 
 template <typename ValueType>
@@ -52,7 +53,7 @@ void right_apply_to_dense(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_DIAGONAL_RIGHT_APPLY_TO_DENSE_KERNEL);
 
 
@@ -77,7 +78,7 @@ void apply_to_csr(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DIAGONAL_APPLY_TO_CSR_KERNEL);
 
 
@@ -101,7 +102,7 @@ void right_apply_to_csr(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DIAGONAL_RIGHT_APPLY_TO_CSR_KERNEL);
 
 
@@ -118,7 +119,7 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DIAGONAL_FILL_IN_MATRIX_DATA_KERNEL);
 
 
@@ -141,7 +142,7 @@ void convert_to_csr(std::shared_ptr<const ReferenceExecutor> exec,
     row_ptrs[size] = size;
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DIAGONAL_CONVERT_TO_CSR_KERNEL);
 
 
@@ -159,7 +160,8 @@ void conj_transpose(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DIAGONAL_CONJ_TRANSPOSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_DIAGONAL_CONJ_TRANSPOSE_KERNEL);
 
 
 }  // namespace diagonal
diff --git a/reference/matrix/ell_kernels.cpp b/reference/matrix/ell_kernels.cpp
index 1fa37c4e250..ece95b38a39 100644
--- a/reference/matrix/ell_kernels.cpp
+++ b/reference/matrix/ell_kernels.cpp
@@ -68,7 +68,7 @@ void spmv(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ELL_SPMV_KERNEL);
 
 
@@ -107,7 +107,7 @@ void advanced_spmv(std::shared_ptr<const ReferenceExecutor> exec,
 
     for (size_type j = 0; j < c->get_size()[1]; j++) {
         for (size_type row = 0; row < a->get_size()[0]; row++) {
-            arithmetic_type result = c->at(row, j);
+            auto result = static_cast<arithmetic_type>(c->at(row, j));
             result *= beta_val;
             for (size_type i = 0; i < num_stored_elements_per_row; i++) {
                 arithmetic_type val = a_vals(row + i * stride);
@@ -121,7 +121,7 @@ void advanced_spmv(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL);
 
 
@@ -161,7 +161,7 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ELL_FILL_IN_MATRIX_DATA_KERNEL);
 
 
@@ -185,7 +185,7 @@ void fill_in_dense(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ELL_FILL_IN_DENSE_KERNEL);
 
 
@@ -203,7 +203,8 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_COPY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_ELL_COPY_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -234,7 +235,7 @@ void convert_to_csr(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ELL_CONVERT_TO_CSR_KERNEL);
 
 
@@ -258,7 +259,7 @@ void count_nonzeros_per_row(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ELL_COUNT_NONZEROS_PER_ROW_KERNEL);
 
 
@@ -283,7 +284,7 @@ void extract_diagonal(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ELL_EXTRACT_DIAGONAL_KERNEL);
 
 
diff --git a/reference/matrix/fbcsr_kernels.cpp b/reference/matrix/fbcsr_kernels.cpp
index 4c170a973a7..048158136be 100644
--- a/reference/matrix/fbcsr_kernels.cpp
+++ b/reference/matrix/fbcsr_kernels.cpp
@@ -74,7 +74,8 @@ void spmv(const std::shared_ptr<const ReferenceExecutor>,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_FBCSR_SPMV_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -118,7 +119,7 @@ void advanced_spmv(const std::shared_ptr<const ReferenceExecutor>,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL);
 
 
@@ -176,7 +177,7 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
     std::copy(col_idx_vec.begin(), col_idx_vec.end(), col_idxs.get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL);
 
 
@@ -212,7 +213,7 @@ void fill_in_dense(const std::shared_ptr<const ReferenceExecutor>,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL);
 
 
@@ -271,7 +272,7 @@ void convert_to_csr(const std::shared_ptr<const ReferenceExecutor>,
         static_cast<IndexType>(source->get_num_stored_elements());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL);
 
 
@@ -353,7 +354,7 @@ void transpose(std::shared_ptr<const ReferenceExecutor> exec,
                             [](const ValueType x) { return x; });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL);
 
 
@@ -366,7 +367,7 @@ void conj_transpose(std::shared_ptr<const ReferenceExecutor> exec,
                             [](const ValueType x) { return conj(x); });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL);
 
 
@@ -391,7 +392,7 @@ void is_sorted_by_column_index(
     return;
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX);
 
 
@@ -448,7 +449,7 @@ void sort_by_column_index(const std::shared_ptr<const ReferenceExecutor> exec,
         syn::value_list<int>(), syn::type_list<>(), to_sort);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX);
 
 
@@ -487,7 +488,7 @@ void extract_diagonal(std::shared_ptr<const ReferenceExecutor>,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL);
 
 
diff --git a/reference/matrix/hybrid_kernels.cpp b/reference/matrix/hybrid_kernels.cpp
index f2a06c321f2..5fe013297f3 100644
--- a/reference/matrix/hybrid_kernels.cpp
+++ b/reference/matrix/hybrid_kernels.cpp
@@ -86,7 +86,7 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_HYBRID_FILL_IN_MATRIX_DATA_KERNEL);
 
 
@@ -130,7 +130,7 @@ void convert_to_csr(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL);
 
 
diff --git a/reference/matrix/scaled_permutation_kernels.cpp b/reference/matrix/scaled_permutation_kernels.cpp
index b00e06f72f2..a352c0f777d 100644
--- a/reference/matrix/scaled_permutation_kernels.cpp
+++ b/reference/matrix/scaled_permutation_kernels.cpp
@@ -26,7 +26,7 @@ void invert(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SCALED_PERMUTATION_INVERT_KERNEL);
 
 
@@ -51,7 +51,7 @@ void compose(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SCALED_PERMUTATION_COMPOSE_KERNEL);
 
 
diff --git a/reference/matrix/sellp_kernels.cpp b/reference/matrix/sellp_kernels.cpp
index 120194d6952..70cfc3cac3a 100644
--- a/reference/matrix/sellp_kernels.cpp
+++ b/reference/matrix/sellp_kernels.cpp
@@ -55,7 +55,8 @@ void spmv(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_SELLP_SPMV_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -96,7 +97,7 @@ void advanced_spmv(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL);
 
 
@@ -163,7 +164,7 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SELLP_FILL_IN_MATRIX_DATA_KERNEL);
 
 
@@ -198,7 +199,7 @@ void fill_in_dense(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SELLP_FILL_IN_DENSE_KERNEL);
 
 
@@ -234,7 +235,7 @@ void count_nonzeros_per_row(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SELLP_COUNT_NONZEROS_PER_ROW_KERNEL);
 
 
@@ -280,7 +281,7 @@ void convert_to_csr(std::shared_ptr<const ReferenceExecutor> exec,
     result_row_ptrs[num_rows] = cur_ptr;
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL);
 
 
@@ -317,7 +318,7 @@ void extract_diagonal(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SELLP_EXTRACT_DIAGONAL_KERNEL);
 
 
diff --git a/reference/matrix/sparsity_csr_kernels.cpp b/reference/matrix/sparsity_csr_kernels.cpp
index c511a16a292..b773d3b9a50 100644
--- a/reference/matrix/sparsity_csr_kernels.cpp
+++ b/reference/matrix/sparsity_csr_kernels.cpp
@@ -55,7 +55,7 @@ void spmv(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL);
 
 
@@ -92,7 +92,7 @@ void advanced_spmv(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL);
 
 
@@ -113,7 +113,7 @@ void fill_in_dense(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL);
 
 
@@ -138,7 +138,7 @@ void diagonal_element_prefix_sum(
     prefix_sum[num_rows] = num_diag;
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SPARSITY_CSR_DIAGONAL_ELEMENT_PREFIX_SUM_KERNEL);
 
 
@@ -173,7 +173,7 @@ void remove_diagonal_elements(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SPARSITY_CSR_REMOVE_DIAGONAL_ELEMENTS_KERNEL);
 
 
@@ -227,7 +227,7 @@ void transpose(std::shared_ptr<const ReferenceExecutor> exec,
     transpose_and_transform(exec, orig, trans);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL);
 
 
@@ -245,7 +245,7 @@ void sort_by_column_index(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX);
 
 
@@ -269,7 +269,7 @@ void is_sorted_by_column_index(
     return;
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SPARSITY_CSR_IS_SORTED_BY_COLUMN_INDEX);
 
 
diff --git a/reference/test/base/combination.cpp b/reference/test/base/combination.cpp
index aea578f4e7e..149aaa33256 100644
--- a/reference/test/base/combination.cpp
+++ b/reference/test/base/combination.cpp
@@ -34,7 +34,8 @@ class Combination : public ::testing::Test {
     std::vector<std::shared_ptr<gko::LinOp>> operators;
 };
 
-TYPED_TEST_SUITE(Combination, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Combination, gko::test::ValueTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(Combination, CopiesOnSameExecutor)
@@ -114,7 +115,7 @@ TYPED_TEST(Combination, AppliesToMixedVector)
         cmb = [ 8 7 ]
               [ 5 4 ]
     */
-    using value_type = gko::next_precision<TypeParam>;
+    using value_type = gko::next_precision_with_half<TypeParam>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto cmb = gko::Combination<TypeParam>::create(
         this->coefficients[0], this->operators[0], this->coefficients[1],
@@ -156,7 +157,8 @@ TYPED_TEST(Combination, AppliesToMixedComplexVector)
         cmb = [ 8 7 ]
               [ 5 4 ]
     */
-    using value_type = gko::to_complex<gko::next_precision<TypeParam>>;
+    using value_type =
+        gko::to_complex<gko::next_precision_with_half<TypeParam>>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto cmb = gko::Combination<TypeParam>::create(
         this->coefficients[0], this->operators[0], this->coefficients[1],
@@ -200,7 +202,7 @@ TYPED_TEST(Combination, AppliesLinearCombinationToMixedVector)
         cmb = [ 8 7 ]
               [ 5 4 ]
     */
-    using value_type = gko::next_precision<TypeParam>;
+    using value_type = gko::next_precision_with_half<TypeParam>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto cmb = gko::Combination<TypeParam>::create(
         this->coefficients[0], this->operators[0], this->coefficients[1],
@@ -248,7 +250,8 @@ TYPED_TEST(Combination, AppliesLinearCombinationToMixedComplexVector)
         cmb = [ 8 7 ]
               [ 5 4 ]
     */
-    using MixedDense = gko::matrix::Dense<gko::next_precision<TypeParam>>;
+    using MixedDense =
+        gko::matrix::Dense<gko::next_precision_with_half<TypeParam>>;
     using MixedDenseComplex = gko::to_complex<MixedDense>;
     using value_type = typename MixedDenseComplex::value_type;
     auto cmb = gko::Combination<TypeParam>::create(
diff --git a/reference/test/matrix/coo_kernels.cpp b/reference/test/matrix/coo_kernels.cpp
index fcca61a33d4..53efc588e1c 100644
--- a/reference/test/matrix/coo_kernels.cpp
+++ b/reference/test/matrix/coo_kernels.cpp
@@ -79,16 +79,17 @@ TYPED_TEST(Coo, ConvertsToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = typename gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Coo = typename TestFixture::Mtx;
     using OtherCoo = gko::matrix::Coo<OtherType, IndexType>;
     auto tmp = OtherCoo::create(this->exec);
     auto res = Coo::create(this->exec);
     // If OtherType is more precise: 0, otherwise r
-    auto residual =
-        r<OtherType>::value < r<ValueType>::value
-            ? gko::remove_complex<ValueType>{0}
-            : static_cast<gko::remove_complex<ValueType>>(r<OtherType>::value);
+    auto residual = r<OtherType>::value < r<ValueType>::value
+                        ? gko::remove_complex<ValueType>{0}
+                        : gko::remove_complex<ValueType>{
+                              static_cast<gko::remove_complex<ValueType>>(
+                                  r<OtherType>::value)};
 
     this->mtx->convert_to(tmp);
     tmp->convert_to(res);
@@ -101,7 +102,7 @@ TYPED_TEST(Coo, MovesToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = typename gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Coo = typename TestFixture::Mtx;
     using OtherCoo = gko::matrix::Coo<OtherType, IndexType>;
     auto tmp = OtherCoo::create(this->exec);
@@ -214,7 +215,7 @@ TYPED_TEST(Coo, ConvertsEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = typename gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Coo = typename TestFixture::Mtx;
     using OtherCoo = gko::matrix::Coo<OtherType, IndexType>;
     auto empty = OtherCoo::create(this->exec);
@@ -231,7 +232,7 @@ TYPED_TEST(Coo, MovesEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = typename gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Coo = typename TestFixture::Mtx;
     using OtherCoo = gko::matrix::Coo<OtherType, IndexType>;
     auto empty = OtherCoo::create(this->exec);
diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp
index 2dd68bd9239..b84ac958f02 100644
--- a/reference/test/matrix/csr_kernels.cpp
+++ b/reference/test/matrix/csr_kernels.cpp
@@ -788,7 +788,7 @@ TYPED_TEST(Csr, ConvertsToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = typename gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Csr = typename TestFixture::Mtx;
     using OtherCsr = gko::matrix::Csr<OtherType, IndexType>;
     auto tmp = OtherCsr::create(this->exec);
@@ -814,7 +814,7 @@ TYPED_TEST(Csr, MovesToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = typename gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Csr = typename TestFixture::Mtx;
     using OtherCsr = gko::matrix::Csr<OtherType, IndexType>;
     auto tmp = OtherCsr::create(this->exec);
@@ -992,7 +992,7 @@ TYPED_TEST(Csr, ConvertsEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = typename gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Csr = typename TestFixture::Mtx;
     using OtherCsr = gko::matrix::Csr<OtherType, IndexType>;
     auto empty = OtherCsr::create(this->exec);
@@ -1011,7 +1011,7 @@ TYPED_TEST(Csr, MovesEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = typename gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Csr = typename TestFixture::Mtx;
     using OtherCsr = gko::matrix::Csr<OtherType, IndexType>;
     auto empty = OtherCsr::create(this->exec);
diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp
index 51b0aa148fd..a8d37ce5a09 100644
--- a/reference/test/matrix/dense_kernels.cpp
+++ b/reference/test/matrix/dense_kernels.cpp
@@ -75,8 +75,7 @@ class Dense : public ::testing::Test {
         return gko::test::generate_random_matrix<MtxType>(
             num_rows, num_cols,
             std::uniform_int_distribution<gko::size_type>(num_cols, num_cols),
-            std::normal_distribution<gko::remove_complex<value_type>>(0.0, 1.0),
-            rand_engine, exec);
+            std::normal_distribution<>(0.0, 1.0), rand_engine, exec);
     }
 };
 
@@ -751,9 +750,11 @@ TYPED_TEST(Dense, ConvertsToPrecision)
     auto tmp = OtherDense::create(this->exec);
     auto res = Dense::create(this->exec);
     // If OtherT is more precise: 0, otherwise r
-    auto residual = r<OtherT>::value < r<T>::value
-                        ? gko::remove_complex<T>{0}
-                        : static_cast<gko::remove_complex<T>>(r<OtherT>::value);
+    auto residual =
+        r<OtherT>::value < r<T>::value
+            ? gko::remove_complex<T>{0}
+            : gko::remove_complex<T>{
+                  static_cast<gko::remove_complex<T>>(r<OtherT>::value)};
 
     this->mtx1->convert_to(tmp);
     tmp->convert_to(res);
@@ -771,9 +772,11 @@ TYPED_TEST(Dense, MovesToPrecision)
     auto tmp = OtherDense::create(this->exec);
     auto res = Dense::create(this->exec);
     // If OtherT is more precise: 0, otherwise r
-    auto residual = r<OtherT>::value < r<T>::value
-                        ? gko::remove_complex<T>{0}
-                        : static_cast<gko::remove_complex<T>>(r<OtherT>::value);
+    auto residual =
+        r<OtherT>::value < r<T>::value
+            ? gko::remove_complex<T>{0}
+            : gko::remove_complex<T>{
+                  static_cast<gko::remove_complex<T>>(r<OtherT>::value)};
 
     this->mtx1->move_to(tmp);
     tmp->move_to(res);
@@ -3549,7 +3552,7 @@ class DenseComplex : public ::testing::Test {
 };
 
 
-TYPED_TEST_SUITE(DenseComplex, gko::test::ComplexValueTypes,
+TYPED_TEST_SUITE(DenseComplex, gko::test::ComplexValueTypesWithHalf,
                  TypenameNameGenerator);
 
 
diff --git a/reference/test/matrix/diagonal_kernels.cpp b/reference/test/matrix/diagonal_kernels.cpp
index b0932c7eb66..e2ac67190d0 100644
--- a/reference/test/matrix/diagonal_kernels.cpp
+++ b/reference/test/matrix/diagonal_kernels.cpp
@@ -85,16 +85,17 @@ TYPED_TEST_SUITE(Diagonal, gko::test::ValueTypes, TypenameNameGenerator);
 TYPED_TEST(Diagonal, ConvertsToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
-    using OtherType = typename gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Diagonal = typename TestFixture::Diag;
     using OtherDiagonal = gko::matrix::Diagonal<OtherType>;
     auto tmp = OtherDiagonal::create(this->exec);
     auto res = Diagonal::create(this->exec);
     // If OtherType is more precise: 0, otherwise r
-    auto residual =
-        r<OtherType>::value < r<ValueType>::value
-            ? gko::remove_complex<ValueType>{0}
-            : static_cast<gko::remove_complex<ValueType>>(r<OtherType>::value);
+    auto residual = r<OtherType>::value < r<ValueType>::value
+                        ? gko::remove_complex<ValueType>{0}
+                        : gko::remove_complex<ValueType>{
+                              static_cast<gko::remove_complex<ValueType>>(
+                                  r<OtherType>::value)};
 
     this->diag1->convert_to(tmp);
     tmp->convert_to(res);
@@ -106,7 +107,7 @@ TYPED_TEST(Diagonal, ConvertsToPrecision)
 TYPED_TEST(Diagonal, MovesToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
-    using OtherType = typename gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Diagonal = typename TestFixture::Diag;
     using OtherDiagonal = gko::matrix::Diagonal<OtherType>;
     auto tmp = OtherDiagonal::create(this->exec);
@@ -672,7 +673,7 @@ class DiagonalComplex : public ::testing::Test {
     using Diag = gko::matrix::Diagonal<value_type>;
 };
 
-TYPED_TEST_SUITE(DiagonalComplex, gko::test::ComplexValueTypes,
+TYPED_TEST_SUITE(DiagonalComplex, gko::test::ComplexValueTypesWithHalf,
                  TypenameNameGenerator);
 
 
diff --git a/reference/test/matrix/ell_kernels.cpp b/reference/test/matrix/ell_kernels.cpp
index e1eef9f087c..6214db82d1c 100644
--- a/reference/test/matrix/ell_kernels.cpp
+++ b/reference/test/matrix/ell_kernels.cpp
@@ -443,16 +443,17 @@ TYPED_TEST(Ell, ConvertsToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = typename gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Ell = typename TestFixture::Mtx;
     using OtherEll = gko::matrix::Ell<OtherType, IndexType>;
     auto tmp = OtherEll::create(this->exec);
     auto res = Ell::create(this->exec);
     // If OtherType is more precise: 0, otherwise r
-    auto residual =
-        r<OtherType>::value < r<ValueType>::value
-            ? gko::remove_complex<ValueType>{0}
-            : static_cast<gko::remove_complex<ValueType>>(r<OtherType>::value);
+    auto residual = r<OtherType>::value < r<ValueType>::value
+                        ? gko::remove_complex<ValueType>{0}
+                        : gko::remove_complex<ValueType>{
+                              static_cast<gko::remove_complex<ValueType>>(
+                                  r<OtherType>::value)};
 
     this->mtx1->convert_to(tmp);
     tmp->convert_to(res);
@@ -465,7 +466,7 @@ TYPED_TEST(Ell, MovesToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = typename gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Ell = typename TestFixture::Mtx;
     using OtherEll = gko::matrix::Ell<OtherType, IndexType>;
     auto tmp = OtherEll::create(this->exec);
@@ -734,7 +735,7 @@ TYPED_TEST(Ell, ConvertsEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = typename gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Ell = typename TestFixture::Mtx;
     using OtherEll = gko::matrix::Ell<OtherType, IndexType>;
     auto empty = Ell::create(this->exec);
@@ -751,7 +752,7 @@ TYPED_TEST(Ell, MovesEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = typename gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Ell = typename TestFixture::Mtx;
     using OtherEll = gko::matrix::Ell<OtherType, IndexType>;
     auto empty = Ell::create(this->exec);
diff --git a/reference/test/matrix/fbcsr_kernels.cpp b/reference/test/matrix/fbcsr_kernels.cpp
index f7c6d2197ef..665df4ace31 100644
--- a/reference/test/matrix/fbcsr_kernels.cpp
+++ b/reference/test/matrix/fbcsr_kernels.cpp
@@ -271,16 +271,17 @@ TYPED_TEST(Fbcsr, ConvertsToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = typename gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Fbcsr = typename TestFixture::Mtx;
     using OtherFbcsr = gko::matrix::Fbcsr<OtherType, IndexType>;
     auto tmp = OtherFbcsr::create(this->exec);
     auto res = Fbcsr::create(this->exec);
     // If OtherType is more precise: 0, otherwise r
-    auto residual =
-        r<OtherType>::value < r<ValueType>::value
-            ? gko::remove_complex<ValueType>{0}
-            : static_cast<gko::remove_complex<ValueType>>(r<OtherType>::value);
+    auto residual = r<OtherType>::value < r<ValueType>::value
+                        ? gko::remove_complex<ValueType>{0}
+                        : gko::remove_complex<ValueType>{
+                              static_cast<gko::remove_complex<ValueType>>(
+                                  r<OtherType>::value)};
 
     this->mtx->convert_to(tmp);
     tmp->convert_to(res);
@@ -293,7 +294,7 @@ TYPED_TEST(Fbcsr, MovesToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = typename gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Fbcsr = typename TestFixture::Mtx;
     using OtherFbcsr = gko::matrix::Fbcsr<OtherType, IndexType>;
     auto tmp = OtherFbcsr::create(this->exec);
@@ -391,7 +392,7 @@ TYPED_TEST(Fbcsr, ConvertsEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = typename gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Fbcsr = typename TestFixture::Mtx;
     using OtherFbcsr = gko::matrix::Fbcsr<OtherType, IndexType>;
     auto empty = OtherFbcsr::create(this->exec);
@@ -410,7 +411,7 @@ TYPED_TEST(Fbcsr, MovesEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = typename gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Fbcsr = typename TestFixture::Mtx;
     using OtherFbcsr = gko::matrix::Fbcsr<OtherType, IndexType>;
     auto empty = OtherFbcsr::create(this->exec);
diff --git a/reference/test/matrix/hybrid_kernels.cpp b/reference/test/matrix/hybrid_kernels.cpp
index 754e599b8fe..87fd4c02811 100644
--- a/reference/test/matrix/hybrid_kernels.cpp
+++ b/reference/test/matrix/hybrid_kernels.cpp
@@ -233,16 +233,17 @@ TYPED_TEST(Hybrid, ConvertsToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = typename gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Hybrid = typename TestFixture::Mtx;
     using OtherHybrid = gko::matrix::Hybrid<OtherType, IndexType>;
     auto tmp = OtherHybrid::create(this->exec);
     auto res = Hybrid::create(this->exec);
     // If OtherType is more precise: 0, otherwise r
-    auto residual =
-        r<OtherType>::value < r<ValueType>::value
-            ? gko::remove_complex<ValueType>{0}
-            : static_cast<gko::remove_complex<ValueType>>(r<OtherType>::value);
+    auto residual = r<OtherType>::value < r<ValueType>::value
+                        ? gko::remove_complex<ValueType>{0}
+                        : gko::remove_complex<ValueType>{
+                              static_cast<gko::remove_complex<ValueType>>(
+                                  r<OtherType>::value)};
 
     this->mtx1->convert_to(tmp);
     tmp->convert_to(res);
@@ -255,7 +256,7 @@ TYPED_TEST(Hybrid, MovesToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = typename gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Hybrid = typename TestFixture::Mtx;
     using OtherHybrid = gko::matrix::Hybrid<OtherType, IndexType>;
     auto tmp = OtherHybrid::create(this->exec);
@@ -366,7 +367,7 @@ TYPED_TEST(Hybrid, ConvertsEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = typename gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Hybrid = typename TestFixture::Mtx;
     using OtherHybrid = gko::matrix::Hybrid<OtherType, IndexType>;
     auto other = Hybrid::create(this->exec);
@@ -383,7 +384,7 @@ TYPED_TEST(Hybrid, MovesEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = typename gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Hybrid = typename TestFixture::Mtx;
     using OtherHybrid = gko::matrix::Hybrid<OtherType, IndexType>;
     auto other = Hybrid::create(this->exec);
diff --git a/reference/test/matrix/scaled_permutation.cpp b/reference/test/matrix/scaled_permutation.cpp
index ba65705bf29..6d8d49f5662 100644
--- a/reference/test/matrix/scaled_permutation.cpp
+++ b/reference/test/matrix/scaled_permutation.cpp
@@ -145,8 +145,7 @@ TYPED_TEST(ScaledPermutation, CombineWithInverse)
     using index_type = typename TestFixture::index_type;
     const gko::size_type size = 20;
     auto rng = std::default_random_engine{3754};
-    auto dist = std::uniform_real_distribution<gko::remove_complex<value_type>>{
-        1.0, 2.0};
+    auto dist = std::uniform_real_distribution<>{1.0, 2.0};
     auto perm = gko::matrix::ScaledPermutation<value_type, index_type>::create(
         this->exec, size);
     std::iota(perm->get_permutation(), perm->get_permutation() + size, 0);
diff --git a/reference/test/matrix/sellp_kernels.cpp b/reference/test/matrix/sellp_kernels.cpp
index a39d8e16832..3208b8c42be 100644
--- a/reference/test/matrix/sellp_kernels.cpp
+++ b/reference/test/matrix/sellp_kernels.cpp
@@ -189,16 +189,17 @@ TYPED_TEST(Sellp, ConvertsToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = typename gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Sellp = typename TestFixture::Mtx;
     using OtherSellp = gko::matrix::Sellp<OtherType, IndexType>;
     auto tmp = OtherSellp::create(this->exec);
     auto res = Sellp::create(this->exec);
     // If OtherType is more precise: 0, otherwise r
-    auto residual =
-        r<OtherType>::value < r<ValueType>::value
-            ? gko::remove_complex<ValueType>{0}
-            : static_cast<gko::remove_complex<ValueType>>(r<OtherType>::value);
+    auto residual = r<OtherType>::value < r<ValueType>::value
+                        ? gko::remove_complex<ValueType>{0}
+                        : gko::remove_complex<ValueType>{
+                              static_cast<gko::remove_complex<ValueType>>(
+                                  r<OtherType>::value)};
 
     this->mtx1->convert_to(tmp);
     tmp->convert_to(res);
@@ -211,16 +212,17 @@ TYPED_TEST(Sellp, MovesToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = typename gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Sellp = typename TestFixture::Mtx;
     using OtherSellp = gko::matrix::Sellp<OtherType, IndexType>;
     auto tmp = OtherSellp::create(this->exec);
     auto res = Sellp::create(this->exec);
     // If OtherType is more precise: 0, otherwise r
-    auto residual =
-        r<OtherType>::value < r<ValueType>::value
-            ? gko::remove_complex<ValueType>{0}
-            : static_cast<gko::remove_complex<ValueType>>(r<OtherType>::value);
+    auto residual = r<OtherType>::value < r<ValueType>::value
+                        ? gko::remove_complex<ValueType>{0}
+                        : gko::remove_complex<ValueType>{
+                              static_cast<gko::remove_complex<ValueType>>(
+                                  r<OtherType>::value)};
 
     this->mtx1->move_to(tmp);
     tmp->move_to(res);
@@ -308,7 +310,7 @@ TYPED_TEST(Sellp, ConvertsEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = typename gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Sellp = typename TestFixture::Mtx;
     using OtherSellp = gko::matrix::Sellp<OtherType, IndexType>;
     auto empty = OtherSellp::create(this->exec);
@@ -327,7 +329,7 @@ TYPED_TEST(Sellp, MovesEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = typename gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Sellp = typename TestFixture::Mtx;
     using OtherSellp = gko::matrix::Sellp<OtherType, IndexType>;
     auto empty = OtherSellp::create(this->exec);
diff --git a/test/matrix/fbcsr_kernels.cpp b/test/matrix/fbcsr_kernels.cpp
index 8cff04c28a0..4ff8e1fc36a 100644
--- a/test/matrix/fbcsr_kernels.cpp
+++ b/test/matrix/fbcsr_kernels.cpp
@@ -37,7 +37,7 @@ class Fbcsr : public CommonTestFixture {
 
     std::unique_ptr<const Mtx> rsorted;
 
-    std::normal_distribution<gko::remove_complex<T>> distb;
+    std::normal_distribution<> distb;
     std::default_random_engine engine;
 
     value_type get_random_value()
@@ -123,6 +123,9 @@ TYPED_TEST(Fbcsr, SpmvIsEquivalentToRefSorted)
     using Mtx = typename TestFixture::Mtx;
     using Dense = typename TestFixture::Dense;
     using value_type = typename Mtx::value_type;
+    if (this->exec->get_master() != this->exec) {
+        SKIP_IF_HALF(value_type);
+    }
     auto drand = gko::clone(this->exec, this->rsorted);
     auto x =
         Dense::create(this->ref, gko::dim<2>(this->rsorted->get_size()[1], 1));
@@ -145,6 +148,9 @@ TYPED_TEST(Fbcsr, SpmvMultiIsEquivalentToRefSorted)
     using Mtx = typename TestFixture::Mtx;
     using Dense = typename TestFixture::Dense;
     using value_type = typename Mtx::value_type;
+    if (this->exec->get_master() != this->exec) {
+        SKIP_IF_HALF(value_type);
+    }
     auto drand = gko::clone(this->exec, this->rsorted);
     auto x =
         Dense::create(this->ref, gko::dim<2>(this->rsorted->get_size()[1], 3));
@@ -168,6 +174,9 @@ TYPED_TEST(Fbcsr, AdvancedSpmvIsEquivalentToRefSorted)
     using Dense = typename TestFixture::Dense;
     using value_type = typename TestFixture::value_type;
     using real_type = typename TestFixture::real_type;
+    if (this->exec->get_master() != this->exec) {
+        SKIP_IF_HALF(value_type);
+    }
     auto drand = gko::clone(this->exec, this->rsorted);
     auto x =
         Dense::create(this->ref, gko::dim<2>(this->rsorted->get_size()[1], 1));
@@ -198,6 +207,9 @@ TYPED_TEST(Fbcsr, AdvancedSpmvMultiIsEquivalentToRefSorted)
     using Dense = typename TestFixture::Dense;
     using value_type = typename TestFixture::value_type;
     using real_type = typename TestFixture::real_type;
+    if (this->exec->get_master() != this->exec) {
+        SKIP_IF_HALF(value_type);
+    }
     auto drand = gko::clone(this->exec, this->rsorted);
     auto x =
         Dense::create(this->ref, gko::dim<2>(this->rsorted->get_size()[1], 3));
diff --git a/test/matrix/matrix.cpp b/test/matrix/matrix.cpp
index eea1a67ef5f..0b06f76df85 100644
--- a/test/matrix/matrix.cpp
+++ b/test/matrix/matrix.cpp
@@ -586,10 +586,7 @@ class Matrix : public CommonTestFixture {
     template <typename ValueType, typename IndexType>
     gko::matrix_data<ValueType, IndexType> gen_dense_data(gko::dim<2> size)
     {
-        return {
-            size,
-            std::normal_distribution<gko::remove_complex<ValueType>>(0.0, 1.0),
-            rand_engine};
+        return {size, std::normal_distribution<>(0.0, 1.0), rand_engine};
     }
 
     template <typename VecType = Vec>
@@ -609,10 +606,7 @@ class Matrix : public CommonTestFixture {
         return {gko::initialize<VecType>(
                     {gko::test::detail::get_rand_value<
                         typename VecType::value_type>(
-                        std::normal_distribution<
-                            gko::remove_complex<typename VecType::value_type>>(
-                            0.0, 1.0),
-                        rand_engine)},
+                        std::normal_distribution<>(0.0, 1.0), rand_engine)},
                     ref),
                 exec};
     }

From d289c9d5c3355c94953fed94ee9d036789c4b383 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 24 Oct 2024 11:59:30 +0200
Subject: [PATCH 362/448] device_matrix_data and mtx_io

---
 .../base/device_matrix_data_kernels.cpp       | 16 +++++++--
 .../base/device_matrix_data_kernels.cpp       |  4 +--
 core/base/device_matrix_data.cpp              |  3 +-
 core/base/mtx_io.cpp                          | 35 ++++++++++++++-----
 core/device_hooks/common_kernels.inc.cpp      | 12 ++++---
 core/test/base/mtx_io.cpp                     | 20 +++++++++--
 dpcpp/base/device_matrix_data_kernels.dp.cpp  |  4 +--
 omp/base/device_matrix_data_kernels.cpp       |  6 ++--
 reference/base/device_matrix_data_kernels.cpp | 10 +++---
 test/base/device_matrix_data_kernels.cpp      |  7 ++--
 10 files changed, 80 insertions(+), 37 deletions(-)

diff --git a/common/cuda_hip/base/device_matrix_data_kernels.cpp b/common/cuda_hip/base/device_matrix_data_kernels.cpp
index c5742653a93..ebfed84dba2 100644
--- a/common/cuda_hip/base/device_matrix_data_kernels.cpp
+++ b/common/cuda_hip/base/device_matrix_data_kernels.cpp
@@ -12,6 +12,7 @@
 #include <thrust/sort.h>
 #include <thrust/tuple.h>
 
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
 
@@ -22,6 +23,15 @@ namespace GKO_DEVICE_NAMESPACE {
 namespace components {
 
 
+// __half `!=` operation is only available in __device__
+// Although gko::is_nonzero is constexpr, it still shows calling __device__ in
+// __host__
+template <typename T>
+GKO_INLINE __device__ constexpr bool is_nonzero(T value)
+{
+    return value != zero<T>();
+}
+
 template <typename ValueType, typename IndexType>
 void remove_zeros(std::shared_ptr<const DefaultExecutor> exec,
                   array<ValueType>& values, array<IndexType>& row_idxs,
@@ -58,7 +68,7 @@ void remove_zeros(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DEVICE_MATRIX_DATA_REMOVE_ZEROS_KERNEL);
 
 
@@ -102,7 +112,7 @@ void sum_duplicates(std::shared_ptr<const DefaultExecutor> exec, size_type,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DEVICE_MATRIX_DATA_SUM_DUPLICATES_KERNEL);
 
 
@@ -117,7 +127,7 @@ void sort_row_major(std::shared_ptr<const DefaultExecutor> exec,
                         it + data.get_num_stored_elements(), vals);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DEVICE_MATRIX_DATA_SORT_ROW_MAJOR_KERNEL);
 
 
diff --git a/common/unified/base/device_matrix_data_kernels.cpp b/common/unified/base/device_matrix_data_kernels.cpp
index d801b47fcd5..b72c6bf3476 100644
--- a/common/unified/base/device_matrix_data_kernels.cpp
+++ b/common/unified/base/device_matrix_data_kernels.cpp
@@ -30,7 +30,7 @@ void soa_to_aos(std::shared_ptr<const DefaultExecutor> exec,
         in.get_const_col_idxs(), in.get_const_values(), out);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DEVICE_MATRIX_DATA_SOA_TO_AOS_KERNEL);
 
 
@@ -50,7 +50,7 @@ void aos_to_soa(std::shared_ptr<const DefaultExecutor> exec,
         out.get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DEVICE_MATRIX_DATA_AOS_TO_SOA_KERNEL);
 
 
diff --git a/core/base/device_matrix_data.cpp b/core/base/device_matrix_data.cpp
index 4c71fffe275..cb9d332f5ab 100644
--- a/core/base/device_matrix_data.cpp
+++ b/core/base/device_matrix_data.cpp
@@ -157,7 +157,8 @@ device_matrix_data<ValueType, IndexType>::empty_out()
 
 #define GKO_DECLARE_DEVICE_MATRIX_DATA(ValueType, IndexType) \
     class device_matrix_data<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DEVICE_MATRIX_DATA);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_DEVICE_MATRIX_DATA);
 
 
 }  // namespace gko
diff --git a/core/base/mtx_io.cpp b/core/base/mtx_io.cpp
index 33c3b07d487..0897349d08c 100644
--- a/core/base/mtx_io.cpp
+++ b/core/base/mtx_io.cpp
@@ -14,6 +14,7 @@
 #include <type_traits>
 
 #include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/half.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/utils.hpp>
 
@@ -757,19 +758,28 @@ static constexpr uint64 binary_format_magic()
 {
     constexpr auto is_int = std::is_same<IndexType, int32>::value;
     constexpr auto is_long = std::is_same<IndexType, int64>::value;
+    constexpr auto is_half = std::is_same<ValueType, half>::value;
     constexpr auto is_double = std::is_same<ValueType, double>::value;
     constexpr auto is_float = std::is_same<ValueType, float>::value;
     constexpr auto is_complex_double =
         std::is_same<ValueType, std::complex<double>>::value;
     constexpr auto is_complex_float =
         std::is_same<ValueType, std::complex<float>>::value;
+    constexpr auto is_complex_half =
+        std::is_same<ValueType, std::complex<half>>::value;
     static_assert(is_int || is_long, "invalid storage index type");
-    static_assert(
-        is_double || is_float || is_complex_double || is_complex_float,
-        "invalid storage value type");
+    static_assert(is_half || is_complex_half || is_double || is_float ||
+                      is_complex_double || is_complex_float,
+                  "invalid storage value type");
     constexpr auto index_bit = is_int ? 'I' : 'L';
     constexpr auto value_bit =
-        is_double ? 'D' : (is_float ? 'S' : (is_complex_double ? 'Z' : 'C'));
+        is_double
+            ? 'D'
+            : (is_float
+                   ? 'S'
+                   : (is_complex_double
+                          ? 'Z'
+                          : (is_complex_float ? 'C' : (is_half ? 'H' : 'X'))));
     constexpr uint64 shift = 256;
     constexpr uint64 type_bits = index_bit * shift + value_bit;
     return 'G' +
@@ -879,12 +889,16 @@ matrix_data<ValueType, IndexType> read_binary_raw(std::istream& is)
     }
     DECLARE_OVERLOAD(double, int32)
     DECLARE_OVERLOAD(float, int32)
+    DECLARE_OVERLOAD(half, int32)
     DECLARE_OVERLOAD(std::complex<double>, int32)
     DECLARE_OVERLOAD(std::complex<float>, int32)
+    DECLARE_OVERLOAD(std::complex<half>, int32)
     DECLARE_OVERLOAD(double, int64)
     DECLARE_OVERLOAD(float, int64)
+    DECLARE_OVERLOAD(half, int64)
     DECLARE_OVERLOAD(std::complex<double>, int64)
     DECLARE_OVERLOAD(std::complex<float>, int64)
+    DECLARE_OVERLOAD(std::complex<half>, int64)
 #undef DECLARE_OVERLOAD
     else
     {
@@ -970,11 +984,14 @@ void write_raw(std::ostream& os, const matrix_data<ValueType, IndexType>& data,
                           const matrix_data<ValueType, IndexType>& data)
 #define GKO_DECLARE_READ_GENERIC_RAW(ValueType, IndexType) \
     matrix_data<ValueType, IndexType> read_generic_raw(std::istream& is)
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_READ_RAW);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_WRITE_RAW);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_READ_BINARY_RAW);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_WRITE_BINARY_RAW);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_READ_GENERIC_RAW);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_READ_RAW);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_WRITE_RAW);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_READ_BINARY_RAW);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_WRITE_BINARY_RAW);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_READ_GENERIC_RAW);
 
 
 }  // namespace gko
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 78b80ec2859..439cda481a2 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -251,14 +251,16 @@ GKO_STUB_TEMPLATE_TYPE_WITH_HALF(GKO_DECLARE_FILL_SEQ_ARRAY_KERNEL);
 GKO_STUB_TEMPLATE_TYPE_WITH_HALF(GKO_DECLARE_REDUCE_ADD_ARRAY_KERNEL);
 GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_INPLACE_ABSOLUTE_ARRAY_KERNEL);
 GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_OUTPLACE_ABSOLUTE_ARRAY_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DEVICE_MATRIX_DATA_REMOVE_ZEROS_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DEVICE_MATRIX_DATA_SUM_DUPLICATES_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DEVICE_MATRIX_DATA_SORT_ROW_MAJOR_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DEVICE_MATRIX_DATA_AOS_TO_SOA_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DEVICE_MATRIX_DATA_SOA_TO_AOS_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_DEVICE_MATRIX_DATA_AOS_TO_SOA_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_DEVICE_MATRIX_DATA_SOA_TO_AOS_KERNEL);
 
 template <typename IndexType, typename RowPtrType>
 GKO_DECLARE_CONVERT_PTRS_TO_IDXS(IndexType, RowPtrType)
diff --git a/core/test/base/mtx_io.cpp b/core/test/base/mtx_io.cpp
index 8ac1ced0e50..14d44335b85 100644
--- a/core/test/base/mtx_io.cpp
+++ b/core/test/base/mtx_io.cpp
@@ -7,6 +7,7 @@
 
 #include <gtest/gtest.h>
 
+#include <ginkgo/config.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/mtx_io.hpp>
@@ -570,6 +571,12 @@ TEST(MtxReader, ReadsBinary)
     test_read(gko::matrix_data<double, gko::int64>{});
     test_read(gko::matrix_data<std::complex<float>, gko::int64>{});
     test_read(gko::matrix_data<std::complex<double>, gko::int64>{});
+#if GINKGO_ENABLE_HALF
+    test_read(gko::matrix_data<gko::half, gko::int32>{});
+    test_read(gko::matrix_data<std::complex<gko::half>, gko::int32>{});
+    test_read(gko::matrix_data<gko::half, gko::int64>{});
+    test_read(gko::matrix_data<std::complex<gko::half>, gko::int64>{});
+#endif
 }
 
 
@@ -625,6 +632,12 @@ TEST(MtxReader, ReadsComplexBinary)
     test_read_fail(gko::matrix_data<double, gko::int64>{});
     test_read(gko::matrix_data<std::complex<float>, gko::int64>{});
     test_read(gko::matrix_data<std::complex<double>, gko::int64>{});
+#if GINKGO_ENABLE_HALF
+    test_read_fail(gko::matrix_data<gko::half, gko::int32>{});
+    test_read(gko::matrix_data<std::complex<gko::half>, gko::int32>{});
+    test_read_fail(gko::matrix_data<gko::half, gko::int64>{});
+    test_read(gko::matrix_data<std::complex<gko::half>, gko::int64>{});
+#endif
 }
 
 
@@ -960,7 +973,7 @@ class RealDummyLinOpTest : public ::testing::Test {
         typename std::tuple_element<1, decltype(ValueIndexType())>::type;
 };
 
-TYPED_TEST_SUITE(RealDummyLinOpTest, gko::test::RealValueIndexTypes,
+TYPED_TEST_SUITE(RealDummyLinOpTest, gko::test::RealValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
@@ -1165,7 +1178,7 @@ class DenseTest : public ::testing::Test {
     using index_type = typename std::tuple_element<1, ValueIndexType>::type;
 };
 
-TYPED_TEST_SUITE(DenseTest, gko::test::RealValueIndexTypes,
+TYPED_TEST_SUITE(DenseTest, gko::test::RealValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
@@ -1209,7 +1222,8 @@ class ComplexDummyLinOpTest : public ::testing::Test {
         typename std::tuple_element<1, decltype(ValueIndexType())>::type;
 };
 
-TYPED_TEST_SUITE(ComplexDummyLinOpTest, gko::test::ComplexValueIndexTypes,
+TYPED_TEST_SUITE(ComplexDummyLinOpTest,
+                 gko::test::ComplexValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
diff --git a/dpcpp/base/device_matrix_data_kernels.dp.cpp b/dpcpp/base/device_matrix_data_kernels.dp.cpp
index f39615613fe..a5f58831a27 100644
--- a/dpcpp/base/device_matrix_data_kernels.dp.cpp
+++ b/dpcpp/base/device_matrix_data_kernels.dp.cpp
@@ -49,7 +49,7 @@ void remove_zeros(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DEVICE_MATRIX_DATA_REMOVE_ZEROS_KERNEL);
 
 
@@ -112,7 +112,7 @@ void sort_row_major(std::shared_ptr<const DefaultExecutor> exec,
               });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DEVICE_MATRIX_DATA_SORT_ROW_MAJOR_KERNEL);
 
 
diff --git a/omp/base/device_matrix_data_kernels.cpp b/omp/base/device_matrix_data_kernels.cpp
index bce89e2f409..cb2dabd3010 100644
--- a/omp/base/device_matrix_data_kernels.cpp
+++ b/omp/base/device_matrix_data_kernels.cpp
@@ -69,7 +69,7 @@ void remove_zeros(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DEVICE_MATRIX_DATA_REMOVE_ZEROS_KERNEL);
 
 
@@ -127,7 +127,7 @@ void sum_duplicates(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DEVICE_MATRIX_DATA_SUM_DUPLICATES_KERNEL);
 
 
@@ -142,7 +142,7 @@ void sort_row_major(std::shared_ptr<const DefaultExecutor> exec,
     aos_to_soa(exec, tmp, data);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DEVICE_MATRIX_DATA_SORT_ROW_MAJOR_KERNEL);
 
 
diff --git a/reference/base/device_matrix_data_kernels.cpp b/reference/base/device_matrix_data_kernels.cpp
index f9a23b35e69..78a2e25a712 100644
--- a/reference/base/device_matrix_data_kernels.cpp
+++ b/reference/base/device_matrix_data_kernels.cpp
@@ -29,7 +29,7 @@ void soa_to_aos(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DEVICE_MATRIX_DATA_SOA_TO_AOS_KERNEL);
 
 
@@ -46,7 +46,7 @@ void aos_to_soa(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DEVICE_MATRIX_DATA_AOS_TO_SOA_KERNEL);
 
 
@@ -78,7 +78,7 @@ void remove_zeros(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DEVICE_MATRIX_DATA_REMOVE_ZEROS_KERNEL);
 
 
@@ -127,7 +127,7 @@ void sum_duplicates(std::shared_ptr<const DefaultExecutor> exec, size_type,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DEVICE_MATRIX_DATA_SUM_DUPLICATES_KERNEL);
 
 
@@ -142,7 +142,7 @@ void sort_row_major(std::shared_ptr<const DefaultExecutor> exec,
     aos_to_soa(exec, tmp, data);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DEVICE_MATRIX_DATA_SORT_ROW_MAJOR_KERNEL);
 
 
diff --git a/test/base/device_matrix_data_kernels.cpp b/test/base/device_matrix_data_kernels.cpp
index 6ddc926b76c..d2543ae7cbb 100644
--- a/test/base/device_matrix_data_kernels.cpp
+++ b/test/base/device_matrix_data_kernels.cpp
@@ -35,8 +35,7 @@ class DeviceMatrixData : public CommonTestFixture {
             0, host_data.size[0] - 1);
         std::uniform_int_distribution<index_type> col_distr(
             0, host_data.size[1] - 1);
-        std::uniform_real_distribution<gko::remove_complex<value_type>>
-            val_distr(1.0, 2.0);
+        std::uniform_real_distribution<> val_distr(1.0, 2.0);
         // add random entries
         for (int i = 0; i < 1000; i++) {
             host_data.nonzeros.emplace_back(
@@ -85,7 +84,7 @@ class DeviceMatrixData : public CommonTestFixture {
     gko::matrix_data<value_type, index_type> deduplicated_data;
 };
 
-TYPED_TEST_SUITE(DeviceMatrixData, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(DeviceMatrixData, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
@@ -339,7 +338,7 @@ TYPED_TEST(DeviceMatrixData, SumsDuplicates)
     arrays.values.set_executor(this->exec->get_master());
     for (int i = 0; i < arrays.values.get_size(); i++) {
         max_error = std::max<double>(
-            max_error, std::abs(arrays.values.get_const_data()[i] -
+            max_error, gko::abs(arrays.values.get_const_data()[i] -
                                 ref_arrays.values.get_const_data()[i]));
     }
     // when Hip with GNU < 7, it will give a little difference.

From b651cc436d38e66e8719e75ab0f3633e9045536e Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 24 Oct 2024 15:39:04 +0200
Subject: [PATCH 363/448] components such as array/iterator/segmented_array
 test with half

---
 core/test/base/array.cpp                             |  3 ++-
 core/test/base/iterator_factory.cpp                  |  4 ++--
 core/test/base/segmented_array.cpp                   |  3 ++-
 core/test/components/addressable_pq.cpp              |  4 ++--
 cuda/test/base/array.cpp                             |  3 ++-
 reference/test/base/array.cpp                        |  3 ++-
 reference/test/components/absolute_array_kernels.cpp |  3 ++-
 reference/test/components/fill_array_kernels.cpp     |  2 +-
 reference/test/components/reduce_array_kernels.cpp   |  2 +-
 test/components/fill_array_kernels.cpp               |  2 +-
 test/components/reduce_array_kernels.cpp             | 11 ++++++++---
 11 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/core/test/base/array.cpp b/core/test/base/array.cpp
index f7e03855d06..23515d70fc4 100644
--- a/core/test/base/array.cpp
+++ b/core/test/base/array.cpp
@@ -40,7 +40,8 @@ class Array : public ::testing::Test {
     gko::array<T> x;
 };
 
-TYPED_TEST_SUITE(Array, gko::test::ComplexAndPODTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Array, gko::test::ComplexAndPODTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(Array, CanBeCreatedWithoutAnExecutor)
diff --git a/core/test/base/iterator_factory.cpp b/core/test/base/iterator_factory.cpp
index bbc3bbfd04f..3685242f78a 100644
--- a/core/test/base/iterator_factory.cpp
+++ b/core/test/base/iterator_factory.cpp
@@ -78,7 +78,7 @@ class ZipIterator : public ::testing::Test {
     const std::vector<value_type> ordered_value;
 };
 
-TYPED_TEST_SUITE(ZipIterator, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(ZipIterator, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
@@ -366,7 +366,7 @@ class PermuteIterator : public ::testing::Test {
     using value_type = ValueType;
 };
 
-TYPED_TEST_SUITE(PermuteIterator, gko::test::ComplexAndPODTypes,
+TYPED_TEST_SUITE(PermuteIterator, gko::test::ComplexAndPODTypesWithHalf,
                  TypenameNameGenerator);
 
 
diff --git a/core/test/base/segmented_array.cpp b/core/test/base/segmented_array.cpp
index 2741990036f..31444d71d18 100644
--- a/core/test/base/segmented_array.cpp
+++ b/core/test/base/segmented_array.cpp
@@ -27,7 +27,8 @@ class SegmentedArray : public ::testing::Test {
     std::shared_ptr<gko::Executor> exec = gko::ReferenceExecutor::create();
 };
 
-TYPED_TEST_SUITE(SegmentedArray, gko::test::PODTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(SegmentedArray, gko::test::PODTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(SegmentedArray, CanConstructFromExecutor)
diff --git a/core/test/components/addressable_pq.cpp b/core/test/components/addressable_pq.cpp
index 6301cd44fb4..87fcb289a77 100644
--- a/core/test/components/addressable_pq.cpp
+++ b/core/test/components/addressable_pq.cpp
@@ -91,8 +91,8 @@ class AddressablePriorityQueue : public ::testing::Test {
     std::shared_ptr<const gko::Executor> exec;
 };
 
-TYPED_TEST_SUITE(AddressablePriorityQueue, gko::test::RealValueIndexTypes,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(AddressablePriorityQueue,
+                 gko::test::RealValueIndexTypesWithHalf, TypenameNameGenerator);
 
 
 TYPED_TEST(AddressablePriorityQueue, InitializesCorrectly)
diff --git a/cuda/test/base/array.cpp b/cuda/test/base/array.cpp
index db7d4c54536..7294cbff29f 100644
--- a/cuda/test/base/array.cpp
+++ b/cuda/test/base/array.cpp
@@ -32,7 +32,8 @@ class Array : public CudaTestFixture {
     gko::array<T> x;
 };
 
-TYPED_TEST_SUITE(Array, gko::test::ComplexAndPODTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Array, gko::test::ComplexAndPODTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(Array, CanCreateTemporaryCloneOnDifferentExecutor)
diff --git a/reference/test/base/array.cpp b/reference/test/base/array.cpp
index 666ab13063c..2c69f1afc8e 100644
--- a/reference/test/base/array.cpp
+++ b/reference/test/base/array.cpp
@@ -28,7 +28,8 @@ class Array : public ::testing::Test {
     gko::array<T> x;
 };
 
-TYPED_TEST_SUITE(Array, gko::test::ComplexAndPODTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Array, gko::test::ComplexAndPODTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(Array, CanBeFilledWithValue)
diff --git a/reference/test/components/absolute_array_kernels.cpp b/reference/test/components/absolute_array_kernels.cpp
index c192d540032..5ad75440c88 100644
--- a/reference/test/components/absolute_array_kernels.cpp
+++ b/reference/test/components/absolute_array_kernels.cpp
@@ -43,7 +43,8 @@ class AbsoluteArray : public ::testing::Test {
     gko::array<value_type> vals;
 };
 
-TYPED_TEST_SUITE(AbsoluteArray, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(AbsoluteArray, gko::test::ValueTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(AbsoluteArray, InplaceEqualsExpected)
diff --git a/reference/test/components/fill_array_kernels.cpp b/reference/test/components/fill_array_kernels.cpp
index 3c7520c6847..0a9239ce1bd 100644
--- a/reference/test/components/fill_array_kernels.cpp
+++ b/reference/test/components/fill_array_kernels.cpp
@@ -40,7 +40,7 @@ class FillArray : public ::testing::Test {
     gko::array<value_type> seqs;
 };
 
-TYPED_TEST_SUITE(FillArray, gko::test::ComplexAndPODTypes,
+TYPED_TEST_SUITE(FillArray, gko::test::ComplexAndPODTypesWithHalf,
                  TypenameNameGenerator);
 
 
diff --git a/reference/test/components/reduce_array_kernels.cpp b/reference/test/components/reduce_array_kernels.cpp
index 8286817c853..c8839bc178d 100644
--- a/reference/test/components/reduce_array_kernels.cpp
+++ b/reference/test/components/reduce_array_kernels.cpp
@@ -31,7 +31,7 @@ class ReduceArray : public ::testing::Test {
     gko::array<value_type> vals;
 };
 
-TYPED_TEST_SUITE(ReduceArray, gko::test::ComplexAndPODTypes,
+TYPED_TEST_SUITE(ReduceArray, gko::test::ComplexAndPODTypesWithHalf,
                  TypenameNameGenerator);
 
 
diff --git a/test/components/fill_array_kernels.cpp b/test/components/fill_array_kernels.cpp
index 3d494b3f5f0..4237a75304a 100644
--- a/test/components/fill_array_kernels.cpp
+++ b/test/components/fill_array_kernels.cpp
@@ -36,7 +36,7 @@ class FillArray : public CommonTestFixture {
     gko::array<value_type> seqs;
 };
 
-TYPED_TEST_SUITE(FillArray, gko::test::ComplexAndPODTypes,
+TYPED_TEST_SUITE(FillArray, gko::test::ComplexAndPODTypesWithHalf,
                  TypenameNameGenerator);
 
 
diff --git a/test/components/reduce_array_kernels.cpp b/test/components/reduce_array_kernels.cpp
index b7407801a32..7940feec661 100644
--- a/test/components/reduce_array_kernels.cpp
+++ b/test/components/reduce_array_kernels.cpp
@@ -20,14 +20,19 @@ template <typename T>
 class ReduceArray : public CommonTestFixture {
 protected:
     using value_type = T;
+    static constexpr bool using_half =
+        std::is_same_v<gko::remove_complex<value_type>, gko::half>;
+
+    // due to half accuracy, the summation ordering will affect the result
+    // easily
     ReduceArray()
-        : total_size(6355),
+        : total_size(using_half ? 1024 : 6355),
           out{ref, I<T>{2}},
           dout{exec, out},
           vals{ref, total_size},
           dvals{exec}
     {
-        std::fill_n(vals.get_data(), total_size, 3);
+        std::fill_n(vals.get_data(), total_size, using_half ? 1 : 3);
         dvals = vals;
     }
 
@@ -38,7 +43,7 @@ class ReduceArray : public CommonTestFixture {
     gko::array<value_type> dvals;
 };
 
-TYPED_TEST_SUITE(ReduceArray, gko::test::ComplexAndPODTypes,
+TYPED_TEST_SUITE(ReduceArray, gko::test::ComplexAndPODTypesWithHalf,
                  TypenameNameGenerator);
 
 

From c03fc10d68a0fb956a48ae2387798b226c77b2ef Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 24 Oct 2024 15:39:41 +0200
Subject: [PATCH 364/448] matrix test with half

---
 core/test/matrix/coo.cpp                      |  3 +-
 core/test/matrix/coo_builder.cpp              |  2 +-
 core/test/matrix/csr.cpp                      |  3 +-
 core/test/matrix/csr_builder.cpp              |  2 +-
 core/test/matrix/dense.cpp                    |  2 +-
 core/test/matrix/diagonal.cpp                 |  3 +-
 core/test/matrix/ell.cpp                      |  3 +-
 core/test/matrix/fbcsr.cpp                    |  7 ++-
 core/test/matrix/fbcsr_builder.cpp            |  2 +-
 core/test/matrix/hybrid.cpp                   |  3 +-
 core/test/matrix/identity.cpp                 |  6 +-
 core/test/matrix/permutation.cpp              |  2 +-
 core/test/matrix/row_gatherer.cpp             |  2 +-
 core/test/matrix/sellp.cpp                    |  3 +-
 core/test/matrix/sparsity_csr.cpp             |  2 +-
 core/test/utils/fb_matrix_generator.hpp       | 13 ++---
 core/test/utils/value_generator.hpp           |  6 +-
 hip/test/matrix/fbcsr_kernels.cpp             | 56 +++++++++++++------
 reference/test/matrix/coo_kernels.cpp         | 33 +++++------
 reference/test/matrix/csr_kernels.cpp         | 48 ++++++++--------
 reference/test/matrix/dense_kernels.cpp       | 19 ++++---
 reference/test/matrix/diagonal_kernels.cpp    | 14 +++--
 reference/test/matrix/ell_kernels.cpp         | 46 +++++++--------
 reference/test/matrix/fbcsr_kernels.cpp       | 15 ++---
 reference/test/matrix/hybrid_kernels.cpp      | 20 ++++---
 reference/test/matrix/identity.cpp            |  6 +-
 reference/test/matrix/permutation.cpp         |  2 +-
 reference/test/matrix/scaled_permutation.cpp  |  2 +-
 reference/test/matrix/sellp_kernels.cpp       | 19 ++++---
 reference/test/matrix/sparsity_csr.cpp        |  2 +-
 .../test/matrix/sparsity_csr_kernels.cpp      | 12 ++--
 test/matrix/fbcsr_kernels.cpp                 | 23 ++++++--
 32 files changed, 219 insertions(+), 162 deletions(-)

diff --git a/core/test/matrix/coo.cpp b/core/test/matrix/coo.cpp
index ffb8d5aee9f..56735e792d5 100644
--- a/core/test/matrix/coo.cpp
+++ b/core/test/matrix/coo.cpp
@@ -77,7 +77,8 @@ class Coo : public ::testing::Test {
     }
 };
 
-TYPED_TEST_SUITE(Coo, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Coo, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(Coo, KnowsItsSize)
diff --git a/core/test/matrix/coo_builder.cpp b/core/test/matrix/coo_builder.cpp
index 9bfae5cf3af..b1b22c5848a 100644
--- a/core/test/matrix/coo_builder.cpp
+++ b/core/test/matrix/coo_builder.cpp
@@ -32,7 +32,7 @@ class CooBuilder : public ::testing::Test {
     std::unique_ptr<Mtx> mtx;
 };
 
-TYPED_TEST_SUITE(CooBuilder, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(CooBuilder, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
diff --git a/core/test/matrix/csr.cpp b/core/test/matrix/csr.cpp
index 4bbdc63851a..f199de423e8 100644
--- a/core/test/matrix/csr.cpp
+++ b/core/test/matrix/csr.cpp
@@ -82,7 +82,8 @@ class Csr : public ::testing::Test {
     }
 };
 
-TYPED_TEST_SUITE(Csr, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Csr, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(Csr, KnowsItsSize)
diff --git a/core/test/matrix/csr_builder.cpp b/core/test/matrix/csr_builder.cpp
index 24cbe4718c5..2accb57770c 100644
--- a/core/test/matrix/csr_builder.cpp
+++ b/core/test/matrix/csr_builder.cpp
@@ -33,7 +33,7 @@ class CsrBuilder : public ::testing::Test {
     std::unique_ptr<Mtx> mtx;
 };
 
-TYPED_TEST_SUITE(CsrBuilder, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(CsrBuilder, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
diff --git a/core/test/matrix/dense.cpp b/core/test/matrix/dense.cpp
index e7158a15aed..f1a673840ea 100644
--- a/core/test/matrix/dense.cpp
+++ b/core/test/matrix/dense.cpp
@@ -48,7 +48,7 @@ class Dense : public ::testing::Test {
     std::unique_ptr<gko::matrix::Dense<value_type>> mtx;
 };
 
-TYPED_TEST_SUITE(Dense, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Dense, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
 
 
 TYPED_TEST(Dense, CanBeEmpty)
diff --git a/core/test/matrix/diagonal.cpp b/core/test/matrix/diagonal.cpp
index de03a9350bb..7e598d67a5e 100644
--- a/core/test/matrix/diagonal.cpp
+++ b/core/test/matrix/diagonal.cpp
@@ -47,7 +47,8 @@ class Diagonal : public ::testing::Test {
     }
 };
 
-TYPED_TEST_SUITE(Diagonal, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Diagonal, gko::test::ValueTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(Diagonal, KnowsItsSize)
diff --git a/core/test/matrix/ell.cpp b/core/test/matrix/ell.cpp
index bcc2b591a50..93fc73dde18 100644
--- a/core/test/matrix/ell.cpp
+++ b/core/test/matrix/ell.cpp
@@ -79,7 +79,8 @@ class Ell : public ::testing::Test {
     }
 };
 
-TYPED_TEST_SUITE(Ell, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Ell, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(Ell, KnowsItsSize)
diff --git a/core/test/matrix/fbcsr.cpp b/core/test/matrix/fbcsr.cpp
index 3d3d4ee738d..fd024532a14 100644
--- a/core/test/matrix/fbcsr.cpp
+++ b/core/test/matrix/fbcsr.cpp
@@ -131,7 +131,7 @@ class FbcsrSample : public ::testing::Test {
 };
 
 
-TYPED_TEST_SUITE(FbcsrSample, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(FbcsrSample, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
@@ -183,7 +183,7 @@ template <typename ValueIndexType>
 class FbcsrSampleComplex : public FbcsrSample<ValueIndexType> {};
 
 
-TYPED_TEST_SUITE(FbcsrSampleComplex, gko::test::ComplexValueIndexTypes,
+TYPED_TEST_SUITE(FbcsrSampleComplex, gko::test::ComplexValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
@@ -282,7 +282,8 @@ class Fbcsr : public ::testing::Test {
     }
 };
 
-TYPED_TEST_SUITE(Fbcsr, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Fbcsr, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(Fbcsr, GetNumBlocksCorrectlyThrows)
diff --git a/core/test/matrix/fbcsr_builder.cpp b/core/test/matrix/fbcsr_builder.cpp
index d91a0c7b70a..241c7ccc6eb 100644
--- a/core/test/matrix/fbcsr_builder.cpp
+++ b/core/test/matrix/fbcsr_builder.cpp
@@ -33,7 +33,7 @@ class FbcsrBuilder : public ::testing::Test {
     std::unique_ptr<Mtx> mtx;
 };
 
-TYPED_TEST_SUITE(FbcsrBuilder, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(FbcsrBuilder, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
diff --git a/core/test/matrix/hybrid.cpp b/core/test/matrix/hybrid.cpp
index d1a69312755..6b1e2a4a747 100644
--- a/core/test/matrix/hybrid.cpp
+++ b/core/test/matrix/hybrid.cpp
@@ -96,7 +96,8 @@ class Hybrid : public ::testing::Test {
     }
 };
 
-TYPED_TEST_SUITE(Hybrid, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Hybrid, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(Hybrid, KnowsItsSize)
diff --git a/core/test/matrix/identity.cpp b/core/test/matrix/identity.cpp
index bcf9c036992..80defae4441 100644
--- a/core/test/matrix/identity.cpp
+++ b/core/test/matrix/identity.cpp
@@ -31,7 +31,8 @@ class Identity : public ::testing::Test {
     std::shared_ptr<const gko::Executor> exec;
 };
 
-TYPED_TEST_SUITE(Identity, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Identity, gko::test::ValueTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(Identity, CanBeEmpty)
@@ -81,7 +82,8 @@ class IdentityFactory : public ::testing::Test {
     using value_type = T;
 };
 
-TYPED_TEST_SUITE(IdentityFactory, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(IdentityFactory, gko::test::ValueTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(IdentityFactory, CanGenerateIdentityMatrix)
diff --git a/core/test/matrix/permutation.cpp b/core/test/matrix/permutation.cpp
index edb1532696b..fcd5aad789c 100644
--- a/core/test/matrix/permutation.cpp
+++ b/core/test/matrix/permutation.cpp
@@ -52,7 +52,7 @@ class Permutation : public ::testing::Test {
     std::unique_ptr<gko::matrix::Permutation<index_type>> mtx;
 };
 
-TYPED_TEST_SUITE(Permutation, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(Permutation, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
diff --git a/core/test/matrix/row_gatherer.cpp b/core/test/matrix/row_gatherer.cpp
index 801f639c206..b808828cc08 100644
--- a/core/test/matrix/row_gatherer.cpp
+++ b/core/test/matrix/row_gatherer.cpp
@@ -65,7 +65,7 @@ class RowGatherer : public ::testing::Test {
     std::unique_ptr<OutVec> out;
 };
 
-TYPED_TEST_SUITE(RowGatherer, gko::test::TwoValueIndexType,
+TYPED_TEST_SUITE(RowGatherer, gko::test::TwoValueIndexTypeWithHalf,
                  TupleTypenameNameGenerator);
 
 
diff --git a/core/test/matrix/sellp.cpp b/core/test/matrix/sellp.cpp
index 123d7bae773..a79fcf2bbd3 100644
--- a/core/test/matrix/sellp.cpp
+++ b/core/test/matrix/sellp.cpp
@@ -107,7 +107,8 @@ class Sellp : public ::testing::Test {
     }
 };
 
-TYPED_TEST_SUITE(Sellp, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Sellp, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(Sellp, KnowsItsSize)
diff --git a/core/test/matrix/sparsity_csr.cpp b/core/test/matrix/sparsity_csr.cpp
index e929f960f1e..67f8237adb6 100644
--- a/core/test/matrix/sparsity_csr.cpp
+++ b/core/test/matrix/sparsity_csr.cpp
@@ -74,7 +74,7 @@ class SparsityCsr : public ::testing::Test {
     }
 };
 
-TYPED_TEST_SUITE(SparsityCsr, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(SparsityCsr, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
diff --git a/core/test/utils/fb_matrix_generator.hpp b/core/test/utils/fb_matrix_generator.hpp
index 034dd95fce1..786f836e10a 100644
--- a/core/test/utils/fb_matrix_generator.hpp
+++ b/core/test/utils/fb_matrix_generator.hpp
@@ -131,16 +131,15 @@ std::unique_ptr<matrix::Fbcsr<ValueType, IndexType>> generate_fbcsr_from_csr(
     const IndexType* const row_ptrs = fmtx->get_const_row_ptrs();
     const IndexType* const col_idxs = fmtx->get_const_col_idxs();
     ValueType* const vals = fmtx->get_values();
-    std::uniform_real_distribution<gko::remove_complex<ValueType>>
-        off_diag_dist(-1.0, 1.0);
+    std::uniform_real_distribution<> off_diag_dist(-1.0, 1.0);
 
     for (IndexType ibrow = 0; ibrow < nbrows; ibrow++) {
         if (row_diag_dominant) {
             const IndexType nrownz =
                 (row_ptrs[ibrow + 1] - row_ptrs[ibrow]) * block_size;
 
-            std::uniform_real_distribution<gko::remove_complex<ValueType>>
-                diag_dist(1.01 * nrownz, 2 * nrownz);
+            std::uniform_real_distribution<> diag_dist(1.01 * nrownz,
+                                                       2 * nrownz);
 
             for (IndexType ibz = row_ptrs[ibrow]; ibz < row_ptrs[ibrow + 1];
                  ibz++) {
@@ -205,13 +204,11 @@ std::unique_ptr<matrix::Fbcsr<ValueType, IndexType>> generate_random_fbcsr(
                   matrix::Csr<ValueType, IndexType>>(
                   nbrows, nbcols,
                   std::uniform_int_distribution<IndexType>(0, nbcols - 1),
-                  std::normal_distribution<real_type>(0.0, 1.0),
-                  std::move(engine), ref)
+                  std::normal_distribution<>(0.0, 1.0), std::move(engine), ref)
             : generate_random_matrix<matrix::Csr<ValueType, IndexType>>(
                   nbrows, nbcols,
                   std::uniform_int_distribution<IndexType>(0, nbcols - 1),
-                  std::normal_distribution<real_type>(0.0, 1.0),
-                  std::move(engine), ref);
+                  std::normal_distribution<>(0.0, 1.0), std::move(engine), ref);
     if (unsort && rand_csr_ref->is_sorted_by_column_index()) {
         unsort_matrix(rand_csr_ref, engine);
     }
diff --git a/core/test/utils/value_generator.hpp b/core/test/utils/value_generator.hpp
index f18f2170c96..19e01b33356 100644
--- a/core/test/utils/value_generator.hpp
+++ b/core/test/utils/value_generator.hpp
@@ -33,7 +33,7 @@ template <typename ValueType, typename ValueDistribution, typename Engine>
 typename std::enable_if<!is_complex_s<ValueType>::value, ValueType>::type
 get_rand_value(ValueDistribution&& value_dist, Engine&& gen)
 {
-    return value_dist(gen);
+    return static_cast<ValueType>(value_dist(gen));
 }
 
 /**
@@ -45,7 +45,9 @@ template <typename ValueType, typename ValueDistribution, typename Engine>
 typename std::enable_if<is_complex_s<ValueType>::value, ValueType>::type
 get_rand_value(ValueDistribution&& value_dist, Engine&& gen)
 {
-    return ValueType(value_dist(gen), value_dist(gen));
+    using real_type = remove_complex<ValueType>;
+    return ValueType(static_cast<real_type>(value_dist(gen)),
+                     static_cast<real_type>(value_dist(gen)));
 }
 
 
diff --git a/hip/test/matrix/fbcsr_kernels.cpp b/hip/test/matrix/fbcsr_kernels.cpp
index 0b4b16086ca..536ff3dc01c 100644
--- a/hip/test/matrix/fbcsr_kernels.cpp
+++ b/hip/test/matrix/fbcsr_kernels.cpp
@@ -8,6 +8,7 @@
 
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/fbcsr.hpp>
 
@@ -41,7 +42,7 @@ class Fbcsr : public HipTestFixture {
 
     std::unique_ptr<const Mtx> rsorted_ref;
 
-    std::normal_distribution<gko::remove_complex<T>> distb;
+    std::normal_distribution<> distb;
     std::default_random_engine engine;
 
     value_type get_random_value()
@@ -60,7 +61,8 @@ class Fbcsr : public HipTestFixture {
     }
 };
 
-TYPED_TEST_SUITE(Fbcsr, gko::test::RealValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Fbcsr, gko::test::RealValueTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(Fbcsr, CanWriteFromMatrixOnDevice)
@@ -145,11 +147,15 @@ TYPED_TEST(Fbcsr, SpmvIsEquivalentToRefSorted)
         this->ref, gko::dim<2>(this->rsorted_ref->get_size()[0], 1));
     auto prod_hip = Dense::create(this->exec, prod_ref->get_size());
 
-    rand_hip->apply(x_hip, prod_hip);
-    this->rsorted_ref->apply(x_ref, prod_ref);
+    if (std::is_same<value_type, gko::half>::value) {
+        ASSERT_THROW(rand_hip->apply(x_hip, prod_hip), gko::NotImplemented);
+    } else {
+        rand_hip->apply(x_hip, prod_hip);
+        this->rsorted_ref->apply(x_ref, prod_ref);
 
-    const double tol = r<value_type>::value;
-    GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol);
+        const double tol = r<value_type>::value;
+        GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol);
+    }
 }
 
 
@@ -169,11 +175,15 @@ TYPED_TEST(Fbcsr, SpmvMultiIsEquivalentToRefSorted)
         this->ref, gko::dim<2>(this->rsorted_ref->get_size()[0], 3));
     auto prod_hip = Dense::create(this->exec, prod_ref->get_size());
 
-    rand_hip->apply(x_hip, prod_hip);
-    this->rsorted_ref->apply(x_ref, prod_ref);
+    if (std::is_same<value_type, gko::half>::value) {
+        ASSERT_THROW(rand_hip->apply(x_hip, prod_hip), gko::NotImplemented);
+    } else {
+        rand_hip->apply(x_hip, prod_hip);
+        this->rsorted_ref->apply(x_ref, prod_ref);
 
-    const double tol = r<value_type>::value;
-    GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol);
+        const double tol = r<value_type>::value;
+        GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol);
+    }
 }
 
 
@@ -205,11 +215,16 @@ TYPED_TEST(Fbcsr, AdvancedSpmvIsEquivalentToRefSorted)
     auto beta = Dense::create(this->exec);
     beta->copy_from(beta_ref);
 
-    rand_hip->apply(alpha, x_hip, beta, prod_hip);
-    this->rsorted_ref->apply(alpha_ref, x_ref, beta_ref, prod_ref);
+    if (std::is_same<value_type, gko::half>::value) {
+        ASSERT_THROW(rand_hip->apply(alpha, x_hip, beta, prod_hip),
+                     gko::NotImplemented);
+    } else {
+        rand_hip->apply(alpha, x_hip, beta, prod_hip);
+        this->rsorted_ref->apply(alpha_ref, x_ref, beta_ref, prod_ref);
 
-    const double tol = r<value_type>::value;
-    GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol);
+        const double tol = r<value_type>::value;
+        GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol);
+    }
 }
 
 
@@ -241,11 +256,16 @@ TYPED_TEST(Fbcsr, AdvancedSpmvMultiIsEquivalentToRefSorted)
     auto beta = Dense::create(this->exec);
     beta->copy_from(beta_ref);
 
-    rand_hip->apply(alpha, x_hip, beta, prod_hip);
-    this->rsorted_ref->apply(alpha_ref, x_ref, beta_ref, prod_ref);
+    if (std::is_same<value_type, gko::half>::value) {
+        ASSERT_THROW(rand_hip->apply(alpha, x_hip, beta, prod_hip),
+                     gko::NotImplemented);
+    } else {
+        rand_hip->apply(alpha, x_hip, beta, prod_hip);
+        this->rsorted_ref->apply(alpha_ref, x_ref, beta_ref, prod_ref);
 
-    const double tol = r<value_type>::value;
-    GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol);
+        const double tol = r<value_type>::value;
+        GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol);
+    }
 }
 
 
diff --git a/reference/test/matrix/coo_kernels.cpp b/reference/test/matrix/coo_kernels.cpp
index 53efc588e1c..6ffea5d0e7d 100644
--- a/reference/test/matrix/coo_kernels.cpp
+++ b/reference/test/matrix/coo_kernels.cpp
@@ -32,7 +32,8 @@ class Coo : public ::testing::Test {
     using Csr = gko::matrix::Csr<value_type, index_type>;
     using Mtx = gko::matrix::Coo<value_type, index_type>;
     using Vec = gko::matrix::Dense<value_type>;
-    using MixedVec = gko::matrix::Dense<gko::next_precision<value_type>>;
+    using MixedVec =
+        gko::matrix::Dense<gko::next_precision_with_half<value_type>>;
 
     Coo() : exec(gko::ReferenceExecutor::create()), mtx(Mtx::create(exec))
     {
@@ -72,24 +73,24 @@ class Coo : public ::testing::Test {
     std::unique_ptr<Mtx> uns_mtx;
 };
 
-TYPED_TEST_SUITE(Coo, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Coo, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(Coo, ConvertsToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision_with_half<ValueType>;
     using Coo = typename TestFixture::Mtx;
     using OtherCoo = gko::matrix::Coo<OtherType, IndexType>;
     auto tmp = OtherCoo::create(this->exec);
     auto res = Coo::create(this->exec);
     // If OtherType is more precise: 0, otherwise r
-    auto residual = r<OtherType>::value < r<ValueType>::value
-                        ? gko::remove_complex<ValueType>{0}
-                        : gko::remove_complex<ValueType>{
-                              static_cast<gko::remove_complex<ValueType>>(
-                                  r<OtherType>::value)};
+    auto residual =
+        r<OtherType>::value < r<ValueType>::value
+            ? gko::remove_complex<ValueType>{0}
+            : static_cast<gko::remove_complex<ValueType>>(r<OtherType>::value);
 
     this->mtx->convert_to(tmp);
     tmp->convert_to(res);
@@ -102,7 +103,7 @@ TYPED_TEST(Coo, MovesToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision_with_half<ValueType>;
     using Coo = typename TestFixture::Mtx;
     using OtherCoo = gko::matrix::Coo<OtherType, IndexType>;
     auto tmp = OtherCoo::create(this->exec);
@@ -215,7 +216,7 @@ TYPED_TEST(Coo, ConvertsEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision_with_half<ValueType>;
     using Coo = typename TestFixture::Mtx;
     using OtherCoo = gko::matrix::Coo<OtherType, IndexType>;
     auto empty = OtherCoo::create(this->exec);
@@ -232,7 +233,7 @@ TYPED_TEST(Coo, MovesEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision_with_half<ValueType>;
     using Coo = typename TestFixture::Mtx;
     using OtherCoo = gko::matrix::Coo<OtherType, IndexType>;
     auto empty = OtherCoo::create(this->exec);
@@ -703,7 +704,7 @@ TYPED_TEST(Coo, AppliesToComplex)
 TYPED_TEST(Coo, AppliesToMixedComplex)
 {
     using mixed_value_type =
-        gko::next_precision<typename TestFixture::value_type>;
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using mixed_complex_type = gko::to_complex<mixed_value_type>;
     using Vec = gko::matrix::Dense<mixed_complex_type>;
     auto exec = gko::ReferenceExecutor::create();
@@ -759,7 +760,7 @@ TYPED_TEST(Coo, AdvancedAppliesToComplex)
 TYPED_TEST(Coo, AdvancedAppliesToMixedComplex)
 {
     using mixed_value_type =
-        gko::next_precision<typename TestFixture::value_type>;
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using mixed_complex_type = gko::to_complex<mixed_value_type>;
     using MixedDense = gko::matrix::Dense<mixed_value_type>;
     using MixedDenseComplex = gko::matrix::Dense<mixed_complex_type>;
@@ -817,7 +818,7 @@ TYPED_TEST(Coo, ApplyAddsToComplex)
 TYPED_TEST(Coo, ApplyAddsToMixedComplex)
 {
     using mixed_value_type =
-        gko::next_precision<typename TestFixture::value_type>;
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using mixed_complex_type = gko::to_complex<mixed_value_type>;
     using MixedVec = gko::matrix::Dense<mixed_complex_type>;
     auto exec = gko::ReferenceExecutor::create();
@@ -874,7 +875,7 @@ TYPED_TEST(Coo, ApplyAddsScaledToComplex)
 TYPED_TEST(Coo, ApplyAddsScaledToMixedComplex)
 {
     using mixed_value_type =
-        gko::next_precision<typename TestFixture::value_type>;
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using mixed_complex_type = gko::to_complex<mixed_value_type>;
     using MixedDense = gko::matrix::Dense<mixed_value_type>;
     using MixedDenseComplex = gko::matrix::Dense<mixed_complex_type>;
@@ -911,7 +912,7 @@ class CooComplex : public ::testing::Test {
     using Mtx = gko::matrix::Coo<value_type, index_type>;
 };
 
-TYPED_TEST_SUITE(CooComplex, gko::test::ComplexValueIndexTypes,
+TYPED_TEST_SUITE(CooComplex, gko::test::ComplexValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp
index b84ac958f02..b417eb93f52 100644
--- a/reference/test/matrix/csr_kernels.cpp
+++ b/reference/test/matrix/csr_kernels.cpp
@@ -46,7 +46,8 @@ class Csr : public ::testing::Test {
     using Ell = gko::matrix::Ell<value_type, index_type>;
     using Hybrid = gko::matrix::Hybrid<value_type, index_type>;
     using Vec = gko::matrix::Dense<value_type>;
-    using MixedVec = gko::matrix::Dense<gko::next_precision<value_type>>;
+    using MixedVec =
+        gko::matrix::Dense<gko::next_precision_with_half<value_type>>;
     using Perm = gko::matrix::Permutation<index_type>;
     using ScaledPerm = gko::matrix::ScaledPermutation<value_type, index_type>;
 
@@ -347,7 +348,8 @@ class Csr : public ::testing::Test {
     index_type invalid_index = gko::invalid_index<index_type>();
 };
 
-TYPED_TEST_SUITE(Csr, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Csr, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(Csr, AppliesToDenseVector)
@@ -368,7 +370,7 @@ TYPED_TEST(Csr, MixedAppliesToDenseVector1)
 {
     // Both vectors have the same value type which differs from the matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision<T>;
+    using next_T = gko::next_precision_with_half<T>;
     using Vec = typename gko::matrix::Dense<next_T>;
     auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
     auto y = Vec::create(this->exec, gko::dim<2>{2, 1});
@@ -383,7 +385,7 @@ TYPED_TEST(Csr, MixedAppliesToDenseVector2)
 {
     // Input vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision<T>;
+    using next_T = gko::next_precision_with_half<T>;
     using Vec1 = typename TestFixture::Vec;
     using Vec2 = gko::matrix::Dense<next_T>;
     auto x = gko::initialize<Vec1>({2.0, 1.0, 4.0}, this->exec);
@@ -399,9 +401,9 @@ TYPED_TEST(Csr, MixedAppliesToDenseVector3)
 {
     // Output vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision<T>;
+    using next_T = gko::next_precision_with_half<T>;
     using Vec1 = typename TestFixture::Vec;
-    using Vec2 = gko::matrix::Dense<gko::next_precision<T>>;
+    using Vec2 = gko::matrix::Dense<gko::next_precision_with_half<T>>;
     auto x = gko::initialize<Vec2>({2.0, 1.0, 4.0}, this->exec);
     auto y = Vec1::create(this->exec, gko::dim<2>{2, 1});
 
@@ -432,7 +434,7 @@ TYPED_TEST(Csr, MixedAppliesToDenseMatrix1)
 {
     // Both vectors have the same value type which differs from the matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision<T>;
+    using next_T = gko::next_precision_with_half<T>;
     using Vec = gko::matrix::Dense<next_T>;
     // clang-format off
     auto x = gko::initialize<Vec>(
@@ -456,7 +458,7 @@ TYPED_TEST(Csr, MixedAppliesToDenseMatrix2)
 {
     // Input vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision<T>;
+    using next_T = gko::next_precision_with_half<T>;
     using Vec1 = typename TestFixture::Vec;
     using Vec2 = gko::matrix::Dense<next_T>;
     // clang-format off
@@ -481,7 +483,7 @@ TYPED_TEST(Csr, MixedAppliesToDenseMatrix3)
 {
     // Output vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision<T>;
+    using next_T = gko::next_precision_with_half<T>;
     using Vec1 = typename TestFixture::Vec;
     using Vec2 = gko::matrix::Dense<next_T>;
     // clang-format off
@@ -522,7 +524,7 @@ TYPED_TEST(Csr, MixedAppliesLinearCombinationToDenseVector1)
 {
     // Both vectors have the same value type which differs from the matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision<T>;
+    using next_T = gko::next_precision_with_half<T>;
     using Vec = gko::matrix::Dense<next_T>;
     auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
     auto beta = gko::initialize<Vec>({2.0}, this->exec);
@@ -539,7 +541,7 @@ TYPED_TEST(Csr, MixedAppliesLinearCombinationToDenseVector2)
 {
     // Input vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision<T>;
+    using next_T = gko::next_precision_with_half<T>;
     using Vec1 = typename TestFixture::Vec;
     using Vec2 = gko::matrix::Dense<next_T>;
     auto alpha = gko::initialize<Vec1>({-1.0}, this->exec);
@@ -557,7 +559,7 @@ TYPED_TEST(Csr, MixedAppliesLinearCombinationToDenseVector3)
 {
     // Output vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision<T>;
+    using next_T = gko::next_precision_with_half<T>;
     using Vec1 = typename TestFixture::Vec;
     using Vec2 = gko::matrix::Dense<next_T>;
     auto alpha = gko::initialize<Vec2>({-1.0}, this->exec);
@@ -595,7 +597,7 @@ TYPED_TEST(Csr, MixedAppliesLinearCombinationToDenseMatrix1)
 {
     // Both vectors have the same value type which differs from the matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision<T>;
+    using next_T = gko::next_precision_with_half<T>;
     using Vec = gko::matrix::Dense<next_T>;
     auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
     auto beta = gko::initialize<Vec>({2.0}, this->exec);
@@ -619,7 +621,7 @@ TYPED_TEST(Csr, MixedAppliesLinearCombinationToDenseMatrix2)
 {
     // Input vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision<T>;
+    using next_T = gko::next_precision_with_half<T>;
     using Vec1 = typename TestFixture::Vec;
     using Vec2 = gko::matrix::Dense<next_T>;
     auto alpha = gko::initialize<Vec1>({-1.0}, this->exec);
@@ -639,7 +641,7 @@ TYPED_TEST(Csr, MixedAppliesLinearCombinationToDenseMatrix3)
 {
     // Output vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision<T>;
+    using next_T = gko::next_precision_with_half<T>;
     using Vec1 = typename TestFixture::Vec;
     using Vec2 = gko::matrix::Dense<next_T>;
     auto alpha = gko::initialize<Vec2>({-1.0}, this->exec);
@@ -788,7 +790,7 @@ TYPED_TEST(Csr, ConvertsToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision_with_half<ValueType>;
     using Csr = typename TestFixture::Mtx;
     using OtherCsr = gko::matrix::Csr<OtherType, IndexType>;
     auto tmp = OtherCsr::create(this->exec);
@@ -814,7 +816,7 @@ TYPED_TEST(Csr, MovesToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision_with_half<ValueType>;
     using Csr = typename TestFixture::Mtx;
     using OtherCsr = gko::matrix::Csr<OtherType, IndexType>;
     auto tmp = OtherCsr::create(this->exec);
@@ -992,7 +994,7 @@ TYPED_TEST(Csr, ConvertsEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision_with_half<ValueType>;
     using Csr = typename TestFixture::Mtx;
     using OtherCsr = gko::matrix::Csr<OtherType, IndexType>;
     auto empty = OtherCsr::create(this->exec);
@@ -1011,7 +1013,7 @@ TYPED_TEST(Csr, MovesEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision_with_half<ValueType>;
     using Csr = typename TestFixture::Mtx;
     using OtherCsr = gko::matrix::Csr<OtherType, IndexType>;
     auto empty = OtherCsr::create(this->exec);
@@ -2048,7 +2050,7 @@ TYPED_TEST(Csr, AppliesToComplex)
 TYPED_TEST(Csr, AppliesToMixedComplex)
 {
     using mixed_value_type =
-        gko::next_precision<typename TestFixture::value_type>;
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using mixed_complex_type = gko::to_complex<mixed_value_type>;
     using Vec = gko::matrix::Dense<mixed_complex_type>;
     auto exec = gko::ReferenceExecutor::create();
@@ -2104,7 +2106,7 @@ TYPED_TEST(Csr, AdvancedAppliesToComplex)
 TYPED_TEST(Csr, AdvancedAppliesToMixedComplex)
 {
     using mixed_value_type =
-        gko::next_precision<typename TestFixture::value_type>;
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using mixed_complex_type = gko::to_complex<mixed_value_type>;
     using MixedDense = gko::matrix::Dense<mixed_value_type>;
     using MixedDenseComplex = gko::matrix::Dense<mixed_complex_type>;
@@ -2245,7 +2247,7 @@ class CsrComplex : public ::testing::Test {
     using Mtx = gko::matrix::Csr<value_type, index_type>;
 };
 
-TYPED_TEST_SUITE(CsrComplex, gko::test::ComplexValueIndexTypes,
+TYPED_TEST_SUITE(CsrComplex, gko::test::ComplexValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
@@ -2590,7 +2592,7 @@ class CsrLookup : public ::testing::Test {
     index_type invalid_index = gko::invalid_index<index_type>();
 };
 
-TYPED_TEST_SUITE(CsrLookup, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(CsrLookup, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 TYPED_TEST(CsrLookup, GeneratesLookupDataOffsets)
diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp
index a8d37ce5a09..3854cd56dff 100644
--- a/reference/test/matrix/dense_kernels.cpp
+++ b/reference/test/matrix/dense_kernels.cpp
@@ -37,7 +37,8 @@ class Dense : public ::testing::Test {
 protected:
     using value_type = T;
     using Mtx = gko::matrix::Dense<value_type>;
-    using MixedMtx = gko::matrix::Dense<gko::next_precision<value_type>>;
+    using MixedMtx =
+        gko::matrix::Dense<gko::next_precision_with_half<value_type>>;
     using ComplexMtx = gko::to_complex<Mtx>;
     using RealMtx = gko::remove_complex<Mtx>;
     Dense()
@@ -80,7 +81,7 @@ class Dense : public ::testing::Test {
 };
 
 
-TYPED_TEST_SUITE(Dense, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Dense, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
 
 
 TYPED_TEST(Dense, CopyRespectsStride)
@@ -745,7 +746,7 @@ TYPED_TEST(Dense, ConvertsToPrecision)
 {
     using Dense = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using OtherT = typename gko::next_precision<T>;
+    using OtherT = typename gko::next_precision_with_half<T>;
     using OtherDense = typename gko::matrix::Dense<OtherT>;
     auto tmp = OtherDense::create(this->exec);
     auto res = Dense::create(this->exec);
@@ -767,7 +768,7 @@ TYPED_TEST(Dense, MovesToPrecision)
 {
     using Dense = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using OtherT = typename gko::next_precision<T>;
+    using OtherT = typename gko::next_precision_with_half<T>;
     using OtherDense = typename gko::matrix::Dense<OtherT>;
     auto tmp = OtherDense::create(this->exec);
     auto res = Dense::create(this->exec);
@@ -1066,7 +1067,7 @@ TYPED_TEST(Dense, AppliesToComplex)
 TYPED_TEST(Dense, AppliesToMixedComplex)
 {
     using mixed_value_type =
-        gko::next_precision<typename TestFixture::value_type>;
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using mixed_complex_type = gko::to_complex<mixed_value_type>;
     using Vec = gko::matrix::Dense<mixed_complex_type>;
     auto exec = gko::ReferenceExecutor::create();
@@ -1120,7 +1121,7 @@ TYPED_TEST(Dense, AdvancedAppliesToComplex)
 TYPED_TEST(Dense, AdvancedAppliesToMixedComplex)
 {
     using mixed_value_type =
-        gko::next_precision<typename TestFixture::value_type>;
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using mixed_complex_type = gko::to_complex<mixed_value_type>;
     using MixedDense = gko::matrix::Dense<mixed_value_type>;
     using MixedDenseComplex = gko::matrix::Dense<mixed_complex_type>;
@@ -1359,7 +1360,7 @@ class DenseWithIndexType
     std::unique_ptr<ScaledPermutation> scale_perm0;
 };
 
-TYPED_TEST_SUITE(DenseWithIndexType, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(DenseWithIndexType, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
@@ -2013,7 +2014,7 @@ TYPED_TEST(Dense, ConvertsEmptyToPrecision)
 {
     using Dense = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using OtherT = typename gko::next_precision<T>;
+    using OtherT = typename gko::next_precision_with_half<T>;
     using OtherDense = typename gko::matrix::Dense<OtherT>;
     auto empty = OtherDense::create(this->exec);
     auto res = Dense::create(this->exec);
@@ -2028,7 +2029,7 @@ TYPED_TEST(Dense, MovesEmptyToPrecision)
 {
     using Dense = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using OtherT = typename gko::next_precision<T>;
+    using OtherT = typename gko::next_precision_with_half<T>;
     using OtherDense = typename gko::matrix::Dense<OtherT>;
     auto empty = OtherDense::create(this->exec);
     auto res = Dense::create(this->exec);
diff --git a/reference/test/matrix/diagonal_kernels.cpp b/reference/test/matrix/diagonal_kernels.cpp
index e2ac67190d0..d1208e96178 100644
--- a/reference/test/matrix/diagonal_kernels.cpp
+++ b/reference/test/matrix/diagonal_kernels.cpp
@@ -30,7 +30,8 @@ class Diagonal : public ::testing::Test {
     using Csr = gko::matrix::Csr<value_type>;
     using Diag = gko::matrix::Diagonal<value_type>;
     using Dense = gko::matrix::Dense<value_type>;
-    using MixedDense = gko::matrix::Dense<gko::next_precision<value_type>>;
+    using MixedDense =
+        gko::matrix::Dense<gko::next_precision_with_half<value_type>>;
 
     Diagonal()
         : exec(gko::ReferenceExecutor::create()),
@@ -79,13 +80,14 @@ class Diagonal : public ::testing::Test {
     std::unique_ptr<Dense> dense3;
 };
 
-TYPED_TEST_SUITE(Diagonal, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Diagonal, gko::test::ValueTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(Diagonal, ConvertsToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
-    using OtherType = gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision_with_half<ValueType>;
     using Diagonal = typename TestFixture::Diag;
     using OtherDiagonal = gko::matrix::Diagonal<OtherType>;
     auto tmp = OtherDiagonal::create(this->exec);
@@ -107,7 +109,7 @@ TYPED_TEST(Diagonal, ConvertsToPrecision)
 TYPED_TEST(Diagonal, MovesToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
-    using OtherType = gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision_with_half<ValueType>;
     using Diagonal = typename TestFixture::Diag;
     using OtherDiagonal = gko::matrix::Diagonal<OtherType>;
     auto tmp = OtherDiagonal::create(this->exec);
@@ -574,7 +576,7 @@ TYPED_TEST(Diagonal, AppliesToComplex)
 TYPED_TEST(Diagonal, AppliesToMixedComplex)
 {
     using mixed_value_type =
-        gko::next_precision<typename TestFixture::value_type>;
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using mixed_complex_type = gko::to_complex<mixed_value_type>;
     using Vec = gko::matrix::Dense<mixed_complex_type>;
     auto exec = gko::ReferenceExecutor::create();
@@ -634,7 +636,7 @@ TYPED_TEST(Diagonal, AppliesLinearCombinationToComplex)
 TYPED_TEST(Diagonal, AppliesLinearCombinationToMixedComplex)
 {
     using mixed_value_type =
-        gko::next_precision<typename TestFixture::value_type>;
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using mixed_complex_type = gko::to_complex<mixed_value_type>;
     using Vec = gko::matrix::Dense<mixed_complex_type>;
     using Scalar = gko::matrix::Dense<mixed_value_type>;
diff --git a/reference/test/matrix/ell_kernels.cpp b/reference/test/matrix/ell_kernels.cpp
index 6214db82d1c..7f3c770c603 100644
--- a/reference/test/matrix/ell_kernels.cpp
+++ b/reference/test/matrix/ell_kernels.cpp
@@ -30,7 +30,8 @@ class Ell : public ::testing::Test {
     using Mtx = gko::matrix::Ell<value_type, index_type>;
     using Csr = gko::matrix::Csr<value_type, index_type>;
     using Vec = gko::matrix::Dense<value_type>;
-    using MixedVec = gko::matrix::Dense<gko::next_precision<value_type>>;
+    using MixedVec =
+        gko::matrix::Dense<gko::next_precision_with_half<value_type>>;
 
     Ell()
         : exec(gko::ReferenceExecutor::create()),
@@ -72,7 +73,8 @@ class Ell : public ::testing::Test {
     std::unique_ptr<Mtx> mtx2;
 };
 
-TYPED_TEST_SUITE(Ell, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Ell, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(Ell, AppliesToDenseVector)
@@ -91,7 +93,7 @@ TYPED_TEST(Ell, MixedAppliesToDenseVector1)
 {
     // Both vectors have the same value type which differs from the matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision<T>;
+    using next_T = gko::next_precision_with_half<T>;
     using Vec = typename gko::matrix::Dense<next_T>;
     auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
     auto y = Vec::create(this->exec, gko::dim<2>{2, 1});
@@ -106,7 +108,7 @@ TYPED_TEST(Ell, MixedAppliesToDenseVector2)
 {
     // Input vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision<T>;
+    using next_T = gko::next_precision_with_half<T>;
     using Vec1 = typename TestFixture::Vec;
     using Vec2 = gko::matrix::Dense<next_T>;
     auto x = gko::initialize<Vec1>({2.0, 1.0, 4.0}, this->exec);
@@ -122,9 +124,9 @@ TYPED_TEST(Ell, MixedAppliesToDenseVector3)
 {
     // Output vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision<T>;
+    using next_T = gko::next_precision_with_half<T>;
     using Vec1 = typename TestFixture::Vec;
-    using Vec2 = gko::matrix::Dense<gko::next_precision<T>>;
+    using Vec2 = gko::matrix::Dense<gko::next_precision_with_half<T>>;
     auto x = gko::initialize<Vec2>({2.0, 1.0, 4.0}, this->exec);
     auto y = Vec1::create(this->exec, gko::dim<2>{2, 1});
 
@@ -160,7 +162,7 @@ TYPED_TEST(Ell, MixedAppliesToDenseMatrix1)
 {
     // Both vectors have the same value type which differs from the matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision<T>;
+    using next_T = gko::next_precision_with_half<T>;
     using Vec = gko::matrix::Dense<next_T>;
     // clang-format off
     auto x = gko::initialize<Vec>(
@@ -184,7 +186,7 @@ TYPED_TEST(Ell, MixedAppliesToDenseMatrix2)
 {
     // Input vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision<T>;
+    using next_T = gko::next_precision_with_half<T>;
     using Vec1 = typename TestFixture::Vec;
     using Vec2 = gko::matrix::Dense<next_T>;
     // clang-format off
@@ -209,7 +211,7 @@ TYPED_TEST(Ell, MixedAppliesToDenseMatrix3)
 {
     // Output vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision<T>;
+    using next_T = gko::next_precision_with_half<T>;
     using Vec1 = typename TestFixture::Vec;
     using Vec2 = gko::matrix::Dense<next_T>;
     // clang-format off
@@ -248,7 +250,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseVector1)
 {
     // Both vectors have the same value type which differs from the matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision<T>;
+    using next_T = gko::next_precision_with_half<T>;
     using Vec = gko::matrix::Dense<next_T>;
     auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
     auto beta = gko::initialize<Vec>({2.0}, this->exec);
@@ -265,7 +267,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseVector2)
 {
     // Input vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision<T>;
+    using next_T = gko::next_precision_with_half<T>;
     using Vec1 = typename TestFixture::Vec;
     using Vec2 = gko::matrix::Dense<next_T>;
     auto alpha = gko::initialize<Vec1>({-1.0}, this->exec);
@@ -283,7 +285,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseVector3)
 {
     // Output vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision<T>;
+    using next_T = gko::next_precision_with_half<T>;
     using Vec1 = typename TestFixture::Vec;
     using Vec2 = gko::matrix::Dense<next_T>;
     auto alpha = gko::initialize<Vec2>({-1.0}, this->exec);
@@ -327,7 +329,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseMatrix1)
 {
     // Both vectors have the same value type which differs from the matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision<T>;
+    using next_T = gko::next_precision_with_half<T>;
     using Vec = gko::matrix::Dense<next_T>;
     auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
     auto beta = gko::initialize<Vec>({2.0}, this->exec);
@@ -355,7 +357,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseMatrix2)
 {
     // Input vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision<T>;
+    using next_T = gko::next_precision_with_half<T>;
     using Vec1 = typename TestFixture::Vec;
     using Vec2 = gko::matrix::Dense<next_T>;
     auto alpha = gko::initialize<Vec1>({-1.0}, this->exec);
@@ -384,7 +386,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseMatrix3)
 {
     // Output vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision<T>;
+    using next_T = gko::next_precision_with_half<T>;
     using Vec1 = typename TestFixture::Vec;
     using Vec2 = gko::matrix::Dense<next_T>;
     auto alpha = gko::initialize<Vec2>({-1.0}, this->exec);
@@ -443,7 +445,7 @@ TYPED_TEST(Ell, ConvertsToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision_with_half<ValueType>;
     using Ell = typename TestFixture::Mtx;
     using OtherEll = gko::matrix::Ell<OtherType, IndexType>;
     auto tmp = OtherEll::create(this->exec);
@@ -466,7 +468,7 @@ TYPED_TEST(Ell, MovesToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision_with_half<ValueType>;
     using Ell = typename TestFixture::Mtx;
     using OtherEll = gko::matrix::Ell<OtherType, IndexType>;
     auto tmp = OtherEll::create(this->exec);
@@ -735,7 +737,7 @@ TYPED_TEST(Ell, ConvertsEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision_with_half<ValueType>;
     using Ell = typename TestFixture::Mtx;
     using OtherEll = gko::matrix::Ell<OtherType, IndexType>;
     auto empty = Ell::create(this->exec);
@@ -752,7 +754,7 @@ TYPED_TEST(Ell, MovesEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision_with_half<ValueType>;
     using Ell = typename TestFixture::Mtx;
     using OtherEll = gko::matrix::Ell<OtherType, IndexType>;
     auto empty = Ell::create(this->exec);
@@ -897,7 +899,7 @@ TYPED_TEST(Ell, AppliesToComplex)
 TYPED_TEST(Ell, AppliesToMixedComplex)
 {
     using mixed_value_type =
-        gko::next_precision<typename TestFixture::value_type>;
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using mixed_complex_type = gko::to_complex<mixed_value_type>;
     using Vec = gko::matrix::Dense<mixed_complex_type>;
     auto exec = gko::ReferenceExecutor::create();
@@ -954,7 +956,7 @@ TYPED_TEST(Ell, AdvancedAppliesToComplex)
 TYPED_TEST(Ell, AdvancedAppliesToMixedComplex)
 {
     using mixed_value_type =
-        gko::next_precision<typename TestFixture::value_type>;
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using mixed_complex_type = gko::to_complex<mixed_value_type>;
     using MixedDense = gko::matrix::Dense<mixed_value_type>;
     using MixedDenseComplex = gko::matrix::Dense<mixed_complex_type>;
@@ -992,7 +994,7 @@ class EllComplex : public ::testing::Test {
     using Mtx = gko::matrix::Ell<value_type, index_type>;
 };
 
-TYPED_TEST_SUITE(EllComplex, gko::test::ComplexValueIndexTypes,
+TYPED_TEST_SUITE(EllComplex, gko::test::ComplexValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/matrix/fbcsr_kernels.cpp b/reference/test/matrix/fbcsr_kernels.cpp
index 665df4ace31..9d9e2144cc3 100644
--- a/reference/test/matrix/fbcsr_kernels.cpp
+++ b/reference/test/matrix/fbcsr_kernels.cpp
@@ -104,7 +104,8 @@ class Fbcsr : public ::testing::Test {
     const std::unique_ptr<const Mtx> mtxsq;
 };
 
-TYPED_TEST_SUITE(Fbcsr, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Fbcsr, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 template <typename T>
@@ -114,7 +115,7 @@ std::unique_ptr<gko::matrix::Dense<T>> get_some_vectors(
 {
     using RT = gko::remove_complex<T>;
     std::default_random_engine engine(39);
-    std::normal_distribution<RT> dist(0.0, 5.0);
+    std::normal_distribution<> dist(0.0, 5.0);
     std::uniform_int_distribution<> nnzdist(1, nrhs);
     return gko::test::generate_random_matrix<gko::matrix::Dense<T>>(
         nrows, nrhs, nnzdist, dist, engine, exec);
@@ -271,7 +272,7 @@ TYPED_TEST(Fbcsr, ConvertsToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision_with_half<ValueType>;
     using Fbcsr = typename TestFixture::Mtx;
     using OtherFbcsr = gko::matrix::Fbcsr<OtherType, IndexType>;
     auto tmp = OtherFbcsr::create(this->exec);
@@ -294,7 +295,7 @@ TYPED_TEST(Fbcsr, MovesToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision_with_half<ValueType>;
     using Fbcsr = typename TestFixture::Mtx;
     using OtherFbcsr = gko::matrix::Fbcsr<OtherType, IndexType>;
     auto tmp = OtherFbcsr::create(this->exec);
@@ -392,7 +393,7 @@ TYPED_TEST(Fbcsr, ConvertsEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision_with_half<ValueType>;
     using Fbcsr = typename TestFixture::Mtx;
     using OtherFbcsr = gko::matrix::Fbcsr<OtherType, IndexType>;
     auto empty = OtherFbcsr::create(this->exec);
@@ -411,7 +412,7 @@ TYPED_TEST(Fbcsr, MovesEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision_with_half<ValueType>;
     using Fbcsr = typename TestFixture::Mtx;
     using OtherFbcsr = gko::matrix::Fbcsr<OtherType, IndexType>;
     auto empty = OtherFbcsr::create(this->exec);
@@ -619,7 +620,7 @@ class FbcsrComplex : public ::testing::Test {
     using Csr = gko::matrix::Csr<value_type, index_type>;
 };
 
-TYPED_TEST_SUITE(FbcsrComplex, gko::test::ComplexValueIndexTypes,
+TYPED_TEST_SUITE(FbcsrComplex, gko::test::ComplexValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/matrix/hybrid_kernels.cpp b/reference/test/matrix/hybrid_kernels.cpp
index 87fd4c02811..c5e6496dce1 100644
--- a/reference/test/matrix/hybrid_kernels.cpp
+++ b/reference/test/matrix/hybrid_kernels.cpp
@@ -32,7 +32,8 @@ class Hybrid : public ::testing::Test {
     using Mtx = gko::matrix::Hybrid<value_type, index_type>;
     using Vec = gko::matrix::Dense<value_type>;
     using Csr = gko::matrix::Csr<value_type, index_type>;
-    using MixedVec = gko::matrix::Dense<gko::next_precision<value_type>>;
+    using MixedVec =
+        gko::matrix::Dense<gko::next_precision_with_half<value_type>>;
 
     Hybrid()
         : exec(gko::ReferenceExecutor::create()),
@@ -96,7 +97,8 @@ class Hybrid : public ::testing::Test {
     std::unique_ptr<Mtx> mtx3;
 };
 
-TYPED_TEST_SUITE(Hybrid, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Hybrid, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(Hybrid, AppliesToDenseVector)
@@ -233,7 +235,7 @@ TYPED_TEST(Hybrid, ConvertsToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision_with_half<ValueType>;
     using Hybrid = typename TestFixture::Mtx;
     using OtherHybrid = gko::matrix::Hybrid<OtherType, IndexType>;
     auto tmp = OtherHybrid::create(this->exec);
@@ -256,7 +258,7 @@ TYPED_TEST(Hybrid, MovesToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision_with_half<ValueType>;
     using Hybrid = typename TestFixture::Mtx;
     using OtherHybrid = gko::matrix::Hybrid<OtherType, IndexType>;
     auto tmp = OtherHybrid::create(this->exec);
@@ -367,7 +369,7 @@ TYPED_TEST(Hybrid, ConvertsEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision_with_half<ValueType>;
     using Hybrid = typename TestFixture::Mtx;
     using OtherHybrid = gko::matrix::Hybrid<OtherType, IndexType>;
     auto other = Hybrid::create(this->exec);
@@ -384,7 +386,7 @@ TYPED_TEST(Hybrid, MovesEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision_with_half<ValueType>;
     using Hybrid = typename TestFixture::Mtx;
     using OtherHybrid = gko::matrix::Hybrid<OtherType, IndexType>;
     auto other = Hybrid::create(this->exec);
@@ -699,7 +701,7 @@ TYPED_TEST(Hybrid, AppliesToComplex)
 TYPED_TEST(Hybrid, AppliesToMixedComplex)
 {
     using mixed_value_type =
-        gko::next_precision<typename TestFixture::value_type>;
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using mixed_complex_type = gko::to_complex<mixed_value_type>;
     using Vec = gko::matrix::Dense<mixed_complex_type>;
     auto exec = gko::ReferenceExecutor::create();
@@ -756,7 +758,7 @@ TYPED_TEST(Hybrid, AdvancedAppliesToComplex)
 TYPED_TEST(Hybrid, AdvancedAppliesToMixedComplex)
 {
     using mixed_value_type =
-        gko::next_precision<typename TestFixture::value_type>;
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using mixed_complex_type = gko::to_complex<mixed_value_type>;
     using MixedDense = gko::matrix::Dense<mixed_value_type>;
     using MixedDenseComplex = gko::matrix::Dense<mixed_complex_type>;
@@ -795,7 +797,7 @@ class HybridComplex : public ::testing::Test {
     using Mtx = gko::matrix::Hybrid<value_type, index_type>;
 };
 
-TYPED_TEST_SUITE(HybridComplex, gko::test::ComplexValueIndexTypes,
+TYPED_TEST_SUITE(HybridComplex, gko::test::ComplexValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/matrix/identity.cpp b/reference/test/matrix/identity.cpp
index 11953de338a..82704145978 100644
--- a/reference/test/matrix/identity.cpp
+++ b/reference/test/matrix/identity.cpp
@@ -19,7 +19,8 @@ class Identity : public ::testing::Test {
     using value_type = T;
     using Id = gko::matrix::Identity<value_type>;
     using Vec = gko::matrix::Dense<value_type>;
-    using MixedVec = gko::matrix::Dense<gko::next_precision<value_type>>;
+    using MixedVec =
+        gko::matrix::Dense<gko::next_precision_with_half<value_type>>;
     using ComplexVec = gko::to_complex<Vec>;
     using MixedComplexVec = gko::to_complex<MixedVec>;
 
@@ -29,7 +30,8 @@ class Identity : public ::testing::Test {
 };
 
 
-TYPED_TEST_SUITE(Identity, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Identity, gko::test::ValueTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(Identity, AppliesToVector)
diff --git a/reference/test/matrix/permutation.cpp b/reference/test/matrix/permutation.cpp
index 5418f97353b..b646a6fc67f 100644
--- a/reference/test/matrix/permutation.cpp
+++ b/reference/test/matrix/permutation.cpp
@@ -51,7 +51,7 @@ class Permutation : public ::testing::Test {
     std::shared_ptr<const gko::Executor> exec;
 };
 
-TYPED_TEST_SUITE(Permutation, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(Permutation, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/matrix/scaled_permutation.cpp b/reference/test/matrix/scaled_permutation.cpp
index 6d8d49f5662..f2b3e66b4cd 100644
--- a/reference/test/matrix/scaled_permutation.cpp
+++ b/reference/test/matrix/scaled_permutation.cpp
@@ -58,7 +58,7 @@ class ScaledPermutation : public ::testing::Test {
     std::unique_ptr<Mtx> perm2;
 };
 
-TYPED_TEST_SUITE(ScaledPermutation, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(ScaledPermutation, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/matrix/sellp_kernels.cpp b/reference/test/matrix/sellp_kernels.cpp
index 3208b8c42be..23251c63b8f 100644
--- a/reference/test/matrix/sellp_kernels.cpp
+++ b/reference/test/matrix/sellp_kernels.cpp
@@ -50,7 +50,8 @@ class Sellp : public ::testing::Test {
     std::unique_ptr<Mtx> mtx2;
 };
 
-TYPED_TEST_SUITE(Sellp, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Sellp, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(Sellp, AppliesToDenseVector)
@@ -67,7 +68,8 @@ TYPED_TEST(Sellp, AppliesToDenseVector)
 
 TYPED_TEST(Sellp, AppliesToMixedDenseVector)
 {
-    using value_type = gko::next_precision<typename TestFixture::value_type>;
+    using value_type =
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using Vec = gko::matrix::Dense<value_type>;
     auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
     auto y = Vec::create(this->exec, gko::dim<2>{2, 1});
@@ -116,7 +118,8 @@ TYPED_TEST(Sellp, AppliesLinearCombinationToDenseVector)
 
 TYPED_TEST(Sellp, AppliesLinearCombinationToMixedDenseVector)
 {
-    using value_type = gko::next_precision<typename TestFixture::value_type>;
+    using value_type =
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using Vec = gko::matrix::Dense<value_type>;
     auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
     auto beta = gko::initialize<Vec>({2.0}, this->exec);
@@ -189,7 +192,7 @@ TYPED_TEST(Sellp, ConvertsToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision_with_half<ValueType>;
     using Sellp = typename TestFixture::Mtx;
     using OtherSellp = gko::matrix::Sellp<OtherType, IndexType>;
     auto tmp = OtherSellp::create(this->exec);
@@ -212,7 +215,7 @@ TYPED_TEST(Sellp, MovesToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision_with_half<ValueType>;
     using Sellp = typename TestFixture::Mtx;
     using OtherSellp = gko::matrix::Sellp<OtherType, IndexType>;
     auto tmp = OtherSellp::create(this->exec);
@@ -310,7 +313,7 @@ TYPED_TEST(Sellp, ConvertsEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision_with_half<ValueType>;
     using Sellp = typename TestFixture::Mtx;
     using OtherSellp = gko::matrix::Sellp<OtherType, IndexType>;
     auto empty = OtherSellp::create(this->exec);
@@ -329,7 +332,7 @@ TYPED_TEST(Sellp, MovesEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision<ValueType>;
+    using OtherType = gko::next_precision_with_half<ValueType>;
     using Sellp = typename TestFixture::Mtx;
     using OtherSellp = gko::matrix::Sellp<OtherType, IndexType>;
     auto empty = OtherSellp::create(this->exec);
@@ -751,7 +754,7 @@ class SellpComplex : public ::testing::Test {
     using Mtx = gko::matrix::Sellp<value_type, index_type>;
 };
 
-TYPED_TEST_SUITE(SellpComplex, gko::test::ComplexValueIndexTypes,
+TYPED_TEST_SUITE(SellpComplex, gko::test::ComplexValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/matrix/sparsity_csr.cpp b/reference/test/matrix/sparsity_csr.cpp
index d8ed6147e30..8db0dee144f 100644
--- a/reference/test/matrix/sparsity_csr.cpp
+++ b/reference/test/matrix/sparsity_csr.cpp
@@ -47,7 +47,7 @@ class SparsityCsr : public ::testing::Test {
     std::unique_ptr<Mtx> mtx;
 };
 
-TYPED_TEST_SUITE(SparsityCsr, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(SparsityCsr, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/matrix/sparsity_csr_kernels.cpp b/reference/test/matrix/sparsity_csr_kernels.cpp
index f08d6c352ca..30805d033ab 100644
--- a/reference/test/matrix/sparsity_csr_kernels.cpp
+++ b/reference/test/matrix/sparsity_csr_kernels.cpp
@@ -125,7 +125,7 @@ class SparsityCsr : public ::testing::Test {
     std::unique_ptr<Mtx> mtx3_unsorted;
 };
 
-TYPED_TEST_SUITE(SparsityCsr, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(SparsityCsr, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
@@ -145,7 +145,7 @@ TYPED_TEST(SparsityCsr, AppliesToDenseVector)
 
 TYPED_TEST(SparsityCsr, AppliesToMixedDenseVector)
 {
-    using T = gko::next_precision<typename TestFixture::value_type>;
+    using T = gko::next_precision_with_half<typename TestFixture::value_type>;
     using Vec = gko::matrix::Dense<T>;
     auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
     auto y = Vec::create(this->exec, gko::dim<2>{2, 1});
@@ -192,7 +192,7 @@ TYPED_TEST(SparsityCsr, AppliesLinearCombinationToDenseVector)
 
 TYPED_TEST(SparsityCsr, AppliesLinearCombinationToMixedDenseVector)
 {
-    using T = gko::next_precision<typename TestFixture::value_type>;
+    using T = gko::next_precision_with_half<typename TestFixture::value_type>;
     using Vec = gko::matrix::Dense<T>;
     auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
     auto beta = gko::initialize<Vec>({2.0}, this->exec);
@@ -243,8 +243,8 @@ TYPED_TEST(SparsityCsr, AppliesToComplex)
 
 TYPED_TEST(SparsityCsr, AppliesToMixedComplex)
 {
-    using T =
-        gko::next_precision<gko::to_complex<typename TestFixture::value_type>>;
+    using T = gko::next_precision_with_half<
+        gko::to_complex<typename TestFixture::value_type>>;
     using Vec = gko::matrix::Dense<T>;
     auto x = gko::initialize<Vec>({T{2.0, 4.0}, T{1.0, 2.0}, T{4.0, 8.0}},
                                   this->exec);
@@ -279,7 +279,7 @@ TYPED_TEST(SparsityCsr, AppliesLinearCombinationToComplex)
 TYPED_TEST(SparsityCsr, AppliesLinearCombinationToMixedComplex)
 {
     using Vec = gko::matrix::Dense<
-        gko::next_precision<typename TestFixture::value_type>>;
+        gko::next_precision_with_half<typename TestFixture::value_type>>;
     using ComplexVec = gko::to_complex<Vec>;
     using T = typename ComplexVec::value_type;
     auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
diff --git a/test/matrix/fbcsr_kernels.cpp b/test/matrix/fbcsr_kernels.cpp
index 4ff8e1fc36a..5e3d4b1a112 100644
--- a/test/matrix/fbcsr_kernels.cpp
+++ b/test/matrix/fbcsr_kernels.cpp
@@ -48,18 +48,23 @@ class Fbcsr : public CommonTestFixture {
     void generate_sin(gko::ptr_param<Dense> x)
     {
         value_type* const xarr = x->get_values();
+        // we do not have sin for half, so we compute sin in double or
+        // complex<double>
+        using working_type = std::conditional_t<gko::is_complex<value_type>(),
+                                                std::complex<double>, double>;
         for (index_type i = 0; i < x->get_size()[0] * x->get_size()[1]; i++) {
-            xarr[i] =
-                static_cast<real_type>(2.0) *
-                std::sin(static_cast<real_type>(i / 2.0) + get_random_value());
+            xarr[i] = static_cast<value_type>(
+                2.0 * std::sin(i / 2.0 +
+                               static_cast<working_type>(get_random_value())));
         }
     }
 };
 
 #ifdef GKO_COMPILING_HIP
-TYPED_TEST_SUITE(Fbcsr, gko::test::RealValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Fbcsr, gko::test::RealValueTypesWithHalf,
+                 TypenameNameGenerator);
 #else
-TYPED_TEST_SUITE(Fbcsr, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Fbcsr, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
 #endif
 
 TYPED_TEST(Fbcsr, CanWriteFromMatrixOnDevice)
@@ -124,6 +129,8 @@ TYPED_TEST(Fbcsr, SpmvIsEquivalentToRefSorted)
     using Dense = typename TestFixture::Dense;
     using value_type = typename Mtx::value_type;
     if (this->exec->get_master() != this->exec) {
+        // FBCSR on accelerator does not have half precision apply through
+        // vendor libraries.
         SKIP_IF_HALF(value_type);
     }
     auto drand = gko::clone(this->exec, this->rsorted);
@@ -149,6 +156,8 @@ TYPED_TEST(Fbcsr, SpmvMultiIsEquivalentToRefSorted)
     using Dense = typename TestFixture::Dense;
     using value_type = typename Mtx::value_type;
     if (this->exec->get_master() != this->exec) {
+        // FBCSR on accelerator does not have half precision apply through
+        // vendor libraries.
         SKIP_IF_HALF(value_type);
     }
     auto drand = gko::clone(this->exec, this->rsorted);
@@ -175,6 +184,8 @@ TYPED_TEST(Fbcsr, AdvancedSpmvIsEquivalentToRefSorted)
     using value_type = typename TestFixture::value_type;
     using real_type = typename TestFixture::real_type;
     if (this->exec->get_master() != this->exec) {
+        // FBCSR on accelerator does not have half precision apply through
+        // vendor libraries.
         SKIP_IF_HALF(value_type);
     }
     auto drand = gko::clone(this->exec, this->rsorted);
@@ -208,6 +219,8 @@ TYPED_TEST(Fbcsr, AdvancedSpmvMultiIsEquivalentToRefSorted)
     using value_type = typename TestFixture::value_type;
     using real_type = typename TestFixture::real_type;
     if (this->exec->get_master() != this->exec) {
+        // FBCSR on accelerator does not have half precision apply through
+        // vendor libraries.
         SKIP_IF_HALF(value_type);
     }
     auto drand = gko::clone(this->exec, this->rsorted);

From 89a3d779817bb3d971a55eb25c1ad448b448176c Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 25 Oct 2024 16:29:56 +0200
Subject: [PATCH 365/448] base such as composition/combination with half and
 corr. test

---
 core/base/block_operator.cpp         |  8 ++++++--
 core/base/combination.cpp            |  2 +-
 core/base/composition.cpp            |  2 +-
 core/base/dense_cache.cpp            |  2 +-
 core/base/perturbation.cpp           |  2 +-
 core/test/base/combination.cpp       |  3 ++-
 core/test/base/composition.cpp       |  3 ++-
 core/test/base/dense_cache.cpp       |  3 ++-
 reference/test/base/composition.cpp  | 13 ++++++++-----
 reference/test/base/perturbation.cpp | 13 ++++++++-----
 10 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/core/base/block_operator.cpp b/core/base/block_operator.cpp
index f53375301a8..68c00aeee70 100644
--- a/core/base/block_operator.cpp
+++ b/core/base/block_operator.cpp
@@ -19,8 +19,12 @@ namespace {
 template <typename Fn>
 auto dispatch_dense(Fn&& fn, LinOp* v)
 {
-    return run<matrix::Dense, float, double, std::complex<float>,
-               std::complex<double>>(v, std::forward<Fn>(fn));
+    return run<matrix::Dense, float, double,
+#if GINKGO_ENABLE_HALF
+               half, std::complex<half>,
+#endif
+               std::complex<float>, std::complex<double>>(v,
+                                                          std::forward<Fn>(fn));
 }
 
 
diff --git a/core/base/combination.cpp b/core/base/combination.cpp
index 3b30b77d38c..53af6742f6e 100644
--- a/core/base/combination.cpp
+++ b/core/base/combination.cpp
@@ -168,7 +168,7 @@ void Combination<ValueType>::apply_impl(const LinOp* alpha, const LinOp* b,
 
 
 #define GKO_DECLARE_COMBINATION(_type) class Combination<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_COMBINATION);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_COMBINATION);
 
 
 }  // namespace gko
diff --git a/core/base/composition.cpp b/core/base/composition.cpp
index 82c8152300b..f6a7df21e45 100644
--- a/core/base/composition.cpp
+++ b/core/base/composition.cpp
@@ -222,7 +222,7 @@ void Composition<ValueType>::apply_impl(const LinOp* alpha, const LinOp* b,
 
 
 #define GKO_DECLARE_COMPOSITION(_type) class Composition<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_COMPOSITION);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_COMPOSITION);
 
 
 }  // namespace gko
diff --git a/core/base/dense_cache.cpp b/core/base/dense_cache.cpp
index 38a0decfa46..096ad1f761a 100644
--- a/core/base/dense_cache.cpp
+++ b/core/base/dense_cache.cpp
@@ -33,7 +33,7 @@ void DenseCache<ValueType>::init_from(
 
 
 #define GKO_DECLARE_DENSE_CACHE(_type) struct DenseCache<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CACHE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_CACHE);
 
 
 }  // namespace detail
diff --git a/core/base/perturbation.cpp b/core/base/perturbation.cpp
index 87501361c05..b17cba209e1 100644
--- a/core/base/perturbation.cpp
+++ b/core/base/perturbation.cpp
@@ -182,7 +182,7 @@ void Perturbation<ValueType>::apply_impl(const LinOp* alpha, const LinOp* b,
 
 
 #define GKO_DECLARE_PERTURBATION(_type) class Perturbation<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_PERTURBATION);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_PERTURBATION);
 
 
 }  // namespace gko
diff --git a/core/test/base/combination.cpp b/core/test/base/combination.cpp
index 73c30ffe11c..63c73cfa168 100644
--- a/core/test/base/combination.cpp
+++ b/core/test/base/combination.cpp
@@ -43,7 +43,8 @@ class Combination : public ::testing::Test {
     std::vector<std::shared_ptr<gko::LinOp>> coefficients;
 };
 
-TYPED_TEST_SUITE(Combination, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Combination, gko::test::ValueTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(Combination, CanBeEmpty)
diff --git a/core/test/base/composition.cpp b/core/test/base/composition.cpp
index 122755b8f92..58c86894fc8 100644
--- a/core/test/base/composition.cpp
+++ b/core/test/base/composition.cpp
@@ -41,7 +41,8 @@ class Composition : public ::testing::Test {
     std::vector<std::shared_ptr<gko::LinOp>> operators;
 };
 
-TYPED_TEST_SUITE(Composition, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Composition, gko::test::ValueTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(Composition, CanBeEmpty)
diff --git a/core/test/base/dense_cache.cpp b/core/test/base/dense_cache.cpp
index 526187610a4..54d904617db 100644
--- a/core/test/base/dense_cache.cpp
+++ b/core/test/base/dense_cache.cpp
@@ -31,7 +31,8 @@ class DenseCache : public ::testing::Test {
 };
 
 
-TYPED_TEST_SUITE(DenseCache, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(DenseCache, gko::test::ValueTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(DenseCache, CanDefaultConstruct)
diff --git a/reference/test/base/composition.cpp b/reference/test/base/composition.cpp
index f736edb53f9..d17b8602ce8 100644
--- a/reference/test/base/composition.cpp
+++ b/reference/test/base/composition.cpp
@@ -75,7 +75,8 @@ class Composition : public ::testing::Test {
     std::shared_ptr<Mtx> product;
 };
 
-TYPED_TEST_SUITE(Composition, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Composition, gko::test::ValueTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(Composition, CopiesOnSameExecutor)
@@ -142,7 +143,7 @@ TYPED_TEST(Composition, AppliesSingleToMixedVector)
         cmp = [ -9 -2 ]
               [ 27 26 ]
     */
-    using Mtx = gko::matrix::Dense<gko::next_precision<TypeParam>>;
+    using Mtx = gko::matrix::Dense<gko::next_precision_with_half<TypeParam>>;
     using value_type = typename Mtx::value_type;
     auto cmp = gko::Composition<TypeParam>::create(this->product);
     auto x = gko::initialize<Mtx>({1.0, 2.0}, this->exec);
@@ -182,7 +183,8 @@ TYPED_TEST(Composition, AppliesSingleToMixedComplexVector)
         cmp = [ -9 -2 ]
               [ 27 26 ]
     */
-    using value_type = gko::next_precision<gko::to_complex<TypeParam>>;
+    using value_type =
+        gko::next_precision_with_half<gko::to_complex<TypeParam>>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto cmp = gko::Composition<TypeParam>::create(this->product);
     auto x = gko::initialize<Mtx>(
@@ -222,7 +224,7 @@ TYPED_TEST(Composition, AppliesSingleLinearCombinationToMixedVector)
         cmp = [ -9 -2 ]
               [ 27 26 ]
     */
-    using value_type = gko::next_precision<TypeParam>;
+    using value_type = gko::next_precision_with_half<TypeParam>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto cmp = gko::Composition<TypeParam>::create(this->product);
     auto alpha = gko::initialize<Mtx>({3.0}, this->exec);
@@ -267,7 +269,8 @@ TYPED_TEST(Composition, AppliesSingleLinearCombinationToMixedComplexVector)
         cmp = [ -9 -2 ]
               [ 27 26 ]
     */
-    using MixedDense = gko::matrix::Dense<gko::next_precision<TypeParam>>;
+    using MixedDense =
+        gko::matrix::Dense<gko::next_precision_with_half<TypeParam>>;
     using MixedDenseComplex = gko::to_complex<MixedDense>;
     using value_type = typename MixedDenseComplex::value_type;
     auto cmp = gko::Composition<TypeParam>::create(this->product);
diff --git a/reference/test/base/perturbation.cpp b/reference/test/base/perturbation.cpp
index b6be9ab1563..50a5fe7db20 100644
--- a/reference/test/base/perturbation.cpp
+++ b/reference/test/base/perturbation.cpp
@@ -33,7 +33,8 @@ class Perturbation : public ::testing::Test {
     std::shared_ptr<gko::LinOp> scalar;
 };
 
-TYPED_TEST_SUITE(Perturbation, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Perturbation, gko::test::ValueTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(Perturbation, CopiesOnSameExecutor)
@@ -101,7 +102,7 @@ TYPED_TEST(Perturbation, AppliesToMixedVector)
         cmp = I + 2 * [ 2 ] * [ 3 2 ]
                       [ 1 ]
     */
-    using Mtx = gko::matrix::Dense<gko::next_precision<TypeParam>>;
+    using Mtx = gko::matrix::Dense<gko::next_precision_with_half<TypeParam>>;
     using value_type = typename Mtx::value_type;
     auto cmp = gko::Perturbation<TypeParam>::create(this->scalar, this->basis,
                                                     this->projector);
@@ -143,7 +144,8 @@ TYPED_TEST(Perturbation, AppliesToMixedComplexVector)
         cmp = I + 2 * [ 2 ] * [ 3 2 ]
                       [ 1 ]
     */
-    using value_type = gko::to_complex<gko::next_precision<TypeParam>>;
+    using value_type =
+        gko::to_complex<gko::next_precision_with_half<TypeParam>>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto cmp = gko::Perturbation<TypeParam>::create(this->scalar, this->basis,
                                                     this->projector);
@@ -185,7 +187,7 @@ TYPED_TEST(Perturbation, AppliesLinearCombinationToMixedVector)
         cmp = I + 2 * [ 2 ] * [ 3 2 ]
                       [ 1 ]
     */
-    using value_type = gko::next_precision<TypeParam>;
+    using value_type = gko::next_precision_with_half<TypeParam>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto cmp = gko::Perturbation<TypeParam>::create(this->scalar, this->basis,
                                                     this->projector);
@@ -232,7 +234,8 @@ TYPED_TEST(Perturbation, AppliesLinearCombinationToMixedComplexVector)
         cmp = I + 2 * [ 2 ] * [ 3 2 ]
                       [ 1 ]
     */
-    using MixedDense = gko::matrix::Dense<gko::next_precision<TypeParam>>;
+    using MixedDense =
+        gko::matrix::Dense<gko::next_precision_with_half<TypeParam>>;
     using MixedDenseComplex = gko::to_complex<MixedDense>;
     using value_type = typename MixedDenseComplex::value_type;
     auto cmp = gko::Perturbation<TypeParam>::create(this->scalar, this->basis,

From 28936f514cbe05535b14505f2ec41870aa7177cb Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 4 Nov 2024 15:15:17 +0100
Subject: [PATCH 366/448] test_utils test

---
 core/test/utils/array_generator_test.cpp  | 18 +++++----
 core/test/utils/matrix_generator.hpp      | 18 +++++++--
 core/test/utils/matrix_generator_test.cpp | 49 ++++++++++++++---------
 core/test/utils/matrix_utils_test.cpp     | 11 ++---
 core/test/utils/unsort_matrix_test.cpp    |  2 +-
 core/test/utils/value_generator_test.cpp  | 16 +++++---
 reference/test/utils/assertions_test.cpp  |  3 +-
 7 files changed, 73 insertions(+), 44 deletions(-)

diff --git a/core/test/utils/array_generator_test.cpp b/core/test/utils/array_generator_test.cpp
index ae66e4686da..ca96761ea4e 100644
--- a/core/test/utils/array_generator_test.cpp
+++ b/core/test/utils/array_generator_test.cpp
@@ -18,11 +18,12 @@ template <typename T>
 class ArrayGenerator : public ::testing::Test {
 protected:
     using value_type = T;
+    using check_type = double;
 
     ArrayGenerator() : exec(gko::ReferenceExecutor::create())
     {
         array = gko::test::generate_random_array<T>(
-            500, std::normal_distribution<gko::remove_complex<T>>(20.0, 5.0),
+            500, std::normal_distribution<>(20.0, 5.0),
             std::default_random_engine(42), exec);
     }
 
@@ -30,15 +31,17 @@ class ArrayGenerator : public ::testing::Test {
     gko::array<T> array;
 
     template <typename InputIterator, typename ValueType, typename Closure>
-    ValueType get_nth_moment(int n, ValueType c, InputIterator sample_start,
-                             InputIterator sample_end, Closure closure_op)
+    check_type get_nth_moment(int n, ValueType c, InputIterator sample_start,
+                              InputIterator sample_end, Closure closure_op)
     {
         using std::pow;
-        ValueType res = 0;
-        ValueType num_elems = 0;
+        check_type res = 0;
+        check_type num_elems = 0;
         while (sample_start != sample_end) {
             auto tmp = *(sample_start++);
-            res += pow(closure_op(tmp) - c, n);
+            res += pow(static_cast<check_type>(closure_op(tmp)) -
+                           static_cast<check_type>(c),
+                       n);
             num_elems += 1;
         }
         return res / num_elems;
@@ -62,7 +65,8 @@ class ArrayGenerator : public ::testing::Test {
     }
 };
 
-TYPED_TEST_SUITE(ArrayGenerator, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(ArrayGenerator, gko::test::ValueTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(ArrayGenerator, OutputHasCorrectSize)
diff --git a/core/test/utils/matrix_generator.hpp b/core/test/utils/matrix_generator.hpp
index 56ff38c520d..01ee40cdadc 100644
--- a/core/test/utils/matrix_generator.hpp
+++ b/core/test/utils/matrix_generator.hpp
@@ -659,10 +659,20 @@ gko::matrix_data<ValueType, IndexType> generate_tridiag_inverse_matrix_data(
                 auto off_diag = i < j ? upper : lower;
                 auto min_idx = std::min(i, j);
                 auto max_idx = std::max(i, j);
-                auto val = sign *
-                           static_cast<ValueType>(
-                               std::pow(off_diag, max_idx - min_idx)) *
-                           alpha[min_idx] * beta[max_idx + 1] / alpha.back();
+                // NVHPC 23.3 with O3 gives wrong result with std::pow on
+                // complex<half>. We use the float variant to help it, also for
+                // half.
+                using pow_type = std::conditional_t<
+                    std::is_same<gko::remove_complex<ValueType>,
+                                 gko::half>::value,
+                    std::conditional_t<gko::is_complex<ValueType>(),
+                                       std::complex<float>, float>,
+                    ValueType>;
+                auto val =
+                    sign *
+                    static_cast<ValueType>(std::pow(
+                        static_cast<pow_type>(off_diag), max_idx - min_idx)) *
+                    alpha[min_idx] * beta[max_idx + 1] / alpha.back();
                 md.nonzeros.emplace_back(i, j, val);
             }
         }
diff --git a/core/test/utils/matrix_generator_test.cpp b/core/test/utils/matrix_generator_test.cpp
index 43756bc1709..61710540e24 100644
--- a/core/test/utils/matrix_generator_test.cpp
+++ b/core/test/utils/matrix_generator_test.cpp
@@ -20,31 +20,32 @@ template <typename T>
 class MatrixGenerator : public ::testing::Test {
 protected:
     using value_type = T;
+    using check_type = double;
     using real_type = gko::remove_complex<T>;
     using mtx_type = gko::matrix::Dense<T>;
 
     MatrixGenerator()
         : exec(gko::ReferenceExecutor::create()),
           mtx(gko::test::generate_random_matrix<mtx_type>(
-              500, 100, std::normal_distribution<real_type>(50, 5),
-              std::normal_distribution<real_type>(20.0, 5.0),
+              500, 100, std::normal_distribution<>(50, 5),
+              std::normal_distribution<>(20.0, 5.0),
               std::default_random_engine(42), exec)),
           dense_mtx(gko::test::generate_random_dense_matrix<value_type>(
-              500, 100, std::normal_distribution<real_type>(20.0, 5.0),
+              500, 100, std::normal_distribution<>(20.0, 5.0),
               std::default_random_engine(41), exec)),
           l_mtx(gko::test::generate_random_lower_triangular_matrix<mtx_type>(
-              4, true, std::normal_distribution<real_type>(50, 5),
-              std::normal_distribution<real_type>(20.0, 5.0),
+              4, true, std::normal_distribution<>(50, 5),
+              std::normal_distribution<>(20.0, 5.0),
               std::default_random_engine(42), exec)),
           u_mtx(gko::test::generate_random_upper_triangular_matrix<mtx_type>(
-              4, true, std::normal_distribution<real_type>(50, 5),
-              std::normal_distribution<real_type>(20.0, 5.0),
+              4, true, std::normal_distribution<>(50, 5),
+              std::normal_distribution<>(20.0, 5.0),
               std::default_random_engine(42), exec)),
           lower_bandwidth(2),
           upper_bandwidth(3),
           band_mtx(gko::test::generate_random_band_matrix<mtx_type>(
               100, lower_bandwidth, upper_bandwidth,
-              std::normal_distribution<real_type>(20.0, 5.0),
+              std::normal_distribution<>(20.0, 5.0),
               std::default_random_engine(42), exec)),
           nnz_per_row_sample(500, 0),
           values_sample(0),
@@ -96,15 +97,17 @@ class MatrixGenerator : public ::testing::Test {
 
 
     template <typename InputIterator, typename ValueType, typename Closure>
-    ValueType get_nth_moment(int n, ValueType c, InputIterator sample_start,
-                             InputIterator sample_end, Closure closure_op)
+    check_type get_nth_moment(int n, ValueType c, InputIterator sample_start,
+                              InputIterator sample_end, Closure closure_op)
     {
         using std::pow;
-        ValueType res = 0;
-        ValueType num_elems = 0;
+        check_type res = 0;
+        check_type num_elems = 0;
         while (sample_start != sample_end) {
             auto tmp = *(sample_start++);
-            res += pow(closure_op(tmp) - c, n);
+            res += pow(static_cast<check_type>(closure_op(tmp)) -
+                           static_cast<check_type>(c),
+                       n);
             num_elems += 1;
         }
         return res / num_elems;
@@ -128,7 +131,8 @@ class MatrixGenerator : public ::testing::Test {
     }
 };
 
-TYPED_TEST_SUITE(MatrixGenerator, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(MatrixGenerator, gko::test::ValueTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(MatrixGenerator, OutputHasCorrectSize)
@@ -247,7 +251,7 @@ TYPED_TEST(MatrixGenerator, CanGenerateTridiagMatrix)
 {
     using T = typename TestFixture::value_type;
     using Dense = typename TestFixture::mtx_type;
-    auto dist = std::normal_distribution<gko::remove_complex<T>>(0, 1);
+    auto dist = std::normal_distribution<>(0, 1);
     auto engine = std::default_random_engine(42);
     auto lower = gko::test::detail::get_rand_value<T>(dist, engine);
     auto diag = gko::test::detail::get_rand_value<T>(dist, engine);
@@ -271,18 +275,23 @@ TYPED_TEST(MatrixGenerator, CanGenerateTridiagInverseMatrix)
 {
     using T = typename TestFixture::value_type;
     using Dense = typename TestFixture::mtx_type;
-    auto dist = std::normal_distribution<gko::remove_complex<T>>(0, 1);
+    auto dist = std::normal_distribution<>(0, 1);
     auto engine = std::default_random_engine(42);
     auto lower = gko::test::detail::get_rand_value<T>(dist, engine);
     auto upper = gko::test::detail::get_rand_value<T>(dist, engine);
     // make diagonally dominant
-    auto diag = std::abs(gko::test::detail::get_rand_value<T>(dist, engine)) +
-                std::abs(lower) + std::abs(upper);
+    auto diag = gko::abs(gko::test::detail::get_rand_value<T>(dist, engine)) +
+                gko::abs(lower) + gko::abs(upper);
+    gko::size_type size = 50;
+    if (std::is_same_v<gko::remove_complex<T>, gko::half>) {
+        // half precision can only handle the inverse of small matrix.
+        size = 5;
+    }
 
     auto mtx = gko::test::generate_tridiag_matrix<Dense>(
-        50, {lower, diag, upper}, this->exec);
+        size, {lower, diag, upper}, this->exec);
     auto inv_mtx = gko::test::generate_tridiag_inverse_matrix<Dense>(
-        50, {lower, diag, upper}, this->exec);
+        size, {lower, diag, upper}, this->exec);
 
     auto result = Dense::create(this->exec, mtx->get_size());
     inv_mtx->apply(mtx, result);
diff --git a/core/test/utils/matrix_utils_test.cpp b/core/test/utils/matrix_utils_test.cpp
index 3c67571e1b2..f742d4561a2 100644
--- a/core/test/utils/matrix_utils_test.cpp
+++ b/core/test/utils/matrix_utils_test.cpp
@@ -30,8 +30,8 @@ class MatrixUtils : public ::testing::Test {
     MatrixUtils()
         : exec(gko::ReferenceExecutor::create()),
           data(gko::test::generate_random_matrix_data<value_type, int>(
-              500, 500, std::normal_distribution<real_type>(50, 5),
-              std::normal_distribution<real_type>(20.0, 5.0),
+              500, 500, std::normal_distribution<>(50, 5),
+              std::normal_distribution<>(20.0, 5.0),
               std::default_random_engine(42))),
           rectangular_data(gko::dim<2>(500, 100))
     {}
@@ -41,7 +41,8 @@ class MatrixUtils : public ::testing::Test {
     mtx_data rectangular_data;
 };
 
-TYPED_TEST_SUITE(MatrixUtils, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(MatrixUtils, gko::test::ValueTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(MatrixUtils, MakeSymmetricThrowsError)
@@ -241,7 +242,7 @@ TYPED_TEST(MatrixUtils, MakeHpdMatrixCorrectly)
 TYPED_TEST(MatrixUtils, MakeHpdMatrixWithRatioCorrectly)
 {
     using T = typename TestFixture::value_type;
-    gko::remove_complex<T> ratio = 1.00001;
+    gko::remove_complex<T> ratio = 1.01;
     auto cpy_data = this->data;
 
     gko::utils::make_hpd(this->data, ratio);
@@ -276,7 +277,7 @@ TYPED_TEST(MatrixUtils, MakeSpdMatrixCorrectly)
 TYPED_TEST(MatrixUtils, MakeSpdMatrixWithRatioCorrectly)
 {
     using T = typename TestFixture::value_type;
-    gko::remove_complex<T> ratio = 1.00001;
+    gko::remove_complex<T> ratio = 1.01;
     auto cpy_data = this->data;
 
     gko::utils::make_spd(this->data, ratio);
diff --git a/core/test/utils/unsort_matrix_test.cpp b/core/test/utils/unsort_matrix_test.cpp
index 5d2f88f982a..40ec65b08db 100644
--- a/core/test/utils/unsort_matrix_test.cpp
+++ b/core/test/utils/unsort_matrix_test.cpp
@@ -119,7 +119,7 @@ class UnsortMatrix : public ::testing::Test {
     std::unique_ptr<Coo> coo_empty;
 };
 
-TYPED_TEST_SUITE(UnsortMatrix, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(UnsortMatrix, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
diff --git a/core/test/utils/value_generator_test.cpp b/core/test/utils/value_generator_test.cpp
index 633565a66ef..57473c41b6e 100644
--- a/core/test/utils/value_generator_test.cpp
+++ b/core/test/utils/value_generator_test.cpp
@@ -20,19 +20,22 @@ template <typename T>
 class ValueGenerator : public ::testing::Test {
 protected:
     using value_type = T;
+    using check_type = double;
 
     ValueGenerator() {}
 
     template <typename InputIterator, typename ValueType, typename Closure>
-    ValueType get_nth_moment(int n, ValueType c, InputIterator sample_start,
-                             InputIterator sample_end, Closure closure_op)
+    check_type get_nth_moment(int n, ValueType c, InputIterator sample_start,
+                              InputIterator sample_end, Closure closure_op)
     {
         using std::pow;
-        ValueType res = 0;
-        ValueType num_elems = 0;
+        check_type res = 0;
+        check_type num_elems = 0;
         while (sample_start != sample_end) {
             auto tmp = *(sample_start++);
-            res += pow(closure_op(tmp) - c, n);
+            res += pow(static_cast<check_type>(closure_op(tmp)) -
+                           static_cast<check_type>(c),
+                       n);
             num_elems += 1;
         }
         return res / num_elems;
@@ -56,7 +59,8 @@ class ValueGenerator : public ::testing::Test {
     }
 };
 
-TYPED_TEST_SUITE(ValueGenerator, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(ValueGenerator, gko::test::ValueTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(ValueGenerator, OutputHasCorrectAverageAndDeviation)
diff --git a/reference/test/utils/assertions_test.cpp b/reference/test/utils/assertions_test.cpp
index 98f1ec68e0d..9c6b544172e 100644
--- a/reference/test/utils/assertions_test.cpp
+++ b/reference/test/utils/assertions_test.cpp
@@ -17,7 +17,8 @@ namespace {
 template <typename T>
 class MatricesNear : public ::testing::Test {};
 
-TYPED_TEST_SUITE(MatricesNear, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(MatricesNear, gko::test::ValueTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(MatricesNear, CanPassAnyMatrixType)

From 65136e973dbdf1a91ce4ba34c9a9b8f6c5e9da00 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 21 Nov 2024 11:14:50 +0100
Subject: [PATCH 367/448] constexpr restriction for nvc++

---
 accessor/reference_helper.hpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/accessor/reference_helper.hpp b/accessor/reference_helper.hpp
index a3a77352f8f..61e15bf8b22 100644
--- a/accessor/reference_helper.hpp
+++ b/accessor/reference_helper.hpp
@@ -12,10 +12,8 @@
 #include "utils.hpp"
 
 
-// CUDA TOOLKIT < 11 does not support constexpr in combination with
-// thrust::complex, which is why constexpr is only present in later versions
-#if defined(__CUDA_ARCH__) && defined(__CUDACC_VER_MAJOR__) && \
-    (__CUDACC_VER_MAJOR__ < 11)
+// NVC++ disallow a constexpr function has a nonliteral return type like half
+#if defined(__NVCOMPILER) && GINKGO_ENABLE_HALF
 
 #define GKO_ACC_ENABLE_REFERENCE_CONSTEXPR
 
@@ -23,7 +21,7 @@
 
 #define GKO_ACC_ENABLE_REFERENCE_CONSTEXPR constexpr
 
-#endif  // __CUDA_ARCH__ && __CUDACC_VER_MAJOR__ && __CUDACC_VER_MAJOR__ < 11
+#endif
 
 
 namespace gko {

From dd3efb9e6ed3a1ca12df02ceb4375c1bfae2a89b Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 25 Nov 2024 15:03:07 +0100
Subject: [PATCH 368/448] cuda with CC<70 and hip do not support 16 bit atomic.
 throw error or fallback to a working version if it is the case for matrix

---
 common/cuda_hip/components/atomic.hpp         |  48 --------
 common/cuda_hip/matrix/coo_kernels.cpp        | 114 ++++++++++--------
 .../cuda_hip/matrix/csr_kernels.template.cpp  |  97 ++++++++-------
 common/cuda_hip/matrix/ell_kernels.cpp        |  93 ++++++++------
 hip/components/cooperative_groups.hip.hpp     |  12 +-
 5 files changed, 182 insertions(+), 182 deletions(-)

diff --git a/common/cuda_hip/components/atomic.hpp b/common/cuda_hip/components/atomic.hpp
index 954bc7476ed..cd59485dac9 100644
--- a/common/cuda_hip/components/atomic.hpp
+++ b/common/cuda_hip/components/atomic.hpp
@@ -96,52 +96,6 @@ __forceinline__ __device__ ResultType reinterpret(ValueType val)
         }                                                                    \
     };
 
-
-#define GKO_BIND_ATOMIC_HELPER_FAKE_STRUCTURE(CONVERTER_TYPE)               \
-    template <typename ValueType>                                           \
-    struct atomic_helper<                                                   \
-        ValueType,                                                          \
-        std::enable_if_t<(sizeof(ValueType) == sizeof(CONVERTER_TYPE))>> {  \
-        __forceinline__ __device__ static ValueType atomic_add(             \
-            ValueType* __restrict__ addr, ValueType val)                    \
-        {                                                                   \
-            assert(false);                                                  \
-            using c_type = CONVERTER_TYPE;                                  \
-            return atomic_wrapper(                                          \
-                addr, [&val](c_type& old, c_type assumed, c_type* c_addr) { \
-                    old = *c_addr;                                          \
-                    *c_addr = reinterpret<c_type>(                          \
-                        val + reinterpret<ValueType>(assumed));             \
-                });                                                         \
-        }                                                                   \
-        __forceinline__ __device__ static ValueType atomic_max(             \
-            ValueType* __restrict__ addr, ValueType val)                    \
-        {                                                                   \
-            assert(false);                                                  \
-            using c_type = CONVERTER_TYPE;                                  \
-            return atomic_wrapper(                                          \
-                addr, [&val](c_type& old, c_type assumed, c_type* c_addr) { \
-                    if (reinterpret<ValueType>(assumed) < val) {            \
-                        old = *c_addr;                                      \
-                        *c_addr = reinterpret<c_type>(assumed);             \
-                    }                                                       \
-                });                                                         \
-        }                                                                   \
-                                                                            \
-    private:                                                                \
-        template <typename Callable>                                        \
-        __forceinline__ __device__ static ValueType atomic_wrapper(         \
-            ValueType* __restrict__ addr, Callable set_old)                 \
-        {                                                                   \
-            CONVERTER_TYPE* address_as_converter =                          \
-                reinterpret_cast<CONVERTER_TYPE*>(addr);                    \
-            CONVERTER_TYPE old = *address_as_converter;                     \
-            CONVERTER_TYPE assumed = old;                                   \
-            set_old(old, assumed, address_as_converter);                    \
-            return reinterpret<ValueType>(old);                             \
-        }                                                                   \
-    };
-
 // Support 64-bit ATOMIC_ADD and ATOMIC_MAX
 GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned long long int);
 // Support 32-bit ATOMIC_ADD and ATOMIC_MAX
@@ -152,8 +106,6 @@ GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned int);
 // Support 16-bit atomicCAS, atomicADD, and atomicMAX only on CUDA with CC
 // >= 7.0
 GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned short int);
-#else
-GKO_BIND_ATOMIC_HELPER_FAKE_STRUCTURE(unsigned short int)
 #endif
 
 
diff --git a/common/cuda_hip/matrix/coo_kernels.cpp b/common/cuda_hip/matrix/coo_kernels.cpp
index 4609f9f7f95..88d6dced504 100644
--- a/common/cuda_hip/matrix/coo_kernels.cpp
+++ b/common/cuda_hip/matrix/coo_kernels.cpp
@@ -268,30 +268,38 @@ void spmv2(std::shared_ptr<const DefaultExecutor> exec,
     const dim3 coo_block(config::warp_size, warps_in_block, 1);
     const auto nwarps = host_kernel::calculate_nwarps(exec, nnz);
 
-    if (nwarps > 0 && b_ncols > 0) {
-        // TODO: b_ncols needs to be tuned for ROCm.
-        if (b_ncols < 4) {
-            const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols);
-            int num_lines = ceildiv(nnz, nwarps * config::warp_size);
-
-            abstract_spmv<<<coo_grid, coo_block, 0, exec->get_stream()>>>(
-                nnz, num_lines, as_device_type(a->get_const_values()),
-                a->get_const_col_idxs(),
-                as_device_type(a->get_const_row_idxs()),
-                as_device_type(b->get_const_values()), b->get_stride(),
-                as_device_type(c->get_values()), c->get_stride());
-        } else {
-            int num_elems =
-                ceildiv(nnz, nwarps * config::warp_size) * config::warp_size;
-            const dim3 coo_grid(ceildiv(nwarps, warps_in_block),
-                                ceildiv(b_ncols, config::warp_size));
-
-            abstract_spmm<<<coo_grid, coo_block, 0, exec->get_stream()>>>(
-                nnz, num_elems, as_device_type(a->get_const_values()),
-                a->get_const_col_idxs(),
-                as_device_type(a->get_const_row_idxs()), b_ncols,
-                as_device_type(b->get_const_values()), b->get_stride(),
-                as_device_type(c->get_values()), c->get_stride());
+// not support 16 bit atomic
+#if !(defined(CUDA_VERSION) && (__CUDA_ARCH__ >= 700))
+    if constexpr (std::is_same_v<remove_complex<ValueType>, gko::half>) {
+        GKO_NOT_SUPPORTED(c);
+    } else
+#endif
+    {
+        if (nwarps > 0 && b_ncols > 0) {
+            // TODO: b_ncols needs to be tuned for ROCm.
+            if (b_ncols < 4) {
+                const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols);
+                int num_lines = ceildiv(nnz, nwarps * config::warp_size);
+
+                abstract_spmv<<<coo_grid, coo_block, 0, exec->get_stream()>>>(
+                    nnz, num_lines, as_device_type(a->get_const_values()),
+                    a->get_const_col_idxs(),
+                    as_device_type(a->get_const_row_idxs()),
+                    as_device_type(b->get_const_values()), b->get_stride(),
+                    as_device_type(c->get_values()), c->get_stride());
+            } else {
+                int num_elems = ceildiv(nnz, nwarps * config::warp_size) *
+                                config::warp_size;
+                const dim3 coo_grid(ceildiv(nwarps, warps_in_block),
+                                    ceildiv(b_ncols, config::warp_size));
+
+                abstract_spmm<<<coo_grid, coo_block, 0, exec->get_stream()>>>(
+                    nnz, num_elems, as_device_type(a->get_const_values()),
+                    a->get_const_col_idxs(),
+                    as_device_type(a->get_const_row_idxs()), b_ncols,
+                    as_device_type(b->get_const_values()), b->get_stride(),
+                    as_device_type(c->get_values()), c->get_stride());
+            }
         }
     }
 }
@@ -312,30 +320,40 @@ void advanced_spmv2(std::shared_ptr<const DefaultExecutor> exec,
     const dim3 coo_block(config::warp_size, warps_in_block, 1);
     const auto b_ncols = b->get_size()[1];
 
-    if (nwarps > 0 && b_ncols > 0) {
-        // TODO: b_ncols needs to be tuned for ROCm.
-        if (b_ncols < 4) {
-            int num_lines = ceildiv(nnz, nwarps * config::warp_size);
-            const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols);
-
-            abstract_spmv<<<coo_grid, coo_block, 0, exec->get_stream()>>>(
-                nnz, num_lines, as_device_type(alpha->get_const_values()),
-                as_device_type(a->get_const_values()), a->get_const_col_idxs(),
-                as_device_type(a->get_const_row_idxs()),
-                as_device_type(b->get_const_values()), b->get_stride(),
-                as_device_type(c->get_values()), c->get_stride());
-        } else {
-            int num_elems =
-                ceildiv(nnz, nwarps * config::warp_size) * config::warp_size;
-            const dim3 coo_grid(ceildiv(nwarps, warps_in_block),
-                                ceildiv(b_ncols, config::warp_size));
-
-            abstract_spmm<<<coo_grid, coo_block, 0, exec->get_stream()>>>(
-                nnz, num_elems, as_device_type(alpha->get_const_values()),
-                as_device_type(a->get_const_values()), a->get_const_col_idxs(),
-                as_device_type(a->get_const_row_idxs()), b_ncols,
-                as_device_type(b->get_const_values()), b->get_stride(),
-                as_device_type(c->get_values()), c->get_stride());
+    // not support 16 bit atomic
+#if !(defined(CUDA_VERSION) && (__CUDA_ARCH__ >= 700))
+    if constexpr (std::is_same_v<remove_complex<ValueType>, gko::half>) {
+        GKO_NOT_SUPPORTED(c);
+    } else
+#endif
+    {
+        if (nwarps > 0 && b_ncols > 0) {
+            // TODO: b_ncols needs to be tuned for ROCm.
+            if (b_ncols < 4) {
+                int num_lines = ceildiv(nnz, nwarps * config::warp_size);
+                const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols);
+
+                abstract_spmv<<<coo_grid, coo_block, 0, exec->get_stream()>>>(
+                    nnz, num_lines, as_device_type(alpha->get_const_values()),
+                    as_device_type(a->get_const_values()),
+                    a->get_const_col_idxs(),
+                    as_device_type(a->get_const_row_idxs()),
+                    as_device_type(b->get_const_values()), b->get_stride(),
+                    as_device_type(c->get_values()), c->get_stride());
+            } else {
+                int num_elems = ceildiv(nnz, nwarps * config::warp_size) *
+                                config::warp_size;
+                const dim3 coo_grid(ceildiv(nwarps, warps_in_block),
+                                    ceildiv(b_ncols, config::warp_size));
+
+                abstract_spmm<<<coo_grid, coo_block, 0, exec->get_stream()>>>(
+                    nnz, num_elems, as_device_type(alpha->get_const_values()),
+                    as_device_type(a->get_const_values()),
+                    a->get_const_col_idxs(),
+                    as_device_type(a->get_const_row_idxs()), b_ncols,
+                    as_device_type(b->get_const_values()), b->get_stride(),
+                    as_device_type(c->get_values()), c->get_stride());
+            }
         }
     }
 }
diff --git a/common/cuda_hip/matrix/csr_kernels.template.cpp b/common/cuda_hip/matrix/csr_kernels.template.cpp
index f808e234670..bd2423d4306 100644
--- a/common/cuda_hip/matrix/csr_kernels.template.cpp
+++ b/common/cuda_hip/matrix/csr_kernels.template.cpp
@@ -2064,7 +2064,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv);
 
 template <typename MatrixValueType, typename InputValueType,
           typename OutputValueType, typename IndexType>
-void load_balance_spmv(std::shared_ptr<const DefaultExecutor> exec,
+bool load_balance_spmv(std::shared_ptr<const DefaultExecutor> exec,
                        const matrix::Csr<MatrixValueType, IndexType>* a,
                        const matrix::Dense<InputValueType>* b,
                        matrix::Dense<OutputValueType>* c,
@@ -2074,42 +2074,54 @@ void load_balance_spmv(std::shared_ptr<const DefaultExecutor> exec,
     using arithmetic_type =
         highest_precision<InputValueType, OutputValueType, MatrixValueType>;
 
-    if (beta) {
-        dense::scale(exec, beta, c);
-    } else {
-        dense::fill(exec, c, zero<OutputValueType>());
-    }
-    const IndexType nwarps = a->get_num_srow_elements();
-    if (nwarps > 0) {
-        const dim3 csr_block(config::warp_size, warps_in_block, 1);
-        const dim3 csr_grid(ceildiv(nwarps, warps_in_block), b->get_size()[1]);
-        const auto a_vals =
-            acc::helper::build_const_rrm_accessor<arithmetic_type>(a);
-        const auto b_vals =
-            acc::helper::build_const_rrm_accessor<arithmetic_type>(b);
-        auto c_vals = acc::helper::build_rrm_accessor<arithmetic_type>(c);
-        if (alpha) {
-            if (csr_grid.x > 0 && csr_grid.y > 0) {
-                kernel::abstract_spmv<<<csr_grid, csr_block, 0,
-                                        exec->get_stream()>>>(
-                    nwarps, static_cast<IndexType>(a->get_size()[0]),
-                    as_device_type(alpha->get_const_values()),
-                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
-                    as_device_type(a->get_const_row_ptrs()),
-                    as_device_type(a->get_const_srow()),
-                    acc::as_device_range(b_vals), acc::as_device_range(c_vals));
-            }
+    // not support 16 bit atomic
+#if !(defined(CUDA_VERSION) && (__CUDA_ARCH__ >= 700))
+    if constexpr (std::is_same_v<remove_complex<OutputValueType>, half>) {
+        return false;
+    } else
+#endif
+    {
+        if (beta) {
+            dense::scale(exec, beta, c);
         } else {
-            if (csr_grid.x > 0 && csr_grid.y > 0) {
-                kernel::abstract_spmv<<<csr_grid, csr_block, 0,
-                                        exec->get_stream()>>>(
-                    nwarps, static_cast<IndexType>(a->get_size()[0]),
-                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
-                    as_device_type(a->get_const_row_ptrs()),
-                    as_device_type(a->get_const_srow()),
-                    acc::as_device_range(b_vals), acc::as_device_range(c_vals));
+            dense::fill(exec, c, zero<OutputValueType>());
+        }
+        const IndexType nwarps = a->get_num_srow_elements();
+        if (nwarps > 0) {
+            const dim3 csr_block(config::warp_size, warps_in_block, 1);
+            const dim3 csr_grid(ceildiv(nwarps, warps_in_block),
+                                b->get_size()[1]);
+            const auto a_vals =
+                acc::helper::build_const_rrm_accessor<arithmetic_type>(a);
+            const auto b_vals =
+                acc::helper::build_const_rrm_accessor<arithmetic_type>(b);
+            auto c_vals = acc::helper::build_rrm_accessor<arithmetic_type>(c);
+            if (alpha) {
+                if (csr_grid.x > 0 && csr_grid.y > 0) {
+                    kernel::abstract_spmv<<<csr_grid, csr_block, 0,
+                                            exec->get_stream()>>>(
+                        nwarps, static_cast<IndexType>(a->get_size()[0]),
+                        as_device_type(alpha->get_const_values()),
+                        acc::as_device_range(a_vals), a->get_const_col_idxs(),
+                        as_device_type(a->get_const_row_ptrs()),
+                        as_device_type(a->get_const_srow()),
+                        acc::as_device_range(b_vals),
+                        acc::as_device_range(c_vals));
+                }
+            } else {
+                if (csr_grid.x > 0 && csr_grid.y > 0) {
+                    kernel::abstract_spmv<<<csr_grid, csr_block, 0,
+                                            exec->get_stream()>>>(
+                        nwarps, static_cast<IndexType>(a->get_size()[0]),
+                        acc::as_device_range(a_vals), a->get_const_col_idxs(),
+                        as_device_type(a->get_const_row_ptrs()),
+                        as_device_type(a->get_const_srow()),
+                        acc::as_device_range(b_vals),
+                        acc::as_device_range(c_vals));
+                }
             }
         }
+        return true;
     }
 }
 
@@ -2257,8 +2269,6 @@ void spmv(std::shared_ptr<const DefaultExecutor> exec,
 {
     if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
         // empty output: nothing to do
-    } else if (a->get_strategy()->get_name() == "load_balance") {
-        host_kernel::load_balance_spmv(exec, a, b, c);
     } else if (a->get_strategy()->get_name() == "merge_path") {
         using arithmetic_type =
             highest_precision<InputValueType, OutputValueType, MatrixValueType>;
@@ -2273,8 +2283,10 @@ void spmv(std::shared_ptr<const DefaultExecutor> exec,
             syn::value_list<int>(), syn::type_list<>(), exec, a, b, c);
     } else {
         bool use_classical = true;
-        if (a->get_strategy()->get_name() == "sparselib" ||
-            a->get_strategy()->get_name() == "cusparse") {
+        if (a->get_strategy()->get_name() == "load_balance") {
+            use_classical = !host_kernel::load_balance_spmv(exec, a, b, c);
+        } else if (a->get_strategy()->get_name() == "sparselib" ||
+                   a->get_strategy()->get_name() == "cusparse") {
             use_classical = !host_kernel::try_sparselib_spmv(exec, a, b, c);
         }
         if (use_classical) {
@@ -2316,8 +2328,6 @@ void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec,
 {
     if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
         // empty output: nothing to do
-    } else if (a->get_strategy()->get_name() == "load_balance") {
-        host_kernel::load_balance_spmv(exec, a, b, c, alpha, beta);
     } else if (a->get_strategy()->get_name() == "merge_path") {
         using arithmetic_type =
             highest_precision<InputValueType, OutputValueType, MatrixValueType>;
@@ -2333,8 +2343,11 @@ void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec,
             beta);
     } else {
         bool use_classical = true;
-        if (a->get_strategy()->get_name() == "sparselib" ||
-            a->get_strategy()->get_name() == "cusparse") {
+        if (a->get_strategy()->get_name() == "load_balance") {
+            use_classical =
+                !host_kernel::load_balance_spmv(exec, a, b, c, alpha, beta);
+        } else if (a->get_strategy()->get_name() == "sparselib" ||
+                   a->get_strategy()->get_name() == "cusparse") {
             use_classical =
                 !host_kernel::try_sparselib_spmv(exec, a, b, c, alpha, beta);
         }
diff --git a/common/cuda_hip/matrix/ell_kernels.cpp b/common/cuda_hip/matrix/ell_kernels.cpp
index 16371166662..23079092162 100644
--- a/common/cuda_hip/matrix/ell_kernels.cpp
+++ b/common/cuda_hip/matrix/ell_kernels.cpp
@@ -91,7 +91,7 @@ __device__ void spmv_kernel(
     using arithmetic_type = typename a_accessor::arithmetic_type;
     const auto tidx = thread::get_thread_id_flat();
     const decltype(tidx) column_id = blockIdx.y;
-    if (num_thread_per_worker == 1) {
+    if constexpr (num_thread_per_worker == 1) {
         // Specialize the num_thread_per_worker = 1. It doesn't need the shared
         // memory, __syncthreads, and atomic_add
         if (tidx < num_rows) {
@@ -137,7 +137,7 @@ __device__ void spmv_kernel(
             __syncthreads();
             if (idx_in_worker == 0) {
                 const auto c_ind = x * c_stride + column_id;
-                if (atomic) {
+                if constexpr (atomic) {
                     atomic_add(&(c[c_ind]), op(storage[threadIdx.x], c[c_ind]));
                 } else {
                     c[c_ind] = op(storage[threadIdx.x], c[c_ind]);
@@ -179,7 +179,7 @@ __global__ __launch_bounds__(default_block_size) void spmv(
     using arithmetic_type = typename a_accessor::arithmetic_type;
     const auto alpha_val = alpha(0);
     const OutputValueType beta_val = beta[0];
-    if (atomic) {
+    if constexpr (atomic) {
         // Because the atomic operation changes the values of c during
         // computation, it can not directly do alpha * a * b + beta * c
         // operation. The beta * c needs to be done before calling this kernel.
@@ -240,42 +240,59 @@ void abstract_spmv(syn::value_list<int, info>,
     const dim3 grid_size(ceildiv(nrows * num_worker_per_row, block_size.x),
                          b->get_size()[1], 1);
 
-    const auto a_vals = acc::range<a_accessor>(
-        std::array<acc::size_type, 1>{{static_cast<acc::size_type>(
-            num_stored_elements_per_row * stride)}},
-        a->get_const_values());
-    const auto b_vals = acc::range<b_accessor>(
-        std::array<acc::size_type, 2>{
-            {static_cast<acc::size_type>(b->get_size()[0]),
-             static_cast<acc::size_type>(b->get_size()[1])}},
-        b->get_const_values(),
-        std::array<acc::size_type, 1>{
-            {static_cast<acc::size_type>(b->get_stride())}});
-
-    if (alpha == nullptr && beta == nullptr) {
-        if (grid_size.x > 0 && grid_size.y > 0) {
-            kernel::spmv<num_thread_per_worker, atomic>
-                <<<grid_size, block_size, 0, exec->get_stream()>>>(
-                    nrows, num_worker_per_row, acc::as_device_range(a_vals),
-                    a->get_const_col_idxs(), stride,
-                    num_stored_elements_per_row, acc::as_device_range(b_vals),
-                    as_device_type(c->get_values()), c->get_stride());
-        }
-    } else if (alpha != nullptr && beta != nullptr) {
-        const auto alpha_val = acc::range<a_accessor>(
-            std::array<acc::size_type, 1>{1}, alpha->get_const_values());
-        if (grid_size.x > 0 && grid_size.y > 0) {
-            kernel::spmv<num_thread_per_worker, atomic>
-                <<<grid_size, block_size, 0, exec->get_stream()>>>(
-                    nrows, num_worker_per_row, acc::as_device_range(alpha_val),
-                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
-                    stride, num_stored_elements_per_row,
-                    acc::as_device_range(b_vals),
-                    as_device_type(beta->get_const_values()),
-                    as_device_type(c->get_values()), c->get_stride());
-        }
-    } else {
+// not support 16 bit atomic
+#if !(defined(CUDA_VERSION) && (__CUDA_ARCH__ >= 700))
+    // We do atomic on shared memory when num_thread_per_worker is not 1.
+    // If atomic is also true, we also do atomic on out_vector.
+    constexpr bool shared_half =
+        std::is_same_v<remove_complex<arithmetic_type>, half>;
+    constexpr bool atomic_half_out =
+        atomic && std::is_same_v<remove_complex<OutputValueType>, half>;
+    if constexpr (num_thread_per_worker != 1 &&
+                  (shared_half || atomic_half_out)) {
         GKO_KERNEL_NOT_FOUND;
+    } else
+#endif
+    {
+        const auto a_vals = acc::range<a_accessor>(
+            std::array<acc::size_type, 1>{{static_cast<acc::size_type>(
+                num_stored_elements_per_row * stride)}},
+            a->get_const_values());
+        const auto b_vals = acc::range<b_accessor>(
+            std::array<acc::size_type, 2>{
+                {static_cast<acc::size_type>(b->get_size()[0]),
+                 static_cast<acc::size_type>(b->get_size()[1])}},
+            b->get_const_values(),
+            std::array<acc::size_type, 1>{
+                {static_cast<acc::size_type>(b->get_stride())}});
+
+        if (alpha == nullptr && beta == nullptr) {
+            if (grid_size.x > 0 && grid_size.y > 0) {
+                kernel::spmv<num_thread_per_worker, atomic>
+                    <<<grid_size, block_size, 0, exec->get_stream()>>>(
+                        nrows, num_worker_per_row, acc::as_device_range(a_vals),
+                        a->get_const_col_idxs(), stride,
+                        num_stored_elements_per_row,
+                        acc::as_device_range(b_vals),
+                        as_device_type(c->get_values()), c->get_stride());
+            }
+        } else if (alpha != nullptr && beta != nullptr) {
+            const auto alpha_val = acc::range<a_accessor>(
+                std::array<acc::size_type, 1>{1}, alpha->get_const_values());
+            if (grid_size.x > 0 && grid_size.y > 0) {
+                kernel::spmv<num_thread_per_worker, atomic>
+                    <<<grid_size, block_size, 0, exec->get_stream()>>>(
+                        nrows, num_worker_per_row,
+                        acc::as_device_range(alpha_val),
+                        acc::as_device_range(a_vals), a->get_const_col_idxs(),
+                        stride, num_stored_elements_per_row,
+                        acc::as_device_range(b_vals),
+                        as_device_type(beta->get_const_values()),
+                        as_device_type(c->get_values()), c->get_stride());
+            }
+        } else {
+            GKO_KERNEL_NOT_FOUND;
+        }
     }
 }
 
diff --git a/hip/components/cooperative_groups.hip.hpp b/hip/components/cooperative_groups.hip.hpp
index 46c2fb195bc..36618bb7f3e 100644
--- a/hip/components/cooperative_groups.hip.hpp
+++ b/hip/components/cooperative_groups.hip.hpp
@@ -306,7 +306,7 @@ class enable_extended_shuffle : public Group {
                                                SelectorType selector) const \
     {                                                                       \
         return shuffle_impl(                                                \
-            [this](uint16 v, SelectorType s) {                              \
+            [this](uint32 v, SelectorType s) {                              \
                 return static_cast<const Group*>(this)->_name(v, s);        \
             },                                                              \
             var, selector);                                                 \
@@ -326,12 +326,12 @@ class enable_extended_shuffle : public Group {
     shuffle_impl(ShuffleOperator intrinsic_shuffle, const ValueType var,
                  SelectorType selector)
     {
-        static_assert(sizeof(ValueType) % sizeof(uint16) == 0,
-                      "Unable to shuffle sizes which are not 2-byte multiples");
-        constexpr auto value_size = sizeof(ValueType) / sizeof(uint16);
+        static_assert(sizeof(ValueType) % sizeof(uint32) == 0,
+                      "Unable to shuffle sizes which are not 4-byte multiples");
+        constexpr auto value_size = sizeof(ValueType) / sizeof(uint32);
         ValueType result;
-        auto var_array = reinterpret_cast<const uint16*>(&var);
-        auto result_array = reinterpret_cast<uint16*>(&result);
+        auto var_array = reinterpret_cast<const uint32*>(&var);
+        auto result_array = reinterpret_cast<uint32*>(&result);
 #pragma unroll
         for (std::size_t i = 0; i < value_size; ++i) {
             result_array[i] = intrinsic_shuffle(var_array[i], selector);

From 917fee3d2830e36065e3fdbff5f3994ce4bfee90 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 26 Nov 2024 11:13:59 +0100
Subject: [PATCH 369/448] implement half shuffle via 32 bit impl

---
 hip/components/cooperative_groups.hip.hpp | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/hip/components/cooperative_groups.hip.hpp b/hip/components/cooperative_groups.hip.hpp
index 36618bb7f3e..dce69421a31 100644
--- a/hip/components/cooperative_groups.hip.hpp
+++ b/hip/components/cooperative_groups.hip.hpp
@@ -319,6 +319,27 @@ class enable_extended_shuffle : public Group {
 
 #undef GKO_ENABLE_SHUFFLE_OPERATION
 
+// hip does not support 16bit shuffle directly
+#define GKO_ENABLE_SHUFFLE_OPERATION_HALF(_name, SelectorType)           \
+    __device__ __forceinline__ __half _name(const __half& var,           \
+                                            SelectorType selector) const \
+    {                                                                    \
+        uint32 u;                                                        \
+        memcpy(&u, &var, sizeof(__half));                                \
+        u = static_cast<const Group*>(this)->_name(u, selector);         \
+        __half result;                                                   \
+        memcpy(&result, &u, sizeof(__half));                             \
+        return result;                                                   \
+    }
+
+    GKO_ENABLE_SHUFFLE_OPERATION_HALF(shfl, int32)
+    GKO_ENABLE_SHUFFLE_OPERATION_HALF(shfl_up, uint32)
+    GKO_ENABLE_SHUFFLE_OPERATION_HALF(shfl_down, uint32)
+    GKO_ENABLE_SHUFFLE_OPERATION_HALF(shfl_xor, int32)
+
+#undef GKO_ENABLE_SHUFFLE_OPERATION_HALF
+
+
 private:
     template <typename ShuffleOperator, typename ValueType,
               typename SelectorType>

From 0e46bbefcd4c4bcdbaf857c1d7def5852d9fcbbf Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 25 Oct 2024 16:27:44 +0200
Subject: [PATCH 370/448] config

---
 core/config/config_helper.hpp          |  4 ++-
 core/config/dispatch.hpp               |  8 +++++
 core/config/parse_macro.hpp            | 50 ++++++++++++++++----------
 core/config/type_descriptor.cpp        |  2 +-
 core/config/type_descriptor_helper.hpp |  3 ++
 5 files changed, 47 insertions(+), 20 deletions(-)

diff --git a/core/config/config_helper.hpp b/core/config/config_helper.hpp
index 483366765aa..b1fa7bd69b5 100644
--- a/core/config/config_helper.hpp
+++ b/core/config/config_helper.hpp
@@ -202,7 +202,9 @@ get_value(const pnode& config)
  * This is specialization for floating point type
  */
 template <typename ValueType>
-inline std::enable_if_t<std::is_floating_point<ValueType>::value, ValueType>
+inline std::enable_if_t<std::is_floating_point<ValueType>::value ||
+                            std::is_same<ValueType, half>::value,
+                        ValueType>
 get_value(const pnode& config)
 {
     auto val = config.get_real();
diff --git a/core/config/dispatch.hpp b/core/config/dispatch.hpp
index 0138665aac2..1c6d0eb12cd 100644
--- a/core/config/dispatch.hpp
+++ b/core/config/dispatch.hpp
@@ -105,6 +105,14 @@ deferred_factory_parameter<ReturnType> dispatch(
 using value_type_list =
     syn::type_list<double, float, std::complex<double>, std::complex<float>>;
 
+#if GINKGO_ENABLE_HALF
+using value_type_list_with_half =
+    syn::type_list<double, float, half, std::complex<double>,
+                   std::complex<float>, std::complex<half>>;
+#else
+using value_type_list_with_half = value_type_list;
+#endif  // GINKGO_ENABLE_HALF
+
 using index_type_list = syn::type_list<int32, int64>;
 
 }  // namespace config
diff --git a/core/config/parse_macro.hpp b/core/config/parse_macro.hpp
index 800b42f9493..e3734e5db7a 100644
--- a/core/config/parse_macro.hpp
+++ b/core/config/parse_macro.hpp
@@ -16,27 +16,33 @@
 
 
 // for value_type only
-#define GKO_PARSE_VALUE_TYPE(_type, _configurator)                            \
-    template <>                                                               \
-    deferred_factory_parameter<gko::LinOpFactory>                             \
-    parse<gko::config::LinOpFactoryType::_type>(                              \
-        const gko::config::pnode& config,                                     \
-        const gko::config::registry& context,                                 \
-        const gko::config::type_descriptor& td)                               \
-    {                                                                         \
-        auto updated = gko::config::update_type(config, td);                  \
-        return gko::config::dispatch<gko::LinOpFactory, _configurator>(       \
-            config, context, updated,                                         \
-            gko::config::make_type_selector(updated.get_value_typestr(),      \
-                                            gko::config::value_type_list())); \
-    }                                                                         \
-    static_assert(true,                                                       \
-                  "This assert is used to counter the false positive extra "  \
+#define GKO_PARSE_VALUE_TYPE_(_type, _configurator, _value_type_list)        \
+    template <>                                                              \
+    deferred_factory_parameter<gko::LinOpFactory>                            \
+    parse<gko::config::LinOpFactoryType::_type>(                             \
+        const gko::config::pnode& config,                                    \
+        const gko::config::registry& context,                                \
+        const gko::config::type_descriptor& td)                              \
+    {                                                                        \
+        auto updated = gko::config::update_type(config, td);                 \
+        return gko::config::dispatch<gko::LinOpFactory, _configurator>(      \
+            config, context, updated,                                        \
+            gko::config::make_type_selector(updated.get_value_typestr(),     \
+                                            _value_type_list));              \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
                   "semi-colon warnings")
+#define GKO_PARSE_VALUE_TYPE(_type, _configurator) \
+    GKO_PARSE_VALUE_TYPE_(_type, _configurator, gko::config::value_type_list())
 
+#define GKO_PARSE_VALUE_TYPE_WITH_HALF(_type, _configurator) \
+    GKO_PARSE_VALUE_TYPE_(_type, _configurator,              \
+                          gko::config::value_type_list_with_half())
 
 // for value_type and index_type
-#define GKO_PARSE_VALUE_AND_INDEX_TYPE(_type, _configurator)                  \
+#define GKO_PARSE_VALUE_AND_INDEX_TYPE_(_type, _configurator,                 \
+                                        _value_type_list)                     \
     template <>                                                               \
     deferred_factory_parameter<gko::LinOpFactory>                             \
     parse<gko::config::LinOpFactoryType::_type>(                              \
@@ -48,7 +54,7 @@
         return gko::config::dispatch<gko::LinOpFactory, _configurator>(       \
             config, context, updated,                                         \
             gko::config::make_type_selector(updated.get_value_typestr(),      \
-                                            gko::config::value_type_list()),  \
+                                            _value_type_list),                \
             gko::config::make_type_selector(updated.get_index_typestr(),      \
                                             gko::config::index_type_list())); \
     }                                                                         \
@@ -56,5 +62,13 @@
                   "This assert is used to counter the false positive extra "  \
                   "semi-colon warnings")
 
+#define GKO_PARSE_VALUE_AND_INDEX_TYPE(_type, _configurator) \
+    GKO_PARSE_VALUE_AND_INDEX_TYPE_(_type, _configurator,    \
+                                    gko::config::value_type_list())
+
+#define GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(_type, _configurator) \
+    GKO_PARSE_VALUE_AND_INDEX_TYPE_(_type, _configurator,              \
+                                    gko::config::value_type_list_with_half())
+
 
 #endif  // GKO_CORE_CONFIG_PARSE_MACRO_HPP_
diff --git a/core/config/type_descriptor.cpp b/core/config/type_descriptor.cpp
index fe11b785d6f..ef4cdc692f9 100644
--- a/core/config/type_descriptor.cpp
+++ b/core/config/type_descriptor.cpp
@@ -50,7 +50,7 @@ type_descriptor make_type_descriptor()
                                          GlobalIndexType)           \
     type_descriptor                                                 \
     make_type_descriptor<ValueType, LocalIndexType, GlobalIndexType>()
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_MAKE_TYPE_DESCRIPTOR);
 
 #define GKO_DECLARE_MAKE_VOID_TYPE_DESCRIPTOR(LocalIndexType, GlobalIndexType) \
diff --git a/core/config/type_descriptor_helper.hpp b/core/config/type_descriptor_helper.hpp
index 0edc4376f1a..63a953e3a1e 100644
--- a/core/config/type_descriptor_helper.hpp
+++ b/core/config/type_descriptor_helper.hpp
@@ -8,6 +8,7 @@
 
 #include <string>
 
+#include <ginkgo/core/base/half.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/config/property_tree.hpp>
 #include <ginkgo/core/config/type_descriptor.hpp>
@@ -38,8 +39,10 @@ struct type_string {};
 TYPE_STRING_OVERLOAD(void, "void");
 TYPE_STRING_OVERLOAD(double, "float64");
 TYPE_STRING_OVERLOAD(float, "float32");
+TYPE_STRING_OVERLOAD(half, "float16");
 TYPE_STRING_OVERLOAD(std::complex<double>, "complex<float64>");
 TYPE_STRING_OVERLOAD(std::complex<float>, "complex<float32>");
+TYPE_STRING_OVERLOAD(std::complex<half>, "complex<float16>");
 TYPE_STRING_OVERLOAD(int32, "int32");
 TYPE_STRING_OVERLOAD(int64, "int64");
 

From 7676dea761088ed031230187c4a48a2f0240ee08 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 25 Oct 2024 14:47:14 +0200
Subject: [PATCH 371/448] residual with half

---
 .../cuda_hip/stop/residual_norm_kernels.cpp   |  5 ++--
 core/device_hooks/common_kernels.inc.cpp      |  4 +--
 core/stop/residual_norm.cpp                   |  5 ++--
 dpcpp/stop/residual_norm_kernels.dp.cpp       |  7 +++--
 omp/stop/residual_norm_kernels.cpp            |  5 ++--
 reference/stop/residual_norm_kernels.cpp      |  5 ++--
 reference/test/stop/residual_norm_kernels.cpp | 21 ++++++++-----
 test/stop/residual_norm_kernels.cpp           | 30 ++++++++++++-------
 8 files changed, 51 insertions(+), 31 deletions(-)

diff --git a/common/cuda_hip/stop/residual_norm_kernels.cpp b/common/cuda_hip/stop/residual_norm_kernels.cpp
index 9d6db5211e8..23ca8e5d5f1 100644
--- a/common/cuda_hip/stop/residual_norm_kernels.cpp
+++ b/common/cuda_hip/stop/residual_norm_kernels.cpp
@@ -91,7 +91,7 @@ void residual_norm(std::shared_ptr<const DefaultExecutor> exec,
     *one_changed = get_element(*device_storage, 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_RESIDUAL_NORM_KERNEL);
 
 
@@ -171,7 +171,8 @@ void implicit_residual_norm(
     *one_changed = get_element(*device_storage, 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL);
 
 
 }  // namespace implicit_residual_norm
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 439cda481a2..c41f9e921cb 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -1136,7 +1136,7 @@ GKO_STUB(GKO_DECLARE_SET_ALL_STATUSES_KERNEL);
 namespace residual_norm {
 
 
-GKO_STUB_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_RESIDUAL_NORM_KERNEL);
+GKO_STUB_NON_COMPLEX_VALUE_TYPE_WITH_HALF(GKO_DECLARE_RESIDUAL_NORM_KERNEL);
 
 
 }  // namespace residual_norm
@@ -1145,7 +1145,7 @@ GKO_STUB_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_RESIDUAL_NORM_KERNEL);
 namespace implicit_residual_norm {
 
 
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL);
 
 
 }  // namespace implicit_residual_norm
diff --git a/core/stop/residual_norm.cpp b/core/stop/residual_norm.cpp
index c962784033a..5f75efcec82 100644
--- a/core/stop/residual_norm.cpp
+++ b/core/stop/residual_norm.cpp
@@ -227,12 +227,13 @@ bool ImplicitResidualNorm<ValueType>::check_impl(
 
 
 #define GKO_DECLARE_RESIDUAL_NORM(_type) class ResidualNormBase<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_RESIDUAL_NORM);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_RESIDUAL_NORM);
 
 
 #define GKO_DECLARE_IMPLICIT_RESIDUAL_NORM(_type) \
     class ImplicitResidualNorm<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IMPLICIT_RESIDUAL_NORM);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_IMPLICIT_RESIDUAL_NORM);
 
 
 }  // namespace stop
diff --git a/dpcpp/stop/residual_norm_kernels.dp.cpp b/dpcpp/stop/residual_norm_kernels.dp.cpp
index ddb617a1a84..23d62e83729 100644
--- a/dpcpp/stop/residual_norm_kernels.dp.cpp
+++ b/dpcpp/stop/residual_norm_kernels.dp.cpp
@@ -69,7 +69,7 @@ void residual_norm(std::shared_ptr<const DpcppExecutor> exec,
     *one_changed = get_element(*device_storage, 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_RESIDUAL_NORM_KERNEL);
 
 
@@ -108,7 +108,7 @@ void implicit_residual_norm(
         cgh.parallel_for(
             sycl::range<1>{tau->get_size()[1]}, [=](sycl::id<1> idx_id) {
                 const auto tidx = idx_id[0];
-                if (std::sqrt(std::abs(tau_val[tidx])) <=
+                if (gko::sqrt(gko::abs(tau_val[tidx])) <=
                     rel_residual_goal * orig_tau_val[tidx]) {
                     stop_status_val[tidx].converge(stoppingId, setFinalized);
                     device_storage_val[1] = true;
@@ -126,7 +126,8 @@ void implicit_residual_norm(
     *one_changed = get_element(*device_storage, 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL);
 
 
 }  // namespace implicit_residual_norm
diff --git a/omp/stop/residual_norm_kernels.cpp b/omp/stop/residual_norm_kernels.cpp
index 0ec4395a16b..ff259477d03 100644
--- a/omp/stop/residual_norm_kernels.cpp
+++ b/omp/stop/residual_norm_kernels.cpp
@@ -53,7 +53,7 @@ void residual_norm(std::shared_ptr<const OmpExecutor> exec,
     *all_converged = local_all_converged;
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_RESIDUAL_NORM_KERNEL);
 
 
@@ -98,7 +98,8 @@ void implicit_residual_norm(
     *all_converged = local_all_converged;
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL);
 
 
 }  // namespace implicit_residual_norm
diff --git a/reference/stop/residual_norm_kernels.cpp b/reference/stop/residual_norm_kernels.cpp
index ba2672edc28..ed91ff390b6 100644
--- a/reference/stop/residual_norm_kernels.cpp
+++ b/reference/stop/residual_norm_kernels.cpp
@@ -50,7 +50,7 @@ void residual_norm(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_RESIDUAL_NORM_KERNEL);
 
 
@@ -90,7 +90,8 @@ void implicit_residual_norm(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL);
 
 
 }  // namespace implicit_residual_norm
diff --git a/reference/test/stop/residual_norm_kernels.cpp b/reference/test/stop/residual_norm_kernels.cpp
index 43b865796b7..aed801afacf 100644
--- a/reference/test/stop/residual_norm_kernels.cpp
+++ b/reference/test/stop/residual_norm_kernels.cpp
@@ -45,7 +45,8 @@ class ResidualNorm : public ::testing::Test {
     std::shared_ptr<const gko::Executor> exec_;
 };
 
-TYPED_TEST_SUITE(ResidualNorm, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(ResidualNorm, gko::test::ValueTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(ResidualNorm, CanCreateFactory)
@@ -85,7 +86,8 @@ TYPED_TEST(ResidualNorm, CheckIfResZeroConverges)
     for (auto baseline :
          {mode::rhs_norm, mode::initial_resnorm, mode::absolute}) {
         gko::remove_complex<T> factor =
-            (baseline == mode::absolute) ? 0.0 : r<T>::value;
+            (baseline == mode::absolute) ? gko::zero<gko::remove_complex<T>>()
+                                         : r<T>::value;
         auto criterion = gko::stop::ResidualNorm<T>::build()
                              .with_reduction_factor(factor)
                              .with_baseline(baseline)
@@ -399,7 +401,9 @@ TYPED_TEST(ResidualNorm, SelfCalculatesAndWaitsTillResidualGoal)
         ASSERT_FALSE(abs_criterion->update().solution(solution).check(
             RelativeStoppingId, true, &stop_status, &one_changed));
 
-        solution->at(0) = rhs_val - r<T>::value * T{1.2};
+        // TODO FIXME: NVHPC calculates different result of rhs - r*1.2 from
+        // rhs - tmp = rhs - (r * 1.2). https://godbolt.org/z/GrGE9PE67
+        solution->at(0) = rhs_val - r<T>::value * T{1.4};
         ASSERT_FALSE(abs_criterion->update().solution(solution).check(
             RelativeStoppingId, true, &stop_status, &one_changed));
         ASSERT_EQ(stop_status.get_data()[0].has_converged(), false);
@@ -526,7 +530,7 @@ class ResidualNormWithInitialResnorm : public ::testing::Test {
     std::shared_ptr<const gko::ReferenceExecutor> exec_;
 };
 
-TYPED_TEST_SUITE(ResidualNormWithInitialResnorm, gko::test::ValueTypes,
+TYPED_TEST_SUITE(ResidualNormWithInitialResnorm, gko::test::ValueTypesWithHalf,
                  TypenameNameGenerator);
 
 
@@ -667,7 +671,7 @@ class ResidualNormWithRhsNorm : public ::testing::Test {
     std::shared_ptr<const gko::Executor> exec_;
 };
 
-TYPED_TEST_SUITE(ResidualNormWithRhsNorm, gko::test::ValueTypes,
+TYPED_TEST_SUITE(ResidualNormWithRhsNorm, gko::test::ValueTypesWithHalf,
                  TypenameNameGenerator);
 
 
@@ -804,7 +808,7 @@ class ImplicitResidualNorm : public ::testing::Test {
     std::shared_ptr<const gko::Executor> exec_;
 };
 
-TYPED_TEST_SUITE(ImplicitResidualNorm, gko::test::ValueTypes,
+TYPED_TEST_SUITE(ImplicitResidualNorm, gko::test::ValueTypesWithHalf,
                  TypenameNameGenerator);
 
 
@@ -836,7 +840,8 @@ TYPED_TEST(ImplicitResidualNorm, CheckIfResZeroConverges)
     for (auto baseline :
          {mode::rhs_norm, mode::initial_resnorm, mode::absolute}) {
         gko::remove_complex<T> factor =
-            (baseline == mode::absolute) ? 0.0 : r<T>::value;
+            (baseline == mode::absolute) ? gko::zero<gko::remove_complex<T>>()
+                                         : r<T>::value;
         auto criterion = gko::stop::ImplicitResidualNorm<T>::build()
                              .with_reduction_factor(factor)
                              .with_baseline(baseline)
@@ -979,7 +984,7 @@ class ResidualNormWithAbsolute : public ::testing::Test {
     std::shared_ptr<const gko::Executor> exec_;
 };
 
-TYPED_TEST_SUITE(ResidualNormWithAbsolute, gko::test::ValueTypes,
+TYPED_TEST_SUITE(ResidualNormWithAbsolute, gko::test::ValueTypesWithHalf,
                  TypenameNameGenerator);
 
 
diff --git a/test/stop/residual_norm_kernels.cpp b/test/stop/residual_norm_kernels.cpp
index a0a144bcf3b..62f656bed59 100644
--- a/test/stop/residual_norm_kernels.cpp
+++ b/test/stop/residual_norm_kernels.cpp
@@ -5,6 +5,7 @@
 #include <gtest/gtest.h>
 
 #include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
 #include "core/test/utils.hpp"
@@ -57,7 +58,8 @@ class ResidualNorm : public CommonTestFixture {
     std::unique_ptr<typename gko::stop::ResidualNorm<T>::Factory> abs_factory;
 };
 
-TYPED_TEST_SUITE(ResidualNorm, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(ResidualNorm, gko::test::ValueTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(ResidualNorm, CanIgorneResidualNorm)
@@ -81,13 +83,16 @@ TYPED_TEST(ResidualNorm, CanIgorneResidualNorm)
                  gko::NotSupported);
 }
 
+
 TYPED_TEST(ResidualNorm, CheckIfResZeroConverges)
 {
     using Mtx = typename TestFixture::Mtx;
     using NormVector = typename TestFixture::NormVector;
     using T = typename TestFixture::ValueType;
+    // use csr to use half apply
+    using Csr = gko::matrix::Csr<T>;
     using mode = gko::stop::mode;
-    std::shared_ptr<gko::LinOp> mtx = gko::initialize<Mtx>({1.0}, this->exec);
+    std::shared_ptr<gko::LinOp> mtx = gko::initialize<Csr>({1.0}, this->exec);
     std::shared_ptr<gko::LinOp> rhs = gko::initialize<Mtx>({0.0}, this->exec);
     std::shared_ptr<gko::LinOp> x = gko::initialize<Mtx>({0.0}, this->exec);
     std::shared_ptr<gko::LinOp> res_norm =
@@ -96,7 +101,8 @@ TYPED_TEST(ResidualNorm, CheckIfResZeroConverges)
     for (auto baseline :
          {mode::rhs_norm, mode::initial_resnorm, mode::absolute}) {
         gko::remove_complex<T> factor =
-            (baseline == mode::absolute) ? 0.0 : r<T>::value;
+            (baseline == mode::absolute) ? gko::zero<gko::remove_complex<T>>()
+                                         : r<T>::value;
         auto criterion = gko::stop::ResidualNorm<T>::build()
                              .with_reduction_factor(factor)
                              .with_baseline(baseline)
@@ -116,6 +122,7 @@ TYPED_TEST(ResidualNorm, CheckIfResZeroConverges)
     }
 }
 
+
 TYPED_TEST(ResidualNorm, WaitsTillResidualGoal)
 {
     using Mtx = typename TestFixture::Mtx;
@@ -338,7 +345,7 @@ class ResidualNormWithInitialResnorm : public CommonTestFixture {
     std::unique_ptr<typename gko::stop::ResidualNorm<T>::Factory> factory;
 };
 
-TYPED_TEST_SUITE(ResidualNormWithInitialResnorm, gko::test::ValueTypes,
+TYPED_TEST_SUITE(ResidualNormWithInitialResnorm, gko::test::ValueTypesWithHalf,
                  TypenameNameGenerator);
 
 
@@ -435,7 +442,7 @@ class ResidualNormWithRhsNorm : public CommonTestFixture {
     std::unique_ptr<typename gko::stop::ResidualNorm<T>::Factory> factory;
 };
 
-TYPED_TEST_SUITE(ResidualNormWithRhsNorm, gko::test::ValueTypes,
+TYPED_TEST_SUITE(ResidualNormWithRhsNorm, gko::test::ValueTypesWithHalf,
                  TypenameNameGenerator);
 
 
@@ -540,16 +547,18 @@ class ImplicitResidualNorm : public CommonTestFixture {
         factory;
 };
 
-TYPED_TEST_SUITE(ImplicitResidualNorm, gko::test::ValueTypes,
+TYPED_TEST_SUITE(ImplicitResidualNorm, gko::test::ValueTypesWithHalf,
                  TypenameNameGenerator);
 
 
 TYPED_TEST(ImplicitResidualNorm, CheckIfResZeroConverges)
 {
-    using Mtx = typename TestFixture::Mtx;
     using T = typename TestFixture::ValueType;
+    using Mtx = typename TestFixture::Mtx;
+    // use csr to use half apply
+    using Csr = gko::matrix::Csr<T>;
     using gko::stop::mode;
-    std::shared_ptr<gko::LinOp> mtx = gko::initialize<Mtx>({1.0}, this->exec);
+    std::shared_ptr<gko::LinOp> mtx = gko::initialize<Csr>({1.0}, this->exec);
     std::shared_ptr<gko::LinOp> rhs = gko::initialize<Mtx>({0.0}, this->exec);
     std::shared_ptr<gko::LinOp> x = gko::initialize<Mtx>({0.0}, this->exec);
     std::shared_ptr<gko::LinOp> implicit_sq_res_norm =
@@ -558,7 +567,8 @@ TYPED_TEST(ImplicitResidualNorm, CheckIfResZeroConverges)
     for (auto baseline :
          {mode::rhs_norm, mode::initial_resnorm, mode::absolute}) {
         gko::remove_complex<T> factor =
-            (baseline == mode::absolute) ? 0.0 : r<T>::value;
+            (baseline == mode::absolute) ? gko::zero<gko::remove_complex<T>>()
+                                         : r<T>::value;
         auto criterion = gko::stop::ImplicitResidualNorm<T>::build()
                              .with_reduction_factor(factor)
                              .with_baseline(baseline)
@@ -683,7 +693,7 @@ class ResidualNormWithAbsolute : public CommonTestFixture {
     std::unique_ptr<typename gko::stop::ResidualNorm<T>::Factory> factory;
 };
 
-TYPED_TEST_SUITE(ResidualNormWithAbsolute, gko::test::ValueTypes,
+TYPED_TEST_SUITE(ResidualNormWithAbsolute, gko::test::ValueTypesWithHalf,
                  TypenameNameGenerator);
 
 

From 488382763110a813c705ddf3799b4e5194555a9c Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 12 Nov 2024 16:56:55 +0100
Subject: [PATCH 372/448] residual norm default reduction_factor respect to
 precision

Co-authored-by: Marcel Koch <marcel.koch@kit.edu>
---
 include/ginkgo/core/stop/residual_norm.hpp | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/include/ginkgo/core/stop/residual_norm.hpp b/include/ginkgo/core/stop/residual_norm.hpp
index 7ee020207d4..c7f240950fa 100644
--- a/include/ginkgo/core/stop/residual_norm.hpp
+++ b/include/ginkgo/core/stop/residual_norm.hpp
@@ -6,10 +6,12 @@
 #define GKO_PUBLIC_CORE_STOP_RESIDUAL_NORM_HPP_
 
 
+#include <limits>
 #include <type_traits>
 
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/base/utils.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/criterion.hpp>
@@ -120,7 +122,8 @@ class ResidualNorm : public ResidualNormBase<ValueType> {
          * Residual norm reduction factor
          */
         remove_complex<ValueType> GKO_FACTORY_PARAMETER_SCALAR(
-            reduction_factor, static_cast<remove_complex<ValueType>>(1e-15));
+            reduction_factor,
+            5 * std::numeric_limits<remove_complex<ValueType>>::epsilon());
 
         /**
          * The quantity the reduction is relative to. Choices include
@@ -176,7 +179,8 @@ class ImplicitResidualNorm : public ResidualNormBase<ValueType> {
          * Implicit Residual norm goal
          */
         remove_complex<ValueType> GKO_FACTORY_PARAMETER_SCALAR(
-            reduction_factor, static_cast<remove_complex<ValueType>>(1e-15));
+            reduction_factor,
+            5 * std::numeric_limits<remove_complex<ValueType>>::epsilon());
 
         /**
          * The quantity the reduction is relative to. Choices include
@@ -251,7 +255,8 @@ class GKO_DEPRECATED(
          * Factor by which the residual norm will be reduced
          */
         remove_complex<ValueType> GKO_FACTORY_PARAMETER_SCALAR(
-            reduction_factor, static_cast<remove_complex<ValueType>>(1e-15));
+            reduction_factor,
+            5 * std::numeric_limits<remove_complex<ValueType>>::epsilon());
     };
     GKO_ENABLE_CRITERION_FACTORY(ResidualNormReduction<ValueType>, parameters,
                                  Factory);
@@ -307,7 +312,8 @@ class GKO_DEPRECATED(
          * Relative residual norm goal
          */
         remove_complex<ValueType> GKO_FACTORY_PARAMETER_SCALAR(
-            tolerance, static_cast<remove_complex<ValueType>>(1e-15));
+            tolerance,
+            5 * std::numeric_limits<remove_complex<ValueType>>::epsilon());
     };
     GKO_ENABLE_CRITERION_FACTORY(RelativeResidualNorm<ValueType>, parameters,
                                  Factory);
@@ -360,7 +366,8 @@ class GKO_DEPRECATED(
          * Absolute residual norm goal
          */
         remove_complex<ValueType> GKO_FACTORY_PARAMETER_SCALAR(
-            tolerance, static_cast<remove_complex<ValueType>>(1e-15));
+            tolerance,
+            5 * std::numeric_limits<remove_complex<ValueType>>::epsilon());
     };
     GKO_ENABLE_CRITERION_FACTORY(AbsoluteResidualNorm<ValueType>, parameters,
                                  Factory);

From 310fd59a9c39fbf51202835b35a4b212fdec605e Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 25 Oct 2024 16:57:19 +0200
Subject: [PATCH 373/448] residual config dispatch

---
 core/config/stop_config.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/core/config/stop_config.cpp b/core/config/stop_config.cpp
index 4623eb768fc..2696b471a21 100644
--- a/core/config/stop_config.cpp
+++ b/core/config/stop_config.cpp
@@ -87,7 +87,8 @@ deferred_factory_parameter<stop::CriterionFactory> configure_residual(
     auto updated = update_type(config, td);
     return dispatch<stop::CriterionFactory, ResidualNormConfigurer>(
         config, context, updated,
-        make_type_selector(updated.get_value_typestr(), value_type_list()));
+        make_type_selector(updated.get_value_typestr(),
+                           value_type_list_with_half()));
 }
 
 
@@ -119,7 +120,8 @@ deferred_factory_parameter<stop::CriterionFactory> configure_implicit_residual(
     auto updated = update_type(config, td);
     return dispatch<stop::CriterionFactory, ImplicitResidualNormConfigurer>(
         config, context, updated,
-        make_type_selector(updated.get_value_typestr(), value_type_list()));
+        make_type_selector(updated.get_value_typestr(),
+                           value_type_list_with_half()));
 }
 
 

From 1de2a94650efef5611f080bd588c7ed02640e82b Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 25 Oct 2024 16:44:50 +0200
Subject: [PATCH 374/448] krylov solver

---
 common/cuda_hip/solver/idr_kernels.cpp        | 20 ++++--
 common/unified/solver/bicg_kernels.cpp        |  7 +-
 common/unified/solver/bicgstab_kernels.cpp    | 15 +++--
 common/unified/solver/cg_kernels.cpp          |  6 +-
 common/unified/solver/cgs_kernels.cpp         |  9 +--
 .../unified/solver/common_gmres_kernels.cpp   |  7 +-
 common/unified/solver/fcg_kernels.cpp         |  7 +-
 common/unified/solver/gcr_kernels.cpp         |  7 +-
 common/unified/solver/gmres_kernels.cpp       |  8 ++-
 core/device_hooks/common_kernels.inc.cpp      | 64 +++++++++----------
 core/solver/bicg.cpp                          |  4 +-
 core/solver/bicgstab.cpp                      |  4 +-
 core/solver/cg.cpp                            |  4 +-
 core/solver/cgs.cpp                           |  4 +-
 core/solver/fcg.cpp                           |  4 +-
 core/solver/gcr.cpp                           |  4 +-
 core/solver/gmres.cpp                         |  4 +-
 core/solver/idr.cpp                           | 16 ++++-
 core/solver/ir.cpp                            |  4 +-
 core/test/solver/bicg.cpp                     |  2 +-
 core/test/solver/bicgstab.cpp                 |  3 +-
 core/test/solver/cg.cpp                       |  2 +-
 core/test/solver/cgs.cpp                      |  2 +-
 core/test/solver/fcg.cpp                      |  2 +-
 core/test/solver/gcr.cpp                      | 23 +++----
 core/test/solver/gmres.cpp                    | 23 +++----
 core/test/solver/idr.cpp                      |  2 +-
 core/test/solver/ir.cpp                       |  2 +-
 cuda/base/curand_bindings.hpp                 | 13 ++++
 dpcpp/solver/cb_gmres_kernels.dp.cpp          | 10 +--
 dpcpp/solver/common_gmres_kernels.dp.inc      | 10 +--
 dpcpp/solver/idr_kernels.dp.cpp               | 35 ++++++----
 hip/base/hiprand_bindings.hip.hpp             | 13 ++++
 omp/solver/idr_kernels.cpp                    | 24 ++++---
 reference/solver/bicg_kernels.cpp             |  7 +-
 reference/solver/bicgstab_kernels.cpp         | 15 +++--
 reference/solver/cg_kernels.cpp               |  6 +-
 reference/solver/cgs_kernels.cpp              |  9 +--
 reference/solver/common_gmres_kernels.cpp     |  7 +-
 reference/solver/fcg_kernels.cpp              |  7 +-
 reference/solver/gcr_kernels.cpp              |  7 +-
 reference/solver/gmres_kernels.cpp            |  8 ++-
 reference/solver/idr_kernels.cpp              | 24 ++++---
 reference/test/solver/bicg_kernels.cpp        | 22 +++++--
 reference/test/solver/bicgstab_kernels.cpp    | 40 ++++++++----
 reference/test/solver/cg_kernels.cpp          | 26 ++++++--
 reference/test/solver/cgs_kernels.cpp         | 35 +++++++---
 reference/test/solver/fcg_kernels.cpp         | 26 ++++++--
 reference/test/solver/gcr_kernels.cpp         | 38 ++++++++---
 reference/test/solver/gmres_kernels.cpp       | 30 +++++++--
 reference/test/solver/idr_kernels.cpp         | 28 ++++++--
 reference/test/solver/ir_kernels.cpp          | 11 ++--
 test/solver/cb_gmres_kernels.cpp              |  2 +-
 53 files changed, 455 insertions(+), 257 deletions(-)

diff --git a/common/cuda_hip/solver/idr_kernels.cpp b/common/cuda_hip/solver/idr_kernels.cpp
index a0f605134eb..0dc310ebd2e 100644
--- a/common/cuda_hip/solver/idr_kernels.cpp
+++ b/common/cuda_hip/solver/idr_kernels.cpp
@@ -344,9 +344,13 @@ __global__ __launch_bounds__(config::warp_size) void compute_omega_kernel(
 
     if (!stop_status[global_id].has_stopped()) {
         auto thr = omega[global_id];
+        const auto normt = sqrt(real(tht[global_id]));
+        if (normt == zero<remove_complex<ValueType>>()) {
+            omega[global_id] = zero<ValueType>();
+            return;
+        }
         omega[global_id] /= tht[global_id];
-        auto absrho =
-            abs(thr / (sqrt(real(tht[global_id])) * residual_norm[global_id]));
+        auto absrho = abs(thr / (normt * residual_norm[global_id]));
 
         if (absrho < kappa) {
             omega[global_id] *= kappa / absrho;
@@ -555,7 +559,8 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
     orthonormalize_subspace_vectors(exec, subspace_vectors);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_IDR_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -582,7 +587,7 @@ void step_1(std::shared_ptr<const DefaultExecutor> exec, const size_type nrhs,
         stop_status->get_const_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -609,7 +614,7 @@ void step_2(std::shared_ptr<const DefaultExecutor> exec, const size_type nrhs,
         stop_status->get_const_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_2_KERNEL);
 
 
 template <typename ValueType>
@@ -626,7 +631,7 @@ void step_3(std::shared_ptr<const DefaultExecutor> exec, const size_type nrhs,
     update_x_r_and_f(exec, nrhs, k, m, g, u, f, residual, x, stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_3_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_3_KERNEL);
 
 
 template <typename ValueType>
@@ -644,7 +649,8 @@ void compute_omega(
         as_device_type(omega->get_values()), stop_status->get_const_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL);
 
 
 }  // namespace idr
diff --git a/common/unified/solver/bicg_kernels.cpp b/common/unified/solver/bicg_kernels.cpp
index 7d15718c05d..4c6fe8cdc98 100644
--- a/common/unified/solver/bicg_kernels.cpp
+++ b/common/unified/solver/bicg_kernels.cpp
@@ -64,7 +64,8 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_BICG_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -90,7 +91,7 @@ void step_1(std::shared_ptr<const DefaultExecutor> exec,
         row_vector(prev_rho), *stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICG_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -119,7 +120,7 @@ void step_2(std::shared_ptr<const DefaultExecutor> exec,
         default_stride(q2), row_vector(beta), row_vector(rho), *stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICG_STEP_2_KERNEL);
 
 
 }  // namespace bicg
diff --git a/common/unified/solver/bicgstab_kernels.cpp b/common/unified/solver/bicgstab_kernels.cpp
index c403da3bf96..ad5b1ed3302 100644
--- a/common/unified/solver/bicgstab_kernels.cpp
+++ b/common/unified/solver/bicgstab_kernels.cpp
@@ -69,7 +69,8 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_BICGSTAB_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -98,7 +99,8 @@ void step_1(std::shared_ptr<const DefaultExecutor> exec,
         row_vector(alpha), row_vector(omega), *stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_BICGSTAB_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -127,7 +129,8 @@ void step_2(std::shared_ptr<const DefaultExecutor> exec,
         *stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_BICGSTAB_STEP_2_KERNEL);
 
 
 template <typename ValueType>
@@ -159,7 +162,8 @@ void step_3(
         row_vector(omega), *stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_3_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_BICGSTAB_STEP_3_KERNEL);
 
 
 template <typename ValueType>
@@ -188,7 +192,8 @@ void finalize(std::shared_ptr<const DefaultExecutor> exec,
         x->get_size()[1], *stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_FINALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_BICGSTAB_FINALIZE_KERNEL);
 
 
 }  // namespace bicgstab
diff --git a/common/unified/solver/cg_kernels.cpp b/common/unified/solver/cg_kernels.cpp
index 822dddf1c3b..e77f01de748 100644
--- a/common/unified/solver/cg_kernels.cpp
+++ b/common/unified/solver/cg_kernels.cpp
@@ -57,7 +57,7 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -80,7 +80,7 @@ void step_1(std::shared_ptr<const DefaultExecutor> exec,
         row_vector(rho), row_vector(prev_rho), *stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -106,7 +106,7 @@ void step_2(std::shared_ptr<const DefaultExecutor> exec,
         default_stride(q), row_vector(beta), row_vector(rho), *stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG_STEP_2_KERNEL);
 
 
 }  // namespace cg
diff --git a/common/unified/solver/cgs_kernels.cpp b/common/unified/solver/cgs_kernels.cpp
index 0618b8f8208..6ceaa883c9f 100644
--- a/common/unified/solver/cgs_kernels.cpp
+++ b/common/unified/solver/cgs_kernels.cpp
@@ -72,7 +72,8 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_CGS_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -103,7 +104,7 @@ void step_1(std::shared_ptr<const DefaultExecutor> exec,
         row_vector(prev_rho), *stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -134,7 +135,7 @@ void step_2(std::shared_ptr<const DefaultExecutor> exec,
         row_vector(alpha), row_vector(rho), row_vector(gamma), *stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_STEP_2_KERNEL);
 
 template <typename ValueType>
 void step_3(std::shared_ptr<const DefaultExecutor> exec,
@@ -157,7 +158,7 @@ void step_3(std::shared_ptr<const DefaultExecutor> exec,
         *stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_STEP_3_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_STEP_3_KERNEL);
 
 
 }  // namespace cgs
diff --git a/common/unified/solver/common_gmres_kernels.cpp b/common/unified/solver/common_gmres_kernels.cpp
index 679aebcfaa2..32fe526d7f6 100644
--- a/common/unified/solver/common_gmres_kernels.cpp
+++ b/common/unified/solver/common_gmres_kernels.cpp
@@ -52,7 +52,8 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
         b->get_size()[0]);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_COMMON_GMRES_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_COMMON_GMRES_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -125,7 +126,7 @@ void hessenberg_qr(std::shared_ptr<const DefaultExecutor> exec,
         stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_COMMON_GMRES_HESSENBERG_QR_KERNEL);
 
 
@@ -158,7 +159,7 @@ void solve_krylov(std::shared_ptr<const DefaultExecutor> exec,
         residual_norm_collection->get_size()[1]);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_COMMON_GMRES_SOLVE_KRYLOV_KERNEL);
 
 
diff --git a/common/unified/solver/fcg_kernels.cpp b/common/unified/solver/fcg_kernels.cpp
index 7853d97c358..01dd3cb3d9a 100644
--- a/common/unified/solver/fcg_kernels.cpp
+++ b/common/unified/solver/fcg_kernels.cpp
@@ -61,7 +61,8 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_FCG_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -84,7 +85,7 @@ void step_1(std::shared_ptr<const DefaultExecutor> exec,
         row_vector(rho_t), row_vector(prev_rho), *stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_FCG_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -113,7 +114,7 @@ void step_2(std::shared_ptr<const DefaultExecutor> exec,
         *stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_FCG_STEP_2_KERNEL);
 
 
 }  // namespace fcg
diff --git a/common/unified/solver/gcr_kernels.cpp b/common/unified/solver/gcr_kernels.cpp
index 7adef77dfb1..d5c2e27097d 100644
--- a/common/unified/solver/gcr_kernels.cpp
+++ b/common/unified/solver/gcr_kernels.cpp
@@ -44,7 +44,8 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GCR_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_GCR_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -78,7 +79,7 @@ void restart(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GCR_RESTART_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GCR_RESTART_KERNEL);
 
 
 template <typename ValueType>
@@ -104,7 +105,7 @@ void step_1(std::shared_ptr<const DefaultExecutor> exec,
         stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GCR_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GCR_STEP_1_KERNEL);
 
 }  // namespace gcr
 }  // namespace GKO_DEVICE_NAMESPACE
diff --git a/common/unified/solver/gmres_kernels.cpp b/common/unified/solver/gmres_kernels.cpp
index f24ae445edb..38bb935df9f 100644
--- a/common/unified/solver/gmres_kernels.cpp
+++ b/common/unified/solver/gmres_kernels.cpp
@@ -56,7 +56,7 @@ void restart(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_RESTART_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GMRES_RESTART_KERNEL);
 
 
 template <typename ValueType>
@@ -92,7 +92,8 @@ void multi_axpy(std::shared_ptr<const DefaultExecutor> exec,
         before_preconditioner->get_size()[1], stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL);
 
 
 template <typename ValueType>
@@ -119,7 +120,8 @@ void multi_dot(std::shared_ptr<const DefaultExecutor> exec,
         next_krylov->get_size()[0]);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_DOT_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_GMRES_MULTI_DOT_KERNEL);
 
 }  // namespace gmres
 }  // namespace GKO_DEVICE_NAMESPACE
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index c41f9e921cb..1c57ca45177 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -549,9 +549,9 @@ GKO_STUB_BATCH_VALUE_MATRIX_PRECONDITIONER(
 namespace cg {
 
 
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_CG_INITIALIZE_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_CG_STEP_1_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_CG_STEP_2_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG_INITIALIZE_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG_STEP_1_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG_STEP_2_KERNEL);
 
 
 }  // namespace cg
@@ -560,9 +560,9 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_CG_STEP_2_KERNEL);
 namespace bicg {
 
 
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_BICG_INITIALIZE_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_BICG_STEP_1_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_BICG_STEP_2_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICG_INITIALIZE_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICG_STEP_1_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICG_STEP_2_KERNEL);
 
 
 }  // namespace bicg
@@ -593,9 +593,9 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL);
 namespace fcg {
 
 
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_FCG_INITIALIZE_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_FCG_STEP_1_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_FCG_STEP_2_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_FCG_INITIALIZE_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_FCG_STEP_1_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_FCG_STEP_2_KERNEL);
 
 
 }  // namespace fcg
@@ -604,11 +604,11 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_FCG_STEP_2_KERNEL);
 namespace bicgstab {
 
 
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_BICGSTAB_INITIALIZE_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_1_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_2_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_3_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_BICGSTAB_FINALIZE_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICGSTAB_INITIALIZE_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICGSTAB_STEP_1_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICGSTAB_STEP_2_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICGSTAB_STEP_3_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICGSTAB_FINALIZE_KERNEL);
 
 
 }  // namespace bicgstab
@@ -617,11 +617,11 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_BICGSTAB_FINALIZE_KERNEL);
 namespace idr {
 
 
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_IDR_INITIALIZE_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_IDR_STEP_1_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_IDR_STEP_2_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_IDR_STEP_3_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_INITIALIZE_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_1_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_2_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_3_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL);
 
 
 }  // namespace idr
@@ -630,10 +630,10 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL);
 namespace cgs {
 
 
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_CGS_INITIALIZE_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_CGS_STEP_1_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_CGS_STEP_2_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_CGS_STEP_3_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_INITIALIZE_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_STEP_1_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_STEP_2_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_STEP_3_KERNEL);
 
 
 }  // namespace cgs
@@ -641,9 +641,9 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_CGS_STEP_3_KERNEL);
 namespace gcr {
 
 
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_GCR_INITIALIZE_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_GCR_RESTART_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_GCR_STEP_1_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GCR_INITIALIZE_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GCR_RESTART_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GCR_STEP_1_KERNEL);
 
 
 }  // namespace gcr
@@ -651,9 +651,9 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_GCR_STEP_1_KERNEL);
 namespace common_gmres {
 
 
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_COMMON_GMRES_INITIALIZE_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_COMMON_GMRES_HESSENBERG_QR_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_COMMON_GMRES_SOLVE_KRYLOV_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_COMMON_GMRES_INITIALIZE_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_COMMON_GMRES_HESSENBERG_QR_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_COMMON_GMRES_SOLVE_KRYLOV_KERNEL);
 
 
 }  // namespace common_gmres
@@ -662,9 +662,9 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_COMMON_GMRES_SOLVE_KRYLOV_KERNEL);
 namespace gmres {
 
 
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_GMRES_RESTART_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_DOT_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GMRES_RESTART_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GMRES_MULTI_DOT_KERNEL);
 
 
 }  // namespace gmres
diff --git a/core/solver/bicg.cpp b/core/solver/bicg.cpp
index 0b39b3664cc..55d18f7f01d 100644
--- a/core/solver/bicg.cpp
+++ b/core/solver/bicg.cpp
@@ -293,8 +293,8 @@ std::vector<int> workspace_traits<Bicg<ValueType>>::vectors(const Solver&)
 
 #define GKO_DECLARE_BICG(_type) class Bicg<_type>
 #define GKO_DECLARE_BICG_TRAITS(_type) struct workspace_traits<Bicg<_type>>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_TRAITS);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICG);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICG_TRAITS);
 
 
 }  // namespace solver
diff --git a/core/solver/bicgstab.cpp b/core/solver/bicgstab.cpp
index c254b417765..1e27c046186 100644
--- a/core/solver/bicgstab.cpp
+++ b/core/solver/bicgstab.cpp
@@ -298,8 +298,8 @@ std::vector<int> workspace_traits<Bicgstab<ValueType>>::vectors(const Solver&)
 #define GKO_DECLARE_BICGSTAB(_type) class Bicgstab<_type>
 #define GKO_DECLARE_BICGSTAB_TRAITS(_type) \
     struct workspace_traits<Bicgstab<_type>>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_TRAITS);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICGSTAB);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICGSTAB_TRAITS);
 
 
 }  // namespace solver
diff --git a/core/solver/cg.cpp b/core/solver/cg.cpp
index c512dc4313b..a7898577b8a 100644
--- a/core/solver/cg.cpp
+++ b/core/solver/cg.cpp
@@ -243,8 +243,8 @@ std::vector<int> workspace_traits<Cg<ValueType>>::vectors(const Solver&)
 
 #define GKO_DECLARE_CG(_type) class Cg<_type>
 #define GKO_DECLARE_CG_TRAITS(_type) struct workspace_traits<Cg<_type>>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_TRAITS);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG_TRAITS);
 
 
 }  // namespace solver
diff --git a/core/solver/cgs.cpp b/core/solver/cgs.cpp
index 19f625228a3..4ec702a8db5 100644
--- a/core/solver/cgs.cpp
+++ b/core/solver/cgs.cpp
@@ -265,8 +265,8 @@ std::vector<int> workspace_traits<Cgs<ValueType>>::vectors(const Solver&)
 
 #define GKO_DECLARE_CGS(_type) class Cgs<_type>
 #define GKO_DECLARE_CGS_TRAITS(_type) struct workspace_traits<Cgs<_type>>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_TRAITS);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_TRAITS);
 
 
 }  // namespace solver
diff --git a/core/solver/fcg.cpp b/core/solver/fcg.cpp
index 6c65f63ccae..569061626ff 100644
--- a/core/solver/fcg.cpp
+++ b/core/solver/fcg.cpp
@@ -247,8 +247,8 @@ std::vector<int> workspace_traits<Fcg<ValueType>>::vectors(const Solver&)
 
 #define GKO_DECLARE_FCG(_type) class Fcg<_type>
 #define GKO_DECLARE_FCG_TRAITS(_type) struct workspace_traits<Fcg<_type>>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_TRAITS);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_FCG);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_FCG_TRAITS);
 
 
 }  // namespace solver
diff --git a/core/solver/gcr.cpp b/core/solver/gcr.cpp
index d5131632dc3..8219de79ef4 100644
--- a/core/solver/gcr.cpp
+++ b/core/solver/gcr.cpp
@@ -371,8 +371,8 @@ std::vector<int> workspace_traits<Gcr<ValueType>>::vectors(const Solver&)
 
 #define GKO_DECLARE_GCR(_type) class Gcr<_type>
 #define GKO_DECLARE_GCR_TRAITS(_type) struct workspace_traits<Gcr<_type>>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GCR);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GCR_TRAITS);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GCR);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GCR_TRAITS);
 
 
 }  // namespace solver
diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp
index e066fc696a1..8a4fdf563c3 100644
--- a/core/solver/gmres.cpp
+++ b/core/solver/gmres.cpp
@@ -707,8 +707,8 @@ std::vector<int> workspace_traits<Gmres<ValueType>>::vectors(const Solver&)
 
 #define GKO_DECLARE_GMRES(_type) class Gmres<_type>
 #define GKO_DECLARE_GMRES_TRAITS(_type) struct workspace_traits<Gmres<_type>>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_TRAITS);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GMRES);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GMRES_TRAITS);
 
 
 }  // namespace solver
diff --git a/core/solver/idr.cpp b/core/solver/idr.cpp
index c6d89b84ea6..d090324fea1 100644
--- a/core/solver/idr.cpp
+++ b/core/solver/idr.cpp
@@ -65,6 +65,10 @@ std::unique_ptr<LinOp> Idr<ValueType>::transpose() const
         .with_generated_preconditioner(
             share(as<Transposable>(this->get_preconditioner())->transpose()))
         .with_criteria(this->get_stop_criterion_factory())
+        .with_subspace_dim(this->get_subspace_dim())
+        .with_kappa(this->get_kappa())
+        .with_deterministic(this->get_deterministic())
+        .with_complex_subspace(this->get_complex_subspace())
         .on(this->get_executor())
         ->generate(
             share(as<Transposable>(this->get_system_matrix())->transpose()));
@@ -78,6 +82,10 @@ std::unique_ptr<LinOp> Idr<ValueType>::conj_transpose() const
         .with_generated_preconditioner(share(
             as<Transposable>(this->get_preconditioner())->conj_transpose()))
         .with_criteria(this->get_stop_criterion_factory())
+        .with_subspace_dim(this->get_subspace_dim())
+        .with_kappa(this->get_kappa())
+        .with_deterministic(this->get_deterministic())
+        .with_complex_subspace(this->get_complex_subspace())
         .on(this->get_executor())
         ->generate(share(
             as<Transposable>(this->get_system_matrix())->conj_transpose()));
@@ -272,7 +280,9 @@ void Idr<ValueType>::iterate(const VectorType* dense_b,
 
         // omega = (t^H * residual) / (t^H * t)
         // rho = (t^H * residual) / (norm(t) * norm(residual))
-        // if abs(rho) < kappa then
+        // if norm(t) is zero then
+        //     omega = 0
+        // else if abs(rho) < kappa then
         //     omega *= kappa / abs(rho)
         // end if
         // residual -= omega * t
@@ -396,8 +406,8 @@ std::vector<int> workspace_traits<Idr<ValueType>>::vectors(const Solver&)
 
 #define GKO_DECLARE_IDR(_type) class Idr<_type>
 #define GKO_DECLARE_IDR_TRAITS(_type) struct workspace_traits<Idr<_type>>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_TRAITS);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_TRAITS);
 
 
 }  // namespace solver
diff --git a/core/solver/ir.cpp b/core/solver/ir.cpp
index 75efac351f9..3c2854dcf98 100644
--- a/core/solver/ir.cpp
+++ b/core/solver/ir.cpp
@@ -370,8 +370,8 @@ std::vector<int> workspace_traits<Ir<ValueType>>::vectors(const Solver&)
 
 #define GKO_DECLARE_IR(_type) class Ir<_type>
 #define GKO_DECLARE_IR_TRAITS(_type) struct workspace_traits<Ir<_type>>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IR);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IR_TRAITS);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IR);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IR_TRAITS);
 
 
 }  // namespace solver
diff --git a/core/test/solver/bicg.cpp b/core/test/solver/bicg.cpp
index e5a40e0c4f8..a229bd85ed9 100644
--- a/core/test/solver/bicg.cpp
+++ b/core/test/solver/bicg.cpp
@@ -46,7 +46,7 @@ class Bicg : public ::testing::Test {
     std::unique_ptr<gko::LinOp> solver;
 };
 
-TYPED_TEST_SUITE(Bicg, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Bicg, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
 
 
 TYPED_TEST(Bicg, BicgFactoryKnowsItsExecutor)
diff --git a/core/test/solver/bicgstab.cpp b/core/test/solver/bicgstab.cpp
index f8b8d3c7b05..23695fe1355 100644
--- a/core/test/solver/bicgstab.cpp
+++ b/core/test/solver/bicgstab.cpp
@@ -45,7 +45,8 @@ class Bicgstab : public ::testing::Test {
     std::unique_ptr<gko::LinOp> solver;
 };
 
-TYPED_TEST_SUITE(Bicgstab, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Bicgstab, gko::test::ValueTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(Bicgstab, BicgstabFactoryKnowsItsExecutor)
diff --git a/core/test/solver/cg.cpp b/core/test/solver/cg.cpp
index cbf637de302..95552d841ac 100644
--- a/core/test/solver/cg.cpp
+++ b/core/test/solver/cg.cpp
@@ -46,7 +46,7 @@ class Cg : public ::testing::Test {
     std::unique_ptr<gko::LinOp> solver;
 };
 
-TYPED_TEST_SUITE(Cg, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Cg, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
 
 
 TYPED_TEST(Cg, CgFactoryKnowsItsExecutor)
diff --git a/core/test/solver/cgs.cpp b/core/test/solver/cgs.cpp
index 5dc80892a1b..cc355b58270 100644
--- a/core/test/solver/cgs.cpp
+++ b/core/test/solver/cgs.cpp
@@ -46,7 +46,7 @@ class Cgs : public ::testing::Test {
     std::unique_ptr<gko::LinOp> solver;
 };
 
-TYPED_TEST_SUITE(Cgs, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Cgs, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
 
 
 TYPED_TEST(Cgs, CgsFactoryKnowsItsExecutor)
diff --git a/core/test/solver/fcg.cpp b/core/test/solver/fcg.cpp
index 2898a5f5c46..c92fa4bb7f1 100644
--- a/core/test/solver/fcg.cpp
+++ b/core/test/solver/fcg.cpp
@@ -44,7 +44,7 @@ class Fcg : public ::testing::Test {
     std::unique_ptr<gko::LinOp> solver;
 };
 
-TYPED_TEST_SUITE(Fcg, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Fcg, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
 
 
 TYPED_TEST(Fcg, FcgFactoryKnowsItsExecutor)
diff --git a/core/test/solver/gcr.cpp b/core/test/solver/gcr.cpp
index 2d7b5ea7974..58194f6e92a 100644
--- a/core/test/solver/gcr.cpp
+++ b/core/test/solver/gcr.cpp
@@ -27,8 +27,8 @@ class Gcr : public ::testing::Test {
     using Solver = gko::solver::Gcr<value_type>;
     using Big_solver = gko::solver::Gcr<double>;
 
-    static constexpr gko::remove_complex<T> reduction_factor =
-        gko::remove_complex<T>(1e-6);
+    const gko::remove_complex<T> reduction_factor =
+        r<gko::remove_complex<T>>::value;
 
     Gcr()
         : exec(gko::ReferenceExecutor::create()),
@@ -70,10 +70,7 @@ class Gcr : public ::testing::Test {
     }
 };
 
-template <typename T>
-constexpr gko::remove_complex<T> Gcr<T>::reduction_factor;
-
-TYPED_TEST_SUITE(Gcr, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Gcr, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
 
 
 TYPED_TEST(Gcr, GcrFactoryKnowsItsExecutor)
@@ -160,10 +157,9 @@ TYPED_TEST(Gcr, CanSetPreconditionerGenerator)
     using value_type = typename TestFixture::value_type;
     auto gcr_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u),
-                gko::stop::ResidualNorm<value_type>::build()
-                    .with_reduction_factor(TestFixture::reduction_factor))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u),
+                           gko::stop::ResidualNorm<value_type>::build()
+                               .with_reduction_factor(this->reduction_factor))
             .with_preconditioner(Solver::build().with_criteria(
                 gko::stop::Iteration::build().with_max_iters(3u)))
             .on(this->exec);
@@ -210,10 +206,9 @@ TYPED_TEST(Gcr, CanSetKrylovDim)
     auto gcr_factory =
         Solver::build()
             .with_krylov_dim(4u)
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(4u),
-                gko::stop::ResidualNorm<value_type>::build()
-                    .with_reduction_factor(TestFixture::reduction_factor))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(4u),
+                           gko::stop::ResidualNorm<value_type>::build()
+                               .with_reduction_factor(this->reduction_factor))
             .on(this->exec);
     auto solver = gcr_factory->generate(this->mtx);
     auto krylov_dim = solver->get_krylov_dim();
diff --git a/core/test/solver/gmres.cpp b/core/test/solver/gmres.cpp
index 5d9c9e3c40e..50f505f6321 100644
--- a/core/test/solver/gmres.cpp
+++ b/core/test/solver/gmres.cpp
@@ -27,8 +27,8 @@ class Gmres : public ::testing::Test {
     using Solver = gko::solver::Gmres<value_type>;
     using Big_solver = gko::solver::Gmres<double>;
 
-    static constexpr gko::remove_complex<T> reduction_factor =
-        gko::remove_complex<T>(1e-6);
+    const gko::remove_complex<T> reduction_factor =
+        r<gko::remove_complex<T>>::value;
 
     Gmres()
         : exec(gko::ReferenceExecutor::create()),
@@ -60,10 +60,7 @@ class Gmres : public ::testing::Test {
     std::unique_ptr<gko::LinOp> big_solver;
 };
 
-template <typename T>
-constexpr gko::remove_complex<T> Gmres<T>::reduction_factor;
-
-TYPED_TEST_SUITE(Gmres, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Gmres, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
 
 
 TYPED_TEST(Gmres, GmresFactoryKnowsItsExecutor)
@@ -146,10 +143,9 @@ TYPED_TEST(Gmres, CanSetPreconditionerGenerator)
     using value_type = typename TestFixture::value_type;
     auto gmres_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u),
-                gko::stop::ResidualNorm<value_type>::build()
-                    .with_reduction_factor(TestFixture::reduction_factor))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u),
+                           gko::stop::ResidualNorm<value_type>::build()
+                               .with_reduction_factor(this->reduction_factor))
             .with_preconditioner(Solver::build().with_criteria(
                 gko::stop::Iteration::build().with_max_iters(3u)))
             .on(this->exec);
@@ -197,10 +193,9 @@ TYPED_TEST(Gmres, CanSetKrylovDim)
     auto gmres_factory =
         Solver::build()
             .with_krylov_dim(4u)
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(4u),
-                gko::stop::ResidualNorm<value_type>::build()
-                    .with_reduction_factor(TestFixture::reduction_factor))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(4u),
+                           gko::stop::ResidualNorm<value_type>::build()
+                               .with_reduction_factor(this->reduction_factor))
             .on(this->exec);
     auto solver = gmres_factory->generate(this->mtx);
     auto krylov_dim = solver->get_krylov_dim();
diff --git a/core/test/solver/idr.cpp b/core/test/solver/idr.cpp
index 9eb79356046..823327e337e 100644
--- a/core/test/solver/idr.cpp
+++ b/core/test/solver/idr.cpp
@@ -45,7 +45,7 @@ class Idr : public ::testing::Test {
     std::unique_ptr<gko::LinOp> solver;
 };
 
-TYPED_TEST_SUITE(Idr, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Idr, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
 
 
 TYPED_TEST(Idr, IdrFactoryKnowsItsExecutor)
diff --git a/core/test/solver/ir.cpp b/core/test/solver/ir.cpp
index 1137862a395..59f85f42321 100644
--- a/core/test/solver/ir.cpp
+++ b/core/test/solver/ir.cpp
@@ -46,7 +46,7 @@ class Ir : public ::testing::Test {
     std::unique_ptr<gko::LinOp> solver;
 };
 
-TYPED_TEST_SUITE(Ir, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Ir, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
 
 
 TYPED_TEST(Ir, IrFactoryKnowsItsExecutor)
diff --git a/cuda/base/curand_bindings.hpp b/cuda/base/curand_bindings.hpp
index eb3dbee6b7b..80ceff2dacd 100644
--- a/cuda/base/curand_bindings.hpp
+++ b/cuda/base/curand_bindings.hpp
@@ -23,6 +23,17 @@ namespace cuda {
  * @ingroup curand
  */
 namespace curand {
+namespace detail {
+
+
+template <typename... Args>
+inline int64 not_implemented(Args...)
+{
+    return static_cast<int64>(CURAND_STATUS_TYPE_ERROR);
+}
+
+
+}  // namespace detail
 
 
 template <typename ValueType>
@@ -77,6 +88,8 @@ GKO_BIND_CURAND_RANDOM_VECTOR(float, curandGenerateNormal);
 GKO_BIND_CURAND_RANDOM_VECTOR(double, curandGenerateNormalDouble);
 GKO_BIND_CURAND_RANDOM_VECTOR(std::complex<float>, curandGenerateNormal);
 GKO_BIND_CURAND_RANDOM_VECTOR(std::complex<double>, curandGenerateNormalDouble);
+template <typename ValueType>
+GKO_BIND_CURAND_RANDOM_VECTOR(ValueType, detail::not_implemented);
 
 
 #undef GKO_BIND_CURAND_RANDOM_VECTOR
diff --git a/dpcpp/solver/cb_gmres_kernels.dp.cpp b/dpcpp/solver/cb_gmres_kernels.dp.cpp
index 7ab010ba29f..e3424944309 100644
--- a/dpcpp/solver/cb_gmres_kernels.dp.cpp
+++ b/dpcpp/solver/cb_gmres_kernels.dp.cpp
@@ -285,9 +285,9 @@ void multinorminf_without_stop_kernel(
              i += default_dot_dim) {
             const auto next_krylov_idx = i * stride_next_krylov + col_idx;
             local_max =
-                (local_max >= std::abs(next_krylov_basis[next_krylov_idx]))
+                (local_max >= gko::abs(next_krylov_basis[next_krylov_idx]))
                     ? local_max
-                    : std::abs(next_krylov_basis[next_krylov_idx]);
+                    : gko::abs(next_krylov_basis[next_krylov_idx]);
         }
     }
     reduction_helper[tidx * (default_dot_dim + 1) + tidy] = local_max;
@@ -373,7 +373,7 @@ void multinorm2_inf_kernel(
             local_res += squared_norm(num);
             if (compute_inf) {
                 local_max =
-                    ((local_max >= std::abs(num)) ? local_max : std::abs(num));
+                    ((local_max >= gko::abs(num)) ? local_max : gko::abs(num));
             }
         }
     }
@@ -729,8 +729,8 @@ void check_arnoldi_norms(
         gko::cb_gmres::detail::has_3d_scaled_accessor<Accessor3d>::value;
 
     if (col_idx < num_rhs && !stop_status[col_idx].has_stopped()) {
-        const auto num0 = (std::sqrt(eta_squared * arnoldi_norm[col_idx]));
-        const auto num11 = std::sqrt(arnoldi_norm[col_idx + stride_norm]);
+        const auto num0 = gko::sqrt(eta_squared * arnoldi_norm[col_idx]);
+        const auto num11 = gko::sqrt(arnoldi_norm[col_idx + stride_norm]);
         const auto num2 = has_scalar ? (arnoldi_norm[col_idx + 2 * stride_norm])
                                      : remove_complex<ValueType>{};
         if (num11 < num0) {
diff --git a/dpcpp/solver/common_gmres_kernels.dp.inc b/dpcpp/solver/common_gmres_kernels.dp.inc
index 0b5de8188f2..f8a54fe5116 100644
--- a/dpcpp/solver/common_gmres_kernels.dp.inc
+++ b/dpcpp/solver/common_gmres_kernels.dp.inc
@@ -72,12 +72,12 @@ void calculate_sin_and_cos_kernel(size_type col_idx, size_type num_cols,
         register_cos = zero<ValueType>();
         register_sin = one<ValueType>();
     } else {
-        const auto scale = std::abs(this_hess) + std::abs(next_hess);
+        const auto scale = gko::abs(this_hess) + gko::abs(next_hess);
         const auto hypotenuse =
             scale *
-            std::sqrt(
-                std::abs(this_hess / scale) * std::abs(this_hess / scale) +
-                std::abs(next_hess / scale) * std::abs(next_hess / scale));
+            gko::sqrt(
+                gko::abs(this_hess / scale) * gko::abs(this_hess / scale) +
+                gko::abs(next_hess / scale) * gko::abs(next_hess / scale));
         register_cos = conj(this_hess) / hypotenuse;
         register_sin = conj(next_hess) / hypotenuse;
     }
@@ -102,7 +102,7 @@ void calculate_residual_norm_kernel(size_type col_idx, size_type num_cols,
     const auto next_rnc = -conj(register_sin) * this_rnc;
     residual_norm_collection[iter * stride_residual_norm_collection + col_idx] =
         register_cos * this_rnc;
-    residual_norm[col_idx] = std::abs(next_rnc);
+    residual_norm[col_idx] = gko::abs(next_rnc);
     residual_norm_collection[(iter + 1) * stride_residual_norm_collection +
                              col_idx] = next_rnc;
 }
diff --git a/dpcpp/solver/idr_kernels.dp.cpp b/dpcpp/solver/idr_kernels.dp.cpp
index d59ada362f9..29cdd70cd64 100644
--- a/dpcpp/solver/idr_kernels.dp.cpp
+++ b/dpcpp/solver/idr_kernels.dp.cpp
@@ -8,6 +8,7 @@
 
 #include <ctime>
 #include <random>
+#include <type_traits>
 
 #include <CL/sycl.hpp>
 
@@ -127,7 +128,7 @@ void orthonormalize_subspace_vectors_kernel(
                const remove_complex<ValueType>& b) { return a + b; });
         item_ct1.barrier(sycl::access::fence_space::local_space);
 
-        norm = std::sqrt(reduction_helper_real[0]);
+        norm = gko::sqrt(reduction_helper_real[0]);
         for (size_type j = tidx; j < num_cols; j += block_size) {
             values[row * stride + j] /= norm;
         }
@@ -542,8 +543,12 @@ void compute_omega_kernel(
     if (!stop_status[global_id].has_stopped()) {
         auto thr = omega[global_id];
         omega[global_id] /= tht[global_id];
-        auto absrho = std::abs(
-            thr / (std::sqrt(real(tht[global_id])) * residual_norm[global_id]));
+        const auto normt = sqrt(real(tht[global_id]));
+        if (normt == zero<remove_complex<ValueType>>()) {
+            omega[global_id] = zero<ValueType>();
+            return;
+        }
+        auto absrho = gko::abs(thr / (normt * residual_norm[global_id]));
 
         if (absrho < kappa) {
             omega[global_id] *= kappa / absrho;
@@ -594,18 +599,20 @@ void initialize_subspace_vectors(std::shared_ptr<const DpcppExecutor> exec,
 {
     if (!deterministic) {
         auto seed = std::random_device{}();
-        auto work = reinterpret_cast<remove_complex<ValueType>*>(
-            subspace_vectors->get_values());
+        using real_type = remove_complex<ValueType>;
+        auto work =
+            reinterpret_cast<real_type*>(subspace_vectors->get_values());
         auto n =
             subspace_vectors->get_size()[0] * subspace_vectors->get_stride();
+        using rand_type = std::conditional_t<std::is_same_v<real_type, half>,
+                                             float, real_type>;
         n = is_complex<ValueType>() ? 2 * n : n;
         exec->get_queue()->submit([&](sycl::handler& cgh) {
             cgh.parallel_for(sycl::range<1>(n), [=](sycl::item<1> idx) {
                 std::uint64_t offset = idx.get_linear_id();
                 oneapi::dpl::minstd_rand engine(seed, offset);
-                oneapi::dpl::normal_distribution<remove_complex<ValueType>>
-                    distr(0, 1);
-                auto res = distr(engine);
+                oneapi::dpl::normal_distribution<rand_type> distr(0, 1);
+                auto res = static_cast<real_type>(distr(engine));
 
                 work[idx] = res;
             });
@@ -761,7 +768,8 @@ void initialize(std::shared_ptr<const DpcppExecutor> exec, const size_type nrhs,
     orthonormalize_subspace_vectors(exec, subspace_vectors);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_IDR_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -787,7 +795,7 @@ void step_1(std::shared_ptr<const DpcppExecutor> exec, const size_type nrhs,
                   stop_status->get_const_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -812,7 +820,7 @@ void step_2(std::shared_ptr<const DpcppExecutor> exec, const size_type nrhs,
                   stop_status->get_const_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_2_KERNEL);
 
 
 template <typename ValueType>
@@ -829,7 +837,7 @@ void step_3(std::shared_ptr<const DpcppExecutor> exec, const size_type nrhs,
     update_x_r_and_f(exec, nrhs, k, m, g, u, f, residual, x, stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_3_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_3_KERNEL);
 
 
 template <typename ValueType>
@@ -846,7 +854,8 @@ void compute_omega(
                          stop_status->get_const_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL);
 
 
 }  // namespace idr
diff --git a/hip/base/hiprand_bindings.hip.hpp b/hip/base/hiprand_bindings.hip.hpp
index 7cd76b9d320..76a7f4e79ce 100644
--- a/hip/base/hiprand_bindings.hip.hpp
+++ b/hip/base/hiprand_bindings.hip.hpp
@@ -29,6 +29,17 @@ namespace hip {
  * @ingroup hiprand
  */
 namespace hiprand {
+namespace detail {
+
+
+template <typename... Args>
+inline int64 not_implemented(Args...)
+{
+    return static_cast<int64>(HIPRAND_STATUS_TYPE_ERROR);
+}
+
+
+}  // namespace detail
 
 
 template <typename ValueType>
@@ -83,6 +94,8 @@ GKO_BIND_HIPRAND_RANDOM_VECTOR(double, hiprandGenerateNormalDouble);
 GKO_BIND_HIPRAND_RANDOM_VECTOR(std::complex<float>, hiprandGenerateNormal);
 GKO_BIND_HIPRAND_RANDOM_VECTOR(std::complex<double>,
                                hiprandGenerateNormalDouble);
+template <typename ValueType>
+GKO_BIND_HIPRAND_RANDOM_VECTOR(ValueType, detail::not_implemented);
 
 
 #undef GKO_BIND_HIPRAND_RANDOM_VECTOR
diff --git a/omp/solver/idr_kernels.cpp b/omp/solver/idr_kernels.cpp
index a93002e4833..eb0eb1074e5 100644
--- a/omp/solver/idr_kernels.cpp
+++ b/omp/solver/idr_kernels.cpp
@@ -93,7 +93,7 @@ template <typename ValueType, typename Distribution, typename Generator>
 typename std::enable_if<!is_complex_s<ValueType>::value, ValueType>::type
 get_rand_value(Distribution&& dist, Generator&& gen)
 {
-    return dist(gen);
+    return static_cast<ValueType>(dist(gen));
 }
 
 
@@ -101,7 +101,9 @@ template <typename ValueType, typename Distribution, typename Generator>
 typename std::enable_if<is_complex_s<ValueType>::value, ValueType>::type
 get_rand_value(Distribution&& dist, Generator&& gen)
 {
-    return ValueType(dist(gen), dist(gen));
+    using real_value_type = remove_complex<ValueType>;
+    return ValueType(get_rand_value<real_value_type>(dist, gen),
+                     get_rand_value<real_value_type>(dist, gen));
 }
 
 
@@ -135,7 +137,7 @@ void initialize(std::shared_ptr<const OmpExecutor> exec, const size_type nrhs,
     // Initialize and Orthonormalize P
     const auto num_rows = subspace_vectors->get_size()[0];
     const auto num_cols = subspace_vectors->get_size()[1];
-    auto dist = std::normal_distribution<remove_complex<ValueType>>(0.0, 1.0);
+    auto dist = std::normal_distribution<>(0.0, 1.0);
     auto seed = std::random_device{}();
     auto gen = std::default_random_engine(seed);
     for (size_type row = 0; row < num_rows; row++) {
@@ -182,7 +184,8 @@ void initialize(std::shared_ptr<const OmpExecutor> exec, const size_type nrhs,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_IDR_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -216,7 +219,7 @@ void step_1(std::shared_ptr<const OmpExecutor> exec, const size_type nrhs,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -242,7 +245,7 @@ void step_2(std::shared_ptr<const OmpExecutor> exec, const size_type nrhs,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_2_KERNEL);
 
 
 template <typename ValueType>
@@ -288,7 +291,7 @@ void step_3(std::shared_ptr<const OmpExecutor> exec, const size_type nrhs,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_3_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_3_KERNEL);
 
 
 template <typename ValueType>
@@ -306,6 +309,10 @@ void compute_omega(
 
         auto thr = omega->at(0, i);
         auto normt = sqrt(real(tht->at(0, i)));
+        if (normt == zero<remove_complex<ValueType>>()) {
+            omega->at(0, i) = 0;
+            continue;
+        }
         omega->at(0, i) /= tht->at(0, i);
         auto absrho = abs(thr / (normt * residual_norm->at(0, i)));
 
@@ -315,7 +322,8 @@ void compute_omega(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL);
 
 
 }  // namespace idr
diff --git a/reference/solver/bicg_kernels.cpp b/reference/solver/bicg_kernels.cpp
index dee2d30b8dc..511d4375ae5 100644
--- a/reference/solver/bicg_kernels.cpp
+++ b/reference/solver/bicg_kernels.cpp
@@ -46,7 +46,8 @@ void initialize(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_BICG_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -74,7 +75,7 @@ void step_1(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICG_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -102,7 +103,7 @@ void step_2(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICG_STEP_2_KERNEL);
 
 
 }  // namespace bicg
diff --git a/reference/solver/bicgstab_kernels.cpp b/reference/solver/bicgstab_kernels.cpp
index 31955a59c53..e762dc88533 100644
--- a/reference/solver/bicgstab_kernels.cpp
+++ b/reference/solver/bicgstab_kernels.cpp
@@ -57,7 +57,8 @@ void initialize(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_BICGSTAB_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -87,7 +88,8 @@ void step_1(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_BICGSTAB_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -115,7 +117,8 @@ void step_2(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_BICGSTAB_STEP_2_KERNEL);
 
 
 template <typename ValueType>
@@ -149,7 +152,8 @@ void step_3(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_3_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_BICGSTAB_STEP_3_KERNEL);
 
 
 template <typename ValueType>
@@ -169,7 +173,8 @@ void finalize(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_FINALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_BICGSTAB_FINALIZE_KERNEL);
 
 
 }  // namespace bicgstab
diff --git a/reference/solver/cg_kernels.cpp b/reference/solver/cg_kernels.cpp
index 5af15692414..fe548b9a03a 100644
--- a/reference/solver/cg_kernels.cpp
+++ b/reference/solver/cg_kernels.cpp
@@ -42,7 +42,7 @@ void initialize(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -67,7 +67,7 @@ void step_1(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -93,7 +93,7 @@ void step_2(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG_STEP_2_KERNEL);
 
 
 }  // namespace cg
diff --git a/reference/solver/cgs_kernels.cpp b/reference/solver/cgs_kernels.cpp
index a5a5f8c5862..f2f2200b996 100644
--- a/reference/solver/cgs_kernels.cpp
+++ b/reference/solver/cgs_kernels.cpp
@@ -51,7 +51,8 @@ void initialize(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_CGS_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -83,7 +84,7 @@ void step_1(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -114,7 +115,7 @@ void step_2(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_STEP_2_KERNEL);
 
 
 template <typename ValueType>
@@ -135,7 +136,7 @@ void step_3(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_STEP_3_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_STEP_3_KERNEL);
 
 
 }  // namespace cgs
diff --git a/reference/solver/common_gmres_kernels.cpp b/reference/solver/common_gmres_kernels.cpp
index 4ba091e03ae..24c6135f0b1 100644
--- a/reference/solver/common_gmres_kernels.cpp
+++ b/reference/solver/common_gmres_kernels.cpp
@@ -132,7 +132,8 @@ void initialize(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_COMMON_GMRES_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_COMMON_GMRES_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -156,7 +157,7 @@ void hessenberg_qr(std::shared_ptr<const ReferenceExecutor> exec,
                                  residual_norm_collection, iter, stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_COMMON_GMRES_HESSENBERG_QR_KERNEL);
 
 
@@ -186,7 +187,7 @@ void solve_krylov(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_COMMON_GMRES_SOLVE_KRYLOV_KERNEL);
 
 
diff --git a/reference/solver/fcg_kernels.cpp b/reference/solver/fcg_kernels.cpp
index 65b6bf27698..5ba997da941 100644
--- a/reference/solver/fcg_kernels.cpp
+++ b/reference/solver/fcg_kernels.cpp
@@ -43,7 +43,8 @@ void initialize(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_FCG_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -68,7 +69,7 @@ void step_1(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_FCG_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -96,7 +97,7 @@ void step_2(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_FCG_STEP_2_KERNEL);
 
 
 }  // namespace fcg
diff --git a/reference/solver/gcr_kernels.cpp b/reference/solver/gcr_kernels.cpp
index 531814c641e..d51728b15cf 100644
--- a/reference/solver/gcr_kernels.cpp
+++ b/reference/solver/gcr_kernels.cpp
@@ -37,7 +37,8 @@ void initialize(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GCR_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_GCR_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -56,7 +57,7 @@ void restart(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GCR_RESTART_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GCR_RESTART_KERNEL);
 
 
 template <typename ValueType>
@@ -82,7 +83,7 @@ void step_1(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GCR_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GCR_STEP_1_KERNEL);
 
 
 }  // namespace gcr
diff --git a/reference/solver/gmres_kernels.cpp b/reference/solver/gmres_kernels.cpp
index a7f5a751a3b..6d5eaae1490 100644
--- a/reference/solver/gmres_kernels.cpp
+++ b/reference/solver/gmres_kernels.cpp
@@ -40,7 +40,7 @@ void restart(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_RESTART_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GMRES_RESTART_KERNEL);
 
 
 template <typename ValueType>
@@ -69,7 +69,8 @@ void multi_axpy(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL);
 
 template <typename ValueType>
 void multi_dot(std::shared_ptr<const ReferenceExecutor> exec,
@@ -91,7 +92,8 @@ void multi_dot(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_DOT_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_GMRES_MULTI_DOT_KERNEL);
 
 }  // namespace gmres
 }  // namespace reference
diff --git a/reference/solver/idr_kernels.cpp b/reference/solver/idr_kernels.cpp
index 606def8a18b..27315da3565 100644
--- a/reference/solver/idr_kernels.cpp
+++ b/reference/solver/idr_kernels.cpp
@@ -86,7 +86,7 @@ template <typename ValueType, typename Distribution, typename Generator>
 typename std::enable_if<!is_complex_s<ValueType>::value, ValueType>::type
 get_rand_value(Distribution&& dist, Generator&& gen)
 {
-    return dist(gen);
+    return static_cast<ValueType>(dist(gen));
 }
 
 
@@ -94,7 +94,9 @@ template <typename ValueType, typename Distribution, typename Generator>
 typename std::enable_if<is_complex_s<ValueType>::value, ValueType>::type
 get_rand_value(Distribution&& dist, Generator&& gen)
 {
-    return ValueType(dist(gen), dist(gen));
+    using real_value_type = remove_complex<ValueType>;
+    return ValueType(get_rand_value<real_value_type>(dist, gen),
+                     get_rand_value<real_value_type>(dist, gen));
 }
 
 
@@ -122,7 +124,7 @@ void initialize(std::shared_ptr<const ReferenceExecutor> exec,
     // Initialize and Orthonormalize P
     const auto num_rows = subspace_vectors->get_size()[0];
     const auto num_cols = subspace_vectors->get_size()[1];
-    auto dist = std::normal_distribution<remove_complex<ValueType>>(0.0, 1.0);
+    auto dist = std::normal_distribution<>(0.0, 1.0);
     auto seed = std::random_device{}();
     auto gen = std::default_random_engine(seed);
     for (size_type row = 0; row < num_rows; row++) {
@@ -158,7 +160,8 @@ void initialize(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_IDR_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -188,7 +191,7 @@ void step_1(std::shared_ptr<const ReferenceExecutor> exec, const size_type nrhs,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -213,7 +216,7 @@ void step_2(std::shared_ptr<const ReferenceExecutor> exec, const size_type nrhs,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_2_KERNEL);
 
 
 template <typename ValueType>
@@ -256,7 +259,7 @@ void step_3(std::shared_ptr<const ReferenceExecutor> exec, const size_type nrhs,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_3_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_3_KERNEL);
 
 
 template <typename ValueType>
@@ -275,14 +278,17 @@ void compute_omega(
         auto normt = sqrt(real(tht->at(0, i)));
         omega->at(0, i) /= tht->at(0, i);
         auto absrho = abs(thr / (normt * residual_norm->at(0, i)));
-
         if (absrho < kappa) {
             omega->at(0, i) *= kappa / absrho;
         }
+        if (normt == zero<remove_complex<ValueType>>()) {
+            omega->at(0, i) = 0;
+        }
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL);
 
 
 }  // namespace idr
diff --git a/reference/test/solver/bicg_kernels.cpp b/reference/test/solver/bicg_kernels.cpp
index fd24c52bcc8..13d81de0c7a 100644
--- a/reference/test/solver/bicg_kernels.cpp
+++ b/reference/test/solver/bicg_kernels.cpp
@@ -119,7 +119,7 @@ class Bicg : public ::testing::Test {
     std::unique_ptr<typename Solver::Factory> bicg_factory_non_symmetric;
 };
 
-TYPED_TEST_SUITE(Bicg, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Bicg, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
 
 
 TYPED_TEST(Bicg, KernelInitialize)
@@ -266,7 +266,8 @@ TYPED_TEST(Bicg, SolvesStencilSystem)
 
 TYPED_TEST(Bicg, SolvesStencilSystemMixed)
 {
-    using value_type = gko::next_precision<typename TestFixture::value_type>;
+    using value_type =
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->bicg_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, this->exec);
@@ -302,8 +303,8 @@ TYPED_TEST(Bicg, SolvesStencilSystemComplex)
 
 TYPED_TEST(Bicg, SolvesStencilSystemMixedComplex)
 {
-    using value_type =
-        gko::to_complex<gko::next_precision<typename TestFixture::value_type>>;
+    using value_type = gko::to_complex<
+        gko::next_precision_with_half<typename TestFixture::value_type>>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->bicg_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>(
@@ -358,7 +359,8 @@ TYPED_TEST(Bicg, SolvesStencilSystemUsingAdvancedApply)
 
 TYPED_TEST(Bicg, SolvesStencilSystemUsingAdvancedApplyMixed)
 {
-    using value_type = gko::next_precision<typename TestFixture::value_type>;
+    using value_type =
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->bicg_factory->generate(this->mtx);
     auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
@@ -400,7 +402,7 @@ TYPED_TEST(Bicg, SolvesStencilSystemUsingAdvancedApplyComplex)
 TYPED_TEST(Bicg, SolvesStencilSystemUsingAdvancedApplyMixedComplex)
 {
     using Scalar = gko::matrix::Dense<
-        gko::next_precision<typename TestFixture::value_type>>;
+        gko::next_precision_with_half<typename TestFixture::value_type>>;
     using Mtx = gko::to_complex<typename TestFixture::Mtx>;
     using value_type = typename Mtx::value_type;
     auto solver = this->bicg_factory->generate(this->mtx);
@@ -446,6 +448,8 @@ TYPED_TEST(Bicg, SolvesBigDenseSystem1)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     auto solver = this->bicg_factory_big->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
         {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5},
@@ -463,6 +467,8 @@ TYPED_TEST(Bicg, SolvesBigDenseSystem2)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     auto solver = this->bicg_factory_big->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
         {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0},
@@ -480,6 +486,8 @@ TYPED_TEST(Bicg, SolvesBigDenseSystemImplicitResNormCrit)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     auto solver = this->bicg_factory_big2->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
         {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0},
@@ -511,6 +519,8 @@ TYPED_TEST(Bicg, SolvesMultipleDenseSystemForDivergenceCheck)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     auto solver = this->bicg_factory_big->generate(this->mtx_big);
     auto b1 = gko::initialize<Mtx>(
         {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5},
diff --git a/reference/test/solver/bicgstab_kernels.cpp b/reference/test/solver/bicgstab_kernels.cpp
index f09e78137b3..ce17f25f47e 100644
--- a/reference/test/solver/bicgstab_kernels.cpp
+++ b/reference/test/solver/bicgstab_kernels.cpp
@@ -121,7 +121,8 @@ class Bicgstab : public ::testing::Test {
     std::unique_ptr<typename Solver::Factory> bicgstab_factory_precision;
 };
 
-TYPED_TEST_SUITE(Bicgstab, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Bicgstab, gko::test::ValueTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(Bicgstab, KernelInitialize)
@@ -383,7 +384,8 @@ TYPED_TEST(Bicgstab, SolvesDenseSystem)
 
 TYPED_TEST(Bicgstab, SolvesDenseSystemMixed)
 {
-    using value_type = gko::next_precision<typename TestFixture::value_type>;
+    using value_type =
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->bicgstab_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, this->exec);
@@ -419,8 +421,8 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemComplex)
 
 TYPED_TEST(Bicgstab, SolvesDenseSystemMixedComplex)
 {
-    using value_type =
-        gko::to_complex<gko::next_precision<typename TestFixture::value_type>>;
+    using value_type = gko::to_complex<
+        gko::next_precision_with_half<typename TestFixture::value_type>>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->bicgstab_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>(
@@ -489,13 +491,14 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApply)
 
     solver->apply(alpha, b, beta, x);
 
-    GKO_ASSERT_MTX_NEAR(x, l({-8.5, -3.0, 6.0}), r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(x, l({-8.5, -3.0, 6.0}), 2 * r<value_type>::value);
 }
 
 
 TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyMixed)
 {
-    using value_type = gko::next_precision<typename TestFixture::value_type>;
+    using value_type =
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->bicgstab_factory->generate(this->mtx);
     auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
@@ -506,7 +509,7 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyMixed)
     solver->apply(alpha, b, beta, x);
 
     GKO_ASSERT_MTX_NEAR(x, l({-8.5, -3.0, 6.0}),
-                        (r_mixed<value_type, TypeParam>()));
+                        (2 * r_mixed<value_type, TypeParam>()));
 }
 
 
@@ -522,14 +525,14 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyComplex)
         {value_type{-1.0, 2.0}, value_type{3.0, -6.0}, value_type{1.0, -2.0}},
         this->exec);
     auto x = gko::initialize<Mtx>(
-        {value_type{0.5, -1.0}, value_type{1.0, -2.0}, value_type{2.0, -4.0}},
+        {value_type{0.5, -0.5}, value_type{1.0, 0.5}, value_type{2.0, -1.0}},
         this->exec);
 
     solver->apply(alpha, b, beta, x);
 
     GKO_ASSERT_MTX_NEAR(x,
-                        l({value_type{-8.5, 17.0}, value_type{-3.0, 6.0},
-                           value_type{6.0, -12.0}}),
+                        l({value_type{-8.5, 16.5}, value_type{-3.0, 3.5},
+                           value_type{6.0, -15.0}}),
                         r<value_type>::value);
 }
 
@@ -537,7 +540,7 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyComplex)
 TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyMixedComplex)
 {
     using Scalar = gko::matrix::Dense<
-        gko::next_precision<typename TestFixture::value_type>>;
+        gko::next_precision_with_half<typename TestFixture::value_type>>;
     using Mtx = gko::to_complex<typename TestFixture::Mtx>;
     using value_type = typename Mtx::value_type;
     auto solver = this->bicgstab_factory->generate(this->mtx);
@@ -547,14 +550,14 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyMixedComplex)
         {value_type{-1.0, 2.0}, value_type{3.0, -6.0}, value_type{1.0, -2.0}},
         this->exec);
     auto x = gko::initialize<Mtx>(
-        {value_type{0.5, -1.0}, value_type{1.0, -2.0}, value_type{2.0, -4.0}},
+        {value_type{0.5, -0.5}, value_type{1.0, 0.5}, value_type{2.0, -1.0}},
         this->exec);
 
     solver->apply(alpha, b, beta, x);
 
     GKO_ASSERT_MTX_NEAR(x,
-                        l({value_type{-8.5, 17.0}, value_type{-3.0, 6.0},
-                           value_type{6.0, -12.0}}),
+                        l({value_type{-8.5, 16.5}, value_type{-3.0, 3.5},
+                           value_type{6.0, -15.0}}),
                         (r_mixed<value_type, TypeParam>()));
 }
 
@@ -585,6 +588,9 @@ TYPED_TEST(Bicgstab, SolvesBigDenseSystemForDivergenceCheck1)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // beta encounters huge value out of the half-precision range in the first
+    // part of the second iteration
+    SKIP_IF_HALF(value_type);
     auto half_tol = std::sqrt(r<value_type>::value);
     std::shared_ptr<Mtx> locmtx =
         gko::initialize<Mtx>({{-19.0, 47.0, -41.0, 35.0, -21.0, 71.0},
@@ -613,6 +619,9 @@ TYPED_TEST(Bicgstab, SolvesBigDenseSystemForDivergenceCheck2)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // beta encounters huge value out of the half-precision range in the first
+    // part of second iteration
+    SKIP_IF_HALF(value_type);
     auto half_tol = std::sqrt(r<value_type>::value);
     std::shared_ptr<Mtx> locmtx =
         gko::initialize<Mtx>({{-19.0, 47.0, -41.0, 35.0, -21.0, 71.0},
@@ -642,6 +651,9 @@ TYPED_TEST(Bicgstab, SolvesMultipleDenseSystemsDivergenceCheck)
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
     using T = value_type;
+    // beta encounters huge value out of the half-precision range in the first
+    // part of second iteration
+    SKIP_IF_HALF(value_type);
     std::shared_ptr<Mtx> locmtx =
         gko::initialize<Mtx>({{-19.0, 47.0, -41.0, 35.0, -21.0, 71.0},
                               {-8.0, -66.0, 29.0, -96.0, -95.0, -14.0},
diff --git a/reference/test/solver/cg_kernels.cpp b/reference/test/solver/cg_kernels.cpp
index 7cbc629717c..fd708d736bc 100644
--- a/reference/test/solver/cg_kernels.cpp
+++ b/reference/test/solver/cg_kernels.cpp
@@ -107,7 +107,7 @@ class Cg : public ::testing::Test {
     std::unique_ptr<typename Solver::Factory> cg_factory_big2;
 };
 
-TYPED_TEST_SUITE(Cg, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Cg, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
 
 
 TYPED_TEST(Cg, KernelInitialize)
@@ -228,7 +228,8 @@ TYPED_TEST(Cg, SolvesStencilSystem)
 
 TYPED_TEST(Cg, SolvesStencilSystemMixed)
 {
-    using value_type = gko::next_precision<typename TestFixture::value_type>;
+    using value_type =
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->cg_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, this->exec);
@@ -264,8 +265,8 @@ TYPED_TEST(Cg, SolvesStencilSystemComplex)
 
 TYPED_TEST(Cg, SolvesStencilSystemMixedComplex)
 {
-    using value_type =
-        gko::to_complex<gko::next_precision<typename TestFixture::value_type>>;
+    using value_type = gko::to_complex<
+        gko::next_precision_with_half<typename TestFixture::value_type>>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->cg_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>(
@@ -320,7 +321,8 @@ TYPED_TEST(Cg, SolvesStencilSystemUsingAdvancedApply)
 
 TYPED_TEST(Cg, SolvesStencilSystemUsingAdvancedApplyMixed)
 {
-    using value_type = gko::next_precision<typename TestFixture::value_type>;
+    using value_type =
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->cg_factory->generate(this->mtx);
     auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
@@ -362,7 +364,7 @@ TYPED_TEST(Cg, SolvesStencilSystemUsingAdvancedApplyComplex)
 TYPED_TEST(Cg, SolvesStencilSystemUsingAdvancedApplyMixedComplex)
 {
     using Scalar = gko::matrix::Dense<
-        gko::next_precision<typename TestFixture::value_type>>;
+        gko::next_precision_with_half<typename TestFixture::value_type>>;
     using Mtx = gko::to_complex<typename TestFixture::Mtx>;
     using value_type = typename Mtx::value_type;
     auto solver = this->cg_factory->generate(this->mtx);
@@ -408,6 +410,8 @@ TYPED_TEST(Cg, SolvesBigDenseSystem1)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     auto solver = this->cg_factory_big->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
         {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5},
@@ -425,6 +429,8 @@ TYPED_TEST(Cg, SolvesBigDenseSystem2)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     auto solver = this->cg_factory_big->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
         {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0},
@@ -442,6 +448,8 @@ TYPED_TEST(Cg, SolvesBigDenseSystem3)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     auto solver = this->cg_factory_big2->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
         {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0},
@@ -459,6 +467,8 @@ TYPED_TEST(Cg, SolvesMultipleDenseSystemForDivergenceCheck)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     auto solver = this->cg_factory_big->generate(this->mtx_big);
     auto b1 = gko::initialize<Mtx>(
         {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5},
@@ -527,6 +537,8 @@ TYPED_TEST(Cg, SolvesTransposedBigDenseSystem)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     auto solver = this->cg_factory_big->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
         {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5},
@@ -544,6 +556,8 @@ TYPED_TEST(Cg, SolvesConjTransposedBigDenseSystem)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     auto solver = this->cg_factory_big->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
         {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5},
diff --git a/reference/test/solver/cgs_kernels.cpp b/reference/test/solver/cgs_kernels.cpp
index 9024623ade8..a06c087776c 100644
--- a/reference/test/solver/cgs_kernels.cpp
+++ b/reference/test/solver/cgs_kernels.cpp
@@ -121,7 +121,7 @@ class Cgs : public ::testing::Test {
     std::unique_ptr<typename Solver::Factory> cgs_factory_big2;
 };
 
-TYPED_TEST_SUITE(Cgs, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Cgs, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
 
 
 TYPED_TEST(Cgs, KernelInitialize)
@@ -293,7 +293,8 @@ TYPED_TEST(Cgs, SolvesDenseSystem)
 
 TYPED_TEST(Cgs, SolvesDenseSystemMixed)
 {
-    using value_type = gko::next_precision<typename TestFixture::value_type>;
+    using value_type =
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->cgs_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, this->exec);
@@ -329,8 +330,8 @@ TYPED_TEST(Cgs, SolvesDenseSystemComplex)
 
 TYPED_TEST(Cgs, SolvesDenseSystemMixedComplex)
 {
-    using value_type =
-        gko::to_complex<gko::next_precision<typename TestFixture::value_type>>;
+    using value_type = gko::to_complex<
+        gko::next_precision_with_half<typename TestFixture::value_type>>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->cgs_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>(
@@ -386,7 +387,8 @@ TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApply)
 
 TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApplyMixed)
 {
-    using value_type = gko::next_precision<typename TestFixture::value_type>;
+    using value_type =
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->cgs_factory->generate(this->mtx);
     auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
@@ -413,13 +415,13 @@ TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApplyComplex)
         {value_type{-1.0, 2.0}, value_type{3.0, -6.0}, value_type{1.0, -2.0}},
         this->exec);
     auto x = gko::initialize<Mtx>(
-        {value_type{0.5, -1.0}, value_type{1.0, -2.0}, value_type{2.0, -4.0}},
+        {value_type{-2.0, 4.0}, value_type{-0.5, 1.0}, value_type{2.0, -4.0}},
         this->exec);
 
     solver->apply(alpha, b, beta, x);
 
     GKO_ASSERT_MTX_NEAR(x,
-                        l({value_type{-8.5, 17.0}, value_type{-3.0, 6.0},
+                        l({value_type{-6.0, 12.0}, value_type{-1.5, 3.0},
                            value_type{6.0, -12.0}}),
                         r<value_type>::value * 1e3);
 }
@@ -428,7 +430,7 @@ TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApplyComplex)
 TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApplyMixedComplex)
 {
     using Scalar = gko::matrix::Dense<
-        gko::next_precision<typename TestFixture::value_type>>;
+        gko::next_precision_with_half<typename TestFixture::value_type>>;
     using Mtx = gko::to_complex<typename TestFixture::Mtx>;
     using value_type = typename Mtx::value_type;
     auto solver = this->cgs_factory->generate(this->mtx);
@@ -438,13 +440,14 @@ TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApplyMixedComplex)
         {value_type{-1.0, 2.0}, value_type{3.0, -6.0}, value_type{1.0, -2.0}},
         this->exec);
     auto x = gko::initialize<Mtx>(
-        {value_type{0.5, -1.0}, value_type{1.0, -2.0}, value_type{2.0, -4.0}},
+        {value_type{-2.0, 4.0}, value_type{-0.5, 1.0}, value_type{2.0, -4.0}},
         this->exec);
 
+
     solver->apply(alpha, b, beta, x);
 
     GKO_ASSERT_MTX_NEAR(x,
-                        l({value_type{-8.5, 17.0}, value_type{-3.0, 6.0},
+                        l({value_type{-6.0, 12.0}, value_type{-1.5, 3.0},
                            value_type{6.0, -12.0}}),
                         (r_mixed<value_type, TypeParam>()) * 1e3);
 }
@@ -475,6 +478,8 @@ TYPED_TEST(Cgs, SolvesBigDenseSystem1)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // squared_norm of residual(=b) exceeds the range of half precision.
+    SKIP_IF_HALF(value_type);
     auto solver = this->cgs_factory_big->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
         {764.0, -4032.0, -11855.0, 7111.0, -12765.0, -4589}, this->exec);
@@ -491,6 +496,8 @@ TYPED_TEST(Cgs, SolvesBigDenseSystemWithImplicitResNormCrit)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // squared_norm of residual(=b) exceeds the range of half precision.
+    SKIP_IF_HALF(value_type);
     auto solver = this->cgs_factory_big2->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
         {17356.0, 5466.0, 748.0, -456.0, 3434.0, -7020.0}, this->exec);
@@ -507,6 +514,8 @@ TYPED_TEST(Cgs, SolvesBigDenseSystem2)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // squared_norm of residual(=b) exceeds the range of half precision.
+    SKIP_IF_HALF(value_type);
     auto solver = this->cgs_factory_big->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
         {17356.0, 5466.0, 748.0, -456.0, 3434.0, -7020.0}, this->exec);
@@ -523,6 +532,8 @@ TYPED_TEST(Cgs, SolvesMultipleDenseSystems)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // squared_norm of residual(=b) exceeds the range of half precision.
+    SKIP_IF_HALF(value_type);
     auto solver = this->cgs_factory_big->generate(this->mtx_big);
     auto b1 = gko::initialize<Mtx>(
         {764.0, -4032.0, -11855.0, 7111.0, -12765.0, -4589}, this->exec);
@@ -589,6 +600,8 @@ TYPED_TEST(Cgs, SolvesTransposedBigDenseSystem)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // squared_norm of residual(=b) exceeds the range of half precision.
+    SKIP_IF_HALF(value_type);
     auto solver = this->cgs_factory_big->generate(this->mtx_big->transpose());
     auto b = gko::initialize<Mtx>(
         {764.0, -4032.0, -11855.0, 7111.0, -12765.0, -4589}, this->exec);
@@ -605,6 +618,8 @@ TYPED_TEST(Cgs, SolvesConjTransposedBigDenseSystem)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // squared_norm of residual(=b) exceeds the range of half precision.
+    SKIP_IF_HALF(value_type);
     auto solver =
         this->cgs_factory_big->generate(this->mtx_big->conj_transpose());
     auto b = gko::initialize<Mtx>(
diff --git a/reference/test/solver/fcg_kernels.cpp b/reference/test/solver/fcg_kernels.cpp
index 2b7b97ffc3b..88615921f34 100644
--- a/reference/test/solver/fcg_kernels.cpp
+++ b/reference/test/solver/fcg_kernels.cpp
@@ -112,7 +112,7 @@ class Fcg : public ::testing::Test {
     std::unique_ptr<typename Solver::Factory> fcg_factory_big2;
 };
 
-TYPED_TEST_SUITE(Fcg, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Fcg, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
 
 
 TYPED_TEST(Fcg, KernelInitialize)
@@ -242,7 +242,8 @@ TYPED_TEST(Fcg, SolvesStencilSystem)
 
 TYPED_TEST(Fcg, SolvesStencilSystemMixed)
 {
-    using value_type = gko::next_precision<typename TestFixture::value_type>;
+    using value_type =
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->fcg_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, this->exec);
@@ -278,8 +279,8 @@ TYPED_TEST(Fcg, SolvesStencilSystemComplex)
 
 TYPED_TEST(Fcg, SolvesStencilSystemMixedComplex)
 {
-    using value_type =
-        gko::to_complex<gko::next_precision<typename TestFixture::value_type>>;
+    using value_type = gko::to_complex<
+        gko::next_precision_with_half<typename TestFixture::value_type>>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->fcg_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>(
@@ -334,7 +335,8 @@ TYPED_TEST(Fcg, SolvesStencilSystemUsingAdvancedApply)
 
 TYPED_TEST(Fcg, SolvesStencilSystemUsingAdvancedApplyMixed)
 {
-    using value_type = gko::next_precision<typename TestFixture::value_type>;
+    using value_type =
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->fcg_factory->generate(this->mtx);
     auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
@@ -376,7 +378,7 @@ TYPED_TEST(Fcg, SolvesStencilSystemUsingAdvancedApplyComplex)
 TYPED_TEST(Fcg, SolvesStencilSystemUsingAdvancedApplyMixedComplex)
 {
     using Scalar = gko::matrix::Dense<
-        gko::next_precision<typename TestFixture::value_type>>;
+        gko::next_precision_with_half<typename TestFixture::value_type>>;
     using Mtx = gko::to_complex<typename TestFixture::Mtx>;
     using value_type = typename Mtx::value_type;
     auto solver = this->fcg_factory->generate(this->mtx);
@@ -422,6 +424,8 @@ TYPED_TEST(Fcg, SolvesBigDenseSystem1)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     auto solver = this->fcg_factory_big->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
         {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5},
@@ -439,6 +443,8 @@ TYPED_TEST(Fcg, SolvesBigDenseSystemWithImplicitResNormCrit)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     auto solver = this->fcg_factory_big2->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
         {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0},
@@ -456,6 +462,8 @@ TYPED_TEST(Fcg, SolvesBigDenseSystem2)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     auto solver = this->fcg_factory_big->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
         {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0},
@@ -473,6 +481,8 @@ TYPED_TEST(Fcg, SolvesMultipleBigDenseSystems)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     auto solver = this->fcg_factory_big->generate(this->mtx_big);
     auto b1 = gko::initialize<Mtx>(
         {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5},
@@ -541,6 +551,8 @@ TYPED_TEST(Fcg, SolvesTransposedBigDenseSystem)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     auto solver = this->fcg_factory_big->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
         {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5},
@@ -558,6 +570,8 @@ TYPED_TEST(Fcg, SolvesConjTransposedBigDenseSystem)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     auto solver = this->fcg_factory_big->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
         {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5},
diff --git a/reference/test/solver/gcr_kernels.cpp b/reference/test/solver/gcr_kernels.cpp
index 7ca885cfab8..af8e74888d0 100644
--- a/reference/test/solver/gcr_kernels.cpp
+++ b/reference/test/solver/gcr_kernels.cpp
@@ -119,7 +119,7 @@ class Gcr : public ::testing::Test {
     std::unique_ptr<typename Solver::Factory> gcr_factory_big2;
 };
 
-TYPED_TEST_SUITE(Gcr, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Gcr, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
 
 
 TYPED_TEST(Gcr, KernelInitialize)
@@ -225,7 +225,8 @@ TYPED_TEST(Gcr, SolvesStencilSystem)
 
 TYPED_TEST(Gcr, SolvesStencilSystemMixed)
 {
-    using value_type = gko::next_precision<typename TestFixture::value_type>;
+    using value_type =
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->gcr_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>({13.0, 7.0, 1.0}, this->exec);
@@ -234,7 +235,7 @@ TYPED_TEST(Gcr, SolvesStencilSystemMixed)
     solver->apply(b.get(), x.get());
 
     GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}),
-                        (r_mixed<value_type, TypeParam>()));
+                        (r_mixed<value_type, TypeParam>() * 1e1));
 }
 
 
@@ -256,14 +257,14 @@ TYPED_TEST(Gcr, SolvesStencilSystemComplex)
     GKO_ASSERT_MTX_NEAR(x,
                         l({value_type{1.0, -2.0}, value_type{3.0, -6.0},
                            value_type{2.0, -4.0}}),
-                        r<value_type>::value * 1e1);
+                        r<value_type>::value);
 }
 
 
 TYPED_TEST(Gcr, SolvesStencilSystemMixedComplex)
 {
-    using value_type =
-        gko::to_complex<gko::next_precision<typename TestFixture::value_type>>;
+    using value_type = gko::to_complex<
+        gko::next_precision_with_half<typename TestFixture::value_type>>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->gcr_factory->generate(this->mtx);
     auto b =
@@ -319,7 +320,8 @@ TYPED_TEST(Gcr, SolvesStencilSystemUsingAdvancedApply)
 
 TYPED_TEST(Gcr, SolvesStencilSystemUsingAdvancedApplyMixed)
 {
-    using value_type = gko::next_precision<typename TestFixture::value_type>;
+    using value_type =
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->gcr_factory->generate(this->mtx);
     auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
@@ -330,7 +332,7 @@ TYPED_TEST(Gcr, SolvesStencilSystemUsingAdvancedApplyMixed)
     solver->apply(alpha.get(), b.get(), beta.get(), x.get());
 
     GKO_ASSERT_MTX_NEAR(x, l({1.5, 5.0, 2.0}),
-                        (r_mixed<value_type, TypeParam>()) * 1e1);
+                        (r_mixed<value_type, TypeParam>() * 2e1));
 }
 
 
@@ -362,7 +364,7 @@ TYPED_TEST(Gcr, SolvesStencilSystemUsingAdvancedApplyComplex)
 TYPED_TEST(Gcr, SolvesStencilSystemUsingAdvancedApplyMixedComplex)
 {
     using Scalar = gko::matrix::Dense<
-        gko::next_precision<typename TestFixture::value_type>>;
+        gko::next_precision_with_half<typename TestFixture::value_type>>;
     using Mtx = gko::to_complex<typename TestFixture::Mtx>;
     using value_type = typename Mtx::value_type;
     auto solver = this->gcr_factory->generate(this->mtx);
@@ -409,6 +411,8 @@ TYPED_TEST(Gcr, SolvesBigDenseSystem1)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     auto solver = this->gcr_factory_big->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
         {72748.36, 297469.88, 347229.24, 36290.66, 82958.82, -80192.15},
@@ -426,6 +430,8 @@ TYPED_TEST(Gcr, SolvesBigDenseSystem2)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     auto solver = this->gcr_factory_big->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
         {175352.10, 313410.50, 131114.10, -134116.30, 179529.30, -43564.90},
@@ -443,6 +449,8 @@ TYPED_TEST(Gcr, SolveWithImplicitResNormCritIsDisabled)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     auto solver = this->gcr_factory_big2->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
         {175352.10, 313410.50, 131114.10, -134116.30, 179529.30, -43564.90},
@@ -456,7 +464,7 @@ TYPED_TEST(Gcr, SolveWithImplicitResNormCritIsDisabled)
 template <typename T>
 gko::remove_complex<T> infNorm(gko::matrix::Dense<T>* mat, size_t col = 0)
 {
-    using std::abs;
+    using gko::abs;
     using no_cpx_t = gko::remove_complex<T>;
     no_cpx_t norm = 0.0;
     for (size_t i = 0; i < mat->get_size()[0]; ++i) {
@@ -471,6 +479,8 @@ TYPED_TEST(Gcr, SolvesMultipleDenseSystemForDivergenceCheck)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     auto solver = this->gcr_factory_big->generate(this->mtx_big);
     auto b1 = gko::initialize<Mtx>(
         {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5},
@@ -537,6 +547,8 @@ TYPED_TEST(Gcr, SolvesBigDenseSystem1WithRestart)
     using Mtx = typename TestFixture::Mtx;
     using Solver = typename TestFixture::Solver;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     auto half_tol = std::sqrt(r<value_type>::value);
     auto gcr_factory_restart =
         Solver::build()
@@ -562,6 +574,8 @@ TYPED_TEST(Gcr, SolvesWithPreconditioner)
     using Mtx = typename TestFixture::Mtx;
     using Solver = typename TestFixture::Solver;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     auto gcr_factory_preconditioner =
         Solver::build()
             .with_criteria(gko::stop::Iteration::build().with_max_iters(100u),
@@ -588,6 +602,8 @@ TYPED_TEST(Gcr, SolvesTransposedBigDenseSystem)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     auto solver = this->gcr_factory_big->generate(this->mtx_big->transpose());
     auto b = gko::initialize<Mtx>(
         {72748.36, 297469.88, 347229.24, 36290.66, 82958.82, -80192.15},
@@ -605,6 +621,8 @@ TYPED_TEST(Gcr, SolvesConjTransposedBigDenseSystem)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     auto solver =
         this->gcr_factory_big->generate(this->mtx_big->conj_transpose());
     auto b = gko::initialize<Mtx>(
diff --git a/reference/test/solver/gmres_kernels.cpp b/reference/test/solver/gmres_kernels.cpp
index 3f11b087bb7..abecc6b2a79 100644
--- a/reference/test/solver/gmres_kernels.cpp
+++ b/reference/test/solver/gmres_kernels.cpp
@@ -136,7 +136,7 @@ class Gmres : public ::testing::Test {
     std::unique_ptr<typename Solver::Factory> gmres_factory_big2;
 };
 
-TYPED_TEST_SUITE(Gmres, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Gmres, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
 
 
 TYPED_TEST(Gmres, KernelInitialize)
@@ -434,7 +434,8 @@ TYPED_TEST(Gmres, SolvesStencilSystem)
 
 TYPED_TEST(Gmres, SolvesStencilSystemMixed)
 {
-    using value_type = gko::next_precision<typename TestFixture::value_type>;
+    using value_type =
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->gmres_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>({13.0, 7.0, 1.0}, this->exec);
@@ -471,8 +472,8 @@ TYPED_TEST(Gmres, SolvesStencilSystemComplex)
 
 TYPED_TEST(Gmres, SolvesStencilSystemMixedComplex)
 {
-    using value_type =
-        gko::to_complex<gko::next_precision<typename TestFixture::value_type>>;
+    using value_type = gko::to_complex<
+        gko::next_precision_with_half<typename TestFixture::value_type>>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->gmres_factory->generate(this->mtx);
     auto b =
@@ -528,7 +529,8 @@ TYPED_TEST(Gmres, SolvesStencilSystemUsingAdvancedApply)
 
 TYPED_TEST(Gmres, SolvesStencilSystemUsingAdvancedApplyMixed)
 {
-    using value_type = gko::next_precision<typename TestFixture::value_type>;
+    using value_type =
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->gmres_factory->generate(this->mtx);
     auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
@@ -571,7 +573,7 @@ TYPED_TEST(Gmres, SolvesStencilSystemUsingAdvancedApplyComplex)
 TYPED_TEST(Gmres, SolvesStencilSystemUsingAdvancedApplyMixedComplex)
 {
     using Scalar = gko::matrix::Dense<
-        gko::next_precision<typename TestFixture::value_type>>;
+        gko::next_precision_with_half<typename TestFixture::value_type>>;
     using Mtx = gko::to_complex<typename TestFixture::Mtx>;
     using value_type = typename Mtx::value_type;
     auto solver = this->gmres_factory->generate(this->mtx);
@@ -618,6 +620,8 @@ TYPED_TEST(Gmres, SolvesBigDenseSystem1)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     auto solver = this->gmres_factory_big->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
         {72748.36, 297469.88, 347229.24, 36290.66, 82958.82, -80192.15},
@@ -635,6 +639,8 @@ TYPED_TEST(Gmres, SolvesBigDenseSystem2)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     auto solver = this->gmres_factory_big->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
         {175352.10, 313410.50, 131114.10, -134116.30, 179529.30, -43564.90},
@@ -652,6 +658,8 @@ TYPED_TEST(Gmres, SolveWithImplicitResNormCritIsDisabled)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     auto solver = this->gmres_factory_big2->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
         {175352.10, 313410.50, 131114.10, -134116.30, 179529.30, -43564.90},
@@ -666,6 +674,8 @@ TYPED_TEST(Gmres, SolvesMultipleDenseSystemForDivergenceCheck)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     auto solver = this->gmres_factory_big->generate(this->mtx_big);
     auto b1 = gko::initialize<Mtx>(
         {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5},
@@ -732,6 +742,8 @@ TYPED_TEST(Gmres, SolvesBigDenseSystem1WithRestart)
     using Mtx = typename TestFixture::Mtx;
     using Solver = typename TestFixture::Solver;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     auto half_tol = std::sqrt(r<value_type>::value);
     auto gmres_factory_restart =
         Solver::build()
@@ -759,6 +771,8 @@ TYPED_TEST(Gmres, SolvesWithPreconditioner)
     using Mtx = typename TestFixture::Mtx;
     using Solver = typename TestFixture::Solver;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     for (auto ortho :
          {ortho_method::mgs, ortho_method::cgs, ortho_method::cgs2}) {
         SCOPED_TRACE(ortho);
@@ -792,6 +806,8 @@ TYPED_TEST(Gmres, SolvesTransposedBigDenseSystem)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     auto solver = this->gmres_factory_big->generate(this->mtx_big->transpose());
     auto b = gko::initialize<Mtx>(
         {72748.36, 297469.88, 347229.24, 36290.66, 82958.82, -80192.15},
@@ -809,6 +825,8 @@ TYPED_TEST(Gmres, SolvesConjTransposedBigDenseSystem)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // the system is already out of half precision range
+    SKIP_IF_HALF(value_type);
     auto solver =
         this->gmres_factory_big->generate(this->mtx_big->conj_transpose());
     auto b = gko::initialize<Mtx>(
diff --git a/reference/test/solver/idr_kernels.cpp b/reference/test/solver/idr_kernels.cpp
index c3ca4fc1bd9..420a3f15684 100644
--- a/reference/test/solver/idr_kernels.cpp
+++ b/reference/test/solver/idr_kernels.cpp
@@ -2,10 +2,13 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <iostream>
+
 #include <gtest/gtest.h>
 
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/log/solver_progress.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/solver/idr.hpp>
 #include <ginkgo/core/stop/combined.hpp>
@@ -57,7 +60,7 @@ class Idr : public ::testing::Test {
     std::unique_ptr<typename Solver::Factory> idr_factory_precision;
 };
 
-TYPED_TEST_SUITE(Idr, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Idr, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
 
 
 TYPED_TEST(Idr, SolvesDenseSystem)
@@ -76,7 +79,8 @@ TYPED_TEST(Idr, SolvesDenseSystem)
 
 TYPED_TEST(Idr, SolvesDenseSystemMixed)
 {
-    using value_type = gko::next_precision<typename TestFixture::value_type>;
+    using T = typename TestFixture::value_type;
+    using value_type = gko::next_precision_with_half<T>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->idr_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, this->exec);
@@ -91,6 +95,7 @@ TYPED_TEST(Idr, SolvesDenseSystemMixed)
 
 TYPED_TEST(Idr, SolvesDenseSystemComplex)
 {
+    using T = typename TestFixture::value_type;
     using Mtx = gko::to_complex<typename TestFixture::Mtx>;
     using value_type = typename Mtx::value_type;
     auto solver = this->idr_factory->generate(this->mtx);
@@ -112,8 +117,8 @@ TYPED_TEST(Idr, SolvesDenseSystemComplex)
 
 TYPED_TEST(Idr, SolvesDenseSystemMixedComplex)
 {
-    using value_type =
-        gko::to_complex<gko::next_precision<typename TestFixture::value_type>>;
+    using T = typename TestFixture::value_type;
+    using value_type = gko::to_complex<gko::next_precision_with_half<T>>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->idr_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>(
@@ -137,6 +142,7 @@ TYPED_TEST(Idr, SolvesDenseSystemWithComplexSubSpace)
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
     using Solver = typename TestFixture::Solver;
+    // intermediate value is too small to represent in half
     auto half_tol = std::sqrt(r<value_type>::value);
     auto solver_factory =
         Solver::build()
@@ -231,7 +237,8 @@ TYPED_TEST(Idr, SolvesDenseSystemUsingAdvancedApply)
 
 TYPED_TEST(Idr, SolvesDenseSystemUsingAdvancedApplyMixed)
 {
-    using value_type = gko::next_precision<typename TestFixture::value_type>;
+    using value_type =
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->idr_factory->generate(this->mtx);
     auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
@@ -273,7 +280,7 @@ TYPED_TEST(Idr, SolvesDenseSystemUsingAdvancedApplyComplex)
 TYPED_TEST(Idr, SolvesDenseSystemUsingAdvancedApplyMixedComplex)
 {
     using Scalar = gko::matrix::Dense<
-        gko::next_precision<typename TestFixture::value_type>>;
+        gko::next_precision_with_half<typename TestFixture::value_type>>;
     using Mtx = gko::to_complex<typename TestFixture::Mtx>;
     using value_type = typename Mtx::value_type;
     auto solver = this->idr_factory->generate(this->mtx);
@@ -321,6 +328,9 @@ TYPED_TEST(Idr, SolvesBigDenseSystemForDivergenceCheck1)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // the internal vector t will be too large in the first run and then out of
+    // the half precision range.
+    SKIP_IF_HALF(value_type);
     auto half_tol = std::sqrt(r<value_type>::value);
     std::shared_ptr<Mtx> locmtx =
         gko::initialize<Mtx>({{-19.0, 47.0, -41.0, 35.0, -21.0, 71.0},
@@ -357,6 +367,9 @@ TYPED_TEST(Idr, SolvesBigDenseSystemForDivergenceCheck2)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
+    // the internal vector t will be too large in the first run and then out of
+    // the half precision range.
+    SKIP_IF_HALF(value_type);
     auto half_tol = std::sqrt(r<value_type>::value);
     std::shared_ptr<Mtx> locmtx =
         gko::initialize<Mtx>({{-19.0, 47.0, -41.0, 35.0, -21.0, 71.0},
@@ -386,6 +399,9 @@ TYPED_TEST(Idr, SolvesMultipleDenseSystemsDivergenceCheck)
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
     using T = value_type;
+    // the internal vector t will be too large in the first run and then out of
+    // the half precision range.
+    SKIP_IF_HALF(value_type);
     std::shared_ptr<Mtx> locmtx =
         gko::initialize<Mtx>({{-19.0, 47.0, -41.0, 35.0, -21.0, 71.0},
                               {-8.0, -66.0, 29.0, -96.0, -95.0, -14.0},
diff --git a/reference/test/solver/ir_kernels.cpp b/reference/test/solver/ir_kernels.cpp
index b0c1029f693..f329a16d932 100644
--- a/reference/test/solver/ir_kernels.cpp
+++ b/reference/test/solver/ir_kernels.cpp
@@ -47,7 +47,7 @@ class Ir : public ::testing::Test {
     std::unique_ptr<typename Solver::Factory> ir_factory;
 };
 
-TYPED_TEST_SUITE(Ir, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Ir, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
 
 
 TYPED_TEST(Ir, KernelInitialize)
@@ -82,7 +82,8 @@ TYPED_TEST(Ir, SolvesTriangularSystem)
 
 TYPED_TEST(Ir, SolvesTriangularSystemMixed)
 {
-    using value_type = gko::next_precision<typename TestFixture::value_type>;
+    using value_type =
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->ir_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>({3.9, 9.0, 2.2}, this->exec);
@@ -118,8 +119,8 @@ TYPED_TEST(Ir, SolvesTriangularSystemComplex)
 
 TYPED_TEST(Ir, SolvesTriangularSystemMixedComplex)
 {
-    using value_type =
-        gko::to_complex<gko::next_precision<typename TestFixture::value_type>>;
+    using value_type = gko::to_complex<
+        gko::next_precision_with_half<typename TestFixture::value_type>>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->ir_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>(
@@ -244,7 +245,7 @@ TYPED_TEST(Ir, SolvesTriangularSystemUsingAdvancedApplyComplex)
 TYPED_TEST(Ir, SolvesTriangularSystemUsingAdvancedApplyMixedComplex)
 {
     using Scalar = gko::matrix::Dense<
-        gko::next_precision<typename TestFixture::value_type>>;
+        gko::next_precision_with_half<typename TestFixture::value_type>>;
     using Mtx = gko::to_complex<typename TestFixture::Mtx>;
     using value_type = typename Mtx::value_type;
     auto solver = this->ir_factory->generate(this->mtx);
diff --git a/test/solver/cb_gmres_kernels.cpp b/test/solver/cb_gmres_kernels.cpp
index 022899d21e6..98eb295091b 100644
--- a/test/solver/cb_gmres_kernels.cpp
+++ b/test/solver/cb_gmres_kernels.cpp
@@ -146,7 +146,7 @@ class CbGmres : public CommonTestFixture {
         auto& krylov_bases = range_helper.get_bases();
         d_to_host = d_range_helper.get_bases();
         const auto tolerance = r<storage_type>::value;
-        using std::abs;
+        using gko::abs;
         for (gko::size_type i = 0; i < krylov_bases.get_size(); ++i) {
             const auto ref_value = krylov_bases.get_const_data()[i];
             const auto dev_value = d_to_host.get_const_data()[i];

From 30296aff601a076bcbf5e0985acfecfe753c0be4 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 25 Oct 2024 16:58:01 +0200
Subject: [PATCH 375/448] solver config dispatch

---
 core/config/solver_config.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/core/config/solver_config.cpp b/core/config/solver_config.cpp
index b35a639b8e7..eb566986526 100644
--- a/core/config/solver_config.cpp
+++ b/core/config/solver_config.cpp
@@ -30,15 +30,15 @@ namespace gko {
 namespace config {
 
 
-GKO_PARSE_VALUE_TYPE(Cg, gko::solver::Cg);
-GKO_PARSE_VALUE_TYPE(Bicg, gko::solver::Bicg);
-GKO_PARSE_VALUE_TYPE(Bicgstab, gko::solver::Bicgstab);
-GKO_PARSE_VALUE_TYPE(Cgs, gko::solver::Cgs);
-GKO_PARSE_VALUE_TYPE(Fcg, gko::solver::Fcg);
-GKO_PARSE_VALUE_TYPE(Ir, gko::solver::Ir);
-GKO_PARSE_VALUE_TYPE(Idr, gko::solver::Idr);
-GKO_PARSE_VALUE_TYPE(Gcr, gko::solver::Gcr);
-GKO_PARSE_VALUE_TYPE(Gmres, gko::solver::Gmres);
+GKO_PARSE_VALUE_TYPE_WITH_HALF(Cg, gko::solver::Cg);
+GKO_PARSE_VALUE_TYPE_WITH_HALF(Bicg, gko::solver::Bicg);
+GKO_PARSE_VALUE_TYPE_WITH_HALF(Bicgstab, gko::solver::Bicgstab);
+GKO_PARSE_VALUE_TYPE_WITH_HALF(Cgs, gko::solver::Cgs);
+GKO_PARSE_VALUE_TYPE_WITH_HALF(Fcg, gko::solver::Fcg);
+GKO_PARSE_VALUE_TYPE_WITH_HALF(Ir, gko::solver::Ir);
+GKO_PARSE_VALUE_TYPE_WITH_HALF(Idr, gko::solver::Idr);
+GKO_PARSE_VALUE_TYPE_WITH_HALF(Gcr, gko::solver::Gcr);
+GKO_PARSE_VALUE_TYPE_WITH_HALF(Gmres, gko::solver::Gmres);
 GKO_PARSE_VALUE_TYPE(CbGmres, gko::solver::CbGmres);
 GKO_PARSE_VALUE_AND_INDEX_TYPE(Direct, gko::experimental::solver::Direct);
 GKO_PARSE_VALUE_AND_INDEX_TYPE(LowerTrs, gko::solver::LowerTrs);

From 631956e9a5a20e5285689573f8dad9f4dd3213e7 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 25 Nov 2024 16:11:20 +0100
Subject: [PATCH 376/448] cuda with CC<70 and hip do not support 16 bit atomic.
 throw error for idr

---
 common/cuda_hip/solver/idr_kernels.cpp | 34 +++++++++++++++++++-------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/common/cuda_hip/solver/idr_kernels.cpp b/common/cuda_hip/solver/idr_kernels.cpp
index 0dc310ebd2e..649d8a1769c 100644
--- a/common/cuda_hip/solver/idr_kernels.cpp
+++ b/common/cuda_hip/solver/idr_kernels.cpp
@@ -454,11 +454,19 @@ void update_g_and_u(std::shared_ptr<const DefaultExecutor> exec,
         if (nrhs > 1 || is_complex<ValueType>()) {
             components::fill_array(exec, alpha->get_values(), nrhs,
                                    zero<ValueType>());
-            multidot_kernel<<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-                size, nrhs, as_device_type(p_i),
-                as_device_type(g_k->get_values()), g_k->get_stride(),
-                as_device_type(alpha->get_values()),
-                stop_status->get_const_data());
+            // not support 16 bit atomic
+#if !(defined(CUDA_VERSION) && (__CUDA_ARCH__ >= 700))
+            if constexpr (std::is_same_v<remove_complex<ValueType>, half>) {
+                GKO_NOT_SUPPORTED(alpha);
+            } else
+#endif
+            {
+                multidot_kernel<<<grid_dim, block_dim, 0, exec->get_stream()>>>(
+                    size, nrhs, as_device_type(p_i),
+                    as_device_type(g_k->get_values()), g_k->get_stride(),
+                    as_device_type(alpha->get_values()),
+                    stop_status->get_const_data());
+            }
         } else {
             blas::dot(exec->get_blas_handle(), size, p_i, 1, g_k->get_values(),
                       g_k->get_stride(), alpha->get_values());
@@ -505,10 +513,18 @@ void update_m(std::shared_ptr<const DefaultExecutor> exec, const size_type nrhs,
         auto m_i = m->get_values() + i * m_stride + k * nrhs;
         if (nrhs > 1 || is_complex<ValueType>()) {
             components::fill_array(exec, m_i, nrhs, zero<ValueType>());
-            multidot_kernel<<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-                size, nrhs, as_device_type(p_i),
-                as_device_type(g_k->get_const_values()), g_k->get_stride(),
-                as_device_type(m_i), stop_status->get_const_data());
+            // not support 16 bit atomic
+#if !(defined(CUDA_VERSION) && (__CUDA_ARCH__ >= 700))
+            if constexpr (std::is_same_v<remove_complex<ValueType>, half>) {
+                GKO_NOT_SUPPORTED(m_i);
+            } else
+#endif
+            {
+                multidot_kernel<<<grid_dim, block_dim, 0, exec->get_stream()>>>(
+                    size, nrhs, as_device_type(p_i),
+                    as_device_type(g_k->get_const_values()), g_k->get_stride(),
+                    as_device_type(m_i), stop_status->get_const_data());
+            }
         } else {
             blas::dot(exec->get_blas_handle(), size, p_i, 1,
                       g_k->get_const_values(), g_k->get_stride(), m_i);

From 93cb4d69095ef613c14550ac813d0713417d2ea9 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 25 Oct 2024 16:47:50 +0200
Subject: [PATCH 377/448] triangular and direct solver

---
 .../cuda_hip/components/memory.nvidia.hpp.inc | 364 ++++++++++++++++++
 core/device_hooks/common_kernels.inc.cpp      |   8 +-
 core/solver/direct.cpp                        |   5 +-
 core/solver/lower_trs.cpp                     |   5 +-
 core/solver/upper_trs.cpp                     |   5 +-
 core/test/solver/direct.cpp                   |   3 +-
 core/test/solver/lower_trs.cpp                |   2 +-
 core/test/solver/upper_trs.cpp                |   2 +-
 cuda/solver/common_trs_kernels.cuh            |  15 +-
 cuda/solver/lower_trs_kernels.cu              |   4 +-
 cuda/solver/upper_trs_kernels.cu              |   4 +-
 dev_tools/scripts/generate_cuda_memory_ptx.py |  96 +++++
 dpcpp/solver/lower_trs_kernels.dp.cpp         |   4 +-
 dpcpp/solver/upper_trs_kernels.dp.cpp         |   4 +-
 hip/solver/lower_trs_kernels.hip.cpp          |   4 +-
 hip/solver/upper_trs_kernels.hip.cpp          |   4 +-
 omp/solver/lower_trs_kernels.cpp              |   4 +-
 omp/solver/upper_trs_kernels.cpp              |   4 +-
 reference/solver/lower_trs_kernels.cpp        |   4 +-
 reference/solver/upper_trs_kernels.cpp        |   4 +-
 reference/test/solver/direct.cpp              |   5 +-
 reference/test/solver/lower_trs.cpp           |   2 +-
 reference/test/solver/lower_trs_kernels.cpp   |  12 +-
 reference/test/solver/upper_trs.cpp           |   2 +-
 reference/test/solver/upper_trs_kernels.cpp   |  12 +-
 test/solver/direct.cpp                        |   6 +-
 26 files changed, 530 insertions(+), 54 deletions(-)

diff --git a/common/cuda_hip/components/memory.nvidia.hpp.inc b/common/cuda_hip/components/memory.nvidia.hpp.inc
index a695904e82a..f39c600ce6c 100644
--- a/common/cuda_hip/components/memory.nvidia.hpp.inc
+++ b/common/cuda_hip/components/memory.nvidia.hpp.inc
@@ -1031,3 +1031,367 @@ __device__ __forceinline__ void store_relaxed(thrust::complex<double>* ptr,
         "d"(real_result), "d"(imag_result)
         : "memory");
 }
+
+
+__device__ __forceinline__ __half load_relaxed_shared(const __half* ptr)
+{
+    float result;
+    asm volatile(
+        "{\n\t"
+        "  .reg .f16 t;\n\t"
+#if __CUDA_ARCH__ < 700
+        "  ld.volatile.shared.b16 t, [%1];\n\t"
+#else
+        "  ld.relaxed.cta.shared.b16 t, [%1];\n\t"
+#endif
+        "  cvt.f32.f16 %0, t;\n\t"
+        "}"
+        : "=f"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<__half*>(ptr)))
+        : "memory");
+
+    return static_cast<__half>(result);
+}
+
+
+__device__ __forceinline__ void store_relaxed_shared(__half* ptr, __half result)
+{
+    asm volatile(
+        "{\n\t"
+        "  .reg .f16 t;\n\t"
+        "  cvt.rn.f16.f32 t, %1;\n\t"
+#if __CUDA_ARCH__ < 700
+        "  st.volatile.shared.b16 [%0], t;\n\t"
+#else
+        "  st.relaxed.cta.shared.b16 [%0], t;\n\t"
+#endif
+        "}" ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "f"(static_cast<float>(result))
+        : "memory");
+}
+
+
+__device__ __forceinline__ __half load_acquire_shared(const __half* ptr)
+{
+    float result;
+    asm volatile(
+        "{\n\t"
+        "  .reg .f16 t;\n\t"
+#if __CUDA_ARCH__ < 700
+        "  ld.volatile.shared.b16 t, [%1];\n\t"
+#else
+        "  ld.acquire.cta.shared.b16 t, [%1];\n\t"
+#endif
+        "  cvt.f32.f16 %0, t;\n\t"
+        "}"
+        : "=f"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<__half*>(ptr)))
+        : "memory");
+    membar_acq_rel_shared();
+    return static_cast<__half>(result);
+}
+
+
+__device__ __forceinline__ void store_release_shared(__half* ptr, __half result)
+{
+    membar_acq_rel_shared();
+    asm volatile(
+        "{\n\t"
+        "  .reg .f16 t;\n\t"
+        "  cvt.rn.f16.f32 t, %1;\n\t"
+#if __CUDA_ARCH__ < 700
+        "  st.volatile.shared.b16 [%0], t;\n\t"
+#else
+        "  st.release.cta.shared.b16 [%0], t;\n\t"
+#endif
+        "}" ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "f"(static_cast<float>(result))
+        : "memory");
+}
+
+
+__device__ __forceinline__ __half load_relaxed_local(const __half* ptr)
+{
+    float result;
+    asm volatile(
+        "{\n\t"
+        "  .reg .f16 t;\n\t"
+#if __CUDA_ARCH__ < 700
+        "  ld.volatile.b16 t, [%1];\n\t"
+#else
+        "  ld.relaxed.cta.b16 t, [%1];\n\t"
+#endif
+        "  cvt.f32.f16 %0, t;\n\t"
+        "}"
+        : "=f"(result)
+        : "l"(const_cast<__half*>(ptr))
+        : "memory");
+
+    return static_cast<__half>(result);
+}
+
+
+__device__ __forceinline__ void store_relaxed_local(__half* ptr, __half result)
+{
+    asm volatile(
+        "{\n\t"
+        "  .reg .f16 t;\n\t"
+        "  cvt.rn.f16.f32 t, %1;\n\t"
+#if __CUDA_ARCH__ < 700
+        "  st.volatile.b16 [%0], t;\n\t"
+#else
+        "  st.relaxed.cta.b16 [%0], t;\n\t"
+#endif
+        "}" ::"l"(ptr),
+        "f"(static_cast<float>(result))
+        : "memory");
+}
+
+
+__device__ __forceinline__ __half load_acquire_local(const __half* ptr)
+{
+    float result;
+    asm volatile(
+        "{\n\t"
+        "  .reg .f16 t;\n\t"
+#if __CUDA_ARCH__ < 700
+        "  ld.volatile.b16 t, [%1];\n\t"
+#else
+        "  ld.acquire.cta.b16 t, [%1];\n\t"
+#endif
+        "  cvt.f32.f16 %0, t;\n\t"
+        "}"
+        : "=f"(result)
+        : "l"(const_cast<__half*>(ptr))
+        : "memory");
+    membar_acq_rel_local();
+    return static_cast<__half>(result);
+}
+
+
+__device__ __forceinline__ void store_release_local(__half* ptr, __half result)
+{
+    membar_acq_rel_local();
+    asm volatile(
+        "{\n\t"
+        "  .reg .f16 t;\n\t"
+        "  cvt.rn.f16.f32 t, %1;\n\t"
+#if __CUDA_ARCH__ < 700
+        "  st.volatile.b16 [%0], t;\n\t"
+#else
+        "  st.release.cta.b16 [%0], t;\n\t"
+#endif
+        "}" ::"l"(ptr),
+        "f"(static_cast<float>(result))
+        : "memory");
+}
+
+
+__device__ __forceinline__ __half load_relaxed(const __half* ptr)
+{
+    float result;
+    asm volatile(
+        "{\n\t"
+        "  .reg .f16 t;\n\t"
+#if __CUDA_ARCH__ < 700
+        "  ld.volatile.b16 t, [%1];\n\t"
+#else
+        "  ld.relaxed.gpu.b16 t, [%1];\n\t"
+#endif
+        "  cvt.f32.f16 %0, t;\n\t"
+        "}"
+        : "=f"(result)
+        : "l"(const_cast<__half*>(ptr))
+        : "memory");
+
+    return static_cast<__half>(result);
+}
+
+
+__device__ __forceinline__ void store_relaxed(__half* ptr, __half result)
+{
+    asm volatile(
+        "{\n\t"
+        "  .reg .f16 t;\n\t"
+        "  cvt.rn.f16.f32 t, %1;\n\t"
+#if __CUDA_ARCH__ < 700
+        "  st.volatile.b16 [%0], t;\n\t"
+#else
+        "  st.relaxed.gpu.b16 [%0], t;\n\t"
+#endif
+        "}" ::"l"(ptr),
+        "f"(static_cast<float>(result))
+        : "memory");
+}
+
+
+__device__ __forceinline__ __half load_acquire(const __half* ptr)
+{
+    float result;
+    asm volatile(
+        "{\n\t"
+        "  .reg .f16 t;\n\t"
+#if __CUDA_ARCH__ < 700
+        "  ld.volatile.b16 t, [%1];\n\t"
+#else
+        "  ld.acquire.gpu.b16 t, [%1];\n\t"
+#endif
+        "  cvt.f32.f16 %0, t;\n\t"
+        "}"
+        : "=f"(result)
+        : "l"(const_cast<__half*>(ptr))
+        : "memory");
+    membar_acq_rel();
+    return static_cast<__half>(result);
+}
+
+
+__device__ __forceinline__ void store_release(__half* ptr, __half result)
+{
+    membar_acq_rel();
+    asm volatile(
+        "{\n\t"
+        "  .reg .f16 t;\n\t"
+        "  cvt.rn.f16.f32 t, %1;\n\t"
+#if __CUDA_ARCH__ < 700
+        "  st.volatile.b16 [%0], t;\n\t"
+#else
+        "  st.release.gpu.b16 [%0], t;\n\t"
+#endif
+        "}" ::"l"(ptr),
+        "f"(static_cast<float>(result))
+        : "memory");
+}
+
+
+__device__ __forceinline__ thrust::complex<__half> load_relaxed_shared(
+    const thrust::complex<__half>* ptr)
+{
+    float real_result;
+    float imag_result;
+    asm volatile(
+        "{\n\t"
+        "  .reg .v2 .f16 t;\n\t"
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.shared.v2.b16 {t.x, t.y}, [%2];\n\t"
+#else
+        "ld.relaxed.cta.shared.v2.b16 {t.x, t.y}, [%2];\n\t"
+#endif
+        "  cvt.f32.f16 %0, t.x;\n\t"
+        "  cvt.f32.f16 %1, t.y;\n\t"
+        "}"
+        : "=f"(real_result), "=f"(imag_result)
+        : "r"(convert_generic_ptr_to_smem_ptr(
+            const_cast<thrust::complex<__half>*>(ptr)))
+        : "memory");
+    return thrust::complex<__half>{real_result, imag_result};
+}
+
+
+__device__ __forceinline__ void store_relaxed_shared(
+    thrust::complex<__half>* ptr, thrust::complex<__half> result)
+{
+    auto real_result = static_cast<float>(result.real());
+    auto imag_result = static_cast<float>(result.imag());
+    asm volatile(
+        "{\n\t"
+        "  .reg .v2 .f16 t;\n\t"
+        "  cvt.rn.f16.f32 t.x, %1;\n\t"
+        "  cvt.rn.f16.f32 t.y, %2;\n\t"
+#if __CUDA_ARCH__ < 700
+        "st.volatile.shared.v2.b16 [%0], t;\n\t"
+#else
+        "st.relaxed.cta.shared.v2.b16 [%0], t;\n\t"
+#endif
+        "}" ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "f"(real_result), "f"(imag_result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ thrust::complex<__half> load_relaxed_local(
+    const thrust::complex<__half>* ptr)
+{
+    float real_result;
+    float imag_result;
+    asm volatile(
+        "{\n\t"
+        "  .reg .v2 .f16 t;\n\t"
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.v2.b16 {t.x, t.y}, [%2];\n\t"
+#else
+        "ld.relaxed.cta.v2.b16 {t.x, t.y}, [%2];\n\t"
+#endif
+        "  cvt.f32.f16 %0, t.x;\n\t"
+        "  cvt.f32.f16 %1, t.y;\n\t"
+        "}"
+        : "=f"(real_result), "=f"(imag_result)
+        : "l"(const_cast<thrust::complex<__half>*>(ptr))
+        : "memory");
+    return thrust::complex<__half>{real_result, imag_result};
+}
+
+
+__device__ __forceinline__ void store_relaxed_local(
+    thrust::complex<__half>* ptr, thrust::complex<__half> result)
+{
+    auto real_result = static_cast<float>(result.real());
+    auto imag_result = static_cast<float>(result.imag());
+    asm volatile(
+        "{\n\t"
+        "  .reg .v2 .f16 t;\n\t"
+        "  cvt.rn.f16.f32 t.x, %1;\n\t"
+        "  cvt.rn.f16.f32 t.y, %2;\n\t"
+#if __CUDA_ARCH__ < 700
+        "st.volatile.v2.b16 [%0], t;\n\t"
+#else
+        "st.relaxed.cta.v2.b16 [%0], t;\n\t"
+#endif
+        "}" ::"l"(ptr),
+        "f"(real_result), "f"(imag_result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ thrust::complex<__half> load_relaxed(
+    const thrust::complex<__half>* ptr)
+{
+    float real_result;
+    float imag_result;
+    asm volatile(
+        "{\n\t"
+        "  .reg .v2 .f16 t;\n\t"
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.v2.b16 {t.x, t.y}, [%2];\n\t"
+#else
+        "ld.relaxed.gpu.v2.b16 {t.x, t.y}, [%2];\n\t"
+#endif
+        "  cvt.f32.f16 %0, t.x;\n\t"
+        "  cvt.f32.f16 %1, t.y;\n\t"
+        "}"
+        : "=f"(real_result), "=f"(imag_result)
+        : "l"(const_cast<thrust::complex<__half>*>(ptr))
+        : "memory");
+    return thrust::complex<__half>{real_result, imag_result};
+}
+
+
+__device__ __forceinline__ void store_relaxed(thrust::complex<__half>* ptr,
+                                              thrust::complex<__half> result)
+{
+    auto real_result = static_cast<float>(result.real());
+    auto imag_result = static_cast<float>(result.imag());
+    asm volatile(
+        "{\n\t"
+        "  .reg .v2 .f16 t;\n\t"
+        "  cvt.rn.f16.f32 t.x, %1;\n\t"
+        "  cvt.rn.f16.f32 t.y, %2;\n\t"
+#if __CUDA_ARCH__ < 700
+        "st.volatile.v2.b16 [%0], t;\n\t"
+#else
+        "st.relaxed.gpu.v2.b16 [%0], t;\n\t"
+#endif
+        "}" ::"l"(ptr),
+        "f"(real_result), "f"(imag_result)
+        : "memory");
+}
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 1c57ca45177..f37166613b7 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -572,8 +572,8 @@ namespace lower_trs {
 
 
 GKO_STUB(GKO_DECLARE_LOWER_TRS_SHOULD_PERFORM_TRANSPOSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LOWER_TRS_SOLVE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_LOWER_TRS_SOLVE_KERNEL);
 
 
 }  // namespace lower_trs
@@ -583,8 +583,8 @@ namespace upper_trs {
 
 
 GKO_STUB(GKO_DECLARE_UPPER_TRS_SHOULD_PERFORM_TRANSPOSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL);
 
 
 }  // namespace upper_trs
diff --git a/core/solver/direct.cpp b/core/solver/direct.cpp
index cf15bc4a9ae..69c2f9512dd 100644
--- a/core/solver/direct.cpp
+++ b/core/solver/direct.cpp
@@ -221,7 +221,7 @@ void Direct<ValueType, IndexType>::apply_impl(const LinOp* alpha,
 #define GKO_DECLARE_DIRECT(ValueType, IndexType) \
     class Direct<ValueType, IndexType>
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DIRECT);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_DIRECT);
 
 
 }  // namespace solver
@@ -283,7 +283,8 @@ std::vector<int> workspace_traits<gko::experimental::solver::Direct<
     struct workspace_traits<                            \
         gko::experimental::solver::Direct<ValueType, IndexType>>
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DIRECT_TRAITS);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_DIRECT_TRAITS);
 
 
 }  // namespace solver
diff --git a/core/solver/lower_trs.cpp b/core/solver/lower_trs.cpp
index 3048c877dbd..da16061db03 100644
--- a/core/solver/lower_trs.cpp
+++ b/core/solver/lower_trs.cpp
@@ -248,8 +248,9 @@ std::vector<int> workspace_traits<LowerTrs<ValueType, IndexType>>::vectors(
 #define GKO_DECLARE_LOWER_TRS(_vtype, _itype) class LowerTrs<_vtype, _itype>
 #define GKO_DECLARE_LOWER_TRS_TRAITS(_vtype, _itype) \
     struct workspace_traits<LowerTrs<_vtype, _itype>>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LOWER_TRS);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LOWER_TRS_TRAITS);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_LOWER_TRS);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_LOWER_TRS_TRAITS);
 
 
 }  // namespace solver
diff --git a/core/solver/upper_trs.cpp b/core/solver/upper_trs.cpp
index c759c119647..5e1dfb23df2 100644
--- a/core/solver/upper_trs.cpp
+++ b/core/solver/upper_trs.cpp
@@ -248,8 +248,9 @@ std::vector<int> workspace_traits<UpperTrs<ValueType, IndexType>>::vectors(
 #define GKO_DECLARE_UPPER_TRS(_vtype, _itype) class UpperTrs<_vtype, _itype>
 #define GKO_DECLARE_UPPER_TRS_TRAITS(_vtype, _itype) \
     struct workspace_traits<UpperTrs<_vtype, _itype>>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_UPPER_TRS);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_UPPER_TRS_TRAITS);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_UPPER_TRS);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_UPPER_TRS_TRAITS);
 
 
 }  // namespace solver
diff --git a/core/test/solver/direct.cpp b/core/test/solver/direct.cpp
index d895892a8be..43acdd0bdf1 100644
--- a/core/test/solver/direct.cpp
+++ b/core/test/solver/direct.cpp
@@ -35,7 +35,8 @@ class Direct : public ::testing::Test {
     std::unique_ptr<typename Solver::Factory> factory;
 };
 
-TYPED_TEST_SUITE(Direct, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Direct, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(Direct, FactoryKnowsItsExecutor)
diff --git a/core/test/solver/lower_trs.cpp b/core/test/solver/lower_trs.cpp
index dfcb564ca12..ae07e08c3f7 100644
--- a/core/test/solver/lower_trs.cpp
+++ b/core/test/solver/lower_trs.cpp
@@ -33,7 +33,7 @@ class LowerTrs : public ::testing::Test {
     std::unique_ptr<typename Solver::Factory> lower_trs_factory;
 };
 
-TYPED_TEST_SUITE(LowerTrs, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(LowerTrs, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
diff --git a/core/test/solver/upper_trs.cpp b/core/test/solver/upper_trs.cpp
index 2e84cb81e10..bc53d1a193c 100644
--- a/core/test/solver/upper_trs.cpp
+++ b/core/test/solver/upper_trs.cpp
@@ -33,7 +33,7 @@ class UpperTrs : public ::testing::Test {
     std::unique_ptr<typename Solver::Factory> upper_trs_factory;
 };
 
-TYPED_TEST_SUITE(UpperTrs, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(UpperTrs, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh
index 291c842325f..66643c0aa9f 100644
--- a/cuda/solver/common_trs_kernels.cuh
+++ b/cuda/solver/common_trs_kernels.cuh
@@ -212,12 +212,16 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
 
         size_type work_size{};
 
+        // nullptr is considered nullptr_t not casted to the function signature
+        // automatically Explicitly cast `nullptr` to `const ValueType*` to
+        // prevent compiler issues with gnu/llvm 9
         sparselib::buffer_size_ext(
             handle, algorithm, SPARSELIB_OPERATION_NON_TRANSPOSE,
             SPARSELIB_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs,
             matrix->get_num_stored_elements(), one<ValueType>(), factor_descr,
             matrix->get_const_values(), matrix->get_const_row_ptrs(),
-            matrix->get_const_col_idxs(), nullptr, num_rhs, solve_info, policy,
+            matrix->get_const_col_idxs(),
+            static_cast<const ValueType*>(nullptr), num_rhs, solve_info, policy,
             &work_size);
 
         // allocate workspace
@@ -228,7 +232,8 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
             SPARSELIB_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs,
             matrix->get_num_stored_elements(), one<ValueType>(), factor_descr,
             matrix->get_const_values(), matrix->get_const_row_ptrs(),
-            matrix->get_const_col_idxs(), nullptr, num_rhs, solve_info, policy,
+            matrix->get_const_col_idxs(),
+            static_cast<const ValueType*>(nullptr), num_rhs, solve_info, policy,
             work.get_data());
     }
 
@@ -357,6 +362,10 @@ struct float_to_unsigned_impl<float> {
     using type = uint32;
 };
 
+template <>
+struct float_to_unsigned_impl<__half> {
+    using type = uint16;
+};
 
 /**
  * Checks if a floating point number representation matches the representation
@@ -503,7 +512,7 @@ __global__ void sptrsv_naive_legacy_kernel(
     const auto row_end = is_upper ? rowptrs[row] - 1 : rowptrs[row + 1];
     const int row_step = is_upper ? -1 : 1;
 
-    ValueType sum = 0.0;
+    ValueType sum = zero<ValueType>();
     auto j = row_begin;
     auto col = colidxs[j];
     while (j != row_end) {
diff --git a/cuda/solver/lower_trs_kernels.cu b/cuda/solver/lower_trs_kernels.cu
index b37f6536b0f..7832cf9e4c5 100644
--- a/cuda/solver/lower_trs_kernels.cu
+++ b/cuda/solver/lower_trs_kernels.cu
@@ -50,7 +50,7 @@ void generate(std::shared_ptr<const CudaExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL);
 
 
@@ -70,7 +70,7 @@ void solve(std::shared_ptr<const CudaExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_LOWER_TRS_SOLVE_KERNEL);
 
 
diff --git a/cuda/solver/upper_trs_kernels.cu b/cuda/solver/upper_trs_kernels.cu
index eb7d8386083..b6828bc0c92 100644
--- a/cuda/solver/upper_trs_kernels.cu
+++ b/cuda/solver/upper_trs_kernels.cu
@@ -50,7 +50,7 @@ void generate(std::shared_ptr<const CudaExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL);
 
 
@@ -70,7 +70,7 @@ void solve(std::shared_ptr<const CudaExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL);
 
 
diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py
index 49f99d4d96f..a408f1bb879 100755
--- a/dev_tools/scripts/generate_cuda_memory_ptx.py
+++ b/dev_tools/scripts/generate_cuda_memory_ptx.py
@@ -191,3 +191,99 @@ class type_desc:
         : "memory");
 }}
 """)
+
+# since there are no constraints for f16 register an intermediate conversion needs to happen
+t = type_desc(ptx_type_suffix='.f16', val_constraint='f', name='__half')
+t.parent_name = "float"
+t.ptx_parent_type_suffix = '.f32'
+t.ptx_mem_type_suffix = '.b16'
+for s in memory_spaces:
+    for o in memory_orderings:
+        membar_expression = "" if o.is_relaxed else f"membar_acq_rel{s.fn_suffix}();"
+        const_ptr_expr = s.ptr_expr.format(
+            ptr=f"const_cast<{t.name}*>(ptr)")
+        mut_ptr_expr = s.ptr_expr.format(ptr="ptr")
+        print(f"""
+__device__ __forceinline__ {t.name} load{o.fn_load_suffix}{s.fn_suffix}(const {t.name}* ptr)
+{{
+    {t.parent_name} result;
+    asm volatile("{{\\n\\t"
+        "  .reg {t.ptx_type_suffix} t;\\n\\t"
+    #if __CUDA_ARCH__ < 700
+        "  ld.volatile{s.ptx_space_suffix}{t.ptx_mem_type_suffix} t, [%1];\\n\\t"
+    #else
+        "  ld{o.ptx_load_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_mem_type_suffix} t, [%1];\\n\\t"
+    #endif
+        "  cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %0, t;\\n\\t"
+        "}}"
+        : "={t.val_constraint}"(result)
+        : "{s.ptr_constraint}"({const_ptr_expr})
+        : "memory");
+    {membar_expression}
+    return static_cast<{t.name}>(result);
+}}
+
+
+__device__ __forceinline__ void store{o.fn_store_suffix}{s.fn_suffix}({t.name}* ptr, {t.name} result)
+{{
+    {membar_expression}
+    asm volatile("{{\\n\\t"
+        "  .reg {t.ptx_type_suffix} t;\\n\\t"
+        "  cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t, %1;\\n\\t"
+    #if __CUDA_ARCH__ < 700
+        "  st.volatile{s.ptx_space_suffix}{t.ptx_mem_type_suffix} [%0], t;\\n\\t"
+    #else
+        "  st{o.ptx_store_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_mem_type_suffix} [%0], t;\\n\\t"
+    #endif
+        "}}"
+        :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(static_cast<{t.parent_name}>(result))
+        : "memory");
+}}
+""")
+
+for s in memory_spaces:
+    o = ordering(ptx_load_suffix=".relaxed", fn_load_suffix="_relaxed",
+                 ptx_store_suffix=".relaxed", fn_store_suffix="_relaxed", is_relaxed=True)
+    const_ptr_expr = s.ptr_expr.format(
+        ptr=f"const_cast<thrust::complex<{t.name}>*>(ptr)")
+    mut_ptr_expr = s.ptr_expr.format(ptr="ptr")
+    print(f"""
+__device__ __forceinline__ thrust::complex<{t.name}> load_relaxed{s.fn_suffix}(const thrust::complex<{t.name}>* ptr)
+{{
+    {t.parent_name} real_result;
+    {t.parent_name} imag_result;
+    asm volatile("{{\\n\\t"
+        "  .reg .v2 {t.ptx_type_suffix} t;\\n\\t"
+#if __CUDA_ARCH__ < 700
+        "ld.volatile{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} {{t.x, t.y}}, [%2];\\n\\t"
+#else
+        "ld.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} {{t.x, t.y}}, [%2];\\n\\t"
+#endif
+        "  cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %0, t.x;\\n\\t"
+        "  cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %1, t.y;\\n\\t"
+        "}}"
+        : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result)
+        : "{s.ptr_constraint}"({const_ptr_expr})
+        : "memory");
+    return thrust::complex<{t.name}>{{real_result, imag_result}};
+}}
+
+
+__device__ __forceinline__ void store_relaxed{s.fn_suffix}(thrust::complex<{t.name}>* ptr, thrust::complex<{t.name}> result)
+{{
+    auto real_result = static_cast<{t.parent_name}>(result.real());
+    auto imag_result = static_cast<{t.parent_name}>(result.imag());
+    asm volatile("{{\\n\\t"
+        "  .reg .v2 {t.ptx_type_suffix} t;\\n\\t"
+        "  cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t.x, %1;\\n\\t"
+        "  cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t.y, %2;\\n\\t"
+#if __CUDA_ARCH__ < 700
+        "st.volatile{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} [%0], t;\\n\\t"
+#else
+        "st.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} [%0], t;\\n\\t"
+#endif
+        "}}"
+        :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result)
+        : "memory");
+}}
+""")
diff --git a/dpcpp/solver/lower_trs_kernels.dp.cpp b/dpcpp/solver/lower_trs_kernels.dp.cpp
index 449bfe5cfcf..62cfe93a59d 100644
--- a/dpcpp/solver/lower_trs_kernels.dp.cpp
+++ b/dpcpp/solver/lower_trs_kernels.dp.cpp
@@ -42,7 +42,7 @@ void generate(std::shared_ptr<const DpcppExecutor> exec,
               bool unit_diag, const solver::trisolve_algorithm algorithm,
               const size_type num_rhs) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL);
 
 
@@ -59,7 +59,7 @@ void solve(std::shared_ptr<const DpcppExecutor> exec,
            const matrix::Dense<ValueType>* b,
            matrix::Dense<ValueType>* x) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_LOWER_TRS_SOLVE_KERNEL);
 
 
diff --git a/dpcpp/solver/upper_trs_kernels.dp.cpp b/dpcpp/solver/upper_trs_kernels.dp.cpp
index 7ac4950fe82..49e0a931e74 100644
--- a/dpcpp/solver/upper_trs_kernels.dp.cpp
+++ b/dpcpp/solver/upper_trs_kernels.dp.cpp
@@ -42,7 +42,7 @@ void generate(std::shared_ptr<const DpcppExecutor> exec,
               bool unit_diag, const solver::trisolve_algorithm algorithm,
               const size_type num_rhs) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL);
 
 
@@ -59,7 +59,7 @@ void solve(std::shared_ptr<const DpcppExecutor> exec,
            const matrix::Dense<ValueType>* b,
            matrix::Dense<ValueType>* x) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL);
 
 
diff --git a/hip/solver/lower_trs_kernels.hip.cpp b/hip/solver/lower_trs_kernels.hip.cpp
index 5eab76ed5fa..6858f1eddc0 100644
--- a/hip/solver/lower_trs_kernels.hip.cpp
+++ b/hip/solver/lower_trs_kernels.hip.cpp
@@ -54,7 +54,7 @@ void generate(std::shared_ptr<const HipExecutor> exec,
                                           false, unit_diag);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL);
 
 
@@ -70,7 +70,7 @@ void solve(std::shared_ptr<const HipExecutor> exec,
                                        trans_x, b, x);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_LOWER_TRS_SOLVE_KERNEL);
 
 
diff --git a/hip/solver/upper_trs_kernels.hip.cpp b/hip/solver/upper_trs_kernels.hip.cpp
index fb480d9b22d..f1398faeea4 100644
--- a/hip/solver/upper_trs_kernels.hip.cpp
+++ b/hip/solver/upper_trs_kernels.hip.cpp
@@ -54,7 +54,7 @@ void generate(std::shared_ptr<const HipExecutor> exec,
                                           true, unit_diag);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL);
 
 
@@ -70,7 +70,7 @@ void solve(std::shared_ptr<const HipExecutor> exec,
                                        trans_x, b, x);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL);
 
 
diff --git a/omp/solver/lower_trs_kernels.cpp b/omp/solver/lower_trs_kernels.cpp
index 6dac6b46078..c873e5e8958 100644
--- a/omp/solver/lower_trs_kernels.cpp
+++ b/omp/solver/lower_trs_kernels.cpp
@@ -47,7 +47,7 @@ void generate(std::shared_ptr<const OmpExecutor> exec,
     // "analysis" phase for the triangular matrix.
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL);
 
 
@@ -88,7 +88,7 @@ void solve(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_LOWER_TRS_SOLVE_KERNEL);
 
 
diff --git a/omp/solver/upper_trs_kernels.cpp b/omp/solver/upper_trs_kernels.cpp
index ea05cabeb63..5014f823d35 100644
--- a/omp/solver/upper_trs_kernels.cpp
+++ b/omp/solver/upper_trs_kernels.cpp
@@ -47,7 +47,7 @@ void generate(std::shared_ptr<const OmpExecutor> exec,
     // "analysis" phase for the triangular matrix.
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL);
 
 
@@ -90,7 +90,7 @@ void solve(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL);
 
 
diff --git a/reference/solver/lower_trs_kernels.cpp b/reference/solver/lower_trs_kernels.cpp
index ba02c9c838c..49e3829d9af 100644
--- a/reference/solver/lower_trs_kernels.cpp
+++ b/reference/solver/lower_trs_kernels.cpp
@@ -44,7 +44,7 @@ void generate(std::shared_ptr<const ReferenceExecutor> exec,
     // "analysis" phase for the triangular matrix.
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL);
 
 
@@ -88,7 +88,7 @@ void solve(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_LOWER_TRS_SOLVE_KERNEL);
 
 
diff --git a/reference/solver/upper_trs_kernels.cpp b/reference/solver/upper_trs_kernels.cpp
index f0c23a9c4cc..b1d045eeadb 100644
--- a/reference/solver/upper_trs_kernels.cpp
+++ b/reference/solver/upper_trs_kernels.cpp
@@ -44,7 +44,7 @@ void generate(std::shared_ptr<const ReferenceExecutor> exec,
     // "analysis" phase for the triangular matrix.
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL);
 
 
@@ -90,7 +90,7 @@ void solve(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL);
 
 
diff --git a/reference/test/solver/direct.cpp b/reference/test/solver/direct.cpp
index 1fb147a7a2b..e421811382f 100644
--- a/reference/test/solver/direct.cpp
+++ b/reference/test/solver/direct.cpp
@@ -49,7 +49,7 @@ class Direct : public ::testing::Test {
                                 symmetric))
                 .on(exec);
         solver = factory->generate(mtx);
-        std::normal_distribution<gko::remove_complex<value_type>> dist(0, 1);
+        std::normal_distribution<> dist(0, 1);
         x = gko::test::generate_random_dense_matrix<value_type>(
             mtx->get_size()[0], nrhs, dist, rng, this->exec);
         x_ref = x->clone();
@@ -66,7 +66,8 @@ class Direct : public ::testing::Test {
     std::unique_ptr<solver_type> solver;
 };
 
-TYPED_TEST_SUITE(Direct, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Direct, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(Direct, SolvesAni1SingleRhs)
diff --git a/reference/test/solver/lower_trs.cpp b/reference/test/solver/lower_trs.cpp
index d52ee028b53..fd6fe1e4b16 100644
--- a/reference/test/solver/lower_trs.cpp
+++ b/reference/test/solver/lower_trs.cpp
@@ -45,7 +45,7 @@ class LowerTrs : public ::testing::Test {
     std::unique_ptr<Solver> solver;
 };
 
-TYPED_TEST_SUITE(LowerTrs, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(LowerTrs, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/solver/lower_trs_kernels.cpp b/reference/test/solver/lower_trs_kernels.cpp
index 3680f19681f..6d54efd2913 100644
--- a/reference/test/solver/lower_trs_kernels.cpp
+++ b/reference/test/solver/lower_trs_kernels.cpp
@@ -75,7 +75,7 @@ class LowerTrs : public ::testing::Test {
     std::unique_ptr<typename Solver::Factory> lower_trs_factory_unit;
 };
 
-TYPED_TEST_SUITE(LowerTrs, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(LowerTrs, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
@@ -108,7 +108,7 @@ TYPED_TEST(LowerTrs, SolvesTriangularSystem)
 TYPED_TEST(LowerTrs, SolvesTriangularSystemMixed)
 {
     using other_value_type = typename TestFixture::value_type;
-    using value_type = gko::next_precision<other_value_type>;
+    using value_type = gko::next_precision_with_half<other_value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     std::shared_ptr<Mtx> b = gko::initialize<Mtx>({1.0, 2.0, 1.0}, this->exec);
     auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
@@ -146,7 +146,8 @@ TYPED_TEST(LowerTrs, SolvesTriangularSystemComplex)
 TYPED_TEST(LowerTrs, SolvesTriangularSystemMixedComplex)
 {
     using other_value_type = typename TestFixture::value_type;
-    using Scalar = gko::matrix::Dense<gko::next_precision<other_value_type>>;
+    using Scalar =
+        gko::matrix::Dense<gko::next_precision_with_half<other_value_type>>;
     using Mtx = gko::to_complex<typename TestFixture::Mtx>;
     using value_type = typename Mtx::value_type;
     std::shared_ptr<Mtx> b = gko::initialize<Mtx>(
@@ -217,7 +218,7 @@ TYPED_TEST(LowerTrs, SolvesTriangularSystemUsingAdvancedApply)
 TYPED_TEST(LowerTrs, SolvesTriangularSystemUsingAdvancedApplyMixed)
 {
     using other_value_type = typename TestFixture::value_type;
-    using value_type = gko::next_precision<other_value_type>;
+    using value_type = gko::next_precision_with_half<other_value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
     auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
@@ -259,7 +260,8 @@ TYPED_TEST(LowerTrs, SolvesTriangularSystemUsingAdvancedApplyComplex)
 TYPED_TEST(LowerTrs, SolvesTriangularSystemUsingAdvancedApplyMixedComplex)
 {
     using other_value_type = typename TestFixture::value_type;
-    using Scalar = gko::matrix::Dense<gko::next_precision<other_value_type>>;
+    using Scalar =
+        gko::matrix::Dense<gko::next_precision_with_half<other_value_type>>;
     using Mtx = gko::to_complex<typename TestFixture::Mtx>;
     using value_type = typename Mtx::value_type;
     auto alpha = gko::initialize<Scalar>({2.0}, this->exec);
diff --git a/reference/test/solver/upper_trs.cpp b/reference/test/solver/upper_trs.cpp
index 9980c51f9d1..b59744a0e8c 100644
--- a/reference/test/solver/upper_trs.cpp
+++ b/reference/test/solver/upper_trs.cpp
@@ -45,7 +45,7 @@ class UpperTrs : public ::testing::Test {
     std::unique_ptr<Solver> upper_trs_solver;
 };
 
-TYPED_TEST_SUITE(UpperTrs, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(UpperTrs, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/solver/upper_trs_kernels.cpp b/reference/test/solver/upper_trs_kernels.cpp
index a60f3b46079..870542593ff 100644
--- a/reference/test/solver/upper_trs_kernels.cpp
+++ b/reference/test/solver/upper_trs_kernels.cpp
@@ -75,7 +75,7 @@ class UpperTrs : public ::testing::Test {
     std::unique_ptr<typename Solver::Factory> upper_trs_factory_unit;
 };
 
-TYPED_TEST_SUITE(UpperTrs, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(UpperTrs, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
@@ -108,7 +108,7 @@ TYPED_TEST(UpperTrs, SolvesTriangularSystem)
 TYPED_TEST(UpperTrs, SolvesTriangularSystemMixed)
 {
     using other_value_type = typename TestFixture::value_type;
-    using value_type = gko::next_precision<other_value_type>;
+    using value_type = gko::next_precision_with_half<other_value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     std::shared_ptr<Mtx> b = gko::initialize<Mtx>({4.0, 2.0, 3.0}, this->exec);
     auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
@@ -146,7 +146,8 @@ TYPED_TEST(UpperTrs, SolvesTriangularSystemComplex)
 TYPED_TEST(UpperTrs, SolvesTriangularSystemMixedComplex)
 {
     using other_value_type = typename TestFixture::value_type;
-    using Scalar = gko::matrix::Dense<gko::next_precision<other_value_type>>;
+    using Scalar =
+        gko::matrix::Dense<gko::next_precision_with_half<other_value_type>>;
     using Mtx = gko::to_complex<typename TestFixture::Mtx>;
     using value_type = typename Mtx::value_type;
     std::shared_ptr<Mtx> b = gko::initialize<Mtx>(
@@ -218,7 +219,7 @@ TYPED_TEST(UpperTrs, SolvesTriangularSystemUsingAdvancedApply)
 TYPED_TEST(UpperTrs, SolvesTriangularSystemUsingAdvancedApplyMixed)
 {
     using other_value_type = typename TestFixture::value_type;
-    using value_type = gko::next_precision<other_value_type>;
+    using value_type = gko::next_precision_with_half<other_value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
     auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
@@ -260,7 +261,8 @@ TYPED_TEST(UpperTrs, SolvesTriangularSystemUsingAdvancedApplyComplex)
 TYPED_TEST(UpperTrs, SolvesTriangularSystemUsingAdvancedApplyMixedComplex)
 {
     using other_value_type = typename TestFixture::value_type;
-    using Scalar = gko::matrix::Dense<gko::next_precision<other_value_type>>;
+    using Scalar =
+        gko::matrix::Dense<gko::next_precision_with_half<other_value_type>>;
     using Mtx = gko::to_complex<typename TestFixture::Mtx>;
     using value_type = typename Mtx::value_type;
     auto alpha = gko::initialize<Scalar>({2.0}, this->exec);
diff --git a/test/solver/direct.cpp b/test/solver/direct.cpp
index da77682bcdd..1ee95806c37 100644
--- a/test/solver/direct.cpp
+++ b/test/solver/direct.cpp
@@ -51,9 +51,7 @@ class Direct : public CommonTestFixture {
         return gko::test::generate_random_matrix<vector_type>(
             num_rows, num_cols,
             std::uniform_int_distribution<>(num_cols, num_cols),
-            std::normal_distribution<gko::remove_complex<value_type>>(-1.0,
-                                                                      1.0),
-            rand_engine, ref);
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
     }
 
     void initialize_data(const char* mtx_filename, int nrhs)
@@ -102,7 +100,7 @@ class Direct : public CommonTestFixture {
 };
 
 #ifdef GKO_COMPILING_OMP
-using Types = gko::test::ValueIndexTypes;
+using Types = gko::test::ValueIndexTypesWithHalf;
 #elif defined(GKO_COMPILING_CUDA)
 // CUDA don't support long indices for sorting, and the triangular solvers
 // seem broken

From 2acd30993768ba2396cc3be3865dea84d89cac70 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 4 Nov 2024 10:21:40 +0100
Subject: [PATCH 378/448] workaround for half precision of load/store by using
 single precision in shared memory

---
 cuda/solver/common_trs_kernels.cuh | 32 ++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh
index 66643c0aa9f..4058112a44b 100644
--- a/cuda/solver/common_trs_kernels.cuh
+++ b/cuda/solver/common_trs_kernels.cuh
@@ -213,7 +213,7 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
         size_type work_size{};
 
         // nullptr is considered nullptr_t not casted to the function signature
-        // automatically Explicitly cast `nullptr` to `const ValueType*` to
+        // automatically explicitly cast `nullptr` to `const ValueType*` to
         // prevent compiler issues with gnu/llvm 9
         sparselib::buffer_size_ext(
             handle, algorithm, SPARSELIB_OPERATION_NON_TRANSPOSE,
@@ -406,7 +406,16 @@ __global__ void sptrsv_naive_caching_kernel(
     const size_type nrhs, bool unit_diag, bool* nan_produced,
     IndexType* atomic_counter)
 {
-    __shared__ uninitialized_array<ValueType, default_block_size> x_s_array;
+    // TODO: need to investigate
+    // memory operation on the half-precision shared_memory seem to give
+    // wrong result. we use float in shared_memory.
+    using SharedValueType = std::conditional_t<
+        std::is_same<remove_complex<ValueType>, __half>::value,
+        std::conditional_t<is_complex<ValueType>(), thrust::complex<float>,
+                           float>,
+        ValueType>;
+    __shared__ uninitialized_array<SharedValueType, default_block_size>
+        x_s_array;
     __shared__ IndexType block_base_idx;
 
     if (threadIdx.x == 0) {
@@ -426,8 +435,8 @@ __global__ void sptrsv_naive_caching_kernel(
     const auto self_shmem_id = full_gid / default_block_size;
     const auto self_shid = full_gid % default_block_size;
 
-    ValueType* x_s = x_s_array;
-    x_s[self_shid] = nan<ValueType>();
+    SharedValueType* x_s = x_s_array;
+    x_s[self_shid] = nan<SharedValueType>();
 
     __syncthreads();
 
@@ -439,20 +448,19 @@ __global__ void sptrsv_naive_caching_kernel(
     const auto row_end = is_upper ? rowptrs[row] - 1 : rowptrs[row + 1];
     const int row_step = is_upper ? -1 : 1;
 
-    auto sum = zero<ValueType>();
+    auto sum = zero<SharedValueType>();
     auto i = row_begin;
     for (; i != row_end; i += row_step) {
         const auto dependency = colidxs[i];
         if (is_upper ? dependency <= row : dependency >= row) {
             break;
         }
-        auto x_p = &x[dependency * x_stride + rhs];
 
         const auto dependency_gid = is_upper ? (n - 1 - dependency) * nrhs + rhs
                                              : dependency * nrhs + rhs;
         const bool shmem_possible =
             (dependency_gid / default_block_size) == self_shmem_id;
-        ValueType val{};
+        SharedValueType val{};
         if (shmem_possible) {
             const auto dependency_shid = dependency_gid % default_block_size;
             while (is_nan_exact(
@@ -464,15 +472,17 @@ __global__ void sptrsv_naive_caching_kernel(
             }
         }
 
-        sum += val * vals[i];
+        sum += val * static_cast<SharedValueType>(vals[i]);
     }
 
     // The first entry past the triangular part will be the diagonal
-    const auto diag = unit_diag ? one<ValueType>() : vals[i];
-    const auto r = (b[row * b_stride + rhs] - sum) / diag;
+    const auto diag = unit_diag ? one<SharedValueType>()
+                                : static_cast<SharedValueType>(vals[i]);
+    const auto r =
+        (static_cast<SharedValueType>(b[row * b_stride + rhs]) - sum) / diag;
 
     store_relaxed_shared(x_s + self_shid, r);
-    store_relaxed(x + row * x_stride + rhs, r);
+    store_relaxed(x + row * x_stride + rhs, static_cast<ValueType>(r));
 
     // This check to ensure no infinite loops happen.
     if (is_nan_exact(r)) {

From 95a3f21bacf01ee391998cf2a117d78308c055f2 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 18 Nov 2024 12:14:07 +0100
Subject: [PATCH 379/448] delete the current unusable half memory op on shared
 memory

---
 .../cuda_hip/components/memory.nvidia.hpp.inc | 122 ------------------
 dev_tools/scripts/generate_cuda_memory_ptx.py |   6 +-
 2 files changed, 4 insertions(+), 124 deletions(-)

diff --git a/common/cuda_hip/components/memory.nvidia.hpp.inc b/common/cuda_hip/components/memory.nvidia.hpp.inc
index f39c600ce6c..f759c613f45 100644
--- a/common/cuda_hip/components/memory.nvidia.hpp.inc
+++ b/common/cuda_hip/components/memory.nvidia.hpp.inc
@@ -1033,83 +1033,6 @@ __device__ __forceinline__ void store_relaxed(thrust::complex<double>* ptr,
 }
 
 
-__device__ __forceinline__ __half load_relaxed_shared(const __half* ptr)
-{
-    float result;
-    asm volatile(
-        "{\n\t"
-        "  .reg .f16 t;\n\t"
-#if __CUDA_ARCH__ < 700
-        "  ld.volatile.shared.b16 t, [%1];\n\t"
-#else
-        "  ld.relaxed.cta.shared.b16 t, [%1];\n\t"
-#endif
-        "  cvt.f32.f16 %0, t;\n\t"
-        "}"
-        : "=f"(result)
-        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<__half*>(ptr)))
-        : "memory");
-
-    return static_cast<__half>(result);
-}
-
-
-__device__ __forceinline__ void store_relaxed_shared(__half* ptr, __half result)
-{
-    asm volatile(
-        "{\n\t"
-        "  .reg .f16 t;\n\t"
-        "  cvt.rn.f16.f32 t, %1;\n\t"
-#if __CUDA_ARCH__ < 700
-        "  st.volatile.shared.b16 [%0], t;\n\t"
-#else
-        "  st.relaxed.cta.shared.b16 [%0], t;\n\t"
-#endif
-        "}" ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
-        "f"(static_cast<float>(result))
-        : "memory");
-}
-
-
-__device__ __forceinline__ __half load_acquire_shared(const __half* ptr)
-{
-    float result;
-    asm volatile(
-        "{\n\t"
-        "  .reg .f16 t;\n\t"
-#if __CUDA_ARCH__ < 700
-        "  ld.volatile.shared.b16 t, [%1];\n\t"
-#else
-        "  ld.acquire.cta.shared.b16 t, [%1];\n\t"
-#endif
-        "  cvt.f32.f16 %0, t;\n\t"
-        "}"
-        : "=f"(result)
-        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<__half*>(ptr)))
-        : "memory");
-    membar_acq_rel_shared();
-    return static_cast<__half>(result);
-}
-
-
-__device__ __forceinline__ void store_release_shared(__half* ptr, __half result)
-{
-    membar_acq_rel_shared();
-    asm volatile(
-        "{\n\t"
-        "  .reg .f16 t;\n\t"
-        "  cvt.rn.f16.f32 t, %1;\n\t"
-#if __CUDA_ARCH__ < 700
-        "  st.volatile.shared.b16 [%0], t;\n\t"
-#else
-        "  st.release.cta.shared.b16 [%0], t;\n\t"
-#endif
-        "}" ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
-        "f"(static_cast<float>(result))
-        : "memory");
-}
-
-
 __device__ __forceinline__ __half load_relaxed_local(const __half* ptr)
 {
     float result;
@@ -1264,51 +1187,6 @@ __device__ __forceinline__ void store_release(__half* ptr, __half result)
 }
 
 
-__device__ __forceinline__ thrust::complex<__half> load_relaxed_shared(
-    const thrust::complex<__half>* ptr)
-{
-    float real_result;
-    float imag_result;
-    asm volatile(
-        "{\n\t"
-        "  .reg .v2 .f16 t;\n\t"
-#if __CUDA_ARCH__ < 700
-        "ld.volatile.shared.v2.b16 {t.x, t.y}, [%2];\n\t"
-#else
-        "ld.relaxed.cta.shared.v2.b16 {t.x, t.y}, [%2];\n\t"
-#endif
-        "  cvt.f32.f16 %0, t.x;\n\t"
-        "  cvt.f32.f16 %1, t.y;\n\t"
-        "}"
-        : "=f"(real_result), "=f"(imag_result)
-        : "r"(convert_generic_ptr_to_smem_ptr(
-            const_cast<thrust::complex<__half>*>(ptr)))
-        : "memory");
-    return thrust::complex<__half>{real_result, imag_result};
-}
-
-
-__device__ __forceinline__ void store_relaxed_shared(
-    thrust::complex<__half>* ptr, thrust::complex<__half> result)
-{
-    auto real_result = static_cast<float>(result.real());
-    auto imag_result = static_cast<float>(result.imag());
-    asm volatile(
-        "{\n\t"
-        "  .reg .v2 .f16 t;\n\t"
-        "  cvt.rn.f16.f32 t.x, %1;\n\t"
-        "  cvt.rn.f16.f32 t.y, %2;\n\t"
-#if __CUDA_ARCH__ < 700
-        "st.volatile.shared.v2.b16 [%0], t;\n\t"
-#else
-        "st.relaxed.cta.shared.v2.b16 [%0], t;\n\t"
-#endif
-        "}" ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
-        "f"(real_result), "f"(imag_result)
-        : "memory");
-}
-
-
 __device__ __forceinline__ thrust::complex<__half> load_relaxed_local(
     const thrust::complex<__half>* ptr)
 {
diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py
index a408f1bb879..834c49dba46 100755
--- a/dev_tools/scripts/generate_cuda_memory_ptx.py
+++ b/dev_tools/scripts/generate_cuda_memory_ptx.py
@@ -193,11 +193,13 @@ class type_desc:
 """)
 
 # since there are no constraints for f16 register an intermediate conversion needs to happen
+# There are some issues when using f16 on shared memory. We disable them currently.
+memory_spaces_without_shared=memory_spaces[1:]
 t = type_desc(ptx_type_suffix='.f16', val_constraint='f', name='__half')
 t.parent_name = "float"
 t.ptx_parent_type_suffix = '.f32'
 t.ptx_mem_type_suffix = '.b16'
-for s in memory_spaces:
+for s in memory_spaces_without_shared:
     for o in memory_orderings:
         membar_expression = "" if o.is_relaxed else f"membar_acq_rel{s.fn_suffix}();"
         const_ptr_expr = s.ptr_expr.format(
@@ -241,7 +243,7 @@ class type_desc:
 }}
 """)
 
-for s in memory_spaces:
+for s in memory_spaces_without_shared:
     o = ordering(ptx_load_suffix=".relaxed", fn_load_suffix="_relaxed",
                  ptx_store_suffix=".relaxed", fn_store_suffix="_relaxed", is_relaxed=True)
     const_ptr_expr = s.ptr_expr.format(

From d01ad04ea76d86f33b156efa06dd345d6cbe78c0 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 25 Oct 2024 16:58:48 +0200
Subject: [PATCH 380/448] direct and tri config dispatch

---
 core/config/solver_config.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/core/config/solver_config.cpp b/core/config/solver_config.cpp
index eb566986526..04bf5f5fcd5 100644
--- a/core/config/solver_config.cpp
+++ b/core/config/solver_config.cpp
@@ -40,9 +40,10 @@ GKO_PARSE_VALUE_TYPE_WITH_HALF(Idr, gko::solver::Idr);
 GKO_PARSE_VALUE_TYPE_WITH_HALF(Gcr, gko::solver::Gcr);
 GKO_PARSE_VALUE_TYPE_WITH_HALF(Gmres, gko::solver::Gmres);
 GKO_PARSE_VALUE_TYPE(CbGmres, gko::solver::CbGmres);
-GKO_PARSE_VALUE_AND_INDEX_TYPE(Direct, gko::experimental::solver::Direct);
-GKO_PARSE_VALUE_AND_INDEX_TYPE(LowerTrs, gko::solver::LowerTrs);
-GKO_PARSE_VALUE_AND_INDEX_TYPE(UpperTrs, gko::solver::UpperTrs);
+GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(Direct,
+                                         gko::experimental::solver::Direct);
+GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(LowerTrs, gko::solver::LowerTrs);
+GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(UpperTrs, gko::solver::UpperTrs);
 
 
 template <>

From 7df26f0139cd702d4ab95930239f2629dc8a2cc9 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 25 Oct 2024 18:22:25 +0200
Subject: [PATCH 381/448] factorization

---
 .../factorization/cholesky_kernels.cpp        | 12 ++--
 .../factorization/factorization_kernels.cpp   | 10 ++--
 common/cuda_hip/factorization/ic_kernels.cpp  |  2 +-
 common/cuda_hip/factorization/ilu_kernels.cpp |  2 +-
 common/cuda_hip/factorization/lu_kernels.cpp  |  6 +-
 .../cuda_hip/factorization/par_ic_kernels.cpp | 29 +++++----
 .../factorization/par_ict_kernels.cpp         | 26 +++++---
 .../factorization/par_ilu_kernels.cpp         | 32 ++++++----
 .../par_ilut_approx_filter_kernels.cpp        |  2 +-
 .../factorization/par_ilut_filter_kernels.cpp |  2 +-
 .../factorization/par_ilut_select_common.cpp  | 16 +++--
 .../factorization/par_ilut_select_kernels.cpp | 19 ++++--
 .../factorization/par_ilut_select_kernels.hpp |  4 +-
 .../factorization/par_ilut_spgeam_kernels.cpp |  2 +-
 .../factorization/par_ilut_sweep_kernels.cpp  | 36 ++++++-----
 core/device_hooks/common_kernels.inc.cpp      | 60 +++++++++++--------
 core/factorization/cholesky.cpp               |  2 +-
 core/factorization/elimination_forest.cpp     |  3 +-
 core/factorization/factorization.cpp          |  3 +-
 core/factorization/ic.cpp                     |  2 +-
 core/factorization/ilu.cpp                    |  2 +-
 core/factorization/lu.cpp                     |  2 +-
 core/factorization/par_ic.cpp                 |  2 +-
 core/factorization/par_ict.cpp                |  2 +-
 core/factorization/par_ilu.cpp                |  2 +-
 core/factorization/par_ilut.cpp               |  2 +-
 core/factorization/symbolic.cpp               |  8 ++-
 .../test/factorization/elimination_forest.cpp |  2 +-
 core/test/factorization/par_ic.cpp            |  3 +-
 core/test/factorization/par_ict.cpp           |  3 +-
 core/test/factorization/par_ilu.cpp           |  3 +-
 core/test/factorization/par_ilut.cpp          |  2 +-
 .../factorization_kernels.dp.cpp              | 12 ++--
 dpcpp/factorization/par_ic_kernels.dp.cpp     |  8 +--
 dpcpp/factorization/par_ict_kernels.dp.cpp    |  6 +-
 .../par_ilut_filter_kernels.hpp.inc           |  4 +-
 .../par_ilut_select_kernels.hpp.inc           |  8 +--
 omp/factorization/cholesky_kernels.cpp        | 12 ++--
 omp/factorization/factorization_kernels.cpp   | 10 ++--
 omp/factorization/ic_kernels.cpp              |  2 +-
 omp/factorization/ilu_kernels.cpp             |  2 +-
 omp/factorization/lu_kernels.cpp              |  8 ++-
 omp/factorization/par_ic_kernels.cpp          |  4 +-
 omp/factorization/par_ict_kernels.cpp         |  4 +-
 omp/factorization/par_ilu_kernels.cpp         |  2 +-
 omp/factorization/par_ilut_kernels.cpp        | 17 ++++--
 reference/factorization/cholesky_kernels.cpp  | 12 ++--
 .../factorization/factorization_kernels.cpp   | 10 ++--
 reference/factorization/ic_kernels.cpp        |  2 +-
 reference/factorization/ilu_kernels.cpp       |  2 +-
 reference/factorization/lu_kernels.cpp        |  8 ++-
 reference/factorization/par_ic_kernels.cpp    |  4 +-
 reference/factorization/par_ict_kernels.cpp   |  4 +-
 reference/factorization/par_ilu_kernels.cpp   |  2 +-
 reference/factorization/par_ilut_kernels.cpp  | 17 ++++--
 .../test/factorization/cholesky_kernels.cpp   |  2 +-
 .../test/factorization/factorization.cpp      |  2 +-
 reference/test/factorization/ic_kernels.cpp   |  3 +-
 reference/test/factorization/ilu_kernels.cpp  |  3 +-
 reference/test/factorization/lu_kernels.cpp   | 11 ++--
 .../test/factorization/par_ic_kernels.cpp     |  3 +-
 .../test/factorization/par_ict_kernels.cpp    |  3 +-
 .../test/factorization/par_ilu_kernels.cpp    |  3 +-
 .../test/factorization/par_ilut_kernels.cpp   | 27 ++++++---
 test/factorization/lu_kernels.cpp             |  2 +-
 test/factorization/par_ic_kernels.cpp         |  8 ++-
 test/factorization/par_ict_kernels.cpp        | 13 ++--
 test/factorization/par_ilu_kernels.cpp        |  9 ++-
 test/factorization/par_ilut_kernels.cpp       | 53 +++++++++-------
 69 files changed, 364 insertions(+), 241 deletions(-)

diff --git a/common/cuda_hip/factorization/cholesky_kernels.cpp b/common/cuda_hip/factorization/cholesky_kernels.cpp
index 7ff1382d8c6..ef24bb47fe0 100644
--- a/common/cuda_hip/factorization/cholesky_kernels.cpp
+++ b/common/cuda_hip/factorization/cholesky_kernels.cpp
@@ -262,7 +262,7 @@ void symbolic_factorize(
             postorder, postorder_parent, out_row_ptrs, out_cols);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE);
 
 
@@ -321,7 +321,7 @@ void forest_from_factor(
     build_children_from_parents(exec, forest);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR);
 
 
@@ -355,7 +355,8 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
                                transpose_idxs);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_INITIALIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CHOLESKY_INITIALIZE);
 
 
 template <typename ValueType, typename IndexType>
@@ -390,7 +391,8 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CHOLESKY_FACTORIZE);
 
 
 template <typename ValueType, typename IndexType>
@@ -446,7 +448,7 @@ void symbolic_count(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT);
 
 
diff --git a/common/cuda_hip/factorization/factorization_kernels.cpp b/common/cuda_hip/factorization/factorization_kernels.cpp
index f26ef668d34..8e8893df535 100644
--- a/common/cuda_hip/factorization/factorization_kernels.cpp
+++ b/common/cuda_hip/factorization/factorization_kernels.cpp
@@ -355,7 +355,7 @@ void add_diagonal_elements(std::shared_ptr<const DefaultExecutor> exec,
     mtx_builder.get_col_idx_array() = std::move(new_col_idx_array);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL);
 
 
@@ -385,7 +385,7 @@ void initialize_row_ptrs_l_u(
     components::prefix_sum_nonnegative(exec, u_row_ptrs, num_rows + 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL);
 
 
@@ -418,7 +418,7 @@ void initialize_l_u(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL);
 
 
@@ -446,7 +446,7 @@ void initialize_row_ptrs_l(
     components::prefix_sum_nonnegative(exec, l_row_ptrs, num_rows + 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL);
 
 
@@ -483,7 +483,7 @@ void initialize_l(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL);
 
 
diff --git a/common/cuda_hip/factorization/ic_kernels.cpp b/common/cuda_hip/factorization/ic_kernels.cpp
index e84032bac35..c2ed0b17cf0 100644
--- a/common/cuda_hip/factorization/ic_kernels.cpp
+++ b/common/cuda_hip/factorization/ic_kernels.cpp
@@ -54,7 +54,7 @@ void sparselib_ic(std::shared_ptr<const DefaultExecutor> exec,
     sparselib::destroy(desc);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_IC_SPARSELIB_IC_KERNEL);
 
 
diff --git a/common/cuda_hip/factorization/ilu_kernels.cpp b/common/cuda_hip/factorization/ilu_kernels.cpp
index b81f8fb9092..eb7677e117f 100644
--- a/common/cuda_hip/factorization/ilu_kernels.cpp
+++ b/common/cuda_hip/factorization/ilu_kernels.cpp
@@ -54,7 +54,7 @@ void sparselib_ilu(std::shared_ptr<const DefaultExecutor> exec,
     sparselib::destroy(desc);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);
 
 
diff --git a/common/cuda_hip/factorization/lu_kernels.cpp b/common/cuda_hip/factorization/lu_kernels.cpp
index b0d54e44217..4d98b611e28 100644
--- a/common/cuda_hip/factorization/lu_kernels.cpp
+++ b/common/cuda_hip/factorization/lu_kernels.cpp
@@ -253,7 +253,8 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_INITIALIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_LU_INITIALIZE);
 
 
 template <typename ValueType, typename IndexType>
@@ -286,7 +287,8 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_LU_FACTORIZE);
 
 
 template <typename IndexType>
diff --git a/common/cuda_hip/factorization/par_ic_kernels.cpp b/common/cuda_hip/factorization/par_ic_kernels.cpp
index ee8b7c97f64..87e2fefd823 100644
--- a/common/cuda_hip/factorization/par_ic_kernels.cpp
+++ b/common/cuda_hip/factorization/par_ic_kernels.cpp
@@ -110,7 +110,7 @@ void init_factor(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_IC_INIT_FACTOR_KERNEL);
 
 
@@ -123,19 +123,28 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
     auto nnz = l->get_num_stored_elements();
     auto num_blocks = ceildiv(nnz, default_block_size);
     if (num_blocks > 0) {
-        for (size_type i = 0; i < iterations; ++i) {
-            kernel::ic_sweep<<<num_blocks, default_block_size, 0,
-                               exec->get_stream()>>>(
-                a_lower->get_const_row_idxs(), a_lower->get_const_col_idxs(),
-                as_device_type(a_lower->get_const_values()),
-                l->get_const_row_ptrs(), l->get_const_col_idxs(),
-                as_device_type(l->get_values()),
-                static_cast<IndexType>(l->get_num_stored_elements()));
+#ifdef GKO_COMPILING_HIP
+        if constexpr (std::is_same<remove_complex<ValueType>, half>::value) {
+            // HIP does not support 16bit atomic operation
+            GKO_NOT_SUPPORTED(a_lower);
+        } else
+#endif
+        {
+            for (size_type i = 0; i < iterations; ++i) {
+                kernel::ic_sweep<<<num_blocks, default_block_size, 0,
+                                   exec->get_stream()>>>(
+                    a_lower->get_const_row_idxs(),
+                    a_lower->get_const_col_idxs(),
+                    as_device_type(a_lower->get_const_values()),
+                    l->get_const_row_ptrs(), l->get_const_col_idxs(),
+                    as_device_type(l->get_values()),
+                    static_cast<IndexType>(l->get_num_stored_elements()));
+            }
         }
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_IC_COMPUTE_FACTOR_KERNEL);
 
 
diff --git a/common/cuda_hip/factorization/par_ict_kernels.cpp b/common/cuda_hip/factorization/par_ict_kernels.cpp
index 3446f124123..0acf0633a2c 100644
--- a/common/cuda_hip/factorization/par_ict_kernels.cpp
+++ b/common/cuda_hip/factorization/par_ict_kernels.cpp
@@ -390,13 +390,21 @@ void compute_factor(syn::value_list<int, subwarp_size>,
     auto block_size = default_block_size / subwarp_size;
     auto num_blocks = ceildiv(total_nnz, block_size);
     if (num_blocks > 0) {
-        kernel::ict_sweep<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                a->get_const_row_ptrs(), a->get_const_col_idxs(),
-                as_device_type(a->get_const_values()), l->get_const_row_ptrs(),
-                l_coo->get_const_row_idxs(), l->get_const_col_idxs(),
-                as_device_type(l->get_values()),
-                static_cast<IndexType>(l->get_num_stored_elements()));
+#ifdef GKO_COMPILING_HIP
+        if constexpr (std::is_same<remove_complex<ValueType>, half>::value) {
+            // HIP does not support 16bit atomic operation
+            GKO_NOT_SUPPORTED(l);
+        } else
+#endif
+        {
+            kernel::ict_sweep<subwarp_size>
+                <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                    a->get_const_row_ptrs(), a->get_const_col_idxs(),
+                    as_device_type(a->get_const_values()),
+                    l->get_const_row_ptrs(), l_coo->get_const_row_idxs(),
+                    l->get_const_col_idxs(), as_device_type(l->get_values()),
+                    static_cast<IndexType>(l->get_num_stored_elements()));
+        }
     }
 }
 
@@ -427,7 +435,7 @@ void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
         syn::value_list<int>(), syn::type_list<>(), exec, llh, a, l, l_new);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL);
 
 
@@ -449,7 +457,7 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
         syn::value_list<int>(), syn::type_list<>(), exec, a, l, l_coo);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL);
 
 
diff --git a/common/cuda_hip/factorization/par_ilu_kernels.cpp b/common/cuda_hip/factorization/par_ilu_kernels.cpp
index 8bf71c471a8..a22bb85275a 100644
--- a/common/cuda_hip/factorization/par_ilu_kernels.cpp
+++ b/common/cuda_hip/factorization/par_ilu_kernels.cpp
@@ -94,21 +94,31 @@ void compute_l_u_factors(std::shared_ptr<const DefaultExecutor> exec,
     const auto grid_dim = static_cast<uint32>(
         ceildiv(num_elements, static_cast<size_type>(block_size)));
     if (grid_dim > 0) {
-        for (size_type i = 0; i < iterations; ++i) {
-            kernel::compute_l_u_factors<<<grid_dim, block_size, 0,
-                                          exec->get_stream()>>>(
-                num_elements, system_matrix->get_const_row_idxs(),
-                system_matrix->get_const_col_idxs(),
-                as_device_type(system_matrix->get_const_values()),
-                l_factor->get_const_row_ptrs(), l_factor->get_const_col_idxs(),
-                as_device_type(l_factor->get_values()),
-                u_factor->get_const_row_ptrs(), u_factor->get_const_col_idxs(),
-                as_device_type(u_factor->get_values()));
+#ifdef GKO_COMPILING_HIP
+        if constexpr (std::is_same<remove_complex<ValueType>, half>::value) {
+            // HIP does not support 16bit atomic operation
+            GKO_NOT_SUPPORTED(system_matrix);
+        } else
+#endif
+        {
+            for (size_type i = 0; i < iterations; ++i) {
+                kernel::compute_l_u_factors<<<grid_dim, block_size, 0,
+                                              exec->get_stream()>>>(
+                    num_elements, system_matrix->get_const_row_idxs(),
+                    system_matrix->get_const_col_idxs(),
+                    as_device_type(system_matrix->get_const_values()),
+                    l_factor->get_const_row_ptrs(),
+                    l_factor->get_const_col_idxs(),
+                    as_device_type(l_factor->get_values()),
+                    u_factor->get_const_row_ptrs(),
+                    u_factor->get_const_col_idxs(),
+                    as_device_type(u_factor->get_values()));
+            }
         }
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL);
 
 
diff --git a/common/cuda_hip/factorization/par_ilut_approx_filter_kernels.cpp b/common/cuda_hip/factorization/par_ilut_approx_filter_kernels.cpp
index 12d8da9e4f5..475d87b8bda 100644
--- a/common/cuda_hip/factorization/par_ilut_approx_filter_kernels.cpp
+++ b/common/cuda_hip/factorization/par_ilut_approx_filter_kernels.cpp
@@ -168,7 +168,7 @@ void threshold_filter_approx(std::shared_ptr<const DefaultExecutor> exec,
         &threshold, m_out, m_out_coo);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL);
 
 
diff --git a/common/cuda_hip/factorization/par_ilut_filter_kernels.cpp b/common/cuda_hip/factorization/par_ilut_filter_kernels.cpp
index 25432fb44d2..d6ad2f477eb 100644
--- a/common/cuda_hip/factorization/par_ilut_filter_kernels.cpp
+++ b/common/cuda_hip/factorization/par_ilut_filter_kernels.cpp
@@ -123,7 +123,7 @@ void threshold_filter(std::shared_ptr<const DefaultExecutor> exec,
         m_out_coo, lower);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL);
 
 
diff --git a/common/cuda_hip/factorization/par_ilut_select_common.cpp b/common/cuda_hip/factorization/par_ilut_select_common.cpp
index fccb89fcf5a..3bb67d96e4f 100644
--- a/common/cuda_hip/factorization/par_ilut_select_common.cpp
+++ b/common/cuda_hip/factorization/par_ilut_select_common.cpp
@@ -43,9 +43,17 @@ void sampleselect_count(std::shared_ptr<const DefaultExecutor> exec,
     auto num_threads_total = ceildiv(size, items_per_thread);
     auto num_blocks =
         static_cast<IndexType>(ceildiv(num_threads_total, default_block_size));
-    // pick sample, build searchtree
-    kernel::build_searchtree<<<1, bucket_count, 0, exec->get_stream()>>>(
-        as_device_type(values), size, as_device_type(tree));
+#ifdef GKO_COMPILING_HIP
+    if constexpr (std::is_same<remove_complex<ValueType>, half>::value) {
+        // HIP does not support 16bit atomic operation
+        GKO_NOT_SUPPORTED(values);
+    } else
+#endif
+    {
+        // pick sample, build searchtree
+        kernel::build_searchtree<<<1, bucket_count, 0, exec->get_stream()>>>(
+            as_device_type(values), size, as_device_type(tree));
+    }
     // determine bucket sizes
     if (num_blocks > 0) {
         kernel::count_buckets<<<num_blocks, default_block_size, 0,
@@ -69,7 +77,7 @@ void sampleselect_count(std::shared_ptr<const DefaultExecutor> exec,
                             unsigned char* oracles, IndexType* partial_counts, \
                             IndexType* total_counts)
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(DECLARE_SSSS_COUNT);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(DECLARE_SSSS_COUNT);
 
 
 template <typename IndexType>
diff --git a/common/cuda_hip/factorization/par_ilut_select_kernels.cpp b/common/cuda_hip/factorization/par_ilut_select_kernels.cpp
index e03ee379977..a15adf580e8 100644
--- a/common/cuda_hip/factorization/par_ilut_select_kernels.cpp
+++ b/common/cuda_hip/factorization/par_ilut_select_kernels.cpp
@@ -141,13 +141,22 @@ void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
 
     // base case
     auto out_ptr = reinterpret_cast<AbsType*>(tmp1.get_data());
-    kernel::basecase_select<<<1, kernel::basecase_block_size, 0,
-                              exec->get_stream()>>>(
-        as_device_type(tmp22), bucket.size, rank, as_device_type(out_ptr));
-    threshold = exec->copy_val_to_host(out_ptr);
+
+#ifdef GKO_COMPILING_HIP
+    if constexpr (std::is_same<remove_complex<ValueType>, half>::value) {
+        // HIP does not support 16bit atomic operation
+        GKO_NOT_SUPPORTED(m);
+    } else
+#endif
+    {
+        kernel::basecase_select<<<1, kernel::basecase_block_size, 0,
+                                  exec->get_stream()>>>(
+            as_device_type(tmp22), bucket.size, rank, as_device_type(out_ptr));
+        threshold = exec->copy_val_to_host(out_ptr);
+    }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL);
 
 
diff --git a/common/cuda_hip/factorization/par_ilut_select_kernels.hpp b/common/cuda_hip/factorization/par_ilut_select_kernels.hpp
index 79a562ff834..b88c052d19a 100644
--- a/common/cuda_hip/factorization/par_ilut_select_kernels.hpp
+++ b/common/cuda_hip/factorization/par_ilut_select_kernels.hpp
@@ -254,12 +254,12 @@ __global__ __launch_bounds__(basecase_block_size) void basecase_select(
     const ValueType* __restrict__ input, IndexType size, IndexType rank,
     ValueType* __restrict__ out)
 {
-    constexpr auto sentinel = device_numeric_limits<ValueType>::inf();
+    const auto sentinel = device_numeric_limits<ValueType>::inf();
     ValueType local[basecase_local_size];
     __shared__ ValueType sh_local[basecase_size];
     for (int i = 0; i < basecase_local_size; ++i) {
         auto idx = threadIdx.x + i * basecase_block_size;
-        local[i] = idx < size ? input[idx] : sentinel;
+        local[i] = idx < size ? input[idx] : static_cast<ValueType>(sentinel);
     }
     bitonic_sort<basecase_size, basecase_local_size>(local, sh_local);
     if (threadIdx.x == rank / basecase_local_size) {
diff --git a/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp b/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp
index a29cf6f2cb3..8f7a8af0443 100644
--- a/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp
+++ b/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp
@@ -389,7 +389,7 @@ void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
         u_new);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL);
 
 
diff --git a/common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp b/common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp
index 52f62b50e6a..c0f962a89c8 100644
--- a/common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp
+++ b/common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp
@@ -154,18 +154,26 @@ void compute_l_u_factors(syn::value_list<int, subwarp_size>,
     auto block_size = default_block_size / subwarp_size;
     auto num_blocks = ceildiv(total_nnz, block_size);
     if (num_blocks > 0) {
-        kernel::sweep<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                a->get_const_row_ptrs(), a->get_const_col_idxs(),
-                as_device_type(a->get_const_values()), l->get_const_row_ptrs(),
-                l_coo->get_const_row_idxs(), l->get_const_col_idxs(),
-                as_device_type(l->get_values()),
-                static_cast<IndexType>(l->get_num_stored_elements()),
-                u_coo->get_const_row_idxs(), u_coo->get_const_col_idxs(),
-                as_device_type(u->get_values()), u_csc->get_const_row_ptrs(),
-                u_csc->get_const_col_idxs(),
-                as_device_type(u_csc->get_values()),
-                static_cast<IndexType>(u->get_num_stored_elements()));
+#ifdef GKO_COMPILING_HIP
+        if constexpr (std::is_same<remove_complex<ValueType>, half>::value) {
+            // HIP does not support 16bit atomic operation
+            GKO_NOT_SUPPORTED(a);
+        } else
+#endif
+        {
+            kernel::sweep<subwarp_size>
+                <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                    a->get_const_row_ptrs(), a->get_const_col_idxs(),
+                    as_device_type(a->get_const_values()),
+                    l->get_const_row_ptrs(), l_coo->get_const_row_idxs(),
+                    l->get_const_col_idxs(), as_device_type(l->get_values()),
+                    static_cast<IndexType>(l->get_num_stored_elements()),
+                    u_coo->get_const_row_idxs(), u_coo->get_const_col_idxs(),
+                    as_device_type(u->get_values()),
+                    u_csc->get_const_row_ptrs(), u_csc->get_const_col_idxs(),
+                    as_device_type(u_csc->get_values()),
+                    static_cast<IndexType>(u->get_num_stored_elements()));
+        }
     }
 }
 
@@ -199,11 +207,11 @@ void compute_l_u_factors(std::shared_ptr<const DefaultExecutor> exec,
         u_csc);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL);
 
 
 }  // namespace par_ilut_factorization
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index f37166613b7..4e64134a9f2 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -995,11 +995,13 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL);
 namespace cholesky {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_INITIALIZE);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CHOLESKY_INITIALIZE);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CHOLESKY_FACTORIZE);
 
 
 }  // namespace cholesky
@@ -1008,14 +1010,16 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE);
 namespace factorization {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE(
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL);
 
 
 }  // namespace factorization
@@ -1024,7 +1028,7 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL);
 namespace ic_factorization {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC_SPARSELIB_IC_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_IC_SPARSELIB_IC_KERNEL);
 
 
 }  // namespace ic_factorization
@@ -1033,7 +1037,7 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC_SPARSELIB_IC_KERNEL);
 namespace ilu_factorization {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);
 
 
 }  // namespace ilu_factorization
@@ -1042,8 +1046,8 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);
 namespace lu_factorization {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_INITIALIZE);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_LU_INITIALIZE);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_LU_FACTORIZE);
 GKO_STUB_INDEX_TYPE(GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE);
 GKO_STUB_INDEX_TYPE(GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE_FINALIZE);
 
@@ -1054,8 +1058,9 @@ GKO_STUB_INDEX_TYPE(GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE_FINALIZE);
 namespace par_ic_factorization {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_IC_INIT_FACTOR_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_IC_COMPUTE_FACTOR_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_PAR_IC_INIT_FACTOR_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_PAR_IC_COMPUTE_FACTOR_KERNEL);
 
 
 }  // namespace par_ic_factorization
@@ -1064,8 +1069,10 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_IC_COMPUTE_FACTOR_KERNEL);
 namespace par_ict_factorization {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL);
 
 
 }  // namespace par_ict_factorization
@@ -1074,7 +1081,8 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL);
 namespace par_ilu_factorization {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL);
 
 
 }  // namespace par_ilu_factorization
@@ -1083,11 +1091,15 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL);
 namespace par_ilut_factorization {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL);
 
 
diff --git a/core/factorization/cholesky.cpp b/core/factorization/cholesky.cpp
index 92d598f0bd7..a552ec37ec1 100644
--- a/core/factorization/cholesky.cpp
+++ b/core/factorization/cholesky.cpp
@@ -146,7 +146,7 @@ std::unique_ptr<LinOp> Cholesky<ValueType, IndexType>::generate_impl(
 #define GKO_DECLARE_CHOLESKY(ValueType, IndexType) \
     class Cholesky<ValueType, IndexType>
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CHOLESKY);
 
 
 }  // namespace factorization
diff --git a/core/factorization/elimination_forest.cpp b/core/factorization/elimination_forest.cpp
index 1dc8ff060a0..f8d6d861c2d 100644
--- a/core/factorization/elimination_forest.cpp
+++ b/core/factorization/elimination_forest.cpp
@@ -173,7 +173,8 @@ void compute_elim_forest(const matrix::Csr<ValueType, IndexType>* mtx,
         const matrix::Csr<ValueType, IndexType>* mtx,         \
         std::unique_ptr<elimination_forest<IndexType>>& forest)
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COMPUTE_ELIM_FOREST);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_COMPUTE_ELIM_FOREST);
 
 
 }  // namespace factorization
diff --git a/core/factorization/factorization.cpp b/core/factorization/factorization.cpp
index 1df1f49aa13..e0e4ccdc3c7 100644
--- a/core/factorization/factorization.cpp
+++ b/core/factorization/factorization.cpp
@@ -362,7 +362,8 @@ void Factorization<ValueType, IndexType>::apply_impl(const LinOp* alpha,
 #define GKO_DECLARE_FACTORIZATION(ValueType, IndexType) \
     class Factorization<ValueType, IndexType>
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FACTORIZATION);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_FACTORIZATION);
 
 
 }  // namespace factorization
diff --git a/core/factorization/ic.cpp b/core/factorization/ic.cpp
index bf9d5e7bbf4..d8706c8b8e3 100644
--- a/core/factorization/ic.cpp
+++ b/core/factorization/ic.cpp
@@ -203,7 +203,7 @@ std::unique_ptr<Composition<ValueType>> Ic<ValueType, IndexType>::generate(
 
 
 #define GKO_DECLARE_IC(ValueType, IndexType) class Ic<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_IC);
 
 
 }  // namespace factorization
diff --git a/core/factorization/ilu.cpp b/core/factorization/ilu.cpp
index f7703f3d20b..1c6079700e3 100644
--- a/core/factorization/ilu.cpp
+++ b/core/factorization/ilu.cpp
@@ -188,7 +188,7 @@ std::unique_ptr<Composition<ValueType>> Ilu<ValueType, IndexType>::generate_l_u(
 
 
 #define GKO_DECLARE_ILU(ValueType, IndexType) class Ilu<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ILU);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_ILU);
 
 
 }  // namespace factorization
diff --git a/core/factorization/lu.cpp b/core/factorization/lu.cpp
index 4feb78083d2..dfdce26f140 100644
--- a/core/factorization/lu.cpp
+++ b/core/factorization/lu.cpp
@@ -166,7 +166,7 @@ std::unique_ptr<LinOp> Lu<ValueType, IndexType>::generate_impl(
 
 #define GKO_DECLARE_LU(ValueType, IndexType) class Lu<ValueType, IndexType>
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_LU);
 
 
 }  // namespace factorization
diff --git a/core/factorization/par_ic.cpp b/core/factorization/par_ic.cpp
index f4a4afd23d6..b310025eb8d 100644
--- a/core/factorization/par_ic.cpp
+++ b/core/factorization/par_ic.cpp
@@ -146,7 +146,7 @@ std::unique_ptr<Composition<ValueType>> ParIc<ValueType, IndexType>::generate(
 
 #define GKO_DECLARE_PAR_IC(ValueType, IndexType) \
     class ParIc<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_IC);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_PAR_IC);
 
 
 }  // namespace factorization
diff --git a/core/factorization/par_ict.cpp b/core/factorization/par_ict.cpp
index a0e8a628ca8..696b185e969 100644
--- a/core/factorization/par_ict.cpp
+++ b/core/factorization/par_ict.cpp
@@ -300,7 +300,7 @@ void ParIctState<ValueType, IndexType>::iterate()
 
 #define GKO_DECLARE_PAR_ICT(ValueType, IndexType) \
     class ParIct<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ICT);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_PAR_ICT);
 
 
 }  // namespace factorization
diff --git a/core/factorization/par_ilu.cpp b/core/factorization/par_ilu.cpp
index 68c0c0c4fc6..177c150df1d 100644
--- a/core/factorization/par_ilu.cpp
+++ b/core/factorization/par_ilu.cpp
@@ -161,7 +161,7 @@ ParIlu<ValueType, IndexType>::generate_l_u(
 
 #define GKO_DECLARE_PAR_ILU(ValueType, IndexType) \
     class ParIlu<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ILU);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_PAR_ILU);
 
 
 }  // namespace factorization
diff --git a/core/factorization/par_ilut.cpp b/core/factorization/par_ilut.cpp
index 42e3cc03130..e90dbb8140f 100644
--- a/core/factorization/par_ilut.cpp
+++ b/core/factorization/par_ilut.cpp
@@ -352,7 +352,7 @@ void ParIlutState<ValueType, IndexType>::iterate()
 
 #define GKO_DECLARE_PAR_ILUT(ValueType, IndexType) \
     class ParIlut<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ILUT);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_PAR_ILUT);
 
 
 }  // namespace factorization
diff --git a/core/factorization/symbolic.cpp b/core/factorization/symbolic.cpp
index 23f6b94cc14..495b830d7ea 100644
--- a/core/factorization/symbolic.cpp
+++ b/core/factorization/symbolic.cpp
@@ -80,7 +80,8 @@ void symbolic_cholesky(
         std::unique_ptr<matrix::Csr<ValueType, IndexType>>& factors,   \
         std::unique_ptr<factorization::elimination_forest<IndexType>>& forest)
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SYMBOLIC_CHOLESKY);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_SYMBOLIC_CHOLESKY);
 
 
 template <typename ValueType, typename IndexType>
@@ -158,7 +159,7 @@ void symbolic_lu_near_symm(
         const matrix::Csr<ValueType, IndexType>* mtx,           \
         std::unique_ptr<matrix::Csr<ValueType, IndexType>>& factors)
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SYMBOLIC_LU_NEAR_SYMM);
 
 
@@ -245,7 +246,8 @@ void symbolic_lu(const matrix::Csr<ValueType, IndexType>* mtx,
         const matrix::Csr<ValueType, IndexType>* mtx, \
         std::unique_ptr<matrix::Csr<ValueType, IndexType>>& factors)
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SYMBOLIC_LU);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_SYMBOLIC_LU);
 
 
 }  // namespace factorization
diff --git a/core/test/factorization/elimination_forest.cpp b/core/test/factorization/elimination_forest.cpp
index 292b366f50e..cf9ddb7536e 100644
--- a/core/test/factorization/elimination_forest.cpp
+++ b/core/test/factorization/elimination_forest.cpp
@@ -33,7 +33,7 @@ class EliminationForest : public ::testing::Test {
     std::shared_ptr<const gko::ReferenceExecutor> ref;
 };
 
-TYPED_TEST_SUITE(EliminationForest, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(EliminationForest, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
diff --git a/core/test/factorization/par_ic.cpp b/core/test/factorization/par_ic.cpp
index d6de0f9fc98..efd4c1e3ebd 100644
--- a/core/test/factorization/par_ic.cpp
+++ b/core/test/factorization/par_ic.cpp
@@ -29,7 +29,8 @@ class ParIc : public ::testing::Test {
     std::shared_ptr<const gko::ReferenceExecutor> ref;
 };
 
-TYPED_TEST_SUITE(ParIc, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(ParIc, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(ParIc, SetIterations)
diff --git a/core/test/factorization/par_ict.cpp b/core/test/factorization/par_ict.cpp
index 07eec8db549..5d5ac8bc815 100644
--- a/core/test/factorization/par_ict.cpp
+++ b/core/test/factorization/par_ict.cpp
@@ -29,7 +29,8 @@ class ParIct : public ::testing::Test {
     std::shared_ptr<const gko::ReferenceExecutor> ref;
 };
 
-TYPED_TEST_SUITE(ParIct, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(ParIct, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(ParIct, SetIterations)
diff --git a/core/test/factorization/par_ilu.cpp b/core/test/factorization/par_ilu.cpp
index a0b8f37e3d4..e06a90741af 100644
--- a/core/test/factorization/par_ilu.cpp
+++ b/core/test/factorization/par_ilu.cpp
@@ -29,7 +29,8 @@ class ParIlu : public ::testing::Test {
     std::shared_ptr<const gko::ReferenceExecutor> ref;
 };
 
-TYPED_TEST_SUITE(ParIlu, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(ParIlu, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(ParIlu, SetIterations)
diff --git a/core/test/factorization/par_ilut.cpp b/core/test/factorization/par_ilut.cpp
index ad466e62407..a2d0287d22a 100644
--- a/core/test/factorization/par_ilut.cpp
+++ b/core/test/factorization/par_ilut.cpp
@@ -30,7 +30,7 @@ class ParIlut : public ::testing::Test {
     std::shared_ptr<const gko::ReferenceExecutor> ref;
 };
 
-TYPED_TEST_SUITE(ParIlut, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(ParIlut, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
diff --git a/dpcpp/factorization/factorization_kernels.dp.cpp b/dpcpp/factorization/factorization_kernels.dp.cpp
index 885fe481609..24736f9e00c 100644
--- a/dpcpp/factorization/factorization_kernels.dp.cpp
+++ b/dpcpp/factorization/factorization_kernels.dp.cpp
@@ -393,7 +393,7 @@ void initialize_l(dim3 grid, dim3 block, size_type dynamic_shared_memory,
                                   helpers::triangular_mtx_closure(
                                       [use_sqrt](auto val) {
                                           if (use_sqrt) {
-                                              val = sqrt(val);
+                                              val = gko::sqrt(val);
                                               if (!is_finite(val)) {
                                                   val = one<ValueType>();
                                               }
@@ -482,7 +482,7 @@ void add_diagonal_elements(std::shared_ptr<const DpcppExecutor> exec,
     mtx_builder.get_col_idx_array() = std::move(new_col_idxs);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL);
 
 
@@ -509,7 +509,7 @@ void initialize_row_ptrs_l_u(
     components::prefix_sum_nonnegative(exec, u_row_ptrs, num_rows + 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL);
 
 
@@ -534,7 +534,7 @@ void initialize_l_u(std::shared_ptr<const DpcppExecutor> exec,
                            csr_u->get_col_idxs(), csr_u->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL);
 
 
@@ -559,7 +559,7 @@ void initialize_row_ptrs_l(
     components::prefix_sum_nonnegative(exec, l_row_ptrs, num_rows + 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL);
 
 
@@ -582,7 +582,7 @@ void initialize_l(std::shared_ptr<const DpcppExecutor> exec,
                          csr_l->get_values(), diag_sqrt);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL);
 
 
diff --git a/dpcpp/factorization/par_ic_kernels.dp.cpp b/dpcpp/factorization/par_ic_kernels.dp.cpp
index 5428460fac5..91819dd98d0 100644
--- a/dpcpp/factorization/par_ic_kernels.dp.cpp
+++ b/dpcpp/factorization/par_ic_kernels.dp.cpp
@@ -41,7 +41,7 @@ void ic_init(const IndexType* __restrict__ l_row_ptrs,
         return;
     }
     auto l_nz = l_row_ptrs[row + 1] - 1;
-    auto diag = std::sqrt(l_vals[l_nz]);
+    auto diag = gko::sqrt(l_vals[l_nz]);
     if (is_finite(diag)) {
         l_vals[l_nz] = diag;
     } else {
@@ -93,7 +93,7 @@ void ic_sweep(const IndexType* __restrict__ a_row_idxs,
         lh_col_begin += l_col >= lh_row;
     }
     auto to_write = row == col
-                        ? std::sqrt(a_val - sum)
+                        ? gko::sqrt(a_val - sum)
                         : (a_val - sum) / l_vals[l_row_ptrs[col + 1] - 1];
     if (is_finite(to_write)) {
         l_vals[l_nz] = to_write;
@@ -130,7 +130,7 @@ void init_factor(std::shared_ptr<const DefaultExecutor> exec,
                     l_row_ptrs, l_vals, num_rows);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_IC_INIT_FACTOR_KERNEL);
 
 
@@ -152,7 +152,7 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_IC_COMPUTE_FACTOR_KERNEL);
 
 
diff --git a/dpcpp/factorization/par_ict_kernels.dp.cpp b/dpcpp/factorization/par_ict_kernels.dp.cpp
index fb99b662dec..6a704641252 100644
--- a/dpcpp/factorization/par_ict_kernels.dp.cpp
+++ b/dpcpp/factorization/par_ict_kernels.dp.cpp
@@ -356,7 +356,7 @@ void ict_sweep(const IndexType* __restrict__ a_row_ptrs,
 
     if (subwarp.thread_rank() == 0) {
         auto to_write = row == col
-                            ? std::sqrt(a_val - sum)
+                            ? gko::sqrt(a_val - sum)
                             : (a_val - sum) / l_vals[l_row_ptrs[col + 1] - 1];
         if (is_finite(to_write)) {
             l_vals[l_nz] = to_write;
@@ -483,7 +483,7 @@ void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
         syn::value_list<int>(), syn::type_list<>(), exec, llh, a, l, l_new);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL);
 
 
@@ -505,7 +505,7 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
         syn::value_list<int>(), syn::type_list<>(), exec, a, l, l_coo);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL);
 
 
diff --git a/dpcpp/factorization/par_ilut_filter_kernels.hpp.inc b/dpcpp/factorization/par_ilut_filter_kernels.hpp.inc
index d2345848d1f..6081bc0f417 100644
--- a/dpcpp/factorization/par_ilut_filter_kernels.hpp.inc
+++ b/dpcpp/factorization/par_ilut_filter_kernels.hpp.inc
@@ -102,7 +102,7 @@ void threshold_filter_nnz(const IndexType* __restrict__ row_ptrs,
         row_ptrs, num_rows,
         [&](IndexType idx, IndexType row_begin, IndexType row_end) {
             auto diag_idx = lower ? row_end - 1 : row_begin;
-            return std::abs(vals[idx]) >= threshold || idx == diag_idx;
+            return gko::abs(vals[idx]) >= threshold || idx == diag_idx;
         },
         nnz, item_ct1);
 }
@@ -140,7 +140,7 @@ void threshold_filter(const IndexType* __restrict__ old_row_ptrs,
         old_row_ptrs, old_col_idxs, old_vals, num_rows,
         [&](IndexType idx, IndexType row_begin, IndexType row_end) {
             auto diag_idx = lower ? row_end - 1 : row_begin;
-            return std::abs(old_vals[idx]) >= threshold || idx == diag_idx;
+            return gko::abs(old_vals[idx]) >= threshold || idx == diag_idx;
         },
         new_row_ptrs, new_row_idxs, new_col_idxs, new_vals, item_ct1);
 }
diff --git a/dpcpp/factorization/par_ilut_select_kernels.hpp.inc b/dpcpp/factorization/par_ilut_select_kernels.hpp.inc
index 67cc9cdba15..430bf650e07 100644
--- a/dpcpp/factorization/par_ilut_select_kernels.hpp.inc
+++ b/dpcpp/factorization/par_ilut_select_kernels.hpp.inc
@@ -38,7 +38,7 @@ void build_searchtree(const ValueType* __restrict__ input, IndexType size,
     for (int i = 0; i < sampleselect_oversampling; ++i) {
         auto lidx = idx * sampleselect_oversampling + i;
         auto val = input[static_cast<IndexType>(lidx * stride)];
-        samples[i] = std::abs(val);
+        samples[i] = gko::abs(val);
     }
 
     bitonic_sort<sample_size, sampleselect_oversampling>(samples, sh_samples,
@@ -113,7 +113,7 @@ void count_buckets(const ValueType* __restrict__ input, IndexType size,
     auto end = min(block_end, size);
     for (IndexType i = begin; i < end; i += default_block_size) {
         // traverse the search tree with the input element
-        auto el = std::abs(input[i]);
+        auto el = gko::abs(input[i]);
         IndexType tree_idx{};
 #pragma unroll
         for (int level = 0; level < sampleselect_searchtree_height; ++level) {
@@ -297,7 +297,7 @@ void filter_bucket(const ValueType* __restrict__ input, IndexType size,
         auto found = bucket == oracles[i];
         auto ofs = atomic_add<atomic::local_space>(&*counter, IndexType{found});
         if (found) {
-            output[ofs] = std::abs(input[i]);
+            output[ofs] = gko::abs(input[i]);
         }
     }
 }
@@ -337,7 +337,7 @@ void basecase_select(const ValueType* __restrict__ input, IndexType size,
 
     for (int i = 0; i < basecase_local_size; ++i) {
         auto idx = item_ct1.get_local_id(2) + i * basecase_block_size;
-        local[i] = idx < size ? input[idx] : sentinel;
+        local[i] = idx < size ? input[idx] : static_cast<ValueType>(sentinel);
     }
     bitonic_sort<basecase_size, basecase_local_size>(local, sh_local, item_ct1);
     if (item_ct1.get_local_id(2) == rank / basecase_local_size) {
diff --git a/omp/factorization/cholesky_kernels.cpp b/omp/factorization/cholesky_kernels.cpp
index aa4aabfc731..0eb30441405 100644
--- a/omp/factorization/cholesky_kernels.cpp
+++ b/omp/factorization/cholesky_kernels.cpp
@@ -78,7 +78,7 @@ void symbolic_count(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT);
 
 
@@ -126,7 +126,7 @@ void symbolic_factorize(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE);
 
 
@@ -169,7 +169,7 @@ void forest_from_factor(
                                      num_rows, num_rows + 1, child_ptrs);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR);
 
 
@@ -201,7 +201,8 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
               });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_INITIALIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CHOLESKY_INITIALIZE);
 
 
 namespace {
@@ -282,7 +283,8 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CHOLESKY_FACTORIZE);
 
 
 }  // namespace cholesky
diff --git a/omp/factorization/factorization_kernels.cpp b/omp/factorization/factorization_kernels.cpp
index e7b66f6f887..47cd38d89c3 100644
--- a/omp/factorization/factorization_kernels.cpp
+++ b/omp/factorization/factorization_kernels.cpp
@@ -180,7 +180,7 @@ void add_diagonal_elements(std::shared_ptr<const OmpExecutor> exec,
     mtx_builder.get_col_idx_array() = std::move(new_col_idxs);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL);
 
 
@@ -215,7 +215,7 @@ void initialize_row_ptrs_l_u(
     components::prefix_sum_nonnegative(exec, u_row_ptrs, num_rows + 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL);
 
 
@@ -233,7 +233,7 @@ void initialize_l_u(std::shared_ptr<const OmpExecutor> exec,
                                         helpers::identity{}));
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL);
 
 
@@ -264,7 +264,7 @@ void initialize_row_ptrs_l(
     components::prefix_sum_nonnegative(exec, l_row_ptrs, num_rows + 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL);
 
 
@@ -287,7 +287,7 @@ void initialize_l(std::shared_ptr<const OmpExecutor> exec,
                               helpers::identity{}));
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL);
 
 
diff --git a/omp/factorization/ic_kernels.cpp b/omp/factorization/ic_kernels.cpp
index c071ba2ca87..313bf8c7982 100644
--- a/omp/factorization/ic_kernels.cpp
+++ b/omp/factorization/ic_kernels.cpp
@@ -20,7 +20,7 @@ template <typename ValueType, typename IndexType>
 void sparselib_ic(std::shared_ptr<const DefaultExecutor> exec,
                   matrix::Csr<ValueType, IndexType>* m) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_IC_SPARSELIB_IC_KERNEL);
 
 
diff --git a/omp/factorization/ilu_kernels.cpp b/omp/factorization/ilu_kernels.cpp
index b88e6a77900..db3fd5ef7a8 100644
--- a/omp/factorization/ilu_kernels.cpp
+++ b/omp/factorization/ilu_kernels.cpp
@@ -20,7 +20,7 @@ template <typename ValueType, typename IndexType>
 void sparselib_ilu(std::shared_ptr<const DefaultExecutor> exec,
                    matrix::Csr<ValueType, IndexType>* m) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);
 
 
diff --git a/omp/factorization/lu_kernels.cpp b/omp/factorization/lu_kernels.cpp
index 4b13f9a352c..5f766a7208a 100644
--- a/omp/factorization/lu_kernels.cpp
+++ b/omp/factorization/lu_kernels.cpp
@@ -59,7 +59,8 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_INITIALIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_LU_INITIALIZE);
 
 
 namespace {
@@ -126,7 +127,8 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_LU_FACTORIZE);
 
 
 template <typename IndexType>
@@ -215,4 +217,4 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
 }  // namespace lu_factorization
 }  // namespace omp
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/omp/factorization/par_ic_kernels.cpp b/omp/factorization/par_ic_kernels.cpp
index 93093783acc..9488c448519 100644
--- a/omp/factorization/par_ic_kernels.cpp
+++ b/omp/factorization/par_ic_kernels.cpp
@@ -42,7 +42,7 @@ void init_factor(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_IC_INIT_FACTOR_KERNEL);
 
 
@@ -96,7 +96,7 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_IC_COMPUTE_FACTOR_KERNEL);
 
 
diff --git a/omp/factorization/par_ict_kernels.cpp b/omp/factorization/par_ict_kernels.cpp
index b5546e1a644..a67ad860965 100644
--- a/omp/factorization/par_ict_kernels.cpp
+++ b/omp/factorization/par_ict_kernels.cpp
@@ -91,7 +91,7 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL);
 
 
@@ -166,7 +166,7 @@ void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
         [](IndexType, row_state) {});
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL);
 
 
diff --git a/omp/factorization/par_ilu_kernels.cpp b/omp/factorization/par_ilu_kernels.cpp
index da42a631b81..0504bca8b1d 100644
--- a/omp/factorization/par_ilu_kernels.cpp
+++ b/omp/factorization/par_ilu_kernels.cpp
@@ -88,7 +88,7 @@ void compute_l_u_factors(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL);
 
 
diff --git a/omp/factorization/par_ilut_kernels.cpp b/omp/factorization/par_ilut_kernels.cpp
index a24709e4f1a..af9229f3509 100644
--- a/omp/factorization/par_ilut_kernels.cpp
+++ b/omp/factorization/par_ilut_kernels.cpp
@@ -54,7 +54,7 @@ void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
     threshold = abs(*target);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL);
 
 
@@ -144,7 +144,7 @@ void threshold_filter(std::shared_ptr<const DefaultExecutor> exec,
         });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL);
 
 
@@ -181,7 +181,12 @@ void threshold_filter_approx(std::shared_ptr<const DefaultExecutor> exec,
     // pick splitters
     for (IndexType i = 0; i < bucket_count - 1; ++i) {
         // shift by one so we get upper bounds for the buckets
-        sample[i] = sample[(i + 1) * sampleselect_oversampling];
+        // NVHPC 23.3 seems to handle assignment index with
+        // optimization wrongly on a custom class when IndexType is long. We set
+        // the index explicitly with volatile to solve it. NVHPC24.1 fixed this
+        // issue. https://godbolt.org/z/srYhGndKn
+        volatile auto index = (i + 1) * sampleselect_oversampling;
+        sample[i] = sample[index];
     }
     // count elements per bucket
     auto total_histogram = reinterpret_cast<IndexType*>(sample + bucket_count);
@@ -228,7 +233,7 @@ void threshold_filter_approx(std::shared_ptr<const DefaultExecutor> exec,
         });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL);
 
 
@@ -312,7 +317,7 @@ void compute_l_u_factors(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL);
 
 
@@ -428,7 +433,7 @@ void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
         [](IndexType, row_state) {});
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL);
 
 
diff --git a/reference/factorization/cholesky_kernels.cpp b/reference/factorization/cholesky_kernels.cpp
index e4d7112a15f..199cae4c8fa 100644
--- a/reference/factorization/cholesky_kernels.cpp
+++ b/reference/factorization/cholesky_kernels.cpp
@@ -63,7 +63,7 @@ void symbolic_count(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT);
 
 
@@ -102,7 +102,7 @@ void symbolic_factorize(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE);
 
 
@@ -140,7 +140,7 @@ void forest_from_factor(
                                      num_rows + 1, child_ptrs);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR);
 
 
@@ -172,7 +172,8 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
               });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_INITIALIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CHOLESKY_INITIALIZE);
 
 
 namespace {
@@ -254,7 +255,8 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CHOLESKY_FACTORIZE);
 
 
 }  // namespace cholesky
diff --git a/reference/factorization/factorization_kernels.cpp b/reference/factorization/factorization_kernels.cpp
index 99b522ffba9..15d778c2235 100644
--- a/reference/factorization/factorization_kernels.cpp
+++ b/reference/factorization/factorization_kernels.cpp
@@ -127,7 +127,7 @@ void add_diagonal_elements(std::shared_ptr<const ReferenceExecutor> exec,
     mtx_builder.get_col_idx_array() = std::move(new_col_idxs_array);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL);
 
 
@@ -159,7 +159,7 @@ void initialize_row_ptrs_l_u(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL);
 
 
@@ -177,7 +177,7 @@ void initialize_l_u(std::shared_ptr<const ReferenceExecutor> exec,
                                         helpers::identity{}));
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL);
 
 
@@ -204,7 +204,7 @@ void initialize_row_ptrs_l(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL);
 
 
@@ -227,7 +227,7 @@ void initialize_l(std::shared_ptr<const ReferenceExecutor> exec,
                               helpers::identity{}));
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL);
 
 
diff --git a/reference/factorization/ic_kernels.cpp b/reference/factorization/ic_kernels.cpp
index 93945c2da14..3557ee0b978 100644
--- a/reference/factorization/ic_kernels.cpp
+++ b/reference/factorization/ic_kernels.cpp
@@ -69,7 +69,7 @@ void sparselib_ic(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_IC_SPARSELIB_IC_KERNEL);
 
 
diff --git a/reference/factorization/ilu_kernels.cpp b/reference/factorization/ilu_kernels.cpp
index 3323e0b6cef..2eedd988929 100644
--- a/reference/factorization/ilu_kernels.cpp
+++ b/reference/factorization/ilu_kernels.cpp
@@ -65,7 +65,7 @@ void sparselib_ilu(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);
 
 
diff --git a/reference/factorization/lu_kernels.cpp b/reference/factorization/lu_kernels.cpp
index c72b14456e1..d8bb8c427ef 100644
--- a/reference/factorization/lu_kernels.cpp
+++ b/reference/factorization/lu_kernels.cpp
@@ -58,7 +58,8 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_INITIALIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_LU_INITIALIZE);
 
 
 namespace {
@@ -124,7 +125,8 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_LU_FACTORIZE);
 
 
 template <typename IndexType>
@@ -212,4 +214,4 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
 }  // namespace lu_factorization
 }  // namespace reference
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/reference/factorization/par_ic_kernels.cpp b/reference/factorization/par_ic_kernels.cpp
index 4da317cf201..e8f3a9273f4 100644
--- a/reference/factorization/par_ic_kernels.cpp
+++ b/reference/factorization/par_ic_kernels.cpp
@@ -46,7 +46,7 @@ void init_factor(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_IC_INIT_FACTOR_KERNEL);
 
 
@@ -96,7 +96,7 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_IC_COMPUTE_FACTOR_KERNEL);
 
 
diff --git a/reference/factorization/par_ict_kernels.cpp b/reference/factorization/par_ict_kernels.cpp
index 684158d380c..c6b192b328b 100644
--- a/reference/factorization/par_ict_kernels.cpp
+++ b/reference/factorization/par_ict_kernels.cpp
@@ -89,7 +89,7 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL);
 
 
@@ -167,7 +167,7 @@ void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
         [](IndexType, row_state) {});
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL);
 
 
diff --git a/reference/factorization/par_ilu_kernels.cpp b/reference/factorization/par_ilu_kernels.cpp
index 44c2e5f66bc..ddcc41d1070 100644
--- a/reference/factorization/par_ilu_kernels.cpp
+++ b/reference/factorization/par_ilu_kernels.cpp
@@ -86,7 +86,7 @@ void compute_l_u_factors(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL);
 
 
diff --git a/reference/factorization/par_ilut_kernels.cpp b/reference/factorization/par_ilut_kernels.cpp
index abef6e9b5f2..c22c6924d6c 100644
--- a/reference/factorization/par_ilut_kernels.cpp
+++ b/reference/factorization/par_ilut_kernels.cpp
@@ -58,7 +58,7 @@ void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
     threshold = abs(*target);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL);
 
 
@@ -150,7 +150,7 @@ void threshold_filter(std::shared_ptr<const DefaultExecutor> exec,
         });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL);
 
 
@@ -191,7 +191,12 @@ void threshold_filter_approx(std::shared_ptr<const DefaultExecutor> exec,
     // pick splitters
     for (IndexType i = 0; i < bucket_count - 1; ++i) {
         // shift by one so we get upper bounds for the buckets
-        sample[i] = sample[(i + 1) * sampleselect_oversampling];
+        // NVHPC 23.3 seems to handle assignment index with
+        // optimization wrongly on a custom class when IndexType is long. We set
+        // the index explicitly with volatile to solve it. NVHPC24.1 fixed this
+        // issue. https://godbolt.org/z/srYhGndKn
+        volatile auto index = (i + 1) * sampleselect_oversampling;
+        sample[i] = sample[index];
     }
     // count elements per bucket
     auto histogram = reinterpret_cast<IndexType*>(sample + bucket_count);
@@ -221,7 +226,7 @@ void threshold_filter_approx(std::shared_ptr<const DefaultExecutor> exec,
         });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL);
 
 
@@ -309,7 +314,7 @@ void compute_l_u_factors(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL);
 
 
@@ -432,7 +437,7 @@ void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
         [](IndexType, row_state) {});
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL);
 
 
diff --git a/reference/test/factorization/cholesky_kernels.cpp b/reference/test/factorization/cholesky_kernels.cpp
index b4c33d76ab9..671630c99eb 100644
--- a/reference/test/factorization/cholesky_kernels.cpp
+++ b/reference/test/factorization/cholesky_kernels.cpp
@@ -245,7 +245,7 @@ class Cholesky : public ::testing::Test {
     std::shared_ptr<matrix_type> combined_ref;
 };
 
-TYPED_TEST_SUITE(Cholesky, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(Cholesky, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/factorization/factorization.cpp b/reference/test/factorization/factorization.cpp
index 2ded81d4867..73bf8cdc321 100644
--- a/reference/test/factorization/factorization.cpp
+++ b/reference/test/factorization/factorization.cpp
@@ -70,7 +70,7 @@ class Factorization : public ::testing::Test {
     std::shared_ptr<vector_type> beta;
 };
 
-TYPED_TEST_SUITE(Factorization, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(Factorization, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/factorization/ic_kernels.cpp b/reference/test/factorization/ic_kernels.cpp
index 1593da136a4..84faa3c3b45 100644
--- a/reference/test/factorization/ic_kernels.cpp
+++ b/reference/test/factorization/ic_kernels.cpp
@@ -80,7 +80,8 @@ class Ic : public ::testing::Test {
     gko::remove_complex<value_type> tol;
 };
 
-TYPED_TEST_SUITE(Ic, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Ic, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(Ic, ThrowNotSupportedForWrongLinOp)
diff --git a/reference/test/factorization/ilu_kernels.cpp b/reference/test/factorization/ilu_kernels.cpp
index aaeb44382f1..1ba1fedf13f 100644
--- a/reference/test/factorization/ilu_kernels.cpp
+++ b/reference/test/factorization/ilu_kernels.cpp
@@ -170,7 +170,8 @@ class Ilu : public ::testing::Test {
     std::unique_ptr<typename ilu_type::Factory> ilu_factory_sort;
 };
 
-TYPED_TEST_SUITE(Ilu, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Ilu, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(Ilu, ThrowNotSupportedForWrongLinOp1)
diff --git a/reference/test/factorization/lu_kernels.cpp b/reference/test/factorization/lu_kernels.cpp
index c10937ac486..7b4a860b0d5 100644
--- a/reference/test/factorization/lu_kernels.cpp
+++ b/reference/test/factorization/lu_kernels.cpp
@@ -98,7 +98,8 @@ class Lu : public ::testing::Test {
     gko::array<gko::int64> row_descs;
 };
 
-TYPED_TEST_SUITE(Lu, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Lu, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(Lu, SymbolicCholeskyWorks)
@@ -219,7 +220,7 @@ TYPED_TEST(Lu, KernelFactorizeWorks)
             diag_idxs.get_const_data(), this->mtx_lu.get(), true, tmp);
 
         GKO_ASSERT_MTX_NEAR(this->mtx_lu, mtx_lu_ref,
-                            15 * r<value_type>::value);
+                            30 * r<value_type>::value);
     });
 }
 
@@ -268,7 +269,7 @@ TYPED_TEST(Lu, FactorizeNonsymmetricWorks)
 
         GKO_ASSERT_MTX_EQ_SPARSITY(lu->get_combined(), this->mtx_lu);
         GKO_ASSERT_MTX_NEAR(lu->get_combined(), this->mtx_lu,
-                            15 * r<value_type>::value);
+                            30 * r<value_type>::value);
         ASSERT_EQ(lu->get_storage_type(),
                   gko::experimental::factorization::storage_type::combined_lu);
         ASSERT_EQ(lu->get_lower_factor(), nullptr);
@@ -294,7 +295,7 @@ TYPED_TEST(Lu, FactorizeNearSymmetricWorks)
 
         GKO_ASSERT_MTX_EQ_SPARSITY(lu->get_combined(), this->mtx_lu);
         GKO_ASSERT_MTX_NEAR(lu->get_combined(), this->mtx_lu,
-                            15 * r<value_type>::value);
+                            30 * r<value_type>::value);
         ASSERT_EQ(lu->get_storage_type(),
                   gko::experimental::factorization::storage_type::combined_lu);
         ASSERT_EQ(lu->get_lower_factor(), nullptr);
@@ -321,7 +322,7 @@ TYPED_TEST(Lu, FactorizeWithKnownSparsityWorks)
         auto lu = factory->generate(this->mtx);
 
         GKO_ASSERT_MTX_NEAR(lu->get_combined(), this->mtx_lu,
-                            15 * r<value_type>::value);
+                            30 * r<value_type>::value);
         ASSERT_EQ(lu->get_storage_type(),
                   gko::experimental::factorization::storage_type::combined_lu);
         ASSERT_EQ(lu->get_lower_factor(), nullptr);
diff --git a/reference/test/factorization/par_ic_kernels.cpp b/reference/test/factorization/par_ic_kernels.cpp
index b9caf8c9e5e..481e89bb744 100644
--- a/reference/test/factorization/par_ic_kernels.cpp
+++ b/reference/test/factorization/par_ic_kernels.cpp
@@ -104,7 +104,8 @@ class ParIc : public ::testing::Test {
     gko::remove_complex<value_type> tol;
 };
 
-TYPED_TEST_SUITE(ParIc, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(ParIc, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(ParIc, KernelCompute)
diff --git a/reference/test/factorization/par_ict_kernels.cpp b/reference/test/factorization/par_ict_kernels.cpp
index 55ac5771732..d3b6df59f42 100644
--- a/reference/test/factorization/par_ict_kernels.cpp
+++ b/reference/test/factorization/par_ict_kernels.cpp
@@ -137,7 +137,8 @@ class ParIct : public ::testing::Test {
     gko::remove_complex<value_type> tol;
 };
 
-TYPED_TEST_SUITE(ParIct, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(ParIct, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(ParIct, KernelInitializeRowPtrsL)
diff --git a/reference/test/factorization/par_ilu_kernels.cpp b/reference/test/factorization/par_ilu_kernels.cpp
index bf4e422f640..3d590c1a6d6 100644
--- a/reference/test/factorization/par_ilu_kernels.cpp
+++ b/reference/test/factorization/par_ilu_kernels.cpp
@@ -180,7 +180,8 @@ class ParIlu : public ::testing::Test {
     std::unique_ptr<typename par_ilu_type::Factory> ilu_factory_sort;
 };
 
-TYPED_TEST_SUITE(ParIlu, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(ParIlu, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(ParIlu, KernelAddDiagonalElementsEmpty)
diff --git a/reference/test/factorization/par_ilut_kernels.cpp b/reference/test/factorization/par_ilut_kernels.cpp
index 59805f246f8..3a6ba9232da 100644
--- a/reference/test/factorization/par_ilut_kernels.cpp
+++ b/reference/test/factorization/par_ilut_kernels.cpp
@@ -54,6 +54,7 @@ class ParIlut : public ::testing::Test {
     using ComplexCsr =
         gko::matrix::Csr<std::complex<gko::remove_complex<value_type>>,
                          index_type>;
+    using complex_value_type = std::complex<gko::remove_complex<value_type>>;
 
     ParIlut()
         : ref(gko::ReferenceExecutor::create()),
@@ -75,16 +76,24 @@ class ParIlut : public ::testing::Test {
                                                   {0., -3., 0., 1.}},
                                                  ref)),
           mtx1_complex(gko::initialize<ComplexCsr>(
-              {{{.1, 0.}, {0., 0.}, {0., 0.}, {0., 0.}},
-               {{-1., .1}, {.1, -1.}, {0., 0.}, {0., 0.}},
-               {{-1., 1.}, {-2., .2}, {-1., -.3}, {0., 0.}},
-               {{1., -2.}, {-3., -.1}, {-1., .1}, {.1, 2.}}},
+              {{complex_value_type{.1, 0.}, complex_value_type{0., 0.},
+                complex_value_type{0., 0.}, complex_value_type{0., 0.}},
+               {complex_value_type{-1., .1}, complex_value_type{.1, -1.},
+                complex_value_type{0., 0.}, complex_value_type{0., 0.}},
+               {complex_value_type{-1., 1.}, complex_value_type{-2., .2},
+                complex_value_type{-1., -.3}, complex_value_type{0., 0.}},
+               {complex_value_type{1., -2.}, complex_value_type{-3., -.1},
+                complex_value_type{-1., .1}, complex_value_type{.1, 2.}}},
               ref)),
           mtx1_expect_complex_thrm(gko::initialize<ComplexCsr>(
-              {{{.1, 0.}, {0., 0.}, {0., 0.}, {0., 0.}},
-               {{0., 0.}, {.1, -1.}, {0., 0.}, {0., 0.}},
-               {{-1., 1.}, {-2., .2}, {-1., -.3}, {0., 0.}},
-               {{1., -2.}, {-3., -.1}, {0., 0.}, {.1, 2.}}},
+              {{complex_value_type{.1, 0.}, complex_value_type{0., 0.},
+                complex_value_type{0., 0.}, complex_value_type{0., 0.}},
+               {complex_value_type{0., 0.}, complex_value_type{.1, -1.},
+                complex_value_type{0., 0.}, complex_value_type{0., 0.}},
+               {complex_value_type{-1., 1.}, complex_value_type{-2., .2},
+                complex_value_type{-1., -.3}, complex_value_type{0., 0.}},
+               {complex_value_type{1., -2.}, complex_value_type{-3., -.1},
+                complex_value_type{0., 0.}, complex_value_type{.1, 2.}}},
               ref)),
           identity(gko::initialize<Csr>(
               {{1., 0., 0.}, {0., 1., 0.}, {0., 0., 1.}}, ref)),
@@ -268,7 +277,7 @@ class ParIlut : public ::testing::Test {
     gko::remove_complex<value_type> tol;
 };  // namespace
 
-TYPED_TEST_SUITE(ParIlut, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(ParIlut, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
diff --git a/test/factorization/lu_kernels.cpp b/test/factorization/lu_kernels.cpp
index b9b8bbf00ee..e9f64bb1152 100644
--- a/test/factorization/lu_kernels.cpp
+++ b/test/factorization/lu_kernels.cpp
@@ -129,7 +129,7 @@ class Lu : public CommonTestFixture {
 };
 
 #ifdef GKO_COMPILING_OMP
-using Types = gko::test::ValueIndexTypes;
+using Types = gko::test::ValueIndexTypesWithHalf;
 #elif defined(GKO_COMPILING_CUDA)
 // CUDA don't support long indices for sorting, and the triangular solvers
 // seem broken
diff --git a/test/factorization/par_ic_kernels.cpp b/test/factorization/par_ic_kernels.cpp
index de2342a28db..bb53a454e21 100644
--- a/test/factorization/par_ic_kernels.cpp
+++ b/test/factorization/par_ic_kernels.cpp
@@ -41,8 +41,7 @@ class ParIc : public CommonTestFixture {
         mtx_l = gko::test::generate_random_lower_triangular_matrix<Csr>(
             mtx_size[0], false,
             std::uniform_int_distribution<index_type>(10, mtx_size[0]),
-            std::normal_distribution<gko::remove_complex<value_type>>(0, 10.0),
-            rand_engine, ref);
+            std::normal_distribution<>(0, 10.0), rand_engine, ref);
         dmtx_ani = Csr::create(exec);
         dmtx_l_ani = Csr::create(exec);
         dmtx_l_ani_init = Csr::create(exec);
@@ -87,7 +86,8 @@ class ParIc : public CommonTestFixture {
     std::unique_ptr<Csr> dmtx_l_ani_init;
 };
 
-TYPED_TEST_SUITE(ParIc, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(ParIc, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(ParIc, KernelInitFactorIsEquivalentToRef)
@@ -107,6 +107,8 @@ TYPED_TEST(ParIc, KernelComputeFactorIsEquivalentToRef)
 {
     using Csr = typename TestFixture::Csr;
     using Coo = typename TestFixture::Coo;
+    using value_type = typename TestFixture::value_type;
+    SKIP_IF_HALF(value_type);
     auto square_size = this->mtx_ani->get_size();
     auto mtx_l_coo = Coo::create(this->ref, square_size);
     this->mtx_l_ani->convert_to(mtx_l_coo);
diff --git a/test/factorization/par_ict_kernels.cpp b/test/factorization/par_ict_kernels.cpp
index 3b33e52630c..945f874ef26 100644
--- a/test/factorization/par_ict_kernels.cpp
+++ b/test/factorization/par_ict_kernels.cpp
@@ -47,15 +47,11 @@ class ParIct : public CommonTestFixture {
         mtx = gko::test::generate_random_matrix<Csr>(
             mtx_size[0], mtx_size[1],
             std::uniform_int_distribution<index_type>(10, mtx_size[1]),
-            std::normal_distribution<gko::remove_complex<value_type>>(-1.0,
-                                                                      1.0),
-            rand_engine, ref);
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
         mtx_l = gko::test::generate_random_lower_triangular_matrix<Csr>(
             mtx_size[0], false,
             std::uniform_int_distribution<index_type>(10, mtx_size[0]),
-            std::normal_distribution<gko::remove_complex<value_type>>(-1.0,
-                                                                      1.0),
-            rand_engine, ref);
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
 
         dmtx_ani = Csr::create(exec);
         dmtx_l_ani = Csr::create(exec);
@@ -97,7 +93,8 @@ class ParIct : public CommonTestFixture {
     std::unique_ptr<Csr> dmtx_l;
 };
 
-TYPED_TEST_SUITE(ParIct, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(ParIct, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(ParIct, KernelAddCandidatesIsEquivalentToRef)
@@ -127,6 +124,8 @@ TYPED_TEST(ParIct, KernelComputeFactorIsEquivalentToRef)
 {
     using Csr = typename TestFixture::Csr;
     using Coo = typename TestFixture::Coo;
+    using value_type = typename TestFixture::value_type;
+    SKIP_IF_HALF(value_type);
     auto square_size = this->mtx_ani->get_size();
     auto mtx_l_coo = Coo::create(this->ref, square_size);
     this->mtx_l_ani->convert_to(mtx_l_coo);
diff --git a/test/factorization/par_ilu_kernels.cpp b/test/factorization/par_ilu_kernels.cpp
index 88f5ecff0d9..216a4f597cb 100644
--- a/test/factorization/par_ilu_kernels.cpp
+++ b/test/factorization/par_ilu_kernels.cpp
@@ -59,8 +59,7 @@ class ParIlu : public CommonTestFixture {
         return gko::test::generate_random_matrix<Mtx>(
             num_rows, num_cols,
             std::uniform_int_distribution<index_type>(0, num_cols - 1),
-            std::normal_distribution<gko::remove_complex<value_type>>(0.0, 1.0),
-            rand_engine, ref);
+            std::normal_distribution<>(0.0, 1.0), rand_engine, ref);
     }
 
     std::unique_ptr<Csr> gen_unsorted_mtx(index_type num_rows,
@@ -145,7 +144,8 @@ class ParIlu : public CommonTestFixture {
     }
 };
 
-TYPED_TEST_SUITE(ParIlu, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(ParIlu, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(ParIlu, KernelAddDiagonalElementsSortedEquivalentToRef)
@@ -237,6 +237,8 @@ TYPED_TEST(ParIlu, KernelInitializeParILUIsEquivalentToRef)
 TYPED_TEST(ParIlu, KernelComputeParILUIsEquivalentToRef)
 {
     using Csr = typename TestFixture::Csr;
+    using value_type = typename TestFixture::value_type;
+    SKIP_IF_HALF(value_type);
     std::unique_ptr<Csr> l_mtx{};
     std::unique_ptr<Csr> u_mtx{};
     std::unique_ptr<Csr> dl_mtx{};
@@ -255,6 +257,7 @@ TYPED_TEST(ParIlu, KernelComputeParILUWithMoreIterationsIsEquivalentToRef)
 {
     using Csr = typename TestFixture::Csr;
     using value_type = typename TestFixture::value_type;
+    SKIP_IF_HALF(value_type);
     std::unique_ptr<Csr> l_mtx{};
     std::unique_ptr<Csr> u_mtx{};
     std::unique_ptr<Csr> dl_mtx{};
diff --git a/test/factorization/par_ilut_kernels.cpp b/test/factorization/par_ilut_kernels.cpp
index dff3cc702c1..6804a3edcce 100644
--- a/test/factorization/par_ilut_kernels.cpp
+++ b/test/factorization/par_ilut_kernels.cpp
@@ -48,39 +48,27 @@ class ParIlut : public CommonTestFixture {
         mtx1 = gko::test::generate_random_matrix<Csr>(
             mtx_size[0], mtx_size[1],
             std::uniform_int_distribution<index_type>(10, mtx_size[1]),
-            std::normal_distribution<gko::remove_complex<value_type>>(-1.0,
-                                                                      1.0),
-            rand_engine, ref);
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
         mtx2 = gko::test::generate_random_matrix<Csr>(
             mtx_size[0], mtx_size[1],
             std::uniform_int_distribution<index_type>(0, mtx_size[1]),
-            std::normal_distribution<gko::remove_complex<value_type>>(-1.0,
-                                                                      1.0),
-            rand_engine, ref);
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
         mtx_square = gko::test::generate_random_matrix<Csr>(
             mtx_size[0], mtx_size[0],
             std::uniform_int_distribution<index_type>(1, mtx_size[0]),
-            std::normal_distribution<gko::remove_complex<value_type>>(-1.0,
-                                                                      1.0),
-            rand_engine, ref);
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
         mtx_l = gko::test::generate_random_lower_triangular_matrix<Csr>(
             mtx_size[0], false,
             std::uniform_int_distribution<index_type>(10, mtx_size[0]),
-            std::normal_distribution<gko::remove_complex<value_type>>(-1.0,
-                                                                      1.0),
-            rand_engine, ref);
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
         mtx_l2 = gko::test::generate_random_lower_triangular_matrix<Csr>(
             mtx_size[0], true,
             std::uniform_int_distribution<index_type>(1, mtx_size[0]),
-            std::normal_distribution<gko::remove_complex<value_type>>(-1.0,
-                                                                      1.0),
-            rand_engine, ref);
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
         mtx_u = gko::test::generate_random_upper_triangular_matrix<Csr>(
             mtx_size[0], false,
             std::uniform_int_distribution<index_type>(10, mtx_size[0]),
-            std::normal_distribution<gko::remove_complex<value_type>>(-1.0,
-                                                                      1.0),
-            rand_engine, ref);
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
 
         dmtx1 = gko::clone(exec, mtx1);
         dmtx2 = gko::clone(exec, mtx2);
@@ -134,7 +122,7 @@ class ParIlut : public CommonTestFixture {
                      const std::unique_ptr<Mtx>& dmtx, index_type rank)
     {
         double tolerance =
-            gko::is_complex<value_type>() ? r<value_type>::value : 0.0;
+            gko::is_complex<value_type>() ? double(r<value_type>::value) : 0.0;
         auto size = index_type(mtx->get_num_stored_elements());
         using ValueType = typename Mtx::value_type;
 
@@ -189,7 +177,7 @@ class ParIlut : public CommonTestFixture {
                             const std::unique_ptr<Mtx>& dmtx, index_type rank)
     {
         double tolerance =
-            gko::is_complex<value_type>() ? r<value_type>::value : 0.0;
+            gko::is_complex<value_type>() ? double(r<value_type>::value) : 0.0;
         auto res = Mtx::create(ref, mtx_size);
         auto dres = Mtx::create(exec, mtx_size);
         auto res_coo = Coo::create(ref, mtx_size);
@@ -245,12 +233,15 @@ class ParIlut : public CommonTestFixture {
     std::unique_ptr<Csr> dmtx_u;
 };
 
-TYPED_TEST_SUITE(ParIlut, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(ParIlut, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
 TYPED_TEST(ParIlut, KernelThresholdSelectIsEquivalentToRef)
 {
+    using value_type = typename TestFixture::value_type;
+    SKIP_IF_HALF(value_type);
+
     this->test_select(this->mtx_l, this->dmtx_l,
                       this->mtx_l->get_num_stored_elements() / 3);
 }
@@ -258,12 +249,18 @@ TYPED_TEST(ParIlut, KernelThresholdSelectIsEquivalentToRef)
 
 TYPED_TEST(ParIlut, KernelThresholdSelectMinIsEquivalentToRef)
 {
+    using value_type = typename TestFixture::value_type;
+    SKIP_IF_HALF(value_type);
+
     this->test_select(this->mtx_l, this->dmtx_l, 0);
 }
 
 
 TYPED_TEST(ParIlut, KernelThresholdSelectMaxIsEquivalentToRef)
 {
+    using value_type = typename TestFixture::value_type;
+    SKIP_IF_HALF(value_type);
+
     this->test_select(this->mtx_l, this->dmtx_l,
                       this->mtx_l->get_num_stored_elements() - 1);
 }
@@ -330,6 +327,7 @@ TYPED_TEST(ParIlut, KernelThresholdFilterApproxNullptrCooIsEquivalentToRef)
     using Coo = typename TestFixture::Coo;
     using value_type = typename TestFixture::value_type;
     using index_type = typename TestFixture::index_type;
+    SKIP_IF_HALF(value_type);
     this->test_filter(this->mtx_l, this->dmtx_l, 0.5, true);
     auto res = Csr::create(this->ref, this->mtx_size);
     auto dres = Csr::create(this->exec, this->mtx_size);
@@ -355,6 +353,9 @@ TYPED_TEST(ParIlut, KernelThresholdFilterApproxNullptrCooIsEquivalentToRef)
 
 TYPED_TEST(ParIlut, KernelThresholdFilterApproxLowerIsEquivalentToRef)
 {
+    using value_type = typename TestFixture::value_type;
+    SKIP_IF_HALF(value_type);
+
     this->test_filter_approx(this->mtx_l, this->dmtx_l,
                              this->mtx_l->get_num_stored_elements() / 2);
 }
@@ -362,12 +363,18 @@ TYPED_TEST(ParIlut, KernelThresholdFilterApproxLowerIsEquivalentToRef)
 
 TYPED_TEST(ParIlut, KernelThresholdFilterApproxNoneLowerIsEquivalentToRef)
 {
+    using value_type = typename TestFixture::value_type;
+    SKIP_IF_HALF(value_type);
+
     this->test_filter_approx(this->mtx_l, this->dmtx_l, 0);
 }
 
 
 TYPED_TEST(ParIlut, KernelThresholdFilterApproxAllLowerIsEquivalentToRef)
 {
+    using value_type = typename TestFixture::value_type;
+    SKIP_IF_HALF(value_type);
+
     this->test_filter_approx(this->mtx_l, this->dmtx_l,
                              this->mtx_l->get_num_stored_elements() - 1);
 }
@@ -377,6 +384,8 @@ TYPED_TEST(ParIlut, KernelAddCandidatesIsEquivalentToRef)
 {
     using Csr = typename TestFixture::Csr;
     using value_type = typename TestFixture::value_type;
+    // there's one value larger than half range
+    SKIP_IF_HALF(value_type);
     auto square_size = this->mtx_square->get_size();
     auto mtx_lu = Csr::create(this->ref, square_size);
     this->mtx_l2->apply(this->mtx_u, mtx_lu);
@@ -405,6 +414,8 @@ TYPED_TEST(ParIlut, KernelComputeLUIsEquivalentToRef)
 {
     using Csr = typename TestFixture::Csr;
     using Coo = typename TestFixture::Coo;
+    using value_type = typename TestFixture::value_type;
+    SKIP_IF_HALF(value_type);
     auto square_size = this->mtx_ani->get_size();
     auto mtx_l_coo = Coo::create(this->ref, square_size);
     auto mtx_u_coo = Coo::create(this->ref, square_size);

From ab14d4c3f120f2923e73e16502e932366c805357 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 25 Oct 2024 18:23:26 +0200
Subject: [PATCH 382/448] factorization config dispatch

---
 core/config/factorization_config.cpp | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/core/config/factorization_config.cpp b/core/config/factorization_config.cpp
index 259d32cb872..dae4072cce8 100644
--- a/core/config/factorization_config.cpp
+++ b/core/config/factorization_config.cpp
@@ -23,15 +23,18 @@ namespace gko {
 namespace config {
 
 
-GKO_PARSE_VALUE_AND_INDEX_TYPE(Factorization_Ic, gko::factorization::Ic);
-GKO_PARSE_VALUE_AND_INDEX_TYPE(Factorization_Ilu, gko::factorization::Ilu);
-GKO_PARSE_VALUE_AND_INDEX_TYPE(Cholesky,
-                               gko::experimental::factorization::Cholesky);
-GKO_PARSE_VALUE_AND_INDEX_TYPE(Lu, gko::experimental::factorization::Lu);
-GKO_PARSE_VALUE_AND_INDEX_TYPE(ParIlu, gko::factorization::ParIlu);
-GKO_PARSE_VALUE_AND_INDEX_TYPE(ParIlut, gko::factorization::ParIlut);
-GKO_PARSE_VALUE_AND_INDEX_TYPE(ParIc, gko::factorization::ParIc);
-GKO_PARSE_VALUE_AND_INDEX_TYPE(ParIct, gko::factorization::ParIct);
+GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(Factorization_Ic,
+                                         gko::factorization::Ic);
+GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(Factorization_Ilu,
+                                         gko::factorization::Ilu);
+GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    Cholesky, gko::experimental::factorization::Cholesky);
+GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(Lu,
+                                         gko::experimental::factorization::Lu);
+GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(ParIlu, gko::factorization::ParIlu);
+GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(ParIlut, gko::factorization::ParIlut);
+GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(ParIc, gko::factorization::ParIc);
+GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(ParIct, gko::factorization::ParIct);
 
 
 }  // namespace config

From e2c4b21fdf78082e4fbf671c416ad9749a11ae2e Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 5 Nov 2024 16:28:10 +0100
Subject: [PATCH 383/448] cmake cuda test with cuda arch and fix is_finite

---
 cmake/create_test.cmake       |  4 ----
 common/cuda_hip/base/math.hpp | 12 ++++++++++++
 cuda/test/base/math.cu        | 24 ++++++++++++++++++++----
 hip/test/base/math.hip.cpp    | 24 ++++++++++++++++++++----
 4 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake
index 20f074778a1..c540d6e2cf7 100644
--- a/cmake/create_test.cmake
+++ b/cmake/create_test.cmake
@@ -171,10 +171,6 @@ function(ginkgo_create_cuda_test_internal test_name filename test_target_name)
             PRIVATE
                 $<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda --expt-relaxed-constexpr>)
     endif()
-    # we handle CUDA architecture flags for now, disable CMake handling
-    if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
-        set_target_properties(${test_target_name} PROPERTIES CUDA_ARCHITECTURES OFF)
-    endif()
     ginkgo_set_test_target_properties(${test_target_name} "_cuda" ${ARGN})
     ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE cudagpu)
 endfunction(ginkgo_create_cuda_test_internal)
diff --git a/common/cuda_hip/base/math.hpp b/common/cuda_hip/base/math.hpp
index f83533d8f0d..51a7fedf0c4 100644
--- a/common/cuda_hip/base/math.hpp
+++ b/common/cuda_hip/base/math.hpp
@@ -162,6 +162,18 @@ __device__ __forceinline__ __half sqrt(const __half& val)
 }
 
 
+// using overload here. Otherwise, compiler still think the is_finite
+// specialization is still __host__ __device__ function.
+__device__ __forceinline__ bool is_finite(const __half& value)
+{
+    return abs(value) < device_numeric_limits<__half>::inf();
+}
+
+__device__ __forceinline__ bool is_finite(const thrust::complex<__half>& value)
+{
+    return is_finite(value.real()) && is_finite(value.imag());
+}
+
 #endif
 
 
diff --git a/cuda/test/base/math.cu b/cuda/test/base/math.cu
index 1025c3cc489..27a35b2421a 100644
--- a/cuda/test/base/math.cu
+++ b/cuda/test/base/math.cu
@@ -10,6 +10,7 @@
 
 #include <gtest/gtest.h>
 
+#include <ginkgo/config.hpp>
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
@@ -26,8 +27,8 @@ namespace kernel {
 template <typename T, typename FuncType>
 __device__ bool test_real_is_finite_function(FuncType isfin)
 {
-    constexpr T inf = gko::device_numeric_limits<T>::inf();
-    constexpr T quiet_nan = NAN;
+    const T inf = gko::device_numeric_limits<T>::inf();
+    const auto quiet_nan = static_cast<T>(NAN);
     bool test_true{};
     bool test_false{};
 
@@ -46,8 +47,8 @@ __device__ bool test_complex_is_finite_function(FuncType isfin)
                   "Template type must be a complex type.");
     using T = gko::remove_complex<ComplexType>;
     using c_type = gko::kernels::cuda::cuda_type<ComplexType>;
-    constexpr T inf = gko::device_numeric_limits<T>::inf();
-    constexpr T quiet_nan = NAN;
+    const T inf = gko::device_numeric_limits<T>::inf();
+    const auto quiet_nan = static_cast<T>(NAN);
     bool test_true{};
     bool test_false{};
 
@@ -109,6 +110,21 @@ TEST_F(IsFinite, Float) { ASSERT_TRUE(test_real_is_finite_kernel<float>()); }
 TEST_F(IsFinite, Double) { ASSERT_TRUE(test_real_is_finite_kernel<double>()); }
 
 
+#if GINKGO_ENABLE_HALF
+
+
+TEST_F(IsFinite, Half) { ASSERT_TRUE(test_real_is_finite_kernel<__half>()); }
+
+
+TEST_F(IsFinite, HalfComplex)
+{
+    ASSERT_TRUE(test_complex_is_finite_kernel<thrust::complex<__half>>());
+}
+
+
+#endif  // GINKGO_ENABLE_HALF
+
+
 TEST_F(IsFinite, FloatComplex)
 {
     ASSERT_TRUE(test_complex_is_finite_kernel<thrust::complex<float>>());
diff --git a/hip/test/base/math.hip.cpp b/hip/test/base/math.hip.cpp
index f69ca804aa9..ef25220957b 100644
--- a/hip/test/base/math.hip.cpp
+++ b/hip/test/base/math.hip.cpp
@@ -16,6 +16,7 @@
 
 #include <gtest/gtest.h>
 
+#include <ginkgo/config.hpp>
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
@@ -32,8 +33,8 @@ namespace kernel {
 template <typename T, typename FuncType>
 __device__ bool test_real_is_finite_function(FuncType isfin)
 {
-    constexpr T inf = gko::device_numeric_limits<T>::inf();
-    constexpr T quiet_nan = NAN;
+    const T inf = gko::device_numeric_limits<T>::inf();
+    const auto quiet_nan = static_cast<T>(NAN);
     bool test_true{};
     bool test_false{};
 
@@ -52,8 +53,8 @@ __device__ bool test_complex_is_finite_function(FuncType isfin)
                   "Template type must be a complex type.");
     using T = gko::remove_complex<ComplexType>;
     using c_type = gko::kernels::hip::hip_type<ComplexType>;
-    constexpr T inf = gko::device_numeric_limits<T>::inf();
-    constexpr T quiet_nan = NAN;
+    const T inf = gko::device_numeric_limits<T>::inf();
+    const auto quiet_nan = static_cast<T>(NAN);
     bool test_true{};
     bool test_false{};
 
@@ -115,6 +116,21 @@ TEST_F(IsFinite, Float) { ASSERT_TRUE(test_real_is_finite_kernel<float>()); }
 TEST_F(IsFinite, Double) { ASSERT_TRUE(test_real_is_finite_kernel<double>()); }
 
 
+#if GINKGO_ENABLE_HALF
+
+
+TEST_F(IsFinite, Half) { ASSERT_TRUE(test_real_is_finite_kernel<__half>()); }
+
+
+TEST_F(IsFinite, HalfComplex)
+{
+    ASSERT_TRUE(test_complex_is_finite_kernel<thrust::complex<__half>>());
+}
+
+
+#endif  // GINKGO_ENABLE_HALF
+
+
 TEST_F(IsFinite, FloatComplex)
 {
     ASSERT_TRUE(test_complex_is_finite_kernel<thrust::complex<float>>());

From 4ad4404ecd3be55df64afaeb4e2a29ae45eb89c5 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 5 Nov 2024 17:02:00 +0100
Subject: [PATCH 384/448] figure out factorization test

---
 core/test/utils/assertions.hpp          |  5 ++-
 test/factorization/cholesky_kernels.cpp |  2 +-
 test/factorization/lu_kernels.cpp       |  2 +-
 test/factorization/par_ic_kernels.cpp   | 15 +++++++-
 test/factorization/par_ict_kernels.cpp  |  3 ++
 test/factorization/par_ilu_kernels.cpp  |  6 ++++
 test/factorization/par_ilut_kernels.cpp | 46 ++++++++++++++++++++-----
 7 files changed, 67 insertions(+), 12 deletions(-)

diff --git a/core/test/utils/assertions.hpp b/core/test/utils/assertions.hpp
index 7bdc71ea94e..3dae62151b3 100644
--- a/core/test/utils/assertions.hpp
+++ b/core/test/utils/assertions.hpp
@@ -259,9 +259,12 @@ template <typename MatrixData1, typename MatrixData2>
 double get_relative_error(const MatrixData1& first, const MatrixData2& second)
 {
     using std::abs;
-    using vt = typename detail::biggest_valuetype<
+    using biggest_vt = typename detail::biggest_valuetype<
         typename MatrixData1::value_type,
         typename MatrixData2::value_type>::type;
+    // using the double or complex<double> to check the error
+    using vt = std::conditional_t<is_complex<biggest_vt>(),
+                                  std::complex<double>, double>;
     using real_vt = remove_complex<vt>;
 
     real_vt diff = 0.0;
diff --git a/test/factorization/cholesky_kernels.cpp b/test/factorization/cholesky_kernels.cpp
index 61bc86bbf43..007f3cbf6fd 100644
--- a/test/factorization/cholesky_kernels.cpp
+++ b/test/factorization/cholesky_kernels.cpp
@@ -115,7 +115,7 @@ using Types = gko::test::ValueIndexTypes;
 #elif defined(GKO_COMPILING_CUDA)
 // CUDA doesn't support long indices for sorting, and the triangular solvers
 // seem broken
-using Types = gko::test::cartesian_type_product_t<gko::test::ValueTypes,
+using Types = gko::test::cartesian_type_product_t<gko::test::ValueTypesWithHalf,
                                                   ::testing::Types<gko::int32>>;
 #else
 // HIP only supports real types and int32
diff --git a/test/factorization/lu_kernels.cpp b/test/factorization/lu_kernels.cpp
index e9f64bb1152..59f3cb30327 100644
--- a/test/factorization/lu_kernels.cpp
+++ b/test/factorization/lu_kernels.cpp
@@ -133,7 +133,7 @@ using Types = gko::test::ValueIndexTypesWithHalf;
 #elif defined(GKO_COMPILING_CUDA)
 // CUDA don't support long indices for sorting, and the triangular solvers
 // seem broken
-using Types = gko::test::cartesian_type_product_t<gko::test::ValueTypes,
+using Types = gko::test::cartesian_type_product_t<gko::test::ValueTypesWithHalf,
                                                   ::testing::Types<gko::int32>>;
 #else
 // HIP only supports real types and int32
diff --git a/test/factorization/par_ic_kernels.cpp b/test/factorization/par_ic_kernels.cpp
index bb53a454e21..10eccb83f10 100644
--- a/test/factorization/par_ic_kernels.cpp
+++ b/test/factorization/par_ic_kernels.cpp
@@ -108,16 +108,29 @@ TYPED_TEST(ParIc, KernelComputeFactorIsEquivalentToRef)
     using Csr = typename TestFixture::Csr;
     using Coo = typename TestFixture::Coo;
     using value_type = typename TestFixture::value_type;
+#ifdef GKO_COMPILING_HIP
+    // hip does not support memory operation in 16bit
     SKIP_IF_HALF(value_type);
+#endif
     auto square_size = this->mtx_ani->get_size();
     auto mtx_l_coo = Coo::create(this->ref, square_size);
     this->mtx_l_ani->convert_to(mtx_l_coo);
     auto dmtx_l_coo = gko::clone(this->exec, mtx_l_coo);
+    // If we compute the mtx_near in half, we still get less 1e-4 in half
+    // precision By using double in mtx_near, we get around 2.4e-4.
+    // TODO: when gko::half support subnormal value, revisit this.
+    // Use the reference result as initial values in device::compute_factor, it
+    // still converges to the same result, which gives around 2.4e-4 against the
+    // reference result. Applying more iterations on the device side does not
+    // change the result. It might mean some values are subnormal such that both
+    // converges to different stable result.
+    auto tol = std::max(
+        1e-4, static_cast<double>(r<gko::remove_complex<value_type>>::value));
 
     gko::kernels::reference::par_ic_factorization::compute_factor(
         this->ref, 1, mtx_l_coo.get(), this->mtx_l_ani_init.get());
     gko::kernels::GKO_DEVICE_NAMESPACE::par_ic_factorization::compute_factor(
         this->exec, 100, dmtx_l_coo.get(), this->dmtx_l_ani_init.get());
 
-    GKO_ASSERT_MTX_NEAR(this->mtx_l_ani_init, this->dmtx_l_ani_init, 1e-4);
+    GKO_EXPECT_MTX_NEAR(this->mtx_l_ani_init, this->dmtx_l_ani_init, tol);
 }
diff --git a/test/factorization/par_ict_kernels.cpp b/test/factorization/par_ict_kernels.cpp
index 945f874ef26..07a4ddc63ff 100644
--- a/test/factorization/par_ict_kernels.cpp
+++ b/test/factorization/par_ict_kernels.cpp
@@ -125,7 +125,10 @@ TYPED_TEST(ParIct, KernelComputeFactorIsEquivalentToRef)
     using Csr = typename TestFixture::Csr;
     using Coo = typename TestFixture::Coo;
     using value_type = typename TestFixture::value_type;
+#ifdef GKO_COMPILING_HIP
+    // hip does not support memory operation in 16bit
     SKIP_IF_HALF(value_type);
+#endif
     auto square_size = this->mtx_ani->get_size();
     auto mtx_l_coo = Coo::create(this->ref, square_size);
     this->mtx_l_ani->convert_to(mtx_l_coo);
diff --git a/test/factorization/par_ilu_kernels.cpp b/test/factorization/par_ilu_kernels.cpp
index 216a4f597cb..8c3ab20a674 100644
--- a/test/factorization/par_ilu_kernels.cpp
+++ b/test/factorization/par_ilu_kernels.cpp
@@ -238,7 +238,10 @@ TYPED_TEST(ParIlu, KernelComputeParILUIsEquivalentToRef)
 {
     using Csr = typename TestFixture::Csr;
     using value_type = typename TestFixture::value_type;
+#ifdef GKO_COMPILING_HIP
+    // hip does not support memory operation in 16bit
     SKIP_IF_HALF(value_type);
+#endif
     std::unique_ptr<Csr> l_mtx{};
     std::unique_ptr<Csr> u_mtx{};
     std::unique_ptr<Csr> dl_mtx{};
@@ -257,7 +260,10 @@ TYPED_TEST(ParIlu, KernelComputeParILUWithMoreIterationsIsEquivalentToRef)
 {
     using Csr = typename TestFixture::Csr;
     using value_type = typename TestFixture::value_type;
+#ifdef GKO_COMPILING_HIP
+    // hip does not support memory operation in 16bit
     SKIP_IF_HALF(value_type);
+#endif
     std::unique_ptr<Csr> l_mtx{};
     std::unique_ptr<Csr> u_mtx{};
     std::unique_ptr<Csr> dl_mtx{};
diff --git a/test/factorization/par_ilut_kernels.cpp b/test/factorization/par_ilut_kernels.cpp
index 6804a3edcce..ba2d84b4cc7 100644
--- a/test/factorization/par_ilut_kernels.cpp
+++ b/test/factorization/par_ilut_kernels.cpp
@@ -48,27 +48,27 @@ class ParIlut : public CommonTestFixture {
         mtx1 = gko::test::generate_random_matrix<Csr>(
             mtx_size[0], mtx_size[1],
             std::uniform_int_distribution<index_type>(10, mtx_size[1]),
-            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+            std::normal_distribution<>(0.0, 1.0), rand_engine, ref);
         mtx2 = gko::test::generate_random_matrix<Csr>(
             mtx_size[0], mtx_size[1],
             std::uniform_int_distribution<index_type>(0, mtx_size[1]),
-            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+            std::normal_distribution<>(0.0, 1.0), rand_engine, ref);
         mtx_square = gko::test::generate_random_matrix<Csr>(
             mtx_size[0], mtx_size[0],
             std::uniform_int_distribution<index_type>(1, mtx_size[0]),
-            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+            std::normal_distribution<>(0.0, 1.0), rand_engine, ref);
         mtx_l = gko::test::generate_random_lower_triangular_matrix<Csr>(
             mtx_size[0], false,
             std::uniform_int_distribution<index_type>(10, mtx_size[0]),
-            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+            std::normal_distribution<>(0.0, 1.0), rand_engine, ref);
         mtx_l2 = gko::test::generate_random_lower_triangular_matrix<Csr>(
             mtx_size[0], true,
             std::uniform_int_distribution<index_type>(1, mtx_size[0]),
-            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+            std::normal_distribution<>(0.0, 1.0), rand_engine, ref);
         mtx_u = gko::test::generate_random_upper_triangular_matrix<Csr>(
             mtx_size[0], false,
             std::uniform_int_distribution<index_type>(10, mtx_size[0]),
-            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+            std::normal_distribution<>(0.0, 1.0), rand_engine, ref);
 
         dmtx1 = gko::clone(exec, mtx1);
         dmtx2 = gko::clone(exec, mtx2);
@@ -240,7 +240,10 @@ TYPED_TEST_SUITE(ParIlut, gko::test::ValueIndexTypesWithHalf,
 TYPED_TEST(ParIlut, KernelThresholdSelectIsEquivalentToRef)
 {
     using value_type = typename TestFixture::value_type;
+#ifdef GKO_COMPILING_HIP
+    // hip does not support memory operation in 16bit
     SKIP_IF_HALF(value_type);
+#endif
 
     this->test_select(this->mtx_l, this->dmtx_l,
                       this->mtx_l->get_num_stored_elements() / 3);
@@ -250,7 +253,10 @@ TYPED_TEST(ParIlut, KernelThresholdSelectIsEquivalentToRef)
 TYPED_TEST(ParIlut, KernelThresholdSelectMinIsEquivalentToRef)
 {
     using value_type = typename TestFixture::value_type;
+#ifdef GKO_COMPILING_HIP
+    // hip does not support memory operation in 16bit
     SKIP_IF_HALF(value_type);
+#endif
 
     this->test_select(this->mtx_l, this->dmtx_l, 0);
 }
@@ -259,7 +265,10 @@ TYPED_TEST(ParIlut, KernelThresholdSelectMinIsEquivalentToRef)
 TYPED_TEST(ParIlut, KernelThresholdSelectMaxIsEquivalentToRef)
 {
     using value_type = typename TestFixture::value_type;
+#ifdef GKO_COMPILING_HIP
+    // hip does not support memory operation in 16bit
     SKIP_IF_HALF(value_type);
+#endif
 
     this->test_select(this->mtx_l, this->dmtx_l,
                       this->mtx_l->get_num_stored_elements() - 1);
@@ -327,7 +336,12 @@ TYPED_TEST(ParIlut, KernelThresholdFilterApproxNullptrCooIsEquivalentToRef)
     using Coo = typename TestFixture::Coo;
     using value_type = typename TestFixture::value_type;
     using index_type = typename TestFixture::index_type;
+#ifdef GKO_COMPILING_HIP
+    // hip does not support memory operation in 16bit
+    // threshold_filter_approx calls sampleselect_count which needs 16 bits
+    // memory operation
     SKIP_IF_HALF(value_type);
+#endif
     this->test_filter(this->mtx_l, this->dmtx_l, 0.5, true);
     auto res = Csr::create(this->ref, this->mtx_size);
     auto dres = Csr::create(this->exec, this->mtx_size);
@@ -354,7 +368,12 @@ TYPED_TEST(ParIlut, KernelThresholdFilterApproxNullptrCooIsEquivalentToRef)
 TYPED_TEST(ParIlut, KernelThresholdFilterApproxLowerIsEquivalentToRef)
 {
     using value_type = typename TestFixture::value_type;
+#ifdef GKO_COMPILING_HIP
+    // hip does not support memory operation in 16bit
+    // threshold_filter_approx calls sampleselect_count which needs 16 bits
+    // memory operation
     SKIP_IF_HALF(value_type);
+#endif
 
     this->test_filter_approx(this->mtx_l, this->dmtx_l,
                              this->mtx_l->get_num_stored_elements() / 2);
@@ -364,7 +383,12 @@ TYPED_TEST(ParIlut, KernelThresholdFilterApproxLowerIsEquivalentToRef)
 TYPED_TEST(ParIlut, KernelThresholdFilterApproxNoneLowerIsEquivalentToRef)
 {
     using value_type = typename TestFixture::value_type;
+#ifdef GKO_COMPILING_HIP
+    // hip does not support memory operation in 16bit
+    // threshold_filter_approx calls sampleselect_count which needs 16 bits
+    // memory operation
     SKIP_IF_HALF(value_type);
+#endif
 
     this->test_filter_approx(this->mtx_l, this->dmtx_l, 0);
 }
@@ -373,7 +397,12 @@ TYPED_TEST(ParIlut, KernelThresholdFilterApproxNoneLowerIsEquivalentToRef)
 TYPED_TEST(ParIlut, KernelThresholdFilterApproxAllLowerIsEquivalentToRef)
 {
     using value_type = typename TestFixture::value_type;
+#ifdef GKO_COMPILING_HIP
+    // hip does not support memory operation in 16bit
+    // threshold_filter_approx calls sampleselect_count which needs 16 bits
+    // memory operation
     SKIP_IF_HALF(value_type);
+#endif
 
     this->test_filter_approx(this->mtx_l, this->dmtx_l,
                              this->mtx_l->get_num_stored_elements() - 1);
@@ -384,8 +413,6 @@ TYPED_TEST(ParIlut, KernelAddCandidatesIsEquivalentToRef)
 {
     using Csr = typename TestFixture::Csr;
     using value_type = typename TestFixture::value_type;
-    // there's one value larger than half range
-    SKIP_IF_HALF(value_type);
     auto square_size = this->mtx_square->get_size();
     auto mtx_lu = Csr::create(this->ref, square_size);
     this->mtx_l2->apply(this->mtx_u, mtx_lu);
@@ -415,7 +442,10 @@ TYPED_TEST(ParIlut, KernelComputeLUIsEquivalentToRef)
     using Csr = typename TestFixture::Csr;
     using Coo = typename TestFixture::Coo;
     using value_type = typename TestFixture::value_type;
+#ifdef GKO_COMPILING_HIP
+    // hip does not support memory operation in 16bit
     SKIP_IF_HALF(value_type);
+#endif
     auto square_size = this->mtx_ani->get_size();
     auto mtx_l_coo = Coo::create(this->ref, square_size);
     auto mtx_u_coo = Coo::create(this->ref, square_size);

From 8ce50bac5e3afbfd39b2d90cfaa539297537c019 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 13 Nov 2024 19:14:11 +0100
Subject: [PATCH 385/448] change the diagonal to reduce random on
 parilut/parict

---
 test/factorization/par_ict_kernels.cpp  | 13 +++++++++++++
 test/factorization/par_ilut_kernels.cpp | 14 +++++++++++++-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/test/factorization/par_ict_kernels.cpp b/test/factorization/par_ict_kernels.cpp
index 07a4ddc63ff..8d6579d584e 100644
--- a/test/factorization/par_ict_kernels.cpp
+++ b/test/factorization/par_ict_kernels.cpp
@@ -101,6 +101,19 @@ TYPED_TEST(ParIct, KernelAddCandidatesIsEquivalentToRef)
 {
     using Csr = typename TestFixture::Csr;
     using value_type = typename TestFixture::value_type;
+    if (std::is_same_v<gko::remove_complex<value_type>, gko::half>) {
+        // We set the diagonal larger than 1 in half precision to reduce the
+        // possibility of resulting inf. It might introduce (a - llh)/diag when
+        // the entry is not presented in the original matrix
+        auto dist = std::uniform_real_distribution<>(1.0, 10.0);
+        for (gko::size_type i = 0; i < this->mtx_l->get_size()[0]; i++) {
+            this->mtx_l
+                ->get_values()[this->mtx_l->get_const_row_ptrs()[i + 1] - 1] =
+                gko::detail::get_rand_value<value_type>(dist,
+                                                        this->rand_engine);
+        }
+        this->dmtx_l->copy_from(this->mtx_l);
+    }
     auto mtx_llh = Csr::create(this->ref, this->mtx_size);
     this->mtx_l->apply(this->mtx_l->conj_transpose(), mtx_llh);
     auto dmtx_llh = Csr::create(this->exec, this->mtx_size);
diff --git a/test/factorization/par_ilut_kernels.cpp b/test/factorization/par_ilut_kernels.cpp
index ba2d84b4cc7..b1af2b4c748 100644
--- a/test/factorization/par_ilut_kernels.cpp
+++ b/test/factorization/par_ilut_kernels.cpp
@@ -413,6 +413,18 @@ TYPED_TEST(ParIlut, KernelAddCandidatesIsEquivalentToRef)
 {
     using Csr = typename TestFixture::Csr;
     using value_type = typename TestFixture::value_type;
+    if (std::is_same_v<gko::remove_complex<value_type>, gko::half>) {
+        // We set the diagonal larger than 1 in half precision to reduce the
+        // possibility of resulting inf. It might introduce (a - lu)/u_diag when
+        // the entry is not presented in the original matrix
+        auto dist = std::uniform_real_distribution<>(1.0, 10.0);
+        for (gko::size_type i = 0; i < this->mtx_u->get_size()[0]; i++) {
+            this->mtx_u->get_values()[this->mtx_u->get_const_row_ptrs()[i]] =
+                gko::detail::get_rand_value<value_type>(dist,
+                                                        this->rand_engine);
+        }
+        this->dmtx_u->copy_from(this->mtx_u);
+    }
     auto square_size = this->mtx_square->get_size();
     auto mtx_lu = Csr::create(this->ref, square_size);
     this->mtx_l2->apply(this->mtx_u, mtx_lu);
@@ -422,7 +434,7 @@ TYPED_TEST(ParIlut, KernelAddCandidatesIsEquivalentToRef)
     auto res_mtx_u = Csr::create(this->ref, square_size);
     auto dres_mtx_l = Csr::create(this->exec, square_size);
     auto dres_mtx_u = Csr::create(this->exec, square_size);
-
+    // gko::write(std::cout, mtx_lu);
     gko::kernels::reference::par_ilut_factorization::add_candidates(
         this->ref, mtx_lu.get(), this->mtx_square.get(), this->mtx_l2.get(),
         this->mtx_u.get(), res_mtx_l.get(), res_mtx_u.get());

From 0a7e869de5a13c8590ba90d091a8c7435424f76d Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 25 Oct 2024 16:47:14 +0200
Subject: [PATCH 386/448] multigrid and the multigridlevel

---
 common/cuda_hip/multigrid/pgm_kernels.cpp     |  5 +++--
 common/cuda_hip/solver/multigrid_kernels.cpp  | 12 +++++-----
 common/unified/multigrid/pgm_kernels.cpp      |  4 ++--
 core/device_hooks/common_kernels.inc.cpp      | 16 ++++++++------
 core/multigrid/fixed_coarsening.cpp           |  3 ++-
 core/multigrid/pgm.cpp                        |  2 +-
 core/solver/multigrid.cpp                     | 22 +++++++++++++++++++
 core/test/multigrid/fixed_coarsening.cpp      |  2 +-
 core/test/multigrid/pgm.cpp                   |  2 +-
 core/test/solver/multigrid.cpp                |  7 +++---
 dpcpp/multigrid/pgm_kernels.dp.cpp            |  5 +++--
 dpcpp/solver/multigrid_kernels.dp.cpp         |  8 ++++---
 omp/multigrid/pgm_kernels.cpp                 |  5 +++--
 omp/solver/multigrid_kernels.cpp              |  8 ++++---
 reference/multigrid/pgm_kernels.cpp           |  9 ++++----
 reference/solver/multigrid_kernels.cpp        |  8 ++++---
 .../multigrid/fixed_coarsening_kernels.cpp    |  2 +-
 reference/test/multigrid/pgm_kernels.cpp      |  3 ++-
 reference/test/solver/multigrid_kernels.cpp   |  6 ++---
 19 files changed, 83 insertions(+), 46 deletions(-)

diff --git a/common/cuda_hip/multigrid/pgm_kernels.cpp b/common/cuda_hip/multigrid/pgm_kernels.cpp
index d3c44cf540e..0077b801e46 100644
--- a/common/cuda_hip/multigrid/pgm_kernels.cpp
+++ b/common/cuda_hip/multigrid/pgm_kernels.cpp
@@ -54,7 +54,8 @@ void sort_row_major(std::shared_ptr<const DefaultExecutor> exec, size_type nnz,
     thrust::sort_by_key(thrust_policy(exec), it, it + nnz, vals_it);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PGM_SORT_ROW_MAJOR);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_PGM_SORT_ROW_MAJOR);
 
 
 template <typename ValueType, typename IndexType>
@@ -78,7 +79,7 @@ void compute_coarse_coo(std::shared_ptr<const DefaultExecutor> exec,
                           vals_it, coarse_key_it, coarse_vals_it);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PGM_COMPUTE_COARSE_COO);
 
 
diff --git a/common/cuda_hip/solver/multigrid_kernels.cpp b/common/cuda_hip/solver/multigrid_kernels.cpp
index 9b22e457203..b5d8a0f77b9 100644
--- a/common/cuda_hip/solver/multigrid_kernels.cpp
+++ b/common/cuda_hip/solver/multigrid_kernels.cpp
@@ -141,7 +141,8 @@ void kcycle_step_1(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -174,7 +175,8 @@ void kcycle_step_2(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL);
 
 
 template <typename ValueType>
@@ -192,13 +194,13 @@ void kcycle_check_stop(std::shared_ptr<const DefaultExecutor> exec,
         kernel::kcycle_check_stop_kernel<<<grid, default_block_size, 0,
                                            exec->get_stream()>>>(
             nrhs, as_device_type(old_norm->get_const_values()),
-            as_device_type(new_norm->get_const_values()), rel_tol,
-            as_device_type(dis_stop.get_data()));
+            as_device_type(new_norm->get_const_values()),
+            as_device_type(rel_tol), as_device_type(dis_stop.get_data()));
     }
     is_stop = get_element(dis_stop, 0);
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_MULTIGRID_KCYCLE_CHECK_STOP_KERNEL);
 
 
diff --git a/common/unified/multigrid/pgm_kernels.cpp b/common/unified/multigrid/pgm_kernels.cpp
index 9ba144cba2e..9e59671a821 100644
--- a/common/unified/multigrid/pgm_kernels.cpp
+++ b/common/unified/multigrid/pgm_kernels.cpp
@@ -217,7 +217,7 @@ void find_strongest_neighbor(
         strongest_neighbor.get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PGM_FIND_STRONGEST_NEIGHBOR);
 
 template <typename ValueType, typename IndexType>
@@ -305,7 +305,7 @@ void assign_to_exist_agg(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PGM_ASSIGN_TO_EXIST_AGG);
 
 
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 4e64134a9f2..9c492871a84 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -694,9 +694,10 @@ GKO_STUB(GKO_DECLARE_IR_INITIALIZE_KERNEL);
 namespace multigrid {
 
 
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL);
-GKO_STUB_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_CHECK_STOP_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL);
+GKO_STUB_NON_COMPLEX_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_MULTIGRID_KCYCLE_CHECK_STOP_KERNEL);
 
 
 }  // namespace multigrid
@@ -1125,11 +1126,12 @@ GKO_STUB_INDEX_TYPE(GKO_DECLARE_PGM_SORT_AGG_KERNEL);
 GKO_STUB_INDEX_TYPE(GKO_DECLARE_PGM_MAP_ROW_KERNEL);
 GKO_STUB_INDEX_TYPE(GKO_DECLARE_PGM_MAP_COL_KERNEL);
 GKO_STUB_INDEX_TYPE(GKO_DECLARE_PGM_COUNT_UNREPEATED_NNZ_KERNEL);
-GKO_STUB_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
+GKO_STUB_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PGM_FIND_STRONGEST_NEIGHBOR);
-GKO_STUB_NON_COMPLEX_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PGM_ASSIGN_TO_EXIST_AGG);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PGM_SORT_ROW_MAJOR);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PGM_COMPUTE_COARSE_COO);
+GKO_STUB_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_PGM_ASSIGN_TO_EXIST_AGG);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_PGM_SORT_ROW_MAJOR);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_PGM_COMPUTE_COARSE_COO);
 GKO_STUB_INDEX_TYPE(GKO_DECLARE_PGM_GATHER_INDEX);
 
 
diff --git a/core/multigrid/fixed_coarsening.cpp b/core/multigrid/fixed_coarsening.cpp
index 1cbdd557fb4..f62ce746d6b 100644
--- a/core/multigrid/fixed_coarsening.cpp
+++ b/core/multigrid/fixed_coarsening.cpp
@@ -90,7 +90,8 @@ void FixedCoarsening<ValueType, IndexType>::generate()
 
 #define GKO_DECLARE_FIXED_COARSENING(_vtype, _itype) \
     class FixedCoarsening<_vtype, _itype>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FIXED_COARSENING);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_FIXED_COARSENING);
 
 
 }  // namespace multigrid
diff --git a/core/multigrid/pgm.cpp b/core/multigrid/pgm.cpp
index 9f1f5b50ba6..e531fb2b996 100644
--- a/core/multigrid/pgm.cpp
+++ b/core/multigrid/pgm.cpp
@@ -541,7 +541,7 @@ void Pgm<ValueType, IndexType>::generate()
 
 
 #define GKO_DECLARE_PGM(_vtype, _itype) class Pgm<_vtype, _itype>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PGM);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_PGM);
 
 
 }  // namespace multigrid
diff --git a/core/solver/multigrid.cpp b/core/solver/multigrid.cpp
index 2ecd3dd74c4..0b918a13897 100644
--- a/core/solver/multigrid.cpp
+++ b/core/solver/multigrid.cpp
@@ -9,6 +9,7 @@
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/half.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/utils.hpp>
@@ -318,6 +319,9 @@ void MultigridState::generate(const LinOp* system_matrix_in,
         auto mg_level = mg_level_list.at(i);
 
         run<gko::multigrid::EnableMultigridLevel, float, double,
+#if GINKGO_ENABLE_HALF
+            half, std::complex<half>,
+#endif
             std::complex<float>, std::complex<double>>(
             mg_level,
             [&, this](auto mg_level, auto i, auto cycle, auto current_nrows,
@@ -456,6 +460,9 @@ void MultigridState::run_mg_cycle(multigrid::cycle cycle, size_type level,
     }
     auto mg_level = multigrid->get_mg_level_list().at(level);
     run<gko::multigrid::EnableMultigridLevel, float, double,
+#if GINKGO_ENABLE_HALF
+        half, std::complex<half>,
+#endif
         std::complex<float>, std::complex<double>>(
         mg_level, [&, this](auto mg_level) {
 #if GINKGO_BUILD_MPI
@@ -705,6 +712,9 @@ void Multigrid::generate()
         }
 
         run<gko::multigrid::EnableMultigridLevel, float, double,
+#if GINKGO_ENABLE_HALF
+            half, std::complex<half>,
+#endif
             std::complex<float>, std::complex<double>>(
             mg_level,
             [this](auto mg_level, auto index, auto matrix) {
@@ -743,6 +753,9 @@ void Multigrid::generate()
 
     // generate coarsest solver
     run<gko::multigrid::EnableMultigridLevel, float, double,
+#if GINKGO_ENABLE_HALF
+        half, std::complex<half>,
+#endif
         std::complex<float>, std::complex<double>>(
         last_mg_level,
         [this](auto mg_level, auto level, auto matrix) {
@@ -860,6 +873,9 @@ void Multigrid::apply_with_initial_guess_impl(const LinOp* b, LinOp* x,
     };
     auto first_mg_level = this->get_mg_level_list().front();
     run<gko::multigrid::EnableMultigridLevel, float, double,
+#if GINKGO_ENABLE_HALF
+        half, std::complex<half>,
+#endif
         std::complex<float>, std::complex<double>>(first_mg_level, lambda, b,
                                                    x);
 }
@@ -899,6 +915,9 @@ void Multigrid::apply_with_initial_guess_impl(const LinOp* alpha,
     };
     auto first_mg_level = this->get_mg_level_list().front();
     run<gko::multigrid::EnableMultigridLevel, float, double,
+#if GINKGO_ENABLE_HALF
+        half, std::complex<half>,
+#endif
         std::complex<float>, std::complex<double>>(first_mg_level, lambda,
                                                    alpha, b, beta, x);
 }
@@ -964,6 +983,9 @@ void Multigrid::apply_dense_impl(const VectorType* b, VectorType* x,
     auto first_mg_level = this->get_mg_level_list().front();
 
     run<gko::multigrid::EnableMultigridLevel, float, double,
+#if GINKGO_ENABLE_HALF
+        half, std::complex<half>,
+#endif
         std::complex<float>, std::complex<double>>(first_mg_level, lambda, b,
                                                    x);
 }
diff --git a/core/test/multigrid/fixed_coarsening.cpp b/core/test/multigrid/fixed_coarsening.cpp
index 5cab7282b5d..35bd04bb067 100644
--- a/core/test/multigrid/fixed_coarsening.cpp
+++ b/core/test/multigrid/fixed_coarsening.cpp
@@ -38,7 +38,7 @@ class FixedCoarseningFactory : public ::testing::Test {
     std::unique_ptr<typename MgLevel::Factory> fixed_coarsening_factory;
 };
 
-TYPED_TEST_SUITE(FixedCoarseningFactory, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(FixedCoarseningFactory, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
diff --git a/core/test/multigrid/pgm.cpp b/core/test/multigrid/pgm.cpp
index 7798e97f5d6..c06edda60a0 100644
--- a/core/test/multigrid/pgm.cpp
+++ b/core/test/multigrid/pgm.cpp
@@ -40,7 +40,7 @@ class PgmFactory : public ::testing::Test {
     std::unique_ptr<typename MgLevel::Factory> pgm_factory;
 };
 
-TYPED_TEST_SUITE(PgmFactory, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(PgmFactory, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
diff --git a/core/test/solver/multigrid.cpp b/core/test/solver/multigrid.cpp
index 8cb545f6cb2..54c4a18b8d3 100644
--- a/core/test/solver/multigrid.cpp
+++ b/core/test/solver/multigrid.cpp
@@ -75,9 +75,7 @@ class DummyLinOpWithFactory
             std::make_shared<DummyLinOp>(this->get_executor(),
                                          gko::dim<2>{n_, n_ - 1}),
             gko::share(gko::test::generate_random_dense_matrix<ValueType>(
-                n_ - 1, n_ - 1,
-                std::uniform_real_distribution<gko::remove_complex<ValueType>>(
-                    0, 1),
+                n_ - 1, n_ - 1, std::uniform_real_distribution<>(0, 1),
                 std::default_random_engine{}, factory->get_executor())),
             std::make_shared<DummyLinOp>(this->get_executor(),
                                          gko::dim<2>{n_ - 1, n_}));
@@ -166,7 +164,8 @@ class Multigrid : public ::testing::Test {
     }
 };
 
-TYPED_TEST_SUITE(Multigrid, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Multigrid, gko::test::ValueTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(Multigrid, MultigridFactoryKnowsItsExecutor)
diff --git a/dpcpp/multigrid/pgm_kernels.dp.cpp b/dpcpp/multigrid/pgm_kernels.dp.cpp
index a9148c54ff4..e645ba3bc6e 100644
--- a/dpcpp/multigrid/pgm_kernels.dp.cpp
+++ b/dpcpp/multigrid/pgm_kernels.dp.cpp
@@ -56,7 +56,8 @@ void sort_row_major(std::shared_ptr<const DefaultExecutor> exec, size_type nnz,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PGM_SORT_ROW_MAJOR);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_PGM_SORT_ROW_MAJOR);
 
 
 template <typename ValueType, typename IndexType>
@@ -89,7 +90,7 @@ void compute_coarse_coo(std::shared_ptr<const DefaultExecutor> exec,
         [](auto a, auto b) { return a + b; });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PGM_COMPUTE_COARSE_COO);
 
 
diff --git a/dpcpp/solver/multigrid_kernels.dp.cpp b/dpcpp/solver/multigrid_kernels.dp.cpp
index aaf0ab63354..cdbcb39d043 100644
--- a/dpcpp/solver/multigrid_kernels.dp.cpp
+++ b/dpcpp/solver/multigrid_kernels.dp.cpp
@@ -31,7 +31,8 @@ void kcycle_step_1(std::shared_ptr<const DefaultExecutor> exec,
                    matrix::Dense<ValueType>* g, matrix::Dense<ValueType>* d,
                    matrix::Dense<ValueType>* e) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -44,7 +45,8 @@ void kcycle_step_2(std::shared_ptr<const DefaultExecutor> exec,
                    const matrix::Dense<ValueType>* d,
                    matrix::Dense<ValueType>* e) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL);
 
 
 template <typename ValueType>
@@ -54,7 +56,7 @@ void kcycle_check_stop(std::shared_ptr<const DefaultExecutor> exec,
                        const ValueType rel_tol,
                        bool& is_stop) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_MULTIGRID_KCYCLE_CHECK_STOP_KERNEL);
 
 
diff --git a/omp/multigrid/pgm_kernels.cpp b/omp/multigrid/pgm_kernels.cpp
index 4c824a0140b..bfe95291f2e 100644
--- a/omp/multigrid/pgm_kernels.cpp
+++ b/omp/multigrid/pgm_kernels.cpp
@@ -47,7 +47,8 @@ void sort_row_major(std::shared_ptr<const DefaultExecutor> exec, size_type nnz,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PGM_SORT_ROW_MAJOR);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_PGM_SORT_ROW_MAJOR);
 
 
 template <typename ValueType, typename IndexType>
@@ -83,7 +84,7 @@ void compute_coarse_coo(std::shared_ptr<const DefaultExecutor> exec,
     coarse_val[coarse_idxs] = temp_val;
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PGM_COMPUTE_COARSE_COO);
 
 
diff --git a/omp/solver/multigrid_kernels.cpp b/omp/solver/multigrid_kernels.cpp
index 12e5bad8577..509ecf51828 100644
--- a/omp/solver/multigrid_kernels.cpp
+++ b/omp/solver/multigrid_kernels.cpp
@@ -44,7 +44,8 @@ void kcycle_step_1(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -74,7 +75,8 @@ void kcycle_step_2(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL);
 
 
 template <typename ValueType>
@@ -92,7 +94,7 @@ void kcycle_check_stop(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_MULTIGRID_KCYCLE_CHECK_STOP_KERNEL);
 
 
diff --git a/reference/multigrid/pgm_kernels.cpp b/reference/multigrid/pgm_kernels.cpp
index bff2a776c6b..2b4298377cb 100644
--- a/reference/multigrid/pgm_kernels.cpp
+++ b/reference/multigrid/pgm_kernels.cpp
@@ -208,7 +208,7 @@ void find_strongest_neighbor(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PGM_FIND_STRONGEST_NEIGHBOR);
 
 
@@ -260,7 +260,7 @@ void assign_to_exist_agg(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PGM_ASSIGN_TO_EXIST_AGG);
 
 
@@ -274,7 +274,8 @@ void sort_row_major(std::shared_ptr<const DefaultExecutor> exec, size_type nnz,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PGM_SORT_ROW_MAJOR);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_PGM_SORT_ROW_MAJOR);
 
 
 template <typename ValueType, typename IndexType>
@@ -311,7 +312,7 @@ void compute_coarse_coo(std::shared_ptr<const DefaultExecutor> exec,
     coarse_val[coarse_idxs] = temp_val;
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PGM_COMPUTE_COARSE_COO);
 
 
diff --git a/reference/solver/multigrid_kernels.cpp b/reference/solver/multigrid_kernels.cpp
index b08c9857d3a..4ce4491c990 100644
--- a/reference/solver/multigrid_kernels.cpp
+++ b/reference/solver/multigrid_kernels.cpp
@@ -43,7 +43,8 @@ void kcycle_step_1(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -72,7 +73,8 @@ void kcycle_step_2(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL);
 
 
 template <typename ValueType>
@@ -89,7 +91,7 @@ void kcycle_check_stop(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_MULTIGRID_KCYCLE_CHECK_STOP_KERNEL);
 
 
diff --git a/reference/test/multigrid/fixed_coarsening_kernels.cpp b/reference/test/multigrid/fixed_coarsening_kernels.cpp
index 582950b4e17..001e23d6124 100644
--- a/reference/test/multigrid/fixed_coarsening_kernels.cpp
+++ b/reference/test/multigrid/fixed_coarsening_kernels.cpp
@@ -143,7 +143,7 @@ class FixedCoarsening : public ::testing::Test {
     std::unique_ptr<MgLevel> mg_level;
 };
 
-TYPED_TEST_SUITE(FixedCoarsening, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(FixedCoarsening, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/multigrid/pgm_kernels.cpp b/reference/test/multigrid/pgm_kernels.cpp
index 2fc754f23b3..e715b2175d3 100644
--- a/reference/test/multigrid/pgm_kernels.cpp
+++ b/reference/test/multigrid/pgm_kernels.cpp
@@ -187,7 +187,8 @@ class Pgm : public ::testing::Test {
     std::unique_ptr<MgLevel> mg_level;
 };
 
-TYPED_TEST_SUITE(Pgm, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Pgm, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(Pgm, CanBeCopied)
diff --git a/reference/test/solver/multigrid_kernels.cpp b/reference/test/solver/multigrid_kernels.cpp
index 57ba8fba84d..7b79ba98ad2 100644
--- a/reference/test/solver/multigrid_kernels.cpp
+++ b/reference/test/solver/multigrid_kernels.cpp
@@ -154,7 +154,7 @@ class DummyLinOpWithFactory
     {
         auto alpha_value =
             gko::as<gko::matrix::Dense<ValueType>>(alpha)->at(0, 0);
-        gko::remove_complex<ValueType> scale = std::real(alpha_value);
+        gko::remove_complex<ValueType> scale = gko::real(alpha_value);
         global_step *= static_cast<int>(scale);
         step.push_back(global_step);
         global_step++;
@@ -233,7 +233,7 @@ class Multigrid : public ::testing::Test {
     using Smoother = gko::solver::Ir<value_type>;
     using InnerSolver = gko::preconditioner::Jacobi<value_type>;
     using CoarsestSolver = gko::solver::Cg<value_type>;
-    using CoarsestNextSolver = gko::solver::Cg<gko::next_precision<value_type>>;
+    using CoarsestNextSolver = gko::solver::Cg<next_precision<value_type>>;
     using DummyRPFactory = DummyMultigridLevelWithFactory<value_type>;
     using DummyFactory = DummyLinOpWithFactory<value_type>;
     Multigrid()
@@ -415,7 +415,7 @@ class Multigrid : public ::testing::Test {
     std::shared_ptr<Mtx> x2;
 };
 
-TYPED_TEST_SUITE(Multigrid, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(Multigrid, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 

From 29ae3904bfe2bd1e92a0fb63b377beda90922233 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 28 Oct 2024 17:07:11 +0100
Subject: [PATCH 387/448] pgm uses gko::max to avoid ambiguous in hip

---
 common/unified/multigrid/pgm_kernels.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/common/unified/multigrid/pgm_kernels.cpp b/common/unified/multigrid/pgm_kernels.cpp
index 9e59671a821..2b0c04592a7 100644
--- a/common/unified/multigrid/pgm_kernels.cpp
+++ b/common/unified/multigrid/pgm_kernels.cpp
@@ -183,7 +183,7 @@ void find_strongest_neighbor(
                     continue;
                 }
                 auto weight =
-                    weight_vals[idx] / max(abs(diag[row]), abs(diag[col]));
+                    weight_vals[idx] / gko::max(abs(diag[row]), abs(diag[col]));
                 if (agg[col] == -1 &&
                     device_std::tie(weight, col) >
                         device_std::tie(max_weight_unagg, strongest_unagg)) {
@@ -246,8 +246,8 @@ void assign_to_exist_agg(std::shared_ptr<const DefaultExecutor> exec,
                     if (col == row) {
                         continue;
                     }
-                    auto weight =
-                        weight_vals[idx] / max(abs(diag[row]), abs(diag[col]));
+                    auto weight = weight_vals[idx] /
+                                  gko::max(abs(diag[row]), abs(diag[col]));
                     if (agg_const_val[col] != -1 &&
                         device_std::tie(weight, col) >
                             device_std::tie(max_weight_agg, strongest_agg)) {
@@ -284,8 +284,8 @@ void assign_to_exist_agg(std::shared_ptr<const DefaultExecutor> exec,
                     if (col == row) {
                         continue;
                     }
-                    auto weight =
-                        weight_vals[idx] / max(abs(diag[row]), abs(diag[col]));
+                    auto weight = weight_vals[idx] /
+                                  gko::max(abs(diag[row]), abs(diag[col]));
                     if (agg_val[col] != -1 &&
                         device_std::tie(weight, col) >
                             device_std::tie(max_weight_agg, strongest_agg)) {

From 2666ae1936aa76e0872f8d6d59eff5e1651068fb Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 25 Oct 2024 16:59:11 +0200
Subject: [PATCH 388/448] multigrid config dispatch

---
 core/config/multigrid_config.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/config/multigrid_config.cpp b/core/config/multigrid_config.cpp
index 83be1a1742b..8cc4b4e1ca3 100644
--- a/core/config/multigrid_config.cpp
+++ b/core/config/multigrid_config.cpp
@@ -10,7 +10,7 @@ namespace gko {
 namespace config {
 
 
-GKO_PARSE_VALUE_AND_INDEX_TYPE(Pgm, gko::multigrid::Pgm);
+GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(Pgm, gko::multigrid::Pgm);
 
 
 }  // namespace config

From 58d5d0399df5ed3898f25a970e1f5571fe442225 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 25 Oct 2024 18:54:05 +0200
Subject: [PATCH 389/448] preconditioner with half

---
 common/cuda_hip/components/warp_blas.hpp      |  2 +-
 .../cuda_hip/preconditioner/isai_kernels.cpp  | 10 ++--
 .../jacobi_advanced_apply_kernels.cpp         |  3 +-
 ...obi_advanced_apply_kernels.instantiate.cpp |  2 +-
 .../jacobi_generate_kernels.cpp               |  2 +-
 .../jacobi_generate_kernels.instantiate.cpp   |  2 +-
 .../preconditioner/jacobi_kernels.cpp         |  8 ++--
 .../jacobi_simple_apply_kernels.cpp           |  2 +-
 ...acobi_simple_apply_kernels.instantiate.cpp |  2 +-
 .../cuda_hip/preconditioner/sor_kernels.cpp   | 17 +++----
 .../unified/preconditioner/jacobi_kernels.cpp | 13 +++--
 core/device_hooks/common_kernels.inc.cpp      | 47 +++++++++++--------
 core/preconditioner/gauss_seidel.cpp          |  3 +-
 core/preconditioner/ic.cpp                    | 12 +++--
 core/preconditioner/ilu.cpp                   | 23 +++++----
 core/preconditioner/isai.cpp                  |  9 ++--
 core/preconditioner/jacobi.cpp                | 11 +++--
 core/preconditioner/jacobi_utils.hpp          |  4 +-
 core/preconditioner/sor.cpp                   |  2 +-
 core/test/preconditioner/isai.cpp             |  2 +-
 core/test/preconditioner/jacobi.cpp           |  2 +-
 dpcpp/preconditioner/isai_kernels.dp.cpp      | 14 +++---
 ...cobi_advanced_apply_instantiate.inc.dp.cpp |  2 +-
 .../jacobi_advanced_apply_kernel.dp.cpp       |  3 +-
 .../jacobi_generate_instantiate.inc.dp.cpp    |  2 +-
 .../jacobi_generate_kernel.dp.cpp             |  2 +-
 dpcpp/preconditioner/jacobi_kernels.dp.cpp    |  8 ++--
 ...jacobi_simple_apply_instantiate.inc.dp.cpp |  2 +-
 .../jacobi_simple_apply_kernel.dp.cpp         |  2 +-
 dpcpp/preconditioner/sor_kernels.dp.cpp       |  4 +-
 .../test/preconditioner/jacobi_kernels.dp.cpp | 21 ++++-----
 include/ginkgo/core/preconditioner/ic.hpp     |  2 +-
 include/ginkgo/core/preconditioner/ilu.hpp    |  3 +-
 omp/preconditioner/isai_kernels.cpp           | 10 ++--
 omp/preconditioner/jacobi_kernels.cpp         | 15 +++---
 omp/preconditioner/sor_kernels.cpp            |  4 +-
 reference/preconditioner/isai_kernels.cpp     | 10 ++--
 reference/preconditioner/jacobi_kernels.cpp   | 28 ++++++-----
 reference/preconditioner/sor_kernels.cpp      |  4 +-
 .../test/preconditioner/gauss_seidel.cpp      |  2 +-
 reference/test/preconditioner/ic.cpp          | 13 ++---
 reference/test/preconditioner/ilu.cpp         | 14 +++---
 .../test/preconditioner/isai_kernels.cpp      | 35 ++++++++++++--
 reference/test/preconditioner/jacobi.cpp      |  5 +-
 .../test/preconditioner/jacobi_kernels.cpp    | 32 ++++++++-----
 reference/test/preconditioner/sor_kernels.cpp |  3 +-
 reference/test/solver/multigrid_kernels.cpp   |  6 ++-
 47 files changed, 250 insertions(+), 174 deletions(-)

diff --git a/common/cuda_hip/components/warp_blas.hpp b/common/cuda_hip/components/warp_blas.hpp
index 116b963ad11..0df0612152c 100644
--- a/common/cuda_hip/components/warp_blas.hpp
+++ b/common/cuda_hip/components/warp_blas.hpp
@@ -425,7 +425,7 @@ __device__ __forceinline__ remove_complex<ValueType> compute_infinity_norm(
         }
     }
     return reduce(group, sum,
-                  [](result_type x, result_type y) { return max(x, y); });
+                  [](result_type x, result_type y) { return gko::max(x, y); });
 }
 
 
diff --git a/common/cuda_hip/preconditioner/isai_kernels.cpp b/common/cuda_hip/preconditioner/isai_kernels.cpp
index d6fdd6389fc..77fdb3c0e23 100644
--- a/common/cuda_hip/preconditioner/isai_kernels.cpp
+++ b/common/cuda_hip/preconditioner/isai_kernels.cpp
@@ -487,7 +487,7 @@ void generate_tri_inverse(std::shared_ptr<const DefaultExecutor> exec,
     components::prefix_sum_nonnegative(exec, excess_nz_ptrs, num_rows + 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL);
 
 
@@ -516,7 +516,7 @@ void generate_general_inverse(std::shared_ptr<const DefaultExecutor> exec,
     components::prefix_sum_nonnegative(exec, excess_nz_ptrs, num_rows + 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ISAI_GENERATE_GENERAL_INVERSE_KERNEL);
 
 
@@ -548,7 +548,7 @@ void generate_excess_system(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL);
 
 
@@ -568,7 +568,7 @@ void scale_excess_solution(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ISAI_SCALE_EXCESS_SOLUTION_KERNEL);
 
 
@@ -593,7 +593,7 @@ void scatter_excess_solution(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL);
 
 
diff --git a/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.cpp b/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.cpp
index 27b4f57eb6c..fcd86bdba29 100644
--- a/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.cpp
+++ b/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.cpp
@@ -66,7 +66,8 @@ void apply(std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_JACOBI_APPLY_KERNEL);
 
 
 }  // namespace jacobi
diff --git a/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp b/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp
index 131c530d2ee..62d9c1ece43 100644
--- a/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp
+++ b/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp
@@ -160,7 +160,7 @@ void advanced_apply(
         const preconditioner::block_interleaved_storage_scheme<IndexType>&, \
         const ValueType*, const ValueType*, size_type, ValueType*, size_type)
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     DECLARE_JACOBI_ADVANCED_APPLY_INSTANTIATION);
 
 
diff --git a/common/cuda_hip/preconditioner/jacobi_generate_kernels.cpp b/common/cuda_hip/preconditioner/jacobi_generate_kernels.cpp
index 207550ff6b1..7c37e578045 100644
--- a/common/cuda_hip/preconditioner/jacobi_generate_kernels.cpp
+++ b/common/cuda_hip/preconditioner/jacobi_generate_kernels.cpp
@@ -68,7 +68,7 @@ void generate(std::shared_ptr<const DefaultExecutor> exec,
         block_pointers.get_const_data(), num_blocks);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_JACOBI_GENERATE_KERNEL);
 
 
diff --git a/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp b/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp
index fdb0ad11e9e..5efd0c40632 100644
--- a/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp
+++ b/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp
@@ -268,7 +268,7 @@ void generate(syn::value_list<int, max_block_size>,
         remove_complex<ValueType>*, precision_reduction*, const IndexType*,  \
         size_type)
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     DECLARE_JACOBI_GENERATE_INSTANTIATION);
 
 
diff --git a/common/cuda_hip/preconditioner/jacobi_kernels.cpp b/common/cuda_hip/preconditioner/jacobi_kernels.cpp
index 6f2d4ae3974..adcc08e37e9 100644
--- a/common/cuda_hip/preconditioner/jacobi_kernels.cpp
+++ b/common/cuda_hip/preconditioner/jacobi_kernels.cpp
@@ -297,7 +297,7 @@ void find_blocks(std::shared_ptr<const DefaultExecutor> exec,
         exec, max_block_size, num_natural_blocks, block_pointers.get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL);
 
 
@@ -364,7 +364,7 @@ void transpose_jacobi(
         storage_scheme, out_blocks.get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL);
 
 
@@ -388,7 +388,7 @@ void conj_transpose_jacobi(
         storage_scheme, out_blocks.get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL);
 
 
@@ -401,7 +401,7 @@ void convert_to_dense(
         storage_scheme,
     ValueType* result_values, size_type result_stride) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL);
 
 
diff --git a/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.cpp b/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.cpp
index e9b7b10fd88..fb73c22ccef 100644
--- a/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.cpp
+++ b/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.cpp
@@ -57,7 +57,7 @@ void simple_apply(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL);
 
 
diff --git a/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.cpp b/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.cpp
index faf869718a6..3a35fbe3f04 100644
--- a/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.cpp
+++ b/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.cpp
@@ -151,7 +151,7 @@ void apply(syn::value_list<int, max_block_size>,
         const preconditioner::block_interleaved_storage_scheme<IndexType>&,   \
         const ValueType*, size_type, ValueType*, size_type)
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     DECLARE_JACOBI_SIMPLE_APPLY_INSTANTIATION);
 
 
diff --git a/common/cuda_hip/preconditioner/sor_kernels.cpp b/common/cuda_hip/preconditioner/sor_kernels.cpp
index 4805eca3ab3..f75a52b3af2 100644
--- a/common/cuda_hip/preconditioner/sor_kernels.cpp
+++ b/common/cuda_hip/preconditioner/sor_kernels.cpp
@@ -26,7 +26,7 @@ void initialize_weighted_l(
     const auto grid_dim = static_cast<uint32>(
         ceildiv(num_rows, static_cast<size_type>(block_size)));
 
-    auto inv_weight = one(weight) / weight;
+    auto inv_weight = as_device_type(one(weight) / weight);
 
     if (grid_dim > 0) {
         using namespace gko::factorization;
@@ -46,7 +46,7 @@ void initialize_weighted_l(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L);
 
 
@@ -62,9 +62,10 @@ void initialize_weighted_l_u(
     const auto grid_dim = static_cast<uint32>(
         ceildiv(num_rows, static_cast<size_type>(block_size)));
 
-    auto inv_weight = one(weight) / weight;
-    auto inv_two_minus_weight =
-        one(weight) / (static_cast<remove_complex<ValueType>>(2.0) - weight);
+    auto inv_weight = as_device_type(one(weight) / weight);
+    auto inv_two_minus_weight = as_device_type(
+        one(weight) / (static_cast<remove_complex<ValueType>>(2.0) - weight));
+    auto d_weight = as_device_type(weight);
 
     if (grid_dim > 0) {
         using namespace gko::factorization;
@@ -87,13 +88,13 @@ void initialize_weighted_l_u(
                     [inv_two_minus_weight] __device__(auto val) {
                         return val * inv_two_minus_weight;
                     },
-                    [weight, inv_two_minus_weight] __device__(auto val) {
-                        return val * weight * inv_two_minus_weight;
+                    [d_weight, inv_two_minus_weight] __device__(auto val) {
+                        return val * d_weight * inv_two_minus_weight;
                     }));
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U);
 
 
diff --git a/common/unified/preconditioner/jacobi_kernels.cpp b/common/unified/preconditioner/jacobi_kernels.cpp
index dce00fd1366..00f3d62f312 100644
--- a/common/unified/preconditioner/jacobi_kernels.cpp
+++ b/common/unified/preconditioner/jacobi_kernels.cpp
@@ -32,7 +32,8 @@ void scalar_conj(std::shared_ptr<const DefaultExecutor> exec,
         diag.get_size(), diag, conj_diag);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_CONJ_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_JACOBI_SCALAR_CONJ_KERNEL);
 
 
 template <typename ValueType>
@@ -49,7 +50,8 @@ void invert_diagonal(std::shared_ptr<const DefaultExecutor> exec,
         diag.get_size(), diag, inv_diag);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_INVERT_DIAGONAL_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_JACOBI_INVERT_DIAGONAL_KERNEL);
 
 
 template <typename ValueType>
@@ -83,7 +85,8 @@ void scalar_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL);
 
 
 template <typename ValueType>
@@ -100,7 +103,7 @@ void simple_scalar_apply(std::shared_ptr<const DefaultExecutor> exec,
         x->get_size(), diag, b, x);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL);
 
 
@@ -120,7 +123,7 @@ void scalar_convert_to_dense(std::shared_ptr<const DefaultExecutor> exec,
         result->get_size(), blocks, result);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_JACOBI_SCALAR_CONVERT_TO_DENSE_KERNEL);
 
 
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 9c492871a84..7215a17aec5 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -952,18 +952,21 @@ GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_BLOCK_JACOBI_COMPUTE_KERNEL);
 namespace jacobi {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_GENERATE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_APPLY_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_CONVERT_TO_DENSE_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_CONJ_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_JACOBI_INVERT_DIAGONAL_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_JACOBI_GENERATE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_JACOBI_APPLY_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_JACOBI_SCALAR_CONVERT_TO_DENSE_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_JACOBI_SCALAR_CONJ_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_JACOBI_INVERT_DIAGONAL_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL);
 GKO_STUB(GKO_DECLARE_JACOBI_INITIALIZE_PRECISIONS_KERNEL);
 
 
@@ -973,8 +976,9 @@ GKO_STUB(GKO_DECLARE_JACOBI_INITIALIZE_PRECISIONS_KERNEL);
 namespace sor {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U);
 
 
 }  // namespace sor
@@ -983,11 +987,16 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U);
 namespace isai {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ISAI_GENERATE_GENERAL_INVERSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ISAI_SCALE_EXCESS_SOLUTION_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_ISAI_GENERATE_GENERAL_INVERSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_ISAI_SCALE_EXCESS_SOLUTION_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL);
 
 
 }  // namespace isai
diff --git a/core/preconditioner/gauss_seidel.cpp b/core/preconditioner/gauss_seidel.cpp
index aec7a4ff827..f4735cff5bc 100644
--- a/core/preconditioner/gauss_seidel.cpp
+++ b/core/preconditioner/gauss_seidel.cpp
@@ -71,7 +71,8 @@ std::unique_ptr<LinOp> GaussSeidel<ValueType, IndexType>::generate_impl(
 #define GKO_DECLARE_GAUSS_SEIDEL(ValueType, IndexType) \
     class GaussSeidel<ValueType, IndexType>
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_GAUSS_SEIDEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_GAUSS_SEIDEL);
 
 
 }  // namespace preconditioner
diff --git a/core/preconditioner/ic.cpp b/core/preconditioner/ic.cpp
index 691795ad60b..2e9833c21f7 100644
--- a/core/preconditioner/ic.cpp
+++ b/core/preconditioner/ic.cpp
@@ -50,28 +50,32 @@ typename Ic::parameters_type ic_parse(
     ic_parse<Ic<solver::LowerTrs<ValueType, IndexType>, IndexType>>( \
         const config::pnode&, const config::registry&,               \
         const config::type_descriptor&)
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LOWERTRS_IC_PARSE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_LOWERTRS_IC_PARSE);
 
 #define GKO_DECLARE_IR_IC_PARSE(ValueType, IndexType)              \
     typename Ic<solver::Ir<ValueType>, IndexType>::parameters_type \
     ic_parse<Ic<solver::Ir<ValueType>, IndexType>>(                \
         const config::pnode&, const config::registry&,             \
         const config::type_descriptor&)
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IR_IC_PARSE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_IR_IC_PARSE);
 
 #define GKO_DECLARE_GMRES_IC_PARSE(ValueType, IndexType)              \
     typename Ic<solver::Gmres<ValueType>, IndexType>::parameters_type \
     ic_parse<Ic<solver::Gmres<ValueType>, IndexType>>(                \
         const config::pnode&, const config::registry&,                \
         const config::type_descriptor&)
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_GMRES_IC_PARSE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_GMRES_IC_PARSE);
 
 #define GKO_DECLARE_LOWERISAI_IC_PARSE(ValueType, IndexType)                 \
     typename Ic<LowerIsai<ValueType, IndexType>, IndexType>::parameters_type \
     ic_parse<Ic<LowerIsai<ValueType, IndexType>, IndexType>>(                \
         const config::pnode&, const config::registry&,                       \
         const config::type_descriptor&)
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LOWERISAI_IC_PARSE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_LOWERISAI_IC_PARSE);
 
 }  // namespace detail
 }  // namespace preconditioner
diff --git a/core/preconditioner/ilu.cpp b/core/preconditioner/ilu.cpp
index d6f49e49588..dae6cf97829 100644
--- a/core/preconditioner/ilu.cpp
+++ b/core/preconditioner/ilu.cpp
@@ -59,7 +59,8 @@ typename Ilu::parameters_type ilu_parse(
                   solver::UpperTrs<ValueType, IndexType>, false, IndexType>>( \
         const config::pnode&, const config::registry&,                        \
         const config::type_descriptor&)
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_TRS_ILU_FALSE_PARSE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_TRS_ILU_FALSE_PARSE);
 
 #define GKO_DECLARE_TRS_ILU_TRUE_PARSE(ValueType, IndexType)                 \
     typename Ilu<solver::LowerTrs<ValueType, IndexType>,                     \
@@ -69,7 +70,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_TRS_ILU_FALSE_PARSE);
                   solver::UpperTrs<ValueType, IndexType>, true, IndexType>>( \
         const config::pnode&, const config::registry&,                       \
         const config::type_descriptor&)
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_TRS_ILU_TRUE_PARSE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_TRS_ILU_TRUE_PARSE);
 
 #define GKO_DECLARE_GMRES_ILU_FALSE_PARSE(ValueType, IndexType)              \
     typename Ilu<solver::Gmres<ValueType>, solver::Gmres<ValueType>, false,  \
@@ -77,7 +79,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_TRS_ILU_TRUE_PARSE);
     ilu_parse<Ilu<solver::Gmres<ValueType>, solver::Gmres<ValueType>, false, \
                   IndexType>>(const config::pnode&, const config::registry&, \
                               const config::type_descriptor&)
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_GMRES_ILU_FALSE_PARSE);
 
 #define GKO_DECLARE_GMRES_ILU_TRUE_PARSE(ValueType, IndexType)               \
@@ -86,7 +88,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     ilu_parse<Ilu<solver::Gmres<ValueType>, solver::Gmres<ValueType>, true,  \
                   IndexType>>(const config::pnode&, const config::registry&, \
                               const config::type_descriptor&)
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_GMRES_ILU_TRUE_PARSE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_GMRES_ILU_TRUE_PARSE);
 
 #define GKO_DECLARE_IR_ILU_FALSE_PARSE(ValueType, IndexType)                  \
     typename Ilu<solver::Ir<ValueType>, solver::Ir<ValueType>, false,         \
@@ -95,7 +98,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_GMRES_ILU_TRUE_PARSE);
         Ilu<solver::Ir<ValueType>, solver::Ir<ValueType>, false, IndexType>>( \
         const config::pnode&, const config::registry&,                        \
         const config::type_descriptor&)
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IR_ILU_FALSE_PARSE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_IR_ILU_FALSE_PARSE);
 
 #define GKO_DECLARE_IR_ILU_TRUE_PARSE(ValueType, IndexType)                  \
     typename Ilu<solver::Ir<ValueType>, solver::Ir<ValueType>, true,         \
@@ -104,7 +108,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IR_ILU_FALSE_PARSE);
         Ilu<solver::Ir<ValueType>, solver::Ir<ValueType>, true, IndexType>>( \
         const config::pnode&, const config::registry&,                       \
         const config::type_descriptor&)
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IR_ILU_TRUE_PARSE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_IR_ILU_TRUE_PARSE);
 
 #define GKO_DECLARE_ISAI_ILU_FALSE_PARSE(ValueType, IndexType)         \
     typename Ilu<LowerIsai<ValueType, IndexType>,                      \
@@ -114,7 +119,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IR_ILU_TRUE_PARSE);
                   UpperIsai<ValueType, IndexType>, false, IndexType>>( \
         const config::pnode&, const config::registry&,                 \
         const config::type_descriptor&)
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ISAI_ILU_FALSE_PARSE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_ISAI_ILU_FALSE_PARSE);
 
 #define GKO_DECLARE_ISAI_ILU_TRUE_PARSE(ValueType, IndexType)         \
     typename Ilu<LowerIsai<ValueType, IndexType>,                     \
@@ -124,7 +130,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ISAI_ILU_FALSE_PARSE);
                   UpperIsai<ValueType, IndexType>, true, IndexType>>( \
         const config::pnode&, const config::registry&,                \
         const config::type_descriptor&)
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ISAI_ILU_TRUE_PARSE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_ISAI_ILU_TRUE_PARSE);
 
 
 }  // namespace detail
diff --git a/core/preconditioner/isai.cpp b/core/preconditioner/isai.cpp
index 9684f1bdb27..ec0ef365592 100644
--- a/core/preconditioner/isai.cpp
+++ b/core/preconditioner/isai.cpp
@@ -358,19 +358,20 @@ std::unique_ptr<LinOp> Isai<IsaiType, ValueType, IndexType>::conj_transpose()
 
 #define GKO_DECLARE_LOWER_ISAI(ValueType, IndexType) \
     class Isai<isai_type::lower, ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LOWER_ISAI);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_LOWER_ISAI);
 
 #define GKO_DECLARE_UPPER_ISAI(ValueType, IndexType) \
     class Isai<isai_type::upper, ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_UPPER_ISAI);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_UPPER_ISAI);
 
 #define GKO_DECLARE_GENERAL_ISAI(ValueType, IndexType) \
     class Isai<isai_type::general, ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_GENERAL_ISAI);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_GENERAL_ISAI);
 
 #define GKO_DECLARE_SPD_ISAI(ValueType, IndexType) \
     class Isai<isai_type::spd, ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPD_ISAI);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_SPD_ISAI);
 
 
 }  // namespace preconditioner
diff --git a/core/preconditioner/jacobi.cpp b/core/preconditioner/jacobi.cpp
index f6d5b042a23..3f773710ceb 100644
--- a/core/preconditioner/jacobi.cpp
+++ b/core/preconditioner/jacobi.cpp
@@ -328,10 +328,11 @@ void Jacobi<ValueType, IndexType>::generate(const LinOp* system_matrix,
     if (parameters_.max_block_size == 1) {
         auto diag = share(as<DiagonalLinOpExtractable>(system_matrix)
                               ->extract_diagonal_linop());
-        auto diag_vt =
-            ::gko::detail::temporary_conversion<matrix::Diagonal<ValueType>>::
-                template create<matrix::Diagonal<next_precision<ValueType>>>(
-                    diag.get());
+        auto diag_vt = ::gko::detail::
+            temporary_conversion<matrix::Diagonal<ValueType>>::template create<
+                matrix::Diagonal<previous_precision_with_half<ValueType>>,
+                matrix::Diagonal<previous_precision_with_half<
+                    previous_precision_with_half<ValueType>>>>(diag.get());
         if (!diag_vt) {
             GKO_NOT_SUPPORTED(system_matrix);
         }
@@ -374,7 +375,7 @@ void Jacobi<ValueType, IndexType>::generate(const LinOp* system_matrix,
 
 #define GKO_DECLARE_JACOBI(ValueType, IndexType) \
     class Jacobi<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_JACOBI);
 
 
 }  // namespace preconditioner
diff --git a/core/preconditioner/jacobi_utils.hpp b/core/preconditioner/jacobi_utils.hpp
index e159fd15776..36e7d1ccd75 100644
--- a/core/preconditioner/jacobi_utils.hpp
+++ b/core/preconditioner/jacobi_utils.hpp
@@ -108,8 +108,8 @@ GKO_ATTRIBUTES GKO_INLINE uint32 get_supported_storage_reductions(
     using gko::detail::float_traits;
     using type = remove_complex<ValueType>;
     using prd = precision_reduction_descriptor;
-    auto accurate = [&cond, &accuracy](type eps) {
-        return cond * eps < accuracy;
+    auto accurate = [&cond, &accuracy](auto eps) {
+        return cond * static_cast<type>(eps) < accuracy;
     };
     uint8 is_verified1 = 2;
     auto supported = static_cast<uint32>(prd::p0n0);
diff --git a/core/preconditioner/sor.cpp b/core/preconditioner/sor.cpp
index c9905c5f73c..b671a99c6fb 100644
--- a/core/preconditioner/sor.cpp
+++ b/core/preconditioner/sor.cpp
@@ -161,7 +161,7 @@ std::unique_ptr<LinOp> Sor<ValueType, IndexType>::generate_impl(
 
 #define GKO_DECLARE_SOR(ValueType, IndexType) class Sor<ValueType, IndexType>
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SOR);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_SOR);
 
 
 }  // namespace preconditioner
diff --git a/core/test/preconditioner/isai.cpp b/core/test/preconditioner/isai.cpp
index b5e7400d0e8..b2ee8175d49 100644
--- a/core/test/preconditioner/isai.cpp
+++ b/core/test/preconditioner/isai.cpp
@@ -64,7 +64,7 @@ class IsaiFactory : public ::testing::Test {
     std::unique_ptr<typename UpperIsai::Factory> upper_isai_factory;
 };
 
-TYPED_TEST_SUITE(IsaiFactory, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(IsaiFactory, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
diff --git a/core/test/preconditioner/jacobi.cpp b/core/test/preconditioner/jacobi.cpp
index 8813b4c3c4d..40bc9e8d494 100644
--- a/core/test/preconditioner/jacobi.cpp
+++ b/core/test/preconditioner/jacobi.cpp
@@ -43,7 +43,7 @@ class JacobiFactory : public ::testing::Test {
     std::shared_ptr<gko::matrix::Csr<value_type, index_type>> mtx;
 };
 
-TYPED_TEST_SUITE(JacobiFactory, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(JacobiFactory, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
diff --git a/dpcpp/preconditioner/isai_kernels.dp.cpp b/dpcpp/preconditioner/isai_kernels.dp.cpp
index 4082035ff9f..8d5429b088a 100644
--- a/dpcpp/preconditioner/isai_kernels.dp.cpp
+++ b/dpcpp/preconditioner/isai_kernels.dp.cpp
@@ -365,7 +365,7 @@ void generate_general_inverse(
 
         if (spd) {
             auto diag = subwarp.shfl(sol, num_elems - 1);
-            sol /= std::sqrt(diag);
+            sol /= gko::sqrt(diag);
         }
 
         return sol;
@@ -531,7 +531,7 @@ void scale_excess_solution(const IndexType* __restrict__ excess_block_ptrs,
         return;
     }
     const auto diag = excess_solution[block_end - 1];
-    const ValueType scal = one<ValueType>() / std::sqrt(diag);
+    const ValueType scal = one<ValueType>() / gko::sqrt(diag);
 
     for (size_type i = block_begin + local_id; i < block_end;
          i += subwarp_size) {
@@ -642,7 +642,7 @@ void generate_tri_inverse(std::shared_ptr<const DefaultExecutor> exec,
     components::prefix_sum_nonnegative(exec, excess_nz_ptrs, num_rows + 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL);
 
 
@@ -669,7 +669,7 @@ void generate_general_inverse(std::shared_ptr<const DefaultExecutor> exec,
     components::prefix_sum_nonnegative(exec, excess_nz_ptrs, num_rows + 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ISAI_GENERATE_GENERAL_INVERSE_KERNEL);
 
 
@@ -699,7 +699,7 @@ void generate_excess_system(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL);
 
 
@@ -718,7 +718,7 @@ void scale_excess_solution(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ISAI_SCALE_EXCESS_SOLUTION_KERNEL);
 
 
@@ -742,7 +742,7 @@ void scatter_excess_solution(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL);
 
 
diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
index e8c086ec0a6..4b9077d5ec5 100644
--- a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
@@ -197,7 +197,7 @@ void advanced_apply(
         const preconditioner::block_interleaved_storage_scheme<IndexType>&, \
         const ValueType*, const ValueType*, size_type, ValueType*, size_type)
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     DECLARE_JACOBI_ADVANCED_APPLY_INSTANTIATION);
 
 
diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
index 0e26989808e..72a32c2d5cb 100644
--- a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
@@ -65,7 +65,8 @@ void apply(std::shared_ptr<const DpcppExecutor> exec, size_type num_blocks,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_JACOBI_APPLY_KERNEL);
 
 
 }  // namespace jacobi
diff --git a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
index d957ea2c5be..fe0973a9f21 100644
--- a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
@@ -388,7 +388,7 @@ void generate(syn::value_list<int, max_block_size>,
         remove_complex<ValueType>*, precision_reduction*, const IndexType*,  \
         size_type)
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     DECLARE_JACOBI_GENERATE_INSTANTIATION);
 
 
diff --git a/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp
index 62ff7fdbb51..826509be1df 100644
--- a/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp
@@ -61,7 +61,7 @@ void generate(std::shared_ptr<const DpcppExecutor> exec,
         block_pointers.get_const_data(), num_blocks);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_JACOBI_GENERATE_KERNEL);
 
 
diff --git a/dpcpp/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/preconditioner/jacobi_kernels.dp.cpp
index 886f96e88e3..63449ba5b4b 100644
--- a/dpcpp/preconditioner/jacobi_kernels.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_kernels.dp.cpp
@@ -389,7 +389,7 @@ void find_blocks(std::shared_ptr<const DefaultExecutor> exec,
         exec, max_block_size, num_natural_blocks, block_pointers.get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL);
 
 
@@ -452,7 +452,7 @@ void transpose_jacobi(
         storage_scheme, out_blocks.get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL);
 
 
@@ -476,7 +476,7 @@ void conj_transpose_jacobi(
         storage_scheme, out_blocks.get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL);
 
 
@@ -489,7 +489,7 @@ void convert_to_dense(
         storage_scheme,
     ValueType* result_values, size_type result_stride) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL);
 
 
diff --git a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
index c088ae8e986..8eafc3af69d 100644
--- a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
@@ -190,7 +190,7 @@ void apply(syn::value_list<int, max_block_size>,
         const preconditioner::block_interleaved_storage_scheme<IndexType>&,   \
         const ValueType*, size_type, ValueType*, size_type)
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     DECLARE_JACOBI_SIMPLE_APPLY_INSTANTIATION);
 
 
diff --git a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
index 25701c6dc55..3d6ebe76226 100644
--- a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
@@ -61,7 +61,7 @@ void simple_apply(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL);
 
 
diff --git a/dpcpp/preconditioner/sor_kernels.dp.cpp b/dpcpp/preconditioner/sor_kernels.dp.cpp
index 4af676288bd..aed20ab8c8a 100644
--- a/dpcpp/preconditioner/sor_kernels.dp.cpp
+++ b/dpcpp/preconditioner/sor_kernels.dp.cpp
@@ -50,7 +50,7 @@ void initialize_weighted_l(
         });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L);
 
 
@@ -100,7 +100,7 @@ void initialize_weighted_l_u(
         });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U);
 
 
diff --git a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp
index b8950ed2d2a..36179402262 100644
--- a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp
+++ b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp
@@ -62,7 +62,7 @@ class Jacobi : public ::testing::Test {
         if (condition_numbers.size() == 0) {
             mtx = gko::test::generate_random_matrix<Mtx>(
                 dim, dim, std::uniform_int_distribution<>(min_nnz, max_nnz),
-                std::normal_distribution<value_type>(0.0, 1.0), engine, ref);
+                std::normal_distribution<>(0.0, 1.0), engine, ref);
         } else {
             std::vector<mtx_data> blocks;
             for (gko::size_type i = 0; i < block_pointers.size() - 1; ++i) {
@@ -70,8 +70,7 @@ class Jacobi : public ::testing::Test {
                     begin(block_pointers)[i + 1] - begin(block_pointers)[i];
                 const auto cond = begin(condition_numbers)[i];
                 blocks.push_back(mtx_data::cond(
-                    size, cond, std::normal_distribution<value_type>(-1, 1),
-                    engine));
+                    size, cond, std::normal_distribution<>(-1, 1), engine));
             }
             mtx = Mtx::create(ref);
             mtx->read(mtx_data::diag(begin(blocks), end(blocks)));
@@ -107,11 +106,11 @@ class Jacobi : public ::testing::Test {
         }
         b = gko::test::generate_random_matrix<Vec>(
             dim, num_rhs, std::uniform_int_distribution<>(num_rhs, num_rhs),
-            std::normal_distribution<value_type>(0.0, 1.0), engine, ref);
+            std::normal_distribution<>(0.0, 1.0), engine, ref);
         d_b = gko::clone(dpcpp, b);
         x = gko::test::generate_random_matrix<Vec>(
             dim, num_rhs, std::uniform_int_distribution<>(num_rhs, num_rhs),
-            std::normal_distribution<value_type>(0.0, 1.0), engine, ref);
+            std::normal_distribution<>(0.0, 1.0), engine, ref);
         d_x = gko::clone(dpcpp, x);
     }
 
@@ -409,7 +408,7 @@ TEST_F(Jacobi, DpcppScalarApplyEquivalentToRef)
     smtx->copy_from(dense_smtx);
     auto sb = gko::share(gko::test::generate_random_matrix<Vec>(
         dim, 3, std::uniform_int_distribution<>(1, 1),
-        std::normal_distribution<value_type>(0.0, 1.0), engine, ref));
+        std::normal_distribution<>(0.0, 1.0), engine, ref));
     auto sx = Vec::create(ref, sb->get_size());
 
     auto d_smtx = gko::share(Mtx::create(dpcpp));
@@ -453,7 +452,7 @@ TEST_F(Jacobi, DpcppScalarLinearCombinationApplyEquivalentToRef)
     auto dense_data =
         gko::test::generate_random_matrix_data<value_type, index_type>(
             dim, dim, std::uniform_int_distribution<>(1, dim),
-            std::normal_distribution<value_type>(1.0, 2.0), engine);
+            std::normal_distribution<>(1.0, 2.0), engine);
     gko::utils::make_diag_dominant(dense_data);
     auto dense_smtx = gko::share(Vec::create(ref));
     dense_smtx->read(dense_data);
@@ -461,12 +460,12 @@ TEST_F(Jacobi, DpcppScalarLinearCombinationApplyEquivalentToRef)
     smtx->copy_from(dense_smtx);
     auto sb = gko::share(gko::test::generate_random_matrix<Vec>(
         dim, 3, std::uniform_int_distribution<>(1, 1),
-        std::normal_distribution<value_type>(0.0, 1.0), engine, ref,
-        gko::dim<2>(dim, 3), 4));
+        std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3),
+        4));
     auto sx = gko::share(gko::test::generate_random_matrix<Vec>(
         dim, 3, std::uniform_int_distribution<>(1, 1),
-        std::normal_distribution<value_type>(0.0, 1.0), engine, ref,
-        gko::dim<2>(dim, 3), 4));
+        std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3),
+        4));
 
     auto d_smtx = gko::share(gko::clone(dpcpp, smtx));
     auto d_sb = gko::share(gko::clone(dpcpp, sb));
diff --git a/include/ginkgo/core/preconditioner/ic.hpp b/include/ginkgo/core/preconditioner/ic.hpp
index aea43af3cf1..9260bfbb891 100644
--- a/include/ginkgo/core/preconditioner/ic.hpp
+++ b/include/ginkgo/core/preconditioner/ic.hpp
@@ -441,7 +441,7 @@ class Ic : public EnableLinOp<Ic<LSolverType, IndexType>>, public Transposable {
     generate_default_solver(const std::shared_ptr<const Executor>& exec,
                             const std::shared_ptr<const LinOp>& mtx)
     {
-        constexpr gko::remove_complex<value_type> default_reduce_residual{1e-4};
+        const gko::remove_complex<value_type> default_reduce_residual{1e-4};
         const unsigned int default_max_iters{
             static_cast<unsigned int>(mtx->get_size()[0])};
 
diff --git a/include/ginkgo/core/preconditioner/ilu.hpp b/include/ginkgo/core/preconditioner/ilu.hpp
index 1f4be3e3046..98aa3ce70c1 100644
--- a/include/ginkgo/core/preconditioner/ilu.hpp
+++ b/include/ginkgo/core/preconditioner/ilu.hpp
@@ -498,7 +498,8 @@ class Ilu : public EnableLinOp<
     generate_default_solver(const std::shared_ptr<const Executor>& exec,
                             const std::shared_ptr<const LinOp>& mtx)
     {
-        constexpr gko::remove_complex<value_type> default_reduce_residual{1e-4};
+        // half can not use constexpr constructor
+        const gko::remove_complex<value_type> default_reduce_residual{1e-4};
         const unsigned int default_max_iters{
             static_cast<unsigned int>(mtx->get_size()[0])};
 
diff --git a/omp/preconditioner/isai_kernels.cpp b/omp/preconditioner/isai_kernels.cpp
index 6f2fe4838d9..61a2193a2b3 100644
--- a/omp/preconditioner/isai_kernels.cpp
+++ b/omp/preconditioner/isai_kernels.cpp
@@ -230,7 +230,7 @@ void generate_tri_inverse(std::shared_ptr<const DefaultExecutor> exec,
                      trs_solve, true);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL);
 
 
@@ -324,7 +324,7 @@ void generate_general_inverse(std::shared_ptr<const DefaultExecutor> exec,
                      general_solve, false);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ISAI_GENERATE_GENERAL_INVERSE_KERNEL);
 
 
@@ -388,7 +388,7 @@ void generate_excess_system(std::shared_ptr<const DefaultExecutor>,
     e_row_ptrs[e_dim] = excess_nz_ptrs[e_end] - excess_nz_ptrs[e_start];
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL);
 
 
@@ -415,7 +415,7 @@ void scale_excess_solution(std::shared_ptr<const DefaultExecutor>,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ISAI_SCALE_EXCESS_SOLUTION_KERNEL);
 
 
@@ -441,7 +441,7 @@ void scatter_excess_solution(std::shared_ptr<const DefaultExecutor>,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL);
 
 
diff --git a/omp/preconditioner/jacobi_kernels.cpp b/omp/preconditioner/jacobi_kernels.cpp
index 76224f97a2f..ee51f7adb40 100644
--- a/omp/preconditioner/jacobi_kernels.cpp
+++ b/omp/preconditioner/jacobi_kernels.cpp
@@ -132,7 +132,7 @@ void find_blocks(std::shared_ptr<const OmpExecutor> exec,
                                             block_pointers.get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL);
 
 
@@ -436,7 +436,7 @@ void generate(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_JACOBI_GENERATE_KERNEL);
 
 
@@ -514,7 +514,8 @@ void apply(std::shared_ptr<const OmpExecutor> exec, size_type num_blocks,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_JACOBI_APPLY_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -548,7 +549,7 @@ void simple_apply(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL);
 
 
@@ -585,7 +586,7 @@ void transpose_jacobi(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL);
 
 
@@ -622,7 +623,7 @@ void conj_transpose_jacobi(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL);
 
 
@@ -661,7 +662,7 @@ void convert_to_dense(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL);
 
 
diff --git a/omp/preconditioner/sor_kernels.cpp b/omp/preconditioner/sor_kernels.cpp
index 509946ac15a..670277b6ebd 100644
--- a/omp/preconditioner/sor_kernels.cpp
+++ b/omp/preconditioner/sor_kernels.cpp
@@ -29,7 +29,7 @@ void initialize_weighted_l(
             [](auto val) { return val; }));
 };
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L);
 
 
@@ -57,7 +57,7 @@ void initialize_weighted_l_u(
             }));
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U);
 
 
diff --git a/reference/preconditioner/isai_kernels.cpp b/reference/preconditioner/isai_kernels.cpp
index 55f56b5705e..6114d3d8e3c 100644
--- a/reference/preconditioner/isai_kernels.cpp
+++ b/reference/preconditioner/isai_kernels.cpp
@@ -219,7 +219,7 @@ void generate_tri_inverse(std::shared_ptr<const DefaultExecutor> exec,
                      trs_solve, true);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL);
 
 
@@ -314,7 +314,7 @@ void generate_general_inverse(std::shared_ptr<const DefaultExecutor> exec,
                      general_solve, false);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ISAI_GENERATE_GENERAL_INVERSE_KERNEL);
 
 
@@ -377,7 +377,7 @@ void generate_excess_system(std::shared_ptr<const DefaultExecutor>,
     e_row_ptrs[e_dim] = excess_nz_ptrs[e_end] - excess_nz_ptrs[e_start];
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL);
 
 
@@ -405,7 +405,7 @@ void scale_excess_solution(std::shared_ptr<const DefaultExecutor>,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ISAI_SCALE_EXCESS_SOLUTION_KERNEL);
 
 
@@ -430,7 +430,7 @@ void scatter_excess_solution(std::shared_ptr<const DefaultExecutor>,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL);
 
 
diff --git a/reference/preconditioner/jacobi_kernels.cpp b/reference/preconditioner/jacobi_kernels.cpp
index 4eaf0988a00..52e3666ca30 100644
--- a/reference/preconditioner/jacobi_kernels.cpp
+++ b/reference/preconditioner/jacobi_kernels.cpp
@@ -116,7 +116,7 @@ void find_blocks(std::shared_ptr<const DefaultExecutor> exec,
                                             block_pointers.get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL);
 
 
@@ -406,7 +406,7 @@ void generate(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_JACOBI_GENERATE_KERNEL);
 
 
@@ -494,7 +494,8 @@ void apply(std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_JACOBI_APPLY_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -527,7 +528,7 @@ void simple_apply(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL);
 
 
@@ -547,7 +548,8 @@ void scalar_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL);
 
 
 template <typename ValueType>
@@ -563,7 +565,7 @@ void simple_scalar_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL);
 
 
@@ -576,7 +578,8 @@ void scalar_conj(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_CONJ_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_JACOBI_SCALAR_CONJ_KERNEL);
 
 
 template <typename ValueType>
@@ -591,7 +594,8 @@ void invert_diagonal(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_INVERT_DIAGONAL_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_JACOBI_INVERT_DIAGONAL_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -626,7 +630,7 @@ void transpose_jacobi(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL);
 
 
@@ -662,7 +666,7 @@ void conj_transpose_jacobi(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL);
 
 
@@ -682,7 +686,7 @@ void scalar_convert_to_dense(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_JACOBI_SCALAR_CONVERT_TO_DENSE_KERNEL);
 
 
@@ -720,7 +724,7 @@ void convert_to_dense(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL);
 
 
diff --git a/reference/preconditioner/sor_kernels.cpp b/reference/preconditioner/sor_kernels.cpp
index 88ac422dd02..b5ada476f13 100644
--- a/reference/preconditioner/sor_kernels.cpp
+++ b/reference/preconditioner/sor_kernels.cpp
@@ -32,7 +32,7 @@ void initialize_weighted_l(
             [](auto val) { return val; }));
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L);
 
 
@@ -60,7 +60,7 @@ void initialize_weighted_l_u(
             }));
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U);
 
 
diff --git a/reference/test/preconditioner/gauss_seidel.cpp b/reference/test/preconditioner/gauss_seidel.cpp
index 2b67b665d77..53db7f0781e 100644
--- a/reference/test/preconditioner/gauss_seidel.cpp
+++ b/reference/test/preconditioner/gauss_seidel.cpp
@@ -47,7 +47,7 @@ class GaussSeidel : public ::testing::Test {
     std::shared_ptr<csr_type> mtx = csr_type::create(exec);
 };
 
-TYPED_TEST_SUITE(GaussSeidel, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(GaussSeidel, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/preconditioner/ic.cpp b/reference/test/preconditioner/ic.cpp
index 16ffc8d7b3c..aabd6c64d73 100644
--- a/reference/test/preconditioner/ic.cpp
+++ b/reference/test/preconditioner/ic.cpp
@@ -67,7 +67,8 @@ class Ic : public ::testing::Test {
     gko::remove_complex<value_type> tol;
 };
 
-TYPED_TEST_SUITE(Ic, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Ic, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(Ic, BuildsTwoFactorComposition)
@@ -245,7 +246,7 @@ TYPED_TEST(Ic, SolvesSingleRhsMixed)
 {
     using ic_prec_type = typename TestFixture::ic_prec_type;
     using T = typename TestFixture::value_type;
-    using Vec = gko::matrix::Dense<gko::next_precision<T>>;
+    using Vec = gko::matrix::Dense<gko::next_precision_with_half<T>>;
     const auto b = gko::initialize<Vec>({1.0, 3.0, 6.0}, this->exec);
     auto x = Vec::create(this->exec, gko::dim<2>{3, 1});
     auto preconditioner =
@@ -278,8 +279,8 @@ TYPED_TEST(Ic, SolvesSingleRhsComplex)
 TYPED_TEST(Ic, SolvesSingleRhsComplexMixed)
 {
     using ic_prec_type = typename TestFixture::ic_prec_type;
-    using Vec = gko::matrix::Dense<
-        gko::next_precision<gko::to_complex<typename TestFixture::value_type>>>;
+    using Vec = gko::matrix::Dense<gko::next_precision_with_half<
+        gko::to_complex<typename TestFixture::value_type>>>;
     using T = typename Vec::value_type;
     const auto b = gko::initialize<Vec>(
         {T{1.0, 2.0}, T{3.0, 6.0}, T{6.0, 12.0}}, this->exec);
@@ -315,7 +316,7 @@ TYPED_TEST(Ic, AdvancedSolvesSingleRhsMixed)
 {
     using ic_prec_type = typename TestFixture::ic_prec_type;
     using T = typename TestFixture::value_type;
-    using Vec = gko::matrix::Dense<gko::next_precision<T>>;
+    using Vec = gko::matrix::Dense<gko::next_precision_with_half<T>>;
     const auto b = gko::initialize<Vec>({1.0, 3.0, 6.0}, this->exec);
     const auto alpha = gko::initialize<Vec>({2.0}, this->exec);
     const auto beta = gko::initialize<Vec>({-1.0}, this->exec);
@@ -355,7 +356,7 @@ TYPED_TEST(Ic, AdvancedSolvesSingleRhsComplexMixed)
 {
     using ic_prec_type = typename TestFixture::ic_prec_type;
     using MixedDense = gko::matrix::Dense<
-        gko::next_precision<typename TestFixture::value_type>>;
+        gko::next_precision_with_half<typename TestFixture::value_type>>;
     using MixedDenseComplex = gko::to_complex<MixedDense>;
     using T = typename MixedDenseComplex::value_type;
     const auto b = gko::initialize<MixedDenseComplex>(
diff --git a/reference/test/preconditioner/ilu.cpp b/reference/test/preconditioner/ilu.cpp
index 180b92be9ec..e4c4809f084 100644
--- a/reference/test/preconditioner/ilu.cpp
+++ b/reference/test/preconditioner/ilu.cpp
@@ -84,7 +84,7 @@ class Ilu : public ::testing::Test {
     std::shared_ptr<typename ilu_rev_prec_type::Factory> ilu_rev_pre_factory;
 };
 
-TYPED_TEST_SUITE(Ilu, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Ilu, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
 
 
 TYPED_TEST(Ilu, BuildsDefaultWithoutThrowing)
@@ -316,7 +316,7 @@ TYPED_TEST(Ilu, SolvesSingleRhsWithMtx)
 TYPED_TEST(Ilu, SolvesSingleRhsWithMixedMtx)
 {
     using Mtx = gko::matrix::Dense<
-        gko::next_precision<typename TestFixture::value_type>>;
+        gko::next_precision_with_half<typename TestFixture::value_type>>;
     const auto b = gko::initialize<Mtx>({1.0, 3.0, 6.0}, this->exec);
     auto x = Mtx::create(this->exec, gko::dim<2>{3, 1});
     x->copy_from(b);
@@ -349,8 +349,8 @@ TYPED_TEST(Ilu, SolvesSingleRhsWithComplexMtx)
 
 TYPED_TEST(Ilu, SolvesSingleRhsWithMixedComplexMtx)
 {
-    using Mtx = gko::matrix::Dense<
-        gko::to_complex<gko::next_precision<typename TestFixture::value_type>>>;
+    using Mtx = gko::matrix::Dense<gko::to_complex<
+        gko::next_precision_with_half<typename TestFixture::value_type>>>;
     using T = typename Mtx::value_type;
     const auto b = gko::initialize<Mtx>(
         {T{1.0, 2.0}, T{3.0, 6.0}, T{6.0, 12.0}}, this->exec);
@@ -403,7 +403,8 @@ TYPED_TEST(Ilu, SolvesAdvancedSingleRhs)
 
 TYPED_TEST(Ilu, SolvesAdvancedSingleRhsMixed)
 {
-    using value_type = gko::next_precision<typename TestFixture::value_type>;
+    using value_type =
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     const value_type alpha{2.0};
     const auto alpha_linop = gko::initialize<Mtx>({alpha}, this->exec);
@@ -453,7 +454,8 @@ TYPED_TEST(Ilu, SolvesAdvancedSingleRhsComplex)
 
 TYPED_TEST(Ilu, SolvesAdvancedSingleRhsMixedComplex)
 {
-    using value_type = gko::next_precision<typename TestFixture::value_type>;
+    using value_type =
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using complex_type = gko::to_complex<value_type>;
     using MixedDense = gko::matrix::Dense<value_type>;
     using MixedDenseComplex = gko::to_complex<MixedDense>;
diff --git a/reference/test/preconditioner/isai_kernels.cpp b/reference/test/preconditioner/isai_kernels.cpp
index e989125c61d..f55d7e12b87 100644
--- a/reference/test/preconditioner/isai_kernels.cpp
+++ b/reference/test/preconditioner/isai_kernels.cpp
@@ -186,8 +186,20 @@ class Isai : public ::testing::Test {
     {
         lower_isai_factory = LowerIsai::build().on(exec);
         upper_isai_factory = UpperIsai::build().on(exec);
-        general_isai_factory = GeneralIsai::build().on(exec);
-        spd_isai_factory = SpdIsai::build().on(exec);
+        if (std::is_same_v<gko::remove_complex<value_type>, gko::half>) {
+            general_isai_factory =
+                GeneralIsai::build()
+                    .with_excess_solver_reduction(
+                        gko::remove_complex<value_type>{1e-3})
+                    .on(exec);
+            spd_isai_factory = SpdIsai::build()
+                                   .with_excess_solver_reduction(
+                                       gko::remove_complex<value_type>{1e-3})
+                                   .on(exec);
+        } else {
+            general_isai_factory = GeneralIsai::build().on(exec);
+            spd_isai_factory = SpdIsai::build().on(exec);
+        }
         a_dense->convert_to(a_csr);
         a_dense_inv->convert_to(a_csr_inv);
         l_dense->convert_to(l_csr);
@@ -310,7 +322,8 @@ class Isai : public ::testing::Test {
     std::shared_ptr<Csr> spd_sparse_inv;
 };
 
-TYPED_TEST_SUITE(Isai, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Isai, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(Isai, KernelGenerateA)
@@ -1021,6 +1034,9 @@ TYPED_TEST(Isai, ReturnsCorrectInverseALongrowWithExcessSolver)
 {
     using value_type = typename TestFixture::value_type;
     using GeneralIsai = typename TestFixture::GeneralIsai;
+    // When using the other precision, we already need to drastically reduce the
+    // precision, so it is hard to work with half.
+    SKIP_IF_HALF(value_type);
     auto general_isai_factory =
         GeneralIsai::build()
             .with_excess_solver_factory(this->excess_solver_factory)
@@ -1068,6 +1084,9 @@ TYPED_TEST(Isai, ReturnsCorrectInverseLLongrowWithExcessSolver)
     using Csr = typename TestFixture::Csr;
     using LowerIsai = typename TestFixture::LowerIsai;
     using value_type = typename TestFixture::value_type;
+    // When using the other precision, we already need to drastically reduce the
+    // precision, so it is hard to work with half.
+    SKIP_IF_HALF(value_type);
     auto lower_isai_factory =
         LowerIsai::build()
             .with_excess_solver_factory(this->excess_solver_factory)
@@ -1115,6 +1134,9 @@ TYPED_TEST(Isai, ReturnsCorrectInverseULongrowWithExcessSolver)
     using Csr = typename TestFixture::Csr;
     using UpperIsai = typename TestFixture::UpperIsai;
     using value_type = typename TestFixture::value_type;
+    // When using the other precision, we already need to drastically reduce the
+    // precision, so it is hard to work with half.
+    SKIP_IF_HALF(value_type);
     auto upper_isai_factory =
         UpperIsai::build()
             .with_excess_solver_factory(this->excess_solver_factory)
@@ -1228,8 +1250,8 @@ TYPED_TEST(Isai, ReturnsCorrectInverseSpdLongrow)
     // need to reduce precision due to spd ISAI using GMRES instead of
     // direct solve
     GKO_ASSERT_MTX_NEAR(lower, this->spd_csr_longrow_inv,
-                        10 * r<value_type>::value);
-    GKO_ASSERT_MTX_NEAR(lower_t, expected_transpose, 10 * r<value_type>::value);
+                        30 * r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(lower_t, expected_transpose, 30 * r<value_type>::value);
 }
 
 
@@ -1238,6 +1260,9 @@ TYPED_TEST(Isai, ReturnsCorrectInverseSpdLongrowWithExcessSolver)
     using Csr = typename TestFixture::Csr;
     using SpdIsai = typename TestFixture::SpdIsai;
     using value_type = typename TestFixture::value_type;
+    // When using the other precision, we already need to drastically reduce the
+    // precision, so it is hard to work with half.
+    SKIP_IF_HALF(value_type);
     const auto expected_transpose =
         gko::as<Csr>(this->spd_csr_longrow_inv->transpose());
     auto spd_isai_factory =
diff --git a/reference/test/preconditioner/jacobi.cpp b/reference/test/preconditioner/jacobi.cpp
index 79c276579ad..1bc0aa37470 100644
--- a/reference/test/preconditioner/jacobi.cpp
+++ b/reference/test/preconditioner/jacobi.cpp
@@ -144,7 +144,8 @@ class Jacobi : public ::testing::Test {
     std::unique_ptr<Bj> adaptive_bj;
 };
 
-TYPED_TEST_SUITE(Jacobi, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Jacobi, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(Jacobi, GeneratesCorrectStorageScheme)
@@ -477,7 +478,7 @@ TYPED_TEST(Jacobi, ScalarJacobiGeneratesOnDifferentPrecision)
 {
     using value_type = typename TestFixture::value_type;
     using index_type = typename TestFixture::index_type;
-    using next_type = gko::next_precision<value_type>;
+    using next_type = gko::next_precision_with_half<value_type>;
     using Bj = typename TestFixture::Bj;
     auto csr =
         gko::share(gko::matrix::Csr<next_type, index_type>::create(this->exec));
diff --git a/reference/test/preconditioner/jacobi_kernels.cpp b/reference/test/preconditioner/jacobi_kernels.cpp
index 97d9951be7a..2b75c2a5590 100644
--- a/reference/test/preconditioner/jacobi_kernels.cpp
+++ b/reference/test/preconditioner/jacobi_kernels.cpp
@@ -86,7 +86,8 @@ class Jacobi : public ::testing::Test {
     std::shared_ptr<gko::matrix::Csr<value_type, index_type>> mtx;
 };
 
-TYPED_TEST_SUITE(Jacobi, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Jacobi, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(Jacobi, CanBeGenerated)
@@ -561,11 +562,14 @@ TYPED_TEST(Jacobi, SelectsCorrectBlockPrecisions)
 
     auto prec =
         bj->get_parameters().storage_optimization.block_wise.get_const_data();
-    auto precision2 = std::is_same<gko::remove_complex<T>, float>::value
-                          ? gko::precision_reduction(0, 0)   // float
-                          : gko::precision_reduction(0, 1);  // double
-    EXPECT_EQ(prec[0], gko::precision_reduction(0, 2));  // u * cond = ~1.2e-3
-    ASSERT_EQ(prec[1], precision2);                      // u * cond = ~2.0e-3
+    auto precision1 = std::is_same<gko::remove_complex<T>, gko::half>::value
+                          ? gko::precision_reduction(2, 0)
+                          : gko::precision_reduction(0, 2);
+    auto precision2 = std::is_same<gko::remove_complex<T>, double>::value
+                          ? gko::precision_reduction(0, 1)   // double
+                          : gko::precision_reduction(0, 0);  // float, half
+    EXPECT_EQ(prec[0], precision1);  // u * cond = ~1.2e-3
+    ASSERT_EQ(prec[1], precision2);  // u * cond = ~2.0e-3
 }
 
 
@@ -606,6 +610,9 @@ TYPED_TEST(Jacobi, AvoidsPrecisionsThatOverflow)
     auto precision = std::is_same<gko::remove_complex<T>, float>::value
                          ? gko::precision_reduction(0, 2)   // float
                          : gko::precision_reduction(1, 1);  // double
+    if (std::is_same<gko::remove_complex<T>, gko::half>::value) {
+        precision = gko::precision_reduction(2, 0);
+    }
     EXPECT_EQ(prec[0], precision);
     ASSERT_EQ(prec[1], precision);
 }
@@ -642,7 +649,8 @@ TYPED_TEST(Jacobi, ScalarJacobiAppliesToVector)
 
 TYPED_TEST(Jacobi, AppliesToMixedVector)
 {
-    using value_type = gko::next_precision<typename TestFixture::value_type>;
+    using value_type =
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using Vec = gko::matrix::Dense<value_type>;
     auto x = gko::initialize<Vec>({1.0, -1.0, 2.0, -2.0, 3.0}, this->exec);
     auto b = gko::initialize<Vec>({4.0, -1.0, -2.0, 4.0, -1.0}, this->exec);
@@ -682,8 +690,8 @@ TYPED_TEST(Jacobi, AppliesToComplexVector)
 
 TYPED_TEST(Jacobi, AppliesToMixedComplexVector)
 {
-    using value_type =
-        gko::to_complex<gko::next_precision<typename TestFixture::value_type>>;
+    using value_type = gko::to_complex<
+        gko::next_precision_with_half<typename TestFixture::value_type>>;
     using Vec = gko::matrix::Dense<value_type>;
     auto x = gko::initialize<Vec>(
         {value_type{1.0, 2.0}, value_type{-1.0, -2.0}, value_type{2.0, 4.0},
@@ -888,7 +896,8 @@ TYPED_TEST(Jacobi, ScalarJacobiAppliesLinearCombinationToVector)
 
 TYPED_TEST(Jacobi, AppliesLinearCombinationToMixedVector)
 {
-    using value_type = gko::next_precision<typename TestFixture::value_type>;
+    using value_type =
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using Vec = gko::matrix::Dense<value_type>;
     auto x = gko::initialize<Vec>({1.0, -1.0, 2.0, -2.0, 3.0}, this->exec);
     auto b = gko::initialize<Vec>({4.0, -1.0, -2.0, 4.0, -1.0}, this->exec);
@@ -931,7 +940,8 @@ TYPED_TEST(Jacobi, AppliesLinearCombinationToComplexVector)
 
 TYPED_TEST(Jacobi, AppliesLinearCombinationToMixedComplexVector)
 {
-    using value_type = gko::next_precision<typename TestFixture::value_type>;
+    using value_type =
+        gko::next_precision_with_half<typename TestFixture::value_type>;
     using MixedDense = gko::matrix::Dense<value_type>;
     using MixedDenseComplex = gko::to_complex<MixedDense>;
     using T = gko::to_complex<value_type>;
diff --git a/reference/test/preconditioner/sor_kernels.cpp b/reference/test/preconditioner/sor_kernels.cpp
index 18c055aa6d9..cd2fa9af364 100644
--- a/reference/test/preconditioner/sor_kernels.cpp
+++ b/reference/test/preconditioner/sor_kernels.cpp
@@ -55,7 +55,8 @@ class Sor : public ::testing::Test {
                                   exec);
 };
 
-TYPED_TEST_SUITE(Sor, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Sor, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(Sor, CanInitializeLFactor)
diff --git a/reference/test/solver/multigrid_kernels.cpp b/reference/test/solver/multigrid_kernels.cpp
index 7b79ba98ad2..8aad93a1efb 100644
--- a/reference/test/solver/multigrid_kernels.cpp
+++ b/reference/test/solver/multigrid_kernels.cpp
@@ -229,11 +229,13 @@ class Multigrid : public ::testing::Test {
     using Mtx = gko::matrix::Dense<value_type>;
     using Solver = gko::solver::Multigrid;
     using Coarse = gko::multigrid::Pgm<value_type>;
-    using CoarseNext = gko::multigrid::Pgm<gko::next_precision<value_type>>;
+    using CoarseNext =
+        gko::multigrid::Pgm<gko::next_precision_with_half<value_type>>;
     using Smoother = gko::solver::Ir<value_type>;
     using InnerSolver = gko::preconditioner::Jacobi<value_type>;
     using CoarsestSolver = gko::solver::Cg<value_type>;
-    using CoarsestNextSolver = gko::solver::Cg<next_precision<value_type>>;
+    using CoarsestNextSolver =
+        gko::solver::Cg<gko::next_precision_with_half<value_type>>;
     using DummyRPFactory = DummyMultigridLevelWithFactory<value_type>;
     using DummyFactory = DummyLinOpWithFactory<value_type>;
     Multigrid()

From 2980e46e631020eb5091f186691427e24daf2a84 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 25 Oct 2024 19:02:11 +0200
Subject: [PATCH 390/448] preconditioner config dispatch

---
 core/config/preconditioner_config.cpp | 30 +++++++++++++++------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/core/config/preconditioner_config.cpp b/core/config/preconditioner_config.cpp
index 68cbf8595ba..a5669902d00 100644
--- a/core/config/preconditioner_config.cpp
+++ b/core/config/preconditioner_config.cpp
@@ -117,24 +117,28 @@ deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Ic>(
         return dispatch<gko::LinOpFactory,
                         IcHelper2<solver::LowerTrs>::Configurator>(
             config, context, updated,
-            make_type_selector(updated.get_value_typestr(), value_type_list()),
+            make_type_selector(updated.get_value_typestr(),
+                               value_type_list_with_half()),
             make_type_selector(updated.get_index_typestr(), index_type_list()));
     } else if (str == "solver::Ir") {
         return dispatch<gko::LinOpFactory, IcHelper1<solver::Ir>::Configurator>(
             config, context, updated,
-            make_type_selector(updated.get_value_typestr(), value_type_list()),
+            make_type_selector(updated.get_value_typestr(),
+                               value_type_list_with_half()),
             make_type_selector(updated.get_index_typestr(), index_type_list()));
     } else if (str == "preconditioner::LowerIsai") {
         return dispatch<gko::LinOpFactory,
                         IcHelper2<preconditioner::LowerIsai>::Configurator>(
             config, context, updated,
-            make_type_selector(updated.get_value_typestr(), value_type_list()),
+            make_type_selector(updated.get_value_typestr(),
+                               value_type_list_with_half()),
             make_type_selector(updated.get_index_typestr(), index_type_list()));
     } else if (str == "solver::Gmres") {
         return dispatch<gko::LinOpFactory,
                         IcHelper1<solver::Gmres>::Configurator>(
             config, context, updated,
-            make_type_selector(updated.get_value_typestr(), value_type_list()),
+            make_type_selector(updated.get_value_typestr(),
+                               value_type_list_with_half()),
             make_type_selector(updated.get_index_typestr(), index_type_list()));
     } else {
         GKO_INVALID_CONFIG_VALUE("l_solver_type", str);
@@ -194,7 +198,7 @@ deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Ilu>(
                            ReverseApply::value>::template Configurator>(
                 config, context, updated,
                 make_type_selector(updated.get_value_typestr(),
-                                   value_type_list()),
+                                   value_type_list_with_half()),
                 make_type_selector(updated.get_index_typestr(),
                                    index_type_list()));
         } else if (str == "solver::Ir") {
@@ -204,7 +208,7 @@ deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Ilu>(
                            ReverseApply::value>::template Configurator>(
                 config, context, updated,
                 make_type_selector(updated.get_value_typestr(),
-                                   value_type_list()),
+                                   value_type_list_with_half()),
                 make_type_selector(updated.get_index_typestr(),
                                    index_type_list()));
         } else if (str == "preconditioner::LowerIsai") {
@@ -214,7 +218,7 @@ deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Ilu>(
                            ReverseApply::value>::template Configurator>(
                 config, context, updated,
                 make_type_selector(updated.get_value_typestr(),
-                                   value_type_list()),
+                                   value_type_list_with_half()),
                 make_type_selector(updated.get_index_typestr(),
                                    index_type_list()));
         } else if (str == "solver::Gmres") {
@@ -224,7 +228,7 @@ deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Ilu>(
                            ReverseApply::value>::template Configurator>(
                 config, context, updated,
                 make_type_selector(updated.get_value_typestr(),
-                                   value_type_list()),
+                                   value_type_list_with_half()),
                 make_type_selector(updated.get_index_typestr(),
                                    index_type_list()));
         } else {
@@ -256,7 +260,7 @@ deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Isai>(
                 IsaiHelper<preconditioner::isai_type::lower>::Configurator>(
                 config, context, updated,
                 make_type_selector(updated.get_value_typestr(),
-                                   value_type_list()),
+                                   value_type_list_with_half()),
                 make_type_selector(updated.get_index_typestr(),
                                    index_type_list()));
         } else if (str == "upper") {
@@ -265,7 +269,7 @@ deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Isai>(
                 IsaiHelper<preconditioner::isai_type::upper>::Configurator>(
                 config, context, updated,
                 make_type_selector(updated.get_value_typestr(),
-                                   value_type_list()),
+                                   value_type_list_with_half()),
                 make_type_selector(updated.get_index_typestr(),
                                    index_type_list()));
         } else if (str == "general") {
@@ -274,7 +278,7 @@ deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Isai>(
                 IsaiHelper<preconditioner::isai_type::general>::Configurator>(
                 config, context, updated,
                 make_type_selector(updated.get_value_typestr(),
-                                   value_type_list()),
+                                   value_type_list_with_half()),
                 make_type_selector(updated.get_index_typestr(),
                                    index_type_list()));
         } else if (str == "spd") {
@@ -283,7 +287,7 @@ deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Isai>(
                 IsaiHelper<preconditioner::isai_type::spd>::Configurator>(
                 config, context, updated,
                 make_type_selector(updated.get_value_typestr(),
-                                   value_type_list()),
+                                   value_type_list_with_half()),
                 make_type_selector(updated.get_index_typestr(),
                                    index_type_list()));
         } else {
@@ -296,7 +300,7 @@ deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Isai>(
 
 
 GKO_PARSE_VALUE_AND_INDEX_TYPE(GaussSeidel, gko::preconditioner::GaussSeidel);
-GKO_PARSE_VALUE_AND_INDEX_TYPE(Jacobi, gko::preconditioner::Jacobi);
+GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(Jacobi, gko::preconditioner::Jacobi);
 GKO_PARSE_VALUE_AND_INDEX_TYPE(Sor, gko::preconditioner::Sor);
 
 

From 32115ca3ebd9bb5ac0e718e7de355af3a8f40404 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 25 Oct 2024 19:20:59 +0200
Subject: [PATCH 391/448] reorder with half

---
 core/reorder/mc64.cpp                       | 32 ++++++++++-----------
 core/reorder/rcm.cpp                        |  2 +-
 core/reorder/scaled_reordered.cpp           |  3 +-
 core/test/reorder/amd.cpp                   |  3 +-
 reference/test/reorder/mc64.cpp             |  7 +++--
 reference/test/reorder/mc64_kernels.cpp     | 27 +++++++++++++----
 reference/test/reorder/rcm.cpp              |  3 +-
 reference/test/reorder/scaled_reordered.cpp | 26 +++++++++++++----
 test/reorder/amd.cpp                        |  3 +-
 9 files changed, 72 insertions(+), 34 deletions(-)

diff --git a/core/reorder/mc64.cpp b/core/reorder/mc64.cpp
index 97dd37b90fc..1319dea252a 100644
--- a/core/reorder/mc64.cpp
+++ b/core/reorder/mc64.cpp
@@ -37,8 +37,7 @@ void initialize_weights(const matrix::Csr<ValueType, IndexType>* host_mtx,
                         array<remove_complex<ValueType>>& row_maxima_array,
                         gko::experimental::reorder::mc64_strategy strategy)
 {
-    constexpr auto inf =
-        std::numeric_limits<remove_complex<ValueType>>::infinity();
+    const auto inf = std::numeric_limits<remove_complex<ValueType>>::infinity();
     const auto num_rows = host_mtx->get_size()[0];
     const auto row_ptrs = host_mtx->get_const_row_ptrs();
     const auto col_idxs = host_mtx->get_const_col_idxs();
@@ -67,11 +66,13 @@ void initialize_weights(const matrix::Csr<ValueType, IndexType>* host_mtx,
             }
         }
     };
-    if (strategy ==
-        gko::experimental::reorder::mc64_strategy::max_diagonal_sum) {
-        run_computation([](ValueType a) { return abs(a); });
+    if (strategy == mc64_strategy::max_diagonal_sum) {
+        run_computation(
+            [](ValueType a) -> remove_complex<ValueType> { return abs(a); });
     } else {
-        run_computation([](ValueType a) { return std::log2(abs(a)); });
+        run_computation([](ValueType a) -> remove_complex<ValueType> {
+            return std::log2(abs(a));
+        });
     }
 }
 
@@ -179,7 +180,7 @@ void shortest_augmenting_path(
     addressable_priority_queue<ValueType, IndexType>& queue,
     std::vector<IndexType>& q_j, ValueType tolerance)
 {
-    constexpr auto inf = std::numeric_limits<ValueType>::infinity();
+    const auto inf = std::numeric_limits<ValueType>::infinity();
     auto weights = weights_array.get_data();
     auto dual_u = dual_u_array.get_data();
     auto distance = distance_array.get_data();
@@ -433,8 +434,7 @@ void compute_scaling(const matrix::Csr<ValueType, IndexType>* host_mtx,
                      mc64_strategy strategy, ValueType* row_scaling,
                      ValueType* col_scaling)
 {
-    constexpr auto inf =
-        std::numeric_limits<remove_complex<ValueType>>::infinity();
+    const auto inf = std::numeric_limits<remove_complex<ValueType>>::infinity();
     const auto num_rows = host_mtx->get_size()[0];
     const auto weights = weights_array.get_const_data();
     const auto dual_u = dual_u_array.get_const_data();
@@ -459,13 +459,14 @@ void compute_scaling(const matrix::Csr<ValueType, IndexType>* host_mtx,
 }
 
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_MC64_INITIALIZE_WEIGHTS);
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_MC64_INITIAL_MATCHING);
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_MC64_SHORTEST_AUGMENTING_PATH);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_MC64_COMPUTE_SCALING);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_MC64_COMPUTE_SCALING);
 
 
 }  // namespace mc64
@@ -538,8 +539,7 @@ std::unique_ptr<LinOp> Mc64<ValueType, IndexType>::generate_impl(
     marked_cols.fill(0);
     matched_idxs.fill(0);
     unmatched_rows.fill(0);
-    constexpr auto inf =
-        std::numeric_limits<remove_complex<ValueType>>::infinity();
+    const auto inf = std::numeric_limits<remove_complex<ValueType>>::infinity();
     dual_u.fill(inf);
     distance.fill(inf);
 
@@ -588,7 +588,7 @@ std::unique_ptr<LinOp> Mc64<ValueType, IndexType>::generate_impl(
 
 
 #define GKO_DECLARE_MC64(ValueType, IndexType) class Mc64<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_MC64);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_MC64);
 
 
 }  // namespace reorder
diff --git a/core/reorder/rcm.cpp b/core/reorder/rcm.cpp
index 1acf4d97f1f..0d2bae4d7dc 100644
--- a/core/reorder/rcm.cpp
+++ b/core/reorder/rcm.cpp
@@ -114,7 +114,7 @@ Rcm<ValueType, IndexType>::Rcm(const Factory* factory,
 
 
 #define GKO_DECLARE_RCM(ValueType, IndexType) class Rcm<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_RCM);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_RCM);
 
 
 }  // namespace reorder
diff --git a/core/reorder/scaled_reordered.cpp b/core/reorder/scaled_reordered.cpp
index 264122c0b8f..210e513841b 100644
--- a/core/reorder/scaled_reordered.cpp
+++ b/core/reorder/scaled_reordered.cpp
@@ -84,7 +84,8 @@ void ScaledReordered<ValueType, IndexType>::apply_impl(const LinOp* alpha,
 
 #define GKO_DECLARE_SCALED_REORDERED(ValueType, IndexType) \
     class ScaledReordered<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SCALED_REORDERED);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_SCALED_REORDERED);
 
 
 }  // namespace reorder
diff --git a/core/test/reorder/amd.cpp b/core/test/reorder/amd.cpp
index 9eecf3777e1..b97201e929e 100644
--- a/core/test/reorder/amd.cpp
+++ b/core/test/reorder/amd.cpp
@@ -177,7 +177,8 @@ class Amd : public ::testing::Test {
     std::shared_ptr<gko::experimental::reorder::Amd<index_type>> amd;
 };
 
-TYPED_TEST_SUITE(Amd, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Amd, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(Amd, WorksAndReducesFillIn)
diff --git a/reference/test/reorder/mc64.cpp b/reference/test/reorder/mc64.cpp
index 2c64538e9b2..028c093c5f3 100644
--- a/reference/test/reorder/mc64.cpp
+++ b/reference/test/reorder/mc64.cpp
@@ -70,7 +70,8 @@ class Mc64 : public ::testing::Test {
     std::unique_ptr<reorder_type> mc64_factory;
 };
 
-TYPED_TEST_SUITE(Mc64, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Mc64, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(Mc64, HasSensibleDefaults)
@@ -86,11 +87,13 @@ TYPED_TEST(Mc64, HasSensibleDefaults)
 TYPED_TEST(Mc64, CanBeCreatedWithReorderingStrategy)
 {
     using reorder_type = typename TestFixture::reorder_type;
+    using real_type = typename TestFixture::real_type;
 
     auto mc64 =
         reorder_type::build()
             .with_strategy(
                 gko::experimental::reorder::mc64_strategy::max_diagonal_sum)
+            .with_tolerance(real_type{1e-4})
             .on(this->exec)
             ->generate(this->not_id3_mtx);
 
@@ -123,7 +126,7 @@ TYPED_TEST(Mc64, CanBeCreatedWithTolerance)
     using real_type = typename TestFixture::real_type;
 
     auto mc64 = reorder_type::build()
-                    .with_tolerance(real_type{1e-10})
+                    .with_tolerance(real_type{1e-4})
                     .on(this->exec)
                     ->generate(this->id3_mtx);
 
diff --git a/reference/test/reorder/mc64_kernels.cpp b/reference/test/reorder/mc64_kernels.cpp
index fb20d4af2c8..81dd1aa59a1 100644
--- a/reference/test/reorder/mc64_kernels.cpp
+++ b/reference/test/reorder/mc64_kernels.cpp
@@ -12,6 +12,7 @@
 
 #include <gtest/gtest.h>
 
+#include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/permutation.hpp>
@@ -118,7 +119,7 @@ class Mc64 : public ::testing::Test {
                                             {0., 0., 0., 4., 2., 0.},
                                             {0., 5., 8., 0., 0., 0.}},
                                            ref)),
-          zero_tol{1e-14}
+          zero_tol{1e-4}
     {}
 
     std::pair<std::shared_ptr<const perm_type>,
@@ -134,8 +135,8 @@ class Mc64 : public ::testing::Test {
     {
         ASSERT_EQ(a.get_size(), b.get_size());
         for (gko::size_type i = 0; i < a.get_size(); i++) {
-            if (std::isfinite(a.get_const_data()[i]) ||
-                std::isfinite(b.get_const_data()[i])) {
+            if (gko::is_finite(a.get_const_data()[i]) ||
+                gko::is_finite(b.get_const_data()[i])) {
                 ASSERT_NEAR(a.get_const_data()[i], b.get_const_data()[i],
                             r<value_type>::value)
                     << name << '[' << i << ']';
@@ -180,7 +181,8 @@ class Mc64 : public ::testing::Test {
     const real_type zero_tol;
 };
 
-TYPED_TEST_SUITE(Mc64, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Mc64, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(Mc64, InitializeWeightsSum)
@@ -284,6 +286,7 @@ TYPED_TEST(Mc64, CreatesCorrectPermutationAndScalingExampleSum)
         gko::experimental::reorder::Mc64<value_type, index_type>::build()
             .with_strategy(
                 gko::experimental::reorder::mc64_strategy::max_diagonal_sum)
+            .with_tolerance(real_type{1e-4})
             .on(this->ref);
 
     auto mc64 = mc64_factory->generate(this->mtx);
@@ -303,10 +306,12 @@ TYPED_TEST(Mc64, CreatesCorrectPermutationAndScalingExampleProduct)
 {
     using index_type = typename TestFixture::index_type;
     using value_type = typename TestFixture::value_type;
+    using real_type = typename TestFixture::real_type;
     auto mc64_factory =
         gko::experimental::reorder::Mc64<value_type, index_type>::build()
             .with_strategy(
                 gko::experimental::reorder::mc64_strategy::max_diagonal_product)
+            .with_tolerance(real_type{1e-4})
             .on(this->ref);
     auto mc64 = mc64_factory->generate(this->mtx);
 
@@ -344,6 +349,12 @@ TYPED_TEST(Mc64, CreatesCorrectPermutationAndScalingLargeTrivialExampleProduct)
     using value_type = typename TestFixture::value_type;
     using matrix_type = typename TestFixture::matrix_type;
     using perm_type = typename TestFixture::perm_type;
+    // A few scaling factors is zero and gives (inf, -nan) in inv_scaling when
+    // it is complex value. Depends on compiler and optimization level, the
+    // value / (inf, -nan) gives (0, 0), which can pass the test under the
+    // threshold, or (nan, nan), which fails. We disable not only complex<half>
+    // but also half, because it relies on the value/inf on the half.
+    SKIP_IF_HALF(value_type);
     // read input data
     std::ifstream mtx_stream{gko::matrices::location_1138_bus_mtx};
     auto mtx = gko::share(gko::read<matrix_type>(mtx_stream, this->ref));
@@ -354,6 +365,7 @@ TYPED_TEST(Mc64, CreatesCorrectPermutationAndScalingLargeTrivialExampleProduct)
         gko::experimental::reorder::Mc64<value_type, index_type>::build()
             .with_strategy(
                 gko::experimental::reorder::mc64_strategy::max_diagonal_product)
+            .with_tolerance(real_type{1e-4})
             .on(this->ref);
     auto mc64 = mc64_factory->generate(mtx);
     // get components
@@ -362,7 +374,7 @@ TYPED_TEST(Mc64, CreatesCorrectPermutationAndScalingLargeTrivialExampleProduct)
 
     mtx = mtx->scale_permute(row_perm, col_perm);
 
-    GKO_ASSERT_MTX_NEAR(mtx, expected_result, r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(mtx, expected_result, 20 * r<value_type>::value);
 }
 
 
@@ -373,6 +385,11 @@ TYPED_TEST(Mc64, CreatesCorrectPermutationAndScalingLargeExampleProduct)
     using value_type = typename TestFixture::value_type;
     using matrix_type = typename TestFixture::matrix_type;
     using perm_type = typename TestFixture::perm_type;
+    // some values are too small such that log2(abs(v)) -> -inf and some values
+    // are out of half-precision range -> inf. It leads some permutation values
+    // to be invalid_index after the kernel such that scale_permute gives
+    // segmentation fault.
+    SKIP_IF_HALF(value_type);
     // read input data
     std::ifstream mtx_stream{gko::matrices::location_nontrivial_mc64_example};
     auto mtx = gko::share(gko::read<matrix_type>(mtx_stream, this->ref));
diff --git a/reference/test/reorder/rcm.cpp b/reference/test/reorder/rcm.cpp
index ec547c141e3..ae63ca504bb 100644
--- a/reference/test/reorder/rcm.cpp
+++ b/reference/test/reorder/rcm.cpp
@@ -54,7 +54,8 @@ class Rcm : public ::testing::Test {
     std::unique_ptr<reorder_type> reorder_op;
 };
 
-TYPED_TEST_SUITE(Rcm, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Rcm, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(Rcm, CanBeCleared)
diff --git a/reference/test/reorder/scaled_reordered.cpp b/reference/test/reorder/scaled_reordered.cpp
index 75ab3728a30..b9924cd9418 100644
--- a/reference/test/reorder/scaled_reordered.cpp
+++ b/reference/test/reorder/scaled_reordered.cpp
@@ -132,7 +132,7 @@ class ScaledReordered : public ::testing::Test {
     gko::remove_complex<value_type> tol;
 };
 
-TYPED_TEST_SUITE(ScaledReordered, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(ScaledReordered, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
@@ -364,6 +364,9 @@ TYPED_TEST(ScaledReordered, AppliesWithRcmReordering)
 TYPED_TEST(ScaledReordered, SolvesSingleRhsWithOnlyInnerOperator)
 {
     using SR = typename TestFixture::SR;
+    using value_type = typename TestFixture::value_type;
+    // Need to solve them with scaling when using half
+    SKIP_IF_HALF(value_type);
     auto scaled_reordered_fact =
         SR::build().with_inner_operator(this->solver_factory).on(this->exec);
     auto scaled_reordered = scaled_reordered_fact->generate(this->rcm_mtx);
@@ -410,6 +413,9 @@ TYPED_TEST(ScaledReordered, SolvesSingleRhsWithColScaling)
 TYPED_TEST(ScaledReordered, SolvesSingleRhsWithRcmReordering)
 {
     using SR = typename TestFixture::SR;
+    using value_type = typename TestFixture::value_type;
+    // Need to solve them with scaling when using half
+    SKIP_IF_HALF(value_type);
     auto scaled_reordered_fact = SR::build()
                                      .with_reordering(this->rcm_factory)
                                      .with_inner_operator(this->solver_factory)
@@ -445,7 +451,8 @@ TYPED_TEST(ScaledReordered, SolvesSingleRhsWithScalingAndRcmReorderingMixed)
 {
     using SR = typename TestFixture::SR;
     using T = typename TestFixture::value_type;
-    using Vec = gko::matrix::Dense<gko::next_precision<T>>;
+    using OtherT = gko::next_precision_with_half<T>;
+    using Vec = gko::matrix::Dense<OtherT>;
     auto scaled_reordered_fact = SR::build()
                                      .with_row_scaling(this->diag2)
                                      .with_col_scaling(this->diag3)
@@ -459,7 +466,10 @@ TYPED_TEST(ScaledReordered, SolvesSingleRhsWithScalingAndRcmReorderingMixed)
 
     scaled_reordered->apply(b, res);
 
-    GKO_ASSERT_MTX_NEAR(res, x, 1e-5);
+    auto tol = std::max(static_cast<double>(r<OtherT>::value),
+                        static_cast<double>(r<T>::value)) *
+               15;
+    GKO_ASSERT_MTX_NEAR(res, x, tol);
 }
 
 
@@ -467,6 +477,7 @@ TYPED_TEST(ScaledReordered, AdvancedSolvesSingleRhsWithScalingAndRcmReordering)
 {
     using SR = typename TestFixture::SR;
     using Vec = typename TestFixture::Vec;
+    using T = typename TestFixture::value_type;
     const auto alpha = gko::initialize<Vec>({2.0}, this->exec);
     const auto beta = gko::initialize<Vec>({-1.0}, this->exec);
     auto scaled_reordered_fact = SR::build()
@@ -489,8 +500,8 @@ TYPED_TEST(ScaledReordered,
 {
     using SR = typename TestFixture::SR;
     using T = typename TestFixture::value_type;
-    using value_type = gko::next_precision<T>;
-    using Vec = gko::matrix::Dense<value_type>;
+    using OtherT = gko::next_precision_with_half<T>;
+    using Vec = gko::matrix::Dense<OtherT>;
     auto scaled_reordered_fact = SR::build()
                                      .with_row_scaling(this->diag2)
                                      .with_col_scaling(this->diag3)
@@ -506,7 +517,10 @@ TYPED_TEST(ScaledReordered,
 
     scaled_reordered->apply(alpha, b, beta, res);
 
-    GKO_ASSERT_MTX_NEAR(res, l({-8.3, -12.5, -5.9, -2., 2.9}), 1e-5);
+    auto tol = std::max(static_cast<double>(r<OtherT>::value),
+                        static_cast<double>(r<T>::value)) *
+               15;
+    GKO_ASSERT_MTX_NEAR(res, l({-8.3, -12.5, -5.9, -2., 2.9}), tol);
 }
 
 
diff --git a/test/reorder/amd.cpp b/test/reorder/amd.cpp
index a1ca7c09359..f5a17e943e1 100644
--- a/test/reorder/amd.cpp
+++ b/test/reorder/amd.cpp
@@ -40,7 +40,8 @@ class Amd : public CommonTestFixture {
     std::shared_ptr<matrix_type> dmtx;
 };
 
-TYPED_TEST_SUITE(Amd, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Amd, gko::test::ValueIndexTypesWithHalf,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(Amd, IsEquivalentToRef)

From e9ec66b0db3f06327382c43a0e2d5e1403920e7d Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 12 Nov 2024 16:32:21 +0100
Subject: [PATCH 392/448] change the default mc64 tolerance respect to
 precision

Co-authored-by: Marcel Koch <marcel.koch@kit.edu>
---
 include/ginkgo/core/reorder/mc64.hpp    |  6 ++++--
 reference/test/reorder/mc64.cpp         | 10 ++++------
 reference/test/reorder/mc64_kernels.cpp |  5 +----
 3 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/include/ginkgo/core/reorder/mc64.hpp b/include/ginkgo/core/reorder/mc64.hpp
index b2c1fd1a644..82b4f8f5be5 100644
--- a/include/ginkgo/core/reorder/mc64.hpp
+++ b/include/ginkgo/core/reorder/mc64.hpp
@@ -6,6 +6,7 @@
 #define GKO_PUBLIC_CORE_REORDER_MC64_HPP_
 
 
+#include <limits>
 #include <memory>
 
 #include <ginkgo/core/base/abstract_factory.hpp>
@@ -100,8 +101,9 @@ class Mc64 final
          * This parameter controls the tolerance below which a weight is
          * considered to be zero.
          */
-        remove_complex<ValueType> GKO_FACTORY_PARAMETER_SCALAR(tolerance,
-                                                               1e-14);
+        remove_complex<ValueType> GKO_FACTORY_PARAMETER_SCALAR(
+            tolerance,
+            50 * std::numeric_limits<remove_complex<ValueType>>::epsilon());
     };
 
     /**
diff --git a/reference/test/reorder/mc64.cpp b/reference/test/reorder/mc64.cpp
index 028c093c5f3..0670c77f6e2 100644
--- a/reference/test/reorder/mc64.cpp
+++ b/reference/test/reorder/mc64.cpp
@@ -4,6 +4,7 @@
 
 #include <algorithm>
 #include <fstream>
+#include <limits>
 #include <memory>
 
 #include <gtest/gtest.h>
@@ -80,7 +81,8 @@ TYPED_TEST(Mc64, HasSensibleDefaults)
 
     ASSERT_EQ(this->mc64_factory->get_parameters().strategy,
               gko::experimental::reorder::mc64_strategy::max_diagonal_product);
-    ASSERT_EQ(this->mc64_factory->get_parameters().tolerance, real_type{1e-14});
+    ASSERT_EQ(this->mc64_factory->get_parameters().tolerance,
+              50 * std::numeric_limits<real_type>::epsilon());
 }
 
 
@@ -93,7 +95,6 @@ TYPED_TEST(Mc64, CanBeCreatedWithReorderingStrategy)
         reorder_type::build()
             .with_strategy(
                 gko::experimental::reorder::mc64_strategy::max_diagonal_sum)
-            .with_tolerance(real_type{1e-4})
             .on(this->exec)
             ->generate(this->not_id3_mtx);
 
@@ -125,10 +126,7 @@ TYPED_TEST(Mc64, CanBeCreatedWithTolerance)
     using reorder_type = typename TestFixture::reorder_type;
     using real_type = typename TestFixture::real_type;
 
-    auto mc64 = reorder_type::build()
-                    .with_tolerance(real_type{1e-4})
-                    .on(this->exec)
-                    ->generate(this->id3_mtx);
+    auto mc64 = reorder_type::build().on(this->exec)->generate(this->id3_mtx);
 
     this->assert_correct_permutation(mc64.get());
 }
diff --git a/reference/test/reorder/mc64_kernels.cpp b/reference/test/reorder/mc64_kernels.cpp
index 81dd1aa59a1..f31bf7ba658 100644
--- a/reference/test/reorder/mc64_kernels.cpp
+++ b/reference/test/reorder/mc64_kernels.cpp
@@ -119,7 +119,7 @@ class Mc64 : public ::testing::Test {
                                             {0., 0., 0., 4., 2., 0.},
                                             {0., 5., 8., 0., 0., 0.}},
                                            ref)),
-          zero_tol{1e-4}
+          zero_tol{50 * std::numeric_limits<real_type>::epsilon()}
     {}
 
     std::pair<std::shared_ptr<const perm_type>,
@@ -286,7 +286,6 @@ TYPED_TEST(Mc64, CreatesCorrectPermutationAndScalingExampleSum)
         gko::experimental::reorder::Mc64<value_type, index_type>::build()
             .with_strategy(
                 gko::experimental::reorder::mc64_strategy::max_diagonal_sum)
-            .with_tolerance(real_type{1e-4})
             .on(this->ref);
 
     auto mc64 = mc64_factory->generate(this->mtx);
@@ -311,7 +310,6 @@ TYPED_TEST(Mc64, CreatesCorrectPermutationAndScalingExampleProduct)
         gko::experimental::reorder::Mc64<value_type, index_type>::build()
             .with_strategy(
                 gko::experimental::reorder::mc64_strategy::max_diagonal_product)
-            .with_tolerance(real_type{1e-4})
             .on(this->ref);
     auto mc64 = mc64_factory->generate(this->mtx);
 
@@ -365,7 +363,6 @@ TYPED_TEST(Mc64, CreatesCorrectPermutationAndScalingLargeTrivialExampleProduct)
         gko::experimental::reorder::Mc64<value_type, index_type>::build()
             .with_strategy(
                 gko::experimental::reorder::mc64_strategy::max_diagonal_product)
-            .with_tolerance(real_type{1e-4})
             .on(this->ref);
     auto mc64 = mc64_factory->generate(mtx);
     // get components

From 38003495c5ee9be4611b234ce6e2688265401b41 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 28 Oct 2024 18:46:18 +0100
Subject: [PATCH 393/448] log with half

---
 core/log/convergence.cpp           |  2 +-
 core/log/papi.cpp                  |  2 +-
 core/log/solver_progress.cpp       |  8 ++++
 core/log/stream.cpp                |  2 +-
 core/test/log/convergence.cpp      |  3 +-
 core/test/log/papi.cpp             |  2 +-
 core/test/log/solver_progress.cpp  |  3 +-
 core/test/log/stream.cpp           | 72 +++++++++++++++---------------
 reference/test/log/convergence.cpp |  3 +-
 reference/test/log/papi.cpp        |  2 +-
 10 files changed, 55 insertions(+), 44 deletions(-)

diff --git a/core/log/convergence.cpp b/core/log/convergence.cpp
index 7cfa764dfd1..78f004226cb 100644
--- a/core/log/convergence.cpp
+++ b/core/log/convergence.cpp
@@ -110,7 +110,7 @@ void Convergence<ValueType>::on_iteration_complete(
 
 
 #define GKO_DECLARE_CONVERGENCE(_type) class Convergence<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CONVERGENCE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CONVERGENCE);
 
 
 }  // namespace log
diff --git a/core/log/papi.cpp b/core/log/papi.cpp
index 5ced377ca38..b5c56527687 100644
--- a/core/log/papi.cpp
+++ b/core/log/papi.cpp
@@ -279,7 +279,7 @@ void Papi<ValueType>::on_iteration_complete(
 
 
 #define GKO_DECLARE_PAPI(_type) class Papi<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_PAPI);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_PAPI);
 
 
 }  // namespace log
diff --git a/core/log/solver_progress.cpp b/core/log/solver_progress.cpp
index effa0279bba..4d1566e159f 100644
--- a/core/log/solver_progress.cpp
+++ b/core/log/solver_progress.cpp
@@ -247,6 +247,14 @@ class SolverProgressStore : public SolverProgress {
         run<gko::matrix::Dense<double>, gko::matrix::Dense<float>,
             gko::matrix::Dense<std::complex<double>>,
             gko::matrix::Dense<std::complex<float>>,
+#if GINKGO_ENABLE_HALF
+            gko::matrix::Dense<gko::half>,
+            gko::matrix::Dense<std::complex<gko::half>>,
+            gko::WritableToMatrixData<gko::half, int32>,
+            gko::WritableToMatrixData<std::complex<gko::half>, int32>,
+            gko::WritableToMatrixData<gko::half, int64>,
+            gko::WritableToMatrixData<std::complex<gko::half>, int64>,
+#endif
             // fallback for other matrix types
             gko::WritableToMatrixData<double, int32>,
             gko::WritableToMatrixData<float, int32>,
diff --git a/core/log/stream.cpp b/core/log/stream.cpp
index 5e510d409e2..69eef2e0949 100644
--- a/core/log/stream.cpp
+++ b/core/log/stream.cpp
@@ -482,7 +482,7 @@ void Stream<ValueType>::on_iteration_complete(
 
 
 #define GKO_DECLARE_STREAM(_type) class Stream<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_STREAM);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_STREAM);
 
 
 }  // namespace log
diff --git a/core/test/log/convergence.cpp b/core/test/log/convergence.cpp
index 8fff0c17b8e..64ec37e8942 100644
--- a/core/test/log/convergence.cpp
+++ b/core/test/log/convergence.cpp
@@ -45,7 +45,8 @@ class Convergence : public ::testing::Test {
     gko::array<gko::stopping_status> status = {exec, 1};
 };
 
-TYPED_TEST_SUITE(Convergence, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Convergence, gko::test::ValueTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(Convergence, CanGetEmptyData)
diff --git a/core/test/log/papi.cpp b/core/test/log/papi.cpp
index 8278120cc49..e0404b04d90 100644
--- a/core/test/log/papi.cpp
+++ b/core/test/log/papi.cpp
@@ -91,7 +91,7 @@ class Papi : public ::testing::Test {
     int eventset;
 };
 
-TYPED_TEST_SUITE(Papi, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Papi, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
 
 
 TYPED_TEST(Papi, CatchesAllocationStarted)
diff --git a/core/test/log/solver_progress.cpp b/core/test/log/solver_progress.cpp
index e00044a908d..2b4a6ac599c 100644
--- a/core/test/log/solver_progress.cpp
+++ b/core/test/log/solver_progress.cpp
@@ -68,7 +68,8 @@ class SolverProgress : public ::testing::Test {
     std::unique_ptr<Cg> solver;
 };
 
-TYPED_TEST_SUITE(SolverProgress, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(SolverProgress, gko::test::ValueTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(SolverProgress, TableWorks)
diff --git a/core/test/log/stream.cpp b/core/test/log/stream.cpp
index 995a9975b89..7f4b41e5cc3 100644
--- a/core/test/log/stream.cpp
+++ b/core/test/log/stream.cpp
@@ -26,7 +26,7 @@ constexpr int num_iters = 10;
 template <typename T>
 class Stream : public ::testing::Test {};
 
-TYPED_TEST_SUITE(Stream, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Stream, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
 
 
 TYPED_TEST(Stream, CatchesAllocationStarted)
@@ -380,17 +380,17 @@ TYPED_TEST(Stream, CatchesLinOpApplyStartedWithVerbose)
     std::stringstream out;
     auto logger = gko::log::Stream<TypeParam>::create(
         gko::log::Logger::linop_apply_started_mask, out, true);
-    auto A = gko::initialize<Dense>({1.1}, exec);
-    auto b = gko::initialize<Dense>({-2.2}, exec);
-    auto x = gko::initialize<Dense>({3.3}, exec);
+    auto A = gko::initialize<Dense>({1.5}, exec);
+    auto b = gko::initialize<Dense>({-2.25}, exec);
+    auto x = gko::initialize<Dense>({3.125}, exec);
 
     logger->template on<gko::log::Logger::linop_apply_started>(A.get(), b.get(),
                                                                x.get());
 
     auto os = out.str();
-    GKO_ASSERT_STR_CONTAINS(os, "1.1");
-    GKO_ASSERT_STR_CONTAINS(os, "-2.2");
-    GKO_ASSERT_STR_CONTAINS(os, "3.3");
+    GKO_ASSERT_STR_CONTAINS(os, "1.5");
+    GKO_ASSERT_STR_CONTAINS(os, "-2.25");
+    GKO_ASSERT_STR_CONTAINS(os, "3.125");
 }
 
 
@@ -429,17 +429,17 @@ TYPED_TEST(Stream, CatchesLinOpApplyCompletedWithVerbose)
     std::stringstream out;
     auto logger = gko::log::Stream<TypeParam>::create(
         gko::log::Logger::linop_apply_completed_mask, out, true);
-    auto A = gko::initialize<Dense>({1.1}, exec);
-    auto b = gko::initialize<Dense>({-2.2}, exec);
-    auto x = gko::initialize<Dense>({3.3}, exec);
+    auto A = gko::initialize<Dense>({1.5}, exec);
+    auto b = gko::initialize<Dense>({-2.25}, exec);
+    auto x = gko::initialize<Dense>({3.125}, exec);
 
     logger->template on<gko::log::Logger::linop_apply_completed>(
         A.get(), b.get(), x.get());
 
     auto os = out.str();
-    GKO_ASSERT_STR_CONTAINS(os, "1.1");
-    GKO_ASSERT_STR_CONTAINS(os, "-2.2");
-    GKO_ASSERT_STR_CONTAINS(os, "3.3");
+    GKO_ASSERT_STR_CONTAINS(os, "1.5");
+    GKO_ASSERT_STR_CONTAINS(os, "-2.25");
+    GKO_ASSERT_STR_CONTAINS(os, "3.125");
 }
 
 
@@ -486,21 +486,21 @@ TYPED_TEST(Stream, CatchesLinOpAdvancedApplyStartedWithVerbose)
     std::stringstream out;
     auto logger = gko::log::Stream<TypeParam>::create(
         gko::log::Logger::linop_advanced_apply_started_mask, out, true);
-    auto A = gko::initialize<Dense>({1.1}, exec);
-    auto alpha = gko::initialize<Dense>({-4.4}, exec);
-    auto b = gko::initialize<Dense>({-2.2}, exec);
+    auto A = gko::initialize<Dense>({1.5}, exec);
+    auto alpha = gko::initialize<Dense>({-4.75}, exec);
+    auto b = gko::initialize<Dense>({-2.25}, exec);
     auto beta = gko::initialize<Dense>({-5.5}, exec);
-    auto x = gko::initialize<Dense>({3.3}, exec);
+    auto x = gko::initialize<Dense>({3.125}, exec);
 
     logger->template on<gko::log::Logger::linop_advanced_apply_started>(
         A.get(), alpha.get(), b.get(), beta.get(), x.get());
 
     auto os = out.str();
-    GKO_ASSERT_STR_CONTAINS(os, "1.1");
-    GKO_ASSERT_STR_CONTAINS(os, "-4.4");
-    GKO_ASSERT_STR_CONTAINS(os, "-2.2");
+    GKO_ASSERT_STR_CONTAINS(os, "1.5");
+    GKO_ASSERT_STR_CONTAINS(os, "-4.75");
+    GKO_ASSERT_STR_CONTAINS(os, "-2.25");
     GKO_ASSERT_STR_CONTAINS(os, "-5.5");
-    GKO_ASSERT_STR_CONTAINS(os, "3.3");
+    GKO_ASSERT_STR_CONTAINS(os, "3.125");
 }
 
 
@@ -547,21 +547,21 @@ TYPED_TEST(Stream, CatchesLinOpAdvancedApplyCompletedWithVerbose)
     std::stringstream out;
     auto logger = gko::log::Stream<TypeParam>::create(
         gko::log::Logger::linop_advanced_apply_completed_mask, out, true);
-    auto A = gko::initialize<Dense>({1.1}, exec);
-    auto alpha = gko::initialize<Dense>({-4.4}, exec);
-    auto b = gko::initialize<Dense>({-2.2}, exec);
+    auto A = gko::initialize<Dense>({1.5}, exec);
+    auto alpha = gko::initialize<Dense>({-4.75}, exec);
+    auto b = gko::initialize<Dense>({-2.25}, exec);
     auto beta = gko::initialize<Dense>({-5.5}, exec);
-    auto x = gko::initialize<Dense>({3.3}, exec);
+    auto x = gko::initialize<Dense>({3.125}, exec);
 
     logger->template on<gko::log::Logger::linop_advanced_apply_completed>(
         A.get(), alpha.get(), b.get(), beta.get(), x.get());
 
     auto os = out.str();
-    GKO_ASSERT_STR_CONTAINS(os, "1.1");
-    GKO_ASSERT_STR_CONTAINS(os, "-4.4");
-    GKO_ASSERT_STR_CONTAINS(os, "-2.2");
+    GKO_ASSERT_STR_CONTAINS(os, "1.5");
+    GKO_ASSERT_STR_CONTAINS(os, "-4.75");
+    GKO_ASSERT_STR_CONTAINS(os, "-2.25");
     GKO_ASSERT_STR_CONTAINS(os, "-5.5");
-    GKO_ASSERT_STR_CONTAINS(os, "3.3");
+    GKO_ASSERT_STR_CONTAINS(os, "3.125");
 }
 
 
@@ -782,11 +782,11 @@ TYPED_TEST(Stream, CatchesIterationsWithVerbose)
         gko::solver::Bicgstab<TypeParam>::build()
             .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(exec);
-    auto solver = factory->generate(gko::initialize<Dense>({1.1}, exec));
+    auto solver = factory->generate(gko::initialize<Dense>({1.25}, exec));
     auto right_hand_side = gko::initialize<Dense>({-5.5}, exec);
-    auto residual = gko::initialize<Dense>({-4.4}, exec);
-    auto solution = gko::initialize<Dense>({-2.2}, exec);
-    auto residual_norm = gko::initialize<Dense>({-3.3}, exec);
+    auto residual = gko::initialize<Dense>({-4.5}, exec);
+    auto solution = gko::initialize<Dense>({-2.25}, exec);
+    auto residual_norm = gko::initialize<Dense>({-3.125}, exec);
     gko::array<gko::stopping_status> stop_status(exec, 1);
 
     logger->template on<gko::log::Logger::iteration_complete>(
@@ -795,9 +795,9 @@ TYPED_TEST(Stream, CatchesIterationsWithVerbose)
 
     auto os = out.str();
     GKO_ASSERT_STR_CONTAINS(os, "-5.5");
-    GKO_ASSERT_STR_CONTAINS(os, "-4.4");
-    GKO_ASSERT_STR_CONTAINS(os, "-2.2");
-    GKO_ASSERT_STR_CONTAINS(os, "-3.3");
+    GKO_ASSERT_STR_CONTAINS(os, "-4.5");
+    GKO_ASSERT_STR_CONTAINS(os, "-2.25");
+    GKO_ASSERT_STR_CONTAINS(os, "-3.125");
     GKO_ASSERT_STR_CONTAINS(os, "Finalized:")
 }
 
diff --git a/reference/test/log/convergence.cpp b/reference/test/log/convergence.cpp
index 50db0db49c4..70fc004c030 100644
--- a/reference/test/log/convergence.cpp
+++ b/reference/test/log/convergence.cpp
@@ -19,7 +19,8 @@ namespace {
 template <typename T>
 class Convergence : public ::testing::Test {};
 
-TYPED_TEST_SUITE(Convergence, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Convergence, gko::test::ValueTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(Convergence, CatchesCriterionCheckCompleted)
diff --git a/reference/test/log/papi.cpp b/reference/test/log/papi.cpp
index 4f1d9e469f1..647a14af9b2 100644
--- a/reference/test/log/papi.cpp
+++ b/reference/test/log/papi.cpp
@@ -83,7 +83,7 @@ class Papi : public ::testing::Test {
     int eventset;
 };
 
-TYPED_TEST_SUITE(Papi, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Papi, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
 
 
 TYPED_TEST(Papi, CatchesCriterionCheckCompleted)

From cb41bcb78b7ed5d4bdaa7f2bd1f9c11229805071 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 29 Oct 2024 16:58:44 +0100
Subject: [PATCH 394/448] dispatch with distributed needs to throw with half

---
 core/distributed/helpers.hpp                  |  14 +-
 core/multigrid/pgm.cpp                        | 266 +++++++++---------
 core/solver/multigrid.cpp                     | 174 ++++++++----
 .../ginkgo/core/base/precision_dispatch.hpp   | 143 ++++++----
 4 files changed, 350 insertions(+), 247 deletions(-)

diff --git a/core/distributed/helpers.hpp b/core/distributed/helpers.hpp
index 5536dbe32f0..9ce7d3b6ab4 100644
--- a/core/distributed/helpers.hpp
+++ b/core/distributed/helpers.hpp
@@ -122,11 +122,15 @@ void vector_dispatch(T* linop, F&& f, Args&&... args)
 {
 #if GINKGO_BUILD_MPI
     if (is_distributed(linop)) {
-        using type = std::conditional_t<
-            std::is_const<T>::value,
-            const experimental::distributed::Vector<ValueType>,
-            experimental::distributed::Vector<ValueType>>;
-        f(dynamic_cast<type*>(linop), std::forward<Args>(args)...);
+        if constexpr (std::is_same_v<remove_complex<ValueType>, half>) {
+            GKO_NOT_SUPPORTED(linop);
+        } else {
+            using type = std::conditional_t<
+                std::is_const<T>::value,
+                const experimental::distributed::Vector<ValueType>,
+                experimental::distributed::Vector<ValueType>>;
+            f(dynamic_cast<type*>(linop), std::forward<Args>(args)...);
+        }
     } else
 #endif
     {
diff --git a/core/multigrid/pgm.cpp b/core/multigrid/pgm.cpp
index e531fb2b996..d4e4ffde4de 100644
--- a/core/multigrid/pgm.cpp
+++ b/core/multigrid/pgm.cpp
@@ -389,137 +389,147 @@ void Pgm<ValueType, IndexType>::generate()
 #if GINKGO_BUILD_MPI
     if (std::dynamic_pointer_cast<
             const experimental::distributed::DistributedBase>(system_matrix_)) {
-        auto convert_fine_op = [&](auto matrix) {
-            using global_index_type = typename std::decay_t<
-                decltype(*matrix)>::result_type::global_index_type;
-            auto exec = as<LinOp>(matrix)->get_executor();
-            auto comm = as<experimental::distributed::DistributedBase>(matrix)
-                            ->get_communicator();
-            auto fine = share(
-                experimental::distributed::
-                    Matrix<ValueType, IndexType, global_index_type>::create(
-                        exec, comm,
-                        matrix::Csr<ValueType, IndexType>::create(exec),
-                        matrix::Csr<ValueType, IndexType>::create(exec)));
-            matrix->convert_to(fine);
-            this->set_fine_op(fine);
-        };
-        auto setup_fine_op = [&](auto matrix) {
-            // Only support csr matrix currently.
-            auto local_csr = std::dynamic_pointer_cast<const csr_type>(
-                matrix->get_local_matrix());
-            auto non_local_csr = std::dynamic_pointer_cast<const csr_type>(
-                matrix->get_non_local_matrix());
-            // If system matrix is not csr or need sorting, generate the csr.
-            if (!parameters_.skip_sorting || !local_csr || !non_local_csr) {
+        if constexpr (std::is_same_v<remove_complex<ValueType>, half>) {
+            GKO_NOT_SUPPORTED(nullptr);
+        } else {
+            auto convert_fine_op = [&](auto matrix) {
+                using global_index_type = typename std::decay_t<
+                    decltype(*matrix)>::result_type::global_index_type;
+                auto exec = as<LinOp>(matrix)->get_executor();
+                auto comm =
+                    as<experimental::distributed::DistributedBase>(matrix)
+                        ->get_communicator();
+                auto fine = share(
+                    experimental::distributed::
+                        Matrix<ValueType, IndexType, global_index_type>::create(
+                            exec, comm,
+                            matrix::Csr<ValueType, IndexType>::create(exec),
+                            matrix::Csr<ValueType, IndexType>::create(exec)));
+                matrix->convert_to(fine);
+                this->set_fine_op(fine);
+            };
+            auto setup_fine_op = [&](auto matrix) {
+                // Only support csr matrix currently.
+                auto local_csr = std::dynamic_pointer_cast<const csr_type>(
+                    matrix->get_local_matrix());
+                auto non_local_csr = std::dynamic_pointer_cast<const csr_type>(
+                    matrix->get_non_local_matrix());
+                // If system matrix is not csr or need sorting, generate the
+                // csr.
+                if (!parameters_.skip_sorting || !local_csr || !non_local_csr) {
+                    using global_index_type = typename std::decay_t<
+                        decltype(*matrix)>::global_index_type;
+                    convert_fine_op(
+                        as<ConvertibleTo<experimental::distributed::Matrix<
+                            ValueType, IndexType, global_index_type>>>(matrix));
+                }
+            };
+
+            using fst_mtx_type =
+                experimental::distributed::Matrix<ValueType, IndexType,
+                                                  IndexType>;
+            using snd_mtx_type =
+                experimental::distributed::Matrix<ValueType, IndexType, int64>;
+            // setup the fine op using Csr with current ValueType
+            // we do not use dispatcher run in the first place because we have
+            // the fallback option for that.
+            if (auto obj = std::dynamic_pointer_cast<const fst_mtx_type>(
+                    system_matrix_)) {
+                setup_fine_op(obj);
+            } else if (auto obj = std::dynamic_pointer_cast<const snd_mtx_type>(
+                           system_matrix_)) {
+                setup_fine_op(obj);
+            } else {
+                // handle other ValueTypes.
+                run<ConvertibleTo, fst_mtx_type, snd_mtx_type>(obj,
+                                                               convert_fine_op);
+            }
+
+            auto distributed_setup = [&](auto matrix) {
+                auto exec = gko::as<LinOp>(matrix)->get_executor();
+                auto comm =
+                    gko::as<experimental::distributed::DistributedBase>(matrix)
+                        ->get_communicator();
+                auto num_rank = comm.size();
+                auto pgm_local_op =
+                    gko::as<const csr_type>(matrix->get_local_matrix());
+                auto result = this->generate_local(pgm_local_op);
+
+                auto non_local_csr =
+                    as<const csr_type>(matrix->get_non_local_matrix());
+                auto non_local_size = non_local_csr->get_size()[1];
+                array<IndexType> non_local_agg(exec, non_local_size);
+                // get agg information (prolong_row_gather row idx)
+                communicate(matrix, agg_, non_local_agg);
+                // generate non_local_col_map
+                non_local_agg.set_executor(exec->get_master());
+                array<IndexType> non_local_col_map(exec->get_master(),
+                                                   non_local_size);
+                // add additional entry in tail such that the offset easily
+                // handle it.
+                array<IndexType> renumber(exec->get_master(),
+                                          non_local_size + 1);
+                auto recv_offsets = matrix->recv_offsets_;
+                generate_non_local_map(recv_offsets, non_local_agg,
+                                       non_local_col_map, renumber);
+
+                // get new recv_size and recv_offsets
+                std::vector<experimental::distributed::comm_index_type>
+                    new_recv_size(num_rank);
+                std::vector<experimental::distributed::comm_index_type>
+                    new_recv_offsets(num_rank + 1);
+                array<IndexType> new_recv_gather_idxs(exec->get_master());
+                compute_communication(recv_offsets, non_local_agg, renumber,
+                                      new_recv_size, new_recv_offsets,
+                                      new_recv_gather_idxs);
+
+                non_local_col_map.set_executor(exec);
+                IndexType non_local_num_agg = new_recv_gather_idxs.get_size();
+                // build csr from row and col map
+                // unlike non-distributed version, generate_coarse uses
+                // different row and col maps.
+                auto result_non_local_csr = generate_coarse(
+                    exec, non_local_csr.get(),
+                    static_cast<IndexType>(std::get<1>(result)->get_size()[0]),
+                    agg_, non_local_num_agg, non_local_col_map);
+                // use local and non-local to build coarse matrix
+                // also restriction and prolongation (Local-only-global matrix)
+                auto coarse_size =
+                    static_cast<int64>(std::get<1>(result)->get_size()[0]);
+                comm.all_reduce(exec->get_master(), &coarse_size, 1, MPI_SUM);
+                new_recv_gather_idxs.set_executor(exec);
+
+                // setup the generated linop.
                 using global_index_type =
                     typename std::decay_t<decltype(*matrix)>::global_index_type;
-                convert_fine_op(
-                    as<ConvertibleTo<experimental::distributed::Matrix<
-                        ValueType, IndexType, global_index_type>>>(matrix));
-            }
-        };
-
-        using fst_mtx_type =
-            experimental::distributed::Matrix<ValueType, IndexType, IndexType>;
-        using snd_mtx_type =
-            experimental::distributed::Matrix<ValueType, IndexType, int64>;
-        // setup the fine op using Csr with current ValueType
-        // we do not use dispatcher run in the first place because we have the
-        // fallback option for that.
-        if (auto obj =
-                std::dynamic_pointer_cast<const fst_mtx_type>(system_matrix_)) {
-            setup_fine_op(obj);
-        } else if (auto obj = std::dynamic_pointer_cast<const snd_mtx_type>(
-                       system_matrix_)) {
-            setup_fine_op(obj);
-        } else {
-            // handle other ValueTypes.
-            run<ConvertibleTo, fst_mtx_type, snd_mtx_type>(obj,
-                                                           convert_fine_op);
+                auto coarse = share(
+                    experimental::distributed::
+                        Matrix<ValueType, IndexType, global_index_type>::create(
+                            exec, comm, gko::dim<2>(coarse_size, coarse_size),
+                            std::get<1>(result), result_non_local_csr,
+                            new_recv_size, new_recv_offsets,
+                            new_recv_gather_idxs));
+                auto restrict_op = share(
+                    experimental::distributed::
+                        Matrix<ValueType, IndexType, global_index_type>::create(
+                            exec, comm,
+                            dim<2>(coarse_size,
+                                   gko::as<LinOp>(matrix)->get_size()[0]),
+                            std::get<2>(result)));
+                auto prolong_op = share(
+                    experimental::distributed::
+                        Matrix<ValueType, IndexType, global_index_type>::create(
+                            exec, comm,
+                            dim<2>(gko::as<LinOp>(matrix)->get_size()[0],
+                                   coarse_size),
+                            std::get<0>(result)));
+                this->set_multigrid_level(prolong_op, coarse, restrict_op);
+            };
+
+            // the fine op is using csr with the current ValueType
+            run<fst_mtx_type, snd_mtx_type>(this->get_fine_op(),
+                                            distributed_setup);
         }
-
-        auto distributed_setup = [&](auto matrix) {
-            auto exec = gko::as<LinOp>(matrix)->get_executor();
-            auto comm =
-                gko::as<experimental::distributed::DistributedBase>(matrix)
-                    ->get_communicator();
-            auto num_rank = comm.size();
-            auto pgm_local_op =
-                gko::as<const csr_type>(matrix->get_local_matrix());
-            auto result = this->generate_local(pgm_local_op);
-
-            auto non_local_csr =
-                as<const csr_type>(matrix->get_non_local_matrix());
-            auto non_local_size = non_local_csr->get_size()[1];
-            array<IndexType> non_local_agg(exec, non_local_size);
-            // get agg information (prolong_row_gather row idx)
-            communicate(matrix, agg_, non_local_agg);
-            // generate non_local_col_map
-            non_local_agg.set_executor(exec->get_master());
-            array<IndexType> non_local_col_map(exec->get_master(),
-                                               non_local_size);
-            // add additional entry in tail such that the offset easily handle
-            // it.
-            array<IndexType> renumber(exec->get_master(), non_local_size + 1);
-            auto recv_offsets = matrix->recv_offsets_;
-            generate_non_local_map(recv_offsets, non_local_agg,
-                                   non_local_col_map, renumber);
-
-            // get new recv_size and recv_offsets
-            std::vector<experimental::distributed::comm_index_type>
-                new_recv_size(num_rank);
-            std::vector<experimental::distributed::comm_index_type>
-                new_recv_offsets(num_rank + 1);
-            array<IndexType> new_recv_gather_idxs(exec->get_master());
-            compute_communication(recv_offsets, non_local_agg, renumber,
-                                  new_recv_size, new_recv_offsets,
-                                  new_recv_gather_idxs);
-
-            non_local_col_map.set_executor(exec);
-            IndexType non_local_num_agg = new_recv_gather_idxs.get_size();
-            // build csr from row and col map
-            // unlike non-distributed version, generate_coarse uses different
-            // row and col maps.
-            auto result_non_local_csr = generate_coarse(
-                exec, non_local_csr.get(),
-                static_cast<IndexType>(std::get<1>(result)->get_size()[0]),
-                agg_, non_local_num_agg, non_local_col_map);
-            // use local and non-local to build coarse matrix
-            // also restriction and prolongation (Local-only-global matrix)
-            auto coarse_size =
-                static_cast<int64>(std::get<1>(result)->get_size()[0]);
-            comm.all_reduce(exec->get_master(), &coarse_size, 1, MPI_SUM);
-            new_recv_gather_idxs.set_executor(exec);
-
-            // setup the generated linop.
-            using global_index_type =
-                typename std::decay_t<decltype(*matrix)>::global_index_type;
-            auto coarse = share(
-                experimental::distributed::
-                    Matrix<ValueType, IndexType, global_index_type>::create(
-                        exec, comm, gko::dim<2>(coarse_size, coarse_size),
-                        std::get<1>(result), result_non_local_csr,
-                        new_recv_size, new_recv_offsets, new_recv_gather_idxs));
-            auto restrict_op = share(
-                experimental::distributed::
-                    Matrix<ValueType, IndexType, global_index_type>::create(
-                        exec, comm,
-                        dim<2>(coarse_size,
-                               gko::as<LinOp>(matrix)->get_size()[0]),
-                        std::get<2>(result)));
-            auto prolong_op = share(
-                experimental::distributed::
-                    Matrix<ValueType, IndexType, global_index_type>::create(
-                        exec, comm,
-                        dim<2>(gko::as<LinOp>(matrix)->get_size()[0],
-                               coarse_size),
-                        std::get<0>(result)));
-            this->set_multigrid_level(prolong_op, coarse, restrict_op);
-        };
-
-        // the fine op is using csr with the current ValueType
-        run<fst_mtx_type, snd_mtx_type>(this->get_fine_op(), distributed_setup);
     } else
 #endif  // GINKGO_BUILD_MPI
     {
diff --git a/core/solver/multigrid.cpp b/core/solver/multigrid.cpp
index 0b918a13897..9b2a4a814e1 100644
--- a/core/solver/multigrid.cpp
+++ b/core/solver/multigrid.cpp
@@ -101,11 +101,16 @@ void handle_list(
         auto exec = matrix->get_executor();
 #if GINKGO_BUILD_MPI
         if (gko::detail::is_distributed(matrix.get())) {
-            using experimental::distributed::Matrix;
-            return run<Matrix<ValueType, int32, int32>,
-                       Matrix<ValueType, int32, int64>,
-                       Matrix<ValueType, int64, int64>>(
-                matrix, [exec, iteration, relaxation_factor](auto matrix) {
+            if constexpr (std::is_same_v<remove_complex<ValueType>, half>) {
+                GKO_NOT_SUPPORTED(matrix);
+            } else {
+                using experimental::distributed::Matrix;
+                return run<Matrix<ValueType, int32, int32>,
+                           Matrix<ValueType, int32, int64>,
+                           Matrix<ValueType, int64,
+                                  int64>>(matrix, [exec, iteration,
+                                                   relaxation_factor](
+                                                      auto matrix) {
                     using Mtx = typename decltype(matrix)::element_type;
                     return share(
                         build_smoother(
@@ -119,6 +124,7 @@ void handle_list(
                             iteration, casting<ValueType>(relaxation_factor))
                             ->generate(matrix));
                 });
+            }
         }
 #endif
         return share(build_smoother(preconditioner::Jacobi<ValueType>::build()
@@ -330,30 +336,37 @@ void MultigridState::generate(const LinOp* system_matrix_in,
                 if (gko::detail::is_distributed(system_matrix_in)) {
                     using value_type =
                         typename std::decay_t<decltype(*mg_level)>::value_type;
-                    using VectorType =
-                        experimental::distributed::Vector<value_type>;
-                    auto fine = mg_level->get_fine_op().get();
-                    auto coarse = mg_level->get_coarse_op().get();
-                    auto distributed_fine = dynamic_cast<
-                        const experimental::distributed::DistributedBase*>(
-                        fine);
-                    auto distributed_coarse = dynamic_cast<
-                        const experimental::distributed::DistributedBase*>(
-                        coarse);
-                    auto current_comm = distributed_fine->get_communicator();
-                    auto next_comm = distributed_coarse->get_communicator();
-                    auto current_local_nrows =
-                        ::gko::detail::run_matrix(fine, [](auto* fine_mat) {
-                            return fine_mat->get_local_matrix()->get_size()[0];
-                        });
-                    auto next_local_nrows =
-                        ::gko::detail::run_matrix(coarse, [](auto* coarse_mat) {
-                            return coarse_mat->get_non_local_matrix()
-                                ->get_size()[0];
-                        });
-                    this->allocate_memory<VectorType>(
-                        i, cycle, current_comm, next_comm, current_nrows,
-                        next_nrows, current_local_nrows, next_local_nrows);
+                    if constexpr (std::is_same_v<remove_complex<value_type>,
+                                                 half>) {
+                        GKO_NOT_SUPPORTED(system_matrix_in);
+                    } else {
+                        using VectorType =
+                            experimental::distributed::Vector<value_type>;
+                        auto fine = mg_level->get_fine_op().get();
+                        auto coarse = mg_level->get_coarse_op().get();
+                        auto distributed_fine = dynamic_cast<
+                            const experimental::distributed::DistributedBase*>(
+                            fine);
+                        auto distributed_coarse = dynamic_cast<
+                            const experimental::distributed::DistributedBase*>(
+                            coarse);
+                        auto current_comm =
+                            distributed_fine->get_communicator();
+                        auto next_comm = distributed_coarse->get_communicator();
+                        auto current_local_nrows =
+                            ::gko::detail::run_matrix(fine, [](auto* fine_mat) {
+                                return fine_mat->get_local_matrix()
+                                    ->get_size()[0];
+                            });
+                        auto next_local_nrows = ::gko::detail::run_matrix(
+                            coarse, [](auto* coarse_mat) {
+                                return coarse_mat->get_non_local_matrix()
+                                    ->get_size()[0];
+                            });
+                        this->allocate_memory<VectorType>(
+                            i, cycle, current_comm, next_comm, current_nrows,
+                            next_nrows, current_local_nrows, next_local_nrows);
+                    }
                 } else
 #endif
                 {
@@ -446,6 +459,32 @@ void MultigridState::allocate_memory(
         initialize<dense_vec>({-one<value_type>()}, exec));
 }
 
+#if GINKGO_ENABLE_HALF
+template <>
+void MultigridState::allocate_memory<
+    gko::experimental::distributed::Vector<gko::half>>(
+    int level, multigrid::cycle cycle,
+    const experimental::mpi::communicator& current_comm,
+    const experimental::mpi::communicator& next_comm, size_type current_nrows,
+    size_type next_nrows, size_type current_local_nrows,
+    size_type next_local_nrows)
+{
+    GKO_NOT_SUPPORTED(nullptr);
+}
+
+template <>
+void MultigridState::allocate_memory<
+    gko::experimental::distributed::Vector<std::complex<gko::half>>>(
+    int level, multigrid::cycle cycle,
+    const experimental::mpi::communicator& current_comm,
+    const experimental::mpi::communicator& next_comm, size_type current_nrows,
+    size_type next_nrows, size_type current_local_nrows,
+    size_type next_local_nrows)
+{
+    GKO_NOT_SUPPORTED(nullptr);
+}
+#endif
+
 
 #endif
 
@@ -594,6 +633,27 @@ void MultigridState::run_cycle(multigrid::cycle cycle, size_type level,
     }
 }
 
+template <>
+void MultigridState::run_cycle<
+    gko::experimental::distributed::Vector<gko::half>>(
+    multigrid::cycle cycle, size_type level,
+    const std::shared_ptr<const LinOp>& matrix, const LinOp* b, LinOp* x,
+    cycle_mode mode)
+{
+    GKO_NOT_SUPPORTED(nullptr);
+}
+
+template <>
+void MultigridState::run_cycle<
+    gko::experimental::distributed::Vector<std::complex<gko::half>>>(
+    multigrid::cycle cycle, size_type level,
+    const std::shared_ptr<const LinOp>& matrix, const LinOp* b, LinOp* x,
+    cycle_mode mode)
+{
+    GKO_NOT_SUPPORTED(nullptr);
+}
+
+
 }  // namespace detail
 }  // namespace multigrid
 
@@ -770,35 +830,41 @@ void Multigrid::generate()
                 if (gko::detail::is_distributed(matrix.get())) {
                     using absolute_value_type = remove_complex<value_type>;
                     using experimental::distributed::Matrix;
-                    return run<Matrix<value_type, int32, int32>,
-                               Matrix<value_type, int32, int64>,
-                               Matrix<value_type, int64,
-                                      int64>>(matrix, [exec](auto matrix) {
-                        using Mtx = typename decltype(matrix)::element_type;
-                        return solver::Gmres<value_type>::build()
-                            .with_criteria(
-                                stop::Iteration::build().with_max_iters(
-                                    matrix->get_size()[0]),
-                                stop::ResidualNorm<value_type>::build()
-                                    .with_reduction_factor(
-                                        std::numeric_limits<
-                                            absolute_value_type>::epsilon() *
-                                        absolute_value_type{10}))
-                            .with_krylov_dim(
-                                std::min(size_type(100), matrix->get_size()[0]))
-                            .with_preconditioner(
-                                experimental::distributed::preconditioner::
-                                    Schwarz<value_type,
-                                            typename Mtx::local_index_type,
-                                            typename Mtx::global_index_type>::
-                                        build()
+                    if constexpr (std::is_same_v<absolute_value_type,
+                                                 gko::half>) {
+                        GKO_NOT_SUPPORTED(matrix);
+                    } else {
+                        return run<Matrix<value_type, int32, int32>,
+                                   Matrix<value_type, int32, int64>,
+                                   Matrix<value_type, int64,
+                                          int64>>(matrix, [exec](auto matrix) {
+                            using Mtx = typename decltype(matrix)::element_type;
+                            return solver::Gmres<value_type>::build()
+                                .with_criteria(
+                                    stop::Iteration::build().with_max_iters(
+                                        matrix->get_size()[0]),
+                                    stop::ResidualNorm<value_type>::build()
+                                        .with_reduction_factor(
+                                            std::numeric_limits<
+                                                absolute_value_type>::
+                                                epsilon() *
+                                            absolute_value_type{10}))
+                                .with_krylov_dim(std::min(
+                                    size_type(100), matrix->get_size()[0]))
+                                .with_preconditioner(
+                                    experimental::distributed::preconditioner::
+                                        Schwarz<value_type,
+                                                typename Mtx::local_index_type,
+                                                typename Mtx::
+                                                    global_index_type>::build()
                                             .with_local_solver(
                                                 preconditioner::Jacobi<
                                                     value_type>::build()
                                                     .with_max_block_size(1u)))
-                            .on(exec)
-                            ->generate(matrix);
-                    });
+                                .on(exec)
+                                ->generate(matrix);
+                        });
+                    }
                 }
 #endif
                 if (dynamic_cast<const DpcppExecutor*>(exec.get())) {
diff --git a/include/ginkgo/core/base/precision_dispatch.hpp b/include/ginkgo/core/base/precision_dispatch.hpp
index ad31a6b19e8..4adc02763f0 100644
--- a/include/ginkgo/core/base/precision_dispatch.hpp
+++ b/include/ginkgo/core/base/precision_dispatch.hpp
@@ -382,7 +382,11 @@ make_temporary_conversion(const LinOp* matrix)
 template <typename ValueType, typename Function, typename... Args>
 void precision_dispatch(Function fn, Args*... linops)
 {
-    fn(distributed::make_temporary_conversion<ValueType>(linops).get()...);
+    if constexpr (std::is_same_v<remove_complex<ValueType>, half>) {
+        GKO_NOT_SUPPORTED(nullptr);
+    } else {
+        fn(distributed::make_temporary_conversion<ValueType>(linops).get()...);
+    }
 }
 
 
@@ -398,23 +402,29 @@ void precision_dispatch(Function fn, Args*... linops)
 template <typename ValueType, typename Function>
 void precision_dispatch_real_complex(Function fn, const LinOp* in, LinOp* out)
 {
-    auto complex_to_real = !(
-        is_complex<ValueType>() ||
-        dynamic_cast<const ConvertibleTo<experimental::distributed::Vector<>>*>(
-            in));
-    if (complex_to_real) {
-        auto dense_in =
-            distributed::make_temporary_conversion<to_complex<ValueType>>(in);
-        auto dense_out =
-            distributed::make_temporary_conversion<to_complex<ValueType>>(out);
-        using Vector = experimental::distributed::Vector<ValueType>;
-        // These dynamic_casts are only needed to make the code compile
-        // If ValueType is complex, this branch will never be taken
-        // If ValueType is real, the cast is a no-op
-        fn(dynamic_cast<const Vector*>(dense_in->create_real_view().get()),
-           dynamic_cast<Vector*>(dense_out->create_real_view().get()));
+    if constexpr (std::is_same_v<remove_complex<ValueType>, half>) {
+        GKO_NOT_SUPPORTED(nullptr);
     } else {
-        distributed::precision_dispatch<ValueType>(fn, in, out);
+        auto complex_to_real = !(
+            is_complex<ValueType>() ||
+            dynamic_cast<
+                const ConvertibleTo<experimental::distributed::Vector<>>*>(in));
+        if (complex_to_real) {
+            auto dense_in =
+                distributed::make_temporary_conversion<to_complex<ValueType>>(
+                    in);
+            auto dense_out =
+                distributed::make_temporary_conversion<to_complex<ValueType>>(
+                    out);
+            using Vector = experimental::distributed::Vector<ValueType>;
+            // These dynamic_casts are only needed to make the code compile
+            // If ValueType is complex, this branch will never be taken
+            // If ValueType is real, the cast is a no-op
+            fn(dynamic_cast<const Vector*>(dense_in->create_real_view().get()),
+               dynamic_cast<Vector*>(dense_out->create_real_view().get()));
+        } else {
+            distributed::precision_dispatch<ValueType>(fn, in, out);
+        }
     }
 }
 
@@ -426,27 +436,33 @@ template <typename ValueType, typename Function>
 void precision_dispatch_real_complex(Function fn, const LinOp* alpha,
                                      const LinOp* in, LinOp* out)
 {
-    auto complex_to_real = !(
-        is_complex<ValueType>() ||
-        dynamic_cast<const ConvertibleTo<experimental::distributed::Vector<>>*>(
-            in));
-    if (complex_to_real) {
-        auto dense_in =
-            distributed::make_temporary_conversion<to_complex<ValueType>>(in);
-        auto dense_out =
-            distributed::make_temporary_conversion<to_complex<ValueType>>(out);
-        auto dense_alpha = gko::make_temporary_conversion<ValueType>(alpha);
-        using Vector = experimental::distributed::Vector<ValueType>;
-        // These dynamic_casts are only needed to make the code compile
-        // If ValueType is complex, this branch will never be taken
-        // If ValueType is real, the cast is a no-op
-        fn(dense_alpha.get(),
-           dynamic_cast<const Vector*>(dense_in->create_real_view().get()),
-           dynamic_cast<Vector*>(dense_out->create_real_view().get()));
+    if constexpr (std::is_same_v<remove_complex<ValueType>, half>) {
+        GKO_NOT_SUPPORTED(nullptr);
     } else {
-        fn(gko::make_temporary_conversion<ValueType>(alpha).get(),
-           distributed::make_temporary_conversion<ValueType>(in).get(),
-           distributed::make_temporary_conversion<ValueType>(out).get());
+        auto complex_to_real = !(
+            is_complex<ValueType>() ||
+            dynamic_cast<
+                const ConvertibleTo<experimental::distributed::Vector<>>*>(in));
+        if (complex_to_real) {
+            auto dense_in =
+                distributed::make_temporary_conversion<to_complex<ValueType>>(
+                    in);
+            auto dense_out =
+                distributed::make_temporary_conversion<to_complex<ValueType>>(
+                    out);
+            auto dense_alpha = gko::make_temporary_conversion<ValueType>(alpha);
+            using Vector = experimental::distributed::Vector<ValueType>;
+            // These dynamic_casts are only needed to make the code compile
+            // If ValueType is complex, this branch will never be taken
+            // If ValueType is real, the cast is a no-op
+            fn(dense_alpha.get(),
+               dynamic_cast<const Vector*>(dense_in->create_real_view().get()),
+               dynamic_cast<Vector*>(dense_out->create_real_view().get()));
+        } else {
+            fn(gko::make_temporary_conversion<ValueType>(alpha).get(),
+               distributed::make_temporary_conversion<ValueType>(in).get(),
+               distributed::make_temporary_conversion<ValueType>(out).get());
+        }
     }
 }
 
@@ -459,30 +475,36 @@ void precision_dispatch_real_complex(Function fn, const LinOp* alpha,
                                      const LinOp* in, const LinOp* beta,
                                      LinOp* out)
 {
-    auto complex_to_real = !(
-        is_complex<ValueType>() ||
-        dynamic_cast<const ConvertibleTo<experimental::distributed::Vector<>>*>(
-            in));
-    if (complex_to_real) {
-        auto dense_in =
-            distributed::make_temporary_conversion<to_complex<ValueType>>(in);
-        auto dense_out =
-            distributed::make_temporary_conversion<to_complex<ValueType>>(out);
-        auto dense_alpha = gko::make_temporary_conversion<ValueType>(alpha);
-        auto dense_beta = gko::make_temporary_conversion<ValueType>(beta);
-        using Vector = experimental::distributed::Vector<ValueType>;
-        // These dynamic_casts are only needed to make the code compile
-        // If ValueType is complex, this branch will never be taken
-        // If ValueType is real, the cast is a no-op
-        fn(dense_alpha.get(),
-           dynamic_cast<const Vector*>(dense_in->create_real_view().get()),
-           dense_beta.get(),
-           dynamic_cast<Vector*>(dense_out->create_real_view().get()));
+    if constexpr (std::is_same_v<remove_complex<ValueType>, half>) {
+        GKO_NOT_SUPPORTED(nullptr);
     } else {
-        fn(gko::make_temporary_conversion<ValueType>(alpha).get(),
-           distributed::make_temporary_conversion<ValueType>(in).get(),
-           gko::make_temporary_conversion<ValueType>(beta).get(),
-           distributed::make_temporary_conversion<ValueType>(out).get());
+        auto complex_to_real = !(
+            is_complex<ValueType>() ||
+            dynamic_cast<
+                const ConvertibleTo<experimental::distributed::Vector<>>*>(in));
+        if (complex_to_real) {
+            auto dense_in =
+                distributed::make_temporary_conversion<to_complex<ValueType>>(
+                    in);
+            auto dense_out =
+                distributed::make_temporary_conversion<to_complex<ValueType>>(
+                    out);
+            auto dense_alpha = gko::make_temporary_conversion<ValueType>(alpha);
+            auto dense_beta = gko::make_temporary_conversion<ValueType>(beta);
+            using Vector = experimental::distributed::Vector<ValueType>;
+            // These dynamic_casts are only needed to make the code compile
+            // If ValueType is complex, this branch will never be taken
+            // If ValueType is real, the cast is a no-op
+            fn(dense_alpha.get(),
+               dynamic_cast<const Vector*>(dense_in->create_real_view().get()),
+               dense_beta.get(),
+               dynamic_cast<Vector*>(dense_out->create_real_view().get()));
+        } else {
+            fn(gko::make_temporary_conversion<ValueType>(alpha).get(),
+               distributed::make_temporary_conversion<ValueType>(in).get(),
+               gko::make_temporary_conversion<ValueType>(beta).get(),
+               distributed::make_temporary_conversion<ValueType>(out).get());
+        }
     }
 }
 
@@ -547,6 +569,7 @@ void precision_dispatch_real_complex_distributed(Function fn,
     if (dynamic_cast<const experimental::distributed::DistributedBase*>(in)) {
         experimental::distributed::precision_dispatch_real_complex<ValueType>(
             fn, alpha, in, beta, out);
+
     } else {
         gko::precision_dispatch_real_complex<ValueType>(fn, alpha, in, beta,
                                                         out);

From 91837493ecc4a1035b65f13e3cf6120d3acc1b3d Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 21 Nov 2024 10:35:07 +0100
Subject: [PATCH 395/448] revert the distribution value_type in
 dpcpp/test/preconditioner/jacobi_kernels to make them work with single

---
 .../test/preconditioner/jacobi_kernels.dp.cpp | 23 +++++++++++--------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp
index 36179402262..cdf3a0d0298 100644
--- a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp
+++ b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp
@@ -23,6 +23,8 @@
 namespace {
 
 
+// We keep some distribution with value_type to make the test with
+// GINKGO_DPCPP_SINGLE_MODE still work.
 class Jacobi : public ::testing::Test {
 protected:
     using index_type = int32_t;
@@ -62,7 +64,7 @@ class Jacobi : public ::testing::Test {
         if (condition_numbers.size() == 0) {
             mtx = gko::test::generate_random_matrix<Mtx>(
                 dim, dim, std::uniform_int_distribution<>(min_nnz, max_nnz),
-                std::normal_distribution<>(0.0, 1.0), engine, ref);
+                std::normal_distribution<value_type>(0.0, 1.0), engine, ref);
         } else {
             std::vector<mtx_data> blocks;
             for (gko::size_type i = 0; i < block_pointers.size() - 1; ++i) {
@@ -70,7 +72,8 @@ class Jacobi : public ::testing::Test {
                     begin(block_pointers)[i + 1] - begin(block_pointers)[i];
                 const auto cond = begin(condition_numbers)[i];
                 blocks.push_back(mtx_data::cond(
-                    size, cond, std::normal_distribution<>(-1, 1), engine));
+                    size, cond, std::normal_distribution<value_type>(-1, 1),
+                    engine));
             }
             mtx = Mtx::create(ref);
             mtx->read(mtx_data::diag(begin(blocks), end(blocks)));
@@ -106,11 +109,11 @@ class Jacobi : public ::testing::Test {
         }
         b = gko::test::generate_random_matrix<Vec>(
             dim, num_rhs, std::uniform_int_distribution<>(num_rhs, num_rhs),
-            std::normal_distribution<>(0.0, 1.0), engine, ref);
+            std::normal_distribution<value_type>(0.0, 1.0), engine, ref);
         d_b = gko::clone(dpcpp, b);
         x = gko::test::generate_random_matrix<Vec>(
             dim, num_rhs, std::uniform_int_distribution<>(num_rhs, num_rhs),
-            std::normal_distribution<>(0.0, 1.0), engine, ref);
+            std::normal_distribution<value_type>(0.0, 1.0), engine, ref);
         d_x = gko::clone(dpcpp, x);
     }
 
@@ -408,7 +411,7 @@ TEST_F(Jacobi, DpcppScalarApplyEquivalentToRef)
     smtx->copy_from(dense_smtx);
     auto sb = gko::share(gko::test::generate_random_matrix<Vec>(
         dim, 3, std::uniform_int_distribution<>(1, 1),
-        std::normal_distribution<>(0.0, 1.0), engine, ref));
+        std::normal_distribution<value_type>(0.0, 1.0), engine, ref));
     auto sx = Vec::create(ref, sb->get_size());
 
     auto d_smtx = gko::share(Mtx::create(dpcpp));
@@ -452,7 +455,7 @@ TEST_F(Jacobi, DpcppScalarLinearCombinationApplyEquivalentToRef)
     auto dense_data =
         gko::test::generate_random_matrix_data<value_type, index_type>(
             dim, dim, std::uniform_int_distribution<>(1, dim),
-            std::normal_distribution<>(1.0, 2.0), engine);
+            std::normal_distribution<value_type>(1.0, 2.0), engine);
     gko::utils::make_diag_dominant(dense_data);
     auto dense_smtx = gko::share(Vec::create(ref));
     dense_smtx->read(dense_data);
@@ -460,12 +463,12 @@ TEST_F(Jacobi, DpcppScalarLinearCombinationApplyEquivalentToRef)
     smtx->copy_from(dense_smtx);
     auto sb = gko::share(gko::test::generate_random_matrix<Vec>(
         dim, 3, std::uniform_int_distribution<>(1, 1),
-        std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3),
-        4));
+        std::normal_distribution<value_type>(0.0, 1.0), engine, ref,
+        gko::dim<2>(dim, 3), 4));
     auto sx = gko::share(gko::test::generate_random_matrix<Vec>(
         dim, 3, std::uniform_int_distribution<>(1, 1),
-        std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3),
-        4));
+        std::normal_distribution<value_type>(0.0, 1.0), engine, ref,
+        gko::dim<2>(dim, 3), 4));
 
     auto d_smtx = gko::share(gko::clone(dpcpp, smtx));
     auto d_sb = gko::share(gko::clone(dpcpp, sb));

From e066ac5bcff081a8bb1db112be32eb954a99a5bb Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Sun, 24 Nov 2024 22:36:57 +0100
Subject: [PATCH 396/448] fix distributed mixed-precision pgm

---
 core/multigrid/pgm.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/multigrid/pgm.cpp b/core/multigrid/pgm.cpp
index d4e4ffde4de..d11ebf32399 100644
--- a/core/multigrid/pgm.cpp
+++ b/core/multigrid/pgm.cpp
@@ -441,7 +441,7 @@ void Pgm<ValueType, IndexType>::generate()
                 setup_fine_op(obj);
             } else {
                 // handle other ValueTypes.
-                run<ConvertibleTo, fst_mtx_type, snd_mtx_type>(obj,
+                run<ConvertibleTo, fst_mtx_type, snd_mtx_type>(system_matrix_,
                                                                convert_fine_op);
             }
 

From 62a12d8fe6735ced1a3a5ffbeea6e95621e9282d Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 28 Nov 2024 19:05:12 +0100
Subject: [PATCH 397/448] fix type_size_impl for thrust::complex

---
 common/cuda_hip/base/math.hpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/common/cuda_hip/base/math.hpp b/common/cuda_hip/base/math.hpp
index 51a7fedf0c4..8c0da63c181 100644
--- a/common/cuda_hip/base/math.hpp
+++ b/common/cuda_hip/base/math.hpp
@@ -83,6 +83,12 @@ struct truncate_type_impl<thrust::complex<T>> {
 };
 
 
+template <typename T>
+struct type_size_impl<thrust::complex<T>> {
+    static constexpr auto value = sizeof(T) * byte_size;
+};
+
+
 template <typename T>
 struct is_complex_impl<thrust::complex<T>> : public std::true_type {};
 

From d529ab60ebf6e9985da6c927b1d1f2f7a041639b Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 25 Nov 2024 18:48:03 +0100
Subject: [PATCH 398/448] do not support half for nvhpc23.3 due to signal 11

---
 CMakeLists.txt | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fea0c3efd40..56e8cbae610 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -34,8 +34,9 @@ option(GINKGO_TEST_NONDEFAULT_STREAM "Uses non-default streams in CUDA and HIP t
 option(GINKGO_MIXED_PRECISION "Instantiate true mixed-precision kernels (otherwise they will be conversion-based using implicit temporary storage)" OFF)
 option(GINKGO_ENABLE_HALF "Enable the use of half precision" ON)
 # We do not support MSVC. SYCL will come later
-if(MSVC OR GINKGO_BUILD_SYCL)
-    message(STATUS "HALF is not supported in MSVC, and later support in SYCL")
+# NVHPC 23.3 faces "termminated by signal 11" in reference/test/isal_kernel and core/config/preconditioner, so we don't support this version for half unfortunately.
+if(MSVC OR GINKGO_BUILD_SYCL OR (CMAKE_CXX_COMPILER_ID MATCHES "PGI|NVHPC" AND CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 23.3.0))
+    message(STATUS "HALF is not supported in MSVC, NVHPC 23.3, and later support in SYCL")
     set(GINKGO_ENABLE_HALF OFF CACHE BOOL "Enable the use of half precision" FORCE)
 endif()
 option(GINKGO_SKIP_DEPENDENCY_UPDATE

From ef8dea31a4c3861c3e502333647aa943534ba57f Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 27 Nov 2024 16:55:52 +0100
Subject: [PATCH 399/448] split config to different file to overcome nvhpc
 limit

---
 core/CMakeLists.txt                        |   3 +
 core/config/preconditioner_config.cpp      | 280 ---------------------
 core/config/preconditioner_ic_config.cpp   | 111 ++++++++
 core/config/preconditioner_ilu_config.cpp  | 147 +++++++++++
 core/config/preconditioner_isai_config.cpp |  91 +++++++
 5 files changed, 352 insertions(+), 280 deletions(-)
 create mode 100644 core/config/preconditioner_ic_config.cpp
 create mode 100644 core/config/preconditioner_ilu_config.cpp
 create mode 100644 core/config/preconditioner_isai_config.cpp

diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
index 598167c0d7c..7901edf5341 100644
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -4,6 +4,9 @@ set(config_source
     config/factorization_config.cpp
     config/multigrid_config.cpp
     config/preconditioner_config.cpp
+    config/preconditioner_ic_config.cpp
+    config/preconditioner_ilu_config.cpp
+    config/preconditioner_isai_config.cpp
     config/registry.cpp
     config/solver_config.cpp
 )
diff --git a/core/config/preconditioner_config.cpp b/core/config/preconditioner_config.cpp
index a5669902d00..840094b51c9 100644
--- a/core/config/preconditioner_config.cpp
+++ b/core/config/preconditioner_config.cpp
@@ -6,14 +6,8 @@
 #include <ginkgo/core/config/config.hpp>
 #include <ginkgo/core/config/registry.hpp>
 #include <ginkgo/core/preconditioner/gauss_seidel.hpp>
-#include <ginkgo/core/preconditioner/ic.hpp>
-#include <ginkgo/core/preconditioner/ilu.hpp>
-#include <ginkgo/core/preconditioner/isai.hpp>
 #include <ginkgo/core/preconditioner/jacobi.hpp>
 #include <ginkgo/core/preconditioner/sor.hpp>
-#include <ginkgo/core/solver/gmres.hpp>
-#include <ginkgo/core/solver/ir.hpp>
-#include <ginkgo/core/solver/triangular.hpp>
 
 #include "core/config/config_helper.hpp"
 #include "core/config/dispatch.hpp"
@@ -25,280 +19,6 @@ namespace gko {
 namespace config {
 
 
-// For Ic and Ilu, we use additional ValueType to help Solver type decision
-template <typename Solver>
-class IcSolverHelper {
-public:
-    template <typename ValueType, typename IndexType>
-    class Configurator {
-    public:
-        static
-            typename gko::preconditioner::Ic<Solver, IndexType>::parameters_type
-            parse(const pnode& config, const registry& context,
-                  const type_descriptor& td_for_child)
-        {
-            return gko::preconditioner::Ic<Solver, IndexType>::parse(
-                config, context, td_for_child);
-        }
-    };
-};
-
-
-template <typename LSolver, typename USolver, bool ReverseApply>
-class IluSolverHelper {
-public:
-    template <typename ValueType, typename IndexType>
-    class Configurator {
-    public:
-        static typename preconditioner::Ilu<LSolver, USolver, ReverseApply,
-                                            IndexType>::parameters_type
-        parse(const pnode& config, const registry& context,
-              const type_descriptor& td_for_child)
-        {
-            return preconditioner::Ilu<LSolver, USolver, ReverseApply,
-                                       IndexType>::parse(config, context,
-                                                         td_for_child);
-        }
-    };
-};
-
-
-template <preconditioner::isai_type IsaiType>
-class IsaiHelper {
-public:
-    template <typename ValueType, typename IndexType>
-    class Configurator {
-    public:
-        static typename preconditioner::Isai<IsaiType, ValueType,
-                                             IndexType>::parameters_type
-        parse(const pnode& config, const registry& context,
-              const type_descriptor& td_for_child)
-        {
-            return preconditioner::Isai<IsaiType, ValueType, IndexType>::parse(
-                config, context, td_for_child);
-        }
-    };
-};
-
-// Do not use the partial specialization for SolverBase<V> and SolverBase<V, I>
-// because the default template arguments are allowed for a template template
-// argument (detail: CWG 150 after c++17
-// https://en.cppreference.com/w/cpp/language/template_parameters#Template_template_arguments)
-template <template <typename V> class SolverBase>
-class IcHelper1 {
-public:
-    template <typename ValueType, typename IndexType>
-    class Configurator
-        : public IcSolverHelper<SolverBase<ValueType>>::template Configurator<
-              ValueType, IndexType> {};
-};
-
-
-template <template <typename V, typename I> class SolverBase>
-class IcHelper2 {
-public:
-    template <typename ValueType, typename IndexType>
-    class Configurator
-        : public IcSolverHelper<SolverBase<ValueType, IndexType>>::
-              template Configurator<ValueType, IndexType> {};
-};
-
-
-template <>
-deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Ic>(
-    const pnode& config, const registry& context, const type_descriptor& td)
-{
-    auto updated = update_type(config, td);
-    std::string str("solver::LowerTrs");
-    if (auto& obj = config.get("l_solver_type")) {
-        str = obj.get_string();
-    }
-    if (str == "solver::LowerTrs") {
-        return dispatch<gko::LinOpFactory,
-                        IcHelper2<solver::LowerTrs>::Configurator>(
-            config, context, updated,
-            make_type_selector(updated.get_value_typestr(),
-                               value_type_list_with_half()),
-            make_type_selector(updated.get_index_typestr(), index_type_list()));
-    } else if (str == "solver::Ir") {
-        return dispatch<gko::LinOpFactory, IcHelper1<solver::Ir>::Configurator>(
-            config, context, updated,
-            make_type_selector(updated.get_value_typestr(),
-                               value_type_list_with_half()),
-            make_type_selector(updated.get_index_typestr(), index_type_list()));
-    } else if (str == "preconditioner::LowerIsai") {
-        return dispatch<gko::LinOpFactory,
-                        IcHelper2<preconditioner::LowerIsai>::Configurator>(
-            config, context, updated,
-            make_type_selector(updated.get_value_typestr(),
-                               value_type_list_with_half()),
-            make_type_selector(updated.get_index_typestr(), index_type_list()));
-    } else if (str == "solver::Gmres") {
-        return dispatch<gko::LinOpFactory,
-                        IcHelper1<solver::Gmres>::Configurator>(
-            config, context, updated,
-            make_type_selector(updated.get_value_typestr(),
-                               value_type_list_with_half()),
-            make_type_selector(updated.get_index_typestr(), index_type_list()));
-    } else {
-        GKO_INVALID_CONFIG_VALUE("l_solver_type", str);
-    }
-}
-
-
-template <template <typename V> class LSolverBase,
-          template <typename V> class USolverBase, bool ReverseApply>
-class IluHelper1 {
-public:
-    template <typename ValueType, typename IndexType>
-    class Configurator
-        : public IluSolverHelper<
-              LSolverBase<ValueType>, USolverBase<ValueType>,
-              ReverseApply>::template Configurator<ValueType, IndexType> {};
-};
-
-
-template <template <typename V, typename I> class LSolverBase,
-          template <typename V, typename I> class USolverBase,
-          bool ReverseApply>
-class IluHelper2 {
-public:
-    template <typename ValueType, typename IndexType>
-    class Configurator
-        : public IluSolverHelper<
-              LSolverBase<ValueType, IndexType>,
-              USolverBase<ValueType, IndexType>,
-              ReverseApply>::template Configurator<ValueType, IndexType> {};
-};
-
-
-template <>
-deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Ilu>(
-    const pnode& config, const registry& context, const type_descriptor& td)
-{
-    auto updated = update_type(config, td);
-    auto dispatch_solver = [&](auto reverse_apply)
-        -> deferred_factory_parameter<gko::LinOpFactory> {
-        using ReverseApply = decltype(reverse_apply);
-        // always use symmetric solver for USolverType
-        if (config.get("u_solver_type")) {
-            GKO_INVALID_STATE(
-                "preconditioner::Ilu only allows l_solver_type. The "
-                "u_solver_type automatically uses the transposed type of "
-                "l_solver_type.");
-        }
-        std::string str("solver::LowerTrs");
-        if (auto& obj = config.get("l_solver_type")) {
-            str = obj.get_string();
-        }
-        if (str == "solver::LowerTrs") {
-            return dispatch<
-                gko::LinOpFactory,
-                IluHelper2<solver::LowerTrs, solver::UpperTrs,
-                           ReverseApply::value>::template Configurator>(
-                config, context, updated,
-                make_type_selector(updated.get_value_typestr(),
-                                   value_type_list_with_half()),
-                make_type_selector(updated.get_index_typestr(),
-                                   index_type_list()));
-        } else if (str == "solver::Ir") {
-            return dispatch<
-                gko::LinOpFactory,
-                IluHelper1<solver::Ir, solver::Ir,
-                           ReverseApply::value>::template Configurator>(
-                config, context, updated,
-                make_type_selector(updated.get_value_typestr(),
-                                   value_type_list_with_half()),
-                make_type_selector(updated.get_index_typestr(),
-                                   index_type_list()));
-        } else if (str == "preconditioner::LowerIsai") {
-            return dispatch<
-                gko::LinOpFactory,
-                IluHelper2<preconditioner::LowerIsai, preconditioner::UpperIsai,
-                           ReverseApply::value>::template Configurator>(
-                config, context, updated,
-                make_type_selector(updated.get_value_typestr(),
-                                   value_type_list_with_half()),
-                make_type_selector(updated.get_index_typestr(),
-                                   index_type_list()));
-        } else if (str == "solver::Gmres") {
-            return dispatch<
-                gko::LinOpFactory,
-                IluHelper1<solver::Gmres, solver::Gmres,
-                           ReverseApply::value>::template Configurator>(
-                config, context, updated,
-                make_type_selector(updated.get_value_typestr(),
-                                   value_type_list_with_half()),
-                make_type_selector(updated.get_index_typestr(),
-                                   index_type_list()));
-        } else {
-            GKO_INVALID_CONFIG_VALUE("l_solver_type", str);
-        }
-    };
-    bool reverse_apply = false;
-    if (auto& obj = config.get("reverse_apply")) {
-        reverse_apply = obj.get_boolean();
-    }
-    if (reverse_apply) {
-        return dispatch_solver(std::true_type{});
-    } else {
-        return dispatch_solver(std::false_type{});
-    }
-}
-
-
-template <>
-deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Isai>(
-    const pnode& config, const registry& context, const type_descriptor& td)
-{
-    auto updated = update_type(config, td);
-    if (auto& obj = config.get("isai_type")) {
-        auto str = obj.get_string();
-        if (str == "lower") {
-            return dispatch<
-                gko::LinOpFactory,
-                IsaiHelper<preconditioner::isai_type::lower>::Configurator>(
-                config, context, updated,
-                make_type_selector(updated.get_value_typestr(),
-                                   value_type_list_with_half()),
-                make_type_selector(updated.get_index_typestr(),
-                                   index_type_list()));
-        } else if (str == "upper") {
-            return dispatch<
-                gko::LinOpFactory,
-                IsaiHelper<preconditioner::isai_type::upper>::Configurator>(
-                config, context, updated,
-                make_type_selector(updated.get_value_typestr(),
-                                   value_type_list_with_half()),
-                make_type_selector(updated.get_index_typestr(),
-                                   index_type_list()));
-        } else if (str == "general") {
-            return dispatch<
-                gko::LinOpFactory,
-                IsaiHelper<preconditioner::isai_type::general>::Configurator>(
-                config, context, updated,
-                make_type_selector(updated.get_value_typestr(),
-                                   value_type_list_with_half()),
-                make_type_selector(updated.get_index_typestr(),
-                                   index_type_list()));
-        } else if (str == "spd") {
-            return dispatch<
-                gko::LinOpFactory,
-                IsaiHelper<preconditioner::isai_type::spd>::Configurator>(
-                config, context, updated,
-                make_type_selector(updated.get_value_typestr(),
-                                   value_type_list_with_half()),
-                make_type_selector(updated.get_index_typestr(),
-                                   index_type_list()));
-        } else {
-            GKO_INVALID_CONFIG_VALUE("isai_type", str);
-        }
-    } else {
-        GKO_MISSING_CONFIG_ENTRY("isai_type");
-    }
-}
-
-
 GKO_PARSE_VALUE_AND_INDEX_TYPE(GaussSeidel, gko::preconditioner::GaussSeidel);
 GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(Jacobi, gko::preconditioner::Jacobi);
 GKO_PARSE_VALUE_AND_INDEX_TYPE(Sor, gko::preconditioner::Sor);
diff --git a/core/config/preconditioner_ic_config.cpp b/core/config/preconditioner_ic_config.cpp
new file mode 100644
index 00000000000..e029b228479
--- /dev/null
+++ b/core/config/preconditioner_ic_config.cpp
@@ -0,0 +1,111 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/config/config.hpp>
+#include <ginkgo/core/config/registry.hpp>
+#include <ginkgo/core/preconditioner/ic.hpp>
+#include <ginkgo/core/preconditioner/ilu.hpp>
+#include <ginkgo/core/preconditioner/isai.hpp>
+#include <ginkgo/core/solver/gmres.hpp>
+#include <ginkgo/core/solver/ir.hpp>
+#include <ginkgo/core/solver/triangular.hpp>
+
+#include "core/config/config_helper.hpp"
+#include "core/config/dispatch.hpp"
+#include "core/config/parse_macro.hpp"
+#include "core/config/type_descriptor_helper.hpp"
+
+
+namespace gko {
+namespace config {
+
+
+// For Ic and Ilu, we use additional ValueType to help Solver type decision
+template <typename Solver>
+class IcSolverHelper {
+public:
+    template <typename ValueType, typename IndexType>
+    class Configurator {
+    public:
+        static
+            typename gko::preconditioner::Ic<Solver, IndexType>::parameters_type
+            parse(const pnode& config, const registry& context,
+                  const type_descriptor& td_for_child)
+        {
+            return gko::preconditioner::Ic<Solver, IndexType>::parse(
+                config, context, td_for_child);
+        }
+    };
+};
+
+
+// Do not use the partial specialization for SolverBase<V> and SolverBase<V, I>
+// because the default template arguments are allowed for a template template
+// argument (detail: CWG 150 after c++17
+// https://en.cppreference.com/w/cpp/language/template_parameters#Template_template_arguments)
+template <template <typename V> class SolverBase>
+class IcHelper1 {
+public:
+    template <typename ValueType, typename IndexType>
+    class Configurator
+        : public IcSolverHelper<SolverBase<ValueType>>::template Configurator<
+              ValueType, IndexType> {};
+};
+
+
+template <template <typename V, typename I> class SolverBase>
+class IcHelper2 {
+public:
+    template <typename ValueType, typename IndexType>
+    class Configurator
+        : public IcSolverHelper<SolverBase<ValueType, IndexType>>::
+              template Configurator<ValueType, IndexType> {};
+};
+
+
+template <>
+deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Ic>(
+    const pnode& config, const registry& context, const type_descriptor& td)
+{
+    auto updated = update_type(config, td);
+    std::string str("solver::LowerTrs");
+    if (auto& obj = config.get("l_solver_type")) {
+        str = obj.get_string();
+    }
+    if (str == "solver::LowerTrs") {
+        return dispatch<gko::LinOpFactory,
+                        IcHelper2<solver::LowerTrs>::Configurator>(
+            config, context, updated,
+            make_type_selector(updated.get_value_typestr(),
+                               value_type_list_with_half()),
+            make_type_selector(updated.get_index_typestr(), index_type_list()));
+    } else if (str == "solver::Ir") {
+        return dispatch<gko::LinOpFactory, IcHelper1<solver::Ir>::Configurator>(
+            config, context, updated,
+            make_type_selector(updated.get_value_typestr(),
+                               value_type_list_with_half()),
+            make_type_selector(updated.get_index_typestr(), index_type_list()));
+    } else if (str == "preconditioner::LowerIsai") {
+        return dispatch<gko::LinOpFactory,
+                        IcHelper2<preconditioner::LowerIsai>::Configurator>(
+            config, context, updated,
+            make_type_selector(updated.get_value_typestr(),
+                               value_type_list_with_half()),
+            make_type_selector(updated.get_index_typestr(), index_type_list()));
+    } else if (str == "solver::Gmres") {
+        return dispatch<gko::LinOpFactory,
+                        IcHelper1<solver::Gmres>::Configurator>(
+            config, context, updated,
+            make_type_selector(updated.get_value_typestr(),
+                               value_type_list_with_half()),
+            make_type_selector(updated.get_index_typestr(), index_type_list()));
+    } else {
+        GKO_INVALID_CONFIG_VALUE("l_solver_type", str);
+    }
+}
+
+
+}  // namespace config
+}  // namespace gko
diff --git a/core/config/preconditioner_ilu_config.cpp b/core/config/preconditioner_ilu_config.cpp
new file mode 100644
index 00000000000..9ed8494ab10
--- /dev/null
+++ b/core/config/preconditioner_ilu_config.cpp
@@ -0,0 +1,147 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/config/config.hpp>
+#include <ginkgo/core/config/registry.hpp>
+#include <ginkgo/core/preconditioner/ic.hpp>
+#include <ginkgo/core/preconditioner/ilu.hpp>
+#include <ginkgo/core/preconditioner/isai.hpp>
+#include <ginkgo/core/solver/gmres.hpp>
+#include <ginkgo/core/solver/ir.hpp>
+#include <ginkgo/core/solver/triangular.hpp>
+
+#include "core/config/config_helper.hpp"
+#include "core/config/dispatch.hpp"
+#include "core/config/parse_macro.hpp"
+#include "core/config/type_descriptor_helper.hpp"
+
+
+namespace gko {
+namespace config {
+
+
+// For Ic and Ilu, we use additional ValueType to help Solver type decision
+template <typename LSolver, typename USolver, bool ReverseApply>
+class IluSolverHelper {
+public:
+    template <typename ValueType, typename IndexType>
+    class Configurator {
+    public:
+        static typename preconditioner::Ilu<LSolver, USolver, ReverseApply,
+                                            IndexType>::parameters_type
+        parse(const pnode& config, const registry& context,
+              const type_descriptor& td_for_child)
+        {
+            return preconditioner::Ilu<LSolver, USolver, ReverseApply,
+                                       IndexType>::parse(config, context,
+                                                         td_for_child);
+        }
+    };
+};
+
+
+template <template <typename V> class LSolverBase,
+          template <typename V> class USolverBase, bool ReverseApply>
+class IluHelper1 {
+public:
+    template <typename ValueType, typename IndexType>
+    class Configurator
+        : public IluSolverHelper<
+              LSolverBase<ValueType>, USolverBase<ValueType>,
+              ReverseApply>::template Configurator<ValueType, IndexType> {};
+};
+
+
+template <template <typename V, typename I> class LSolverBase,
+          template <typename V, typename I> class USolverBase,
+          bool ReverseApply>
+class IluHelper2 {
+public:
+    template <typename ValueType, typename IndexType>
+    class Configurator
+        : public IluSolverHelper<
+              LSolverBase<ValueType, IndexType>,
+              USolverBase<ValueType, IndexType>,
+              ReverseApply>::template Configurator<ValueType, IndexType> {};
+};
+
+
+template <>
+deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Ilu>(
+    const pnode& config, const registry& context, const type_descriptor& td)
+{
+    auto updated = update_type(config, td);
+    auto dispatch_solver = [&](auto reverse_apply)
+        -> deferred_factory_parameter<gko::LinOpFactory> {
+        using ReverseApply = decltype(reverse_apply);
+        // always use symmetric solver for USolverType
+        if (config.get("u_solver_type")) {
+            GKO_INVALID_STATE(
+                "preconditioner::Ilu only allows l_solver_type. The "
+                "u_solver_type automatically uses the transposed type of "
+                "l_solver_type.");
+        }
+        std::string str("solver::LowerTrs");
+        if (auto& obj = config.get("l_solver_type")) {
+            str = obj.get_string();
+        }
+        if (str == "solver::LowerTrs") {
+            return dispatch<
+                gko::LinOpFactory,
+                IluHelper2<solver::LowerTrs, solver::UpperTrs,
+                           ReverseApply::value>::template Configurator>(
+                config, context, updated,
+                make_type_selector(updated.get_value_typestr(),
+                                   value_type_list_with_half()),
+                make_type_selector(updated.get_index_typestr(),
+                                   index_type_list()));
+        } else if (str == "solver::Ir") {
+            return dispatch<
+                gko::LinOpFactory,
+                IluHelper1<solver::Ir, solver::Ir,
+                           ReverseApply::value>::template Configurator>(
+                config, context, updated,
+                make_type_selector(updated.get_value_typestr(),
+                                   value_type_list_with_half()),
+                make_type_selector(updated.get_index_typestr(),
+                                   index_type_list()));
+        } else if (str == "preconditioner::LowerIsai") {
+            return dispatch<
+                gko::LinOpFactory,
+                IluHelper2<preconditioner::LowerIsai, preconditioner::UpperIsai,
+                           ReverseApply::value>::template Configurator>(
+                config, context, updated,
+                make_type_selector(updated.get_value_typestr(),
+                                   value_type_list_with_half()),
+                make_type_selector(updated.get_index_typestr(),
+                                   index_type_list()));
+        } else if (str == "solver::Gmres") {
+            return dispatch<
+                gko::LinOpFactory,
+                IluHelper1<solver::Gmres, solver::Gmres,
+                           ReverseApply::value>::template Configurator>(
+                config, context, updated,
+                make_type_selector(updated.get_value_typestr(),
+                                   value_type_list_with_half()),
+                make_type_selector(updated.get_index_typestr(),
+                                   index_type_list()));
+        } else {
+            GKO_INVALID_CONFIG_VALUE("l_solver_type", str);
+        }
+    };
+    bool reverse_apply = false;
+    if (auto& obj = config.get("reverse_apply")) {
+        reverse_apply = obj.get_boolean();
+    }
+    if (reverse_apply) {
+        return dispatch_solver(std::true_type{});
+    } else {
+        return dispatch_solver(std::false_type{});
+    }
+}
+
+
+}  // namespace config
+}  // namespace gko
diff --git a/core/config/preconditioner_isai_config.cpp b/core/config/preconditioner_isai_config.cpp
new file mode 100644
index 00000000000..828721ed74e
--- /dev/null
+++ b/core/config/preconditioner_isai_config.cpp
@@ -0,0 +1,91 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/config/config.hpp>
+#include <ginkgo/core/config/registry.hpp>
+#include <ginkgo/core/preconditioner/isai.hpp>
+
+#include "core/config/config_helper.hpp"
+#include "core/config/dispatch.hpp"
+#include "core/config/parse_macro.hpp"
+#include "core/config/type_descriptor_helper.hpp"
+
+
+namespace gko {
+namespace config {
+
+
+template <preconditioner::isai_type IsaiType>
+class IsaiHelper {
+public:
+    template <typename ValueType, typename IndexType>
+    class Configurator {
+    public:
+        static typename preconditioner::Isai<IsaiType, ValueType,
+                                             IndexType>::parameters_type
+        parse(const pnode& config, const registry& context,
+              const type_descriptor& td_for_child)
+        {
+            return preconditioner::Isai<IsaiType, ValueType, IndexType>::parse(
+                config, context, td_for_child);
+        }
+    };
+};
+
+
+template <>
+deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Isai>(
+    const pnode& config, const registry& context, const type_descriptor& td)
+{
+    auto updated = update_type(config, td);
+    if (auto& obj = config.get("isai_type")) {
+        auto str = obj.get_string();
+        if (str == "lower") {
+            return dispatch<
+                gko::LinOpFactory,
+                IsaiHelper<preconditioner::isai_type::lower>::Configurator>(
+                config, context, updated,
+                make_type_selector(updated.get_value_typestr(),
+                                   value_type_list_with_half()),
+                make_type_selector(updated.get_index_typestr(),
+                                   index_type_list()));
+        } else if (str == "upper") {
+            return dispatch<
+                gko::LinOpFactory,
+                IsaiHelper<preconditioner::isai_type::upper>::Configurator>(
+                config, context, updated,
+                make_type_selector(updated.get_value_typestr(),
+                                   value_type_list_with_half()),
+                make_type_selector(updated.get_index_typestr(),
+                                   index_type_list()));
+        } else if (str == "general") {
+            return dispatch<
+                gko::LinOpFactory,
+                IsaiHelper<preconditioner::isai_type::general>::Configurator>(
+                config, context, updated,
+                make_type_selector(updated.get_value_typestr(),
+                                   value_type_list_with_half()),
+                make_type_selector(updated.get_index_typestr(),
+                                   index_type_list()));
+        } else if (str == "spd") {
+            return dispatch<
+                gko::LinOpFactory,
+                IsaiHelper<preconditioner::isai_type::spd>::Configurator>(
+                config, context, updated,
+                make_type_selector(updated.get_value_typestr(),
+                                   value_type_list_with_half()),
+                make_type_selector(updated.get_index_typestr(),
+                                   index_type_list()));
+        } else {
+            GKO_INVALID_CONFIG_VALUE("isai_type", str);
+        }
+    } else {
+        GKO_MISSING_CONFIG_ENTRY("isai_type");
+    }
+}
+
+
+}  // namespace config
+}  // namespace gko

From 4a80809f0ad64ea710e1e6721afc3faa472d027f Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 27 Nov 2024 18:52:50 +0100
Subject: [PATCH 400/448] make test target_name aligned with binary name, split
 isai due to nvhpc limitation

---
 CMakeLists.txt                                |  5 ++--
 cmake/create_test.cmake                       | 23 +++++++++++--------
 reference/test/preconditioner/CMakeLists.txt  |  6 +++++
 .../test/preconditioner/isai_kernels.cpp      | 20 ++++++++++++++++
 4 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 56e8cbae610..fea0c3efd40 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -34,9 +34,8 @@ option(GINKGO_TEST_NONDEFAULT_STREAM "Uses non-default streams in CUDA and HIP t
 option(GINKGO_MIXED_PRECISION "Instantiate true mixed-precision kernels (otherwise they will be conversion-based using implicit temporary storage)" OFF)
 option(GINKGO_ENABLE_HALF "Enable the use of half precision" ON)
 # We do not support MSVC. SYCL will come later
-# NVHPC 23.3 faces "termminated by signal 11" in reference/test/isal_kernel and core/config/preconditioner, so we don't support this version for half unfortunately.
-if(MSVC OR GINKGO_BUILD_SYCL OR (CMAKE_CXX_COMPILER_ID MATCHES "PGI|NVHPC" AND CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 23.3.0))
-    message(STATUS "HALF is not supported in MSVC, NVHPC 23.3, and later support in SYCL")
+if(MSVC OR GINKGO_BUILD_SYCL)
+    message(STATUS "HALF is not supported in MSVC, and later support in SYCL")
     set(GINKGO_ENABLE_HALF OFF CACHE BOOL "Enable the use of half precision" FORCE)
 endif()
 option(GINKGO_SKIP_DEPENDENCY_UPDATE
diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake
index c540d6e2cf7..239613ef804 100644
--- a/cmake/create_test.cmake
+++ b/cmake/create_test.cmake
@@ -5,9 +5,14 @@ set(gko_test_option_args "NO_RESOURCES;NO_GTEST_MAIN")
 
 ## Replaces / by _ to create valid target names from relative paths
 function(ginkgo_build_test_name test_name target_name)
+    cmake_parse_arguments(PARSE_ARGV 2 build_test_name "" "${gko_test_single_args}" "")
     file(RELATIVE_PATH REL_BINARY_DIR
          ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR})
-    string(REPLACE "/" "_" TEST_TARGET_NAME "${REL_BINARY_DIR}/${test_name}")
+    set(test_binary_name ${test_name})
+    if (build_test_name_EXECUTABLE_NAME)
+        set(test_binary_name ${build_test_name_EXECUTABLE_NAME})
+    endif()
+    string(REPLACE "/" "_" TEST_TARGET_NAME "${REL_BINARY_DIR}/${test_binary_name}")
     set(${target_name} ${TEST_TARGET_NAME} PARENT_SCOPE)
 endfunction()
 
@@ -127,7 +132,7 @@ endfunction()
 
 ## Normal test
 function(ginkgo_create_test test_name)
-    ginkgo_build_test_name(${test_name} test_target_name)
+    ginkgo_build_test_name(${test_name} test_target_name ${ARGN})
     add_executable(${test_target_name} ${test_name}.cpp)
     target_link_libraries(${test_target_name})
     ginkgo_set_test_target_properties(${test_target_name} "_cpu" ${ARGN})
@@ -136,7 +141,7 @@ endfunction(ginkgo_create_test)
 
 ## Test compiled with dpcpp
 function(ginkgo_create_dpcpp_test test_name)
-    ginkgo_build_test_name(${test_name} test_target_name)
+    ginkgo_build_test_name(${test_name} test_target_name ${ARGN})
     add_executable(${test_target_name} ${test_name}.dp.cpp)
     target_compile_options(${test_target_name} PRIVATE ${GINKGO_DPCPP_FLAGS})
     gko_add_sycl_to_target(TARGET ${test_target_name} SOURCES ${test_name}.dp.cpp)
@@ -151,7 +156,7 @@ endfunction(ginkgo_create_dpcpp_test)
 
 ## Test compiled with CUDA
 function(ginkgo_create_cuda_test test_name)
-    ginkgo_build_test_name(${test_name} test_target_name)
+    ginkgo_build_test_name(${test_name} test_target_name ${ARGN})
     ginkgo_create_cuda_test_internal(${test_name} ${test_name}.cu ${test_target_name} ${ARGN})
 endfunction(ginkgo_create_cuda_test)
 
@@ -177,7 +182,7 @@ endfunction(ginkgo_create_cuda_test_internal)
 
 ## Test compiled with HIP
 function(ginkgo_create_hip_test test_name)
-    ginkgo_build_test_name(${test_name} test_target_name)
+    ginkgo_build_test_name(${test_name} test_target_name ${ARGN})
     ginkgo_create_hip_test_internal(${test_name} ${test_name}.hip.cpp ${test_target_name} ${ARGN})
 endfunction(ginkgo_create_hip_test)
 
@@ -196,12 +201,12 @@ endfunction(ginkgo_create_hip_test_internal)
 
 ## Test compiled with OpenMP
 function(ginkgo_create_omp_test test_name)
-    ginkgo_build_test_name(${test_name} test_target_name)
+    ginkgo_build_test_name(${test_name} test_target_name ${ARGN})
     ginkgo_create_omp_test_internal(${test_name} ${test_name}.cpp ${test_target_name} "" ${ARGN})
 endfunction()
 
 function(ginkgo_create_omp_test_internal test_name filename test_target_name)
-    ginkgo_build_test_name(${test_name} test_target_name)
+    ginkgo_build_test_name(${test_name} test_target_name ${ARGN})
     add_executable(${test_target_name} ${test_name}.cpp)
     target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_OMP GKO_DEVICE_NAMESPACE=omp)
     target_link_libraries(${test_target_name} PRIVATE OpenMP::OpenMP_CXX)
@@ -241,7 +246,7 @@ function(ginkgo_create_common_test_internal test_name exec_type exec)
     else ()
         set(test_resource_type sycl)
     endif ()
-    ginkgo_build_test_name(${test_name} test_target_name)
+    ginkgo_build_test_name(${test_name} test_target_name ${ARGN})
     string(TOUPPER ${exec} exec_upper)
 
     # set up actual test
@@ -267,7 +272,7 @@ endfunction(ginkgo_create_common_test_internal)
 ## Common test compiled with the device compiler, one target for each enabled backend
 function(ginkgo_create_common_device_test test_name)
     cmake_parse_arguments(PARSE_ARGV 1 common_device_test "" "${gko_test_single_args}" "${gko_test_multi_args}")
-    ginkgo_build_test_name(${test_name} test_target_name)
+    ginkgo_build_test_name(${test_name} test_target_name ${ARGN})
     if(GINKGO_BUILD_SYCL)
         ginkgo_create_common_test_internal(${test_name} DpcppExecutor dpcpp ${ARGN})
         target_compile_options(${test_target_name}_dpcpp PRIVATE ${GINKGO_DPCPP_FLAGS})
diff --git a/reference/test/preconditioner/CMakeLists.txt b/reference/test/preconditioner/CMakeLists.txt
index f558aa87495..603b5033cf9 100644
--- a/reference/test/preconditioner/CMakeLists.txt
+++ b/reference/test/preconditioner/CMakeLists.txt
@@ -3,6 +3,12 @@ ginkgo_create_test(gauss_seidel)
 ginkgo_create_test(ilu)
 ginkgo_create_test(ic)
 ginkgo_create_test(isai_kernels)
+# need to split the isai_kernels due to NVHPC compilation limitation
+if(CMAKE_CXX_COMPILER_ID MATCHES "PGI|NVHPC" AND GINKGO_ENABLE_HALF)
+    ginkgo_create_test(isai_kernels EXECUTABLE_NAME isai_kernels_half)
+    ginkgo_build_test_name(isai_kernels isai_half_target EXECUTABLE_NAME isai_kernels_half)
+    target_compile_definitions(${isai_half_target} PRIVATE "NVHPC_HALF")
+endif()
 ginkgo_create_test(jacobi)
 ginkgo_create_test(jacobi_kernels)
 ginkgo_create_test(sor_kernels)
diff --git a/reference/test/preconditioner/isai_kernels.cpp b/reference/test/preconditioner/isai_kernels.cpp
index f55d7e12b87..0164f5d8e58 100644
--- a/reference/test/preconditioner/isai_kernels.cpp
+++ b/reference/test/preconditioner/isai_kernels.cpp
@@ -322,10 +322,30 @@ class Isai : public ::testing::Test {
     std::shared_ptr<Csr> spd_sparse_inv;
 };
 
+#ifdef __NVCOMPILER
+
+
+// Due to NVHPC compilation limitation, we need to split it to two files.
+#ifdef NVHPC_HALF
+using HalfIndexTypes = gko::test::cartesian_type_product_t<
+    ::testing::Types<gko::half, std::complex<gko::half>>,
+    gko::test::IndexTypes>;
+TYPED_TEST_SUITE(Isai, HalfIndexTypes, PairTypenameNameGenerator);
+#else
+TYPED_TEST_SUITE(Isai, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+#endif
+
+
+#else
+
+
 TYPED_TEST_SUITE(Isai, gko::test::ValueIndexTypesWithHalf,
                  PairTypenameNameGenerator);
 
 
+#endif
+
+
 TYPED_TEST(Isai, KernelGenerateA)
 {
     using Csr = typename TestFixture::Csr;

From 7d65761bc5d8d362eec95c8608e88d94d7c11a1d Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 29 Oct 2024 01:04:27 +0100
Subject: [PATCH 401/448] batch with half

---
 .../base/batch_multi_vector_kernels.cpp       | 13 ++---
 common/cuda_hip/matrix/batch_csr_kernels.cpp  |  8 +--
 .../cuda_hip/matrix/batch_dense_kernels.cpp   | 12 +++--
 common/cuda_hip/matrix/batch_ell_kernels.cpp  |  8 +--
 core/base/batch_instantiation.hpp             |  2 +-
 core/base/batch_multi_vector.cpp              | 27 ++++++++--
 core/device_hooks/common_kernels.inc.cpp      | 53 +++++++++++--------
 core/log/batch_logger.cpp                     |  4 +-
 core/matrix/batch_csr.cpp                     | 29 ++++++++--
 core/matrix/batch_dense.cpp                   | 28 ++++++++--
 core/matrix/batch_ell.cpp                     | 29 ++++++++--
 core/matrix/batch_identity.cpp                |  3 +-
 core/preconditioner/batch_jacobi.cpp          |  2 +-
 core/solver/batch_bicgstab.cpp                |  2 +-
 core/solver/batch_cg.cpp                      |  2 +-
 core/solver/batch_dispatch.hpp                | 42 +++++++++++++--
 cuda/preconditioner/batch_jacobi_kernels.cu   |  4 +-
 dpcpp/base/batch_multi_vector_kernels.dp.cpp  | 13 ++---
 dpcpp/base/batch_multi_vector_kernels.hpp     | 41 --------------
 dpcpp/matrix/batch_csr_kernels.dp.cpp         |  8 +--
 dpcpp/matrix/batch_dense_kernels.dp.cpp       | 12 +++--
 dpcpp/matrix/batch_ell_kernels.dp.cpp         |  8 +--
 dpcpp/preconditioner/batch_block_jacobi.hpp   |  7 ++-
 .../batch_jacobi_kernels.dp.cpp               |  4 +-
 .../batch_jacobi_kernels.hip.cpp              |  4 +-
 .../ginkgo/core/base/batch_multi_vector.hpp   | 36 ++++++++++---
 include/ginkgo/core/base/types.hpp            | 11 ++++
 include/ginkgo/core/log/logger.hpp            | 12 +++++
 include/ginkgo/core/matrix/batch_csr.hpp      | 35 ++++++++++--
 include/ginkgo/core/matrix/batch_dense.hpp    | 33 ++++++++++--
 include/ginkgo/core/matrix/batch_ell.hpp      | 35 ++++++++++--
 omp/base/batch_multi_vector_kernels.cpp       | 13 ++---
 omp/matrix/batch_csr_kernels.cpp              |  8 +--
 omp/matrix/batch_dense_kernels.cpp            | 12 +++--
 omp/matrix/batch_ell_kernels.cpp              |  8 +--
 omp/preconditioner/batch_jacobi_kernels.cpp   |  4 +-
 reference/base/batch_multi_vector_kernels.cpp | 13 ++---
 reference/matrix/batch_csr_kernels.cpp        |  8 +--
 reference/matrix/batch_dense_kernels.cpp      | 12 +++--
 reference/matrix/batch_ell_kernels.cpp        |  8 +--
 .../preconditioner/batch_jacobi_kernels.cpp   |  4 +-
 41 files changed, 426 insertions(+), 191 deletions(-)

diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.cpp b/common/cuda_hip/base/batch_multi_vector_kernels.cpp
index 8154dc440df..8ff88ddc73b 100644
--- a/common/cuda_hip/base/batch_multi_vector_kernels.cpp
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.cpp
@@ -55,7 +55,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL);
 
 
@@ -81,7 +81,7 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL);
 
 
@@ -101,7 +101,7 @@ void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
         x_ub, y_ub, res_ub, [] __device__(auto val) { return val; });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL);
 
 
@@ -121,7 +121,7 @@ void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
         x_ub, y_ub, res_ub, [] __device__(auto val) { return conj(val); });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_CONJ_DOT_KERNEL);
 
 
@@ -139,7 +139,7 @@ void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
         x_ub, res_ub);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL);
 
 
@@ -156,7 +156,8 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
             x_ub, result_ub);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
 
 
 }  // namespace batch_multi_vector
diff --git a/common/cuda_hip/matrix/batch_csr_kernels.cpp b/common/cuda_hip/matrix/batch_csr_kernels.cpp
index d48cdbaf32a..0db100363b8 100644
--- a/common/cuda_hip/matrix/batch_csr_kernels.cpp
+++ b/common/cuda_hip/matrix/batch_csr_kernels.cpp
@@ -46,7 +46,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
 }
 
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_CSR_SIMPLE_APPLY_KERNEL);
 
 
@@ -72,7 +72,7 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
         alpha_ub, mat_ub, b_ub, beta_ub, x_ub);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_CSR_ADVANCED_APPLY_KERNEL);
 
 
@@ -91,7 +91,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
             mat_ub);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_CSR_SCALE_KERNEL);
 
 
@@ -110,7 +110,7 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
         alpha_ub, beta_ub, mat_ub);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_CSR_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/common/cuda_hip/matrix/batch_dense_kernels.cpp b/common/cuda_hip/matrix/batch_dense_kernels.cpp
index ee4d87abaa3..e0f1fc5e8dc 100644
--- a/common/cuda_hip/matrix/batch_dense_kernels.cpp
+++ b/common/cuda_hip/matrix/batch_dense_kernels.cpp
@@ -45,7 +45,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
         mat_ub, b_ub, x_ub);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL);
 
 
@@ -71,7 +71,7 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
         alpha_ub, mat_ub, b_ub, beta_ub, x_ub);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL);
 
 
@@ -90,7 +90,8 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
             mat_ub);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL);
 
 
 template <typename ValueType>
@@ -108,7 +109,8 @@ void scale_add(std::shared_ptr<const DefaultExecutor> exec,
         alpha_ub, mat_ub, in_out_ub);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_ADD_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_BATCH_DENSE_SCALE_ADD_KERNEL);
 
 
 template <typename ValueType>
@@ -126,7 +128,7 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
         alpha_ub, beta_ub, mat_ub);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_DENSE_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/common/cuda_hip/matrix/batch_ell_kernels.cpp b/common/cuda_hip/matrix/batch_ell_kernels.cpp
index 38d34707d45..dddb53e34ff 100644
--- a/common/cuda_hip/matrix/batch_ell_kernels.cpp
+++ b/common/cuda_hip/matrix/batch_ell_kernels.cpp
@@ -46,7 +46,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
 }
 
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL);
 
 
@@ -72,7 +72,7 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
         alpha_ub, mat_ub, b_ub, beta_ub, x_ub);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL);
 
 
@@ -91,7 +91,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
             mat_ub);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_ELL_SCALE_KERNEL);
 
 
@@ -110,7 +110,7 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
         alpha_ub, beta_ub, mat_ub);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_ELL_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/core/base/batch_instantiation.hpp b/core/base/batch_instantiation.hpp
index dbcccefb469..652d4cd7ff7 100644
--- a/core/base/batch_instantiation.hpp
+++ b/core/base/batch_instantiation.hpp
@@ -45,7 +45,7 @@ namespace batch {
 #define GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER(...) \
     GKO_CALL(GKO_BATCH_INSTANTIATE_MATRIX,                         \
              GKO_BATCH_INSTANTIATE_PRECONDITIONER,                 \
-             GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS, __VA_ARGS__)
+             GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS_WITH_HALF, __VA_ARGS__)
 
 
 }  // namespace batch
diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp
index f4485377f25..1eb3cd8f60d 100644
--- a/core/base/batch_multi_vector.cpp
+++ b/core/base/batch_multi_vector.cpp
@@ -281,7 +281,7 @@ void MultiVector<ValueType>::compute_norm2(
 
 template <typename ValueType>
 void MultiVector<ValueType>::convert_to(
-    MultiVector<next_precision<ValueType>>* result) const
+    MultiVector<next_precision_with_half<ValueType>>* result) const
 {
     result->values_ = this->values_;
     result->set_size(this->get_size());
@@ -290,14 +290,35 @@ void MultiVector<ValueType>::convert_to(
 
 template <typename ValueType>
 void MultiVector<ValueType>::move_to(
-    MultiVector<next_precision<ValueType>>* result)
+    MultiVector<next_precision_with_half<ValueType>>* result)
 {
     this->convert_to(result);
 }
 
 
+#if GINKGO_ENABLE_HALF
+template <typename ValueType>
+void MultiVector<ValueType>::convert_to(
+    MultiVector<next_precision_with_half<next_precision_with_half<ValueType>>>*
+        result) const
+{
+    result->values_ = this->values_;
+    result->set_size(this->get_size());
+}
+
+
+template <typename ValueType>
+void MultiVector<ValueType>::move_to(
+    MultiVector<next_precision_with_half<next_precision_with_half<ValueType>>>*
+        result)
+{
+    this->convert_to(result);
+}
+#endif
+
+
 #define GKO_DECLARE_BATCH_MULTI_VECTOR(_type) class MultiVector<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BATCH_MULTI_VECTOR);
 
 
 }  // namespace batch
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 7215a17aec5..3f6cc9ab1bc 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -362,12 +362,15 @@ GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL);
 namespace batch_multi_vector {
 
 
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_CONJ_DOT_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_CONJ_DOT_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
 
 
 }  // namespace batch_multi_vector
@@ -376,10 +379,13 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
 namespace batch_csr {
 
 
-GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_CSR_SIMPLE_APPLY_KERNEL);
-GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_CSR_ADVANCED_APPLY_KERNEL);
-GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_CSR_SCALE_KERNEL);
-GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_CSR_ADD_SCALED_IDENTITY_KERNEL);
+GKO_STUB_VALUE_AND_INT32_TYPE_WITH_HALF(
+    GKO_DECLARE_BATCH_CSR_SIMPLE_APPLY_KERNEL);
+GKO_STUB_VALUE_AND_INT32_TYPE_WITH_HALF(
+    GKO_DECLARE_BATCH_CSR_ADVANCED_APPLY_KERNEL);
+GKO_STUB_VALUE_AND_INT32_TYPE_WITH_HALF(GKO_DECLARE_BATCH_CSR_SCALE_KERNEL);
+GKO_STUB_VALUE_AND_INT32_TYPE_WITH_HALF(
+    GKO_DECLARE_BATCH_CSR_ADD_SCALED_IDENTITY_KERNEL);
 
 
 }  // namespace batch_csr
@@ -388,11 +394,12 @@ GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_CSR_ADD_SCALED_IDENTITY_KERNEL);
 namespace batch_dense {
 
 
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_ADD_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_ADD_SCALED_IDENTITY_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BATCH_DENSE_SCALE_ADD_KERNEL);
+GKO_STUB_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_BATCH_DENSE_ADD_SCALED_IDENTITY_KERNEL);
 
 
 }  // namespace batch_dense
@@ -401,10 +408,13 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_ADD_SCALED_IDENTITY_KERNEL);
 namespace batch_ell {
 
 
-GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL);
-GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL);
-GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_ELL_SCALE_KERNEL);
-GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_ELL_ADD_SCALED_IDENTITY_KERNEL);
+GKO_STUB_VALUE_AND_INT32_TYPE_WITH_HALF(
+    GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL);
+GKO_STUB_VALUE_AND_INT32_TYPE_WITH_HALF(
+    GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL);
+GKO_STUB_VALUE_AND_INT32_TYPE_WITH_HALF(GKO_DECLARE_BATCH_ELL_SCALE_KERNEL);
+GKO_STUB_VALUE_AND_INT32_TYPE_WITH_HALF(
+    GKO_DECLARE_BATCH_ELL_ADD_SCALED_IDENTITY_KERNEL);
 
 
 }  // namespace batch_ell
@@ -941,9 +951,10 @@ namespace batch_jacobi {
 GKO_STUB_INDEX_TYPE(
     GKO_DECLARE_BATCH_BLOCK_JACOBI_COMPUTE_CUMULATIVE_BLOCK_STORAGE);
 GKO_STUB_INDEX_TYPE(GKO_DECLARE_BATCH_BLOCK_JACOBI_FIND_ROW_BLOCK_MAP);
-GKO_STUB_VALUE_AND_INT32_TYPE(
+GKO_STUB_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_BLOCK_JACOBI_EXTRACT_PATTERN_KERNEL);
-GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_BLOCK_JACOBI_COMPUTE_KERNEL);
+GKO_STUB_VALUE_AND_INT32_TYPE_WITH_HALF(
+    GKO_DECLARE_BATCH_BLOCK_JACOBI_COMPUTE_KERNEL);
 
 
 }  // namespace batch_jacobi
diff --git a/core/log/batch_logger.cpp b/core/log/batch_logger.cpp
index f274019016f..86c6ea647f2 100644
--- a/core/log/batch_logger.cpp
+++ b/core/log/batch_logger.cpp
@@ -65,7 +65,7 @@ log_data<ValueType>::log_data(std::shared_ptr<const Executor> exec,
 
 #define GKO_DECLARE_LOG_DATA(_type) struct log_data<_type>
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_LOG_DATA);
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF(GKO_DECLARE_LOG_DATA);
 
 #undef GKO_DECLARE_LOG_DATA
 
@@ -92,7 +92,7 @@ void BatchConvergence<ValueType>::on_batch_solver_completed(
 
 
 #define GKO_DECLARE_BATCH_CONVERGENCE(_type) class BatchConvergence<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CONVERGENCE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BATCH_CONVERGENCE);
 
 
 }  // namespace log
diff --git a/core/matrix/batch_csr.cpp b/core/matrix/batch_csr.cpp
index 1b1dc22a6c4..141c5b86d02 100644
--- a/core/matrix/batch_csr.cpp
+++ b/core/matrix/batch_csr.cpp
@@ -246,7 +246,7 @@ void Csr<ValueType, IndexType>::add_scaled_identity(
 
 template <typename ValueType, typename IndexType>
 void Csr<ValueType, IndexType>::convert_to(
-    Csr<next_precision<ValueType>, IndexType>* result) const
+    Csr<next_precision_with_half<ValueType>, IndexType>* result) const
 {
     result->values_ = this->values_;
     result->col_idxs_ = this->col_idxs_;
@@ -257,14 +257,37 @@ void Csr<ValueType, IndexType>::convert_to(
 
 template <typename ValueType, typename IndexType>
 void Csr<ValueType, IndexType>::move_to(
-    Csr<next_precision<ValueType>, IndexType>* result)
+    Csr<next_precision_with_half<ValueType>, IndexType>* result)
 {
     this->convert_to(result);
 }
 
 
+#if GINKGO_ENABLE_HALF
+template <typename ValueType, typename IndexType>
+void Csr<ValueType, IndexType>::convert_to(
+    Csr<next_precision_with_half<next_precision_with_half<ValueType>>,
+        IndexType>* result) const
+{
+    result->values_ = this->values_;
+    result->col_idxs_ = this->col_idxs_;
+    result->row_ptrs_ = this->row_ptrs_;
+    result->set_size(this->get_size());
+}
+
+
+template <typename ValueType, typename IndexType>
+void Csr<ValueType, IndexType>::move_to(
+    Csr<next_precision_with_half<next_precision_with_half<ValueType>>,
+        IndexType>* result)
+{
+    this->convert_to(result);
+}
+#endif
+
+
 #define GKO_DECLARE_BATCH_CSR_MATRIX(ValueType) class Csr<ValueType, int32>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CSR_MATRIX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BATCH_CSR_MATRIX);
 
 
 }  // namespace matrix
diff --git a/core/matrix/batch_dense.cpp b/core/matrix/batch_dense.cpp
index 6390a4c7ad0..0c1838abb56 100644
--- a/core/matrix/batch_dense.cpp
+++ b/core/matrix/batch_dense.cpp
@@ -245,7 +245,7 @@ void Dense<ValueType>::add_scaled_identity(
 
 template <typename ValueType>
 void Dense<ValueType>::convert_to(
-    Dense<next_precision<ValueType>>* result) const
+    Dense<next_precision_with_half<ValueType>>* result) const
 {
     result->values_ = this->values_;
     result->set_size(this->get_size());
@@ -253,14 +253,36 @@ void Dense<ValueType>::convert_to(
 
 
 template <typename ValueType>
-void Dense<ValueType>::move_to(Dense<next_precision<ValueType>>* result)
+void Dense<ValueType>::move_to(
+    Dense<next_precision_with_half<ValueType>>* result)
 {
     this->convert_to(result);
 }
 
 
+#if GINKGO_ENABLE_HALF
+template <typename ValueType>
+void Dense<ValueType>::convert_to(
+    Dense<next_precision_with_half<next_precision_with_half<ValueType>>>*
+        result) const
+{
+    result->values_ = this->values_;
+    result->set_size(this->get_size());
+}
+
+
+template <typename ValueType>
+void Dense<ValueType>::move_to(
+    Dense<next_precision_with_half<next_precision_with_half<ValueType>>>*
+        result)
+{
+    this->convert_to(result);
+}
+#endif
+
+
 #define GKO_DECLARE_BATCH_DENSE_MATRIX(_type) class Dense<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_MATRIX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BATCH_DENSE_MATRIX);
 
 
 }  // namespace matrix
diff --git a/core/matrix/batch_ell.cpp b/core/matrix/batch_ell.cpp
index 3722c41de60..3b829d3ba4c 100644
--- a/core/matrix/batch_ell.cpp
+++ b/core/matrix/batch_ell.cpp
@@ -266,7 +266,7 @@ void Ell<ValueType, IndexType>::add_scaled_identity(
 
 template <typename ValueType, typename IndexType>
 void Ell<ValueType, IndexType>::convert_to(
-    Ell<next_precision<ValueType>, IndexType>* result) const
+    Ell<next_precision_with_half<ValueType>, IndexType>* result) const
 {
     result->values_ = this->values_;
     result->col_idxs_ = this->col_idxs_;
@@ -277,14 +277,37 @@ void Ell<ValueType, IndexType>::convert_to(
 
 template <typename ValueType, typename IndexType>
 void Ell<ValueType, IndexType>::move_to(
-    Ell<next_precision<ValueType>, IndexType>* result)
+    Ell<next_precision_with_half<ValueType>, IndexType>* result)
 {
     this->convert_to(result);
 }
 
 
+#if GINKGO_ENABLE_HALF
+template <typename ValueType, typename IndexType>
+void Ell<ValueType, IndexType>::convert_to(
+    Ell<next_precision_with_half<next_precision_with_half<ValueType>>,
+        IndexType>* result) const
+{
+    result->values_ = this->values_;
+    result->col_idxs_ = this->col_idxs_;
+    result->num_elems_per_row_ = this->num_elems_per_row_;
+    result->set_size(this->get_size());
+}
+
+
+template <typename ValueType, typename IndexType>
+void Ell<ValueType, IndexType>::move_to(
+    Ell<next_precision_with_half<next_precision_with_half<ValueType>>,
+        IndexType>* result)
+{
+    this->convert_to(result);
+}
+#endif
+
+
 #define GKO_DECLARE_BATCH_ELL_MATRIX(ValueType) class Ell<ValueType, int32>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_ELL_MATRIX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BATCH_ELL_MATRIX);
 
 
 }  // namespace matrix
diff --git a/core/matrix/batch_identity.cpp b/core/matrix/batch_identity.cpp
index 2220120d00b..6ee2d55f6fe 100644
--- a/core/matrix/batch_identity.cpp
+++ b/core/matrix/batch_identity.cpp
@@ -113,7 +113,8 @@ void Identity<ValueType>::apply_impl(const MultiVector<ValueType>* alpha,
 
 
 #define GKO_DECLARE_BATCH_IDENTITY_MATRIX(ValueType) class Identity<ValueType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_IDENTITY_MATRIX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_BATCH_IDENTITY_MATRIX);
 
 
 }  // namespace matrix
diff --git a/core/preconditioner/batch_jacobi.cpp b/core/preconditioner/batch_jacobi.cpp
index e4382de38ec..53809a82a5a 100644
--- a/core/preconditioner/batch_jacobi.cpp
+++ b/core/preconditioner/batch_jacobi.cpp
@@ -175,7 +175,7 @@ void Jacobi<ValueType, IndexType>::generate_precond(
 
 
 #define GKO_DECLARE_BATCH_JACOBI(_type) class Jacobi<_type, int32>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_JACOBI);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BATCH_JACOBI);
 
 
 }  // namespace preconditioner
diff --git a/core/solver/batch_bicgstab.cpp b/core/solver/batch_bicgstab.cpp
index 73fc0a2c852..fa467c98976 100644
--- a/core/solver/batch_bicgstab.cpp
+++ b/core/solver/batch_bicgstab.cpp
@@ -68,7 +68,7 @@ void Bicgstab<ValueType>::solver_apply(
 
 
 #define GKO_DECLARE_BATCH_BICGSTAB(_type) class Bicgstab<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BATCH_BICGSTAB);
 
 
 }  // namespace solver
diff --git a/core/solver/batch_cg.cpp b/core/solver/batch_cg.cpp
index 13a5afffcaa..c7c4da5085a 100644
--- a/core/solver/batch_cg.cpp
+++ b/core/solver/batch_cg.cpp
@@ -69,7 +69,7 @@ void Cg<ValueType>::solver_apply(
 
 
 #define GKO_DECLARE_BATCH_CG(_type) class Cg<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BATCH_CG);
 
 
 }  // namespace solver
diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp
index d76bc72d489..570b717d7d6 100644
--- a/core/solver/batch_dispatch.hpp
+++ b/core/solver/batch_dispatch.hpp
@@ -86,6 +86,23 @@ using DeviceValueType = gko::kernels::hip::hip_type<ValueType>;
 #include "dpcpp/stop/batch_criteria.hpp"
 
 
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+
+
+template <typename T>
+inline std::decay_t<T> as_device_type(T val)
+{
+    return val;
+}
+
+
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
+
+
 namespace gko {
 namespace batch {
 namespace solver {
@@ -115,6 +132,23 @@ using DeviceValueType = ValueType;
 #include "reference/stop/batch_criteria.hpp"
 
 
+namespace gko {
+namespace kernels {
+namespace host {
+
+
+template <typename T>
+inline std::decay_t<T> as_device_type(T val)
+{
+    return val;
+}
+
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace gko
+
+
 namespace gko {
 namespace batch {
 namespace solver {
@@ -205,7 +239,7 @@ enum class log_type { simple_convergence_completion };
     GKO_CALL(GKO_BATCH_INSTANTIATE_MATRIX_BATCH, GKO_BATCH_INSTANTIATE_LOGGER, \
              GKO_BATCH_INSTANTIATE_DEVICE_PRECONDITIONER,                      \
              GKO_BATCH_INSTANTIATE_STOP,                                       \
-             GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS, __VA_ARGS__)
+             GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS_WITH_HALF, __VA_ARGS__)
 
 
 /**
@@ -226,6 +260,7 @@ class batch_solver_dispatch {
     using value_type = ValueType;
     using device_value_type = DeviceValueType<ValueType>;
     using real_type = remove_complex<value_type>;
+    using device_real_type = DeviceValueType<real_type>;
 
     batch_solver_dispatch(
         const KernelCaller& kernel_caller, const SettingsType& settings,
@@ -316,8 +351,9 @@ class batch_solver_dispatch {
     {
         if (logger_type_ ==
             log::detail::log_type::simple_convergence_completion) {
-            device::batch_log::SimpleFinalLogger<real_type> logger(
-                log_data.res_norms.get_data(), log_data.iter_counts.get_data());
+            device::batch_log::SimpleFinalLogger<device_real_type> logger(
+                device::as_device_type(log_data.res_norms.get_data()),
+                log_data.iter_counts.get_data());
             dispatch_on_preconditioner(logger, amat, b_item, x_item);
         } else {
             GKO_NOT_IMPLEMENTED;
diff --git a/cuda/preconditioner/batch_jacobi_kernels.cu b/cuda/preconditioner/batch_jacobi_kernels.cu
index 2ac5717308a..30bbc8fd2e7 100644
--- a/cuda/preconditioner/batch_jacobi_kernels.cu
+++ b/cuda/preconditioner/batch_jacobi_kernels.cu
@@ -99,7 +99,7 @@ void extract_common_blocks_pattern(
         blocks_pattern);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_BLOCK_JACOBI_EXTRACT_PATTERN_KERNEL);
 
 
@@ -156,7 +156,7 @@ void compute_block_jacobi(
         cumulative_block_storage, block_pointers, blocks_pattern, blocks);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_BLOCK_JACOBI_COMPUTE_KERNEL);
 
 
diff --git a/dpcpp/base/batch_multi_vector_kernels.dp.cpp b/dpcpp/base/batch_multi_vector_kernels.dp.cpp
index 1d38a165956..6f1f3467e4a 100644
--- a/dpcpp/base/batch_multi_vector_kernels.dp.cpp
+++ b/dpcpp/base/batch_multi_vector_kernels.dp.cpp
@@ -102,7 +102,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL);
 
 
@@ -161,7 +161,7 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL);
 
 
@@ -230,7 +230,7 @@ void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL);
 
 
@@ -275,7 +275,7 @@ void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_CONJ_DOT_KERNEL);
 
 
@@ -334,7 +334,7 @@ void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL);
 
 
@@ -372,7 +372,8 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
 
 
 }  // namespace batch_multi_vector
diff --git a/dpcpp/base/batch_multi_vector_kernels.hpp b/dpcpp/base/batch_multi_vector_kernels.hpp
index 74abaeda86f..96ada23f42c 100644
--- a/dpcpp/base/batch_multi_vector_kernels.hpp
+++ b/dpcpp/base/batch_multi_vector_kernels.hpp
@@ -65,25 +65,6 @@ __dpct_inline__ void add_scaled_kernel(
 }
 
 
-template <typename ValueType>
-__dpct_inline__ void single_rhs_compute_conj_dot(
-    const int num_rows, const ValueType* const __restrict__ x,
-    const ValueType* const __restrict__ y, ValueType& result,
-    sycl::nd_item<3> item_ct1)
-{
-    const auto group = item_ct1.get_group();
-    const auto group_size = item_ct1.get_local_range().size();
-    const auto tid = item_ct1.get_local_linear_id();
-
-    ValueType val = zero<ValueType>();
-
-    for (int r = tid; r < num_rows; r += group_size) {
-        val += conj(x[r]) * y[r];
-    }
-    result = sycl::reduce_over_group(group, val, sycl::plus<>());
-}
-
-
 template <int tile_size = config::warp_size, typename ValueType>
 __dpct_inline__ void single_rhs_compute_conj_dot_sg(
     const int num_rows, const ValueType* const __restrict__ x,
@@ -174,28 +155,6 @@ __dpct_inline__ void single_rhs_compute_norm2_sg(
 }
 
 
-template <typename ValueType>
-__dpct_inline__ void single_rhs_compute_norm2(
-    const int num_rows, const ValueType* const __restrict__ x,
-    gko::remove_complex<ValueType>& result, sycl::nd_item<3> item_ct1)
-{
-    const auto group = item_ct1.get_group();
-    const auto group_size = item_ct1.get_local_range().size();
-    const auto tid = item_ct1.get_local_linear_id();
-
-    using real_type = typename gko::remove_complex<ValueType>;
-    real_type val = zero<real_type>();
-
-    for (int r = tid; r < num_rows; r += group_size) {
-        val += squared_norm(x[r]);
-    }
-
-    val = sycl::reduce_over_group(group, val, sycl::plus<>());
-
-    result = sqrt(val);
-}
-
-
 template <typename ValueType>
 __dpct_inline__ void compute_norm2_kernel(
     const gko::batch::multi_vector::batch_item<const ValueType>& x,
diff --git a/dpcpp/matrix/batch_csr_kernels.dp.cpp b/dpcpp/matrix/batch_csr_kernels.dp.cpp
index 1759a959299..ae5122ec7f9 100644
--- a/dpcpp/matrix/batch_csr_kernels.dp.cpp
+++ b/dpcpp/matrix/batch_csr_kernels.dp.cpp
@@ -73,7 +73,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_CSR_SIMPLE_APPLY_KERNEL);
 
 
@@ -127,7 +127,7 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_CSR_ADVANCED_APPLY_KERNEL);
 
 
@@ -173,7 +173,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_CSR_SCALE_KERNEL);
 
 
@@ -215,7 +215,7 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_CSR_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/dpcpp/matrix/batch_dense_kernels.dp.cpp b/dpcpp/matrix/batch_dense_kernels.dp.cpp
index 43974589abb..6c0e4b4eb44 100644
--- a/dpcpp/matrix/batch_dense_kernels.dp.cpp
+++ b/dpcpp/matrix/batch_dense_kernels.dp.cpp
@@ -76,7 +76,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL);
 
 
@@ -129,7 +129,7 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL);
 
 
@@ -173,7 +173,8 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL);
 
 
 template <typename ValueType>
@@ -215,7 +216,8 @@ void scale_add(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_ADD_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_BATCH_DENSE_SCALE_ADD_KERNEL);
 
 
 template <typename ValueType>
@@ -256,7 +258,7 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_DENSE_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/dpcpp/matrix/batch_ell_kernels.dp.cpp b/dpcpp/matrix/batch_ell_kernels.dp.cpp
index d9b819b101e..b4e2627a494 100644
--- a/dpcpp/matrix/batch_ell_kernels.dp.cpp
+++ b/dpcpp/matrix/batch_ell_kernels.dp.cpp
@@ -73,7 +73,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL);
 
 
@@ -127,7 +127,7 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL);
 
 
@@ -170,7 +170,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_ELL_SCALE_KERNEL);
 
 
@@ -212,7 +212,7 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_ELL_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/dpcpp/preconditioner/batch_block_jacobi.hpp b/dpcpp/preconditioner/batch_block_jacobi.hpp
index a7431f919a5..04c21f97991 100644
--- a/dpcpp/preconditioner/batch_block_jacobi.hpp
+++ b/dpcpp/preconditioner/batch_block_jacobi.hpp
@@ -129,8 +129,11 @@ class BlockJacobi final {
                 sum += block_val * r[dense_block_col + idx_start];
             }
 
-            // reduction
-            sum = sycl::reduce_over_group(sg, sum, sycl::plus<>());
+            // reduction (it does not support half)
+            // sum = sycl::reduce_over_group(sg, sum, sycl::plus<>());
+            for (int i = sg_size / 2; i > 0; i /= 2) {
+                sum += sg.shuffle_down(sum, i);
+            }
 
             if (sg_tid == 0) {
                 z[row_idx] = sum;
diff --git a/dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp b/dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp
index 7721359716c..3a63466ef5d 100644
--- a/dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp
+++ b/dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp
@@ -104,7 +104,7 @@ void extract_common_blocks_pattern(
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_BLOCK_JACOBI_EXTRACT_PATTERN_KERNEL);
 
 
@@ -173,7 +173,7 @@ void compute_block_jacobi(
         cumulative_block_storage, block_pointers, blocks_pattern, blocks, exec);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_BLOCK_JACOBI_COMPUTE_KERNEL);
 
 
diff --git a/hip/preconditioner/batch_jacobi_kernels.hip.cpp b/hip/preconditioner/batch_jacobi_kernels.hip.cpp
index fdd57a95127..2424a035cf4 100644
--- a/hip/preconditioner/batch_jacobi_kernels.hip.cpp
+++ b/hip/preconditioner/batch_jacobi_kernels.hip.cpp
@@ -101,7 +101,7 @@ void extract_common_blocks_pattern(
         blocks_pattern);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_BLOCK_JACOBI_EXTRACT_PATTERN_KERNEL);
 
 
@@ -159,7 +159,7 @@ void compute_block_jacobi(
         cumulative_block_storage, block_pointers, blocks_pattern, blocks);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_BLOCK_JACOBI_COMPUTE_KERNEL);
 
 
diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp
index d04e9562fce..bd641f057a1 100644
--- a/include/ginkgo/core/base/batch_multi_vector.hpp
+++ b/include/ginkgo/core/base/batch_multi_vector.hpp
@@ -52,16 +52,22 @@ template <typename ValueType = default_precision>
 class MultiVector
     : public EnablePolymorphicObject<MultiVector<ValueType>>,
       public EnablePolymorphicAssignment<MultiVector<ValueType>>,
-      public ConvertibleTo<MultiVector<next_precision<ValueType>>> {
+#if GINKGO_ENABLE_HALF
+      public ConvertibleTo<MultiVector<
+          next_precision_with_half<next_precision_with_half<ValueType>>>>,
+#endif
+      public ConvertibleTo<MultiVector<next_precision_with_half<ValueType>>> {
     friend class EnablePolymorphicObject<MultiVector>;
     friend class MultiVector<to_complex<ValueType>>;
-    friend class MultiVector<next_precision<ValueType>>;
+    friend class MultiVector<previous_precision_with_half<ValueType>>;
 
 public:
     using EnablePolymorphicAssignment<MultiVector>::convert_to;
     using EnablePolymorphicAssignment<MultiVector>::move_to;
-    using ConvertibleTo<MultiVector<next_precision<ValueType>>>::convert_to;
-    using ConvertibleTo<MultiVector<next_precision<ValueType>>>::move_to;
+    using ConvertibleTo<
+        MultiVector<next_precision_with_half<ValueType>>>::convert_to;
+    using ConvertibleTo<
+        MultiVector<next_precision_with_half<ValueType>>>::move_to;
 
     using value_type = ValueType;
     using index_type = int32;
@@ -78,10 +84,28 @@ class MultiVector
     static std::unique_ptr<MultiVector> create_with_config_of(
         ptr_param<const MultiVector> other);
 
+    void convert_to(MultiVector<next_precision_with_half<ValueType>>* result)
+        const override;
+
+    void move_to(
+        MultiVector<next_precision_with_half<ValueType>>* result) override;
+
+#if GINKGO_ENABLE_HALF
+    friend class MultiVector<
+        previous_precision_with_half<previous_precision_with_half<ValueType>>>;
+    using ConvertibleTo<MultiVector<next_precision_with_half<
+        next_precision_with_half<ValueType>>>>::convert_to;
+    using ConvertibleTo<MultiVector<next_precision_with_half<
+        next_precision_with_half<ValueType>>>>::move_to;
+
     void convert_to(
-        MultiVector<next_precision<ValueType>>* result) const override;
+        MultiVector<
+            next_precision_with_half<next_precision_with_half<ValueType>>>*
+            result) const override;
 
-    void move_to(MultiVector<next_precision<ValueType>>* result) override;
+    void move_to(MultiVector<next_precision_with_half<
+                     next_precision_with_half<ValueType>>>* result) override;
+#endif
 
     /**
      * Creates a mutable view (of matrix::Dense type) of one item of the Batch
diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp
index 5e1fb2a14e3..4f1166de223 100644
--- a/include/ginkgo/core/base/types.hpp
+++ b/include/ginkgo/core/base/types.hpp
@@ -490,6 +490,11 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
     template GKO_INDIRECT(_macro(double, __VA_ARGS__))
 #endif
 
+#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_VARGS_WITH_HALF( \
+    _macro, ...)                                                         \
+    GKO_INDIRECT(GKO_ADAPT_HF(template _macro(half, __VA_ARGS__)));      \
+    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_VARGS(_macro, __VA_ARGS__)
+
 
 /**
  * Instantiates a template for each non-complex value type compiled by Ginkgo.
@@ -517,6 +522,12 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
     template GKO_INDIRECT(_macro(std::complex<double>, __VA_ARGS__))
 #endif
 
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS_WITH_HALF(_macro, ...) \
+    GKO_INDIRECT(GKO_ADAPT_HF(template _macro(half, __VA_ARGS__)));      \
+    GKO_INDIRECT(                                                        \
+        GKO_ADAPT_HF(template _macro(std::complex<half>, __VA_ARGS__))); \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(_macro, __VA_ARGS__)
+
 
 /**
  * Instantiates a template for each value and scalar type compiled by Ginkgo.
diff --git a/include/ginkgo/core/log/logger.hpp b/include/ginkgo/core/log/logger.hpp
index dd9d30249e9..b05b15fcc0c 100644
--- a/include/ginkgo/core/log/logger.hpp
+++ b/include/ginkgo/core/log/logger.hpp
@@ -18,6 +18,7 @@
 
 namespace gko {
 
+class half;
 
 /* Eliminate circular dependencies the hard way */
 template <typename ValueType>
@@ -579,6 +580,17 @@ public:                                                              \
         const array<int>& iters, const array<float>& residual_norms) const
     {}
 
+    /**
+     * Batch solver's event that records the iteration count and the residual
+     * norm.
+     *
+     * @param iters  the array of iteration counts.
+     * @param residual_norms  the array storing the residual norms.
+     */
+    virtual void on_batch_solver_completed(
+        const array<int>& iters, const array<gko::half>& residual_norms) const
+    {}
+
 public:
 #undef GKO_LOGGER_REGISTER_EVENT
 
diff --git a/include/ginkgo/core/matrix/batch_csr.hpp b/include/ginkgo/core/matrix/batch_csr.hpp
index e431454063d..49eb5e4d7cd 100644
--- a/include/ginkgo/core/matrix/batch_csr.hpp
+++ b/include/ginkgo/core/matrix/batch_csr.hpp
@@ -46,10 +46,16 @@ namespace matrix {
 template <typename ValueType = default_precision, typename IndexType = int32>
 class Csr final
     : public EnableBatchLinOp<Csr<ValueType, IndexType>>,
-      public ConvertibleTo<Csr<next_precision<ValueType>, IndexType>> {
+#if GINKGO_ENABLE_HALF
+      public ConvertibleTo<
+          Csr<next_precision_with_half<next_precision_with_half<ValueType>>,
+              IndexType>>,
+#endif
+      public ConvertibleTo<
+          Csr<next_precision_with_half<ValueType>, IndexType>> {
     friend class EnablePolymorphicObject<Csr, BatchLinOp>;
     friend class Csr<to_complex<ValueType>, IndexType>;
-    friend class Csr<next_precision<ValueType>, IndexType>;
+    friend class Csr<previous_precision_with_half<ValueType>, IndexType>;
     static_assert(std::is_same<IndexType, int32>::value,
                   "IndexType must be a 32 bit integer");
 
@@ -63,10 +69,31 @@ class Csr final
     using absolute_type = remove_complex<Csr>;
     using complex_type = to_complex<Csr>;
 
+    void convert_to(Csr<next_precision_with_half<ValueType>, IndexType>* result)
+        const override;
+
+    void move_to(
+        Csr<next_precision_with_half<ValueType>, IndexType>* result) override;
+
+#if GINKGO_ENABLE_HALF
+    friend class Csr<
+        previous_precision_with_half<previous_precision_with_half<ValueType>>,
+        IndexType>;
+    using ConvertibleTo<
+        Csr<next_precision_with_half<next_precision_with_half<ValueType>>,
+            IndexType>>::convert_to;
+    using ConvertibleTo<
+        Csr<next_precision_with_half<next_precision_with_half<ValueType>>,
+            IndexType>>::move_to;
+
     void convert_to(
-        Csr<next_precision<ValueType>, IndexType>* result) const override;
+        Csr<next_precision_with_half<next_precision_with_half<ValueType>>,
+            IndexType>* result) const override;
 
-    void move_to(Csr<next_precision<ValueType>, IndexType>* result) override;
+    void move_to(
+        Csr<next_precision_with_half<next_precision_with_half<ValueType>>,
+            IndexType>* result) override;
+#endif
 
     /**
      * Creates a mutable view (of matrix::Csr type) of one item of the
diff --git a/include/ginkgo/core/matrix/batch_dense.hpp b/include/ginkgo/core/matrix/batch_dense.hpp
index 5ea7c3ee128..c1340e482f4 100644
--- a/include/ginkgo/core/matrix/batch_dense.hpp
+++ b/include/ginkgo/core/matrix/batch_dense.hpp
@@ -45,11 +45,16 @@ namespace matrix {
  * @ingroup BatchLinOp
  */
 template <typename ValueType = default_precision>
-class Dense final : public EnableBatchLinOp<Dense<ValueType>>,
-                    public ConvertibleTo<Dense<next_precision<ValueType>>> {
+class Dense final
+    : public EnableBatchLinOp<Dense<ValueType>>,
+#if GINKGO_ENABLE_HALF
+      public ConvertibleTo<
+          Dense<next_precision_with_half<next_precision_with_half<ValueType>>>>,
+#endif
+      public ConvertibleTo<Dense<next_precision_with_half<ValueType>>> {
     friend class EnablePolymorphicObject<Dense, BatchLinOp>;
     friend class Dense<to_complex<ValueType>>;
-    friend class Dense<next_precision<ValueType>>;
+    friend class Dense<previous_precision_with_half<ValueType>>;
 
 public:
     using EnableBatchLinOp<Dense>::convert_to;
@@ -62,9 +67,27 @@ class Dense final : public EnableBatchLinOp<Dense<ValueType>>,
     using absolute_type = remove_complex<Dense>;
     using complex_type = to_complex<Dense>;
 
-    void convert_to(Dense<next_precision<ValueType>>* result) const override;
+    void convert_to(
+        Dense<next_precision_with_half<ValueType>>* result) const override;
 
-    void move_to(Dense<next_precision<ValueType>>* result) override;
+    void move_to(Dense<next_precision_with_half<ValueType>>* result) override;
+
+#if GINKGO_ENABLE_HALF
+    friend class Dense<
+        previous_precision_with_half<previous_precision_with_half<ValueType>>>;
+    using ConvertibleTo<Dense<next_precision_with_half<
+        next_precision_with_half<ValueType>>>>::convert_to;
+    using ConvertibleTo<Dense<next_precision_with_half<
+        next_precision_with_half<ValueType>>>>::move_to;
+
+    void convert_to(
+        Dense<next_precision_with_half<next_precision_with_half<ValueType>>>*
+            result) const override;
+
+    void move_to(
+        Dense<next_precision_with_half<next_precision_with_half<ValueType>>>*
+            result) override;
+#endif
 
     /**
      * Creates a mutable view (of gko::matrix::Dense type) of one item of the
diff --git a/include/ginkgo/core/matrix/batch_ell.hpp b/include/ginkgo/core/matrix/batch_ell.hpp
index b760cee795a..872b8ce2db9 100644
--- a/include/ginkgo/core/matrix/batch_ell.hpp
+++ b/include/ginkgo/core/matrix/batch_ell.hpp
@@ -51,10 +51,16 @@ namespace matrix {
 template <typename ValueType = default_precision, typename IndexType = int32>
 class Ell final
     : public EnableBatchLinOp<Ell<ValueType, IndexType>>,
-      public ConvertibleTo<Ell<next_precision<ValueType>, IndexType>> {
+#if GINKGO_ENABLE_HALF
+      public ConvertibleTo<
+          Ell<next_precision_with_half<next_precision_with_half<ValueType>>,
+              IndexType>>,
+#endif
+      public ConvertibleTo<
+          Ell<next_precision_with_half<ValueType>, IndexType>> {
     friend class EnablePolymorphicObject<Ell, BatchLinOp>;
     friend class Ell<to_complex<ValueType>, IndexType>;
-    friend class Ell<next_precision<ValueType>, IndexType>;
+    friend class Ell<previous_precision_with_half<ValueType>, IndexType>;
     static_assert(std::is_same<IndexType, int32>::value,
                   "IndexType must be a 32 bit integer");
 
@@ -68,10 +74,31 @@ class Ell final
     using absolute_type = remove_complex<Ell>;
     using complex_type = to_complex<Ell>;
 
+    void convert_to(Ell<next_precision_with_half<ValueType>, IndexType>* result)
+        const override;
+
+    void move_to(
+        Ell<next_precision_with_half<ValueType>, IndexType>* result) override;
+
+#if GINKGO_ENABLE_HALF
+    friend class Ell<
+        previous_precision_with_half<previous_precision_with_half<ValueType>>,
+        IndexType>;
+    using ConvertibleTo<
+        Ell<next_precision_with_half<next_precision_with_half<ValueType>>,
+            IndexType>>::convert_to;
+    using ConvertibleTo<
+        Ell<next_precision_with_half<next_precision_with_half<ValueType>>,
+            IndexType>>::move_to;
+
     void convert_to(
-        Ell<next_precision<ValueType>, IndexType>* result) const override;
+        Ell<next_precision_with_half<next_precision_with_half<ValueType>>,
+            IndexType>* result) const override;
 
-    void move_to(Ell<next_precision<ValueType>, IndexType>* result) override;
+    void move_to(
+        Ell<next_precision_with_half<next_precision_with_half<ValueType>>,
+            IndexType>* result) override;
+#endif
 
     /**
      * Creates a mutable view (of matrix::Ell type) of one item of the
diff --git a/omp/base/batch_multi_vector_kernels.cpp b/omp/base/batch_multi_vector_kernels.cpp
index 5b57921ab8f..bbae1b0b85d 100644
--- a/omp/base/batch_multi_vector_kernels.cpp
+++ b/omp/base/batch_multi_vector_kernels.cpp
@@ -37,7 +37,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL);
 
 
@@ -59,7 +59,7 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL);
 
 
@@ -81,7 +81,7 @@ void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL);
 
 
@@ -103,7 +103,7 @@ void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_CONJ_DOT_KERNEL);
 
 
@@ -122,7 +122,7 @@ void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL);
 
 
@@ -141,7 +141,8 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
 
 
 }  // namespace batch_multi_vector
diff --git a/omp/matrix/batch_csr_kernels.cpp b/omp/matrix/batch_csr_kernels.cpp
index d4ea6cbd642..b55253e9d4e 100644
--- a/omp/matrix/batch_csr_kernels.cpp
+++ b/omp/matrix/batch_csr_kernels.cpp
@@ -41,7 +41,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_CSR_SIMPLE_APPLY_KERNEL);
 
 
@@ -71,7 +71,7 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_CSR_ADVANCED_APPLY_KERNEL);
 
 
@@ -98,7 +98,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_CSR_SCALE_KERNEL);
 
 
@@ -122,7 +122,7 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_CSR_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/omp/matrix/batch_dense_kernels.cpp b/omp/matrix/batch_dense_kernels.cpp
index cd4a7f05b4a..ea7da295bb4 100644
--- a/omp/matrix/batch_dense_kernels.cpp
+++ b/omp/matrix/batch_dense_kernels.cpp
@@ -41,7 +41,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL);
 
 
@@ -71,7 +71,7 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL);
 
 
@@ -98,7 +98,8 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL);
 
 
 template <typename ValueType>
@@ -121,7 +122,8 @@ void scale_add(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_ADD_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_BATCH_DENSE_SCALE_ADD_KERNEL);
 
 
 template <typename ValueType>
@@ -144,7 +146,7 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_DENSE_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/omp/matrix/batch_ell_kernels.cpp b/omp/matrix/batch_ell_kernels.cpp
index 8b1239565a1..74b8d94cfc8 100644
--- a/omp/matrix/batch_ell_kernels.cpp
+++ b/omp/matrix/batch_ell_kernels.cpp
@@ -41,7 +41,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL);
 
 
@@ -71,7 +71,7 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL);
 
 
@@ -98,7 +98,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_ELL_SCALE_KERNEL);
 
 
@@ -122,7 +122,7 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_ELL_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/omp/preconditioner/batch_jacobi_kernels.cpp b/omp/preconditioner/batch_jacobi_kernels.cpp
index 58fb2602075..99036fd628f 100644
--- a/omp/preconditioner/batch_jacobi_kernels.cpp
+++ b/omp/preconditioner/batch_jacobi_kernels.cpp
@@ -74,7 +74,7 @@ void extract_common_blocks_pattern(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_BLOCK_JACOBI_EXTRACT_PATTERN_KERNEL);
 
 
@@ -102,7 +102,7 @@ void compute_block_jacobi(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_BLOCK_JACOBI_COMPUTE_KERNEL);
 
 
diff --git a/reference/base/batch_multi_vector_kernels.cpp b/reference/base/batch_multi_vector_kernels.cpp
index d7fbf3ce214..4f48a0b6f94 100644
--- a/reference/base/batch_multi_vector_kernels.cpp
+++ b/reference/base/batch_multi_vector_kernels.cpp
@@ -35,7 +35,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL);
 
 
@@ -56,7 +56,7 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL);
 
 
@@ -77,7 +77,7 @@ void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL);
 
 
@@ -98,7 +98,7 @@ void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_CONJ_DOT_KERNEL);
 
 
@@ -116,7 +116,7 @@ void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL);
 
 
@@ -134,7 +134,8 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
 
 
 }  // namespace batch_multi_vector
diff --git a/reference/matrix/batch_csr_kernels.cpp b/reference/matrix/batch_csr_kernels.cpp
index d3304ab9795..c277d4f0738 100644
--- a/reference/matrix/batch_csr_kernels.cpp
+++ b/reference/matrix/batch_csr_kernels.cpp
@@ -39,7 +39,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_CSR_SIMPLE_APPLY_KERNEL);
 
 
@@ -68,7 +68,7 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_CSR_ADVANCED_APPLY_KERNEL);
 
 
@@ -94,7 +94,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_CSR_SCALE_KERNEL);
 
 
@@ -117,7 +117,7 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_CSR_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/reference/matrix/batch_dense_kernels.cpp b/reference/matrix/batch_dense_kernels.cpp
index 599af30ecfb..9c92fb54056 100644
--- a/reference/matrix/batch_dense_kernels.cpp
+++ b/reference/matrix/batch_dense_kernels.cpp
@@ -39,7 +39,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL);
 
 
@@ -68,7 +68,7 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL);
 
 
@@ -94,7 +94,8 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL);
 
 
 template <typename ValueType>
@@ -116,7 +117,8 @@ void scale_add(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_ADD_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+    GKO_DECLARE_BATCH_DENSE_SCALE_ADD_KERNEL);
 
 
 template <typename ValueType>
@@ -138,7 +140,7 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_DENSE_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/reference/matrix/batch_ell_kernels.cpp b/reference/matrix/batch_ell_kernels.cpp
index 1a4855f389f..bc0eb61e30d 100644
--- a/reference/matrix/batch_ell_kernels.cpp
+++ b/reference/matrix/batch_ell_kernels.cpp
@@ -39,7 +39,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL);
 
 
@@ -68,7 +68,7 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL);
 
 
@@ -94,7 +94,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_ELL_SCALE_KERNEL);
 
 
@@ -117,7 +117,7 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_ELL_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/reference/preconditioner/batch_jacobi_kernels.cpp b/reference/preconditioner/batch_jacobi_kernels.cpp
index f994c8c448b..3f6d75cca29 100644
--- a/reference/preconditioner/batch_jacobi_kernels.cpp
+++ b/reference/preconditioner/batch_jacobi_kernels.cpp
@@ -70,7 +70,7 @@ void extract_common_blocks_pattern(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_BLOCK_JACOBI_EXTRACT_PATTERN_KERNEL);
 
 
@@ -96,7 +96,7 @@ void compute_block_jacobi(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
     GKO_DECLARE_BATCH_BLOCK_JACOBI_COMPUTE_KERNEL);
 
 

From 04a5b9b7d31fbb467f7a39aa6662bea1a9474f98 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 29 Oct 2024 01:06:15 +0100
Subject: [PATCH 402/448] batch test with half

---
 core/test/base/batch_multi_vector.cpp         |  3 ++-
 core/test/matrix/batch_csr.cpp                |  2 +-
 core/test/matrix/batch_dense.cpp              |  2 +-
 core/test/matrix/batch_ell.cpp                |  2 +-
 core/test/matrix/batch_identity.cpp           |  3 ++-
 core/test/solver/batch_bicgstab.cpp           |  3 ++-
 core/test/solver/batch_cg.cpp                 |  2 +-
 core/test/utils/batch_helpers.hpp             |  2 +-
 .../test/base/batch_multi_vector_kernels.cpp  | 11 ++++----
 reference/test/matrix/batch_csr_kernels.cpp   |  2 +-
 reference/test/matrix/batch_dense_kernels.cpp |  2 +-
 reference/test/matrix/batch_ell_kernels.cpp   |  2 +-
 .../test/solver/batch_bicgstab_kernels.cpp    | 27 ++++++++++++-------
 reference/test/solver/batch_cg_kernels.cpp    | 22 ++++++++++++---
 14 files changed, 56 insertions(+), 29 deletions(-)

diff --git a/core/test/base/batch_multi_vector.cpp b/core/test/base/batch_multi_vector.cpp
index 3798f30ce65..7a9606bc710 100644
--- a/core/test/base/batch_multi_vector.cpp
+++ b/core/test/base/batch_multi_vector.cpp
@@ -64,7 +64,8 @@ class MultiVector : public ::testing::Test {
     std::unique_ptr<gko::matrix::Dense<value_type>> dense_mtx;
 };
 
-TYPED_TEST_SUITE(MultiVector, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(MultiVector, gko::test::ValueTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(MultiVector, CanBeEmpty)
diff --git a/core/test/matrix/batch_csr.cpp b/core/test/matrix/batch_csr.cpp
index 57cae53d646..3a1871ba583 100644
--- a/core/test/matrix/batch_csr.cpp
+++ b/core/test/matrix/batch_csr.cpp
@@ -114,7 +114,7 @@ class Csr : public ::testing::Test {
     std::unique_ptr<CsrMtx> sp_csr_mtx;
 };
 
-TYPED_TEST_SUITE(Csr, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Csr, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
 
 
 TYPED_TEST(Csr, KnowsItsSizeAndValues)
diff --git a/core/test/matrix/batch_dense.cpp b/core/test/matrix/batch_dense.cpp
index 334df5c0e93..23542114746 100644
--- a/core/test/matrix/batch_dense.cpp
+++ b/core/test/matrix/batch_dense.cpp
@@ -68,7 +68,7 @@ class Dense : public ::testing::Test {
     std::unique_ptr<gko::matrix::Dense<value_type>> dense_mtx;
 };
 
-TYPED_TEST_SUITE(Dense, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Dense, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
 
 
 TYPED_TEST(Dense, KnowsItsSizeAndValues)
diff --git a/core/test/matrix/batch_ell.cpp b/core/test/matrix/batch_ell.cpp
index 11f6381a43d..ae047ecfa90 100644
--- a/core/test/matrix/batch_ell.cpp
+++ b/core/test/matrix/batch_ell.cpp
@@ -92,7 +92,7 @@ class Ell : public ::testing::Test {
     std::unique_ptr<EllMtx> sp_ell_mtx;
 };
 
-TYPED_TEST_SUITE(Ell, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Ell, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
 
 
 TYPED_TEST(Ell, KnowsItsSizeAndValues)
diff --git a/core/test/matrix/batch_identity.cpp b/core/test/matrix/batch_identity.cpp
index dd7a3675110..765f9f30938 100644
--- a/core/test/matrix/batch_identity.cpp
+++ b/core/test/matrix/batch_identity.cpp
@@ -49,7 +49,8 @@ class Identity : public ::testing::Test {
     std::unique_ptr<gko::batch::MultiVector<value_type>> mvec;
 };
 
-TYPED_TEST_SUITE(Identity, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Identity, gko::test::ValueTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(Identity, KnowsItsSizeAndValues)
diff --git a/core/test/solver/batch_bicgstab.cpp b/core/test/solver/batch_bicgstab.cpp
index cd9446d07b2..0b50f7f6e92 100644
--- a/core/test/solver/batch_bicgstab.cpp
+++ b/core/test/solver/batch_bicgstab.cpp
@@ -50,7 +50,8 @@ class BatchBicgstab : public ::testing::Test {
     std::unique_ptr<gko::batch::BatchLinOp> solver;
 };
 
-TYPED_TEST_SUITE(BatchBicgstab, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(BatchBicgstab, gko::test::ValueTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(BatchBicgstab, FactoryKnowsItsExecutor)
diff --git a/core/test/solver/batch_cg.cpp b/core/test/solver/batch_cg.cpp
index 1e97c765f8a..b517c931adf 100644
--- a/core/test/solver/batch_cg.cpp
+++ b/core/test/solver/batch_cg.cpp
@@ -50,7 +50,7 @@ class BatchCg : public ::testing::Test {
     std::unique_ptr<gko::batch::BatchLinOp> solver;
 };
 
-TYPED_TEST_SUITE(BatchCg, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(BatchCg, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
 
 
 TYPED_TEST(BatchCg, FactoryKnowsItsExecutor)
diff --git a/core/test/utils/batch_helpers.hpp b/core/test/utils/batch_helpers.hpp
index 15c4d7560d9..790034b724c 100644
--- a/core/test/utils/batch_helpers.hpp
+++ b/core/test/utils/batch_helpers.hpp
@@ -137,7 +137,7 @@ std::unique_ptr<MatrixType> generate_diag_dominant_batch_matrix(
                     static_cast<size_type>(num_cols)},
         {}};
     auto engine = std::default_random_engine(42);
-    auto rand_diag_dist = std::normal_distribution<real_type>(20.0, 1.0);
+    auto rand_diag_dist = std::normal_distribution<>(20.0, 1.0);
     for (int row = 0; row < num_rows; ++row) {
         std::uniform_int_distribution<index_type> rand_nnz_dist{1, row + 1};
         const auto k = rand_nnz_dist(engine);
diff --git a/reference/test/base/batch_multi_vector_kernels.cpp b/reference/test/base/batch_multi_vector_kernels.cpp
index 694ae491ef4..a860c3c4b24 100644
--- a/reference/test/base/batch_multi_vector_kernels.cpp
+++ b/reference/test/base/batch_multi_vector_kernels.cpp
@@ -96,7 +96,8 @@ class MultiVector : public ::testing::Test {
     std::default_random_engine rand_engine;
 };
 
-TYPED_TEST_SUITE(MultiVector, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(MultiVector, gko::test::ValueTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(MultiVector, ScalesData)
@@ -342,7 +343,7 @@ TYPED_TEST(MultiVector, ConvertsToPrecision)
 {
     using MultiVector = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using OtherT = typename gko::next_precision<T>;
+    using OtherT = typename gko::next_precision_with_half<T>;
     using OtherMultiVector = typename gko::batch::MultiVector<OtherT>;
     auto tmp = OtherMultiVector::create(this->exec);
     auto res = MultiVector::create(this->exec);
@@ -366,7 +367,7 @@ TYPED_TEST(MultiVector, MovesToPrecision)
 {
     using MultiVector = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using OtherT = typename gko::next_precision<T>;
+    using OtherT = typename gko::next_precision_with_half<T>;
     using OtherMultiVector = typename gko::batch::MultiVector<OtherT>;
     auto tmp = OtherMultiVector::create(this->exec);
     auto res = MultiVector::create(this->exec);
@@ -390,7 +391,7 @@ TYPED_TEST(MultiVector, ConvertsEmptyToPrecision)
 {
     using MultiVector = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using OtherT = typename gko::next_precision<T>;
+    using OtherT = typename gko::next_precision_with_half<T>;
     using OtherMultiVector = typename gko::batch::MultiVector<OtherT>;
     auto empty = OtherMultiVector::create(this->exec);
     auto res = MultiVector::create(this->exec);
@@ -405,7 +406,7 @@ TYPED_TEST(MultiVector, MovesEmptyToPrecision)
 {
     using MultiVector = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using OtherT = typename gko::next_precision<T>;
+    using OtherT = typename gko::next_precision_with_half<T>;
     using OtherMultiVector = typename gko::batch::MultiVector<OtherT>;
     auto empty = OtherMultiVector::create(this->exec);
     auto res = MultiVector::create(this->exec);
diff --git a/reference/test/matrix/batch_csr_kernels.cpp b/reference/test/matrix/batch_csr_kernels.cpp
index 920bb67696b..85e461b933e 100644
--- a/reference/test/matrix/batch_csr_kernels.cpp
+++ b/reference/test/matrix/batch_csr_kernels.cpp
@@ -78,7 +78,7 @@ class Csr : public ::testing::Test {
     std::ranlux48 rand_engine;
 };
 
-TYPED_TEST_SUITE(Csr, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Csr, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
 
 
 TYPED_TEST(Csr, AppliesToBatchMultiVector)
diff --git a/reference/test/matrix/batch_dense_kernels.cpp b/reference/test/matrix/batch_dense_kernels.cpp
index 50c1909959f..23f747c24cb 100644
--- a/reference/test/matrix/batch_dense_kernels.cpp
+++ b/reference/test/matrix/batch_dense_kernels.cpp
@@ -77,7 +77,7 @@ class Dense : public ::testing::Test {
 };
 
 
-TYPED_TEST_SUITE(Dense, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Dense, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
 
 
 TYPED_TEST(Dense, AppliesToBatchMultiVector)
diff --git a/reference/test/matrix/batch_ell_kernels.cpp b/reference/test/matrix/batch_ell_kernels.cpp
index a2c9ef4e83c..5e2b377eda0 100644
--- a/reference/test/matrix/batch_ell_kernels.cpp
+++ b/reference/test/matrix/batch_ell_kernels.cpp
@@ -79,7 +79,7 @@ class Ell : public ::testing::Test {
 };
 
 
-TYPED_TEST_SUITE(Ell, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Ell, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
 
 
 TYPED_TEST(Ell, AppliesToBatchMultiVector)
diff --git a/reference/test/solver/batch_bicgstab_kernels.cpp b/reference/test/solver/batch_bicgstab_kernels.cpp
index c7b36ba875c..468b38a561b 100644
--- a/reference/test/solver/batch_bicgstab_kernels.cpp
+++ b/reference/test/solver/batch_bicgstab_kernels.cpp
@@ -75,7 +75,7 @@ class BatchBicgstab : public ::testing::Test {
         solve_lambda;
 };
 
-TYPED_TEST_SUITE(BatchBicgstab, gko::test::RealValueTypes,
+TYPED_TEST_SUITE(BatchBicgstab, gko::test::RealValueTypesWithHalf,
                  TypenameNameGenerator);
 
 
@@ -111,8 +111,13 @@ TYPED_TEST(BatchBicgstab, StencilSystemLoggerLogsResidual)
         ASSERT_LE(
             res_log_array[i] / this->linear_system.host_rhs_norm->at(i, 0, 0),
             this->solver_settings.residual_tol);
-        ASSERT_NEAR(res_log_array[i], res.host_res_norm->get_const_values()[i],
-                    10 * this->eps);
+        if (!std::is_same<real_type, gko::half>::value) {
+            // There is no guarantee of this condition. We disable this check in
+            // half.
+            ASSERT_NEAR(res_log_array[i],
+                        res.host_res_norm->get_const_values()[i],
+                        10 * this->eps);
+        }
     }
 }
 
@@ -131,7 +136,7 @@ TYPED_TEST(BatchBicgstab, StencilSystemLoggerLogsIterations)
 
     auto iter_array = res.log_data->iter_counts.get_const_data();
     for (size_t i = 0; i < this->num_batch_items; i++) {
-        ASSERT_EQ(iter_array[i], ref_iters);
+        ASSERT_LE(iter_array[i], ref_iters);
     }
 }
 
@@ -142,7 +147,7 @@ TYPED_TEST(BatchBicgstab, CanSolveDenseSystem)
     using real_type = gko::remove_complex<value_type>;
     using Solver = typename TestFixture::solver_type;
     using Mtx = typename TestFixture::Mtx;
-    const real_type tol = 1e-5;
+    const real_type tol = 1e-4;
     const int max_iters = 1000;
     auto solver_factory =
         Solver::build()
@@ -167,7 +172,7 @@ TYPED_TEST(BatchBicgstab, CanSolveDenseSystem)
     for (size_t i = 0; i < num_batch_items; i++) {
         ASSERT_LE(res.host_res_norm->get_const_values()[i] /
                       linear_system.host_rhs_norm->get_const_values()[i],
-                  tol);
+                  tol * 10);
     }
 }
 
@@ -179,7 +184,7 @@ TYPED_TEST(BatchBicgstab, ApplyLogsResAndIters)
     using Solver = typename TestFixture::solver_type;
     using Mtx = typename TestFixture::Mtx;
     using Logger = gko::batch::log::BatchConvergence<value_type>;
-    const real_type tol = 1e-5;
+    const real_type tol = 1e-4;
     const int max_iters = 1000;
     auto solver_factory =
         Solver::build()
@@ -222,7 +227,7 @@ TYPED_TEST(BatchBicgstab, CanSolveEllSystem)
     using real_type = gko::remove_complex<value_type>;
     using Solver = typename TestFixture::solver_type;
     using Mtx = typename TestFixture::EllMtx;
-    const real_type tol = 1e-5;
+    const real_type tol = 1e-4;
     const int max_iters = 1000;
     auto solver_factory =
         Solver::build()
@@ -258,7 +263,7 @@ TYPED_TEST(BatchBicgstab, CanSolveCsrSystem)
     using real_type = gko::remove_complex<value_type>;
     using Solver = typename TestFixture::solver_type;
     using Mtx = typename TestFixture::CsrMtx;
-    const real_type tol = 1e-5;
+    const real_type tol = 1e-4;
     const int max_iters = 1000;
     auto solver_factory =
         Solver::build()
@@ -294,6 +299,10 @@ TYPED_TEST(BatchBicgstab, CanSolveDenseHpdSystem)
     using real_type = gko::remove_complex<value_type>;
     using Solver = typename TestFixture::solver_type;
     using Mtx = typename TestFixture::Mtx;
+    // Need to design a better random system. With different random value
+    // distribution, the solver can not solve the hpd matrix even with single
+    // precision
+    SKIP_IF_HALF(value_type);
     const real_type tol = 1e-5;
     const int max_iters = 1000;
     auto solver_factory =
diff --git a/reference/test/solver/batch_cg_kernels.cpp b/reference/test/solver/batch_cg_kernels.cpp
index 86efa158fb5..2619614278e 100644
--- a/reference/test/solver/batch_cg_kernels.cpp
+++ b/reference/test/solver/batch_cg_kernels.cpp
@@ -75,7 +75,8 @@ class BatchCg : public ::testing::Test {
         solve_lambda;
 };
 
-TYPED_TEST_SUITE(BatchCg, gko::test::RealValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(BatchCg, gko::test::RealValueTypesWithHalf,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(BatchCg, SolvesStencilSystem)
@@ -87,7 +88,7 @@ TYPED_TEST(BatchCg, SolvesStencilSystem)
     for (size_t i = 0; i < this->num_batch_items; i++) {
         ASSERT_LE(res.host_res_norm->get_const_values()[i] /
                       this->linear_system.host_rhs_norm->get_const_values()[i],
-                  this->solver_settings.residual_tol);
+                  5 * this->solver_settings.residual_tol);
     }
     GKO_ASSERT_BATCH_MTX_NEAR(res.x, this->linear_system.exact_sol,
                               this->eps * 10);
@@ -108,8 +109,13 @@ TYPED_TEST(BatchCg, StencilSystemLoggerLogsResidual)
         ASSERT_LE(
             res_log_array[i] / this->linear_system.host_rhs_norm->at(i, 0, 0),
             this->solver_settings.residual_tol);
-        ASSERT_NEAR(res_log_array[i], res.host_res_norm->get_const_values()[i],
-                    10 * this->eps);
+        if (!std::is_same<real_type, gko::half>::value) {
+            // There is no guarantee of this condition. We disable this check in
+            // half.
+            ASSERT_NEAR(res_log_array[i],
+                        res.host_res_norm->get_const_values()[i],
+                        10 * this->eps);
+        }
     }
 }
 
@@ -140,6 +146,10 @@ TYPED_TEST(BatchCg, ApplyLogsResAndIters)
     using Solver = typename TestFixture::solver_type;
     using Mtx = typename TestFixture::Mtx;
     using Logger = gko::batch::log::BatchConvergence<value_type>;
+    // Need to design a better random system. With different random value
+    // distribution, the solver can not solve the hpd matrix even with single
+    // precision
+    SKIP_IF_HALF(value_type);
     const real_type tol = 1e-6;
     const int max_iters = 1000;
     auto solver_factory =
@@ -181,6 +191,10 @@ TYPED_TEST(BatchCg, CanSolveHpdSystem)
     using real_type = gko::remove_complex<value_type>;
     using Solver = typename TestFixture::solver_type;
     using Mtx = typename TestFixture::Mtx;
+    // Need to design a better random system. With different random value
+    // distribution, the solver can not solve the hpd matrix even with single
+    // precision
+    SKIP_IF_HALF(value_type);
     const real_type tol = 1e-6;
     const int max_iters = 1000;
     auto solver_factory =

From 24feb210f429ec0a91c4af29fa037a7825f15997 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 20 Nov 2024 18:21:17 +0100
Subject: [PATCH 403/448] cuda/hip batch changes

---
 .../cuda_hip/solver/batch_bicgstab_launch.hpp |  4 +--
 common/cuda_hip/solver/batch_cg_launch.hpp    | 26 +++++++++---------
 cuda/solver/batch_bicgstab_kernels.cu         | 22 +++++++--------
 cuda/solver/batch_bicgstab_launch.cuh         | 27 ++++++++++---------
 cuda/solver/batch_cg_kernels.cu               | 14 +++++-----
 cuda/solver/batch_cg_launch.cuh               | 13 ++++-----
 hip/solver/batch_bicgstab_kernels.hip.cpp     | 22 +++++++--------
 hip/solver/batch_cg_kernels.hip.cpp           | 14 +++++-----
 8 files changed, 72 insertions(+), 70 deletions(-)

diff --git a/common/cuda_hip/solver/batch_bicgstab_launch.hpp b/common/cuda_hip/solver/batch_bicgstab_launch.hpp
index 3886c33bcd5..df7eaaa2f1b 100644
--- a/common/cuda_hip/solver/batch_bicgstab_launch.hpp
+++ b/common/cuda_hip/solver/batch_bicgstab_launch.hpp
@@ -38,11 +38,11 @@ void launch_apply_kernel(
 
 #define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH(_vtype, _n_shared, _prec_shared, \
                                           mat_t, log_t, pre_t, stop_t)     \
-    void launch_apply_kernel<device_type<_vtype>, _n_shared, _prec_shared, \
+    void launch_apply_kernel<_vtype, _n_shared, _prec_shared,              \
                              stop_t<device_type<_vtype>>>(                 \
         std::shared_ptr<const DefaultExecutor> exec,                       \
         const gko::kernels::batch_bicgstab::storage_config& sconf,         \
-        const settings<remove_complex<device_type<_vtype>>>& settings,     \
+        const settings<remove_complex<_vtype>>& settings,                  \
         log_t<gko::remove_complex<device_type<_vtype>>>& logger,           \
         pre_t<device_type<_vtype>>& prec,                                  \
         const mat_t<const device_type<_vtype>>& mat,                       \
diff --git a/common/cuda_hip/solver/batch_cg_launch.hpp b/common/cuda_hip/solver/batch_cg_launch.hpp
index 4306dc2bfab..9fe05f62558 100644
--- a/common/cuda_hip/solver/batch_cg_launch.hpp
+++ b/common/cuda_hip/solver/batch_cg_launch.hpp
@@ -36,19 +36,19 @@ void launch_apply_kernel(
     device_type<ValueType>* const __restrict__ workspace_data,
     const int& block_size, const size_t& shared_size);
 
-#define GKO_DECLARE_BATCH_CG_LAUNCH(_vtype, _n_shared, _prec_shared, mat_t,   \
-                                    log_t, pre_t, stop_t)                     \
-    void launch_apply_kernel<device_type<_vtype>, _n_shared, _prec_shared,    \
-                             stop_t<device_type<_vtype>>>(                    \
-        std::shared_ptr<const DefaultExecutor> exec,                          \
-        const gko::kernels::batch_cg::storage_config& sconf,                  \
-        const settings<remove_complex<_vtype>>& settings,                     \
-        log_t<device_type<gko::remove_complex<device_type<_vtype>>>>& logger, \
-        pre_t<device_type<_vtype>>& prec,                                     \
-        const mat_t<const device_type<_vtype>>& mat,                          \
-        const device_type<_vtype>* const __restrict__ b_values,               \
-        device_type<_vtype>* const __restrict__ x_values,                     \
-        device_type<_vtype>* const __restrict__ workspace_data,               \
+#define GKO_DECLARE_BATCH_CG_LAUNCH(_vtype, _n_shared, _prec_shared, mat_t, \
+                                    log_t, pre_t, stop_t)                   \
+    void launch_apply_kernel<_vtype, _n_shared, _prec_shared,               \
+                             stop_t<device_type<_vtype>>>(                  \
+        std::shared_ptr<const DefaultExecutor> exec,                        \
+        const gko::kernels::batch_cg::storage_config& sconf,                \
+        const settings<remove_complex<_vtype>>& settings,                   \
+        log_t<gko::remove_complex<device_type<_vtype>>>& logger,            \
+        pre_t<device_type<_vtype>>& prec,                                   \
+        const mat_t<const device_type<_vtype>>& mat,                        \
+        const device_type<_vtype>* const __restrict__ b_values,             \
+        device_type<_vtype>* const __restrict__ x_values,                   \
+        device_type<_vtype>* const __restrict__ workspace_data,             \
         const int& block_size, const size_t& shared_size)
 
 #define GKO_INSTANTIATE_BATCH_CG_LAUNCH_0_FALSE \
diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu
index 74d312c95ef..52398093ac2 100644
--- a/cuda/solver/batch_bicgstab_kernels.cu
+++ b/cuda/solver/batch_bicgstab_kernels.cu
@@ -72,58 +72,58 @@ public:
         // Template parameters launch_apply_kernel<StopType, n_shared,
         // prec_shared>
         if (sconf.prec_shared) {
-            launch_apply_kernel<cuda_value_type, 9, true, StopType>(
+            launch_apply_kernel<ValueType, 9, true, StopType>(
                 exec_, sconf, settings_, logger, prec, mat, b.values, x.values,
                 workspace_data, block_size, shared_size);
         } else {
             switch (sconf.n_shared) {
             case 0:
-                launch_apply_kernel<cuda_value_type, 0, false, StopType>(
+                launch_apply_kernel<ValueType, 0, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 1:
-                launch_apply_kernel<cuda_value_type, 1, false, StopType>(
+                launch_apply_kernel<ValueType, 1, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 2:
-                launch_apply_kernel<cuda_value_type, 2, false, StopType>(
+                launch_apply_kernel<ValueType, 2, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 3:
-                launch_apply_kernel<cuda_value_type, 3, false, StopType>(
+                launch_apply_kernel<ValueType, 3, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 4:
-                launch_apply_kernel<cuda_value_type, 4, false, StopType>(
+                launch_apply_kernel<ValueType, 4, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 5:
-                launch_apply_kernel<cuda_value_type, 5, false, StopType>(
+                launch_apply_kernel<ValueType, 5, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 6:
-                launch_apply_kernel<cuda_value_type, 6, false, StopType>(
+                launch_apply_kernel<ValueType, 6, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 7:
-                launch_apply_kernel<cuda_value_type, 7, false, StopType>(
+                launch_apply_kernel<ValueType, 7, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 8:
-                launch_apply_kernel<cuda_value_type, 8, false, StopType>(
+                launch_apply_kernel<ValueType, 8, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 9:
-                launch_apply_kernel<cuda_value_type, 9, false, StopType>(
+                launch_apply_kernel<ValueType, 9, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
diff --git a/cuda/solver/batch_bicgstab_launch.cuh b/cuda/solver/batch_bicgstab_launch.cuh
index b4e8753ccca..81c71aa91e7 100644
--- a/cuda/solver/batch_bicgstab_launch.cuh
+++ b/cuda/solver/batch_bicgstab_launch.cuh
@@ -31,13 +31,13 @@ template <typename StopType, typename PrecType, typename LogType,
 int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
                               const int num_rows);
 
-#define GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK(               \
-    _vtype, mat_t, log_t, pre_t, stop_t)                                    \
-    int get_num_threads_per_block<                                          \
-        stop_t<cuda_type<_vtype>>, pre_t<cuda_type<_vtype>>,                \
-        log_t<gko::remove_complex<_vtype>>, mat_t<const cuda_type<_vtype>>, \
-        cuda_type<_vtype>>(std::shared_ptr<const DefaultExecutor> exec,     \
-                           const int num_rows)
+#define GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK( \
+    _vtype, mat_t, log_t, pre_t, stop_t)                      \
+    int get_num_threads_per_block<                            \
+        stop_t<cuda_type<_vtype>>, pre_t<cuda_type<_vtype>>,  \
+        log_t<gko::remove_complex<cuda_type<_vtype>>>,        \
+        mat_t<const cuda_type<_vtype>>, cuda_type<_vtype>>(   \
+        std::shared_ptr<const DefaultExecutor> exec, const int num_rows)
 
 #define GKO_INSTANTIATE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK \
     GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK)
@@ -47,12 +47,13 @@ template <typename StopType, typename PrecType, typename LogType,
           typename BatchMatrixType, typename ValueType>
 int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec);
 
-#define GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY(           \
-    _vtype, mat_t, log_t, pre_t, stop_t)                                    \
-    int get_max_dynamic_shared_memory<                                      \
-        stop_t<cuda_type<_vtype>>, pre_t<cuda_type<_vtype>>,                \
-        log_t<gko::remove_complex<_vtype>>, mat_t<const cuda_type<_vtype>>, \
-        cuda_type<_vtype>>(std::shared_ptr<const DefaultExecutor> exec)
+#define GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY( \
+    _vtype, mat_t, log_t, pre_t, stop_t)                          \
+    int get_max_dynamic_shared_memory<                            \
+        stop_t<cuda_type<_vtype>>, pre_t<cuda_type<_vtype>>,      \
+        log_t<gko::remove_complex<cuda_type<_vtype>>>,            \
+        mat_t<const cuda_type<_vtype>>, cuda_type<_vtype>>(       \
+        std::shared_ptr<const DefaultExecutor> exec)
 
 #define GKO_INSTANTIATE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY \
     GKO_BATCH_INSTANTIATE(                                           \
diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu
index e1aec94852b..d3d93a0af6d 100644
--- a/cuda/solver/batch_cg_kernels.cu
+++ b/cuda/solver/batch_cg_kernels.cu
@@ -73,38 +73,38 @@ public:
         // Template parameters launch_apply_kernel<ValueType, n_shared,
         // prec_shared, StopType>
         if (sconf.prec_shared) {
-            launch_apply_kernel<cuda_value_type, 5, true, StopType>(
+            launch_apply_kernel<ValueType, 5, true, StopType>(
                 exec_, sconf, settings_, logger, prec, mat, b.values, x.values,
                 workspace_data, block_size, shared_size);
         } else {
             switch (sconf.n_shared) {
             case 0:
-                launch_apply_kernel<cuda_value_type, 0, false, StopType>(
+                launch_apply_kernel<ValueType, 0, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 1:
-                launch_apply_kernel<cuda_value_type, 1, false, StopType>(
+                launch_apply_kernel<ValueType, 1, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 2:
-                launch_apply_kernel<cuda_value_type, 2, false, StopType>(
+                launch_apply_kernel<ValueType, 2, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 3:
-                launch_apply_kernel<cuda_value_type, 3, false, StopType>(
+                launch_apply_kernel<ValueType, 3, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 4:
-                launch_apply_kernel<cuda_value_type, 4, false, StopType>(
+                launch_apply_kernel<ValueType, 4, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 5:
-                launch_apply_kernel<cuda_value_type, 5, false, StopType>(
+                launch_apply_kernel<ValueType, 5, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
diff --git a/cuda/solver/batch_cg_launch.cuh b/cuda/solver/batch_cg_launch.cuh
index 94d948cf202..7747cea0252 100644
--- a/cuda/solver/batch_cg_launch.cuh
+++ b/cuda/solver/batch_cg_launch.cuh
@@ -47,12 +47,13 @@ template <typename StopType, typename PrecType, typename LogType,
           typename BatchMatrixType, typename ValueType>
 int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec);
 
-#define GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY(                 \
-    _vtype, mat_t, log_t, pre_t, stop_t)                                    \
-    int get_max_dynamic_shared_memory<                                      \
-        stop_t<cuda_type<_vtype>>, pre_t<cuda_type<_vtype>>,                \
-        log_t<gko::remove_complex<_vtype>>, mat_t<const cuda_type<_vtype>>, \
-        cuda_type<_vtype>>(std::shared_ptr<const DefaultExecutor> exec)
+#define GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY(  \
+    _vtype, mat_t, log_t, pre_t, stop_t)                     \
+    int get_max_dynamic_shared_memory<                       \
+        stop_t<cuda_type<_vtype>>, pre_t<cuda_type<_vtype>>, \
+        log_t<gko::remove_complex<cuda_type<_vtype>>>,       \
+        mat_t<const cuda_type<_vtype>>, cuda_type<_vtype>>(  \
+        std::shared_ptr<const DefaultExecutor> exec)
 
 #define GKO_INSTANTIATE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY \
     GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY)
diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp
index 66d6130cfd0..2aede809427 100644
--- a/hip/solver/batch_bicgstab_kernels.hip.cpp
+++ b/hip/solver/batch_bicgstab_kernels.hip.cpp
@@ -96,58 +96,58 @@ class kernel_caller {
         // Template parameters launch_apply_kernel<StopType, n_shared,
         // prec_shared)
         if (sconf.prec_shared) {
-            launch_apply_kernel<hip_value_type, 9, true, StopType>(
+            launch_apply_kernel<ValueType, 9, true, StopType>(
                 exec_, sconf, settings_, logger, prec, mat, b.values, x.values,
                 workspace_data, block_size, shared_size);
         } else {
             switch (sconf.n_shared) {
             case 0:
-                launch_apply_kernel<hip_value_type, 0, false, StopType>(
+                launch_apply_kernel<ValueType, 0, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 1:
-                launch_apply_kernel<hip_value_type, 1, false, StopType>(
+                launch_apply_kernel<ValueType, 1, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 2:
-                launch_apply_kernel<hip_value_type, 2, false, StopType>(
+                launch_apply_kernel<ValueType, 2, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 3:
-                launch_apply_kernel<hip_value_type, 3, false, StopType>(
+                launch_apply_kernel<ValueType, 3, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 4:
-                launch_apply_kernel<hip_value_type, 4, false, StopType>(
+                launch_apply_kernel<ValueType, 4, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 5:
-                launch_apply_kernel<hip_value_type, 5, false, StopType>(
+                launch_apply_kernel<ValueType, 5, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 6:
-                launch_apply_kernel<hip_value_type, 6, false, StopType>(
+                launch_apply_kernel<ValueType, 6, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 7:
-                launch_apply_kernel<hip_value_type, 7, false, StopType>(
+                launch_apply_kernel<ValueType, 7, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 8:
-                launch_apply_kernel<hip_value_type, 8, false, StopType>(
+                launch_apply_kernel<ValueType, 8, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 9:
-                launch_apply_kernel<hip_value_type, 9, false, StopType>(
+                launch_apply_kernel<ValueType, 9, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp
index f36974aae06..b6d3580585e 100644
--- a/hip/solver/batch_cg_kernels.hip.cpp
+++ b/hip/solver/batch_cg_kernels.hip.cpp
@@ -98,38 +98,38 @@ class kernel_caller {
         // Template parameters launch_apply_kernel<ValueType, n_shared,
         // prec_shared, StopType>
         if (sconf.prec_shared) {
-            launch_apply_kernel<hip_value_type, 5, true, StopType>(
+            launch_apply_kernel<ValueType, 5, true, StopType>(
                 exec_, sconf, settings_, logger, prec, mat, b.values, x.values,
                 workspace_data, block_size, shared_size);
         } else {
             switch (sconf.n_shared) {
             case 0:
-                launch_apply_kernel<hip_value_type, 0, false, StopType>(
+                launch_apply_kernel<ValueType, 0, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 1:
-                launch_apply_kernel<hip_value_type, 1, false, StopType>(
+                launch_apply_kernel<ValueType, 1, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 2:
-                launch_apply_kernel<hip_value_type, 2, false, StopType>(
+                launch_apply_kernel<ValueType, 2, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 3:
-                launch_apply_kernel<hip_value_type, 3, false, StopType>(
+                launch_apply_kernel<ValueType, 3, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 4:
-                launch_apply_kernel<hip_value_type, 4, false, StopType>(
+                launch_apply_kernel<ValueType, 4, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;
             case 5:
-                launch_apply_kernel<hip_value_type, 5, false, StopType>(
+                launch_apply_kernel<ValueType, 5, false, StopType>(
                     exec_, sconf, settings_, logger, prec, mat, b.values,
                     x.values, workspace_data, block_size, shared_size);
                 break;

From 126439da4cc18f4aa9d1a457a316229c06f499f5 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 25 Oct 2024 10:51:45 +0200
Subject: [PATCH 404/448] add device_type in sycl

---
 accessor/sycl_helper.hpp | 192 +++++++++++++++++++++++++++++++++++++++
 dpcpp/base/types.hpp     | 125 +++++++++++++++++++++++++
 2 files changed, 317 insertions(+)
 create mode 100644 accessor/sycl_helper.hpp
 create mode 100644 dpcpp/base/types.hpp

diff --git a/accessor/sycl_helper.hpp b/accessor/sycl_helper.hpp
new file mode 100644
index 00000000000..793587c30d3
--- /dev/null
+++ b/accessor/sycl_helper.hpp
@@ -0,0 +1,192 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_ACCESSOR_SYCL_HELPER_HPP_
+#define GKO_ACCESSOR_SYCL_HELPER_HPP_
+
+
+#include <complex>
+#include <type_traits>
+
+#include "block_col_major.hpp"
+#include "reduced_row_major.hpp"
+#include "row_major.hpp"
+#include "scaled_reduced_row_major.hpp"
+#include "utils.hpp"
+
+
+namespace sycl {
+inline namespace _V1 {
+
+
+class half;
+
+
+}
+}  // namespace sycl
+
+
+namespace gko {
+
+
+class half;
+
+
+namespace acc {
+namespace detail {
+
+
+template <typename T>
+struct sycl_type {
+    using type = T;
+};
+
+template <>
+struct sycl_type<gko::half> {
+    using type = sycl::half;
+};
+
+// Unpack cv and reference / pointer qualifiers
+template <typename T>
+struct sycl_type<const T> {
+    using type = const typename sycl_type<T>::type;
+};
+
+template <typename T>
+struct sycl_type<volatile T> {
+    using type = volatile typename sycl_type<T>::type;
+};
+
+template <typename T>
+struct sycl_type<T*> {
+    using type = typename sycl_type<T>::type*;
+};
+
+template <typename T>
+struct sycl_type<T&> {
+    using type = typename sycl_type<T>::type&;
+};
+
+template <typename T>
+struct sycl_type<T&&> {
+    using type = typename sycl_type<T>::type&&;
+};
+
+
+// Transform the underlying type of std::complex
+template <typename T>
+struct sycl_type<std::complex<T>> {
+    using type = std::complex<typename sycl_type<T>::type>;
+};
+
+
+}  // namespace detail
+
+
+/**
+ * This is an alias for SYCL's equivalent of `T`.
+ *
+ * @tparam T  a type
+ */
+template <typename T>
+using sycl_type_t = typename detail::sycl_type<T>::type;
+
+
+/**
+ * Reinterprets the passed in value as a SYCL type.
+ *
+ * @param val  the value to reinterpret
+ *
+ * @return `val` reinterpreted to SYCL type
+ */
+template <typename T>
+std::enable_if_t<std::is_pointer<T>::value || std::is_reference<T>::value,
+                 sycl_type_t<T>>
+as_sycl_type(T val)
+{
+    return reinterpret_cast<sycl_type_t<T>>(val);
+}
+
+
+/**
+ * @copydoc as_sycl_type()
+ */
+template <typename T>
+std::enable_if_t<!std::is_pointer<T>::value && !std::is_reference<T>::value,
+                 sycl_type_t<T>>
+as_sycl_type(T val)
+{
+    return *reinterpret_cast<sycl_type_t<T>*>(&val);
+}
+
+
+/**
+ * Changes the types and reinterprets the passed in range pointers as a SYCL
+ * types.
+ *
+ * @param r  the range which pointers need to be reinterpreted
+ *
+ * @return `r` with appropriate types and reinterpreted to SYCL pointers
+ */
+template <std::size_t dim, typename Type1, typename Type2>
+GKO_ACC_INLINE auto as_sycl_range(
+    const range<reduced_row_major<dim, Type1, Type2>>& r)
+{
+    return range<
+        reduced_row_major<dim, sycl_type_t<Type1>, sycl_type_t<Type2>>>(
+        r.get_accessor().get_size(),
+        as_sycl_type(r.get_accessor().get_stored_data()),
+        r.get_accessor().get_stride());
+}
+
+/**
+ * @copydoc as_sycl_range()
+ */
+template <std::size_t dim, typename Type1, typename Type2, std::uint64_t mask>
+GKO_ACC_INLINE auto as_sycl_range(
+    const range<scaled_reduced_row_major<dim, Type1, Type2, mask>>& r)
+{
+    return range<scaled_reduced_row_major<dim, sycl_type_t<Type1>,
+                                          sycl_type_t<Type2>, mask>>(
+        r.get_accessor().get_size(),
+        as_sycl_type(r.get_accessor().get_stored_data()),
+        r.get_accessor().get_storage_stride(),
+        as_sycl_type(r.get_accessor().get_scalar()),
+        r.get_accessor().get_scalar_stride());
+}
+
+/**
+ * @copydoc as_sycl_range()
+ */
+template <typename T, size_type dim>
+GKO_ACC_INLINE auto as_sycl_range(const range<block_col_major<T, dim>>& r)
+{
+    return range<block_col_major<sycl_type_t<T>, dim>>(
+        r.get_accessor().lengths, as_sycl_type(r.get_accessor().data),
+        r.get_accessor().stride);
+}
+
+/**
+ * @copydoc as_sycl_range()
+ */
+template <typename T, size_type dim>
+GKO_ACC_INLINE auto as_sycl_range(const range<row_major<T, dim>>& r)
+{
+    return range<block_col_major<sycl_type_t<T>, dim>>(
+        r.get_accessor().lengths, as_sycl_type(r.get_accessor().data),
+        r.get_accessor().stride);
+}
+
+template <typename AccType>
+GKO_ACC_INLINE auto as_device_range(AccType&& acc)
+{
+    return as_device_range(std::forward<AccType>(acc));
+}
+
+
+}  // namespace acc
+}  // namespace gko
+
+
+#endif  // GKO_ACCESSOR_SYCL_HELPER_HPP_
diff --git a/dpcpp/base/types.hpp b/dpcpp/base/types.hpp
new file mode 100644
index 00000000000..64c446c356e
--- /dev/null
+++ b/dpcpp/base/types.hpp
@@ -0,0 +1,125 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_DPCPP_BASE_TYPES_HPP_
+#define GKO_DPCPP_BASE_TYPES_HPP_
+
+
+#include <type_traits>
+
+#include <sycl/half_type.hpp>
+
+#include <ginkgo/core/base/half.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+namespace detail {
+
+
+template <typename T>
+struct sycl_type_impl {
+    using type = T;
+};
+
+template <typename T>
+struct sycl_type_impl<T*> {
+    using type = typename sycl_type_impl<T>::type*;
+};
+
+template <typename T>
+struct sycl_type_impl<T&> {
+    using type = typename sycl_type_impl<T>::type&;
+};
+
+template <typename T>
+struct sycl_type_impl<const T> {
+    using type = const typename sycl_type_impl<T>::type;
+};
+
+template <typename T>
+struct sycl_type_impl<volatile T> {
+    using type = volatile typename sycl_type_impl<T>::type;
+};
+
+template <>
+struct sycl_type_impl<half> {
+    using type = sycl::half;
+};
+
+template <typename T>
+struct sycl_type_impl<std::complex<T>> {
+    using type = std::complex<typename sycl_type_impl<T>::type>;
+};
+
+}  // namespace detail
+
+
+/**
+ * This is an alias for SYCL's equivalent of `T`.
+ *
+ * @tparam T  a type
+ */
+template <typename T>
+using sycl_type = typename detail::sycl_type_impl<T>::type;
+
+/**
+ * This is an alias for SYCL/HIP's equivalent of `T` depending on the namespace.
+ *
+ * @tparam T  a type
+ */
+template <typename T>
+using device_type = sycl_type<T>;
+
+
+/**
+ * Reinterprets the passed in value as a SYCL type.
+ *
+ * @param val  the value to reinterpret
+ *
+ * @return `val` reinterpreted to SYCL type
+ */
+template <typename T>
+inline std::enable_if_t<
+    std::is_pointer<T>::value || std::is_reference<T>::value, sycl_type<T>>
+as_sycl_type(T val)
+{
+    return reinterpret_cast<sycl_type<T>>(val);
+}
+
+
+/**
+ * @copydoc as_sycl_type()
+ */
+template <typename T>
+inline std::enable_if_t<
+    !std::is_pointer<T>::value && !std::is_reference<T>::value, sycl_type<T>>
+as_sycl_type(T val)
+{
+    return *reinterpret_cast<sycl_type<T>*>(&val);
+}
+
+
+/**
+ * Reinterprets the passed in value as a SYCL/HIP type depending on the
+ * namespace.
+ *
+ * @param val  the value to reinterpret
+ *
+ * @return `val` reinterpreted to SYCL/HIP type
+ */
+template <typename T>
+inline device_type<T> as_device_type(T val)
+{
+    return as_sycl_type(val);
+}
+
+
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
+
+#endif  // GKO_DPCPP_BASE_TYPES_HPP_

From 0819ce2287029c5071dc2afb5a8cf156d3627f5d Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 25 Oct 2024 10:51:58 +0200
Subject: [PATCH 405/448] add device_type in kernel_launch

---
 common/unified/base/kernel_launch.hpp     | 12 ++----------
 dpcpp/base/kernel_launch_reduction.dp.hpp | 13 ++++++++-----
 2 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/common/unified/base/kernel_launch.hpp b/common/unified/base/kernel_launch.hpp
index d4810e1aa95..248c4671623 100644
--- a/common/unified/base/kernel_launch.hpp
+++ b/common/unified/base/kernel_launch.hpp
@@ -76,16 +76,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr unpack_member_type<T> unpack_member(T value)
 namespace gko {
 namespace kernels {
 namespace dpcpp {
-
-
-template <typename T>
-using device_type = T;
-
-template <typename T>
-device_type<T> as_device_type(T value)
-{
-    return value;
-}
+#include "dpcpp/base/types.hpp"
 
 
 template <typename T>
@@ -97,6 +88,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr unpack_member_type<T> unpack_member(T value)
     return value;
 }
 
+
 }  // namespace dpcpp
 }  // namespace kernels
 }  // namespace gko
diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp
index 83436966ecb..f45a92269a5 100644
--- a/dpcpp/base/kernel_launch_reduction.dp.hpp
+++ b/dpcpp/base/kernel_launch_reduction.dp.hpp
@@ -239,7 +239,8 @@ void run_kernel_reduction_cached(std::shared_ptr<const DpcppExecutor> exec,
         [&](std::uint32_t cfg) { return cfg == desired_cfg; },
         syn::value_list<bool>(), syn::value_list<int>(),
         syn::value_list<size_type>(), syn::type_list<>(), exec, fn, op,
-        finalize, identity, result, size, tmp, map_to_device(args)...);
+        finalize, as_device_type(identity), as_device_type(result), size, tmp,
+        map_to_device(args)...);
 }
 
 
@@ -261,7 +262,8 @@ void run_kernel_reduction_cached(std::shared_ptr<const DpcppExecutor> exec,
         [&](std::uint32_t cfg) { return cfg == desired_cfg; },
         syn::value_list<bool>(), syn::value_list<int>(),
         syn::value_list<size_type>(), syn::type_list<>(), exec, fn, op,
-        finalize, identity, result, size, tmp, map_to_device(args)...);
+        finalize, as_device_type(identity), as_device_type(result), size, tmp,
+        map_to_device(args)...);
 }
 
 
@@ -658,8 +660,8 @@ void run_kernel_row_reduction_cached(std::shared_ptr<const DpcppExecutor> exec,
         [&](std::uint32_t cfg) { return cfg == desired_cfg; },
         syn::value_list<bool>(), syn::value_list<int>(),
         syn::value_list<size_type>(), syn::type_list<>(), exec, fn, op,
-        finalize, identity, result, result_stride, size, tmp,
-        map_to_device(args)...);
+        finalize, as_device_type(identity), as_device_type(result),
+        result_stride, size, tmp, map_to_device(args)...);
 }
 
 
@@ -681,7 +683,8 @@ void run_kernel_col_reduction_cached(std::shared_ptr<const DpcppExecutor> exec,
         [&](std::uint32_t cfg) { return cfg == desired_cfg; },
         syn::value_list<bool>(), syn::value_list<int>(),
         syn::value_list<size_type>(), syn::type_list<>(), exec, fn, op,
-        finalize, identity, result, size, tmp, map_to_device(args)...);
+        finalize, as_device_type(identity), as_device_type(result), size, tmp,
+        map_to_device(args)...);
 }
 
 

From 8bc3c97aea7b6bf744bf3f81ea57f3bb1054d9a6 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 25 Oct 2024 13:30:26 +0200
Subject: [PATCH 406/448] reduction sycl type

---
 dpcpp/components/reduction.dp.hpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp
index aed8166d601..933f6db7817 100644
--- a/dpcpp/components/reduction.dp.hpp
+++ b/dpcpp/components/reduction.dp.hpp
@@ -21,6 +21,7 @@
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/base/helper.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
 #include "dpcpp/components/uninitialized_array.hpp"
@@ -189,8 +190,9 @@ void reduce_add_array(dim3 grid, dim3 block, size_type dynamic_shared_memory,
                       const ValueType* source, ValueType* result)
 {
     queue->submit([&](sycl::handler& cgh) {
-        sycl::local_accessor<
-            uninitialized_array<ValueType, DeviceConfig::block_size>, 0>
+        sycl::local_accessor<uninitialized_array<device_type<ValueType>,
+                                                 DeviceConfig::block_size>,
+                             0>
             block_sum_acc_ct1(cgh);
 
         cgh.parallel_for(
@@ -198,8 +200,8 @@ void reduce_add_array(dim3 grid, dim3 block, size_type dynamic_shared_memory,
             [=](sycl::nd_item<3> item_ct1)
                 [[sycl::reqd_sub_group_size(DeviceConfig::subgroup_size)]] {
                     reduce_add_array<DeviceConfig>(
-                        size, source, result, item_ct1,
-                        *block_sum_acc_ct1.get_pointer());
+                        size, as_device_type(source), as_device_type(result),
+                        item_ct1, *block_sum_acc_ct1.get_pointer());
                 });
     });
 }

From 45411ef5e3220ceea5f7fa299e239bc17873bd24 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 25 Oct 2024 13:30:35 +0200
Subject: [PATCH 407/448] component sycl type

---
 dpcpp/base/device_matrix_data_kernels.dp.cpp | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/dpcpp/base/device_matrix_data_kernels.dp.cpp b/dpcpp/base/device_matrix_data_kernels.dp.cpp
index a5f58831a27..f676e09321a 100644
--- a/dpcpp/base/device_matrix_data_kernels.dp.cpp
+++ b/dpcpp/base/device_matrix_data_kernels.dp.cpp
@@ -8,7 +8,9 @@
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 
+#include "dpcpp/base/math.hpp"
 #include "dpcpp/base/onedpl.hpp"
+#include "dpcpp/base/types.hpp"
 
 
 namespace gko {
@@ -22,12 +24,13 @@ void remove_zeros(std::shared_ptr<const DefaultExecutor> exec,
                   array<ValueType>& values, array<IndexType>& row_idxs,
                   array<IndexType>& col_idxs)
 {
-    using nonzero_type = matrix_data_entry<ValueType, IndexType>;
+    using device_value_type = device_type<ValueType>;
     auto size = values.get_size();
     auto policy = onedpl_policy(exec);
-    auto nnz = std::count_if(
-        policy, values.get_const_data(), values.get_const_data() + size,
-        [](ValueType val) { return is_nonzero<ValueType>(val); });
+    auto nnz =
+        std::count_if(policy, as_device_type(values.get_const_data()),
+                      as_device_type(values.get_const_data()) + size,
+                      [](device_value_type val) { return is_nonzero(val); });
     if (nnz < size) {
         // allocate new storage
         array<ValueType> new_values{exec, static_cast<size_type>(nnz)};
@@ -36,10 +39,10 @@ void remove_zeros(std::shared_ptr<const DefaultExecutor> exec,
         // copy nonzeros
         auto input_it = oneapi::dpl::make_zip_iterator(
             row_idxs.get_const_data(), col_idxs.get_const_data(),
-            values.get_const_data());
-        auto output_it = oneapi::dpl::make_zip_iterator(new_row_idxs.get_data(),
-                                                        new_col_idxs.get_data(),
-                                                        new_values.get_data());
+            as_device_type(values.get_const_data()));
+        auto output_it = oneapi::dpl::make_zip_iterator(
+            new_row_idxs.get_data(), new_col_idxs.get_data(),
+            as_device_type(new_values.get_data()));
         std::copy_if(policy, input_it, input_it + size, output_it,
                      [](auto tuple) { return is_nonzero(std::get<2>(tuple)); });
         // swap out storage

From 38d6e1ea86d4b93d1cc0c875604b32d00666ce6c Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 25 Oct 2024 13:30:56 +0200
Subject: [PATCH 408/448] matrix sycl type

---
 dpcpp/matrix/coo_kernels.dp.cpp          |  35 ++--
 dpcpp/matrix/csr_kernels.dp.cpp          | 214 +++++++++++++----------
 dpcpp/matrix/dense_kernels.dp.cpp        |  50 +++---
 dpcpp/matrix/diagonal_kernels.dp.cpp     |   5 +-
 dpcpp/matrix/ell_kernels.dp.cpp          |  15 +-
 dpcpp/matrix/sparsity_csr_kernels.dp.cpp |  13 +-
 6 files changed, 189 insertions(+), 143 deletions(-)

diff --git a/dpcpp/matrix/coo_kernels.dp.cpp b/dpcpp/matrix/coo_kernels.dp.cpp
index 7e8a9acfac3..c8f79968577 100644
--- a/dpcpp/matrix/coo_kernels.dp.cpp
+++ b/dpcpp/matrix/coo_kernels.dp.cpp
@@ -294,20 +294,22 @@ void spmv2(std::shared_ptr<const DpcppExecutor> exec,
             const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols);
             int num_lines = ceildiv(nnz, nwarps * config::warp_size);
             abstract_spmv(coo_grid, coo_block, 0, exec->get_queue(), nnz,
-                          num_lines, a->get_const_values(),
+                          num_lines, as_device_type(a->get_const_values()),
                           a->get_const_col_idxs(), a->get_const_row_idxs(),
-                          b->get_const_values(), b->get_stride(),
-                          c->get_values(), c->get_stride());
+                          as_device_type(b->get_const_values()),
+                          b->get_stride(), as_device_type(c->get_values()),
+                          c->get_stride());
         } else {
             int num_elems =
                 ceildiv(nnz, nwarps * config::warp_size) * config::warp_size;
             const dim3 coo_grid(ceildiv(nwarps, warps_in_block),
                                 ceildiv(b_ncols, config::warp_size));
             abstract_spmm(coo_grid, coo_block, 0, exec->get_queue(), nnz,
-                          num_elems, a->get_const_values(),
+                          num_elems, as_device_type(a->get_const_values()),
                           a->get_const_col_idxs(), a->get_const_row_idxs(),
-                          b_ncols, b->get_const_values(), b->get_stride(),
-                          c->get_values(), c->get_stride());
+                          b_ncols, as_device_type(b->get_const_values()),
+                          b->get_stride(), as_device_type(c->get_values()),
+                          c->get_stride());
         }
     }
 }
@@ -333,21 +335,24 @@ void advanced_spmv2(std::shared_ptr<const DpcppExecutor> exec,
             int num_lines = ceildiv(nnz, nwarps * config::warp_size);
             const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols);
             abstract_spmv(coo_grid, coo_block, 0, exec->get_queue(), nnz,
-                          num_lines, alpha->get_const_values(),
-                          a->get_const_values(), a->get_const_col_idxs(),
-                          a->get_const_row_idxs(), b->get_const_values(),
-                          b->get_stride(), c->get_values(), c->get_stride());
+                          num_lines, as_device_type(alpha->get_const_values()),
+                          as_device_type(a->get_const_values()),
+                          a->get_const_col_idxs(), a->get_const_row_idxs(),
+                          as_device_type(b->get_const_values()),
+                          b->get_stride(), as_device_type(c->get_values()),
+                          c->get_stride());
         } else {
             int num_elems =
                 ceildiv(nnz, nwarps * config::warp_size) * config::warp_size;
             const dim3 coo_grid(ceildiv(nwarps, warps_in_block),
                                 ceildiv(b_ncols, config::warp_size));
             abstract_spmm(coo_grid, coo_block, 0, exec->get_queue(), nnz,
-                          num_elems, alpha->get_const_values(),
-                          a->get_const_values(), a->get_const_col_idxs(),
-                          a->get_const_row_idxs(), b_ncols,
-                          b->get_const_values(), b->get_stride(),
-                          c->get_values(), c->get_stride());
+                          num_elems, as_device_type(alpha->get_const_values()),
+                          as_device_type(a->get_const_values()),
+                          a->get_const_col_idxs(), a->get_const_row_idxs(),
+                          b_ncols, as_device_type(b->get_const_values()),
+                          b->get_stride(), as_device_type(c->get_values()),
+                          c->get_stride());
         }
     }
 }
diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp
index efcb9b7f470..9085dd9140e 100644
--- a/dpcpp/matrix/csr_kernels.dp.cpp
+++ b/dpcpp/matrix/csr_kernels.dp.cpp
@@ -18,6 +18,7 @@
 #include <ginkgo/core/matrix/hybrid.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 
+#include "accessor/sycl_helper.hpp"
 #include "core/base/array_access.hpp"
 #include "core/base/mixed_precision_types.hpp"
 #include "core/base/utils.hpp"
@@ -32,6 +33,7 @@
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/base/helper.hpp"
 #include "dpcpp/base/onemkl_bindings.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/components/atomic.dp.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
@@ -1242,29 +1244,35 @@ void merge_path_spmv(syn::value_list<int, items_per_thread>,
             if (grid_num > 0) {
                 csr::kernel::abstract_merge_path_spmv<items_per_thread>(
                     grid, block, 0, exec->get_queue(),
-                    static_cast<IndexType>(a->get_size()[0]), a_vals,
-                    a->get_const_col_idxs(), a->get_const_row_ptrs(),
-                    a->get_const_srow(), b_vals, c_vals, row_out.get_data(),
-                    val_out.get_data());
+                    static_cast<IndexType>(a->get_size()[0]),
+                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
+                    a->get_const_row_ptrs(), a->get_const_srow(),
+                    acc::as_device_range(b_vals), acc::as_device_range(c_vals),
+                    row_out.get_data(), as_device_type(val_out.get_data()));
             }
             csr::kernel::abstract_reduce(
                 1, spmv_block_size, 0, exec->get_queue(), grid_num,
-                val_out.get_data(), row_out.get_data(), c_vals);
+                as_device_type(val_out.get_data()), row_out.get_data(),
+                acc::as_device_range(c_vals));
 
         } else if (alpha != nullptr && beta != nullptr) {
             if (grid_num > 0) {
                 csr::kernel::abstract_merge_path_spmv<items_per_thread>(
                     grid, block, 0, exec->get_queue(),
                     static_cast<IndexType>(a->get_size()[0]),
-                    alpha->get_const_values(), a_vals, a->get_const_col_idxs(),
-                    a->get_const_row_ptrs(), a->get_const_srow(), b_vals,
-                    beta->get_const_values(), c_vals, row_out.get_data(),
-                    val_out.get_data());
+                    as_device_type(alpha->get_const_values()),
+                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
+                    a->get_const_row_ptrs(), a->get_const_srow(),
+                    acc::as_device_range(b_vals),
+                    as_device_type(beta->get_const_values()),
+                    acc::as_device_range(c_vals), row_out.get_data(),
+                    as_device_type(val_out.get_data()));
             }
-            csr::kernel::abstract_reduce(1, spmv_block_size, 0,
-                                         exec->get_queue(), grid_num,
-                                         val_out.get_data(), row_out.get_data(),
-                                         alpha->get_const_values(), c_vals);
+            csr::kernel::abstract_reduce(
+                1, spmv_block_size, 0, exec->get_queue(), grid_num,
+                as_device_type(val_out.get_data()), row_out.get_data(),
+                as_device_type(alpha->get_const_values()),
+                acc::as_device_range(c_vals));
         } else {
             GKO_KERNEL_NOT_FOUND;
         }
@@ -1318,17 +1326,20 @@ void classical_spmv(syn::value_list<int, subgroup_size>,
     if (alpha == nullptr && beta == nullptr) {
         if (grid.x > 0 && grid.y > 0) {
             kernel::abstract_classical_spmv<subgroup_size>(
-                grid, block, 0, exec->get_queue(), a->get_size()[0], a_vals,
-                a->get_const_col_idxs(), a->get_const_row_ptrs(), b_vals,
-                c_vals);
+                grid, block, 0, exec->get_queue(), a->get_size()[0],
+                acc::as_device_range(a_vals), a->get_const_col_idxs(),
+                a->get_const_row_ptrs(), acc::as_device_range(b_vals),
+                acc::as_device_range(c_vals));
         }
     } else if (alpha != nullptr && beta != nullptr) {
         if (grid.x > 0 && grid.y > 0) {
             kernel::abstract_classical_spmv<subgroup_size>(
                 grid, block, 0, exec->get_queue(), a->get_size()[0],
-                alpha->get_const_values(), a_vals, a->get_const_col_idxs(),
-                a->get_const_row_ptrs(), b_vals, beta->get_const_values(),
-                c_vals);
+                as_device_type(alpha->get_const_values()),
+                acc::as_device_range(a_vals), a->get_const_col_idxs(),
+                a->get_const_row_ptrs(), acc::as_device_range(b_vals),
+                as_device_type(beta->get_const_values()),
+                acc::as_device_range(c_vals));
         }
     } else {
         GKO_KERNEL_NOT_FOUND;
@@ -1369,17 +1380,19 @@ void load_balance_spmv(std::shared_ptr<const DpcppExecutor> exec,
                 csr::kernel::abstract_spmv(
                     csr_grid, csr_block, 0, exec->get_queue(), nwarps,
                     static_cast<IndexType>(a->get_size()[0]),
-                    alpha->get_const_values(), a_vals, a->get_const_col_idxs(),
-                    a->get_const_row_ptrs(), a->get_const_srow(), b_vals,
-                    c_vals);
+                    as_device_type(alpha->get_const_values()),
+                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
+                    a->get_const_row_ptrs(), a->get_const_srow(),
+                    acc::as_device_range(b_vals), acc::as_device_range(c_vals));
             }
         } else {
             if (csr_grid.x > 0 && csr_grid.y > 0) {
                 csr::kernel::abstract_spmv(
                     csr_grid, csr_block, 0, exec->get_queue(), nwarps,
-                    static_cast<IndexType>(a->get_size()[0]), a_vals,
-                    a->get_const_col_idxs(), a->get_const_row_ptrs(),
-                    a->get_const_srow(), b_vals, c_vals);
+                    static_cast<IndexType>(a->get_size()[0]),
+                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
+                    a->get_const_row_ptrs(), a->get_const_srow(),
+                    acc::as_device_range(b_vals), acc::as_device_range(c_vals));
             }
         }
     }
@@ -1720,9 +1733,10 @@ void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,
     kernel::compute_submatrix_idxs_and_vals(
         grid_dim, block_dim, 0, exec->get_queue(), num_rows, num_cols, num_nnz,
         row_offset, col_offset, source->get_const_row_ptrs(),
-        source->get_const_col_idxs(), source->get_const_values(),
+        source->get_const_col_idxs(),
+        as_device_type(source->get_const_values()),
         result->get_const_row_ptrs(), result->get_col_idxs(),
-        result->get_values());
+        as_device_type(result->get_values()));
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
@@ -1874,8 +1888,8 @@ auto spgemm_multiway_merge(size_type row,
                            const typename HeapElement::index_type* b_cols,
                            const typename HeapElement::value_type* b_vals,
                            HeapElement* heap, InitCallback init_cb,
-                           StepCallback step_cb, ColCallback col_cb)
-    -> decltype(init_cb(0))
+                           StepCallback step_cb,
+                           ColCallback col_cb) -> decltype(init_cb(0))
 {
     auto a_begin = a_row_ptrs[row];
     auto a_end = a_row_ptrs[row + 1];
@@ -1937,19 +1951,20 @@ void spgemm(std::shared_ptr<const DpcppExecutor> exec,
     auto num_rows = a->get_size()[0];
     const auto a_row_ptrs = a->get_const_row_ptrs();
     const auto a_cols = a->get_const_col_idxs();
-    const auto a_vals = a->get_const_values();
+    const auto a_vals = as_device_type(a->get_const_values());
     const auto b_row_ptrs = b->get_const_row_ptrs();
     const auto b_cols = b->get_const_col_idxs();
-    const auto b_vals = b->get_const_values();
+    const auto b_vals = as_device_type(b->get_const_values());
     auto c_row_ptrs = c->get_row_ptrs();
     auto queue = exec->get_queue();
 
-    array<val_heap_element<ValueType, IndexType>> heap_array(
+    using device_value_type = device_type<ValueType>;
+    array<val_heap_element<device_value_type, IndexType>> heap_array(
         exec, a->get_num_stored_elements());
 
     auto heap = heap_array.get_data();
     auto col_heap =
-        reinterpret_cast<col_heap_element<ValueType, IndexType>*>(heap);
+        reinterpret_cast<col_heap_element<device_value_type, IndexType>*>(heap);
 
     // first sweep: count nnz for each row
     queue->submit([&](sycl::handler& cgh) {
@@ -1958,7 +1973,7 @@ void spgemm(std::shared_ptr<const DpcppExecutor> exec,
             c_row_ptrs[a_row] = spgemm_multiway_merge(
                 a_row, a_row_ptrs, a_cols, a_vals, b_row_ptrs, b_cols, b_vals,
                 col_heap, [](size_type) { return IndexType{}; },
-                [](ValueType, IndexType, IndexType&) {},
+                [](device_value_type, IndexType, IndexType&) {},
                 [](IndexType, IndexType& nnz) { nnz++; });
         });
     });
@@ -1974,7 +1989,7 @@ void spgemm(std::shared_ptr<const DpcppExecutor> exec,
     c_col_idxs_array.resize_and_reset(new_nnz);
     c_vals_array.resize_and_reset(new_nnz);
     auto c_col_idxs = c_col_idxs_array.get_data();
-    auto c_vals = c_vals_array.get_data();
+    auto c_vals = as_device_type(c_vals_array.get_data());
 
     queue->submit([&](sycl::handler& cgh) {
         cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx) {
@@ -1983,16 +1998,18 @@ void spgemm(std::shared_ptr<const DpcppExecutor> exec,
                 a_row, a_row_ptrs, a_cols, a_vals, b_row_ptrs, b_cols, b_vals,
                 heap,
                 [&](size_type row) {
-                    return std::make_pair(zero<ValueType>(), c_row_ptrs[row]);
+                    return std::make_pair(zero<device_value_type>(),
+                                          c_row_ptrs[row]);
                 },
-                [](ValueType val, IndexType,
-                   std::pair<ValueType, IndexType>& state) {
+                [](device_value_type val, IndexType,
+                   std::pair<device_value_type, IndexType>& state) {
                     state.first += val;
                 },
-                [&](IndexType col, std::pair<ValueType, IndexType>& state) {
+                [&](IndexType col,
+                    std::pair<device_value_type, IndexType>& state) {
                     c_col_idxs[state.second] = col;
                     c_vals[state.second] = state.first;
-                    state.first = zero<ValueType>();
+                    state.first = zero<device_value_type>();
                     state.second++;
                 });
         });
@@ -2015,27 +2032,27 @@ void advanced_spgemm(std::shared_ptr<const DpcppExecutor> exec,
     auto num_rows = a->get_size()[0];
     const auto a_row_ptrs = a->get_const_row_ptrs();
     const auto a_cols = a->get_const_col_idxs();
-    const auto a_vals = a->get_const_values();
+    const auto a_vals = as_device_type(a->get_const_values());
     const auto b_row_ptrs = b->get_const_row_ptrs();
     const auto b_cols = b->get_const_col_idxs();
-    const auto b_vals = b->get_const_values();
+    const auto b_vals = as_device_type(b->get_const_values());
     const auto d_row_ptrs = d->get_const_row_ptrs();
     const auto d_cols = d->get_const_col_idxs();
-    const auto d_vals = d->get_const_values();
+    const auto d_vals = as_device_type(d->get_const_values());
     auto c_row_ptrs = c->get_row_ptrs();
-    const auto alpha_vals = alpha->get_const_values();
-    const auto beta_vals = beta->get_const_values();
+    const auto alpha_vals = as_device_type(alpha->get_const_values());
+    const auto beta_vals = as_device_type(beta->get_const_values());
     constexpr auto sentinel = std::numeric_limits<IndexType>::max();
     auto queue = exec->get_queue();
 
     // first sweep: count nnz for each row
-
-    array<val_heap_element<ValueType, IndexType>> heap_array(
+    using device_value_type = device_type<ValueType>;
+    array<val_heap_element<device_value_type, IndexType>> heap_array(
         exec, a->get_num_stored_elements());
 
     auto heap = heap_array.get_data();
     auto col_heap =
-        reinterpret_cast<col_heap_element<ValueType, IndexType>*>(heap);
+        reinterpret_cast<col_heap_element<device_value_type, IndexType>*>(heap);
 
     // first sweep: count nnz for each row
     queue->submit([&](sycl::handler& cgh) {
@@ -2047,7 +2064,7 @@ void advanced_spgemm(std::shared_ptr<const DpcppExecutor> exec,
             c_row_ptrs[a_row] = spgemm_multiway_merge(
                 a_row, a_row_ptrs, a_cols, a_vals, b_row_ptrs, b_cols, b_vals,
                 col_heap, [](size_type row) { return IndexType{}; },
-                [](ValueType, IndexType, IndexType&) {},
+                [](device_value_type, IndexType, IndexType&) {},
                 [&](IndexType col, IndexType& nnz) {
                     // skip smaller elements from d
                     while (d_col <= col) {
@@ -2074,7 +2091,7 @@ void advanced_spgemm(std::shared_ptr<const DpcppExecutor> exec,
     c_vals_array.resize_and_reset(new_nnz);
 
     auto c_col_idxs = c_col_idxs_array.get_data();
-    auto c_vals = c_vals_array.get_data();
+    auto c_vals = as_device_type(c_vals_array.get_data());
 
     queue->submit([&](sycl::handler& cgh) {
         cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx) {
@@ -2082,24 +2099,26 @@ void advanced_spgemm(std::shared_ptr<const DpcppExecutor> exec,
             auto d_nz = d_row_ptrs[a_row];
             const auto d_end = d_row_ptrs[a_row + 1];
             auto d_col = checked_load(d_cols, d_nz, d_end, sentinel);
-            auto d_val = checked_load(d_vals, d_nz, d_end, zero<ValueType>());
-            const auto valpha = alpha_vals[0];
-            const auto vbeta = beta_vals[0];
+            auto d_val =
+                checked_load(d_vals, d_nz, d_end, zero<device_value_type>());
+            const auto valpha = as_device_type(alpha_vals[0]);
+            const auto vbeta = as_device_type(beta_vals[0]);
             auto c_nz =
                 spgemm_multiway_merge(
                     a_row, a_row_ptrs, a_cols, a_vals, b_row_ptrs, b_cols,
                     b_vals, heap,
                     [&](size_type row) {
-                        return std::make_pair(zero<ValueType>(),
+                        return std::make_pair(zero<device_value_type>(),
                                               c_row_ptrs[row]);
                     },
-                    [](ValueType val, IndexType,
-                       std::pair<ValueType, IndexType>& state) {
+                    [](device_value_type val, IndexType,
+                       std::pair<device_value_type, IndexType>& state) {
                         state.first += val;
                     },
-                    [&](IndexType col, std::pair<ValueType, IndexType>& state) {
+                    [&](IndexType col,
+                        std::pair<device_value_type, IndexType>& state) {
                         // handle smaller elements from d
-                        ValueType part_d_val{};
+                        device_value_type part_d_val{};
                         while (d_col <= col) {
                             if (d_col == col) {
                                 part_d_val = d_val;
@@ -2111,12 +2130,12 @@ void advanced_spgemm(std::shared_ptr<const DpcppExecutor> exec,
                             d_nz++;
                             d_col = checked_load(d_cols, d_nz, d_end, sentinel);
                             d_val = checked_load(d_vals, d_nz, d_end,
-                                                 zero<ValueType>());
+                                                 zero<device_value_type>());
                         }
                         c_col_idxs[state.second] = col;
                         c_vals[state.second] =
                             vbeta * part_d_val + valpha * state.first;
-                        state.first = zero<ValueType>();
+                        state.first = zero<device_value_type>();
                         state.second++;
                     })
                     .second;
@@ -2127,7 +2146,8 @@ void advanced_spgemm(std::shared_ptr<const DpcppExecutor> exec,
                 c_nz++;
                 d_nz++;
                 d_col = checked_load(d_cols, d_nz, d_end, sentinel);
-                d_val = checked_load(d_vals, d_nz, d_end, zero<ValueType>());
+                d_val = checked_load(d_vals, d_nz, d_end,
+                                     zero<device_value_type>());
             }
         });
     });
@@ -2184,13 +2204,14 @@ void spgeam(std::shared_ptr<const DpcppExecutor> exec,
     c_col_idxs_array.resize_and_reset(new_nnz);
     c_vals_array.resize_and_reset(new_nnz);
     auto c_cols = c_col_idxs_array.get_data();
-    auto c_vals = c_vals_array.get_data();
+    auto c_vals = as_device_type(c_vals_array.get_data());
 
-    const auto a_vals = a->get_const_values();
-    const auto b_vals = b->get_const_values();
-    const auto alpha_vals = alpha->get_const_values();
-    const auto beta_vals = beta->get_const_values();
+    const auto a_vals = as_device_type(a->get_const_values());
+    const auto b_vals = as_device_type(b->get_const_values());
+    const auto alpha_vals = as_device_type(alpha->get_const_values());
+    const auto beta_vals = as_device_type(beta->get_const_values());
 
+    using device_value_type = device_type<ValueType>;
     // count number of non-zeros per row
     queue->submit([&](sycl::handler& cgh) {
         cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx) {
@@ -2207,8 +2228,10 @@ void spgeam(std::shared_ptr<const DpcppExecutor> exec,
                 const auto b_col = checked_load(b_cols, b_idx, b_end, sentinel);
                 const bool use_a = a_col <= b_col;
                 const bool use_b = b_col <= a_col;
-                const auto a_val = use_a ? a_vals[a_idx] : zero<ValueType>();
-                const auto b_val = use_b ? b_vals[b_idx] : zero<ValueType>();
+                const auto a_val =
+                    use_a ? a_vals[a_idx] : zero<device_value_type>();
+                const auto b_val =
+                    use_b ? b_vals[b_idx] : zero<device_value_type>();
                 c_cols[c_nz] = std::min(a_col, b_col);
                 c_vals[c_nz] = alpha * a_val + beta * b_val;
                 c_nz++;
@@ -2233,12 +2256,12 @@ void fill_in_dense(std::shared_ptr<const DpcppExecutor> exec,
     const auto stride = result->get_stride();
     const auto row_ptrs = source->get_const_row_ptrs();
     const auto col_idxs = source->get_const_col_idxs();
-    const auto vals = source->get_const_values();
+    const auto vals = as_device_type(source->get_const_values());
 
     auto grid_dim = ceildiv(num_rows, default_block_size);
     kernel::fill_in_dense(grid_dim, default_block_size, 0, exec->get_queue(),
                           num_rows, row_ptrs, col_idxs, vals, stride,
-                          result->get_values());
+                          as_device_type(result->get_values()));
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
@@ -2265,13 +2288,13 @@ void generic_transpose(std::shared_ptr<const DpcppExecutor> exec,
     auto queue = exec->get_queue();
     const auto row_ptrs = orig->get_const_row_ptrs();
     const auto cols = orig->get_const_col_idxs();
-    const auto vals = orig->get_const_values();
+    const auto vals = as_device_type(orig->get_const_values());
 
     array<IndexType> counts{exec, num_cols + 1};
     auto tmp_counts = counts.get_data();
     auto out_row_ptrs = trans->get_row_ptrs();
     auto out_cols = trans->get_col_idxs();
-    auto out_vals = trans->get_values();
+    auto out_vals = as_device_type(trans->get_values());
     components::fill_array(exec, tmp_counts, num_cols, IndexType{});
 
     queue->submit([&](sycl::handler& cgh) {
@@ -2348,8 +2371,8 @@ void inv_symm_permute(std::shared_ptr<const DpcppExecutor> exec,
     inv_symm_permute_kernel(
         copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
         perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(),
-        orig->get_const_values(), permuted->get_row_ptrs(),
-        permuted->get_col_idxs(), permuted->get_values());
+        as_deivice_type(orig->get_const_values()), permuted->get_row_ptrs(),
+        permuted->get_col_idxs(), as_device_type(permuted->get_values()));
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
@@ -2374,9 +2397,9 @@ void inv_nonsymm_permute(std::shared_ptr<const DpcppExecutor> exec,
     inv_nonsymm_permute_kernel(
         copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
         row_perm, col_perm, orig->get_const_row_ptrs(),
-        orig->get_const_col_idxs(), orig->get_const_values(),
+        orig->get_const_col_idxs(), as_deivice_type(orig->get_const_values()),
         permuted->get_row_ptrs(), permuted->get_col_idxs(),
-        permuted->get_values());
+        as_deivice_type(permuted->get_values()));
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
@@ -2401,8 +2424,9 @@ void row_permute(std::shared_ptr<const DpcppExecutor> exec,
     row_permute_kernel(
         copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
         perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(),
-        orig->get_const_values(), row_permuted->get_row_ptrs(),
-        row_permuted->get_col_idxs(), row_permuted->get_values());
+        as_deivice_type(orig->get_const_values()), row_permuted->get_row_ptrs(),
+        row_permuted->get_col_idxs(),
+        as_deivice_type(row_permuted->get_values()));
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
@@ -2427,8 +2451,9 @@ void inv_row_permute(std::shared_ptr<const DpcppExecutor> exec,
     inv_row_permute_kernel(
         copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
         perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(),
-        orig->get_const_values(), row_permuted->get_row_ptrs(),
-        row_permuted->get_col_idxs(), row_permuted->get_values());
+        as_deivice_type(orig->get_const_values()), row_permuted->get_row_ptrs(),
+        row_permuted->get_col_idxs(),
+        as_deivice_type(row_permuted->get_values()));
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
@@ -2453,8 +2478,8 @@ void inv_symm_scale_permute(std::shared_ptr<const DpcppExecutor> exec,
     inv_symm_scale_permute_kernel(
         copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
         scale, perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(),
-        orig->get_const_values(), permuted->get_row_ptrs(),
-        permuted->get_col_idxs(), permuted->get_values());
+        as_deivice_type(orig->get_const_values()), permuted->get_row_ptrs(),
+        permuted->get_col_idxs(), as_deivice_type(permuted->get_values()));
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
@@ -2482,9 +2507,9 @@ void inv_nonsymm_scale_permute(std::shared_ptr<const DpcppExecutor> exec,
     inv_nonsymm_scale_permute_kernel(
         copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
         row_scale, row_perm, col_scale, col_perm, orig->get_const_row_ptrs(),
-        orig->get_const_col_idxs(), orig->get_const_values(),
+        orig->get_const_col_idxs(), as_deivice_type(orig->get_const_values()),
         permuted->get_row_ptrs(), permuted->get_col_idxs(),
-        permuted->get_values());
+        as_deivice_type(permuted->get_values()));
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
@@ -2509,8 +2534,9 @@ void row_scale_permute(std::shared_ptr<const DpcppExecutor> exec,
     row_scale_permute_kernel(
         copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
         scale, perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(),
-        orig->get_const_values(), row_permuted->get_row_ptrs(),
-        row_permuted->get_col_idxs(), row_permuted->get_values());
+        as_deivice_type(orig->get_const_values()), row_permuted->get_row_ptrs(),
+        row_permuted->get_col_idxs(),
+        as_deivice_type(row_permuted->get_values()));
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
@@ -2535,8 +2561,9 @@ void inv_row_scale_permute(std::shared_ptr<const DpcppExecutor> exec,
     inv_row_scale_permute_kernel(
         copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
         scale, perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(),
-        orig->get_const_values(), row_permuted->get_row_ptrs(),
-        row_permuted->get_col_idxs(), row_permuted->get_values());
+        as_deivice_type(orig->get_const_values()), row_permuted->get_row_ptrs(),
+        row_permuted->get_col_idxs(),
+        as_deivice_type(row_permuted->get_values()));
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
@@ -2550,7 +2577,7 @@ void sort_by_column_index(std::shared_ptr<const DpcppExecutor> exec,
     const auto num_rows = to_sort->get_size()[0];
     const auto row_ptrs = to_sort->get_const_row_ptrs();
     auto cols = to_sort->get_col_idxs();
-    auto vals = to_sort->get_values();
+    auto vals = as_deivice_type(to_sort->get_values());
     exec->get_queue()->submit([&](sycl::handler& cgh) {
         cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx) {
             const auto row = static_cast<size_type>(idx[0]);
@@ -2643,10 +2670,10 @@ void extract_diagonal(std::shared_ptr<const DpcppExecutor> exec,
     const auto num_blocks =
         ceildiv(config::warp_size * diag_size, default_block_size);
 
-    const auto orig_values = orig->get_const_values();
+    const auto orig_values = as_device_type(orig->get_const_values());
     const auto orig_row_ptrs = orig->get_const_row_ptrs();
     const auto orig_col_idxs = orig->get_const_col_idxs();
-    auto diag_values = diag->get_values();
+    auto diag_values = as_device_type(diag->get_values());
 
     kernel::extract_diagonal(num_blocks, default_block_size, 0,
                              exec->get_queue(), diag_size, nnz, orig_values,
@@ -2696,9 +2723,10 @@ void add_scaled_identity(std::shared_ptr<const DpcppExecutor> exec,
     const auto nblocks = ceildiv(nthreads, default_block_size);
     kernel::add_scaled_identity(
         nblocks, default_block_size, 0, exec->get_queue(),
-        alpha->get_const_values(), beta->get_const_values(),
+        as_deivice_type(alpha->get_const_values()),
+        as_deivice_type(beta->get_const_values()),
         static_cast<IndexType>(nrows), mtx->get_const_row_ptrs(),
-        mtx->get_const_col_idxs(), mtx->get_values());
+        mtx->get_const_col_idxs(), as_deivice_type(mtx->get_values()));
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index c6eb163bc7d..4e44edaef7e 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -22,6 +22,7 @@
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/helper.hpp"
 #include "dpcpp/base/onemkl_bindings.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
@@ -103,9 +104,9 @@ void transpose(sycl::queue* queue, const matrix::Dense<ValueType>* orig,
             uninitialized_array<ValueType, sg_size*(sg_size + 1)>, 0>
             space_acc_ct1(cgh);
         // Can not pass the member to device function directly
-        auto in = orig->get_const_values();
+        auto in = as_device_type(orig->get_const_values());
         auto in_stride = orig->get_stride();
-        auto out = trans->get_values();
+        auto out = as_device_type(trans->get_values());
         auto out_stride = trans->get_stride();
         cgh.parallel_for(
             sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
@@ -223,9 +224,11 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                 oneapi::mkl::blas::row_major::gemm(
                     *exec->get_queue(), transpose::nontrans,
                     transpose::nontrans, c->get_size()[0], c->get_size()[1],
-                    a->get_size()[1], one<ValueType>(), a->get_const_values(),
-                    a->get_stride(), b->get_const_values(), b->get_stride(),
-                    zero<ValueType>(), c->get_values(), c->get_stride());
+                    a->get_size()[1], one<ValueType>(),
+                    as_device_type(a->get_const_values()), a->get_stride(),
+                    as_device_type(b->get_const_values()), b->get_stride(),
+                    zero<ValueType>(), as_device_type(c->get_values()),
+                    c->get_stride());
             } else {
                 dense::fill(exec, c, zero<ValueType>());
             }
@@ -254,10 +257,10 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
                     transpose::nontrans, c->get_size()[0], c->get_size()[1],
                     a->get_size()[1],
                     exec->copy_val_to_host(alpha->get_const_values()),
-                    a->get_const_values(), a->get_stride(),
-                    b->get_const_values(), b->get_stride(),
+                    as_device_type(a->get_const_values()), a->get_stride(),
+                    as_device_type(b->get_const_values()), b->get_stride(),
                     exec->copy_val_to_host(beta->get_const_values()),
-                    c->get_values(), c->get_stride());
+                    as_device_type(c->get_values()), c->get_stride());
             } else {
                 dense::scale(exec, beta, c);
             }
@@ -278,12 +281,12 @@ void convert_to_coo(std::shared_ptr<const DefaultExecutor> exec,
 {
     const auto num_rows = result->get_size()[0];
     const auto num_cols = result->get_size()[1];
-    const auto in_vals = source->get_const_values();
+    const auto in_vals = as_device_type(source->get_const_values());
     const auto stride = source->get_stride();
 
     auto rows = result->get_row_idxs();
     auto cols = result->get_col_idxs();
-    auto vals = result->get_values();
+    auto vals = as_device_type(result->get_values());
 
     exec->get_queue()->submit([&](sycl::handler& cgh) {
         cgh.parallel_for(num_rows, [=](sycl::item<1> item) {
@@ -313,12 +316,12 @@ void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,
 {
     const auto num_rows = result->get_size()[0];
     const auto num_cols = result->get_size()[1];
-    const auto in_vals = source->get_const_values();
+    const auto in_vals = as_device_type(source->get_const_values());
     const auto stride = source->get_stride();
 
     const auto row_ptrs = result->get_const_row_ptrs();
     auto cols = result->get_col_idxs();
-    auto vals = result->get_values();
+    auto vals = as_device_type(result->get_values());
 
     exec->get_queue()->submit([&](sycl::handler& cgh) {
         cgh.parallel_for(num_rows, [=](sycl::item<1> item) {
@@ -348,11 +351,11 @@ void convert_to_ell(std::shared_ptr<const DefaultExecutor> exec,
     const auto num_rows = result->get_size()[0];
     const auto num_cols = result->get_size()[1];
     const auto max_nnz_per_row = result->get_num_stored_elements_per_row();
-    const auto in_vals = source->get_const_values();
+    const auto in_vals = as_device_type(source->get_const_values());
     const auto in_stride = source->get_stride();
 
     auto cols = result->get_col_idxs();
-    auto vals = result->get_values();
+    auto vals = as_device_type(result->get_values());
     const auto stride = result->get_stride();
 
     exec->get_queue()->submit([&](sycl::handler& cgh) {
@@ -408,14 +411,14 @@ void convert_to_hybrid(std::shared_ptr<const DefaultExecutor> exec,
     const auto num_rows = result->get_size()[0];
     const auto num_cols = result->get_size()[1];
     const auto ell_lim = result->get_ell_num_stored_elements_per_row();
-    const auto in_vals = source->get_const_values();
+    const auto in_vals = as_device_type(source->get_const_values());
     const auto in_stride = source->get_stride();
     const auto ell_stride = result->get_ell_stride();
     auto ell_cols = result->get_ell_col_idxs();
-    auto ell_vals = result->get_ell_values();
+    auto ell_vals = as_device_type(result->get_ell_values());
     auto coo_rows = result->get_coo_row_idxs();
     auto coo_cols = result->get_coo_col_idxs();
-    auto coo_vals = result->get_coo_values();
+    auto coo_vals = as_device_type(result->get_coo_values());
 
     exec->get_queue()->submit([&](sycl::handler& cgh) {
         cgh.parallel_for(num_rows, [=](sycl::item<1> item) {
@@ -463,11 +466,11 @@ void convert_to_sellp(std::shared_ptr<const DefaultExecutor> exec,
     const auto num_rows = result->get_size()[0];
     const auto num_cols = result->get_size()[1];
     const auto stride = source->get_stride();
-    const auto in_vals = source->get_const_values();
+    const auto in_vals = as_device_type(source->get_const_values());
 
     const auto slice_sets = result->get_const_slice_sets();
     const auto slice_size = result->get_slice_size();
-    auto vals = result->get_values();
+    auto vals = as_device_type(result->get_values());
     auto col_idxs = result->get_col_idxs();
 
     exec->get_queue()->submit([&](sycl::handler& cgh) {
@@ -505,7 +508,7 @@ void convert_to_sparsity_csr(std::shared_ptr<const DefaultExecutor> exec,
 {
     const auto num_rows = result->get_size()[0];
     const auto num_cols = result->get_size()[1];
-    const auto in_vals = source->get_const_values();
+    const auto in_vals = as_device_type(source->get_const_values());
     const auto stride = source->get_stride();
 
     const auto row_ptrs = result->get_const_row_ptrs();
@@ -571,9 +574,10 @@ void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
     const auto sg_size = DCFG_1D::decode<1>(cfg);
     dim3 grid(ceildiv(size[1], sg_size), ceildiv(size[0], sg_size));
     dim3 block(sg_size, sg_size);
-    kernel::conj_transpose_call(cfg, grid, block, 0, queue, size[0], size[1],
-                                orig->get_const_values(), orig->get_stride(),
-                                trans->get_values(), trans->get_stride());
+    kernel::conj_transpose_call(
+        cfg, grid, block, 0, queue, size[0], size[1],
+        as_device_type(orig->get_const_values()), orig->get_stride(),
+        as_device_type(trans->get_values()), trans->get_stride());
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
diff --git a/dpcpp/matrix/diagonal_kernels.dp.cpp b/dpcpp/matrix/diagonal_kernels.dp.cpp
index 272a6dbd581..b377179183c 100644
--- a/dpcpp/matrix/diagonal_kernels.dp.cpp
+++ b/dpcpp/matrix/diagonal_kernels.dp.cpp
@@ -12,6 +12,7 @@
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/helper.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
 
@@ -70,9 +71,9 @@ void apply_to_csr(std::shared_ptr<const DpcppExecutor> exec,
                   matrix::Csr<ValueType, IndexType>* c, bool inverse)
 {
     const auto num_rows = b->get_size()[0];
-    const auto diag_values = a->get_const_values();
+    const auto diag_values = as_device_type(a->get_const_values());
     c->copy_from(b);
-    auto csr_values = c->get_values();
+    auto csr_values = as_device_type(c->get_values());
     const auto csr_row_ptrs = c->get_const_row_ptrs();
 
     const auto grid_dim =
diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp
index b33ed28b12d..e11123f84ce 100644
--- a/dpcpp/matrix/ell_kernels.dp.cpp
+++ b/dpcpp/matrix/ell_kernels.dp.cpp
@@ -15,6 +15,7 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 #include "accessor/reduced_row_major.hpp"
+#include "accessor/sycl_helper.hpp"
 #include "core/base/mixed_precision_types.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
@@ -23,6 +24,7 @@
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/helper.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/components/atomic.dp.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/format_conversion.dp.hpp"
@@ -323,17 +325,20 @@ void abstract_spmv(syn::value_list<int, info>,
     if (alpha == nullptr && beta == nullptr) {
         kernel::spmv<num_thread_per_worker, atomic>(
             grid_size, block_size, 0, exec->get_queue(), nrows,
-            num_worker_per_row, a_vals, a->get_const_col_idxs(), stride,
-            num_stored_elements_per_row, b_vals, c->get_values(),
+            num_worker_per_row, acc::as_device_range(a_vals),
+            a->get_const_col_idxs(), stride, num_stored_elements_per_row,
+            acc::as_device_range(b_vals), as_device_type(c->get_values()),
             c->get_stride());
     } else if (alpha != nullptr && beta != nullptr) {
         const auto alpha_val = gko::acc::range<a_accessor>(
             std::array<acc::size_type, 1>{1}, alpha->get_const_values());
         kernel::spmv<num_thread_per_worker, atomic>(
             grid_size, block_size, 0, exec->get_queue(), nrows,
-            num_worker_per_row, alpha_val, a_vals, a->get_const_col_idxs(),
-            stride, num_stored_elements_per_row, b_vals,
-            beta->get_const_values(), c->get_values(), c->get_stride());
+            num_worker_per_row, acc::as_device_range(alpha_val),
+            acc::as_device_range(a_vals), a->get_const_col_idxs(), stride,
+            num_stored_elements_per_row, acc::as_device_range(b_vals),
+            as_device_type(beta->get_const_values()),
+            as_device_type(c->get_values()), c->get_stride());
     } else {
         GKO_KERNEL_NOT_FOUND;
     }
diff --git a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp
index 0e076794ac8..d81e2a721b3 100644
--- a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp
+++ b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp
@@ -206,14 +206,17 @@ void classical_spmv(syn::value_list<int, subgroup_size>,
     if (alpha == nullptr && beta == nullptr) {
         kernel::abstract_classical_spmv<subgroup_size>(
             grid, block, 0, exec->get_queue(), a->get_size()[0],
-            a->get_const_value(), a->get_const_col_idxs(),
-            a->get_const_row_ptrs(), b_vals, c_vals);
+            as_device_type(a->get_const_value()), a->get_const_col_idxs(),
+            a->get_const_row_ptrs(), acc::as_device_range(b_vals),
+            acc::as_device_range(c_vals));
     } else if (alpha != nullptr && beta != nullptr) {
         kernel::abstract_classical_spmv<subgroup_size>(
             grid, block, 0, exec->get_queue(), a->get_size()[0],
-            alpha->get_const_values(), a->get_const_value(),
-            a->get_const_col_idxs(), a->get_const_row_ptrs(), b_vals,
-            beta->get_const_values(), c_vals);
+            as_device_type(alpha->get_const_values()), a->get_const_value(),
+            a->get_const_col_idxs(), a->get_const_row_ptrs(),
+            acc::as_device_range(b_vals),
+            as_device_type(beta->get_const_values()),
+            acc::as_device_range(c_vals));
     } else {
         GKO_KERNEL_NOT_FOUND;
     }

From 1341726df3f9c968dcbccde28130509a4a89f7d1 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 25 Oct 2024 13:31:14 +0200
Subject: [PATCH 409/448] factorization sycl type

---
 dpcpp/factorization/factorization_kernels.dp.cpp   |  2 +-
 dpcpp/factorization/par_ic_kernels.dp.cpp          | 14 +++++++-------
 dpcpp/factorization/par_ict_kernels.dp.cpp         | 13 +++++++------
 .../par_ilut_approx_filter_kernel.dp.cpp           |  4 ++--
 dpcpp/factorization/par_ilut_filter_kernel.dp.cpp  |  2 +-
 dpcpp/factorization/par_ilut_select_kernel.dp.cpp  |  2 +-
 dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp  |  8 ++++----
 dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp   |  9 +++++----
 8 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/dpcpp/factorization/factorization_kernels.dp.cpp b/dpcpp/factorization/factorization_kernels.dp.cpp
index 24736f9e00c..99ca5b1c985 100644
--- a/dpcpp/factorization/factorization_kernels.dp.cpp
+++ b/dpcpp/factorization/factorization_kernels.dp.cpp
@@ -428,7 +428,7 @@ void add_diagonal_elements(std::shared_ptr<const DpcppExecutor> exec,
     array<bool> needs_change_device{exec, 1};
     needs_change_device = needs_change_host;
 
-    auto dpcpp_old_values = mtx->get_const_values();
+    auto dpcpp_old_values = as_device_type(mtx->get_const_values());
     auto dpcpp_old_col_idxs = mtx->get_const_col_idxs();
     auto dpcpp_old_row_ptrs = mtx->get_row_ptrs();
     auto dpcpp_row_ptrs_add = row_ptrs_addition.get_data();
diff --git a/dpcpp/factorization/par_ic_kernels.dp.cpp b/dpcpp/factorization/par_ic_kernels.dp.cpp
index 91819dd98d0..7a978a19c55 100644
--- a/dpcpp/factorization/par_ic_kernels.dp.cpp
+++ b/dpcpp/factorization/par_ic_kernels.dp.cpp
@@ -125,7 +125,7 @@ void init_factor(std::shared_ptr<const DefaultExecutor> exec,
     auto num_rows = l->get_size()[0];
     auto num_blocks = ceildiv(num_rows, default_block_size);
     auto l_row_ptrs = l->get_const_row_ptrs();
-    auto l_vals = l->get_values();
+    auto l_vals = as_device_type(l->get_values());
     kernel::ic_init(num_blocks, default_block_size, 0, exec->get_queue(),
                     l_row_ptrs, l_vals, num_rows);
 }
@@ -143,12 +143,12 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
     auto nnz = l->get_num_stored_elements();
     auto num_blocks = ceildiv(nnz, default_block_size);
     for (size_type i = 0; i < iterations; ++i) {
-        kernel::ic_sweep(num_blocks, default_block_size, 0, exec->get_queue(),
-                         a_lower->get_const_row_idxs(),
-                         a_lower->get_const_col_idxs(),
-                         a_lower->get_const_values(), l->get_const_row_ptrs(),
-                         l->get_const_col_idxs(), l->get_values(),
-                         static_cast<IndexType>(l->get_num_stored_elements()));
+        kernel::ic_sweep(
+            num_blocks, default_block_size, 0, exec->get_queue(),
+            a_lower->get_const_row_idxs(), a_lower->get_const_col_idxs(),
+            a_lower->get_const_values(), l->get_const_row_ptrs(),
+            l->get_const_col_idxs(), as_device_type(l->get_values()),
+            static_cast<IndexType>(l->get_num_stored_elements()));
     }
 }
 
diff --git a/dpcpp/factorization/par_ict_kernels.dp.cpp b/dpcpp/factorization/par_ict_kernels.dp.cpp
index 6a704641252..65bfe4c1636 100644
--- a/dpcpp/factorization/par_ict_kernels.dp.cpp
+++ b/dpcpp/factorization/par_ict_kernels.dp.cpp
@@ -402,13 +402,13 @@ void add_candidates(syn::value_list<int, subgroup_size>,
     matrix::CsrBuilder<ValueType, IndexType> l_new_builder(l_new);
     auto llh_row_ptrs = llh->get_const_row_ptrs();
     auto llh_col_idxs = llh->get_const_col_idxs();
-    auto llh_vals = llh->get_const_values();
+    auto llh_vals = as_device_type(llh->get_const_values());
     auto a_row_ptrs = a->get_const_row_ptrs();
     auto a_col_idxs = a->get_const_col_idxs();
-    auto a_vals = a->get_const_values();
+    auto a_vals = as_device_type(a->get_const_values());
     auto l_row_ptrs = l->get_const_row_ptrs();
     auto l_col_idxs = l->get_const_col_idxs();
-    auto l_vals = l->get_const_values();
+    auto l_vals = as_device_type(l->get_const_values());
     auto l_new_row_ptrs = l_new->get_row_ptrs();
     // count non-zeros per row
     kernel::ict_tri_spgeam_nnz<subgroup_size>(
@@ -450,9 +450,10 @@ void compute_factor(syn::value_list<int, subgroup_size>,
     auto num_blocks = ceildiv(total_nnz, block_size);
     kernel::ict_sweep<subgroup_size>(
         num_blocks, default_block_size, 0, exec->get_queue(),
-        a->get_const_row_ptrs(), a->get_const_col_idxs(), a->get_const_values(),
-        l->get_const_row_ptrs(), l_coo->get_const_row_idxs(),
-        l->get_const_col_idxs(), l->get_values(),
+        a->get_const_row_ptrs(), a->get_const_col_idxs(),
+        as_device_type(a->get_const_values()), l->get_const_row_ptrs(),
+        l_coo->get_const_row_idxs(), l->get_const_col_idxs(),
+        as_device_type(l->get_values()),
         static_cast<IndexType>(l->get_num_stored_elements()));
 }
 
diff --git a/dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp b/dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp
index 776ffba3fb1..c808f7e0ae8 100644
--- a/dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp
+++ b/dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp
@@ -58,7 +58,7 @@ void threshold_filter_approx(syn::value_list<int, subgroup_size>,
                              matrix::Csr<ValueType, IndexType>* m_out,
                              matrix::Coo<ValueType, IndexType>* m_out_coo)
 {
-    auto values = m->get_const_values();
+    auto values = as_device_type(m->get_const_values());
     IndexType size = m->get_num_stored_elements();
     using AbsType = remove_complex<ValueType>;
     constexpr auto bucket_count = kernel::searchtree_width;
@@ -102,7 +102,7 @@ void threshold_filter_approx(syn::value_list<int, subgroup_size>,
     // filter the elements
     auto old_row_ptrs = m->get_const_row_ptrs();
     auto old_col_idxs = m->get_const_col_idxs();
-    auto old_vals = m->get_const_values();
+    auto old_vals = as_device_type(m->get_const_values());
     // compute nnz for each row
     auto num_rows = static_cast<IndexType>(m->get_size()[0]);
     auto block_size = default_block_size / subgroup_size;
diff --git a/dpcpp/factorization/par_ilut_filter_kernel.dp.cpp b/dpcpp/factorization/par_ilut_filter_kernel.dp.cpp
index 5ce9df8a0a9..732a8dc6135 100644
--- a/dpcpp/factorization/par_ilut_filter_kernel.dp.cpp
+++ b/dpcpp/factorization/par_ilut_filter_kernel.dp.cpp
@@ -57,7 +57,7 @@ void threshold_filter(syn::value_list<int, subgroup_size>,
 {
     auto old_row_ptrs = a->get_const_row_ptrs();
     auto old_col_idxs = a->get_const_col_idxs();
-    auto old_vals = a->get_const_values();
+    auto old_vals = as_device_type(a->get_const_values());
     // compute nnz for each row
     auto num_rows = static_cast<IndexType>(a->get_size()[0]);
     auto block_size = default_block_size / subgroup_size;
diff --git a/dpcpp/factorization/par_ilut_select_kernel.dp.cpp b/dpcpp/factorization/par_ilut_select_kernel.dp.cpp
index 589f8267f21..43c13fc730b 100644
--- a/dpcpp/factorization/par_ilut_select_kernel.dp.cpp
+++ b/dpcpp/factorization/par_ilut_select_kernel.dp.cpp
@@ -61,7 +61,7 @@ void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
                       array<remove_complex<ValueType>>& tmp2,
                       remove_complex<ValueType>& threshold)
 {
-    auto values = m->get_const_values();
+    auto values = as_device_type(m->get_const_values());
     IndexType size = m->get_num_stored_elements();
     using AbsType = remove_complex<ValueType>;
     constexpr auto bucket_count = kernel::searchtree_width;
diff --git a/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp b/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp
index 246228763bf..f9643fbe66b 100644
--- a/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp
+++ b/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp
@@ -356,16 +356,16 @@ void add_candidates(syn::value_list<int, subgroup_size>,
     matrix::CsrBuilder<ValueType, IndexType> u_new_builder(u_new);
     auto lu_row_ptrs = lu->get_const_row_ptrs();
     auto lu_col_idxs = lu->get_const_col_idxs();
-    auto lu_vals = lu->get_const_values();
+    auto lu_vals = as_device_type(lu->get_const_values());
     auto a_row_ptrs = a->get_const_row_ptrs();
     auto a_col_idxs = a->get_const_col_idxs();
-    auto a_vals = a->get_const_values();
+    auto a_vals = as_device_type(a->get_const_values());
     auto l_row_ptrs = l->get_const_row_ptrs();
     auto l_col_idxs = l->get_const_col_idxs();
-    auto l_vals = l->get_const_values();
+    auto l_vals = as_device_type(l->get_const_values());
     auto u_row_ptrs = u->get_const_row_ptrs();
     auto u_col_idxs = u->get_const_col_idxs();
-    auto u_vals = u->get_const_values();
+    auto u_vals = as_device_type(u->get_const_values());
     auto l_new_row_ptrs = l_new->get_row_ptrs();
     auto u_new_row_ptrs = u_new->get_row_ptrs();
     // count non-zeros per row
diff --git a/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp b/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp
index 601e5dc12d3..4644bb155d2 100644
--- a/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp
+++ b/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp
@@ -176,12 +176,13 @@ void compute_l_u_factors(syn::value_list<int, subgroup_size>,
     auto num_blocks = ceildiv(total_nnz, block_size);
     kernel::sweep<subgroup_size>(
         num_blocks, default_block_size, 0, exec->get_queue(),
-        a->get_const_row_ptrs(), a->get_const_col_idxs(), a->get_const_values(),
-        l->get_const_row_ptrs(), l_coo->get_const_row_idxs(),
-        l->get_const_col_idxs(), l->get_values(),
+        a->get_const_row_ptrs(), a->get_const_col_idxs(),
+        as_device_type(a->get_const_values()), l->get_const_row_ptrs(),
+        l_coo->get_const_row_idxs(), l->get_const_col_idxs(),
+        as_device_type(l->get_values()),
         static_cast<IndexType>(l->get_num_stored_elements()),
         u_coo->get_const_row_idxs(), u_coo->get_const_col_idxs(),
-        u->get_values(), u_csc->get_const_row_ptrs(),
+        as_device_type(u->get_values()), u_csc->get_const_row_ptrs(),
         u_csc->get_const_col_idxs(), u_csc->get_values(),
         static_cast<IndexType>(u->get_num_stored_elements()));
 }

From 022f4fb3a3364fb4eb7b320278c65f012f5c9da8 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 25 Oct 2024 13:31:23 +0200
Subject: [PATCH 410/448] solver/preconditioner/stop sycl type

---
 dpcpp/preconditioner/isai_kernels.dp.cpp      | 32 ++++----
 .../jacobi_advanced_apply_kernel.dp.cpp       |  7 +-
 .../jacobi_generate_instantiate.inc.dp.cpp    |  9 +-
 .../jacobi_simple_apply_kernel.dp.cpp         |  4 +-
 dpcpp/solver/cb_gmres_kernels.dp.cpp          | 24 +++---
 dpcpp/solver/idr_kernels.dp.cpp               | 82 +++++++++++--------
 dpcpp/stop/residual_norm_kernels.dp.cpp       |  4 +-
 7 files changed, 91 insertions(+), 71 deletions(-)

diff --git a/dpcpp/preconditioner/isai_kernels.dp.cpp b/dpcpp/preconditioner/isai_kernels.dp.cpp
index 8d5429b088a..f7d417380e4 100644
--- a/dpcpp/preconditioner/isai_kernels.dp.cpp
+++ b/dpcpp/preconditioner/isai_kernels.dp.cpp
@@ -626,16 +626,20 @@ void generate_tri_inverse(std::shared_ptr<const DefaultExecutor> exec,
             kernel::generate_l_inverse<subwarp_size, subwarps_per_block>(
                 grid, block, 0, exec->get_queue(),
                 static_cast<IndexType>(num_rows), input->get_const_row_ptrs(),
-                input->get_const_col_idxs(), input->get_const_values(),
+                input->get_const_col_idxs(),
+                as_device_type(input->get_const_values()),
                 inverse->get_row_ptrs(), inverse->get_col_idxs(),
-                inverse->get_values(), excess_rhs_ptrs, excess_nz_ptrs);
+                as_device_type(inverse->get_values()), excess_rhs_ptrs,
+                excess_nz_ptrs);
         } else {
             kernel::generate_u_inverse<subwarp_size, subwarps_per_block>(
                 grid, block, 0, exec->get_queue(),
                 static_cast<IndexType>(num_rows), input->get_const_row_ptrs(),
-                input->get_const_col_idxs(), input->get_const_values(),
+                input->get_const_col_idxs(),
+                as_device_type(input->get_const_values()),
                 inverse->get_row_ptrs(), inverse->get_col_idxs(),
-                inverse->get_values(), excess_rhs_ptrs, excess_nz_ptrs);
+                as_device_type(inverse->get_values()), excess_rhs_ptrs,
+                excess_nz_ptrs);
         }
     }
     components::prefix_sum_nonnegative(exec, excess_rhs_ptrs, num_rows + 1);
@@ -661,9 +665,9 @@ void generate_general_inverse(std::shared_ptr<const DefaultExecutor> exec,
         kernel::generate_general_inverse<subwarp_size, subwarps_per_block>(
             grid, block, 0, exec->get_queue(), static_cast<IndexType>(num_rows),
             input->get_const_row_ptrs(), input->get_const_col_idxs(),
-            input->get_const_values(), inverse->get_row_ptrs(),
-            inverse->get_col_idxs(), inverse->get_values(), excess_rhs_ptrs,
-            excess_nz_ptrs, spd);
+            as_device_type(input->get_const_values()), inverse->get_row_ptrs(),
+            inverse->get_col_idxs(), as_device_type(inverse->get_values()),
+            excess_rhs_ptrs, excess_nz_ptrs, spd);
     }
     components::prefix_sum_nonnegative(exec, excess_rhs_ptrs, num_rows + 1);
     components::prefix_sum_nonnegative(exec, excess_nz_ptrs, num_rows + 1);
@@ -691,11 +695,11 @@ void generate_excess_system(std::shared_ptr<const DefaultExecutor> exec,
         kernel::generate_excess_system<subwarp_size>(
             grid, block, 0, exec->get_queue(), static_cast<IndexType>(num_rows),
             input->get_const_row_ptrs(), input->get_const_col_idxs(),
-            input->get_const_values(), inverse->get_const_row_ptrs(),
-            inverse->get_const_col_idxs(), excess_rhs_ptrs, excess_nz_ptrs,
-            excess_system->get_row_ptrs(), excess_system->get_col_idxs(),
-            excess_system->get_values(), excess_rhs->get_values(), e_start,
-            e_end);
+            as_device_type(input->get_const_values()),
+            inverse->get_const_row_ptrs(), inverse->get_const_col_idxs(),
+            excess_rhs_ptrs, excess_nz_ptrs, excess_system->get_row_ptrs(),
+            excess_system->get_col_idxs(), excess_system->get_values(),
+            excess_rhs->get_values(), e_start, e_end);
     }
 }
 
@@ -737,8 +741,8 @@ void scatter_excess_solution(std::shared_ptr<const DefaultExecutor> exec,
         kernel::copy_excess_solution<subwarp_size>(
             grid, block, 0, exec->get_queue(), static_cast<IndexType>(num_rows),
             inverse->get_const_row_ptrs(), excess_rhs_ptrs,
-            excess_solution->get_const_values(), inverse->get_values(), e_start,
-            e_end);
+            excess_solution->get_const_values(),
+            as_device_type(inverse->get_values()), e_start, e_end);
     }
 }
 
diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
index 72a32c2d5cb..9c9d049668f 100644
--- a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
@@ -59,9 +59,10 @@ void apply(std::shared_ptr<const DpcppExecutor> exec, size_type num_blocks,
             syn::value_list<int, config::min_warps_per_block>(),
             syn::type_list<>(), exec, num_blocks,
             block_precisions.get_const_data(), block_pointers.get_const_data(),
-            blocks.get_const_data(), storage_scheme, alpha->get_const_values(),
-            b->get_const_values() + col, b->get_stride(), x->get_values() + col,
-            x->get_stride());
+            blocks.get_const_data(), storage_scheme,
+            as_device_type(alpha->get_const_values()),
+            as_device_type(b->get_const_values()) + col, b->get_stride(),
+            as_device_type(x->get_values()) + col, x->get_stride());
     }
 }
 
diff --git a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
index fe0973a9f21..a43728d7e2b 100644
--- a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
@@ -365,14 +365,15 @@ void generate(syn::value_list<int, max_block_size>,
                                   warps_per_block>(
             grid_size, block_size, 0, exec->get_queue(), mtx->get_size()[0],
             mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
-            mtx->get_const_values(), accuracy, block_data, storage_scheme,
-            conditioning, block_precisions, block_ptrs, num_blocks);
+            as_device_type(mtx->get_const_values()), accuracy, block_data,
+            storage_scheme, conditioning, block_precisions, block_ptrs,
+            num_blocks);
     } else {
         kernel::generate<max_block_size, subwarp_size, warps_per_block>(
             grid_size, block_size, 0, exec->get_queue(), mtx->get_size()[0],
             mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
-            mtx->get_const_values(), block_data, storage_scheme, block_ptrs,
-            num_blocks);
+            as_device_type(mtx->get_const_values()), block_data, storage_scheme,
+            block_ptrs, num_blocks);
     }
 }
 
diff --git a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
index 3d6ebe76226..facb9ade95c 100644
--- a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
@@ -56,8 +56,8 @@ void simple_apply(
             syn::type_list<>(), exec, num_blocks,
             block_precisions.get_const_data(), block_pointers.get_const_data(),
             blocks.get_const_data(), storage_scheme,
-            b->get_const_values() + col, b->get_stride(), x->get_values() + col,
-            x->get_stride());
+            as_device_type(b->get_const_values()) + col, b->get_stride(),
+            as_device_type(x->get_values()) + col, x->get_stride());
     }
 }
 
diff --git a/dpcpp/solver/cb_gmres_kernels.dp.cpp b/dpcpp/solver/cb_gmres_kernels.dp.cpp
index e3424944309..43a50310b8f 100644
--- a/dpcpp/solver/cb_gmres_kernels.dp.cpp
+++ b/dpcpp/solver/cb_gmres_kernels.dp.cpp
@@ -939,11 +939,11 @@ void initialize(std::shared_ptr<const DpcppExecutor> exec,
 
     initialize_kernel<block_size>(
         grid_dim, block_dim, 0, exec->get_queue(), b->get_size()[0],
-        b->get_size()[1], krylov_dim, b->get_const_values(), b->get_stride(),
-        residual->get_values(), residual->get_stride(),
-        givens_sin->get_values(), givens_sin->get_stride(),
-        givens_cos->get_values(), givens_cos->get_stride(),
-        stop_status->get_data());
+        b->get_size()[1], krylov_dim, as_device_type(b->get_const_values()),
+        b->get_stride(), as_device_type(residual->get_values()),
+        residual->get_stride(), givens_sin->get_values(),
+        givens_sin->get_stride(), givens_cos->get_values(),
+        givens_cos->get_stride(), stop_status->get_data());
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL);
@@ -990,7 +990,8 @@ void restart(std::shared_ptr<const DpcppExecutor> exec,
         const dim3 block_size_nrm(default_dot_dim, default_dot_dim);
         multinorminf_without_stop_kernel(
             grid_size_nrm, block_size_nrm, 0, exec->get_queue(), num_rows,
-            num_rhs, residual->get_const_values(), residual->get_stride(),
+            num_rhs, as_device_type(residual->get_const_values()),
+            residual->get_stride(),
             arnoldi_norm->get_values() + 2 * stride_arnoldi, 0);
     }
 
@@ -1009,7 +1010,7 @@ void restart(std::shared_ptr<const DpcppExecutor> exec,
         1, 1);
     restart_2_kernel<block_size>(
         grid_dim_2, block_dim, 0, exec->get_queue(), residual->get_size()[0],
-        residual->get_size()[1], residual->get_const_values(),
+        residual->get_size()[1], as_device_type(residual->get_const_values()),
         residual->get_stride(), residual_norm->get_const_values(),
         residual_norm_collection->get_values(), krylov_bases,
         next_krylov_basis->get_values(), next_krylov_basis->get_stride(),
@@ -1255,9 +1256,10 @@ void solve_upper_triangular(
     solve_upper_triangular_kernel<block_size>(
         grid_dim, block_dim, 0, exec->get_queue(), hessenberg->get_size()[1],
         num_rhs, residual_norm_collection->get_const_values(),
-        residual_norm_collection->get_stride(), hessenberg->get_const_values(),
-        hessenberg->get_stride(), y->get_values(), y->get_stride(),
-        final_iter_nums->get_const_data());
+        residual_norm_collection->get_stride(),
+        as_device_type(hessenberg->get_const_values()),
+        hessenberg->get_stride(), as_device_type(y->get_values()),
+        y->get_stride(), final_iter_nums->get_const_data());
 }
 
 
@@ -1283,7 +1285,7 @@ void calculate_qy(std::shared_ptr<const DpcppExecutor> exec,
 
     calculate_Qy_kernel<block_size>(
         grid_dim, block_dim, 0, exec->get_queue(), num_rows, num_cols,
-        krylov_bases, y->get_const_values(), y->get_stride(),
+        krylov_bases, as_device_type(y->get_const_values()), y->get_stride(),
         before_preconditioner->get_values(), stride_before_preconditioner,
         final_iter_nums->get_const_data());
     // Calculate qy
diff --git a/dpcpp/solver/idr_kernels.dp.cpp b/dpcpp/solver/idr_kernels.dp.cpp
index 29cdd70cd64..83f17c07723 100644
--- a/dpcpp/solver/idr_kernels.dp.cpp
+++ b/dpcpp/solver/idr_kernels.dp.cpp
@@ -587,8 +587,8 @@ void initialize_m(std::shared_ptr<const DpcppExecutor> exec,
 
     const auto grid_dim = ceildiv(m_stride * subspace_dim, default_block_size);
     initialize_m_kernel(grid_dim, default_block_size, 0, exec->get_queue(),
-                        subspace_dim, nrhs, m->get_values(), m_stride,
-                        stop_status->get_data());
+                        subspace_dim, nrhs, as_device_type(m->get_values()),
+                        m_stride, stop_status->get_data());
 }
 
 
@@ -645,8 +645,9 @@ void solve_lower_triangular(std::shared_ptr<const DpcppExecutor> exec,
     const auto grid_dim = ceildiv(nrhs, default_block_size);
     solve_lower_triangular_kernel(
         grid_dim, default_block_size, 0, exec->get_queue(), subspace_dim, nrhs,
-        m->get_const_values(), m->get_stride(), f->get_const_values(),
-        f->get_stride(), c->get_values(), c->get_stride(),
+        as_device_type(m->get_const_values()), m->get_stride(),
+        as_device_type(f->get_const_values()), f->get_stride(),
+        as_device_type(c->get_values()), c->get_stride(),
         stop_status->get_const_data());
 }
 
@@ -669,30 +670,34 @@ void update_g_and_u(std::shared_ptr<const DpcppExecutor> exec,
     const dim3 block_dim(default_dot_dim, default_dot_dim);
 
     for (size_type i = 0; i < k; i++) {
-        const auto p_i = p->get_const_values() + i * p_stride;
+        const auto p_i = as_device_type(p->get_const_values()) + i * p_stride;
         if (nrhs > 1 || is_complex<ValueType>()) {
-            components::fill_array(exec, alpha->get_values(), nrhs,
-                                   zero<ValueType>());
+            components::fill_array(exec, as_device_type(alpha->get_values()),
+                                   nrhs, zero<ValueType>());
             multidot_kernel(grid_dim, block_dim, 0, exec->get_queue(), size,
                             nrhs, p_i, g_k->get_values(), g_k->get_stride(),
-                            alpha->get_values(), stop_status->get_const_data());
+                            as_device_type(alpha->get_values()),
+                            stop_status->get_const_data());
         } else {
             onemkl::dot(*exec->get_queue(), size, p_i, 1, g_k->get_values(),
-                        g_k->get_stride(), alpha->get_values());
+                        g_k->get_stride(), as_device_type(alpha->get_values()));
         }
         update_g_k_and_u_kernel<default_block_size>(
             ceildiv(size * g_k->get_stride(), default_block_size),
             default_block_size, 0, exec->get_queue(), k, i, size, nrhs,
-            alpha->get_const_values(), m->get_const_values(), m->get_stride(),
-            g->get_const_values(), g->get_stride(), g_k->get_values(),
-            g_k->get_stride(), u->get_values(), u->get_stride(),
+            as_device_type(alpha->get_const_values()),
+            as_device_type(m->get_const_values()), m->get_stride(),
+            as_device_type(g->get_const_values()), g->get_stride(),
+            g_k->get_values(), g_k->get_stride(),
+            as_device_type(u->get_values()), u->get_stride(),
             stop_status->get_const_data());
     }
     update_g_kernel<default_block_size>(
         ceildiv(size * g_k->get_stride(), default_block_size),
         default_block_size, 0, exec->get_queue(), k, size, nrhs,
-        g_k->get_const_values(), g_k->get_stride(), g->get_values(),
-        g->get_stride(), stop_status->get_const_data());
+        g_k->get_const_values(), g_k->get_stride(),
+        as_device_type(g->get_values()), g->get_stride(),
+        stop_status->get_const_data());
 }
 
 
@@ -712,8 +717,8 @@ void update_m(std::shared_ptr<const DpcppExecutor> exec, const size_type nrhs,
     const dim3 block_dim(default_dot_dim, default_dot_dim);
 
     for (size_type i = k; i < subspace_dim; i++) {
-        const auto p_i = p->get_const_values() + i * p_stride;
-        auto m_i = m->get_values() + i * m_stride + k * nrhs;
+        const auto p_i = as_device_type(p->get_const_values()) + i * p_stride;
+        auto m_i = as_device_type(m->get_values()) + i * m_stride + k * nrhs;
         if (nrhs > 1 || is_complex<ValueType>()) {
             components::fill_array(exec, m_i, nrhs, zero<ValueType>());
             multidot_kernel(grid_dim, block_dim, 0, exec->get_queue(), size,
@@ -742,15 +747,18 @@ void update_x_r_and_f(std::shared_ptr<const DpcppExecutor> exec,
     const auto subspace_dim = m->get_size()[0];
 
     const auto grid_dim = ceildiv(size * x->get_stride(), default_block_size);
-    update_x_r_and_f_kernel(grid_dim, default_block_size, 0, exec->get_queue(),
-                            k, size, subspace_dim, nrhs, m->get_const_values(),
-                            m->get_stride(), g->get_const_values(),
-                            g->get_stride(), u->get_const_values(),
-                            u->get_stride(), f->get_values(), f->get_stride(),
-                            r->get_values(), r->get_stride(), x->get_values(),
-                            x->get_stride(), stop_status->get_const_data());
-    components::fill_array(exec, f->get_values() + k * f->get_stride(), nrhs,
-                           zero<ValueType>());
+    update_x_r_and_f_kernel(
+        grid_dim, default_block_size, 0, exec->get_queue(), k, size,
+        subspace_dim, nrhs, as_device_type(m->get_const_values()),
+        m->get_stride(), as_device_type(g->get_const_values()), g->get_stride(),
+        as_device_type(u->get_const_values()), u->get_stride(),
+        as_device_type(f->get_values()), f->get_stride(),
+        as_device_type(r->get_values()), r->get_stride(),
+        as_device_type(x->get_values()), x->get_stride(),
+        stop_status->get_const_data());
+    components::fill_array(
+        exec, as_device_type(f->get_values()) + k * f->get_stride(), nrhs,
+        zero<ValueType>());
 }
 
 
@@ -788,11 +796,12 @@ void step_1(std::shared_ptr<const DpcppExecutor> exec, const size_type nrhs,
 
     const auto grid_dim = ceildiv(nrhs * num_rows, default_block_size);
     step_1_kernel(grid_dim, default_block_size, 0, exec->get_queue(), k,
-                  num_rows, subspace_dim, nrhs, residual->get_const_values(),
-                  residual->get_stride(), c->get_const_values(),
-                  c->get_stride(), g->get_const_values(), g->get_stride(),
-                  v->get_values(), v->get_stride(),
-                  stop_status->get_const_data());
+                  num_rows, subspace_dim, nrhs,
+                  as_device_type(residual->get_const_values()),
+                  residual->get_stride(), as_device_type(c->get_const_values()),
+                  c->get_stride(), as_device_type(g->get_const_values()),
+                  g->get_stride(), as_device_type(v->get_values()),
+                  v->get_stride(), stop_status->get_const_data());
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_1_KERNEL);
@@ -813,10 +822,12 @@ void step_2(std::shared_ptr<const DpcppExecutor> exec, const size_type nrhs,
 
     const auto grid_dim = ceildiv(nrhs * num_rows, default_block_size);
     step_2_kernel(grid_dim, default_block_size, 0, exec->get_queue(), k,
-                  num_rows, subspace_dim, nrhs, omega->get_const_values(),
+                  num_rows, subspace_dim, nrhs,
+                  as_device_type(omega->get_const_values()),
                   preconditioned_vector->get_const_values(),
-                  preconditioned_vector->get_stride(), c->get_const_values(),
-                  c->get_stride(), u->get_values(), u->get_stride(),
+                  preconditioned_vector->get_stride(),
+                  as_device_type(c->get_const_values()), c->get_stride(),
+                  as_device_type(u->get_values()), u->get_stride(),
                   stop_status->get_const_data());
 }
 
@@ -849,8 +860,9 @@ void compute_omega(
 {
     const auto grid_dim = ceildiv(nrhs, config::warp_size);
     compute_omega_kernel(grid_dim, config::warp_size, 0, exec->get_queue(),
-                         nrhs, kappa, tht->get_const_values(),
-                         residual_norm->get_const_values(), omega->get_values(),
+                         nrhs, kappa, as_device_type(tht->get_const_values()),
+                         residual_norm->get_const_values(),
+                         as_device_type(omega->get_values()),
                          stop_status->get_const_data());
 }
 
diff --git a/dpcpp/stop/residual_norm_kernels.dp.cpp b/dpcpp/stop/residual_norm_kernels.dp.cpp
index 23d62e83729..02887d3d176 100644
--- a/dpcpp/stop/residual_norm_kernels.dp.cpp
+++ b/dpcpp/stop/residual_norm_kernels.dp.cpp
@@ -46,7 +46,7 @@ void residual_norm(std::shared_ptr<const DpcppExecutor> exec,
     });
 
     auto orig_tau_val = orig_tau->get_const_values();
-    auto tau_val = tau->get_const_values();
+    auto tau_val = as_device_type(tau->get_const_values());
     auto stop_status_val = stop_status->get_data();
     exec->get_queue()->submit([&](sycl::handler& cgh) {
         cgh.parallel_for(
@@ -102,7 +102,7 @@ void implicit_residual_norm(
     });
 
     auto orig_tau_val = orig_tau->get_const_values();
-    auto tau_val = tau->get_const_values();
+    auto tau_val = as_device_type(tau->get_const_values());
     auto stop_status_val = stop_status->get_data();
     exec->get_queue()->submit([&](sycl::handler& cgh) {
         cgh.parallel_for(

From 7541753be59d3ac0635427289ecdb3e73ad62e8b Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 19 Nov 2024 11:13:32 +0100
Subject: [PATCH 411/448] factorization

---
 dpcpp/factorization/factorization_kernels.dp.cpp         | 1 +
 dpcpp/factorization/par_ic_kernels.dp.cpp                | 1 +
 dpcpp/factorization/par_ict_kernels.dp.cpp               | 1 +
 dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp | 1 +
 dpcpp/factorization/par_ilut_filter_kernel.dp.cpp        | 1 +
 dpcpp/factorization/par_ilut_select_kernel.dp.cpp        | 1 +
 dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp        | 1 +
 dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp         | 1 +
 8 files changed, 8 insertions(+)

diff --git a/dpcpp/factorization/factorization_kernels.dp.cpp b/dpcpp/factorization/factorization_kernels.dp.cpp
index 99ca5b1c985..2b2858ff789 100644
--- a/dpcpp/factorization/factorization_kernels.dp.cpp
+++ b/dpcpp/factorization/factorization_kernels.dp.cpp
@@ -14,6 +14,7 @@
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/dpct.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/intrinsics.dp.hpp"
 #include "dpcpp/components/searching.dp.hpp"
diff --git a/dpcpp/factorization/par_ic_kernels.dp.cpp b/dpcpp/factorization/par_ic_kernels.dp.cpp
index 7a978a19c55..ac143a37465 100644
--- a/dpcpp/factorization/par_ic_kernels.dp.cpp
+++ b/dpcpp/factorization/par_ic_kernels.dp.cpp
@@ -11,6 +11,7 @@
 #include <ginkgo/core/matrix/csr.hpp>
 
 #include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
 
 
diff --git a/dpcpp/factorization/par_ict_kernels.dp.cpp b/dpcpp/factorization/par_ict_kernels.dp.cpp
index 65bfe4c1636..4325c7e8818 100644
--- a/dpcpp/factorization/par_ict_kernels.dp.cpp
+++ b/dpcpp/factorization/par_ict_kernels.dp.cpp
@@ -20,6 +20,7 @@
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/intrinsics.dp.hpp"
 #include "dpcpp/components/merging.dp.hpp"
diff --git a/dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp b/dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp
index c808f7e0ae8..12b74a3c0f3 100644
--- a/dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp
+++ b/dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp
@@ -21,6 +21,7 @@
 #include "core/synthesizer/implementation_selection.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/components/atomic.dp.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/intrinsics.dp.hpp"
diff --git a/dpcpp/factorization/par_ilut_filter_kernel.dp.cpp b/dpcpp/factorization/par_ilut_filter_kernel.dp.cpp
index 732a8dc6135..02d2f479283 100644
--- a/dpcpp/factorization/par_ilut_filter_kernel.dp.cpp
+++ b/dpcpp/factorization/par_ilut_filter_kernel.dp.cpp
@@ -18,6 +18,7 @@
 #include "core/synthesizer/implementation_selection.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/intrinsics.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
diff --git a/dpcpp/factorization/par_ilut_select_kernel.dp.cpp b/dpcpp/factorization/par_ilut_select_kernel.dp.cpp
index 43c13fc730b..fd76f3246b8 100644
--- a/dpcpp/factorization/par_ilut_select_kernel.dp.cpp
+++ b/dpcpp/factorization/par_ilut_select_kernel.dp.cpp
@@ -14,6 +14,7 @@
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/par_ilut_kernels.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/components/atomic.dp.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/intrinsics.dp.hpp"
diff --git a/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp b/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp
index f9643fbe66b..23309290e68 100644
--- a/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp
+++ b/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp
@@ -19,6 +19,7 @@
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/intrinsics.dp.hpp"
 #include "dpcpp/components/merging.dp.hpp"
diff --git a/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp b/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp
index 4644bb155d2..4ef3a473f35 100644
--- a/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp
+++ b/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp
@@ -17,6 +17,7 @@
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/intrinsics.dp.hpp"
 #include "dpcpp/components/merging.dp.hpp"

From d6c9e9de86fcd976f92a9eaabc151884f16d4a7a Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 19 Nov 2024 11:14:20 +0100
Subject: [PATCH 412/448] matrix

---
 dpcpp/matrix/coo_kernels.dp.cpp          |  1 +
 dpcpp/matrix/csr_kernels.dp.cpp          | 44 ++++++++++++------------
 dpcpp/matrix/sparsity_csr_kernels.dp.cpp |  2 ++
 3 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/dpcpp/matrix/coo_kernels.dp.cpp b/dpcpp/matrix/coo_kernels.dp.cpp
index c8f79968577..7782313e42e 100644
--- a/dpcpp/matrix/coo_kernels.dp.cpp
+++ b/dpcpp/matrix/coo_kernels.dp.cpp
@@ -16,6 +16,7 @@
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/helper.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/components/atomic.dp.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/format_conversion.dp.hpp"
diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp
index 9085dd9140e..38a83676e3d 100644
--- a/dpcpp/matrix/csr_kernels.dp.cpp
+++ b/dpcpp/matrix/csr_kernels.dp.cpp
@@ -1888,8 +1888,8 @@ auto spgemm_multiway_merge(size_type row,
                            const typename HeapElement::index_type* b_cols,
                            const typename HeapElement::value_type* b_vals,
                            HeapElement* heap, InitCallback init_cb,
-                           StepCallback step_cb,
-                           ColCallback col_cb) -> decltype(init_cb(0))
+                           StepCallback step_cb, ColCallback col_cb)
+    -> decltype(init_cb(0))
 {
     auto a_begin = a_row_ptrs[row];
     auto a_end = a_row_ptrs[row + 1];
@@ -2371,7 +2371,7 @@ void inv_symm_permute(std::shared_ptr<const DpcppExecutor> exec,
     inv_symm_permute_kernel(
         copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
         perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(),
-        as_deivice_type(orig->get_const_values()), permuted->get_row_ptrs(),
+        as_device_type(orig->get_const_values()), permuted->get_row_ptrs(),
         permuted->get_col_idxs(), as_device_type(permuted->get_values()));
 }
 
@@ -2397,9 +2397,9 @@ void inv_nonsymm_permute(std::shared_ptr<const DpcppExecutor> exec,
     inv_nonsymm_permute_kernel(
         copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
         row_perm, col_perm, orig->get_const_row_ptrs(),
-        orig->get_const_col_idxs(), as_deivice_type(orig->get_const_values()),
+        orig->get_const_col_idxs(), as_device_type(orig->get_const_values()),
         permuted->get_row_ptrs(), permuted->get_col_idxs(),
-        as_deivice_type(permuted->get_values()));
+        as_device_type(permuted->get_values()));
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
@@ -2424,9 +2424,9 @@ void row_permute(std::shared_ptr<const DpcppExecutor> exec,
     row_permute_kernel(
         copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
         perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(),
-        as_deivice_type(orig->get_const_values()), row_permuted->get_row_ptrs(),
+        as_device_type(orig->get_const_values()), row_permuted->get_row_ptrs(),
         row_permuted->get_col_idxs(),
-        as_deivice_type(row_permuted->get_values()));
+        as_device_type(row_permuted->get_values()));
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
@@ -2451,9 +2451,9 @@ void inv_row_permute(std::shared_ptr<const DpcppExecutor> exec,
     inv_row_permute_kernel(
         copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
         perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(),
-        as_deivice_type(orig->get_const_values()), row_permuted->get_row_ptrs(),
+        as_device_type(orig->get_const_values()), row_permuted->get_row_ptrs(),
         row_permuted->get_col_idxs(),
-        as_deivice_type(row_permuted->get_values()));
+        as_device_type(row_permuted->get_values()));
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
@@ -2478,8 +2478,8 @@ void inv_symm_scale_permute(std::shared_ptr<const DpcppExecutor> exec,
     inv_symm_scale_permute_kernel(
         copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
         scale, perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(),
-        as_deivice_type(orig->get_const_values()), permuted->get_row_ptrs(),
-        permuted->get_col_idxs(), as_deivice_type(permuted->get_values()));
+        as_device_type(orig->get_const_values()), permuted->get_row_ptrs(),
+        permuted->get_col_idxs(), as_device_type(permuted->get_values()));
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
@@ -2507,9 +2507,9 @@ void inv_nonsymm_scale_permute(std::shared_ptr<const DpcppExecutor> exec,
     inv_nonsymm_scale_permute_kernel(
         copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
         row_scale, row_perm, col_scale, col_perm, orig->get_const_row_ptrs(),
-        orig->get_const_col_idxs(), as_deivice_type(orig->get_const_values()),
+        orig->get_const_col_idxs(), as_device_type(orig->get_const_values()),
         permuted->get_row_ptrs(), permuted->get_col_idxs(),
-        as_deivice_type(permuted->get_values()));
+        as_device_type(permuted->get_values()));
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
@@ -2534,9 +2534,9 @@ void row_scale_permute(std::shared_ptr<const DpcppExecutor> exec,
     row_scale_permute_kernel(
         copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
         scale, perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(),
-        as_deivice_type(orig->get_const_values()), row_permuted->get_row_ptrs(),
+        as_device_type(orig->get_const_values()), row_permuted->get_row_ptrs(),
         row_permuted->get_col_idxs(),
-        as_deivice_type(row_permuted->get_values()));
+        as_device_type(row_permuted->get_values()));
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
@@ -2561,9 +2561,9 @@ void inv_row_scale_permute(std::shared_ptr<const DpcppExecutor> exec,
     inv_row_scale_permute_kernel(
         copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
         scale, perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(),
-        as_deivice_type(orig->get_const_values()), row_permuted->get_row_ptrs(),
+        as_device_type(orig->get_const_values()), row_permuted->get_row_ptrs(),
         row_permuted->get_col_idxs(),
-        as_deivice_type(row_permuted->get_values()));
+        as_device_type(row_permuted->get_values()));
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
@@ -2577,7 +2577,7 @@ void sort_by_column_index(std::shared_ptr<const DpcppExecutor> exec,
     const auto num_rows = to_sort->get_size()[0];
     const auto row_ptrs = to_sort->get_const_row_ptrs();
     auto cols = to_sort->get_col_idxs();
-    auto vals = as_deivice_type(to_sort->get_values());
+    auto vals = as_device_type(to_sort->get_values());
     exec->get_queue()->submit([&](sycl::handler& cgh) {
         cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx) {
             const auto row = static_cast<size_type>(idx[0]);
@@ -2723,10 +2723,10 @@ void add_scaled_identity(std::shared_ptr<const DpcppExecutor> exec,
     const auto nblocks = ceildiv(nthreads, default_block_size);
     kernel::add_scaled_identity(
         nblocks, default_block_size, 0, exec->get_queue(),
-        as_deivice_type(alpha->get_const_values()),
-        as_deivice_type(beta->get_const_values()),
-        static_cast<IndexType>(nrows), mtx->get_const_row_ptrs(),
-        mtx->get_const_col_idxs(), as_deivice_type(mtx->get_values()));
+        as_device_type(alpha->get_const_values()),
+        as_device_type(beta->get_const_values()), static_cast<IndexType>(nrows),
+        mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
+        as_device_type(mtx->get_values()));
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
diff --git a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp
index d81e2a721b3..9d7f76ea7a6 100644
--- a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp
+++ b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp
@@ -9,10 +9,12 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 #include "accessor/reduced_row_major.hpp"
+#include "accessor/sycl_helper.hpp"
 #include "core/base/mixed_precision_types.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"

From 082c1c268048be0af8e60efc1b893def5e790a84 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 19 Nov 2024 11:15:01 +0100
Subject: [PATCH 413/448] preconditioner

---
 dpcpp/preconditioner/isai_kernels.dp.cpp                    | 1 +
 dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp    | 1 +
 dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp | 1 +
 dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp      | 2 +-
 4 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/dpcpp/preconditioner/isai_kernels.dp.cpp b/dpcpp/preconditioner/isai_kernels.dp.cpp
index f7d417380e4..96a10a2fd44 100644
--- a/dpcpp/preconditioner/isai_kernels.dp.cpp
+++ b/dpcpp/preconditioner/isai_kernels.dp.cpp
@@ -15,6 +15,7 @@
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/dpct.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/merging.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
index 9c9d049668f..4ae013b1141 100644
--- a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
@@ -7,6 +7,7 @@
 #include "core/matrix/dense_kernels.hpp"
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/preconditioner/jacobi_common.hpp"
 
 
diff --git a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
index a43728d7e2b..adf8452fe4b 100644
--- a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
@@ -15,6 +15,7 @@
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/dpct.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/diagonal_block_manipulation.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
diff --git a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
index facb9ade95c..bf2fb49d2ae 100644
--- a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
@@ -7,9 +7,9 @@
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/preconditioner/jacobi_common.hpp"
 
-
 namespace gko {
 namespace kernels {
 namespace dpcpp {

From 6fb0afee36a6c23ccb3c20809f1cbe7a90540895 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 19 Nov 2024 11:15:11 +0100
Subject: [PATCH 414/448] solver

---
 dpcpp/solver/cb_gmres_kernels.dp.cpp | 1 +
 dpcpp/solver/idr_kernels.dp.cpp      | 1 +
 2 files changed, 2 insertions(+)

diff --git a/dpcpp/solver/cb_gmres_kernels.dp.cpp b/dpcpp/solver/cb_gmres_kernels.dp.cpp
index 43a50310b8f..226d1a41c90 100644
--- a/dpcpp/solver/cb_gmres_kernels.dp.cpp
+++ b/dpcpp/solver/cb_gmres_kernels.dp.cpp
@@ -22,6 +22,7 @@
 #include "core/solver/cb_gmres_accessor.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/components/atomic.dp.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
diff --git a/dpcpp/solver/idr_kernels.dp.cpp b/dpcpp/solver/idr_kernels.dp.cpp
index 83f17c07723..319dff53771 100644
--- a/dpcpp/solver/idr_kernels.dp.cpp
+++ b/dpcpp/solver/idr_kernels.dp.cpp
@@ -19,6 +19,7 @@
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/onemkl_bindings.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/components/atomic.dp.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"

From d6da5be7a9b785cfbaf28f5df2324b18f64c4590 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 19 Nov 2024 11:15:15 +0100
Subject: [PATCH 415/448] stop

---
 dpcpp/stop/residual_norm_kernels.dp.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dpcpp/stop/residual_norm_kernels.dp.cpp b/dpcpp/stop/residual_norm_kernels.dp.cpp
index 02887d3d176..3da9bfd3a75 100644
--- a/dpcpp/stop/residual_norm_kernels.dp.cpp
+++ b/dpcpp/stop/residual_norm_kernels.dp.cpp
@@ -12,6 +12,7 @@
 
 #include "core/base/array_access.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
 
 

From 5122f539a881d00d51872b842685d6375c52c7e3 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 20 Nov 2024 18:25:25 +0100
Subject: [PATCH 416/448] sycl half

---
 CMakeLists.txt                                |   6 +-
 accessor/sycl_helper.hpp                      |  12 +-
 common/unified/base/kernel_launch.hpp         |   4 +-
 .../precision_conversion_kernels.cpp          |   4 +-
 .../unified/matrix/dense_kernels.template.cpp |   6 +-
 core/solver/batch_dispatch.hpp                |  19 +-
 dpcpp/base/batch_multi_vector_kernels.dp.cpp  |   2 +
 dpcpp/base/batch_struct.hpp                   |  13 +-
 dpcpp/base/device_matrix_data_kernels.dp.cpp  |   2 +-
 dpcpp/base/kernel_launch_reduction.dp.hpp     |   1 +
 dpcpp/base/math.hpp                           | 236 ++++++++++++++++++
 dpcpp/base/types.hpp                          |   7 +
 dpcpp/components/atomic.dp.hpp                |  31 ++-
 dpcpp/factorization/cholesky_kernels.dp.cpp   |  12 +-
 .../factorization_kernels.dp.cpp              |  27 +-
 dpcpp/factorization/ic_kernels.dp.cpp         |   2 +-
 dpcpp/factorization/ilu_kernels.dp.cpp        |   2 +-
 dpcpp/factorization/lu_kernels.dp.cpp         |   6 +-
 dpcpp/factorization/par_ic_kernels.dp.cpp     |  14 +-
 dpcpp/factorization/par_ict_kernels.dp.cpp    |   3 +-
 dpcpp/factorization/par_ilu_kernels.dp.cpp    |  11 +-
 .../par_ilut_approx_filter_kernel.dp.cpp      |   7 +-
 .../par_ilut_filter_kernel.dp.cpp             |  10 +-
 dpcpp/factorization/par_ilut_kernels.dp.cpp   |  10 +-
 .../par_ilut_select_common.dp.cpp             |   2 +-
 .../par_ilut_select_kernel.dp.cpp             |   5 +-
 .../par_ilut_spgeam_kernel.dp.cpp             |   6 +-
 .../par_ilut_sweep_kernel.dp.cpp              |   5 +-
 dpcpp/matrix/batch_csr_kernels.dp.cpp         |   6 +-
 dpcpp/matrix/batch_dense_kernels.dp.cpp       |   6 +-
 dpcpp/matrix/batch_ell_kernels.dp.cpp         |   6 +-
 dpcpp/matrix/batch_struct.hpp                 |  35 +--
 dpcpp/matrix/coo_kernels.dp.cpp               |   1 +
 dpcpp/matrix/csr_kernels.dp.cpp               |  28 ++-
 dpcpp/matrix/dense_kernels.dp.cpp             |  10 +-
 dpcpp/matrix/ell_kernels.dp.cpp               |   1 +
 dpcpp/matrix/sparsity_csr_kernels.dp.cpp      |   7 +-
 dpcpp/preconditioner/isai_kernels.dp.cpp      |  10 +-
 ...cobi_advanced_apply_instantiate.inc.dp.cpp |  12 +-
 .../jacobi_advanced_apply_kernel.dp.cpp       |   7 +-
 .../jacobi_generate_instantiate.inc.dp.cpp    |  10 +-
 ...jacobi_simple_apply_instantiate.inc.dp.cpp |  11 +-
 .../jacobi_simple_apply_kernel.dp.cpp         |   4 +-
 dpcpp/solver/batch_bicgstab_kernels.dp.cpp    |  40 +--
 dpcpp/solver/batch_bicgstab_launch.hpp        |  23 +-
 .../batch_bicgstab_launch.instantiate.dp.cpp  |  21 +-
 dpcpp/solver/batch_cg_kernels.dp.cpp          |  22 +-
 dpcpp/solver/batch_cg_launch.hpp              |  35 +--
 .../solver/batch_cg_launch.instantiate.dp.cpp |  33 +--
 dpcpp/solver/idr_kernels.dp.cpp               |  67 +++--
 dpcpp/stop/residual_norm_kernels.dp.cpp       |   1 +
 51 files changed, 601 insertions(+), 260 deletions(-)
 create mode 100644 dpcpp/base/math.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fea0c3efd40..90c1c3ba4e3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,9 +33,9 @@ option(GINKGO_FAST_TESTS "Reduces the input size for a few tests known to be tim
 option(GINKGO_TEST_NONDEFAULT_STREAM "Uses non-default streams in CUDA and HIP tests" OFF)
 option(GINKGO_MIXED_PRECISION "Instantiate true mixed-precision kernels (otherwise they will be conversion-based using implicit temporary storage)" OFF)
 option(GINKGO_ENABLE_HALF "Enable the use of half precision" ON)
-# We do not support MSVC. SYCL will come later
-if(MSVC OR GINKGO_BUILD_SYCL)
-    message(STATUS "HALF is not supported in MSVC, and later support in SYCL")
+# We do not support MSVC.
+if(MSVC)
+    message(STATUS "HALF is not supported in MSVC")
     set(GINKGO_ENABLE_HALF OFF CACHE BOOL "Enable the use of half precision" FORCE)
 endif()
 option(GINKGO_SKIP_DEPENDENCY_UPDATE
diff --git a/accessor/sycl_helper.hpp b/accessor/sycl_helper.hpp
index 793587c30d3..0de68a25c97 100644
--- a/accessor/sycl_helper.hpp
+++ b/accessor/sycl_helper.hpp
@@ -16,15 +16,15 @@
 #include "utils.hpp"
 
 
-namespace sycl {
-inline namespace _V1 {
+// namespace sycl {
+// inline namespace _V1 {
 
 
-class half;
+// class half;
 
 
-}
-}  // namespace sycl
+// }
+// }  // namespace sycl
 
 
 namespace gko {
@@ -181,7 +181,7 @@ GKO_ACC_INLINE auto as_sycl_range(const range<row_major<T, dim>>& r)
 template <typename AccType>
 GKO_ACC_INLINE auto as_device_range(AccType&& acc)
 {
-    return as_device_range(std::forward<AccType>(acc));
+    return as_sycl_range(std::forward<AccType>(acc));
 }
 
 
diff --git a/common/unified/base/kernel_launch.hpp b/common/unified/base/kernel_launch.hpp
index 248c4671623..0dd2d86e67e 100644
--- a/common/unified/base/kernel_launch.hpp
+++ b/common/unified/base/kernel_launch.hpp
@@ -73,10 +73,12 @@ GKO_INLINE GKO_ATTRIBUTES constexpr unpack_member_type<T> unpack_member(T value)
 #define GKO_KERNEL
 
 
+#include "dpcpp/base/math.hpp"
+#include "dpcpp/base/types.hpp"
+
 namespace gko {
 namespace kernels {
 namespace dpcpp {
-#include "dpcpp/base/types.hpp"
 
 
 template <typename T>
diff --git a/common/unified/components/precision_conversion_kernels.cpp b/common/unified/components/precision_conversion_kernels.cpp
index 94a8d4e4d0f..46d14a7ef17 100644
--- a/common/unified/components/precision_conversion_kernels.cpp
+++ b/common/unified/components/precision_conversion_kernels.cpp
@@ -19,7 +19,9 @@ void convert_precision(std::shared_ptr<const DefaultExecutor> exec,
 {
     run_kernel(
         exec,
-        [] GKO_KERNEL(auto idx, auto in, auto out) { out[idx] = in[idx]; },
+        [] GKO_KERNEL(auto idx, auto in, auto out) {
+            out[idx] = static_cast<device_type<TargetType>>(in[idx]);
+        },
         size, in, out);
 }
 
diff --git a/common/unified/matrix/dense_kernels.template.cpp b/common/unified/matrix/dense_kernels.template.cpp
index 16630233578..9564f82e8f6 100644
--- a/common/unified/matrix/dense_kernels.template.cpp
+++ b/common/unified/matrix/dense_kernels.template.cpp
@@ -33,7 +33,8 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
     run_kernel(
         exec,
         [] GKO_KERNEL(auto row, auto col, auto input, auto output) {
-            output(row, col) = input(row, col);
+            output(row, col) =
+                static_cast<device_type<OutValueType>>(input(row, col));
         },
         input->get_size(), input, output);
 }
@@ -425,7 +426,8 @@ void row_gather(std::shared_ptr<const DefaultExecutor> exec,
     run_kernel(
         exec,
         [] GKO_KERNEL(auto row, auto col, auto orig, auto rows, auto gathered) {
-            gathered(row, col) = orig(rows[row], col);
+            gathered(row, col) =
+                static_cast<device_type<OutputType>>(orig(rows[row], col));
         },
         row_collection->get_size(), orig, row_idxs, row_collection);
 }
diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp
index 570b717d7d6..33d3c3938e1 100644
--- a/core/solver/batch_dispatch.hpp
+++ b/core/solver/batch_dispatch.hpp
@@ -86,23 +86,6 @@ using DeviceValueType = gko::kernels::hip::hip_type<ValueType>;
 #include "dpcpp/stop/batch_criteria.hpp"
 
 
-namespace gko {
-namespace kernels {
-namespace dpcpp {
-
-
-template <typename T>
-inline std::decay_t<T> as_device_type(T val)
-{
-    return val;
-}
-
-
-}  // namespace dpcpp
-}  // namespace kernels
-}  // namespace gko
-
-
 namespace gko {
 namespace batch {
 namespace solver {
@@ -112,7 +95,7 @@ namespace device = gko::kernels::dpcpp;
 
 
 template <typename ValueType>
-using DeviceValueType = ValueType;
+using DeviceValueType = gko::kernels::dpcpp::sycl_type<ValueType>;
 
 
 }  // namespace solver
diff --git a/dpcpp/base/batch_multi_vector_kernels.dp.cpp b/dpcpp/base/batch_multi_vector_kernels.dp.cpp
index 6f1f3467e4a..7c49af8d9e8 100644
--- a/dpcpp/base/batch_multi_vector_kernels.dp.cpp
+++ b/dpcpp/base/batch_multi_vector_kernels.dp.cpp
@@ -21,6 +21,8 @@
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/base/helper.hpp"
+#include "dpcpp/base/math.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/intrinsics.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
diff --git a/dpcpp/base/batch_struct.hpp b/dpcpp/base/batch_struct.hpp
index 9c20a8574ef..ef32ffd593b 100644
--- a/dpcpp/base/batch_struct.hpp
+++ b/dpcpp/base/batch_struct.hpp
@@ -11,6 +11,7 @@
 
 #include "core/base/batch_struct.hpp"
 #include "dpcpp/base/config.hpp"
+#include "dpcpp/base/types.hpp"
 
 
 namespace gko {
@@ -32,10 +33,10 @@ namespace dpcpp {
  * Generates an immutable uniform batch struct from a batch of multi-vectors.
  */
 template <typename ValueType>
-inline batch::multi_vector::uniform_batch<const ValueType> get_batch_struct(
-    const batch::MultiVector<ValueType>* const op)
+inline batch::multi_vector::uniform_batch<const device_type<ValueType>>
+get_batch_struct(const batch::MultiVector<ValueType>* const op)
 {
-    return {op->get_const_values(), op->get_num_batch_items(),
+    return {as_device_type(op->get_const_values()), op->get_num_batch_items(),
             static_cast<int32>(op->get_common_size()[1]),
             static_cast<int32>(op->get_common_size()[0]),
             static_cast<int32>(op->get_common_size()[1])};
@@ -46,10 +47,10 @@ inline batch::multi_vector::uniform_batch<const ValueType> get_batch_struct(
  * Generates a uniform batch struct from a batch of multi-vectors.
  */
 template <typename ValueType>
-inline batch::multi_vector::uniform_batch<ValueType> get_batch_struct(
-    batch::MultiVector<ValueType>* const op)
+inline batch::multi_vector::uniform_batch<device_type<ValueType>>
+get_batch_struct(batch::MultiVector<ValueType>* const op)
 {
-    return {op->get_values(), op->get_num_batch_items(),
+    return {as_device_type(op->get_values()), op->get_num_batch_items(),
             static_cast<int32>(op->get_common_size()[1]),
             static_cast<int32>(op->get_common_size()[0]),
             static_cast<int32>(op->get_common_size()[1])};
diff --git a/dpcpp/base/device_matrix_data_kernels.dp.cpp b/dpcpp/base/device_matrix_data_kernels.dp.cpp
index f676e09321a..ab9cf9ebc7a 100644
--- a/dpcpp/base/device_matrix_data_kernels.dp.cpp
+++ b/dpcpp/base/device_matrix_data_kernels.dp.cpp
@@ -97,7 +97,7 @@ void sum_duplicates(std::shared_ptr<const DefaultExecutor> exec, size_type,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DEVICE_MATRIX_DATA_SUM_DUPLICATES_KERNEL);
 
 
diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp
index f45a92269a5..73b1e82314d 100644
--- a/dpcpp/base/kernel_launch_reduction.dp.hpp
+++ b/dpcpp/base/kernel_launch_reduction.dp.hpp
@@ -13,6 +13,7 @@
 #include "core/synthesizer/implementation_selection.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
diff --git a/dpcpp/base/math.hpp b/dpcpp/base/math.hpp
new file mode 100644
index 00000000000..0588f844d83
--- /dev/null
+++ b/dpcpp/base/math.hpp
@@ -0,0 +1,236 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_DPCPP_BASE_MATH_HPP_
+#define GKO_DPCPP_BASE_MATH_HPP_
+
+#include <climits>
+#include <cmath>
+
+#include <sycl/half_type.hpp>
+
+#include <ginkgo/core/base/math.hpp>
+
+#include "dpcpp/base/dpct.hpp"
+
+
+namespace std {
+
+
+template <>
+class complex<sycl::half> {
+public:
+    using value_type = sycl::half;
+
+    complex(const value_type& real = value_type(0.f),
+            const value_type& imag = value_type(0.f))
+        : real_(real), imag_(imag)
+    {}
+
+    template <typename T, typename U,
+              typename = std::enable_if_t<std::is_scalar<T>::value &&
+                                          std::is_scalar<U>::value>>
+    explicit complex(const T& real, const U& imag)
+        : real_(static_cast<value_type>(real)),
+          imag_(static_cast<value_type>(imag))
+    {}
+
+    template <typename T, typename = std::enable_if_t<std::is_scalar<T>::value>>
+    complex(const T& real)
+        : real_(static_cast<value_type>(real)),
+          imag_(static_cast<value_type>(0.f))
+    {}
+
+    template <typename T, typename = std::enable_if_t<std::is_scalar<T>::value>>
+    complex(const complex<T>& other)
+        : real_(static_cast<value_type>(other.real())),
+          imag_(static_cast<value_type>(other.imag()))
+    {}
+
+    value_type real() const noexcept { return real_; }
+
+    value_type imag() const noexcept { return imag_; }
+
+    operator std::complex<float>() const noexcept
+    {
+        return std::complex<float>(static_cast<float>(real_),
+                                   static_cast<float>(imag_));
+    }
+
+    template <typename V>
+    complex& operator=(const V& val)
+    {
+        real_ = val;
+        imag_ = value_type();
+        return *this;
+    }
+
+    template <typename V>
+    complex& operator=(const std::complex<V>& val)
+    {
+        real_ = val.real();
+        imag_ = val.imag();
+        return *this;
+    }
+
+    complex& operator+=(const value_type& real)
+    {
+        real_ += real;
+        return *this;
+    }
+
+    complex& operator-=(const value_type& real)
+    {
+        real_ -= real;
+        return *this;
+    }
+
+    complex& operator*=(const value_type& real)
+    {
+        real_ *= real;
+        imag_ *= real;
+        return *this;
+    }
+
+    complex& operator/=(const value_type& real)
+    {
+        real_ /= real;
+        imag_ /= real;
+        return *this;
+    }
+
+    template <typename T>
+    complex& operator+=(const complex<T>& val)
+    {
+        real_ += val.real();
+        imag_ += val.imag();
+        return *this;
+    }
+
+    template <typename T>
+    complex& operator-=(const complex<T>& val)
+    {
+        real_ -= val.real();
+        imag_ -= val.imag();
+        return *this;
+    }
+
+    template <typename T>
+    complex& operator*=(const complex<T>& val)
+    {
+        auto val_f = static_cast<std::complex<float>>(val);
+        auto result_f = static_cast<std::complex<float>>(*this);
+        result_f *= val_f;
+        real_ = result_f.real();
+        imag_ = result_f.imag();
+        return *this;
+    }
+
+    template <typename T>
+    complex& operator/=(const complex<T>& val)
+    {
+        auto val_f = static_cast<std::complex<float>>(val);
+        auto result_f = static_cast<std::complex<float>>(*this);
+        result_f /= val_f;
+        real_ = result_f.real();
+        imag_ = result_f.imag();
+        return *this;
+    }
+
+// It's for MacOS.
+// TODO: check whether mac compiler always use complex version even when real
+// half
+#define COMPLEX_HALF_OPERATOR(_op, _opeq)                                  \
+    friend complex<sycl::half> operator _op(const complex<sycl::half> lhf, \
+                                            const complex<sycl::half> rhf) \
+    {                                                                      \
+        auto a = lhf;                                                      \
+        a _opeq rhf;                                                       \
+        return a;                                                          \
+    }
+
+    COMPLEX_HALF_OPERATOR(+, +=)
+    COMPLEX_HALF_OPERATOR(-, -=)
+    COMPLEX_HALF_OPERATOR(*, *=)
+    COMPLEX_HALF_OPERATOR(/, /=)
+
+#undef COMPLEX_HALF_OPERATOR
+
+private:
+    value_type real_;
+    value_type imag_;
+};
+
+}  // namespace std
+
+
+namespace gko {
+namespace detail {
+
+
+template <>
+struct basic_float_traits<sycl::half> {
+    using type = sycl::half;
+    static constexpr int sign_bits = 1;
+    static constexpr int significand_bits = 10;
+    static constexpr int exponent_bits = 5;
+    static constexpr bool rounds_to_nearest = true;
+};
+
+
+template <>
+struct is_complex_or_scalar_impl<sycl::half> : public std::true_type {};
+
+
+}  // namespace detail
+
+
+bool __dpct_inline__ is_nan(const sycl::half& val)
+{
+    return std::isnan(static_cast<float>(val));
+}
+
+bool __dpct_inline__ is_nan(const std::complex<sycl::half>& val)
+{
+    return is_nan(val.real()) || is_nan(val.imag());
+}
+
+
+sycl::half __dpct_inline__ abs(const sycl::half& val)
+{
+    return abs(static_cast<float>(val));
+}
+
+sycl::half __dpct_inline__ abs(const std::complex<sycl::half>& val)
+{
+    return abs(static_cast<std::complex<float>>(val));
+}
+
+sycl::half __dpct_inline__ sqrt(const sycl::half& val)
+{
+    return sqrt(static_cast<float>(val));
+}
+
+std::complex<sycl::half> __dpct_inline__
+sqrt(const std::complex<sycl::half>& val)
+{
+    return sqrt(static_cast<std::complex<float>>(val));
+}
+
+
+bool __dpct_inline__ is_finite(const sycl::half& value)
+{
+    return abs(value) < std::numeric_limits<sycl::half>::infinity();
+}
+
+bool __dpct_inline__ is_finite(const std::complex<sycl::half>& value)
+{
+    return is_finite(value.real()) && is_finite(value.imag());
+}
+
+
+}  // namespace gko
+
+
+#endif  // GKO_DPCPP_BASE_MATH_HPP_
diff --git a/dpcpp/base/types.hpp b/dpcpp/base/types.hpp
index 64c446c356e..df30d830c28 100644
--- a/dpcpp/base/types.hpp
+++ b/dpcpp/base/types.hpp
@@ -11,6 +11,7 @@
 #include <sycl/half_type.hpp>
 
 #include <ginkgo/core/base/half.hpp>
+#include <ginkgo/core/base/matrix_data.hpp>
 #include <ginkgo/core/base/types.hpp>
 
 
@@ -55,6 +56,12 @@ struct sycl_type_impl<std::complex<T>> {
     using type = std::complex<typename sycl_type_impl<T>::type>;
 };
 
+template <typename ValueType, typename IndexType>
+struct sycl_type_impl<matrix_data_entry<ValueType, IndexType>> {
+    using type =
+        matrix_data_entry<typename sycl_type_impl<ValueType>::type, IndexType>;
+};
+
 }  // namespace detail
 
 
diff --git a/dpcpp/components/atomic.dp.hpp b/dpcpp/components/atomic.dp.hpp
index 1876019c4fc..2066deeef64 100644
--- a/dpcpp/components/atomic.dp.hpp
+++ b/dpcpp/components/atomic.dp.hpp
@@ -169,6 +169,8 @@ __dpct_inline__ ResultType reinterpret(ValueType val)
 GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned long long int);
 // Support 32-bit ATOMIC_ADD
 GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned int);
+// Support 16-bit ATOMIC_ADD
+// GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned short);
 
 
 #undef GKO_BIND_ATOMIC_HELPER_STRUCTURE
@@ -233,10 +235,12 @@ struct atomic_helper<
         }                                                                   \
     };
 
-// Support 64-bit ATOMIC_ADD
+// Support 64-bit ATOMIC_MAX
 GKO_BIND_ATOMIC_MAX_STRUCTURE(unsigned long long int);
-// Support 32-bit ATOMIC_ADD
+// Support 32-bit ATOMIC_MAX
 GKO_BIND_ATOMIC_MAX_STRUCTURE(unsigned int);
+// Support 16-bit ATOMIC_MAX
+// GKO_BIND_ATOMIC_MAX_STRUCTURE(unsigned short);
 
 
 #undef GKO_BIND_ATOMIC_MAX_STRUCTURE
@@ -266,7 +270,15 @@ template <sycl::access::address_space addressSpace = atomic::global_space,
           typename T>
 __dpct_inline__ T atomic_add(T* __restrict__ addr, T val)
 {
-    return detail::atomic_helper<addressSpace, T>::atomic_add(addr, val);
+    if constexpr (std::is_same_v<T, sycl::half> ||
+                  std::is_same_v<T, std::complex<sycl::half>>) {
+        // unsupported
+        auto old = *addr;
+        *addr += val;
+        return old;
+    } else {
+        return detail::atomic_helper<addressSpace, T>::atomic_add(addr, val);
+    }
 }
 
 
@@ -274,7 +286,18 @@ template <sycl::access::address_space addressSpace = atomic::global_space,
           typename T>
 __dpct_inline__ T atomic_max(T* __restrict__ addr, T val)
 {
-    return detail::atomic_max_helper<addressSpace, T>::atomic_max(addr, val);
+    if constexpr (std::is_same_v<T, sycl::half> ||
+                  std::is_same_v<T, std::complex<sycl::half>>) {
+        // unsupported
+        auto old = *addr;
+        if (val > *addr) {
+            *addr = val;
+        }
+        return old;
+    } else {
+        return detail::atomic_max_helper<addressSpace, T>::atomic_max(addr,
+                                                                      val);
+    }
 }
 
 
diff --git a/dpcpp/factorization/cholesky_kernels.dp.cpp b/dpcpp/factorization/cholesky_kernels.dp.cpp
index e13810deb74..cde772e756b 100644
--- a/dpcpp/factorization/cholesky_kernels.dp.cpp
+++ b/dpcpp/factorization/cholesky_kernels.dp.cpp
@@ -89,7 +89,7 @@ void symbolic_count(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT);
 
 
@@ -138,7 +138,7 @@ void symbolic_factorize(
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE);
 
 
@@ -148,7 +148,7 @@ void forest_from_factor(std::shared_ptr<const DefaultExecutor> exec,
                         gko::factorization::elimination_forest<IndexType>&
                             forest) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR);
 
 
@@ -161,7 +161,8 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
                 IndexType* transpose_idxs,
                 matrix::Csr<ValueType, IndexType>* factors) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_INITIALIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CHOLESKY_INITIALIZE);
 
 
 template <typename ValueType, typename IndexType>
@@ -173,7 +174,8 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
                matrix::Csr<ValueType, IndexType>* factors, bool full_fillin,
                array<int>& tmp_storage) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_CHOLESKY_FACTORIZE);
 
 
 }  // namespace cholesky
diff --git a/dpcpp/factorization/factorization_kernels.dp.cpp b/dpcpp/factorization/factorization_kernels.dp.cpp
index 2b2858ff789..66292c00643 100644
--- a/dpcpp/factorization/factorization_kernels.dp.cpp
+++ b/dpcpp/factorization/factorization_kernels.dp.cpp
@@ -14,6 +14,7 @@
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/dpct.hpp"
+#include "dpcpp/base/math.hpp"
 #include "dpcpp/base/types.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/intrinsics.dp.hpp"
@@ -464,7 +465,7 @@ void add_diagonal_elements(std::shared_ptr<const DpcppExecutor> exec,
 
     array<ValueType> new_values{exec, new_num_elems};
     array<IndexType> new_col_idxs{exec, new_num_elems};
-    auto dpcpp_new_values = new_values.get_data();
+    auto dpcpp_new_values = as_device_type(new_values.get_data());
     auto dpcpp_new_col_idxs = new_col_idxs.get_data();
 
     kernel::add_missing_diagonal_elements<subwarp_size>(
@@ -500,11 +501,12 @@ void initialize_row_ptrs_l_u(
         ceildiv(num_rows, static_cast<size_type>(block_size.x));
     const dim3 grid_dim{number_blocks, 1, 1};
 
-    kernel::count_nnz_per_l_u_row(grid_dim, block_size, 0, exec->get_queue(),
-                                  num_rows, system_matrix->get_const_row_ptrs(),
-                                  system_matrix->get_const_col_idxs(),
-                                  system_matrix->get_const_values(), l_row_ptrs,
-                                  u_row_ptrs);
+    kernel::count_nnz_per_l_u_row(
+        grid_dim, block_size, 0, exec->get_queue(), num_rows,
+        system_matrix->get_const_row_ptrs(),
+        system_matrix->get_const_col_idxs(),
+        as_device_type(system_matrix->get_const_values()), l_row_ptrs,
+        u_row_ptrs);
 
     components::prefix_sum_nonnegative(exec, l_row_ptrs, num_rows + 1);
     components::prefix_sum_nonnegative(exec, u_row_ptrs, num_rows + 1);
@@ -552,10 +554,11 @@ void initialize_row_ptrs_l(
         ceildiv(num_rows, static_cast<size_type>(block_size.x));
     const dim3 grid_dim{number_blocks, 1, 1};
 
-    kernel::count_nnz_per_l_row(grid_dim, block_size, 0, exec->get_queue(),
-                                num_rows, system_matrix->get_const_row_ptrs(),
-                                system_matrix->get_const_col_idxs(),
-                                system_matrix->get_const_values(), l_row_ptrs);
+    kernel::count_nnz_per_l_row(
+        grid_dim, block_size, 0, exec->get_queue(), num_rows,
+        system_matrix->get_const_row_ptrs(),
+        system_matrix->get_const_col_idxs(),
+        as_device_type(system_matrix->get_const_values()), l_row_ptrs);
 
     components::prefix_sum_nonnegative(exec, l_row_ptrs, num_rows + 1);
 }
@@ -578,9 +581,9 @@ void initialize_l(std::shared_ptr<const DpcppExecutor> exec,
     kernel::initialize_l(grid_dim, block_size, 0, exec->get_queue(), num_rows,
                          system_matrix->get_const_row_ptrs(),
                          system_matrix->get_const_col_idxs(),
-                         system_matrix->get_const_values(),
+                         as_device_type(system_matrix->get_const_values()),
                          csr_l->get_const_row_ptrs(), csr_l->get_col_idxs(),
-                         csr_l->get_values(), diag_sqrt);
+                         as_device_type(csr_l->get_values()), diag_sqrt);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
diff --git a/dpcpp/factorization/ic_kernels.dp.cpp b/dpcpp/factorization/ic_kernels.dp.cpp
index b2626e7876a..4968e1da538 100644
--- a/dpcpp/factorization/ic_kernels.dp.cpp
+++ b/dpcpp/factorization/ic_kernels.dp.cpp
@@ -20,7 +20,7 @@ template <typename ValueType, typename IndexType>
 void sparselib_ic(std::shared_ptr<const DefaultExecutor> exec,
                   matrix::Csr<ValueType, IndexType>* m) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_IC_SPARSELIB_IC_KERNEL);
 
 
diff --git a/dpcpp/factorization/ilu_kernels.dp.cpp b/dpcpp/factorization/ilu_kernels.dp.cpp
index 847547f7706..d9ea7776f3c 100644
--- a/dpcpp/factorization/ilu_kernels.dp.cpp
+++ b/dpcpp/factorization/ilu_kernels.dp.cpp
@@ -20,7 +20,7 @@ template <typename ValueType, typename IndexType>
 void sparselib_ilu(std::shared_ptr<const DefaultExecutor> exec,
                    matrix::Csr<ValueType, IndexType>* m) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);
 
 
diff --git a/dpcpp/factorization/lu_kernels.dp.cpp b/dpcpp/factorization/lu_kernels.dp.cpp
index bd26b1f79ca..e651944bb1f 100644
--- a/dpcpp/factorization/lu_kernels.dp.cpp
+++ b/dpcpp/factorization/lu_kernels.dp.cpp
@@ -32,7 +32,8 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
                 const int32* factor_lookup_storage, IndexType* diag_idxs,
                 matrix::Csr<ValueType, IndexType>* factors) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_INITIALIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_LU_INITIALIZE);
 
 
 template <typename ValueType, typename IndexType>
@@ -42,7 +43,8 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
                matrix::Csr<ValueType, IndexType>* factors, bool full_fillin,
                array<int>& tmp_storage) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+    GKO_DECLARE_LU_FACTORIZE);
 
 
 template <typename IndexType>
diff --git a/dpcpp/factorization/par_ic_kernels.dp.cpp b/dpcpp/factorization/par_ic_kernels.dp.cpp
index ac143a37465..36768774821 100644
--- a/dpcpp/factorization/par_ic_kernels.dp.cpp
+++ b/dpcpp/factorization/par_ic_kernels.dp.cpp
@@ -11,6 +11,7 @@
 #include <ginkgo/core/matrix/csr.hpp>
 
 #include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/math.hpp"
 #include "dpcpp/base/types.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
 
@@ -144,12 +145,13 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
     auto nnz = l->get_num_stored_elements();
     auto num_blocks = ceildiv(nnz, default_block_size);
     for (size_type i = 0; i < iterations; ++i) {
-        kernel::ic_sweep(
-            num_blocks, default_block_size, 0, exec->get_queue(),
-            a_lower->get_const_row_idxs(), a_lower->get_const_col_idxs(),
-            a_lower->get_const_values(), l->get_const_row_ptrs(),
-            l->get_const_col_idxs(), as_device_type(l->get_values()),
-            static_cast<IndexType>(l->get_num_stored_elements()));
+        kernel::ic_sweep(num_blocks, default_block_size, 0, exec->get_queue(),
+                         a_lower->get_const_row_idxs(),
+                         a_lower->get_const_col_idxs(),
+                         as_device_type(a_lower->get_const_values()),
+                         l->get_const_row_ptrs(), l->get_const_col_idxs(),
+                         as_device_type(l->get_values()),
+                         static_cast<IndexType>(l->get_num_stored_elements()));
     }
 }
 
diff --git a/dpcpp/factorization/par_ict_kernels.dp.cpp b/dpcpp/factorization/par_ict_kernels.dp.cpp
index 4325c7e8818..13cc1b8974c 100644
--- a/dpcpp/factorization/par_ict_kernels.dp.cpp
+++ b/dpcpp/factorization/par_ict_kernels.dp.cpp
@@ -20,6 +20,7 @@
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/math.hpp"
 #include "dpcpp/base/types.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/intrinsics.dp.hpp"
@@ -425,7 +426,7 @@ void add_candidates(syn::value_list<int, subgroup_size>,
     l_new_builder.get_value_array().resize_and_reset(l_new_nnz);
 
     auto l_new_col_idxs = l_new->get_col_idxs();
-    auto l_new_vals = l_new->get_values();
+    auto l_new_vals = as_device_type(l_new->get_values());
 
     // fill columns and values
     kernel::ict_tri_spgeam_init<subgroup_size>(
diff --git a/dpcpp/factorization/par_ilu_kernels.dp.cpp b/dpcpp/factorization/par_ilu_kernels.dp.cpp
index abfd2d72238..6da7b142fe7 100644
--- a/dpcpp/factorization/par_ilu_kernels.dp.cpp
+++ b/dpcpp/factorization/par_ilu_kernels.dp.cpp
@@ -9,6 +9,8 @@
 #include <ginkgo/core/matrix/coo.hpp>
 
 #include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/math.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
 
 
@@ -119,14 +121,15 @@ void compute_l_u_factors(std::shared_ptr<const DpcppExecutor> exec,
             grid_dim, block_size, 0, exec->get_queue(), num_elements,
             system_matrix->get_const_row_idxs(),
             system_matrix->get_const_col_idxs(),
-            system_matrix->get_const_values(), l_factor->get_const_row_ptrs(),
-            l_factor->get_const_col_idxs(), l_factor->get_values(),
+            as_device_type(system_matrix->get_const_values()),
+            l_factor->get_const_row_ptrs(), l_factor->get_const_col_idxs(),
+            as_device_type(l_factor->get_values()),
             u_factor->get_const_row_ptrs(), u_factor->get_const_col_idxs(),
-            u_factor->get_values());
+            as_device_type(u_factor->get_values()));
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL);
 
 
diff --git a/dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp b/dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp
index 12b74a3c0f3..c93e9574f81 100644
--- a/dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp
+++ b/dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp
@@ -21,6 +21,7 @@
 #include "core/synthesizer/implementation_selection.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/math.hpp"
 #include "dpcpp/base/types.hpp"
 #include "dpcpp/components/atomic.dp.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
@@ -59,7 +60,7 @@ void threshold_filter_approx(syn::value_list<int, subgroup_size>,
                              matrix::Csr<ValueType, IndexType>* m_out,
                              matrix::Coo<ValueType, IndexType>* m_out_coo)
 {
-    auto values = as_device_type(m->get_const_values());
+    auto values = m->get_const_values();
     IndexType size = m->get_num_stored_elements();
     using AbsType = remove_complex<ValueType>;
     constexpr auto bucket_count = kernel::searchtree_width;
@@ -137,7 +138,7 @@ void threshold_filter_approx(syn::value_list<int, subgroup_size>,
     kernel::bucket_filter<subgroup_size>(
         num_blocks, default_block_size, 0, exec->get_queue(), old_row_ptrs,
         old_col_idxs, old_vals, oracles, num_rows, bucket, new_row_ptrs,
-        new_row_idxs, new_col_idxs, new_vals);
+        new_row_idxs, new_col_idxs, as_device_type(new_vals));
 }
 
 
@@ -166,7 +167,7 @@ void threshold_filter_approx(std::shared_ptr<const DefaultExecutor> exec,
         &threshold, m_out, m_out_coo);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL);
 
 
diff --git a/dpcpp/factorization/par_ilut_filter_kernel.dp.cpp b/dpcpp/factorization/par_ilut_filter_kernel.dp.cpp
index 02d2f479283..4eb018d8d31 100644
--- a/dpcpp/factorization/par_ilut_filter_kernel.dp.cpp
+++ b/dpcpp/factorization/par_ilut_filter_kernel.dp.cpp
@@ -18,6 +18,7 @@
 #include "core/synthesizer/implementation_selection.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/math.hpp"
 #include "dpcpp/base/types.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/intrinsics.dp.hpp"
@@ -66,7 +67,7 @@ void threshold_filter(syn::value_list<int, subgroup_size>,
     auto new_row_ptrs = m_out->get_row_ptrs();
     kernel::threshold_filter_nnz<subgroup_size>(
         num_blocks, default_block_size, 0, exec->get_queue(), old_row_ptrs,
-        old_vals, num_rows, threshold, new_row_ptrs, lower);
+        old_vals, num_rows, as_device_type(threshold), new_row_ptrs, lower);
 
     // build row pointers
     components::prefix_sum_nonnegative(exec, new_row_ptrs, num_rows + 1);
@@ -91,8 +92,9 @@ void threshold_filter(syn::value_list<int, subgroup_size>,
     }
     kernel::threshold_filter<subgroup_size>(
         num_blocks, default_block_size, 0, exec->get_queue(), old_row_ptrs,
-        old_col_idxs, old_vals, num_rows, threshold, new_row_ptrs, new_row_idxs,
-        new_col_idxs, new_vals, lower);
+        old_col_idxs, old_vals, num_rows, as_device_type(threshold),
+        new_row_ptrs, new_row_idxs, new_col_idxs, as_device_type(new_vals),
+        lower);
 }
 
 
@@ -121,7 +123,7 @@ void threshold_filter(std::shared_ptr<const DefaultExecutor> exec,
         m_out_coo, lower);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL);
 
 
diff --git a/dpcpp/factorization/par_ilut_kernels.dp.cpp b/dpcpp/factorization/par_ilut_kernels.dp.cpp
index 5c9d4c6d769..f682641fc35 100644
--- a/dpcpp/factorization/par_ilut_kernels.dp.cpp
+++ b/dpcpp/factorization/par_ilut_kernels.dp.cpp
@@ -40,7 +40,7 @@ void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
                       array<remove_complex<ValueType>>&,
                       remove_complex<ValueType>& threshold) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL);
 
 
@@ -66,7 +66,7 @@ void threshold_filter(std::shared_ptr<const DefaultExecutor> exec,
                       matrix::Coo<ValueType, IndexType>* m_out_coo,
                       bool) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL);
 
 
@@ -82,7 +82,7 @@ void threshold_filter_approx(
     matrix::Csr<ValueType, IndexType>* m_out,
     matrix::Coo<ValueType, IndexType>* m_out_coo) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL);
 
 
@@ -96,7 +96,7 @@ void compute_l_u_factors(std::shared_ptr<const DefaultExecutor> exec,
                          matrix::Csr<ValueType, IndexType>* u_csc)
     GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL);
 
 
@@ -110,7 +110,7 @@ void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
                     matrix::Csr<ValueType, IndexType>* u_new)
     GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL);
 
 
diff --git a/dpcpp/factorization/par_ilut_select_common.dp.cpp b/dpcpp/factorization/par_ilut_select_common.dp.cpp
index acf383f84a0..a9f51233725 100644
--- a/dpcpp/factorization/par_ilut_select_common.dp.cpp
+++ b/dpcpp/factorization/par_ilut_select_common.dp.cpp
@@ -67,7 +67,7 @@ void sampleselect_count(std::shared_ptr<const DefaultExecutor> exec,
                             unsigned char* oracles, IndexType* partial_counts, \
                             IndexType* total_counts)
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(DECLARE_SSSS_COUNT);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(DECLARE_SSSS_COUNT);
 
 
 template <typename IndexType>
diff --git a/dpcpp/factorization/par_ilut_select_kernel.dp.cpp b/dpcpp/factorization/par_ilut_select_kernel.dp.cpp
index fd76f3246b8..0856a245bc3 100644
--- a/dpcpp/factorization/par_ilut_select_kernel.dp.cpp
+++ b/dpcpp/factorization/par_ilut_select_kernel.dp.cpp
@@ -14,6 +14,7 @@
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/par_ilut_kernels.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/math.hpp"
 #include "dpcpp/base/types.hpp"
 #include "dpcpp/components/atomic.dp.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
@@ -62,7 +63,7 @@ void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
                       array<remove_complex<ValueType>>& tmp2,
                       remove_complex<ValueType>& threshold)
 {
-    auto values = as_device_type(m->get_const_values());
+    auto values = m->get_const_values();
     IndexType size = m->get_num_stored_elements();
     using AbsType = remove_complex<ValueType>;
     constexpr auto bucket_count = kernel::searchtree_width;
@@ -149,7 +150,7 @@ void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
     threshold = exec->copy_val_to_host(out_ptr);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL);
 
 
diff --git a/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp b/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp
index 23309290e68..664c74a5603 100644
--- a/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp
+++ b/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp
@@ -388,9 +388,9 @@ void add_candidates(syn::value_list<int, subgroup_size>,
     u_new_builder.get_value_array().resize_and_reset(u_new_nnz);
 
     auto l_new_col_idxs = l_new->get_col_idxs();
-    auto l_new_vals = l_new->get_values();
+    auto l_new_vals = as_device_type(l_new->get_values());
     auto u_new_col_idxs = u_new->get_col_idxs();
-    auto u_new_vals = u_new->get_values();
+    auto u_new_vals = as_device_type(u_new->get_values());
 
     // fill columns and values
     kernel::tri_spgeam_init<subgroup_size>(
@@ -431,7 +431,7 @@ void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
         u_new);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL);
 
 
diff --git a/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp b/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp
index 4ef3a473f35..d3812094916 100644
--- a/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp
+++ b/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp
@@ -17,6 +17,7 @@
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/math.hpp"
 #include "dpcpp/base/types.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/intrinsics.dp.hpp"
@@ -184,7 +185,7 @@ void compute_l_u_factors(syn::value_list<int, subgroup_size>,
         static_cast<IndexType>(l->get_num_stored_elements()),
         u_coo->get_const_row_idxs(), u_coo->get_const_col_idxs(),
         as_device_type(u->get_values()), u_csc->get_const_row_ptrs(),
-        u_csc->get_const_col_idxs(), u_csc->get_values(),
+        u_csc->get_const_col_idxs(), as_device_type(u_csc->get_values()),
         static_cast<IndexType>(u->get_num_stored_elements()));
 }
 
@@ -218,7 +219,7 @@ void compute_l_u_factors(std::shared_ptr<const DefaultExecutor> exec,
         u_csc);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL);
 
 
diff --git a/dpcpp/matrix/batch_csr_kernels.dp.cpp b/dpcpp/matrix/batch_csr_kernels.dp.cpp
index ae5122ec7f9..1ab3a0494f9 100644
--- a/dpcpp/matrix/batch_csr_kernels.dp.cpp
+++ b/dpcpp/matrix/batch_csr_kernels.dp.cpp
@@ -17,6 +17,8 @@
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/base/helper.hpp"
+#include "dpcpp/base/math.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/intrinsics.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
@@ -136,8 +138,8 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
            const array<ValueType>* col_scale, const array<ValueType>* row_scale,
            batch::matrix::Csr<ValueType, IndexType>* input)
 {
-    const auto col_scale_vals = col_scale->get_const_data();
-    const auto row_scale_vals = row_scale->get_const_data();
+    const auto col_scale_vals = as_device_type(col_scale->get_const_data());
+    const auto row_scale_vals = as_device_type(row_scale->get_const_data());
     const auto num_rows = static_cast<int>(input->get_common_size()[0]);
     const auto num_cols = static_cast<int>(input->get_common_size()[1]);
     const auto stride = input->get_common_size()[1];
diff --git a/dpcpp/matrix/batch_dense_kernels.dp.cpp b/dpcpp/matrix/batch_dense_kernels.dp.cpp
index 6c0e4b4eb44..8f3aa8fd24e 100644
--- a/dpcpp/matrix/batch_dense_kernels.dp.cpp
+++ b/dpcpp/matrix/batch_dense_kernels.dp.cpp
@@ -21,6 +21,8 @@
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/base/helper.hpp"
+#include "dpcpp/base/math.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/intrinsics.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
@@ -138,8 +140,8 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
            const array<ValueType>* col_scale, const array<ValueType>* row_scale,
            batch::matrix::Dense<ValueType>* input)
 {
-    const auto col_scale_vals = col_scale->get_const_data();
-    const auto row_scale_vals = row_scale->get_const_data();
+    const auto col_scale_vals = as_device_type(col_scale->get_const_data());
+    const auto row_scale_vals = as_device_type(row_scale->get_const_data());
     const auto num_rows = static_cast<int>(input->get_common_size()[0]);
     const auto num_cols = static_cast<int>(input->get_common_size()[1]);
     const auto stride = input->get_common_size()[1];
diff --git a/dpcpp/matrix/batch_ell_kernels.dp.cpp b/dpcpp/matrix/batch_ell_kernels.dp.cpp
index b4e2627a494..809e485311a 100644
--- a/dpcpp/matrix/batch_ell_kernels.dp.cpp
+++ b/dpcpp/matrix/batch_ell_kernels.dp.cpp
@@ -17,6 +17,8 @@
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/base/helper.hpp"
+#include "dpcpp/base/math.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/intrinsics.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
@@ -136,8 +138,8 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
            const array<ValueType>* col_scale, const array<ValueType>* row_scale,
            batch::matrix::Ell<ValueType, IndexType>* input)
 {
-    const auto col_scale_vals = col_scale->get_const_data();
-    const auto row_scale_vals = row_scale->get_const_data();
+    const auto col_scale_vals = as_device_type(col_scale->get_const_data());
+    const auto row_scale_vals = as_device_type(row_scale->get_const_data());
     const auto num_rows = static_cast<int>(input->get_common_size()[0]);
     const auto num_cols = static_cast<int>(input->get_common_size()[1]);
     auto mat_ub = get_batch_struct(input);
diff --git a/dpcpp/matrix/batch_struct.hpp b/dpcpp/matrix/batch_struct.hpp
index 77b9eb6b3d5..74d66dd42d2 100644
--- a/dpcpp/matrix/batch_struct.hpp
+++ b/dpcpp/matrix/batch_struct.hpp
@@ -11,6 +11,7 @@
 
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
+#include "dpcpp/base/types.hpp"
 
 
 namespace gko {
@@ -32,10 +33,11 @@ namespace dpcpp {
  * Generates an immutable uniform batch struct from a batch of csr matrices.
  */
 template <typename ValueType, typename IndexType>
-inline batch::matrix::csr::uniform_batch<const ValueType, const IndexType>
+inline batch::matrix::csr::uniform_batch<const device_type<ValueType>,
+                                         const IndexType>
 get_batch_struct(const batch::matrix::Csr<ValueType, IndexType>* const op)
 {
-    return {op->get_const_values(),
+    return {as_device_type(op->get_const_values()),
             op->get_const_col_idxs(),
             op->get_const_row_ptrs(),
             op->get_num_batch_items(),
@@ -49,10 +51,10 @@ get_batch_struct(const batch::matrix::Csr<ValueType, IndexType>* const op)
  * Generates a uniform batch struct from a batch of csr matrices.
  */
 template <typename ValueType, typename IndexType>
-inline batch::matrix::csr::uniform_batch<ValueType, IndexType> get_batch_struct(
-    batch::matrix::Csr<ValueType, IndexType>* const op)
+inline batch::matrix::csr::uniform_batch<device_type<ValueType>, IndexType>
+get_batch_struct(batch::matrix::Csr<ValueType, IndexType>* const op)
 {
-    return {op->get_values(),
+    return {as_device_type(op->get_values()),
             op->get_col_idxs(),
             op->get_row_ptrs(),
             op->get_num_batch_items(),
@@ -66,10 +68,10 @@ inline batch::matrix::csr::uniform_batch<ValueType, IndexType> get_batch_struct(
  * Generates an immutable uniform batch struct from a batch of dense matrices.
  */
 template <typename ValueType>
-inline batch::matrix::dense::uniform_batch<const ValueType> get_batch_struct(
-    const batch::matrix::Dense<ValueType>* const op)
+inline batch::matrix::dense::uniform_batch<const device_type<ValueType>>
+get_batch_struct(const batch::matrix::Dense<ValueType>* const op)
 {
-    return {op->get_const_values(), op->get_num_batch_items(),
+    return {as_device_type(op->get_const_values()), op->get_num_batch_items(),
             static_cast<int32>(op->get_common_size()[1]),
             static_cast<int32>(op->get_common_size()[0]),
             static_cast<int32>(op->get_common_size()[1])};
@@ -80,10 +82,10 @@ inline batch::matrix::dense::uniform_batch<const ValueType> get_batch_struct(
  * Generates a uniform batch struct from a batch of dense matrices.
  */
 template <typename ValueType>
-inline batch::matrix::dense::uniform_batch<ValueType> get_batch_struct(
-    batch::matrix::Dense<ValueType>* const op)
+inline batch::matrix::dense::uniform_batch<device_type<ValueType>>
+get_batch_struct(batch::matrix::Dense<ValueType>* const op)
 {
-    return {op->get_values(), op->get_num_batch_items(),
+    return {as_device_type(op->get_values()), op->get_num_batch_items(),
             static_cast<int32>(op->get_common_size()[1]),
             static_cast<int32>(op->get_common_size()[0]),
             static_cast<int32>(op->get_common_size()[1])};
@@ -94,10 +96,11 @@ inline batch::matrix::dense::uniform_batch<ValueType> get_batch_struct(
  * Generates an immutable uniform batch struct from a batch of ell matrices.
  */
 template <typename ValueType, typename IndexType>
-inline batch::matrix::ell::uniform_batch<const ValueType, const IndexType>
+inline batch::matrix::ell::uniform_batch<const device_type<ValueType>,
+                                         const IndexType>
 get_batch_struct(const batch::matrix::Ell<ValueType, IndexType>* const op)
 {
-    return {op->get_const_values(),
+    return {as_device_type(op->get_const_values()),
             op->get_const_col_idxs(),
             op->get_num_batch_items(),
             static_cast<IndexType>(op->get_common_size()[0]),
@@ -111,10 +114,10 @@ get_batch_struct(const batch::matrix::Ell<ValueType, IndexType>* const op)
  * Generates a uniform batch struct from a batch of ell matrices.
  */
 template <typename ValueType, typename IndexType>
-inline batch::matrix::ell::uniform_batch<ValueType, IndexType> get_batch_struct(
-    batch::matrix::Ell<ValueType, IndexType>* const op)
+inline batch::matrix::ell::uniform_batch<device_type<ValueType>, IndexType>
+get_batch_struct(batch::matrix::Ell<ValueType, IndexType>* const op)
 {
-    return {op->get_values(),
+    return {as_device_type(op->get_values()),
             op->get_col_idxs(),
             op->get_num_batch_items(),
             static_cast<IndexType>(op->get_common_size()[0]),
diff --git a/dpcpp/matrix/coo_kernels.dp.cpp b/dpcpp/matrix/coo_kernels.dp.cpp
index 7782313e42e..f22606aea80 100644
--- a/dpcpp/matrix/coo_kernels.dp.cpp
+++ b/dpcpp/matrix/coo_kernels.dp.cpp
@@ -16,6 +16,7 @@
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/helper.hpp"
+#include "dpcpp/base/math.hpp"
 #include "dpcpp/base/types.hpp"
 #include "dpcpp/components/atomic.dp.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp
index 38a83676e3d..3cf9661902a 100644
--- a/dpcpp/matrix/csr_kernels.dp.cpp
+++ b/dpcpp/matrix/csr_kernels.dp.cpp
@@ -32,6 +32,7 @@
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/base/helper.hpp"
+#include "dpcpp/base/math.hpp"
 #include "dpcpp/base/onemkl_bindings.hpp"
 #include "dpcpp/base/types.hpp"
 #include "dpcpp/components/atomic.dp.hpp"
@@ -2477,9 +2478,10 @@ void inv_symm_scale_permute(std::shared_ptr<const DpcppExecutor> exec,
         ceildiv(num_rows, default_block_size / config::warp_size);
     inv_symm_scale_permute_kernel(
         copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
-        scale, perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(),
-        as_device_type(orig->get_const_values()), permuted->get_row_ptrs(),
-        permuted->get_col_idxs(), as_device_type(permuted->get_values()));
+        as_device_type(scale), perm, orig->get_const_row_ptrs(),
+        orig->get_const_col_idxs(), as_device_type(orig->get_const_values()),
+        permuted->get_row_ptrs(), permuted->get_col_idxs(),
+        as_device_type(permuted->get_values()));
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
@@ -2506,10 +2508,10 @@ void inv_nonsymm_scale_permute(std::shared_ptr<const DpcppExecutor> exec,
         ceildiv(num_rows, default_block_size / config::warp_size);
     inv_nonsymm_scale_permute_kernel(
         copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
-        row_scale, row_perm, col_scale, col_perm, orig->get_const_row_ptrs(),
-        orig->get_const_col_idxs(), as_device_type(orig->get_const_values()),
-        permuted->get_row_ptrs(), permuted->get_col_idxs(),
-        as_device_type(permuted->get_values()));
+        as_device_type(row_scale), row_perm, as_device_type(col_scale),
+        col_perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(),
+        as_device_type(orig->get_const_values()), permuted->get_row_ptrs(),
+        permuted->get_col_idxs(), as_device_type(permuted->get_values()));
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
@@ -2533,9 +2535,9 @@ void row_scale_permute(std::shared_ptr<const DpcppExecutor> exec,
         ceildiv(num_rows, default_block_size / config::warp_size);
     row_scale_permute_kernel(
         copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
-        scale, perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(),
-        as_device_type(orig->get_const_values()), row_permuted->get_row_ptrs(),
-        row_permuted->get_col_idxs(),
+        as_device_type(scale), perm, orig->get_const_row_ptrs(),
+        orig->get_const_col_idxs(), as_device_type(orig->get_const_values()),
+        row_permuted->get_row_ptrs(), row_permuted->get_col_idxs(),
         as_device_type(row_permuted->get_values()));
 }
 
@@ -2560,9 +2562,9 @@ void inv_row_scale_permute(std::shared_ptr<const DpcppExecutor> exec,
         ceildiv(num_rows, default_block_size / config::warp_size);
     inv_row_scale_permute_kernel(
         copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
-        scale, perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(),
-        as_device_type(orig->get_const_values()), row_permuted->get_row_ptrs(),
-        row_permuted->get_col_idxs(),
+        as_device_type(scale), perm, orig->get_const_row_ptrs(),
+        orig->get_const_col_idxs(), as_device_type(orig->get_const_values()),
+        row_permuted->get_row_ptrs(), row_permuted->get_col_idxs(),
         as_device_type(row_permuted->get_values()));
 }
 
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index 4e44edaef7e..d75d0defcbe 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -21,6 +21,7 @@
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/helper.hpp"
+#include "dpcpp/base/math.hpp"
 #include "dpcpp/base/onemkl_bindings.hpp"
 #include "dpcpp/base/types.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
@@ -101,7 +102,8 @@ void transpose(sycl::queue* queue, const matrix::Dense<ValueType>* orig,
 
     queue->submit([&](sycl::handler& cgh) {
         sycl::local_accessor<
-            uninitialized_array<ValueType, sg_size*(sg_size + 1)>, 0>
+            uninitialized_array<device_type<ValueType>, sg_size*(sg_size + 1)>,
+            0>
             space_acc_ct1(cgh);
         // Can not pass the member to device function directly
         auto in = as_device_type(orig->get_const_values());
@@ -372,7 +374,7 @@ void convert_to_ell(std::shared_ptr<const DefaultExecutor> exec,
             }
             for (; col_idx < max_nnz_per_row; col_idx++) {
                 cols[col_idx * stride + row] = invalid_index<IndexType>();
-                vals[col_idx * stride + row] = zero<ValueType>();
+                vals[col_idx * stride + row] = zero<device_type<ValueType>>();
             }
         });
     });
@@ -436,7 +438,7 @@ void convert_to_hybrid(std::shared_ptr<const DefaultExecutor> exec,
                 }
             }
             for (; ell_count < ell_lim; ell_count++) {
-                ell_vals[ell_idx] = zero<ValueType>();
+                ell_vals[ell_idx] = zero<device_type<ValueType>>();
                 ell_cols[ell_idx] = invalid_index<IndexType>();
                 ell_idx += ell_stride;
             }
@@ -491,7 +493,7 @@ void convert_to_sellp(std::shared_ptr<const DefaultExecutor> exec,
             }
             for (; out_idx < slice_end; out_idx += slice_size) {
                 col_idxs[out_idx] = invalid_index<IndexType>();
-                vals[out_idx] = zero<ValueType>();
+                vals[out_idx] = zero<device_type<ValueType>>();
             }
         });
     });
diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp
index e11123f84ce..4a877b788c7 100644
--- a/dpcpp/matrix/ell_kernels.dp.cpp
+++ b/dpcpp/matrix/ell_kernels.dp.cpp
@@ -24,6 +24,7 @@
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/helper.hpp"
+#include "dpcpp/base/math.hpp"
 #include "dpcpp/base/types.hpp"
 #include "dpcpp/components/atomic.dp.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
diff --git a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp
index 9d7f76ea7a6..2686529bd2f 100644
--- a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp
+++ b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp
@@ -14,6 +14,7 @@
 #include "core/synthesizer/implementation_selection.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/math.hpp"
 #include "dpcpp/base/types.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
@@ -214,9 +215,9 @@ void classical_spmv(syn::value_list<int, subgroup_size>,
     } else if (alpha != nullptr && beta != nullptr) {
         kernel::abstract_classical_spmv<subgroup_size>(
             grid, block, 0, exec->get_queue(), a->get_size()[0],
-            as_device_type(alpha->get_const_values()), a->get_const_value(),
-            a->get_const_col_idxs(), a->get_const_row_ptrs(),
-            acc::as_device_range(b_vals),
+            as_device_type(alpha->get_const_values()),
+            as_device_type(a->get_const_value()), a->get_const_col_idxs(),
+            a->get_const_row_ptrs(), acc::as_device_range(b_vals),
             as_device_type(beta->get_const_values()),
             acc::as_device_range(c_vals));
     } else {
diff --git a/dpcpp/preconditioner/isai_kernels.dp.cpp b/dpcpp/preconditioner/isai_kernels.dp.cpp
index 96a10a2fd44..f700536f379 100644
--- a/dpcpp/preconditioner/isai_kernels.dp.cpp
+++ b/dpcpp/preconditioner/isai_kernels.dp.cpp
@@ -15,6 +15,7 @@
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/dpct.hpp"
+#include "dpcpp/base/math.hpp"
 #include "dpcpp/base/types.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/merging.dp.hpp"
@@ -699,8 +700,9 @@ void generate_excess_system(std::shared_ptr<const DefaultExecutor> exec,
             as_device_type(input->get_const_values()),
             inverse->get_const_row_ptrs(), inverse->get_const_col_idxs(),
             excess_rhs_ptrs, excess_nz_ptrs, excess_system->get_row_ptrs(),
-            excess_system->get_col_idxs(), excess_system->get_values(),
-            excess_rhs->get_values(), e_start, e_end);
+            excess_system->get_col_idxs(),
+            as_device_type(excess_system->get_values()),
+            as_device_type(excess_rhs->get_values()), e_start, e_end);
     }
 }
 
@@ -719,7 +721,7 @@ void scale_excess_solution(std::shared_ptr<const DefaultExecutor> exec,
     if (grid > 0) {
         kernel::scale_excess_solution<subwarp_size>(
             grid, block, 0, exec->get_queue(), excess_block_ptrs,
-            excess_solution->get_values(), e_start, e_end);
+            as_device_type(excess_solution->get_values()), e_start, e_end);
     }
 }
 
@@ -742,7 +744,7 @@ void scatter_excess_solution(std::shared_ptr<const DefaultExecutor> exec,
         kernel::copy_excess_solution<subwarp_size>(
             grid, block, 0, exec->get_queue(), static_cast<IndexType>(num_rows),
             inverse->get_const_row_ptrs(), excess_rhs_ptrs,
-            excess_solution->get_const_values(),
+            as_device_type(excess_solution->get_const_values()),
             as_device_type(inverse->get_values()), e_start, e_end);
     }
 }
diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
index 4b9077d5ec5..23f614b3336 100644
--- a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
@@ -176,13 +176,15 @@ void advanced_apply(
     if (block_precisions) {
         kernel::advanced_adaptive_apply<max_block_size, subwarp_size,
                                         warps_per_block>(
-            grid_size, block_size, 0, exec->get_queue(), blocks, storage_scheme,
-            block_precisions, block_pointers, num_blocks, alpha, b, b_stride, x,
-            x_stride);
+            grid_size, block_size, 0, exec->get_queue(), as_device_type(blocks),
+            storage_scheme, block_precisions, block_pointers, num_blocks,
+            as_device_type(alpha), as_device_type(b), b_stride,
+            as_device_type(x), x_stride);
     } else {
         kernel::advanced_apply<max_block_size, subwarp_size, warps_per_block>(
-            grid_size, block_size, 0, exec->get_queue(), blocks, storage_scheme,
-            block_pointers, num_blocks, alpha, b, b_stride, x, x_stride);
+            grid_size, block_size, 0, exec->get_queue(), as_device_type(blocks),
+            storage_scheme, block_pointers, num_blocks, as_device_type(alpha),
+            as_device_type(b), b_stride, as_device_type(x), x_stride);
     }
 }
 
diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
index 4ae013b1141..537e2b9cfa5 100644
--- a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
@@ -60,10 +60,9 @@ void apply(std::shared_ptr<const DpcppExecutor> exec, size_type num_blocks,
             syn::value_list<int, config::min_warps_per_block>(),
             syn::type_list<>(), exec, num_blocks,
             block_precisions.get_const_data(), block_pointers.get_const_data(),
-            blocks.get_const_data(), storage_scheme,
-            as_device_type(alpha->get_const_values()),
-            as_device_type(b->get_const_values()) + col, b->get_stride(),
-            as_device_type(x->get_values()) + col, x->get_stride());
+            blocks.get_const_data(), storage_scheme, alpha->get_const_values(),
+            b->get_const_values() + col, b->get_stride(), x->get_values() + col,
+            x->get_stride());
     }
 }
 
diff --git a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
index adf8452fe4b..9bea9f71a72 100644
--- a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
@@ -15,6 +15,7 @@
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/dpct.hpp"
+#include "dpcpp/base/math.hpp"
 #include "dpcpp/base/types.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/diagonal_block_manipulation.dp.hpp"
@@ -366,15 +367,16 @@ void generate(syn::value_list<int, max_block_size>,
                                   warps_per_block>(
             grid_size, block_size, 0, exec->get_queue(), mtx->get_size()[0],
             mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
-            as_device_type(mtx->get_const_values()), accuracy, block_data,
-            storage_scheme, conditioning, block_precisions, block_ptrs,
+            as_device_type(mtx->get_const_values()), as_device_type(accuracy),
+            as_device_type(block_data), storage_scheme,
+            as_device_type(conditioning), block_precisions, block_ptrs,
             num_blocks);
     } else {
         kernel::generate<max_block_size, subwarp_size, warps_per_block>(
             grid_size, block_size, 0, exec->get_queue(), mtx->get_size()[0],
             mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
-            as_device_type(mtx->get_const_values()), block_data, storage_scheme,
-            block_ptrs, num_blocks);
+            as_device_type(mtx->get_const_values()), as_device_type(block_data),
+            storage_scheme, block_ptrs, num_blocks);
     }
 }
 
diff --git a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
index 8eafc3af69d..84ea1a8a96b 100644
--- a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
@@ -170,13 +170,14 @@ void apply(syn::value_list<int, max_block_size>,
 
     if (block_precisions) {
         kernel::adaptive_apply<max_block_size, subwarp_size, warps_per_block>(
-            grid_size, block_size, 0, exec->get_queue(), blocks, storage_scheme,
-            block_precisions, block_pointers, num_blocks, b, b_stride, x,
-            x_stride);
+            grid_size, block_size, 0, exec->get_queue(), as_device_type(blocks),
+            storage_scheme, block_precisions, block_pointers, num_blocks,
+            as_device_type(b), b_stride, as_device_type(x), x_stride);
     } else {
         kernel::apply<max_block_size, subwarp_size, warps_per_block>(
-            grid_size, block_size, 0, exec->get_queue(), blocks, storage_scheme,
-            block_pointers, num_blocks, b, b_stride, x, x_stride);
+            grid_size, block_size, 0, exec->get_queue(), as_device_type(blocks),
+            storage_scheme, block_pointers, num_blocks, as_device_type(b),
+            b_stride, as_device_type(x), x_stride);
     }
 }
 
diff --git a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
index bf2fb49d2ae..610dbe1c8a6 100644
--- a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
@@ -56,8 +56,8 @@ void simple_apply(
             syn::type_list<>(), exec, num_blocks,
             block_precisions.get_const_data(), block_pointers.get_const_data(),
             blocks.get_const_data(), storage_scheme,
-            as_device_type(b->get_const_values()) + col, b->get_stride(),
-            as_device_type(x->get_values()) + col, x->get_stride());
+            b->get_const_values() + col, b->get_stride(), x->get_values() + col,
+            x->get_stride());
     }
 }
 
diff --git a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
index e86eec5f21b..ed7ad3fafd5 100644
--- a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
+++ b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
@@ -13,10 +13,11 @@
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "dpcpp/base/batch_struct.hpp"
+#include "dpcpp/base/math.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/matrix/batch_struct.hpp"
 #include "dpcpp/solver/batch_bicgstab_launch.hpp"
 
-
 namespace gko {
 namespace kernels {
 namespace dpcpp {
@@ -37,6 +38,8 @@ int get_group_size(int value, int subgroup_size = config::warp_size)
 template <typename ValueType>
 class kernel_caller {
 public:
+    using sycl_value_type = sycl_type<ValueType>;
+
     kernel_caller(std::shared_ptr<const DefaultExecutor> exec,
                   const settings<remove_complex<ValueType>> settings)
         : exec_{std::move(exec)}, settings_{settings}
@@ -46,10 +49,18 @@ class kernel_caller {
               typename LogType>
     void call_kernel(
         LogType logger, const BatchMatrixType& mat, PrecType prec,
-        const gko::batch::multi_vector::uniform_batch<const ValueType>& b,
-        const gko::batch::multi_vector::uniform_batch<ValueType>& x) const
+        const gko::batch::multi_vector::uniform_batch<const sycl_value_type>& b,
+        const gko::batch::multi_vector::uniform_batch<sycl_value_type>& x) const
     {
-        using real_type = gko::remove_complex<ValueType>;
+        using real_type = gko::remove_complex<sycl_value_type>;
+        if constexpr (std::is_same_v<ValueType, half>) {
+            static_assert(
+                std::is_same_v<typename StopType::real_type, sycl::half>,
+                "fail!");
+            static_assert(
+                !std::is_same_v<typename StopType::real_type, gko::half>,
+                "fail!");
+        }
         const size_type num_batch_items = mat.num_batch_items;
         const auto num_rows = mat.num_rows;
         const auto num_rhs = b.num_rhs;
@@ -67,7 +78,7 @@ class kernel_caller {
         // alpha, omega, temp
         // If the value available is negative, then set it to 0
         const int static_var_mem =
-            5 * sizeof(ValueType) + 2 * sizeof(real_type);
+            5 * sizeof(sycl_value_type) + 2 * sizeof(real_type);
         int shmem_per_blk = std::max(
             static_cast<int>(
                 device.get_info<sycl::info::device::local_mem_size>()) -
@@ -76,19 +87,18 @@ class kernel_caller {
         const int padded_num_rows = num_rows;
         const size_type prec_size = PrecType::dynamic_work_size(
             padded_num_rows, mat.get_single_item_num_nnz());
-        const auto sconf =
-            gko::kernels::batch_bicgstab::compute_shared_storage<PrecType,
-                                                                 ValueType>(
-                shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(),
-                b.num_rhs);
+        const auto sconf = gko::kernels::batch_bicgstab::compute_shared_storage<
+            PrecType, sycl_value_type>(shmem_per_blk, padded_num_rows,
+                                       mat.get_single_item_num_nnz(),
+                                       b.num_rhs);
         const size_t shared_size = sconf.n_shared * padded_num_rows +
                                    (sconf.prec_shared ? prec_size : 0);
-        auto workspace = gko::array<ValueType>(
-            exec_,
-            sconf.gmem_stride_bytes * num_batch_items / sizeof(ValueType));
-        GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(ValueType) == 0);
+        auto workspace = gko::array<sycl_value_type>(
+            exec_, sconf.gmem_stride_bytes * num_batch_items /
+                       sizeof(sycl_value_type));
+        GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(sycl_value_type) == 0);
 
-        ValueType* const workspace_data = workspace.get_data();
+        sycl_value_type* const workspace_data = workspace.get_data();
         int n_shared_total = sconf.n_shared + int(sconf.prec_shared);
 
         // launch_apply_kernel<StopType, subgroup_size, n_shared_total>
diff --git a/dpcpp/solver/batch_bicgstab_launch.hpp b/dpcpp/solver/batch_bicgstab_launch.hpp
index a9c78b9df45..47ed6e83d27 100644
--- a/dpcpp/solver/batch_bicgstab_launch.hpp
+++ b/dpcpp/solver/batch_bicgstab_launch.hpp
@@ -33,25 +33,26 @@ void launch_apply_kernel(
     const gko::kernels::batch_bicgstab::storage_config& sconf,
     const settings<remove_complex<ValueType>>& settings, LogType& logger,
     PrecType& prec, const BatchMatrixType& mat,
-    const ValueType* const __restrict__ b_values,
-    ValueType* const __restrict__ x_values,
-    ValueType* const __restrict__ workspace, const int& group_size,
+    const device_type<ValueType>* const __restrict__ b_values,
+    device_type<ValueType>* const __restrict__ x_values,
+    device_type<ValueType>* const __restrict__ workspace, const int& group_size,
     const int& shared_size);
 
 
 #define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH(_vtype, _subgroup_size, _n_shared, \
                                           mat_t, log_t, pre_t, stop_t)       \
-    void                                                                     \
-    launch_apply_kernel<_vtype, stop_t<_vtype>, _subgroup_size, _n_shared>(  \
+    void launch_apply_kernel<_vtype, stop_t<device_type<_vtype>>,            \
+                             _subgroup_size, _n_shared>(                     \
         std::shared_ptr<const DefaultExecutor> exec,                         \
         const gko::kernels::batch_bicgstab::storage_config& sconf,           \
         const settings<remove_complex<_vtype>>& settings,                    \
-        log_t<gko::remove_complex<_vtype>>& logger, pre_t<_vtype>& prec,     \
-        const mat_t<const _vtype>& mat,                                      \
-        const _vtype* const __restrict__ b_values,                           \
-        _vtype* const __restrict__ x_values,                                 \
-        _vtype* const __restrict__ workspace_data, const int& block_size,    \
-        const int& shared_size)
+        log_t<gko::remove_complex<device_type<_vtype>>>& logger,             \
+        pre_t<device_type<_vtype>>& prec,                                    \
+        const mat_t<const device_type<_vtype>>& mat,                         \
+        const device_type<_vtype>* const __restrict__ b_values,              \
+        device_type<_vtype>* const __restrict__ x_values,                    \
+        device_type<_vtype>* const __restrict__ workspace_data,              \
+        const int& block_size, const int& shared_size)
 
 #define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_0 \
     GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, 32, 0)
diff --git a/dpcpp/solver/batch_bicgstab_launch.instantiate.dp.cpp b/dpcpp/solver/batch_bicgstab_launch.instantiate.dp.cpp
index b45d6409575..69afac52b70 100644
--- a/dpcpp/solver/batch_bicgstab_launch.instantiate.dp.cpp
+++ b/dpcpp/solver/batch_bicgstab_launch.instantiate.dp.cpp
@@ -16,6 +16,8 @@
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/base/helper.hpp"
+#include "dpcpp/base/math.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/intrinsics.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
@@ -37,9 +39,9 @@ void launch_apply_kernel(
     const gko::kernels::batch_bicgstab::storage_config& sconf,
     const settings<remove_complex<ValueType>>& settings, LogType& logger,
     PrecType& prec, const BatchMatrixType& mat,
-    const ValueType* const __restrict__ b_values,
-    ValueType* const __restrict__ x_values,
-    ValueType* const __restrict__ workspace, const int& group_size,
+    const device_type<ValueType>* const __restrict__ b_values,
+    device_type<ValueType>* const __restrict__ x_values,
+    device_type<ValueType>* const __restrict__ workspace, const int& group_size,
     const int& shared_size)
 {
     auto num_rows = mat.num_rows;
@@ -48,10 +50,10 @@ void launch_apply_kernel(
     const dim3 grid(mat.num_batch_items);
 
     auto max_iters = settings.max_iterations;
-    auto res_tol = settings.residual_tol;
+    auto res_tol = as_device_type(settings.residual_tol);
 
     exec->get_queue()->submit([&](sycl::handler& cgh) {
-        sycl::local_accessor<ValueType, 1> slm_values(
+        sycl::local_accessor<device_type<ValueType>, 1> slm_values(
             sycl::range<1>(shared_size), cgh);
 
         cgh.parallel_for(
@@ -61,18 +63,19 @@ void launch_apply_kernel(
                 auto batch_id = item_ct1.get_group_linear_id();
                 const auto mat_global_entry =
                     gko::batch::matrix::extract_batch_item(mat, batch_id);
-                const ValueType* const b_global_entry =
+                const device_type<ValueType>* const b_global_entry =
                     gko::batch::multi_vector::batch_item_ptr(
                         b_values, 1, num_rows, batch_id);
-                ValueType* const x_global_entry =
+                device_type<ValueType>* const x_global_entry =
                     gko::batch::multi_vector::batch_item_ptr(
                         x_values, 1, num_rows, batch_id);
                 batch_single_kernels::apply_kernel<StopType, n_shared_total>(
                     sconf, max_iters, res_tol, logger, prec, mat_global_entry,
                     b_global_entry, x_global_entry, num_rows,
                     mat.get_single_item_num_nnz(),
-                    static_cast<ValueType*>(slm_values.get_pointer()), item_ct1,
-                    workspace);
+                    static_cast<device_type<ValueType>*>(
+                        slm_values.get_pointer()),
+                    item_ct1, workspace);
             });
     });
 }
diff --git a/dpcpp/solver/batch_cg_kernels.dp.cpp b/dpcpp/solver/batch_cg_kernels.dp.cpp
index 5ded4a53978..7f173a1dfd0 100644
--- a/dpcpp/solver/batch_cg_kernels.dp.cpp
+++ b/dpcpp/solver/batch_cg_kernels.dp.cpp
@@ -13,6 +13,8 @@
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "dpcpp/base/batch_struct.hpp"
+#include "dpcpp/base/math.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/matrix/batch_struct.hpp"
 #include "dpcpp/solver/batch_cg_kernels.hpp"
 #include "dpcpp/solver/batch_cg_launch.hpp"
@@ -38,6 +40,7 @@ int get_group_size(int value, int subgroup_size = config::warp_size)
 template <typename ValueType>
 class kernel_caller {
 public:
+    using sycl_value_type = sycl_type<ValueType>;
     kernel_caller(std::shared_ptr<const DefaultExecutor> exec,
                   const settings<remove_complex<ValueType>> settings)
         : exec_{std::move(exec)}, settings_{settings}
@@ -47,8 +50,8 @@ class kernel_caller {
               typename LogType>
     void call_kernel(
         LogType logger, const BatchMatrixType& mat, PrecType prec,
-        const gko::batch::multi_vector::uniform_batch<const ValueType>& b,
-        const gko::batch::multi_vector::uniform_batch<ValueType>& x) const
+        const gko::batch::multi_vector::uniform_batch<const sycl_value_type>& b,
+        const gko::batch::multi_vector::uniform_batch<sycl_value_type>& x) const
     {
         using real_type = typename gko::remove_complex<ValueType>;
         const size_type num_batch_items = mat.num_batch_items;
@@ -68,7 +71,7 @@ class kernel_caller {
         // alpha and two norms
         // If the value available is negative, then set it to 0
         const int static_var_mem =
-            3 * sizeof(ValueType) + 2 * sizeof(real_type);
+            3 * sizeof(sycl_value_type) + 2 * sizeof(real_type);
         int shmem_per_blk = std::max(
             static_cast<int>(
                 device.get_info<sycl::info::device::local_mem_size>()) -
@@ -78,17 +81,18 @@ class kernel_caller {
         const size_type prec_size = PrecType::dynamic_work_size(
             padded_num_rows, mat.get_single_item_num_nnz());
         const auto sconf =
-            gko::kernels::batch_cg::compute_shared_storage<PrecType, ValueType>(
+            gko::kernels::batch_cg::compute_shared_storage<PrecType,
+                                                           sycl_value_type>(
                 shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(),
                 b.num_rhs);
         const size_t shared_size = sconf.n_shared * padded_num_rows +
                                    (sconf.prec_shared ? prec_size : 0);
-        auto workspace = gko::array<ValueType>(
-            exec_,
-            sconf.gmem_stride_bytes * num_batch_items / sizeof(ValueType));
-        GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(ValueType) == 0);
+        auto workspace = gko::array<sycl_value_type>(
+            exec_, sconf.gmem_stride_bytes * num_batch_items /
+                       sizeof(sycl_value_type));
+        GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(sycl_value_type) == 0);
 
-        ValueType* const workspace_data = workspace.get_data();
+        sycl_value_type* const workspace_data = workspace.get_data();
         int n_shared_total = sconf.n_shared + int(sconf.prec_shared);
 
         // template
diff --git a/dpcpp/solver/batch_cg_launch.hpp b/dpcpp/solver/batch_cg_launch.hpp
index c5f8e0d5dba..329671cb1c1 100644
--- a/dpcpp/solver/batch_cg_launch.hpp
+++ b/dpcpp/solver/batch_cg_launch.hpp
@@ -27,29 +27,30 @@ using settings = gko::kernels::batch_cg::settings<T>;
 template <typename ValueType, typename StopType, const int subgroup_size,
           const int n_shared_total, typename PrecType, typename LogType,
           typename BatchMatrixType>
-void launch_apply_kernel(std::shared_ptr<const DefaultExecutor> exec,
-                         const gko::kernels::batch_cg::storage_config& sconf,
-                         const settings<remove_complex<ValueType>>& settings,
-                         LogType& logger, PrecType& prec,
-                         const BatchMatrixType& mat,
-                         const ValueType* const __restrict__ b_values,
-                         ValueType* const __restrict__ x_values,
-                         ValueType* const __restrict__ workspace,
-                         const int& group_size, const int& shared_size);
+void launch_apply_kernel(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const gko::kernels::batch_cg::storage_config& sconf,
+    const settings<remove_complex<ValueType>>& settings, LogType& logger,
+    PrecType& prec, const BatchMatrixType& mat,
+    const device_type<ValueType>* const __restrict__ b_values,
+    device_type<ValueType>* const __restrict__ x_values,
+    device_type<ValueType>* const __restrict__ workspace, const int& group_size,
+    const int& shared_size);
 
 #define GKO_DECLARE_BATCH_CG_LAUNCH(_vtype, _subgroup_size, _n_shared, mat_t, \
                                     log_t, pre_t, stop_t)                     \
-    void                                                                      \
-    launch_apply_kernel<_vtype, stop_t<_vtype>, _subgroup_size, _n_shared>(   \
+    void launch_apply_kernel<_vtype, stop_t<device_type<_vtype>>,             \
+                             _subgroup_size, _n_shared>(                      \
         std::shared_ptr<const DefaultExecutor> exec,                          \
         const gko::kernels::batch_cg::storage_config& sconf,                  \
         const settings<remove_complex<_vtype>>& settings,                     \
-        log_t<gko::remove_complex<_vtype>>& logger, pre_t<_vtype>& prec,      \
-        const mat_t<const _vtype>& mat,                                       \
-        const _vtype* const __restrict__ b_values,                            \
-        _vtype* const __restrict__ x_values,                                  \
-        _vtype* const __restrict__ workspace_data, const int& block_size,     \
-        const int& shared_size)
+        log_t<gko::remove_complex<device_type<_vtype>>>& logger,              \
+        pre_t<device_type<_vtype>>& prec,                                     \
+        const mat_t<const device_type<_vtype>>& mat,                          \
+        const device_type<_vtype>* const __restrict__ b_values,               \
+        device_type<_vtype>* const __restrict__ x_values,                     \
+        device_type<_vtype>* const __restrict__ workspace_data,               \
+        const int& block_size, const int& shared_size)
 
 #define GKO_INSTANTIATE_BATCH_CG_LAUNCH_0 \
     GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, 32, 0)
diff --git a/dpcpp/solver/batch_cg_launch.instantiate.dp.cpp b/dpcpp/solver/batch_cg_launch.instantiate.dp.cpp
index ba887c8aeb5..12f2983e846 100644
--- a/dpcpp/solver/batch_cg_launch.instantiate.dp.cpp
+++ b/dpcpp/solver/batch_cg_launch.instantiate.dp.cpp
@@ -19,6 +19,8 @@
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/base/helper.hpp"
+#include "dpcpp/base/math.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/intrinsics.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
@@ -39,15 +41,15 @@ namespace batch_cg {
 template <typename ValueType, typename StopType, const int subgroup_size,
           const int n_shared_total, typename PrecType, typename LogType,
           typename BatchMatrixType>
-void launch_apply_kernel(std::shared_ptr<const DefaultExecutor> exec,
-                         const gko::kernels::batch_cg::storage_config& sconf,
-                         const settings<remove_complex<ValueType>>& settings,
-                         LogType& logger, PrecType& prec,
-                         const BatchMatrixType& mat,
-                         const ValueType* const __restrict__ b_values,
-                         ValueType* const __restrict__ x_values,
-                         ValueType* const __restrict__ workspace,
-                         const int& group_size, const int& shared_size)
+void launch_apply_kernel(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const gko::kernels::batch_cg::storage_config& sconf,
+    const settings<remove_complex<ValueType>>& settings, LogType& logger,
+    PrecType& prec, const BatchMatrixType& mat,
+    const device_type<ValueType>* const __restrict__ b_values,
+    device_type<ValueType>* const __restrict__ x_values,
+    device_type<ValueType>* const __restrict__ workspace, const int& group_size,
+    const int& shared_size)
 {
     auto num_rows = mat.num_rows;
 
@@ -55,10 +57,10 @@ void launch_apply_kernel(std::shared_ptr<const DefaultExecutor> exec,
     const dim3 grid(mat.num_batch_items);
 
     auto max_iters = settings.max_iterations;
-    auto res_tol = settings.residual_tol;
+    auto res_tol = as_device_type(settings.residual_tol);
 
     exec->get_queue()->submit([&](sycl::handler& cgh) {
-        sycl::local_accessor<ValueType, 1> slm_values(
+        sycl::local_accessor<device_type<ValueType>, 1> slm_values(
             sycl::range<1>(shared_size), cgh);
 
         cgh.parallel_for(
@@ -68,18 +70,19 @@ void launch_apply_kernel(std::shared_ptr<const DefaultExecutor> exec,
                 auto batch_id = item_ct1.get_group_linear_id();
                 const auto mat_global_entry =
                     gko::batch::matrix::extract_batch_item(mat, batch_id);
-                const ValueType* const b_global_entry =
+                const device_type<ValueType>* const b_global_entry =
                     gko::batch::multi_vector::batch_item_ptr(
                         b_values, 1, num_rows, batch_id);
-                ValueType* const x_global_entry =
+                device_type<ValueType>* const x_global_entry =
                     gko::batch::multi_vector::batch_item_ptr(
                         x_values, 1, num_rows, batch_id);
                 batch_single_kernels::apply_kernel<StopType, n_shared_total>(
                     sconf, max_iters, res_tol, logger, prec, mat_global_entry,
                     b_global_entry, x_global_entry, num_rows,
                     mat.get_single_item_num_nnz(),
-                    static_cast<ValueType*>(slm_values.get_pointer()), item_ct1,
-                    workspace);
+                    static_cast<device_type<ValueType>*>(
+                        slm_values.get_pointer()),
+                    item_ct1, workspace);
             });
     });
 }
diff --git a/dpcpp/solver/idr_kernels.dp.cpp b/dpcpp/solver/idr_kernels.dp.cpp
index 319dff53771..f0b589739bb 100644
--- a/dpcpp/solver/idr_kernels.dp.cpp
+++ b/dpcpp/solver/idr_kernels.dp.cpp
@@ -18,6 +18,7 @@
 #include "core/components/fill_array_kernels.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/math.hpp"
 #include "dpcpp/base/onemkl_bindings.hpp"
 #include "dpcpp/base/types.hpp"
 #include "dpcpp/components/atomic.dp.hpp"
@@ -672,16 +673,26 @@ void update_g_and_u(std::shared_ptr<const DpcppExecutor> exec,
 
     for (size_type i = 0; i < k; i++) {
         const auto p_i = as_device_type(p->get_const_values()) + i * p_stride;
-        if (nrhs > 1 || is_complex<ValueType>()) {
-            components::fill_array(exec, as_device_type(alpha->get_values()),
-                                   nrhs, zero<ValueType>());
+        auto gko_impl = [&]() {
+            components::fill_array(exec, alpha->get_values(), nrhs,
+                                   zero<ValueType>());
             multidot_kernel(grid_dim, block_dim, 0, exec->get_queue(), size,
-                            nrhs, p_i, g_k->get_values(), g_k->get_stride(),
+                            nrhs, p_i, as_device_type(g_k->get_values()),
+                            g_k->get_stride(),
                             as_device_type(alpha->get_values()),
                             stop_status->get_const_data());
+        };
+        if constexpr (std::is_same_v<ValueType, half> ||
+                      is_complex<ValueType>()) {
+            gko_impl();
         } else {
-            onemkl::dot(*exec->get_queue(), size, p_i, 1, g_k->get_values(),
-                        g_k->get_stride(), as_device_type(alpha->get_values()));
+            if (nrhs > 1) {
+                gko_impl();
+            } else {
+                onemkl::dot(*exec->get_queue(), size, p_i, 1, g_k->get_values(),
+                            g_k->get_stride(),
+                            as_device_type(alpha->get_values()));
+            }
         }
         update_g_k_and_u_kernel<default_block_size>(
             ceildiv(size * g_k->get_stride(), default_block_size),
@@ -689,14 +700,14 @@ void update_g_and_u(std::shared_ptr<const DpcppExecutor> exec,
             as_device_type(alpha->get_const_values()),
             as_device_type(m->get_const_values()), m->get_stride(),
             as_device_type(g->get_const_values()), g->get_stride(),
-            g_k->get_values(), g_k->get_stride(),
+            as_device_type(g_k->get_values()), g_k->get_stride(),
             as_device_type(u->get_values()), u->get_stride(),
             stop_status->get_const_data());
     }
     update_g_kernel<default_block_size>(
         ceildiv(size * g_k->get_stride(), default_block_size),
         default_block_size, 0, exec->get_queue(), k, size, nrhs,
-        g_k->get_const_values(), g_k->get_stride(),
+        as_device_type(g_k->get_const_values()), g_k->get_stride(),
         as_device_type(g->get_values()), g->get_stride(),
         stop_status->get_const_data());
 }
@@ -718,17 +729,26 @@ void update_m(std::shared_ptr<const DpcppExecutor> exec, const size_type nrhs,
     const dim3 block_dim(default_dot_dim, default_dot_dim);
 
     for (size_type i = k; i < subspace_dim; i++) {
-        const auto p_i = as_device_type(p->get_const_values()) + i * p_stride;
-        auto m_i = as_device_type(m->get_values()) + i * m_stride + k * nrhs;
-        if (nrhs > 1 || is_complex<ValueType>()) {
+        const auto p_i = p->get_const_values() + i * p_stride;
+        auto m_i = m->get_values() + i * m_stride + k * nrhs;
+        auto gko_impl = [&]() {
             components::fill_array(exec, m_i, nrhs, zero<ValueType>());
             multidot_kernel(grid_dim, block_dim, 0, exec->get_queue(), size,
-                            nrhs, p_i, g_k->get_const_values(),
-                            g_k->get_stride(), m_i,
+                            nrhs, as_device_type(p_i),
+                            as_device_type(g_k->get_const_values()),
+                            g_k->get_stride(), as_device_type(m_i),
                             stop_status->get_const_data());
+        };
+        if constexpr (std::is_same_v<ValueType, half> ||
+                      is_complex<ValueType>()) {
+            gko_impl();
         } else {
-            onemkl::dot(*exec->get_queue(), size, p_i, 1,
-                        g_k->get_const_values(), g_k->get_stride(), m_i);
+            if (nrhs > 1) {
+                gko_impl();
+            } else {
+                onemkl::dot(*exec->get_queue(), size, as_device_type(p_i), 1,
+                            g_k->get_const_values(), g_k->get_stride(), m_i);
+            }
         }
     }
 }
@@ -757,9 +777,8 @@ void update_x_r_and_f(std::shared_ptr<const DpcppExecutor> exec,
         as_device_type(r->get_values()), r->get_stride(),
         as_device_type(x->get_values()), x->get_stride(),
         stop_status->get_const_data());
-    components::fill_array(
-        exec, as_device_type(f->get_values()) + k * f->get_stride(), nrhs,
-        zero<ValueType>());
+    components::fill_array(exec, f->get_values() + k * f->get_stride(), nrhs,
+                           zero<ValueType>());
 }
 
 
@@ -825,7 +844,7 @@ void step_2(std::shared_ptr<const DpcppExecutor> exec, const size_type nrhs,
     step_2_kernel(grid_dim, default_block_size, 0, exec->get_queue(), k,
                   num_rows, subspace_dim, nrhs,
                   as_device_type(omega->get_const_values()),
-                  preconditioned_vector->get_const_values(),
+                  as_device_type(preconditioned_vector->get_const_values()),
                   preconditioned_vector->get_stride(),
                   as_device_type(c->get_const_values()), c->get_stride(),
                   as_device_type(u->get_values()), u->get_stride(),
@@ -860,11 +879,11 @@ void compute_omega(
     matrix::Dense<ValueType>* omega, const array<stopping_status>* stop_status)
 {
     const auto grid_dim = ceildiv(nrhs, config::warp_size);
-    compute_omega_kernel(grid_dim, config::warp_size, 0, exec->get_queue(),
-                         nrhs, kappa, as_device_type(tht->get_const_values()),
-                         residual_norm->get_const_values(),
-                         as_device_type(omega->get_values()),
-                         stop_status->get_const_data());
+    compute_omega_kernel(
+        grid_dim, config::warp_size, 0, exec->get_queue(), nrhs,
+        as_device_type(kappa), as_device_type(tht->get_const_values()),
+        as_device_type(residual_norm->get_const_values()),
+        as_device_type(omega->get_values()), stop_status->get_const_data());
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
diff --git a/dpcpp/stop/residual_norm_kernels.dp.cpp b/dpcpp/stop/residual_norm_kernels.dp.cpp
index 3da9bfd3a75..ff251cdc943 100644
--- a/dpcpp/stop/residual_norm_kernels.dp.cpp
+++ b/dpcpp/stop/residual_norm_kernels.dp.cpp
@@ -12,6 +12,7 @@
 
 #include "core/base/array_access.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/math.hpp"
 #include "dpcpp/base/types.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
 

From cce61ee2651e611e00c47f6e7536b0fb108e0db7 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Sun, 24 Nov 2024 19:05:21 +0100
Subject: [PATCH 417/448] fix template expect argument

---
 core/test/mpi/distributed/matrix.cpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/core/test/mpi/distributed/matrix.cpp b/core/test/mpi/distributed/matrix.cpp
index 4062393564c..2b027face11 100644
--- a/core/test/mpi/distributed/matrix.cpp
+++ b/core/test/mpi/distributed/matrix.cpp
@@ -186,9 +186,9 @@ TYPED_TEST(MatrixBuilder, BuildWithLocal)
     using value_type = typename TestFixture::value_type;
     using index_type = typename TestFixture::local_index_type;
     using dist_mat_type = typename TestFixture::dist_mtx_type;
-    this->template forall_matrix_types([this](auto with_matrix_type,
-                                              auto expected_type_ptr,
-                                              auto additional_test) {
+    this->forall_matrix_types([this](auto with_matrix_type,
+                                     auto expected_type_ptr,
+                                     auto additional_test) {
         using expected_type = typename std::remove_pointer<
             decltype(expected_type_ptr.get())>::type;
 
@@ -209,9 +209,9 @@ TYPED_TEST(MatrixBuilder, BuildWithLocalAndNonLocal)
     using value_type = typename TestFixture::value_type;
     using index_type = typename TestFixture::local_index_type;
     using dist_mat_type = typename TestFixture::dist_mtx_type;
-    this->template forall_matrix_types([this](auto with_local_matrix_type,
-                                              auto expected_local_type_ptr,
-                                              auto additional_local_test) {
+    this->forall_matrix_types([this](auto with_local_matrix_type,
+                                     auto expected_local_type_ptr,
+                                     auto additional_local_test) {
         using expected_local_type = typename std::remove_pointer<
             decltype(expected_local_type_ptr.get())>::type;
         this->forall_matrix_types([&](auto with_non_local_matrix_type,
@@ -289,9 +289,9 @@ TYPED_TEST(MatrixBuilder, BuildFromLinOpLocal)
     using value_type = typename TestFixture::value_type;
     using index_type = typename TestFixture::local_index_type;
     using dist_mat_type = typename TestFixture::dist_mtx_type;
-    this->template forall_matrix_types([this](auto with_matrix_type,
-                                              auto expected_type_ptr,
-                                              auto additional_test) {
+    this->forall_matrix_types([this](auto with_matrix_type,
+                                     auto expected_type_ptr,
+                                     auto additional_test) {
         using expected_type = typename std::remove_pointer<
             decltype(expected_type_ptr.get())>::type;
 
@@ -312,9 +312,9 @@ TYPED_TEST(MatrixBuilder, BuildFromLinOpLocalAndNonLocal)
     using value_type = typename TestFixture::value_type;
     using index_type = typename TestFixture::local_index_type;
     using dist_mat_type = typename TestFixture::dist_mtx_type;
-    this->template forall_matrix_types([this](auto with_local_matrix_type,
-                                              auto expected_local_type_ptr,
-                                              auto additional_local_test) {
+    this->forall_matrix_types([this](auto with_local_matrix_type,
+                                     auto expected_local_type_ptr,
+                                     auto additional_local_test) {
         using expected_local_type = typename std::remove_pointer<
             decltype(expected_local_type_ptr.get())>::type;
         this->forall_matrix_types([&](auto with_non_local_matrix_type,

From 74dab1cd273bb5e59e90cf3b478eceeeeecbb024 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 22 Nov 2024 19:17:43 +0100
Subject: [PATCH 418/448] intel sycl introduces silent complex header loading
 chain. To adapt it, we need to do the same trick such that we can provide the
 implementation before loading complex header.

---
 CMakeLists.txt                                |   2 +
 cmake/sycl.cmake                              |  15 +-
 dpcpp/CMakeLists.txt                          |   5 +
 dpcpp/base/complex.hpp                        | 223 ++++++++++++++++++
 dpcpp/base/math.hpp                           | 152 +-----------
 .../par_ilut_spgeam_kernel.dp.cpp             |   1 +
 dpcpp/matrix/diagonal_kernels.dp.cpp          |   1 +
 dpcpp/preconditioner/batch_block_jacobi.hpp   |   2 +-
 ...cobi_advanced_apply_instantiate.inc.dp.cpp |   2 +
 ...jacobi_simple_apply_instantiate.inc.dp.cpp |   2 +
 include/ginkgo/config.hpp.in                  |   1 +
 11 files changed, 248 insertions(+), 158 deletions(-)
 create mode 100644 dpcpp/base/complex.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 90c1c3ba4e3..8a347d010dd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -304,9 +304,11 @@ endif()
 
 if(GINKGO_BUILD_SYCL)
     ginkgo_extract_dpcpp_version(${CMAKE_CXX_COMPILER} GINKGO_DPCPP_MAJOR_VERSION __LIBSYCL_MAJOR_VERSION)
+    ginkgo_extract_dpcpp_version(${CMAKE_CXX_COMPILER} GINKGO_DPCPP_MINOR_VERSION __LIBSYCL_MINOR_VERSION)
     ginkgo_extract_dpcpp_version(${CMAKE_CXX_COMPILER} GINKGO_DPCPP_VERSION __SYCL_COMPILER_VERSION)
 else()
     set(GINKGO_DPCPP_MAJOR_VERSION "0")
+    set(GINKGO_DPCPP_MINOR_VERSION "0")
 endif()
 configure_file(${Ginkgo_SOURCE_DIR}/include/ginkgo/config.hpp.in
     ${Ginkgo_BINARY_DIR}/include/ginkgo/config.hpp @ONLY)
diff --git a/cmake/sycl.cmake b/cmake/sycl.cmake
index b0f4eab91f1..5289ee253e7 100644
--- a/cmake/sycl.cmake
+++ b/cmake/sycl.cmake
@@ -15,11 +15,6 @@ endif()
 
 # Provide a uniform way for those package without add_sycl_to_target
 function(gko_add_sycl_to_target)
-    if(COMMAND add_sycl_to_target)
-        add_sycl_to_target(${ARGN})
-        return()
-    endif()
-    # We handle them by adding SYCL_FLAGS to compile and link to the target
     set(one_value_args TARGET)
     set(multi_value_args SOURCES)
     cmake_parse_arguments(SYCL
@@ -27,7 +22,15 @@ function(gko_add_sycl_to_target)
         "${one_value_args}"
         "${multi_value_args}"
         ${ARGN})
+    # trick for complex header chain
+    if("${GINKGO_DPCPP_MAJOR_VERSION}.${GINKGO_DPCPP_MINOR_VERSION}" VERSION_GREATER_EQUAL 7.1)
+        target_include_directories(${SYCL_TARGET} PRIVATE "${PROJECT_BINARY_DIR}/dpcpp/base")
+    endif()
+    if(COMMAND add_sycl_to_target)
+        add_sycl_to_target(${ARGN})
+        return()
+    endif()
+    # We handle them by adding SYCL_FLAGS to compile and link to the target
     target_compile_options(${SYCL_TARGET} PRIVATE "${SYCL_FLAGS}")
     target_link_options(${SYCL_TARGET} PRIVATE "${SYCL_FLAGS}")
 endfunction()
-
diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index 81a2a6034ea..8da2620ed41 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -4,6 +4,11 @@ find_package(oneDPL REQUIRED HINTS "$ENV{DPL_ROOT}" "$ENV{DPLROOT}")
 set(GINKGO_MKL_ROOT "${MKL_DIR}" PARENT_SCOPE)
 set(GINKGO_DPL_ROOT "${oneDPL_DIR}" PARENT_SCOPE)
 
+# trick for complex header chain
+if("${GINKGO_DPCPP_MAJOR_VERSION}.${GINKGO_DPCPP_MINOR_VERSION}" VERSION_GREATER_EQUAL 7.1)
+    configure_file(base/complex.hpp ${CMAKE_CURRENT_BINARY_DIR}/base/complex)
+endif()
+
 include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
 add_instantiation_files(${PROJECT_SOURCE_DIR}/common/unified matrix/dense_kernels.instantiate.cpp DENSE_INSTANTIATE)
 add_instantiation_files(. solver/batch_bicgstab_launch.instantiate.dp.cpp BATCH_BICGSTAB_INSTANTIATE)
diff --git a/dpcpp/base/complex.hpp b/dpcpp/base/complex.hpp
new file mode 100644
index 00000000000..56ce347afdd
--- /dev/null
+++ b/dpcpp/base/complex.hpp
@@ -0,0 +1,223 @@
+// SPDX-FileCopyrightText: 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_DPCPP_BASE_COMPLEX_HPP_
+#define GKO_DPCPP_BASE_COMPLEX_HPP_
+
+#include <sycl/half_type.hpp>
+
+#include <ginkgo/config.hpp>
+
+// this file is to workaround for the intel sycl complex different loading.
+// intel sycl provides complex and the corresponding searching path. When users
+// load complex with -fsycl, the compiler will load intel's <complex> header
+// first and then load usual <complex> header. However, it implicitly
+// instantiates and uses std::complex<sycl::half>, so we need to provide the
+// implementation before that. In ginkgo, we will definitely load <complex> in
+// the public interface, which is before sycl backend, so we have no normal way
+// to provide the std::complex<sycl::half> implementation in sycl.
+// We apply the same trick to load this file first and then load their header
+// later. We will also configure this file as <complex> and provide the search
+// path in sycl module.
+// They start to do this from LIBSYCL 7.1.0.
+
+namespace std {
+
+template <typename>
+class complex;
+
+// implement std::complex<sycl::half> before knowing std::complex<float>
+template <>
+class complex<sycl::half> {
+public:
+    using value_type = sycl::half;
+
+    complex(const value_type& real = value_type(0.f),
+            const value_type& imag = value_type(0.f))
+        : real_(real), imag_(imag)
+    {}
+
+    template <typename T, typename U,
+              typename = std::enable_if_t<std::is_scalar<T>::value &&
+                                          std::is_scalar<U>::value>>
+    explicit complex(const T& real, const U& imag)
+        : real_(static_cast<value_type>(real)),
+          imag_(static_cast<value_type>(imag))
+    {}
+
+    template <typename T, typename = std::enable_if_t<std::is_scalar<T>::value>>
+    complex(const T& real)
+        : real_(static_cast<value_type>(real)),
+          imag_(static_cast<value_type>(0.f))
+    {}
+
+    template <typename T, typename = std::enable_if_t<std::is_scalar<T>::value>>
+    complex(const complex<T>& other)
+        : real_(static_cast<value_type>(other.real())),
+          imag_(static_cast<value_type>(other.imag()))
+    {}
+
+    value_type real() const noexcept { return real_; }
+
+    value_type imag() const noexcept { return imag_; }
+
+    inline operator std::complex<float>() const noexcept;
+
+    template <typename V>
+    complex& operator=(const V& val)
+    {
+        real_ = val;
+        imag_ = value_type();
+        return *this;
+    }
+
+    template <typename V>
+    complex& operator=(const std::complex<V>& val)
+    {
+        real_ = val.real();
+        imag_ = val.imag();
+        return *this;
+    }
+
+    complex& operator+=(const value_type& real)
+    {
+        real_ += real;
+        return *this;
+    }
+
+    complex& operator-=(const value_type& real)
+    {
+        real_ -= real;
+        return *this;
+    }
+
+    complex& operator*=(const value_type& real)
+    {
+        real_ *= real;
+        imag_ *= real;
+        return *this;
+    }
+
+    complex& operator/=(const value_type& real)
+    {
+        real_ /= real;
+        imag_ /= real;
+        return *this;
+    }
+
+    template <typename T>
+    complex& operator+=(const complex<T>& val)
+    {
+        real_ += val.real();
+        imag_ += val.imag();
+        return *this;
+    }
+
+    template <typename T>
+    complex& operator-=(const complex<T>& val)
+    {
+        real_ -= val.real();
+        imag_ -= val.imag();
+        return *this;
+    }
+
+    template <typename T>
+    inline complex& operator*=(const complex<T>& val);
+
+    template <typename T>
+    inline complex& operator/=(const complex<T>& val);
+
+// It's for MacOS.
+// TODO: check whether mac compiler always use complex version even when real
+// half
+#define COMPLEX_HALF_OPERATOR(_op, _opeq)                                  \
+    friend complex<sycl::half> operator _op(const complex<sycl::half> lhf, \
+                                            const complex<sycl::half> rhf) \
+    {                                                                      \
+        auto a = lhf;                                                      \
+        a _opeq rhf;                                                       \
+        return a;                                                          \
+    }
+
+    COMPLEX_HALF_OPERATOR(+, +=)
+    COMPLEX_HALF_OPERATOR(-, -=)
+    COMPLEX_HALF_OPERATOR(*, *=)
+    COMPLEX_HALF_OPERATOR(/, /=)
+
+#undef COMPLEX_HALF_OPERATOR
+
+private:
+    value_type real_;
+    value_type imag_;
+};
+
+}  // namespace std
+
+
+// after providing std::complex<sycl::half>, we can load their <complex> to
+// complete the header chain.
+
+#if GINKGO_DPCPP_MAJOR_VERSION > 7 || \
+    (GINKGO_DPCPP_MAJOR_VERSION == 7 && GINKGO_DPCPP_MINOR_VERSION >= 1)
+
+#if defined(__has_include_next)
+// GCC/clang support go through this path.
+#include_next <complex>
+#else
+// MSVC doesn't support "#include_next", so we take the same workaround in
+// stl_wrappers/complex.
+#include <../stl_wrappers/complex>
+#endif
+
+#else
+
+
+#include <complex>
+
+
+#endif
+
+
+// we know the complex<float> now, so we implement those functions requiring
+// complex<float>
+namespace std {
+
+
+inline complex<sycl::half>::operator complex<float>() const noexcept
+{
+    return std::complex<float>(static_cast<float>(real_),
+                               static_cast<float>(imag_));
+}
+
+
+template <typename T>
+inline complex<sycl::half>& complex<sycl::half>::operator*=(
+    const complex<T>& val)
+{
+    auto val_f = static_cast<std::complex<float>>(val);
+    auto result_f = static_cast<std::complex<float>>(*this);
+    result_f *= val_f;
+    real_ = result_f.real();
+    imag_ = result_f.imag();
+    return *this;
+}
+
+
+template <typename T>
+inline complex<sycl::half>& complex<sycl::half>::operator/=(
+    const complex<T>& val)
+{
+    auto val_f = static_cast<std::complex<float>>(val);
+    auto result_f = static_cast<std::complex<float>>(*this);
+    result_f /= val_f;
+    real_ = result_f.real();
+    imag_ = result_f.imag();
+    return *this;
+}
+
+
+}  // namespace std
+
+
+#endif  // GKO_DPCPP_BASE_COMPLEX_HPP_
diff --git a/dpcpp/base/math.hpp b/dpcpp/base/math.hpp
index 0588f844d83..2d8e955487d 100644
--- a/dpcpp/base/math.hpp
+++ b/dpcpp/base/math.hpp
@@ -12,159 +12,9 @@
 
 #include <ginkgo/core/base/math.hpp>
 
+#include "dpcpp/base/complex.hpp"
 #include "dpcpp/base/dpct.hpp"
 
-
-namespace std {
-
-
-template <>
-class complex<sycl::half> {
-public:
-    using value_type = sycl::half;
-
-    complex(const value_type& real = value_type(0.f),
-            const value_type& imag = value_type(0.f))
-        : real_(real), imag_(imag)
-    {}
-
-    template <typename T, typename U,
-              typename = std::enable_if_t<std::is_scalar<T>::value &&
-                                          std::is_scalar<U>::value>>
-    explicit complex(const T& real, const U& imag)
-        : real_(static_cast<value_type>(real)),
-          imag_(static_cast<value_type>(imag))
-    {}
-
-    template <typename T, typename = std::enable_if_t<std::is_scalar<T>::value>>
-    complex(const T& real)
-        : real_(static_cast<value_type>(real)),
-          imag_(static_cast<value_type>(0.f))
-    {}
-
-    template <typename T, typename = std::enable_if_t<std::is_scalar<T>::value>>
-    complex(const complex<T>& other)
-        : real_(static_cast<value_type>(other.real())),
-          imag_(static_cast<value_type>(other.imag()))
-    {}
-
-    value_type real() const noexcept { return real_; }
-
-    value_type imag() const noexcept { return imag_; }
-
-    operator std::complex<float>() const noexcept
-    {
-        return std::complex<float>(static_cast<float>(real_),
-                                   static_cast<float>(imag_));
-    }
-
-    template <typename V>
-    complex& operator=(const V& val)
-    {
-        real_ = val;
-        imag_ = value_type();
-        return *this;
-    }
-
-    template <typename V>
-    complex& operator=(const std::complex<V>& val)
-    {
-        real_ = val.real();
-        imag_ = val.imag();
-        return *this;
-    }
-
-    complex& operator+=(const value_type& real)
-    {
-        real_ += real;
-        return *this;
-    }
-
-    complex& operator-=(const value_type& real)
-    {
-        real_ -= real;
-        return *this;
-    }
-
-    complex& operator*=(const value_type& real)
-    {
-        real_ *= real;
-        imag_ *= real;
-        return *this;
-    }
-
-    complex& operator/=(const value_type& real)
-    {
-        real_ /= real;
-        imag_ /= real;
-        return *this;
-    }
-
-    template <typename T>
-    complex& operator+=(const complex<T>& val)
-    {
-        real_ += val.real();
-        imag_ += val.imag();
-        return *this;
-    }
-
-    template <typename T>
-    complex& operator-=(const complex<T>& val)
-    {
-        real_ -= val.real();
-        imag_ -= val.imag();
-        return *this;
-    }
-
-    template <typename T>
-    complex& operator*=(const complex<T>& val)
-    {
-        auto val_f = static_cast<std::complex<float>>(val);
-        auto result_f = static_cast<std::complex<float>>(*this);
-        result_f *= val_f;
-        real_ = result_f.real();
-        imag_ = result_f.imag();
-        return *this;
-    }
-
-    template <typename T>
-    complex& operator/=(const complex<T>& val)
-    {
-        auto val_f = static_cast<std::complex<float>>(val);
-        auto result_f = static_cast<std::complex<float>>(*this);
-        result_f /= val_f;
-        real_ = result_f.real();
-        imag_ = result_f.imag();
-        return *this;
-    }
-
-// It's for MacOS.
-// TODO: check whether mac compiler always use complex version even when real
-// half
-#define COMPLEX_HALF_OPERATOR(_op, _opeq)                                  \
-    friend complex<sycl::half> operator _op(const complex<sycl::half> lhf, \
-                                            const complex<sycl::half> rhf) \
-    {                                                                      \
-        auto a = lhf;                                                      \
-        a _opeq rhf;                                                       \
-        return a;                                                          \
-    }
-
-    COMPLEX_HALF_OPERATOR(+, +=)
-    COMPLEX_HALF_OPERATOR(-, -=)
-    COMPLEX_HALF_OPERATOR(*, *=)
-    COMPLEX_HALF_OPERATOR(/, /=)
-
-#undef COMPLEX_HALF_OPERATOR
-
-private:
-    value_type real_;
-    value_type imag_;
-};
-
-}  // namespace std
-
-
 namespace gko {
 namespace detail {
 
diff --git a/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp b/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp
index 664c74a5603..cf379be51b7 100644
--- a/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp
+++ b/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp
@@ -19,6 +19,7 @@
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/math.hpp"
 #include "dpcpp/base/types.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/intrinsics.dp.hpp"
diff --git a/dpcpp/matrix/diagonal_kernels.dp.cpp b/dpcpp/matrix/diagonal_kernels.dp.cpp
index b377179183c..96090c40624 100644
--- a/dpcpp/matrix/diagonal_kernels.dp.cpp
+++ b/dpcpp/matrix/diagonal_kernels.dp.cpp
@@ -12,6 +12,7 @@
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/helper.hpp"
+#include "dpcpp/base/math.hpp"
 #include "dpcpp/base/types.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
diff --git a/dpcpp/preconditioner/batch_block_jacobi.hpp b/dpcpp/preconditioner/batch_block_jacobi.hpp
index 04c21f97991..affc90aa05c 100644
--- a/dpcpp/preconditioner/batch_block_jacobi.hpp
+++ b/dpcpp/preconditioner/batch_block_jacobi.hpp
@@ -132,7 +132,7 @@ class BlockJacobi final {
             // reduction (it does not support half)
             // sum = sycl::reduce_over_group(sg, sum, sycl::plus<>());
             for (int i = sg_size / 2; i > 0; i /= 2) {
-                sum += sg.shuffle_down(sum, i);
+                sum += sycl::shift_group_left(sg, sum, i);
             }
 
             if (sg_tid == 0) {
diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
index 23f614b3336..797e1173d88 100644
--- a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
@@ -13,6 +13,8 @@
 #include "core/synthesizer/implementation_selection.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/math.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
 #include "dpcpp/components/warp_blas.dp.hpp"
diff --git a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
index 84ea1a8a96b..120d73a804c 100644
--- a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
@@ -13,6 +13,8 @@
 #include "core/synthesizer/implementation_selection.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/math.hpp"
+#include "dpcpp/base/types.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
 #include "dpcpp/components/warp_blas.dp.hpp"
diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in
index cf25dcd3c77..d9251e66165 100644
--- a/include/ginkgo/config.hpp.in
+++ b/include/ginkgo/config.hpp.in
@@ -51,6 +51,7 @@
 /* What is the major version of dpcpp compiler */
 // clang-format off
 #define GINKGO_DPCPP_MAJOR_VERSION @GINKGO_DPCPP_MAJOR_VERSION@
+#define GINKGO_DPCPP_MINOR_VERSION @GINKGO_DPCPP_MINOR_VERSION@
 // clang-format on
 
 

From 75a6846e466ec1f13d6001a51b5b3d15fd2638d9 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 25 Nov 2024 01:49:30 +0100
Subject: [PATCH 419/448] replace CL/sycl.hpp by sycl/sycl.hpp

---
 benchmark/utils/dpcpp_timer.dp.cpp                            | 2 +-
 cmake/build_helpers.cmake                                     | 2 +-
 dpcpp/base/batch_multi_vector_kernels.dp.cpp                  | 2 +-
 dpcpp/base/batch_multi_vector_kernels.hpp                     | 2 +-
 dpcpp/base/dim3.dp.hpp                                        | 2 +-
 dpcpp/base/dpct.hpp                                           | 2 +-
 dpcpp/base/executor.dp.cpp                                    | 2 +-
 dpcpp/base/helper.dp.cpp                                      | 2 +-
 dpcpp/base/helper.hpp                                         | 2 +-
 dpcpp/base/kernel_launch.dp.hpp                               | 2 +-
 dpcpp/base/onemkl_bindings.hpp                                | 3 ++-
 dpcpp/base/timer.dp.cpp                                       | 2 +-
 dpcpp/components/atomic.dp.hpp                                | 2 +-
 dpcpp/components/diagonal_block_manipulation.dp.hpp           | 2 +-
 dpcpp/components/format_conversion.dp.hpp                     | 2 +-
 dpcpp/components/intrinsics.dp.hpp                            | 2 +-
 dpcpp/components/merging.dp.hpp                               | 2 +-
 dpcpp/components/prefix_sum.dp.hpp                            | 2 +-
 dpcpp/components/prefix_sum_kernels.dp.cpp                    | 2 +-
 dpcpp/components/reduction.dp.hpp                             | 2 +-
 dpcpp/components/searching.dp.hpp                             | 2 +-
 dpcpp/components/segment_scan.dp.hpp                          | 2 +-
 dpcpp/components/sorting.dp.hpp                               | 2 +-
 dpcpp/components/thread_ids.dp.hpp                            | 2 +-
 dpcpp/components/warp_blas.dp.hpp                             | 2 +-
 dpcpp/factorization/cholesky_kernels.dp.cpp                   | 2 +-
 dpcpp/factorization/factorization_helpers.dp.hpp              | 2 +-
 dpcpp/factorization/factorization_kernels.dp.cpp              | 2 +-
 dpcpp/factorization/par_ic_kernels.dp.cpp                     | 2 +-
 dpcpp/factorization/par_ict_kernels.dp.cpp                    | 2 +-
 dpcpp/factorization/par_ilu_kernels.dp.cpp                    | 2 +-
 dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp      | 2 +-
 dpcpp/factorization/par_ilut_filter_kernel.dp.cpp             | 2 +-
 dpcpp/factorization/par_ilut_kernels.dp.cpp                   | 2 +-
 dpcpp/factorization/par_ilut_select_common.dp.cpp             | 2 +-
 dpcpp/factorization/par_ilut_select_kernel.dp.cpp             | 2 +-
 dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp             | 2 +-
 dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp              | 2 +-
 dpcpp/matrix/batch_csr_kernels.dp.cpp                         | 2 +-
 dpcpp/matrix/batch_csr_kernels.hpp                            | 2 +-
 dpcpp/matrix/batch_dense_kernels.dp.cpp                       | 2 +-
 dpcpp/matrix/batch_dense_kernels.hpp                          | 2 +-
 dpcpp/matrix/batch_ell_kernels.dp.cpp                         | 2 +-
 dpcpp/matrix/batch_ell_kernels.hpp                            | 2 +-
 dpcpp/matrix/coo_kernels.dp.cpp                               | 2 +-
 dpcpp/matrix/csr_kernels.dp.cpp                               | 3 ++-
 dpcpp/matrix/dense_kernels.dp.cpp                             | 3 ++-
 dpcpp/matrix/diagonal_kernels.dp.cpp                          | 2 +-
 dpcpp/matrix/ell_kernels.dp.cpp                               | 2 +-
 dpcpp/matrix/sellp_kernels.dp.cpp                             | 2 +-
 dpcpp/matrix/sparsity_csr_kernels.dp.cpp                      | 2 +-
 dpcpp/preconditioner/batch_block_jacobi.hpp                   | 2 +-
 dpcpp/preconditioner/batch_identity.hpp                       | 2 +-
 dpcpp/preconditioner/batch_jacobi_kernels.hpp                 | 2 +-
 dpcpp/preconditioner/batch_scalar_jacobi.hpp                  | 2 +-
 dpcpp/preconditioner/isai_kernels.dp.cpp                      | 2 +-
 .../jacobi_advanced_apply_instantiate.inc.dp.cpp              | 2 +-
 dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp   | 2 +-
 dpcpp/preconditioner/jacobi_kernels.dp.cpp                    | 2 +-
 .../preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp | 2 +-
 dpcpp/reorder/rcm_kernels.dp.cpp                              | 2 +-
 dpcpp/solver/batch_bicgstab_kernels.dp.cpp                    | 2 +-
 dpcpp/solver/batch_bicgstab_kernels.hpp                       | 2 +-
 dpcpp/solver/batch_bicgstab_launch.hpp                        | 2 +-
 dpcpp/solver/batch_bicgstab_launch.instantiate.dp.cpp         | 2 +-
 dpcpp/solver/batch_cg_kernels.dp.cpp                          | 2 +-
 dpcpp/solver/batch_cg_kernels.hpp                             | 2 +-
 dpcpp/solver/batch_cg_launch.hpp                              | 2 +-
 dpcpp/solver/batch_cg_launch.instantiate.dp.cpp               | 2 +-
 dpcpp/solver/cb_gmres_kernels.dp.cpp                          | 2 +-
 dpcpp/solver/idr_kernels.dp.cpp                               | 2 +-
 dpcpp/solver/lower_trs_kernels.dp.cpp                         | 2 +-
 dpcpp/solver/upper_trs_kernels.dp.cpp                         | 2 +-
 dpcpp/stop/criterion_kernels.dp.cpp                           | 2 +-
 dpcpp/stop/residual_norm_kernels.dp.cpp                       | 2 +-
 dpcpp/test/base/dim3.dp.cpp                                   | 4 ++--
 dpcpp/test/base/executor.dp.cpp                               | 4 ++--
 dpcpp/test/components/cooperative_groups.dp.cpp               | 4 ++--
 dpcpp/test_dpcpp.dp.cpp                                       | 2 +-
 test/solver/idr_kernels.cpp                                   | 2 +-
 80 files changed, 86 insertions(+), 83 deletions(-)

diff --git a/benchmark/utils/dpcpp_timer.dp.cpp b/benchmark/utils/dpcpp_timer.dp.cpp
index c986f2d8fa0..6372d49324b 100644
--- a/benchmark/utils/dpcpp_timer.dp.cpp
+++ b/benchmark/utils/dpcpp_timer.dp.cpp
@@ -4,7 +4,7 @@
 
 #include <iostream>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "benchmark/utils/timer_impl.hpp"
 
diff --git a/cmake/build_helpers.cmake b/cmake/build_helpers.cmake
index 0985f089382..4436a5286c6 100644
--- a/cmake/build_helpers.cmake
+++ b/cmake/build_helpers.cmake
@@ -137,7 +137,7 @@ endfunction()
 
 # Extract the DPC++ version
 function(ginkgo_extract_dpcpp_version DPCPP_COMPILER GINKGO_DPCPP_VERSION MACRO_VAR)
-    set(DPCPP_VERSION_PROG "#include <CL/sycl.hpp>\n#include <iostream>\n"
+    set(DPCPP_VERSION_PROG "#include <sycl/sycl.hpp>\n#include <iostream>\n"
         "int main() {std::cout << ${MACRO_VAR} << '\\n'\;"
         "return 0\;}")
     file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/extract_dpcpp_ver.cpp" ${DPCPP_VERSION_PROG})
diff --git a/dpcpp/base/batch_multi_vector_kernels.dp.cpp b/dpcpp/base/batch_multi_vector_kernels.dp.cpp
index 7c49af8d9e8..3dff550bc22 100644
--- a/dpcpp/base/batch_multi_vector_kernels.dp.cpp
+++ b/dpcpp/base/batch_multi_vector_kernels.dp.cpp
@@ -6,7 +6,7 @@
 
 #include <algorithm>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/batch_multi_vector.hpp>
diff --git a/dpcpp/base/batch_multi_vector_kernels.hpp b/dpcpp/base/batch_multi_vector_kernels.hpp
index 96ada23f42c..81fa63e51a1 100644
--- a/dpcpp/base/batch_multi_vector_kernels.hpp
+++ b/dpcpp/base/batch_multi_vector_kernels.hpp
@@ -8,7 +8,7 @@
 
 #include <memory>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "core/base/batch_struct.hpp"
 #include "dpcpp/base/batch_struct.hpp"
diff --git a/dpcpp/base/dim3.dp.hpp b/dpcpp/base/dim3.dp.hpp
index bff818e2373..6ff3fac143a 100644
--- a/dpcpp/base/dim3.dp.hpp
+++ b/dpcpp/base/dim3.dp.hpp
@@ -6,7 +6,7 @@
 #define GKO_DPCPP_BASE_DIM3_DP_HPP_
 
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 
 namespace gko {
diff --git a/dpcpp/base/dpct.hpp b/dpcpp/base/dpct.hpp
index 8e85033c911..18b689cbc21 100644
--- a/dpcpp/base/dpct.hpp
+++ b/dpcpp/base/dpct.hpp
@@ -6,7 +6,7 @@
 #define GKO_DPCPP_BASE_DPCT_HPP_
 
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 
 // This is partial extraction from dpct/dpct.hpp of Intel
diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp
index 5abb745a91d..06a2643f926 100644
--- a/dpcpp/base/executor.dp.cpp
+++ b/dpcpp/base/executor.dp.cpp
@@ -10,7 +10,7 @@
 #include <map>
 #include <string>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/dpcpp/base/helper.dp.cpp b/dpcpp/base/helper.dp.cpp
index f4ae9f0560d..1d2fb922ef8 100644
--- a/dpcpp/base/helper.dp.cpp
+++ b/dpcpp/base/helper.dp.cpp
@@ -4,7 +4,7 @@
 
 #include "dpcpp/base/helper.hpp"
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 
 namespace gko {
diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp
index b8cf1a8451c..c092a42d7b8 100644
--- a/dpcpp/base/helper.hpp
+++ b/dpcpp/base/helper.hpp
@@ -8,7 +8,7 @@
 
 #include <utility>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/types.hpp>
diff --git a/dpcpp/base/kernel_launch.dp.hpp b/dpcpp/base/kernel_launch.dp.hpp
index 7aa117692f7..3a30ec4edce 100644
--- a/dpcpp/base/kernel_launch.dp.hpp
+++ b/dpcpp/base/kernel_launch.dp.hpp
@@ -10,7 +10,7 @@
 
 #include <tuple>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 
 namespace gko {
diff --git a/dpcpp/base/onemkl_bindings.hpp b/dpcpp/base/onemkl_bindings.hpp
index 004c296553c..90869a3402e 100644
--- a/dpcpp/base/onemkl_bindings.hpp
+++ b/dpcpp/base/onemkl_bindings.hpp
@@ -8,9 +8,10 @@
 
 #include <type_traits>
 
-#include <CL/sycl.hpp>
 #include <oneapi/mkl.hpp>
 
+#include <sycl/sycl.hpp>
+
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
diff --git a/dpcpp/base/timer.dp.cpp b/dpcpp/base/timer.dp.cpp
index ed21e1b79a5..cf95732a488 100644
--- a/dpcpp/base/timer.dp.cpp
+++ b/dpcpp/base/timer.dp.cpp
@@ -4,7 +4,7 @@
 
 #include "ginkgo/core/base/timer.hpp"
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 
diff --git a/dpcpp/components/atomic.dp.hpp b/dpcpp/components/atomic.dp.hpp
index 2066deeef64..86b37434417 100644
--- a/dpcpp/components/atomic.dp.hpp
+++ b/dpcpp/components/atomic.dp.hpp
@@ -8,7 +8,7 @@
 
 #include <type_traits>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "dpcpp/base/dpct.hpp"
 
diff --git a/dpcpp/components/diagonal_block_manipulation.dp.hpp b/dpcpp/components/diagonal_block_manipulation.dp.hpp
index 626a225c4fa..c24d6bef463 100644
--- a/dpcpp/components/diagonal_block_manipulation.dp.hpp
+++ b/dpcpp/components/diagonal_block_manipulation.dp.hpp
@@ -8,7 +8,7 @@
 
 #include <type_traits>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dpct.hpp"
diff --git a/dpcpp/components/format_conversion.dp.hpp b/dpcpp/components/format_conversion.dp.hpp
index 17cf55389df..7724830ad19 100644
--- a/dpcpp/components/format_conversion.dp.hpp
+++ b/dpcpp/components/format_conversion.dp.hpp
@@ -8,7 +8,7 @@
 
 #include <algorithm>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/types.hpp>
diff --git a/dpcpp/components/intrinsics.dp.hpp b/dpcpp/components/intrinsics.dp.hpp
index 369a3dff8b9..8c426fea8ef 100644
--- a/dpcpp/components/intrinsics.dp.hpp
+++ b/dpcpp/components/intrinsics.dp.hpp
@@ -6,7 +6,7 @@
 #define GKO_DPCPP_COMPONENTS_INTRINSICS_DP_HPP_
 
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/types.hpp>
 
diff --git a/dpcpp/components/merging.dp.hpp b/dpcpp/components/merging.dp.hpp
index 8d2f96e70bf..60b9412f6ff 100644
--- a/dpcpp/components/merging.dp.hpp
+++ b/dpcpp/components/merging.dp.hpp
@@ -8,7 +8,7 @@
 
 #include <limits>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "core/base/utils.hpp"
 #include "dpcpp/base/dpct.hpp"
diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp
index b1ae9da32bb..9671ce90b15 100644
--- a/dpcpp/components/prefix_sum.dp.hpp
+++ b/dpcpp/components/prefix_sum.dp.hpp
@@ -8,7 +8,7 @@
 
 #include <type_traits>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "core/base/types.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
diff --git a/dpcpp/components/prefix_sum_kernels.dp.cpp b/dpcpp/components/prefix_sum_kernels.dp.cpp
index a47f45e9565..70f78555d4f 100644
--- a/dpcpp/components/prefix_sum_kernels.dp.cpp
+++ b/dpcpp/components/prefix_sum_kernels.dp.cpp
@@ -4,7 +4,7 @@
 
 #include "core/components/prefix_sum_kernels.hpp"
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/types.hpp>
 
diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp
index 933f6db7817..26e797fe0e2 100644
--- a/dpcpp/components/reduction.dp.hpp
+++ b/dpcpp/components/reduction.dp.hpp
@@ -8,7 +8,7 @@
 
 #include <type_traits>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
diff --git a/dpcpp/components/searching.dp.hpp b/dpcpp/components/searching.dp.hpp
index b4cbd1bb726..b8350aa4f8b 100644
--- a/dpcpp/components/searching.dp.hpp
+++ b/dpcpp/components/searching.dp.hpp
@@ -6,7 +6,7 @@
 #define GKO_DPCPP_COMPONENTS_SEARCHING_DP_HPP_
 
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dpct.hpp"
diff --git a/dpcpp/components/segment_scan.dp.hpp b/dpcpp/components/segment_scan.dp.hpp
index b6c26523f30..bcc4766cd0a 100644
--- a/dpcpp/components/segment_scan.dp.hpp
+++ b/dpcpp/components/segment_scan.dp.hpp
@@ -6,7 +6,7 @@
 #define GKO_DPCPP_COMPONENTS_SEGMENT_SCAN_DP_HPP_
 
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/dpct.hpp"
diff --git a/dpcpp/components/sorting.dp.hpp b/dpcpp/components/sorting.dp.hpp
index e616903721c..111e0cad910 100644
--- a/dpcpp/components/sorting.dp.hpp
+++ b/dpcpp/components/sorting.dp.hpp
@@ -6,7 +6,7 @@
 #define GKO_DPCPP_COMPONENTS_SORTING_DP_HPP_
 
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dpct.hpp"
diff --git a/dpcpp/components/thread_ids.dp.hpp b/dpcpp/components/thread_ids.dp.hpp
index 09f7b24c6ee..8b379722ea8 100644
--- a/dpcpp/components/thread_ids.dp.hpp
+++ b/dpcpp/components/thread_ids.dp.hpp
@@ -6,7 +6,7 @@
 #define GKO_DPCPP_COMPONENTS_THREAD_IDS_DP_HPP_
 
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dpct.hpp"
diff --git a/dpcpp/components/warp_blas.dp.hpp b/dpcpp/components/warp_blas.dp.hpp
index dabc812930f..bdf11b8ba30 100644
--- a/dpcpp/components/warp_blas.dp.hpp
+++ b/dpcpp/components/warp_blas.dp.hpp
@@ -9,7 +9,7 @@
 #include <cassert>
 #include <type_traits>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/config.hpp>
 
diff --git a/dpcpp/factorization/cholesky_kernels.dp.cpp b/dpcpp/factorization/cholesky_kernels.dp.cpp
index cde772e756b..dfef9becf19 100644
--- a/dpcpp/factorization/cholesky_kernels.dp.cpp
+++ b/dpcpp/factorization/cholesky_kernels.dp.cpp
@@ -7,7 +7,7 @@
 #include <algorithm>
 #include <memory>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/matrix/csr.hpp>
 
diff --git a/dpcpp/factorization/factorization_helpers.dp.hpp b/dpcpp/factorization/factorization_helpers.dp.hpp
index 9779e134e77..d7793f9971d 100644
--- a/dpcpp/factorization/factorization_helpers.dp.hpp
+++ b/dpcpp/factorization/factorization_helpers.dp.hpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "core/factorization/factorization_helpers.hpp"
 #include "dpcpp/base/config.hpp"
diff --git a/dpcpp/factorization/factorization_kernels.dp.cpp b/dpcpp/factorization/factorization_kernels.dp.cpp
index 66292c00643..75f7250f569 100644
--- a/dpcpp/factorization/factorization_kernels.dp.cpp
+++ b/dpcpp/factorization/factorization_kernels.dp.cpp
@@ -4,7 +4,7 @@
 
 #include "core/factorization/factorization_kernels.hpp"
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/array.hpp>
 
diff --git a/dpcpp/factorization/par_ic_kernels.dp.cpp b/dpcpp/factorization/par_ic_kernels.dp.cpp
index 36768774821..7c60faabc38 100644
--- a/dpcpp/factorization/par_ic_kernels.dp.cpp
+++ b/dpcpp/factorization/par_ic_kernels.dp.cpp
@@ -4,7 +4,7 @@
 
 #include "core/factorization/par_ic_kernels.hpp"
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
diff --git a/dpcpp/factorization/par_ict_kernels.dp.cpp b/dpcpp/factorization/par_ict_kernels.dp.cpp
index 13cc1b8974c..8e505053aaf 100644
--- a/dpcpp/factorization/par_ict_kernels.dp.cpp
+++ b/dpcpp/factorization/par_ict_kernels.dp.cpp
@@ -6,7 +6,7 @@
 
 #include <limits>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
diff --git a/dpcpp/factorization/par_ilu_kernels.dp.cpp b/dpcpp/factorization/par_ilu_kernels.dp.cpp
index 6da7b142fe7..b8df4266672 100644
--- a/dpcpp/factorization/par_ilu_kernels.dp.cpp
+++ b/dpcpp/factorization/par_ilu_kernels.dp.cpp
@@ -4,7 +4,7 @@
 
 #include "core/factorization/par_ilu_kernels.hpp"
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/matrix/coo.hpp>
 
diff --git a/dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp b/dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp
index c93e9574f81..417a2d6f3be 100644
--- a/dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp
+++ b/dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp
@@ -5,7 +5,7 @@
 #include <algorithm>
 #include <limits>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
diff --git a/dpcpp/factorization/par_ilut_filter_kernel.dp.cpp b/dpcpp/factorization/par_ilut_filter_kernel.dp.cpp
index 4eb018d8d31..9e6d8909227 100644
--- a/dpcpp/factorization/par_ilut_filter_kernel.dp.cpp
+++ b/dpcpp/factorization/par_ilut_filter_kernel.dp.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
diff --git a/dpcpp/factorization/par_ilut_kernels.dp.cpp b/dpcpp/factorization/par_ilut_kernels.dp.cpp
index f682641fc35..9e9b951dd42 100644
--- a/dpcpp/factorization/par_ilut_kernels.dp.cpp
+++ b/dpcpp/factorization/par_ilut_kernels.dp.cpp
@@ -9,7 +9,7 @@
 #include <unordered_map>
 #include <unordered_set>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
diff --git a/dpcpp/factorization/par_ilut_select_common.dp.cpp b/dpcpp/factorization/par_ilut_select_common.dp.cpp
index a9f51233725..f20ae4e280b 100644
--- a/dpcpp/factorization/par_ilut_select_common.dp.cpp
+++ b/dpcpp/factorization/par_ilut_select_common.dp.cpp
@@ -6,7 +6,7 @@
 
 #include <limits>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/par_ilut_kernels.hpp"
diff --git a/dpcpp/factorization/par_ilut_select_kernel.dp.cpp b/dpcpp/factorization/par_ilut_select_kernel.dp.cpp
index 0856a245bc3..f545063ff26 100644
--- a/dpcpp/factorization/par_ilut_select_kernel.dp.cpp
+++ b/dpcpp/factorization/par_ilut_select_kernel.dp.cpp
@@ -5,7 +5,7 @@
 #include <algorithm>
 #include <limits>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
diff --git a/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp b/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp
index cf379be51b7..9add72baff8 100644
--- a/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp
+++ b/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp
@@ -4,7 +4,7 @@
 
 #include <limits>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
diff --git a/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp b/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp
index d3812094916..7b18458532b 100644
--- a/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp
+++ b/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
diff --git a/dpcpp/matrix/batch_csr_kernels.dp.cpp b/dpcpp/matrix/batch_csr_kernels.dp.cpp
index 1ab3a0494f9..736025075fd 100644
--- a/dpcpp/matrix/batch_csr_kernels.dp.cpp
+++ b/dpcpp/matrix/batch_csr_kernels.dp.cpp
@@ -6,7 +6,7 @@
 
 #include <algorithm>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/matrix/batch_csr.hpp>
diff --git a/dpcpp/matrix/batch_csr_kernels.hpp b/dpcpp/matrix/batch_csr_kernels.hpp
index 37dc5a2c52c..d55de4de90b 100644
--- a/dpcpp/matrix/batch_csr_kernels.hpp
+++ b/dpcpp/matrix/batch_csr_kernels.hpp
@@ -8,7 +8,7 @@
 
 #include <memory>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
diff --git a/dpcpp/matrix/batch_dense_kernels.dp.cpp b/dpcpp/matrix/batch_dense_kernels.dp.cpp
index 8f3aa8fd24e..bb8272a457c 100644
--- a/dpcpp/matrix/batch_dense_kernels.dp.cpp
+++ b/dpcpp/matrix/batch_dense_kernels.dp.cpp
@@ -6,7 +6,7 @@
 
 #include <algorithm>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/batch_multi_vector.hpp>
diff --git a/dpcpp/matrix/batch_dense_kernels.hpp b/dpcpp/matrix/batch_dense_kernels.hpp
index a8f741bc3d0..61e9c32c9c0 100644
--- a/dpcpp/matrix/batch_dense_kernels.hpp
+++ b/dpcpp/matrix/batch_dense_kernels.hpp
@@ -8,7 +8,7 @@
 
 #include <memory>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
diff --git a/dpcpp/matrix/batch_ell_kernels.dp.cpp b/dpcpp/matrix/batch_ell_kernels.dp.cpp
index 809e485311a..f598d273205 100644
--- a/dpcpp/matrix/batch_ell_kernels.dp.cpp
+++ b/dpcpp/matrix/batch_ell_kernels.dp.cpp
@@ -6,7 +6,7 @@
 
 #include <algorithm>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
diff --git a/dpcpp/matrix/batch_ell_kernels.hpp b/dpcpp/matrix/batch_ell_kernels.hpp
index fb6bd3d8121..b7cc98443e5 100644
--- a/dpcpp/matrix/batch_ell_kernels.hpp
+++ b/dpcpp/matrix/batch_ell_kernels.hpp
@@ -8,7 +8,7 @@
 
 #include <memory>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
diff --git a/dpcpp/matrix/coo_kernels.dp.cpp b/dpcpp/matrix/coo_kernels.dp.cpp
index f22606aea80..636c6fccda4 100644
--- a/dpcpp/matrix/coo_kernels.dp.cpp
+++ b/dpcpp/matrix/coo_kernels.dp.cpp
@@ -4,7 +4,7 @@
 
 #include "core/matrix/coo_kernels.hpp"
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp
index 3cf9661902a..f4fb0bc1ec6 100644
--- a/dpcpp/matrix/csr_kernels.dp.cpp
+++ b/dpcpp/matrix/csr_kernels.dp.cpp
@@ -6,9 +6,10 @@
 
 #include <algorithm>
 
-#include <CL/sycl.hpp>
 #include <oneapi/mkl.hpp>
 
+#include <sycl/sycl.hpp>
+
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index d75d0defcbe..a36d30cf8e7 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -4,9 +4,10 @@
 
 #include "core/matrix/dense_kernels.hpp"
 
-#include <CL/sycl.hpp>
 #include <oneapi/mkl.hpp>
 
+#include <sycl/sycl.hpp>
+
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/range_accessors.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
diff --git a/dpcpp/matrix/diagonal_kernels.dp.cpp b/dpcpp/matrix/diagonal_kernels.dp.cpp
index 96090c40624..7bc42413330 100644
--- a/dpcpp/matrix/diagonal_kernels.dp.cpp
+++ b/dpcpp/matrix/diagonal_kernels.dp.cpp
@@ -4,7 +4,7 @@
 
 #include "core/matrix/diagonal_kernels.hpp"
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp
index 4a877b788c7..4f6453d4767 100644
--- a/dpcpp/matrix/ell_kernels.dp.cpp
+++ b/dpcpp/matrix/ell_kernels.dp.cpp
@@ -6,7 +6,7 @@
 
 #include <array>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
diff --git a/dpcpp/matrix/sellp_kernels.dp.cpp b/dpcpp/matrix/sellp_kernels.dp.cpp
index e83e8f2ce1a..c8b80e77ea9 100644
--- a/dpcpp/matrix/sellp_kernels.dp.cpp
+++ b/dpcpp/matrix/sellp_kernels.dp.cpp
@@ -4,7 +4,7 @@
 
 #include "core/matrix/sellp_kernels.hpp"
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
diff --git a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp
index 2686529bd2f..10744ac3b59 100644
--- a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp
+++ b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp
@@ -4,7 +4,7 @@
 
 #include "core/matrix/sparsity_csr_kernels.hpp"
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 
diff --git a/dpcpp/preconditioner/batch_block_jacobi.hpp b/dpcpp/preconditioner/batch_block_jacobi.hpp
index affc90aa05c..89a03907f6f 100644
--- a/dpcpp/preconditioner/batch_block_jacobi.hpp
+++ b/dpcpp/preconditioner/batch_block_jacobi.hpp
@@ -8,7 +8,7 @@
 
 #include <memory>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
diff --git a/dpcpp/preconditioner/batch_identity.hpp b/dpcpp/preconditioner/batch_identity.hpp
index 5d6a1cfcb65..3d787602ccb 100644
--- a/dpcpp/preconditioner/batch_identity.hpp
+++ b/dpcpp/preconditioner/batch_identity.hpp
@@ -8,7 +8,7 @@
 
 #include <memory>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
diff --git a/dpcpp/preconditioner/batch_jacobi_kernels.hpp b/dpcpp/preconditioner/batch_jacobi_kernels.hpp
index 8ac8718c3af..a95576ed0c4 100644
--- a/dpcpp/preconditioner/batch_jacobi_kernels.hpp
+++ b/dpcpp/preconditioner/batch_jacobi_kernels.hpp
@@ -8,7 +8,7 @@
 
 #include <memory>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
diff --git a/dpcpp/preconditioner/batch_scalar_jacobi.hpp b/dpcpp/preconditioner/batch_scalar_jacobi.hpp
index e48188c32c2..e9863d3442c 100644
--- a/dpcpp/preconditioner/batch_scalar_jacobi.hpp
+++ b/dpcpp/preconditioner/batch_scalar_jacobi.hpp
@@ -8,7 +8,7 @@
 
 #include <memory>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
diff --git a/dpcpp/preconditioner/isai_kernels.dp.cpp b/dpcpp/preconditioner/isai_kernels.dp.cpp
index f700536f379..47ff2938c6c 100644
--- a/dpcpp/preconditioner/isai_kernels.dp.cpp
+++ b/dpcpp/preconditioner/isai_kernels.dp.cpp
@@ -4,7 +4,7 @@
 
 #include "core/preconditioner/isai_kernels.hpp"
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
index 797e1173d88..01a244f34af 100644
--- a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 
diff --git a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
index 9bea9f71a72..4fe0d9c5031 100644
--- a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/dpcpp/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/preconditioner/jacobi_kernels.dp.cpp
index 63449ba5b4b..3fa743e2cc8 100644
--- a/dpcpp/preconditioner/jacobi_kernels.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_kernels.dp.cpp
@@ -4,7 +4,7 @@
 
 #include "core/preconditioner/jacobi_kernels.hpp"
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 
diff --git a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
index 120d73a804c..5bf5f06cf29 100644
--- a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 
diff --git a/dpcpp/reorder/rcm_kernels.dp.cpp b/dpcpp/reorder/rcm_kernels.dp.cpp
index b1cd6fc1319..760679afdf8 100644
--- a/dpcpp/reorder/rcm_kernels.dp.cpp
+++ b/dpcpp/reorder/rcm_kernels.dp.cpp
@@ -4,7 +4,7 @@
 
 #include "core/reorder/rcm_kernels.hpp"
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/types.hpp>
diff --git a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
index ed7ad3fafd5..578446c1cc9 100644
--- a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
+++ b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
@@ -4,7 +4,7 @@
 
 #include "core/solver/batch_bicgstab_kernels.hpp"
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/solver/batch_bicgstab.hpp>
 
diff --git a/dpcpp/solver/batch_bicgstab_kernels.hpp b/dpcpp/solver/batch_bicgstab_kernels.hpp
index c670725503e..f50fe82ed2f 100644
--- a/dpcpp/solver/batch_bicgstab_kernels.hpp
+++ b/dpcpp/solver/batch_bicgstab_kernels.hpp
@@ -8,7 +8,7 @@
 
 #include <memory>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
diff --git a/dpcpp/solver/batch_bicgstab_launch.hpp b/dpcpp/solver/batch_bicgstab_launch.hpp
index 47ed6e83d27..ebcda13f06e 100644
--- a/dpcpp/solver/batch_bicgstab_launch.hpp
+++ b/dpcpp/solver/batch_bicgstab_launch.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/solver/batch_bicgstab.hpp>
 
diff --git a/dpcpp/solver/batch_bicgstab_launch.instantiate.dp.cpp b/dpcpp/solver/batch_bicgstab_launch.instantiate.dp.cpp
index 69afac52b70..65a3082dcdf 100644
--- a/dpcpp/solver/batch_bicgstab_launch.instantiate.dp.cpp
+++ b/dpcpp/solver/batch_bicgstab_launch.instantiate.dp.cpp
@@ -4,7 +4,7 @@
 
 #include "dpcpp/solver/batch_bicgstab_launch.hpp"
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
diff --git a/dpcpp/solver/batch_cg_kernels.dp.cpp b/dpcpp/solver/batch_cg_kernels.dp.cpp
index 7f173a1dfd0..ae1018b9e80 100644
--- a/dpcpp/solver/batch_cg_kernels.dp.cpp
+++ b/dpcpp/solver/batch_cg_kernels.dp.cpp
@@ -4,7 +4,7 @@
 
 #include "core/solver/batch_cg_kernels.hpp"
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/solver/batch_cg.hpp>
 
diff --git a/dpcpp/solver/batch_cg_kernels.hpp b/dpcpp/solver/batch_cg_kernels.hpp
index 1619e64aa2f..bd7105b554d 100644
--- a/dpcpp/solver/batch_cg_kernels.hpp
+++ b/dpcpp/solver/batch_cg_kernels.hpp
@@ -8,7 +8,7 @@
 
 #include <memory>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
diff --git a/dpcpp/solver/batch_cg_launch.hpp b/dpcpp/solver/batch_cg_launch.hpp
index 329671cb1c1..5c12db659bf 100644
--- a/dpcpp/solver/batch_cg_launch.hpp
+++ b/dpcpp/solver/batch_cg_launch.hpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/solver/batch_cg.hpp>
 
diff --git a/dpcpp/solver/batch_cg_launch.instantiate.dp.cpp b/dpcpp/solver/batch_cg_launch.instantiate.dp.cpp
index 12f2983e846..250d65c933c 100644
--- a/dpcpp/solver/batch_cg_launch.instantiate.dp.cpp
+++ b/dpcpp/solver/batch_cg_launch.instantiate.dp.cpp
@@ -4,7 +4,7 @@
 
 #include "dpcpp/solver/batch_cg_launch.hpp"
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
diff --git a/dpcpp/solver/cb_gmres_kernels.dp.cpp b/dpcpp/solver/cb_gmres_kernels.dp.cpp
index 226d1a41c90..e5bd911390b 100644
--- a/dpcpp/solver/cb_gmres_kernels.dp.cpp
+++ b/dpcpp/solver/cb_gmres_kernels.dp.cpp
@@ -6,7 +6,7 @@
 
 #include <algorithm>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/range.hpp>
diff --git a/dpcpp/solver/idr_kernels.dp.cpp b/dpcpp/solver/idr_kernels.dp.cpp
index f0b589739bb..bbc81fee963 100644
--- a/dpcpp/solver/idr_kernels.dp.cpp
+++ b/dpcpp/solver/idr_kernels.dp.cpp
@@ -10,7 +10,7 @@
 #include <random>
 #include <type_traits>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
diff --git a/dpcpp/solver/lower_trs_kernels.dp.cpp b/dpcpp/solver/lower_trs_kernels.dp.cpp
index 62cfe93a59d..f38b74ae240 100644
--- a/dpcpp/solver/lower_trs_kernels.dp.cpp
+++ b/dpcpp/solver/lower_trs_kernels.dp.cpp
@@ -6,7 +6,7 @@
 
 #include <memory>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/dpcpp/solver/upper_trs_kernels.dp.cpp b/dpcpp/solver/upper_trs_kernels.dp.cpp
index 49e0a931e74..fe5381bc12b 100644
--- a/dpcpp/solver/upper_trs_kernels.dp.cpp
+++ b/dpcpp/solver/upper_trs_kernels.dp.cpp
@@ -6,7 +6,7 @@
 
 #include <memory>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/dpcpp/stop/criterion_kernels.dp.cpp b/dpcpp/stop/criterion_kernels.dp.cpp
index 2970263f6ae..baa1742e4ba 100644
--- a/dpcpp/stop/criterion_kernels.dp.cpp
+++ b/dpcpp/stop/criterion_kernels.dp.cpp
@@ -4,7 +4,7 @@
 
 #include "core/stop/criterion_kernels.hpp"
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 
diff --git a/dpcpp/stop/residual_norm_kernels.dp.cpp b/dpcpp/stop/residual_norm_kernels.dp.cpp
index ff251cdc943..8f055f693a9 100644
--- a/dpcpp/stop/residual_norm_kernels.dp.cpp
+++ b/dpcpp/stop/residual_norm_kernels.dp.cpp
@@ -4,7 +4,7 @@
 
 #include "core/stop/residual_norm_kernels.hpp"
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
diff --git a/dpcpp/test/base/dim3.dp.cpp b/dpcpp/test/base/dim3.dp.cpp
index cf0e5d1da30..c55d27a7125 100644
--- a/dpcpp/test/base/dim3.dp.cpp
+++ b/dpcpp/test/base/dim3.dp.cpp
@@ -4,10 +4,10 @@
 
 #include "dpcpp/base/dim3.dp.hpp"
 
-#include <CL/sycl.hpp>
-
 #include <gtest/gtest.h>
 
+#include <sycl/sycl.hpp>
+
 
 namespace {
 
diff --git a/dpcpp/test/base/executor.dp.cpp b/dpcpp/test/base/executor.dp.cpp
index 83a29a3b6db..077e549d683 100644
--- a/dpcpp/test/base/executor.dp.cpp
+++ b/dpcpp/test/base/executor.dp.cpp
@@ -6,10 +6,10 @@
 #include <memory>
 #include <type_traits>
 
-#include <CL/sycl.hpp>
-
 #include <gtest/gtest.h>
 
+#include <sycl/sycl.hpp>
+
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/dpcpp/test/components/cooperative_groups.dp.cpp b/dpcpp/test/components/cooperative_groups.dp.cpp
index eadd99a6ac5..18e62cf4d33 100644
--- a/dpcpp/test/components/cooperative_groups.dp.cpp
+++ b/dpcpp/test/components/cooperative_groups.dp.cpp
@@ -7,10 +7,10 @@
 #include <iostream>
 #include <memory>
 
-#include <CL/sycl.hpp>
-
 #include <gtest/gtest.h>
 
+#include <sycl/sycl.hpp>
+
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/types.hpp>
diff --git a/dpcpp/test_dpcpp.dp.cpp b/dpcpp/test_dpcpp.dp.cpp
index e4d47745da9..0035d8e7e73 100644
--- a/dpcpp/test_dpcpp.dp.cpp
+++ b/dpcpp/test_dpcpp.dp.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 
 int main()
diff --git a/test/solver/idr_kernels.cpp b/test/solver/idr_kernels.cpp
index a9857952615..ed32a5494c8 100644
--- a/test/solver/idr_kernels.cpp
+++ b/test/solver/idr_kernels.cpp
@@ -11,7 +11,7 @@
 
 
 #ifdef GKO_COMPILING_DPCPP
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 #endif
 
 

From 66e440811c40e334506188dacfa18984d4552b32 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 25 Nov 2024 19:23:23 +0100
Subject: [PATCH 420/448] sycl does not support 16 bit atomic. throw error or
 fallback to working version

---
 dpcpp/components/atomic.dp.hpp  | 27 +--------
 dpcpp/matrix/coo_kernels.dp.cpp | 98 ++++++++++++++++++---------------
 dpcpp/matrix/csr_kernels.dp.cpp | 92 +++++++++++++++++--------------
 dpcpp/matrix/ell_kernels.dp.cpp | 78 +++++++++++++++-----------
 dpcpp/solver/idr_kernels.dp.cpp | 47 +++++++---------
 5 files changed, 172 insertions(+), 170 deletions(-)

diff --git a/dpcpp/components/atomic.dp.hpp b/dpcpp/components/atomic.dp.hpp
index 86b37434417..8258c6924e1 100644
--- a/dpcpp/components/atomic.dp.hpp
+++ b/dpcpp/components/atomic.dp.hpp
@@ -169,8 +169,6 @@ __dpct_inline__ ResultType reinterpret(ValueType val)
 GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned long long int);
 // Support 32-bit ATOMIC_ADD
 GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned int);
-// Support 16-bit ATOMIC_ADD
-// GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned short);
 
 
 #undef GKO_BIND_ATOMIC_HELPER_STRUCTURE
@@ -239,8 +237,6 @@ struct atomic_helper<
 GKO_BIND_ATOMIC_MAX_STRUCTURE(unsigned long long int);
 // Support 32-bit ATOMIC_MAX
 GKO_BIND_ATOMIC_MAX_STRUCTURE(unsigned int);
-// Support 16-bit ATOMIC_MAX
-// GKO_BIND_ATOMIC_MAX_STRUCTURE(unsigned short);
 
 
 #undef GKO_BIND_ATOMIC_MAX_STRUCTURE
@@ -270,15 +266,7 @@ template <sycl::access::address_space addressSpace = atomic::global_space,
           typename T>
 __dpct_inline__ T atomic_add(T* __restrict__ addr, T val)
 {
-    if constexpr (std::is_same_v<T, sycl::half> ||
-                  std::is_same_v<T, std::complex<sycl::half>>) {
-        // unsupported
-        auto old = *addr;
-        *addr += val;
-        return old;
-    } else {
-        return detail::atomic_helper<addressSpace, T>::atomic_add(addr, val);
-    }
+    return detail::atomic_helper<addressSpace, T>::atomic_add(addr, val);
 }
 
 
@@ -286,18 +274,7 @@ template <sycl::access::address_space addressSpace = atomic::global_space,
           typename T>
 __dpct_inline__ T atomic_max(T* __restrict__ addr, T val)
 {
-    if constexpr (std::is_same_v<T, sycl::half> ||
-                  std::is_same_v<T, std::complex<sycl::half>>) {
-        // unsupported
-        auto old = *addr;
-        if (val > *addr) {
-            *addr = val;
-        }
-        return old;
-    } else {
-        return detail::atomic_max_helper<addressSpace, T>::atomic_max(addr,
-                                                                      val);
-    }
+    return detail::atomic_max_helper<addressSpace, T>::atomic_max(addr, val);
 }
 
 
diff --git a/dpcpp/matrix/coo_kernels.dp.cpp b/dpcpp/matrix/coo_kernels.dp.cpp
index 636c6fccda4..3882a714104 100644
--- a/dpcpp/matrix/coo_kernels.dp.cpp
+++ b/dpcpp/matrix/coo_kernels.dp.cpp
@@ -291,27 +291,32 @@ void spmv2(std::shared_ptr<const DpcppExecutor> exec,
     const dim3 coo_block(config::warp_size, warps_in_block, 1);
     const auto nwarps = host_kernel::calculate_nwarps(exec, nnz);
 
-    if (nwarps > 0) {
-        if (b_ncols < 4) {
-            const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols);
-            int num_lines = ceildiv(nnz, nwarps * config::warp_size);
-            abstract_spmv(coo_grid, coo_block, 0, exec->get_queue(), nnz,
-                          num_lines, as_device_type(a->get_const_values()),
-                          a->get_const_col_idxs(), a->get_const_row_idxs(),
-                          as_device_type(b->get_const_values()),
-                          b->get_stride(), as_device_type(c->get_values()),
-                          c->get_stride());
-        } else {
-            int num_elems =
-                ceildiv(nnz, nwarps * config::warp_size) * config::warp_size;
-            const dim3 coo_grid(ceildiv(nwarps, warps_in_block),
-                                ceildiv(b_ncols, config::warp_size));
-            abstract_spmm(coo_grid, coo_block, 0, exec->get_queue(), nnz,
-                          num_elems, as_device_type(a->get_const_values()),
-                          a->get_const_col_idxs(), a->get_const_row_idxs(),
-                          b_ncols, as_device_type(b->get_const_values()),
-                          b->get_stride(), as_device_type(c->get_values()),
-                          c->get_stride());
+    // not support 16 bit atomic
+    if constexpr (std::is_same_v<remove_complex<ValueType>, half>) {
+        GKO_NOT_SUPPORTED(c);
+    } else {
+        if (nwarps > 0) {
+            if (b_ncols < 4) {
+                const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols);
+                int num_lines = ceildiv(nnz, nwarps * config::warp_size);
+                abstract_spmv(coo_grid, coo_block, 0, exec->get_queue(), nnz,
+                              num_lines, as_device_type(a->get_const_values()),
+                              a->get_const_col_idxs(), a->get_const_row_idxs(),
+                              as_device_type(b->get_const_values()),
+                              b->get_stride(), as_device_type(c->get_values()),
+                              c->get_stride());
+            } else {
+                int num_elems = ceildiv(nnz, nwarps * config::warp_size) *
+                                config::warp_size;
+                const dim3 coo_grid(ceildiv(nwarps, warps_in_block),
+                                    ceildiv(b_ncols, config::warp_size));
+                abstract_spmm(coo_grid, coo_block, 0, exec->get_queue(), nnz,
+                              num_elems, as_device_type(a->get_const_values()),
+                              a->get_const_col_idxs(), a->get_const_row_idxs(),
+                              b_ncols, as_device_type(b->get_const_values()),
+                              b->get_stride(), as_device_type(c->get_values()),
+                              c->get_stride());
+            }
         }
     }
 }
@@ -332,29 +337,34 @@ void advanced_spmv2(std::shared_ptr<const DpcppExecutor> exec,
     const dim3 coo_block(config::warp_size, warps_in_block, 1);
     const auto b_ncols = b->get_size()[1];
 
-    if (nwarps > 0) {
-        if (b_ncols < 4) {
-            int num_lines = ceildiv(nnz, nwarps * config::warp_size);
-            const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols);
-            abstract_spmv(coo_grid, coo_block, 0, exec->get_queue(), nnz,
-                          num_lines, as_device_type(alpha->get_const_values()),
-                          as_device_type(a->get_const_values()),
-                          a->get_const_col_idxs(), a->get_const_row_idxs(),
-                          as_device_type(b->get_const_values()),
-                          b->get_stride(), as_device_type(c->get_values()),
-                          c->get_stride());
-        } else {
-            int num_elems =
-                ceildiv(nnz, nwarps * config::warp_size) * config::warp_size;
-            const dim3 coo_grid(ceildiv(nwarps, warps_in_block),
-                                ceildiv(b_ncols, config::warp_size));
-            abstract_spmm(coo_grid, coo_block, 0, exec->get_queue(), nnz,
-                          num_elems, as_device_type(alpha->get_const_values()),
-                          as_device_type(a->get_const_values()),
-                          a->get_const_col_idxs(), a->get_const_row_idxs(),
-                          b_ncols, as_device_type(b->get_const_values()),
-                          b->get_stride(), as_device_type(c->get_values()),
-                          c->get_stride());
+    // not support 16 bit atomic
+    if constexpr (std::is_same_v<remove_complex<ValueType>, gko::half>) {
+        GKO_NOT_SUPPORTED(c);
+    } else {
+        if (nwarps > 0) {
+            if (b_ncols < 4) {
+                int num_lines = ceildiv(nnz, nwarps * config::warp_size);
+                const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols);
+                abstract_spmv(
+                    coo_grid, coo_block, 0, exec->get_queue(), nnz, num_lines,
+                    as_device_type(alpha->get_const_values()),
+                    as_device_type(a->get_const_values()),
+                    a->get_const_col_idxs(), a->get_const_row_idxs(),
+                    as_device_type(b->get_const_values()), b->get_stride(),
+                    as_device_type(c->get_values()), c->get_stride());
+            } else {
+                int num_elems = ceildiv(nnz, nwarps * config::warp_size) *
+                                config::warp_size;
+                const dim3 coo_grid(ceildiv(nwarps, warps_in_block),
+                                    ceildiv(b_ncols, config::warp_size));
+                abstract_spmm(
+                    coo_grid, coo_block, 0, exec->get_queue(), nnz, num_elems,
+                    as_device_type(alpha->get_const_values()),
+                    as_device_type(a->get_const_values()),
+                    a->get_const_col_idxs(), a->get_const_row_idxs(), b_ncols,
+                    as_device_type(b->get_const_values()), b->get_stride(),
+                    as_device_type(c->get_values()), c->get_stride());
+            }
         }
     }
 }
diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp
index f4fb0bc1ec6..d54df253a9b 100644
--- a/dpcpp/matrix/csr_kernels.dp.cpp
+++ b/dpcpp/matrix/csr_kernels.dp.cpp
@@ -1353,7 +1353,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv);
 
 template <typename MatrixValueType, typename InputValueType,
           typename OutputValueType, typename IndexType>
-void load_balance_spmv(std::shared_ptr<const DpcppExecutor> exec,
+bool load_balance_spmv(std::shared_ptr<const DpcppExecutor> exec,
                        const matrix::Csr<MatrixValueType, IndexType>* a,
                        const matrix::Dense<InputValueType>* b,
                        matrix::Dense<OutputValueType>* c,
@@ -1363,40 +1363,49 @@ void load_balance_spmv(std::shared_ptr<const DpcppExecutor> exec,
     using arithmetic_type =
         highest_precision<InputValueType, OutputValueType, MatrixValueType>;
 
-    if (beta) {
-        dense::scale(exec, beta, c);
+    // not support 16 bit atomic
+    if constexpr (std::is_same_v<remove_complex<OutputValueType>, half>) {
+        return false;
     } else {
-        dense::fill(exec, c, zero<OutputValueType>());
-    }
-    const IndexType nwarps = a->get_num_srow_elements();
-    if (nwarps > 0) {
-        const dim3 csr_block(config::warp_size, warps_in_block, 1);
-        const dim3 csr_grid(ceildiv(nwarps, warps_in_block), b->get_size()[1]);
-        const auto a_vals =
-            acc::helper::build_const_rrm_accessor<arithmetic_type>(a);
-        const auto b_vals =
-            acc::helper::build_const_rrm_accessor<arithmetic_type>(b);
-        auto c_vals = acc::helper::build_rrm_accessor<arithmetic_type>(c);
-        if (alpha) {
-            if (csr_grid.x > 0 && csr_grid.y > 0) {
-                csr::kernel::abstract_spmv(
-                    csr_grid, csr_block, 0, exec->get_queue(), nwarps,
-                    static_cast<IndexType>(a->get_size()[0]),
-                    as_device_type(alpha->get_const_values()),
-                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
-                    a->get_const_row_ptrs(), a->get_const_srow(),
-                    acc::as_device_range(b_vals), acc::as_device_range(c_vals));
-            }
+        if (beta) {
+            dense::scale(exec, beta, c);
         } else {
-            if (csr_grid.x > 0 && csr_grid.y > 0) {
-                csr::kernel::abstract_spmv(
-                    csr_grid, csr_block, 0, exec->get_queue(), nwarps,
-                    static_cast<IndexType>(a->get_size()[0]),
-                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
-                    a->get_const_row_ptrs(), a->get_const_srow(),
-                    acc::as_device_range(b_vals), acc::as_device_range(c_vals));
+            dense::fill(exec, c, zero<OutputValueType>());
+        }
+        const IndexType nwarps = a->get_num_srow_elements();
+        if (nwarps > 0) {
+            const dim3 csr_block(config::warp_size, warps_in_block, 1);
+            const dim3 csr_grid(ceildiv(nwarps, warps_in_block),
+                                b->get_size()[1]);
+            const auto a_vals =
+                acc::helper::build_const_rrm_accessor<arithmetic_type>(a);
+            const auto b_vals =
+                acc::helper::build_const_rrm_accessor<arithmetic_type>(b);
+            auto c_vals = acc::helper::build_rrm_accessor<arithmetic_type>(c);
+            if (alpha) {
+                if (csr_grid.x > 0 && csr_grid.y > 0) {
+                    csr::kernel::abstract_spmv(
+                        csr_grid, csr_block, 0, exec->get_queue(), nwarps,
+                        static_cast<IndexType>(a->get_size()[0]),
+                        as_device_type(alpha->get_const_values()),
+                        acc::as_device_range(a_vals), a->get_const_col_idxs(),
+                        a->get_const_row_ptrs(), a->get_const_srow(),
+                        acc::as_device_range(b_vals),
+                        acc::as_device_range(c_vals));
+                }
+            } else {
+                if (csr_grid.x > 0 && csr_grid.y > 0) {
+                    csr::kernel::abstract_spmv(
+                        csr_grid, csr_block, 0, exec->get_queue(), nwarps,
+                        static_cast<IndexType>(a->get_size()[0]),
+                        acc::as_device_range(a_vals), a->get_const_col_idxs(),
+                        a->get_const_row_ptrs(), a->get_const_srow(),
+                        acc::as_device_range(b_vals),
+                        acc::as_device_range(c_vals));
+                }
             }
         }
+        return true;
     }
 }
 
@@ -1502,9 +1511,7 @@ void spmv(std::shared_ptr<const DpcppExecutor> exec,
         dense::fill(exec, c, zero<OutputValueType>());
         return;
     }
-    if (a->get_strategy()->get_name() == "load_balance") {
-        host_kernel::load_balance_spmv(exec, a, b, c);
-    } else if (a->get_strategy()->get_name() == "merge_path") {
+    if (a->get_strategy()->get_name() == "merge_path") {
         using arithmetic_type =
             highest_precision<InputValueType, OutputValueType, MatrixValueType>;
         int items_per_thread =
@@ -1518,8 +1525,10 @@ void spmv(std::shared_ptr<const DpcppExecutor> exec,
             syn::value_list<int>(), syn::type_list<>(), exec, a, b, c);
     } else {
         bool use_classical = true;
-        if (a->get_strategy()->get_name() == "sparselib" ||
-            a->get_strategy()->get_name() == "cusparse") {
+        if (a->get_strategy()->get_name() == "load_balance") {
+            use_classical = !host_kernel::load_balance_spmv(exec, a, b, c);
+        } else if (a->get_strategy()->get_name() == "sparselib" ||
+                   a->get_strategy()->get_name() == "cusparse") {
             use_classical = !host_kernel::try_sparselib_spmv(exec, a, b, c);
         }
         if (use_classical) {
@@ -1571,9 +1580,7 @@ void advanced_spmv(std::shared_ptr<const DpcppExecutor> exec,
         dense::scale(exec, beta, c);
         return;
     }
-    if (a->get_strategy()->get_name() == "load_balance") {
-        host_kernel::load_balance_spmv(exec, a, b, c, alpha, beta);
-    } else if (a->get_strategy()->get_name() == "merge_path") {
+    if (a->get_strategy()->get_name() == "merge_path") {
         using arithmetic_type =
             highest_precision<InputValueType, OutputValueType, MatrixValueType>;
         int items_per_thread =
@@ -1588,8 +1595,11 @@ void advanced_spmv(std::shared_ptr<const DpcppExecutor> exec,
             beta);
     } else {
         bool use_classical = true;
-        if (a->get_strategy()->get_name() == "sparselib" ||
-            a->get_strategy()->get_name() == "cusparse") {
+        if (a->get_strategy()->get_name() == "load_balance") {
+            use_classical =
+                !host_kernel::load_balance_spmv(exec, a, b, c, alpha, beta);
+        } else if (a->get_strategy()->get_name() == "sparselib" ||
+                   a->get_strategy()->get_name() == "cusparse") {
             use_classical =
                 !host_kernel::try_sparselib_spmv(exec, a, b, c, alpha, beta);
         }
diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp
index 4f6453d4767..cd3cbdd190e 100644
--- a/dpcpp/matrix/ell_kernels.dp.cpp
+++ b/dpcpp/matrix/ell_kernels.dp.cpp
@@ -97,7 +97,7 @@ void spmv_kernel(
     using arithmetic_type = typename a_accessor::arithmetic_type;
     const auto tidx = thread::get_thread_id_flat(item_ct1);
     const decltype(tidx) column_id = item_ct1.get_group(1);
-    if (num_thread_per_worker == 1) {
+    if constexpr (num_thread_per_worker == 1) {
         // Specialize the num_thread_per_worker = 1. It doesn't need the shared
         // memory, __syncthreads, and atomic_add
         if (tidx < num_rows) {
@@ -146,7 +146,7 @@ void spmv_kernel(
         item_ct1.barrier(sycl::access::fence_space::local_space);
         if (runnable && idx_in_worker == 0) {
             const auto c_ind = x * c_stride + column_id;
-            if (atomic) {
+            if constexpr (atomic) {
                 atomic_add(&(c[c_ind]),
                            op(storage[item_ct1.get_local_id(2)], c[c_ind]));
             } else {
@@ -219,7 +219,7 @@ void spmv(
     using arithmetic_type = typename a_accessor::arithmetic_type;
     const auto alpha_val = alpha(0);
     const OutputValueType beta_val = beta[0];
-    if (atomic) {
+    if constexpr (atomic) {
         // Because the atomic operation changes the values of c during
         // computation, it can not directly do alpha * a * b + beta * c
         // operation. The beta * c needs to be done before calling this kernel.
@@ -311,37 +311,49 @@ void abstract_spmv(syn::value_list<int, info>,
     const dim3 grid_size(ceildiv(nrows * num_worker_per_row, block_size.x),
                          b->get_size()[1], 1);
 
-    const auto a_vals = gko::acc::range<a_accessor>(
-        std::array<acc::size_type, 1>{{static_cast<acc::size_type>(
-            num_stored_elements_per_row * stride)}},
-        a->get_const_values());
-    const auto b_vals = gko::acc::range<b_accessor>(
-        std::array<acc::size_type, 2>{
-            {static_cast<acc::size_type>(b->get_size()[0]),
-             static_cast<acc::size_type>(b->get_size()[1])}},
-        b->get_const_values(),
-        std::array<acc::size_type, 1>{
-            {static_cast<acc::size_type>(b->get_stride())}});
-
-    if (alpha == nullptr && beta == nullptr) {
-        kernel::spmv<num_thread_per_worker, atomic>(
-            grid_size, block_size, 0, exec->get_queue(), nrows,
-            num_worker_per_row, acc::as_device_range(a_vals),
-            a->get_const_col_idxs(), stride, num_stored_elements_per_row,
-            acc::as_device_range(b_vals), as_device_type(c->get_values()),
-            c->get_stride());
-    } else if (alpha != nullptr && beta != nullptr) {
-        const auto alpha_val = gko::acc::range<a_accessor>(
-            std::array<acc::size_type, 1>{1}, alpha->get_const_values());
-        kernel::spmv<num_thread_per_worker, atomic>(
-            grid_size, block_size, 0, exec->get_queue(), nrows,
-            num_worker_per_row, acc::as_device_range(alpha_val),
-            acc::as_device_range(a_vals), a->get_const_col_idxs(), stride,
-            num_stored_elements_per_row, acc::as_device_range(b_vals),
-            as_device_type(beta->get_const_values()),
-            as_device_type(c->get_values()), c->get_stride());
-    } else {
+    // not support 16 bit atomic
+    // We do atomic on shared memory when num_thread_per_worker is not 1.
+    // If atomic is also true, we also do atomic on out_vector.
+    constexpr bool shared_half =
+        std::is_same_v<remove_complex<arithmetic_type>, half>;
+    constexpr bool atomic_half_out =
+        atomic && std::is_same_v<remove_complex<OutputValueType>, half>;
+    if constexpr (num_thread_per_worker != 1 &&
+                  (shared_half || atomic_half_out)) {
         GKO_KERNEL_NOT_FOUND;
+    } else {
+        const auto a_vals = gko::acc::range<a_accessor>(
+            std::array<acc::size_type, 1>{{static_cast<acc::size_type>(
+                num_stored_elements_per_row * stride)}},
+            a->get_const_values());
+        const auto b_vals = gko::acc::range<b_accessor>(
+            std::array<acc::size_type, 2>{
+                {static_cast<acc::size_type>(b->get_size()[0]),
+                 static_cast<acc::size_type>(b->get_size()[1])}},
+            b->get_const_values(),
+            std::array<acc::size_type, 1>{
+                {static_cast<acc::size_type>(b->get_stride())}});
+
+        if (alpha == nullptr && beta == nullptr) {
+            kernel::spmv<num_thread_per_worker, atomic>(
+                grid_size, block_size, 0, exec->get_queue(), nrows,
+                num_worker_per_row, acc::as_device_range(a_vals),
+                a->get_const_col_idxs(), stride, num_stored_elements_per_row,
+                acc::as_device_range(b_vals), as_device_type(c->get_values()),
+                c->get_stride());
+        } else if (alpha != nullptr && beta != nullptr) {
+            const auto alpha_val = gko::acc::range<a_accessor>(
+                std::array<acc::size_type, 1>{1}, alpha->get_const_values());
+            kernel::spmv<num_thread_per_worker, atomic>(
+                grid_size, block_size, 0, exec->get_queue(), nrows,
+                num_worker_per_row, acc::as_device_range(alpha_val),
+                acc::as_device_range(a_vals), a->get_const_col_idxs(), stride,
+                num_stored_elements_per_row, acc::as_device_range(b_vals),
+                as_device_type(beta->get_const_values()),
+                as_device_type(c->get_values()), c->get_stride());
+        } else {
+            GKO_KERNEL_NOT_FOUND;
+        }
     }
 }
 
diff --git a/dpcpp/solver/idr_kernels.dp.cpp b/dpcpp/solver/idr_kernels.dp.cpp
index bbc81fee963..b34b7123d45 100644
--- a/dpcpp/solver/idr_kernels.dp.cpp
+++ b/dpcpp/solver/idr_kernels.dp.cpp
@@ -673,21 +673,18 @@ void update_g_and_u(std::shared_ptr<const DpcppExecutor> exec,
 
     for (size_type i = 0; i < k; i++) {
         const auto p_i = as_device_type(p->get_const_values()) + i * p_stride;
-        auto gko_impl = [&]() {
-            components::fill_array(exec, alpha->get_values(), nrhs,
-                                   zero<ValueType>());
-            multidot_kernel(grid_dim, block_dim, 0, exec->get_queue(), size,
-                            nrhs, p_i, as_device_type(g_k->get_values()),
-                            g_k->get_stride(),
-                            as_device_type(alpha->get_values()),
-                            stop_status->get_const_data());
-        };
-        if constexpr (std::is_same_v<ValueType, half> ||
-                      is_complex<ValueType>()) {
-            gko_impl();
+        // not support 16 bit atomic
+        if constexpr (std::is_same_v<remove_complex<ValueType>, half>) {
+            GKO_NOT_SUPPORTED(alpha);
         } else {
-            if (nrhs > 1) {
-                gko_impl();
+            if (nrhs > 1 || is_complex<ValueType>()) {
+                components::fill_array(exec, alpha->get_values(), nrhs,
+                                       zero<ValueType>());
+                multidot_kernel(grid_dim, block_dim, 0, exec->get_queue(), size,
+                                nrhs, p_i, as_device_type(g_k->get_values()),
+                                g_k->get_stride(),
+                                as_device_type(alpha->get_values()),
+                                stop_status->get_const_data());
             } else {
                 onemkl::dot(*exec->get_queue(), size, p_i, 1, g_k->get_values(),
                             g_k->get_stride(),
@@ -731,20 +728,16 @@ void update_m(std::shared_ptr<const DpcppExecutor> exec, const size_type nrhs,
     for (size_type i = k; i < subspace_dim; i++) {
         const auto p_i = p->get_const_values() + i * p_stride;
         auto m_i = m->get_values() + i * m_stride + k * nrhs;
-        auto gko_impl = [&]() {
-            components::fill_array(exec, m_i, nrhs, zero<ValueType>());
-            multidot_kernel(grid_dim, block_dim, 0, exec->get_queue(), size,
-                            nrhs, as_device_type(p_i),
-                            as_device_type(g_k->get_const_values()),
-                            g_k->get_stride(), as_device_type(m_i),
-                            stop_status->get_const_data());
-        };
-        if constexpr (std::is_same_v<ValueType, half> ||
-                      is_complex<ValueType>()) {
-            gko_impl();
+        if constexpr (std::is_same_v<remove_complex<ValueType>, half>) {
+            GKO_NOT_SUPPORTED(m_i);
         } else {
-            if (nrhs > 1) {
-                gko_impl();
+            if (nrhs > 1 || is_complex<ValueType>()) {
+                components::fill_array(exec, m_i, nrhs, zero<ValueType>());
+                multidot_kernel(grid_dim, block_dim, 0, exec->get_queue(), size,
+                                nrhs, as_device_type(p_i),
+                                as_device_type(g_k->get_const_values()),
+                                g_k->get_stride(), as_device_type(m_i),
+                                stop_status->get_const_data());
             } else {
                 onemkl::dot(*exec->get_queue(), size, as_device_type(p_i), 1,
                             g_k->get_const_values(), g_k->get_stride(), m_i);

From 09469342bab28f69311b1a7e265fe869a0d49caa Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 26 Nov 2024 09:24:05 +0100
Subject: [PATCH 421/448] only provide the custom operation for complex<half>

---
 dpcpp/preconditioner/batch_block_jacobi.hpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/dpcpp/preconditioner/batch_block_jacobi.hpp b/dpcpp/preconditioner/batch_block_jacobi.hpp
index 89a03907f6f..ec3821fce58 100644
--- a/dpcpp/preconditioner/batch_block_jacobi.hpp
+++ b/dpcpp/preconditioner/batch_block_jacobi.hpp
@@ -129,10 +129,14 @@ class BlockJacobi final {
                 sum += block_val * r[dense_block_col + idx_start];
             }
 
-            // reduction (it does not support half)
-            // sum = sycl::reduce_over_group(sg, sum, sycl::plus<>());
-            for (int i = sg_size / 2; i > 0; i /= 2) {
-                sum += sycl::shift_group_left(sg, sum, i);
+            // reduction (it does not support complex<half>)
+            if constexpr (std::is_same_v<value_type,
+                                         std::complex<sycl::half>>) {
+                for (int i = sg_size / 2; i > 0; i /= 2) {
+                    sum += sycl::shift_group_left(sg, sum, i);
+                }
+            } else {
+                sum = sycl::reduce_over_group(sg, sum, sycl::plus<>());
             }
 
             if (sg_tid == 0) {

From 39b904ccfd4030f3cdb0070576683365310cf270 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 29 Nov 2024 00:43:25 +0100
Subject: [PATCH 422/448] move to gko::complex<sycl::half> alone

---
 accessor/sycl_helper.hpp                    |  10 ++
 cmake/sycl.cmake                            |   4 -
 dpcpp/CMakeLists.txt                        |   5 -
 dpcpp/base/complex.hpp                      | 177 +++++++++-----------
 dpcpp/base/math.hpp                         |  45 ++++-
 dpcpp/base/types.hpp                        |   7 +
 dpcpp/preconditioner/batch_block_jacobi.hpp |   2 +-
 7 files changed, 139 insertions(+), 111 deletions(-)

diff --git a/accessor/sycl_helper.hpp b/accessor/sycl_helper.hpp
index 0de68a25c97..a24287f32bf 100644
--- a/accessor/sycl_helper.hpp
+++ b/accessor/sycl_helper.hpp
@@ -33,6 +33,10 @@ namespace gko {
 class half;
 
 
+template <typename V>
+class complex;
+
+
 namespace acc {
 namespace detail {
 
@@ -81,6 +85,12 @@ struct sycl_type<std::complex<T>> {
 };
 
 
+template <>
+struct sycl_type<std::complex<gko::half>> {
+    using type = gko::complex<typename sycl_type<gko::half>::type>;
+};
+
+
 }  // namespace detail
 
 
diff --git a/cmake/sycl.cmake b/cmake/sycl.cmake
index 5289ee253e7..a8acb0cea6c 100644
--- a/cmake/sycl.cmake
+++ b/cmake/sycl.cmake
@@ -22,10 +22,6 @@ function(gko_add_sycl_to_target)
         "${one_value_args}"
         "${multi_value_args}"
         ${ARGN})
-    # trick for complex header chain
-    if("${GINKGO_DPCPP_MAJOR_VERSION}.${GINKGO_DPCPP_MINOR_VERSION}" VERSION_GREATER_EQUAL 7.1)
-        target_include_directories(${SYCL_TARGET} PRIVATE "${PROJECT_BINARY_DIR}/dpcpp/base")
-    endif()
     if(COMMAND add_sycl_to_target)
         add_sycl_to_target(${ARGN})
         return()
diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index 8da2620ed41..81a2a6034ea 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -4,11 +4,6 @@ find_package(oneDPL REQUIRED HINTS "$ENV{DPL_ROOT}" "$ENV{DPLROOT}")
 set(GINKGO_MKL_ROOT "${MKL_DIR}" PARENT_SCOPE)
 set(GINKGO_DPL_ROOT "${oneDPL_DIR}" PARENT_SCOPE)
 
-# trick for complex header chain
-if("${GINKGO_DPCPP_MAJOR_VERSION}.${GINKGO_DPCPP_MINOR_VERSION}" VERSION_GREATER_EQUAL 7.1)
-    configure_file(base/complex.hpp ${CMAKE_CURRENT_BINARY_DIR}/base/complex)
-endif()
-
 include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
 add_instantiation_files(${PROJECT_SOURCE_DIR}/common/unified matrix/dense_kernels.instantiate.cpp DENSE_INSTANTIATE)
 add_instantiation_files(. solver/batch_bicgstab_launch.instantiate.dp.cpp BATCH_BICGSTAB_INSTANTIATE)
diff --git a/dpcpp/base/complex.hpp b/dpcpp/base/complex.hpp
index 56ce347afdd..143227c7b79 100644
--- a/dpcpp/base/complex.hpp
+++ b/dpcpp/base/complex.hpp
@@ -5,29 +5,19 @@
 #ifndef GKO_DPCPP_BASE_COMPLEX_HPP_
 #define GKO_DPCPP_BASE_COMPLEX_HPP_
 
+#include <complex>
+
 #include <sycl/half_type.hpp>
 
 #include <ginkgo/config.hpp>
 
-// this file is to workaround for the intel sycl complex different loading.
-// intel sycl provides complex and the corresponding searching path. When users
-// load complex with -fsycl, the compiler will load intel's <complex> header
-// first and then load usual <complex> header. However, it implicitly
-// instantiates and uses std::complex<sycl::half>, so we need to provide the
-// implementation before that. In ginkgo, we will definitely load <complex> in
-// the public interface, which is before sycl backend, so we have no normal way
-// to provide the std::complex<sycl::half> implementation in sycl.
-// We apply the same trick to load this file first and then load their header
-// later. We will also configure this file as <complex> and provide the search
-// path in sycl module.
-// They start to do this from LIBSYCL 7.1.0.
-
-namespace std {
+
+namespace gko {
 
 template <typename>
 class complex;
 
-// implement std::complex<sycl::half> before knowing std::complex<float>
+
 template <>
 class complex<sycl::half> {
 public:
@@ -53,7 +43,7 @@ class complex<sycl::half> {
     {}
 
     template <typename T, typename = std::enable_if_t<std::is_scalar<T>::value>>
-    complex(const complex<T>& other)
+    complex(const std::complex<T>& other)
         : real_(static_cast<value_type>(other.real())),
           imag_(static_cast<value_type>(other.imag()))
     {}
@@ -62,7 +52,18 @@ class complex<sycl::half> {
 
     value_type imag() const noexcept { return imag_; }
 
-    inline operator std::complex<float>() const noexcept;
+    operator std::complex<float>() const noexcept
+    {
+        return std::complex<float>(static_cast<float>(real_),
+                                   static_cast<float>(imag_));
+    }
+
+    bool operator!=(const complex& r) const { return !this->operator==(r); }
+
+    bool operator==(const complex& r) const
+    {
+        return real_ == r.real() && imag_ == r.imag();
+    }
 
     template <typename V>
     complex& operator=(const V& val)
@@ -107,7 +108,7 @@ class complex<sycl::half> {
     }
 
     template <typename T>
-    complex& operator+=(const complex<T>& val)
+    complex& operator+=(const std::complex<T>& val)
     {
         real_ += val.real();
         imag_ += val.imag();
@@ -115,7 +116,7 @@ class complex<sycl::half> {
     }
 
     template <typename T>
-    complex& operator-=(const complex<T>& val)
+    complex& operator-=(const std::complex<T>& val)
     {
         real_ -= val.real();
         imag_ -= val.imag();
@@ -123,21 +124,67 @@ class complex<sycl::half> {
     }
 
     template <typename T>
-    inline complex& operator*=(const complex<T>& val);
+    complex& operator*=(const std::complex<T>& val)
+    {
+        auto val_f = static_cast<std::complex<float>>(val);
+        auto result_f = static_cast<std::complex<float>>(*this);
+        result_f *= val_f;
+        real_ = result_f.real();
+        imag_ = result_f.imag();
+        return *this;
+    }
 
     template <typename T>
-    inline complex& operator/=(const complex<T>& val);
+    complex& operator/=(const std::complex<T>& val)
+    {
+        auto val_f = static_cast<std::complex<float>>(val);
+        auto result_f = static_cast<std::complex<float>>(*this);
+        result_f /= val_f;
+        real_ = result_f.real();
+        imag_ = result_f.imag();
+        return *this;
+    }
+
+    complex& operator+=(const complex& val)
+    {
+        real_ += val.real();
+        imag_ += val.imag();
+        return *this;
+    }
+
+    complex& operator-=(const complex& val)
+    {
+        real_ -= val.real();
+        imag_ -= val.imag();
+        return *this;
+    }
 
-// It's for MacOS.
-// TODO: check whether mac compiler always use complex version even when real
-// half
-#define COMPLEX_HALF_OPERATOR(_op, _opeq)                                  \
-    friend complex<sycl::half> operator _op(const complex<sycl::half> lhf, \
-                                            const complex<sycl::half> rhf) \
-    {                                                                      \
-        auto a = lhf;                                                      \
-        a _opeq rhf;                                                       \
-        return a;                                                          \
+    complex& operator*=(const complex& val)
+    {
+        auto val_f = static_cast<std::complex<float>>(val);
+        auto result_f = static_cast<std::complex<float>>(*this);
+        result_f *= val_f;
+        real_ = result_f.real();
+        imag_ = result_f.imag();
+        return *this;
+    }
+
+    complex& operator/=(const complex& val)
+    {
+        auto val_f = static_cast<std::complex<float>>(val);
+        auto result_f = static_cast<std::complex<float>>(*this);
+        result_f /= val_f;
+        real_ = result_f.real();
+        imag_ = result_f.imag();
+        return *this;
+    }
+
+#define COMPLEX_HALF_OPERATOR(_op, _opeq)                               \
+    friend complex operator _op(const complex& lhf, const complex& rhf) \
+    {                                                                   \
+        auto a = lhf;                                                   \
+        a _opeq rhf;                                                    \
+        return a;                                                       \
     }
 
     COMPLEX_HALF_OPERATOR(+, +=)
@@ -147,77 +194,15 @@ class complex<sycl::half> {
 
 #undef COMPLEX_HALF_OPERATOR
 
+    complex operator-() const { return complex(-real_, -imag_); }
+
 private:
     value_type real_;
     value_type imag_;
 };
 
-}  // namespace std
-
-
-// after providing std::complex<sycl::half>, we can load their <complex> to
-// complete the header chain.
-
-#if GINKGO_DPCPP_MAJOR_VERSION > 7 || \
-    (GINKGO_DPCPP_MAJOR_VERSION == 7 && GINKGO_DPCPP_MINOR_VERSION >= 1)
-
-#if defined(__has_include_next)
-// GCC/clang support go through this path.
-#include_next <complex>
-#else
-// MSVC doesn't support "#include_next", so we take the same workaround in
-// stl_wrappers/complex.
-#include <../stl_wrappers/complex>
-#endif
-
-#else
-
-
-#include <complex>
-
-
-#endif
-
-
-// we know the complex<float> now, so we implement those functions requiring
-// complex<float>
-namespace std {
-
-
-inline complex<sycl::half>::operator complex<float>() const noexcept
-{
-    return std::complex<float>(static_cast<float>(real_),
-                               static_cast<float>(imag_));
-}
-
-
-template <typename T>
-inline complex<sycl::half>& complex<sycl::half>::operator*=(
-    const complex<T>& val)
-{
-    auto val_f = static_cast<std::complex<float>>(val);
-    auto result_f = static_cast<std::complex<float>>(*this);
-    result_f *= val_f;
-    real_ = result_f.real();
-    imag_ = result_f.imag();
-    return *this;
-}
-
-
-template <typename T>
-inline complex<sycl::half>& complex<sycl::half>::operator/=(
-    const complex<T>& val)
-{
-    auto val_f = static_cast<std::complex<float>>(val);
-    auto result_f = static_cast<std::complex<float>>(*this);
-    result_f /= val_f;
-    real_ = result_f.real();
-    imag_ = result_f.imag();
-    return *this;
-}
-
 
-}  // namespace std
+}  // namespace gko
 
 
 #endif  // GKO_DPCPP_BASE_COMPLEX_HPP_
diff --git a/dpcpp/base/math.hpp b/dpcpp/base/math.hpp
index 2d8e955487d..dd11c31fa10 100644
--- a/dpcpp/base/math.hpp
+++ b/dpcpp/base/math.hpp
@@ -32,6 +32,41 @@ struct basic_float_traits<sycl::half> {
 template <>
 struct is_complex_or_scalar_impl<sycl::half> : public std::true_type {};
 
+template <typename ValueType>
+struct complex_helper {
+    using type = std::complex<ValueType>;
+};
+
+template <>
+struct complex_helper<sycl::half> {
+    using type = gko::complex<sycl::half>;
+};
+
+
+template <typename T>
+struct type_size_impl<gko::complex<T>> {
+    static constexpr auto value = sizeof(T) * byte_size;
+};
+
+
+template <typename T>
+struct remove_complex_impl<gko::complex<T>> {
+    using type = T;
+};
+
+
+template <typename T>
+struct truncate_type_impl<gko::complex<T>> {
+    using type =
+        typename complex_helper<typename truncate_type_impl<T>::type>::type;
+};
+
+template <typename T>
+struct is_complex_impl<gko::complex<T>> : public std::true_type {};
+
+template <typename T>
+struct is_complex_or_scalar_impl<gko::complex<T>>
+    : public is_complex_or_scalar_impl<T> {};
 
 }  // namespace detail
 
@@ -41,7 +76,7 @@ bool __dpct_inline__ is_nan(const sycl::half& val)
     return std::isnan(static_cast<float>(val));
 }
 
-bool __dpct_inline__ is_nan(const std::complex<sycl::half>& val)
+bool __dpct_inline__ is_nan(const gko::complex<sycl::half>& val)
 {
     return is_nan(val.real()) || is_nan(val.imag());
 }
@@ -52,7 +87,7 @@ sycl::half __dpct_inline__ abs(const sycl::half& val)
     return abs(static_cast<float>(val));
 }
 
-sycl::half __dpct_inline__ abs(const std::complex<sycl::half>& val)
+sycl::half __dpct_inline__ abs(const gko::complex<sycl::half>& val)
 {
     return abs(static_cast<std::complex<float>>(val));
 }
@@ -62,8 +97,8 @@ sycl::half __dpct_inline__ sqrt(const sycl::half& val)
     return sqrt(static_cast<float>(val));
 }
 
-std::complex<sycl::half> __dpct_inline__
-sqrt(const std::complex<sycl::half>& val)
+gko::complex<sycl::half> __dpct_inline__
+sqrt(const gko::complex<sycl::half>& val)
 {
     return sqrt(static_cast<std::complex<float>>(val));
 }
@@ -74,7 +109,7 @@ bool __dpct_inline__ is_finite(const sycl::half& value)
     return abs(value) < std::numeric_limits<sycl::half>::infinity();
 }
 
-bool __dpct_inline__ is_finite(const std::complex<sycl::half>& value)
+bool __dpct_inline__ is_finite(const gko::complex<sycl::half>& value)
 {
     return is_finite(value.real()) && is_finite(value.imag());
 }
diff --git a/dpcpp/base/types.hpp b/dpcpp/base/types.hpp
index df30d830c28..2f2934839fc 100644
--- a/dpcpp/base/types.hpp
+++ b/dpcpp/base/types.hpp
@@ -14,6 +14,8 @@
 #include <ginkgo/core/base/matrix_data.hpp>
 #include <ginkgo/core/base/types.hpp>
 
+#include "dpcpp/base/complex.hpp"
+
 
 namespace gko {
 namespace kernels {
@@ -56,6 +58,11 @@ struct sycl_type_impl<std::complex<T>> {
     using type = std::complex<typename sycl_type_impl<T>::type>;
 };
 
+template <>
+struct sycl_type_impl<std::complex<gko::half>> {
+    using type = gko::complex<typename sycl_type_impl<gko::half>::type>;
+};
+
 template <typename ValueType, typename IndexType>
 struct sycl_type_impl<matrix_data_entry<ValueType, IndexType>> {
     using type =
diff --git a/dpcpp/preconditioner/batch_block_jacobi.hpp b/dpcpp/preconditioner/batch_block_jacobi.hpp
index ec3821fce58..1aa6b3cb591 100644
--- a/dpcpp/preconditioner/batch_block_jacobi.hpp
+++ b/dpcpp/preconditioner/batch_block_jacobi.hpp
@@ -131,7 +131,7 @@ class BlockJacobi final {
 
             // reduction (it does not support complex<half>)
             if constexpr (std::is_same_v<value_type,
-                                         std::complex<sycl::half>>) {
+                                         gko::complex<sycl::half>>) {
                 for (int i = sg_size / 2; i > 0; i /= 2) {
                     sum += sycl::shift_group_left(sg, sum, i);
                 }

From f99e203a1615487b2f36624187804b1703405642 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 3 Dec 2024 10:37:48 +0100
Subject: [PATCH 423/448] update docuementation

---
 CMakeLists.txt | 4 ++--
 INSTALL.md     | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8a347d010dd..8362cfaa277 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,9 +33,9 @@ option(GINKGO_FAST_TESTS "Reduces the input size for a few tests known to be tim
 option(GINKGO_TEST_NONDEFAULT_STREAM "Uses non-default streams in CUDA and HIP tests" OFF)
 option(GINKGO_MIXED_PRECISION "Instantiate true mixed-precision kernels (otherwise they will be conversion-based using implicit temporary storage)" OFF)
 option(GINKGO_ENABLE_HALF "Enable the use of half precision" ON)
-# We do not support MSVC.
+# We do not support half precision in MSVC.
 if(MSVC)
-    message(STATUS "HALF is not supported in MSVC")
+    message(STATUS "We do not support half precision in MSVC.")
     set(GINKGO_ENABLE_HALF OFF CACHE BOOL "Enable the use of half precision" FORCE)
 endif()
 option(GINKGO_SKIP_DEPENDENCY_UPDATE
diff --git a/INSTALL.md b/INSTALL.md
index 9719bdfb920..87ed9c4f61a 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -25,6 +25,8 @@ Ginkgo adds the following additional switches to control what is being built:
     instead of converting data on the fly, default is `OFF`.
     Enabling this flag increases the library size, but improves performance of
     mixed-precision kernels.
+*   `-DGINKGO_ENABLE_HALF={ON, OFF}` enable half precision support in Ginkgo, default is `ON`.
+    It is `OFF` when the compiler is MSVC.
 *   `-DGINKGO_BUILD_TESTS={ON, OFF}` builds Ginkgo's tests
     (will download googletest), default is `ON`.
 *   `-DGINKGO_FAST_TESTS={ON, OFF}` reduces the input sizes for a few slow tests

From 169db0fe1e0cc53251f95d5d9509d2f4fda35c28 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 2 Dec 2024 11:44:54 +0100
Subject: [PATCH 424/448] rename precision chain

---
 benchmark/utils/formats.hpp                   |  8 +-
 core/base/batch_multi_vector.cpp              | 10 +--
 core/distributed/matrix.cpp                   |  8 +-
 core/distributed/vector.cpp                   |  4 +-
 core/matrix/batch_csr.cpp                     | 10 +--
 core/matrix/batch_dense.cpp                   | 11 +--
 core/matrix/batch_ell.cpp                     | 10 +--
 core/matrix/coo.cpp                           | 10 +--
 core/matrix/csr.cpp                           | 10 +--
 core/matrix/dense.cpp                         | 15 ++--
 core/matrix/diagonal.cpp                      | 11 +--
 core/matrix/ell.cpp                           | 10 +--
 core/matrix/fbcsr.cpp                         | 11 ++-
 core/matrix/hybrid.cpp                        | 10 +--
 core/matrix/sellp.cpp                         | 10 +--
 core/preconditioner/jacobi.cpp                | 11 +--
 .../ginkgo/core/base/batch_multi_vector.hpp   | 42 +++++-----
 include/ginkgo/core/base/math.hpp             | 36 +++++----
 .../ginkgo/core/base/precision_dispatch.hpp   | 25 +++---
 include/ginkgo/core/distributed/matrix.hpp    | 21 +++--
 include/ginkgo/core/distributed/vector.hpp    | 15 ++--
 include/ginkgo/core/matrix/batch_csr.hpp      | 36 ++++-----
 include/ginkgo/core/matrix/batch_dense.hpp    | 31 ++++----
 include/ginkgo/core/matrix/batch_ell.hpp      | 36 ++++-----
 include/ginkgo/core/matrix/coo.hpp            | 60 ++++++---------
 include/ginkgo/core/matrix/csr.hpp            | 76 ++++++++-----------
 include/ginkgo/core/matrix/dense.hpp          | 35 ++++-----
 include/ginkgo/core/matrix/diagonal.hpp       | 37 ++++-----
 include/ginkgo/core/matrix/ell.hpp            | 60 ++++++---------
 include/ginkgo/core/matrix/fbcsr.hpp          | 66 +++++++---------
 include/ginkgo/core/matrix/hybrid.hpp         | 43 +++++------
 include/ginkgo/core/matrix/sellp.hpp          | 62 +++++++--------
 .../test/base/batch_multi_vector_kernels.cpp  |  8 +-
 reference/test/base/combination.cpp           | 10 +--
 reference/test/base/composition.cpp           | 10 +--
 reference/test/base/perturbation.cpp          | 10 +--
 reference/test/matrix/coo_kernels.cpp         | 19 +++--
 reference/test/matrix/csr_kernels.cpp         | 41 +++++-----
 reference/test/matrix/dense_kernels.cpp       | 15 ++--
 reference/test/matrix/diagonal_kernels.cpp    | 11 ++-
 reference/test/matrix/ell_kernels.cpp         | 41 +++++-----
 reference/test/matrix/fbcsr_kernels.cpp       |  8 +-
 reference/test/matrix/hybrid_kernels.cpp      | 15 ++--
 reference/test/matrix/identity.cpp            |  3 +-
 reference/test/matrix/sellp_kernels.cpp       | 14 ++--
 .../test/matrix/sparsity_csr_kernels.cpp      | 10 +--
 reference/test/preconditioner/ic.cpp          | 10 +--
 reference/test/preconditioner/ilu.cpp         | 12 ++-
 reference/test/preconditioner/jacobi.cpp      |  2 +-
 .../test/preconditioner/jacobi_kernels.cpp    | 13 ++--
 reference/test/reorder/scaled_reordered.cpp   |  4 +-
 reference/test/solver/bicg_kernels.cpp        | 12 ++-
 reference/test/solver/bicgstab_kernels.cpp    | 12 ++-
 reference/test/solver/cb_gmres_kernels.cpp    | 12 +--
 reference/test/solver/cg_kernels.cpp          | 12 ++-
 reference/test/solver/cgs_kernels.cpp         | 12 ++-
 reference/test/solver/fcg_kernels.cpp         | 12 ++-
 reference/test/solver/gcr_kernels.cpp         | 12 ++-
 reference/test/solver/gmres_kernels.cpp       | 12 ++-
 reference/test/solver/idr_kernels.cpp         |  9 +--
 reference/test/solver/ir_kernels.cpp          |  9 +--
 reference/test/solver/lower_trs_kernels.cpp   | 10 +--
 reference/test/solver/multigrid_kernels.cpp   |  6 +-
 reference/test/solver/upper_trs_kernels.cpp   | 10 +--
 test/matrix/csr_kernels2.cpp                  |  2 +-
 test/matrix/matrix.cpp                        |  2 +-
 test/mpi/matrix.cpp                           |  4 +-
 test/mpi/solver/solver.cpp                    |  4 +-
 test/mpi/vector.cpp                           |  4 +-
 test/solver/solver.cpp                        |  2 +-
 70 files changed, 544 insertions(+), 710 deletions(-)

diff --git a/benchmark/utils/formats.hpp b/benchmark/utils/formats.hpp
index 13f2cee1056..58b66e06f14 100644
--- a/benchmark/utils/formats.hpp
+++ b/benchmark/utils/formats.hpp
@@ -129,7 +129,7 @@ using hybrid = gko::matrix::Hybrid<etype, itype>;
 using csr = gko::matrix::Csr<etype, itype>;
 using coo = gko::matrix::Coo<etype, itype>;
 using ell = gko::matrix::Ell<etype, itype>;
-using ell_mixed = gko::matrix::Ell<gko::next_precision<etype>, itype>;
+using ell_mixed = gko::matrix::Ell<gko::next_precision_base<etype>, itype>;
 
 
 /**
@@ -274,7 +274,7 @@ std::unique_ptr<gko::LinOp> matrix_factory(
         check_ell_admissibility(data);
     }
     if (format == "ell_mixed") {
-        gko::matrix_data<gko::next_precision<etype>, itype> conv_data;
+        gko::matrix_data<gko::next_precision_base<etype>, itype> conv_data;
         conv_data.size = data.size;
         conv_data.nonzeros.resize(data.nonzeros.size());
         auto it = conv_data.nonzeros.begin();
@@ -284,8 +284,8 @@ std::unique_ptr<gko::LinOp> matrix_factory(
             it->value = el.value;
             ++it;
         }
-        gko::as<gko::ReadableFromMatrixData<gko::next_precision<etype>, itype>>(
-            mat.get())
+        gko::as<gko::ReadableFromMatrixData<gko::next_precision_base<etype>,
+                                            itype>>(mat.get())
             ->read(conv_data);
     } else {
         gko::as<gko::ReadableFromMatrixData<etype, itype>>(mat.get())->read(
diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp
index 1eb3cd8f60d..4cd6b81d5bb 100644
--- a/core/base/batch_multi_vector.cpp
+++ b/core/base/batch_multi_vector.cpp
@@ -281,7 +281,7 @@ void MultiVector<ValueType>::compute_norm2(
 
 template <typename ValueType>
 void MultiVector<ValueType>::convert_to(
-    MultiVector<next_precision_with_half<ValueType>>* result) const
+    MultiVector<next_precision<ValueType>>* result) const
 {
     result->values_ = this->values_;
     result->set_size(this->get_size());
@@ -290,7 +290,7 @@ void MultiVector<ValueType>::convert_to(
 
 template <typename ValueType>
 void MultiVector<ValueType>::move_to(
-    MultiVector<next_precision_with_half<ValueType>>* result)
+    MultiVector<next_precision<ValueType>>* result)
 {
     this->convert_to(result);
 }
@@ -299,8 +299,7 @@ void MultiVector<ValueType>::move_to(
 #if GINKGO_ENABLE_HALF
 template <typename ValueType>
 void MultiVector<ValueType>::convert_to(
-    MultiVector<next_precision_with_half<next_precision_with_half<ValueType>>>*
-        result) const
+    MultiVector<next_precision<next_precision<ValueType>>>* result) const
 {
     result->values_ = this->values_;
     result->set_size(this->get_size());
@@ -309,8 +308,7 @@ void MultiVector<ValueType>::convert_to(
 
 template <typename ValueType>
 void MultiVector<ValueType>::move_to(
-    MultiVector<next_precision_with_half<next_precision_with_half<ValueType>>>*
-        result)
+    MultiVector<next_precision<next_precision<ValueType>>>* result)
 {
     this->convert_to(result);
 }
diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp
index 442771c66b5..191c3cc0add 100644
--- a/core/distributed/matrix.cpp
+++ b/core/distributed/matrix.cpp
@@ -203,8 +203,8 @@ Matrix<ValueType, LocalIndexType, GlobalIndexType>::create(
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void Matrix<ValueType, LocalIndexType, GlobalIndexType>::convert_to(
-    Matrix<next_precision<value_type>, local_index_type, global_index_type>*
-        result) const
+    Matrix<next_precision_base<value_type>, local_index_type,
+           global_index_type>* result) const
 {
     GKO_ASSERT(this->get_communicator().size() ==
                result->get_communicator().size());
@@ -222,8 +222,8 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::convert_to(
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void Matrix<ValueType, LocalIndexType, GlobalIndexType>::move_to(
-    Matrix<next_precision<value_type>, local_index_type, global_index_type>*
-        result)
+    Matrix<next_precision_base<value_type>, local_index_type,
+           global_index_type>* result)
 {
     GKO_ASSERT(this->get_communicator().size() ==
                result->get_communicator().size());
diff --git a/core/distributed/vector.cpp b/core/distributed/vector.cpp
index ae7ab182a85..0f2aeda7b5e 100644
--- a/core/distributed/vector.cpp
+++ b/core/distributed/vector.cpp
@@ -283,7 +283,7 @@ void Vector<ValueType>::fill(const ValueType value)
 
 template <typename ValueType>
 void Vector<ValueType>::convert_to(
-    Vector<next_precision<ValueType>>* result) const
+    Vector<next_precision_base<ValueType>>* result) const
 {
     GKO_ASSERT(this->get_communicator().size() ==
                result->get_communicator().size());
@@ -293,7 +293,7 @@ void Vector<ValueType>::convert_to(
 
 
 template <typename ValueType>
-void Vector<ValueType>::move_to(Vector<next_precision<ValueType>>* result)
+void Vector<ValueType>::move_to(Vector<next_precision_base<ValueType>>* result)
 {
     this->convert_to(result);
 }
diff --git a/core/matrix/batch_csr.cpp b/core/matrix/batch_csr.cpp
index 141c5b86d02..3abd1856ce2 100644
--- a/core/matrix/batch_csr.cpp
+++ b/core/matrix/batch_csr.cpp
@@ -246,7 +246,7 @@ void Csr<ValueType, IndexType>::add_scaled_identity(
 
 template <typename ValueType, typename IndexType>
 void Csr<ValueType, IndexType>::convert_to(
-    Csr<next_precision_with_half<ValueType>, IndexType>* result) const
+    Csr<next_precision<ValueType>, IndexType>* result) const
 {
     result->values_ = this->values_;
     result->col_idxs_ = this->col_idxs_;
@@ -257,7 +257,7 @@ void Csr<ValueType, IndexType>::convert_to(
 
 template <typename ValueType, typename IndexType>
 void Csr<ValueType, IndexType>::move_to(
-    Csr<next_precision_with_half<ValueType>, IndexType>* result)
+    Csr<next_precision<ValueType>, IndexType>* result)
 {
     this->convert_to(result);
 }
@@ -266,8 +266,7 @@ void Csr<ValueType, IndexType>::move_to(
 #if GINKGO_ENABLE_HALF
 template <typename ValueType, typename IndexType>
 void Csr<ValueType, IndexType>::convert_to(
-    Csr<next_precision_with_half<next_precision_with_half<ValueType>>,
-        IndexType>* result) const
+    Csr<next_precision<next_precision<ValueType>>, IndexType>* result) const
 {
     result->values_ = this->values_;
     result->col_idxs_ = this->col_idxs_;
@@ -278,8 +277,7 @@ void Csr<ValueType, IndexType>::convert_to(
 
 template <typename ValueType, typename IndexType>
 void Csr<ValueType, IndexType>::move_to(
-    Csr<next_precision_with_half<next_precision_with_half<ValueType>>,
-        IndexType>* result)
+    Csr<next_precision<next_precision<ValueType>>, IndexType>* result)
 {
     this->convert_to(result);
 }
diff --git a/core/matrix/batch_dense.cpp b/core/matrix/batch_dense.cpp
index 0c1838abb56..c256dad59cc 100644
--- a/core/matrix/batch_dense.cpp
+++ b/core/matrix/batch_dense.cpp
@@ -245,7 +245,7 @@ void Dense<ValueType>::add_scaled_identity(
 
 template <typename ValueType>
 void Dense<ValueType>::convert_to(
-    Dense<next_precision_with_half<ValueType>>* result) const
+    Dense<next_precision<ValueType>>* result) const
 {
     result->values_ = this->values_;
     result->set_size(this->get_size());
@@ -253,8 +253,7 @@ void Dense<ValueType>::convert_to(
 
 
 template <typename ValueType>
-void Dense<ValueType>::move_to(
-    Dense<next_precision_with_half<ValueType>>* result)
+void Dense<ValueType>::move_to(Dense<next_precision<ValueType>>* result)
 {
     this->convert_to(result);
 }
@@ -263,8 +262,7 @@ void Dense<ValueType>::move_to(
 #if GINKGO_ENABLE_HALF
 template <typename ValueType>
 void Dense<ValueType>::convert_to(
-    Dense<next_precision_with_half<next_precision_with_half<ValueType>>>*
-        result) const
+    Dense<next_precision<next_precision<ValueType>>>* result) const
 {
     result->values_ = this->values_;
     result->set_size(this->get_size());
@@ -273,8 +271,7 @@ void Dense<ValueType>::convert_to(
 
 template <typename ValueType>
 void Dense<ValueType>::move_to(
-    Dense<next_precision_with_half<next_precision_with_half<ValueType>>>*
-        result)
+    Dense<next_precision<next_precision<ValueType>>>* result)
 {
     this->convert_to(result);
 }
diff --git a/core/matrix/batch_ell.cpp b/core/matrix/batch_ell.cpp
index 3b829d3ba4c..7d18cc1e0ea 100644
--- a/core/matrix/batch_ell.cpp
+++ b/core/matrix/batch_ell.cpp
@@ -266,7 +266,7 @@ void Ell<ValueType, IndexType>::add_scaled_identity(
 
 template <typename ValueType, typename IndexType>
 void Ell<ValueType, IndexType>::convert_to(
-    Ell<next_precision_with_half<ValueType>, IndexType>* result) const
+    Ell<next_precision<ValueType>, IndexType>* result) const
 {
     result->values_ = this->values_;
     result->col_idxs_ = this->col_idxs_;
@@ -277,7 +277,7 @@ void Ell<ValueType, IndexType>::convert_to(
 
 template <typename ValueType, typename IndexType>
 void Ell<ValueType, IndexType>::move_to(
-    Ell<next_precision_with_half<ValueType>, IndexType>* result)
+    Ell<next_precision<ValueType>, IndexType>* result)
 {
     this->convert_to(result);
 }
@@ -286,8 +286,7 @@ void Ell<ValueType, IndexType>::move_to(
 #if GINKGO_ENABLE_HALF
 template <typename ValueType, typename IndexType>
 void Ell<ValueType, IndexType>::convert_to(
-    Ell<next_precision_with_half<next_precision_with_half<ValueType>>,
-        IndexType>* result) const
+    Ell<next_precision<next_precision<ValueType>>, IndexType>* result) const
 {
     result->values_ = this->values_;
     result->col_idxs_ = this->col_idxs_;
@@ -298,8 +297,7 @@ void Ell<ValueType, IndexType>::convert_to(
 
 template <typename ValueType, typename IndexType>
 void Ell<ValueType, IndexType>::move_to(
-    Ell<next_precision_with_half<next_precision_with_half<ValueType>>,
-        IndexType>* result)
+    Ell<next_precision<next_precision<ValueType>>, IndexType>* result)
 {
     this->convert_to(result);
 }
diff --git a/core/matrix/coo.cpp b/core/matrix/coo.cpp
index 7b3b3876295..38894bdb447 100644
--- a/core/matrix/coo.cpp
+++ b/core/matrix/coo.cpp
@@ -214,7 +214,7 @@ void Coo<ValueType, IndexType>::apply2_impl(const LinOp* alpha, const LinOp* b,
 
 template <typename ValueType, typename IndexType>
 void Coo<ValueType, IndexType>::convert_to(
-    Coo<next_precision_with_half<ValueType>, IndexType>* result) const
+    Coo<next_precision<ValueType>, IndexType>* result) const
 {
     result->values_ = this->values_;
     result->row_idxs_ = this->row_idxs_;
@@ -225,7 +225,7 @@ void Coo<ValueType, IndexType>::convert_to(
 
 template <typename ValueType, typename IndexType>
 void Coo<ValueType, IndexType>::move_to(
-    Coo<next_precision_with_half<ValueType>, IndexType>* result)
+    Coo<next_precision<ValueType>, IndexType>* result)
 {
     this->convert_to(result);
 }
@@ -234,8 +234,7 @@ void Coo<ValueType, IndexType>::move_to(
 #if GINKGO_ENABLE_HALF
 template <typename ValueType, typename IndexType>
 void Coo<ValueType, IndexType>::convert_to(
-    Coo<next_precision_with_half<next_precision_with_half<ValueType>>,
-        IndexType>* result) const
+    Coo<next_precision<next_precision<ValueType>>, IndexType>* result) const
 {
     result->values_ = this->values_;
     result->row_idxs_ = this->row_idxs_;
@@ -246,8 +245,7 @@ void Coo<ValueType, IndexType>::convert_to(
 
 template <typename ValueType, typename IndexType>
 void Coo<ValueType, IndexType>::move_to(
-    Coo<next_precision_with_half<next_precision_with_half<ValueType>>,
-        IndexType>* result)
+    Coo<next_precision<next_precision<ValueType>>, IndexType>* result)
 {
     this->convert_to(result);
 }
diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp
index 1bb3e778478..4097fbed1a0 100644
--- a/core/matrix/csr.cpp
+++ b/core/matrix/csr.cpp
@@ -304,7 +304,7 @@ void Csr<ValueType, IndexType>::apply_impl(const LinOp* alpha, const LinOp* b,
 
 template <typename ValueType, typename IndexType>
 void Csr<ValueType, IndexType>::convert_to(
-    Csr<next_precision_with_half<ValueType>, IndexType>* result) const
+    Csr<next_precision<ValueType>, IndexType>* result) const
 {
     result->values_ = this->values_;
     result->col_idxs_ = this->col_idxs_;
@@ -316,7 +316,7 @@ void Csr<ValueType, IndexType>::convert_to(
 
 template <typename ValueType, typename IndexType>
 void Csr<ValueType, IndexType>::move_to(
-    Csr<next_precision_with_half<ValueType>, IndexType>* result)
+    Csr<next_precision<ValueType>, IndexType>* result)
 {
     this->convert_to(result);
 }
@@ -324,8 +324,7 @@ void Csr<ValueType, IndexType>::move_to(
 #if GINKGO_ENABLE_HALF
 template <typename ValueType, typename IndexType>
 void Csr<ValueType, IndexType>::convert_to(
-    Csr<next_precision_with_half<next_precision_with_half<ValueType>>,
-        IndexType>* result) const
+    Csr<next_precision<next_precision<ValueType>>, IndexType>* result) const
 {
     result->values_ = this->values_;
     result->col_idxs_ = this->col_idxs_;
@@ -337,8 +336,7 @@ void Csr<ValueType, IndexType>::convert_to(
 
 template <typename ValueType, typename IndexType>
 void Csr<ValueType, IndexType>::move_to(
-    Csr<next_precision_with_half<next_precision_with_half<ValueType>>,
-        IndexType>* result)
+    Csr<next_precision<next_precision<ValueType>>, IndexType>* result)
 {
     this->convert_to(result);
 }
diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp
index 071e689232e..1fd20a1db3d 100644
--- a/core/matrix/dense.cpp
+++ b/core/matrix/dense.cpp
@@ -582,7 +582,7 @@ Dense<ValueType>::Dense(Dense<ValueType>&& other) : Dense(other.get_executor())
 
 template <typename ValueType>
 void Dense<ValueType>::convert_to(
-    Dense<next_precision_with_half<ValueType>>* result) const
+    Dense<next_precision<ValueType>>* result) const
 {
     if (result->get_size() != this->get_size()) {
         result->set_size(this->get_size());
@@ -597,8 +597,7 @@ void Dense<ValueType>::convert_to(
 
 
 template <typename ValueType>
-void Dense<ValueType>::move_to(
-    Dense<next_precision_with_half<ValueType>>* result)
+void Dense<ValueType>::move_to(Dense<next_precision<ValueType>>* result)
 {
     this->convert_to(result);
 }
@@ -607,8 +606,7 @@ void Dense<ValueType>::move_to(
 #if GINKGO_ENABLE_HALF
 template <typename ValueType>
 void Dense<ValueType>::convert_to(
-    Dense<next_precision_with_half<next_precision_with_half<ValueType>>>*
-        result) const
+    Dense<next_precision<next_precision<ValueType>>>* result) const
 {
     if (result->get_size() != this->get_size()) {
         result->set_size(this->get_size());
@@ -624,8 +622,7 @@ void Dense<ValueType>::convert_to(
 
 template <typename ValueType>
 void Dense<ValueType>::move_to(
-    Dense<next_precision_with_half<next_precision_with_half<ValueType>>>*
-        result)
+    Dense<next_precision<next_precision<ValueType>>>* result)
 {
     this->convert_to(result);
 }
@@ -1548,8 +1545,8 @@ template <typename ValueType, typename Function>
 void gather_mixed_real_complex(Function fn, LinOp* out)
 {
 #ifdef GINKGO_MIXED_PRECISION
-    run<matrix::Dense, ValueType, next_precision_with_half<ValueType>,
-        next_precision_with_half<next_precision_with_half<ValueType>>>(out, fn);
+    run<matrix::Dense, ValueType, next_precision<ValueType>,
+        next_precision<next_precision<ValueType>>>(out, fn);
 #else
     precision_dispatch<ValueType>(fn, out);
 #endif
diff --git a/core/matrix/diagonal.cpp b/core/matrix/diagonal.cpp
index 85c5739b529..6a0c4edc1b5 100644
--- a/core/matrix/diagonal.cpp
+++ b/core/matrix/diagonal.cpp
@@ -149,7 +149,7 @@ std::unique_ptr<LinOp> Diagonal<ValueType>::conj_transpose() const
 
 template <typename ValueType>
 void Diagonal<ValueType>::convert_to(
-    Diagonal<next_precision_with_half<ValueType>>* result) const
+    Diagonal<next_precision<ValueType>>* result) const
 {
     result->values_ = this->values_;
     result->set_size(this->get_size());
@@ -157,8 +157,7 @@ void Diagonal<ValueType>::convert_to(
 
 
 template <typename ValueType>
-void Diagonal<ValueType>::move_to(
-    Diagonal<next_precision_with_half<ValueType>>* result)
+void Diagonal<ValueType>::move_to(Diagonal<next_precision<ValueType>>* result)
 {
     this->convert_to(result);
 }
@@ -167,8 +166,7 @@ void Diagonal<ValueType>::move_to(
 #if GINKGO_ENABLE_HALF
 template <typename ValueType>
 void Diagonal<ValueType>::convert_to(
-    Diagonal<next_precision_with_half<next_precision_with_half<ValueType>>>*
-        result) const
+    Diagonal<next_precision<next_precision<ValueType>>>* result) const
 {
     result->values_ = this->values_;
     result->set_size(this->get_size());
@@ -177,8 +175,7 @@ void Diagonal<ValueType>::convert_to(
 
 template <typename ValueType>
 void Diagonal<ValueType>::move_to(
-    Diagonal<next_precision_with_half<next_precision_with_half<ValueType>>>*
-        result)
+    Diagonal<next_precision<next_precision<ValueType>>>* result)
 {
     this->convert_to(result);
 }
diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp
index eafd9fa9cad..98fbfc94c7d 100644
--- a/core/matrix/ell.cpp
+++ b/core/matrix/ell.cpp
@@ -154,7 +154,7 @@ void Ell<ValueType, IndexType>::apply_impl(const LinOp* alpha, const LinOp* b,
 
 template <typename ValueType, typename IndexType>
 void Ell<ValueType, IndexType>::convert_to(
-    Ell<next_precision_with_half<ValueType>, IndexType>* result) const
+    Ell<next_precision<ValueType>, IndexType>* result) const
 {
     result->values_ = this->values_;
     result->col_idxs_ = this->col_idxs_;
@@ -166,7 +166,7 @@ void Ell<ValueType, IndexType>::convert_to(
 
 template <typename ValueType, typename IndexType>
 void Ell<ValueType, IndexType>::move_to(
-    Ell<next_precision_with_half<ValueType>, IndexType>* result)
+    Ell<next_precision<ValueType>, IndexType>* result)
 {
     this->convert_to(result);
 }
@@ -175,8 +175,7 @@ void Ell<ValueType, IndexType>::move_to(
 #if GINKGO_ENABLE_HALF
 template <typename ValueType, typename IndexType>
 void Ell<ValueType, IndexType>::convert_to(
-    Ell<next_precision_with_half<next_precision_with_half<ValueType>>,
-        IndexType>* result) const
+    Ell<next_precision<next_precision<ValueType>>, IndexType>* result) const
 {
     result->values_ = this->values_;
     result->col_idxs_ = this->col_idxs_;
@@ -188,8 +187,7 @@ void Ell<ValueType, IndexType>::convert_to(
 
 template <typename ValueType, typename IndexType>
 void Ell<ValueType, IndexType>::move_to(
-    Ell<next_precision_with_half<next_precision_with_half<ValueType>>,
-        IndexType>* result)
+    Ell<next_precision<next_precision<ValueType>>, IndexType>* result)
 {
     this->convert_to(result);
 }
diff --git a/core/matrix/fbcsr.cpp b/core/matrix/fbcsr.cpp
index f1612be10e0..b9e8c6b00b6 100644
--- a/core/matrix/fbcsr.cpp
+++ b/core/matrix/fbcsr.cpp
@@ -145,7 +145,7 @@ void Fbcsr<ValueType, IndexType>::apply_impl(const LinOp* alpha, const LinOp* b,
 
 template <typename ValueType, typename IndexType>
 void Fbcsr<ValueType, IndexType>::convert_to(
-    Fbcsr<next_precision_with_half<ValueType>, IndexType>* const result) const
+    Fbcsr<next_precision<ValueType>, IndexType>* const result) const
 {
     result->values_ = this->values_;
     result->col_idxs_ = this->col_idxs_;
@@ -158,7 +158,7 @@ void Fbcsr<ValueType, IndexType>::convert_to(
 
 template <typename ValueType, typename IndexType>
 void Fbcsr<ValueType, IndexType>::move_to(
-    Fbcsr<next_precision_with_half<ValueType>, IndexType>* const result)
+    Fbcsr<next_precision<ValueType>, IndexType>* const result)
 {
     this->convert_to(result);
 }
@@ -167,8 +167,8 @@ void Fbcsr<ValueType, IndexType>::move_to(
 #if GINKGO_ENABLE_HALF
 template <typename ValueType, typename IndexType>
 void Fbcsr<ValueType, IndexType>::convert_to(
-    Fbcsr<next_precision_with_half<next_precision_with_half<ValueType>>,
-          IndexType>* const result) const
+    Fbcsr<next_precision<next_precision<ValueType>>, IndexType>* const result)
+    const
 {
     result->values_ = this->values_;
     result->col_idxs_ = this->col_idxs_;
@@ -181,8 +181,7 @@ void Fbcsr<ValueType, IndexType>::convert_to(
 
 template <typename ValueType, typename IndexType>
 void Fbcsr<ValueType, IndexType>::move_to(
-    Fbcsr<next_precision_with_half<next_precision_with_half<ValueType>>,
-          IndexType>* const result)
+    Fbcsr<next_precision<next_precision<ValueType>>, IndexType>* const result)
 {
     this->convert_to(result);
 }
diff --git a/core/matrix/hybrid.cpp b/core/matrix/hybrid.cpp
index 72137558a10..95a95b3a619 100644
--- a/core/matrix/hybrid.cpp
+++ b/core/matrix/hybrid.cpp
@@ -203,7 +203,7 @@ void Hybrid<ValueType, IndexType>::apply_impl(const LinOp* alpha,
 
 template <typename ValueType, typename IndexType>
 void Hybrid<ValueType, IndexType>::convert_to(
-    Hybrid<next_precision_with_half<ValueType>, IndexType>* result) const
+    Hybrid<next_precision<ValueType>, IndexType>* result) const
 {
     this->ell_->convert_to(result->ell_);
     this->coo_->convert_to(result->coo_);
@@ -216,7 +216,7 @@ void Hybrid<ValueType, IndexType>::convert_to(
 
 template <typename ValueType, typename IndexType>
 void Hybrid<ValueType, IndexType>::move_to(
-    Hybrid<next_precision_with_half<ValueType>, IndexType>* result)
+    Hybrid<next_precision<ValueType>, IndexType>* result)
 {
     this->convert_to(result);
 }
@@ -225,8 +225,7 @@ void Hybrid<ValueType, IndexType>::move_to(
 #if GINKGO_ENABLE_HALF
 template <typename ValueType, typename IndexType>
 void Hybrid<ValueType, IndexType>::convert_to(
-    Hybrid<next_precision_with_half<next_precision_with_half<ValueType>>,
-           IndexType>* result) const
+    Hybrid<next_precision<next_precision<ValueType>>, IndexType>* result) const
 {
     this->ell_->convert_to(result->ell_.get());
     this->coo_->convert_to(result->coo_.get());
@@ -239,8 +238,7 @@ void Hybrid<ValueType, IndexType>::convert_to(
 
 template <typename ValueType, typename IndexType>
 void Hybrid<ValueType, IndexType>::move_to(
-    Hybrid<next_precision_with_half<next_precision_with_half<ValueType>>,
-           IndexType>* result)
+    Hybrid<next_precision<next_precision<ValueType>>, IndexType>* result)
 {
     this->convert_to(result);
 }
diff --git a/core/matrix/sellp.cpp b/core/matrix/sellp.cpp
index bd81b08bada..3422d249b40 100644
--- a/core/matrix/sellp.cpp
+++ b/core/matrix/sellp.cpp
@@ -176,7 +176,7 @@ void Sellp<ValueType, IndexType>::apply_impl(const LinOp* alpha, const LinOp* b,
 
 template <typename ValueType, typename IndexType>
 void Sellp<ValueType, IndexType>::convert_to(
-    Sellp<next_precision_with_half<ValueType>, IndexType>* result) const
+    Sellp<next_precision<ValueType>, IndexType>* result) const
 {
     result->values_ = this->values_;
     result->col_idxs_ = this->col_idxs_;
@@ -190,7 +190,7 @@ void Sellp<ValueType, IndexType>::convert_to(
 
 template <typename ValueType, typename IndexType>
 void Sellp<ValueType, IndexType>::move_to(
-    Sellp<next_precision_with_half<ValueType>, IndexType>* result)
+    Sellp<next_precision<ValueType>, IndexType>* result)
 {
     this->convert_to(result);
 }
@@ -199,8 +199,7 @@ void Sellp<ValueType, IndexType>::move_to(
 #if GINKGO_ENABLE_HALF
 template <typename ValueType, typename IndexType>
 void Sellp<ValueType, IndexType>::convert_to(
-    Sellp<next_precision_with_half<next_precision_with_half<ValueType>>,
-          IndexType>* result) const
+    Sellp<next_precision<next_precision<ValueType>>, IndexType>* result) const
 {
     result->values_ = this->values_;
     result->col_idxs_ = this->col_idxs_;
@@ -214,8 +213,7 @@ void Sellp<ValueType, IndexType>::convert_to(
 
 template <typename ValueType, typename IndexType>
 void Sellp<ValueType, IndexType>::move_to(
-    Sellp<next_precision_with_half<next_precision_with_half<ValueType>>,
-          IndexType>* result)
+    Sellp<next_precision<next_precision<ValueType>>, IndexType>* result)
 {
     this->convert_to(result);
 }
diff --git a/core/preconditioner/jacobi.cpp b/core/preconditioner/jacobi.cpp
index 3f773710ceb..556fb9bc0df 100644
--- a/core/preconditioner/jacobi.cpp
+++ b/core/preconditioner/jacobi.cpp
@@ -328,11 +328,12 @@ void Jacobi<ValueType, IndexType>::generate(const LinOp* system_matrix,
     if (parameters_.max_block_size == 1) {
         auto diag = share(as<DiagonalLinOpExtractable>(system_matrix)
                               ->extract_diagonal_linop());
-        auto diag_vt = ::gko::detail::
-            temporary_conversion<matrix::Diagonal<ValueType>>::template create<
-                matrix::Diagonal<previous_precision_with_half<ValueType>>,
-                matrix::Diagonal<previous_precision_with_half<
-                    previous_precision_with_half<ValueType>>>>(diag.get());
+        auto diag_vt =
+            ::gko::detail::temporary_conversion<matrix::Diagonal<ValueType>>::
+                template create<matrix::Diagonal<previous_precision<ValueType>>,
+                                matrix::Diagonal<previous_precision<
+                                    previous_precision<ValueType>>>>(
+                    diag.get());
         if (!diag_vt) {
             GKO_NOT_SUPPORTED(system_matrix);
         }
diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp
index bd641f057a1..be47c8a3ee8 100644
--- a/include/ginkgo/core/base/batch_multi_vector.hpp
+++ b/include/ginkgo/core/base/batch_multi_vector.hpp
@@ -53,21 +53,19 @@ class MultiVector
     : public EnablePolymorphicObject<MultiVector<ValueType>>,
       public EnablePolymorphicAssignment<MultiVector<ValueType>>,
 #if GINKGO_ENABLE_HALF
-      public ConvertibleTo<MultiVector<
-          next_precision_with_half<next_precision_with_half<ValueType>>>>,
+      public ConvertibleTo<
+          MultiVector<next_precision<next_precision<ValueType>>>>,
 #endif
-      public ConvertibleTo<MultiVector<next_precision_with_half<ValueType>>> {
+      public ConvertibleTo<MultiVector<next_precision<ValueType>>> {
     friend class EnablePolymorphicObject<MultiVector>;
     friend class MultiVector<to_complex<ValueType>>;
-    friend class MultiVector<previous_precision_with_half<ValueType>>;
+    friend class MultiVector<previous_precision<ValueType>>;
 
 public:
     using EnablePolymorphicAssignment<MultiVector>::convert_to;
     using EnablePolymorphicAssignment<MultiVector>::move_to;
-    using ConvertibleTo<
-        MultiVector<next_precision_with_half<ValueType>>>::convert_to;
-    using ConvertibleTo<
-        MultiVector<next_precision_with_half<ValueType>>>::move_to;
+    using ConvertibleTo<MultiVector<next_precision<ValueType>>>::convert_to;
+    using ConvertibleTo<MultiVector<next_precision<ValueType>>>::move_to;
 
     using value_type = ValueType;
     using index_type = int32;
@@ -84,27 +82,23 @@ class MultiVector
     static std::unique_ptr<MultiVector> create_with_config_of(
         ptr_param<const MultiVector> other);
 
-    void convert_to(MultiVector<next_precision_with_half<ValueType>>* result)
-        const override;
+    void convert_to(
+        MultiVector<next_precision<ValueType>>* result) const override;
 
-    void move_to(
-        MultiVector<next_precision_with_half<ValueType>>* result) override;
+    void move_to(MultiVector<next_precision<ValueType>>* result) override;
 
 #if GINKGO_ENABLE_HALF
-    friend class MultiVector<
-        previous_precision_with_half<previous_precision_with_half<ValueType>>>;
-    using ConvertibleTo<MultiVector<next_precision_with_half<
-        next_precision_with_half<ValueType>>>>::convert_to;
-    using ConvertibleTo<MultiVector<next_precision_with_half<
-        next_precision_with_half<ValueType>>>>::move_to;
+    friend class MultiVector<previous_precision<previous_precision<ValueType>>>;
+    using ConvertibleTo<
+        MultiVector<next_precision<next_precision<ValueType>>>>::convert_to;
+    using ConvertibleTo<
+        MultiVector<next_precision<next_precision<ValueType>>>>::move_to;
 
-    void convert_to(
-        MultiVector<
-            next_precision_with_half<next_precision_with_half<ValueType>>>*
-            result) const override;
+    void convert_to(MultiVector<next_precision<next_precision<ValueType>>>*
+                        result) const override;
 
-    void move_to(MultiVector<next_precision_with_half<
-                     next_precision_with_half<ValueType>>>* result) override;
+    void move_to(MultiVector<next_precision<next_precision<ValueType>>>* result)
+        override;
 #endif
 
     /**
diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp
index e308b092ea6..8f063d8c2fe 100644
--- a/include/ginkgo/core/base/math.hpp
+++ b/include/ginkgo/core/base/math.hpp
@@ -293,46 +293,46 @@ namespace detail {
 
 // singly linked list of all our supported precisions
 template <typename T>
-struct next_precision_impl {};
+struct next_precision_base_impl {};
 
 template <>
-struct next_precision_impl<float> {
+struct next_precision_base_impl<float> {
     using type = double;
 };
 
 template <>
-struct next_precision_impl<double> {
+struct next_precision_base_impl<double> {
     using type = float;
 };
 
 template <typename T>
-struct next_precision_impl<std::complex<T>> {
-    using type = std::complex<typename next_precision_impl<T>::type>;
+struct next_precision_base_impl<std::complex<T>> {
+    using type = std::complex<typename next_precision_base_impl<T>::type>;
 };
 
 
 template <typename T>
-struct next_precision_with_half_impl {};
+struct next_precision_impl {};
 
 
 template <>
-struct next_precision_with_half_impl<gko::half> {
+struct next_precision_impl<gko::half> {
     using type = float;
 };
 
 template <>
-struct next_precision_with_half_impl<float> {
+struct next_precision_impl<float> {
     using type = double;
 };
 
 template <>
-struct next_precision_with_half_impl<double> {
+struct next_precision_impl<double> {
     using type = gko::half;
 };
 
 template <typename T>
-struct next_precision_with_half_impl<std::complex<T>> {
-    using type = std::complex<typename next_precision_with_half_impl<T>::type>;
+struct next_precision_impl<std::complex<T>> {
+    using type = std::complex<typename next_precision_impl<T>::type>;
 };
 
 
@@ -418,7 +418,7 @@ struct highest_precision_variadic<Head> {
  * Obtains the next type in the singly-linked precision list.
  */
 template <typename T>
-using next_precision = typename detail::next_precision_impl<T>::type;
+using next_precision_base = typename detail::next_precision_base_impl<T>::type;
 
 
 /**
@@ -428,26 +428,24 @@ using next_precision = typename detail::next_precision_impl<T>::type;
  *       next_precision.
  */
 template <typename T>
-using previous_precision = next_precision<T>;
+using previous_precision_base = next_precision_base<T>;
 
 /**
  * Obtains the next type in the singly-linked precision list with half.
  */
 #if GINKGO_ENABLE_HALF
 template <typename T>
-using next_precision_with_half =
-    typename detail::next_precision_with_half_impl<T>::type;
+using next_precision = typename detail::next_precision_impl<T>::type;
 
 template <typename T>
-using previous_precision_with_half =
-    next_precision_with_half<next_precision_with_half<T>>;
+using previous_precision = next_precision<next_precision<T>>;
 #else
 // fallback to float/double list
 template <typename T>
-using next_precision_with_half = next_precision<T>;
+using next_precision = next_precision_base<T>;
 
 template <typename T>
-using previous_precision_with_half = previous_precision<T>;
+using previous_precision = previous_precision_base<T>;
 #endif
 
 
diff --git a/include/ginkgo/core/base/precision_dispatch.hpp b/include/ginkgo/core/base/precision_dispatch.hpp
index 4adc02763f0..29aa4bfcab1 100644
--- a/include/ginkgo/core/base/precision_dispatch.hpp
+++ b/include/ginkgo/core/base/precision_dispatch.hpp
@@ -19,9 +19,9 @@ namespace gko {
 /**
  * Convert the given LinOp from matrix::Dense<...> to matrix::Dense<ValueType>.
  * The conversion tries to convert the input LinOp to all Dense types with value
- * type recursively reachable by next_precision<...> starting from the ValueType
- * template parameter. This means that all real-to-real and complex-to-complex
- * conversions for default precisions are being considered.
+ * type recursively reachable by next_precision_base<...> starting from the
+ * ValueType template parameter. This means that all real-to-real and
+ * complex-to-complex conversions for default precisions are being considered.
  * If the input matrix is non-const, the contents of the modified converted
  * object will be converted back to the input matrix when the returned object is
  * destroyed. This may lead to a loss of precision!
@@ -48,9 +48,9 @@ make_temporary_conversion(Ptr&& matrix)
 {
     using Pointee = detail::pointee<Ptr>;
     using Dense = matrix::Dense<ValueType>;
-    using NextDense = matrix::Dense<next_precision_with_half<ValueType>>;
-    using NextNextDense = matrix::Dense<
-        next_precision_with_half<next_precision_with_half<ValueType>>>;
+    using NextDense = matrix::Dense<next_precision<ValueType>>;
+    using NextNextDense =
+        matrix::Dense<next_precision<next_precision<ValueType>>>;
     using MaybeConstDense =
         std::conditional_t<std::is_const<Pointee>::value, const Dense, Dense>;
     auto result = detail::temporary_conversion<
@@ -201,7 +201,7 @@ void precision_dispatch_real_complex(Function fn, const LinOp* alpha,
  * If GINKGO_MIXED_PRECISION is defined, this means that the function will be
  * called with its dynamic type as a static type, so the (templated/generic)
  * function will be instantiated with all pairs of Dense<ValueType> and
- * Dense<next_precision<ValueType>> parameter types, and the appropriate
+ * Dense<next_precision_base<ValueType>> parameter types, and the appropriate
  * overload will be called based on the dynamic type of the parameter.
  *
  * If GINKGO_MIXED_PRECISION is not defined, it will behave exactly like
@@ -228,9 +228,8 @@ void mixed_precision_dispatch(Function fn, const LinOp* in, LinOp* out)
 {
 #ifdef GINKGO_MIXED_PRECISION
     using fst_type = matrix::Dense<ValueType>;
-    using snd_type = matrix::Dense<next_precision_with_half<ValueType>>;
-    using trd_type = matrix::Dense<
-        next_precision_with_half<next_precision_with_half<ValueType>>>;
+    using snd_type = matrix::Dense<next_precision<ValueType>>;
+    using trd_type = matrix::Dense<next_precision<next_precision<ValueType>>>;
     auto dispatch_out_vector = [&](auto dense_in) {
         if (auto dense_out = dynamic_cast<fst_type*>(out)) {
             fn(dense_in, dense_out);
@@ -314,7 +313,7 @@ namespace distributed {
  * Convert the given LinOp from experimental::distributed::Vector<...> to
  * experimental::distributed::Vector<ValueType>. The conversion tries to convert
  * the input LinOp to all Dense types with value type recursively reachable by
- * next_precision<...> starting from the ValueType template parameter. This
+ * next_precision_base<...> starting from the ValueType template parameter. This
  * means that all real-to-real and complex-to-complex conversions for default
  * precisions are being considered. If the input matrix is non-const, the
  * contents of the modified converted object will be converted back to the input
@@ -341,7 +340,7 @@ gko::detail::temporary_conversion<Vector<ValueType>> make_temporary_conversion(
 {
     auto result =
         gko::detail::temporary_conversion<Vector<ValueType>>::template create<
-            Vector<next_precision<ValueType>>>(matrix);
+            Vector<next_precision_base<ValueType>>>(matrix);
     if (!result) {
         GKO_NOT_SUPPORTED(matrix);
     }
@@ -357,7 +356,7 @@ gko::detail::temporary_conversion<const Vector<ValueType>>
 make_temporary_conversion(const LinOp* matrix)
 {
     auto result = gko::detail::temporary_conversion<const Vector<ValueType>>::
-        template create<Vector<next_precision<ValueType>>>(matrix);
+        template create<Vector<next_precision_base<ValueType>>>(matrix);
     if (!result) {
         GKO_NOT_SUPPORTED(matrix);
     }
diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp
index 2f2f470a4ed..1bb7f2315fd 100644
--- a/include/ginkgo/core/distributed/matrix.hpp
+++ b/include/ginkgo/core/distributed/matrix.hpp
@@ -257,14 +257,13 @@ class Vector;
  */
 template <typename ValueType = default_precision,
           typename LocalIndexType = int32, typename GlobalIndexType = int64>
-class Matrix
-    : public EnableDistributedLinOp<
-          Matrix<ValueType, LocalIndexType, GlobalIndexType>>,
-      public ConvertibleTo<
-          Matrix<next_precision<ValueType>, LocalIndexType, GlobalIndexType>>,
-      public DistributedBase {
+class Matrix : public EnableDistributedLinOp<
+                   Matrix<ValueType, LocalIndexType, GlobalIndexType>>,
+               public ConvertibleTo<Matrix<next_precision_base<ValueType>,
+                                           LocalIndexType, GlobalIndexType>>,
+               public DistributedBase {
     friend class EnableDistributedPolymorphicObject<Matrix, LinOp>;
-    friend class Matrix<next_precision<ValueType>, LocalIndexType,
+    friend class Matrix<next_precision_base<ValueType>, LocalIndexType,
                         GlobalIndexType>;
     friend class multigrid::Pgm<ValueType, LocalIndexType>;
 
@@ -279,15 +278,15 @@ class Matrix
 
     using EnableDistributedLinOp<Matrix>::convert_to;
     using EnableDistributedLinOp<Matrix>::move_to;
-    using ConvertibleTo<Matrix<next_precision<ValueType>, LocalIndexType,
+    using ConvertibleTo<Matrix<next_precision_base<ValueType>, LocalIndexType,
                                GlobalIndexType>>::convert_to;
-    using ConvertibleTo<Matrix<next_precision<ValueType>, LocalIndexType,
+    using ConvertibleTo<Matrix<next_precision_base<ValueType>, LocalIndexType,
                                GlobalIndexType>>::move_to;
 
-    void convert_to(Matrix<next_precision<value_type>, local_index_type,
+    void convert_to(Matrix<next_precision_base<value_type>, local_index_type,
                            global_index_type>* result) const override;
 
-    void move_to(Matrix<next_precision<value_type>, local_index_type,
+    void move_to(Matrix<next_precision_base<value_type>, local_index_type,
                         global_index_type>* result) override;
 
     /**
diff --git a/include/ginkgo/core/distributed/vector.hpp b/include/ginkgo/core/distributed/vector.hpp
index 17b8d0f944b..a3dd460bd43 100644
--- a/include/ginkgo/core/distributed/vector.hpp
+++ b/include/ginkgo/core/distributed/vector.hpp
@@ -66,20 +66,20 @@ class Partition;
 template <typename ValueType = double>
 class Vector
     : public EnableDistributedLinOp<Vector<ValueType>>,
-      public ConvertibleTo<Vector<next_precision<ValueType>>>,
+      public ConvertibleTo<Vector<next_precision_base<ValueType>>>,
       public EnableAbsoluteComputation<remove_complex<Vector<ValueType>>>,
       public DistributedBase {
     friend class EnableDistributedPolymorphicObject<Vector, LinOp>;
     friend class Vector<to_complex<ValueType>>;
     friend class Vector<remove_complex<ValueType>>;
-    friend class Vector<next_precision<ValueType>>;
+    friend class Vector<next_precision_base<ValueType>>;
     friend class detail::VectorCache<ValueType>;
 
 public:
     using EnableDistributedLinOp<Vector>::convert_to;
     using EnableDistributedLinOp<Vector>::move_to;
-    using ConvertibleTo<Vector<next_precision<ValueType>>>::convert_to;
-    using ConvertibleTo<Vector<next_precision<ValueType>>>::move_to;
+    using ConvertibleTo<Vector<next_precision_base<ValueType>>>::convert_to;
+    using ConvertibleTo<Vector<next_precision_base<ValueType>>>::move_to;
 
     using value_type = ValueType;
     using absolute_type = remove_complex<Vector>;
@@ -168,9 +168,10 @@ class Vector
     void read_distributed(const matrix_data<ValueType, int32>& data,
                           ptr_param<const Partition<int32, int32>> partition);
 
-    void convert_to(Vector<next_precision<ValueType>>* result) const override;
+    void convert_to(
+        Vector<next_precision_base<ValueType>>* result) const override;
 
-    void move_to(Vector<next_precision<ValueType>>* result) override;
+    void move_to(Vector<next_precision_base<ValueType>>* result) override;
 
     std::unique_ptr<absolute_type> compute_absolute() const override;
 
@@ -672,7 +673,7 @@ template <typename ValueType>
 struct conversion_target_helper<experimental::distributed::Vector<ValueType>> {
     using target_type = experimental::distributed::Vector<ValueType>;
     using source_type =
-        experimental::distributed::Vector<previous_precision<ValueType>>;
+        experimental::distributed::Vector<previous_precision_base<ValueType>>;
 
     static std::unique_ptr<target_type> create_empty(const source_type* source)
     {
diff --git a/include/ginkgo/core/matrix/batch_csr.hpp b/include/ginkgo/core/matrix/batch_csr.hpp
index 49eb5e4d7cd..766ad1facb1 100644
--- a/include/ginkgo/core/matrix/batch_csr.hpp
+++ b/include/ginkgo/core/matrix/batch_csr.hpp
@@ -48,14 +48,12 @@ class Csr final
     : public EnableBatchLinOp<Csr<ValueType, IndexType>>,
 #if GINKGO_ENABLE_HALF
       public ConvertibleTo<
-          Csr<next_precision_with_half<next_precision_with_half<ValueType>>,
-              IndexType>>,
+          Csr<next_precision<next_precision<ValueType>>, IndexType>>,
 #endif
-      public ConvertibleTo<
-          Csr<next_precision_with_half<ValueType>, IndexType>> {
+      public ConvertibleTo<Csr<next_precision<ValueType>, IndexType>> {
     friend class EnablePolymorphicObject<Csr, BatchLinOp>;
     friend class Csr<to_complex<ValueType>, IndexType>;
-    friend class Csr<previous_precision_with_half<ValueType>, IndexType>;
+    friend class Csr<previous_precision<ValueType>, IndexType>;
     static_assert(std::is_same<IndexType, int32>::value,
                   "IndexType must be a 32 bit integer");
 
@@ -69,30 +67,24 @@ class Csr final
     using absolute_type = remove_complex<Csr>;
     using complex_type = to_complex<Csr>;
 
-    void convert_to(Csr<next_precision_with_half<ValueType>, IndexType>* result)
-        const override;
+    void convert_to(
+        Csr<next_precision<ValueType>, IndexType>* result) const override;
 
-    void move_to(
-        Csr<next_precision_with_half<ValueType>, IndexType>* result) override;
+    void move_to(Csr<next_precision<ValueType>, IndexType>* result) override;
 
 #if GINKGO_ENABLE_HALF
-    friend class Csr<
-        previous_precision_with_half<previous_precision_with_half<ValueType>>,
-        IndexType>;
+    friend class Csr<previous_precision<previous_precision<ValueType>>,
+                     IndexType>;
     using ConvertibleTo<
-        Csr<next_precision_with_half<next_precision_with_half<ValueType>>,
-            IndexType>>::convert_to;
+        Csr<next_precision<next_precision<ValueType>>, IndexType>>::convert_to;
     using ConvertibleTo<
-        Csr<next_precision_with_half<next_precision_with_half<ValueType>>,
-            IndexType>>::move_to;
+        Csr<next_precision<next_precision<ValueType>>, IndexType>>::move_to;
 
-    void convert_to(
-        Csr<next_precision_with_half<next_precision_with_half<ValueType>>,
-            IndexType>* result) const override;
+    void convert_to(Csr<next_precision<next_precision<ValueType>>, IndexType>*
+                        result) const override;
 
-    void move_to(
-        Csr<next_precision_with_half<next_precision_with_half<ValueType>>,
-            IndexType>* result) override;
+    void move_to(Csr<next_precision<next_precision<ValueType>>, IndexType>*
+                     result) override;
 #endif
 
     /**
diff --git a/include/ginkgo/core/matrix/batch_dense.hpp b/include/ginkgo/core/matrix/batch_dense.hpp
index c1340e482f4..ad4db6d0a84 100644
--- a/include/ginkgo/core/matrix/batch_dense.hpp
+++ b/include/ginkgo/core/matrix/batch_dense.hpp
@@ -48,13 +48,12 @@ template <typename ValueType = default_precision>
 class Dense final
     : public EnableBatchLinOp<Dense<ValueType>>,
 #if GINKGO_ENABLE_HALF
-      public ConvertibleTo<
-          Dense<next_precision_with_half<next_precision_with_half<ValueType>>>>,
+      public ConvertibleTo<Dense<next_precision<next_precision<ValueType>>>>,
 #endif
-      public ConvertibleTo<Dense<next_precision_with_half<ValueType>>> {
+      public ConvertibleTo<Dense<next_precision<ValueType>>> {
     friend class EnablePolymorphicObject<Dense, BatchLinOp>;
     friend class Dense<to_complex<ValueType>>;
-    friend class Dense<previous_precision_with_half<ValueType>>;
+    friend class Dense<previous_precision<ValueType>>;
 
 public:
     using EnableBatchLinOp<Dense>::convert_to;
@@ -67,26 +66,22 @@ class Dense final
     using absolute_type = remove_complex<Dense>;
     using complex_type = to_complex<Dense>;
 
-    void convert_to(
-        Dense<next_precision_with_half<ValueType>>* result) const override;
+    void convert_to(Dense<next_precision<ValueType>>* result) const override;
 
-    void move_to(Dense<next_precision_with_half<ValueType>>* result) override;
+    void move_to(Dense<next_precision<ValueType>>* result) override;
 
 #if GINKGO_ENABLE_HALF
-    friend class Dense<
-        previous_precision_with_half<previous_precision_with_half<ValueType>>>;
-    using ConvertibleTo<Dense<next_precision_with_half<
-        next_precision_with_half<ValueType>>>>::convert_to;
-    using ConvertibleTo<Dense<next_precision_with_half<
-        next_precision_with_half<ValueType>>>>::move_to;
+    friend class Dense<previous_precision<previous_precision<ValueType>>>;
+    using ConvertibleTo<
+        Dense<next_precision<next_precision<ValueType>>>>::convert_to;
+    using ConvertibleTo<
+        Dense<next_precision<next_precision<ValueType>>>>::move_to;
 
-    void convert_to(
-        Dense<next_precision_with_half<next_precision_with_half<ValueType>>>*
-            result) const override;
+    void convert_to(Dense<next_precision<next_precision<ValueType>>>* result)
+        const override;
 
     void move_to(
-        Dense<next_precision_with_half<next_precision_with_half<ValueType>>>*
-            result) override;
+        Dense<next_precision<next_precision<ValueType>>>* result) override;
 #endif
 
     /**
diff --git a/include/ginkgo/core/matrix/batch_ell.hpp b/include/ginkgo/core/matrix/batch_ell.hpp
index 872b8ce2db9..c52da8f8f9d 100644
--- a/include/ginkgo/core/matrix/batch_ell.hpp
+++ b/include/ginkgo/core/matrix/batch_ell.hpp
@@ -53,14 +53,12 @@ class Ell final
     : public EnableBatchLinOp<Ell<ValueType, IndexType>>,
 #if GINKGO_ENABLE_HALF
       public ConvertibleTo<
-          Ell<next_precision_with_half<next_precision_with_half<ValueType>>,
-              IndexType>>,
+          Ell<next_precision<next_precision<ValueType>>, IndexType>>,
 #endif
-      public ConvertibleTo<
-          Ell<next_precision_with_half<ValueType>, IndexType>> {
+      public ConvertibleTo<Ell<next_precision<ValueType>, IndexType>> {
     friend class EnablePolymorphicObject<Ell, BatchLinOp>;
     friend class Ell<to_complex<ValueType>, IndexType>;
-    friend class Ell<previous_precision_with_half<ValueType>, IndexType>;
+    friend class Ell<previous_precision<ValueType>, IndexType>;
     static_assert(std::is_same<IndexType, int32>::value,
                   "IndexType must be a 32 bit integer");
 
@@ -74,30 +72,24 @@ class Ell final
     using absolute_type = remove_complex<Ell>;
     using complex_type = to_complex<Ell>;
 
-    void convert_to(Ell<next_precision_with_half<ValueType>, IndexType>* result)
-        const override;
+    void convert_to(
+        Ell<next_precision<ValueType>, IndexType>* result) const override;
 
-    void move_to(
-        Ell<next_precision_with_half<ValueType>, IndexType>* result) override;
+    void move_to(Ell<next_precision<ValueType>, IndexType>* result) override;
 
 #if GINKGO_ENABLE_HALF
-    friend class Ell<
-        previous_precision_with_half<previous_precision_with_half<ValueType>>,
-        IndexType>;
+    friend class Ell<previous_precision<previous_precision<ValueType>>,
+                     IndexType>;
     using ConvertibleTo<
-        Ell<next_precision_with_half<next_precision_with_half<ValueType>>,
-            IndexType>>::convert_to;
+        Ell<next_precision<next_precision<ValueType>>, IndexType>>::convert_to;
     using ConvertibleTo<
-        Ell<next_precision_with_half<next_precision_with_half<ValueType>>,
-            IndexType>>::move_to;
+        Ell<next_precision<next_precision<ValueType>>, IndexType>>::move_to;
 
-    void convert_to(
-        Ell<next_precision_with_half<next_precision_with_half<ValueType>>,
-            IndexType>* result) const override;
+    void convert_to(Ell<next_precision<next_precision<ValueType>>, IndexType>*
+                        result) const override;
 
-    void move_to(
-        Ell<next_precision_with_half<next_precision_with_half<ValueType>>,
-            IndexType>* result) override;
+    void move_to(Ell<next_precision<next_precision<ValueType>>, IndexType>*
+                     result) override;
 #endif
 
     /**
diff --git a/include/ginkgo/core/matrix/coo.hpp b/include/ginkgo/core/matrix/coo.hpp
index a0edf5aa862..89e94568f0f 100644
--- a/include/ginkgo/core/matrix/coo.hpp
+++ b/include/ginkgo/core/matrix/coo.hpp
@@ -47,21 +47,19 @@ class Hybrid;
  * @ingroup LinOp
  */
 template <typename ValueType = default_precision, typename IndexType = int32>
-class Coo
-    : public EnableLinOp<Coo<ValueType, IndexType>>,
-      public ConvertibleTo<Coo<next_precision_with_half<ValueType>, IndexType>>,
+class Coo : public EnableLinOp<Coo<ValueType, IndexType>>,
+            public ConvertibleTo<Coo<next_precision<ValueType>, IndexType>>,
 #if GINKGO_ENABLE_HALF
-      public ConvertibleTo<
-          Coo<next_precision_with_half<next_precision_with_half<ValueType>>,
-              IndexType>>,
+            public ConvertibleTo<
+                Coo<next_precision<next_precision<ValueType>>, IndexType>>,
 #endif
-      public ConvertibleTo<Csr<ValueType, IndexType>>,
-      public ConvertibleTo<Dense<ValueType>>,
-      public DiagonalExtractable<ValueType>,
-      public ReadableFromMatrixData<ValueType, IndexType>,
-      public WritableToMatrixData<ValueType, IndexType>,
-      public EnableAbsoluteComputation<
-          remove_complex<Coo<ValueType, IndexType>>> {
+            public ConvertibleTo<Csr<ValueType, IndexType>>,
+            public ConvertibleTo<Dense<ValueType>>,
+            public DiagonalExtractable<ValueType>,
+            public ReadableFromMatrixData<ValueType, IndexType>,
+            public WritableToMatrixData<ValueType, IndexType>,
+            public EnableAbsoluteComputation<
+                remove_complex<Coo<ValueType, IndexType>>> {
     friend class EnablePolymorphicObject<Coo, LinOp>;
     friend class Csr<ValueType, IndexType>;
     friend class Dense<ValueType>;
@@ -72,10 +70,8 @@ class Coo
 public:
     using EnableLinOp<Coo>::convert_to;
     using EnableLinOp<Coo>::move_to;
-    using ConvertibleTo<
-        Coo<next_precision_with_half<ValueType>, IndexType>>::convert_to;
-    using ConvertibleTo<
-        Coo<next_precision_with_half<ValueType>, IndexType>>::move_to;
+    using ConvertibleTo<Coo<next_precision<ValueType>, IndexType>>::convert_to;
+    using ConvertibleTo<Coo<next_precision<ValueType>, IndexType>>::move_to;
     using ConvertibleTo<Csr<ValueType, IndexType>>::convert_to;
     using ConvertibleTo<Csr<ValueType, IndexType>>::move_to;
     using ConvertibleTo<Dense<ValueType>>::convert_to;
@@ -88,32 +84,26 @@ class Coo
     using device_mat_data = device_matrix_data<ValueType, IndexType>;
     using absolute_type = remove_complex<Coo>;
 
-    friend class Coo<previous_precision_with_half<ValueType>, IndexType>;
+    friend class Coo<previous_precision<ValueType>, IndexType>;
 
-    void convert_to(Coo<next_precision_with_half<ValueType>, IndexType>* result)
-        const override;
+    void convert_to(
+        Coo<next_precision<ValueType>, IndexType>* result) const override;
 
-    void move_to(
-        Coo<next_precision_with_half<ValueType>, IndexType>* result) override;
+    void move_to(Coo<next_precision<ValueType>, IndexType>* result) override;
 
 #if GINKGO_ENABLE_HALF
-    friend class Coo<
-        previous_precision_with_half<previous_precision_with_half<ValueType>>,
-        IndexType>;
+    friend class Coo<previous_precision<previous_precision<ValueType>>,
+                     IndexType>;
     using ConvertibleTo<
-        Coo<next_precision_with_half<next_precision_with_half<ValueType>>,
-            IndexType>>::convert_to;
+        Coo<next_precision<next_precision<ValueType>>, IndexType>>::convert_to;
     using ConvertibleTo<
-        Coo<next_precision_with_half<next_precision_with_half<ValueType>>,
-            IndexType>>::move_to;
+        Coo<next_precision<next_precision<ValueType>>, IndexType>>::move_to;
 
-    void convert_to(
-        Coo<next_precision_with_half<next_precision_with_half<ValueType>>,
-            IndexType>* result) const override;
+    void convert_to(Coo<next_precision<next_precision<ValueType>>, IndexType>*
+                        result) const override;
 
-    void move_to(
-        Coo<next_precision_with_half<next_precision_with_half<ValueType>>,
-            IndexType>* result) override;
+    void move_to(Coo<next_precision<next_precision<ValueType>>, IndexType>*
+                     result) override;
 #endif
 
     void convert_to(Csr<ValueType, IndexType>* other) const override;
diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp
index 2f66683085f..9c620f10ded 100644
--- a/include/ginkgo/core/matrix/csr.hpp
+++ b/include/ginkgo/core/matrix/csr.hpp
@@ -98,29 +98,27 @@ void strategy_rebuild_helper(Csr<ValueType, IndexType>* result);
  * @ingroup LinOp
  */
 template <typename ValueType = default_precision, typename IndexType = int32>
-class Csr
-    : public EnableLinOp<Csr<ValueType, IndexType>>,
-      public ConvertibleTo<Csr<next_precision_with_half<ValueType>, IndexType>>,
+class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
+            public ConvertibleTo<Csr<next_precision<ValueType>, IndexType>>,
 #if GINKGO_ENABLE_HALF
-      public ConvertibleTo<
-          Csr<next_precision_with_half<next_precision_with_half<ValueType>>,
-              IndexType>>,
+            public ConvertibleTo<
+                Csr<next_precision<next_precision<ValueType>>, IndexType>>,
 #endif
-      public ConvertibleTo<Dense<ValueType>>,
-      public ConvertibleTo<Coo<ValueType, IndexType>>,
-      public ConvertibleTo<Ell<ValueType, IndexType>>,
-      public ConvertibleTo<Fbcsr<ValueType, IndexType>>,
-      public ConvertibleTo<Hybrid<ValueType, IndexType>>,
-      public ConvertibleTo<Sellp<ValueType, IndexType>>,
-      public ConvertibleTo<SparsityCsr<ValueType, IndexType>>,
-      public DiagonalExtractable<ValueType>,
-      public ReadableFromMatrixData<ValueType, IndexType>,
-      public WritableToMatrixData<ValueType, IndexType>,
-      public Transposable,
-      public Permutable<IndexType>,
-      public EnableAbsoluteComputation<
-          remove_complex<Csr<ValueType, IndexType>>>,
-      public ScaledIdentityAddable {
+            public ConvertibleTo<Dense<ValueType>>,
+            public ConvertibleTo<Coo<ValueType, IndexType>>,
+            public ConvertibleTo<Ell<ValueType, IndexType>>,
+            public ConvertibleTo<Fbcsr<ValueType, IndexType>>,
+            public ConvertibleTo<Hybrid<ValueType, IndexType>>,
+            public ConvertibleTo<Sellp<ValueType, IndexType>>,
+            public ConvertibleTo<SparsityCsr<ValueType, IndexType>>,
+            public DiagonalExtractable<ValueType>,
+            public ReadableFromMatrixData<ValueType, IndexType>,
+            public WritableToMatrixData<ValueType, IndexType>,
+            public Transposable,
+            public Permutable<IndexType>,
+            public EnableAbsoluteComputation<
+                remove_complex<Csr<ValueType, IndexType>>>,
+            public ScaledIdentityAddable {
     friend class EnablePolymorphicObject<Csr, LinOp>;
     friend class Coo<ValueType, IndexType>;
     friend class Dense<ValueType>;
@@ -136,10 +134,8 @@ class Csr
 public:
     using EnableLinOp<Csr>::convert_to;
     using EnableLinOp<Csr>::move_to;
-    using ConvertibleTo<
-        Csr<next_precision_with_half<ValueType>, IndexType>>::convert_to;
-    using ConvertibleTo<
-        Csr<next_precision_with_half<ValueType>, IndexType>>::move_to;
+    using ConvertibleTo<Csr<next_precision<ValueType>, IndexType>>::convert_to;
+    using ConvertibleTo<Csr<next_precision<ValueType>, IndexType>>::move_to;
     using ConvertibleTo<Dense<ValueType>>::convert_to;
     using ConvertibleTo<Dense<ValueType>>::move_to;
     using ConvertibleTo<Coo<ValueType, IndexType>>::convert_to;
@@ -696,32 +692,26 @@ class Csr
         index_type max_length_per_row_;
     };
 
-    friend class Csr<previous_precision_with_half<ValueType>, IndexType>;
+    friend class Csr<previous_precision<ValueType>, IndexType>;
 
-    void convert_to(Csr<next_precision_with_half<ValueType>, IndexType>* result)
-        const override;
+    void convert_to(
+        Csr<next_precision<ValueType>, IndexType>* result) const override;
 
-    void move_to(
-        Csr<next_precision_with_half<ValueType>, IndexType>* result) override;
+    void move_to(Csr<next_precision<ValueType>, IndexType>* result) override;
 
 #if GINKGO_ENABLE_HALF
-    friend class Csr<
-        previous_precision_with_half<previous_precision_with_half<ValueType>>,
-        IndexType>;
+    friend class Csr<previous_precision<previous_precision<ValueType>>,
+                     IndexType>;
     using ConvertibleTo<
-        Csr<next_precision_with_half<next_precision_with_half<ValueType>>,
-            IndexType>>::convert_to;
+        Csr<next_precision<next_precision<ValueType>>, IndexType>>::convert_to;
     using ConvertibleTo<
-        Csr<next_precision_with_half<next_precision_with_half<ValueType>>,
-            IndexType>>::move_to;
+        Csr<next_precision<next_precision<ValueType>>, IndexType>>::move_to;
 
-    void convert_to(
-        Csr<next_precision_with_half<next_precision_with_half<ValueType>>,
-            IndexType>* result) const override;
+    void convert_to(Csr<next_precision<next_precision<ValueType>>, IndexType>*
+                        result) const override;
 
-    void move_to(
-        Csr<next_precision_with_half<next_precision_with_half<ValueType>>,
-            IndexType>* result) override;
+    void move_to(Csr<next_precision<next_precision<ValueType>>, IndexType>*
+                     result) override;
 #endif
 
     void convert_to(Dense<ValueType>* other) const override;
diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp
index 9ae96ca46d6..9e19109e82a 100644
--- a/include/ginkgo/core/matrix/dense.hpp
+++ b/include/ginkgo/core/matrix/dense.hpp
@@ -87,10 +87,9 @@ class SparsityCsr;
 template <typename ValueType = default_precision>
 class Dense
     : public EnableLinOp<Dense<ValueType>>,
-      public ConvertibleTo<Dense<next_precision_with_half<ValueType>>>,
+      public ConvertibleTo<Dense<next_precision<ValueType>>>,
 #if GINKGO_ENABLE_HALF
-      public ConvertibleTo<
-          Dense<next_precision_with_half<next_precision_with_half<ValueType>>>>,
+      public ConvertibleTo<Dense<next_precision<next_precision<ValueType>>>>,
 #endif
       public ConvertibleTo<Coo<ValueType, int32>>,
       public ConvertibleTo<Coo<ValueType, int64>>,
@@ -139,8 +138,8 @@ class Dense
 public:
     using EnableLinOp<Dense>::convert_to;
     using EnableLinOp<Dense>::move_to;
-    using ConvertibleTo<Dense<next_precision_with_half<ValueType>>>::convert_to;
-    using ConvertibleTo<Dense<next_precision_with_half<ValueType>>>::move_to;
+    using ConvertibleTo<Dense<next_precision<ValueType>>>::convert_to;
+    using ConvertibleTo<Dense<next_precision<ValueType>>>::move_to;
     using ConvertibleTo<Coo<ValueType, int32>>::convert_to;
     using ConvertibleTo<Coo<ValueType, int32>>::move_to;
     using ConvertibleTo<Coo<ValueType, int64>>::convert_to;
@@ -280,28 +279,24 @@ class Dense
         return other->create_const_view_of_impl();
     }
 
-    friend class Dense<previous_precision_with_half<ValueType>>;
+    friend class Dense<previous_precision<ValueType>>;
 
-    void convert_to(
-        Dense<next_precision_with_half<ValueType>>* result) const override;
+    void convert_to(Dense<next_precision<ValueType>>* result) const override;
 
-    void move_to(Dense<next_precision_with_half<ValueType>>* result) override;
+    void move_to(Dense<next_precision<ValueType>>* result) override;
 
 #if GINKGO_ENABLE_HALF
-    friend class Dense<
-        previous_precision_with_half<previous_precision_with_half<ValueType>>>;
-    using ConvertibleTo<Dense<next_precision_with_half<
-        next_precision_with_half<ValueType>>>>::convert_to;
-    using ConvertibleTo<Dense<next_precision_with_half<
-        next_precision_with_half<ValueType>>>>::move_to;
+    friend class Dense<previous_precision<previous_precision<ValueType>>>;
+    using ConvertibleTo<
+        Dense<next_precision<next_precision<ValueType>>>>::convert_to;
+    using ConvertibleTo<
+        Dense<next_precision<next_precision<ValueType>>>>::move_to;
 
-    void convert_to(
-        Dense<next_precision_with_half<next_precision_with_half<ValueType>>>*
-            result) const override;
+    void convert_to(Dense<next_precision<next_precision<ValueType>>>* result)
+        const override;
 
     void move_to(
-        Dense<next_precision_with_half<next_precision_with_half<ValueType>>>*
-            result) override;
+        Dense<next_precision<next_precision<ValueType>>>* result) override;
 #endif
 
     void convert_to(Coo<ValueType, int32>* result) const override;
diff --git a/include/ginkgo/core/matrix/diagonal.hpp b/include/ginkgo/core/matrix/diagonal.hpp
index 3b11399138b..03a485f3ff3 100644
--- a/include/ginkgo/core/matrix/diagonal.hpp
+++ b/include/ginkgo/core/matrix/diagonal.hpp
@@ -41,10 +41,9 @@ class Diagonal
     : public EnableLinOp<Diagonal<ValueType>>,
       public ConvertibleTo<Csr<ValueType, int32>>,
       public ConvertibleTo<Csr<ValueType, int64>>,
-      public ConvertibleTo<Diagonal<next_precision_with_half<ValueType>>>,
+      public ConvertibleTo<Diagonal<next_precision<ValueType>>>,
 #if GINKGO_ENABLE_HALF
-      public ConvertibleTo<Diagonal<
-          next_precision_with_half<next_precision_with_half<ValueType>>>>,
+      public ConvertibleTo<Diagonal<next_precision<next_precision<ValueType>>>>,
 #endif
       public Transposable,
       public WritableToMatrixData<ValueType, int32>,
@@ -64,9 +63,8 @@ class Diagonal
     using ConvertibleTo<Csr<ValueType, int32>>::move_to;
     using ConvertibleTo<Csr<ValueType, int64>>::convert_to;
     using ConvertibleTo<Csr<ValueType, int64>>::move_to;
-    using ConvertibleTo<
-        Diagonal<next_precision_with_half<ValueType>>>::convert_to;
-    using ConvertibleTo<Diagonal<next_precision_with_half<ValueType>>>::move_to;
+    using ConvertibleTo<Diagonal<next_precision<ValueType>>>::convert_to;
+    using ConvertibleTo<Diagonal<next_precision<ValueType>>>::move_to;
 
     using value_type = ValueType;
     using index_type = int64;
@@ -76,33 +74,28 @@ class Diagonal
     using device_mat_data32 = device_matrix_data<ValueType, int32>;
     using absolute_type = remove_complex<Diagonal>;
 
-    friend class Diagonal<previous_precision_with_half<ValueType>>;
+    friend class Diagonal<previous_precision<ValueType>>;
 
     std::unique_ptr<LinOp> transpose() const override;
 
     std::unique_ptr<LinOp> conj_transpose() const override;
 
-    void convert_to(
-        Diagonal<next_precision_with_half<ValueType>>* result) const override;
+    void convert_to(Diagonal<next_precision<ValueType>>* result) const override;
 
-    void move_to(
-        Diagonal<next_precision_with_half<ValueType>>* result) override;
+    void move_to(Diagonal<next_precision<ValueType>>* result) override;
 
 #if GINKGO_ENABLE_HALF
-    friend class Diagonal<
-        previous_precision_with_half<previous_precision_with_half<ValueType>>>;
-    using ConvertibleTo<Diagonal<next_precision_with_half<
-        next_precision_with_half<ValueType>>>>::convert_to;
-    using ConvertibleTo<Diagonal<next_precision_with_half<
-        next_precision_with_half<ValueType>>>>::move_to;
+    friend class Diagonal<previous_precision<previous_precision<ValueType>>>;
+    using ConvertibleTo<
+        Diagonal<next_precision<next_precision<ValueType>>>>::convert_to;
+    using ConvertibleTo<
+        Diagonal<next_precision<next_precision<ValueType>>>>::move_to;
 
-    void convert_to(
-        Diagonal<next_precision_with_half<next_precision_with_half<ValueType>>>*
-            result) const override;
+    void convert_to(Diagonal<next_precision<next_precision<ValueType>>>* result)
+        const override;
 
     void move_to(
-        Diagonal<next_precision_with_half<next_precision_with_half<ValueType>>>*
-            result) override;
+        Diagonal<next_precision<next_precision<ValueType>>>* result) override;
 #endif
 
     void convert_to(Csr<ValueType, int32>* result) const override;
diff --git a/include/ginkgo/core/matrix/ell.hpp b/include/ginkgo/core/matrix/ell.hpp
index adbd3505855..8a1512e3f51 100644
--- a/include/ginkgo/core/matrix/ell.hpp
+++ b/include/ginkgo/core/matrix/ell.hpp
@@ -49,36 +49,32 @@ class Hybrid;
  * @ingroup LinOp
  */
 template <typename ValueType = default_precision, typename IndexType = int32>
-class Ell
-    : public EnableLinOp<Ell<ValueType, IndexType>>,
-      public ConvertibleTo<Ell<next_precision_with_half<ValueType>, IndexType>>,
+class Ell : public EnableLinOp<Ell<ValueType, IndexType>>,
+            public ConvertibleTo<Ell<next_precision<ValueType>, IndexType>>,
 #if GINKGO_ENABLE_HALF
-      public ConvertibleTo<
-          Ell<next_precision_with_half<next_precision_with_half<ValueType>>,
-              IndexType>>,
+            public ConvertibleTo<
+                Ell<next_precision<next_precision<ValueType>>, IndexType>>,
 #endif
-      public ConvertibleTo<Dense<ValueType>>,
-      public ConvertibleTo<Csr<ValueType, IndexType>>,
-      public DiagonalExtractable<ValueType>,
-      public ReadableFromMatrixData<ValueType, IndexType>,
-      public WritableToMatrixData<ValueType, IndexType>,
-      public EnableAbsoluteComputation<
-          remove_complex<Ell<ValueType, IndexType>>> {
+            public ConvertibleTo<Dense<ValueType>>,
+            public ConvertibleTo<Csr<ValueType, IndexType>>,
+            public DiagonalExtractable<ValueType>,
+            public ReadableFromMatrixData<ValueType, IndexType>,
+            public WritableToMatrixData<ValueType, IndexType>,
+            public EnableAbsoluteComputation<
+                remove_complex<Ell<ValueType, IndexType>>> {
     friend class EnablePolymorphicObject<Ell, LinOp>;
     friend class Dense<ValueType>;
     friend class Coo<ValueType, IndexType>;
     friend class Csr<ValueType, IndexType>;
     friend class Ell<to_complex<ValueType>, IndexType>;
-    friend class Ell<previous_precision_with_half<ValueType>, IndexType>;
+    friend class Ell<previous_precision<ValueType>, IndexType>;
     friend class Hybrid<ValueType, IndexType>;
 
 public:
     using EnableLinOp<Ell>::convert_to;
     using EnableLinOp<Ell>::move_to;
-    using ConvertibleTo<
-        Ell<next_precision_with_half<ValueType>, IndexType>>::convert_to;
-    using ConvertibleTo<
-        Ell<next_precision_with_half<ValueType>, IndexType>>::move_to;
+    using ConvertibleTo<Ell<next_precision<ValueType>, IndexType>>::convert_to;
+    using ConvertibleTo<Ell<next_precision<ValueType>, IndexType>>::move_to;
     using ConvertibleTo<Dense<ValueType>>::convert_to;
     using ConvertibleTo<Dense<ValueType>>::move_to;
     using ConvertibleTo<Csr<ValueType, IndexType>>::convert_to;
@@ -91,30 +87,24 @@ class Ell
     using device_mat_data = device_matrix_data<ValueType, IndexType>;
     using absolute_type = remove_complex<Ell>;
 
-    void convert_to(Ell<next_precision_with_half<ValueType>, IndexType>* result)
-        const override;
+    void convert_to(
+        Ell<next_precision<ValueType>, IndexType>* result) const override;
 
-    void move_to(
-        Ell<next_precision_with_half<ValueType>, IndexType>* result) override;
+    void move_to(Ell<next_precision<ValueType>, IndexType>* result) override;
 
 #if GINKGO_ENABLE_HALF
-    friend class Ell<
-        previous_precision_with_half<previous_precision_with_half<ValueType>>,
-        IndexType>;
+    friend class Ell<previous_precision<previous_precision<ValueType>>,
+                     IndexType>;
     using ConvertibleTo<
-        Ell<next_precision_with_half<next_precision_with_half<ValueType>>,
-            IndexType>>::convert_to;
+        Ell<next_precision<next_precision<ValueType>>, IndexType>>::convert_to;
     using ConvertibleTo<
-        Ell<next_precision_with_half<next_precision_with_half<ValueType>>,
-            IndexType>>::move_to;
+        Ell<next_precision<next_precision<ValueType>>, IndexType>>::move_to;
 
-    void convert_to(
-        Ell<next_precision_with_half<next_precision_with_half<ValueType>>,
-            IndexType>* result) const override;
+    void convert_to(Ell<next_precision<next_precision<ValueType>>, IndexType>*
+                        result) const override;
 
-    void move_to(
-        Ell<next_precision_with_half<next_precision_with_half<ValueType>>,
-            IndexType>* result) override;
+    void move_to(Ell<next_precision<next_precision<ValueType>>, IndexType>*
+                     result) override;
 #endif
 
     void convert_to(Dense<ValueType>* other) const override;
diff --git a/include/ginkgo/core/matrix/fbcsr.hpp b/include/ginkgo/core/matrix/fbcsr.hpp
index 283807b242c..ae11dec6843 100644
--- a/include/ginkgo/core/matrix/fbcsr.hpp
+++ b/include/ginkgo/core/matrix/fbcsr.hpp
@@ -96,24 +96,21 @@ inline IndexType get_num_blocks(const int block_size, const IndexType size)
  * @ingroup LinOp
  */
 template <typename ValueType = default_precision, typename IndexType = int32>
-class Fbcsr
-    : public EnableLinOp<Fbcsr<ValueType, IndexType>>,
-      public ConvertibleTo<
-          Fbcsr<next_precision_with_half<ValueType>, IndexType>>,
+class Fbcsr : public EnableLinOp<Fbcsr<ValueType, IndexType>>,
+              public ConvertibleTo<Fbcsr<next_precision<ValueType>, IndexType>>,
 #if GINKGO_ENABLE_HALF
-      public ConvertibleTo<
-          Fbcsr<next_precision_with_half<next_precision_with_half<ValueType>>,
-                IndexType>>,
+              public ConvertibleTo<
+                  Fbcsr<next_precision<next_precision<ValueType>>, IndexType>>,
 #endif
-      public ConvertibleTo<Dense<ValueType>>,
-      public ConvertibleTo<Csr<ValueType, IndexType>>,
-      public ConvertibleTo<SparsityCsr<ValueType, IndexType>>,
-      public DiagonalExtractable<ValueType>,
-      public ReadableFromMatrixData<ValueType, IndexType>,
-      public WritableToMatrixData<ValueType, IndexType>,
-      public Transposable,
-      public EnableAbsoluteComputation<
-          remove_complex<Fbcsr<ValueType, IndexType>>> {
+              public ConvertibleTo<Dense<ValueType>>,
+              public ConvertibleTo<Csr<ValueType, IndexType>>,
+              public ConvertibleTo<SparsityCsr<ValueType, IndexType>>,
+              public DiagonalExtractable<ValueType>,
+              public ReadableFromMatrixData<ValueType, IndexType>,
+              public WritableToMatrixData<ValueType, IndexType>,
+              public Transposable,
+              public EnableAbsoluteComputation<
+                  remove_complex<Fbcsr<ValueType, IndexType>>> {
     friend class EnablePolymorphicObject<Fbcsr, LinOp>;
     friend class Csr<ValueType, IndexType>;
     friend class Dense<ValueType>;
@@ -143,9 +140,8 @@ class Fbcsr
     using EnableLinOp<Fbcsr<ValueType, IndexType>>::convert_to;
 
     using ConvertibleTo<
-        Fbcsr<next_precision_with_half<ValueType>, IndexType>>::convert_to;
-    using ConvertibleTo<
-        Fbcsr<next_precision_with_half<ValueType>, IndexType>>::move_to;
+        Fbcsr<next_precision<ValueType>, IndexType>>::convert_to;
+    using ConvertibleTo<Fbcsr<next_precision<ValueType>, IndexType>>::move_to;
     using ConvertibleTo<Dense<ValueType>>::convert_to;
     using ConvertibleTo<Dense<ValueType>>::move_to;
     using ConvertibleTo<Csr<ValueType, IndexType>>::convert_to;
@@ -153,32 +149,26 @@ class Fbcsr
     using ConvertibleTo<SparsityCsr<ValueType, IndexType>>::convert_to;
     using ConvertibleTo<SparsityCsr<ValueType, IndexType>>::move_to;
 
-    friend class Fbcsr<previous_precision_with_half<ValueType>, IndexType>;
+    friend class Fbcsr<previous_precision<ValueType>, IndexType>;
 
-    void convert_to(Fbcsr<next_precision_with_half<ValueType>, IndexType>*
-                        result) const override;
+    void convert_to(
+        Fbcsr<next_precision<ValueType>, IndexType>* result) const override;
 
-    void move_to(
-        Fbcsr<next_precision_with_half<ValueType>, IndexType>* result) override;
+    void move_to(Fbcsr<next_precision<ValueType>, IndexType>* result) override;
 
 #if GINKGO_ENABLE_HALF
-    friend class Fbcsr<
-        previous_precision_with_half<previous_precision_with_half<ValueType>>,
-        IndexType>;
+    friend class Fbcsr<previous_precision<previous_precision<ValueType>>,
+                       IndexType>;
+    using ConvertibleTo<Fbcsr<next_precision<next_precision<ValueType>>,
+                              IndexType>>::convert_to;
     using ConvertibleTo<
-        Fbcsr<next_precision_with_half<next_precision_with_half<ValueType>>,
-              IndexType>>::convert_to;
-    using ConvertibleTo<
-        Fbcsr<next_precision_with_half<next_precision_with_half<ValueType>>,
-              IndexType>>::move_to;
+        Fbcsr<next_precision<next_precision<ValueType>>, IndexType>>::move_to;
 
-    void convert_to(
-        Fbcsr<next_precision_with_half<next_precision_with_half<ValueType>>,
-              IndexType>* result) const override;
+    void convert_to(Fbcsr<next_precision<next_precision<ValueType>>, IndexType>*
+                        result) const override;
 
-    void move_to(
-        Fbcsr<next_precision_with_half<next_precision_with_half<ValueType>>,
-              IndexType>* result) override;
+    void move_to(Fbcsr<next_precision<next_precision<ValueType>>, IndexType>*
+                     result) override;
 #endif
 
     void convert_to(Dense<ValueType>* other) const override;
diff --git a/include/ginkgo/core/matrix/hybrid.hpp b/include/ginkgo/core/matrix/hybrid.hpp
index 24cb3ed26c7..8432e856319 100644
--- a/include/ginkgo/core/matrix/hybrid.hpp
+++ b/include/ginkgo/core/matrix/hybrid.hpp
@@ -41,12 +41,10 @@ class Csr;
 template <typename ValueType = default_precision, typename IndexType = int32>
 class Hybrid
     : public EnableLinOp<Hybrid<ValueType, IndexType>>,
-      public ConvertibleTo<
-          Hybrid<next_precision_with_half<ValueType>, IndexType>>,
+      public ConvertibleTo<Hybrid<next_precision<ValueType>, IndexType>>,
 #if GINKGO_ENABLE_HALF
       public ConvertibleTo<
-          Hybrid<next_precision_with_half<next_precision_with_half<ValueType>>,
-                 IndexType>>,
+          Hybrid<next_precision<next_precision<ValueType>>, IndexType>>,
 #endif
       public ConvertibleTo<Dense<ValueType>>,
       public ConvertibleTo<Csr<ValueType, IndexType>>,
@@ -65,9 +63,8 @@ class Hybrid
     using EnableLinOp<Hybrid>::convert_to;
     using EnableLinOp<Hybrid>::move_to;
     using ConvertibleTo<
-        Hybrid<next_precision_with_half<ValueType>, IndexType>>::convert_to;
-    using ConvertibleTo<
-        Hybrid<next_precision_with_half<ValueType>, IndexType>>::move_to;
+        Hybrid<next_precision<ValueType>, IndexType>>::convert_to;
+    using ConvertibleTo<Hybrid<next_precision<ValueType>, IndexType>>::move_to;
     using ConvertibleTo<Dense<ValueType>>::convert_to;
     using ConvertibleTo<Dense<ValueType>>::move_to;
     using ConvertibleTo<Csr<ValueType, IndexType>>::convert_to;
@@ -362,32 +359,26 @@ class Hybrid
         imbalance_bounded_limit strategy_;
     };
 
-    friend class Hybrid<previous_precision_with_half<ValueType>, IndexType>;
+    friend class Hybrid<previous_precision<ValueType>, IndexType>;
 
-    void convert_to(Hybrid<next_precision_with_half<ValueType>, IndexType>*
-                        result) const override;
+    void convert_to(
+        Hybrid<next_precision<ValueType>, IndexType>* result) const override;
 
-    void move_to(Hybrid<next_precision_with_half<ValueType>, IndexType>* result)
-        override;
+    void move_to(Hybrid<next_precision<ValueType>, IndexType>* result) override;
 
 #if GINKGO_ENABLE_HALF
-    friend class Hybrid<
-        previous_precision_with_half<previous_precision_with_half<ValueType>>,
-        IndexType>;
+    friend class Hybrid<previous_precision<previous_precision<ValueType>>,
+                        IndexType>;
+    using ConvertibleTo<Hybrid<next_precision<next_precision<ValueType>>,
+                               IndexType>>::convert_to;
     using ConvertibleTo<
-        Hybrid<next_precision_with_half<next_precision_with_half<ValueType>>,
-               IndexType>>::convert_to;
-    using ConvertibleTo<
-        Hybrid<next_precision_with_half<next_precision_with_half<ValueType>>,
-               IndexType>>::move_to;
+        Hybrid<next_precision<next_precision<ValueType>>, IndexType>>::move_to;
 
-    void convert_to(
-        Hybrid<next_precision_with_half<next_precision_with_half<ValueType>>,
-               IndexType>* result) const override;
+    void convert_to(Hybrid<next_precision<next_precision<ValueType>>,
+                           IndexType>* result) const override;
 
-    void move_to(
-        Hybrid<next_precision_with_half<next_precision_with_half<ValueType>>,
-               IndexType>* result) override;
+    void move_to(Hybrid<next_precision<next_precision<ValueType>>, IndexType>*
+                     result) override;
 #endif
 
     void convert_to(Dense<ValueType>* other) const override;
diff --git a/include/ginkgo/core/matrix/sellp.hpp b/include/ginkgo/core/matrix/sellp.hpp
index 6140a832c85..529df8e9f25 100644
--- a/include/ginkgo/core/matrix/sellp.hpp
+++ b/include/ginkgo/core/matrix/sellp.hpp
@@ -40,22 +40,19 @@ class Csr;
  * @ingroup LinOp
  */
 template <typename ValueType = default_precision, typename IndexType = int32>
-class Sellp
-    : public EnableLinOp<Sellp<ValueType, IndexType>>,
-      public ConvertibleTo<
-          Sellp<next_precision_with_half<ValueType>, IndexType>>,
+class Sellp : public EnableLinOp<Sellp<ValueType, IndexType>>,
+              public ConvertibleTo<Sellp<next_precision<ValueType>, IndexType>>,
 #if GINKGO_ENABLE_HALF
-      public ConvertibleTo<
-          Sellp<next_precision_with_half<next_precision_with_half<ValueType>>,
-                IndexType>>,
+              public ConvertibleTo<
+                  Sellp<next_precision<next_precision<ValueType>>, IndexType>>,
 #endif
-      public ConvertibleTo<Dense<ValueType>>,
-      public ConvertibleTo<Csr<ValueType, IndexType>>,
-      public DiagonalExtractable<ValueType>,
-      public ReadableFromMatrixData<ValueType, IndexType>,
-      public WritableToMatrixData<ValueType, IndexType>,
-      public EnableAbsoluteComputation<
-          remove_complex<Sellp<ValueType, IndexType>>> {
+              public ConvertibleTo<Dense<ValueType>>,
+              public ConvertibleTo<Csr<ValueType, IndexType>>,
+              public DiagonalExtractable<ValueType>,
+              public ReadableFromMatrixData<ValueType, IndexType>,
+              public WritableToMatrixData<ValueType, IndexType>,
+              public EnableAbsoluteComputation<
+                  remove_complex<Sellp<ValueType, IndexType>>> {
     friend class EnablePolymorphicObject<Sellp, LinOp>;
     friend class Dense<ValueType>;
     friend class Csr<ValueType, IndexType>;
@@ -65,9 +62,8 @@ class Sellp
     using EnableLinOp<Sellp>::convert_to;
     using EnableLinOp<Sellp>::move_to;
     using ConvertibleTo<
-        Sellp<next_precision_with_half<ValueType>, IndexType>>::convert_to;
-    using ConvertibleTo<
-        Sellp<next_precision_with_half<ValueType>, IndexType>>::move_to;
+        Sellp<next_precision<ValueType>, IndexType>>::convert_to;
+    using ConvertibleTo<Sellp<next_precision<ValueType>, IndexType>>::move_to;
     using ConvertibleTo<Dense<ValueType>>::convert_to;
     using ConvertibleTo<Dense<ValueType>>::move_to;
     using ConvertibleTo<Csr<ValueType, IndexType>>::convert_to;
@@ -80,32 +76,26 @@ class Sellp
     using device_mat_data = device_matrix_data<ValueType, IndexType>;
     using absolute_type = remove_complex<Sellp>;
 
-    friend class Sellp<previous_precision_with_half<ValueType>, IndexType>;
+    friend class Sellp<previous_precision<ValueType>, IndexType>;
 
-    void convert_to(Sellp<next_precision_with_half<ValueType>, IndexType>*
-                        result) const override;
+    void convert_to(
+        Sellp<next_precision<ValueType>, IndexType>* result) const override;
 
-    void move_to(
-        Sellp<next_precision_with_half<ValueType>, IndexType>* result) override;
+    void move_to(Sellp<next_precision<ValueType>, IndexType>* result) override;
 
 #if GINKGO_ENABLE_HALF
-    friend class Sellp<
-        previous_precision_with_half<previous_precision_with_half<ValueType>>,
-        IndexType>;
+    friend class Sellp<previous_precision<previous_precision<ValueType>>,
+                       IndexType>;
+    using ConvertibleTo<Sellp<next_precision<next_precision<ValueType>>,
+                              IndexType>>::convert_to;
     using ConvertibleTo<
-        Sellp<next_precision_with_half<next_precision_with_half<ValueType>>,
-              IndexType>>::convert_to;
-    using ConvertibleTo<
-        Sellp<next_precision_with_half<next_precision_with_half<ValueType>>,
-              IndexType>>::move_to;
+        Sellp<next_precision<next_precision<ValueType>>, IndexType>>::move_to;
 
-    void convert_to(
-        Sellp<next_precision_with_half<next_precision_with_half<ValueType>>,
-              IndexType>* result) const override;
+    void convert_to(Sellp<next_precision<next_precision<ValueType>>, IndexType>*
+                        result) const override;
 
-    void move_to(
-        Sellp<next_precision_with_half<next_precision_with_half<ValueType>>,
-              IndexType>* result) override;
+    void move_to(Sellp<next_precision<next_precision<ValueType>>, IndexType>*
+                     result) override;
 #endif
 
     void convert_to(Dense<ValueType>* other) const override;
diff --git a/reference/test/base/batch_multi_vector_kernels.cpp b/reference/test/base/batch_multi_vector_kernels.cpp
index a860c3c4b24..7ef9b77601d 100644
--- a/reference/test/base/batch_multi_vector_kernels.cpp
+++ b/reference/test/base/batch_multi_vector_kernels.cpp
@@ -343,7 +343,7 @@ TYPED_TEST(MultiVector, ConvertsToPrecision)
 {
     using MultiVector = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using OtherT = typename gko::next_precision_with_half<T>;
+    using OtherT = typename gko::next_precision<T>;
     using OtherMultiVector = typename gko::batch::MultiVector<OtherT>;
     auto tmp = OtherMultiVector::create(this->exec);
     auto res = MultiVector::create(this->exec);
@@ -367,7 +367,7 @@ TYPED_TEST(MultiVector, MovesToPrecision)
 {
     using MultiVector = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using OtherT = typename gko::next_precision_with_half<T>;
+    using OtherT = typename gko::next_precision<T>;
     using OtherMultiVector = typename gko::batch::MultiVector<OtherT>;
     auto tmp = OtherMultiVector::create(this->exec);
     auto res = MultiVector::create(this->exec);
@@ -391,7 +391,7 @@ TYPED_TEST(MultiVector, ConvertsEmptyToPrecision)
 {
     using MultiVector = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using OtherT = typename gko::next_precision_with_half<T>;
+    using OtherT = typename gko::next_precision<T>;
     using OtherMultiVector = typename gko::batch::MultiVector<OtherT>;
     auto empty = OtherMultiVector::create(this->exec);
     auto res = MultiVector::create(this->exec);
@@ -406,7 +406,7 @@ TYPED_TEST(MultiVector, MovesEmptyToPrecision)
 {
     using MultiVector = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using OtherT = typename gko::next_precision_with_half<T>;
+    using OtherT = typename gko::next_precision<T>;
     using OtherMultiVector = typename gko::batch::MultiVector<OtherT>;
     auto empty = OtherMultiVector::create(this->exec);
     auto res = MultiVector::create(this->exec);
diff --git a/reference/test/base/combination.cpp b/reference/test/base/combination.cpp
index 149aaa33256..cfb5abcdba0 100644
--- a/reference/test/base/combination.cpp
+++ b/reference/test/base/combination.cpp
@@ -115,7 +115,7 @@ TYPED_TEST(Combination, AppliesToMixedVector)
         cmb = [ 8 7 ]
               [ 5 4 ]
     */
-    using value_type = gko::next_precision_with_half<TypeParam>;
+    using value_type = gko::next_precision<TypeParam>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto cmb = gko::Combination<TypeParam>::create(
         this->coefficients[0], this->operators[0], this->coefficients[1],
@@ -157,8 +157,7 @@ TYPED_TEST(Combination, AppliesToMixedComplexVector)
         cmb = [ 8 7 ]
               [ 5 4 ]
     */
-    using value_type =
-        gko::to_complex<gko::next_precision_with_half<TypeParam>>;
+    using value_type = gko::to_complex<gko::next_precision<TypeParam>>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto cmb = gko::Combination<TypeParam>::create(
         this->coefficients[0], this->operators[0], this->coefficients[1],
@@ -202,7 +201,7 @@ TYPED_TEST(Combination, AppliesLinearCombinationToMixedVector)
         cmb = [ 8 7 ]
               [ 5 4 ]
     */
-    using value_type = gko::next_precision_with_half<TypeParam>;
+    using value_type = gko::next_precision<TypeParam>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto cmb = gko::Combination<TypeParam>::create(
         this->coefficients[0], this->operators[0], this->coefficients[1],
@@ -250,8 +249,7 @@ TYPED_TEST(Combination, AppliesLinearCombinationToMixedComplexVector)
         cmb = [ 8 7 ]
               [ 5 4 ]
     */
-    using MixedDense =
-        gko::matrix::Dense<gko::next_precision_with_half<TypeParam>>;
+    using MixedDense = gko::matrix::Dense<gko::next_precision<TypeParam>>;
     using MixedDenseComplex = gko::to_complex<MixedDense>;
     using value_type = typename MixedDenseComplex::value_type;
     auto cmb = gko::Combination<TypeParam>::create(
diff --git a/reference/test/base/composition.cpp b/reference/test/base/composition.cpp
index d17b8602ce8..9767fab466c 100644
--- a/reference/test/base/composition.cpp
+++ b/reference/test/base/composition.cpp
@@ -143,7 +143,7 @@ TYPED_TEST(Composition, AppliesSingleToMixedVector)
         cmp = [ -9 -2 ]
               [ 27 26 ]
     */
-    using Mtx = gko::matrix::Dense<gko::next_precision_with_half<TypeParam>>;
+    using Mtx = gko::matrix::Dense<gko::next_precision<TypeParam>>;
     using value_type = typename Mtx::value_type;
     auto cmp = gko::Composition<TypeParam>::create(this->product);
     auto x = gko::initialize<Mtx>({1.0, 2.0}, this->exec);
@@ -183,8 +183,7 @@ TYPED_TEST(Composition, AppliesSingleToMixedComplexVector)
         cmp = [ -9 -2 ]
               [ 27 26 ]
     */
-    using value_type =
-        gko::next_precision_with_half<gko::to_complex<TypeParam>>;
+    using value_type = gko::next_precision<gko::to_complex<TypeParam>>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto cmp = gko::Composition<TypeParam>::create(this->product);
     auto x = gko::initialize<Mtx>(
@@ -224,7 +223,7 @@ TYPED_TEST(Composition, AppliesSingleLinearCombinationToMixedVector)
         cmp = [ -9 -2 ]
               [ 27 26 ]
     */
-    using value_type = gko::next_precision_with_half<TypeParam>;
+    using value_type = gko::next_precision<TypeParam>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto cmp = gko::Composition<TypeParam>::create(this->product);
     auto alpha = gko::initialize<Mtx>({3.0}, this->exec);
@@ -269,8 +268,7 @@ TYPED_TEST(Composition, AppliesSingleLinearCombinationToMixedComplexVector)
         cmp = [ -9 -2 ]
               [ 27 26 ]
     */
-    using MixedDense =
-        gko::matrix::Dense<gko::next_precision_with_half<TypeParam>>;
+    using MixedDense = gko::matrix::Dense<gko::next_precision<TypeParam>>;
     using MixedDenseComplex = gko::to_complex<MixedDense>;
     using value_type = typename MixedDenseComplex::value_type;
     auto cmp = gko::Composition<TypeParam>::create(this->product);
diff --git a/reference/test/base/perturbation.cpp b/reference/test/base/perturbation.cpp
index 50a5fe7db20..1563f22e175 100644
--- a/reference/test/base/perturbation.cpp
+++ b/reference/test/base/perturbation.cpp
@@ -102,7 +102,7 @@ TYPED_TEST(Perturbation, AppliesToMixedVector)
         cmp = I + 2 * [ 2 ] * [ 3 2 ]
                       [ 1 ]
     */
-    using Mtx = gko::matrix::Dense<gko::next_precision_with_half<TypeParam>>;
+    using Mtx = gko::matrix::Dense<gko::next_precision<TypeParam>>;
     using value_type = typename Mtx::value_type;
     auto cmp = gko::Perturbation<TypeParam>::create(this->scalar, this->basis,
                                                     this->projector);
@@ -144,8 +144,7 @@ TYPED_TEST(Perturbation, AppliesToMixedComplexVector)
         cmp = I + 2 * [ 2 ] * [ 3 2 ]
                       [ 1 ]
     */
-    using value_type =
-        gko::to_complex<gko::next_precision_with_half<TypeParam>>;
+    using value_type = gko::to_complex<gko::next_precision<TypeParam>>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto cmp = gko::Perturbation<TypeParam>::create(this->scalar, this->basis,
                                                     this->projector);
@@ -187,7 +186,7 @@ TYPED_TEST(Perturbation, AppliesLinearCombinationToMixedVector)
         cmp = I + 2 * [ 2 ] * [ 3 2 ]
                       [ 1 ]
     */
-    using value_type = gko::next_precision_with_half<TypeParam>;
+    using value_type = gko::next_precision<TypeParam>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto cmp = gko::Perturbation<TypeParam>::create(this->scalar, this->basis,
                                                     this->projector);
@@ -234,8 +233,7 @@ TYPED_TEST(Perturbation, AppliesLinearCombinationToMixedComplexVector)
         cmp = I + 2 * [ 2 ] * [ 3 2 ]
                       [ 1 ]
     */
-    using MixedDense =
-        gko::matrix::Dense<gko::next_precision_with_half<TypeParam>>;
+    using MixedDense = gko::matrix::Dense<gko::next_precision<TypeParam>>;
     using MixedDenseComplex = gko::to_complex<MixedDense>;
     using value_type = typename MixedDenseComplex::value_type;
     auto cmp = gko::Perturbation<TypeParam>::create(this->scalar, this->basis,
diff --git a/reference/test/matrix/coo_kernels.cpp b/reference/test/matrix/coo_kernels.cpp
index 6ffea5d0e7d..b3f58ec3083 100644
--- a/reference/test/matrix/coo_kernels.cpp
+++ b/reference/test/matrix/coo_kernels.cpp
@@ -32,8 +32,7 @@ class Coo : public ::testing::Test {
     using Csr = gko::matrix::Csr<value_type, index_type>;
     using Mtx = gko::matrix::Coo<value_type, index_type>;
     using Vec = gko::matrix::Dense<value_type>;
-    using MixedVec =
-        gko::matrix::Dense<gko::next_precision_with_half<value_type>>;
+    using MixedVec = gko::matrix::Dense<gko::next_precision<value_type>>;
 
     Coo() : exec(gko::ReferenceExecutor::create()), mtx(Mtx::create(exec))
     {
@@ -81,7 +80,7 @@ TYPED_TEST(Coo, ConvertsToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision_with_half<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Coo = typename TestFixture::Mtx;
     using OtherCoo = gko::matrix::Coo<OtherType, IndexType>;
     auto tmp = OtherCoo::create(this->exec);
@@ -103,7 +102,7 @@ TYPED_TEST(Coo, MovesToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision_with_half<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Coo = typename TestFixture::Mtx;
     using OtherCoo = gko::matrix::Coo<OtherType, IndexType>;
     auto tmp = OtherCoo::create(this->exec);
@@ -216,7 +215,7 @@ TYPED_TEST(Coo, ConvertsEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision_with_half<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Coo = typename TestFixture::Mtx;
     using OtherCoo = gko::matrix::Coo<OtherType, IndexType>;
     auto empty = OtherCoo::create(this->exec);
@@ -233,7 +232,7 @@ TYPED_TEST(Coo, MovesEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision_with_half<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Coo = typename TestFixture::Mtx;
     using OtherCoo = gko::matrix::Coo<OtherType, IndexType>;
     auto empty = OtherCoo::create(this->exec);
@@ -704,7 +703,7 @@ TYPED_TEST(Coo, AppliesToComplex)
 TYPED_TEST(Coo, AppliesToMixedComplex)
 {
     using mixed_value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+        gko::next_precision<typename TestFixture::value_type>;
     using mixed_complex_type = gko::to_complex<mixed_value_type>;
     using Vec = gko::matrix::Dense<mixed_complex_type>;
     auto exec = gko::ReferenceExecutor::create();
@@ -760,7 +759,7 @@ TYPED_TEST(Coo, AdvancedAppliesToComplex)
 TYPED_TEST(Coo, AdvancedAppliesToMixedComplex)
 {
     using mixed_value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+        gko::next_precision<typename TestFixture::value_type>;
     using mixed_complex_type = gko::to_complex<mixed_value_type>;
     using MixedDense = gko::matrix::Dense<mixed_value_type>;
     using MixedDenseComplex = gko::matrix::Dense<mixed_complex_type>;
@@ -818,7 +817,7 @@ TYPED_TEST(Coo, ApplyAddsToComplex)
 TYPED_TEST(Coo, ApplyAddsToMixedComplex)
 {
     using mixed_value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+        gko::next_precision<typename TestFixture::value_type>;
     using mixed_complex_type = gko::to_complex<mixed_value_type>;
     using MixedVec = gko::matrix::Dense<mixed_complex_type>;
     auto exec = gko::ReferenceExecutor::create();
@@ -875,7 +874,7 @@ TYPED_TEST(Coo, ApplyAddsScaledToComplex)
 TYPED_TEST(Coo, ApplyAddsScaledToMixedComplex)
 {
     using mixed_value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+        gko::next_precision<typename TestFixture::value_type>;
     using mixed_complex_type = gko::to_complex<mixed_value_type>;
     using MixedDense = gko::matrix::Dense<mixed_value_type>;
     using MixedDenseComplex = gko::matrix::Dense<mixed_complex_type>;
diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp
index b417eb93f52..61018563769 100644
--- a/reference/test/matrix/csr_kernels.cpp
+++ b/reference/test/matrix/csr_kernels.cpp
@@ -46,8 +46,7 @@ class Csr : public ::testing::Test {
     using Ell = gko::matrix::Ell<value_type, index_type>;
     using Hybrid = gko::matrix::Hybrid<value_type, index_type>;
     using Vec = gko::matrix::Dense<value_type>;
-    using MixedVec =
-        gko::matrix::Dense<gko::next_precision_with_half<value_type>>;
+    using MixedVec = gko::matrix::Dense<gko::next_precision<value_type>>;
     using Perm = gko::matrix::Permutation<index_type>;
     using ScaledPerm = gko::matrix::ScaledPermutation<value_type, index_type>;
 
@@ -370,7 +369,7 @@ TYPED_TEST(Csr, MixedAppliesToDenseVector1)
 {
     // Both vectors have the same value type which differs from the matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision_with_half<T>;
+    using next_T = gko::next_precision<T>;
     using Vec = typename gko::matrix::Dense<next_T>;
     auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
     auto y = Vec::create(this->exec, gko::dim<2>{2, 1});
@@ -385,7 +384,7 @@ TYPED_TEST(Csr, MixedAppliesToDenseVector2)
 {
     // Input vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision_with_half<T>;
+    using next_T = gko::next_precision<T>;
     using Vec1 = typename TestFixture::Vec;
     using Vec2 = gko::matrix::Dense<next_T>;
     auto x = gko::initialize<Vec1>({2.0, 1.0, 4.0}, this->exec);
@@ -401,9 +400,9 @@ TYPED_TEST(Csr, MixedAppliesToDenseVector3)
 {
     // Output vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision_with_half<T>;
+    using next_T = gko::next_precision<T>;
     using Vec1 = typename TestFixture::Vec;
-    using Vec2 = gko::matrix::Dense<gko::next_precision_with_half<T>>;
+    using Vec2 = gko::matrix::Dense<gko::next_precision<T>>;
     auto x = gko::initialize<Vec2>({2.0, 1.0, 4.0}, this->exec);
     auto y = Vec1::create(this->exec, gko::dim<2>{2, 1});
 
@@ -434,7 +433,7 @@ TYPED_TEST(Csr, MixedAppliesToDenseMatrix1)
 {
     // Both vectors have the same value type which differs from the matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision_with_half<T>;
+    using next_T = gko::next_precision<T>;
     using Vec = gko::matrix::Dense<next_T>;
     // clang-format off
     auto x = gko::initialize<Vec>(
@@ -458,7 +457,7 @@ TYPED_TEST(Csr, MixedAppliesToDenseMatrix2)
 {
     // Input vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision_with_half<T>;
+    using next_T = gko::next_precision<T>;
     using Vec1 = typename TestFixture::Vec;
     using Vec2 = gko::matrix::Dense<next_T>;
     // clang-format off
@@ -483,7 +482,7 @@ TYPED_TEST(Csr, MixedAppliesToDenseMatrix3)
 {
     // Output vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision_with_half<T>;
+    using next_T = gko::next_precision<T>;
     using Vec1 = typename TestFixture::Vec;
     using Vec2 = gko::matrix::Dense<next_T>;
     // clang-format off
@@ -524,7 +523,7 @@ TYPED_TEST(Csr, MixedAppliesLinearCombinationToDenseVector1)
 {
     // Both vectors have the same value type which differs from the matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision_with_half<T>;
+    using next_T = gko::next_precision<T>;
     using Vec = gko::matrix::Dense<next_T>;
     auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
     auto beta = gko::initialize<Vec>({2.0}, this->exec);
@@ -541,7 +540,7 @@ TYPED_TEST(Csr, MixedAppliesLinearCombinationToDenseVector2)
 {
     // Input vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision_with_half<T>;
+    using next_T = gko::next_precision<T>;
     using Vec1 = typename TestFixture::Vec;
     using Vec2 = gko::matrix::Dense<next_T>;
     auto alpha = gko::initialize<Vec1>({-1.0}, this->exec);
@@ -559,7 +558,7 @@ TYPED_TEST(Csr, MixedAppliesLinearCombinationToDenseVector3)
 {
     // Output vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision_with_half<T>;
+    using next_T = gko::next_precision<T>;
     using Vec1 = typename TestFixture::Vec;
     using Vec2 = gko::matrix::Dense<next_T>;
     auto alpha = gko::initialize<Vec2>({-1.0}, this->exec);
@@ -597,7 +596,7 @@ TYPED_TEST(Csr, MixedAppliesLinearCombinationToDenseMatrix1)
 {
     // Both vectors have the same value type which differs from the matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision_with_half<T>;
+    using next_T = gko::next_precision<T>;
     using Vec = gko::matrix::Dense<next_T>;
     auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
     auto beta = gko::initialize<Vec>({2.0}, this->exec);
@@ -621,7 +620,7 @@ TYPED_TEST(Csr, MixedAppliesLinearCombinationToDenseMatrix2)
 {
     // Input vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision_with_half<T>;
+    using next_T = gko::next_precision<T>;
     using Vec1 = typename TestFixture::Vec;
     using Vec2 = gko::matrix::Dense<next_T>;
     auto alpha = gko::initialize<Vec1>({-1.0}, this->exec);
@@ -641,7 +640,7 @@ TYPED_TEST(Csr, MixedAppliesLinearCombinationToDenseMatrix3)
 {
     // Output vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision_with_half<T>;
+    using next_T = gko::next_precision<T>;
     using Vec1 = typename TestFixture::Vec;
     using Vec2 = gko::matrix::Dense<next_T>;
     auto alpha = gko::initialize<Vec2>({-1.0}, this->exec);
@@ -790,7 +789,7 @@ TYPED_TEST(Csr, ConvertsToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision_with_half<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Csr = typename TestFixture::Mtx;
     using OtherCsr = gko::matrix::Csr<OtherType, IndexType>;
     auto tmp = OtherCsr::create(this->exec);
@@ -816,7 +815,7 @@ TYPED_TEST(Csr, MovesToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision_with_half<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Csr = typename TestFixture::Mtx;
     using OtherCsr = gko::matrix::Csr<OtherType, IndexType>;
     auto tmp = OtherCsr::create(this->exec);
@@ -994,7 +993,7 @@ TYPED_TEST(Csr, ConvertsEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision_with_half<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Csr = typename TestFixture::Mtx;
     using OtherCsr = gko::matrix::Csr<OtherType, IndexType>;
     auto empty = OtherCsr::create(this->exec);
@@ -1013,7 +1012,7 @@ TYPED_TEST(Csr, MovesEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision_with_half<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Csr = typename TestFixture::Mtx;
     using OtherCsr = gko::matrix::Csr<OtherType, IndexType>;
     auto empty = OtherCsr::create(this->exec);
@@ -2050,7 +2049,7 @@ TYPED_TEST(Csr, AppliesToComplex)
 TYPED_TEST(Csr, AppliesToMixedComplex)
 {
     using mixed_value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+        gko::next_precision<typename TestFixture::value_type>;
     using mixed_complex_type = gko::to_complex<mixed_value_type>;
     using Vec = gko::matrix::Dense<mixed_complex_type>;
     auto exec = gko::ReferenceExecutor::create();
@@ -2106,7 +2105,7 @@ TYPED_TEST(Csr, AdvancedAppliesToComplex)
 TYPED_TEST(Csr, AdvancedAppliesToMixedComplex)
 {
     using mixed_value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+        gko::next_precision<typename TestFixture::value_type>;
     using mixed_complex_type = gko::to_complex<mixed_value_type>;
     using MixedDense = gko::matrix::Dense<mixed_value_type>;
     using MixedDenseComplex = gko::matrix::Dense<mixed_complex_type>;
diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp
index 3854cd56dff..9ab59b0b4b8 100644
--- a/reference/test/matrix/dense_kernels.cpp
+++ b/reference/test/matrix/dense_kernels.cpp
@@ -37,8 +37,7 @@ class Dense : public ::testing::Test {
 protected:
     using value_type = T;
     using Mtx = gko::matrix::Dense<value_type>;
-    using MixedMtx =
-        gko::matrix::Dense<gko::next_precision_with_half<value_type>>;
+    using MixedMtx = gko::matrix::Dense<gko::next_precision<value_type>>;
     using ComplexMtx = gko::to_complex<Mtx>;
     using RealMtx = gko::remove_complex<Mtx>;
     Dense()
@@ -746,7 +745,7 @@ TYPED_TEST(Dense, ConvertsToPrecision)
 {
     using Dense = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using OtherT = typename gko::next_precision_with_half<T>;
+    using OtherT = typename gko::next_precision<T>;
     using OtherDense = typename gko::matrix::Dense<OtherT>;
     auto tmp = OtherDense::create(this->exec);
     auto res = Dense::create(this->exec);
@@ -768,7 +767,7 @@ TYPED_TEST(Dense, MovesToPrecision)
 {
     using Dense = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using OtherT = typename gko::next_precision_with_half<T>;
+    using OtherT = typename gko::next_precision<T>;
     using OtherDense = typename gko::matrix::Dense<OtherT>;
     auto tmp = OtherDense::create(this->exec);
     auto res = Dense::create(this->exec);
@@ -1067,7 +1066,7 @@ TYPED_TEST(Dense, AppliesToComplex)
 TYPED_TEST(Dense, AppliesToMixedComplex)
 {
     using mixed_value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+        gko::next_precision<typename TestFixture::value_type>;
     using mixed_complex_type = gko::to_complex<mixed_value_type>;
     using Vec = gko::matrix::Dense<mixed_complex_type>;
     auto exec = gko::ReferenceExecutor::create();
@@ -1121,7 +1120,7 @@ TYPED_TEST(Dense, AdvancedAppliesToComplex)
 TYPED_TEST(Dense, AdvancedAppliesToMixedComplex)
 {
     using mixed_value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+        gko::next_precision<typename TestFixture::value_type>;
     using mixed_complex_type = gko::to_complex<mixed_value_type>;
     using MixedDense = gko::matrix::Dense<mixed_value_type>;
     using MixedDenseComplex = gko::matrix::Dense<mixed_complex_type>;
@@ -2014,7 +2013,7 @@ TYPED_TEST(Dense, ConvertsEmptyToPrecision)
 {
     using Dense = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using OtherT = typename gko::next_precision_with_half<T>;
+    using OtherT = typename gko::next_precision<T>;
     using OtherDense = typename gko::matrix::Dense<OtherT>;
     auto empty = OtherDense::create(this->exec);
     auto res = Dense::create(this->exec);
@@ -2029,7 +2028,7 @@ TYPED_TEST(Dense, MovesEmptyToPrecision)
 {
     using Dense = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using OtherT = typename gko::next_precision_with_half<T>;
+    using OtherT = typename gko::next_precision<T>;
     using OtherDense = typename gko::matrix::Dense<OtherT>;
     auto empty = OtherDense::create(this->exec);
     auto res = Dense::create(this->exec);
diff --git a/reference/test/matrix/diagonal_kernels.cpp b/reference/test/matrix/diagonal_kernels.cpp
index d1208e96178..b25c3593595 100644
--- a/reference/test/matrix/diagonal_kernels.cpp
+++ b/reference/test/matrix/diagonal_kernels.cpp
@@ -30,8 +30,7 @@ class Diagonal : public ::testing::Test {
     using Csr = gko::matrix::Csr<value_type>;
     using Diag = gko::matrix::Diagonal<value_type>;
     using Dense = gko::matrix::Dense<value_type>;
-    using MixedDense =
-        gko::matrix::Dense<gko::next_precision_with_half<value_type>>;
+    using MixedDense = gko::matrix::Dense<gko::next_precision<value_type>>;
 
     Diagonal()
         : exec(gko::ReferenceExecutor::create()),
@@ -87,7 +86,7 @@ TYPED_TEST_SUITE(Diagonal, gko::test::ValueTypesWithHalf,
 TYPED_TEST(Diagonal, ConvertsToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
-    using OtherType = gko::next_precision_with_half<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Diagonal = typename TestFixture::Diag;
     using OtherDiagonal = gko::matrix::Diagonal<OtherType>;
     auto tmp = OtherDiagonal::create(this->exec);
@@ -109,7 +108,7 @@ TYPED_TEST(Diagonal, ConvertsToPrecision)
 TYPED_TEST(Diagonal, MovesToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
-    using OtherType = gko::next_precision_with_half<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Diagonal = typename TestFixture::Diag;
     using OtherDiagonal = gko::matrix::Diagonal<OtherType>;
     auto tmp = OtherDiagonal::create(this->exec);
@@ -576,7 +575,7 @@ TYPED_TEST(Diagonal, AppliesToComplex)
 TYPED_TEST(Diagonal, AppliesToMixedComplex)
 {
     using mixed_value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+        gko::next_precision<typename TestFixture::value_type>;
     using mixed_complex_type = gko::to_complex<mixed_value_type>;
     using Vec = gko::matrix::Dense<mixed_complex_type>;
     auto exec = gko::ReferenceExecutor::create();
@@ -636,7 +635,7 @@ TYPED_TEST(Diagonal, AppliesLinearCombinationToComplex)
 TYPED_TEST(Diagonal, AppliesLinearCombinationToMixedComplex)
 {
     using mixed_value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+        gko::next_precision<typename TestFixture::value_type>;
     using mixed_complex_type = gko::to_complex<mixed_value_type>;
     using Vec = gko::matrix::Dense<mixed_complex_type>;
     using Scalar = gko::matrix::Dense<mixed_value_type>;
diff --git a/reference/test/matrix/ell_kernels.cpp b/reference/test/matrix/ell_kernels.cpp
index 7f3c770c603..4f96742d3f7 100644
--- a/reference/test/matrix/ell_kernels.cpp
+++ b/reference/test/matrix/ell_kernels.cpp
@@ -30,8 +30,7 @@ class Ell : public ::testing::Test {
     using Mtx = gko::matrix::Ell<value_type, index_type>;
     using Csr = gko::matrix::Csr<value_type, index_type>;
     using Vec = gko::matrix::Dense<value_type>;
-    using MixedVec =
-        gko::matrix::Dense<gko::next_precision_with_half<value_type>>;
+    using MixedVec = gko::matrix::Dense<gko::next_precision<value_type>>;
 
     Ell()
         : exec(gko::ReferenceExecutor::create()),
@@ -93,7 +92,7 @@ TYPED_TEST(Ell, MixedAppliesToDenseVector1)
 {
     // Both vectors have the same value type which differs from the matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision_with_half<T>;
+    using next_T = gko::next_precision<T>;
     using Vec = typename gko::matrix::Dense<next_T>;
     auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
     auto y = Vec::create(this->exec, gko::dim<2>{2, 1});
@@ -108,7 +107,7 @@ TYPED_TEST(Ell, MixedAppliesToDenseVector2)
 {
     // Input vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision_with_half<T>;
+    using next_T = gko::next_precision<T>;
     using Vec1 = typename TestFixture::Vec;
     using Vec2 = gko::matrix::Dense<next_T>;
     auto x = gko::initialize<Vec1>({2.0, 1.0, 4.0}, this->exec);
@@ -124,9 +123,9 @@ TYPED_TEST(Ell, MixedAppliesToDenseVector3)
 {
     // Output vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision_with_half<T>;
+    using next_T = gko::next_precision<T>;
     using Vec1 = typename TestFixture::Vec;
-    using Vec2 = gko::matrix::Dense<gko::next_precision_with_half<T>>;
+    using Vec2 = gko::matrix::Dense<gko::next_precision<T>>;
     auto x = gko::initialize<Vec2>({2.0, 1.0, 4.0}, this->exec);
     auto y = Vec1::create(this->exec, gko::dim<2>{2, 1});
 
@@ -162,7 +161,7 @@ TYPED_TEST(Ell, MixedAppliesToDenseMatrix1)
 {
     // Both vectors have the same value type which differs from the matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision_with_half<T>;
+    using next_T = gko::next_precision<T>;
     using Vec = gko::matrix::Dense<next_T>;
     // clang-format off
     auto x = gko::initialize<Vec>(
@@ -186,7 +185,7 @@ TYPED_TEST(Ell, MixedAppliesToDenseMatrix2)
 {
     // Input vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision_with_half<T>;
+    using next_T = gko::next_precision<T>;
     using Vec1 = typename TestFixture::Vec;
     using Vec2 = gko::matrix::Dense<next_T>;
     // clang-format off
@@ -211,7 +210,7 @@ TYPED_TEST(Ell, MixedAppliesToDenseMatrix3)
 {
     // Output vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision_with_half<T>;
+    using next_T = gko::next_precision<T>;
     using Vec1 = typename TestFixture::Vec;
     using Vec2 = gko::matrix::Dense<next_T>;
     // clang-format off
@@ -250,7 +249,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseVector1)
 {
     // Both vectors have the same value type which differs from the matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision_with_half<T>;
+    using next_T = gko::next_precision<T>;
     using Vec = gko::matrix::Dense<next_T>;
     auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
     auto beta = gko::initialize<Vec>({2.0}, this->exec);
@@ -267,7 +266,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseVector2)
 {
     // Input vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision_with_half<T>;
+    using next_T = gko::next_precision<T>;
     using Vec1 = typename TestFixture::Vec;
     using Vec2 = gko::matrix::Dense<next_T>;
     auto alpha = gko::initialize<Vec1>({-1.0}, this->exec);
@@ -285,7 +284,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseVector3)
 {
     // Output vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision_with_half<T>;
+    using next_T = gko::next_precision<T>;
     using Vec1 = typename TestFixture::Vec;
     using Vec2 = gko::matrix::Dense<next_T>;
     auto alpha = gko::initialize<Vec2>({-1.0}, this->exec);
@@ -329,7 +328,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseMatrix1)
 {
     // Both vectors have the same value type which differs from the matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision_with_half<T>;
+    using next_T = gko::next_precision<T>;
     using Vec = gko::matrix::Dense<next_T>;
     auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
     auto beta = gko::initialize<Vec>({2.0}, this->exec);
@@ -357,7 +356,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseMatrix2)
 {
     // Input vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision_with_half<T>;
+    using next_T = gko::next_precision<T>;
     using Vec1 = typename TestFixture::Vec;
     using Vec2 = gko::matrix::Dense<next_T>;
     auto alpha = gko::initialize<Vec1>({-1.0}, this->exec);
@@ -386,7 +385,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseMatrix3)
 {
     // Output vector has same value type as matrix
     using T = typename TestFixture::value_type;
-    using next_T = gko::next_precision_with_half<T>;
+    using next_T = gko::next_precision<T>;
     using Vec1 = typename TestFixture::Vec;
     using Vec2 = gko::matrix::Dense<next_T>;
     auto alpha = gko::initialize<Vec2>({-1.0}, this->exec);
@@ -445,7 +444,7 @@ TYPED_TEST(Ell, ConvertsToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision_with_half<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Ell = typename TestFixture::Mtx;
     using OtherEll = gko::matrix::Ell<OtherType, IndexType>;
     auto tmp = OtherEll::create(this->exec);
@@ -468,7 +467,7 @@ TYPED_TEST(Ell, MovesToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision_with_half<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Ell = typename TestFixture::Mtx;
     using OtherEll = gko::matrix::Ell<OtherType, IndexType>;
     auto tmp = OtherEll::create(this->exec);
@@ -737,7 +736,7 @@ TYPED_TEST(Ell, ConvertsEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision_with_half<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Ell = typename TestFixture::Mtx;
     using OtherEll = gko::matrix::Ell<OtherType, IndexType>;
     auto empty = Ell::create(this->exec);
@@ -754,7 +753,7 @@ TYPED_TEST(Ell, MovesEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision_with_half<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Ell = typename TestFixture::Mtx;
     using OtherEll = gko::matrix::Ell<OtherType, IndexType>;
     auto empty = Ell::create(this->exec);
@@ -899,7 +898,7 @@ TYPED_TEST(Ell, AppliesToComplex)
 TYPED_TEST(Ell, AppliesToMixedComplex)
 {
     using mixed_value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+        gko::next_precision<typename TestFixture::value_type>;
     using mixed_complex_type = gko::to_complex<mixed_value_type>;
     using Vec = gko::matrix::Dense<mixed_complex_type>;
     auto exec = gko::ReferenceExecutor::create();
@@ -956,7 +955,7 @@ TYPED_TEST(Ell, AdvancedAppliesToComplex)
 TYPED_TEST(Ell, AdvancedAppliesToMixedComplex)
 {
     using mixed_value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+        gko::next_precision<typename TestFixture::value_type>;
     using mixed_complex_type = gko::to_complex<mixed_value_type>;
     using MixedDense = gko::matrix::Dense<mixed_value_type>;
     using MixedDenseComplex = gko::matrix::Dense<mixed_complex_type>;
diff --git a/reference/test/matrix/fbcsr_kernels.cpp b/reference/test/matrix/fbcsr_kernels.cpp
index 9d9e2144cc3..decae21b986 100644
--- a/reference/test/matrix/fbcsr_kernels.cpp
+++ b/reference/test/matrix/fbcsr_kernels.cpp
@@ -272,7 +272,7 @@ TYPED_TEST(Fbcsr, ConvertsToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision_with_half<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Fbcsr = typename TestFixture::Mtx;
     using OtherFbcsr = gko::matrix::Fbcsr<OtherType, IndexType>;
     auto tmp = OtherFbcsr::create(this->exec);
@@ -295,7 +295,7 @@ TYPED_TEST(Fbcsr, MovesToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision_with_half<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Fbcsr = typename TestFixture::Mtx;
     using OtherFbcsr = gko::matrix::Fbcsr<OtherType, IndexType>;
     auto tmp = OtherFbcsr::create(this->exec);
@@ -393,7 +393,7 @@ TYPED_TEST(Fbcsr, ConvertsEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision_with_half<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Fbcsr = typename TestFixture::Mtx;
     using OtherFbcsr = gko::matrix::Fbcsr<OtherType, IndexType>;
     auto empty = OtherFbcsr::create(this->exec);
@@ -412,7 +412,7 @@ TYPED_TEST(Fbcsr, MovesEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision_with_half<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Fbcsr = typename TestFixture::Mtx;
     using OtherFbcsr = gko::matrix::Fbcsr<OtherType, IndexType>;
     auto empty = OtherFbcsr::create(this->exec);
diff --git a/reference/test/matrix/hybrid_kernels.cpp b/reference/test/matrix/hybrid_kernels.cpp
index c5e6496dce1..bdc2724e1de 100644
--- a/reference/test/matrix/hybrid_kernels.cpp
+++ b/reference/test/matrix/hybrid_kernels.cpp
@@ -32,8 +32,7 @@ class Hybrid : public ::testing::Test {
     using Mtx = gko::matrix::Hybrid<value_type, index_type>;
     using Vec = gko::matrix::Dense<value_type>;
     using Csr = gko::matrix::Csr<value_type, index_type>;
-    using MixedVec =
-        gko::matrix::Dense<gko::next_precision_with_half<value_type>>;
+    using MixedVec = gko::matrix::Dense<gko::next_precision<value_type>>;
 
     Hybrid()
         : exec(gko::ReferenceExecutor::create()),
@@ -235,7 +234,7 @@ TYPED_TEST(Hybrid, ConvertsToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision_with_half<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Hybrid = typename TestFixture::Mtx;
     using OtherHybrid = gko::matrix::Hybrid<OtherType, IndexType>;
     auto tmp = OtherHybrid::create(this->exec);
@@ -258,7 +257,7 @@ TYPED_TEST(Hybrid, MovesToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision_with_half<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Hybrid = typename TestFixture::Mtx;
     using OtherHybrid = gko::matrix::Hybrid<OtherType, IndexType>;
     auto tmp = OtherHybrid::create(this->exec);
@@ -369,7 +368,7 @@ TYPED_TEST(Hybrid, ConvertsEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision_with_half<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Hybrid = typename TestFixture::Mtx;
     using OtherHybrid = gko::matrix::Hybrid<OtherType, IndexType>;
     auto other = Hybrid::create(this->exec);
@@ -386,7 +385,7 @@ TYPED_TEST(Hybrid, MovesEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision_with_half<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Hybrid = typename TestFixture::Mtx;
     using OtherHybrid = gko::matrix::Hybrid<OtherType, IndexType>;
     auto other = Hybrid::create(this->exec);
@@ -701,7 +700,7 @@ TYPED_TEST(Hybrid, AppliesToComplex)
 TYPED_TEST(Hybrid, AppliesToMixedComplex)
 {
     using mixed_value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+        gko::next_precision<typename TestFixture::value_type>;
     using mixed_complex_type = gko::to_complex<mixed_value_type>;
     using Vec = gko::matrix::Dense<mixed_complex_type>;
     auto exec = gko::ReferenceExecutor::create();
@@ -758,7 +757,7 @@ TYPED_TEST(Hybrid, AdvancedAppliesToComplex)
 TYPED_TEST(Hybrid, AdvancedAppliesToMixedComplex)
 {
     using mixed_value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+        gko::next_precision<typename TestFixture::value_type>;
     using mixed_complex_type = gko::to_complex<mixed_value_type>;
     using MixedDense = gko::matrix::Dense<mixed_value_type>;
     using MixedDenseComplex = gko::matrix::Dense<mixed_complex_type>;
diff --git a/reference/test/matrix/identity.cpp b/reference/test/matrix/identity.cpp
index 82704145978..dc0f799ba70 100644
--- a/reference/test/matrix/identity.cpp
+++ b/reference/test/matrix/identity.cpp
@@ -19,8 +19,7 @@ class Identity : public ::testing::Test {
     using value_type = T;
     using Id = gko::matrix::Identity<value_type>;
     using Vec = gko::matrix::Dense<value_type>;
-    using MixedVec =
-        gko::matrix::Dense<gko::next_precision_with_half<value_type>>;
+    using MixedVec = gko::matrix::Dense<gko::next_precision<value_type>>;
     using ComplexVec = gko::to_complex<Vec>;
     using MixedComplexVec = gko::to_complex<MixedVec>;
 
diff --git a/reference/test/matrix/sellp_kernels.cpp b/reference/test/matrix/sellp_kernels.cpp
index 23251c63b8f..5e056997163 100644
--- a/reference/test/matrix/sellp_kernels.cpp
+++ b/reference/test/matrix/sellp_kernels.cpp
@@ -68,8 +68,7 @@ TYPED_TEST(Sellp, AppliesToDenseVector)
 
 TYPED_TEST(Sellp, AppliesToMixedDenseVector)
 {
-    using value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+    using value_type = gko::next_precision<typename TestFixture::value_type>;
     using Vec = gko::matrix::Dense<value_type>;
     auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
     auto y = Vec::create(this->exec, gko::dim<2>{2, 1});
@@ -118,8 +117,7 @@ TYPED_TEST(Sellp, AppliesLinearCombinationToDenseVector)
 
 TYPED_TEST(Sellp, AppliesLinearCombinationToMixedDenseVector)
 {
-    using value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+    using value_type = gko::next_precision<typename TestFixture::value_type>;
     using Vec = gko::matrix::Dense<value_type>;
     auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
     auto beta = gko::initialize<Vec>({2.0}, this->exec);
@@ -192,7 +190,7 @@ TYPED_TEST(Sellp, ConvertsToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision_with_half<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Sellp = typename TestFixture::Mtx;
     using OtherSellp = gko::matrix::Sellp<OtherType, IndexType>;
     auto tmp = OtherSellp::create(this->exec);
@@ -215,7 +213,7 @@ TYPED_TEST(Sellp, MovesToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision_with_half<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Sellp = typename TestFixture::Mtx;
     using OtherSellp = gko::matrix::Sellp<OtherType, IndexType>;
     auto tmp = OtherSellp::create(this->exec);
@@ -313,7 +311,7 @@ TYPED_TEST(Sellp, ConvertsEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision_with_half<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Sellp = typename TestFixture::Mtx;
     using OtherSellp = gko::matrix::Sellp<OtherType, IndexType>;
     auto empty = OtherSellp::create(this->exec);
@@ -332,7 +330,7 @@ TYPED_TEST(Sellp, MovesEmptyToPrecision)
 {
     using ValueType = typename TestFixture::value_type;
     using IndexType = typename TestFixture::index_type;
-    using OtherType = gko::next_precision_with_half<ValueType>;
+    using OtherType = gko::next_precision<ValueType>;
     using Sellp = typename TestFixture::Mtx;
     using OtherSellp = gko::matrix::Sellp<OtherType, IndexType>;
     auto empty = OtherSellp::create(this->exec);
diff --git a/reference/test/matrix/sparsity_csr_kernels.cpp b/reference/test/matrix/sparsity_csr_kernels.cpp
index 30805d033ab..03adbece035 100644
--- a/reference/test/matrix/sparsity_csr_kernels.cpp
+++ b/reference/test/matrix/sparsity_csr_kernels.cpp
@@ -145,7 +145,7 @@ TYPED_TEST(SparsityCsr, AppliesToDenseVector)
 
 TYPED_TEST(SparsityCsr, AppliesToMixedDenseVector)
 {
-    using T = gko::next_precision_with_half<typename TestFixture::value_type>;
+    using T = gko::next_precision<typename TestFixture::value_type>;
     using Vec = gko::matrix::Dense<T>;
     auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
     auto y = Vec::create(this->exec, gko::dim<2>{2, 1});
@@ -192,7 +192,7 @@ TYPED_TEST(SparsityCsr, AppliesLinearCombinationToDenseVector)
 
 TYPED_TEST(SparsityCsr, AppliesLinearCombinationToMixedDenseVector)
 {
-    using T = gko::next_precision_with_half<typename TestFixture::value_type>;
+    using T = gko::next_precision<typename TestFixture::value_type>;
     using Vec = gko::matrix::Dense<T>;
     auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
     auto beta = gko::initialize<Vec>({2.0}, this->exec);
@@ -243,8 +243,8 @@ TYPED_TEST(SparsityCsr, AppliesToComplex)
 
 TYPED_TEST(SparsityCsr, AppliesToMixedComplex)
 {
-    using T = gko::next_precision_with_half<
-        gko::to_complex<typename TestFixture::value_type>>;
+    using T =
+        gko::next_precision<gko::to_complex<typename TestFixture::value_type>>;
     using Vec = gko::matrix::Dense<T>;
     auto x = gko::initialize<Vec>({T{2.0, 4.0}, T{1.0, 2.0}, T{4.0, 8.0}},
                                   this->exec);
@@ -279,7 +279,7 @@ TYPED_TEST(SparsityCsr, AppliesLinearCombinationToComplex)
 TYPED_TEST(SparsityCsr, AppliesLinearCombinationToMixedComplex)
 {
     using Vec = gko::matrix::Dense<
-        gko::next_precision_with_half<typename TestFixture::value_type>>;
+        gko::next_precision<typename TestFixture::value_type>>;
     using ComplexVec = gko::to_complex<Vec>;
     using T = typename ComplexVec::value_type;
     auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
diff --git a/reference/test/preconditioner/ic.cpp b/reference/test/preconditioner/ic.cpp
index aabd6c64d73..3084f60ca1c 100644
--- a/reference/test/preconditioner/ic.cpp
+++ b/reference/test/preconditioner/ic.cpp
@@ -246,7 +246,7 @@ TYPED_TEST(Ic, SolvesSingleRhsMixed)
 {
     using ic_prec_type = typename TestFixture::ic_prec_type;
     using T = typename TestFixture::value_type;
-    using Vec = gko::matrix::Dense<gko::next_precision_with_half<T>>;
+    using Vec = gko::matrix::Dense<gko::next_precision<T>>;
     const auto b = gko::initialize<Vec>({1.0, 3.0, 6.0}, this->exec);
     auto x = Vec::create(this->exec, gko::dim<2>{3, 1});
     auto preconditioner =
@@ -279,8 +279,8 @@ TYPED_TEST(Ic, SolvesSingleRhsComplex)
 TYPED_TEST(Ic, SolvesSingleRhsComplexMixed)
 {
     using ic_prec_type = typename TestFixture::ic_prec_type;
-    using Vec = gko::matrix::Dense<gko::next_precision_with_half<
-        gko::to_complex<typename TestFixture::value_type>>>;
+    using Vec = gko::matrix::Dense<
+        gko::next_precision<gko::to_complex<typename TestFixture::value_type>>>;
     using T = typename Vec::value_type;
     const auto b = gko::initialize<Vec>(
         {T{1.0, 2.0}, T{3.0, 6.0}, T{6.0, 12.0}}, this->exec);
@@ -316,7 +316,7 @@ TYPED_TEST(Ic, AdvancedSolvesSingleRhsMixed)
 {
     using ic_prec_type = typename TestFixture::ic_prec_type;
     using T = typename TestFixture::value_type;
-    using Vec = gko::matrix::Dense<gko::next_precision_with_half<T>>;
+    using Vec = gko::matrix::Dense<gko::next_precision<T>>;
     const auto b = gko::initialize<Vec>({1.0, 3.0, 6.0}, this->exec);
     const auto alpha = gko::initialize<Vec>({2.0}, this->exec);
     const auto beta = gko::initialize<Vec>({-1.0}, this->exec);
@@ -356,7 +356,7 @@ TYPED_TEST(Ic, AdvancedSolvesSingleRhsComplexMixed)
 {
     using ic_prec_type = typename TestFixture::ic_prec_type;
     using MixedDense = gko::matrix::Dense<
-        gko::next_precision_with_half<typename TestFixture::value_type>>;
+        gko::next_precision<typename TestFixture::value_type>>;
     using MixedDenseComplex = gko::to_complex<MixedDense>;
     using T = typename MixedDenseComplex::value_type;
     const auto b = gko::initialize<MixedDenseComplex>(
diff --git a/reference/test/preconditioner/ilu.cpp b/reference/test/preconditioner/ilu.cpp
index e4c4809f084..1c7deddf56c 100644
--- a/reference/test/preconditioner/ilu.cpp
+++ b/reference/test/preconditioner/ilu.cpp
@@ -316,7 +316,7 @@ TYPED_TEST(Ilu, SolvesSingleRhsWithMtx)
 TYPED_TEST(Ilu, SolvesSingleRhsWithMixedMtx)
 {
     using Mtx = gko::matrix::Dense<
-        gko::next_precision_with_half<typename TestFixture::value_type>>;
+        gko::next_precision<typename TestFixture::value_type>>;
     const auto b = gko::initialize<Mtx>({1.0, 3.0, 6.0}, this->exec);
     auto x = Mtx::create(this->exec, gko::dim<2>{3, 1});
     x->copy_from(b);
@@ -349,8 +349,8 @@ TYPED_TEST(Ilu, SolvesSingleRhsWithComplexMtx)
 
 TYPED_TEST(Ilu, SolvesSingleRhsWithMixedComplexMtx)
 {
-    using Mtx = gko::matrix::Dense<gko::to_complex<
-        gko::next_precision_with_half<typename TestFixture::value_type>>>;
+    using Mtx = gko::matrix::Dense<
+        gko::to_complex<gko::next_precision<typename TestFixture::value_type>>>;
     using T = typename Mtx::value_type;
     const auto b = gko::initialize<Mtx>(
         {T{1.0, 2.0}, T{3.0, 6.0}, T{6.0, 12.0}}, this->exec);
@@ -403,8 +403,7 @@ TYPED_TEST(Ilu, SolvesAdvancedSingleRhs)
 
 TYPED_TEST(Ilu, SolvesAdvancedSingleRhsMixed)
 {
-    using value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+    using value_type = gko::next_precision<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     const value_type alpha{2.0};
     const auto alpha_linop = gko::initialize<Mtx>({alpha}, this->exec);
@@ -454,8 +453,7 @@ TYPED_TEST(Ilu, SolvesAdvancedSingleRhsComplex)
 
 TYPED_TEST(Ilu, SolvesAdvancedSingleRhsMixedComplex)
 {
-    using value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+    using value_type = gko::next_precision<typename TestFixture::value_type>;
     using complex_type = gko::to_complex<value_type>;
     using MixedDense = gko::matrix::Dense<value_type>;
     using MixedDenseComplex = gko::to_complex<MixedDense>;
diff --git a/reference/test/preconditioner/jacobi.cpp b/reference/test/preconditioner/jacobi.cpp
index 1bc0aa37470..2697efc3cd5 100644
--- a/reference/test/preconditioner/jacobi.cpp
+++ b/reference/test/preconditioner/jacobi.cpp
@@ -478,7 +478,7 @@ TYPED_TEST(Jacobi, ScalarJacobiGeneratesOnDifferentPrecision)
 {
     using value_type = typename TestFixture::value_type;
     using index_type = typename TestFixture::index_type;
-    using next_type = gko::next_precision_with_half<value_type>;
+    using next_type = gko::next_precision<value_type>;
     using Bj = typename TestFixture::Bj;
     auto csr =
         gko::share(gko::matrix::Csr<next_type, index_type>::create(this->exec));
diff --git a/reference/test/preconditioner/jacobi_kernels.cpp b/reference/test/preconditioner/jacobi_kernels.cpp
index 2b75c2a5590..d5f9346eab6 100644
--- a/reference/test/preconditioner/jacobi_kernels.cpp
+++ b/reference/test/preconditioner/jacobi_kernels.cpp
@@ -649,8 +649,7 @@ TYPED_TEST(Jacobi, ScalarJacobiAppliesToVector)
 
 TYPED_TEST(Jacobi, AppliesToMixedVector)
 {
-    using value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+    using value_type = gko::next_precision<typename TestFixture::value_type>;
     using Vec = gko::matrix::Dense<value_type>;
     auto x = gko::initialize<Vec>({1.0, -1.0, 2.0, -2.0, 3.0}, this->exec);
     auto b = gko::initialize<Vec>({4.0, -1.0, -2.0, 4.0, -1.0}, this->exec);
@@ -690,8 +689,8 @@ TYPED_TEST(Jacobi, AppliesToComplexVector)
 
 TYPED_TEST(Jacobi, AppliesToMixedComplexVector)
 {
-    using value_type = gko::to_complex<
-        gko::next_precision_with_half<typename TestFixture::value_type>>;
+    using value_type =
+        gko::to_complex<gko::next_precision<typename TestFixture::value_type>>;
     using Vec = gko::matrix::Dense<value_type>;
     auto x = gko::initialize<Vec>(
         {value_type{1.0, 2.0}, value_type{-1.0, -2.0}, value_type{2.0, 4.0},
@@ -896,8 +895,7 @@ TYPED_TEST(Jacobi, ScalarJacobiAppliesLinearCombinationToVector)
 
 TYPED_TEST(Jacobi, AppliesLinearCombinationToMixedVector)
 {
-    using value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+    using value_type = gko::next_precision<typename TestFixture::value_type>;
     using Vec = gko::matrix::Dense<value_type>;
     auto x = gko::initialize<Vec>({1.0, -1.0, 2.0, -2.0, 3.0}, this->exec);
     auto b = gko::initialize<Vec>({4.0, -1.0, -2.0, 4.0, -1.0}, this->exec);
@@ -940,8 +938,7 @@ TYPED_TEST(Jacobi, AppliesLinearCombinationToComplexVector)
 
 TYPED_TEST(Jacobi, AppliesLinearCombinationToMixedComplexVector)
 {
-    using value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+    using value_type = gko::next_precision<typename TestFixture::value_type>;
     using MixedDense = gko::matrix::Dense<value_type>;
     using MixedDenseComplex = gko::to_complex<MixedDense>;
     using T = gko::to_complex<value_type>;
diff --git a/reference/test/reorder/scaled_reordered.cpp b/reference/test/reorder/scaled_reordered.cpp
index b9924cd9418..8fb3c885cf3 100644
--- a/reference/test/reorder/scaled_reordered.cpp
+++ b/reference/test/reorder/scaled_reordered.cpp
@@ -451,7 +451,7 @@ TYPED_TEST(ScaledReordered, SolvesSingleRhsWithScalingAndRcmReorderingMixed)
 {
     using SR = typename TestFixture::SR;
     using T = typename TestFixture::value_type;
-    using OtherT = gko::next_precision_with_half<T>;
+    using OtherT = gko::next_precision<T>;
     using Vec = gko::matrix::Dense<OtherT>;
     auto scaled_reordered_fact = SR::build()
                                      .with_row_scaling(this->diag2)
@@ -500,7 +500,7 @@ TYPED_TEST(ScaledReordered,
 {
     using SR = typename TestFixture::SR;
     using T = typename TestFixture::value_type;
-    using OtherT = gko::next_precision_with_half<T>;
+    using OtherT = gko::next_precision<T>;
     using Vec = gko::matrix::Dense<OtherT>;
     auto scaled_reordered_fact = SR::build()
                                      .with_row_scaling(this->diag2)
diff --git a/reference/test/solver/bicg_kernels.cpp b/reference/test/solver/bicg_kernels.cpp
index 13d81de0c7a..a8f397aed1e 100644
--- a/reference/test/solver/bicg_kernels.cpp
+++ b/reference/test/solver/bicg_kernels.cpp
@@ -266,8 +266,7 @@ TYPED_TEST(Bicg, SolvesStencilSystem)
 
 TYPED_TEST(Bicg, SolvesStencilSystemMixed)
 {
-    using value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+    using value_type = gko::next_precision<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->bicg_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, this->exec);
@@ -303,8 +302,8 @@ TYPED_TEST(Bicg, SolvesStencilSystemComplex)
 
 TYPED_TEST(Bicg, SolvesStencilSystemMixedComplex)
 {
-    using value_type = gko::to_complex<
-        gko::next_precision_with_half<typename TestFixture::value_type>>;
+    using value_type =
+        gko::to_complex<gko::next_precision<typename TestFixture::value_type>>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->bicg_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>(
@@ -359,8 +358,7 @@ TYPED_TEST(Bicg, SolvesStencilSystemUsingAdvancedApply)
 
 TYPED_TEST(Bicg, SolvesStencilSystemUsingAdvancedApplyMixed)
 {
-    using value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+    using value_type = gko::next_precision<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->bicg_factory->generate(this->mtx);
     auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
@@ -402,7 +400,7 @@ TYPED_TEST(Bicg, SolvesStencilSystemUsingAdvancedApplyComplex)
 TYPED_TEST(Bicg, SolvesStencilSystemUsingAdvancedApplyMixedComplex)
 {
     using Scalar = gko::matrix::Dense<
-        gko::next_precision_with_half<typename TestFixture::value_type>>;
+        gko::next_precision<typename TestFixture::value_type>>;
     using Mtx = gko::to_complex<typename TestFixture::Mtx>;
     using value_type = typename Mtx::value_type;
     auto solver = this->bicg_factory->generate(this->mtx);
diff --git a/reference/test/solver/bicgstab_kernels.cpp b/reference/test/solver/bicgstab_kernels.cpp
index ce17f25f47e..5bbff5b1a5e 100644
--- a/reference/test/solver/bicgstab_kernels.cpp
+++ b/reference/test/solver/bicgstab_kernels.cpp
@@ -384,8 +384,7 @@ TYPED_TEST(Bicgstab, SolvesDenseSystem)
 
 TYPED_TEST(Bicgstab, SolvesDenseSystemMixed)
 {
-    using value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+    using value_type = gko::next_precision<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->bicgstab_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, this->exec);
@@ -421,8 +420,8 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemComplex)
 
 TYPED_TEST(Bicgstab, SolvesDenseSystemMixedComplex)
 {
-    using value_type = gko::to_complex<
-        gko::next_precision_with_half<typename TestFixture::value_type>>;
+    using value_type =
+        gko::to_complex<gko::next_precision<typename TestFixture::value_type>>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->bicgstab_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>(
@@ -497,8 +496,7 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApply)
 
 TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyMixed)
 {
-    using value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+    using value_type = gko::next_precision<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->bicgstab_factory->generate(this->mtx);
     auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
@@ -540,7 +538,7 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyComplex)
 TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyMixedComplex)
 {
     using Scalar = gko::matrix::Dense<
-        gko::next_precision_with_half<typename TestFixture::value_type>>;
+        gko::next_precision<typename TestFixture::value_type>>;
     using Mtx = gko::to_complex<typename TestFixture::Mtx>;
     using value_type = typename Mtx::value_type;
     auto solver = this->bicgstab_factory->generate(this->mtx);
diff --git a/reference/test/solver/cb_gmres_kernels.cpp b/reference/test/solver/cb_gmres_kernels.cpp
index ae3ecce9963..cee760f7840 100644
--- a/reference/test/solver/cb_gmres_kernels.cpp
+++ b/reference/test/solver/cb_gmres_kernels.cpp
@@ -160,7 +160,8 @@ TYPED_TEST(CbGmres, SolvesStencilSystem)
 
 TYPED_TEST(CbGmres, SolvesStencilSystemMixed)
 {
-    using value_type = gko::next_precision<typename TestFixture::value_type>;
+    using value_type =
+        gko::next_precision_base<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->cb_gmres_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>({13.0, 7.0, 1.0}, this->exec);
@@ -198,8 +199,8 @@ TYPED_TEST(CbGmres, SolvesStencilSystemComplex)
 
 TYPED_TEST(CbGmres, SolvesStencilSystemMixedComplex)
 {
-    using value_type =
-        gko::to_complex<gko::next_precision<typename TestFixture::value_type>>;
+    using value_type = gko::to_complex<
+        gko::next_precision_base<typename TestFixture::value_type>>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->cb_gmres_factory->generate(this->mtx);
     auto b =
@@ -280,7 +281,8 @@ TYPED_TEST(CbGmres, SolvesStencilSystemUsingAdvancedApply)
 
 TYPED_TEST(CbGmres, SolvesStencilSystemUsingAdvancedApplyMixed)
 {
-    using value_type = gko::next_precision<typename TestFixture::value_type>;
+    using value_type =
+        gko::next_precision_base<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->cb_gmres_factory->generate(this->mtx);
     auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
@@ -324,7 +326,7 @@ TYPED_TEST(CbGmres, SolvesStencilSystemUsingAdvancedApplyComplex)
 TYPED_TEST(CbGmres, SolvesStencilSystemUsingAdvancedApplyMixedComplex)
 {
     using Scalar = gko::matrix::Dense<
-        gko::next_precision<typename TestFixture::value_type>>;
+        gko::next_precision_base<typename TestFixture::value_type>>;
     using Mtx = gko::to_complex<typename TestFixture::Mtx>;
     using value_type = typename Mtx::value_type;
     auto solver = this->cb_gmres_factory->generate(this->mtx);
diff --git a/reference/test/solver/cg_kernels.cpp b/reference/test/solver/cg_kernels.cpp
index fd708d736bc..8cbc9a09c18 100644
--- a/reference/test/solver/cg_kernels.cpp
+++ b/reference/test/solver/cg_kernels.cpp
@@ -228,8 +228,7 @@ TYPED_TEST(Cg, SolvesStencilSystem)
 
 TYPED_TEST(Cg, SolvesStencilSystemMixed)
 {
-    using value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+    using value_type = gko::next_precision<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->cg_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, this->exec);
@@ -265,8 +264,8 @@ TYPED_TEST(Cg, SolvesStencilSystemComplex)
 
 TYPED_TEST(Cg, SolvesStencilSystemMixedComplex)
 {
-    using value_type = gko::to_complex<
-        gko::next_precision_with_half<typename TestFixture::value_type>>;
+    using value_type =
+        gko::to_complex<gko::next_precision<typename TestFixture::value_type>>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->cg_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>(
@@ -321,8 +320,7 @@ TYPED_TEST(Cg, SolvesStencilSystemUsingAdvancedApply)
 
 TYPED_TEST(Cg, SolvesStencilSystemUsingAdvancedApplyMixed)
 {
-    using value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+    using value_type = gko::next_precision<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->cg_factory->generate(this->mtx);
     auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
@@ -364,7 +362,7 @@ TYPED_TEST(Cg, SolvesStencilSystemUsingAdvancedApplyComplex)
 TYPED_TEST(Cg, SolvesStencilSystemUsingAdvancedApplyMixedComplex)
 {
     using Scalar = gko::matrix::Dense<
-        gko::next_precision_with_half<typename TestFixture::value_type>>;
+        gko::next_precision<typename TestFixture::value_type>>;
     using Mtx = gko::to_complex<typename TestFixture::Mtx>;
     using value_type = typename Mtx::value_type;
     auto solver = this->cg_factory->generate(this->mtx);
diff --git a/reference/test/solver/cgs_kernels.cpp b/reference/test/solver/cgs_kernels.cpp
index a06c087776c..c51a3252540 100644
--- a/reference/test/solver/cgs_kernels.cpp
+++ b/reference/test/solver/cgs_kernels.cpp
@@ -293,8 +293,7 @@ TYPED_TEST(Cgs, SolvesDenseSystem)
 
 TYPED_TEST(Cgs, SolvesDenseSystemMixed)
 {
-    using value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+    using value_type = gko::next_precision<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->cgs_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, this->exec);
@@ -330,8 +329,8 @@ TYPED_TEST(Cgs, SolvesDenseSystemComplex)
 
 TYPED_TEST(Cgs, SolvesDenseSystemMixedComplex)
 {
-    using value_type = gko::to_complex<
-        gko::next_precision_with_half<typename TestFixture::value_type>>;
+    using value_type =
+        gko::to_complex<gko::next_precision<typename TestFixture::value_type>>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->cgs_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>(
@@ -387,8 +386,7 @@ TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApply)
 
 TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApplyMixed)
 {
-    using value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+    using value_type = gko::next_precision<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->cgs_factory->generate(this->mtx);
     auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
@@ -430,7 +428,7 @@ TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApplyComplex)
 TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApplyMixedComplex)
 {
     using Scalar = gko::matrix::Dense<
-        gko::next_precision_with_half<typename TestFixture::value_type>>;
+        gko::next_precision<typename TestFixture::value_type>>;
     using Mtx = gko::to_complex<typename TestFixture::Mtx>;
     using value_type = typename Mtx::value_type;
     auto solver = this->cgs_factory->generate(this->mtx);
diff --git a/reference/test/solver/fcg_kernels.cpp b/reference/test/solver/fcg_kernels.cpp
index 88615921f34..4b8484a78c8 100644
--- a/reference/test/solver/fcg_kernels.cpp
+++ b/reference/test/solver/fcg_kernels.cpp
@@ -242,8 +242,7 @@ TYPED_TEST(Fcg, SolvesStencilSystem)
 
 TYPED_TEST(Fcg, SolvesStencilSystemMixed)
 {
-    using value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+    using value_type = gko::next_precision<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->fcg_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, this->exec);
@@ -279,8 +278,8 @@ TYPED_TEST(Fcg, SolvesStencilSystemComplex)
 
 TYPED_TEST(Fcg, SolvesStencilSystemMixedComplex)
 {
-    using value_type = gko::to_complex<
-        gko::next_precision_with_half<typename TestFixture::value_type>>;
+    using value_type =
+        gko::to_complex<gko::next_precision<typename TestFixture::value_type>>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->fcg_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>(
@@ -335,8 +334,7 @@ TYPED_TEST(Fcg, SolvesStencilSystemUsingAdvancedApply)
 
 TYPED_TEST(Fcg, SolvesStencilSystemUsingAdvancedApplyMixed)
 {
-    using value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+    using value_type = gko::next_precision<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->fcg_factory->generate(this->mtx);
     auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
@@ -378,7 +376,7 @@ TYPED_TEST(Fcg, SolvesStencilSystemUsingAdvancedApplyComplex)
 TYPED_TEST(Fcg, SolvesStencilSystemUsingAdvancedApplyMixedComplex)
 {
     using Scalar = gko::matrix::Dense<
-        gko::next_precision_with_half<typename TestFixture::value_type>>;
+        gko::next_precision<typename TestFixture::value_type>>;
     using Mtx = gko::to_complex<typename TestFixture::Mtx>;
     using value_type = typename Mtx::value_type;
     auto solver = this->fcg_factory->generate(this->mtx);
diff --git a/reference/test/solver/gcr_kernels.cpp b/reference/test/solver/gcr_kernels.cpp
index af8e74888d0..be747a2f84b 100644
--- a/reference/test/solver/gcr_kernels.cpp
+++ b/reference/test/solver/gcr_kernels.cpp
@@ -225,8 +225,7 @@ TYPED_TEST(Gcr, SolvesStencilSystem)
 
 TYPED_TEST(Gcr, SolvesStencilSystemMixed)
 {
-    using value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+    using value_type = gko::next_precision<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->gcr_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>({13.0, 7.0, 1.0}, this->exec);
@@ -263,8 +262,8 @@ TYPED_TEST(Gcr, SolvesStencilSystemComplex)
 
 TYPED_TEST(Gcr, SolvesStencilSystemMixedComplex)
 {
-    using value_type = gko::to_complex<
-        gko::next_precision_with_half<typename TestFixture::value_type>>;
+    using value_type =
+        gko::to_complex<gko::next_precision<typename TestFixture::value_type>>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->gcr_factory->generate(this->mtx);
     auto b =
@@ -320,8 +319,7 @@ TYPED_TEST(Gcr, SolvesStencilSystemUsingAdvancedApply)
 
 TYPED_TEST(Gcr, SolvesStencilSystemUsingAdvancedApplyMixed)
 {
-    using value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+    using value_type = gko::next_precision<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->gcr_factory->generate(this->mtx);
     auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
@@ -364,7 +362,7 @@ TYPED_TEST(Gcr, SolvesStencilSystemUsingAdvancedApplyComplex)
 TYPED_TEST(Gcr, SolvesStencilSystemUsingAdvancedApplyMixedComplex)
 {
     using Scalar = gko::matrix::Dense<
-        gko::next_precision_with_half<typename TestFixture::value_type>>;
+        gko::next_precision<typename TestFixture::value_type>>;
     using Mtx = gko::to_complex<typename TestFixture::Mtx>;
     using value_type = typename Mtx::value_type;
     auto solver = this->gcr_factory->generate(this->mtx);
diff --git a/reference/test/solver/gmres_kernels.cpp b/reference/test/solver/gmres_kernels.cpp
index abecc6b2a79..1719dfe1062 100644
--- a/reference/test/solver/gmres_kernels.cpp
+++ b/reference/test/solver/gmres_kernels.cpp
@@ -434,8 +434,7 @@ TYPED_TEST(Gmres, SolvesStencilSystem)
 
 TYPED_TEST(Gmres, SolvesStencilSystemMixed)
 {
-    using value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+    using value_type = gko::next_precision<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->gmres_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>({13.0, 7.0, 1.0}, this->exec);
@@ -472,8 +471,8 @@ TYPED_TEST(Gmres, SolvesStencilSystemComplex)
 
 TYPED_TEST(Gmres, SolvesStencilSystemMixedComplex)
 {
-    using value_type = gko::to_complex<
-        gko::next_precision_with_half<typename TestFixture::value_type>>;
+    using value_type =
+        gko::to_complex<gko::next_precision<typename TestFixture::value_type>>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->gmres_factory->generate(this->mtx);
     auto b =
@@ -529,8 +528,7 @@ TYPED_TEST(Gmres, SolvesStencilSystemUsingAdvancedApply)
 
 TYPED_TEST(Gmres, SolvesStencilSystemUsingAdvancedApplyMixed)
 {
-    using value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+    using value_type = gko::next_precision<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->gmres_factory->generate(this->mtx);
     auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
@@ -573,7 +571,7 @@ TYPED_TEST(Gmres, SolvesStencilSystemUsingAdvancedApplyComplex)
 TYPED_TEST(Gmres, SolvesStencilSystemUsingAdvancedApplyMixedComplex)
 {
     using Scalar = gko::matrix::Dense<
-        gko::next_precision_with_half<typename TestFixture::value_type>>;
+        gko::next_precision<typename TestFixture::value_type>>;
     using Mtx = gko::to_complex<typename TestFixture::Mtx>;
     using value_type = typename Mtx::value_type;
     auto solver = this->gmres_factory->generate(this->mtx);
diff --git a/reference/test/solver/idr_kernels.cpp b/reference/test/solver/idr_kernels.cpp
index 420a3f15684..a5aee44029c 100644
--- a/reference/test/solver/idr_kernels.cpp
+++ b/reference/test/solver/idr_kernels.cpp
@@ -80,7 +80,7 @@ TYPED_TEST(Idr, SolvesDenseSystem)
 TYPED_TEST(Idr, SolvesDenseSystemMixed)
 {
     using T = typename TestFixture::value_type;
-    using value_type = gko::next_precision_with_half<T>;
+    using value_type = gko::next_precision<T>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->idr_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, this->exec);
@@ -118,7 +118,7 @@ TYPED_TEST(Idr, SolvesDenseSystemComplex)
 TYPED_TEST(Idr, SolvesDenseSystemMixedComplex)
 {
     using T = typename TestFixture::value_type;
-    using value_type = gko::to_complex<gko::next_precision_with_half<T>>;
+    using value_type = gko::to_complex<gko::next_precision<T>>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->idr_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>(
@@ -237,8 +237,7 @@ TYPED_TEST(Idr, SolvesDenseSystemUsingAdvancedApply)
 
 TYPED_TEST(Idr, SolvesDenseSystemUsingAdvancedApplyMixed)
 {
-    using value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+    using value_type = gko::next_precision<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->idr_factory->generate(this->mtx);
     auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
@@ -280,7 +279,7 @@ TYPED_TEST(Idr, SolvesDenseSystemUsingAdvancedApplyComplex)
 TYPED_TEST(Idr, SolvesDenseSystemUsingAdvancedApplyMixedComplex)
 {
     using Scalar = gko::matrix::Dense<
-        gko::next_precision_with_half<typename TestFixture::value_type>>;
+        gko::next_precision<typename TestFixture::value_type>>;
     using Mtx = gko::to_complex<typename TestFixture::Mtx>;
     using value_type = typename Mtx::value_type;
     auto solver = this->idr_factory->generate(this->mtx);
diff --git a/reference/test/solver/ir_kernels.cpp b/reference/test/solver/ir_kernels.cpp
index f329a16d932..fc14711bec2 100644
--- a/reference/test/solver/ir_kernels.cpp
+++ b/reference/test/solver/ir_kernels.cpp
@@ -82,8 +82,7 @@ TYPED_TEST(Ir, SolvesTriangularSystem)
 
 TYPED_TEST(Ir, SolvesTriangularSystemMixed)
 {
-    using value_type =
-        gko::next_precision_with_half<typename TestFixture::value_type>;
+    using value_type = gko::next_precision<typename TestFixture::value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->ir_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>({3.9, 9.0, 2.2}, this->exec);
@@ -119,8 +118,8 @@ TYPED_TEST(Ir, SolvesTriangularSystemComplex)
 
 TYPED_TEST(Ir, SolvesTriangularSystemMixedComplex)
 {
-    using value_type = gko::to_complex<
-        gko::next_precision_with_half<typename TestFixture::value_type>>;
+    using value_type =
+        gko::to_complex<gko::next_precision<typename TestFixture::value_type>>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto solver = this->ir_factory->generate(this->mtx);
     auto b = gko::initialize<Mtx>(
@@ -245,7 +244,7 @@ TYPED_TEST(Ir, SolvesTriangularSystemUsingAdvancedApplyComplex)
 TYPED_TEST(Ir, SolvesTriangularSystemUsingAdvancedApplyMixedComplex)
 {
     using Scalar = gko::matrix::Dense<
-        gko::next_precision_with_half<typename TestFixture::value_type>>;
+        gko::next_precision<typename TestFixture::value_type>>;
     using Mtx = gko::to_complex<typename TestFixture::Mtx>;
     using value_type = typename Mtx::value_type;
     auto solver = this->ir_factory->generate(this->mtx);
diff --git a/reference/test/solver/lower_trs_kernels.cpp b/reference/test/solver/lower_trs_kernels.cpp
index 6d54efd2913..dcb7d6c6f0c 100644
--- a/reference/test/solver/lower_trs_kernels.cpp
+++ b/reference/test/solver/lower_trs_kernels.cpp
@@ -108,7 +108,7 @@ TYPED_TEST(LowerTrs, SolvesTriangularSystem)
 TYPED_TEST(LowerTrs, SolvesTriangularSystemMixed)
 {
     using other_value_type = typename TestFixture::value_type;
-    using value_type = gko::next_precision_with_half<other_value_type>;
+    using value_type = gko::next_precision<other_value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     std::shared_ptr<Mtx> b = gko::initialize<Mtx>({1.0, 2.0, 1.0}, this->exec);
     auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
@@ -146,8 +146,7 @@ TYPED_TEST(LowerTrs, SolvesTriangularSystemComplex)
 TYPED_TEST(LowerTrs, SolvesTriangularSystemMixedComplex)
 {
     using other_value_type = typename TestFixture::value_type;
-    using Scalar =
-        gko::matrix::Dense<gko::next_precision_with_half<other_value_type>>;
+    using Scalar = gko::matrix::Dense<gko::next_precision<other_value_type>>;
     using Mtx = gko::to_complex<typename TestFixture::Mtx>;
     using value_type = typename Mtx::value_type;
     std::shared_ptr<Mtx> b = gko::initialize<Mtx>(
@@ -218,7 +217,7 @@ TYPED_TEST(LowerTrs, SolvesTriangularSystemUsingAdvancedApply)
 TYPED_TEST(LowerTrs, SolvesTriangularSystemUsingAdvancedApplyMixed)
 {
     using other_value_type = typename TestFixture::value_type;
-    using value_type = gko::next_precision_with_half<other_value_type>;
+    using value_type = gko::next_precision<other_value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
     auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
@@ -260,8 +259,7 @@ TYPED_TEST(LowerTrs, SolvesTriangularSystemUsingAdvancedApplyComplex)
 TYPED_TEST(LowerTrs, SolvesTriangularSystemUsingAdvancedApplyMixedComplex)
 {
     using other_value_type = typename TestFixture::value_type;
-    using Scalar =
-        gko::matrix::Dense<gko::next_precision_with_half<other_value_type>>;
+    using Scalar = gko::matrix::Dense<gko::next_precision<other_value_type>>;
     using Mtx = gko::to_complex<typename TestFixture::Mtx>;
     using value_type = typename Mtx::value_type;
     auto alpha = gko::initialize<Scalar>({2.0}, this->exec);
diff --git a/reference/test/solver/multigrid_kernels.cpp b/reference/test/solver/multigrid_kernels.cpp
index 8aad93a1efb..31c27a4551a 100644
--- a/reference/test/solver/multigrid_kernels.cpp
+++ b/reference/test/solver/multigrid_kernels.cpp
@@ -229,13 +229,11 @@ class Multigrid : public ::testing::Test {
     using Mtx = gko::matrix::Dense<value_type>;
     using Solver = gko::solver::Multigrid;
     using Coarse = gko::multigrid::Pgm<value_type>;
-    using CoarseNext =
-        gko::multigrid::Pgm<gko::next_precision_with_half<value_type>>;
+    using CoarseNext = gko::multigrid::Pgm<gko::next_precision<value_type>>;
     using Smoother = gko::solver::Ir<value_type>;
     using InnerSolver = gko::preconditioner::Jacobi<value_type>;
     using CoarsestSolver = gko::solver::Cg<value_type>;
-    using CoarsestNextSolver =
-        gko::solver::Cg<gko::next_precision_with_half<value_type>>;
+    using CoarsestNextSolver = gko::solver::Cg<gko::next_precision<value_type>>;
     using DummyRPFactory = DummyMultigridLevelWithFactory<value_type>;
     using DummyFactory = DummyLinOpWithFactory<value_type>;
     Multigrid()
diff --git a/reference/test/solver/upper_trs_kernels.cpp b/reference/test/solver/upper_trs_kernels.cpp
index 870542593ff..15f0f3c2996 100644
--- a/reference/test/solver/upper_trs_kernels.cpp
+++ b/reference/test/solver/upper_trs_kernels.cpp
@@ -108,7 +108,7 @@ TYPED_TEST(UpperTrs, SolvesTriangularSystem)
 TYPED_TEST(UpperTrs, SolvesTriangularSystemMixed)
 {
     using other_value_type = typename TestFixture::value_type;
-    using value_type = gko::next_precision_with_half<other_value_type>;
+    using value_type = gko::next_precision<other_value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     std::shared_ptr<Mtx> b = gko::initialize<Mtx>({4.0, 2.0, 3.0}, this->exec);
     auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
@@ -146,8 +146,7 @@ TYPED_TEST(UpperTrs, SolvesTriangularSystemComplex)
 TYPED_TEST(UpperTrs, SolvesTriangularSystemMixedComplex)
 {
     using other_value_type = typename TestFixture::value_type;
-    using Scalar =
-        gko::matrix::Dense<gko::next_precision_with_half<other_value_type>>;
+    using Scalar = gko::matrix::Dense<gko::next_precision<other_value_type>>;
     using Mtx = gko::to_complex<typename TestFixture::Mtx>;
     using value_type = typename Mtx::value_type;
     std::shared_ptr<Mtx> b = gko::initialize<Mtx>(
@@ -219,7 +218,7 @@ TYPED_TEST(UpperTrs, SolvesTriangularSystemUsingAdvancedApply)
 TYPED_TEST(UpperTrs, SolvesTriangularSystemUsingAdvancedApplyMixed)
 {
     using other_value_type = typename TestFixture::value_type;
-    using value_type = gko::next_precision_with_half<other_value_type>;
+    using value_type = gko::next_precision<other_value_type>;
     using Mtx = gko::matrix::Dense<value_type>;
     auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
     auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
@@ -261,8 +260,7 @@ TYPED_TEST(UpperTrs, SolvesTriangularSystemUsingAdvancedApplyComplex)
 TYPED_TEST(UpperTrs, SolvesTriangularSystemUsingAdvancedApplyMixedComplex)
 {
     using other_value_type = typename TestFixture::value_type;
-    using Scalar =
-        gko::matrix::Dense<gko::next_precision_with_half<other_value_type>>;
+    using Scalar = gko::matrix::Dense<gko::next_precision<other_value_type>>;
     using Mtx = gko::to_complex<typename TestFixture::Mtx>;
     using value_type = typename Mtx::value_type;
     auto alpha = gko::initialize<Scalar>({2.0}, this->exec);
diff --git a/test/matrix/csr_kernels2.cpp b/test/matrix/csr_kernels2.cpp
index 9b3f09a13fc..27c4a5a9494 100644
--- a/test/matrix/csr_kernels2.cpp
+++ b/test/matrix/csr_kernels2.cpp
@@ -34,7 +34,7 @@ class Csr : public CommonTestFixture {
 protected:
     using Arr = gko::array<int>;
     using Vec = gko::matrix::Dense<value_type>;
-    using Vec2 = gko::matrix::Dense<gko::next_precision<value_type>>;
+    using Vec2 = gko::matrix::Dense<gko::next_precision_base<value_type>>;
     using Mtx = gko::matrix::Csr<value_type>;
     using ComplexVec = gko::matrix::Dense<std::complex<value_type>>;
     using ComplexMtx = gko::matrix::Csr<std::complex<value_type>>;
diff --git a/test/matrix/matrix.cpp b/test/matrix/matrix.cpp
index 0b06f76df85..98025d9a3a8 100644
--- a/test/matrix/matrix.cpp
+++ b/test/matrix/matrix.cpp
@@ -557,7 +557,7 @@ class Matrix : public CommonTestFixture {
     using Mtx = typename T::matrix_type;
     using index_type = typename Mtx::index_type;
     using value_type = typename Mtx::value_type;
-    using mixed_value_type = gko::next_precision<value_type>;
+    using mixed_value_type = gko::next_precision_base<value_type>;
     using Vec = gko::matrix::Dense<value_type>;
     using MixedVec = gko::matrix::Dense<mixed_value_type>;
 
diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp
index 88fe4092668..85e0a43d146 100644
--- a/test/mpi/matrix.cpp
+++ b/test/mpi/matrix.cpp
@@ -733,7 +733,7 @@ TYPED_TEST(Matrix, CanConvertToNextPrecision)
     using csr = typename TestFixture::local_matrix_type;
     using local_index_type = typename TestFixture::local_index_type;
     using global_index_type = typename TestFixture::global_index_type;
-    using OtherT = typename gko::next_precision<T>;
+    using OtherT = typename gko::next_precision_base<T>;
     using OtherDist = typename gko::experimental::distributed::Matrix<
         OtherT, local_index_type, global_index_type>;
     auto tmp = OtherDist::create(this->ref, this->comm);
@@ -759,7 +759,7 @@ TYPED_TEST(Matrix, CanMoveToNextPrecision)
     using csr = typename TestFixture::local_matrix_type;
     using local_index_type = typename TestFixture::local_index_type;
     using global_index_type = typename TestFixture::global_index_type;
-    using OtherT = typename gko::next_precision<T>;
+    using OtherT = typename gko::next_precision_base<T>;
     using OtherDist = typename gko::experimental::distributed::Matrix<
         OtherT, local_index_type, global_index_type>;
     auto tmp = OtherDist::create(this->ref, this->comm);
diff --git a/test/mpi/solver/solver.cpp b/test/mpi/solver/solver.cpp
index be9f6865c86..80142e2e947 100644
--- a/test/mpi/solver/solver.cpp
+++ b/test/mpi/solver/solver.cpp
@@ -45,7 +45,7 @@ template <typename SolverType>
 struct SimpleSolverTest {
     using solver_type = SolverType;
     using value_type = typename solver_type::value_type;
-    using mixed_value_type = gko::next_precision<value_type>;
+    using mixed_value_type = gko::next_precision_base<value_type>;
     using local_index_type = gko::int32;
     using global_index_type = gko::int64;
     using dist_matrix_type =
@@ -229,7 +229,7 @@ class Solver : public CommonMpiTestFixture {
     using local_index_type = typename T::local_index_type;
     using global_index_type = typename T::global_index_type;
     using value_type = typename T::value_type;
-    using mixed_value_type = gko::next_precision<value_type>;
+    using mixed_value_type = gko::next_precision_base<value_type>;
     using Vec = typename T::dist_vector_type;
     using LocalVec = typename T::non_dist_vector_type;
     using MixedVec = typename T::mixed_dist_vector_type;
diff --git a/test/mpi/vector.cpp b/test/mpi/vector.cpp
index cedd483b0a2..53990650ed7 100644
--- a/test/mpi/vector.cpp
+++ b/test/mpi/vector.cpp
@@ -839,7 +839,7 @@ TYPED_TEST(VectorLocalOps, AdvancedApplyNotSupported)
 TYPED_TEST(VectorLocalOps, ConvertsToPrecision)
 {
     using T = typename TestFixture::value_type;
-    using OtherT = typename gko::next_precision<T>;
+    using OtherT = typename gko::next_precision_base<T>;
     using OtherVector = typename gko::experimental::distributed::Vector<OtherT>;
     auto local_tmp = OtherVector::local_vector_type::create(this->exec);
     auto tmp = OtherVector::create(this->exec, this->comm);
@@ -855,7 +855,7 @@ TYPED_TEST(VectorLocalOps, ConvertsToPrecision)
 TYPED_TEST(VectorLocalOps, MovesToPrecision)
 {
     using T = typename TestFixture::value_type;
-    using OtherT = typename gko::next_precision<T>;
+    using OtherT = typename gko::next_precision_base<T>;
     using OtherVector = typename gko::experimental::distributed::Vector<OtherT>;
     auto local_tmp = OtherVector::local_vector_type::create(this->exec);
     auto tmp = OtherVector::create(this->exec, this->comm);
diff --git a/test/solver/solver.cpp b/test/solver/solver.cpp
index 47414f83041..57e93295940 100644
--- a/test/solver/solver.cpp
+++ b/test/solver/solver.cpp
@@ -520,7 +520,7 @@ class Solver : public CommonTestFixture {
     using Precond = typename T::precond_type;
     using Mtx = typename T::matrix_type;
     using value_type = typename Mtx::value_type;
-    using mixed_value_type = gko::next_precision<value_type>;
+    using mixed_value_type = gko::next_precision_base<value_type>;
     using Vec = gko::matrix::Dense<value_type>;
     using MixedVec = gko::matrix::Dense<mixed_value_type>;
 

From 5852c12365fb388967c7ef108868bfc30bed46d2 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 2 Dec 2024 11:59:13 +0100
Subject: [PATCH 425/448] rename instantiation

---
 .../base/batch_multi_vector_kernels.cpp       |  13 +-
 .../base/device_matrix_data_kernels.cpp       |   6 +-
 .../cuda_hip/distributed/assembly_kernels.cpp |   2 +-
 .../cuda_hip/distributed/matrix_kernels.cpp   |   2 +-
 .../cuda_hip/distributed/vector_kernels.cpp   |   2 +-
 .../factorization/cholesky_kernels.cpp        |  12 +-
 .../factorization/factorization_kernels.cpp   |  10 +-
 common/cuda_hip/factorization/ic_kernels.cpp  |   2 +-
 common/cuda_hip/factorization/ilu_kernels.cpp |   2 +-
 common/cuda_hip/factorization/lu_kernels.cpp  |   6 +-
 .../cuda_hip/factorization/par_ic_kernels.cpp |   4 +-
 .../factorization/par_ict_kernels.cpp         |   4 +-
 .../factorization/par_ilu_kernels.cpp         |   2 +-
 .../par_ilut_approx_filter_kernels.cpp        |   2 +-
 .../factorization/par_ilut_filter_kernels.cpp |   2 +-
 .../factorization/par_ilut_select_common.cpp  |   2 +-
 .../factorization/par_ilut_select_kernels.cpp |   2 +-
 .../factorization/par_ilut_spgeam_kernels.cpp |   2 +-
 .../factorization/par_ilut_sweep_kernels.cpp  |   2 +-
 common/cuda_hip/matrix/batch_csr_kernels.cpp  |   8 +-
 .../cuda_hip/matrix/batch_dense_kernels.cpp   |  12 +-
 common/cuda_hip/matrix/batch_ell_kernels.cpp  |   8 +-
 common/cuda_hip/matrix/coo_kernels.cpp        |  10 +-
 .../matrix/csr_kernels.instantiate.cpp        | 124 ++-
 common/cuda_hip/matrix/dense_kernels.cpp      |  33 +-
 common/cuda_hip/matrix/diagonal_kernels.cpp   |   2 +-
 common/cuda_hip/matrix/ell_kernels.cpp        |   4 +-
 .../matrix/fbcsr_kernels.instantiate.cpp      |  21 +-
 common/cuda_hip/matrix/sellp_kernels.cpp      |   5 +-
 .../cuda_hip/matrix/sparsity_csr_kernels.cpp  |  10 +-
 common/cuda_hip/multigrid/pgm_kernels.cpp     |   5 +-
 .../cuda_hip/preconditioner/isai_kernels.cpp  |  10 +-
 .../jacobi_advanced_apply_kernels.cpp         |   3 +-
 ...obi_advanced_apply_kernels.instantiate.cpp |   2 +-
 .../jacobi_generate_kernels.cpp               |   2 +-
 .../jacobi_generate_kernels.instantiate.cpp   |   2 +-
 .../preconditioner/jacobi_kernels.cpp         |   8 +-
 .../jacobi_simple_apply_kernels.cpp           |   2 +-
 ...acobi_simple_apply_kernels.instantiate.cpp |   2 +-
 .../cuda_hip/preconditioner/sor_kernels.cpp   |   4 +-
 common/cuda_hip/solver/cb_gmres_kernels.cpp   |   3 +-
 common/cuda_hip/solver/idr_kernels.cpp        |  12 +-
 common/cuda_hip/solver/multigrid_kernels.cpp  |   8 +-
 .../cuda_hip/stop/residual_norm_kernels.cpp   |   5 +-
 .../base/device_matrix_data_kernels.cpp       |   4 +-
 .../components/absolute_array_kernels.cpp     |   6 +-
 .../unified/components/fill_array_kernels.cpp |   5 +-
 .../precision_conversion_kernels.cpp          |   3 +-
 .../components/reduce_array_kernels.cpp       |   3 +-
 .../unified/distributed/assembly_kernels.cpp  |   2 +-
 common/unified/matrix/coo_kernels.cpp         |   4 +-
 common/unified/matrix/csr_kernels.cpp         |  16 +-
 .../matrix/dense_kernels.instantiate.cpp      | 100 ++-
 common/unified/matrix/diagonal_kernels.cpp    |  14 +-
 common/unified/matrix/ell_kernels.cpp         |  13 +-
 common/unified/matrix/hybrid_kernels.cpp      |   4 +-
 .../matrix/scaled_permutation_kernels.cpp     |   4 +-
 common/unified/matrix/sellp_kernels.cpp       |  10 +-
 .../unified/matrix/sparsity_csr_kernels.cpp   |   6 +-
 common/unified/multigrid/pgm_kernels.cpp      |   4 +-
 .../unified/preconditioner/jacobi_kernels.cpp |  13 +-
 common/unified/solver/bicg_kernels.cpp        |   7 +-
 common/unified/solver/bicgstab_kernels.cpp    |  15 +-
 common/unified/solver/cg_kernels.cpp          |   6 +-
 common/unified/solver/cgs_kernels.cpp         |   9 +-
 .../unified/solver/common_gmres_kernels.cpp   |   7 +-
 common/unified/solver/fcg_kernels.cpp         |   7 +-
 common/unified/solver/gcr_kernels.cpp         |   7 +-
 common/unified/solver/gmres_kernels.cpp       |   8 +-
 core/base/array.cpp                           |   9 +-
 core/base/batch_instantiation.hpp             |   8 +-
 core/base/batch_multi_vector.cpp              |   2 +-
 core/base/combination.cpp                     |   2 +-
 core/base/composition.cpp                     |   2 +-
 core/base/dense_cache.cpp                     |   2 +-
 core/base/device_matrix_data.cpp              |   3 +-
 core/base/mixed_precision_types.hpp           | 206 +++--
 core/base/mtx_io.cpp                          |  13 +-
 core/base/perturbation.cpp                    |   2 +-
 core/base/segmented_array.cpp                 |   2 +-
 core/config/factorization_config.cpp          |  21 +-
 core/config/multigrid_config.cpp              |   2 +-
 core/config/parse_macro.hpp                   |  29 +-
 core/config/preconditioner_config.cpp         |   7 +-
 core/config/solver_config.cpp                 |  27 +-
 core/config/type_descriptor.cpp               |   2 +-
 core/device_hooks/common_kernels.inc.cpp      | 746 ++++++++----------
 core/distributed/assembly.cpp                 |   2 +-
 core/distributed/matrix.cpp                   |   2 +-
 core/distributed/preconditioner/schwarz.cpp   |   3 +-
 core/distributed/vector.cpp                   |   2 +-
 core/distributed/vector_cache.cpp             |   2 +-
 core/factorization/cholesky.cpp               |   2 +-
 core/factorization/elimination_forest.cpp     |   3 +-
 core/factorization/factorization.cpp          |   3 +-
 core/factorization/ic.cpp                     |   2 +-
 core/factorization/ilu.cpp                    |   2 +-
 core/factorization/lu.cpp                     |   2 +-
 core/factorization/par_ic.cpp                 |   2 +-
 core/factorization/par_ict.cpp                |   2 +-
 core/factorization/par_ilu.cpp                |   2 +-
 core/factorization/par_ilut.cpp               |   2 +-
 core/factorization/symbolic.cpp               |   8 +-
 core/log/batch_logger.cpp                     |   4 +-
 core/log/convergence.cpp                      |   2 +-
 core/log/papi.cpp                             |   2 +-
 core/log/stream.cpp                           |   2 +-
 core/matrix/batch_csr.cpp                     |   2 +-
 core/matrix/batch_dense.cpp                   |   2 +-
 core/matrix/batch_ell.cpp                     |   2 +-
 core/matrix/batch_identity.cpp                |   3 +-
 core/matrix/coo.cpp                           |   2 +-
 core/matrix/csr.cpp                           |   2 +-
 core/matrix/dense.cpp                         |   2 +-
 core/matrix/diagonal.cpp                      |   4 +-
 core/matrix/ell.cpp                           |   2 +-
 core/matrix/fbcsr.cpp                         |   3 +-
 core/matrix/hybrid.cpp                        |   3 +-
 core/matrix/identity.cpp                      |   4 +-
 core/matrix/scaled_permutation.cpp            |   2 +-
 core/matrix/sellp.cpp                         |   3 +-
 core/matrix/sparsity_csr.cpp                  |   3 +-
 core/multigrid/fixed_coarsening.cpp           |   3 +-
 core/multigrid/pgm.cpp                        |   2 +-
 core/preconditioner/batch_jacobi.cpp          |   2 +-
 core/preconditioner/gauss_seidel.cpp          |   3 +-
 core/preconditioner/ic.cpp                    |  12 +-
 core/preconditioner/ilu.cpp                   |  23 +-
 core/preconditioner/isai.cpp                  |   9 +-
 core/preconditioner/jacobi.cpp                |   2 +-
 core/preconditioner/sor.cpp                   |   2 +-
 core/reorder/mc64.cpp                         |  11 +-
 core/reorder/nested_dissection.cpp            |   2 +-
 core/reorder/rcm.cpp                          |   2 +-
 core/reorder/scaled_reordered.cpp             |   3 +-
 core/solver/batch_bicgstab.cpp                |   2 +-
 core/solver/batch_cg.cpp                      |   2 +-
 core/solver/batch_dispatch.hpp                |   2 +-
 core/solver/bicg.cpp                          |   4 +-
 core/solver/bicgstab.cpp                      |   4 +-
 core/solver/cb_gmres.cpp                      |   4 +-
 core/solver/cg.cpp                            |   4 +-
 core/solver/cgs.cpp                           |   4 +-
 core/solver/direct.cpp                        |   5 +-
 core/solver/fcg.cpp                           |   4 +-
 core/solver/gcr.cpp                           |   4 +-
 core/solver/gmres.cpp                         |   4 +-
 core/solver/idr.cpp                           |   4 +-
 core/solver/ir.cpp                            |   4 +-
 core/solver/lower_trs.cpp                     |   5 +-
 core/solver/upper_trs.cpp                     |   5 +-
 core/stop/residual_norm.cpp                   |   5 +-
 cuda/matrix/fft_kernels.cu                    |   6 +-
 cuda/preconditioner/batch_jacobi_kernels.cu   |   4 +-
 cuda/solver/batch_bicgstab_kernels.cu         |   2 +-
 cuda/solver/batch_cg_kernels.cu               |   2 +-
 cuda/solver/lower_trs_kernels.cu              |   4 +-
 cuda/solver/upper_trs_kernels.cu              |   4 +-
 dpcpp/base/batch_multi_vector_kernels.dp.cpp  |  13 +-
 dpcpp/base/device_matrix_data_kernels.dp.cpp  |   6 +-
 dpcpp/components/atomic.dp.hpp                |  16 +-
 dpcpp/distributed/assembly_kernels.dp.cpp     |   2 +-
 dpcpp/distributed/matrix_kernels.dp.cpp       |   2 +-
 dpcpp/distributed/vector_kernels.dp.cpp       |   2 +-
 dpcpp/factorization/cholesky_kernels.dp.cpp   |  12 +-
 .../factorization_kernels.dp.cpp              |  10 +-
 dpcpp/factorization/ic_kernels.dp.cpp         |   2 +-
 dpcpp/factorization/ilu_kernels.dp.cpp        |   2 +-
 dpcpp/factorization/lu_kernels.dp.cpp         |   6 +-
 dpcpp/factorization/par_ic_kernels.dp.cpp     |   4 +-
 dpcpp/factorization/par_ict_kernels.dp.cpp    |   4 +-
 dpcpp/factorization/par_ilu_kernels.dp.cpp    |   2 +-
 .../par_ilut_approx_filter_kernel.dp.cpp      |   2 +-
 .../par_ilut_filter_kernel.dp.cpp             |   2 +-
 dpcpp/factorization/par_ilut_kernels.dp.cpp   |  10 +-
 .../par_ilut_select_common.dp.cpp             |   2 +-
 .../par_ilut_select_kernel.dp.cpp             |   2 +-
 .../par_ilut_spgeam_kernel.dp.cpp             |   2 +-
 .../par_ilut_sweep_kernel.dp.cpp              |   2 +-
 dpcpp/matrix/batch_csr_kernels.dp.cpp         |   8 +-
 dpcpp/matrix/batch_dense_kernels.dp.cpp       |  12 +-
 dpcpp/matrix/batch_ell_kernels.dp.cpp         |   8 +-
 dpcpp/matrix/coo_kernels.dp.cpp               |  10 +-
 dpcpp/matrix/csr_kernels.dp.cpp               |  56 +-
 dpcpp/matrix/dense_kernels.dp.cpp             |  33 +-
 dpcpp/matrix/diagonal_kernels.dp.cpp          |   2 +-
 dpcpp/matrix/ell_kernels.dp.cpp               |   4 +-
 dpcpp/matrix/fbcsr_kernels.dp.cpp             |  21 +-
 dpcpp/matrix/fft_kernels.dp.cpp               |   6 +-
 dpcpp/matrix/sellp_kernels.dp.cpp             |   5 +-
 dpcpp/matrix/sparsity_csr_kernels.dp.cpp      |  10 +-
 dpcpp/multigrid/pgm_kernels.dp.cpp            |   5 +-
 .../batch_jacobi_kernels.dp.cpp               |   4 +-
 dpcpp/preconditioner/isai_kernels.dp.cpp      |  10 +-
 ...cobi_advanced_apply_instantiate.inc.dp.cpp |   2 +-
 .../jacobi_advanced_apply_kernel.dp.cpp       |   3 +-
 .../jacobi_generate_instantiate.inc.dp.cpp    |   2 +-
 .../jacobi_generate_kernel.dp.cpp             |   2 +-
 dpcpp/preconditioner/jacobi_kernels.dp.cpp    |   8 +-
 ...jacobi_simple_apply_instantiate.inc.dp.cpp |   2 +-
 .../jacobi_simple_apply_kernel.dp.cpp         |   2 +-
 dpcpp/preconditioner/sor_kernels.dp.cpp       |   4 +-
 dpcpp/solver/batch_bicgstab_kernels.dp.cpp    |   2 +-
 dpcpp/solver/batch_cg_kernels.dp.cpp          |   2 +-
 dpcpp/solver/cb_gmres_kernels.dp.cpp          |   3 +-
 dpcpp/solver/idr_kernels.dp.cpp               |  12 +-
 dpcpp/solver/lower_trs_kernels.dp.cpp         |   4 +-
 dpcpp/solver/multigrid_kernels.dp.cpp         |   8 +-
 dpcpp/solver/upper_trs_kernels.dp.cpp         |   4 +-
 dpcpp/stop/residual_norm_kernels.dp.cpp       |   5 +-
 hip/matrix/fft_kernels.hip.cpp                |   6 +-
 hip/matrix/fft_kernels_stub.hip.cpp           |   6 +-
 .../batch_jacobi_kernels.hip.cpp              |   4 +-
 hip/solver/batch_bicgstab_kernels.hip.cpp     |   2 +-
 hip/solver/batch_cg_kernels.hip.cpp           |   2 +-
 hip/solver/lower_trs_kernels.hip.cpp          |   4 +-
 hip/solver/upper_trs_kernels.hip.cpp          |   4 +-
 include/ginkgo/core/base/types.hpp            | 403 +++++-----
 omp/base/batch_multi_vector_kernels.cpp       |  13 +-
 omp/base/device_matrix_data_kernels.cpp       |   6 +-
 omp/distributed/assembly_kernels.cpp          |   2 +-
 omp/distributed/matrix_kernels.cpp            |   2 +-
 omp/distributed/vector_kernels.cpp            |   2 +-
 omp/factorization/cholesky_kernels.cpp        |  12 +-
 omp/factorization/factorization_kernels.cpp   |  10 +-
 omp/factorization/ic_kernels.cpp              |   2 +-
 omp/factorization/ilu_kernels.cpp             |   2 +-
 omp/factorization/lu_kernels.cpp              |   6 +-
 omp/factorization/par_ic_kernels.cpp          |   4 +-
 omp/factorization/par_ict_kernels.cpp         |   4 +-
 omp/factorization/par_ilu_kernels.cpp         |   2 +-
 omp/factorization/par_ilut_kernels.cpp        |  10 +-
 omp/matrix/batch_csr_kernels.cpp              |   8 +-
 omp/matrix/batch_dense_kernels.cpp            |  12 +-
 omp/matrix/batch_ell_kernels.cpp              |   8 +-
 omp/matrix/coo_kernels.cpp                    |  10 +-
 omp/matrix/csr_kernels.cpp                    |  56 +-
 omp/matrix/dense_kernels.cpp                  |  33 +-
 omp/matrix/diagonal_kernels.cpp               |   2 +-
 omp/matrix/ell_kernels.cpp                    |   4 +-
 omp/matrix/fbcsr_kernels.cpp                  |  21 +-
 omp/matrix/fft_kernels.cpp                    |   6 +-
 omp/matrix/sellp_kernels.cpp                  |   5 +-
 omp/matrix/sparsity_csr_kernels.cpp           |  10 +-
 omp/multigrid/pgm_kernels.cpp                 |   5 +-
 omp/preconditioner/batch_jacobi_kernels.cpp   |   4 +-
 omp/preconditioner/isai_kernels.cpp           |  10 +-
 omp/preconditioner/jacobi_kernels.cpp         |  15 +-
 omp/preconditioner/sor_kernels.cpp            |   4 +-
 omp/solver/batch_bicgstab_kernels.cpp         |   2 +-
 omp/solver/batch_cg_kernels.cpp               |   2 +-
 omp/solver/cb_gmres_kernels.cpp               |   3 +-
 omp/solver/idr_kernels.cpp                    |  12 +-
 omp/solver/lower_trs_kernels.cpp              |   4 +-
 omp/solver/multigrid_kernels.cpp              |   8 +-
 omp/solver/upper_trs_kernels.cpp              |   4 +-
 omp/stop/residual_norm_kernels.cpp            |   5 +-
 reference/base/batch_multi_vector_kernels.cpp |  13 +-
 reference/base/device_matrix_data_kernels.cpp |  10 +-
 .../components/absolute_array_kernels.cpp     |   6 +-
 reference/components/fill_array_kernels.cpp   |   5 +-
 .../precision_conversion_kernels.cpp          |   3 +-
 reference/components/reduce_array_kernels.cpp |   3 +-
 reference/distributed/assembly_kernels.cpp    |   4 +-
 reference/distributed/matrix_kernels.cpp      |   2 +-
 reference/distributed/vector_kernels.cpp      |   2 +-
 reference/factorization/cholesky_kernels.cpp  |  12 +-
 .../factorization/factorization_kernels.cpp   |  10 +-
 reference/factorization/ic_kernels.cpp        |   2 +-
 reference/factorization/ilu_kernels.cpp       |   2 +-
 reference/factorization/lu_kernels.cpp        |   6 +-
 reference/factorization/par_ic_kernels.cpp    |   4 +-
 reference/factorization/par_ict_kernels.cpp   |   4 +-
 reference/factorization/par_ilu_kernels.cpp   |   2 +-
 reference/factorization/par_ilut_kernels.cpp  |  10 +-
 reference/matrix/batch_csr_kernels.cpp        |   8 +-
 reference/matrix/batch_dense_kernels.cpp      |  12 +-
 reference/matrix/batch_ell_kernels.cpp        |   8 +-
 reference/matrix/coo_kernels.cpp              |  14 +-
 reference/matrix/csr_kernels.cpp              |  72 +-
 reference/matrix/dense_kernels.cpp            | 132 ++--
 reference/matrix/diagonal_kernels.cpp         |  16 +-
 reference/matrix/ell_kernels.cpp              |  17 +-
 reference/matrix/fbcsr_kernels.cpp            |  21 +-
 reference/matrix/fft_kernels.cpp              |   6 +-
 reference/matrix/hybrid_kernels.cpp           |   4 +-
 .../matrix/scaled_permutation_kernels.cpp     |   4 +-
 reference/matrix/sellp_kernels.cpp            |  15 +-
 reference/matrix/sparsity_csr_kernels.cpp     |  16 +-
 reference/multigrid/pgm_kernels.cpp           |   9 +-
 .../preconditioner/batch_jacobi_kernels.cpp   |   4 +-
 reference/preconditioner/isai_kernels.cpp     |  10 +-
 reference/preconditioner/jacobi_kernels.cpp   |  28 +-
 reference/preconditioner/sor_kernels.cpp      |   4 +-
 reference/solver/batch_bicgstab_kernels.cpp   |   2 +-
 reference/solver/batch_cg_kernels.cpp         |   2 +-
 reference/solver/bicg_kernels.cpp             |   7 +-
 reference/solver/bicgstab_kernels.cpp         |  15 +-
 reference/solver/cb_gmres_kernels.cpp         |   3 +-
 reference/solver/cg_kernels.cpp               |   6 +-
 reference/solver/cgs_kernels.cpp              |   9 +-
 reference/solver/common_gmres_kernels.cpp     |   7 +-
 reference/solver/fcg_kernels.cpp              |   7 +-
 reference/solver/gcr_kernels.cpp              |   7 +-
 reference/solver/gmres_kernels.cpp            |   8 +-
 reference/solver/idr_kernels.cpp              |  12 +-
 reference/solver/lower_trs_kernels.cpp        |   4 +-
 reference/solver/multigrid_kernels.cpp        |   8 +-
 reference/solver/upper_trs_kernels.cpp        |   4 +-
 reference/stop/residual_norm_kernels.cpp      |   5 +-
 310 files changed, 1707 insertions(+), 2024 deletions(-)

diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.cpp b/common/cuda_hip/base/batch_multi_vector_kernels.cpp
index 8ff88ddc73b..8154dc440df 100644
--- a/common/cuda_hip/base/batch_multi_vector_kernels.cpp
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.cpp
@@ -55,7 +55,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL);
 
 
@@ -81,7 +81,7 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL);
 
 
@@ -101,7 +101,7 @@ void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
         x_ub, y_ub, res_ub, [] __device__(auto val) { return val; });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL);
 
 
@@ -121,7 +121,7 @@ void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
         x_ub, y_ub, res_ub, [] __device__(auto val) { return conj(val); });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_CONJ_DOT_KERNEL);
 
 
@@ -139,7 +139,7 @@ void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
         x_ub, res_ub);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL);
 
 
@@ -156,8 +156,7 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
             x_ub, result_ub);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
 
 
 }  // namespace batch_multi_vector
diff --git a/common/cuda_hip/base/device_matrix_data_kernels.cpp b/common/cuda_hip/base/device_matrix_data_kernels.cpp
index ebfed84dba2..6d30e330415 100644
--- a/common/cuda_hip/base/device_matrix_data_kernels.cpp
+++ b/common/cuda_hip/base/device_matrix_data_kernels.cpp
@@ -68,7 +68,7 @@ void remove_zeros(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DEVICE_MATRIX_DATA_REMOVE_ZEROS_KERNEL);
 
 
@@ -112,7 +112,7 @@ void sum_duplicates(std::shared_ptr<const DefaultExecutor> exec, size_type,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DEVICE_MATRIX_DATA_SUM_DUPLICATES_KERNEL);
 
 
@@ -127,7 +127,7 @@ void sort_row_major(std::shared_ptr<const DefaultExecutor> exec,
                         it + data.get_num_stored_elements(), vals);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DEVICE_MATRIX_DATA_SORT_ROW_MAJOR_KERNEL);
 
 
diff --git a/common/cuda_hip/distributed/assembly_kernels.cpp b/common/cuda_hip/distributed/assembly_kernels.cpp
index 81478538477..fb1a8dbc75d 100644
--- a/common/cuda_hip/distributed/assembly_kernels.cpp
+++ b/common/cuda_hip/distributed/assembly_kernels.cpp
@@ -90,7 +90,7 @@ void count_non_owning_entries(
         num_parts, local_part, row_part_ptrs.get_data(), send_count.get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
     GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
 
 
diff --git a/common/cuda_hip/distributed/matrix_kernels.cpp b/common/cuda_hip/distributed/matrix_kernels.cpp
index 88988febbb0..bdf189d9785 100644
--- a/common/cuda_hip/distributed/matrix_kernels.cpp
+++ b/common/cuda_hip/distributed/matrix_kernels.cpp
@@ -194,7 +194,7 @@ void separate_local_nonlocal(
         });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
     GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL);
 
 
diff --git a/common/cuda_hip/distributed/vector_kernels.cpp b/common/cuda_hip/distributed/vector_kernels.cpp
index 1bacc93489a..668a721d249 100644
--- a/common/cuda_hip/distributed/vector_kernels.cpp
+++ b/common/cuda_hip/distributed/vector_kernels.cpp
@@ -83,7 +83,7 @@ void build_local(
         range_id.get_data(), local_mtx->get_values(), is_local_row);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
     GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL);
 
 
diff --git a/common/cuda_hip/factorization/cholesky_kernels.cpp b/common/cuda_hip/factorization/cholesky_kernels.cpp
index ef24bb47fe0..7ff1382d8c6 100644
--- a/common/cuda_hip/factorization/cholesky_kernels.cpp
+++ b/common/cuda_hip/factorization/cholesky_kernels.cpp
@@ -262,7 +262,7 @@ void symbolic_factorize(
             postorder, postorder_parent, out_row_ptrs, out_cols);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE);
 
 
@@ -321,7 +321,7 @@ void forest_from_factor(
     build_children_from_parents(exec, forest);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR);
 
 
@@ -355,8 +355,7 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
                                transpose_idxs);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CHOLESKY_INITIALIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_INITIALIZE);
 
 
 template <typename ValueType, typename IndexType>
@@ -391,8 +390,7 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CHOLESKY_FACTORIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE);
 
 
 template <typename ValueType, typename IndexType>
@@ -448,7 +446,7 @@ void symbolic_count(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT);
 
 
diff --git a/common/cuda_hip/factorization/factorization_kernels.cpp b/common/cuda_hip/factorization/factorization_kernels.cpp
index 8e8893df535..f26ef668d34 100644
--- a/common/cuda_hip/factorization/factorization_kernels.cpp
+++ b/common/cuda_hip/factorization/factorization_kernels.cpp
@@ -355,7 +355,7 @@ void add_diagonal_elements(std::shared_ptr<const DefaultExecutor> exec,
     mtx_builder.get_col_idx_array() = std::move(new_col_idx_array);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL);
 
 
@@ -385,7 +385,7 @@ void initialize_row_ptrs_l_u(
     components::prefix_sum_nonnegative(exec, u_row_ptrs, num_rows + 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL);
 
 
@@ -418,7 +418,7 @@ void initialize_l_u(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL);
 
 
@@ -446,7 +446,7 @@ void initialize_row_ptrs_l(
     components::prefix_sum_nonnegative(exec, l_row_ptrs, num_rows + 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL);
 
 
@@ -483,7 +483,7 @@ void initialize_l(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL);
 
 
diff --git a/common/cuda_hip/factorization/ic_kernels.cpp b/common/cuda_hip/factorization/ic_kernels.cpp
index c2ed0b17cf0..e84032bac35 100644
--- a/common/cuda_hip/factorization/ic_kernels.cpp
+++ b/common/cuda_hip/factorization/ic_kernels.cpp
@@ -54,7 +54,7 @@ void sparselib_ic(std::shared_ptr<const DefaultExecutor> exec,
     sparselib::destroy(desc);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_IC_SPARSELIB_IC_KERNEL);
 
 
diff --git a/common/cuda_hip/factorization/ilu_kernels.cpp b/common/cuda_hip/factorization/ilu_kernels.cpp
index eb7677e117f..b81f8fb9092 100644
--- a/common/cuda_hip/factorization/ilu_kernels.cpp
+++ b/common/cuda_hip/factorization/ilu_kernels.cpp
@@ -54,7 +54,7 @@ void sparselib_ilu(std::shared_ptr<const DefaultExecutor> exec,
     sparselib::destroy(desc);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);
 
 
diff --git a/common/cuda_hip/factorization/lu_kernels.cpp b/common/cuda_hip/factorization/lu_kernels.cpp
index 4d98b611e28..b0d54e44217 100644
--- a/common/cuda_hip/factorization/lu_kernels.cpp
+++ b/common/cuda_hip/factorization/lu_kernels.cpp
@@ -253,8 +253,7 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_LU_INITIALIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_INITIALIZE);
 
 
 template <typename ValueType, typename IndexType>
@@ -287,8 +286,7 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_LU_FACTORIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE);
 
 
 template <typename IndexType>
diff --git a/common/cuda_hip/factorization/par_ic_kernels.cpp b/common/cuda_hip/factorization/par_ic_kernels.cpp
index 87e2fefd823..f3656ac8a29 100644
--- a/common/cuda_hip/factorization/par_ic_kernels.cpp
+++ b/common/cuda_hip/factorization/par_ic_kernels.cpp
@@ -110,7 +110,7 @@ void init_factor(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_IC_INIT_FACTOR_KERNEL);
 
 
@@ -144,7 +144,7 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_IC_COMPUTE_FACTOR_KERNEL);
 
 
diff --git a/common/cuda_hip/factorization/par_ict_kernels.cpp b/common/cuda_hip/factorization/par_ict_kernels.cpp
index 0acf0633a2c..a74e45fbe62 100644
--- a/common/cuda_hip/factorization/par_ict_kernels.cpp
+++ b/common/cuda_hip/factorization/par_ict_kernels.cpp
@@ -435,7 +435,7 @@ void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
         syn::value_list<int>(), syn::type_list<>(), exec, llh, a, l, l_new);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL);
 
 
@@ -457,7 +457,7 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
         syn::value_list<int>(), syn::type_list<>(), exec, a, l, l_coo);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL);
 
 
diff --git a/common/cuda_hip/factorization/par_ilu_kernels.cpp b/common/cuda_hip/factorization/par_ilu_kernels.cpp
index a22bb85275a..5238fcf19c7 100644
--- a/common/cuda_hip/factorization/par_ilu_kernels.cpp
+++ b/common/cuda_hip/factorization/par_ilu_kernels.cpp
@@ -118,7 +118,7 @@ void compute_l_u_factors(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL);
 
 
diff --git a/common/cuda_hip/factorization/par_ilut_approx_filter_kernels.cpp b/common/cuda_hip/factorization/par_ilut_approx_filter_kernels.cpp
index 475d87b8bda..12d8da9e4f5 100644
--- a/common/cuda_hip/factorization/par_ilut_approx_filter_kernels.cpp
+++ b/common/cuda_hip/factorization/par_ilut_approx_filter_kernels.cpp
@@ -168,7 +168,7 @@ void threshold_filter_approx(std::shared_ptr<const DefaultExecutor> exec,
         &threshold, m_out, m_out_coo);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL);
 
 
diff --git a/common/cuda_hip/factorization/par_ilut_filter_kernels.cpp b/common/cuda_hip/factorization/par_ilut_filter_kernels.cpp
index d6ad2f477eb..25432fb44d2 100644
--- a/common/cuda_hip/factorization/par_ilut_filter_kernels.cpp
+++ b/common/cuda_hip/factorization/par_ilut_filter_kernels.cpp
@@ -123,7 +123,7 @@ void threshold_filter(std::shared_ptr<const DefaultExecutor> exec,
         m_out_coo, lower);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL);
 
 
diff --git a/common/cuda_hip/factorization/par_ilut_select_common.cpp b/common/cuda_hip/factorization/par_ilut_select_common.cpp
index 3bb67d96e4f..6751615ff69 100644
--- a/common/cuda_hip/factorization/par_ilut_select_common.cpp
+++ b/common/cuda_hip/factorization/par_ilut_select_common.cpp
@@ -77,7 +77,7 @@ void sampleselect_count(std::shared_ptr<const DefaultExecutor> exec,
                             unsigned char* oracles, IndexType* partial_counts, \
                             IndexType* total_counts)
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(DECLARE_SSSS_COUNT);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(DECLARE_SSSS_COUNT);
 
 
 template <typename IndexType>
diff --git a/common/cuda_hip/factorization/par_ilut_select_kernels.cpp b/common/cuda_hip/factorization/par_ilut_select_kernels.cpp
index a15adf580e8..81a97197dc5 100644
--- a/common/cuda_hip/factorization/par_ilut_select_kernels.cpp
+++ b/common/cuda_hip/factorization/par_ilut_select_kernels.cpp
@@ -156,7 +156,7 @@ void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL);
 
 
diff --git a/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp b/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp
index 8f7a8af0443..a29cf6f2cb3 100644
--- a/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp
+++ b/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp
@@ -389,7 +389,7 @@ void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
         u_new);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL);
 
 
diff --git a/common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp b/common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp
index c0f962a89c8..7c8b1d85781 100644
--- a/common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp
+++ b/common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp
@@ -207,7 +207,7 @@ void compute_l_u_factors(std::shared_ptr<const DefaultExecutor> exec,
         u_csc);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL);
 
 
diff --git a/common/cuda_hip/matrix/batch_csr_kernels.cpp b/common/cuda_hip/matrix/batch_csr_kernels.cpp
index 0db100363b8..d48cdbaf32a 100644
--- a/common/cuda_hip/matrix/batch_csr_kernels.cpp
+++ b/common/cuda_hip/matrix/batch_csr_kernels.cpp
@@ -46,7 +46,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
 }
 
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_CSR_SIMPLE_APPLY_KERNEL);
 
 
@@ -72,7 +72,7 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
         alpha_ub, mat_ub, b_ub, beta_ub, x_ub);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_CSR_ADVANCED_APPLY_KERNEL);
 
 
@@ -91,7 +91,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
             mat_ub);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_CSR_SCALE_KERNEL);
 
 
@@ -110,7 +110,7 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
         alpha_ub, beta_ub, mat_ub);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_CSR_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/common/cuda_hip/matrix/batch_dense_kernels.cpp b/common/cuda_hip/matrix/batch_dense_kernels.cpp
index e0f1fc5e8dc..ee4d87abaa3 100644
--- a/common/cuda_hip/matrix/batch_dense_kernels.cpp
+++ b/common/cuda_hip/matrix/batch_dense_kernels.cpp
@@ -45,7 +45,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
         mat_ub, b_ub, x_ub);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL);
 
 
@@ -71,7 +71,7 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
         alpha_ub, mat_ub, b_ub, beta_ub, x_ub);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL);
 
 
@@ -90,8 +90,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
             mat_ub);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL);
 
 
 template <typename ValueType>
@@ -109,8 +108,7 @@ void scale_add(std::shared_ptr<const DefaultExecutor> exec,
         alpha_ub, mat_ub, in_out_ub);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_BATCH_DENSE_SCALE_ADD_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_ADD_KERNEL);
 
 
 template <typename ValueType>
@@ -128,7 +126,7 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
         alpha_ub, beta_ub, mat_ub);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_DENSE_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/common/cuda_hip/matrix/batch_ell_kernels.cpp b/common/cuda_hip/matrix/batch_ell_kernels.cpp
index dddb53e34ff..38d34707d45 100644
--- a/common/cuda_hip/matrix/batch_ell_kernels.cpp
+++ b/common/cuda_hip/matrix/batch_ell_kernels.cpp
@@ -46,7 +46,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
 }
 
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL);
 
 
@@ -72,7 +72,7 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
         alpha_ub, mat_ub, b_ub, beta_ub, x_ub);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL);
 
 
@@ -91,7 +91,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
             mat_ub);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_ELL_SCALE_KERNEL);
 
 
@@ -110,7 +110,7 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
         alpha_ub, beta_ub, mat_ub);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_ELL_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/common/cuda_hip/matrix/coo_kernels.cpp b/common/cuda_hip/matrix/coo_kernels.cpp
index 88d6dced504..6e36c62b74e 100644
--- a/common/cuda_hip/matrix/coo_kernels.cpp
+++ b/common/cuda_hip/matrix/coo_kernels.cpp
@@ -238,8 +238,7 @@ void spmv(std::shared_ptr<const DefaultExecutor> exec,
     spmv2(exec, a, b, c);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_COO_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -254,7 +253,7 @@ void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec,
     advanced_spmv2(exec, alpha, a, b, c);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_COO_ADVANCED_SPMV_KERNEL);
 
 
@@ -304,8 +303,7 @@ void spmv2(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_COO_SPMV2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV2_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -358,7 +356,7 @@ void advanced_spmv2(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL);
 
 
diff --git a/common/cuda_hip/matrix/csr_kernels.instantiate.cpp b/common/cuda_hip/matrix/csr_kernels.instantiate.cpp
index 2e28de95f5d..151a7a43ded 100644
--- a/common/cuda_hip/matrix/csr_kernels.instantiate.cpp
+++ b/common/cuda_hip/matrix/csr_kernels.instantiate.cpp
@@ -17,136 +17,132 @@ namespace csr {
 
 
 // begin
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL);
 
 
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1_WITH_HALF(
-    GKO_DECLARE_CSR_SPMV_KERNEL, int32);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(GKO_DECLARE_CSR_SPMV_KERNEL,
+                                                 int32);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2_WITH_HALF(
-    GKO_DECLARE_CSR_SPMV_KERNEL, int32);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(GKO_DECLARE_CSR_SPMV_KERNEL,
+                                                 int32);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3_WITH_HALF(
-    GKO_DECLARE_CSR_SPMV_KERNEL, int32);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(GKO_DECLARE_CSR_SPMV_KERNEL,
+                                                 int32);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4_WITH_HALF(
-    GKO_DECLARE_CSR_SPMV_KERNEL, int32);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(GKO_DECLARE_CSR_SPMV_KERNEL,
+                                                 int32);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5_WITH_HALF(
-    GKO_DECLARE_CSR_SPMV_KERNEL, int32);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(GKO_DECLARE_CSR_SPMV_KERNEL,
+                                                 int32);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6_WITH_HALF(
-    GKO_DECLARE_CSR_SPMV_KERNEL, int32);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(GKO_DECLARE_CSR_SPMV_KERNEL,
+                                                 int32);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1_WITH_HALF(
-    GKO_DECLARE_CSR_SPMV_KERNEL, int64);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(GKO_DECLARE_CSR_SPMV_KERNEL,
+                                                 int64);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2_WITH_HALF(
-    GKO_DECLARE_CSR_SPMV_KERNEL, int64);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(GKO_DECLARE_CSR_SPMV_KERNEL,
+                                                 int64);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3_WITH_HALF(
-    GKO_DECLARE_CSR_SPMV_KERNEL, int64);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(GKO_DECLARE_CSR_SPMV_KERNEL,
+                                                 int64);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4_WITH_HALF(
-    GKO_DECLARE_CSR_SPMV_KERNEL, int64);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(GKO_DECLARE_CSR_SPMV_KERNEL,
+                                                 int64);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5_WITH_HALF(
-    GKO_DECLARE_CSR_SPMV_KERNEL, int64);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(GKO_DECLARE_CSR_SPMV_KERNEL,
+                                                 int64);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6_WITH_HALF(
-    GKO_DECLARE_CSR_SPMV_KERNEL, int64);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(GKO_DECLARE_CSR_SPMV_KERNEL,
+                                                 int64);
 
 
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64);
 // split
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64);
 
 
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL);
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX);
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_SPGEMM_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL);
 GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_SPGEAM_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_INDEX_SET_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_FROM_INDEX_SET_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL);
 // end
 
diff --git a/common/cuda_hip/matrix/dense_kernels.cpp b/common/cuda_hip/matrix/dense_kernels.cpp
index d0d4985dd82..d8391ace023 100644
--- a/common/cuda_hip/matrix/dense_kernels.cpp
+++ b/common/cuda_hip/matrix/dense_kernels.cpp
@@ -461,7 +461,7 @@ void convert_to_coo(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL);
 
 
@@ -491,7 +491,7 @@ void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL);
 
 
@@ -521,7 +521,7 @@ void convert_to_ell(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL);
 
 
@@ -544,7 +544,7 @@ void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL);
 
 
@@ -565,7 +565,7 @@ void count_nonzero_blocks_per_row(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL);
 
 
@@ -598,7 +598,7 @@ void convert_to_hybrid(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL);
 
 
@@ -629,7 +629,7 @@ void convert_to_sellp(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL);
 
 
@@ -657,7 +657,7 @@ void convert_to_sparsity_csr(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL);
 
 
@@ -681,7 +681,7 @@ void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
 
 
@@ -706,7 +706,7 @@ void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
 
 
@@ -729,7 +729,7 @@ void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
 
 
@@ -760,8 +760,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
 
 
 template <typename ValueType>
@@ -788,7 +787,7 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
 
 template <typename ValueType>
@@ -813,8 +812,7 @@ void transpose(std::shared_ptr<const DefaultExecutor> exec,
     }
 };
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
 
 
 template <typename ValueType>
@@ -839,8 +837,7 @@ void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
 
 
 }  // namespace dense
diff --git a/common/cuda_hip/matrix/diagonal_kernels.cpp b/common/cuda_hip/matrix/diagonal_kernels.cpp
index baee454c36d..e12d3ed4f9f 100644
--- a/common/cuda_hip/matrix/diagonal_kernels.cpp
+++ b/common/cuda_hip/matrix/diagonal_kernels.cpp
@@ -81,7 +81,7 @@ void apply_to_csr(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DIAGONAL_APPLY_TO_CSR_KERNEL);
 
 
diff --git a/common/cuda_hip/matrix/ell_kernels.cpp b/common/cuda_hip/matrix/ell_kernels.cpp
index 23079092162..b1b466dba9d 100644
--- a/common/cuda_hip/matrix/ell_kernels.cpp
+++ b/common/cuda_hip/matrix/ell_kernels.cpp
@@ -371,7 +371,7 @@ void spmv(std::shared_ptr<const DefaultExecutor> exec,
         b, c);
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_SPMV_KERNEL);
 
 
@@ -405,7 +405,7 @@ void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec,
         b, c, alpha, beta);
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL);
 
 
diff --git a/common/cuda_hip/matrix/fbcsr_kernels.instantiate.cpp b/common/cuda_hip/matrix/fbcsr_kernels.instantiate.cpp
index a7a0263cd35..a3beaac4a85 100644
--- a/common/cuda_hip/matrix/fbcsr_kernels.instantiate.cpp
+++ b/common/cuda_hip/matrix/fbcsr_kernels.instantiate.cpp
@@ -17,27 +17,26 @@ namespace fbcsr {
 
 
 // begin
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL);
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_FBCSR_SPMV_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL);
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL);
 // end
 
diff --git a/common/cuda_hip/matrix/sellp_kernels.cpp b/common/cuda_hip/matrix/sellp_kernels.cpp
index 4d37a0452a6..3e8fba395b3 100644
--- a/common/cuda_hip/matrix/sellp_kernels.cpp
+++ b/common/cuda_hip/matrix/sellp_kernels.cpp
@@ -105,8 +105,7 @@ void spmv(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_SELLP_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_SPMV_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -132,7 +131,7 @@ void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL);
 
 
diff --git a/common/cuda_hip/matrix/sparsity_csr_kernels.cpp b/common/cuda_hip/matrix/sparsity_csr_kernels.cpp
index ddda357fa31..77f11280e5a 100644
--- a/common/cuda_hip/matrix/sparsity_csr_kernels.cpp
+++ b/common/cuda_hip/matrix/sparsity_csr_kernels.cpp
@@ -138,7 +138,7 @@ void transpose(std::shared_ptr<const DefaultExecutor> exec,
                matrix::SparsityCsr<ValueType, IndexType>* trans)
     GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL);
 
 
@@ -246,7 +246,7 @@ void spmv(std::shared_ptr<const DefaultExecutor> exec,
         syn::value_list<int>(), syn::type_list<>(), exec, a, b, c);
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL);
 
 
@@ -264,7 +264,7 @@ void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec,
         syn::value_list<int>(), syn::type_list<>(), exec, a, b, c, alpha, beta);
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL);
 
 
@@ -297,7 +297,7 @@ void sort_by_column_index(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX);
 
 
@@ -320,7 +320,7 @@ void is_sorted_by_column_index(
     cpu_array = gpu_array;
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_IS_SORTED_BY_COLUMN_INDEX);
 
 
diff --git a/common/cuda_hip/multigrid/pgm_kernels.cpp b/common/cuda_hip/multigrid/pgm_kernels.cpp
index 0077b801e46..d3c44cf540e 100644
--- a/common/cuda_hip/multigrid/pgm_kernels.cpp
+++ b/common/cuda_hip/multigrid/pgm_kernels.cpp
@@ -54,8 +54,7 @@ void sort_row_major(std::shared_ptr<const DefaultExecutor> exec, size_type nnz,
     thrust::sort_by_key(thrust_policy(exec), it, it + nnz, vals_it);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_PGM_SORT_ROW_MAJOR);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PGM_SORT_ROW_MAJOR);
 
 
 template <typename ValueType, typename IndexType>
@@ -79,7 +78,7 @@ void compute_coarse_coo(std::shared_ptr<const DefaultExecutor> exec,
                           vals_it, coarse_key_it, coarse_vals_it);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PGM_COMPUTE_COARSE_COO);
 
 
diff --git a/common/cuda_hip/preconditioner/isai_kernels.cpp b/common/cuda_hip/preconditioner/isai_kernels.cpp
index 77fdb3c0e23..d6fdd6389fc 100644
--- a/common/cuda_hip/preconditioner/isai_kernels.cpp
+++ b/common/cuda_hip/preconditioner/isai_kernels.cpp
@@ -487,7 +487,7 @@ void generate_tri_inverse(std::shared_ptr<const DefaultExecutor> exec,
     components::prefix_sum_nonnegative(exec, excess_nz_ptrs, num_rows + 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL);
 
 
@@ -516,7 +516,7 @@ void generate_general_inverse(std::shared_ptr<const DefaultExecutor> exec,
     components::prefix_sum_nonnegative(exec, excess_nz_ptrs, num_rows + 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ISAI_GENERATE_GENERAL_INVERSE_KERNEL);
 
 
@@ -548,7 +548,7 @@ void generate_excess_system(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL);
 
 
@@ -568,7 +568,7 @@ void scale_excess_solution(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ISAI_SCALE_EXCESS_SOLUTION_KERNEL);
 
 
@@ -593,7 +593,7 @@ void scatter_excess_solution(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL);
 
 
diff --git a/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.cpp b/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.cpp
index fcd86bdba29..27b4f57eb6c 100644
--- a/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.cpp
+++ b/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.cpp
@@ -66,8 +66,7 @@ void apply(std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_JACOBI_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_APPLY_KERNEL);
 
 
 }  // namespace jacobi
diff --git a/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp b/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp
index 62d9c1ece43..131c530d2ee 100644
--- a/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp
+++ b/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp
@@ -160,7 +160,7 @@ void advanced_apply(
         const preconditioner::block_interleaved_storage_scheme<IndexType>&, \
         const ValueType*, const ValueType*, size_type, ValueType*, size_type)
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     DECLARE_JACOBI_ADVANCED_APPLY_INSTANTIATION);
 
 
diff --git a/common/cuda_hip/preconditioner/jacobi_generate_kernels.cpp b/common/cuda_hip/preconditioner/jacobi_generate_kernels.cpp
index 7c37e578045..207550ff6b1 100644
--- a/common/cuda_hip/preconditioner/jacobi_generate_kernels.cpp
+++ b/common/cuda_hip/preconditioner/jacobi_generate_kernels.cpp
@@ -68,7 +68,7 @@ void generate(std::shared_ptr<const DefaultExecutor> exec,
         block_pointers.get_const_data(), num_blocks);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_GENERATE_KERNEL);
 
 
diff --git a/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp b/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp
index 5efd0c40632..fdb0ad11e9e 100644
--- a/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp
+++ b/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp
@@ -268,7 +268,7 @@ void generate(syn::value_list<int, max_block_size>,
         remove_complex<ValueType>*, precision_reduction*, const IndexType*,  \
         size_type)
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     DECLARE_JACOBI_GENERATE_INSTANTIATION);
 
 
diff --git a/common/cuda_hip/preconditioner/jacobi_kernels.cpp b/common/cuda_hip/preconditioner/jacobi_kernels.cpp
index adcc08e37e9..6f2d4ae3974 100644
--- a/common/cuda_hip/preconditioner/jacobi_kernels.cpp
+++ b/common/cuda_hip/preconditioner/jacobi_kernels.cpp
@@ -297,7 +297,7 @@ void find_blocks(std::shared_ptr<const DefaultExecutor> exec,
         exec, max_block_size, num_natural_blocks, block_pointers.get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL);
 
 
@@ -364,7 +364,7 @@ void transpose_jacobi(
         storage_scheme, out_blocks.get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL);
 
 
@@ -388,7 +388,7 @@ void conj_transpose_jacobi(
         storage_scheme, out_blocks.get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL);
 
 
@@ -401,7 +401,7 @@ void convert_to_dense(
         storage_scheme,
     ValueType* result_values, size_type result_stride) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL);
 
 
diff --git a/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.cpp b/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.cpp
index fb73c22ccef..e9b7b10fd88 100644
--- a/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.cpp
+++ b/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.cpp
@@ -57,7 +57,7 @@ void simple_apply(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL);
 
 
diff --git a/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.cpp b/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.cpp
index 3a35fbe3f04..faf869718a6 100644
--- a/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.cpp
+++ b/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.cpp
@@ -151,7 +151,7 @@ void apply(syn::value_list<int, max_block_size>,
         const preconditioner::block_interleaved_storage_scheme<IndexType>&,   \
         const ValueType*, size_type, ValueType*, size_type)
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     DECLARE_JACOBI_SIMPLE_APPLY_INSTANTIATION);
 
 
diff --git a/common/cuda_hip/preconditioner/sor_kernels.cpp b/common/cuda_hip/preconditioner/sor_kernels.cpp
index f75a52b3af2..033a65ea862 100644
--- a/common/cuda_hip/preconditioner/sor_kernels.cpp
+++ b/common/cuda_hip/preconditioner/sor_kernels.cpp
@@ -46,7 +46,7 @@ void initialize_weighted_l(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L);
 
 
@@ -94,7 +94,7 @@ void initialize_weighted_l_u(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U);
 
 
diff --git a/common/cuda_hip/solver/cb_gmres_kernels.cpp b/common/cuda_hip/solver/cb_gmres_kernels.cpp
index 02d45a8d31e..bdf6de03f38 100644
--- a/common/cuda_hip/solver/cb_gmres_kernels.cpp
+++ b/common/cuda_hip/solver/cb_gmres_kernels.cpp
@@ -633,7 +633,8 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
             as_device_type(stop_status->get_data()));
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_BASE(
+    GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL);
 
 
 template <typename ValueType, typename Accessor3d>
diff --git a/common/cuda_hip/solver/idr_kernels.cpp b/common/cuda_hip/solver/idr_kernels.cpp
index 649d8a1769c..9a99d7fa581 100644
--- a/common/cuda_hip/solver/idr_kernels.cpp
+++ b/common/cuda_hip/solver/idr_kernels.cpp
@@ -575,8 +575,7 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
     orthonormalize_subspace_vectors(exec, subspace_vectors);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_IDR_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -603,7 +602,7 @@ void step_1(std::shared_ptr<const DefaultExecutor> exec, const size_type nrhs,
         stop_status->get_const_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -630,7 +629,7 @@ void step_2(std::shared_ptr<const DefaultExecutor> exec, const size_type nrhs,
         stop_status->get_const_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_2_KERNEL);
 
 
 template <typename ValueType>
@@ -647,7 +646,7 @@ void step_3(std::shared_ptr<const DefaultExecutor> exec, const size_type nrhs,
     update_x_r_and_f(exec, nrhs, k, m, g, u, f, residual, x, stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_3_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_3_KERNEL);
 
 
 template <typename ValueType>
@@ -665,8 +664,7 @@ void compute_omega(
         as_device_type(omega->get_values()), stop_status->get_const_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL);
 
 
 }  // namespace idr
diff --git a/common/cuda_hip/solver/multigrid_kernels.cpp b/common/cuda_hip/solver/multigrid_kernels.cpp
index b5d8a0f77b9..f172c391864 100644
--- a/common/cuda_hip/solver/multigrid_kernels.cpp
+++ b/common/cuda_hip/solver/multigrid_kernels.cpp
@@ -141,8 +141,7 @@ void kcycle_step_1(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -175,8 +174,7 @@ void kcycle_step_2(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL);
 
 
 template <typename ValueType>
@@ -200,7 +198,7 @@ void kcycle_check_stop(std::shared_ptr<const DefaultExecutor> exec,
     is_stop = get_element(dis_stop, 0);
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
     GKO_DECLARE_MULTIGRID_KCYCLE_CHECK_STOP_KERNEL);
 
 
diff --git a/common/cuda_hip/stop/residual_norm_kernels.cpp b/common/cuda_hip/stop/residual_norm_kernels.cpp
index 23ca8e5d5f1..9d6db5211e8 100644
--- a/common/cuda_hip/stop/residual_norm_kernels.cpp
+++ b/common/cuda_hip/stop/residual_norm_kernels.cpp
@@ -91,7 +91,7 @@ void residual_norm(std::shared_ptr<const DefaultExecutor> exec,
     *one_changed = get_element(*device_storage, 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
     GKO_DECLARE_RESIDUAL_NORM_KERNEL);
 
 
@@ -171,8 +171,7 @@ void implicit_residual_norm(
     *one_changed = get_element(*device_storage, 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL);
 
 
 }  // namespace implicit_residual_norm
diff --git a/common/unified/base/device_matrix_data_kernels.cpp b/common/unified/base/device_matrix_data_kernels.cpp
index b72c6bf3476..d801b47fcd5 100644
--- a/common/unified/base/device_matrix_data_kernels.cpp
+++ b/common/unified/base/device_matrix_data_kernels.cpp
@@ -30,7 +30,7 @@ void soa_to_aos(std::shared_ptr<const DefaultExecutor> exec,
         in.get_const_col_idxs(), in.get_const_values(), out);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DEVICE_MATRIX_DATA_SOA_TO_AOS_KERNEL);
 
 
@@ -50,7 +50,7 @@ void aos_to_soa(std::shared_ptr<const DefaultExecutor> exec,
         out.get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DEVICE_MATRIX_DATA_AOS_TO_SOA_KERNEL);
 
 
diff --git a/common/unified/components/absolute_array_kernels.cpp b/common/unified/components/absolute_array_kernels.cpp
index 423fa234c39..c9ab364353c 100644
--- a/common/unified/components/absolute_array_kernels.cpp
+++ b/common/unified/components/absolute_array_kernels.cpp
@@ -23,8 +23,7 @@ void inplace_absolute_array(std::shared_ptr<const DefaultExecutor> exec,
         data);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_INPLACE_ABSOLUTE_ARRAY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_ARRAY_KERNEL);
 
 
 template <typename ValueType>
@@ -38,8 +37,7 @@ void outplace_absolute_array(std::shared_ptr<const DefaultExecutor> exec,
         n, in, out);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_OUTPLACE_ABSOLUTE_ARRAY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_ARRAY_KERNEL);
 
 
 }  // namespace components
diff --git a/common/unified/components/fill_array_kernels.cpp b/common/unified/components/fill_array_kernels.cpp
index 3e87d782974..2fd51dd939b 100644
--- a/common/unified/components/fill_array_kernels.cpp
+++ b/common/unified/components/fill_array_kernels.cpp
@@ -23,7 +23,7 @@ void fill_array(std::shared_ptr<const DefaultExecutor> exec, ValueType* array,
         array, val);
 }
 
-GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF(GKO_DECLARE_FILL_ARRAY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL);
 template GKO_DECLARE_FILL_ARRAY_KERNEL(bool);
 
 
@@ -44,8 +44,7 @@ void fill_seq_array(std::shared_ptr<const DefaultExecutor> exec,
         n, array);
 }
 
-GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF(
-    GKO_DECLARE_FILL_SEQ_ARRAY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_FILL_SEQ_ARRAY_KERNEL);
 
 
 }  // namespace components
diff --git a/common/unified/components/precision_conversion_kernels.cpp b/common/unified/components/precision_conversion_kernels.cpp
index 46d14a7ef17..270ee5b3590 100644
--- a/common/unified/components/precision_conversion_kernels.cpp
+++ b/common/unified/components/precision_conversion_kernels.cpp
@@ -25,8 +25,7 @@ void convert_precision(std::shared_ptr<const DefaultExecutor> exec,
         size, in, out);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_WITH_HALF(
-    GKO_DECLARE_CONVERT_PRECISION_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(GKO_DECLARE_CONVERT_PRECISION_KERNEL);
 
 
 }  // namespace components
diff --git a/common/unified/components/reduce_array_kernels.cpp b/common/unified/components/reduce_array_kernels.cpp
index 1e7d19264cd..bc8da6fa311 100644
--- a/common/unified/components/reduce_array_kernels.cpp
+++ b/common/unified/components/reduce_array_kernels.cpp
@@ -34,8 +34,7 @@ void reduce_add_array(std::shared_ptr<const DefaultExecutor> exec,
         arr, result);
 }
 
-GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF(
-    GKO_DECLARE_REDUCE_ADD_ARRAY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_REDUCE_ADD_ARRAY_KERNEL);
 
 
 }  // namespace components
diff --git a/common/unified/distributed/assembly_kernels.cpp b/common/unified/distributed/assembly_kernels.cpp
index a3ac5207f17..a33fca28796 100644
--- a/common/unified/distributed/assembly_kernels.cpp
+++ b/common/unified/distributed/assembly_kernels.cpp
@@ -48,7 +48,7 @@ void fill_send_buffers(
         send_values.get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
     GKO_DECLARE_FILL_SEND_BUFFERS);
 
 
diff --git a/common/unified/matrix/coo_kernels.cpp b/common/unified/matrix/coo_kernels.cpp
index 233dffc6f37..ce13d7500ab 100644
--- a/common/unified/matrix/coo_kernels.cpp
+++ b/common/unified/matrix/coo_kernels.cpp
@@ -38,7 +38,7 @@ void extract_diagonal(std::shared_ptr<const DefaultExecutor> exec,
         diag->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_COO_EXTRACT_DIAGONAL_KERNEL);
 
 
@@ -58,7 +58,7 @@ void fill_in_dense(std::shared_ptr<const DefaultExecutor> exec,
         orig->get_const_row_idxs(), orig->get_const_col_idxs(), result);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_COO_FILL_IN_DENSE_KERNEL);
 
 
diff --git a/common/unified/matrix/csr_kernels.cpp b/common/unified/matrix/csr_kernels.cpp
index d5741bb3e1c..5236c1c9da9 100644
--- a/common/unified/matrix/csr_kernels.cpp
+++ b/common/unified/matrix/csr_kernels.cpp
@@ -52,7 +52,7 @@ void inv_col_permute(std::shared_ptr<const DefaultExecutor> exec,
         col_permuted->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INV_COL_PERMUTE_KERNEL);
 
 
@@ -86,7 +86,7 @@ void inv_col_scale_permute(std::shared_ptr<const DefaultExecutor> exec,
         col_permuted->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INV_COL_SCALE_PERMUTE_KERNEL);
 
 
@@ -102,8 +102,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
         x->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_SCALE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SCALE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -118,8 +117,7 @@ void inv_scale(std::shared_ptr<const DefaultExecutor> exec,
         x->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_INV_SCALE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_SCALE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -154,7 +152,7 @@ void convert_to_sellp(std::shared_ptr<const DefaultExecutor> exec,
         output->get_col_idxs(), output->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL);
 
 
@@ -185,7 +183,7 @@ void convert_to_ell(std::shared_ptr<const DefaultExecutor> exec,
         output->get_col_idxs(), output->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL);
 
 
@@ -229,7 +227,7 @@ void convert_to_hybrid(std::shared_ptr<const DefaultExecutor> exec,
         result->get_coo_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL);
 
 
diff --git a/common/unified/matrix/dense_kernels.instantiate.cpp b/common/unified/matrix/dense_kernels.instantiate.cpp
index dcf48573fc6..aca8ad5bec4 100644
--- a/common/unified/matrix/dense_kernels.instantiate.cpp
+++ b/common/unified/matrix/dense_kernels.instantiate.cpp
@@ -12,99 +12,87 @@ namespace dense {
 
 
 // begin
-GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(
     GKO_DECLARE_DENSE_COPY_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_FILL_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_FILL_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_SCALE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_SCALE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(
     GKO_DECLARE_DENSE_INV_SCALE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(
     GKO_DECLARE_DENSE_ADD_SCALED_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(
     GKO_DECLARE_DENSE_SUB_SCALED_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_NONSYMM_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_INV_NONSYMM_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2(
     GKO_DECLARE_DENSE_ROW_GATHER_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2(
     GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_COL_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_INV_COL_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_SYMM_SCALE_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_INV_SYMM_SCALE_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_NONSYMM_SCALE_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_INV_NONSYMM_SCALE_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_ROW_SCALE_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_INV_ROW_SCALE_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_COL_SCALE_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_INV_COL_SCALE_PERMUTE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_MAKE_COMPLEX_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GET_REAL_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GET_IMAG_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MAKE_COMPLEX_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_REAL_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_IMAG_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(
     GKO_DECLARE_DENSE_ADD_SCALED_IDENTITY_KERNEL);
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL);
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL);
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL_SIZE_T);
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL);
 // split
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_COMPUTE_MEAN_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_MEAN_KERNEL);
 // end
 
 
diff --git a/common/unified/matrix/diagonal_kernels.cpp b/common/unified/matrix/diagonal_kernels.cpp
index 75960e800d7..dae037a5134 100644
--- a/common/unified/matrix/diagonal_kernels.cpp
+++ b/common/unified/matrix/diagonal_kernels.cpp
@@ -36,8 +36,7 @@ void apply_to_dense(std::shared_ptr<const DefaultExecutor> exec,
         b->get_size(), a->get_const_values(), b, c, inverse);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DIAGONAL_APPLY_TO_DENSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DIAGONAL_APPLY_TO_DENSE_KERNEL);
 
 
 template <typename ValueType>
@@ -54,7 +53,7 @@ void right_apply_to_dense(std::shared_ptr<const DefaultExecutor> exec,
         b->get_size(), a->get_const_values(), b, c);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DIAGONAL_RIGHT_APPLY_TO_DENSE_KERNEL);
 
 
@@ -75,7 +74,7 @@ void right_apply_to_csr(std::shared_ptr<const DefaultExecutor> exec,
         c->get_const_col_idxs());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DIAGONAL_RIGHT_APPLY_TO_CSR_KERNEL);
 
 
@@ -96,7 +95,7 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
         output->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DIAGONAL_FILL_IN_MATRIX_DATA_KERNEL);
 
 
@@ -121,7 +120,7 @@ void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,
         result->get_col_idxs(), result->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DIAGONAL_CONVERT_TO_CSR_KERNEL);
 
 
@@ -138,8 +137,7 @@ void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
         orig->get_size()[0], orig->get_const_values(), trans->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DIAGONAL_CONJ_TRANSPOSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DIAGONAL_CONJ_TRANSPOSE_KERNEL);
 
 
 }  // namespace diagonal
diff --git a/common/unified/matrix/ell_kernels.cpp b/common/unified/matrix/ell_kernels.cpp
index 24fc90a888e..6d23e08b68b 100644
--- a/common/unified/matrix/ell_kernels.cpp
+++ b/common/unified/matrix/ell_kernels.cpp
@@ -67,7 +67,7 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
         output->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_FILL_IN_MATRIX_DATA_KERNEL);
 
 
@@ -94,7 +94,7 @@ void fill_in_dense(std::shared_ptr<const DefaultExecutor> exec,
         source->get_const_values(), result);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_FILL_IN_DENSE_KERNEL);
 
 
@@ -121,8 +121,7 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
         result->get_col_idxs(), result->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_ELL_COPY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_COPY_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -151,7 +150,7 @@ void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,
         result->get_col_idxs(), result->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_CONVERT_TO_CSR_KERNEL);
 
 
@@ -173,7 +172,7 @@ void count_nonzeros_per_row(std::shared_ptr<const DefaultExecutor> exec,
         static_cast<int64>(source->get_stride()), source->get_const_col_idxs());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_COUNT_NONZEROS_PER_ROW_KERNEL);
 
 
@@ -199,7 +198,7 @@ void extract_diagonal(std::shared_ptr<const DefaultExecutor> exec,
         orig->get_const_values(), diag->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_EXTRACT_DIAGONAL_KERNEL);
 
 
diff --git a/common/unified/matrix/hybrid_kernels.cpp b/common/unified/matrix/hybrid_kernels.cpp
index 79a596febea..8a21a2415f7 100644
--- a/common/unified/matrix/hybrid_kernels.cpp
+++ b/common/unified/matrix/hybrid_kernels.cpp
@@ -89,7 +89,7 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
         result->get_coo_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_HYBRID_FILL_IN_MATRIX_DATA_KERNEL);
 
 
@@ -150,7 +150,7 @@ void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,
         coo_row_ptrs, result->get_col_idxs(), result->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL);
 
 
diff --git a/common/unified/matrix/scaled_permutation_kernels.cpp b/common/unified/matrix/scaled_permutation_kernels.cpp
index 4cdc7974e50..3eaab65e8e6 100644
--- a/common/unified/matrix/scaled_permutation_kernels.cpp
+++ b/common/unified/matrix/scaled_permutation_kernels.cpp
@@ -32,7 +32,7 @@ void invert(std::shared_ptr<const DefaultExecutor> exec,
         size, input_scale, input_permutation, output_scale, output_permutation);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SCALED_PERMUTATION_INVERT_KERNEL);
 
 
@@ -58,7 +58,7 @@ void compose(std::shared_ptr<const DefaultExecutor> exec,
         output_permutation, output_scale);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SCALED_PERMUTATION_COMPOSE_KERNEL);
 
 
diff --git a/common/unified/matrix/sellp_kernels.cpp b/common/unified/matrix/sellp_kernels.cpp
index 23bfe160a69..93b71ff43f2 100644
--- a/common/unified/matrix/sellp_kernels.cpp
+++ b/common/unified/matrix/sellp_kernels.cpp
@@ -87,7 +87,7 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
         output->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SELLP_FILL_IN_MATRIX_DATA_KERNEL);
 
 
@@ -119,7 +119,7 @@ void fill_in_dense(std::shared_ptr<const DefaultExecutor> exec,
         source->get_const_values(), result);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SELLP_FILL_IN_DENSE_KERNEL);
 
 
@@ -149,7 +149,7 @@ void count_nonzeros_per_row(std::shared_ptr<const DefaultExecutor> exec,
         source->get_const_slice_sets(), source->get_const_col_idxs(), result);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SELLP_COUNT_NONZEROS_PER_ROW_KERNEL);
 
 
@@ -183,7 +183,7 @@ void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,
         result->get_col_idxs(), result->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL);
 
 
@@ -215,7 +215,7 @@ void extract_diagonal(std::shared_ptr<const DefaultExecutor> exec,
         orig->get_const_values(), diag->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SELLP_EXTRACT_DIAGONAL_KERNEL);
 
 
diff --git a/common/unified/matrix/sparsity_csr_kernels.cpp b/common/unified/matrix/sparsity_csr_kernels.cpp
index b3f26358ad3..c5a9c79a89b 100644
--- a/common/unified/matrix/sparsity_csr_kernels.cpp
+++ b/common/unified/matrix/sparsity_csr_kernels.cpp
@@ -41,7 +41,7 @@ void fill_in_dense(std::shared_ptr<const DefaultExecutor> exec,
         input->get_const_col_idxs(), input->get_const_value(), output);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL);
 
 
@@ -70,7 +70,7 @@ void diagonal_element_prefix_sum(
     components::prefix_sum_nonnegative(exec, prefix_sum, num_rows + 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_DIAGONAL_ELEMENT_PREFIX_SUM_KERNEL);
 
 
@@ -106,7 +106,7 @@ void remove_diagonal_elements(std::shared_ptr<const DefaultExecutor> exec,
         matrix->get_col_idxs());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_REMOVE_DIAGONAL_ELEMENTS_KERNEL);
 
 
diff --git a/common/unified/multigrid/pgm_kernels.cpp b/common/unified/multigrid/pgm_kernels.cpp
index 2b0c04592a7..409dbc8b9b6 100644
--- a/common/unified/multigrid/pgm_kernels.cpp
+++ b/common/unified/multigrid/pgm_kernels.cpp
@@ -217,7 +217,7 @@ void find_strongest_neighbor(
         strongest_neighbor.get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PGM_FIND_STRONGEST_NEIGHBOR);
 
 template <typename ValueType, typename IndexType>
@@ -305,7 +305,7 @@ void assign_to_exist_agg(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PGM_ASSIGN_TO_EXIST_AGG);
 
 
diff --git a/common/unified/preconditioner/jacobi_kernels.cpp b/common/unified/preconditioner/jacobi_kernels.cpp
index 00f3d62f312..dce00fd1366 100644
--- a/common/unified/preconditioner/jacobi_kernels.cpp
+++ b/common/unified/preconditioner/jacobi_kernels.cpp
@@ -32,8 +32,7 @@ void scalar_conj(std::shared_ptr<const DefaultExecutor> exec,
         diag.get_size(), diag, conj_diag);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_JACOBI_SCALAR_CONJ_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_CONJ_KERNEL);
 
 
 template <typename ValueType>
@@ -50,8 +49,7 @@ void invert_diagonal(std::shared_ptr<const DefaultExecutor> exec,
         diag.get_size(), diag, inv_diag);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_JACOBI_INVERT_DIAGONAL_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_INVERT_DIAGONAL_KERNEL);
 
 
 template <typename ValueType>
@@ -85,8 +83,7 @@ void scalar_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL);
 
 
 template <typename ValueType>
@@ -103,7 +100,7 @@ void simple_scalar_apply(std::shared_ptr<const DefaultExecutor> exec,
         x->get_size(), diag, b, x);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL);
 
 
@@ -123,7 +120,7 @@ void scalar_convert_to_dense(std::shared_ptr<const DefaultExecutor> exec,
         result->get_size(), blocks, result);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_JACOBI_SCALAR_CONVERT_TO_DENSE_KERNEL);
 
 
diff --git a/common/unified/solver/bicg_kernels.cpp b/common/unified/solver/bicg_kernels.cpp
index 4c6fe8cdc98..7d15718c05d 100644
--- a/common/unified/solver/bicg_kernels.cpp
+++ b/common/unified/solver/bicg_kernels.cpp
@@ -64,8 +64,7 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_BICG_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -91,7 +90,7 @@ void step_1(std::shared_ptr<const DefaultExecutor> exec,
         row_vector(prev_rho), *stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICG_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -120,7 +119,7 @@ void step_2(std::shared_ptr<const DefaultExecutor> exec,
         default_stride(q2), row_vector(beta), row_vector(rho), *stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICG_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_2_KERNEL);
 
 
 }  // namespace bicg
diff --git a/common/unified/solver/bicgstab_kernels.cpp b/common/unified/solver/bicgstab_kernels.cpp
index ad5b1ed3302..c403da3bf96 100644
--- a/common/unified/solver/bicgstab_kernels.cpp
+++ b/common/unified/solver/bicgstab_kernels.cpp
@@ -69,8 +69,7 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_BICGSTAB_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -99,8 +98,7 @@ void step_1(std::shared_ptr<const DefaultExecutor> exec,
         row_vector(alpha), row_vector(omega), *stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_BICGSTAB_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -129,8 +127,7 @@ void step_2(std::shared_ptr<const DefaultExecutor> exec,
         *stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_BICGSTAB_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_2_KERNEL);
 
 
 template <typename ValueType>
@@ -162,8 +159,7 @@ void step_3(
         row_vector(omega), *stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_BICGSTAB_STEP_3_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_3_KERNEL);
 
 
 template <typename ValueType>
@@ -192,8 +188,7 @@ void finalize(std::shared_ptr<const DefaultExecutor> exec,
         x->get_size()[1], *stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_BICGSTAB_FINALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_FINALIZE_KERNEL);
 
 
 }  // namespace bicgstab
diff --git a/common/unified/solver/cg_kernels.cpp b/common/unified/solver/cg_kernels.cpp
index e77f01de748..822dddf1c3b 100644
--- a/common/unified/solver/cg_kernels.cpp
+++ b/common/unified/solver/cg_kernels.cpp
@@ -57,7 +57,7 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -80,7 +80,7 @@ void step_1(std::shared_ptr<const DefaultExecutor> exec,
         row_vector(rho), row_vector(prev_rho), *stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -106,7 +106,7 @@ void step_2(std::shared_ptr<const DefaultExecutor> exec,
         default_stride(q), row_vector(beta), row_vector(rho), *stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_STEP_2_KERNEL);
 
 
 }  // namespace cg
diff --git a/common/unified/solver/cgs_kernels.cpp b/common/unified/solver/cgs_kernels.cpp
index 6ceaa883c9f..0618b8f8208 100644
--- a/common/unified/solver/cgs_kernels.cpp
+++ b/common/unified/solver/cgs_kernels.cpp
@@ -72,8 +72,7 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_CGS_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -104,7 +103,7 @@ void step_1(std::shared_ptr<const DefaultExecutor> exec,
         row_vector(prev_rho), *stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -135,7 +134,7 @@ void step_2(std::shared_ptr<const DefaultExecutor> exec,
         row_vector(alpha), row_vector(rho), row_vector(gamma), *stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_STEP_2_KERNEL);
 
 template <typename ValueType>
 void step_3(std::shared_ptr<const DefaultExecutor> exec,
@@ -158,7 +157,7 @@ void step_3(std::shared_ptr<const DefaultExecutor> exec,
         *stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_STEP_3_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_STEP_3_KERNEL);
 
 
 }  // namespace cgs
diff --git a/common/unified/solver/common_gmres_kernels.cpp b/common/unified/solver/common_gmres_kernels.cpp
index 32fe526d7f6..679aebcfaa2 100644
--- a/common/unified/solver/common_gmres_kernels.cpp
+++ b/common/unified/solver/common_gmres_kernels.cpp
@@ -52,8 +52,7 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
         b->get_size()[0]);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_COMMON_GMRES_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_COMMON_GMRES_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -126,7 +125,7 @@ void hessenberg_qr(std::shared_ptr<const DefaultExecutor> exec,
         stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_COMMON_GMRES_HESSENBERG_QR_KERNEL);
 
 
@@ -159,7 +158,7 @@ void solve_krylov(std::shared_ptr<const DefaultExecutor> exec,
         residual_norm_collection->get_size()[1]);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_COMMON_GMRES_SOLVE_KRYLOV_KERNEL);
 
 
diff --git a/common/unified/solver/fcg_kernels.cpp b/common/unified/solver/fcg_kernels.cpp
index 01dd3cb3d9a..7853d97c358 100644
--- a/common/unified/solver/fcg_kernels.cpp
+++ b/common/unified/solver/fcg_kernels.cpp
@@ -61,8 +61,7 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_FCG_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -85,7 +84,7 @@ void step_1(std::shared_ptr<const DefaultExecutor> exec,
         row_vector(rho_t), row_vector(prev_rho), *stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_FCG_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -114,7 +113,7 @@ void step_2(std::shared_ptr<const DefaultExecutor> exec,
         *stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_FCG_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_STEP_2_KERNEL);
 
 
 }  // namespace fcg
diff --git a/common/unified/solver/gcr_kernels.cpp b/common/unified/solver/gcr_kernels.cpp
index d5c2e27097d..7adef77dfb1 100644
--- a/common/unified/solver/gcr_kernels.cpp
+++ b/common/unified/solver/gcr_kernels.cpp
@@ -44,8 +44,7 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_GCR_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GCR_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -79,7 +78,7 @@ void restart(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GCR_RESTART_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GCR_RESTART_KERNEL);
 
 
 template <typename ValueType>
@@ -105,7 +104,7 @@ void step_1(std::shared_ptr<const DefaultExecutor> exec,
         stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GCR_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GCR_STEP_1_KERNEL);
 
 }  // namespace gcr
 }  // namespace GKO_DEVICE_NAMESPACE
diff --git a/common/unified/solver/gmres_kernels.cpp b/common/unified/solver/gmres_kernels.cpp
index 38bb935df9f..f24ae445edb 100644
--- a/common/unified/solver/gmres_kernels.cpp
+++ b/common/unified/solver/gmres_kernels.cpp
@@ -56,7 +56,7 @@ void restart(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GMRES_RESTART_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_RESTART_KERNEL);
 
 
 template <typename ValueType>
@@ -92,8 +92,7 @@ void multi_axpy(std::shared_ptr<const DefaultExecutor> exec,
         before_preconditioner->get_size()[1], stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL);
 
 
 template <typename ValueType>
@@ -120,8 +119,7 @@ void multi_dot(std::shared_ptr<const DefaultExecutor> exec,
         next_krylov->get_size()[0]);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_GMRES_MULTI_DOT_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_DOT_KERNEL);
 
 }  // namespace gmres
 }  // namespace GKO_DEVICE_NAMESPACE
diff --git a/core/base/array.cpp b/core/base/array.cpp
index 7a98223a7b2..a41f7c07e55 100644
--- a/core/base/array.cpp
+++ b/core/base/array.cpp
@@ -51,8 +51,7 @@ void convert_data(std::shared_ptr<const Executor> exec, size_type size,
     void convert_data<From, To>(std::shared_ptr<const Executor>, size_type, \
                                 const From*, To*)
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_WITH_HALF(
-    GKO_DECLARE_ARRAY_CONVERSION);
+GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(GKO_DECLARE_ARRAY_CONVERSION);
 
 
 }  // namespace detail
@@ -89,19 +88,19 @@ ValueType reduce_add(const array<ValueType>& input_arr,
 
 #define GKO_DECLARE_ARRAY_FILL(_type) void array<_type>::fill(const _type value)
 
-GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF(GKO_DECLARE_ARRAY_FILL);
+GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_ARRAY_FILL);
 
 
 #define GKO_DECLARE_ARRAY_REDUCE_ADD(_type) \
     void reduce_add(const array<_type>& arr, array<_type>& value)
 
-GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF(GKO_DECLARE_ARRAY_REDUCE_ADD);
+GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_ARRAY_REDUCE_ADD);
 
 
 #define GKO_DECLARE_ARRAY_REDUCE_ADD2(_type) \
     _type reduce_add(const array<_type>& arr, const _type val)
 
-GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF(GKO_DECLARE_ARRAY_REDUCE_ADD2);
+GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_ARRAY_REDUCE_ADD2);
 
 
 }  // namespace gko
diff --git a/core/base/batch_instantiation.hpp b/core/base/batch_instantiation.hpp
index 652d4cd7ff7..5fb08180a31 100644
--- a/core/base/batch_instantiation.hpp
+++ b/core/base/batch_instantiation.hpp
@@ -42,10 +42,10 @@ namespace batch {
  *
  * @note the second and third arguments only accept the base type.s
  */
-#define GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER(...) \
-    GKO_CALL(GKO_BATCH_INSTANTIATE_MATRIX,                         \
-             GKO_BATCH_INSTANTIATE_PRECONDITIONER,                 \
-             GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS_WITH_HALF, __VA_ARGS__)
+#define GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER_BASE(...) \
+    GKO_CALL(GKO_BATCH_INSTANTIATE_MATRIX,                              \
+             GKO_BATCH_INSTANTIATE_PRECONDITIONER,                      \
+             GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS, __VA_ARGS__)
 
 
 }  // namespace batch
diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp
index 4cd6b81d5bb..4fb9eec6845 100644
--- a/core/base/batch_multi_vector.cpp
+++ b/core/base/batch_multi_vector.cpp
@@ -316,7 +316,7 @@ void MultiVector<ValueType>::move_to(
 
 
 #define GKO_DECLARE_BATCH_MULTI_VECTOR(_type) class MultiVector<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BATCH_MULTI_VECTOR);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR);
 
 
 }  // namespace batch
diff --git a/core/base/combination.cpp b/core/base/combination.cpp
index 53af6742f6e..3b30b77d38c 100644
--- a/core/base/combination.cpp
+++ b/core/base/combination.cpp
@@ -168,7 +168,7 @@ void Combination<ValueType>::apply_impl(const LinOp* alpha, const LinOp* b,
 
 
 #define GKO_DECLARE_COMBINATION(_type) class Combination<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_COMBINATION);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_COMBINATION);
 
 
 }  // namespace gko
diff --git a/core/base/composition.cpp b/core/base/composition.cpp
index f6a7df21e45..82c8152300b 100644
--- a/core/base/composition.cpp
+++ b/core/base/composition.cpp
@@ -222,7 +222,7 @@ void Composition<ValueType>::apply_impl(const LinOp* alpha, const LinOp* b,
 
 
 #define GKO_DECLARE_COMPOSITION(_type) class Composition<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_COMPOSITION);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_COMPOSITION);
 
 
 }  // namespace gko
diff --git a/core/base/dense_cache.cpp b/core/base/dense_cache.cpp
index 096ad1f761a..38a0decfa46 100644
--- a/core/base/dense_cache.cpp
+++ b/core/base/dense_cache.cpp
@@ -33,7 +33,7 @@ void DenseCache<ValueType>::init_from(
 
 
 #define GKO_DECLARE_DENSE_CACHE(_type) struct DenseCache<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_CACHE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CACHE);
 
 
 }  // namespace detail
diff --git a/core/base/device_matrix_data.cpp b/core/base/device_matrix_data.cpp
index cb9d332f5ab..4c71fffe275 100644
--- a/core/base/device_matrix_data.cpp
+++ b/core/base/device_matrix_data.cpp
@@ -157,8 +157,7 @@ device_matrix_data<ValueType, IndexType>::empty_out()
 
 #define GKO_DECLARE_DEVICE_MATRIX_DATA(ValueType, IndexType) \
     class device_matrix_data<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_DEVICE_MATRIX_DATA);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DEVICE_MATRIX_DATA);
 
 
 }  // namespace gko
diff --git a/core/base/mixed_precision_types.hpp b/core/base/mixed_precision_types.hpp
index 5ef5de94e34..3e78453fed1 100644
--- a/core/base/mixed_precision_types.hpp
+++ b/core/base/mixed_precision_types.hpp
@@ -14,49 +14,49 @@
 #ifdef GINKGO_MIXED_PRECISION
 
 
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, ...) \
-    template _macro(float, float, float, __VA_ARGS__);                \
-    template _macro(float, float, double, __VA_ARGS__);               \
-    template _macro(float, double, float, __VA_ARGS__);               \
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1_BASE(_macro, ...) \
+    template _macro(float, float, float, __VA_ARGS__);                     \
+    template _macro(float, float, double, __VA_ARGS__);                    \
+    template _macro(float, double, float, __VA_ARGS__);                    \
     template _macro(float, double, double, __VA_ARGS__)
 
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1_WITH_HALF(_macro, \
-                                                                   ...)    \
-    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, __VA_ARGS__); \
-    GKO_ADAPT_HF(template _macro(float, half, half, __VA_ARGS__));         \
-    GKO_ADAPT_HF(template _macro(float, half, float, __VA_ARGS__));        \
-    GKO_ADAPT_HF(template _macro(float, half, double, __VA_ARGS__));       \
-    GKO_ADAPT_HF(template _macro(float, float, half, __VA_ARGS__));        \
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, ...)   \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1_BASE(_macro,       \
+                                                          __VA_ARGS__); \
+    GKO_ADAPT_HF(template _macro(float, half, half, __VA_ARGS__));      \
+    GKO_ADAPT_HF(template _macro(float, half, float, __VA_ARGS__));     \
+    GKO_ADAPT_HF(template _macro(float, half, double, __VA_ARGS__));    \
+    GKO_ADAPT_HF(template _macro(float, float, half, __VA_ARGS__));     \
     GKO_ADAPT_HF(template _macro(float, double, half, __VA_ARGS__))
 
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, ...) \
-    template _macro(double, float, float, __VA_ARGS__);               \
-    template _macro(double, float, double, __VA_ARGS__);              \
-    template _macro(double, double, float, __VA_ARGS__);              \
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2_BASE(_macro, ...) \
+    template _macro(double, float, float, __VA_ARGS__);                    \
+    template _macro(double, float, double, __VA_ARGS__);                   \
+    template _macro(double, double, float, __VA_ARGS__);                   \
     template _macro(double, double, double, __VA_ARGS__)
 
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2_WITH_HALF(_macro, \
-                                                                   ...)    \
-    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, __VA_ARGS__); \
-    GKO_ADAPT_HF(template _macro(double, half, half, __VA_ARGS__));        \
-    GKO_ADAPT_HF(template _macro(double, half, float, __VA_ARGS__));       \
-    GKO_ADAPT_HF(template _macro(double, half, double, __VA_ARGS__));      \
-    GKO_ADAPT_HF(template _macro(double, float, half, __VA_ARGS__));       \
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, ...)   \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2_BASE(_macro,       \
+                                                          __VA_ARGS__); \
+    GKO_ADAPT_HF(template _macro(double, half, half, __VA_ARGS__));     \
+    GKO_ADAPT_HF(template _macro(double, half, float, __VA_ARGS__));    \
+    GKO_ADAPT_HF(template _macro(double, half, double, __VA_ARGS__));   \
+    GKO_ADAPT_HF(template _macro(double, float, half, __VA_ARGS__));    \
     GKO_ADAPT_HF(template _macro(double, double, half, __VA_ARGS__))
 
 
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, ...) \
-    template _macro(std::complex<float>, std::complex<float>,         \
-                    std::complex<float>, __VA_ARGS__);                \
-    template _macro(std::complex<float>, std::complex<float>,         \
-                    std::complex<double>, __VA_ARGS__);               \
-    template _macro(std::complex<float>, std::complex<double>,        \
-                    std::complex<float>, __VA_ARGS__);                \
-    template _macro(std::complex<float>, std::complex<double>,        \
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3_BASE(_macro, ...) \
+    template _macro(std::complex<float>, std::complex<float>,              \
+                    std::complex<float>, __VA_ARGS__);                     \
+    template _macro(std::complex<float>, std::complex<float>,              \
+                    std::complex<double>, __VA_ARGS__);                    \
+    template _macro(std::complex<float>, std::complex<double>,             \
+                    std::complex<float>, __VA_ARGS__);                     \
+    template _macro(std::complex<float>, std::complex<double>,             \
                     std::complex<double>, __VA_ARGS__)
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3_WITH_HALF(_macro,  \
-                                                                   ...)     \
-    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, __VA_ARGS__);  \
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, ...)       \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3_BASE(_macro,           \
+                                                          __VA_ARGS__);     \
     GKO_ADAPT_HF(template _macro(std::complex<float>, std::complex<half>,   \
                                  std::complex<half>, __VA_ARGS__));         \
     GKO_ADAPT_HF(template _macro(std::complex<float>, std::complex<half>,   \
@@ -68,19 +68,19 @@
     GKO_ADAPT_HF(template _macro(std::complex<float>, std::complex<double>, \
                                  std::complex<half>, __VA_ARGS__))
 
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, ...) \
-    template _macro(std::complex<double>, std::complex<float>,        \
-                    std::complex<float>, __VA_ARGS__);                \
-    template _macro(std::complex<double>, std::complex<float>,        \
-                    std::complex<double>, __VA_ARGS__);               \
-    template _macro(std::complex<double>, std::complex<double>,       \
-                    std::complex<float>, __VA_ARGS__);                \
-    template _macro(std::complex<double>, std::complex<double>,       \
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4_BASE(_macro, ...) \
+    template _macro(std::complex<double>, std::complex<float>,             \
+                    std::complex<float>, __VA_ARGS__);                     \
+    template _macro(std::complex<double>, std::complex<float>,             \
+                    std::complex<double>, __VA_ARGS__);                    \
+    template _macro(std::complex<double>, std::complex<double>,            \
+                    std::complex<float>, __VA_ARGS__);                     \
+    template _macro(std::complex<double>, std::complex<double>,            \
                     std::complex<double>, __VA_ARGS__)
 
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4_WITH_HALF(_macro,   \
-                                                                   ...)      \
-    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, __VA_ARGS__);   \
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, ...)        \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4_BASE(_macro,            \
+                                                          __VA_ARGS__);      \
     GKO_ADAPT_HF(template _macro(std::complex<double>, std::complex<half>,   \
                                  std::complex<half>, __VA_ARGS__));          \
     GKO_ADAPT_HF(template _macro(std::complex<double>, std::complex<half>,   \
@@ -92,20 +92,18 @@
     GKO_ADAPT_HF(template _macro(std::complex<double>, std::complex<double>, \
                                  std::complex<half>, __VA_ARGS__))
 
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5_WITH_HALF(_macro, \
-                                                                   ...)    \
-    GKO_ADAPT_HF(template _macro(half, half, half, __VA_ARGS__));          \
-    GKO_ADAPT_HF(template _macro(half, half, float, __VA_ARGS__));         \
-    GKO_ADAPT_HF(template _macro(half, half, double, __VA_ARGS__));        \
-    GKO_ADAPT_HF(template _macro(half, float, half, __VA_ARGS__));         \
-    GKO_ADAPT_HF(template _macro(half, float, float, __VA_ARGS__));        \
-    GKO_ADAPT_HF(template _macro(half, float, double, __VA_ARGS__));       \
-    GKO_ADAPT_HF(template _macro(half, double, half, __VA_ARGS__));        \
-    GKO_ADAPT_HF(template _macro(half, double, float, __VA_ARGS__));       \
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(_macro, ...) \
+    GKO_ADAPT_HF(template _macro(half, half, half, __VA_ARGS__));     \
+    GKO_ADAPT_HF(template _macro(half, half, float, __VA_ARGS__));    \
+    GKO_ADAPT_HF(template _macro(half, half, double, __VA_ARGS__));   \
+    GKO_ADAPT_HF(template _macro(half, float, half, __VA_ARGS__));    \
+    GKO_ADAPT_HF(template _macro(half, float, float, __VA_ARGS__));   \
+    GKO_ADAPT_HF(template _macro(half, float, double, __VA_ARGS__));  \
+    GKO_ADAPT_HF(template _macro(half, double, half, __VA_ARGS__));   \
+    GKO_ADAPT_HF(template _macro(half, double, float, __VA_ARGS__));  \
     GKO_ADAPT_HF(template _macro(half, double, double, __VA_ARGS__))
 
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6_WITH_HALF(_macro, \
-                                                                   ...)    \
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(_macro, ...)      \
     GKO_ADAPT_HF(template _macro(std::complex<half>, std::complex<half>,   \
                                  std::complex<half>, __VA_ARGS__));        \
     GKO_ADAPT_HF(template _macro(std::complex<half>, std::complex<half>,   \
@@ -127,79 +125,70 @@
 
 #else
 
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, ...) \
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1_BASE(_macro, ...) \
     template _macro(float, float, float, __VA_ARGS__)
 
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1_WITH_HALF(_macro, \
-                                                                   ...)    \
-    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, __VA_ARGS__)
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, ...) \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1_BASE(_macro, __VA_ARGS__)
 
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, ...) \
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2_BASE(_macro, ...) \
     template _macro(double, double, double, __VA_ARGS__)
 
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2_WITH_HALF(_macro, \
-                                                                   ...)    \
-    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, __VA_ARGS__)
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, ...) \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2_BASE(_macro, __VA_ARGS__)
 
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, ...) \
-    template _macro(std::complex<float>, std::complex<float>,         \
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3_BASE(_macro, ...) \
+    template _macro(std::complex<float>, std::complex<float>,              \
                     std::complex<float>, __VA_ARGS__)
 
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3_WITH_HALF(_macro, \
-                                                                   ...)    \
-    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, __VA_ARGS__)
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, ...) \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3_BASE(_macro, __VA_ARGS__)
 
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, ...) \
-    template _macro(std::complex<double>, std::complex<double>,       \
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4_BASE(_macro, ...) \
+    template _macro(std::complex<double>, std::complex<double>,            \
                     std::complex<double>, __VA_ARGS__)
 
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4_WITH_HALF(_macro, \
-                                                                   ...)    \
-    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, __VA_ARGS__)
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, ...) \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4_BASE(_macro, __VA_ARGS__)
 
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5_WITH_HALF(_macro, \
-                                                                   ...)    \
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(_macro, ...) \
     GKO_ADAPT_HF(template _macro(half, half, half, __VA_ARGS__))
 
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6_WITH_HALF(_macro, \
-                                                                   ...)    \
-    GKO_ADAPT_HF(template _macro(std::complex<half>, std::complex<half>,   \
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(_macro, ...)    \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, std::complex<half>, \
                                  std::complex<half>, __VA_ARGS__))
 
 
 #endif
 
 
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_BASE(_macro, ...)     \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1_BASE(_macro,       \
+                                                          __VA_ARGS__); \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2_BASE(_macro,       \
+                                                          __VA_ARGS__); \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3_BASE(_macro,       \
+                                                          __VA_ARGS__); \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4_BASE(_macro, __VA_ARGS__)
+
 #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, ...)             \
     GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, __VA_ARGS__); \
     GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, __VA_ARGS__); \
     GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, __VA_ARGS__); \
-    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, __VA_ARGS__)
-
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_WITH_HALF(_macro, ...)     \
-    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1_WITH_HALF(_macro,       \
-                                                               __VA_ARGS__); \
-    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2_WITH_HALF(_macro,       \
-                                                               __VA_ARGS__); \
-    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3_WITH_HALF(_macro,       \
-                                                               __VA_ARGS__); \
-    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4_WITH_HALF(_macro,       \
-                                                               __VA_ARGS__); \
-    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5_WITH_HALF(_macro,       \
-                                                               __VA_ARGS__); \
-    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6_WITH_HALF(_macro,       \
-                                                               __VA_ARGS__)
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, __VA_ARGS__); \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(_macro, __VA_ARGS__); \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(_macro, __VA_ARGS__)
+
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_BASE(_macro) \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_BASE(_macro, int32);       \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_BASE(_macro, int64)
 
 #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(_macro) \
     GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, int32);       \
     GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, int64)
 
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(_macro) \
-    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_WITH_HALF(_macro, int32);       \
-    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_WITH_HALF(_macro, int64)
-
 #ifdef GINKGO_MIXED_PRECISION
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...)             \
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2_BASE(_macro, ...)        \
     template _macro(float, float, __VA_ARGS__);                              \
     template _macro(float, double, __VA_ARGS__);                             \
     template _macro(double, float, __VA_ARGS__);                             \
@@ -209,8 +198,8 @@
     template _macro(std::complex<double>, std::complex<float>, __VA_ARGS__); \
     template _macro(std::complex<double>, std::complex<double>, __VA_ARGS__)
 
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2_WITH_HALF(_macro, ...)     \
-    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, __VA_ARGS__);          \
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...)               \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2_BASE(_macro, __VA_ARGS__);     \
     GKO_ADAPT_HF(template _macro(half, half, __VA_ARGS__));                    \
     GKO_ADAPT_HF(template _macro(half, float, __VA_ARGS__));                   \
     GKO_ADAPT_HF(template _macro(half, double, __VA_ARGS__));                  \
@@ -227,27 +216,26 @@
     GKO_ADAPT_HF(template _macro(std::complex<double>, std::complex<half>,     \
                                  __VA_ARGS__))
 #else
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...)            \
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2_BASE(_macro, ...)       \
     template _macro(float, float, __VA_ARGS__);                             \
     template _macro(double, double, __VA_ARGS__);                           \
     template _macro(std::complex<float>, std::complex<float>, __VA_ARGS__); \
     template _macro(std::complex<double>, std::complex<double>, __VA_ARGS__)
 
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2_WITH_HALF(_macro, ...) \
-    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, __VA_ARGS__);      \
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...)           \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2_BASE(_macro, __VA_ARGS__); \
     GKO_ADAPT_HF(template _macro(half, half, __VA_ARGS__));                \
     GKO_ADAPT_HF(                                                          \
         template _macro(std::complex<half>, std::complex<half>, __VA_ARGS__))
 #endif
 
 
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2_BASE(_macro) \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2_BASE(_macro, int32);       \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2_BASE(_macro, int64)
+
 #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2(_macro) \
     GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, int32);       \
     GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, int64)
 
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2_WITH_HALF(  \
-    _macro)                                                               \
-    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2_WITH_HALF(_macro, int32); \
-    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2_WITH_HALF(_macro, int64)
-
 #endif  // GKO_CORE_BASE_MIXED_PRECISION_TYPES_HPP_
diff --git a/core/base/mtx_io.cpp b/core/base/mtx_io.cpp
index 0897349d08c..d84b97a213b 100644
--- a/core/base/mtx_io.cpp
+++ b/core/base/mtx_io.cpp
@@ -984,14 +984,11 @@ void write_raw(std::ostream& os, const matrix_data<ValueType, IndexType>& data,
                           const matrix_data<ValueType, IndexType>& data)
 #define GKO_DECLARE_READ_GENERIC_RAW(ValueType, IndexType) \
     matrix_data<ValueType, IndexType> read_generic_raw(std::istream& is)
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_READ_RAW);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_WRITE_RAW);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_READ_BINARY_RAW);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_WRITE_BINARY_RAW);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_READ_GENERIC_RAW);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_READ_RAW);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_WRITE_RAW);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_READ_BINARY_RAW);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_WRITE_BINARY_RAW);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_READ_GENERIC_RAW);
 
 
 }  // namespace gko
diff --git a/core/base/perturbation.cpp b/core/base/perturbation.cpp
index b17cba209e1..87501361c05 100644
--- a/core/base/perturbation.cpp
+++ b/core/base/perturbation.cpp
@@ -182,7 +182,7 @@ void Perturbation<ValueType>::apply_impl(const LinOp* alpha, const LinOp* b,
 
 
 #define GKO_DECLARE_PERTURBATION(_type) class Perturbation<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_PERTURBATION);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_PERTURBATION);
 
 
 }  // namespace gko
diff --git a/core/base/segmented_array.cpp b/core/base/segmented_array.cpp
index 4c6356799f9..d113139f8e2 100644
--- a/core/base/segmented_array.cpp
+++ b/core/base/segmented_array.cpp
@@ -180,7 +180,7 @@ segmented_array<T>& segmented_array<T>::operator=(segmented_array&& other)
 
 #define GKO_DECLARE_SEGMENTED_ARRAY(_type) struct segmented_array<_type>
 
-GKO_INSTANTIATE_FOR_EACH_POD_TYPE_WITH_HALF(GKO_DECLARE_SEGMENTED_ARRAY);
+GKO_INSTANTIATE_FOR_EACH_POD_TYPE(GKO_DECLARE_SEGMENTED_ARRAY);
 
 
 }  // namespace gko
diff --git a/core/config/factorization_config.cpp b/core/config/factorization_config.cpp
index dae4072cce8..259d32cb872 100644
--- a/core/config/factorization_config.cpp
+++ b/core/config/factorization_config.cpp
@@ -23,18 +23,15 @@ namespace gko {
 namespace config {
 
 
-GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(Factorization_Ic,
-                                         gko::factorization::Ic);
-GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(Factorization_Ilu,
-                                         gko::factorization::Ilu);
-GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    Cholesky, gko::experimental::factorization::Cholesky);
-GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(Lu,
-                                         gko::experimental::factorization::Lu);
-GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(ParIlu, gko::factorization::ParIlu);
-GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(ParIlut, gko::factorization::ParIlut);
-GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(ParIc, gko::factorization::ParIc);
-GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(ParIct, gko::factorization::ParIct);
+GKO_PARSE_VALUE_AND_INDEX_TYPE(Factorization_Ic, gko::factorization::Ic);
+GKO_PARSE_VALUE_AND_INDEX_TYPE(Factorization_Ilu, gko::factorization::Ilu);
+GKO_PARSE_VALUE_AND_INDEX_TYPE(Cholesky,
+                               gko::experimental::factorization::Cholesky);
+GKO_PARSE_VALUE_AND_INDEX_TYPE(Lu, gko::experimental::factorization::Lu);
+GKO_PARSE_VALUE_AND_INDEX_TYPE(ParIlu, gko::factorization::ParIlu);
+GKO_PARSE_VALUE_AND_INDEX_TYPE(ParIlut, gko::factorization::ParIlut);
+GKO_PARSE_VALUE_AND_INDEX_TYPE(ParIc, gko::factorization::ParIc);
+GKO_PARSE_VALUE_AND_INDEX_TYPE(ParIct, gko::factorization::ParIct);
 
 
 }  // namespace config
diff --git a/core/config/multigrid_config.cpp b/core/config/multigrid_config.cpp
index 8cc4b4e1ca3..83be1a1742b 100644
--- a/core/config/multigrid_config.cpp
+++ b/core/config/multigrid_config.cpp
@@ -10,7 +10,7 @@ namespace gko {
 namespace config {
 
 
-GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(Pgm, gko::multigrid::Pgm);
+GKO_PARSE_VALUE_AND_INDEX_TYPE(Pgm, gko::multigrid::Pgm);
 
 
 }  // namespace config
diff --git a/core/config/parse_macro.hpp b/core/config/parse_macro.hpp
index e3734e5db7a..f40aa8cb9dd 100644
--- a/core/config/parse_macro.hpp
+++ b/core/config/parse_macro.hpp
@@ -16,7 +16,7 @@
 
 
 // for value_type only
-#define GKO_PARSE_VALUE_TYPE_(_type, _configurator, _value_type_list)        \
+#define GKO_PARSE_VALUE_TYPE_BASE_(_type, _configurator, _value_type_list)   \
     template <>                                                              \
     deferred_factory_parameter<gko::LinOpFactory>                            \
     parse<gko::config::LinOpFactoryType::_type>(                             \
@@ -33,16 +33,17 @@
     static_assert(true,                                                      \
                   "This assert is used to counter the false positive extra " \
                   "semi-colon warnings")
-#define GKO_PARSE_VALUE_TYPE(_type, _configurator) \
-    GKO_PARSE_VALUE_TYPE_(_type, _configurator, gko::config::value_type_list())
+#define GKO_PARSE_VALUE_TYPE_BASE(_type, _configurator) \
+    GKO_PARSE_VALUE_TYPE_BASE_(_type, _configurator,    \
+                               gko::config::value_type_list())
 
-#define GKO_PARSE_VALUE_TYPE_WITH_HALF(_type, _configurator) \
-    GKO_PARSE_VALUE_TYPE_(_type, _configurator,              \
-                          gko::config::value_type_list_with_half())
+#define GKO_PARSE_VALUE_TYPE(_type, _configurator)   \
+    GKO_PARSE_VALUE_TYPE_BASE_(_type, _configurator, \
+                               gko::config::value_type_list_with_half())
 
 // for value_type and index_type
-#define GKO_PARSE_VALUE_AND_INDEX_TYPE_(_type, _configurator,                 \
-                                        _value_type_list)                     \
+#define GKO_PARSE_VALUE_AND_INDEX_TYPE_BASE_(_type, _configurator,            \
+                                             _value_type_list)                \
     template <>                                                               \
     deferred_factory_parameter<gko::LinOpFactory>                             \
     parse<gko::config::LinOpFactoryType::_type>(                              \
@@ -62,13 +63,13 @@
                   "This assert is used to counter the false positive extra "  \
                   "semi-colon warnings")
 
-#define GKO_PARSE_VALUE_AND_INDEX_TYPE(_type, _configurator) \
-    GKO_PARSE_VALUE_AND_INDEX_TYPE_(_type, _configurator,    \
-                                    gko::config::value_type_list())
+#define GKO_PARSE_VALUE_AND_INDEX_TYPE_BASE(_type, _configurator) \
+    GKO_PARSE_VALUE_AND_INDEX_TYPE_BASE_(_type, _configurator,    \
+                                         gko::config::value_type_list())
 
-#define GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(_type, _configurator) \
-    GKO_PARSE_VALUE_AND_INDEX_TYPE_(_type, _configurator,              \
-                                    gko::config::value_type_list_with_half())
+#define GKO_PARSE_VALUE_AND_INDEX_TYPE(_type, _configurator) \
+    GKO_PARSE_VALUE_AND_INDEX_TYPE_BASE_(                    \
+        _type, _configurator, gko::config::value_type_list_with_half())
 
 
 #endif  // GKO_CORE_CONFIG_PARSE_MACRO_HPP_
diff --git a/core/config/preconditioner_config.cpp b/core/config/preconditioner_config.cpp
index 840094b51c9..3db65e76a5f 100644
--- a/core/config/preconditioner_config.cpp
+++ b/core/config/preconditioner_config.cpp
@@ -19,9 +19,10 @@ namespace gko {
 namespace config {
 
 
-GKO_PARSE_VALUE_AND_INDEX_TYPE(GaussSeidel, gko::preconditioner::GaussSeidel);
-GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(Jacobi, gko::preconditioner::Jacobi);
-GKO_PARSE_VALUE_AND_INDEX_TYPE(Sor, gko::preconditioner::Sor);
+GKO_PARSE_VALUE_AND_INDEX_TYPE_BASE(GaussSeidel,
+                                    gko::preconditioner::GaussSeidel);
+GKO_PARSE_VALUE_AND_INDEX_TYPE(Jacobi, gko::preconditioner::Jacobi);
+GKO_PARSE_VALUE_AND_INDEX_TYPE_BASE(Sor, gko::preconditioner::Sor);
 
 
 }  // namespace config
diff --git a/core/config/solver_config.cpp b/core/config/solver_config.cpp
index 04bf5f5fcd5..d13c20f901e 100644
--- a/core/config/solver_config.cpp
+++ b/core/config/solver_config.cpp
@@ -30,20 +30,19 @@ namespace gko {
 namespace config {
 
 
-GKO_PARSE_VALUE_TYPE_WITH_HALF(Cg, gko::solver::Cg);
-GKO_PARSE_VALUE_TYPE_WITH_HALF(Bicg, gko::solver::Bicg);
-GKO_PARSE_VALUE_TYPE_WITH_HALF(Bicgstab, gko::solver::Bicgstab);
-GKO_PARSE_VALUE_TYPE_WITH_HALF(Cgs, gko::solver::Cgs);
-GKO_PARSE_VALUE_TYPE_WITH_HALF(Fcg, gko::solver::Fcg);
-GKO_PARSE_VALUE_TYPE_WITH_HALF(Ir, gko::solver::Ir);
-GKO_PARSE_VALUE_TYPE_WITH_HALF(Idr, gko::solver::Idr);
-GKO_PARSE_VALUE_TYPE_WITH_HALF(Gcr, gko::solver::Gcr);
-GKO_PARSE_VALUE_TYPE_WITH_HALF(Gmres, gko::solver::Gmres);
-GKO_PARSE_VALUE_TYPE(CbGmres, gko::solver::CbGmres);
-GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(Direct,
-                                         gko::experimental::solver::Direct);
-GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(LowerTrs, gko::solver::LowerTrs);
-GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(UpperTrs, gko::solver::UpperTrs);
+GKO_PARSE_VALUE_TYPE(Cg, gko::solver::Cg);
+GKO_PARSE_VALUE_TYPE(Bicg, gko::solver::Bicg);
+GKO_PARSE_VALUE_TYPE(Bicgstab, gko::solver::Bicgstab);
+GKO_PARSE_VALUE_TYPE(Cgs, gko::solver::Cgs);
+GKO_PARSE_VALUE_TYPE(Fcg, gko::solver::Fcg);
+GKO_PARSE_VALUE_TYPE(Ir, gko::solver::Ir);
+GKO_PARSE_VALUE_TYPE(Idr, gko::solver::Idr);
+GKO_PARSE_VALUE_TYPE(Gcr, gko::solver::Gcr);
+GKO_PARSE_VALUE_TYPE(Gmres, gko::solver::Gmres);
+GKO_PARSE_VALUE_TYPE_BASE(CbGmres, gko::solver::CbGmres);
+GKO_PARSE_VALUE_AND_INDEX_TYPE(Direct, gko::experimental::solver::Direct);
+GKO_PARSE_VALUE_AND_INDEX_TYPE(LowerTrs, gko::solver::LowerTrs);
+GKO_PARSE_VALUE_AND_INDEX_TYPE(UpperTrs, gko::solver::UpperTrs);
 
 
 template <>
diff --git a/core/config/type_descriptor.cpp b/core/config/type_descriptor.cpp
index ef4cdc692f9..fe11b785d6f 100644
--- a/core/config/type_descriptor.cpp
+++ b/core/config/type_descriptor.cpp
@@ -50,7 +50,7 @@ type_descriptor make_type_descriptor()
                                          GlobalIndexType)           \
     type_descriptor                                                 \
     make_type_descriptor<ValueType, LocalIndexType, GlobalIndexType>()
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_MAKE_TYPE_DESCRIPTOR);
 
 #define GKO_DECLARE_MAKE_VOID_TYPE_DESCRIPTOR(LocalIndexType, GlobalIndexType) \
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 3f6cc9ab1bc..0240dabc7e4 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -80,35 +80,35 @@
 #define GKO_STUB(_macro) _macro GKO_NOT_COMPILED(GKO_HOOK_MODULE)
 
 
+#define GKO_STUB_NON_COMPLEX_VALUE_TYPE_BASE(_macro)     \
+    template <typename ValueType>                        \
+    _macro(ValueType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
+    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_BASE(_macro)
+
 #define GKO_STUB_NON_COMPLEX_VALUE_TYPE(_macro)          \
     template <typename ValueType>                        \
     _macro(ValueType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
     GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro)
 
-#define GKO_STUB_NON_COMPLEX_VALUE_TYPE_WITH_HALF(_macro) \
-    template <typename ValueType>                         \
-    _macro(ValueType) GKO_NOT_COMPILED(GKO_HOOK_MODULE);  \
-    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF(_macro)
-
-#define GKO_STUB_VALUE_TYPE(_macro)                      \
+#define GKO_STUB_VALUE_TYPE_BASE(_macro)                 \
     template <typename ValueType>                        \
     _macro(ValueType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro)
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_BASE(_macro)
 
-#define GKO_STUB_VALUE_TYPE_WITH_HALF(_macro)            \
+#define GKO_STUB_VALUE_TYPE(_macro)                      \
     template <typename ValueType>                        \
     _macro(ValueType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(_macro)
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro)
 
-#define GKO_STUB_VALUE_AND_SCALAR_TYPE(_macro)                       \
+#define GKO_STUB_VALUE_AND_SCALAR_TYPE_BASE(_macro)                  \
     template <typename ValueType, typename ScalarType>               \
     _macro(ValueType, ScalarType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro)
+    GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_BASE(_macro)
 
-#define GKO_STUB_VALUE_AND_SCALAR_TYPE_WITH_HALF(_macro)             \
+#define GKO_STUB_VALUE_AND_SCALAR_TYPE(_macro)                       \
     template <typename ValueType, typename ScalarType>               \
     _macro(ValueType, ScalarType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF(_macro)
+    GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro)
 
 #define GKO_STUB_INDEX_TYPE(_macro)                      \
     template <typename IndexType>                        \
@@ -120,101 +120,101 @@
     _macro(LocalIndexType, GlobalIndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
     GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE(_macro)
 
+#define GKO_STUB_NON_COMPLEX_VALUE_AND_INDEX_TYPE_BASE(_macro)      \
+    template <typename ValueType, typename IndexType>               \
+    _macro(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
+    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_BASE(_macro)
+
 #define GKO_STUB_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro)           \
     template <typename ValueType, typename IndexType>               \
     _macro(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
     GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro)
 
-#define GKO_STUB_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF(_macro) \
+#define GKO_STUB_VALUE_AND_INDEX_TYPE_BASE(_macro)                  \
     template <typename ValueType, typename IndexType>               \
     _macro(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
-    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF(_macro)
+    GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_BASE(_macro)
 
 #define GKO_STUB_VALUE_AND_INDEX_TYPE(_macro)                       \
     template <typename ValueType, typename IndexType>               \
     _macro(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
     GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro)
 
-#define GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(_macro)             \
+#define GKO_STUB_VALUE_AND_INT32_TYPE_BASE(_macro)                  \
     template <typename ValueType, typename IndexType>               \
     _macro(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(_macro)
+    GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_BASE(_macro)
 
 #define GKO_STUB_VALUE_AND_INT32_TYPE(_macro)                       \
     template <typename ValueType, typename IndexType>               \
     _macro(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
     GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(_macro)
 
-#define GKO_STUB_VALUE_AND_INT32_TYPE_WITH_HALF(_macro)             \
-    template <typename ValueType, typename IndexType>               \
-    _macro(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(_macro)
-
-#define GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(_macro)                     \
+#define GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_BASE(_macro)                \
     template <typename InputValueType, typename MatrixValueType,        \
               typename OutputValueType, typename IndexType>             \
     _macro(InputValueType, MatrixValueType, OutputValueType, IndexType) \
         GKO_NOT_COMPILED(GKO_HOOK_MODULE);                              \
-    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(_macro)
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_BASE(_macro)
 
-#define GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(_macro)           \
+#define GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(_macro)                     \
     template <typename InputValueType, typename MatrixValueType,        \
               typename OutputValueType, typename IndexType>             \
     _macro(InputValueType, MatrixValueType, OutputValueType, IndexType) \
         GKO_NOT_COMPILED(GKO_HOOK_MODULE);                              \
-    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(_macro)
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(_macro)
 
-#define GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_2(_macro)            \
+#define GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_2_BASE(_macro)       \
     template <typename InputValueType, typename OutputValueType, \
               typename IndexType>                                \
     _macro(InputValueType, OutputValueType, IndexType)           \
         GKO_NOT_COMPILED(GKO_HOOK_MODULE);                       \
-    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2(_macro)
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2_BASE(_macro)
 
-#define GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_2_WITH_HALF(_macro)  \
+#define GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_2(_macro)            \
     template <typename InputValueType, typename OutputValueType, \
               typename IndexType>                                \
     _macro(InputValueType, OutputValueType, IndexType)           \
         GKO_NOT_COMPILED(GKO_HOOK_MODULE);                       \
-    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2_WITH_HALF(_macro)
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2(_macro)
 
-#define GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro) \
-    template <typename ValueType, typename LocalIndexType, \
-              typename GlobalIndexType>                    \
-    _macro(ValueType, LocalIndexType, GlobalIndexType)     \
-        GKO_NOT_COMPILED(GKO_HOOK_MODULE);                 \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro)
+#define GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(_macro) \
+    template <typename ValueType, typename LocalIndexType,      \
+              typename GlobalIndexType>                         \
+    _macro(ValueType, LocalIndexType, GlobalIndexType)          \
+        GKO_NOT_COMPILED(GKO_HOOK_MODULE);                      \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(_macro)
 
-#define GKO_STUB_TEMPLATE_TYPE(_macro)                   \
+#define GKO_STUB_TEMPLATE_TYPE_BASE(_macro)              \
     template <typename IndexType>                        \
     _macro(IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
-    GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(_macro)
+    GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_BASE(_macro)
 
-#define GKO_STUB_TEMPLATE_TYPE_WITH_HALF(_macro)         \
+#define GKO_STUB_TEMPLATE_TYPE(_macro)                   \
     template <typename IndexType>                        \
     _macro(IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
-    GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF(_macro)
+    GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(_macro)
+
+#define GKO_STUB_VALUE_CONVERSION_BASE(_macro)                        \
+    template <typename SourceType, typename TargetType>               \
+    _macro(SourceType, TargetType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_BASE(_macro)
 
 #define GKO_STUB_VALUE_CONVERSION(_macro)                             \
     template <typename SourceType, typename TargetType>               \
     _macro(SourceType, TargetType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
     GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro)
 
-#define GKO_STUB_VALUE_CONVERSION_WITH_HALF(_macro)                   \
+#define GKO_STUB_VALUE_CONVERSION_OR_COPY_BASE(_macro)                \
     template <typename SourceType, typename TargetType>               \
     _macro(SourceType, TargetType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_WITH_HALF(_macro)
+    GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY_BASE(_macro)
 
 #define GKO_STUB_VALUE_CONVERSION_OR_COPY(_macro)                     \
     template <typename SourceType, typename TargetType>               \
     _macro(SourceType, TargetType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
     GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro)
 
-#define GKO_STUB_VALUE_CONVERSION_OR_COPY_WITH_HALF(_macro)           \
-    template <typename SourceType, typename TargetType>               \
-    _macro(SourceType, TargetType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY_WITH_HALF(_macro)
-
 #define GKO_STUB_CB_GMRES(_macro)                                              \
     template <typename ValueType, typename ValueTypeKrylovBases>               \
     _macro(ValueType, ValueTypeKrylovBases) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
@@ -225,11 +225,11 @@
     _macro(ValueType, ValueTypeKrylovBases) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
     GKO_INSTANTIATE_FOR_EACH_CB_GMRES_CONST_TYPE(_macro)
 
-#define GKO_STUB_BATCH_VALUE_MATRIX_PRECONDITIONER(_declare, _wrapper)         \
+#define GKO_STUB_BATCH_VALUE_MATRIX_PRECONDITIONER_BASE(_declare, _wrapper)    \
     template <typename ValueType, typename BatchMatrixType, typename PrecType> \
     _declare(ValueType, BatchMatrixType, PrecType)                             \
         GKO_NOT_COMPILED(GKO_HOOK_MODULE);                                     \
-    GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER(_wrapper)
+    GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER_BASE(_wrapper)
 
 
 namespace gko {
@@ -238,29 +238,27 @@ namespace GKO_HOOK_MODULE {
 namespace components {
 
 
-GKO_STUB_VALUE_CONVERSION_WITH_HALF(GKO_DECLARE_CONVERT_PRECISION_KERNEL);
+GKO_STUB_VALUE_CONVERSION(GKO_DECLARE_CONVERT_PRECISION_KERNEL);
 GKO_STUB_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_NONNEGATIVE_KERNEL);
 // explicitly instantiate for size_type, as this is
 // used in the SellP format
 template GKO_DECLARE_PREFIX_SUM_NONNEGATIVE_KERNEL(size_type);
 
-GKO_STUB_TEMPLATE_TYPE_WITH_HALF(GKO_DECLARE_FILL_ARRAY_KERNEL);
+GKO_STUB_TEMPLATE_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL);
 template GKO_DECLARE_FILL_ARRAY_KERNEL(bool);
 
-GKO_STUB_TEMPLATE_TYPE_WITH_HALF(GKO_DECLARE_FILL_SEQ_ARRAY_KERNEL);
-GKO_STUB_TEMPLATE_TYPE_WITH_HALF(GKO_DECLARE_REDUCE_ADD_ARRAY_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_INPLACE_ABSOLUTE_ARRAY_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_OUTPLACE_ABSOLUTE_ARRAY_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_STUB_TEMPLATE_TYPE(GKO_DECLARE_FILL_SEQ_ARRAY_KERNEL);
+GKO_STUB_TEMPLATE_TYPE(GKO_DECLARE_REDUCE_ADD_ARRAY_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_ARRAY_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_ARRAY_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DEVICE_MATRIX_DATA_REMOVE_ZEROS_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_STUB_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DEVICE_MATRIX_DATA_SUM_DUPLICATES_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_STUB_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DEVICE_MATRIX_DATA_SORT_ROW_MAJOR_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_DEVICE_MATRIX_DATA_AOS_TO_SOA_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_DEVICE_MATRIX_DATA_SOA_TO_AOS_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DEVICE_MATRIX_DATA_AOS_TO_SOA_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DEVICE_MATRIX_DATA_SOA_TO_AOS_KERNEL);
 
 template <typename IndexType, typename RowPtrType>
 GKO_DECLARE_CONVERT_PTRS_TO_IDXS(IndexType, RowPtrType)
@@ -332,7 +330,7 @@ GKO_STUB_LOCAL_GLOBAL_TYPE(GKO_DECLARE_INDEX_MAP_MAP_TO_LOCAL);
 namespace distributed_vector {
 
 
-GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
     GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL);
 
 
@@ -342,9 +340,9 @@ GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
 namespace assembly {
 
 
-GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
     GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
-GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_FILL_SEND_BUFFERS);
+GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(GKO_DECLARE_FILL_SEND_BUFFERS);
 
 
 }  // namespace assembly
@@ -353,7 +351,8 @@ GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_FILL_SEND_BUFFERS);
 namespace distributed_matrix {
 
 
-GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL);
+GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
+    GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL);
 
 
 }  // namespace distributed_matrix
@@ -362,15 +361,12 @@ GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL);
 namespace batch_multi_vector {
 
 
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_CONJ_DOT_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_CONJ_DOT_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
 
 
 }  // namespace batch_multi_vector
@@ -379,13 +375,10 @@ GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
 namespace batch_csr {
 
 
-GKO_STUB_VALUE_AND_INT32_TYPE_WITH_HALF(
-    GKO_DECLARE_BATCH_CSR_SIMPLE_APPLY_KERNEL);
-GKO_STUB_VALUE_AND_INT32_TYPE_WITH_HALF(
-    GKO_DECLARE_BATCH_CSR_ADVANCED_APPLY_KERNEL);
-GKO_STUB_VALUE_AND_INT32_TYPE_WITH_HALF(GKO_DECLARE_BATCH_CSR_SCALE_KERNEL);
-GKO_STUB_VALUE_AND_INT32_TYPE_WITH_HALF(
-    GKO_DECLARE_BATCH_CSR_ADD_SCALED_IDENTITY_KERNEL);
+GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_CSR_SIMPLE_APPLY_KERNEL);
+GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_CSR_ADVANCED_APPLY_KERNEL);
+GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_CSR_SCALE_KERNEL);
+GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_CSR_ADD_SCALED_IDENTITY_KERNEL);
 
 
 }  // namespace batch_csr
@@ -394,12 +387,11 @@ GKO_STUB_VALUE_AND_INT32_TYPE_WITH_HALF(
 namespace batch_dense {
 
 
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BATCH_DENSE_SCALE_ADD_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_BATCH_DENSE_ADD_SCALED_IDENTITY_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_ADD_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_ADD_SCALED_IDENTITY_KERNEL);
 
 
 }  // namespace batch_dense
@@ -408,13 +400,10 @@ GKO_STUB_VALUE_TYPE_WITH_HALF(
 namespace batch_ell {
 
 
-GKO_STUB_VALUE_AND_INT32_TYPE_WITH_HALF(
-    GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL);
-GKO_STUB_VALUE_AND_INT32_TYPE_WITH_HALF(
-    GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL);
-GKO_STUB_VALUE_AND_INT32_TYPE_WITH_HALF(GKO_DECLARE_BATCH_ELL_SCALE_KERNEL);
-GKO_STUB_VALUE_AND_INT32_TYPE_WITH_HALF(
-    GKO_DECLARE_BATCH_ELL_ADD_SCALED_IDENTITY_KERNEL);
+GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL);
+GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL);
+GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_ELL_SCALE_KERNEL);
+GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_ELL_ADD_SCALED_IDENTITY_KERNEL);
 
 
 }  // namespace batch_ell
@@ -423,93 +412,69 @@ GKO_STUB_VALUE_AND_INT32_TYPE_WITH_HALF(
 namespace dense {
 
 
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_APPLY_KERNEL);
-GKO_STUB_VALUE_CONVERSION_OR_COPY_WITH_HALF(GKO_DECLARE_DENSE_COPY_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_FILL_KERNEL);
-GKO_STUB_VALUE_AND_SCALAR_TYPE_WITH_HALF(GKO_DECLARE_DENSE_SCALE_KERNEL);
-GKO_STUB_VALUE_AND_SCALAR_TYPE_WITH_HALF(GKO_DECLARE_DENSE_INV_SCALE_KERNEL);
-GKO_STUB_VALUE_AND_SCALAR_TYPE_WITH_HALF(GKO_DECLARE_DENSE_ADD_SCALED_KERNEL);
-GKO_STUB_VALUE_AND_SCALAR_TYPE_WITH_HALF(GKO_DECLARE_DENSE_SUB_SCALED_KERNEL);
-GKO_STUB_VALUE_AND_SCALAR_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_ADD_SCALED_IDENTITY_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_MEAN_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
+GKO_STUB_VALUE_CONVERSION_OR_COPY(GKO_DECLARE_DENSE_COPY_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_FILL_KERNEL);
+GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_SCALE_KERNEL);
+GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_INV_SCALE_KERNEL);
+GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_KERNEL);
+GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_KERNEL);
+GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_IDENTITY_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_MEAN_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL_SIZE_T);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_STUB_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL);
-GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_2_WITH_HALF(
-    GKO_DECLARE_DENSE_ROW_GATHER_KERNEL);
-GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_2_WITH_HALF(
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL);
+GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_2(GKO_DECLARE_DENSE_ROW_GATHER_KERNEL);
+GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_2(
     GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COL_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_INV_COL_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_NONSYMM_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_INV_NONSYMM_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_SYMM_SCALE_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_INV_SYMM_SCALE_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_ROW_SCALE_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_COL_SCALE_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_INV_ROW_SCALE_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_INV_COL_SCALE_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_NONSYMM_SCALE_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_COL_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_COL_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_NONSYMM_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_NONSYMM_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_SYMM_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_SYMM_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_ROW_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_COL_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_ROW_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_COL_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_NONSYMM_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_INV_NONSYMM_SCALE_PERMUTE_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_MAKE_COMPLEX_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GET_REAL_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GET_IMAG_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_MAKE_COMPLEX_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_GET_REAL_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_GET_IMAG_KERNEL);
 
 
 }  // namespace dense
@@ -518,17 +483,13 @@ GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GET_IMAG_KERNEL);
 namespace diagonal {
 
 
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DIAGONAL_APPLY_TO_DENSE_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DIAGONAL_RIGHT_APPLY_TO_DENSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_DIAGONAL_APPLY_TO_CSR_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_DIAGONAL_RIGHT_APPLY_TO_CSR_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_DIAGONAL_CONVERT_TO_CSR_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DIAGONAL_CONJ_TRANSPOSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_DIAGONAL_FILL_IN_MATRIX_DATA_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DIAGONAL_APPLY_TO_DENSE_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DIAGONAL_RIGHT_APPLY_TO_DENSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DIAGONAL_APPLY_TO_CSR_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DIAGONAL_RIGHT_APPLY_TO_CSR_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DIAGONAL_CONVERT_TO_CSR_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DIAGONAL_CONJ_TRANSPOSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DIAGONAL_FILL_IN_MATRIX_DATA_KERNEL);
 
 
 }  // namespace diagonal
@@ -537,7 +498,7 @@ GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
 namespace batch_bicgstab {
 
 
-GKO_STUB_BATCH_VALUE_MATRIX_PRECONDITIONER(
+GKO_STUB_BATCH_VALUE_MATRIX_PRECONDITIONER_BASE(
     GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL,
     GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL_WRAPPER);
 
@@ -548,7 +509,7 @@ GKO_STUB_BATCH_VALUE_MATRIX_PRECONDITIONER(
 namespace batch_cg {
 
 
-GKO_STUB_BATCH_VALUE_MATRIX_PRECONDITIONER(
+GKO_STUB_BATCH_VALUE_MATRIX_PRECONDITIONER_BASE(
     GKO_DECLARE_BATCH_CG_APPLY_KERNEL,
     GKO_DECLARE_BATCH_CG_APPLY_KERNEL_WRAPPER);
 
@@ -559,9 +520,9 @@ GKO_STUB_BATCH_VALUE_MATRIX_PRECONDITIONER(
 namespace cg {
 
 
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG_INITIALIZE_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG_STEP_1_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG_STEP_2_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_CG_INITIALIZE_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_CG_STEP_1_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_CG_STEP_2_KERNEL);
 
 
 }  // namespace cg
@@ -570,9 +531,9 @@ GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG_STEP_2_KERNEL);
 namespace bicg {
 
 
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICG_INITIALIZE_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICG_STEP_1_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICG_STEP_2_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_BICG_INITIALIZE_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_BICG_STEP_1_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_BICG_STEP_2_KERNEL);
 
 
 }  // namespace bicg
@@ -582,8 +543,8 @@ namespace lower_trs {
 
 
 GKO_STUB(GKO_DECLARE_LOWER_TRS_SHOULD_PERFORM_TRANSPOSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_LOWER_TRS_SOLVE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LOWER_TRS_SOLVE_KERNEL);
 
 
 }  // namespace lower_trs
@@ -593,8 +554,8 @@ namespace upper_trs {
 
 
 GKO_STUB(GKO_DECLARE_UPPER_TRS_SHOULD_PERFORM_TRANSPOSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL);
 
 
 }  // namespace upper_trs
@@ -603,9 +564,9 @@ GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL);
 namespace fcg {
 
 
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_FCG_INITIALIZE_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_FCG_STEP_1_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_FCG_STEP_2_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_FCG_INITIALIZE_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_FCG_STEP_1_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_FCG_STEP_2_KERNEL);
 
 
 }  // namespace fcg
@@ -614,11 +575,11 @@ GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_FCG_STEP_2_KERNEL);
 namespace bicgstab {
 
 
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICGSTAB_INITIALIZE_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICGSTAB_STEP_1_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICGSTAB_STEP_2_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICGSTAB_STEP_3_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICGSTAB_FINALIZE_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_BICGSTAB_INITIALIZE_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_1_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_2_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_3_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_BICGSTAB_FINALIZE_KERNEL);
 
 
 }  // namespace bicgstab
@@ -627,11 +588,11 @@ GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICGSTAB_FINALIZE_KERNEL);
 namespace idr {
 
 
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_INITIALIZE_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_1_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_2_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_3_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_IDR_INITIALIZE_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_IDR_STEP_1_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_IDR_STEP_2_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_IDR_STEP_3_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL);
 
 
 }  // namespace idr
@@ -640,10 +601,10 @@ GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL);
 namespace cgs {
 
 
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_INITIALIZE_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_STEP_1_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_STEP_2_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_STEP_3_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_CGS_INITIALIZE_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_CGS_STEP_1_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_CGS_STEP_2_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_CGS_STEP_3_KERNEL);
 
 
 }  // namespace cgs
@@ -651,9 +612,9 @@ GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_STEP_3_KERNEL);
 namespace gcr {
 
 
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GCR_INITIALIZE_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GCR_RESTART_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GCR_STEP_1_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_GCR_INITIALIZE_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_GCR_RESTART_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_GCR_STEP_1_KERNEL);
 
 
 }  // namespace gcr
@@ -661,9 +622,9 @@ GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GCR_STEP_1_KERNEL);
 namespace common_gmres {
 
 
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_COMMON_GMRES_INITIALIZE_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_COMMON_GMRES_HESSENBERG_QR_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_COMMON_GMRES_SOLVE_KRYLOV_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_COMMON_GMRES_INITIALIZE_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_COMMON_GMRES_HESSENBERG_QR_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_COMMON_GMRES_SOLVE_KRYLOV_KERNEL);
 
 
 }  // namespace common_gmres
@@ -672,9 +633,9 @@ GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_COMMON_GMRES_SOLVE_KRYLOV_KERNEL);
 namespace gmres {
 
 
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GMRES_RESTART_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GMRES_MULTI_DOT_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_GMRES_RESTART_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_DOT_KERNEL);
 
 
 }  // namespace gmres
@@ -683,7 +644,7 @@ GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GMRES_MULTI_DOT_KERNEL);
 namespace cb_gmres {
 
 
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL);
+GKO_STUB_VALUE_TYPE_BASE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL);
 GKO_STUB_CB_GMRES(GKO_DECLARE_CB_GMRES_RESTART_KERNEL);
 GKO_STUB_CB_GMRES(GKO_DECLARE_CB_GMRES_ARNOLDI_KERNEL);
 GKO_STUB_CB_GMRES_CONST(GKO_DECLARE_CB_GMRES_SOLVE_KRYLOV_KERNEL);
@@ -704,10 +665,9 @@ GKO_STUB(GKO_DECLARE_IR_INITIALIZE_KERNEL);
 namespace multigrid {
 
 
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL);
-GKO_STUB_NON_COMPLEX_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_MULTIGRID_KCYCLE_CHECK_STOP_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL);
+GKO_STUB_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_CHECK_STOP_KERNEL);
 
 
 }  // namespace multigrid
@@ -716,21 +676,17 @@ GKO_STUB_NON_COMPLEX_VALUE_TYPE_WITH_HALF(
 namespace sparsity_csr {
 
 
-GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL);
-GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL);
+GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_DIAGONAL_ELEMENT_PREFIX_SUM_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_STUB_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_REMOVE_DIAGONAL_ELEMENTS_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX);
+GKO_STUB_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_IS_SORTED_BY_COLUMN_INDEX);
 
 
@@ -740,54 +696,38 @@ GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
 namespace csr {
 
 
-GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_SPMV_KERNEL);
-GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_SPGEMM_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_SPGEAM_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_INV_COL_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_INV_COL_SCALE_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPMV_KERNEL);
+GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_COL_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_COL_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_INDEX_SET_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_FROM_INDEX_SET_KERNEL);
 GKO_STUB_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_OFFSETS_KERNEL);
 GKO_STUB_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL);
@@ -796,14 +736,12 @@ GKO_STUB_INDEX_TYPE(GKO_DECLARE_CSR_BENCHMARK_LOOKUP_KERNEL);
 template <typename ValueType, typename IndexType>
 GKO_DECLARE_CSR_SCALE_KERNEL(ValueType, IndexType)
 GKO_NOT_COMPILED(GKO_HOOK_MODULE);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_SCALE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SCALE_KERNEL);
 
 template <typename ValueType, typename IndexType>
 GKO_DECLARE_CSR_INV_SCALE_KERNEL(ValueType, IndexType)
 GKO_NOT_COMPILED(GKO_HOOK_MODULE);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_INV_SCALE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_SCALE_KERNEL);
 
 
 }  // namespace csr
@@ -812,20 +750,16 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
 namespace fbcsr {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_FBCSR_SPMV_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL);
 
 
 }  // namespace fbcsr
@@ -834,13 +768,12 @@ GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL);
 namespace coo {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_COO_SPMV_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_COO_ADVANCED_SPMV_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_COO_SPMV2_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_COO_FILL_IN_DENSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_COO_EXTRACT_DIAGONAL_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_ADVANCED_SPMV_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV2_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_FILL_IN_DENSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_EXTRACT_DIAGONAL_KERNEL);
 
 
 }  // namespace coo
@@ -849,19 +782,15 @@ GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
 namespace ell {
 
 
-GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_ELL_SPMV_KERNEL);
-GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_ELL_FILL_IN_MATRIX_DATA_KERNEL);
+GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL);
+GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_FILL_IN_MATRIX_DATA_KERNEL);
 GKO_STUB_INDEX_TYPE(GKO_DECLARE_ELL_COMPUTE_MAX_ROW_NNZ_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_ELL_FILL_IN_DENSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_ELL_COPY_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_ELL_CONVERT_TO_CSR_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_ELL_COUNT_NONZEROS_PER_ROW_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_ELL_EXTRACT_DIAGONAL_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_FILL_IN_DENSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_COPY_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_CONVERT_TO_CSR_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_COUNT_NONZEROS_PER_ROW_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_EXTRACT_DIAGONAL_KERNEL);
 
 
 }  // namespace ell
@@ -873,17 +802,17 @@ namespace fft {
 template <typename ValueType>
 GKO_DECLARE_FFT_KERNEL(ValueType)
 GKO_NOT_COMPILED(GKO_HOOK_MODULE);
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_BASE(GKO_DECLARE_FFT_KERNEL);
 
 template <typename ValueType>
 GKO_DECLARE_FFT2_KERNEL(ValueType)
 GKO_NOT_COMPILED(GKO_HOOK_MODULE);
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_BASE(GKO_DECLARE_FFT2_KERNEL);
 
 template <typename ValueType>
 GKO_DECLARE_FFT3_KERNEL(ValueType)
 GKO_NOT_COMPILED(GKO_HOOK_MODULE);
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT3_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_BASE(GKO_DECLARE_FFT3_KERNEL);
 
 
 }  // namespace fft
@@ -894,10 +823,8 @@ namespace hybrid {
 
 GKO_STUB(GKO_DECLARE_HYBRID_COMPUTE_COO_ROW_PTRS_KERNEL);
 GKO_STUB(GKO_DECLARE_HYBRID_COMPUTE_ROW_NNZ);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_HYBRID_FILL_IN_MATRIX_DATA_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_HYBRID_FILL_IN_MATRIX_DATA_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL);
 
 
 }  // namespace hybrid
@@ -916,10 +843,8 @@ GKO_STUB_INDEX_TYPE(GKO_DECLARE_PERMUTATION_COMPOSE_KERNEL);
 namespace scaled_permutation {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_SCALED_PERMUTATION_INVERT_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_SCALED_PERMUTATION_COMPOSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SCALED_PERMUTATION_INVERT_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SCALED_PERMUTATION_COMPOSE_KERNEL);
 
 
 }  // namespace scaled_permutation
@@ -928,18 +853,14 @@ GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
 namespace sellp {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_SELLP_SPMV_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_SELLP_FILL_IN_MATRIX_DATA_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_SPMV_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_FILL_IN_MATRIX_DATA_KERNEL);
 GKO_STUB_INDEX_TYPE(GKO_DECLARE_SELLP_COMPUTE_SLICE_SETS_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_SELLP_FILL_IN_DENSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_SELLP_COUNT_NONZEROS_PER_ROW_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_SELLP_EXTRACT_DIAGONAL_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_FILL_IN_DENSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_COUNT_NONZEROS_PER_ROW_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_EXTRACT_DIAGONAL_KERNEL);
 
 
 }  // namespace sellp
@@ -951,10 +872,9 @@ namespace batch_jacobi {
 GKO_STUB_INDEX_TYPE(
     GKO_DECLARE_BATCH_BLOCK_JACOBI_COMPUTE_CUMULATIVE_BLOCK_STORAGE);
 GKO_STUB_INDEX_TYPE(GKO_DECLARE_BATCH_BLOCK_JACOBI_FIND_ROW_BLOCK_MAP);
-GKO_STUB_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_STUB_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_BLOCK_JACOBI_EXTRACT_PATTERN_KERNEL);
-GKO_STUB_VALUE_AND_INT32_TYPE_WITH_HALF(
-    GKO_DECLARE_BATCH_BLOCK_JACOBI_COMPUTE_KERNEL);
+GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_BLOCK_JACOBI_COMPUTE_KERNEL);
 
 
 }  // namespace batch_jacobi
@@ -963,21 +883,18 @@ GKO_STUB_VALUE_AND_INT32_TYPE_WITH_HALF(
 namespace jacobi {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_JACOBI_GENERATE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_JACOBI_APPLY_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_JACOBI_SCALAR_CONVERT_TO_DENSE_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_JACOBI_SCALAR_CONJ_KERNEL);
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_JACOBI_INVERT_DIAGONAL_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_GENERATE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_APPLY_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_CONVERT_TO_DENSE_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_CONJ_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_JACOBI_INVERT_DIAGONAL_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL);
 GKO_STUB(GKO_DECLARE_JACOBI_INITIALIZE_PRECISIONS_KERNEL);
 
 
@@ -987,9 +904,8 @@ GKO_STUB(GKO_DECLARE_JACOBI_INITIALIZE_PRECISIONS_KERNEL);
 namespace sor {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U);
 
 
 }  // namespace sor
@@ -998,16 +914,11 @@ GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
 namespace isai {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_ISAI_GENERATE_GENERAL_INVERSE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_ISAI_SCALE_EXCESS_SOLUTION_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ISAI_GENERATE_GENERAL_INVERSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ISAI_SCALE_EXCESS_SOLUTION_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL);
 
 
 }  // namespace isai
@@ -1016,13 +927,11 @@ GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
 namespace cholesky {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CHOLESKY_INITIALIZE);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CHOLESKY_FACTORIZE);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_INITIALIZE);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE);
 
 
 }  // namespace cholesky
@@ -1031,16 +940,14 @@ GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CHOLESKY_FACTORIZE);
 namespace factorization {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_STUB_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_STUB_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL);
 
 
 }  // namespace factorization
@@ -1049,7 +956,7 @@ GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
 namespace ic_factorization {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_IC_SPARSELIB_IC_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC_SPARSELIB_IC_KERNEL);
 
 
 }  // namespace ic_factorization
@@ -1058,7 +965,7 @@ GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_IC_SPARSELIB_IC_KERNEL);
 namespace ilu_factorization {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);
 
 
 }  // namespace ilu_factorization
@@ -1067,8 +974,8 @@ GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);
 namespace lu_factorization {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_LU_INITIALIZE);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_LU_FACTORIZE);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_INITIALIZE);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE);
 GKO_STUB_INDEX_TYPE(GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE);
 GKO_STUB_INDEX_TYPE(GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE_FINALIZE);
 
@@ -1079,9 +986,8 @@ GKO_STUB_INDEX_TYPE(GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE_FINALIZE);
 namespace par_ic_factorization {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_PAR_IC_INIT_FACTOR_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_PAR_IC_COMPUTE_FACTOR_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_IC_INIT_FACTOR_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_IC_COMPUTE_FACTOR_KERNEL);
 
 
 }  // namespace par_ic_factorization
@@ -1090,10 +996,8 @@ GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
 namespace par_ict_factorization {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL);
 
 
 }  // namespace par_ict_factorization
@@ -1102,8 +1006,7 @@ GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
 namespace par_ilu_factorization {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL);
 
 
 }  // namespace par_ilu_factorization
@@ -1112,15 +1015,11 @@ GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
 namespace par_ilut_factorization {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL);
 
 
@@ -1146,12 +1045,11 @@ GKO_STUB_INDEX_TYPE(GKO_DECLARE_PGM_SORT_AGG_KERNEL);
 GKO_STUB_INDEX_TYPE(GKO_DECLARE_PGM_MAP_ROW_KERNEL);
 GKO_STUB_INDEX_TYPE(GKO_DECLARE_PGM_MAP_COL_KERNEL);
 GKO_STUB_INDEX_TYPE(GKO_DECLARE_PGM_COUNT_UNREPEATED_NNZ_KERNEL);
-GKO_STUB_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_STUB_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PGM_FIND_STRONGEST_NEIGHBOR);
-GKO_STUB_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_PGM_ASSIGN_TO_EXIST_AGG);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_PGM_SORT_ROW_MAJOR);
-GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_PGM_COMPUTE_COARSE_COO);
+GKO_STUB_NON_COMPLEX_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PGM_ASSIGN_TO_EXIST_AGG);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PGM_SORT_ROW_MAJOR);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PGM_COMPUTE_COARSE_COO);
 GKO_STUB_INDEX_TYPE(GKO_DECLARE_PGM_GATHER_INDEX);
 
 
@@ -1170,7 +1068,7 @@ GKO_STUB(GKO_DECLARE_SET_ALL_STATUSES_KERNEL);
 namespace residual_norm {
 
 
-GKO_STUB_NON_COMPLEX_VALUE_TYPE_WITH_HALF(GKO_DECLARE_RESIDUAL_NORM_KERNEL);
+GKO_STUB_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_RESIDUAL_NORM_KERNEL);
 
 
 }  // namespace residual_norm
@@ -1179,7 +1077,7 @@ GKO_STUB_NON_COMPLEX_VALUE_TYPE_WITH_HALF(GKO_DECLARE_RESIDUAL_NORM_KERNEL);
 namespace implicit_residual_norm {
 
 
-GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL);
 
 
 }  // namespace implicit_residual_norm
diff --git a/core/distributed/assembly.cpp b/core/distributed/assembly.cpp
index 424e641f845..116cf83ee94 100644
--- a/core/distributed/assembly.cpp
+++ b/core/distributed/assembly.cpp
@@ -135,7 +135,7 @@ device_matrix_data<ValueType, GlobalIndexType> assemble_rows_from_neighbors(
         mpi::communicator comm,                                            \
         const device_matrix_data<_value_type, _global_type>& input,        \
         ptr_param<const Partition<_local_type, _global_type>> partition)
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
     GKO_DECLARE_ASSEMBLE_ROWS_FROM_NEIGHBORS);
 
 
diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp
index 191c3cc0add..7f5d446e24e 100644
--- a/core/distributed/matrix.cpp
+++ b/core/distributed/matrix.cpp
@@ -664,7 +664,7 @@ Matrix<ValueType, LocalIndexType, GlobalIndexType>::operator=(Matrix&& other)
 #define GKO_DECLARE_DISTRIBUTED_MATRIX(ValueType, LocalIndexType, \
                                        GlobalIndexType)           \
     class Matrix<ValueType, LocalIndexType, GlobalIndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
     GKO_DECLARE_DISTRIBUTED_MATRIX);
 
 
diff --git a/core/distributed/preconditioner/schwarz.cpp b/core/distributed/preconditioner/schwarz.cpp
index 965414349d6..901d2ee1527 100644
--- a/core/distributed/preconditioner/schwarz.cpp
+++ b/core/distributed/preconditioner/schwarz.cpp
@@ -144,7 +144,8 @@ void Schwarz<ValueType, LocalIndexType, GlobalIndexType>::generate(
 
 #define GKO_DECLARE_SCHWARZ(ValueType, LocalIndexType, GlobalIndexType) \
     class Schwarz<ValueType, LocalIndexType, GlobalIndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_SCHWARZ);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
+    GKO_DECLARE_SCHWARZ);
 
 
 }  // namespace preconditioner
diff --git a/core/distributed/vector.cpp b/core/distributed/vector.cpp
index 0f2aeda7b5e..6dc7773d777 100644
--- a/core/distributed/vector.cpp
+++ b/core/distributed/vector.cpp
@@ -724,7 +724,7 @@ std::unique_ptr<Vector<ValueType>> Vector<ValueType>::create_with_type_of_impl(
 
 
 #define GKO_DECLARE_DISTRIBUTED_VECTOR(ValueType) class Vector<ValueType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DISTRIBUTED_VECTOR);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_BASE(GKO_DECLARE_DISTRIBUTED_VECTOR);
 
 
 }  // namespace distributed
diff --git a/core/distributed/vector_cache.cpp b/core/distributed/vector_cache.cpp
index e6b1af5fc42..683d18dfd98 100644
--- a/core/distributed/vector_cache.cpp
+++ b/core/distributed/vector_cache.cpp
@@ -48,7 +48,7 @@ void VectorCache<ValueType>::init_from(
 
 
 #define GKO_DECLARE_VECTOR_CACHE(_type) class VectorCache<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_VECTOR_CACHE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_BASE(GKO_DECLARE_VECTOR_CACHE);
 
 
 }  // namespace detail
diff --git a/core/factorization/cholesky.cpp b/core/factorization/cholesky.cpp
index a552ec37ec1..92d598f0bd7 100644
--- a/core/factorization/cholesky.cpp
+++ b/core/factorization/cholesky.cpp
@@ -146,7 +146,7 @@ std::unique_ptr<LinOp> Cholesky<ValueType, IndexType>::generate_impl(
 #define GKO_DECLARE_CHOLESKY(ValueType, IndexType) \
     class Cholesky<ValueType, IndexType>
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CHOLESKY);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY);
 
 
 }  // namespace factorization
diff --git a/core/factorization/elimination_forest.cpp b/core/factorization/elimination_forest.cpp
index f8d6d861c2d..1dc8ff060a0 100644
--- a/core/factorization/elimination_forest.cpp
+++ b/core/factorization/elimination_forest.cpp
@@ -173,8 +173,7 @@ void compute_elim_forest(const matrix::Csr<ValueType, IndexType>* mtx,
         const matrix::Csr<ValueType, IndexType>* mtx,         \
         std::unique_ptr<elimination_forest<IndexType>>& forest)
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_COMPUTE_ELIM_FOREST);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COMPUTE_ELIM_FOREST);
 
 
 }  // namespace factorization
diff --git a/core/factorization/factorization.cpp b/core/factorization/factorization.cpp
index e0e4ccdc3c7..1df1f49aa13 100644
--- a/core/factorization/factorization.cpp
+++ b/core/factorization/factorization.cpp
@@ -362,8 +362,7 @@ void Factorization<ValueType, IndexType>::apply_impl(const LinOp* alpha,
 #define GKO_DECLARE_FACTORIZATION(ValueType, IndexType) \
     class Factorization<ValueType, IndexType>
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_FACTORIZATION);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FACTORIZATION);
 
 
 }  // namespace factorization
diff --git a/core/factorization/ic.cpp b/core/factorization/ic.cpp
index d8706c8b8e3..bf9d5e7bbf4 100644
--- a/core/factorization/ic.cpp
+++ b/core/factorization/ic.cpp
@@ -203,7 +203,7 @@ std::unique_ptr<Composition<ValueType>> Ic<ValueType, IndexType>::generate(
 
 
 #define GKO_DECLARE_IC(ValueType, IndexType) class Ic<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_IC);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC);
 
 
 }  // namespace factorization
diff --git a/core/factorization/ilu.cpp b/core/factorization/ilu.cpp
index 1c6079700e3..f7703f3d20b 100644
--- a/core/factorization/ilu.cpp
+++ b/core/factorization/ilu.cpp
@@ -188,7 +188,7 @@ std::unique_ptr<Composition<ValueType>> Ilu<ValueType, IndexType>::generate_l_u(
 
 
 #define GKO_DECLARE_ILU(ValueType, IndexType) class Ilu<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_ILU);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ILU);
 
 
 }  // namespace factorization
diff --git a/core/factorization/lu.cpp b/core/factorization/lu.cpp
index dfdce26f140..4feb78083d2 100644
--- a/core/factorization/lu.cpp
+++ b/core/factorization/lu.cpp
@@ -166,7 +166,7 @@ std::unique_ptr<LinOp> Lu<ValueType, IndexType>::generate_impl(
 
 #define GKO_DECLARE_LU(ValueType, IndexType) class Lu<ValueType, IndexType>
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_LU);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU);
 
 
 }  // namespace factorization
diff --git a/core/factorization/par_ic.cpp b/core/factorization/par_ic.cpp
index b310025eb8d..f4a4afd23d6 100644
--- a/core/factorization/par_ic.cpp
+++ b/core/factorization/par_ic.cpp
@@ -146,7 +146,7 @@ std::unique_ptr<Composition<ValueType>> ParIc<ValueType, IndexType>::generate(
 
 #define GKO_DECLARE_PAR_IC(ValueType, IndexType) \
     class ParIc<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_PAR_IC);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_IC);
 
 
 }  // namespace factorization
diff --git a/core/factorization/par_ict.cpp b/core/factorization/par_ict.cpp
index 696b185e969..a0e8a628ca8 100644
--- a/core/factorization/par_ict.cpp
+++ b/core/factorization/par_ict.cpp
@@ -300,7 +300,7 @@ void ParIctState<ValueType, IndexType>::iterate()
 
 #define GKO_DECLARE_PAR_ICT(ValueType, IndexType) \
     class ParIct<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_PAR_ICT);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ICT);
 
 
 }  // namespace factorization
diff --git a/core/factorization/par_ilu.cpp b/core/factorization/par_ilu.cpp
index 177c150df1d..68c0c0c4fc6 100644
--- a/core/factorization/par_ilu.cpp
+++ b/core/factorization/par_ilu.cpp
@@ -161,7 +161,7 @@ ParIlu<ValueType, IndexType>::generate_l_u(
 
 #define GKO_DECLARE_PAR_ILU(ValueType, IndexType) \
     class ParIlu<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_PAR_ILU);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ILU);
 
 
 }  // namespace factorization
diff --git a/core/factorization/par_ilut.cpp b/core/factorization/par_ilut.cpp
index e90dbb8140f..42e3cc03130 100644
--- a/core/factorization/par_ilut.cpp
+++ b/core/factorization/par_ilut.cpp
@@ -352,7 +352,7 @@ void ParIlutState<ValueType, IndexType>::iterate()
 
 #define GKO_DECLARE_PAR_ILUT(ValueType, IndexType) \
     class ParIlut<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_PAR_ILUT);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ILUT);
 
 
 }  // namespace factorization
diff --git a/core/factorization/symbolic.cpp b/core/factorization/symbolic.cpp
index 495b830d7ea..23f6b94cc14 100644
--- a/core/factorization/symbolic.cpp
+++ b/core/factorization/symbolic.cpp
@@ -80,8 +80,7 @@ void symbolic_cholesky(
         std::unique_ptr<matrix::Csr<ValueType, IndexType>>& factors,   \
         std::unique_ptr<factorization::elimination_forest<IndexType>>& forest)
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_SYMBOLIC_CHOLESKY);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SYMBOLIC_CHOLESKY);
 
 
 template <typename ValueType, typename IndexType>
@@ -159,7 +158,7 @@ void symbolic_lu_near_symm(
         const matrix::Csr<ValueType, IndexType>* mtx,           \
         std::unique_ptr<matrix::Csr<ValueType, IndexType>>& factors)
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SYMBOLIC_LU_NEAR_SYMM);
 
 
@@ -246,8 +245,7 @@ void symbolic_lu(const matrix::Csr<ValueType, IndexType>* mtx,
         const matrix::Csr<ValueType, IndexType>* mtx, \
         std::unique_ptr<matrix::Csr<ValueType, IndexType>>& factors)
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_SYMBOLIC_LU);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SYMBOLIC_LU);
 
 
 }  // namespace factorization
diff --git a/core/log/batch_logger.cpp b/core/log/batch_logger.cpp
index 86c6ea647f2..f274019016f 100644
--- a/core/log/batch_logger.cpp
+++ b/core/log/batch_logger.cpp
@@ -65,7 +65,7 @@ log_data<ValueType>::log_data(std::shared_ptr<const Executor> exec,
 
 #define GKO_DECLARE_LOG_DATA(_type) struct log_data<_type>
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF(GKO_DECLARE_LOG_DATA);
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_LOG_DATA);
 
 #undef GKO_DECLARE_LOG_DATA
 
@@ -92,7 +92,7 @@ void BatchConvergence<ValueType>::on_batch_solver_completed(
 
 
 #define GKO_DECLARE_BATCH_CONVERGENCE(_type) class BatchConvergence<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BATCH_CONVERGENCE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CONVERGENCE);
 
 
 }  // namespace log
diff --git a/core/log/convergence.cpp b/core/log/convergence.cpp
index 78f004226cb..7cfa764dfd1 100644
--- a/core/log/convergence.cpp
+++ b/core/log/convergence.cpp
@@ -110,7 +110,7 @@ void Convergence<ValueType>::on_iteration_complete(
 
 
 #define GKO_DECLARE_CONVERGENCE(_type) class Convergence<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CONVERGENCE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CONVERGENCE);
 
 
 }  // namespace log
diff --git a/core/log/papi.cpp b/core/log/papi.cpp
index b5c56527687..5ced377ca38 100644
--- a/core/log/papi.cpp
+++ b/core/log/papi.cpp
@@ -279,7 +279,7 @@ void Papi<ValueType>::on_iteration_complete(
 
 
 #define GKO_DECLARE_PAPI(_type) class Papi<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_PAPI);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_PAPI);
 
 
 }  // namespace log
diff --git a/core/log/stream.cpp b/core/log/stream.cpp
index 69eef2e0949..5e510d409e2 100644
--- a/core/log/stream.cpp
+++ b/core/log/stream.cpp
@@ -482,7 +482,7 @@ void Stream<ValueType>::on_iteration_complete(
 
 
 #define GKO_DECLARE_STREAM(_type) class Stream<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_STREAM);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_STREAM);
 
 
 }  // namespace log
diff --git a/core/matrix/batch_csr.cpp b/core/matrix/batch_csr.cpp
index 3abd1856ce2..50ccc0a13d8 100644
--- a/core/matrix/batch_csr.cpp
+++ b/core/matrix/batch_csr.cpp
@@ -285,7 +285,7 @@ void Csr<ValueType, IndexType>::move_to(
 
 
 #define GKO_DECLARE_BATCH_CSR_MATRIX(ValueType) class Csr<ValueType, int32>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BATCH_CSR_MATRIX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CSR_MATRIX);
 
 
 }  // namespace matrix
diff --git a/core/matrix/batch_dense.cpp b/core/matrix/batch_dense.cpp
index c256dad59cc..f56e512d41e 100644
--- a/core/matrix/batch_dense.cpp
+++ b/core/matrix/batch_dense.cpp
@@ -279,7 +279,7 @@ void Dense<ValueType>::move_to(
 
 
 #define GKO_DECLARE_BATCH_DENSE_MATRIX(_type) class Dense<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BATCH_DENSE_MATRIX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_MATRIX);
 
 
 }  // namespace matrix
diff --git a/core/matrix/batch_ell.cpp b/core/matrix/batch_ell.cpp
index 7d18cc1e0ea..288d053e219 100644
--- a/core/matrix/batch_ell.cpp
+++ b/core/matrix/batch_ell.cpp
@@ -305,7 +305,7 @@ void Ell<ValueType, IndexType>::move_to(
 
 
 #define GKO_DECLARE_BATCH_ELL_MATRIX(ValueType) class Ell<ValueType, int32>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BATCH_ELL_MATRIX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_ELL_MATRIX);
 
 
 }  // namespace matrix
diff --git a/core/matrix/batch_identity.cpp b/core/matrix/batch_identity.cpp
index 6ee2d55f6fe..2220120d00b 100644
--- a/core/matrix/batch_identity.cpp
+++ b/core/matrix/batch_identity.cpp
@@ -113,8 +113,7 @@ void Identity<ValueType>::apply_impl(const MultiVector<ValueType>* alpha,
 
 
 #define GKO_DECLARE_BATCH_IDENTITY_MATRIX(ValueType) class Identity<ValueType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_BATCH_IDENTITY_MATRIX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_IDENTITY_MATRIX);
 
 
 }  // namespace matrix
diff --git a/core/matrix/coo.cpp b/core/matrix/coo.cpp
index 38894bdb447..6316e8e948a 100644
--- a/core/matrix/coo.cpp
+++ b/core/matrix/coo.cpp
@@ -425,7 +425,7 @@ Coo<ValueType, IndexType>::compute_absolute() const
 
 #define GKO_DECLARE_COO_MATRIX(ValueType, IndexType) \
     class Coo<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_COO_MATRIX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_MATRIX);
 
 
 }  // namespace matrix
diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp
index 4097fbed1a0..ca418241bf9 100644
--- a/core/matrix/csr.cpp
+++ b/core/matrix/csr.cpp
@@ -1068,7 +1068,7 @@ void Csr<ValueType, IndexType>::add_scaled_identity_impl(const LinOp* a,
 
 #define GKO_DECLARE_CSR_MATRIX(ValueType, IndexType) \
     class Csr<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_MATRIX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_MATRIX);
 
 
 }  // namespace matrix
diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp
index 1fd20a1db3d..308b5e8f11e 100644
--- a/core/matrix/dense.cpp
+++ b/core/matrix/dense.cpp
@@ -2056,7 +2056,7 @@ Dense<ValueType>::Dense(std::shared_ptr<const Executor> exec,
 
 
 #define GKO_DECLARE_DENSE_MATRIX(_type) class Dense<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_MATRIX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_MATRIX);
 
 
 }  // namespace matrix
diff --git a/core/matrix/diagonal.cpp b/core/matrix/diagonal.cpp
index 6a0c4edc1b5..f29c7e036b2 100644
--- a/core/matrix/diagonal.cpp
+++ b/core/matrix/diagonal.cpp
@@ -392,7 +392,7 @@ std::unique_ptr<const Diagonal<ValueType>> Diagonal<ValueType>::create_const(
 
 
 #define GKO_DECLARE_DIAGONAL_MATRIX(value_type) class Diagonal<value_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DIAGONAL_MATRIX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DIAGONAL_MATRIX);
 
 
 }  // namespace matrix
@@ -410,7 +410,7 @@ std::unique_ptr<LinOp> DiagonalExtractable<ValueType>::extract_diagonal_linop()
 #define GKO_DECLARE_DIAGONAL_EXTRACTABLE(value_type) \
     std::unique_ptr<LinOp>                           \
     DiagonalExtractable<value_type>::extract_diagonal_linop() const
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DIAGONAL_EXTRACTABLE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DIAGONAL_EXTRACTABLE);
 
 
 }  // namespace gko
diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp
index 98fbfc94c7d..da166ba541e 100644
--- a/core/matrix/ell.cpp
+++ b/core/matrix/ell.cpp
@@ -423,7 +423,7 @@ Ell<ValueType, IndexType>::Ell(std::shared_ptr<const Executor> exec,
 
 #define GKO_DECLARE_ELL_MATRIX(ValueType, IndexType) \
     class Ell<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_ELL_MATRIX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_MATRIX);
 
 
 }  // namespace matrix
diff --git a/core/matrix/fbcsr.cpp b/core/matrix/fbcsr.cpp
index b9e8c6b00b6..8778df25375 100644
--- a/core/matrix/fbcsr.cpp
+++ b/core/matrix/fbcsr.cpp
@@ -498,8 +498,7 @@ Fbcsr<ValueType, IndexType>::Fbcsr(std::shared_ptr<const Executor> exec,
 
 #define GKO_DECLARE_FBCSR_MATRIX(ValueType, IndexType) \
     class Fbcsr<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_FBCSR_MATRIX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_MATRIX);
 
 
 }  // namespace matrix
diff --git a/core/matrix/hybrid.cpp b/core/matrix/hybrid.cpp
index 95a95b3a619..4b36b7115ac 100644
--- a/core/matrix/hybrid.cpp
+++ b/core/matrix/hybrid.cpp
@@ -441,8 +441,7 @@ Hybrid<ValueType, IndexType>::compute_absolute() const
 
 #define GKO_DECLARE_HYBRID_MATRIX(ValueType, IndexType) \
     class Hybrid<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_HYBRID_MATRIX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_HYBRID_MATRIX);
 
 
 }  // namespace matrix
diff --git a/core/matrix/identity.cpp b/core/matrix/identity.cpp
index ecd93b6f959..7e035be82a3 100644
--- a/core/matrix/identity.cpp
+++ b/core/matrix/identity.cpp
@@ -83,9 +83,9 @@ std::unique_ptr<Identity<ValueType>> Identity<ValueType>::create(
 
 
 #define GKO_DECLARE_IDENTITY_MATRIX(_type) class Identity<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDENTITY_MATRIX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDENTITY_MATRIX);
 #define GKO_DECLARE_IDENTITY_FACTORY(_type) class IdentityFactory<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDENTITY_FACTORY);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDENTITY_FACTORY);
 
 
 }  // namespace matrix
diff --git a/core/matrix/scaled_permutation.cpp b/core/matrix/scaled_permutation.cpp
index bbe353e543e..0f295d6b5be 100644
--- a/core/matrix/scaled_permutation.cpp
+++ b/core/matrix/scaled_permutation.cpp
@@ -174,7 +174,7 @@ void ScaledPermutation<ValueType, IndexType>::write(
 
 #define GKO_DECLARE_SCALED_PERMUTATION_MATRIX(ValueType, IndexType) \
     class ScaledPermutation<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SCALED_PERMUTATION_MATRIX);
 
 
diff --git a/core/matrix/sellp.cpp b/core/matrix/sellp.cpp
index 3422d249b40..d4cff180295 100644
--- a/core/matrix/sellp.cpp
+++ b/core/matrix/sellp.cpp
@@ -387,8 +387,7 @@ Sellp<ValueType, IndexType>::compute_absolute() const
 
 #define GKO_DECLARE_SELLP_MATRIX(ValueType, IndexType) \
     class Sellp<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_SELLP_MATRIX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_MATRIX);
 
 
 }  // namespace matrix
diff --git a/core/matrix/sparsity_csr.cpp b/core/matrix/sparsity_csr.cpp
index a4d8b2fa281..9b8ea04da52 100644
--- a/core/matrix/sparsity_csr.cpp
+++ b/core/matrix/sparsity_csr.cpp
@@ -346,8 +346,7 @@ bool SparsityCsr<ValueType, IndexType>::is_sorted_by_column_index() const
 
 #define GKO_DECLARE_SPARSITY_MATRIX(ValueType, IndexType) \
     class SparsityCsr<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_SPARSITY_MATRIX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_MATRIX);
 
 
 }  // namespace matrix
diff --git a/core/multigrid/fixed_coarsening.cpp b/core/multigrid/fixed_coarsening.cpp
index f62ce746d6b..1cbdd557fb4 100644
--- a/core/multigrid/fixed_coarsening.cpp
+++ b/core/multigrid/fixed_coarsening.cpp
@@ -90,8 +90,7 @@ void FixedCoarsening<ValueType, IndexType>::generate()
 
 #define GKO_DECLARE_FIXED_COARSENING(_vtype, _itype) \
     class FixedCoarsening<_vtype, _itype>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_FIXED_COARSENING);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FIXED_COARSENING);
 
 
 }  // namespace multigrid
diff --git a/core/multigrid/pgm.cpp b/core/multigrid/pgm.cpp
index d11ebf32399..468a27e8ce4 100644
--- a/core/multigrid/pgm.cpp
+++ b/core/multigrid/pgm.cpp
@@ -551,7 +551,7 @@ void Pgm<ValueType, IndexType>::generate()
 
 
 #define GKO_DECLARE_PGM(_vtype, _itype) class Pgm<_vtype, _itype>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_PGM);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PGM);
 
 
 }  // namespace multigrid
diff --git a/core/preconditioner/batch_jacobi.cpp b/core/preconditioner/batch_jacobi.cpp
index 53809a82a5a..e4382de38ec 100644
--- a/core/preconditioner/batch_jacobi.cpp
+++ b/core/preconditioner/batch_jacobi.cpp
@@ -175,7 +175,7 @@ void Jacobi<ValueType, IndexType>::generate_precond(
 
 
 #define GKO_DECLARE_BATCH_JACOBI(_type) class Jacobi<_type, int32>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BATCH_JACOBI);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_JACOBI);
 
 
 }  // namespace preconditioner
diff --git a/core/preconditioner/gauss_seidel.cpp b/core/preconditioner/gauss_seidel.cpp
index f4735cff5bc..aec7a4ff827 100644
--- a/core/preconditioner/gauss_seidel.cpp
+++ b/core/preconditioner/gauss_seidel.cpp
@@ -71,8 +71,7 @@ std::unique_ptr<LinOp> GaussSeidel<ValueType, IndexType>::generate_impl(
 #define GKO_DECLARE_GAUSS_SEIDEL(ValueType, IndexType) \
     class GaussSeidel<ValueType, IndexType>
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_GAUSS_SEIDEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_GAUSS_SEIDEL);
 
 
 }  // namespace preconditioner
diff --git a/core/preconditioner/ic.cpp b/core/preconditioner/ic.cpp
index 2e9833c21f7..691795ad60b 100644
--- a/core/preconditioner/ic.cpp
+++ b/core/preconditioner/ic.cpp
@@ -50,32 +50,28 @@ typename Ic::parameters_type ic_parse(
     ic_parse<Ic<solver::LowerTrs<ValueType, IndexType>, IndexType>>( \
         const config::pnode&, const config::registry&,               \
         const config::type_descriptor&)
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_LOWERTRS_IC_PARSE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LOWERTRS_IC_PARSE);
 
 #define GKO_DECLARE_IR_IC_PARSE(ValueType, IndexType)              \
     typename Ic<solver::Ir<ValueType>, IndexType>::parameters_type \
     ic_parse<Ic<solver::Ir<ValueType>, IndexType>>(                \
         const config::pnode&, const config::registry&,             \
         const config::type_descriptor&)
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_IR_IC_PARSE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IR_IC_PARSE);
 
 #define GKO_DECLARE_GMRES_IC_PARSE(ValueType, IndexType)              \
     typename Ic<solver::Gmres<ValueType>, IndexType>::parameters_type \
     ic_parse<Ic<solver::Gmres<ValueType>, IndexType>>(                \
         const config::pnode&, const config::registry&,                \
         const config::type_descriptor&)
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_GMRES_IC_PARSE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_GMRES_IC_PARSE);
 
 #define GKO_DECLARE_LOWERISAI_IC_PARSE(ValueType, IndexType)                 \
     typename Ic<LowerIsai<ValueType, IndexType>, IndexType>::parameters_type \
     ic_parse<Ic<LowerIsai<ValueType, IndexType>, IndexType>>(                \
         const config::pnode&, const config::registry&,                       \
         const config::type_descriptor&)
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_LOWERISAI_IC_PARSE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LOWERISAI_IC_PARSE);
 
 }  // namespace detail
 }  // namespace preconditioner
diff --git a/core/preconditioner/ilu.cpp b/core/preconditioner/ilu.cpp
index dae6cf97829..d6f49e49588 100644
--- a/core/preconditioner/ilu.cpp
+++ b/core/preconditioner/ilu.cpp
@@ -59,8 +59,7 @@ typename Ilu::parameters_type ilu_parse(
                   solver::UpperTrs<ValueType, IndexType>, false, IndexType>>( \
         const config::pnode&, const config::registry&,                        \
         const config::type_descriptor&)
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_TRS_ILU_FALSE_PARSE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_TRS_ILU_FALSE_PARSE);
 
 #define GKO_DECLARE_TRS_ILU_TRUE_PARSE(ValueType, IndexType)                 \
     typename Ilu<solver::LowerTrs<ValueType, IndexType>,                     \
@@ -70,8 +69,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
                   solver::UpperTrs<ValueType, IndexType>, true, IndexType>>( \
         const config::pnode&, const config::registry&,                       \
         const config::type_descriptor&)
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_TRS_ILU_TRUE_PARSE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_TRS_ILU_TRUE_PARSE);
 
 #define GKO_DECLARE_GMRES_ILU_FALSE_PARSE(ValueType, IndexType)              \
     typename Ilu<solver::Gmres<ValueType>, solver::Gmres<ValueType>, false,  \
@@ -79,7 +77,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     ilu_parse<Ilu<solver::Gmres<ValueType>, solver::Gmres<ValueType>, false, \
                   IndexType>>(const config::pnode&, const config::registry&, \
                               const config::type_descriptor&)
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_GMRES_ILU_FALSE_PARSE);
 
 #define GKO_DECLARE_GMRES_ILU_TRUE_PARSE(ValueType, IndexType)               \
@@ -88,8 +86,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     ilu_parse<Ilu<solver::Gmres<ValueType>, solver::Gmres<ValueType>, true,  \
                   IndexType>>(const config::pnode&, const config::registry&, \
                               const config::type_descriptor&)
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_GMRES_ILU_TRUE_PARSE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_GMRES_ILU_TRUE_PARSE);
 
 #define GKO_DECLARE_IR_ILU_FALSE_PARSE(ValueType, IndexType)                  \
     typename Ilu<solver::Ir<ValueType>, solver::Ir<ValueType>, false,         \
@@ -98,8 +95,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
         Ilu<solver::Ir<ValueType>, solver::Ir<ValueType>, false, IndexType>>( \
         const config::pnode&, const config::registry&,                        \
         const config::type_descriptor&)
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_IR_ILU_FALSE_PARSE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IR_ILU_FALSE_PARSE);
 
 #define GKO_DECLARE_IR_ILU_TRUE_PARSE(ValueType, IndexType)                  \
     typename Ilu<solver::Ir<ValueType>, solver::Ir<ValueType>, true,         \
@@ -108,8 +104,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
         Ilu<solver::Ir<ValueType>, solver::Ir<ValueType>, true, IndexType>>( \
         const config::pnode&, const config::registry&,                       \
         const config::type_descriptor&)
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_IR_ILU_TRUE_PARSE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IR_ILU_TRUE_PARSE);
 
 #define GKO_DECLARE_ISAI_ILU_FALSE_PARSE(ValueType, IndexType)         \
     typename Ilu<LowerIsai<ValueType, IndexType>,                      \
@@ -119,8 +114,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
                   UpperIsai<ValueType, IndexType>, false, IndexType>>( \
         const config::pnode&, const config::registry&,                 \
         const config::type_descriptor&)
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_ISAI_ILU_FALSE_PARSE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ISAI_ILU_FALSE_PARSE);
 
 #define GKO_DECLARE_ISAI_ILU_TRUE_PARSE(ValueType, IndexType)         \
     typename Ilu<LowerIsai<ValueType, IndexType>,                     \
@@ -130,8 +124,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
                   UpperIsai<ValueType, IndexType>, true, IndexType>>( \
         const config::pnode&, const config::registry&,                \
         const config::type_descriptor&)
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_ISAI_ILU_TRUE_PARSE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ISAI_ILU_TRUE_PARSE);
 
 
 }  // namespace detail
diff --git a/core/preconditioner/isai.cpp b/core/preconditioner/isai.cpp
index ec0ef365592..9684f1bdb27 100644
--- a/core/preconditioner/isai.cpp
+++ b/core/preconditioner/isai.cpp
@@ -358,20 +358,19 @@ std::unique_ptr<LinOp> Isai<IsaiType, ValueType, IndexType>::conj_transpose()
 
 #define GKO_DECLARE_LOWER_ISAI(ValueType, IndexType) \
     class Isai<isai_type::lower, ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_LOWER_ISAI);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LOWER_ISAI);
 
 #define GKO_DECLARE_UPPER_ISAI(ValueType, IndexType) \
     class Isai<isai_type::upper, ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_UPPER_ISAI);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_UPPER_ISAI);
 
 #define GKO_DECLARE_GENERAL_ISAI(ValueType, IndexType) \
     class Isai<isai_type::general, ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_GENERAL_ISAI);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_GENERAL_ISAI);
 
 #define GKO_DECLARE_SPD_ISAI(ValueType, IndexType) \
     class Isai<isai_type::spd, ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_SPD_ISAI);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPD_ISAI);
 
 
 }  // namespace preconditioner
diff --git a/core/preconditioner/jacobi.cpp b/core/preconditioner/jacobi.cpp
index 556fb9bc0df..1164f6c0cb8 100644
--- a/core/preconditioner/jacobi.cpp
+++ b/core/preconditioner/jacobi.cpp
@@ -376,7 +376,7 @@ void Jacobi<ValueType, IndexType>::generate(const LinOp* system_matrix,
 
 #define GKO_DECLARE_JACOBI(ValueType, IndexType) \
     class Jacobi<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_JACOBI);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI);
 
 
 }  // namespace preconditioner
diff --git a/core/preconditioner/sor.cpp b/core/preconditioner/sor.cpp
index b671a99c6fb..c9905c5f73c 100644
--- a/core/preconditioner/sor.cpp
+++ b/core/preconditioner/sor.cpp
@@ -161,7 +161,7 @@ std::unique_ptr<LinOp> Sor<ValueType, IndexType>::generate_impl(
 
 #define GKO_DECLARE_SOR(ValueType, IndexType) class Sor<ValueType, IndexType>
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_SOR);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SOR);
 
 
 }  // namespace preconditioner
diff --git a/core/reorder/mc64.cpp b/core/reorder/mc64.cpp
index 1319dea252a..26a1b5bb0ad 100644
--- a/core/reorder/mc64.cpp
+++ b/core/reorder/mc64.cpp
@@ -459,14 +459,13 @@ void compute_scaling(const matrix::Csr<ValueType, IndexType>* host_mtx,
 }
 
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_MC64_INITIALIZE_WEIGHTS);
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_MC64_INITIAL_MATCHING);
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_MC64_SHORTEST_AUGMENTING_PATH);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_MC64_COMPUTE_SCALING);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_MC64_COMPUTE_SCALING);
 
 
 }  // namespace mc64
@@ -588,7 +587,7 @@ std::unique_ptr<LinOp> Mc64<ValueType, IndexType>::generate_impl(
 
 
 #define GKO_DECLARE_MC64(ValueType, IndexType) class Mc64<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_MC64);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_MC64);
 
 
 }  // namespace reorder
diff --git a/core/reorder/nested_dissection.cpp b/core/reorder/nested_dissection.cpp
index e14af9ffbfc..1527739d43d 100644
--- a/core/reorder/nested_dissection.cpp
+++ b/core/reorder/nested_dissection.cpp
@@ -159,7 +159,7 @@ std::unique_ptr<LinOp> NestedDissection<ValueType, IndexType>::generate_impl(
 
 #define GKO_DECLARE_ND(ValueType, IndexType) \
     class NestedDissection<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ND);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_BASE(GKO_DECLARE_ND);
 
 
 }  // namespace reorder
diff --git a/core/reorder/rcm.cpp b/core/reorder/rcm.cpp
index 0d2bae4d7dc..1acf4d97f1f 100644
--- a/core/reorder/rcm.cpp
+++ b/core/reorder/rcm.cpp
@@ -114,7 +114,7 @@ Rcm<ValueType, IndexType>::Rcm(const Factory* factory,
 
 
 #define GKO_DECLARE_RCM(ValueType, IndexType) class Rcm<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_RCM);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_RCM);
 
 
 }  // namespace reorder
diff --git a/core/reorder/scaled_reordered.cpp b/core/reorder/scaled_reordered.cpp
index 210e513841b..264122c0b8f 100644
--- a/core/reorder/scaled_reordered.cpp
+++ b/core/reorder/scaled_reordered.cpp
@@ -84,8 +84,7 @@ void ScaledReordered<ValueType, IndexType>::apply_impl(const LinOp* alpha,
 
 #define GKO_DECLARE_SCALED_REORDERED(ValueType, IndexType) \
     class ScaledReordered<ValueType, IndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_SCALED_REORDERED);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SCALED_REORDERED);
 
 
 }  // namespace reorder
diff --git a/core/solver/batch_bicgstab.cpp b/core/solver/batch_bicgstab.cpp
index fa467c98976..73fc0a2c852 100644
--- a/core/solver/batch_bicgstab.cpp
+++ b/core/solver/batch_bicgstab.cpp
@@ -68,7 +68,7 @@ void Bicgstab<ValueType>::solver_apply(
 
 
 #define GKO_DECLARE_BATCH_BICGSTAB(_type) class Bicgstab<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BATCH_BICGSTAB);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB);
 
 
 }  // namespace solver
diff --git a/core/solver/batch_cg.cpp b/core/solver/batch_cg.cpp
index c7c4da5085a..13a5afffcaa 100644
--- a/core/solver/batch_cg.cpp
+++ b/core/solver/batch_cg.cpp
@@ -69,7 +69,7 @@ void Cg<ValueType>::solver_apply(
 
 
 #define GKO_DECLARE_BATCH_CG(_type) class Cg<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BATCH_CG);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG);
 
 
 }  // namespace solver
diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp
index 33d3c3938e1..53ed5b962b2 100644
--- a/core/solver/batch_dispatch.hpp
+++ b/core/solver/batch_dispatch.hpp
@@ -222,7 +222,7 @@ enum class log_type { simple_convergence_completion };
     GKO_CALL(GKO_BATCH_INSTANTIATE_MATRIX_BATCH, GKO_BATCH_INSTANTIATE_LOGGER, \
              GKO_BATCH_INSTANTIATE_DEVICE_PRECONDITIONER,                      \
              GKO_BATCH_INSTANTIATE_STOP,                                       \
-             GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS_WITH_HALF, __VA_ARGS__)
+             GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS, __VA_ARGS__)
 
 
 /**
diff --git a/core/solver/bicg.cpp b/core/solver/bicg.cpp
index 55d18f7f01d..0b39b3664cc 100644
--- a/core/solver/bicg.cpp
+++ b/core/solver/bicg.cpp
@@ -293,8 +293,8 @@ std::vector<int> workspace_traits<Bicg<ValueType>>::vectors(const Solver&)
 
 #define GKO_DECLARE_BICG(_type) class Bicg<_type>
 #define GKO_DECLARE_BICG_TRAITS(_type) struct workspace_traits<Bicg<_type>>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICG);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICG_TRAITS);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_TRAITS);
 
 
 }  // namespace solver
diff --git a/core/solver/bicgstab.cpp b/core/solver/bicgstab.cpp
index 1e27c046186..c254b417765 100644
--- a/core/solver/bicgstab.cpp
+++ b/core/solver/bicgstab.cpp
@@ -298,8 +298,8 @@ std::vector<int> workspace_traits<Bicgstab<ValueType>>::vectors(const Solver&)
 #define GKO_DECLARE_BICGSTAB(_type) class Bicgstab<_type>
 #define GKO_DECLARE_BICGSTAB_TRAITS(_type) \
     struct workspace_traits<Bicgstab<_type>>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICGSTAB);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICGSTAB_TRAITS);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_TRAITS);
 
 
 }  // namespace solver
diff --git a/core/solver/cb_gmres.cpp b/core/solver/cb_gmres.cpp
index 274948531ab..1de3a79fb7c 100644
--- a/core/solver/cb_gmres.cpp
+++ b/core/solver/cb_gmres.cpp
@@ -518,8 +518,8 @@ void CbGmres<ValueType>::apply_impl(const LinOp* alpha, const LinOp* b,
 #define GKO_DECLARE_CB_GMRES(_type1) class CbGmres<_type1>
 #define GKO_DECLARE_CB_GMRES_TRAITS(_type1) \
     struct workspace_traits<CbGmres<_type1>>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_TRAITS);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_BASE(GKO_DECLARE_CB_GMRES);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_BASE(GKO_DECLARE_CB_GMRES_TRAITS);
 
 
 }  // namespace solver
diff --git a/core/solver/cg.cpp b/core/solver/cg.cpp
index a7898577b8a..c512dc4313b 100644
--- a/core/solver/cg.cpp
+++ b/core/solver/cg.cpp
@@ -243,8 +243,8 @@ std::vector<int> workspace_traits<Cg<ValueType>>::vectors(const Solver&)
 
 #define GKO_DECLARE_CG(_type) class Cg<_type>
 #define GKO_DECLARE_CG_TRAITS(_type) struct workspace_traits<Cg<_type>>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG_TRAITS);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_TRAITS);
 
 
 }  // namespace solver
diff --git a/core/solver/cgs.cpp b/core/solver/cgs.cpp
index 4ec702a8db5..19f625228a3 100644
--- a/core/solver/cgs.cpp
+++ b/core/solver/cgs.cpp
@@ -265,8 +265,8 @@ std::vector<int> workspace_traits<Cgs<ValueType>>::vectors(const Solver&)
 
 #define GKO_DECLARE_CGS(_type) class Cgs<_type>
 #define GKO_DECLARE_CGS_TRAITS(_type) struct workspace_traits<Cgs<_type>>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_TRAITS);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_TRAITS);
 
 
 }  // namespace solver
diff --git a/core/solver/direct.cpp b/core/solver/direct.cpp
index 69c2f9512dd..cf15bc4a9ae 100644
--- a/core/solver/direct.cpp
+++ b/core/solver/direct.cpp
@@ -221,7 +221,7 @@ void Direct<ValueType, IndexType>::apply_impl(const LinOp* alpha,
 #define GKO_DECLARE_DIRECT(ValueType, IndexType) \
     class Direct<ValueType, IndexType>
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_DIRECT);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DIRECT);
 
 
 }  // namespace solver
@@ -283,8 +283,7 @@ std::vector<int> workspace_traits<gko::experimental::solver::Direct<
     struct workspace_traits<                            \
         gko::experimental::solver::Direct<ValueType, IndexType>>
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_DIRECT_TRAITS);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DIRECT_TRAITS);
 
 
 }  // namespace solver
diff --git a/core/solver/fcg.cpp b/core/solver/fcg.cpp
index 569061626ff..6c65f63ccae 100644
--- a/core/solver/fcg.cpp
+++ b/core/solver/fcg.cpp
@@ -247,8 +247,8 @@ std::vector<int> workspace_traits<Fcg<ValueType>>::vectors(const Solver&)
 
 #define GKO_DECLARE_FCG(_type) class Fcg<_type>
 #define GKO_DECLARE_FCG_TRAITS(_type) struct workspace_traits<Fcg<_type>>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_FCG);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_FCG_TRAITS);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_TRAITS);
 
 
 }  // namespace solver
diff --git a/core/solver/gcr.cpp b/core/solver/gcr.cpp
index 8219de79ef4..d5131632dc3 100644
--- a/core/solver/gcr.cpp
+++ b/core/solver/gcr.cpp
@@ -371,8 +371,8 @@ std::vector<int> workspace_traits<Gcr<ValueType>>::vectors(const Solver&)
 
 #define GKO_DECLARE_GCR(_type) class Gcr<_type>
 #define GKO_DECLARE_GCR_TRAITS(_type) struct workspace_traits<Gcr<_type>>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GCR);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GCR_TRAITS);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GCR);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GCR_TRAITS);
 
 
 }  // namespace solver
diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp
index 8a4fdf563c3..e066fc696a1 100644
--- a/core/solver/gmres.cpp
+++ b/core/solver/gmres.cpp
@@ -707,8 +707,8 @@ std::vector<int> workspace_traits<Gmres<ValueType>>::vectors(const Solver&)
 
 #define GKO_DECLARE_GMRES(_type) class Gmres<_type>
 #define GKO_DECLARE_GMRES_TRAITS(_type) struct workspace_traits<Gmres<_type>>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GMRES);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GMRES_TRAITS);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_TRAITS);
 
 
 }  // namespace solver
diff --git a/core/solver/idr.cpp b/core/solver/idr.cpp
index d090324fea1..b10c15136f4 100644
--- a/core/solver/idr.cpp
+++ b/core/solver/idr.cpp
@@ -406,8 +406,8 @@ std::vector<int> workspace_traits<Idr<ValueType>>::vectors(const Solver&)
 
 #define GKO_DECLARE_IDR(_type) class Idr<_type>
 #define GKO_DECLARE_IDR_TRAITS(_type) struct workspace_traits<Idr<_type>>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_TRAITS);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_TRAITS);
 
 
 }  // namespace solver
diff --git a/core/solver/ir.cpp b/core/solver/ir.cpp
index 3c2854dcf98..75efac351f9 100644
--- a/core/solver/ir.cpp
+++ b/core/solver/ir.cpp
@@ -370,8 +370,8 @@ std::vector<int> workspace_traits<Ir<ValueType>>::vectors(const Solver&)
 
 #define GKO_DECLARE_IR(_type) class Ir<_type>
 #define GKO_DECLARE_IR_TRAITS(_type) struct workspace_traits<Ir<_type>>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IR);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IR_TRAITS);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IR);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IR_TRAITS);
 
 
 }  // namespace solver
diff --git a/core/solver/lower_trs.cpp b/core/solver/lower_trs.cpp
index da16061db03..3048c877dbd 100644
--- a/core/solver/lower_trs.cpp
+++ b/core/solver/lower_trs.cpp
@@ -248,9 +248,8 @@ std::vector<int> workspace_traits<LowerTrs<ValueType, IndexType>>::vectors(
 #define GKO_DECLARE_LOWER_TRS(_vtype, _itype) class LowerTrs<_vtype, _itype>
 #define GKO_DECLARE_LOWER_TRS_TRAITS(_vtype, _itype) \
     struct workspace_traits<LowerTrs<_vtype, _itype>>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_LOWER_TRS);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_LOWER_TRS_TRAITS);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LOWER_TRS);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LOWER_TRS_TRAITS);
 
 
 }  // namespace solver
diff --git a/core/solver/upper_trs.cpp b/core/solver/upper_trs.cpp
index 5e1dfb23df2..c759c119647 100644
--- a/core/solver/upper_trs.cpp
+++ b/core/solver/upper_trs.cpp
@@ -248,9 +248,8 @@ std::vector<int> workspace_traits<UpperTrs<ValueType, IndexType>>::vectors(
 #define GKO_DECLARE_UPPER_TRS(_vtype, _itype) class UpperTrs<_vtype, _itype>
 #define GKO_DECLARE_UPPER_TRS_TRAITS(_vtype, _itype) \
     struct workspace_traits<UpperTrs<_vtype, _itype>>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_UPPER_TRS);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_UPPER_TRS_TRAITS);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_UPPER_TRS);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_UPPER_TRS_TRAITS);
 
 
 }  // namespace solver
diff --git a/core/stop/residual_norm.cpp b/core/stop/residual_norm.cpp
index 5f75efcec82..c962784033a 100644
--- a/core/stop/residual_norm.cpp
+++ b/core/stop/residual_norm.cpp
@@ -227,13 +227,12 @@ bool ImplicitResidualNorm<ValueType>::check_impl(
 
 
 #define GKO_DECLARE_RESIDUAL_NORM(_type) class ResidualNormBase<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_RESIDUAL_NORM);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_RESIDUAL_NORM);
 
 
 #define GKO_DECLARE_IMPLICIT_RESIDUAL_NORM(_type) \
     class ImplicitResidualNorm<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_IMPLICIT_RESIDUAL_NORM);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IMPLICIT_RESIDUAL_NORM);
 
 
 }  // namespace stop
diff --git a/cuda/matrix/fft_kernels.cu b/cuda/matrix/fft_kernels.cu
index 80e938fbbff..23105f3e7ae 100644
--- a/cuda/matrix/fft_kernels.cu
+++ b/cuda/matrix/fft_kernels.cu
@@ -120,7 +120,7 @@ void fft(std::shared_ptr<const DefaultExecutor> exec,
     handle.execute(b->get_const_values(), x->get_values(), inverse);
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_BASE(GKO_DECLARE_FFT_KERNEL);
 
 
 template <typename ValueType>
@@ -136,7 +136,7 @@ void fft2(std::shared_ptr<const DefaultExecutor> exec,
     handle.execute(b->get_const_values(), x->get_values(), inverse);
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_BASE(GKO_DECLARE_FFT2_KERNEL);
 
 
 template <typename ValueType>
@@ -152,7 +152,7 @@ void fft3(std::shared_ptr<const DefaultExecutor> exec,
     handle.execute(b->get_const_values(), x->get_values(), inverse);
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT3_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_BASE(GKO_DECLARE_FFT3_KERNEL);
 
 
 }  // namespace fft
diff --git a/cuda/preconditioner/batch_jacobi_kernels.cu b/cuda/preconditioner/batch_jacobi_kernels.cu
index 30bbc8fd2e7..2ac5717308a 100644
--- a/cuda/preconditioner/batch_jacobi_kernels.cu
+++ b/cuda/preconditioner/batch_jacobi_kernels.cu
@@ -99,7 +99,7 @@ void extract_common_blocks_pattern(
         blocks_pattern);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_BLOCK_JACOBI_EXTRACT_PATTERN_KERNEL);
 
 
@@ -156,7 +156,7 @@ void compute_block_jacobi(
         cumulative_block_storage, block_pointers, blocks_pattern, blocks);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_BLOCK_JACOBI_COMPUTE_KERNEL);
 
 
diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu
index 52398093ac2..b702f1d7bbd 100644
--- a/cuda/solver/batch_bicgstab_kernels.cu
+++ b/cuda/solver/batch_bicgstab_kernels.cu
@@ -152,7 +152,7 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
     dispatcher.apply(b, x, logdata);
 }
 
-GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER(
+GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER_BASE(
     GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL_WRAPPER);
 
 
diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu
index d3d93a0af6d..77068a10e94 100644
--- a/cuda/solver/batch_cg_kernels.cu
+++ b/cuda/solver/batch_cg_kernels.cu
@@ -133,7 +133,7 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
     dispatcher.apply(b, x, logdata);
 }
 
-GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER(
+GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER_BASE(
     GKO_DECLARE_BATCH_CG_APPLY_KERNEL_WRAPPER);
 
 
diff --git a/cuda/solver/lower_trs_kernels.cu b/cuda/solver/lower_trs_kernels.cu
index 7832cf9e4c5..b37f6536b0f 100644
--- a/cuda/solver/lower_trs_kernels.cu
+++ b/cuda/solver/lower_trs_kernels.cu
@@ -50,7 +50,7 @@ void generate(std::shared_ptr<const CudaExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL);
 
 
@@ -70,7 +70,7 @@ void solve(std::shared_ptr<const CudaExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_LOWER_TRS_SOLVE_KERNEL);
 
 
diff --git a/cuda/solver/upper_trs_kernels.cu b/cuda/solver/upper_trs_kernels.cu
index b6828bc0c92..eb7d8386083 100644
--- a/cuda/solver/upper_trs_kernels.cu
+++ b/cuda/solver/upper_trs_kernels.cu
@@ -50,7 +50,7 @@ void generate(std::shared_ptr<const CudaExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL);
 
 
@@ -70,7 +70,7 @@ void solve(std::shared_ptr<const CudaExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL);
 
 
diff --git a/dpcpp/base/batch_multi_vector_kernels.dp.cpp b/dpcpp/base/batch_multi_vector_kernels.dp.cpp
index 3dff550bc22..1db2b4acf84 100644
--- a/dpcpp/base/batch_multi_vector_kernels.dp.cpp
+++ b/dpcpp/base/batch_multi_vector_kernels.dp.cpp
@@ -104,7 +104,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL);
 
 
@@ -163,7 +163,7 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL);
 
 
@@ -232,7 +232,7 @@ void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL);
 
 
@@ -277,7 +277,7 @@ void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_CONJ_DOT_KERNEL);
 
 
@@ -336,7 +336,7 @@ void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL);
 
 
@@ -374,8 +374,7 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
 
 
 }  // namespace batch_multi_vector
diff --git a/dpcpp/base/device_matrix_data_kernels.dp.cpp b/dpcpp/base/device_matrix_data_kernels.dp.cpp
index ab9cf9ebc7a..2c26bfeeba2 100644
--- a/dpcpp/base/device_matrix_data_kernels.dp.cpp
+++ b/dpcpp/base/device_matrix_data_kernels.dp.cpp
@@ -52,7 +52,7 @@ void remove_zeros(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DEVICE_MATRIX_DATA_REMOVE_ZEROS_KERNEL);
 
 
@@ -97,7 +97,7 @@ void sum_duplicates(std::shared_ptr<const DefaultExecutor> exec, size_type,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DEVICE_MATRIX_DATA_SUM_DUPLICATES_KERNEL);
 
 
@@ -115,7 +115,7 @@ void sort_row_major(std::shared_ptr<const DefaultExecutor> exec,
               });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DEVICE_MATRIX_DATA_SORT_ROW_MAJOR_KERNEL);
 
 
diff --git a/dpcpp/components/atomic.dp.hpp b/dpcpp/components/atomic.dp.hpp
index 8258c6924e1..68a1232849f 100644
--- a/dpcpp/components/atomic.dp.hpp
+++ b/dpcpp/components/atomic.dp.hpp
@@ -173,7 +173,7 @@ GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned int);
 
 #undef GKO_BIND_ATOMIC_HELPER_STRUCTURE
 
-#define GKO_BIND_ATOMIC_HELPER_VALUETYPE(ValueType)                         \
+#define GKO_BIND_ATOMIC_HELPER_VALUETYPE_BASE(ValueType)                    \
     template <sycl::access::address_space addressSpace>                     \
     struct atomic_helper<addressSpace, ValueType, std::enable_if_t<true>> { \
         __dpct_inline__ static ValueType atomic_add(                        \
@@ -183,9 +183,9 @@ GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned int);
         }                                                                   \
     };
 
-GKO_BIND_ATOMIC_HELPER_VALUETYPE(int);
-GKO_BIND_ATOMIC_HELPER_VALUETYPE(unsigned int);
-GKO_BIND_ATOMIC_HELPER_VALUETYPE(unsigned long long int);
+GKO_BIND_ATOMIC_HELPER_VALUETYPE_BASE(int);
+GKO_BIND_ATOMIC_HELPER_VALUETYPE_BASE(unsigned int);
+GKO_BIND_ATOMIC_HELPER_VALUETYPE_BASE(unsigned long long int);
 
 #undef GKO_BIND_ATOMIC_HELPER_VALUETYPE
 
@@ -241,7 +241,7 @@ GKO_BIND_ATOMIC_MAX_STRUCTURE(unsigned int);
 
 #undef GKO_BIND_ATOMIC_MAX_STRUCTURE
 
-#define GKO_BIND_ATOMIC_MAX_VALUETYPE(ValueType)              \
+#define GKO_BIND_ATOMIC_MAX_VALUETYPE_BASE(ValueType)         \
     template <sycl::access::address_space addressSpace>       \
     struct atomic_max_helper<addressSpace, ValueType,         \
                              std::enable_if_t<true>> {        \
@@ -252,9 +252,9 @@ GKO_BIND_ATOMIC_MAX_STRUCTURE(unsigned int);
         }                                                     \
     };
 
-GKO_BIND_ATOMIC_MAX_VALUETYPE(int);
-GKO_BIND_ATOMIC_MAX_VALUETYPE(unsigned int);
-GKO_BIND_ATOMIC_MAX_VALUETYPE(unsigned long long int);
+GKO_BIND_ATOMIC_MAX_VALUETYPE_BASE(int);
+GKO_BIND_ATOMIC_MAX_VALUETYPE_BASE(unsigned int);
+GKO_BIND_ATOMIC_MAX_VALUETYPE_BASE(unsigned long long int);
 
 #undef GKO_BIND_ATOMIC_MAX_VALUETYPE
 
diff --git a/dpcpp/distributed/assembly_kernels.dp.cpp b/dpcpp/distributed/assembly_kernels.dp.cpp
index e0cc872b783..3f89c45ff1f 100644
--- a/dpcpp/distributed/assembly_kernels.dp.cpp
+++ b/dpcpp/distributed/assembly_kernels.dp.cpp
@@ -23,7 +23,7 @@ void count_non_owning_entries(
     array<GlobalIndexType>& send_positions,
     array<GlobalIndexType>& original_positions) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
     GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
 
 
diff --git a/dpcpp/distributed/matrix_kernels.dp.cpp b/dpcpp/distributed/matrix_kernels.dp.cpp
index 47adaaeca59..ec9bc367e5a 100644
--- a/dpcpp/distributed/matrix_kernels.dp.cpp
+++ b/dpcpp/distributed/matrix_kernels.dp.cpp
@@ -27,7 +27,7 @@ void separate_local_nonlocal(
     array<GlobalIndexType>& non_local_col_idxs,
     array<ValueType>& non_local_values) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
     GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL);
 
 
diff --git a/dpcpp/distributed/vector_kernels.dp.cpp b/dpcpp/distributed/vector_kernels.dp.cpp
index fdc5dd2e52d..4f451e2f76b 100644
--- a/dpcpp/distributed/vector_kernels.dp.cpp
+++ b/dpcpp/distributed/vector_kernels.dp.cpp
@@ -22,7 +22,7 @@ void build_local(
     comm_index_type local_part,
     matrix::Dense<ValueType>* local_mtx) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
     GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL);
 
 
diff --git a/dpcpp/factorization/cholesky_kernels.dp.cpp b/dpcpp/factorization/cholesky_kernels.dp.cpp
index dfef9becf19..5cd8396be17 100644
--- a/dpcpp/factorization/cholesky_kernels.dp.cpp
+++ b/dpcpp/factorization/cholesky_kernels.dp.cpp
@@ -89,7 +89,7 @@ void symbolic_count(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT);
 
 
@@ -138,7 +138,7 @@ void symbolic_factorize(
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE);
 
 
@@ -148,7 +148,7 @@ void forest_from_factor(std::shared_ptr<const DefaultExecutor> exec,
                         gko::factorization::elimination_forest<IndexType>&
                             forest) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR);
 
 
@@ -161,8 +161,7 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
                 IndexType* transpose_idxs,
                 matrix::Csr<ValueType, IndexType>* factors) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CHOLESKY_INITIALIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_INITIALIZE);
 
 
 template <typename ValueType, typename IndexType>
@@ -174,8 +173,7 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
                matrix::Csr<ValueType, IndexType>* factors, bool full_fillin,
                array<int>& tmp_storage) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CHOLESKY_FACTORIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE);
 
 
 }  // namespace cholesky
diff --git a/dpcpp/factorization/factorization_kernels.dp.cpp b/dpcpp/factorization/factorization_kernels.dp.cpp
index 75f7250f569..070bd8f86ee 100644
--- a/dpcpp/factorization/factorization_kernels.dp.cpp
+++ b/dpcpp/factorization/factorization_kernels.dp.cpp
@@ -484,7 +484,7 @@ void add_diagonal_elements(std::shared_ptr<const DpcppExecutor> exec,
     mtx_builder.get_col_idx_array() = std::move(new_col_idxs);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL);
 
 
@@ -512,7 +512,7 @@ void initialize_row_ptrs_l_u(
     components::prefix_sum_nonnegative(exec, u_row_ptrs, num_rows + 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL);
 
 
@@ -537,7 +537,7 @@ void initialize_l_u(std::shared_ptr<const DpcppExecutor> exec,
                            csr_u->get_col_idxs(), csr_u->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL);
 
 
@@ -563,7 +563,7 @@ void initialize_row_ptrs_l(
     components::prefix_sum_nonnegative(exec, l_row_ptrs, num_rows + 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL);
 
 
@@ -586,7 +586,7 @@ void initialize_l(std::shared_ptr<const DpcppExecutor> exec,
                          as_device_type(csr_l->get_values()), diag_sqrt);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL);
 
 
diff --git a/dpcpp/factorization/ic_kernels.dp.cpp b/dpcpp/factorization/ic_kernels.dp.cpp
index 4968e1da538..b2626e7876a 100644
--- a/dpcpp/factorization/ic_kernels.dp.cpp
+++ b/dpcpp/factorization/ic_kernels.dp.cpp
@@ -20,7 +20,7 @@ template <typename ValueType, typename IndexType>
 void sparselib_ic(std::shared_ptr<const DefaultExecutor> exec,
                   matrix::Csr<ValueType, IndexType>* m) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_IC_SPARSELIB_IC_KERNEL);
 
 
diff --git a/dpcpp/factorization/ilu_kernels.dp.cpp b/dpcpp/factorization/ilu_kernels.dp.cpp
index d9ea7776f3c..847547f7706 100644
--- a/dpcpp/factorization/ilu_kernels.dp.cpp
+++ b/dpcpp/factorization/ilu_kernels.dp.cpp
@@ -20,7 +20,7 @@ template <typename ValueType, typename IndexType>
 void sparselib_ilu(std::shared_ptr<const DefaultExecutor> exec,
                    matrix::Csr<ValueType, IndexType>* m) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);
 
 
diff --git a/dpcpp/factorization/lu_kernels.dp.cpp b/dpcpp/factorization/lu_kernels.dp.cpp
index e651944bb1f..bd26b1f79ca 100644
--- a/dpcpp/factorization/lu_kernels.dp.cpp
+++ b/dpcpp/factorization/lu_kernels.dp.cpp
@@ -32,8 +32,7 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
                 const int32* factor_lookup_storage, IndexType* diag_idxs,
                 matrix::Csr<ValueType, IndexType>* factors) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_LU_INITIALIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_INITIALIZE);
 
 
 template <typename ValueType, typename IndexType>
@@ -43,8 +42,7 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
                matrix::Csr<ValueType, IndexType>* factors, bool full_fillin,
                array<int>& tmp_storage) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_LU_FACTORIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE);
 
 
 template <typename IndexType>
diff --git a/dpcpp/factorization/par_ic_kernels.dp.cpp b/dpcpp/factorization/par_ic_kernels.dp.cpp
index 7c60faabc38..b4c6ae03942 100644
--- a/dpcpp/factorization/par_ic_kernels.dp.cpp
+++ b/dpcpp/factorization/par_ic_kernels.dp.cpp
@@ -132,7 +132,7 @@ void init_factor(std::shared_ptr<const DefaultExecutor> exec,
                     l_row_ptrs, l_vals, num_rows);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_IC_INIT_FACTOR_KERNEL);
 
 
@@ -155,7 +155,7 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_IC_COMPUTE_FACTOR_KERNEL);
 
 
diff --git a/dpcpp/factorization/par_ict_kernels.dp.cpp b/dpcpp/factorization/par_ict_kernels.dp.cpp
index 8e505053aaf..284da9aca43 100644
--- a/dpcpp/factorization/par_ict_kernels.dp.cpp
+++ b/dpcpp/factorization/par_ict_kernels.dp.cpp
@@ -486,7 +486,7 @@ void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
         syn::value_list<int>(), syn::type_list<>(), exec, llh, a, l, l_new);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL);
 
 
@@ -508,7 +508,7 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
         syn::value_list<int>(), syn::type_list<>(), exec, a, l, l_coo);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL);
 
 
diff --git a/dpcpp/factorization/par_ilu_kernels.dp.cpp b/dpcpp/factorization/par_ilu_kernels.dp.cpp
index b8df4266672..91d2049e361 100644
--- a/dpcpp/factorization/par_ilu_kernels.dp.cpp
+++ b/dpcpp/factorization/par_ilu_kernels.dp.cpp
@@ -129,7 +129,7 @@ void compute_l_u_factors(std::shared_ptr<const DpcppExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL);
 
 
diff --git a/dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp b/dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp
index 417a2d6f3be..f18566d475b 100644
--- a/dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp
+++ b/dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp
@@ -167,7 +167,7 @@ void threshold_filter_approx(std::shared_ptr<const DefaultExecutor> exec,
         &threshold, m_out, m_out_coo);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL);
 
 
diff --git a/dpcpp/factorization/par_ilut_filter_kernel.dp.cpp b/dpcpp/factorization/par_ilut_filter_kernel.dp.cpp
index 9e6d8909227..8fdb729f405 100644
--- a/dpcpp/factorization/par_ilut_filter_kernel.dp.cpp
+++ b/dpcpp/factorization/par_ilut_filter_kernel.dp.cpp
@@ -123,7 +123,7 @@ void threshold_filter(std::shared_ptr<const DefaultExecutor> exec,
         m_out_coo, lower);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL);
 
 
diff --git a/dpcpp/factorization/par_ilut_kernels.dp.cpp b/dpcpp/factorization/par_ilut_kernels.dp.cpp
index 9e9b951dd42..c491243f610 100644
--- a/dpcpp/factorization/par_ilut_kernels.dp.cpp
+++ b/dpcpp/factorization/par_ilut_kernels.dp.cpp
@@ -40,7 +40,7 @@ void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
                       array<remove_complex<ValueType>>&,
                       remove_complex<ValueType>& threshold) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL);
 
 
@@ -66,7 +66,7 @@ void threshold_filter(std::shared_ptr<const DefaultExecutor> exec,
                       matrix::Coo<ValueType, IndexType>* m_out_coo,
                       bool) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL);
 
 
@@ -82,7 +82,7 @@ void threshold_filter_approx(
     matrix::Csr<ValueType, IndexType>* m_out,
     matrix::Coo<ValueType, IndexType>* m_out_coo) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL);
 
 
@@ -96,7 +96,7 @@ void compute_l_u_factors(std::shared_ptr<const DefaultExecutor> exec,
                          matrix::Csr<ValueType, IndexType>* u_csc)
     GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL);
 
 
@@ -110,7 +110,7 @@ void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
                     matrix::Csr<ValueType, IndexType>* u_new)
     GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL);
 
 
diff --git a/dpcpp/factorization/par_ilut_select_common.dp.cpp b/dpcpp/factorization/par_ilut_select_common.dp.cpp
index f20ae4e280b..1073751d2bb 100644
--- a/dpcpp/factorization/par_ilut_select_common.dp.cpp
+++ b/dpcpp/factorization/par_ilut_select_common.dp.cpp
@@ -67,7 +67,7 @@ void sampleselect_count(std::shared_ptr<const DefaultExecutor> exec,
                             unsigned char* oracles, IndexType* partial_counts, \
                             IndexType* total_counts)
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(DECLARE_SSSS_COUNT);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(DECLARE_SSSS_COUNT);
 
 
 template <typename IndexType>
diff --git a/dpcpp/factorization/par_ilut_select_kernel.dp.cpp b/dpcpp/factorization/par_ilut_select_kernel.dp.cpp
index f545063ff26..c2d4b962a13 100644
--- a/dpcpp/factorization/par_ilut_select_kernel.dp.cpp
+++ b/dpcpp/factorization/par_ilut_select_kernel.dp.cpp
@@ -150,7 +150,7 @@ void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
     threshold = exec->copy_val_to_host(out_ptr);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL);
 
 
diff --git a/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp b/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp
index 9add72baff8..e2312e2c3e8 100644
--- a/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp
+++ b/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp
@@ -432,7 +432,7 @@ void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
         u_new);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL);
 
 
diff --git a/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp b/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp
index 7b18458532b..5decf02ff76 100644
--- a/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp
+++ b/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp
@@ -219,7 +219,7 @@ void compute_l_u_factors(std::shared_ptr<const DefaultExecutor> exec,
         u_csc);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL);
 
 
diff --git a/dpcpp/matrix/batch_csr_kernels.dp.cpp b/dpcpp/matrix/batch_csr_kernels.dp.cpp
index 736025075fd..b214f285a4f 100644
--- a/dpcpp/matrix/batch_csr_kernels.dp.cpp
+++ b/dpcpp/matrix/batch_csr_kernels.dp.cpp
@@ -75,7 +75,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_CSR_SIMPLE_APPLY_KERNEL);
 
 
@@ -129,7 +129,7 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_CSR_ADVANCED_APPLY_KERNEL);
 
 
@@ -175,7 +175,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_CSR_SCALE_KERNEL);
 
 
@@ -217,7 +217,7 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_CSR_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/dpcpp/matrix/batch_dense_kernels.dp.cpp b/dpcpp/matrix/batch_dense_kernels.dp.cpp
index bb8272a457c..62b81bbb2a8 100644
--- a/dpcpp/matrix/batch_dense_kernels.dp.cpp
+++ b/dpcpp/matrix/batch_dense_kernels.dp.cpp
@@ -78,7 +78,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL);
 
 
@@ -131,7 +131,7 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL);
 
 
@@ -175,8 +175,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL);
 
 
 template <typename ValueType>
@@ -218,8 +217,7 @@ void scale_add(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_BATCH_DENSE_SCALE_ADD_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_ADD_KERNEL);
 
 
 template <typename ValueType>
@@ -260,7 +258,7 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_DENSE_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/dpcpp/matrix/batch_ell_kernels.dp.cpp b/dpcpp/matrix/batch_ell_kernels.dp.cpp
index f598d273205..10393057ee7 100644
--- a/dpcpp/matrix/batch_ell_kernels.dp.cpp
+++ b/dpcpp/matrix/batch_ell_kernels.dp.cpp
@@ -75,7 +75,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL);
 
 
@@ -129,7 +129,7 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL);
 
 
@@ -172,7 +172,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_ELL_SCALE_KERNEL);
 
 
@@ -214,7 +214,7 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_ELL_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/dpcpp/matrix/coo_kernels.dp.cpp b/dpcpp/matrix/coo_kernels.dp.cpp
index 3882a714104..32cd1f3ed7f 100644
--- a/dpcpp/matrix/coo_kernels.dp.cpp
+++ b/dpcpp/matrix/coo_kernels.dp.cpp
@@ -261,8 +261,7 @@ void spmv(std::shared_ptr<const DpcppExecutor> exec,
     spmv2(exec, a, b, c);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_COO_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -277,7 +276,7 @@ void advanced_spmv(std::shared_ptr<const DpcppExecutor> exec,
     advanced_spmv2(exec, alpha, a, b, c);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_COO_ADVANCED_SPMV_KERNEL);
 
 
@@ -321,8 +320,7 @@ void spmv2(std::shared_ptr<const DpcppExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_COO_SPMV2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV2_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -369,7 +367,7 @@ void advanced_spmv2(std::shared_ptr<const DpcppExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL);
 
 
diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp
index d54df253a9b..f970d62679b 100644
--- a/dpcpp/matrix/csr_kernels.dp.cpp
+++ b/dpcpp/matrix/csr_kernels.dp.cpp
@@ -1558,7 +1558,7 @@ void spmv(std::shared_ptr<const DpcppExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_SPMV_KERNEL);
 
 
@@ -1631,7 +1631,7 @@ void advanced_spmv(std::shared_ptr<const DpcppExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL);
 
 
@@ -1711,7 +1711,7 @@ void calculate_nonzeros_per_row_in_span(
                              row_nnz->get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
 
 
@@ -1723,7 +1723,7 @@ void calculate_nonzeros_per_row_in_index_set(
     const gko::index_set<IndexType>& col_index_set,
     IndexType* row_nnz) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_INDEX_SET_KERNEL);
 
 
@@ -1751,7 +1751,7 @@ void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,
         as_device_type(result->get_values()));
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
 
 
@@ -1763,7 +1763,7 @@ void compute_submatrix_from_index_set(
     const gko::index_set<IndexType>& col_index_set,
     matrix::Csr<ValueType, IndexType>* result) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_FROM_INDEX_SET_KERNEL);
 
 
@@ -2028,8 +2028,7 @@ void spgemm(std::shared_ptr<const DpcppExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_SPGEMM_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -2165,7 +2164,7 @@ void advanced_spgemm(std::shared_ptr<const DpcppExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL);
 
 
@@ -2254,8 +2253,7 @@ void spgeam(std::shared_ptr<const DpcppExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_SPGEAM_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -2276,7 +2274,7 @@ void fill_in_dense(std::shared_ptr<const DpcppExecutor> exec,
                           as_device_type(result->get_values()));
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL);
 
 
@@ -2286,7 +2284,7 @@ void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,
                       array<IndexType>& row_ptrs, array<IndexType>& col_idxs,
                       array<ValueType>& values) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL);
 
 
@@ -2349,8 +2347,7 @@ void transpose(std::shared_ptr<const DpcppExecutor> exec,
     generic_transpose<false>(exec, orig, trans);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -2361,7 +2358,7 @@ void conj_transpose(std::shared_ptr<const DpcppExecutor> exec,
     generic_transpose<true>(exec, orig, trans);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL);
 
 
@@ -2387,7 +2384,7 @@ void inv_symm_permute(std::shared_ptr<const DpcppExecutor> exec,
         permuted->get_col_idxs(), as_device_type(permuted->get_values()));
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL);
 
 
@@ -2414,7 +2411,7 @@ void inv_nonsymm_permute(std::shared_ptr<const DpcppExecutor> exec,
         as_device_type(permuted->get_values()));
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL);
 
 
@@ -2441,7 +2438,7 @@ void row_permute(std::shared_ptr<const DpcppExecutor> exec,
         as_device_type(row_permuted->get_values()));
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL);
 
 
@@ -2468,7 +2465,7 @@ void inv_row_permute(std::shared_ptr<const DpcppExecutor> exec,
         as_device_type(row_permuted->get_values()));
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL);
 
 
@@ -2495,7 +2492,7 @@ void inv_symm_scale_permute(std::shared_ptr<const DpcppExecutor> exec,
         as_device_type(permuted->get_values()));
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL);
 
 
@@ -2525,7 +2522,7 @@ void inv_nonsymm_scale_permute(std::shared_ptr<const DpcppExecutor> exec,
         permuted->get_col_idxs(), as_device_type(permuted->get_values()));
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL);
 
 
@@ -2552,7 +2549,7 @@ void row_scale_permute(std::shared_ptr<const DpcppExecutor> exec,
         as_device_type(row_permuted->get_values()));
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL);
 
 
@@ -2579,7 +2576,7 @@ void inv_row_scale_permute(std::shared_ptr<const DpcppExecutor> exec,
         as_device_type(row_permuted->get_values()));
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL);
 
 
@@ -2637,7 +2634,7 @@ void sort_by_column_index(std::shared_ptr<const DpcppExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX);
 
 
@@ -2669,7 +2666,7 @@ void is_sorted_by_column_index(
     *is_sorted = get_element(is_sorted_device_array, 0);
 };
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX);
 
 
@@ -2693,8 +2690,7 @@ void extract_diagonal(std::shared_ptr<const DpcppExecutor> exec,
                              orig_row_ptrs, orig_col_idxs, diag_values);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
 
 
 template <typename ValueType, typename IndexType>
@@ -2718,7 +2714,7 @@ void check_diagonal_entries_exist(std::shared_ptr<const DpcppExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST);
 
 
@@ -2742,7 +2738,7 @@ void add_scaled_identity(std::shared_ptr<const DpcppExecutor> exec,
         as_device_type(mtx->get_values()));
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index a36d30cf8e7..f4654e7ca06 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -181,7 +181,7 @@ void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
     compute_dot(exec, x, y, result, tmp);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
 
 
@@ -196,7 +196,7 @@ void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
     compute_conj_dot(exec, x, y, result, tmp);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
 
 
@@ -210,7 +210,7 @@ void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
     compute_norm2(exec, x, result, tmp);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
 
 
@@ -241,8 +241,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
 
 
 template <typename ValueType>
@@ -273,7 +272,7 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -308,7 +307,7 @@ void convert_to_coo(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL);
 
 
@@ -342,7 +341,7 @@ void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL);
 
 
@@ -381,7 +380,7 @@ void convert_to_ell(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL);
 
 
@@ -391,7 +390,7 @@ void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,
                       matrix::Fbcsr<ValueType, IndexType>* result)
     GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL);
 
 
@@ -401,7 +400,7 @@ void count_nonzero_blocks_per_row(std::shared_ptr<const DefaultExecutor> exec,
                                   int bs,
                                   IndexType* result) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL);
 
 
@@ -457,7 +456,7 @@ void convert_to_hybrid(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL);
 
 
@@ -500,7 +499,7 @@ void convert_to_sellp(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL);
 
 
@@ -532,7 +531,7 @@ void convert_to_sparsity_csr(std::shared_ptr<const DefaultExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL);
 
 
@@ -554,8 +553,7 @@ void transpose(std::shared_ptr<const DefaultExecutor> exec,
         queue, orig, trans);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
 
 
 template <typename ValueType>
@@ -583,8 +581,7 @@ void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
         as_device_type(trans->get_values()), trans->get_stride());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
 
 
 }  // namespace dense
diff --git a/dpcpp/matrix/diagonal_kernels.dp.cpp b/dpcpp/matrix/diagonal_kernels.dp.cpp
index 7bc42413330..b63489a50ba 100644
--- a/dpcpp/matrix/diagonal_kernels.dp.cpp
+++ b/dpcpp/matrix/diagonal_kernels.dp.cpp
@@ -84,7 +84,7 @@ void apply_to_csr(std::shared_ptr<const DpcppExecutor> exec,
                          inverse);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DIAGONAL_APPLY_TO_CSR_KERNEL);
 
 
diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp
index cd3cbdd190e..b9c1e6d3933 100644
--- a/dpcpp/matrix/ell_kernels.dp.cpp
+++ b/dpcpp/matrix/ell_kernels.dp.cpp
@@ -433,7 +433,7 @@ void spmv(std::shared_ptr<const DpcppExecutor> exec,
         exec, num_worker_per_row, a, b, c);
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_SPMV_KERNEL);
 
 
@@ -469,7 +469,7 @@ void advanced_spmv(std::shared_ptr<const DpcppExecutor> exec,
         exec, num_worker_per_row, a, b, c, alpha, beta);
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL);
 
 
diff --git a/dpcpp/matrix/fbcsr_kernels.dp.cpp b/dpcpp/matrix/fbcsr_kernels.dp.cpp
index 7d53b862d67..e9eb02f5fb2 100644
--- a/dpcpp/matrix/fbcsr_kernels.dp.cpp
+++ b/dpcpp/matrix/fbcsr_kernels.dp.cpp
@@ -32,8 +32,7 @@ void spmv(std::shared_ptr<const DpcppExecutor> exec,
           const matrix::Dense<ValueType>* b,
           matrix::Dense<ValueType>* c) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_FBCSR_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -44,7 +43,7 @@ void advanced_spmv(std::shared_ptr<const DpcppExecutor> exec,
                    const matrix::Dense<ValueType>* beta,
                    matrix::Dense<ValueType>* c) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL);
 
 
@@ -55,7 +54,7 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
                          array<IndexType>& col_idxs,
                          array<ValueType>& values) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL);
 
 
@@ -64,7 +63,7 @@ void fill_in_dense(std::shared_ptr<const DpcppExecutor> exec,
                    const matrix::Fbcsr<ValueType, IndexType>* source,
                    matrix::Dense<ValueType>* result) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL);
 
 
@@ -74,7 +73,7 @@ void convert_to_csr(const std::shared_ptr<const DpcppExecutor> exec,
                     matrix::Csr<ValueType, IndexType>* result)
     GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL);
 
 
@@ -83,7 +82,7 @@ void transpose(std::shared_ptr<const DpcppExecutor> exec,
                const matrix::Fbcsr<ValueType, IndexType>* orig,
                matrix::Fbcsr<ValueType, IndexType>* trans) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL);
 
 
@@ -93,7 +92,7 @@ void conj_transpose(std::shared_ptr<const DpcppExecutor> exec,
                     matrix::Fbcsr<ValueType, IndexType>* trans)
     GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL);
 
 
@@ -103,7 +102,7 @@ void is_sorted_by_column_index(
     const matrix::Fbcsr<ValueType, IndexType>* to_check,
     bool* is_sorted) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX);
 
 
@@ -112,7 +111,7 @@ void sort_by_column_index(const std::shared_ptr<const DpcppExecutor> exec,
                           matrix::Fbcsr<ValueType, IndexType>* to_sort)
     GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX);
 
 
@@ -121,7 +120,7 @@ void extract_diagonal(std::shared_ptr<const DpcppExecutor> exec,
                       const matrix::Fbcsr<ValueType, IndexType>* orig,
                       matrix::Diagonal<ValueType>* diag) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL);
 
 
diff --git a/dpcpp/matrix/fft_kernels.dp.cpp b/dpcpp/matrix/fft_kernels.dp.cpp
index 83c085e8d15..7ce02692d83 100644
--- a/dpcpp/matrix/fft_kernels.dp.cpp
+++ b/dpcpp/matrix/fft_kernels.dp.cpp
@@ -26,7 +26,7 @@ void fft(std::shared_ptr<const DefaultExecutor> exec,
          matrix::Dense<std::complex<ValueType>>* x, bool inverse,
          array<char>& buffer) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_BASE(GKO_DECLARE_FFT_KERNEL);
 
 
 template <typename ValueType>
@@ -36,7 +36,7 @@ void fft2(std::shared_ptr<const DefaultExecutor> exec,
           size_type size2, bool inverse,
           array<char>& buffer) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_BASE(GKO_DECLARE_FFT2_KERNEL);
 
 
 template <typename ValueType>
@@ -46,7 +46,7 @@ void fft3(std::shared_ptr<const DefaultExecutor> exec,
           size_type size2, size_type size3, bool inverse,
           array<char>& buffer) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT3_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_BASE(GKO_DECLARE_FFT3_KERNEL);
 
 
 }  // namespace fft
diff --git a/dpcpp/matrix/sellp_kernels.dp.cpp b/dpcpp/matrix/sellp_kernels.dp.cpp
index c8b80e77ea9..5dee98d38a6 100644
--- a/dpcpp/matrix/sellp_kernels.dp.cpp
+++ b/dpcpp/matrix/sellp_kernels.dp.cpp
@@ -119,8 +119,7 @@ void spmv(std::shared_ptr<const DpcppExecutor> exec,
                 b->get_const_values(), c->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_SELLP_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_SPMV_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -143,7 +142,7 @@ void advanced_spmv(std::shared_ptr<const DpcppExecutor> exec,
         beta->get_const_values(), c->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL);
 
 
diff --git a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp
index 10744ac3b59..d2f89964e75 100644
--- a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp
+++ b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp
@@ -243,7 +243,7 @@ void spmv(std::shared_ptr<const DpcppExecutor> exec,
         syn::value_list<int>(), syn::type_list<>(), exec, a, b, c);
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL);
 
 
@@ -261,7 +261,7 @@ void advanced_spmv(std::shared_ptr<const DpcppExecutor> exec,
         syn::value_list<int>(), syn::type_list<>(), exec, a, b, c, alpha, beta);
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL);
 
 
@@ -271,7 +271,7 @@ void transpose(std::shared_ptr<const DpcppExecutor> exec,
                matrix::SparsityCsr<ValueType, IndexType>* trans)
     GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL);
 
 
@@ -296,7 +296,7 @@ void sort_by_column_index(std::shared_ptr<const DpcppExecutor> exec,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX);
 
 
@@ -330,7 +330,7 @@ void is_sorted_by_column_index(
     cpu_array = gpu_array;
 };
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_IS_SORTED_BY_COLUMN_INDEX);
 
 
diff --git a/dpcpp/multigrid/pgm_kernels.dp.cpp b/dpcpp/multigrid/pgm_kernels.dp.cpp
index e645ba3bc6e..a9148c54ff4 100644
--- a/dpcpp/multigrid/pgm_kernels.dp.cpp
+++ b/dpcpp/multigrid/pgm_kernels.dp.cpp
@@ -56,8 +56,7 @@ void sort_row_major(std::shared_ptr<const DefaultExecutor> exec, size_type nnz,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_PGM_SORT_ROW_MAJOR);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PGM_SORT_ROW_MAJOR);
 
 
 template <typename ValueType, typename IndexType>
@@ -90,7 +89,7 @@ void compute_coarse_coo(std::shared_ptr<const DefaultExecutor> exec,
         [](auto a, auto b) { return a + b; });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PGM_COMPUTE_COARSE_COO);
 
 
diff --git a/dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp b/dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp
index 3a63466ef5d..7721359716c 100644
--- a/dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp
+++ b/dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp
@@ -104,7 +104,7 @@ void extract_common_blocks_pattern(
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_BLOCK_JACOBI_EXTRACT_PATTERN_KERNEL);
 
 
@@ -173,7 +173,7 @@ void compute_block_jacobi(
         cumulative_block_storage, block_pointers, blocks_pattern, blocks, exec);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_BLOCK_JACOBI_COMPUTE_KERNEL);
 
 
diff --git a/dpcpp/preconditioner/isai_kernels.dp.cpp b/dpcpp/preconditioner/isai_kernels.dp.cpp
index 47ff2938c6c..cbf9647b92c 100644
--- a/dpcpp/preconditioner/isai_kernels.dp.cpp
+++ b/dpcpp/preconditioner/isai_kernels.dp.cpp
@@ -648,7 +648,7 @@ void generate_tri_inverse(std::shared_ptr<const DefaultExecutor> exec,
     components::prefix_sum_nonnegative(exec, excess_nz_ptrs, num_rows + 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL);
 
 
@@ -675,7 +675,7 @@ void generate_general_inverse(std::shared_ptr<const DefaultExecutor> exec,
     components::prefix_sum_nonnegative(exec, excess_nz_ptrs, num_rows + 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ISAI_GENERATE_GENERAL_INVERSE_KERNEL);
 
 
@@ -706,7 +706,7 @@ void generate_excess_system(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL);
 
 
@@ -725,7 +725,7 @@ void scale_excess_solution(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ISAI_SCALE_EXCESS_SOLUTION_KERNEL);
 
 
@@ -749,7 +749,7 @@ void scatter_excess_solution(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL);
 
 
diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
index 01a244f34af..1126421fde7 100644
--- a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
@@ -201,7 +201,7 @@ void advanced_apply(
         const preconditioner::block_interleaved_storage_scheme<IndexType>&, \
         const ValueType*, const ValueType*, size_type, ValueType*, size_type)
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     DECLARE_JACOBI_ADVANCED_APPLY_INSTANTIATION);
 
 
diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
index 537e2b9cfa5..6b28e8d866b 100644
--- a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
@@ -66,8 +66,7 @@ void apply(std::shared_ptr<const DpcppExecutor> exec, size_type num_blocks,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_JACOBI_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_APPLY_KERNEL);
 
 
 }  // namespace jacobi
diff --git a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
index 4fe0d9c5031..7f2832f006b 100644
--- a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
@@ -392,7 +392,7 @@ void generate(syn::value_list<int, max_block_size>,
         remove_complex<ValueType>*, precision_reduction*, const IndexType*,  \
         size_type)
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     DECLARE_JACOBI_GENERATE_INSTANTIATION);
 
 
diff --git a/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp
index 826509be1df..62ff7fdbb51 100644
--- a/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp
@@ -61,7 +61,7 @@ void generate(std::shared_ptr<const DpcppExecutor> exec,
         block_pointers.get_const_data(), num_blocks);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_GENERATE_KERNEL);
 
 
diff --git a/dpcpp/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/preconditioner/jacobi_kernels.dp.cpp
index 3fa743e2cc8..f7735238d7e 100644
--- a/dpcpp/preconditioner/jacobi_kernels.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_kernels.dp.cpp
@@ -389,7 +389,7 @@ void find_blocks(std::shared_ptr<const DefaultExecutor> exec,
         exec, max_block_size, num_natural_blocks, block_pointers.get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL);
 
 
@@ -452,7 +452,7 @@ void transpose_jacobi(
         storage_scheme, out_blocks.get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL);
 
 
@@ -476,7 +476,7 @@ void conj_transpose_jacobi(
         storage_scheme, out_blocks.get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL);
 
 
@@ -489,7 +489,7 @@ void convert_to_dense(
         storage_scheme,
     ValueType* result_values, size_type result_stride) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL);
 
 
diff --git a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
index 5bf5f06cf29..e3a71690d6e 100644
--- a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
@@ -193,7 +193,7 @@ void apply(syn::value_list<int, max_block_size>,
         const preconditioner::block_interleaved_storage_scheme<IndexType>&,   \
         const ValueType*, size_type, ValueType*, size_type)
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     DECLARE_JACOBI_SIMPLE_APPLY_INSTANTIATION);
 
 
diff --git a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
index 610dbe1c8a6..b3012b4ff40 100644
--- a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
@@ -61,7 +61,7 @@ void simple_apply(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL);
 
 
diff --git a/dpcpp/preconditioner/sor_kernels.dp.cpp b/dpcpp/preconditioner/sor_kernels.dp.cpp
index aed20ab8c8a..4af676288bd 100644
--- a/dpcpp/preconditioner/sor_kernels.dp.cpp
+++ b/dpcpp/preconditioner/sor_kernels.dp.cpp
@@ -50,7 +50,7 @@ void initialize_weighted_l(
         });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L);
 
 
@@ -100,7 +100,7 @@ void initialize_weighted_l_u(
         });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U);
 
 
diff --git a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
index 578446c1cc9..4cf3dacae48 100644
--- a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
+++ b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
@@ -192,7 +192,7 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
     dispatcher.apply(b, x, logdata);
 }
 
-GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER(
+GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER_BASE(
     GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL_WRAPPER);
 
 
diff --git a/dpcpp/solver/batch_cg_kernels.dp.cpp b/dpcpp/solver/batch_cg_kernels.dp.cpp
index ae1018b9e80..fee88cf7c92 100644
--- a/dpcpp/solver/batch_cg_kernels.dp.cpp
+++ b/dpcpp/solver/batch_cg_kernels.dp.cpp
@@ -163,7 +163,7 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
     dispatcher.apply(b, x, logdata);
 }
 
-GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER(
+GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER_BASE(
     GKO_DECLARE_BATCH_CG_APPLY_KERNEL_WRAPPER);
 
 
diff --git a/dpcpp/solver/cb_gmres_kernels.dp.cpp b/dpcpp/solver/cb_gmres_kernels.dp.cpp
index e5bd911390b..80886af39cc 100644
--- a/dpcpp/solver/cb_gmres_kernels.dp.cpp
+++ b/dpcpp/solver/cb_gmres_kernels.dp.cpp
@@ -947,7 +947,8 @@ void initialize(std::shared_ptr<const DpcppExecutor> exec,
         givens_cos->get_stride(), stop_status->get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_BASE(
+    GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL);
 
 
 template <typename ValueType, typename Accessor3d>
diff --git a/dpcpp/solver/idr_kernels.dp.cpp b/dpcpp/solver/idr_kernels.dp.cpp
index b34b7123d45..e334e341529 100644
--- a/dpcpp/solver/idr_kernels.dp.cpp
+++ b/dpcpp/solver/idr_kernels.dp.cpp
@@ -789,8 +789,7 @@ void initialize(std::shared_ptr<const DpcppExecutor> exec, const size_type nrhs,
     orthonormalize_subspace_vectors(exec, subspace_vectors);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_IDR_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -817,7 +816,7 @@ void step_1(std::shared_ptr<const DpcppExecutor> exec, const size_type nrhs,
                   v->get_stride(), stop_status->get_const_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -844,7 +843,7 @@ void step_2(std::shared_ptr<const DpcppExecutor> exec, const size_type nrhs,
                   stop_status->get_const_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_2_KERNEL);
 
 
 template <typename ValueType>
@@ -861,7 +860,7 @@ void step_3(std::shared_ptr<const DpcppExecutor> exec, const size_type nrhs,
     update_x_r_and_f(exec, nrhs, k, m, g, u, f, residual, x, stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_3_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_3_KERNEL);
 
 
 template <typename ValueType>
@@ -879,8 +878,7 @@ void compute_omega(
         as_device_type(omega->get_values()), stop_status->get_const_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL);
 
 
 }  // namespace idr
diff --git a/dpcpp/solver/lower_trs_kernels.dp.cpp b/dpcpp/solver/lower_trs_kernels.dp.cpp
index f38b74ae240..f0257d12892 100644
--- a/dpcpp/solver/lower_trs_kernels.dp.cpp
+++ b/dpcpp/solver/lower_trs_kernels.dp.cpp
@@ -42,7 +42,7 @@ void generate(std::shared_ptr<const DpcppExecutor> exec,
               bool unit_diag, const solver::trisolve_algorithm algorithm,
               const size_type num_rhs) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL);
 
 
@@ -59,7 +59,7 @@ void solve(std::shared_ptr<const DpcppExecutor> exec,
            const matrix::Dense<ValueType>* b,
            matrix::Dense<ValueType>* x) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_LOWER_TRS_SOLVE_KERNEL);
 
 
diff --git a/dpcpp/solver/multigrid_kernels.dp.cpp b/dpcpp/solver/multigrid_kernels.dp.cpp
index cdbcb39d043..aaf0ab63354 100644
--- a/dpcpp/solver/multigrid_kernels.dp.cpp
+++ b/dpcpp/solver/multigrid_kernels.dp.cpp
@@ -31,8 +31,7 @@ void kcycle_step_1(std::shared_ptr<const DefaultExecutor> exec,
                    matrix::Dense<ValueType>* g, matrix::Dense<ValueType>* d,
                    matrix::Dense<ValueType>* e) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -45,8 +44,7 @@ void kcycle_step_2(std::shared_ptr<const DefaultExecutor> exec,
                    const matrix::Dense<ValueType>* d,
                    matrix::Dense<ValueType>* e) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL);
 
 
 template <typename ValueType>
@@ -56,7 +54,7 @@ void kcycle_check_stop(std::shared_ptr<const DefaultExecutor> exec,
                        const ValueType rel_tol,
                        bool& is_stop) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
     GKO_DECLARE_MULTIGRID_KCYCLE_CHECK_STOP_KERNEL);
 
 
diff --git a/dpcpp/solver/upper_trs_kernels.dp.cpp b/dpcpp/solver/upper_trs_kernels.dp.cpp
index fe5381bc12b..a4878726b05 100644
--- a/dpcpp/solver/upper_trs_kernels.dp.cpp
+++ b/dpcpp/solver/upper_trs_kernels.dp.cpp
@@ -42,7 +42,7 @@ void generate(std::shared_ptr<const DpcppExecutor> exec,
               bool unit_diag, const solver::trisolve_algorithm algorithm,
               const size_type num_rhs) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL);
 
 
@@ -59,7 +59,7 @@ void solve(std::shared_ptr<const DpcppExecutor> exec,
            const matrix::Dense<ValueType>* b,
            matrix::Dense<ValueType>* x) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL);
 
 
diff --git a/dpcpp/stop/residual_norm_kernels.dp.cpp b/dpcpp/stop/residual_norm_kernels.dp.cpp
index 8f055f693a9..129c26e08f4 100644
--- a/dpcpp/stop/residual_norm_kernels.dp.cpp
+++ b/dpcpp/stop/residual_norm_kernels.dp.cpp
@@ -71,7 +71,7 @@ void residual_norm(std::shared_ptr<const DpcppExecutor> exec,
     *one_changed = get_element(*device_storage, 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
     GKO_DECLARE_RESIDUAL_NORM_KERNEL);
 
 
@@ -128,8 +128,7 @@ void implicit_residual_norm(
     *one_changed = get_element(*device_storage, 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL);
 
 
 }  // namespace implicit_residual_norm
diff --git a/hip/matrix/fft_kernels.hip.cpp b/hip/matrix/fft_kernels.hip.cpp
index 6b14aaf067d..4db4fbbcd32 100644
--- a/hip/matrix/fft_kernels.hip.cpp
+++ b/hip/matrix/fft_kernels.hip.cpp
@@ -163,7 +163,7 @@ void fft(std::shared_ptr<const DefaultExecutor> exec,
     handle.execute(b->get_const_values(), x->get_values(), inverse);
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_BASE(GKO_DECLARE_FFT_KERNEL);
 
 
 template <typename ValueType>
@@ -179,7 +179,7 @@ void fft2(std::shared_ptr<const DefaultExecutor> exec,
     handle.execute(b->get_const_values(), x->get_values(), inverse);
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_BASE(GKO_DECLARE_FFT2_KERNEL);
 
 
 template <typename ValueType>
@@ -195,7 +195,7 @@ void fft3(std::shared_ptr<const DefaultExecutor> exec,
     handle.execute(b->get_const_values(), x->get_values(), inverse);
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT3_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_BASE(GKO_DECLARE_FFT3_KERNEL);
 
 
 }  // namespace fft
diff --git a/hip/matrix/fft_kernels_stub.hip.cpp b/hip/matrix/fft_kernels_stub.hip.cpp
index 210349e58e4..753642b555e 100644
--- a/hip/matrix/fft_kernels_stub.hip.cpp
+++ b/hip/matrix/fft_kernels_stub.hip.cpp
@@ -34,7 +34,7 @@ void fft(std::shared_ptr<const DefaultExecutor> exec,
          matrix::Dense<std::complex<ValueType>>* x, bool inverse,
          array<char>& buffer) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_BASE(GKO_DECLARE_FFT_KERNEL);
 
 
 template <typename ValueType>
@@ -44,7 +44,7 @@ void fft2(std::shared_ptr<const DefaultExecutor> exec,
           size_type size2, bool inverse,
           array<char>& buffer) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_BASE(GKO_DECLARE_FFT2_KERNEL);
 
 
 template <typename ValueType>
@@ -54,7 +54,7 @@ void fft3(std::shared_ptr<const DefaultExecutor> exec,
           size_type size2, size_type size3, bool inverse,
           array<char>& buffer) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT3_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_BASE(GKO_DECLARE_FFT3_KERNEL);
 
 
 }  // namespace fft
diff --git a/hip/preconditioner/batch_jacobi_kernels.hip.cpp b/hip/preconditioner/batch_jacobi_kernels.hip.cpp
index 2424a035cf4..fdd57a95127 100644
--- a/hip/preconditioner/batch_jacobi_kernels.hip.cpp
+++ b/hip/preconditioner/batch_jacobi_kernels.hip.cpp
@@ -101,7 +101,7 @@ void extract_common_blocks_pattern(
         blocks_pattern);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_BLOCK_JACOBI_EXTRACT_PATTERN_KERNEL);
 
 
@@ -159,7 +159,7 @@ void compute_block_jacobi(
         cumulative_block_storage, block_pointers, blocks_pattern, blocks);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_BLOCK_JACOBI_COMPUTE_KERNEL);
 
 
diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp
index 2aede809427..b421da91338 100644
--- a/hip/solver/batch_bicgstab_kernels.hip.cpp
+++ b/hip/solver/batch_bicgstab_kernels.hip.cpp
@@ -176,7 +176,7 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
     dispatcher.apply(b, x, logdata);
 }
 
-GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER(
+GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER_BASE(
     GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL_WRAPPER);
 
 
diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp
index b6d3580585e..ef9137dcf5a 100644
--- a/hip/solver/batch_cg_kernels.hip.cpp
+++ b/hip/solver/batch_cg_kernels.hip.cpp
@@ -158,7 +158,7 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
     dispatcher.apply(b, x, logdata);
 }
 
-GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER(
+GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER_BASE(
     GKO_DECLARE_BATCH_CG_APPLY_KERNEL_WRAPPER);
 
 
diff --git a/hip/solver/lower_trs_kernels.hip.cpp b/hip/solver/lower_trs_kernels.hip.cpp
index 6858f1eddc0..5eab76ed5fa 100644
--- a/hip/solver/lower_trs_kernels.hip.cpp
+++ b/hip/solver/lower_trs_kernels.hip.cpp
@@ -54,7 +54,7 @@ void generate(std::shared_ptr<const HipExecutor> exec,
                                           false, unit_diag);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL);
 
 
@@ -70,7 +70,7 @@ void solve(std::shared_ptr<const HipExecutor> exec,
                                        trans_x, b, x);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_LOWER_TRS_SOLVE_KERNEL);
 
 
diff --git a/hip/solver/upper_trs_kernels.hip.cpp b/hip/solver/upper_trs_kernels.hip.cpp
index f1398faeea4..fb480d9b22d 100644
--- a/hip/solver/upper_trs_kernels.hip.cpp
+++ b/hip/solver/upper_trs_kernels.hip.cpp
@@ -54,7 +54,7 @@ void generate(std::shared_ptr<const HipExecutor> exec,
                                           true, unit_diag);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL);
 
 
@@ -70,7 +70,7 @@ void solve(std::shared_ptr<const HipExecutor> exec,
                                        trans_x, b, x);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL);
 
 
diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp
index 4f1166de223..7611bc54bb9 100644
--- a/include/ginkgo/core/base/types.hpp
+++ b/include/ginkgo/core/base/types.hpp
@@ -420,19 +420,19 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
  *                value type.
  */
 #if GINKGO_DPCPP_SINGLE_MODE
-#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro) \
-    template _macro(float);                                     \
-    template <>                                                 \
+#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_BASE(_macro) \
+    template _macro(float);                                          \
+    template <>                                                      \
     _macro(double) GKO_NOT_IMPLEMENTED
 #else
-#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro) \
-    template _macro(float);                                     \
+#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_BASE(_macro) \
+    template _macro(float);                                          \
     template _macro(double)
 #endif
 
-#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF(_macro) \
-    GKO_ADAPT_HF(template _macro(half));                                  \
-    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro)
+#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro) \
+    GKO_ADAPT_HF(template _macro(half));                        \
+    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_BASE(_macro)
 
 
 /**
@@ -444,22 +444,22 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
  *                value type.
  */
 #if GINKGO_DPCPP_SINGLE_MODE
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro)          \
-    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro); \
-    template _macro(std::complex<float>);                    \
-    template <>                                              \
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_BASE(_macro)          \
+    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_BASE(_macro); \
+    template _macro(std::complex<float>);                         \
+    template <>                                                   \
     _macro(std::complex<double>) GKO_NOT_IMPLEMENTED
 #else
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro)          \
-    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro); \
-    template _macro(std::complex<float>);                    \
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_BASE(_macro)          \
+    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_BASE(_macro); \
+    template _macro(std::complex<float>);                         \
     template _macro(std::complex<double>)
 #endif
 
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(_macro) \
-    GKO_ADAPT_HF(template _macro(half));                      \
-    GKO_ADAPT_HF(template _macro(std::complex<half>));        \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro)
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro)    \
+    GKO_ADAPT_HF(template _macro(half));               \
+    GKO_ADAPT_HF(template _macro(std::complex<half>)); \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_BASE(_macro)
 
 
 // Helper macro to make Windows builds work
@@ -479,21 +479,23 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
  * @note This won't be necessary after upgrading to C++20
  */
 #if GINKGO_DPCPP_SINGLE_MODE
-#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_VARGS(_macro, ...) \
+#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_VARGS_BASE(_macro, \
+                                                                   ...)    \
     template GKO_INDIRECT(_macro(float, __VA_ARGS__));                     \
     template <>                                                            \
     GKO_INDIRECT(_macro(double, __VA_ARGS__))                              \
     GKO_NOT_IMPLEMENTED
 #else
-#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_VARGS(_macro, ...) \
+#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_VARGS_BASE(_macro, \
+                                                                   ...)    \
     template GKO_INDIRECT(_macro(float, __VA_ARGS__));                     \
     template GKO_INDIRECT(_macro(double, __VA_ARGS__))
 #endif
 
-#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_VARGS_WITH_HALF( \
-    _macro, ...)                                                         \
-    GKO_INDIRECT(GKO_ADAPT_HF(template _macro(half, __VA_ARGS__)));      \
-    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_VARGS(_macro, __VA_ARGS__)
+#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_VARGS(_macro, ...) \
+    GKO_INDIRECT(GKO_ADAPT_HF(template _macro(half, __VA_ARGS__)));        \
+    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_VARGS_BASE(_macro,     \
+                                                               __VA_ARGS__)
 
 
 /**
@@ -507,26 +509,26 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
  * @note This won't be necessary after upgrading to C++20
  */
 #if GINKGO_DPCPP_SINGLE_MODE
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(_macro, ...)          \
-    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_VARGS(_macro,       \
-                                                          __VA_ARGS__); \
-    template GKO_INDIRECT(_macro(std::complex<float>, __VA_ARGS__));    \
-    template <>                                                         \
-    GKO_INDIRECT(_macro(std::complex<double>, __VA_ARGS__))             \
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS_BASE(_macro, ...)          \
+    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_VARGS_BASE(_macro,       \
+                                                               __VA_ARGS__); \
+    template GKO_INDIRECT(_macro(std::complex<float>, __VA_ARGS__));         \
+    template <>                                                              \
+    GKO_INDIRECT(_macro(std::complex<double>, __VA_ARGS__))                  \
     GKO_NOT_IMPLEMENTED
 #else
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(_macro, ...)          \
-    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_VARGS(_macro,       \
-                                                          __VA_ARGS__); \
-    template GKO_INDIRECT(_macro(std::complex<float>, __VA_ARGS__));    \
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS_BASE(_macro, ...)          \
+    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_VARGS_BASE(_macro,       \
+                                                               __VA_ARGS__); \
+    template GKO_INDIRECT(_macro(std::complex<float>, __VA_ARGS__));         \
     template GKO_INDIRECT(_macro(std::complex<double>, __VA_ARGS__))
 #endif
 
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS_WITH_HALF(_macro, ...) \
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(_macro, ...)           \
     GKO_INDIRECT(GKO_ADAPT_HF(template _macro(half, __VA_ARGS__)));      \
     GKO_INDIRECT(                                                        \
         GKO_ADAPT_HF(template _macro(std::complex<half>, __VA_ARGS__))); \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(_macro, __VA_ARGS__)
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS_BASE(_macro, __VA_ARGS__)
 
 
 /**
@@ -540,7 +542,7 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
  *                value and scalar type, respectively.
  */
 #if GINKGO_DPCPP_SINGLE_MODE
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro)              \
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_BASE(_macro)         \
     template _macro(float, float);                                          \
     template <>                                                             \
     _macro(double, double) GKO_NOT_IMPLEMENTED;                             \
@@ -551,17 +553,17 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
     template <>                                                             \
     _macro(std::complex<double>, double) GKO_NOT_IMPLEMENTED;
 #else
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro)   \
-    template _macro(float, float);                               \
-    template _macro(double, double);                             \
-    template _macro(std::complex<float>, std::complex<float>);   \
-    template _macro(std::complex<double>, std::complex<double>); \
-    template _macro(std::complex<float>, float);                 \
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_BASE(_macro) \
+    template _macro(float, float);                                  \
+    template _macro(double, double);                                \
+    template _macro(std::complex<float>, std::complex<float>);      \
+    template _macro(std::complex<double>, std::complex<double>);    \
+    template _macro(std::complex<float>, float);                    \
     template _macro(std::complex<double>, double)
 #endif
 
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF(_macro)   \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro);                \
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro)             \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_BASE(_macro);           \
     GKO_ADAPT_HF(template _macro(half, half));                             \
     GKO_ADAPT_HF(template _macro(std::complex<half>, std::complex<half>)); \
     GKO_ADAPT_HF(template _macro(std::complex<half>, half))
@@ -590,46 +592,45 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
  *                value and index types.
  */
 #if GINKGO_DPCPP_SINGLE_MODE
-#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro) \
-    template _macro(float, int32);                                        \
-    template <>                                                           \
-    _macro(double, int32) GKO_NOT_IMPLEMENTED;                            \
-    template _macro(float, int64);                                        \
-    template <>                                                           \
+#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_BASE(_macro) \
+    template _macro(float, int32);                                             \
+    template <>                                                                \
+    _macro(double, int32) GKO_NOT_IMPLEMENTED;                                 \
+    template _macro(float, int64);                                             \
+    template <>                                                                \
     _macro(double, int64) GKO_NOT_IMPLEMENTED
 #else
-#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro) \
-    template _macro(float, int32);                                        \
-    template _macro(double, int32);                                       \
-    template _macro(float, int64);                                        \
+#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_BASE(_macro) \
+    template _macro(float, int32);                                             \
+    template _macro(double, int32);                                            \
+    template _macro(float, int64);                                             \
     template _macro(double, int64)
 #endif
-#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF( \
-    _macro)                                                                  \
-    GKO_ADAPT_HF(template _macro(half, int32));                              \
-    GKO_ADAPT_HF(template _macro(half, int64));                              \
-    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro)
+#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro) \
+    GKO_ADAPT_HF(template _macro(half, int32));                           \
+    GKO_ADAPT_HF(template _macro(half, int64));                           \
+    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_BASE(_macro)
 
 #if GINKGO_DPCPP_SINGLE_MODE
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(_macro) \
-    template _macro(float, int32);                            \
-    template <>                                               \
-    _macro(double, int32) GKO_NOT_IMPLEMENTED;                \
-    template _macro(std::complex<float>, int32);              \
-    template <>                                               \
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_BASE(_macro) \
+    template _macro(float, int32);                                 \
+    template <>                                                    \
+    _macro(double, int32) GKO_NOT_IMPLEMENTED;                     \
+    template _macro(std::complex<float>, int32);                   \
+    template <>                                                    \
     _macro(std::complex<double>, int32) GKO_NOT_IMPLEMENTED
 #else
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(_macro) \
-    template _macro(float, int32);                            \
-    template _macro(double, int32);                           \
-    template _macro(std::complex<float>, int32);              \
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_BASE(_macro) \
+    template _macro(float, int32);                                 \
+    template _macro(double, int32);                                \
+    template _macro(std::complex<float>, int32);                   \
     template _macro(std::complex<double>, int32)
 #endif
 
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(_macro) \
-    GKO_ADAPT_HF(template _macro(half, int32));                         \
-    GKO_ADAPT_HF(template _macro(std::complex<half>, int32));           \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(_macro)
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(_macro) \
+    GKO_ADAPT_HF(template _macro(half, int32));               \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, int32)); \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_BASE(_macro)
 
 
 /**
@@ -641,29 +642,29 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
  *                value and index types.
  */
 #if GINKGO_DPCPP_SINGLE_MODE
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro)          \
-    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro); \
-    template _macro(std::complex<float>, int32);                       \
-    template <>                                                        \
-    _macro(std::complex<double>, int32) GKO_NOT_IMPLEMENTED;           \
-    template _macro(std::complex<float>, int64);                       \
-    template <>                                                        \
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_BASE(_macro)          \
+    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_BASE(_macro); \
+    template _macro(std::complex<float>, int32);                            \
+    template <>                                                             \
+    _macro(std::complex<double>, int32) GKO_NOT_IMPLEMENTED;                \
+    template _macro(std::complex<float>, int64);                            \
+    template <>                                                             \
     _macro(std::complex<double>, int64) GKO_NOT_IMPLEMENTED
 #else
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro)          \
-    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro); \
-    template _macro(std::complex<float>, int32);                       \
-    template _macro(std::complex<double>, int32);                      \
-    template _macro(std::complex<float>, int64);                       \
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_BASE(_macro)          \
+    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_BASE(_macro); \
+    template _macro(std::complex<float>, int32);                            \
+    template _macro(std::complex<double>, int32);                           \
+    template _macro(std::complex<float>, int64);                            \
     template _macro(std::complex<double>, int64)
 #endif
 
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(_macro) \
-    GKO_ADAPT_HF(template _macro(half, int32));                         \
-    GKO_ADAPT_HF(template _macro(half, int64));                         \
-    GKO_ADAPT_HF(template _macro(std::complex<half>, int32));           \
-    GKO_ADAPT_HF(template _macro(std::complex<half>, int64));           \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro)
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro) \
+    GKO_ADAPT_HF(template _macro(half, int32));               \
+    GKO_ADAPT_HF(template _macro(half, int64));               \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, int32)); \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, int64)); \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_BASE(_macro)
 
 
 /**
@@ -676,34 +677,34 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
  *                value, the local and the global index types.
  */
 #if GINKGO_DPCPP_SINGLE_MODE
-#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \
-    _macro)                                                                     \
-    template _macro(float, int32, int32);                                       \
-    template _macro(float, int32, int64);                                       \
-    template _macro(float, int64, int64);                                       \
-    template <>                                                                 \
-    _macro(double, int32, int32) GKO_NOT_IMPLEMENTED;                           \
-    template <>                                                                 \
-    _macro(double, int32, int64) GKO_NOT_IMPLEMENTED;                           \
-    template <>                                                                 \
+#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE( \
+    _macro)                                                                          \
+    template _macro(float, int32, int32);                                            \
+    template _macro(float, int32, int64);                                            \
+    template _macro(float, int64, int64);                                            \
+    template <>                                                                      \
+    _macro(double, int32, int32) GKO_NOT_IMPLEMENTED;                                \
+    template <>                                                                      \
+    _macro(double, int32, int64) GKO_NOT_IMPLEMENTED;                                \
+    template <>                                                                      \
     _macro(double, int64, int64) GKO_NOT_IMPLEMENTED
 #else
-#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \
-    _macro)                                                                     \
-    template _macro(float, int32, int32);                                       \
-    template _macro(float, int32, int64);                                       \
-    template _macro(float, int64, int64);                                       \
-    template _macro(double, int32, int32);                                      \
-    template _macro(double, int32, int64);                                      \
+#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE( \
+    _macro)                                                                          \
+    template _macro(float, int32, int32);                                            \
+    template _macro(float, int32, int64);                                            \
+    template _macro(float, int64, int64);                                            \
+    template _macro(double, int32, int32);                                           \
+    template _macro(double, int32, int64);                                           \
     template _macro(double, int64, int64)
 #endif
 
-#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_WITH_HALF( \
-    _macro)                                                                               \
-    GKO_ADAPT_HF(template _macro(half, int32, int32));                                    \
-    GKO_ADAPT_HF(template _macro(half, int32, int64));                                    \
-    GKO_ADAPT_HF(template _macro(half, int64, int64));                                    \
-    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(               \
+#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(  \
+    _macro)                                                                      \
+    GKO_ADAPT_HF(template _macro(half, int32, int32));                           \
+    GKO_ADAPT_HF(template _macro(half, int32, int64));                           \
+    GKO_ADAPT_HF(template _macro(half, int64, int64));                           \
+    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE( \
         _macro)
 
 
@@ -716,43 +717,44 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
  *                value and index types.
  */
 #if GINKGO_DPCPP_SINGLE_MODE
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro)  \
-    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \
-        _macro);                                                            \
-    template _macro(std::complex<float>, int32, int32);                     \
-    template _macro(std::complex<float>, int32, int64);                     \
-    template _macro(std::complex<float>, int64, int64);                     \
-    template <>                                                             \
-    _macro(std::complex<double>, int32, int32) GKO_NOT_IMPLEMENTED;         \
-    template <>                                                             \
-    _macro(std::complex<double>, int32, int64) GKO_NOT_IMPLEMENTED;         \
-    template <>                                                             \
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(         \
+    _macro)                                                                      \
+    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE( \
+        _macro);                                                                 \
+    template _macro(std::complex<float>, int32, int32);                          \
+    template _macro(std::complex<float>, int32, int64);                          \
+    template _macro(std::complex<float>, int64, int64);                          \
+    template <>                                                                  \
+    _macro(std::complex<double>, int32, int32) GKO_NOT_IMPLEMENTED;              \
+    template <>                                                                  \
+    _macro(std::complex<double>, int32, int64) GKO_NOT_IMPLEMENTED;              \
+    template <>                                                                  \
     _macro(std::complex<double>, int64, int64) GKO_NOT_IMPLEMENTED
 #else
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro)  \
-    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \
-        _macro);                                                            \
-    template _macro(std::complex<float>, int32, int32);                     \
-    template _macro(std::complex<float>, int32, int64);                     \
-    template _macro(std::complex<float>, int64, int64);                     \
-    template _macro(std::complex<double>, int32, int32);                    \
-    template _macro(std::complex<double>, int32, int64);                    \
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(         \
+    _macro)                                                                      \
+    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE( \
+        _macro);                                                                 \
+    template _macro(std::complex<float>, int32, int32);                          \
+    template _macro(std::complex<float>, int32, int64);                          \
+    template _macro(std::complex<float>, int64, int64);                          \
+    template _macro(std::complex<double>, int32, int32);                         \
+    template _macro(std::complex<double>, int32, int64);                         \
     template _macro(std::complex<double>, int64, int64)
 #endif
 
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_WITH_HALF( \
-    _macro)                                                                   \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro);       \
-    GKO_ADAPT_HF(template _macro(half, int32, int32));                        \
-    GKO_ADAPT_HF(template _macro(half, int32, int64));                        \
-    GKO_ADAPT_HF(template _macro(half, int64, int64));                        \
-    GKO_ADAPT_HF(template _macro(std::complex<half>, int32, int32));          \
-    GKO_ADAPT_HF(template _macro(std::complex<half>, int32, int64));          \
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro)   \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(_macro); \
+    GKO_ADAPT_HF(template _macro(half, int32, int32));                       \
+    GKO_ADAPT_HF(template _macro(half, int32, int64));                       \
+    GKO_ADAPT_HF(template _macro(half, int64, int64));                       \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, int32, int32));         \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, int32, int64));         \
     GKO_ADAPT_HF(template _macro(std::complex<half>, int64, int64))
 
 
 #if GINKGO_DPCPP_SINGLE_MODE
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro)                  \
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_BASE(_macro)             \
     template <>                                                            \
     _macro(float, double) GKO_NOT_IMPLEMENTED;                             \
     template <>                                                            \
@@ -763,13 +765,13 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
     _macro(std::complex<double>, std::complex<float>) GKO_NOT_IMPLEMENTED
 
 
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro);            \
-    template _macro(float, float);                                \
-    template <>                                                   \
-    _macro(double, double) GKO_NOT_IMPLEMENTED;                   \
-    template _macro(std::complex<float>, std::complex<float>);    \
-    template <>                                                   \
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY_BASE(_macro) \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_BASE(_macro);            \
+    template _macro(float, float);                                     \
+    template <>                                                        \
+    _macro(double, double) GKO_NOT_IMPLEMENTED;                        \
+    template _macro(std::complex<float>, std::complex<float>);         \
+    template <>                                                        \
     _macro(std::complex<double>, std::complex<double>) GKO_NOT_IMPLEMENTED
 #else
 /**
@@ -781,7 +783,7 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
  *                Should take two arguments `src` and `dst`, which
  *                are replaced by the source and destination value type.
  */
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro)       \
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_BASE(_macro)  \
     template _macro(float, double);                             \
     template _macro(double, float);                             \
     template _macro(std::complex<float>, std::complex<double>); \
@@ -797,16 +799,16 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
  *                Should take two arguments `src` and `dst`, which
  *                are replaced by the source and destination value type.
  */
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro);            \
-    template _macro(float, float);                                \
-    template _macro(double, double);                              \
-    template _macro(std::complex<float>, std::complex<float>);    \
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY_BASE(_macro) \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_BASE(_macro);            \
+    template _macro(float, float);                                     \
+    template _macro(double, double);                                   \
+    template _macro(std::complex<float>, std::complex<float>);         \
     template _macro(std::complex<double>, std::complex<double>)
 #endif
 
 #if GINKGO_DPCPP_SINGLE_MODE
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_WITH_HALF(_macro)           \
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro)                     \
     GKO_ADAPT_HF(template <> _macro(half, double) GKO_NOT_IMPLEMENTED);       \
     GKO_ADAPT_HF(template <> _macro(double, half) GKO_NOT_IMPLEMENTED);       \
     GKO_ADAPT_HF(template _macro(float, half));                               \
@@ -817,9 +819,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
     GKO_ADAPT_HF(template _macro(std::complex<float>, std::complex<half>));   \
     GKO_ADAPT_HF(template <> _macro(std::complex<double>, std::complex<half>) \
                      GKO_NOT_IMPLEMENTED);                                    \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro)
+    GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_BASE(_macro)
 #else
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_WITH_HALF(_macro)          \
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro)                    \
     GKO_ADAPT_HF(template _macro(half, double));                             \
     GKO_ADAPT_HF(template _macro(double, half));                             \
     GKO_ADAPT_HF(template _macro(float, half));                              \
@@ -828,16 +830,16 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
     GKO_ADAPT_HF(template _macro(std::complex<half>, std::complex<double>)); \
     GKO_ADAPT_HF(template _macro(std::complex<float>, std::complex<half>));  \
     GKO_ADAPT_HF(template _macro(std::complex<double>, std::complex<half>)); \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro)
+    GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_BASE(_macro)
 #endif
 
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY_WITH_HALF(_macro) \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_WITH_HALF(_macro);            \
-    GKO_ADAPT_HF(template _macro(half, half));                              \
-    GKO_ADAPT_HF(template _macro(std::complex<half>, std::complex<half>));  \
-    template _macro(float, float);                                          \
-    template _macro(double, double);                                        \
-    template _macro(std::complex<float>, std::complex<float>);              \
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro)          \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro);                     \
+    GKO_ADAPT_HF(template _macro(half, half));                             \
+    GKO_ADAPT_HF(template _macro(std::complex<half>, std::complex<half>)); \
+    template _macro(float, float);                                         \
+    template _macro(double, double);                                       \
+    template _macro(std::complex<float>, std::complex<float>);             \
     template _macro(std::complex<double>, std::complex<double>)
 
 /**
@@ -848,7 +850,7 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
  *                Should take two arguments, which are replaced by the
  *                value and index types.
  */
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_PAIR(_macro)       \
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_PAIR_BASE(_macro)  \
     template _macro(float, float);                             \
     template _macro(double, double);                           \
     template _macro(std::complex<float>, float);               \
@@ -856,11 +858,11 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
     template _macro(std::complex<float>, std::complex<float>); \
     template _macro(std::complex<double>, std::complex<double>)
 
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_PAIR_WITH_HALF(_macro)         \
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_PAIR(_macro)                   \
     GKO_ADAPT_HF(template _macro(half, half));                             \
     GKO_ADAPT_HF(template _macro(std::complex<half>, half));               \
     GKO_ADAPT_HF(template _macro(std::complex<half>, std::complex<half>)); \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_PAIR(_macro)
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_PAIR_BASE(_macro)
 
 /**
  * Instantiates a template for each combined value and index type compiled by
@@ -871,23 +873,22 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
  *                Should take two arguments, which are replaced by the
  *                value and index types.
  */
-#define GKO_INSTANTIATE_FOR_EACH_COMBINED_VALUE_AND_INDEX_TYPE(_macro) \
-    template _macro(char, char);                                       \
-    template _macro(int32, int32);                                     \
-    template _macro(int64, int64);                                     \
-    template _macro(unsigned int, unsigned int);                       \
-    template _macro(unsigned long, unsigned long);                     \
-    template _macro(float, float);                                     \
-    template _macro(double, double);                                   \
-    template _macro(long double, long double);                         \
-    template _macro(std::complex<float>, std::complex<float>);         \
+#define GKO_INSTANTIATE_FOR_EACH_COMBINED_VALUE_AND_INDEX_TYPE_BASE(_macro) \
+    template _macro(char, char);                                            \
+    template _macro(int32, int32);                                          \
+    template _macro(int64, int64);                                          \
+    template _macro(unsigned int, unsigned int);                            \
+    template _macro(unsigned long, unsigned long);                          \
+    template _macro(float, float);                                          \
+    template _macro(double, double);                                        \
+    template _macro(long double, long double);                              \
+    template _macro(std::complex<float>, std::complex<float>);              \
     template _macro(std::complex<double>, std::complex<double>)
 
-#define GKO_INSTANTIATE_FOR_EACH_COMBINED_VALUE_AND_INDEX_TYPE_WITH_HALF(  \
-    _macro)                                                                \
+#define GKO_INSTANTIATE_FOR_EACH_COMBINED_VALUE_AND_INDEX_TYPE(_macro)     \
     GKO_ADAPT_HF(template _macro(half, half));                             \
     GKO_ADAPT_HF(template _macro(std::complex<half>, std::complex<half>)); \
-    GKO_INSTANTIATE_FOR_EACH_COMBINED_VALUE_AND_INDEX_TYPE(_macro)
+    GKO_INSTANTIATE_FOR_EACH_COMBINED_VALUE_AND_INDEX_TYPE_BASE(_macro)
 
 /**
  * Instantiates a template for each value and index type compiled by Ginkgo.
@@ -897,20 +898,20 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
  *                Should take two arguments, which are replaced by the
  *                value and index types.
  */
-#define GKO_INSTANTIATE_FOR_EACH_POD_TYPE(_macro) \
-    template _macro(float);                       \
-    template _macro(double);                      \
-    template _macro(std::complex<float>);         \
-    template _macro(std::complex<double>);        \
-    template _macro(size_type);                   \
-    template _macro(bool);                        \
-    template _macro(int32);                       \
+#define GKO_INSTANTIATE_FOR_EACH_POD_TYPE_BASE(_macro) \
+    template _macro(float);                            \
+    template _macro(double);                           \
+    template _macro(std::complex<float>);              \
+    template _macro(std::complex<double>);             \
+    template _macro(size_type);                        \
+    template _macro(bool);                             \
+    template _macro(int32);                            \
     template _macro(int64)
 
-#define GKO_INSTANTIATE_FOR_EACH_POD_TYPE_WITH_HALF(_macro) \
-    GKO_ADAPT_HF(template _macro(half));                    \
-    GKO_ADAPT_HF(template _macro(std::complex<half>));      \
-    GKO_INSTANTIATE_FOR_EACH_POD_TYPE(_macro)
+#define GKO_INSTANTIATE_FOR_EACH_POD_TYPE(_macro)      \
+    GKO_ADAPT_HF(template _macro(half));               \
+    GKO_ADAPT_HF(template _macro(std::complex<half>)); \
+    GKO_INSTANTIATE_FOR_EACH_POD_TYPE_BASE(_macro)
 
 /**
  * Instantiates a template for each normal type
@@ -920,16 +921,16 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
  *                Should take one argument, which is replaced by the
  *                value type.
  */
+#define GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_BASE(_macro) \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_BASE(_macro);       \
+    GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(_macro);            \
+    template _macro(gko::size_type)
+
 #define GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(_macro) \
     GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro);       \
     GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(_macro);       \
     template _macro(gko::size_type)
 
-#define GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF(_macro) \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(_macro);       \
-    GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(_macro);                 \
-    template _macro(gko::size_type)
-
 
 /**
  * Instantiates a template for int32 type.
diff --git a/omp/base/batch_multi_vector_kernels.cpp b/omp/base/batch_multi_vector_kernels.cpp
index bbae1b0b85d..5b57921ab8f 100644
--- a/omp/base/batch_multi_vector_kernels.cpp
+++ b/omp/base/batch_multi_vector_kernels.cpp
@@ -37,7 +37,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL);
 
 
@@ -59,7 +59,7 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL);
 
 
@@ -81,7 +81,7 @@ void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL);
 
 
@@ -103,7 +103,7 @@ void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_CONJ_DOT_KERNEL);
 
 
@@ -122,7 +122,7 @@ void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL);
 
 
@@ -141,8 +141,7 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
 
 
 }  // namespace batch_multi_vector
diff --git a/omp/base/device_matrix_data_kernels.cpp b/omp/base/device_matrix_data_kernels.cpp
index cb2dabd3010..bce89e2f409 100644
--- a/omp/base/device_matrix_data_kernels.cpp
+++ b/omp/base/device_matrix_data_kernels.cpp
@@ -69,7 +69,7 @@ void remove_zeros(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DEVICE_MATRIX_DATA_REMOVE_ZEROS_KERNEL);
 
 
@@ -127,7 +127,7 @@ void sum_duplicates(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DEVICE_MATRIX_DATA_SUM_DUPLICATES_KERNEL);
 
 
@@ -142,7 +142,7 @@ void sort_row_major(std::shared_ptr<const DefaultExecutor> exec,
     aos_to_soa(exec, tmp, data);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DEVICE_MATRIX_DATA_SORT_ROW_MAJOR_KERNEL);
 
 
diff --git a/omp/distributed/assembly_kernels.cpp b/omp/distributed/assembly_kernels.cpp
index 9fa9976e607..44c9c908c52 100644
--- a/omp/distributed/assembly_kernels.cpp
+++ b/omp/distributed/assembly_kernels.cpp
@@ -73,7 +73,7 @@ void count_non_owning_entries(
                                        num_input_elements);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
     GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
 
 
diff --git a/omp/distributed/matrix_kernels.cpp b/omp/distributed/matrix_kernels.cpp
index 2f36ec4a778..d60b31ac6a8 100644
--- a/omp/distributed/matrix_kernels.cpp
+++ b/omp/distributed/matrix_kernels.cpp
@@ -149,7 +149,7 @@ void separate_local_nonlocal(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
     GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL);
 
 
diff --git a/omp/distributed/vector_kernels.cpp b/omp/distributed/vector_kernels.cpp
index 1ae60ed108e..007509f50fd 100644
--- a/omp/distributed/vector_kernels.cpp
+++ b/omp/distributed/vector_kernels.cpp
@@ -42,7 +42,7 @@ void build_local(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
     GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL);
 
 
diff --git a/omp/factorization/cholesky_kernels.cpp b/omp/factorization/cholesky_kernels.cpp
index 0eb30441405..aa4aabfc731 100644
--- a/omp/factorization/cholesky_kernels.cpp
+++ b/omp/factorization/cholesky_kernels.cpp
@@ -78,7 +78,7 @@ void symbolic_count(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT);
 
 
@@ -126,7 +126,7 @@ void symbolic_factorize(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE);
 
 
@@ -169,7 +169,7 @@ void forest_from_factor(
                                      num_rows, num_rows + 1, child_ptrs);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR);
 
 
@@ -201,8 +201,7 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
               });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CHOLESKY_INITIALIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_INITIALIZE);
 
 
 namespace {
@@ -283,8 +282,7 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CHOLESKY_FACTORIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE);
 
 
 }  // namespace cholesky
diff --git a/omp/factorization/factorization_kernels.cpp b/omp/factorization/factorization_kernels.cpp
index 47cd38d89c3..e7b66f6f887 100644
--- a/omp/factorization/factorization_kernels.cpp
+++ b/omp/factorization/factorization_kernels.cpp
@@ -180,7 +180,7 @@ void add_diagonal_elements(std::shared_ptr<const OmpExecutor> exec,
     mtx_builder.get_col_idx_array() = std::move(new_col_idxs);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL);
 
 
@@ -215,7 +215,7 @@ void initialize_row_ptrs_l_u(
     components::prefix_sum_nonnegative(exec, u_row_ptrs, num_rows + 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL);
 
 
@@ -233,7 +233,7 @@ void initialize_l_u(std::shared_ptr<const OmpExecutor> exec,
                                         helpers::identity{}));
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL);
 
 
@@ -264,7 +264,7 @@ void initialize_row_ptrs_l(
     components::prefix_sum_nonnegative(exec, l_row_ptrs, num_rows + 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL);
 
 
@@ -287,7 +287,7 @@ void initialize_l(std::shared_ptr<const OmpExecutor> exec,
                               helpers::identity{}));
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL);
 
 
diff --git a/omp/factorization/ic_kernels.cpp b/omp/factorization/ic_kernels.cpp
index 313bf8c7982..c071ba2ca87 100644
--- a/omp/factorization/ic_kernels.cpp
+++ b/omp/factorization/ic_kernels.cpp
@@ -20,7 +20,7 @@ template <typename ValueType, typename IndexType>
 void sparselib_ic(std::shared_ptr<const DefaultExecutor> exec,
                   matrix::Csr<ValueType, IndexType>* m) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_IC_SPARSELIB_IC_KERNEL);
 
 
diff --git a/omp/factorization/ilu_kernels.cpp b/omp/factorization/ilu_kernels.cpp
index db3fd5ef7a8..b88e6a77900 100644
--- a/omp/factorization/ilu_kernels.cpp
+++ b/omp/factorization/ilu_kernels.cpp
@@ -20,7 +20,7 @@ template <typename ValueType, typename IndexType>
 void sparselib_ilu(std::shared_ptr<const DefaultExecutor> exec,
                    matrix::Csr<ValueType, IndexType>* m) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);
 
 
diff --git a/omp/factorization/lu_kernels.cpp b/omp/factorization/lu_kernels.cpp
index 5f766a7208a..8bcfd155715 100644
--- a/omp/factorization/lu_kernels.cpp
+++ b/omp/factorization/lu_kernels.cpp
@@ -59,8 +59,7 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_LU_INITIALIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_INITIALIZE);
 
 
 namespace {
@@ -127,8 +126,7 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_LU_FACTORIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE);
 
 
 template <typename IndexType>
diff --git a/omp/factorization/par_ic_kernels.cpp b/omp/factorization/par_ic_kernels.cpp
index 9488c448519..93093783acc 100644
--- a/omp/factorization/par_ic_kernels.cpp
+++ b/omp/factorization/par_ic_kernels.cpp
@@ -42,7 +42,7 @@ void init_factor(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_IC_INIT_FACTOR_KERNEL);
 
 
@@ -96,7 +96,7 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_IC_COMPUTE_FACTOR_KERNEL);
 
 
diff --git a/omp/factorization/par_ict_kernels.cpp b/omp/factorization/par_ict_kernels.cpp
index a67ad860965..b5546e1a644 100644
--- a/omp/factorization/par_ict_kernels.cpp
+++ b/omp/factorization/par_ict_kernels.cpp
@@ -91,7 +91,7 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL);
 
 
@@ -166,7 +166,7 @@ void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
         [](IndexType, row_state) {});
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL);
 
 
diff --git a/omp/factorization/par_ilu_kernels.cpp b/omp/factorization/par_ilu_kernels.cpp
index 0504bca8b1d..da42a631b81 100644
--- a/omp/factorization/par_ilu_kernels.cpp
+++ b/omp/factorization/par_ilu_kernels.cpp
@@ -88,7 +88,7 @@ void compute_l_u_factors(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL);
 
 
diff --git a/omp/factorization/par_ilut_kernels.cpp b/omp/factorization/par_ilut_kernels.cpp
index af9229f3509..8b251e88bf4 100644
--- a/omp/factorization/par_ilut_kernels.cpp
+++ b/omp/factorization/par_ilut_kernels.cpp
@@ -54,7 +54,7 @@ void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
     threshold = abs(*target);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL);
 
 
@@ -144,7 +144,7 @@ void threshold_filter(std::shared_ptr<const DefaultExecutor> exec,
         });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL);
 
 
@@ -233,7 +233,7 @@ void threshold_filter_approx(std::shared_ptr<const DefaultExecutor> exec,
         });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL);
 
 
@@ -317,7 +317,7 @@ void compute_l_u_factors(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL);
 
 
@@ -433,7 +433,7 @@ void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
         [](IndexType, row_state) {});
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL);
 
 
diff --git a/omp/matrix/batch_csr_kernels.cpp b/omp/matrix/batch_csr_kernels.cpp
index b55253e9d4e..d4ea6cbd642 100644
--- a/omp/matrix/batch_csr_kernels.cpp
+++ b/omp/matrix/batch_csr_kernels.cpp
@@ -41,7 +41,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_CSR_SIMPLE_APPLY_KERNEL);
 
 
@@ -71,7 +71,7 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_CSR_ADVANCED_APPLY_KERNEL);
 
 
@@ -98,7 +98,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_CSR_SCALE_KERNEL);
 
 
@@ -122,7 +122,7 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_CSR_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/omp/matrix/batch_dense_kernels.cpp b/omp/matrix/batch_dense_kernels.cpp
index ea7da295bb4..cd4a7f05b4a 100644
--- a/omp/matrix/batch_dense_kernels.cpp
+++ b/omp/matrix/batch_dense_kernels.cpp
@@ -41,7 +41,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL);
 
 
@@ -71,7 +71,7 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL);
 
 
@@ -98,8 +98,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL);
 
 
 template <typename ValueType>
@@ -122,8 +121,7 @@ void scale_add(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_BATCH_DENSE_SCALE_ADD_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_ADD_KERNEL);
 
 
 template <typename ValueType>
@@ -146,7 +144,7 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_DENSE_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/omp/matrix/batch_ell_kernels.cpp b/omp/matrix/batch_ell_kernels.cpp
index 74b8d94cfc8..8b1239565a1 100644
--- a/omp/matrix/batch_ell_kernels.cpp
+++ b/omp/matrix/batch_ell_kernels.cpp
@@ -41,7 +41,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL);
 
 
@@ -71,7 +71,7 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL);
 
 
@@ -98,7 +98,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_ELL_SCALE_KERNEL);
 
 
@@ -122,7 +122,7 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_ELL_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/omp/matrix/coo_kernels.cpp b/omp/matrix/coo_kernels.cpp
index 6d4a46b7ed3..021795d8e9c 100644
--- a/omp/matrix/coo_kernels.cpp
+++ b/omp/matrix/coo_kernels.cpp
@@ -42,8 +42,7 @@ void spmv(std::shared_ptr<const OmpExecutor> exec,
     spmv2(exec, a, b, c);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_COO_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -58,7 +57,7 @@ void advanced_spmv(std::shared_ptr<const OmpExecutor> exec,
     advanced_spmv2(exec, alpha, a, b, c);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_COO_ADVANCED_SPMV_KERNEL);
 
 
@@ -307,8 +306,7 @@ void spmv2(std::shared_ptr<const OmpExecutor> exec,
     generic_spmv2(exec, a, b, c, one<ValueType>());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_COO_SPMV2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV2_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -321,7 +319,7 @@ void advanced_spmv2(std::shared_ptr<const OmpExecutor> exec,
     generic_spmv2(exec, a, b, c, alpha->at(0, 0));
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL);
 
 
diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp
index d9c7b9840c1..1adcc0df186 100644
--- a/omp/matrix/csr_kernels.cpp
+++ b/omp/matrix/csr_kernels.cpp
@@ -77,7 +77,7 @@ void spmv(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_SPMV_KERNEL);
 
 
@@ -118,7 +118,7 @@ void advanced_spmv(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL);
 
 
@@ -374,8 +374,7 @@ void spgemm(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_SPGEMM_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -491,7 +490,7 @@ void advanced_spgemm(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL);
 
 
@@ -541,8 +540,7 @@ void spgeam(std::shared_ptr<const OmpExecutor> exec,
         [](IndexType, IndexType) {});
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_SPGEAM_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -565,7 +563,7 @@ void fill_in_dense(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL);
 
 
@@ -635,7 +633,7 @@ void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,
     std::copy(col_idx_vec.begin(), col_idx_vec.end(), col_idxs.get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL);
 
 
@@ -694,8 +692,7 @@ void transpose(std::shared_ptr<const OmpExecutor> exec,
                             [](const ValueType x) { return x; });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -707,7 +704,7 @@ void conj_transpose(std::shared_ptr<const OmpExecutor> exec,
                             [](const ValueType x) { return conj(x); });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL);
 
 
@@ -731,7 +728,7 @@ void calculate_nonzeros_per_row_in_span(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
 
 
@@ -778,7 +775,7 @@ void calculate_nonzeros_per_row_in_index_set(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_INDEX_SET_KERNEL);
 
 
@@ -811,7 +808,7 @@ void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
 
 
@@ -871,7 +868,7 @@ void compute_submatrix_from_index_set(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_FROM_INDEX_SET_KERNEL);
 
 
@@ -884,7 +881,7 @@ void inv_symm_permute(std::shared_ptr<const DefaultExecutor> exec,
     inv_nonsymm_permute(exec, perm, perm, orig, permuted);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL);
 
 
@@ -924,7 +921,7 @@ void inv_nonsymm_permute(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL);
 
 
@@ -962,7 +959,7 @@ void row_permute(std::shared_ptr<const OmpExecutor> exec, const IndexType* perm,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL);
 
 
@@ -1001,7 +998,7 @@ void inv_row_permute(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL);
 
 
@@ -1014,7 +1011,7 @@ void inv_symm_scale_permute(std::shared_ptr<const DefaultExecutor> exec,
     inv_nonsymm_scale_permute(exec, scale, perm, scale, perm, orig, permuted);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL);
 
 
@@ -1058,7 +1055,7 @@ void inv_nonsymm_scale_permute(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL);
 
 
@@ -1099,7 +1096,7 @@ void row_scale_permute(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL);
 
 
@@ -1140,7 +1137,7 @@ void inv_row_scale_permute(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL);
 
 
@@ -1163,7 +1160,7 @@ void sort_by_column_index(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX);
 
 
@@ -1191,7 +1188,7 @@ void is_sorted_by_column_index(
     *is_sorted = local_is_sorted;
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX);
 
 
@@ -1217,8 +1214,7 @@ void extract_diagonal(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
 
 
 template <typename ValueType, typename IndexType>
@@ -1245,7 +1241,7 @@ void check_diagonal_entries_exist(std::shared_ptr<const OmpExecutor> exec,
     has_all_diags = l_has_all_diags;
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST);
 
 
@@ -1274,7 +1270,7 @@ void add_scaled_identity(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/omp/matrix/dense_kernels.cpp b/omp/matrix/dense_kernels.cpp
index 4ca5aa0c075..d1c0f2f8949 100644
--- a/omp/matrix/dense_kernels.cpp
+++ b/omp/matrix/dense_kernels.cpp
@@ -46,7 +46,7 @@ void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
     compute_dot(exec, x, y, result, tmp);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
 
 
@@ -60,7 +60,7 @@ void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
     compute_conj_dot(exec, x, y, result, tmp);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
 
 
@@ -73,7 +73,7 @@ void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
     compute_norm2(exec, x, result, tmp);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
 
 
@@ -100,8 +100,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
 
 
 template <typename ValueType>
@@ -137,7 +136,7 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -169,7 +168,7 @@ void convert_to_coo(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL);
 
 
@@ -200,7 +199,7 @@ void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL);
 
 
@@ -233,7 +232,7 @@ void convert_to_ell(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL);
 
 
@@ -281,7 +280,7 @@ void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL);
 
 
@@ -327,7 +326,7 @@ void convert_to_hybrid(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL);
 
 
@@ -369,7 +368,7 @@ void convert_to_sellp(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL);
 
 
@@ -399,7 +398,7 @@ void convert_to_sparsity_csr(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL);
 
 
@@ -416,8 +415,7 @@ void transpose(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
 
 
 template <typename ValueType>
@@ -433,8 +431,7 @@ void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -464,7 +461,7 @@ void count_nonzero_blocks_per_row(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL);
 
 
diff --git a/omp/matrix/diagonal_kernels.cpp b/omp/matrix/diagonal_kernels.cpp
index c16e740dc45..71363c7bc6e 100644
--- a/omp/matrix/diagonal_kernels.cpp
+++ b/omp/matrix/diagonal_kernels.cpp
@@ -43,7 +43,7 @@ void apply_to_csr(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DIAGONAL_APPLY_TO_CSR_KERNEL);
 
 
diff --git a/omp/matrix/ell_kernels.cpp b/omp/matrix/ell_kernels.cpp
index dc200ae0f93..c35a3654b86 100644
--- a/omp/matrix/ell_kernels.cpp
+++ b/omp/matrix/ell_kernels.cpp
@@ -185,7 +185,7 @@ void spmv(std::shared_ptr<const OmpExecutor> exec,
     spmv_blocked<4>(exec, a, b, c, out);
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_SPMV_KERNEL);
 
 
@@ -228,7 +228,7 @@ void advanced_spmv(std::shared_ptr<const OmpExecutor> exec,
     spmv_blocked<4>(exec, a, b, c, out);
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL);
 
 
diff --git a/omp/matrix/fbcsr_kernels.cpp b/omp/matrix/fbcsr_kernels.cpp
index 14dcb1db77a..d17d47a7467 100644
--- a/omp/matrix/fbcsr_kernels.cpp
+++ b/omp/matrix/fbcsr_kernels.cpp
@@ -74,8 +74,7 @@ void spmv(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_FBCSR_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -119,7 +118,7 @@ void advanced_spmv(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL);
 
 
@@ -177,7 +176,7 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
     std::copy(col_idx_vec.begin(), col_idx_vec.end(), col_idxs.get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL);
 
 
@@ -210,7 +209,7 @@ void fill_in_dense(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL);
 
 
@@ -256,7 +255,7 @@ void convert_to_csr(const std::shared_ptr<const OmpExecutor> exec,
     row_ptrs[result->get_size()[0]] = source->get_num_stored_elements();
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL);
 
 
@@ -331,7 +330,7 @@ void transpose(std::shared_ptr<const OmpExecutor> exec,
                             [](const ValueType x) { return x; });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL);
 
 
@@ -344,7 +343,7 @@ void conj_transpose(std::shared_ptr<const OmpExecutor> exec,
                             [](const ValueType x) { return conj(x); });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL);
 
 
@@ -372,7 +371,7 @@ void is_sorted_by_column_index(
     *is_sorted = local_is_sorted;
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX);
 
 
@@ -427,7 +426,7 @@ void sort_by_column_index(const std::shared_ptr<const OmpExecutor> exec,
         syn::value_list<int>(), syn::type_list<>(), to_sort);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX);
 
 
@@ -464,7 +463,7 @@ void extract_diagonal(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL);
 
 
diff --git a/omp/matrix/fft_kernels.cpp b/omp/matrix/fft_kernels.cpp
index 0301b9093ff..c4858b32334 100644
--- a/omp/matrix/fft_kernels.cpp
+++ b/omp/matrix/fft_kernels.cpp
@@ -119,7 +119,7 @@ void fft(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_BASE(GKO_DECLARE_FFT_KERNEL);
 
 
 template <typename ValueType>
@@ -190,7 +190,7 @@ void fft2(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_BASE(GKO_DECLARE_FFT2_KERNEL);
 
 
 template <typename ValueType>
@@ -295,7 +295,7 @@ void fft3(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT3_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_BASE(GKO_DECLARE_FFT3_KERNEL);
 
 
 }  // namespace fft
diff --git a/omp/matrix/sellp_kernels.cpp b/omp/matrix/sellp_kernels.cpp
index 6306093b36d..7f8b16264ce 100644
--- a/omp/matrix/sellp_kernels.cpp
+++ b/omp/matrix/sellp_kernels.cpp
@@ -155,8 +155,7 @@ void spmv(std::shared_ptr<const OmpExecutor> exec,
     spmv_blocked<4>(exec, a, b, c, out);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_SELLP_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_SPMV_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -195,7 +194,7 @@ void advanced_spmv(std::shared_ptr<const OmpExecutor> exec,
     spmv_blocked<4>(exec, a, b, c, out);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL);
 
 
diff --git a/omp/matrix/sparsity_csr_kernels.cpp b/omp/matrix/sparsity_csr_kernels.cpp
index 560ee6d4890..35bb42c70a6 100644
--- a/omp/matrix/sparsity_csr_kernels.cpp
+++ b/omp/matrix/sparsity_csr_kernels.cpp
@@ -58,7 +58,7 @@ void spmv(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL);
 
 
@@ -95,7 +95,7 @@ void advanced_spmv(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL);
 
 
@@ -149,7 +149,7 @@ void transpose(std::shared_ptr<const OmpExecutor> exec,
     transpose_and_transform(exec, trans, orig);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL);
 
 
@@ -168,7 +168,7 @@ void sort_by_column_index(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX);
 
 
@@ -197,7 +197,7 @@ void is_sorted_by_column_index(
     *is_sorted = local_is_sorted;
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_IS_SORTED_BY_COLUMN_INDEX);
 
 
diff --git a/omp/multigrid/pgm_kernels.cpp b/omp/multigrid/pgm_kernels.cpp
index bfe95291f2e..4c824a0140b 100644
--- a/omp/multigrid/pgm_kernels.cpp
+++ b/omp/multigrid/pgm_kernels.cpp
@@ -47,8 +47,7 @@ void sort_row_major(std::shared_ptr<const DefaultExecutor> exec, size_type nnz,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_PGM_SORT_ROW_MAJOR);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PGM_SORT_ROW_MAJOR);
 
 
 template <typename ValueType, typename IndexType>
@@ -84,7 +83,7 @@ void compute_coarse_coo(std::shared_ptr<const DefaultExecutor> exec,
     coarse_val[coarse_idxs] = temp_val;
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PGM_COMPUTE_COARSE_COO);
 
 
diff --git a/omp/preconditioner/batch_jacobi_kernels.cpp b/omp/preconditioner/batch_jacobi_kernels.cpp
index 99036fd628f..58fb2602075 100644
--- a/omp/preconditioner/batch_jacobi_kernels.cpp
+++ b/omp/preconditioner/batch_jacobi_kernels.cpp
@@ -74,7 +74,7 @@ void extract_common_blocks_pattern(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_BLOCK_JACOBI_EXTRACT_PATTERN_KERNEL);
 
 
@@ -102,7 +102,7 @@ void compute_block_jacobi(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_BLOCK_JACOBI_COMPUTE_KERNEL);
 
 
diff --git a/omp/preconditioner/isai_kernels.cpp b/omp/preconditioner/isai_kernels.cpp
index 61a2193a2b3..6f2fe4838d9 100644
--- a/omp/preconditioner/isai_kernels.cpp
+++ b/omp/preconditioner/isai_kernels.cpp
@@ -230,7 +230,7 @@ void generate_tri_inverse(std::shared_ptr<const DefaultExecutor> exec,
                      trs_solve, true);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL);
 
 
@@ -324,7 +324,7 @@ void generate_general_inverse(std::shared_ptr<const DefaultExecutor> exec,
                      general_solve, false);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ISAI_GENERATE_GENERAL_INVERSE_KERNEL);
 
 
@@ -388,7 +388,7 @@ void generate_excess_system(std::shared_ptr<const DefaultExecutor>,
     e_row_ptrs[e_dim] = excess_nz_ptrs[e_end] - excess_nz_ptrs[e_start];
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL);
 
 
@@ -415,7 +415,7 @@ void scale_excess_solution(std::shared_ptr<const DefaultExecutor>,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ISAI_SCALE_EXCESS_SOLUTION_KERNEL);
 
 
@@ -441,7 +441,7 @@ void scatter_excess_solution(std::shared_ptr<const DefaultExecutor>,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL);
 
 
diff --git a/omp/preconditioner/jacobi_kernels.cpp b/omp/preconditioner/jacobi_kernels.cpp
index ee51f7adb40..76224f97a2f 100644
--- a/omp/preconditioner/jacobi_kernels.cpp
+++ b/omp/preconditioner/jacobi_kernels.cpp
@@ -132,7 +132,7 @@ void find_blocks(std::shared_ptr<const OmpExecutor> exec,
                                             block_pointers.get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL);
 
 
@@ -436,7 +436,7 @@ void generate(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_GENERATE_KERNEL);
 
 
@@ -514,8 +514,7 @@ void apply(std::shared_ptr<const OmpExecutor> exec, size_type num_blocks,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_JACOBI_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_APPLY_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -549,7 +548,7 @@ void simple_apply(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL);
 
 
@@ -586,7 +585,7 @@ void transpose_jacobi(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL);
 
 
@@ -623,7 +622,7 @@ void conj_transpose_jacobi(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL);
 
 
@@ -662,7 +661,7 @@ void convert_to_dense(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL);
 
 
diff --git a/omp/preconditioner/sor_kernels.cpp b/omp/preconditioner/sor_kernels.cpp
index 670277b6ebd..509946ac15a 100644
--- a/omp/preconditioner/sor_kernels.cpp
+++ b/omp/preconditioner/sor_kernels.cpp
@@ -29,7 +29,7 @@ void initialize_weighted_l(
             [](auto val) { return val; }));
 };
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L);
 
 
@@ -57,7 +57,7 @@ void initialize_weighted_l_u(
             }));
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U);
 
 
diff --git a/omp/solver/batch_bicgstab_kernels.cpp b/omp/solver/batch_bicgstab_kernels.cpp
index f8a4dbb8172..11f412f1835 100644
--- a/omp/solver/batch_bicgstab_kernels.cpp
+++ b/omp/solver/batch_bicgstab_kernels.cpp
@@ -91,7 +91,7 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
     dispatcher.apply(b, x, logdata);
 }
 
-GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER(
+GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER_BASE(
     GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL_WRAPPER);
 
 
diff --git a/omp/solver/batch_cg_kernels.cpp b/omp/solver/batch_cg_kernels.cpp
index 26a7046a176..d8c05c39f74 100644
--- a/omp/solver/batch_cg_kernels.cpp
+++ b/omp/solver/batch_cg_kernels.cpp
@@ -97,7 +97,7 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
     dispatcher.apply(b, x, logdata);
 }
 
-GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER(
+GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER_BASE(
     GKO_DECLARE_BATCH_CG_APPLY_KERNEL_WRAPPER);
 
 
diff --git a/omp/solver/cb_gmres_kernels.cpp b/omp/solver/cb_gmres_kernels.cpp
index a53294b9fbe..aeffc3202d6 100644
--- a/omp/solver/cb_gmres_kernels.cpp
+++ b/omp/solver/cb_gmres_kernels.cpp
@@ -330,7 +330,8 @@ void initialize(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_BASE(
+    GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL);
 
 
 template <typename ValueType, typename Accessor3d>
diff --git a/omp/solver/idr_kernels.cpp b/omp/solver/idr_kernels.cpp
index eb0eb1074e5..9218f48c859 100644
--- a/omp/solver/idr_kernels.cpp
+++ b/omp/solver/idr_kernels.cpp
@@ -184,8 +184,7 @@ void initialize(std::shared_ptr<const OmpExecutor> exec, const size_type nrhs,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_IDR_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -219,7 +218,7 @@ void step_1(std::shared_ptr<const OmpExecutor> exec, const size_type nrhs,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -245,7 +244,7 @@ void step_2(std::shared_ptr<const OmpExecutor> exec, const size_type nrhs,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_2_KERNEL);
 
 
 template <typename ValueType>
@@ -291,7 +290,7 @@ void step_3(std::shared_ptr<const OmpExecutor> exec, const size_type nrhs,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_3_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_3_KERNEL);
 
 
 template <typename ValueType>
@@ -322,8 +321,7 @@ void compute_omega(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL);
 
 
 }  // namespace idr
diff --git a/omp/solver/lower_trs_kernels.cpp b/omp/solver/lower_trs_kernels.cpp
index c873e5e8958..6dac6b46078 100644
--- a/omp/solver/lower_trs_kernels.cpp
+++ b/omp/solver/lower_trs_kernels.cpp
@@ -47,7 +47,7 @@ void generate(std::shared_ptr<const OmpExecutor> exec,
     // "analysis" phase for the triangular matrix.
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL);
 
 
@@ -88,7 +88,7 @@ void solve(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_LOWER_TRS_SOLVE_KERNEL);
 
 
diff --git a/omp/solver/multigrid_kernels.cpp b/omp/solver/multigrid_kernels.cpp
index 509ecf51828..12e5bad8577 100644
--- a/omp/solver/multigrid_kernels.cpp
+++ b/omp/solver/multigrid_kernels.cpp
@@ -44,8 +44,7 @@ void kcycle_step_1(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -75,8 +74,7 @@ void kcycle_step_2(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL);
 
 
 template <typename ValueType>
@@ -94,7 +92,7 @@ void kcycle_check_stop(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
     GKO_DECLARE_MULTIGRID_KCYCLE_CHECK_STOP_KERNEL);
 
 
diff --git a/omp/solver/upper_trs_kernels.cpp b/omp/solver/upper_trs_kernels.cpp
index 5014f823d35..ea05cabeb63 100644
--- a/omp/solver/upper_trs_kernels.cpp
+++ b/omp/solver/upper_trs_kernels.cpp
@@ -47,7 +47,7 @@ void generate(std::shared_ptr<const OmpExecutor> exec,
     // "analysis" phase for the triangular matrix.
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL);
 
 
@@ -90,7 +90,7 @@ void solve(std::shared_ptr<const OmpExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL);
 
 
diff --git a/omp/stop/residual_norm_kernels.cpp b/omp/stop/residual_norm_kernels.cpp
index ff259477d03..0ec4395a16b 100644
--- a/omp/stop/residual_norm_kernels.cpp
+++ b/omp/stop/residual_norm_kernels.cpp
@@ -53,7 +53,7 @@ void residual_norm(std::shared_ptr<const OmpExecutor> exec,
     *all_converged = local_all_converged;
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
     GKO_DECLARE_RESIDUAL_NORM_KERNEL);
 
 
@@ -98,8 +98,7 @@ void implicit_residual_norm(
     *all_converged = local_all_converged;
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL);
 
 
 }  // namespace implicit_residual_norm
diff --git a/reference/base/batch_multi_vector_kernels.cpp b/reference/base/batch_multi_vector_kernels.cpp
index 4f48a0b6f94..d7fbf3ce214 100644
--- a/reference/base/batch_multi_vector_kernels.cpp
+++ b/reference/base/batch_multi_vector_kernels.cpp
@@ -35,7 +35,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL);
 
 
@@ -56,7 +56,7 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL);
 
 
@@ -77,7 +77,7 @@ void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL);
 
 
@@ -98,7 +98,7 @@ void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_CONJ_DOT_KERNEL);
 
 
@@ -116,7 +116,7 @@ void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL);
 
 
@@ -134,8 +134,7 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
 
 
 }  // namespace batch_multi_vector
diff --git a/reference/base/device_matrix_data_kernels.cpp b/reference/base/device_matrix_data_kernels.cpp
index 78a2e25a712..f9a23b35e69 100644
--- a/reference/base/device_matrix_data_kernels.cpp
+++ b/reference/base/device_matrix_data_kernels.cpp
@@ -29,7 +29,7 @@ void soa_to_aos(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DEVICE_MATRIX_DATA_SOA_TO_AOS_KERNEL);
 
 
@@ -46,7 +46,7 @@ void aos_to_soa(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DEVICE_MATRIX_DATA_AOS_TO_SOA_KERNEL);
 
 
@@ -78,7 +78,7 @@ void remove_zeros(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DEVICE_MATRIX_DATA_REMOVE_ZEROS_KERNEL);
 
 
@@ -127,7 +127,7 @@ void sum_duplicates(std::shared_ptr<const DefaultExecutor> exec, size_type,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DEVICE_MATRIX_DATA_SUM_DUPLICATES_KERNEL);
 
 
@@ -142,7 +142,7 @@ void sort_row_major(std::shared_ptr<const DefaultExecutor> exec,
     aos_to_soa(exec, tmp, data);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DEVICE_MATRIX_DATA_SORT_ROW_MAJOR_KERNEL);
 
 
diff --git a/reference/components/absolute_array_kernels.cpp b/reference/components/absolute_array_kernels.cpp
index 759caae894c..964e1f80d6a 100644
--- a/reference/components/absolute_array_kernels.cpp
+++ b/reference/components/absolute_array_kernels.cpp
@@ -20,8 +20,7 @@ void inplace_absolute_array(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_INPLACE_ABSOLUTE_ARRAY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_ARRAY_KERNEL);
 
 
 template <typename ValueType>
@@ -34,8 +33,7 @@ void outplace_absolute_array(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_OUTPLACE_ABSOLUTE_ARRAY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_ARRAY_KERNEL);
 
 
 }  // namespace components
diff --git a/reference/components/fill_array_kernels.cpp b/reference/components/fill_array_kernels.cpp
index 663ad8f5b6b..1649aa87982 100644
--- a/reference/components/fill_array_kernels.cpp
+++ b/reference/components/fill_array_kernels.cpp
@@ -20,7 +20,7 @@ void fill_array(std::shared_ptr<const DefaultExecutor> exec, ValueType* array,
     std::fill_n(array, n, val);
 }
 
-GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF(GKO_DECLARE_FILL_ARRAY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL);
 template GKO_DECLARE_FILL_ARRAY_KERNEL(bool);
 
 
@@ -31,8 +31,7 @@ void fill_seq_array(std::shared_ptr<const DefaultExecutor> exec,
     std::iota(array, array + n, 0);
 }
 
-GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF(
-    GKO_DECLARE_FILL_SEQ_ARRAY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_FILL_SEQ_ARRAY_KERNEL);
 
 
 }  // namespace components
diff --git a/reference/components/precision_conversion_kernels.cpp b/reference/components/precision_conversion_kernels.cpp
index 5ec37a1cd72..db12d9316ee 100644
--- a/reference/components/precision_conversion_kernels.cpp
+++ b/reference/components/precision_conversion_kernels.cpp
@@ -20,8 +20,7 @@ void convert_precision(std::shared_ptr<const DefaultExecutor> exec,
     std::copy_n(in, size, out);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_WITH_HALF(
-    GKO_DECLARE_CONVERT_PRECISION_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(GKO_DECLARE_CONVERT_PRECISION_KERNEL);
 
 
 }  // namespace components
diff --git a/reference/components/reduce_array_kernels.cpp b/reference/components/reduce_array_kernels.cpp
index 3c3c6f620ec..a70ef95a878 100644
--- a/reference/components/reduce_array_kernels.cpp
+++ b/reference/components/reduce_array_kernels.cpp
@@ -22,8 +22,7 @@ void reduce_add_array(std::shared_ptr<const DefaultExecutor> exec,
                                         val.get_const_data()[0]);
 }
 
-GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF(
-    GKO_DECLARE_REDUCE_ADD_ARRAY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_REDUCE_ADD_ARRAY_KERNEL);
 
 
 }  // namespace components
diff --git a/reference/distributed/assembly_kernels.cpp b/reference/distributed/assembly_kernels.cpp
index e38680243a0..36c44ca4022 100644
--- a/reference/distributed/assembly_kernels.cpp
+++ b/reference/distributed/assembly_kernels.cpp
@@ -67,7 +67,7 @@ void count_non_owning_entries(
                                        num_input_elements);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
     GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
 
 
@@ -97,7 +97,7 @@ void fill_send_buffers(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
     GKO_DECLARE_FILL_SEND_BUFFERS);
 
 
diff --git a/reference/distributed/matrix_kernels.cpp b/reference/distributed/matrix_kernels.cpp
index 95176b34656..ab0e07070ff 100644
--- a/reference/distributed/matrix_kernels.cpp
+++ b/reference/distributed/matrix_kernels.cpp
@@ -86,7 +86,7 @@ void separate_local_nonlocal(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
     GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL);
 
 
diff --git a/reference/distributed/vector_kernels.cpp b/reference/distributed/vector_kernels.cpp
index 76a8be06a0f..1425f1dc9ab 100644
--- a/reference/distributed/vector_kernels.cpp
+++ b/reference/distributed/vector_kernels.cpp
@@ -40,7 +40,7 @@ void build_local(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
     GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL);
 
 
diff --git a/reference/factorization/cholesky_kernels.cpp b/reference/factorization/cholesky_kernels.cpp
index 199cae4c8fa..e4d7112a15f 100644
--- a/reference/factorization/cholesky_kernels.cpp
+++ b/reference/factorization/cholesky_kernels.cpp
@@ -63,7 +63,7 @@ void symbolic_count(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT);
 
 
@@ -102,7 +102,7 @@ void symbolic_factorize(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE);
 
 
@@ -140,7 +140,7 @@ void forest_from_factor(
                                      num_rows + 1, child_ptrs);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR);
 
 
@@ -172,8 +172,7 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
               });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CHOLESKY_INITIALIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_INITIALIZE);
 
 
 namespace {
@@ -255,8 +254,7 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CHOLESKY_FACTORIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE);
 
 
 }  // namespace cholesky
diff --git a/reference/factorization/factorization_kernels.cpp b/reference/factorization/factorization_kernels.cpp
index 15d778c2235..99b522ffba9 100644
--- a/reference/factorization/factorization_kernels.cpp
+++ b/reference/factorization/factorization_kernels.cpp
@@ -127,7 +127,7 @@ void add_diagonal_elements(std::shared_ptr<const ReferenceExecutor> exec,
     mtx_builder.get_col_idx_array() = std::move(new_col_idxs_array);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL);
 
 
@@ -159,7 +159,7 @@ void initialize_row_ptrs_l_u(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL);
 
 
@@ -177,7 +177,7 @@ void initialize_l_u(std::shared_ptr<const ReferenceExecutor> exec,
                                         helpers::identity{}));
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL);
 
 
@@ -204,7 +204,7 @@ void initialize_row_ptrs_l(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL);
 
 
@@ -227,7 +227,7 @@ void initialize_l(std::shared_ptr<const ReferenceExecutor> exec,
                               helpers::identity{}));
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL);
 
 
diff --git a/reference/factorization/ic_kernels.cpp b/reference/factorization/ic_kernels.cpp
index 3557ee0b978..93945c2da14 100644
--- a/reference/factorization/ic_kernels.cpp
+++ b/reference/factorization/ic_kernels.cpp
@@ -69,7 +69,7 @@ void sparselib_ic(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_IC_SPARSELIB_IC_KERNEL);
 
 
diff --git a/reference/factorization/ilu_kernels.cpp b/reference/factorization/ilu_kernels.cpp
index 2eedd988929..3323e0b6cef 100644
--- a/reference/factorization/ilu_kernels.cpp
+++ b/reference/factorization/ilu_kernels.cpp
@@ -65,7 +65,7 @@ void sparselib_ilu(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);
 
 
diff --git a/reference/factorization/lu_kernels.cpp b/reference/factorization/lu_kernels.cpp
index d8bb8c427ef..9866eb5d207 100644
--- a/reference/factorization/lu_kernels.cpp
+++ b/reference/factorization/lu_kernels.cpp
@@ -58,8 +58,7 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_LU_INITIALIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_INITIALIZE);
 
 
 namespace {
@@ -125,8 +124,7 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_LU_FACTORIZE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE);
 
 
 template <typename IndexType>
diff --git a/reference/factorization/par_ic_kernels.cpp b/reference/factorization/par_ic_kernels.cpp
index e8f3a9273f4..4da317cf201 100644
--- a/reference/factorization/par_ic_kernels.cpp
+++ b/reference/factorization/par_ic_kernels.cpp
@@ -46,7 +46,7 @@ void init_factor(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_IC_INIT_FACTOR_KERNEL);
 
 
@@ -96,7 +96,7 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_IC_COMPUTE_FACTOR_KERNEL);
 
 
diff --git a/reference/factorization/par_ict_kernels.cpp b/reference/factorization/par_ict_kernels.cpp
index c6b192b328b..684158d380c 100644
--- a/reference/factorization/par_ict_kernels.cpp
+++ b/reference/factorization/par_ict_kernels.cpp
@@ -89,7 +89,7 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL);
 
 
@@ -167,7 +167,7 @@ void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
         [](IndexType, row_state) {});
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL);
 
 
diff --git a/reference/factorization/par_ilu_kernels.cpp b/reference/factorization/par_ilu_kernels.cpp
index ddcc41d1070..44c2e5f66bc 100644
--- a/reference/factorization/par_ilu_kernels.cpp
+++ b/reference/factorization/par_ilu_kernels.cpp
@@ -86,7 +86,7 @@ void compute_l_u_factors(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL);
 
 
diff --git a/reference/factorization/par_ilut_kernels.cpp b/reference/factorization/par_ilut_kernels.cpp
index c22c6924d6c..ce78ea451f6 100644
--- a/reference/factorization/par_ilut_kernels.cpp
+++ b/reference/factorization/par_ilut_kernels.cpp
@@ -58,7 +58,7 @@ void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
     threshold = abs(*target);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL);
 
 
@@ -150,7 +150,7 @@ void threshold_filter(std::shared_ptr<const DefaultExecutor> exec,
         });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL);
 
 
@@ -226,7 +226,7 @@ void threshold_filter_approx(std::shared_ptr<const DefaultExecutor> exec,
         });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL);
 
 
@@ -314,7 +314,7 @@ void compute_l_u_factors(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL);
 
 
@@ -437,7 +437,7 @@ void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
         [](IndexType, row_state) {});
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL);
 
 
diff --git a/reference/matrix/batch_csr_kernels.cpp b/reference/matrix/batch_csr_kernels.cpp
index c277d4f0738..d3304ab9795 100644
--- a/reference/matrix/batch_csr_kernels.cpp
+++ b/reference/matrix/batch_csr_kernels.cpp
@@ -39,7 +39,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_CSR_SIMPLE_APPLY_KERNEL);
 
 
@@ -68,7 +68,7 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_CSR_ADVANCED_APPLY_KERNEL);
 
 
@@ -94,7 +94,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_CSR_SCALE_KERNEL);
 
 
@@ -117,7 +117,7 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_CSR_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/reference/matrix/batch_dense_kernels.cpp b/reference/matrix/batch_dense_kernels.cpp
index 9c92fb54056..599af30ecfb 100644
--- a/reference/matrix/batch_dense_kernels.cpp
+++ b/reference/matrix/batch_dense_kernels.cpp
@@ -39,7 +39,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL);
 
 
@@ -68,7 +68,7 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL);
 
 
@@ -94,8 +94,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL);
 
 
 template <typename ValueType>
@@ -117,8 +116,7 @@ void scale_add(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_BATCH_DENSE_SCALE_ADD_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_ADD_KERNEL);
 
 
 template <typename ValueType>
@@ -140,7 +138,7 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_DENSE_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/reference/matrix/batch_ell_kernels.cpp b/reference/matrix/batch_ell_kernels.cpp
index bc0eb61e30d..1a4855f389f 100644
--- a/reference/matrix/batch_ell_kernels.cpp
+++ b/reference/matrix/batch_ell_kernels.cpp
@@ -39,7 +39,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL);
 
 
@@ -68,7 +68,7 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL);
 
 
@@ -94,7 +94,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_ELL_SCALE_KERNEL);
 
 
@@ -117,7 +117,7 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_ELL_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/reference/matrix/coo_kernels.cpp b/reference/matrix/coo_kernels.cpp
index ebb8c1dfce6..f9bf9f5f33d 100644
--- a/reference/matrix/coo_kernels.cpp
+++ b/reference/matrix/coo_kernels.cpp
@@ -38,8 +38,7 @@ void spmv(std::shared_ptr<const ReferenceExecutor> exec,
     spmv2(exec, a, b, c);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_COO_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -54,7 +53,7 @@ void advanced_spmv(std::shared_ptr<const ReferenceExecutor> exec,
     advanced_spmv2(exec, alpha, a, b, c);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_COO_ADVANCED_SPMV_KERNEL);
 
 
@@ -74,8 +73,7 @@ void spmv2(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_COO_SPMV2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV2_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -98,7 +96,7 @@ void advanced_spmv2(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL);
 
 
@@ -115,7 +113,7 @@ void fill_in_dense(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_COO_FILL_IN_DENSE_KERNEL);
 
 
@@ -138,7 +136,7 @@ void extract_diagonal(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_COO_EXTRACT_DIAGONAL_KERNEL);
 
 
diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp
index 679844084d2..8296ac2b582 100644
--- a/reference/matrix/csr_kernels.cpp
+++ b/reference/matrix/csr_kernels.cpp
@@ -76,7 +76,7 @@ void spmv(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_SPMV_KERNEL);
 
 
@@ -116,7 +116,7 @@ void advanced_spmv(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL);
 
 
@@ -240,8 +240,7 @@ void spgemm(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_SPGEMM_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -296,7 +295,7 @@ void advanced_spgemm(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL);
 
 
@@ -346,8 +345,7 @@ void spgeam(std::shared_ptr<const ReferenceExecutor> exec,
         [](IndexType, IndexType) {});
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_SPGEAM_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -369,7 +367,7 @@ void fill_in_dense(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL);
 
 
@@ -416,7 +414,7 @@ void convert_to_sellp(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL);
 
 
@@ -447,7 +445,7 @@ void convert_to_ell(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL);
 
 
@@ -517,7 +515,7 @@ void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,
     std::copy(col_idx_vec.begin(), col_idx_vec.end(), col_idxs.get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL);
 
 
@@ -576,8 +574,7 @@ void transpose(std::shared_ptr<const ReferenceExecutor> exec,
                             [](const ValueType x) { return x; });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -589,7 +586,7 @@ void conj_transpose(std::shared_ptr<const ReferenceExecutor> exec,
                             [](const ValueType x) { return conj(x); });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL);
 
 
@@ -613,7 +610,7 @@ void calculate_nonzeros_per_row_in_span(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
 
 
@@ -660,7 +657,7 @@ void calculate_nonzeros_per_row_in_index_set(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_INDEX_SET_KERNEL);
 
 
@@ -694,7 +691,7 @@ void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
 
 
@@ -752,7 +749,7 @@ void compute_submatrix_from_index_set(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_FROM_INDEX_SET_KERNEL);
 
 
@@ -803,7 +800,7 @@ void convert_to_hybrid(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL);
 
 
@@ -816,7 +813,7 @@ void inv_symm_permute(std::shared_ptr<const ReferenceExecutor> exec,
     inv_nonsymm_permute(exec, perm, perm, orig, permuted);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL);
 
 
@@ -854,7 +851,7 @@ void inv_nonsymm_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL);
 
 
@@ -889,7 +886,7 @@ void row_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL);
 
 
@@ -924,7 +921,7 @@ void inv_row_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL);
 
 
@@ -954,7 +951,7 @@ void inv_col_permute(std::shared_ptr<const ReferenceExecutor> exec,
     cp_row_ptrs[num_rows] = in_row_ptrs[num_rows];
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INV_COL_PERMUTE_KERNEL);
 
 
@@ -967,7 +964,7 @@ void inv_symm_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
     inv_nonsymm_scale_permute(exec, scale, perm, scale, perm, orig, permuted);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL);
 
 
@@ -1009,7 +1006,7 @@ void inv_nonsymm_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL);
 
 
@@ -1046,7 +1043,7 @@ void row_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL);
 
 
@@ -1083,7 +1080,7 @@ void inv_row_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL);
 
 
@@ -1114,7 +1111,7 @@ void inv_col_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
     cp_row_ptrs[num_rows] = in_row_ptrs[num_rows];
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INV_COL_SCALE_PERMUTE_KERNEL);
 
 
@@ -1136,7 +1133,7 @@ void sort_by_column_index(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX);
 
 
@@ -1160,7 +1157,7 @@ void is_sorted_by_column_index(
     return;
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX);
 
 
@@ -1185,8 +1182,7 @@ void extract_diagonal(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
 
 
 template <typename ValueType, typename IndexType>
@@ -1202,8 +1198,7 @@ void scale(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_SCALE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SCALE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -1219,8 +1214,7 @@ void inv_scale(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_CSR_INV_SCALE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_SCALE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -1246,7 +1240,7 @@ void check_diagonal_entries_exist(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST);
 
 
@@ -1269,7 +1263,7 @@ void add_scaled_identity(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp
index 561073c8c2d..921a49998b7 100644
--- a/reference/matrix/dense_kernels.cpp
+++ b/reference/matrix/dense_kernels.cpp
@@ -56,8 +56,7 @@ void simple_apply(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
 
 
 template <typename ValueType>
@@ -90,7 +89,7 @@ void apply(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
 
 template <typename InValueType, typename OutValueType>
@@ -106,7 +105,7 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(
     GKO_DECLARE_DENSE_COPY_KERNEL);
 
 
@@ -121,7 +120,7 @@ void fill(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_FILL_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_FILL_KERNEL);
 
 
 template <typename ValueType, typename ScalarType>
@@ -143,8 +142,7 @@ void scale(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_SCALE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_SCALE_KERNEL);
 
 
 template <typename ValueType, typename ScalarType>
@@ -167,7 +165,7 @@ void inv_scale(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(
     GKO_DECLARE_DENSE_INV_SCALE_KERNEL);
 
 
@@ -191,7 +189,7 @@ void add_scaled(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(
     GKO_DECLARE_DENSE_ADD_SCALED_KERNEL);
 
 
@@ -215,7 +213,7 @@ void sub_scaled(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(
     GKO_DECLARE_DENSE_SUB_SCALED_KERNEL);
 
 
@@ -231,8 +229,7 @@ void add_scaled_diag(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL);
 
 
 template <typename ValueType>
@@ -247,8 +244,7 @@ void sub_scaled_diag(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
 
 
 template <typename ValueType>
@@ -267,8 +263,7 @@ void compute_dot(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
 
 
 template <typename ValueType>
@@ -280,7 +275,7 @@ void compute_dot_dispatch(std::shared_ptr<const ReferenceExecutor> exec,
     compute_dot(exec, x, y, result, tmp);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
 
 
@@ -300,8 +295,7 @@ void compute_conj_dot(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
 
 
 template <typename ValueType>
@@ -314,7 +308,7 @@ void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
     compute_conj_dot(exec, x, y, result, tmp);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
 
 
@@ -337,8 +331,7 @@ void compute_norm2(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
 
 
 template <typename ValueType>
@@ -350,7 +343,7 @@ void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
     compute_norm2(exec, x, result, tmp);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
 
 
@@ -370,8 +363,7 @@ void compute_norm1(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
 
 
 template <typename ValueType>
@@ -394,8 +386,7 @@ void compute_mean(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_COMPUTE_MEAN_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_MEAN_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -409,7 +400,7 @@ void fill_in_matrix_data(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL);
 
 
@@ -429,7 +420,7 @@ void compute_squared_norm2(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL);
 
 
@@ -444,7 +435,7 @@ void compute_sqrt(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL);
 
 
@@ -475,7 +466,7 @@ void convert_to_coo(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL);
 
 
@@ -507,7 +498,7 @@ void convert_to_csr(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL);
 
 
@@ -539,7 +530,7 @@ void convert_to_ell(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL);
 
 
@@ -586,7 +577,7 @@ void convert_to_fbcsr(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL);
 
 
@@ -635,7 +626,7 @@ void convert_to_hybrid(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL);
 
 
@@ -671,7 +662,7 @@ void convert_to_sellp(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL);
 
 
@@ -701,7 +692,7 @@ void convert_to_sparsity_csr(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL);
 
 
@@ -722,7 +713,7 @@ void compute_max_nnz_per_row(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL);
 
 
@@ -754,7 +745,7 @@ void compute_slice_sets(std::shared_ptr<const DefaultExecutor> exec,
     components::prefix_sum_nonnegative(exec, slice_sets, num_slices + 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL);
 
 
@@ -774,9 +765,9 @@ void count_nonzeros_per_row(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL_SIZE_T);
 
 
@@ -806,7 +797,7 @@ void count_nonzero_blocks_per_row(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL);
 
 
@@ -822,8 +813,7 @@ void transpose(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
 
 
 template <typename ValueType>
@@ -838,8 +828,7 @@ void conj_transpose(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -855,7 +844,7 @@ void symm_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL);
 
 
@@ -873,7 +862,7 @@ void inv_symm_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL);
 
 
@@ -890,7 +879,7 @@ void nonsymm_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_NONSYMM_PERMUTE_KERNEL);
 
 
@@ -907,7 +896,7 @@ void inv_nonsymm_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_INV_NONSYMM_PERMUTE_KERNEL);
 
 
@@ -923,7 +912,7 @@ void row_gather(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2(
     GKO_DECLARE_DENSE_ROW_GATHER_KERNEL);
 
 
@@ -948,7 +937,7 @@ void advanced_row_gather(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2(
     GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL);
 
 
@@ -964,7 +953,7 @@ void col_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_COL_PERMUTE_KERNEL);
 
 
@@ -981,7 +970,7 @@ void inv_row_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL);
 
 
@@ -998,7 +987,7 @@ void inv_col_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_INV_COL_PERMUTE_KERNEL);
 
 
@@ -1017,7 +1006,7 @@ void symm_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_SYMM_SCALE_PERMUTE_KERNEL);
 
 
@@ -1036,7 +1025,7 @@ void inv_symm_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_INV_SYMM_SCALE_PERMUTE_KERNEL);
 
 
@@ -1059,7 +1048,7 @@ void nonsymm_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_NONSYMM_SCALE_PERMUTE_KERNEL);
 
 
@@ -1082,7 +1071,7 @@ void inv_nonsymm_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_INV_NONSYMM_SCALE_PERMUTE_KERNEL);
 
 
@@ -1100,7 +1089,7 @@ void row_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_ROW_SCALE_PERMUTE_KERNEL);
 
 
@@ -1118,7 +1107,7 @@ void inv_row_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_INV_ROW_SCALE_PERMUTE_KERNEL);
 
 
@@ -1136,7 +1125,7 @@ void col_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_COL_SCALE_PERMUTE_KERNEL);
 
 
@@ -1154,7 +1143,7 @@ void inv_col_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_INV_COL_SCALE_PERMUTE_KERNEL);
 
 
@@ -1169,8 +1158,7 @@ void extract_diagonal(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL);
 
 
 template <typename ValueType>
@@ -1185,8 +1173,7 @@ void inplace_absolute_dense(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL);
 
 
 template <typename ValueType>
@@ -1202,8 +1189,7 @@ void outplace_absolute_dense(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL);
 
 
 template <typename ValueType>
@@ -1219,7 +1205,7 @@ void make_complex(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_MAKE_COMPLEX_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MAKE_COMPLEX_KERNEL);
 
 
 template <typename ValueType>
@@ -1235,7 +1221,7 @@ void get_real(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GET_REAL_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_REAL_KERNEL);
 
 
 template <typename ValueType>
@@ -1251,7 +1237,7 @@ void get_imag(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GET_IMAG_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_IMAG_KERNEL);
 
 
 template <typename ValueType, typename ScalarType>
@@ -1271,7 +1257,7 @@ void add_scaled_identity(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(
     GKO_DECLARE_DENSE_ADD_SCALED_IDENTITY_KERNEL);
 
 
diff --git a/reference/matrix/diagonal_kernels.cpp b/reference/matrix/diagonal_kernels.cpp
index 47d59728ab0..028b7685c2b 100644
--- a/reference/matrix/diagonal_kernels.cpp
+++ b/reference/matrix/diagonal_kernels.cpp
@@ -35,8 +35,7 @@ void apply_to_dense(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DIAGONAL_APPLY_TO_DENSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DIAGONAL_APPLY_TO_DENSE_KERNEL);
 
 
 template <typename ValueType>
@@ -53,7 +52,7 @@ void right_apply_to_dense(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DIAGONAL_RIGHT_APPLY_TO_DENSE_KERNEL);
 
 
@@ -78,7 +77,7 @@ void apply_to_csr(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DIAGONAL_APPLY_TO_CSR_KERNEL);
 
 
@@ -102,7 +101,7 @@ void right_apply_to_csr(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DIAGONAL_RIGHT_APPLY_TO_CSR_KERNEL);
 
 
@@ -119,7 +118,7 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DIAGONAL_FILL_IN_MATRIX_DATA_KERNEL);
 
 
@@ -142,7 +141,7 @@ void convert_to_csr(std::shared_ptr<const ReferenceExecutor> exec,
     row_ptrs[size] = size;
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DIAGONAL_CONVERT_TO_CSR_KERNEL);
 
 
@@ -160,8 +159,7 @@ void conj_transpose(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_DIAGONAL_CONJ_TRANSPOSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DIAGONAL_CONJ_TRANSPOSE_KERNEL);
 
 
 }  // namespace diagonal
diff --git a/reference/matrix/ell_kernels.cpp b/reference/matrix/ell_kernels.cpp
index ece95b38a39..5aae164aa41 100644
--- a/reference/matrix/ell_kernels.cpp
+++ b/reference/matrix/ell_kernels.cpp
@@ -68,7 +68,7 @@ void spmv(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_SPMV_KERNEL);
 
 
@@ -121,7 +121,7 @@ void advanced_spmv(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL);
 
 
@@ -161,7 +161,7 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_FILL_IN_MATRIX_DATA_KERNEL);
 
 
@@ -185,7 +185,7 @@ void fill_in_dense(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_FILL_IN_DENSE_KERNEL);
 
 
@@ -203,8 +203,7 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_ELL_COPY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_COPY_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -235,7 +234,7 @@ void convert_to_csr(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_CONVERT_TO_CSR_KERNEL);
 
 
@@ -259,7 +258,7 @@ void count_nonzeros_per_row(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_COUNT_NONZEROS_PER_ROW_KERNEL);
 
 
@@ -284,7 +283,7 @@ void extract_diagonal(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_EXTRACT_DIAGONAL_KERNEL);
 
 
diff --git a/reference/matrix/fbcsr_kernels.cpp b/reference/matrix/fbcsr_kernels.cpp
index 048158136be..4c170a973a7 100644
--- a/reference/matrix/fbcsr_kernels.cpp
+++ b/reference/matrix/fbcsr_kernels.cpp
@@ -74,8 +74,7 @@ void spmv(const std::shared_ptr<const ReferenceExecutor>,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_FBCSR_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -119,7 +118,7 @@ void advanced_spmv(const std::shared_ptr<const ReferenceExecutor>,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL);
 
 
@@ -177,7 +176,7 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
     std::copy(col_idx_vec.begin(), col_idx_vec.end(), col_idxs.get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL);
 
 
@@ -213,7 +212,7 @@ void fill_in_dense(const std::shared_ptr<const ReferenceExecutor>,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL);
 
 
@@ -272,7 +271,7 @@ void convert_to_csr(const std::shared_ptr<const ReferenceExecutor>,
         static_cast<IndexType>(source->get_num_stored_elements());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL);
 
 
@@ -354,7 +353,7 @@ void transpose(std::shared_ptr<const ReferenceExecutor> exec,
                             [](const ValueType x) { return x; });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL);
 
 
@@ -367,7 +366,7 @@ void conj_transpose(std::shared_ptr<const ReferenceExecutor> exec,
                             [](const ValueType x) { return conj(x); });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL);
 
 
@@ -392,7 +391,7 @@ void is_sorted_by_column_index(
     return;
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX);
 
 
@@ -449,7 +448,7 @@ void sort_by_column_index(const std::shared_ptr<const ReferenceExecutor> exec,
         syn::value_list<int>(), syn::type_list<>(), to_sort);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX);
 
 
@@ -488,7 +487,7 @@ void extract_diagonal(std::shared_ptr<const ReferenceExecutor>,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL);
 
 
diff --git a/reference/matrix/fft_kernels.cpp b/reference/matrix/fft_kernels.cpp
index 00af068803c..2fc27cdbe3a 100644
--- a/reference/matrix/fft_kernels.cpp
+++ b/reference/matrix/fft_kernels.cpp
@@ -116,7 +116,7 @@ void fft(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_BASE(GKO_DECLARE_FFT_KERNEL);
 
 
 template <typename ValueType>
@@ -183,7 +183,7 @@ void fft2(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_BASE(GKO_DECLARE_FFT2_KERNEL);
 
 
 template <typename ValueType>
@@ -283,7 +283,7 @@ void fft3(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT3_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_BASE(GKO_DECLARE_FFT3_KERNEL);
 
 
 }  // namespace fft
diff --git a/reference/matrix/hybrid_kernels.cpp b/reference/matrix/hybrid_kernels.cpp
index 5fe013297f3..f2a06c321f2 100644
--- a/reference/matrix/hybrid_kernels.cpp
+++ b/reference/matrix/hybrid_kernels.cpp
@@ -86,7 +86,7 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_HYBRID_FILL_IN_MATRIX_DATA_KERNEL);
 
 
@@ -130,7 +130,7 @@ void convert_to_csr(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL);
 
 
diff --git a/reference/matrix/scaled_permutation_kernels.cpp b/reference/matrix/scaled_permutation_kernels.cpp
index a352c0f777d..b00e06f72f2 100644
--- a/reference/matrix/scaled_permutation_kernels.cpp
+++ b/reference/matrix/scaled_permutation_kernels.cpp
@@ -26,7 +26,7 @@ void invert(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SCALED_PERMUTATION_INVERT_KERNEL);
 
 
@@ -51,7 +51,7 @@ void compose(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SCALED_PERMUTATION_COMPOSE_KERNEL);
 
 
diff --git a/reference/matrix/sellp_kernels.cpp b/reference/matrix/sellp_kernels.cpp
index 70cfc3cac3a..120194d6952 100644
--- a/reference/matrix/sellp_kernels.cpp
+++ b/reference/matrix/sellp_kernels.cpp
@@ -55,8 +55,7 @@ void spmv(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_SELLP_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_SPMV_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -97,7 +96,7 @@ void advanced_spmv(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL);
 
 
@@ -164,7 +163,7 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SELLP_FILL_IN_MATRIX_DATA_KERNEL);
 
 
@@ -199,7 +198,7 @@ void fill_in_dense(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SELLP_FILL_IN_DENSE_KERNEL);
 
 
@@ -235,7 +234,7 @@ void count_nonzeros_per_row(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SELLP_COUNT_NONZEROS_PER_ROW_KERNEL);
 
 
@@ -281,7 +280,7 @@ void convert_to_csr(std::shared_ptr<const ReferenceExecutor> exec,
     result_row_ptrs[num_rows] = cur_ptr;
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL);
 
 
@@ -318,7 +317,7 @@ void extract_diagonal(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SELLP_EXTRACT_DIAGONAL_KERNEL);
 
 
diff --git a/reference/matrix/sparsity_csr_kernels.cpp b/reference/matrix/sparsity_csr_kernels.cpp
index b773d3b9a50..c511a16a292 100644
--- a/reference/matrix/sparsity_csr_kernels.cpp
+++ b/reference/matrix/sparsity_csr_kernels.cpp
@@ -55,7 +55,7 @@ void spmv(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL);
 
 
@@ -92,7 +92,7 @@ void advanced_spmv(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL);
 
 
@@ -113,7 +113,7 @@ void fill_in_dense(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL);
 
 
@@ -138,7 +138,7 @@ void diagonal_element_prefix_sum(
     prefix_sum[num_rows] = num_diag;
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_DIAGONAL_ELEMENT_PREFIX_SUM_KERNEL);
 
 
@@ -173,7 +173,7 @@ void remove_diagonal_elements(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_REMOVE_DIAGONAL_ELEMENTS_KERNEL);
 
 
@@ -227,7 +227,7 @@ void transpose(std::shared_ptr<const ReferenceExecutor> exec,
     transpose_and_transform(exec, orig, trans);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL);
 
 
@@ -245,7 +245,7 @@ void sort_by_column_index(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX);
 
 
@@ -269,7 +269,7 @@ void is_sorted_by_column_index(
     return;
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_IS_SORTED_BY_COLUMN_INDEX);
 
 
diff --git a/reference/multigrid/pgm_kernels.cpp b/reference/multigrid/pgm_kernels.cpp
index 2b4298377cb..bff2a776c6b 100644
--- a/reference/multigrid/pgm_kernels.cpp
+++ b/reference/multigrid/pgm_kernels.cpp
@@ -208,7 +208,7 @@ void find_strongest_neighbor(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PGM_FIND_STRONGEST_NEIGHBOR);
 
 
@@ -260,7 +260,7 @@ void assign_to_exist_agg(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PGM_ASSIGN_TO_EXIST_AGG);
 
 
@@ -274,8 +274,7 @@ void sort_row_major(std::shared_ptr<const DefaultExecutor> exec, size_type nnz,
     });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_PGM_SORT_ROW_MAJOR);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PGM_SORT_ROW_MAJOR);
 
 
 template <typename ValueType, typename IndexType>
@@ -312,7 +311,7 @@ void compute_coarse_coo(std::shared_ptr<const DefaultExecutor> exec,
     coarse_val[coarse_idxs] = temp_val;
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PGM_COMPUTE_COARSE_COO);
 
 
diff --git a/reference/preconditioner/batch_jacobi_kernels.cpp b/reference/preconditioner/batch_jacobi_kernels.cpp
index 3f6d75cca29..f994c8c448b 100644
--- a/reference/preconditioner/batch_jacobi_kernels.cpp
+++ b/reference/preconditioner/batch_jacobi_kernels.cpp
@@ -70,7 +70,7 @@ void extract_common_blocks_pattern(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_BLOCK_JACOBI_EXTRACT_PATTERN_KERNEL);
 
 
@@ -96,7 +96,7 @@ void compute_block_jacobi(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_BLOCK_JACOBI_COMPUTE_KERNEL);
 
 
diff --git a/reference/preconditioner/isai_kernels.cpp b/reference/preconditioner/isai_kernels.cpp
index 6114d3d8e3c..55f56b5705e 100644
--- a/reference/preconditioner/isai_kernels.cpp
+++ b/reference/preconditioner/isai_kernels.cpp
@@ -219,7 +219,7 @@ void generate_tri_inverse(std::shared_ptr<const DefaultExecutor> exec,
                      trs_solve, true);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL);
 
 
@@ -314,7 +314,7 @@ void generate_general_inverse(std::shared_ptr<const DefaultExecutor> exec,
                      general_solve, false);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ISAI_GENERATE_GENERAL_INVERSE_KERNEL);
 
 
@@ -377,7 +377,7 @@ void generate_excess_system(std::shared_ptr<const DefaultExecutor>,
     e_row_ptrs[e_dim] = excess_nz_ptrs[e_end] - excess_nz_ptrs[e_start];
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL);
 
 
@@ -405,7 +405,7 @@ void scale_excess_solution(std::shared_ptr<const DefaultExecutor>,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ISAI_SCALE_EXCESS_SOLUTION_KERNEL);
 
 
@@ -430,7 +430,7 @@ void scatter_excess_solution(std::shared_ptr<const DefaultExecutor>,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL);
 
 
diff --git a/reference/preconditioner/jacobi_kernels.cpp b/reference/preconditioner/jacobi_kernels.cpp
index 52e3666ca30..4eaf0988a00 100644
--- a/reference/preconditioner/jacobi_kernels.cpp
+++ b/reference/preconditioner/jacobi_kernels.cpp
@@ -116,7 +116,7 @@ void find_blocks(std::shared_ptr<const DefaultExecutor> exec,
                                             block_pointers.get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL);
 
 
@@ -406,7 +406,7 @@ void generate(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_GENERATE_KERNEL);
 
 
@@ -494,8 +494,7 @@ void apply(std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
-    GKO_DECLARE_JACOBI_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_APPLY_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -528,7 +527,7 @@ void simple_apply(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL);
 
 
@@ -548,8 +547,7 @@ void scalar_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL);
 
 
 template <typename ValueType>
@@ -565,7 +563,7 @@ void simple_scalar_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL);
 
 
@@ -578,8 +576,7 @@ void scalar_conj(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_JACOBI_SCALAR_CONJ_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_CONJ_KERNEL);
 
 
 template <typename ValueType>
@@ -594,8 +591,7 @@ void invert_diagonal(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_JACOBI_INVERT_DIAGONAL_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_INVERT_DIAGONAL_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -630,7 +626,7 @@ void transpose_jacobi(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL);
 
 
@@ -666,7 +662,7 @@ void conj_transpose_jacobi(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL);
 
 
@@ -686,7 +682,7 @@ void scalar_convert_to_dense(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_JACOBI_SCALAR_CONVERT_TO_DENSE_KERNEL);
 
 
@@ -724,7 +720,7 @@ void convert_to_dense(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL);
 
 
diff --git a/reference/preconditioner/sor_kernels.cpp b/reference/preconditioner/sor_kernels.cpp
index b5ada476f13..88ac422dd02 100644
--- a/reference/preconditioner/sor_kernels.cpp
+++ b/reference/preconditioner/sor_kernels.cpp
@@ -32,7 +32,7 @@ void initialize_weighted_l(
             [](auto val) { return val; }));
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L);
 
 
@@ -60,7 +60,7 @@ void initialize_weighted_l_u(
             }));
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U);
 
 
diff --git a/reference/solver/batch_bicgstab_kernels.cpp b/reference/solver/batch_bicgstab_kernels.cpp
index 3f105f27c48..f7652c0d183 100644
--- a/reference/solver/batch_bicgstab_kernels.cpp
+++ b/reference/solver/batch_bicgstab_kernels.cpp
@@ -87,7 +87,7 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
     dispatcher.apply(b, x, logdata);
 }
 
-GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER(
+GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER_BASE(
     GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL_WRAPPER);
 
 
diff --git a/reference/solver/batch_cg_kernels.cpp b/reference/solver/batch_cg_kernels.cpp
index 3acc49fc524..0d53115e1f1 100644
--- a/reference/solver/batch_cg_kernels.cpp
+++ b/reference/solver/batch_cg_kernels.cpp
@@ -86,7 +86,7 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
     dispatcher.apply(b, x, logdata);
 }
 
-GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER(
+GKO_INSTANTIATE_FOR_BATCH_VALUE_MATRIX_PRECONDITIONER_BASE(
     GKO_DECLARE_BATCH_CG_APPLY_KERNEL_WRAPPER);
 
 
diff --git a/reference/solver/bicg_kernels.cpp b/reference/solver/bicg_kernels.cpp
index 511d4375ae5..dee2d30b8dc 100644
--- a/reference/solver/bicg_kernels.cpp
+++ b/reference/solver/bicg_kernels.cpp
@@ -46,8 +46,7 @@ void initialize(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_BICG_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -75,7 +74,7 @@ void step_1(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICG_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -103,7 +102,7 @@ void step_2(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICG_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_2_KERNEL);
 
 
 }  // namespace bicg
diff --git a/reference/solver/bicgstab_kernels.cpp b/reference/solver/bicgstab_kernels.cpp
index e762dc88533..31955a59c53 100644
--- a/reference/solver/bicgstab_kernels.cpp
+++ b/reference/solver/bicgstab_kernels.cpp
@@ -57,8 +57,7 @@ void initialize(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_BICGSTAB_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -88,8 +87,7 @@ void step_1(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_BICGSTAB_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -117,8 +115,7 @@ void step_2(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_BICGSTAB_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_2_KERNEL);
 
 
 template <typename ValueType>
@@ -152,8 +149,7 @@ void step_3(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_BICGSTAB_STEP_3_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_3_KERNEL);
 
 
 template <typename ValueType>
@@ -173,8 +169,7 @@ void finalize(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_BICGSTAB_FINALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_FINALIZE_KERNEL);
 
 
 }  // namespace bicgstab
diff --git a/reference/solver/cb_gmres_kernels.cpp b/reference/solver/cb_gmres_kernels.cpp
index 5d41a0d0e00..c5acb41e45b 100644
--- a/reference/solver/cb_gmres_kernels.cpp
+++ b/reference/solver/cb_gmres_kernels.cpp
@@ -294,7 +294,8 @@ void initialize(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_BASE(
+    GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL);
 
 
 template <typename ValueType, typename Accessor3d>
diff --git a/reference/solver/cg_kernels.cpp b/reference/solver/cg_kernels.cpp
index fe548b9a03a..5af15692414 100644
--- a/reference/solver/cg_kernels.cpp
+++ b/reference/solver/cg_kernels.cpp
@@ -42,7 +42,7 @@ void initialize(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -67,7 +67,7 @@ void step_1(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -93,7 +93,7 @@ void step_2(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_STEP_2_KERNEL);
 
 
 }  // namespace cg
diff --git a/reference/solver/cgs_kernels.cpp b/reference/solver/cgs_kernels.cpp
index f2f2200b996..a5a5f8c5862 100644
--- a/reference/solver/cgs_kernels.cpp
+++ b/reference/solver/cgs_kernels.cpp
@@ -51,8 +51,7 @@ void initialize(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_CGS_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -84,7 +83,7 @@ void step_1(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -115,7 +114,7 @@ void step_2(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_STEP_2_KERNEL);
 
 
 template <typename ValueType>
@@ -136,7 +135,7 @@ void step_3(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_STEP_3_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_STEP_3_KERNEL);
 
 
 }  // namespace cgs
diff --git a/reference/solver/common_gmres_kernels.cpp b/reference/solver/common_gmres_kernels.cpp
index 24c6135f0b1..4ba091e03ae 100644
--- a/reference/solver/common_gmres_kernels.cpp
+++ b/reference/solver/common_gmres_kernels.cpp
@@ -132,8 +132,7 @@ void initialize(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_COMMON_GMRES_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_COMMON_GMRES_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -157,7 +156,7 @@ void hessenberg_qr(std::shared_ptr<const ReferenceExecutor> exec,
                                  residual_norm_collection, iter, stop_status);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_COMMON_GMRES_HESSENBERG_QR_KERNEL);
 
 
@@ -187,7 +186,7 @@ void solve_krylov(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_COMMON_GMRES_SOLVE_KRYLOV_KERNEL);
 
 
diff --git a/reference/solver/fcg_kernels.cpp b/reference/solver/fcg_kernels.cpp
index 5ba997da941..65b6bf27698 100644
--- a/reference/solver/fcg_kernels.cpp
+++ b/reference/solver/fcg_kernels.cpp
@@ -43,8 +43,7 @@ void initialize(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_FCG_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -69,7 +68,7 @@ void step_1(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_FCG_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -97,7 +96,7 @@ void step_2(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_FCG_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_STEP_2_KERNEL);
 
 
 }  // namespace fcg
diff --git a/reference/solver/gcr_kernels.cpp b/reference/solver/gcr_kernels.cpp
index d51728b15cf..531814c641e 100644
--- a/reference/solver/gcr_kernels.cpp
+++ b/reference/solver/gcr_kernels.cpp
@@ -37,8 +37,7 @@ void initialize(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_GCR_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GCR_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -57,7 +56,7 @@ void restart(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GCR_RESTART_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GCR_RESTART_KERNEL);
 
 
 template <typename ValueType>
@@ -83,7 +82,7 @@ void step_1(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GCR_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GCR_STEP_1_KERNEL);
 
 
 }  // namespace gcr
diff --git a/reference/solver/gmres_kernels.cpp b/reference/solver/gmres_kernels.cpp
index 6d5eaae1490..a7f5a751a3b 100644
--- a/reference/solver/gmres_kernels.cpp
+++ b/reference/solver/gmres_kernels.cpp
@@ -40,7 +40,7 @@ void restart(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GMRES_RESTART_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_RESTART_KERNEL);
 
 
 template <typename ValueType>
@@ -69,8 +69,7 @@ void multi_axpy(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL);
 
 template <typename ValueType>
 void multi_dot(std::shared_ptr<const ReferenceExecutor> exec,
@@ -92,8 +91,7 @@ void multi_dot(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_GMRES_MULTI_DOT_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_DOT_KERNEL);
 
 }  // namespace gmres
 }  // namespace reference
diff --git a/reference/solver/idr_kernels.cpp b/reference/solver/idr_kernels.cpp
index 27315da3565..ef13cd08325 100644
--- a/reference/solver/idr_kernels.cpp
+++ b/reference/solver/idr_kernels.cpp
@@ -160,8 +160,7 @@ void initialize(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_IDR_INITIALIZE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_INITIALIZE_KERNEL);
 
 
 template <typename ValueType>
@@ -191,7 +190,7 @@ void step_1(std::shared_ptr<const ReferenceExecutor> exec, const size_type nrhs,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -216,7 +215,7 @@ void step_2(std::shared_ptr<const ReferenceExecutor> exec, const size_type nrhs,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_2_KERNEL);
 
 
 template <typename ValueType>
@@ -259,7 +258,7 @@ void step_3(std::shared_ptr<const ReferenceExecutor> exec, const size_type nrhs,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_3_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_3_KERNEL);
 
 
 template <typename ValueType>
@@ -287,8 +286,7 @@ void compute_omega(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL);
 
 
 }  // namespace idr
diff --git a/reference/solver/lower_trs_kernels.cpp b/reference/solver/lower_trs_kernels.cpp
index 49e3829d9af..ba02c9c838c 100644
--- a/reference/solver/lower_trs_kernels.cpp
+++ b/reference/solver/lower_trs_kernels.cpp
@@ -44,7 +44,7 @@ void generate(std::shared_ptr<const ReferenceExecutor> exec,
     // "analysis" phase for the triangular matrix.
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL);
 
 
@@ -88,7 +88,7 @@ void solve(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_LOWER_TRS_SOLVE_KERNEL);
 
 
diff --git a/reference/solver/multigrid_kernels.cpp b/reference/solver/multigrid_kernels.cpp
index 4ce4491c990..b08c9857d3a 100644
--- a/reference/solver/multigrid_kernels.cpp
+++ b/reference/solver/multigrid_kernels.cpp
@@ -43,8 +43,7 @@ void kcycle_step_1(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL);
 
 
 template <typename ValueType>
@@ -73,8 +72,7 @@ void kcycle_step_2(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL);
 
 
 template <typename ValueType>
@@ -91,7 +89,7 @@ void kcycle_check_stop(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
     GKO_DECLARE_MULTIGRID_KCYCLE_CHECK_STOP_KERNEL);
 
 
diff --git a/reference/solver/upper_trs_kernels.cpp b/reference/solver/upper_trs_kernels.cpp
index b1d045eeadb..f0c23a9c4cc 100644
--- a/reference/solver/upper_trs_kernels.cpp
+++ b/reference/solver/upper_trs_kernels.cpp
@@ -44,7 +44,7 @@ void generate(std::shared_ptr<const ReferenceExecutor> exec,
     // "analysis" phase for the triangular matrix.
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL);
 
 
@@ -90,7 +90,7 @@ void solve(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL);
 
 
diff --git a/reference/stop/residual_norm_kernels.cpp b/reference/stop/residual_norm_kernels.cpp
index ed91ff390b6..ba2672edc28 100644
--- a/reference/stop/residual_norm_kernels.cpp
+++ b/reference/stop/residual_norm_kernels.cpp
@@ -50,7 +50,7 @@ void residual_norm(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF(
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
     GKO_DECLARE_RESIDUAL_NORM_KERNEL);
 
 
@@ -90,8 +90,7 @@ void implicit_residual_norm(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(
-    GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL);
 
 
 }  // namespace implicit_residual_norm

From e410e9bd417dffd9cfc9b54dcb3aa12b616e1d19 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 2 Dec 2024 12:33:24 +0100
Subject: [PATCH 426/448] rename testing type

---
 core/test/base/array.cpp                      |  3 +-
 core/test/base/batch_multi_vector.cpp         |  3 +-
 core/test/base/combination.cpp                |  3 +-
 core/test/base/composition.cpp                |  3 +-
 core/test/base/dense_cache.cpp                |  3 +-
 core/test/base/iterator_factory.cpp           |  4 +-
 core/test/base/mtx_io.cpp                     |  7 ++-
 core/test/base/segmented_array.cpp            |  3 +-
 core/test/components/addressable_pq.cpp       |  4 +-
 .../test/factorization/elimination_forest.cpp |  2 +-
 core/test/factorization/par_ic.cpp            |  3 +-
 core/test/factorization/par_ict.cpp           |  3 +-
 core/test/factorization/par_ilu.cpp           |  3 +-
 core/test/factorization/par_ilut.cpp          |  2 +-
 core/test/log/convergence.cpp                 |  3 +-
 core/test/log/papi.cpp                        |  2 +-
 core/test/log/solver_progress.cpp             |  3 +-
 core/test/log/stream.cpp                      |  2 +-
 core/test/matrix/batch_csr.cpp                |  2 +-
 core/test/matrix/batch_dense.cpp              |  2 +-
 core/test/matrix/batch_ell.cpp                |  2 +-
 core/test/matrix/batch_identity.cpp           |  3 +-
 core/test/matrix/coo.cpp                      |  3 +-
 core/test/matrix/coo_builder.cpp              |  2 +-
 core/test/matrix/csr.cpp                      |  3 +-
 core/test/matrix/csr_builder.cpp              |  2 +-
 core/test/matrix/dense.cpp                    |  2 +-
 core/test/matrix/diagonal.cpp                 |  3 +-
 core/test/matrix/ell.cpp                      |  3 +-
 core/test/matrix/fbcsr.cpp                    |  7 ++-
 core/test/matrix/fbcsr_builder.cpp            |  2 +-
 core/test/matrix/hybrid.cpp                   |  3 +-
 core/test/matrix/identity.cpp                 |  6 +--
 core/test/matrix/permutation.cpp              |  2 +-
 core/test/matrix/row_gatherer.cpp             |  2 +-
 core/test/matrix/sellp.cpp                    |  3 +-
 core/test/matrix/sparsity_csr.cpp             |  2 +-
 core/test/mpi/base/bindings.cpp               |  2 +-
 core/test/mpi/distributed/matrix.cpp          |  2 +-
 .../distributed/preconditioner/schwarz.cpp    |  2 +-
 core/test/mpi/distributed/vector_cache.cpp    |  2 +-
 core/test/multigrid/fixed_coarsening.cpp      |  2 +-
 core/test/multigrid/pgm.cpp                   |  2 +-
 core/test/preconditioner/isai.cpp             |  2 +-
 core/test/preconditioner/jacobi.cpp           |  2 +-
 core/test/reorder/amd.cpp                     |  3 +-
 core/test/solver/batch_bicgstab.cpp           |  3 +-
 core/test/solver/batch_cg.cpp                 |  2 +-
 core/test/solver/bicg.cpp                     |  2 +-
 core/test/solver/bicgstab.cpp                 |  3 +-
 core/test/solver/cb_gmres.cpp                 |  4 +-
 core/test/solver/cg.cpp                       |  2 +-
 core/test/solver/cgs.cpp                      |  2 +-
 core/test/solver/direct.cpp                   |  3 +-
 core/test/solver/fcg.cpp                      |  2 +-
 core/test/solver/gcr.cpp                      |  2 +-
 core/test/solver/gmres.cpp                    |  2 +-
 core/test/solver/idr.cpp                      |  2 +-
 core/test/solver/ir.cpp                       |  2 +-
 core/test/solver/lower_trs.cpp                |  2 +-
 core/test/solver/multigrid.cpp                |  3 +-
 core/test/solver/upper_trs.cpp                |  2 +-
 core/test/utils.hpp                           | 54 +++++++++----------
 core/test/utils/array_generator_test.cpp      |  3 +-
 core/test/utils/matrix_generator_test.cpp     |  3 +-
 core/test/utils/matrix_utils_test.cpp         |  3 +-
 core/test/utils/unsort_matrix_test.cpp        |  2 +-
 core/test/utils/value_generator_test.cpp      |  3 +-
 cuda/test/base/array.cpp                      |  3 +-
 extensions/test/kokkos/types.cpp              |  4 +-
 hip/test/matrix/fbcsr_kernels.cpp             |  3 +-
 reference/test/base/array.cpp                 |  3 +-
 .../test/base/batch_multi_vector_kernels.cpp  |  3 +-
 reference/test/base/combination.cpp           |  3 +-
 reference/test/base/composition.cpp           |  3 +-
 reference/test/base/perturbation.cpp          |  3 +-
 .../components/absolute_array_kernels.cpp     |  3 +-
 .../test/components/fill_array_kernels.cpp    |  2 +-
 .../test/components/reduce_array_kernels.cpp  |  2 +-
 .../test/distributed/assembly_kernels.cpp     |  2 +-
 reference/test/distributed/matrix_kernels.cpp |  2 +-
 reference/test/distributed/vector_kernels.cpp |  2 +-
 .../test/factorization/cholesky_kernels.cpp   |  2 +-
 .../test/factorization/factorization.cpp      |  2 +-
 reference/test/factorization/ic_kernels.cpp   |  3 +-
 reference/test/factorization/ilu_kernels.cpp  |  3 +-
 reference/test/factorization/lu_kernels.cpp   |  3 +-
 .../test/factorization/par_ic_kernels.cpp     |  3 +-
 .../test/factorization/par_ict_kernels.cpp    |  3 +-
 .../test/factorization/par_ilu_kernels.cpp    |  3 +-
 .../test/factorization/par_ilut_kernels.cpp   |  2 +-
 reference/test/log/convergence.cpp            |  3 +-
 reference/test/log/papi.cpp                   |  2 +-
 reference/test/matrix/batch_csr_kernels.cpp   |  2 +-
 reference/test/matrix/batch_dense_kernels.cpp |  2 +-
 reference/test/matrix/batch_ell_kernels.cpp   |  2 +-
 reference/test/matrix/coo_kernels.cpp         |  5 +-
 reference/test/matrix/csr_kernels.cpp         |  7 ++-
 reference/test/matrix/dense_kernels.cpp       |  6 +--
 reference/test/matrix/diagonal_kernels.cpp    |  5 +-
 reference/test/matrix/ell_kernels.cpp         |  5 +-
 reference/test/matrix/fbcsr_kernels.cpp       |  5 +-
 reference/test/matrix/fft_kernels.cpp         |  2 +-
 reference/test/matrix/hybrid_kernels.cpp      |  5 +-
 reference/test/matrix/identity.cpp            |  3 +-
 reference/test/matrix/permutation.cpp         |  2 +-
 reference/test/matrix/scaled_permutation.cpp  |  2 +-
 reference/test/matrix/sellp_kernels.cpp       |  5 +-
 reference/test/matrix/sparsity_csr.cpp        |  2 +-
 .../test/matrix/sparsity_csr_kernels.cpp      |  2 +-
 .../multigrid/fixed_coarsening_kernels.cpp    |  2 +-
 reference/test/multigrid/pgm_kernels.cpp      |  3 +-
 .../test/preconditioner/gauss_seidel.cpp      |  2 +-
 reference/test/preconditioner/ic.cpp          |  3 +-
 reference/test/preconditioner/ilu.cpp         |  2 +-
 .../test/preconditioner/isai_kernels.cpp      |  6 +--
 reference/test/preconditioner/jacobi.cpp      |  3 +-
 .../test/preconditioner/jacobi_kernels.cpp    |  3 +-
 reference/test/preconditioner/sor_kernels.cpp |  3 +-
 reference/test/reorder/mc64.cpp               |  3 +-
 reference/test/reorder/mc64_kernels.cpp       |  3 +-
 reference/test/reorder/rcm.cpp                |  3 +-
 reference/test/reorder/scaled_reordered.cpp   |  2 +-
 .../test/solver/batch_bicgstab_kernels.cpp    |  2 +-
 reference/test/solver/batch_cg_kernels.cpp    |  3 +-
 reference/test/solver/bicg_kernels.cpp        |  2 +-
 reference/test/solver/bicgstab_kernels.cpp    |  3 +-
 reference/test/solver/cb_gmres_kernels.cpp    |  4 +-
 reference/test/solver/cg_kernels.cpp          |  2 +-
 reference/test/solver/cgs_kernels.cpp         |  2 +-
 reference/test/solver/direct.cpp              |  3 +-
 reference/test/solver/fcg_kernels.cpp         |  2 +-
 reference/test/solver/gcr_kernels.cpp         |  2 +-
 reference/test/solver/gmres_kernels.cpp       |  2 +-
 reference/test/solver/idr_kernels.cpp         |  2 +-
 reference/test/solver/ir_kernels.cpp          |  2 +-
 reference/test/solver/lower_trs.cpp           |  2 +-
 reference/test/solver/lower_trs_kernels.cpp   |  2 +-
 reference/test/solver/multigrid_kernels.cpp   |  2 +-
 reference/test/solver/upper_trs.cpp           |  2 +-
 reference/test/solver/upper_trs_kernels.cpp   |  2 +-
 reference/test/stop/residual_norm_kernels.cpp | 11 ++--
 reference/test/utils/assertions_test.cpp      |  3 +-
 test/base/device_matrix_data_kernels.cpp      |  2 +-
 test/components/fill_array_kernels.cpp        |  2 +-
 test/components/reduce_array_kernels.cpp      |  2 +-
 test/distributed/assembly_kernels.cpp         |  2 +-
 test/distributed/matrix_kernels.cpp           |  2 +-
 test/distributed/vector_kernels.cpp           |  2 +-
 test/factorization/cholesky_kernels.cpp       |  6 +--
 test/factorization/lu_kernels.cpp             |  6 +--
 test/factorization/par_ic_kernels.cpp         |  3 +-
 test/factorization/par_ict_kernels.cpp        |  3 +-
 test/factorization/par_ilu_kernels.cpp        |  3 +-
 test/factorization/par_ilut_kernels.cpp       |  2 +-
 test/matrix/fbcsr_kernels.cpp                 |  5 +-
 test/matrix/fft_kernels.cpp                   |  2 +-
 test/mpi/assembly.cpp                         |  2 +-
 test/mpi/matrix.cpp                           |  4 +-
 test/mpi/multigrid/pgm.cpp                    |  2 +-
 test/mpi/preconditioner/schwarz.cpp           |  3 +-
 test/mpi/vector.cpp                           |  9 ++--
 test/reorder/amd.cpp                          |  3 +-
 test/solver/direct.cpp                        |  6 +--
 test/stop/residual_norm_kernels.cpp           | 11 ++--
 165 files changed, 233 insertions(+), 306 deletions(-)

diff --git a/core/test/base/array.cpp b/core/test/base/array.cpp
index 23515d70fc4..f7e03855d06 100644
--- a/core/test/base/array.cpp
+++ b/core/test/base/array.cpp
@@ -40,8 +40,7 @@ class Array : public ::testing::Test {
     gko::array<T> x;
 };
 
-TYPED_TEST_SUITE(Array, gko::test::ComplexAndPODTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(Array, gko::test::ComplexAndPODTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Array, CanBeCreatedWithoutAnExecutor)
diff --git a/core/test/base/batch_multi_vector.cpp b/core/test/base/batch_multi_vector.cpp
index 7a9606bc710..3798f30ce65 100644
--- a/core/test/base/batch_multi_vector.cpp
+++ b/core/test/base/batch_multi_vector.cpp
@@ -64,8 +64,7 @@ class MultiVector : public ::testing::Test {
     std::unique_ptr<gko::matrix::Dense<value_type>> dense_mtx;
 };
 
-TYPED_TEST_SUITE(MultiVector, gko::test::ValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(MultiVector, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(MultiVector, CanBeEmpty)
diff --git a/core/test/base/combination.cpp b/core/test/base/combination.cpp
index 63c73cfa168..73c30ffe11c 100644
--- a/core/test/base/combination.cpp
+++ b/core/test/base/combination.cpp
@@ -43,8 +43,7 @@ class Combination : public ::testing::Test {
     std::vector<std::shared_ptr<gko::LinOp>> coefficients;
 };
 
-TYPED_TEST_SUITE(Combination, gko::test::ValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(Combination, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Combination, CanBeEmpty)
diff --git a/core/test/base/composition.cpp b/core/test/base/composition.cpp
index 58c86894fc8..122755b8f92 100644
--- a/core/test/base/composition.cpp
+++ b/core/test/base/composition.cpp
@@ -41,8 +41,7 @@ class Composition : public ::testing::Test {
     std::vector<std::shared_ptr<gko::LinOp>> operators;
 };
 
-TYPED_TEST_SUITE(Composition, gko::test::ValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(Composition, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Composition, CanBeEmpty)
diff --git a/core/test/base/dense_cache.cpp b/core/test/base/dense_cache.cpp
index 54d904617db..526187610a4 100644
--- a/core/test/base/dense_cache.cpp
+++ b/core/test/base/dense_cache.cpp
@@ -31,8 +31,7 @@ class DenseCache : public ::testing::Test {
 };
 
 
-TYPED_TEST_SUITE(DenseCache, gko::test::ValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(DenseCache, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(DenseCache, CanDefaultConstruct)
diff --git a/core/test/base/iterator_factory.cpp b/core/test/base/iterator_factory.cpp
index 3685242f78a..bbc3bbfd04f 100644
--- a/core/test/base/iterator_factory.cpp
+++ b/core/test/base/iterator_factory.cpp
@@ -78,7 +78,7 @@ class ZipIterator : public ::testing::Test {
     const std::vector<value_type> ordered_value;
 };
 
-TYPED_TEST_SUITE(ZipIterator, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(ZipIterator, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
@@ -366,7 +366,7 @@ class PermuteIterator : public ::testing::Test {
     using value_type = ValueType;
 };
 
-TYPED_TEST_SUITE(PermuteIterator, gko::test::ComplexAndPODTypesWithHalf,
+TYPED_TEST_SUITE(PermuteIterator, gko::test::ComplexAndPODTypes,
                  TypenameNameGenerator);
 
 
diff --git a/core/test/base/mtx_io.cpp b/core/test/base/mtx_io.cpp
index 14d44335b85..f800f4e40f2 100644
--- a/core/test/base/mtx_io.cpp
+++ b/core/test/base/mtx_io.cpp
@@ -973,7 +973,7 @@ class RealDummyLinOpTest : public ::testing::Test {
         typename std::tuple_element<1, decltype(ValueIndexType())>::type;
 };
 
-TYPED_TEST_SUITE(RealDummyLinOpTest, gko::test::RealValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(RealDummyLinOpTest, gko::test::RealValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
@@ -1178,7 +1178,7 @@ class DenseTest : public ::testing::Test {
     using index_type = typename std::tuple_element<1, ValueIndexType>::type;
 };
 
-TYPED_TEST_SUITE(DenseTest, gko::test::RealValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(DenseTest, gko::test::RealValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
@@ -1222,8 +1222,7 @@ class ComplexDummyLinOpTest : public ::testing::Test {
         typename std::tuple_element<1, decltype(ValueIndexType())>::type;
 };
 
-TYPED_TEST_SUITE(ComplexDummyLinOpTest,
-                 gko::test::ComplexValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(ComplexDummyLinOpTest, gko::test::ComplexValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/core/test/base/segmented_array.cpp b/core/test/base/segmented_array.cpp
index 31444d71d18..2741990036f 100644
--- a/core/test/base/segmented_array.cpp
+++ b/core/test/base/segmented_array.cpp
@@ -27,8 +27,7 @@ class SegmentedArray : public ::testing::Test {
     std::shared_ptr<gko::Executor> exec = gko::ReferenceExecutor::create();
 };
 
-TYPED_TEST_SUITE(SegmentedArray, gko::test::PODTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(SegmentedArray, gko::test::PODTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(SegmentedArray, CanConstructFromExecutor)
diff --git a/core/test/components/addressable_pq.cpp b/core/test/components/addressable_pq.cpp
index 87fcb289a77..6301cd44fb4 100644
--- a/core/test/components/addressable_pq.cpp
+++ b/core/test/components/addressable_pq.cpp
@@ -91,8 +91,8 @@ class AddressablePriorityQueue : public ::testing::Test {
     std::shared_ptr<const gko::Executor> exec;
 };
 
-TYPED_TEST_SUITE(AddressablePriorityQueue,
-                 gko::test::RealValueIndexTypesWithHalf, TypenameNameGenerator);
+TYPED_TEST_SUITE(AddressablePriorityQueue, gko::test::RealValueIndexTypes,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(AddressablePriorityQueue, InitializesCorrectly)
diff --git a/core/test/factorization/elimination_forest.cpp b/core/test/factorization/elimination_forest.cpp
index cf9ddb7536e..292b366f50e 100644
--- a/core/test/factorization/elimination_forest.cpp
+++ b/core/test/factorization/elimination_forest.cpp
@@ -33,7 +33,7 @@ class EliminationForest : public ::testing::Test {
     std::shared_ptr<const gko::ReferenceExecutor> ref;
 };
 
-TYPED_TEST_SUITE(EliminationForest, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(EliminationForest, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/core/test/factorization/par_ic.cpp b/core/test/factorization/par_ic.cpp
index efd4c1e3ebd..d6de0f9fc98 100644
--- a/core/test/factorization/par_ic.cpp
+++ b/core/test/factorization/par_ic.cpp
@@ -29,8 +29,7 @@ class ParIc : public ::testing::Test {
     std::shared_ptr<const gko::ReferenceExecutor> ref;
 };
 
-TYPED_TEST_SUITE(ParIc, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(ParIc, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(ParIc, SetIterations)
diff --git a/core/test/factorization/par_ict.cpp b/core/test/factorization/par_ict.cpp
index 5d5ac8bc815..07eec8db549 100644
--- a/core/test/factorization/par_ict.cpp
+++ b/core/test/factorization/par_ict.cpp
@@ -29,8 +29,7 @@ class ParIct : public ::testing::Test {
     std::shared_ptr<const gko::ReferenceExecutor> ref;
 };
 
-TYPED_TEST_SUITE(ParIct, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(ParIct, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(ParIct, SetIterations)
diff --git a/core/test/factorization/par_ilu.cpp b/core/test/factorization/par_ilu.cpp
index e06a90741af..a0b8f37e3d4 100644
--- a/core/test/factorization/par_ilu.cpp
+++ b/core/test/factorization/par_ilu.cpp
@@ -29,8 +29,7 @@ class ParIlu : public ::testing::Test {
     std::shared_ptr<const gko::ReferenceExecutor> ref;
 };
 
-TYPED_TEST_SUITE(ParIlu, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(ParIlu, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(ParIlu, SetIterations)
diff --git a/core/test/factorization/par_ilut.cpp b/core/test/factorization/par_ilut.cpp
index a2d0287d22a..ad466e62407 100644
--- a/core/test/factorization/par_ilut.cpp
+++ b/core/test/factorization/par_ilut.cpp
@@ -30,7 +30,7 @@ class ParIlut : public ::testing::Test {
     std::shared_ptr<const gko::ReferenceExecutor> ref;
 };
 
-TYPED_TEST_SUITE(ParIlut, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(ParIlut, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/core/test/log/convergence.cpp b/core/test/log/convergence.cpp
index 64ec37e8942..8fff0c17b8e 100644
--- a/core/test/log/convergence.cpp
+++ b/core/test/log/convergence.cpp
@@ -45,8 +45,7 @@ class Convergence : public ::testing::Test {
     gko::array<gko::stopping_status> status = {exec, 1};
 };
 
-TYPED_TEST_SUITE(Convergence, gko::test::ValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(Convergence, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Convergence, CanGetEmptyData)
diff --git a/core/test/log/papi.cpp b/core/test/log/papi.cpp
index e0404b04d90..8278120cc49 100644
--- a/core/test/log/papi.cpp
+++ b/core/test/log/papi.cpp
@@ -91,7 +91,7 @@ class Papi : public ::testing::Test {
     int eventset;
 };
 
-TYPED_TEST_SUITE(Papi, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
+TYPED_TEST_SUITE(Papi, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Papi, CatchesAllocationStarted)
diff --git a/core/test/log/solver_progress.cpp b/core/test/log/solver_progress.cpp
index 2b4a6ac599c..e00044a908d 100644
--- a/core/test/log/solver_progress.cpp
+++ b/core/test/log/solver_progress.cpp
@@ -68,8 +68,7 @@ class SolverProgress : public ::testing::Test {
     std::unique_ptr<Cg> solver;
 };
 
-TYPED_TEST_SUITE(SolverProgress, gko::test::ValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(SolverProgress, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(SolverProgress, TableWorks)
diff --git a/core/test/log/stream.cpp b/core/test/log/stream.cpp
index 7f4b41e5cc3..1ad02f7daf1 100644
--- a/core/test/log/stream.cpp
+++ b/core/test/log/stream.cpp
@@ -26,7 +26,7 @@ constexpr int num_iters = 10;
 template <typename T>
 class Stream : public ::testing::Test {};
 
-TYPED_TEST_SUITE(Stream, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
+TYPED_TEST_SUITE(Stream, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Stream, CatchesAllocationStarted)
diff --git a/core/test/matrix/batch_csr.cpp b/core/test/matrix/batch_csr.cpp
index 3a1871ba583..57cae53d646 100644
--- a/core/test/matrix/batch_csr.cpp
+++ b/core/test/matrix/batch_csr.cpp
@@ -114,7 +114,7 @@ class Csr : public ::testing::Test {
     std::unique_ptr<CsrMtx> sp_csr_mtx;
 };
 
-TYPED_TEST_SUITE(Csr, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
+TYPED_TEST_SUITE(Csr, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Csr, KnowsItsSizeAndValues)
diff --git a/core/test/matrix/batch_dense.cpp b/core/test/matrix/batch_dense.cpp
index 23542114746..334df5c0e93 100644
--- a/core/test/matrix/batch_dense.cpp
+++ b/core/test/matrix/batch_dense.cpp
@@ -68,7 +68,7 @@ class Dense : public ::testing::Test {
     std::unique_ptr<gko::matrix::Dense<value_type>> dense_mtx;
 };
 
-TYPED_TEST_SUITE(Dense, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
+TYPED_TEST_SUITE(Dense, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Dense, KnowsItsSizeAndValues)
diff --git a/core/test/matrix/batch_ell.cpp b/core/test/matrix/batch_ell.cpp
index ae047ecfa90..11f6381a43d 100644
--- a/core/test/matrix/batch_ell.cpp
+++ b/core/test/matrix/batch_ell.cpp
@@ -92,7 +92,7 @@ class Ell : public ::testing::Test {
     std::unique_ptr<EllMtx> sp_ell_mtx;
 };
 
-TYPED_TEST_SUITE(Ell, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
+TYPED_TEST_SUITE(Ell, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Ell, KnowsItsSizeAndValues)
diff --git a/core/test/matrix/batch_identity.cpp b/core/test/matrix/batch_identity.cpp
index 765f9f30938..dd7a3675110 100644
--- a/core/test/matrix/batch_identity.cpp
+++ b/core/test/matrix/batch_identity.cpp
@@ -49,8 +49,7 @@ class Identity : public ::testing::Test {
     std::unique_ptr<gko::batch::MultiVector<value_type>> mvec;
 };
 
-TYPED_TEST_SUITE(Identity, gko::test::ValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(Identity, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Identity, KnowsItsSizeAndValues)
diff --git a/core/test/matrix/coo.cpp b/core/test/matrix/coo.cpp
index 56735e792d5..ffb8d5aee9f 100644
--- a/core/test/matrix/coo.cpp
+++ b/core/test/matrix/coo.cpp
@@ -77,8 +77,7 @@ class Coo : public ::testing::Test {
     }
 };
 
-TYPED_TEST_SUITE(Coo, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Coo, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(Coo, KnowsItsSize)
diff --git a/core/test/matrix/coo_builder.cpp b/core/test/matrix/coo_builder.cpp
index b1b22c5848a..9bfae5cf3af 100644
--- a/core/test/matrix/coo_builder.cpp
+++ b/core/test/matrix/coo_builder.cpp
@@ -32,7 +32,7 @@ class CooBuilder : public ::testing::Test {
     std::unique_ptr<Mtx> mtx;
 };
 
-TYPED_TEST_SUITE(CooBuilder, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(CooBuilder, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/core/test/matrix/csr.cpp b/core/test/matrix/csr.cpp
index f199de423e8..4bbdc63851a 100644
--- a/core/test/matrix/csr.cpp
+++ b/core/test/matrix/csr.cpp
@@ -82,8 +82,7 @@ class Csr : public ::testing::Test {
     }
 };
 
-TYPED_TEST_SUITE(Csr, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Csr, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(Csr, KnowsItsSize)
diff --git a/core/test/matrix/csr_builder.cpp b/core/test/matrix/csr_builder.cpp
index 2accb57770c..24cbe4718c5 100644
--- a/core/test/matrix/csr_builder.cpp
+++ b/core/test/matrix/csr_builder.cpp
@@ -33,7 +33,7 @@ class CsrBuilder : public ::testing::Test {
     std::unique_ptr<Mtx> mtx;
 };
 
-TYPED_TEST_SUITE(CsrBuilder, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(CsrBuilder, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/core/test/matrix/dense.cpp b/core/test/matrix/dense.cpp
index f1a673840ea..e7158a15aed 100644
--- a/core/test/matrix/dense.cpp
+++ b/core/test/matrix/dense.cpp
@@ -48,7 +48,7 @@ class Dense : public ::testing::Test {
     std::unique_ptr<gko::matrix::Dense<value_type>> mtx;
 };
 
-TYPED_TEST_SUITE(Dense, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
+TYPED_TEST_SUITE(Dense, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Dense, CanBeEmpty)
diff --git a/core/test/matrix/diagonal.cpp b/core/test/matrix/diagonal.cpp
index 7e598d67a5e..de03a9350bb 100644
--- a/core/test/matrix/diagonal.cpp
+++ b/core/test/matrix/diagonal.cpp
@@ -47,8 +47,7 @@ class Diagonal : public ::testing::Test {
     }
 };
 
-TYPED_TEST_SUITE(Diagonal, gko::test::ValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(Diagonal, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Diagonal, KnowsItsSize)
diff --git a/core/test/matrix/ell.cpp b/core/test/matrix/ell.cpp
index 93fc73dde18..bcc2b591a50 100644
--- a/core/test/matrix/ell.cpp
+++ b/core/test/matrix/ell.cpp
@@ -79,8 +79,7 @@ class Ell : public ::testing::Test {
     }
 };
 
-TYPED_TEST_SUITE(Ell, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Ell, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(Ell, KnowsItsSize)
diff --git a/core/test/matrix/fbcsr.cpp b/core/test/matrix/fbcsr.cpp
index fd024532a14..3d3d4ee738d 100644
--- a/core/test/matrix/fbcsr.cpp
+++ b/core/test/matrix/fbcsr.cpp
@@ -131,7 +131,7 @@ class FbcsrSample : public ::testing::Test {
 };
 
 
-TYPED_TEST_SUITE(FbcsrSample, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(FbcsrSample, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
@@ -183,7 +183,7 @@ template <typename ValueIndexType>
 class FbcsrSampleComplex : public FbcsrSample<ValueIndexType> {};
 
 
-TYPED_TEST_SUITE(FbcsrSampleComplex, gko::test::ComplexValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(FbcsrSampleComplex, gko::test::ComplexValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
@@ -282,8 +282,7 @@ class Fbcsr : public ::testing::Test {
     }
 };
 
-TYPED_TEST_SUITE(Fbcsr, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Fbcsr, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(Fbcsr, GetNumBlocksCorrectlyThrows)
diff --git a/core/test/matrix/fbcsr_builder.cpp b/core/test/matrix/fbcsr_builder.cpp
index 241c7ccc6eb..d91a0c7b70a 100644
--- a/core/test/matrix/fbcsr_builder.cpp
+++ b/core/test/matrix/fbcsr_builder.cpp
@@ -33,7 +33,7 @@ class FbcsrBuilder : public ::testing::Test {
     std::unique_ptr<Mtx> mtx;
 };
 
-TYPED_TEST_SUITE(FbcsrBuilder, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(FbcsrBuilder, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/core/test/matrix/hybrid.cpp b/core/test/matrix/hybrid.cpp
index 6b1e2a4a747..d1a69312755 100644
--- a/core/test/matrix/hybrid.cpp
+++ b/core/test/matrix/hybrid.cpp
@@ -96,8 +96,7 @@ class Hybrid : public ::testing::Test {
     }
 };
 
-TYPED_TEST_SUITE(Hybrid, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Hybrid, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(Hybrid, KnowsItsSize)
diff --git a/core/test/matrix/identity.cpp b/core/test/matrix/identity.cpp
index 80defae4441..bcf9c036992 100644
--- a/core/test/matrix/identity.cpp
+++ b/core/test/matrix/identity.cpp
@@ -31,8 +31,7 @@ class Identity : public ::testing::Test {
     std::shared_ptr<const gko::Executor> exec;
 };
 
-TYPED_TEST_SUITE(Identity, gko::test::ValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(Identity, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Identity, CanBeEmpty)
@@ -82,8 +81,7 @@ class IdentityFactory : public ::testing::Test {
     using value_type = T;
 };
 
-TYPED_TEST_SUITE(IdentityFactory, gko::test::ValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(IdentityFactory, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(IdentityFactory, CanGenerateIdentityMatrix)
diff --git a/core/test/matrix/permutation.cpp b/core/test/matrix/permutation.cpp
index fcd5aad789c..edb1532696b 100644
--- a/core/test/matrix/permutation.cpp
+++ b/core/test/matrix/permutation.cpp
@@ -52,7 +52,7 @@ class Permutation : public ::testing::Test {
     std::unique_ptr<gko::matrix::Permutation<index_type>> mtx;
 };
 
-TYPED_TEST_SUITE(Permutation, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(Permutation, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/core/test/matrix/row_gatherer.cpp b/core/test/matrix/row_gatherer.cpp
index b808828cc08..46a09b808fc 100644
--- a/core/test/matrix/row_gatherer.cpp
+++ b/core/test/matrix/row_gatherer.cpp
@@ -65,7 +65,7 @@ class RowGatherer : public ::testing::Test {
     std::unique_ptr<OutVec> out;
 };
 
-TYPED_TEST_SUITE(RowGatherer, gko::test::TwoValueIndexTypeWithHalf,
+TYPED_TEST_SUITE(RowGatherer, gko::test::TwoValueIndexTypes,
                  TupleTypenameNameGenerator);
 
 
diff --git a/core/test/matrix/sellp.cpp b/core/test/matrix/sellp.cpp
index a79fcf2bbd3..123d7bae773 100644
--- a/core/test/matrix/sellp.cpp
+++ b/core/test/matrix/sellp.cpp
@@ -107,8 +107,7 @@ class Sellp : public ::testing::Test {
     }
 };
 
-TYPED_TEST_SUITE(Sellp, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Sellp, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(Sellp, KnowsItsSize)
diff --git a/core/test/matrix/sparsity_csr.cpp b/core/test/matrix/sparsity_csr.cpp
index 67f8237adb6..e929f960f1e 100644
--- a/core/test/matrix/sparsity_csr.cpp
+++ b/core/test/matrix/sparsity_csr.cpp
@@ -74,7 +74,7 @@ class SparsityCsr : public ::testing::Test {
     }
 };
 
-TYPED_TEST_SUITE(SparsityCsr, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(SparsityCsr, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/core/test/mpi/base/bindings.cpp b/core/test/mpi/base/bindings.cpp
index ddcbb1777df..9feebb76ca9 100644
--- a/core/test/mpi/base/bindings.cpp
+++ b/core/test/mpi/base/bindings.cpp
@@ -24,7 +24,7 @@ class MpiBindings : public ::testing::Test {
     std::shared_ptr<gko::Executor> ref;
 };
 
-using TestTypes = gko::test::merge_type_list_t<gko::test::RealValueTypes,
+using TestTypes = gko::test::merge_type_list_t<gko::test::RealValueTypesBase,
                                                gko::test::IndexTypes>;
 
 TYPED_TEST_SUITE(MpiBindings, TestTypes, TypenameNameGenerator);
diff --git a/core/test/mpi/distributed/matrix.cpp b/core/test/mpi/distributed/matrix.cpp
index 2b027face11..26a551b5758 100644
--- a/core/test/mpi/distributed/matrix.cpp
+++ b/core/test/mpi/distributed/matrix.cpp
@@ -177,7 +177,7 @@ class MatrixBuilder : public ::testing::Test {
     gko::experimental::mpi::communicator comm;
 };
 
-TYPED_TEST_SUITE(MatrixBuilder, gko::test::ValueLocalGlobalIndexTypes,
+TYPED_TEST_SUITE(MatrixBuilder, gko::test::ValueLocalGlobalIndexTypesBase,
                  TupleTypenameNameGenerator);
 
 
diff --git a/core/test/mpi/distributed/preconditioner/schwarz.cpp b/core/test/mpi/distributed/preconditioner/schwarz.cpp
index fb6676cc011..b55ec6a80ce 100644
--- a/core/test/mpi/distributed/preconditioner/schwarz.cpp
+++ b/core/test/mpi/distributed/preconditioner/schwarz.cpp
@@ -65,7 +65,7 @@ class SchwarzFactory : public ::testing::Test {
     std::shared_ptr<Mtx> mtx;
 };
 
-TYPED_TEST_SUITE(SchwarzFactory, gko::test::ValueLocalGlobalIndexTypes,
+TYPED_TEST_SUITE(SchwarzFactory, gko::test::ValueLocalGlobalIndexTypesBase,
                  TupleTypenameNameGenerator);
 
 
diff --git a/core/test/mpi/distributed/vector_cache.cpp b/core/test/mpi/distributed/vector_cache.cpp
index f64c5fe9038..320d37c3c33 100644
--- a/core/test/mpi/distributed/vector_cache.cpp
+++ b/core/test/mpi/distributed/vector_cache.cpp
@@ -42,7 +42,7 @@ class VectorCache : public ::testing::Test {
 };
 
 
-TYPED_TEST_SUITE(VectorCache, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(VectorCache, gko::test::ValueTypesBase, TypenameNameGenerator);
 
 
 TYPED_TEST(VectorCache, CanDefaultConstruct)
diff --git a/core/test/multigrid/fixed_coarsening.cpp b/core/test/multigrid/fixed_coarsening.cpp
index 35bd04bb067..5cab7282b5d 100644
--- a/core/test/multigrid/fixed_coarsening.cpp
+++ b/core/test/multigrid/fixed_coarsening.cpp
@@ -38,7 +38,7 @@ class FixedCoarseningFactory : public ::testing::Test {
     std::unique_ptr<typename MgLevel::Factory> fixed_coarsening_factory;
 };
 
-TYPED_TEST_SUITE(FixedCoarseningFactory, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(FixedCoarseningFactory, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/core/test/multigrid/pgm.cpp b/core/test/multigrid/pgm.cpp
index c06edda60a0..7798e97f5d6 100644
--- a/core/test/multigrid/pgm.cpp
+++ b/core/test/multigrid/pgm.cpp
@@ -40,7 +40,7 @@ class PgmFactory : public ::testing::Test {
     std::unique_ptr<typename MgLevel::Factory> pgm_factory;
 };
 
-TYPED_TEST_SUITE(PgmFactory, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(PgmFactory, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/core/test/preconditioner/isai.cpp b/core/test/preconditioner/isai.cpp
index b2ee8175d49..b5e7400d0e8 100644
--- a/core/test/preconditioner/isai.cpp
+++ b/core/test/preconditioner/isai.cpp
@@ -64,7 +64,7 @@ class IsaiFactory : public ::testing::Test {
     std::unique_ptr<typename UpperIsai::Factory> upper_isai_factory;
 };
 
-TYPED_TEST_SUITE(IsaiFactory, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(IsaiFactory, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/core/test/preconditioner/jacobi.cpp b/core/test/preconditioner/jacobi.cpp
index 40bc9e8d494..8813b4c3c4d 100644
--- a/core/test/preconditioner/jacobi.cpp
+++ b/core/test/preconditioner/jacobi.cpp
@@ -43,7 +43,7 @@ class JacobiFactory : public ::testing::Test {
     std::shared_ptr<gko::matrix::Csr<value_type, index_type>> mtx;
 };
 
-TYPED_TEST_SUITE(JacobiFactory, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(JacobiFactory, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/core/test/reorder/amd.cpp b/core/test/reorder/amd.cpp
index b97201e929e..9eecf3777e1 100644
--- a/core/test/reorder/amd.cpp
+++ b/core/test/reorder/amd.cpp
@@ -177,8 +177,7 @@ class Amd : public ::testing::Test {
     std::shared_ptr<gko::experimental::reorder::Amd<index_type>> amd;
 };
 
-TYPED_TEST_SUITE(Amd, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Amd, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(Amd, WorksAndReducesFillIn)
diff --git a/core/test/solver/batch_bicgstab.cpp b/core/test/solver/batch_bicgstab.cpp
index 0b50f7f6e92..cd9446d07b2 100644
--- a/core/test/solver/batch_bicgstab.cpp
+++ b/core/test/solver/batch_bicgstab.cpp
@@ -50,8 +50,7 @@ class BatchBicgstab : public ::testing::Test {
     std::unique_ptr<gko::batch::BatchLinOp> solver;
 };
 
-TYPED_TEST_SUITE(BatchBicgstab, gko::test::ValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(BatchBicgstab, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(BatchBicgstab, FactoryKnowsItsExecutor)
diff --git a/core/test/solver/batch_cg.cpp b/core/test/solver/batch_cg.cpp
index b517c931adf..1e97c765f8a 100644
--- a/core/test/solver/batch_cg.cpp
+++ b/core/test/solver/batch_cg.cpp
@@ -50,7 +50,7 @@ class BatchCg : public ::testing::Test {
     std::unique_ptr<gko::batch::BatchLinOp> solver;
 };
 
-TYPED_TEST_SUITE(BatchCg, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
+TYPED_TEST_SUITE(BatchCg, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(BatchCg, FactoryKnowsItsExecutor)
diff --git a/core/test/solver/bicg.cpp b/core/test/solver/bicg.cpp
index a229bd85ed9..e5a40e0c4f8 100644
--- a/core/test/solver/bicg.cpp
+++ b/core/test/solver/bicg.cpp
@@ -46,7 +46,7 @@ class Bicg : public ::testing::Test {
     std::unique_ptr<gko::LinOp> solver;
 };
 
-TYPED_TEST_SUITE(Bicg, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
+TYPED_TEST_SUITE(Bicg, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Bicg, BicgFactoryKnowsItsExecutor)
diff --git a/core/test/solver/bicgstab.cpp b/core/test/solver/bicgstab.cpp
index 23695fe1355..f8b8d3c7b05 100644
--- a/core/test/solver/bicgstab.cpp
+++ b/core/test/solver/bicgstab.cpp
@@ -45,8 +45,7 @@ class Bicgstab : public ::testing::Test {
     std::unique_ptr<gko::LinOp> solver;
 };
 
-TYPED_TEST_SUITE(Bicgstab, gko::test::ValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(Bicgstab, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Bicgstab, BicgstabFactoryKnowsItsExecutor)
diff --git a/core/test/solver/cb_gmres.cpp b/core/test/solver/cb_gmres.cpp
index e014e5f8acc..70eb34a8440 100644
--- a/core/test/solver/cb_gmres.cpp
+++ b/core/test/solver/cb_gmres.cpp
@@ -87,9 +87,9 @@ using st_ir2 = st_helper_type<st_enum::ireduce2>;
 
 using TestTypes = gko::test::merge_type_list_t<
     gko::test::cartesian_type_product_t<
-        gko::test::ValueTypes, ::testing::Types<st_keep, st_r1, st_r2>>,
+        gko::test::ValueTypesBase, ::testing::Types<st_keep, st_r1, st_r2>>,
     gko::test::cartesian_type_product_t<
-        gko::test::RealValueTypes, ::testing::Types<st_i, st_ir1, st_ir2>>>;
+        gko::test::RealValueTypesBase, ::testing::Types<st_i, st_ir1, st_ir2>>>;
 
 TYPED_TEST_SUITE(CbGmres, TestTypes, PairTypenameNameGenerator);
 
diff --git a/core/test/solver/cg.cpp b/core/test/solver/cg.cpp
index 95552d841ac..cbf637de302 100644
--- a/core/test/solver/cg.cpp
+++ b/core/test/solver/cg.cpp
@@ -46,7 +46,7 @@ class Cg : public ::testing::Test {
     std::unique_ptr<gko::LinOp> solver;
 };
 
-TYPED_TEST_SUITE(Cg, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
+TYPED_TEST_SUITE(Cg, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Cg, CgFactoryKnowsItsExecutor)
diff --git a/core/test/solver/cgs.cpp b/core/test/solver/cgs.cpp
index cc355b58270..5dc80892a1b 100644
--- a/core/test/solver/cgs.cpp
+++ b/core/test/solver/cgs.cpp
@@ -46,7 +46,7 @@ class Cgs : public ::testing::Test {
     std::unique_ptr<gko::LinOp> solver;
 };
 
-TYPED_TEST_SUITE(Cgs, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
+TYPED_TEST_SUITE(Cgs, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Cgs, CgsFactoryKnowsItsExecutor)
diff --git a/core/test/solver/direct.cpp b/core/test/solver/direct.cpp
index 43acdd0bdf1..d895892a8be 100644
--- a/core/test/solver/direct.cpp
+++ b/core/test/solver/direct.cpp
@@ -35,8 +35,7 @@ class Direct : public ::testing::Test {
     std::unique_ptr<typename Solver::Factory> factory;
 };
 
-TYPED_TEST_SUITE(Direct, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Direct, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(Direct, FactoryKnowsItsExecutor)
diff --git a/core/test/solver/fcg.cpp b/core/test/solver/fcg.cpp
index c92fa4bb7f1..2898a5f5c46 100644
--- a/core/test/solver/fcg.cpp
+++ b/core/test/solver/fcg.cpp
@@ -44,7 +44,7 @@ class Fcg : public ::testing::Test {
     std::unique_ptr<gko::LinOp> solver;
 };
 
-TYPED_TEST_SUITE(Fcg, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
+TYPED_TEST_SUITE(Fcg, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Fcg, FcgFactoryKnowsItsExecutor)
diff --git a/core/test/solver/gcr.cpp b/core/test/solver/gcr.cpp
index 58194f6e92a..add10957c7d 100644
--- a/core/test/solver/gcr.cpp
+++ b/core/test/solver/gcr.cpp
@@ -70,7 +70,7 @@ class Gcr : public ::testing::Test {
     }
 };
 
-TYPED_TEST_SUITE(Gcr, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
+TYPED_TEST_SUITE(Gcr, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Gcr, GcrFactoryKnowsItsExecutor)
diff --git a/core/test/solver/gmres.cpp b/core/test/solver/gmres.cpp
index 50f505f6321..9d4514f66e0 100644
--- a/core/test/solver/gmres.cpp
+++ b/core/test/solver/gmres.cpp
@@ -60,7 +60,7 @@ class Gmres : public ::testing::Test {
     std::unique_ptr<gko::LinOp> big_solver;
 };
 
-TYPED_TEST_SUITE(Gmres, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
+TYPED_TEST_SUITE(Gmres, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Gmres, GmresFactoryKnowsItsExecutor)
diff --git a/core/test/solver/idr.cpp b/core/test/solver/idr.cpp
index 823327e337e..9eb79356046 100644
--- a/core/test/solver/idr.cpp
+++ b/core/test/solver/idr.cpp
@@ -45,7 +45,7 @@ class Idr : public ::testing::Test {
     std::unique_ptr<gko::LinOp> solver;
 };
 
-TYPED_TEST_SUITE(Idr, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
+TYPED_TEST_SUITE(Idr, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Idr, IdrFactoryKnowsItsExecutor)
diff --git a/core/test/solver/ir.cpp b/core/test/solver/ir.cpp
index 59f85f42321..1137862a395 100644
--- a/core/test/solver/ir.cpp
+++ b/core/test/solver/ir.cpp
@@ -46,7 +46,7 @@ class Ir : public ::testing::Test {
     std::unique_ptr<gko::LinOp> solver;
 };
 
-TYPED_TEST_SUITE(Ir, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
+TYPED_TEST_SUITE(Ir, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Ir, IrFactoryKnowsItsExecutor)
diff --git a/core/test/solver/lower_trs.cpp b/core/test/solver/lower_trs.cpp
index ae07e08c3f7..dfcb564ca12 100644
--- a/core/test/solver/lower_trs.cpp
+++ b/core/test/solver/lower_trs.cpp
@@ -33,7 +33,7 @@ class LowerTrs : public ::testing::Test {
     std::unique_ptr<typename Solver::Factory> lower_trs_factory;
 };
 
-TYPED_TEST_SUITE(LowerTrs, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(LowerTrs, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/core/test/solver/multigrid.cpp b/core/test/solver/multigrid.cpp
index 54c4a18b8d3..85be8402243 100644
--- a/core/test/solver/multigrid.cpp
+++ b/core/test/solver/multigrid.cpp
@@ -164,8 +164,7 @@ class Multigrid : public ::testing::Test {
     }
 };
 
-TYPED_TEST_SUITE(Multigrid, gko::test::ValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(Multigrid, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Multigrid, MultigridFactoryKnowsItsExecutor)
diff --git a/core/test/solver/upper_trs.cpp b/core/test/solver/upper_trs.cpp
index bc53d1a193c..2e84cb81e10 100644
--- a/core/test/solver/upper_trs.cpp
+++ b/core/test/solver/upper_trs.cpp
@@ -33,7 +33,7 @@ class UpperTrs : public ::testing::Test {
     std::unique_ptr<typename Solver::Factory> upper_trs_factory;
 };
 
-TYPED_TEST_SUITE(UpperTrs, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(UpperTrs, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/core/test/utils.hpp b/core/test/utils.hpp
index ab9326400e0..e9c4c5e0c99 100644
--- a/core/test/utils.hpp
+++ b/core/test/utils.hpp
@@ -321,14 +321,14 @@ using add_inner_wrapper_t =
     typename detail::add_inner_wrapper<NewInnerWrapper, ListType>::type;
 
 
-using RealValueTypes =
+using RealValueTypesBase =
 #if GINKGO_DPCPP_SINGLE_MODE
     ::testing::Types<float>;
 #else
     ::testing::Types<float, double>;
 #endif
 
-using RealValueTypesWithHalf = ::testing::Types<
+using RealValueTypes = ::testing::Types<
 #if GINKGO_ENABLE_HALF
     gko::half,
 #endif
@@ -337,16 +337,16 @@ using RealValueTypesWithHalf = ::testing::Types<
 #endif
     float>;
 
+using ComplexValueTypesBase =
+    add_inner_wrapper_t<std::complex, RealValueTypesBase>;
+
 using ComplexValueTypes = add_inner_wrapper_t<std::complex, RealValueTypes>;
 
-using ComplexValueTypesWithHalf =
-    add_inner_wrapper_t<std::complex, RealValueTypesWithHalf>;
+using ValueTypesBase =
+    merge_type_list_t<RealValueTypesBase, ComplexValueTypesBase>;
 
 using ValueTypes = merge_type_list_t<RealValueTypes, ComplexValueTypes>;
 
-using ValueTypesWithHalf =
-    merge_type_list_t<RealValueTypesWithHalf, ComplexValueTypesWithHalf>;
-
 using IndexTypes = ::testing::Types<int32, int64>;
 
 using IntegerTypes = merge_type_list_t<IndexTypes, ::testing::Types<size_type>>;
@@ -355,48 +355,46 @@ using LocalGlobalIndexTypes =
     ::testing::Types<std::tuple<int32, int32>, std::tuple<int32, int64>,
                      std::tuple<int64, int64>>;
 
+using PODTypesBase = merge_type_list_t<RealValueTypesBase, IntegerTypes>;
+
 using PODTypes = merge_type_list_t<RealValueTypes, IntegerTypes>;
 
-using PODTypesWithHalf =
-    merge_type_list_t<RealValueTypesWithHalf, IntegerTypes>;
+using ComplexAndPODTypesBase =
+    merge_type_list_t<ComplexValueTypesBase, PODTypesBase>;
 
-using ComplexAndPODTypes = merge_type_list_t<ComplexValueTypes, PODTypes>;
+using ComplexAndPODTypes = merge_type_list_t<ComplexValueTypes, PODTypesBase>;
 
-using ComplexAndPODTypesWithHalf =
-    merge_type_list_t<ComplexValueTypesWithHalf, PODTypes>;
+using ValueIndexTypesBase =
+    cartesian_type_product_t<ValueTypesBase, IndexTypes>;
 
 using ValueIndexTypes = cartesian_type_product_t<ValueTypes, IndexTypes>;
 
-using ValueIndexTypesWithHalf =
-    cartesian_type_product_t<ValueTypesWithHalf, IndexTypes>;
+using RealValueIndexTypesBase =
+    cartesian_type_product_t<RealValueTypesBase, IndexTypes>;
 
 using RealValueIndexTypes =
     cartesian_type_product_t<RealValueTypes, IndexTypes>;
 
-using RealValueIndexTypesWithHalf =
-    cartesian_type_product_t<RealValueTypesWithHalf, IndexTypes>;
+using ComplexValueIndexTypesBase =
+    cartesian_type_product_t<ComplexValueTypesBase, IndexTypes>;
 
 using ComplexValueIndexTypes =
     cartesian_type_product_t<ComplexValueTypes, IndexTypes>;
 
-using ComplexValueIndexTypesWithHalf =
-    cartesian_type_product_t<ComplexValueTypesWithHalf, IndexTypes>;
+using TwoValueIndexTypesBase = add_to_cartesian_type_product_t<
+    merge_type_list_t<
+        cartesian_type_product_t<RealValueTypesBase, RealValueTypesBase>,
+        cartesian_type_product_t<ComplexValueTypesBase, ComplexValueTypesBase>>,
+    IndexTypes>;
 
-using TwoValueIndexType = add_to_cartesian_type_product_t<
+using TwoValueIndexTypes = add_to_cartesian_type_product_t<
     merge_type_list_t<
         cartesian_type_product_t<RealValueTypes, RealValueTypes>,
         cartesian_type_product_t<ComplexValueTypes, ComplexValueTypes>>,
     IndexTypes>;
 
-using TwoValueIndexTypeWithHalf = add_to_cartesian_type_product_t<
-    merge_type_list_t<cartesian_type_product_t<RealValueTypesWithHalf,
-                                               RealValueTypesWithHalf>,
-                      cartesian_type_product_t<ComplexValueTypesWithHalf,
-                                               ComplexValueTypesWithHalf>>,
-    IndexTypes>;
-
-using ValueLocalGlobalIndexTypes =
-    add_to_cartesian_type_product_left_t<ValueTypes, LocalGlobalIndexTypes>;
+using ValueLocalGlobalIndexTypesBase =
+    add_to_cartesian_type_product_left_t<ValueTypesBase, LocalGlobalIndexTypes>;
 
 
 template <typename Precision, typename OutputType>
diff --git a/core/test/utils/array_generator_test.cpp b/core/test/utils/array_generator_test.cpp
index ca96761ea4e..3933dd7db7a 100644
--- a/core/test/utils/array_generator_test.cpp
+++ b/core/test/utils/array_generator_test.cpp
@@ -65,8 +65,7 @@ class ArrayGenerator : public ::testing::Test {
     }
 };
 
-TYPED_TEST_SUITE(ArrayGenerator, gko::test::ValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(ArrayGenerator, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(ArrayGenerator, OutputHasCorrectSize)
diff --git a/core/test/utils/matrix_generator_test.cpp b/core/test/utils/matrix_generator_test.cpp
index 61710540e24..c41a20b0efb 100644
--- a/core/test/utils/matrix_generator_test.cpp
+++ b/core/test/utils/matrix_generator_test.cpp
@@ -131,8 +131,7 @@ class MatrixGenerator : public ::testing::Test {
     }
 };
 
-TYPED_TEST_SUITE(MatrixGenerator, gko::test::ValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(MatrixGenerator, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(MatrixGenerator, OutputHasCorrectSize)
diff --git a/core/test/utils/matrix_utils_test.cpp b/core/test/utils/matrix_utils_test.cpp
index f742d4561a2..3f692edeed6 100644
--- a/core/test/utils/matrix_utils_test.cpp
+++ b/core/test/utils/matrix_utils_test.cpp
@@ -41,8 +41,7 @@ class MatrixUtils : public ::testing::Test {
     mtx_data rectangular_data;
 };
 
-TYPED_TEST_SUITE(MatrixUtils, gko::test::ValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(MatrixUtils, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(MatrixUtils, MakeSymmetricThrowsError)
diff --git a/core/test/utils/unsort_matrix_test.cpp b/core/test/utils/unsort_matrix_test.cpp
index 40ec65b08db..5d2f88f982a 100644
--- a/core/test/utils/unsort_matrix_test.cpp
+++ b/core/test/utils/unsort_matrix_test.cpp
@@ -119,7 +119,7 @@ class UnsortMatrix : public ::testing::Test {
     std::unique_ptr<Coo> coo_empty;
 };
 
-TYPED_TEST_SUITE(UnsortMatrix, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(UnsortMatrix, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/core/test/utils/value_generator_test.cpp b/core/test/utils/value_generator_test.cpp
index 57473c41b6e..829c9b54be2 100644
--- a/core/test/utils/value_generator_test.cpp
+++ b/core/test/utils/value_generator_test.cpp
@@ -59,8 +59,7 @@ class ValueGenerator : public ::testing::Test {
     }
 };
 
-TYPED_TEST_SUITE(ValueGenerator, gko::test::ValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(ValueGenerator, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(ValueGenerator, OutputHasCorrectAverageAndDeviation)
diff --git a/cuda/test/base/array.cpp b/cuda/test/base/array.cpp
index 7294cbff29f..db7d4c54536 100644
--- a/cuda/test/base/array.cpp
+++ b/cuda/test/base/array.cpp
@@ -32,8 +32,7 @@ class Array : public CudaTestFixture {
     gko::array<T> x;
 };
 
-TYPED_TEST_SUITE(Array, gko::test::ComplexAndPODTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(Array, gko::test::ComplexAndPODTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Array, CanCreateTemporaryCloneOnDifferentExecutor)
diff --git a/extensions/test/kokkos/types.cpp b/extensions/test/kokkos/types.cpp
index bb3252b149c..ec59e3fb04c 100644
--- a/extensions/test/kokkos/types.cpp
+++ b/extensions/test/kokkos/types.cpp
@@ -29,7 +29,7 @@ class ArrayMapper : public ::testing::Test {
     gko::array<value_type> array = {exec, I<value_type>{1, 2, 3, 4}};
 };
 
-TYPED_TEST_SUITE(ArrayMapper, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(ArrayMapper, gko::test::ValueTypesBase, TypenameNameGenerator);
 
 
 TYPED_TEST(ArrayMapper, CanMapDefault)
@@ -89,7 +89,7 @@ class DenseMapper : public ::testing::Test {
         gko::initialize<mtx_type>({1, 2, 3, 4}, exec);
 };
 
-TYPED_TEST_SUITE(DenseMapper, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(DenseMapper, gko::test::ValueTypesBase, TypenameNameGenerator);
 
 
 TYPED_TEST(DenseMapper, CanMapDefault)
diff --git a/hip/test/matrix/fbcsr_kernels.cpp b/hip/test/matrix/fbcsr_kernels.cpp
index 536ff3dc01c..0bed7e7c13e 100644
--- a/hip/test/matrix/fbcsr_kernels.cpp
+++ b/hip/test/matrix/fbcsr_kernels.cpp
@@ -61,8 +61,7 @@ class Fbcsr : public HipTestFixture {
     }
 };
 
-TYPED_TEST_SUITE(Fbcsr, gko::test::RealValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(Fbcsr, gko::test::RealValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Fbcsr, CanWriteFromMatrixOnDevice)
diff --git a/reference/test/base/array.cpp b/reference/test/base/array.cpp
index 2c69f1afc8e..666ab13063c 100644
--- a/reference/test/base/array.cpp
+++ b/reference/test/base/array.cpp
@@ -28,8 +28,7 @@ class Array : public ::testing::Test {
     gko::array<T> x;
 };
 
-TYPED_TEST_SUITE(Array, gko::test::ComplexAndPODTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(Array, gko::test::ComplexAndPODTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Array, CanBeFilledWithValue)
diff --git a/reference/test/base/batch_multi_vector_kernels.cpp b/reference/test/base/batch_multi_vector_kernels.cpp
index 7ef9b77601d..694ae491ef4 100644
--- a/reference/test/base/batch_multi_vector_kernels.cpp
+++ b/reference/test/base/batch_multi_vector_kernels.cpp
@@ -96,8 +96,7 @@ class MultiVector : public ::testing::Test {
     std::default_random_engine rand_engine;
 };
 
-TYPED_TEST_SUITE(MultiVector, gko::test::ValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(MultiVector, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(MultiVector, ScalesData)
diff --git a/reference/test/base/combination.cpp b/reference/test/base/combination.cpp
index cfb5abcdba0..aea578f4e7e 100644
--- a/reference/test/base/combination.cpp
+++ b/reference/test/base/combination.cpp
@@ -34,8 +34,7 @@ class Combination : public ::testing::Test {
     std::vector<std::shared_ptr<gko::LinOp>> operators;
 };
 
-TYPED_TEST_SUITE(Combination, gko::test::ValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(Combination, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Combination, CopiesOnSameExecutor)
diff --git a/reference/test/base/composition.cpp b/reference/test/base/composition.cpp
index 9767fab466c..f736edb53f9 100644
--- a/reference/test/base/composition.cpp
+++ b/reference/test/base/composition.cpp
@@ -75,8 +75,7 @@ class Composition : public ::testing::Test {
     std::shared_ptr<Mtx> product;
 };
 
-TYPED_TEST_SUITE(Composition, gko::test::ValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(Composition, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Composition, CopiesOnSameExecutor)
diff --git a/reference/test/base/perturbation.cpp b/reference/test/base/perturbation.cpp
index 1563f22e175..b6be9ab1563 100644
--- a/reference/test/base/perturbation.cpp
+++ b/reference/test/base/perturbation.cpp
@@ -33,8 +33,7 @@ class Perturbation : public ::testing::Test {
     std::shared_ptr<gko::LinOp> scalar;
 };
 
-TYPED_TEST_SUITE(Perturbation, gko::test::ValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(Perturbation, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Perturbation, CopiesOnSameExecutor)
diff --git a/reference/test/components/absolute_array_kernels.cpp b/reference/test/components/absolute_array_kernels.cpp
index 5ad75440c88..c192d540032 100644
--- a/reference/test/components/absolute_array_kernels.cpp
+++ b/reference/test/components/absolute_array_kernels.cpp
@@ -43,8 +43,7 @@ class AbsoluteArray : public ::testing::Test {
     gko::array<value_type> vals;
 };
 
-TYPED_TEST_SUITE(AbsoluteArray, gko::test::ValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(AbsoluteArray, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(AbsoluteArray, InplaceEqualsExpected)
diff --git a/reference/test/components/fill_array_kernels.cpp b/reference/test/components/fill_array_kernels.cpp
index 0a9239ce1bd..3c7520c6847 100644
--- a/reference/test/components/fill_array_kernels.cpp
+++ b/reference/test/components/fill_array_kernels.cpp
@@ -40,7 +40,7 @@ class FillArray : public ::testing::Test {
     gko::array<value_type> seqs;
 };
 
-TYPED_TEST_SUITE(FillArray, gko::test::ComplexAndPODTypesWithHalf,
+TYPED_TEST_SUITE(FillArray, gko::test::ComplexAndPODTypes,
                  TypenameNameGenerator);
 
 
diff --git a/reference/test/components/reduce_array_kernels.cpp b/reference/test/components/reduce_array_kernels.cpp
index c8839bc178d..8286817c853 100644
--- a/reference/test/components/reduce_array_kernels.cpp
+++ b/reference/test/components/reduce_array_kernels.cpp
@@ -31,7 +31,7 @@ class ReduceArray : public ::testing::Test {
     gko::array<value_type> vals;
 };
 
-TYPED_TEST_SUITE(ReduceArray, gko::test::ComplexAndPODTypesWithHalf,
+TYPED_TEST_SUITE(ReduceArray, gko::test::ComplexAndPODTypes,
                  TypenameNameGenerator);
 
 
diff --git a/reference/test/distributed/assembly_kernels.cpp b/reference/test/distributed/assembly_kernels.cpp
index 89662b4efef..4823f465a31 100644
--- a/reference/test/distributed/assembly_kernels.cpp
+++ b/reference/test/distributed/assembly_kernels.cpp
@@ -48,7 +48,7 @@ class AssemblyHelpers : public ::testing::Test {
     gko::array<comm_index_type> mapping;
 };
 
-TYPED_TEST_SUITE(AssemblyHelpers, gko::test::ValueLocalGlobalIndexTypes,
+TYPED_TEST_SUITE(AssemblyHelpers, gko::test::ValueLocalGlobalIndexTypesBase,
                  TupleTypenameNameGenerator);
 
 
diff --git a/reference/test/distributed/matrix_kernels.cpp b/reference/test/distributed/matrix_kernels.cpp
index a34844cbde9..b5e2e3f5fb9 100644
--- a/reference/test/distributed/matrix_kernels.cpp
+++ b/reference/test/distributed/matrix_kernels.cpp
@@ -182,7 +182,7 @@ class Matrix : public ::testing::Test {
     gko::array<value_type> non_local_values;
 };
 
-TYPED_TEST_SUITE(Matrix, gko::test::ValueLocalGlobalIndexTypes,
+TYPED_TEST_SUITE(Matrix, gko::test::ValueLocalGlobalIndexTypesBase,
                  TupleTypenameNameGenerator);
 
 
diff --git a/reference/test/distributed/vector_kernels.cpp b/reference/test/distributed/vector_kernels.cpp
index 7de3104b7fb..43f11967488 100644
--- a/reference/test/distributed/vector_kernels.cpp
+++ b/reference/test/distributed/vector_kernels.cpp
@@ -69,7 +69,7 @@ class Vector : public ::testing::Test {
     std::shared_ptr<const gko::ReferenceExecutor> ref;
 };
 
-TYPED_TEST_SUITE(Vector, gko::test::ValueLocalGlobalIndexTypes,
+TYPED_TEST_SUITE(Vector, gko::test::ValueLocalGlobalIndexTypesBase,
                  TupleTypenameNameGenerator);
 
 
diff --git a/reference/test/factorization/cholesky_kernels.cpp b/reference/test/factorization/cholesky_kernels.cpp
index 671630c99eb..b4c33d76ab9 100644
--- a/reference/test/factorization/cholesky_kernels.cpp
+++ b/reference/test/factorization/cholesky_kernels.cpp
@@ -245,7 +245,7 @@ class Cholesky : public ::testing::Test {
     std::shared_ptr<matrix_type> combined_ref;
 };
 
-TYPED_TEST_SUITE(Cholesky, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(Cholesky, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/factorization/factorization.cpp b/reference/test/factorization/factorization.cpp
index 73bf8cdc321..2ded81d4867 100644
--- a/reference/test/factorization/factorization.cpp
+++ b/reference/test/factorization/factorization.cpp
@@ -70,7 +70,7 @@ class Factorization : public ::testing::Test {
     std::shared_ptr<vector_type> beta;
 };
 
-TYPED_TEST_SUITE(Factorization, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(Factorization, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/factorization/ic_kernels.cpp b/reference/test/factorization/ic_kernels.cpp
index 84faa3c3b45..1593da136a4 100644
--- a/reference/test/factorization/ic_kernels.cpp
+++ b/reference/test/factorization/ic_kernels.cpp
@@ -80,8 +80,7 @@ class Ic : public ::testing::Test {
     gko::remove_complex<value_type> tol;
 };
 
-TYPED_TEST_SUITE(Ic, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Ic, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(Ic, ThrowNotSupportedForWrongLinOp)
diff --git a/reference/test/factorization/ilu_kernels.cpp b/reference/test/factorization/ilu_kernels.cpp
index 1ba1fedf13f..aaeb44382f1 100644
--- a/reference/test/factorization/ilu_kernels.cpp
+++ b/reference/test/factorization/ilu_kernels.cpp
@@ -170,8 +170,7 @@ class Ilu : public ::testing::Test {
     std::unique_ptr<typename ilu_type::Factory> ilu_factory_sort;
 };
 
-TYPED_TEST_SUITE(Ilu, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Ilu, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(Ilu, ThrowNotSupportedForWrongLinOp1)
diff --git a/reference/test/factorization/lu_kernels.cpp b/reference/test/factorization/lu_kernels.cpp
index 7b4a860b0d5..1ea77665f69 100644
--- a/reference/test/factorization/lu_kernels.cpp
+++ b/reference/test/factorization/lu_kernels.cpp
@@ -98,8 +98,7 @@ class Lu : public ::testing::Test {
     gko::array<gko::int64> row_descs;
 };
 
-TYPED_TEST_SUITE(Lu, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Lu, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(Lu, SymbolicCholeskyWorks)
diff --git a/reference/test/factorization/par_ic_kernels.cpp b/reference/test/factorization/par_ic_kernels.cpp
index 481e89bb744..b9caf8c9e5e 100644
--- a/reference/test/factorization/par_ic_kernels.cpp
+++ b/reference/test/factorization/par_ic_kernels.cpp
@@ -104,8 +104,7 @@ class ParIc : public ::testing::Test {
     gko::remove_complex<value_type> tol;
 };
 
-TYPED_TEST_SUITE(ParIc, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(ParIc, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(ParIc, KernelCompute)
diff --git a/reference/test/factorization/par_ict_kernels.cpp b/reference/test/factorization/par_ict_kernels.cpp
index d3b6df59f42..55ac5771732 100644
--- a/reference/test/factorization/par_ict_kernels.cpp
+++ b/reference/test/factorization/par_ict_kernels.cpp
@@ -137,8 +137,7 @@ class ParIct : public ::testing::Test {
     gko::remove_complex<value_type> tol;
 };
 
-TYPED_TEST_SUITE(ParIct, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(ParIct, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(ParIct, KernelInitializeRowPtrsL)
diff --git a/reference/test/factorization/par_ilu_kernels.cpp b/reference/test/factorization/par_ilu_kernels.cpp
index 3d590c1a6d6..bf4e422f640 100644
--- a/reference/test/factorization/par_ilu_kernels.cpp
+++ b/reference/test/factorization/par_ilu_kernels.cpp
@@ -180,8 +180,7 @@ class ParIlu : public ::testing::Test {
     std::unique_ptr<typename par_ilu_type::Factory> ilu_factory_sort;
 };
 
-TYPED_TEST_SUITE(ParIlu, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(ParIlu, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(ParIlu, KernelAddDiagonalElementsEmpty)
diff --git a/reference/test/factorization/par_ilut_kernels.cpp b/reference/test/factorization/par_ilut_kernels.cpp
index 3a6ba9232da..a605ed678ae 100644
--- a/reference/test/factorization/par_ilut_kernels.cpp
+++ b/reference/test/factorization/par_ilut_kernels.cpp
@@ -277,7 +277,7 @@ class ParIlut : public ::testing::Test {
     gko::remove_complex<value_type> tol;
 };  // namespace
 
-TYPED_TEST_SUITE(ParIlut, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(ParIlut, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/log/convergence.cpp b/reference/test/log/convergence.cpp
index 70fc004c030..50db0db49c4 100644
--- a/reference/test/log/convergence.cpp
+++ b/reference/test/log/convergence.cpp
@@ -19,8 +19,7 @@ namespace {
 template <typename T>
 class Convergence : public ::testing::Test {};
 
-TYPED_TEST_SUITE(Convergence, gko::test::ValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(Convergence, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Convergence, CatchesCriterionCheckCompleted)
diff --git a/reference/test/log/papi.cpp b/reference/test/log/papi.cpp
index 647a14af9b2..4f1d9e469f1 100644
--- a/reference/test/log/papi.cpp
+++ b/reference/test/log/papi.cpp
@@ -83,7 +83,7 @@ class Papi : public ::testing::Test {
     int eventset;
 };
 
-TYPED_TEST_SUITE(Papi, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
+TYPED_TEST_SUITE(Papi, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Papi, CatchesCriterionCheckCompleted)
diff --git a/reference/test/matrix/batch_csr_kernels.cpp b/reference/test/matrix/batch_csr_kernels.cpp
index 85e461b933e..920bb67696b 100644
--- a/reference/test/matrix/batch_csr_kernels.cpp
+++ b/reference/test/matrix/batch_csr_kernels.cpp
@@ -78,7 +78,7 @@ class Csr : public ::testing::Test {
     std::ranlux48 rand_engine;
 };
 
-TYPED_TEST_SUITE(Csr, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
+TYPED_TEST_SUITE(Csr, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Csr, AppliesToBatchMultiVector)
diff --git a/reference/test/matrix/batch_dense_kernels.cpp b/reference/test/matrix/batch_dense_kernels.cpp
index 23f747c24cb..50c1909959f 100644
--- a/reference/test/matrix/batch_dense_kernels.cpp
+++ b/reference/test/matrix/batch_dense_kernels.cpp
@@ -77,7 +77,7 @@ class Dense : public ::testing::Test {
 };
 
 
-TYPED_TEST_SUITE(Dense, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
+TYPED_TEST_SUITE(Dense, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Dense, AppliesToBatchMultiVector)
diff --git a/reference/test/matrix/batch_ell_kernels.cpp b/reference/test/matrix/batch_ell_kernels.cpp
index 5e2b377eda0..a2c9ef4e83c 100644
--- a/reference/test/matrix/batch_ell_kernels.cpp
+++ b/reference/test/matrix/batch_ell_kernels.cpp
@@ -79,7 +79,7 @@ class Ell : public ::testing::Test {
 };
 
 
-TYPED_TEST_SUITE(Ell, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
+TYPED_TEST_SUITE(Ell, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Ell, AppliesToBatchMultiVector)
diff --git a/reference/test/matrix/coo_kernels.cpp b/reference/test/matrix/coo_kernels.cpp
index b3f58ec3083..f7063317a73 100644
--- a/reference/test/matrix/coo_kernels.cpp
+++ b/reference/test/matrix/coo_kernels.cpp
@@ -72,8 +72,7 @@ class Coo : public ::testing::Test {
     std::unique_ptr<Mtx> uns_mtx;
 };
 
-TYPED_TEST_SUITE(Coo, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Coo, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(Coo, ConvertsToPrecision)
@@ -911,7 +910,7 @@ class CooComplex : public ::testing::Test {
     using Mtx = gko::matrix::Coo<value_type, index_type>;
 };
 
-TYPED_TEST_SUITE(CooComplex, gko::test::ComplexValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(CooComplex, gko::test::ComplexValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp
index 61018563769..b84ac958f02 100644
--- a/reference/test/matrix/csr_kernels.cpp
+++ b/reference/test/matrix/csr_kernels.cpp
@@ -347,8 +347,7 @@ class Csr : public ::testing::Test {
     index_type invalid_index = gko::invalid_index<index_type>();
 };
 
-TYPED_TEST_SUITE(Csr, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Csr, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(Csr, AppliesToDenseVector)
@@ -2246,7 +2245,7 @@ class CsrComplex : public ::testing::Test {
     using Mtx = gko::matrix::Csr<value_type, index_type>;
 };
 
-TYPED_TEST_SUITE(CsrComplex, gko::test::ComplexValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(CsrComplex, gko::test::ComplexValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
@@ -2591,7 +2590,7 @@ class CsrLookup : public ::testing::Test {
     index_type invalid_index = gko::invalid_index<index_type>();
 };
 
-TYPED_TEST_SUITE(CsrLookup, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(CsrLookup, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 TYPED_TEST(CsrLookup, GeneratesLookupDataOffsets)
diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp
index 9ab59b0b4b8..9fe0e91a670 100644
--- a/reference/test/matrix/dense_kernels.cpp
+++ b/reference/test/matrix/dense_kernels.cpp
@@ -80,7 +80,7 @@ class Dense : public ::testing::Test {
 };
 
 
-TYPED_TEST_SUITE(Dense, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
+TYPED_TEST_SUITE(Dense, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Dense, CopyRespectsStride)
@@ -1359,7 +1359,7 @@ class DenseWithIndexType
     std::unique_ptr<ScaledPermutation> scale_perm0;
 };
 
-TYPED_TEST_SUITE(DenseWithIndexType, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(DenseWithIndexType, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
@@ -3552,7 +3552,7 @@ class DenseComplex : public ::testing::Test {
 };
 
 
-TYPED_TEST_SUITE(DenseComplex, gko::test::ComplexValueTypesWithHalf,
+TYPED_TEST_SUITE(DenseComplex, gko::test::ComplexValueTypes,
                  TypenameNameGenerator);
 
 
diff --git a/reference/test/matrix/diagonal_kernels.cpp b/reference/test/matrix/diagonal_kernels.cpp
index b25c3593595..4517ba43c86 100644
--- a/reference/test/matrix/diagonal_kernels.cpp
+++ b/reference/test/matrix/diagonal_kernels.cpp
@@ -79,8 +79,7 @@ class Diagonal : public ::testing::Test {
     std::unique_ptr<Dense> dense3;
 };
 
-TYPED_TEST_SUITE(Diagonal, gko::test::ValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(Diagonal, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Diagonal, ConvertsToPrecision)
@@ -674,7 +673,7 @@ class DiagonalComplex : public ::testing::Test {
     using Diag = gko::matrix::Diagonal<value_type>;
 };
 
-TYPED_TEST_SUITE(DiagonalComplex, gko::test::ComplexValueTypesWithHalf,
+TYPED_TEST_SUITE(DiagonalComplex, gko::test::ComplexValueTypes,
                  TypenameNameGenerator);
 
 
diff --git a/reference/test/matrix/ell_kernels.cpp b/reference/test/matrix/ell_kernels.cpp
index 4f96742d3f7..6214db82d1c 100644
--- a/reference/test/matrix/ell_kernels.cpp
+++ b/reference/test/matrix/ell_kernels.cpp
@@ -72,8 +72,7 @@ class Ell : public ::testing::Test {
     std::unique_ptr<Mtx> mtx2;
 };
 
-TYPED_TEST_SUITE(Ell, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Ell, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(Ell, AppliesToDenseVector)
@@ -993,7 +992,7 @@ class EllComplex : public ::testing::Test {
     using Mtx = gko::matrix::Ell<value_type, index_type>;
 };
 
-TYPED_TEST_SUITE(EllComplex, gko::test::ComplexValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(EllComplex, gko::test::ComplexValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/matrix/fbcsr_kernels.cpp b/reference/test/matrix/fbcsr_kernels.cpp
index decae21b986..043195e7b50 100644
--- a/reference/test/matrix/fbcsr_kernels.cpp
+++ b/reference/test/matrix/fbcsr_kernels.cpp
@@ -104,8 +104,7 @@ class Fbcsr : public ::testing::Test {
     const std::unique_ptr<const Mtx> mtxsq;
 };
 
-TYPED_TEST_SUITE(Fbcsr, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Fbcsr, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 template <typename T>
@@ -620,7 +619,7 @@ class FbcsrComplex : public ::testing::Test {
     using Csr = gko::matrix::Csr<value_type, index_type>;
 };
 
-TYPED_TEST_SUITE(FbcsrComplex, gko::test::ComplexValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(FbcsrComplex, gko::test::ComplexValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/matrix/fft_kernels.cpp b/reference/test/matrix/fft_kernels.cpp
index 12c2521b71c..a1f82ea9311 100644
--- a/reference/test/matrix/fft_kernels.cpp
+++ b/reference/test/matrix/fft_kernels.cpp
@@ -148,7 +148,7 @@ class Fft : public ::testing::Test {
     std::unique_ptr<Vec> dense_ifft3;
 };
 
-TYPED_TEST_SUITE(Fft, gko::test::ComplexValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Fft, gko::test::ComplexValueTypesBase, TypenameNameGenerator);
 
 
 TYPED_TEST(Fft, ThrowsOnNonPowerOfTwo1D)
diff --git a/reference/test/matrix/hybrid_kernels.cpp b/reference/test/matrix/hybrid_kernels.cpp
index bdc2724e1de..87fd4c02811 100644
--- a/reference/test/matrix/hybrid_kernels.cpp
+++ b/reference/test/matrix/hybrid_kernels.cpp
@@ -96,8 +96,7 @@ class Hybrid : public ::testing::Test {
     std::unique_ptr<Mtx> mtx3;
 };
 
-TYPED_TEST_SUITE(Hybrid, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Hybrid, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(Hybrid, AppliesToDenseVector)
@@ -796,7 +795,7 @@ class HybridComplex : public ::testing::Test {
     using Mtx = gko::matrix::Hybrid<value_type, index_type>;
 };
 
-TYPED_TEST_SUITE(HybridComplex, gko::test::ComplexValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(HybridComplex, gko::test::ComplexValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/matrix/identity.cpp b/reference/test/matrix/identity.cpp
index dc0f799ba70..11953de338a 100644
--- a/reference/test/matrix/identity.cpp
+++ b/reference/test/matrix/identity.cpp
@@ -29,8 +29,7 @@ class Identity : public ::testing::Test {
 };
 
 
-TYPED_TEST_SUITE(Identity, gko::test::ValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(Identity, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Identity, AppliesToVector)
diff --git a/reference/test/matrix/permutation.cpp b/reference/test/matrix/permutation.cpp
index b646a6fc67f..5418f97353b 100644
--- a/reference/test/matrix/permutation.cpp
+++ b/reference/test/matrix/permutation.cpp
@@ -51,7 +51,7 @@ class Permutation : public ::testing::Test {
     std::shared_ptr<const gko::Executor> exec;
 };
 
-TYPED_TEST_SUITE(Permutation, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(Permutation, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/matrix/scaled_permutation.cpp b/reference/test/matrix/scaled_permutation.cpp
index f2b3e66b4cd..6d8d49f5662 100644
--- a/reference/test/matrix/scaled_permutation.cpp
+++ b/reference/test/matrix/scaled_permutation.cpp
@@ -58,7 +58,7 @@ class ScaledPermutation : public ::testing::Test {
     std::unique_ptr<Mtx> perm2;
 };
 
-TYPED_TEST_SUITE(ScaledPermutation, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(ScaledPermutation, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/matrix/sellp_kernels.cpp b/reference/test/matrix/sellp_kernels.cpp
index 5e056997163..3208b8c42be 100644
--- a/reference/test/matrix/sellp_kernels.cpp
+++ b/reference/test/matrix/sellp_kernels.cpp
@@ -50,8 +50,7 @@ class Sellp : public ::testing::Test {
     std::unique_ptr<Mtx> mtx2;
 };
 
-TYPED_TEST_SUITE(Sellp, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Sellp, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(Sellp, AppliesToDenseVector)
@@ -752,7 +751,7 @@ class SellpComplex : public ::testing::Test {
     using Mtx = gko::matrix::Sellp<value_type, index_type>;
 };
 
-TYPED_TEST_SUITE(SellpComplex, gko::test::ComplexValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(SellpComplex, gko::test::ComplexValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/matrix/sparsity_csr.cpp b/reference/test/matrix/sparsity_csr.cpp
index 8db0dee144f..d8ed6147e30 100644
--- a/reference/test/matrix/sparsity_csr.cpp
+++ b/reference/test/matrix/sparsity_csr.cpp
@@ -47,7 +47,7 @@ class SparsityCsr : public ::testing::Test {
     std::unique_ptr<Mtx> mtx;
 };
 
-TYPED_TEST_SUITE(SparsityCsr, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(SparsityCsr, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/matrix/sparsity_csr_kernels.cpp b/reference/test/matrix/sparsity_csr_kernels.cpp
index 03adbece035..f08d6c352ca 100644
--- a/reference/test/matrix/sparsity_csr_kernels.cpp
+++ b/reference/test/matrix/sparsity_csr_kernels.cpp
@@ -125,7 +125,7 @@ class SparsityCsr : public ::testing::Test {
     std::unique_ptr<Mtx> mtx3_unsorted;
 };
 
-TYPED_TEST_SUITE(SparsityCsr, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(SparsityCsr, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/multigrid/fixed_coarsening_kernels.cpp b/reference/test/multigrid/fixed_coarsening_kernels.cpp
index 001e23d6124..582950b4e17 100644
--- a/reference/test/multigrid/fixed_coarsening_kernels.cpp
+++ b/reference/test/multigrid/fixed_coarsening_kernels.cpp
@@ -143,7 +143,7 @@ class FixedCoarsening : public ::testing::Test {
     std::unique_ptr<MgLevel> mg_level;
 };
 
-TYPED_TEST_SUITE(FixedCoarsening, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(FixedCoarsening, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/multigrid/pgm_kernels.cpp b/reference/test/multigrid/pgm_kernels.cpp
index e715b2175d3..2fc754f23b3 100644
--- a/reference/test/multigrid/pgm_kernels.cpp
+++ b/reference/test/multigrid/pgm_kernels.cpp
@@ -187,8 +187,7 @@ class Pgm : public ::testing::Test {
     std::unique_ptr<MgLevel> mg_level;
 };
 
-TYPED_TEST_SUITE(Pgm, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Pgm, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(Pgm, CanBeCopied)
diff --git a/reference/test/preconditioner/gauss_seidel.cpp b/reference/test/preconditioner/gauss_seidel.cpp
index 53db7f0781e..2b67b665d77 100644
--- a/reference/test/preconditioner/gauss_seidel.cpp
+++ b/reference/test/preconditioner/gauss_seidel.cpp
@@ -47,7 +47,7 @@ class GaussSeidel : public ::testing::Test {
     std::shared_ptr<csr_type> mtx = csr_type::create(exec);
 };
 
-TYPED_TEST_SUITE(GaussSeidel, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(GaussSeidel, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/preconditioner/ic.cpp b/reference/test/preconditioner/ic.cpp
index 3084f60ca1c..16ffc8d7b3c 100644
--- a/reference/test/preconditioner/ic.cpp
+++ b/reference/test/preconditioner/ic.cpp
@@ -67,8 +67,7 @@ class Ic : public ::testing::Test {
     gko::remove_complex<value_type> tol;
 };
 
-TYPED_TEST_SUITE(Ic, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Ic, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(Ic, BuildsTwoFactorComposition)
diff --git a/reference/test/preconditioner/ilu.cpp b/reference/test/preconditioner/ilu.cpp
index 1c7deddf56c..180b92be9ec 100644
--- a/reference/test/preconditioner/ilu.cpp
+++ b/reference/test/preconditioner/ilu.cpp
@@ -84,7 +84,7 @@ class Ilu : public ::testing::Test {
     std::shared_ptr<typename ilu_rev_prec_type::Factory> ilu_rev_pre_factory;
 };
 
-TYPED_TEST_SUITE(Ilu, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
+TYPED_TEST_SUITE(Ilu, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Ilu, BuildsDefaultWithoutThrowing)
diff --git a/reference/test/preconditioner/isai_kernels.cpp b/reference/test/preconditioner/isai_kernels.cpp
index 0164f5d8e58..a92a8c1fc5d 100644
--- a/reference/test/preconditioner/isai_kernels.cpp
+++ b/reference/test/preconditioner/isai_kernels.cpp
@@ -332,15 +332,15 @@ using HalfIndexTypes = gko::test::cartesian_type_product_t<
     gko::test::IndexTypes>;
 TYPED_TEST_SUITE(Isai, HalfIndexTypes, PairTypenameNameGenerator);
 #else
-TYPED_TEST_SUITE(Isai, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Isai, gko::test::ValueIndexTypesBase,
+                 PairTypenameNameGenerator);
 #endif
 
 
 #else
 
 
-TYPED_TEST_SUITE(Isai, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Isai, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 #endif
diff --git a/reference/test/preconditioner/jacobi.cpp b/reference/test/preconditioner/jacobi.cpp
index 2697efc3cd5..79c276579ad 100644
--- a/reference/test/preconditioner/jacobi.cpp
+++ b/reference/test/preconditioner/jacobi.cpp
@@ -144,8 +144,7 @@ class Jacobi : public ::testing::Test {
     std::unique_ptr<Bj> adaptive_bj;
 };
 
-TYPED_TEST_SUITE(Jacobi, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Jacobi, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(Jacobi, GeneratesCorrectStorageScheme)
diff --git a/reference/test/preconditioner/jacobi_kernels.cpp b/reference/test/preconditioner/jacobi_kernels.cpp
index d5f9346eab6..cade7b3822a 100644
--- a/reference/test/preconditioner/jacobi_kernels.cpp
+++ b/reference/test/preconditioner/jacobi_kernels.cpp
@@ -86,8 +86,7 @@ class Jacobi : public ::testing::Test {
     std::shared_ptr<gko::matrix::Csr<value_type, index_type>> mtx;
 };
 
-TYPED_TEST_SUITE(Jacobi, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Jacobi, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(Jacobi, CanBeGenerated)
diff --git a/reference/test/preconditioner/sor_kernels.cpp b/reference/test/preconditioner/sor_kernels.cpp
index cd2fa9af364..18c055aa6d9 100644
--- a/reference/test/preconditioner/sor_kernels.cpp
+++ b/reference/test/preconditioner/sor_kernels.cpp
@@ -55,8 +55,7 @@ class Sor : public ::testing::Test {
                                   exec);
 };
 
-TYPED_TEST_SUITE(Sor, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Sor, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(Sor, CanInitializeLFactor)
diff --git a/reference/test/reorder/mc64.cpp b/reference/test/reorder/mc64.cpp
index 0670c77f6e2..f32e02aac07 100644
--- a/reference/test/reorder/mc64.cpp
+++ b/reference/test/reorder/mc64.cpp
@@ -71,8 +71,7 @@ class Mc64 : public ::testing::Test {
     std::unique_ptr<reorder_type> mc64_factory;
 };
 
-TYPED_TEST_SUITE(Mc64, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Mc64, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(Mc64, HasSensibleDefaults)
diff --git a/reference/test/reorder/mc64_kernels.cpp b/reference/test/reorder/mc64_kernels.cpp
index f31bf7ba658..808b74df2d8 100644
--- a/reference/test/reorder/mc64_kernels.cpp
+++ b/reference/test/reorder/mc64_kernels.cpp
@@ -181,8 +181,7 @@ class Mc64 : public ::testing::Test {
     const real_type zero_tol;
 };
 
-TYPED_TEST_SUITE(Mc64, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Mc64, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(Mc64, InitializeWeightsSum)
diff --git a/reference/test/reorder/rcm.cpp b/reference/test/reorder/rcm.cpp
index ae63ca504bb..ec547c141e3 100644
--- a/reference/test/reorder/rcm.cpp
+++ b/reference/test/reorder/rcm.cpp
@@ -54,8 +54,7 @@ class Rcm : public ::testing::Test {
     std::unique_ptr<reorder_type> reorder_op;
 };
 
-TYPED_TEST_SUITE(Rcm, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Rcm, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(Rcm, CanBeCleared)
diff --git a/reference/test/reorder/scaled_reordered.cpp b/reference/test/reorder/scaled_reordered.cpp
index 8fb3c885cf3..4320ec35873 100644
--- a/reference/test/reorder/scaled_reordered.cpp
+++ b/reference/test/reorder/scaled_reordered.cpp
@@ -132,7 +132,7 @@ class ScaledReordered : public ::testing::Test {
     gko::remove_complex<value_type> tol;
 };
 
-TYPED_TEST_SUITE(ScaledReordered, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(ScaledReordered, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/solver/batch_bicgstab_kernels.cpp b/reference/test/solver/batch_bicgstab_kernels.cpp
index 468b38a561b..16d2f7a6653 100644
--- a/reference/test/solver/batch_bicgstab_kernels.cpp
+++ b/reference/test/solver/batch_bicgstab_kernels.cpp
@@ -75,7 +75,7 @@ class BatchBicgstab : public ::testing::Test {
         solve_lambda;
 };
 
-TYPED_TEST_SUITE(BatchBicgstab, gko::test::RealValueTypesWithHalf,
+TYPED_TEST_SUITE(BatchBicgstab, gko::test::RealValueTypes,
                  TypenameNameGenerator);
 
 
diff --git a/reference/test/solver/batch_cg_kernels.cpp b/reference/test/solver/batch_cg_kernels.cpp
index 2619614278e..86f8c1e84b0 100644
--- a/reference/test/solver/batch_cg_kernels.cpp
+++ b/reference/test/solver/batch_cg_kernels.cpp
@@ -75,8 +75,7 @@ class BatchCg : public ::testing::Test {
         solve_lambda;
 };
 
-TYPED_TEST_SUITE(BatchCg, gko::test::RealValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(BatchCg, gko::test::RealValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(BatchCg, SolvesStencilSystem)
diff --git a/reference/test/solver/bicg_kernels.cpp b/reference/test/solver/bicg_kernels.cpp
index a8f397aed1e..22b37683e0d 100644
--- a/reference/test/solver/bicg_kernels.cpp
+++ b/reference/test/solver/bicg_kernels.cpp
@@ -119,7 +119,7 @@ class Bicg : public ::testing::Test {
     std::unique_ptr<typename Solver::Factory> bicg_factory_non_symmetric;
 };
 
-TYPED_TEST_SUITE(Bicg, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
+TYPED_TEST_SUITE(Bicg, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Bicg, KernelInitialize)
diff --git a/reference/test/solver/bicgstab_kernels.cpp b/reference/test/solver/bicgstab_kernels.cpp
index 5bbff5b1a5e..7f508ae750a 100644
--- a/reference/test/solver/bicgstab_kernels.cpp
+++ b/reference/test/solver/bicgstab_kernels.cpp
@@ -121,8 +121,7 @@ class Bicgstab : public ::testing::Test {
     std::unique_ptr<typename Solver::Factory> bicgstab_factory_precision;
 };
 
-TYPED_TEST_SUITE(Bicgstab, gko::test::ValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(Bicgstab, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Bicgstab, KernelInitialize)
diff --git a/reference/test/solver/cb_gmres_kernels.cpp b/reference/test/solver/cb_gmres_kernels.cpp
index cee760f7840..2c6edc4abb1 100644
--- a/reference/test/solver/cb_gmres_kernels.cpp
+++ b/reference/test/solver/cb_gmres_kernels.cpp
@@ -137,9 +137,9 @@ using st_ir2 = st_helper_type<st_enum::ireduce2>;
 
 using TestTypes = gko::test::merge_type_list_t<
     gko::test::cartesian_type_product_t<
-        gko::test::ValueTypes, ::testing::Types<st_keep, st_r1, st_r2>>,
+        gko::test::ValueTypesBase, ::testing::Types<st_keep, st_r1, st_r2>>,
     gko::test::cartesian_type_product_t<
-        gko::test::RealValueTypes, ::testing::Types<st_i, st_ir1, st_ir2>>>;
+        gko::test::RealValueTypesBase, ::testing::Types<st_i, st_ir1, st_ir2>>>;
 
 TYPED_TEST_SUITE(CbGmres, TestTypes, PairTypenameNameGenerator);
 
diff --git a/reference/test/solver/cg_kernels.cpp b/reference/test/solver/cg_kernels.cpp
index 8cbc9a09c18..c4987bb5b17 100644
--- a/reference/test/solver/cg_kernels.cpp
+++ b/reference/test/solver/cg_kernels.cpp
@@ -107,7 +107,7 @@ class Cg : public ::testing::Test {
     std::unique_ptr<typename Solver::Factory> cg_factory_big2;
 };
 
-TYPED_TEST_SUITE(Cg, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
+TYPED_TEST_SUITE(Cg, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Cg, KernelInitialize)
diff --git a/reference/test/solver/cgs_kernels.cpp b/reference/test/solver/cgs_kernels.cpp
index c51a3252540..a631c0c4944 100644
--- a/reference/test/solver/cgs_kernels.cpp
+++ b/reference/test/solver/cgs_kernels.cpp
@@ -121,7 +121,7 @@ class Cgs : public ::testing::Test {
     std::unique_ptr<typename Solver::Factory> cgs_factory_big2;
 };
 
-TYPED_TEST_SUITE(Cgs, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
+TYPED_TEST_SUITE(Cgs, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Cgs, KernelInitialize)
diff --git a/reference/test/solver/direct.cpp b/reference/test/solver/direct.cpp
index e421811382f..d06948e631e 100644
--- a/reference/test/solver/direct.cpp
+++ b/reference/test/solver/direct.cpp
@@ -66,8 +66,7 @@ class Direct : public ::testing::Test {
     std::unique_ptr<solver_type> solver;
 };
 
-TYPED_TEST_SUITE(Direct, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Direct, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(Direct, SolvesAni1SingleRhs)
diff --git a/reference/test/solver/fcg_kernels.cpp b/reference/test/solver/fcg_kernels.cpp
index 4b8484a78c8..dca202f9fc8 100644
--- a/reference/test/solver/fcg_kernels.cpp
+++ b/reference/test/solver/fcg_kernels.cpp
@@ -112,7 +112,7 @@ class Fcg : public ::testing::Test {
     std::unique_ptr<typename Solver::Factory> fcg_factory_big2;
 };
 
-TYPED_TEST_SUITE(Fcg, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
+TYPED_TEST_SUITE(Fcg, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Fcg, KernelInitialize)
diff --git a/reference/test/solver/gcr_kernels.cpp b/reference/test/solver/gcr_kernels.cpp
index be747a2f84b..701a6590ec1 100644
--- a/reference/test/solver/gcr_kernels.cpp
+++ b/reference/test/solver/gcr_kernels.cpp
@@ -119,7 +119,7 @@ class Gcr : public ::testing::Test {
     std::unique_ptr<typename Solver::Factory> gcr_factory_big2;
 };
 
-TYPED_TEST_SUITE(Gcr, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
+TYPED_TEST_SUITE(Gcr, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Gcr, KernelInitialize)
diff --git a/reference/test/solver/gmres_kernels.cpp b/reference/test/solver/gmres_kernels.cpp
index 1719dfe1062..93d2d79e624 100644
--- a/reference/test/solver/gmres_kernels.cpp
+++ b/reference/test/solver/gmres_kernels.cpp
@@ -136,7 +136,7 @@ class Gmres : public ::testing::Test {
     std::unique_ptr<typename Solver::Factory> gmres_factory_big2;
 };
 
-TYPED_TEST_SUITE(Gmres, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
+TYPED_TEST_SUITE(Gmres, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Gmres, KernelInitialize)
diff --git a/reference/test/solver/idr_kernels.cpp b/reference/test/solver/idr_kernels.cpp
index a5aee44029c..7f171530086 100644
--- a/reference/test/solver/idr_kernels.cpp
+++ b/reference/test/solver/idr_kernels.cpp
@@ -60,7 +60,7 @@ class Idr : public ::testing::Test {
     std::unique_ptr<typename Solver::Factory> idr_factory_precision;
 };
 
-TYPED_TEST_SUITE(Idr, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
+TYPED_TEST_SUITE(Idr, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Idr, SolvesDenseSystem)
diff --git a/reference/test/solver/ir_kernels.cpp b/reference/test/solver/ir_kernels.cpp
index fc14711bec2..b0c1029f693 100644
--- a/reference/test/solver/ir_kernels.cpp
+++ b/reference/test/solver/ir_kernels.cpp
@@ -47,7 +47,7 @@ class Ir : public ::testing::Test {
     std::unique_ptr<typename Solver::Factory> ir_factory;
 };
 
-TYPED_TEST_SUITE(Ir, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
+TYPED_TEST_SUITE(Ir, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Ir, KernelInitialize)
diff --git a/reference/test/solver/lower_trs.cpp b/reference/test/solver/lower_trs.cpp
index fd6fe1e4b16..d52ee028b53 100644
--- a/reference/test/solver/lower_trs.cpp
+++ b/reference/test/solver/lower_trs.cpp
@@ -45,7 +45,7 @@ class LowerTrs : public ::testing::Test {
     std::unique_ptr<Solver> solver;
 };
 
-TYPED_TEST_SUITE(LowerTrs, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(LowerTrs, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/solver/lower_trs_kernels.cpp b/reference/test/solver/lower_trs_kernels.cpp
index dcb7d6c6f0c..3680f19681f 100644
--- a/reference/test/solver/lower_trs_kernels.cpp
+++ b/reference/test/solver/lower_trs_kernels.cpp
@@ -75,7 +75,7 @@ class LowerTrs : public ::testing::Test {
     std::unique_ptr<typename Solver::Factory> lower_trs_factory_unit;
 };
 
-TYPED_TEST_SUITE(LowerTrs, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(LowerTrs, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/solver/multigrid_kernels.cpp b/reference/test/solver/multigrid_kernels.cpp
index 31c27a4551a..9f18c264826 100644
--- a/reference/test/solver/multigrid_kernels.cpp
+++ b/reference/test/solver/multigrid_kernels.cpp
@@ -415,7 +415,7 @@ class Multigrid : public ::testing::Test {
     std::shared_ptr<Mtx> x2;
 };
 
-TYPED_TEST_SUITE(Multigrid, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(Multigrid, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/solver/upper_trs.cpp b/reference/test/solver/upper_trs.cpp
index b59744a0e8c..9980c51f9d1 100644
--- a/reference/test/solver/upper_trs.cpp
+++ b/reference/test/solver/upper_trs.cpp
@@ -45,7 +45,7 @@ class UpperTrs : public ::testing::Test {
     std::unique_ptr<Solver> upper_trs_solver;
 };
 
-TYPED_TEST_SUITE(UpperTrs, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(UpperTrs, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/solver/upper_trs_kernels.cpp b/reference/test/solver/upper_trs_kernels.cpp
index 15f0f3c2996..a60f3b46079 100644
--- a/reference/test/solver/upper_trs_kernels.cpp
+++ b/reference/test/solver/upper_trs_kernels.cpp
@@ -75,7 +75,7 @@ class UpperTrs : public ::testing::Test {
     std::unique_ptr<typename Solver::Factory> upper_trs_factory_unit;
 };
 
-TYPED_TEST_SUITE(UpperTrs, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(UpperTrs, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/reference/test/stop/residual_norm_kernels.cpp b/reference/test/stop/residual_norm_kernels.cpp
index aed801afacf..e7eef0565d2 100644
--- a/reference/test/stop/residual_norm_kernels.cpp
+++ b/reference/test/stop/residual_norm_kernels.cpp
@@ -45,8 +45,7 @@ class ResidualNorm : public ::testing::Test {
     std::shared_ptr<const gko::Executor> exec_;
 };
 
-TYPED_TEST_SUITE(ResidualNorm, gko::test::ValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(ResidualNorm, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(ResidualNorm, CanCreateFactory)
@@ -530,7 +529,7 @@ class ResidualNormWithInitialResnorm : public ::testing::Test {
     std::shared_ptr<const gko::ReferenceExecutor> exec_;
 };
 
-TYPED_TEST_SUITE(ResidualNormWithInitialResnorm, gko::test::ValueTypesWithHalf,
+TYPED_TEST_SUITE(ResidualNormWithInitialResnorm, gko::test::ValueTypes,
                  TypenameNameGenerator);
 
 
@@ -671,7 +670,7 @@ class ResidualNormWithRhsNorm : public ::testing::Test {
     std::shared_ptr<const gko::Executor> exec_;
 };
 
-TYPED_TEST_SUITE(ResidualNormWithRhsNorm, gko::test::ValueTypesWithHalf,
+TYPED_TEST_SUITE(ResidualNormWithRhsNorm, gko::test::ValueTypes,
                  TypenameNameGenerator);
 
 
@@ -808,7 +807,7 @@ class ImplicitResidualNorm : public ::testing::Test {
     std::shared_ptr<const gko::Executor> exec_;
 };
 
-TYPED_TEST_SUITE(ImplicitResidualNorm, gko::test::ValueTypesWithHalf,
+TYPED_TEST_SUITE(ImplicitResidualNorm, gko::test::ValueTypes,
                  TypenameNameGenerator);
 
 
@@ -984,7 +983,7 @@ class ResidualNormWithAbsolute : public ::testing::Test {
     std::shared_ptr<const gko::Executor> exec_;
 };
 
-TYPED_TEST_SUITE(ResidualNormWithAbsolute, gko::test::ValueTypesWithHalf,
+TYPED_TEST_SUITE(ResidualNormWithAbsolute, gko::test::ValueTypes,
                  TypenameNameGenerator);
 
 
diff --git a/reference/test/utils/assertions_test.cpp b/reference/test/utils/assertions_test.cpp
index 9c6b544172e..98f1ec68e0d 100644
--- a/reference/test/utils/assertions_test.cpp
+++ b/reference/test/utils/assertions_test.cpp
@@ -17,8 +17,7 @@ namespace {
 template <typename T>
 class MatricesNear : public ::testing::Test {};
 
-TYPED_TEST_SUITE(MatricesNear, gko::test::ValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(MatricesNear, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(MatricesNear, CanPassAnyMatrixType)
diff --git a/test/base/device_matrix_data_kernels.cpp b/test/base/device_matrix_data_kernels.cpp
index d2543ae7cbb..189badf83fa 100644
--- a/test/base/device_matrix_data_kernels.cpp
+++ b/test/base/device_matrix_data_kernels.cpp
@@ -84,7 +84,7 @@ class DeviceMatrixData : public CommonTestFixture {
     gko::matrix_data<value_type, index_type> deduplicated_data;
 };
 
-TYPED_TEST_SUITE(DeviceMatrixData, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(DeviceMatrixData, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/test/components/fill_array_kernels.cpp b/test/components/fill_array_kernels.cpp
index 4237a75304a..3d494b3f5f0 100644
--- a/test/components/fill_array_kernels.cpp
+++ b/test/components/fill_array_kernels.cpp
@@ -36,7 +36,7 @@ class FillArray : public CommonTestFixture {
     gko::array<value_type> seqs;
 };
 
-TYPED_TEST_SUITE(FillArray, gko::test::ComplexAndPODTypesWithHalf,
+TYPED_TEST_SUITE(FillArray, gko::test::ComplexAndPODTypes,
                  TypenameNameGenerator);
 
 
diff --git a/test/components/reduce_array_kernels.cpp b/test/components/reduce_array_kernels.cpp
index 7940feec661..00a634377ec 100644
--- a/test/components/reduce_array_kernels.cpp
+++ b/test/components/reduce_array_kernels.cpp
@@ -43,7 +43,7 @@ class ReduceArray : public CommonTestFixture {
     gko::array<value_type> dvals;
 };
 
-TYPED_TEST_SUITE(ReduceArray, gko::test::ComplexAndPODTypesWithHalf,
+TYPED_TEST_SUITE(ReduceArray, gko::test::ComplexAndPODTypes,
                  TypenameNameGenerator);
 
 
diff --git a/test/distributed/assembly_kernels.cpp b/test/distributed/assembly_kernels.cpp
index d1e2f708ca6..4ab4c9173ac 100644
--- a/test/distributed/assembly_kernels.cpp
+++ b/test/distributed/assembly_kernels.cpp
@@ -100,7 +100,7 @@ class AssemblyHelpers : public CommonTestFixture {
     std::default_random_engine engine;
 };
 
-TYPED_TEST_SUITE(AssemblyHelpers, gko::test::ValueLocalGlobalIndexTypes,
+TYPED_TEST_SUITE(AssemblyHelpers, gko::test::ValueLocalGlobalIndexTypesBase,
                  TupleTypenameNameGenerator);
 
 
diff --git a/test/distributed/matrix_kernels.cpp b/test/distributed/matrix_kernels.cpp
index ad91d699496..9a5d4f2cf7b 100644
--- a/test/distributed/matrix_kernels.cpp
+++ b/test/distributed/matrix_kernels.cpp
@@ -87,7 +87,7 @@ class Matrix : public CommonTestFixture {
     std::default_random_engine engine;
 };
 
-TYPED_TEST_SUITE(Matrix, gko::test::ValueLocalGlobalIndexTypes,
+TYPED_TEST_SUITE(Matrix, gko::test::ValueLocalGlobalIndexTypesBase,
                  TupleTypenameNameGenerator);
 
 
diff --git a/test/distributed/vector_kernels.cpp b/test/distributed/vector_kernels.cpp
index 1246da9a116..a212699a2ca 100644
--- a/test/distributed/vector_kernels.cpp
+++ b/test/distributed/vector_kernels.cpp
@@ -67,7 +67,7 @@ class Vector : public CommonTestFixture {
     std::default_random_engine engine;
 };
 
-TYPED_TEST_SUITE(Vector, gko::test::ValueLocalGlobalIndexTypes,
+TYPED_TEST_SUITE(Vector, gko::test::ValueLocalGlobalIndexTypesBase,
                  TupleTypenameNameGenerator);
 
 
diff --git a/test/factorization/cholesky_kernels.cpp b/test/factorization/cholesky_kernels.cpp
index 007f3cbf6fd..1b2d187c785 100644
--- a/test/factorization/cholesky_kernels.cpp
+++ b/test/factorization/cholesky_kernels.cpp
@@ -111,15 +111,15 @@ class CholeskySymbolic : public CommonTestFixture {
 };
 
 #ifdef GKO_COMPILING_OMP
-using Types = gko::test::ValueIndexTypes;
+using Types = gko::test::ValueIndexTypesBase;
 #elif defined(GKO_COMPILING_CUDA)
 // CUDA doesn't support long indices for sorting, and the triangular solvers
 // seem broken
-using Types = gko::test::cartesian_type_product_t<gko::test::ValueTypesWithHalf,
+using Types = gko::test::cartesian_type_product_t<gko::test::ValueTypes,
                                                   ::testing::Types<gko::int32>>;
 #else
 // HIP only supports real types and int32
-using Types = gko::test::cartesian_type_product_t<gko::test::RealValueTypes,
+using Types = gko::test::cartesian_type_product_t<gko::test::RealValueTypesBase,
                                                   ::testing::Types<gko::int32>>;
 #endif
 
diff --git a/test/factorization/lu_kernels.cpp b/test/factorization/lu_kernels.cpp
index 59f3cb30327..4fe974200dc 100644
--- a/test/factorization/lu_kernels.cpp
+++ b/test/factorization/lu_kernels.cpp
@@ -129,15 +129,15 @@ class Lu : public CommonTestFixture {
 };
 
 #ifdef GKO_COMPILING_OMP
-using Types = gko::test::ValueIndexTypesWithHalf;
+using Types = gko::test::ValueIndexTypes;
 #elif defined(GKO_COMPILING_CUDA)
 // CUDA don't support long indices for sorting, and the triangular solvers
 // seem broken
-using Types = gko::test::cartesian_type_product_t<gko::test::ValueTypesWithHalf,
+using Types = gko::test::cartesian_type_product_t<gko::test::ValueTypes,
                                                   ::testing::Types<gko::int32>>;
 #else
 // HIP only supports real types and int32
-using Types = gko::test::cartesian_type_product_t<gko::test::RealValueTypes,
+using Types = gko::test::cartesian_type_product_t<gko::test::RealValueTypesBase,
                                                   ::testing::Types<gko::int32>>;
 #endif
 
diff --git a/test/factorization/par_ic_kernels.cpp b/test/factorization/par_ic_kernels.cpp
index 10eccb83f10..36a20dda48f 100644
--- a/test/factorization/par_ic_kernels.cpp
+++ b/test/factorization/par_ic_kernels.cpp
@@ -86,8 +86,7 @@ class ParIc : public CommonTestFixture {
     std::unique_ptr<Csr> dmtx_l_ani_init;
 };
 
-TYPED_TEST_SUITE(ParIc, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(ParIc, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(ParIc, KernelInitFactorIsEquivalentToRef)
diff --git a/test/factorization/par_ict_kernels.cpp b/test/factorization/par_ict_kernels.cpp
index 8d6579d584e..7362929a450 100644
--- a/test/factorization/par_ict_kernels.cpp
+++ b/test/factorization/par_ict_kernels.cpp
@@ -93,8 +93,7 @@ class ParIct : public CommonTestFixture {
     std::unique_ptr<Csr> dmtx_l;
 };
 
-TYPED_TEST_SUITE(ParIct, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(ParIct, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(ParIct, KernelAddCandidatesIsEquivalentToRef)
diff --git a/test/factorization/par_ilu_kernels.cpp b/test/factorization/par_ilu_kernels.cpp
index 8c3ab20a674..116790c2b36 100644
--- a/test/factorization/par_ilu_kernels.cpp
+++ b/test/factorization/par_ilu_kernels.cpp
@@ -144,8 +144,7 @@ class ParIlu : public CommonTestFixture {
     }
 };
 
-TYPED_TEST_SUITE(ParIlu, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(ParIlu, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(ParIlu, KernelAddDiagonalElementsSortedEquivalentToRef)
diff --git a/test/factorization/par_ilut_kernels.cpp b/test/factorization/par_ilut_kernels.cpp
index b1af2b4c748..c7a8e254567 100644
--- a/test/factorization/par_ilut_kernels.cpp
+++ b/test/factorization/par_ilut_kernels.cpp
@@ -233,7 +233,7 @@ class ParIlut : public CommonTestFixture {
     std::unique_ptr<Csr> dmtx_u;
 };
 
-TYPED_TEST_SUITE(ParIlut, gko::test::ValueIndexTypesWithHalf,
+TYPED_TEST_SUITE(ParIlut, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
diff --git a/test/matrix/fbcsr_kernels.cpp b/test/matrix/fbcsr_kernels.cpp
index 5e3d4b1a112..72dcc67e005 100644
--- a/test/matrix/fbcsr_kernels.cpp
+++ b/test/matrix/fbcsr_kernels.cpp
@@ -61,10 +61,9 @@ class Fbcsr : public CommonTestFixture {
 };
 
 #ifdef GKO_COMPILING_HIP
-TYPED_TEST_SUITE(Fbcsr, gko::test::RealValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(Fbcsr, gko::test::RealValueTypes, TypenameNameGenerator);
 #else
-TYPED_TEST_SUITE(Fbcsr, gko::test::ValueTypesWithHalf, TypenameNameGenerator);
+TYPED_TEST_SUITE(Fbcsr, gko::test::ValueTypes, TypenameNameGenerator);
 #endif
 
 TYPED_TEST(Fbcsr, CanWriteFromMatrixOnDevice)
diff --git a/test/matrix/fft_kernels.cpp b/test/matrix/fft_kernels.cpp
index 5b2c33085e3..97f043a1fb4 100644
--- a/test/matrix/fft_kernels.cpp
+++ b/test/matrix/fft_kernels.cpp
@@ -91,7 +91,7 @@ class Fft : public CommonTestFixture {
 };
 
 
-TYPED_TEST_SUITE(Fft, gko::test::ComplexValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Fft, gko::test::ComplexValueTypesBase, TypenameNameGenerator);
 
 
 TYPED_TEST(Fft, Apply1DIsEqualToReference)
diff --git a/test/mpi/assembly.cpp b/test/mpi/assembly.cpp
index 9db0eab553a..3ad47565d44 100644
--- a/test/mpi/assembly.cpp
+++ b/test/mpi/assembly.cpp
@@ -83,7 +83,7 @@ class AssemblyHelpers : public CommonMpiTestFixture {
     std::default_random_engine engine;
 };
 
-TYPED_TEST_SUITE(AssemblyHelpers, gko::test::ValueLocalGlobalIndexTypes,
+TYPED_TEST_SUITE(AssemblyHelpers, gko::test::ValueLocalGlobalIndexTypesBase,
                  TupleTypenameNameGenerator);
 
 
diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp
index 85e0a43d146..e761aab159e 100644
--- a/test/mpi/matrix.cpp
+++ b/test/mpi/matrix.cpp
@@ -101,7 +101,7 @@ class MatrixCreation : public CommonMpiTestFixture {
     std::default_random_engine engine;
 };
 
-TYPED_TEST_SUITE(MatrixCreation, gko::test::ValueLocalGlobalIndexTypes,
+TYPED_TEST_SUITE(MatrixCreation, gko::test::ValueLocalGlobalIndexTypesBase,
                  TupleTypenameNameGenerator);
 
 
@@ -467,7 +467,7 @@ class Matrix : public CommonMpiTestFixture {
     std::default_random_engine engine;
 };
 
-TYPED_TEST_SUITE(Matrix, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Matrix, gko::test::ValueTypesBase, TypenameNameGenerator);
 
 
 TYPED_TEST(Matrix, CanApplyToSingleVector)
diff --git a/test/mpi/multigrid/pgm.cpp b/test/mpi/multigrid/pgm.cpp
index 664ad0cd4ec..df198f235c3 100644
--- a/test/mpi/multigrid/pgm.cpp
+++ b/test/mpi/multigrid/pgm.cpp
@@ -80,7 +80,7 @@ class Pgm : public CommonMpiTestFixture {
     std::shared_ptr<dist_mtx_type> dist_mat;
 };
 
-TYPED_TEST_SUITE(Pgm, gko::test::ValueLocalGlobalIndexTypes,
+TYPED_TEST_SUITE(Pgm, gko::test::ValueLocalGlobalIndexTypesBase,
                  TupleTypenameNameGenerator);
 
 
diff --git a/test/mpi/preconditioner/schwarz.cpp b/test/mpi/preconditioner/schwarz.cpp
index 6717cd9d888..113f8922aae 100644
--- a/test/mpi/preconditioner/schwarz.cpp
+++ b/test/mpi/preconditioner/schwarz.cpp
@@ -143,7 +143,8 @@ class SchwarzPreconditioner : public CommonMpiTestFixture {
     }
 };
 
-TYPED_TEST_SUITE(SchwarzPreconditioner, gko::test::ValueLocalGlobalIndexTypes,
+TYPED_TEST_SUITE(SchwarzPreconditioner,
+                 gko::test::ValueLocalGlobalIndexTypesBase,
                  TupleTypenameNameGenerator);
 
 TYPED_TEST(SchwarzPreconditioner, GenerateFailsIfInvalidState)
diff --git a/test/mpi/vector.cpp b/test/mpi/vector.cpp
index 53990650ed7..752342a8e64 100644
--- a/test/mpi/vector.cpp
+++ b/test/mpi/vector.cpp
@@ -95,7 +95,7 @@ class VectorCreation : public CommonMpiTestFixture {
     std::default_random_engine engine;
 };
 
-TYPED_TEST_SUITE(VectorCreation, gko::test::ValueLocalGlobalIndexTypes,
+TYPED_TEST_SUITE(VectorCreation, gko::test::ValueLocalGlobalIndexTypesBase,
                  TupleTypenameNameGenerator);
 
 
@@ -361,7 +361,7 @@ class VectorCreationHelpers : public CommonMpiTestFixture {
     std::unique_ptr<vec_type> dst;
 };
 
-TYPED_TEST_SUITE(VectorCreationHelpers, gko::test::ValueTypes,
+TYPED_TEST_SUITE(VectorCreationHelpers, gko::test::ValueTypesBase,
                  TypenameNameGenerator);
 
 
@@ -513,7 +513,7 @@ class VectorReductions : public CommonMpiTestFixture {
     std::default_random_engine engine;
 };
 
-TYPED_TEST_SUITE(VectorReductions, gko::test::ValueTypes,
+TYPED_TEST_SUITE(VectorReductions, gko::test::ValueTypesBase,
                  TypenameNameGenerator);
 
 
@@ -801,7 +801,8 @@ class VectorLocalOps : public CommonMpiTestFixture {
     std::default_random_engine engine;
 };
 
-TYPED_TEST_SUITE(VectorLocalOps, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(VectorLocalOps, gko::test::ValueTypesBase,
+                 TypenameNameGenerator);
 
 
 TYPED_TEST(VectorLocalOps, ApplyNotSupported)
diff --git a/test/reorder/amd.cpp b/test/reorder/amd.cpp
index f5a17e943e1..a1ca7c09359 100644
--- a/test/reorder/amd.cpp
+++ b/test/reorder/amd.cpp
@@ -40,8 +40,7 @@ class Amd : public CommonTestFixture {
     std::shared_ptr<matrix_type> dmtx;
 };
 
-TYPED_TEST_SUITE(Amd, gko::test::ValueIndexTypesWithHalf,
-                 PairTypenameNameGenerator);
+TYPED_TEST_SUITE(Amd, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 TYPED_TEST(Amd, IsEquivalentToRef)
diff --git a/test/solver/direct.cpp b/test/solver/direct.cpp
index 1ee95806c37..c00fa772c14 100644
--- a/test/solver/direct.cpp
+++ b/test/solver/direct.cpp
@@ -100,15 +100,15 @@ class Direct : public CommonTestFixture {
 };
 
 #ifdef GKO_COMPILING_OMP
-using Types = gko::test::ValueIndexTypesWithHalf;
+using Types = gko::test::ValueIndexTypes;
 #elif defined(GKO_COMPILING_CUDA)
 // CUDA don't support long indices for sorting, and the triangular solvers
 // seem broken
-using Types = gko::test::cartesian_type_product_t<gko::test::ValueTypes,
+using Types = gko::test::cartesian_type_product_t<gko::test::ValueTypesBase,
                                                   ::testing::Types<gko::int32>>;
 #else
 // HIP only supports real types and int32
-using Types = gko::test::cartesian_type_product_t<gko::test::RealValueTypes,
+using Types = gko::test::cartesian_type_product_t<gko::test::RealValueTypesBase,
                                                   ::testing::Types<gko::int32>>;
 #endif
 
diff --git a/test/stop/residual_norm_kernels.cpp b/test/stop/residual_norm_kernels.cpp
index 62f656bed59..7be3e7fde48 100644
--- a/test/stop/residual_norm_kernels.cpp
+++ b/test/stop/residual_norm_kernels.cpp
@@ -58,8 +58,7 @@ class ResidualNorm : public CommonTestFixture {
     std::unique_ptr<typename gko::stop::ResidualNorm<T>::Factory> abs_factory;
 };
 
-TYPED_TEST_SUITE(ResidualNorm, gko::test::ValueTypesWithHalf,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(ResidualNorm, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(ResidualNorm, CanIgorneResidualNorm)
@@ -345,7 +344,7 @@ class ResidualNormWithInitialResnorm : public CommonTestFixture {
     std::unique_ptr<typename gko::stop::ResidualNorm<T>::Factory> factory;
 };
 
-TYPED_TEST_SUITE(ResidualNormWithInitialResnorm, gko::test::ValueTypesWithHalf,
+TYPED_TEST_SUITE(ResidualNormWithInitialResnorm, gko::test::ValueTypes,
                  TypenameNameGenerator);
 
 
@@ -442,7 +441,7 @@ class ResidualNormWithRhsNorm : public CommonTestFixture {
     std::unique_ptr<typename gko::stop::ResidualNorm<T>::Factory> factory;
 };
 
-TYPED_TEST_SUITE(ResidualNormWithRhsNorm, gko::test::ValueTypesWithHalf,
+TYPED_TEST_SUITE(ResidualNormWithRhsNorm, gko::test::ValueTypes,
                  TypenameNameGenerator);
 
 
@@ -547,7 +546,7 @@ class ImplicitResidualNorm : public CommonTestFixture {
         factory;
 };
 
-TYPED_TEST_SUITE(ImplicitResidualNorm, gko::test::ValueTypesWithHalf,
+TYPED_TEST_SUITE(ImplicitResidualNorm, gko::test::ValueTypes,
                  TypenameNameGenerator);
 
 
@@ -693,7 +692,7 @@ class ResidualNormWithAbsolute : public CommonTestFixture {
     std::unique_ptr<typename gko::stop::ResidualNorm<T>::Factory> factory;
 };
 
-TYPED_TEST_SUITE(ResidualNormWithAbsolute, gko::test::ValueTypesWithHalf,
+TYPED_TEST_SUITE(ResidualNormWithAbsolute, gko::test::ValueTypes,
                  TypenameNameGenerator);
 
 

From 28d305ad780a17aa3deec6a6878404602f8ee78e Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 2 Dec 2024 12:35:13 +0100
Subject: [PATCH 427/448] rename type list

---
 core/config/dispatch.hpp                   |  6 +++---
 core/config/parse_macro.hpp                | 21 +++++++++++----------
 core/config/preconditioner_ic_config.cpp   | 12 ++++--------
 core/config/preconditioner_ilu_config.cpp  |  8 ++++----
 core/config/preconditioner_isai_config.cpp |  8 ++++----
 core/config/schwarz_config.cpp             |  6 ++++--
 core/config/stop_config.cpp                |  6 ++----
 7 files changed, 32 insertions(+), 35 deletions(-)

diff --git a/core/config/dispatch.hpp b/core/config/dispatch.hpp
index 1c6d0eb12cd..6a604096322 100644
--- a/core/config/dispatch.hpp
+++ b/core/config/dispatch.hpp
@@ -102,15 +102,15 @@ deferred_factory_parameter<ReturnType> dispatch(
     }
 }
 
-using value_type_list =
+using value_type_list_base =
     syn::type_list<double, float, std::complex<double>, std::complex<float>>;
 
 #if GINKGO_ENABLE_HALF
-using value_type_list_with_half =
+using value_type_list =
     syn::type_list<double, float, half, std::complex<double>,
                    std::complex<float>, std::complex<half>>;
 #else
-using value_type_list_with_half = value_type_list;
+using value_type_list = value_type_list_base;
 #endif  // GINKGO_ENABLE_HALF
 
 using index_type_list = syn::type_list<int32, int64>;
diff --git a/core/config/parse_macro.hpp b/core/config/parse_macro.hpp
index f40aa8cb9dd..273bc8b1a9f 100644
--- a/core/config/parse_macro.hpp
+++ b/core/config/parse_macro.hpp
@@ -16,7 +16,8 @@
 
 
 // for value_type only
-#define GKO_PARSE_VALUE_TYPE_BASE_(_type, _configurator, _value_type_list)   \
+#define GKO_PARSE_VALUE_TYPE_BASE_(_type, _configurator,                     \
+                                   _value_type_list_base)                    \
     template <>                                                              \
     deferred_factory_parameter<gko::LinOpFactory>                            \
     parse<gko::config::LinOpFactoryType::_type>(                             \
@@ -28,22 +29,22 @@
         return gko::config::dispatch<gko::LinOpFactory, _configurator>(      \
             config, context, updated,                                        \
             gko::config::make_type_selector(updated.get_value_typestr(),     \
-                                            _value_type_list));              \
+                                            _value_type_list_base));         \
     }                                                                        \
     static_assert(true,                                                      \
                   "This assert is used to counter the false positive extra " \
                   "semi-colon warnings")
 #define GKO_PARSE_VALUE_TYPE_BASE(_type, _configurator) \
     GKO_PARSE_VALUE_TYPE_BASE_(_type, _configurator,    \
-                               gko::config::value_type_list())
+                               gko::config::value_type_list_base())
 
 #define GKO_PARSE_VALUE_TYPE(_type, _configurator)   \
     GKO_PARSE_VALUE_TYPE_BASE_(_type, _configurator, \
-                               gko::config::value_type_list_with_half())
+                               gko::config::value_type_list())
 
 // for value_type and index_type
 #define GKO_PARSE_VALUE_AND_INDEX_TYPE_BASE_(_type, _configurator,            \
-                                             _value_type_list)                \
+                                             _value_type_list_base)           \
     template <>                                                               \
     deferred_factory_parameter<gko::LinOpFactory>                             \
     parse<gko::config::LinOpFactoryType::_type>(                              \
@@ -55,7 +56,7 @@
         return gko::config::dispatch<gko::LinOpFactory, _configurator>(       \
             config, context, updated,                                         \
             gko::config::make_type_selector(updated.get_value_typestr(),      \
-                                            _value_type_list),                \
+                                            _value_type_list_base),           \
             gko::config::make_type_selector(updated.get_index_typestr(),      \
                                             gko::config::index_type_list())); \
     }                                                                         \
@@ -65,11 +66,11 @@
 
 #define GKO_PARSE_VALUE_AND_INDEX_TYPE_BASE(_type, _configurator) \
     GKO_PARSE_VALUE_AND_INDEX_TYPE_BASE_(_type, _configurator,    \
-                                         gko::config::value_type_list())
+                                         gko::config::value_type_list_base())
 
-#define GKO_PARSE_VALUE_AND_INDEX_TYPE(_type, _configurator) \
-    GKO_PARSE_VALUE_AND_INDEX_TYPE_BASE_(                    \
-        _type, _configurator, gko::config::value_type_list_with_half())
+#define GKO_PARSE_VALUE_AND_INDEX_TYPE(_type, _configurator)   \
+    GKO_PARSE_VALUE_AND_INDEX_TYPE_BASE_(_type, _configurator, \
+                                         gko::config::value_type_list())
 
 
 #endif  // GKO_CORE_CONFIG_PARSE_MACRO_HPP_
diff --git a/core/config/preconditioner_ic_config.cpp b/core/config/preconditioner_ic_config.cpp
index e029b228479..7d4cd3d9ca4 100644
--- a/core/config/preconditioner_ic_config.cpp
+++ b/core/config/preconditioner_ic_config.cpp
@@ -78,28 +78,24 @@ deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Ic>(
         return dispatch<gko::LinOpFactory,
                         IcHelper2<solver::LowerTrs>::Configurator>(
             config, context, updated,
-            make_type_selector(updated.get_value_typestr(),
-                               value_type_list_with_half()),
+            make_type_selector(updated.get_value_typestr(), value_type_list()),
             make_type_selector(updated.get_index_typestr(), index_type_list()));
     } else if (str == "solver::Ir") {
         return dispatch<gko::LinOpFactory, IcHelper1<solver::Ir>::Configurator>(
             config, context, updated,
-            make_type_selector(updated.get_value_typestr(),
-                               value_type_list_with_half()),
+            make_type_selector(updated.get_value_typestr(), value_type_list()),
             make_type_selector(updated.get_index_typestr(), index_type_list()));
     } else if (str == "preconditioner::LowerIsai") {
         return dispatch<gko::LinOpFactory,
                         IcHelper2<preconditioner::LowerIsai>::Configurator>(
             config, context, updated,
-            make_type_selector(updated.get_value_typestr(),
-                               value_type_list_with_half()),
+            make_type_selector(updated.get_value_typestr(), value_type_list()),
             make_type_selector(updated.get_index_typestr(), index_type_list()));
     } else if (str == "solver::Gmres") {
         return dispatch<gko::LinOpFactory,
                         IcHelper1<solver::Gmres>::Configurator>(
             config, context, updated,
-            make_type_selector(updated.get_value_typestr(),
-                               value_type_list_with_half()),
+            make_type_selector(updated.get_value_typestr(), value_type_list()),
             make_type_selector(updated.get_index_typestr(), index_type_list()));
     } else {
         GKO_INVALID_CONFIG_VALUE("l_solver_type", str);
diff --git a/core/config/preconditioner_ilu_config.cpp b/core/config/preconditioner_ilu_config.cpp
index 9ed8494ab10..3714d356e71 100644
--- a/core/config/preconditioner_ilu_config.cpp
+++ b/core/config/preconditioner_ilu_config.cpp
@@ -94,7 +94,7 @@ deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Ilu>(
                            ReverseApply::value>::template Configurator>(
                 config, context, updated,
                 make_type_selector(updated.get_value_typestr(),
-                                   value_type_list_with_half()),
+                                   value_type_list()),
                 make_type_selector(updated.get_index_typestr(),
                                    index_type_list()));
         } else if (str == "solver::Ir") {
@@ -104,7 +104,7 @@ deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Ilu>(
                            ReverseApply::value>::template Configurator>(
                 config, context, updated,
                 make_type_selector(updated.get_value_typestr(),
-                                   value_type_list_with_half()),
+                                   value_type_list()),
                 make_type_selector(updated.get_index_typestr(),
                                    index_type_list()));
         } else if (str == "preconditioner::LowerIsai") {
@@ -114,7 +114,7 @@ deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Ilu>(
                            ReverseApply::value>::template Configurator>(
                 config, context, updated,
                 make_type_selector(updated.get_value_typestr(),
-                                   value_type_list_with_half()),
+                                   value_type_list()),
                 make_type_selector(updated.get_index_typestr(),
                                    index_type_list()));
         } else if (str == "solver::Gmres") {
@@ -124,7 +124,7 @@ deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Ilu>(
                            ReverseApply::value>::template Configurator>(
                 config, context, updated,
                 make_type_selector(updated.get_value_typestr(),
-                                   value_type_list_with_half()),
+                                   value_type_list()),
                 make_type_selector(updated.get_index_typestr(),
                                    index_type_list()));
         } else {
diff --git a/core/config/preconditioner_isai_config.cpp b/core/config/preconditioner_isai_config.cpp
index 828721ed74e..8d1b849eea4 100644
--- a/core/config/preconditioner_isai_config.cpp
+++ b/core/config/preconditioner_isai_config.cpp
@@ -48,7 +48,7 @@ deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Isai>(
                 IsaiHelper<preconditioner::isai_type::lower>::Configurator>(
                 config, context, updated,
                 make_type_selector(updated.get_value_typestr(),
-                                   value_type_list_with_half()),
+                                   value_type_list()),
                 make_type_selector(updated.get_index_typestr(),
                                    index_type_list()));
         } else if (str == "upper") {
@@ -57,7 +57,7 @@ deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Isai>(
                 IsaiHelper<preconditioner::isai_type::upper>::Configurator>(
                 config, context, updated,
                 make_type_selector(updated.get_value_typestr(),
-                                   value_type_list_with_half()),
+                                   value_type_list()),
                 make_type_selector(updated.get_index_typestr(),
                                    index_type_list()));
         } else if (str == "general") {
@@ -66,7 +66,7 @@ deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Isai>(
                 IsaiHelper<preconditioner::isai_type::general>::Configurator>(
                 config, context, updated,
                 make_type_selector(updated.get_value_typestr(),
-                                   value_type_list_with_half()),
+                                   value_type_list()),
                 make_type_selector(updated.get_index_typestr(),
                                    index_type_list()));
         } else if (str == "spd") {
@@ -75,7 +75,7 @@ deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Isai>(
                 IsaiHelper<preconditioner::isai_type::spd>::Configurator>(
                 config, context, updated,
                 make_type_selector(updated.get_value_typestr(),
-                                   value_type_list_with_half()),
+                                   value_type_list()),
                 make_type_selector(updated.get_index_typestr(),
                                    index_type_list()));
         } else {
diff --git a/core/config/schwarz_config.cpp b/core/config/schwarz_config.cpp
index 9543b833041..3244d6064e5 100644
--- a/core/config/schwarz_config.cpp
+++ b/core/config/schwarz_config.cpp
@@ -29,7 +29,8 @@ deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Schwarz>(
             gko::LinOpFactory,
             gko::experimental::distributed::preconditioner::Schwarz>(
             config, context, updated,
-            make_type_selector(updated.get_value_typestr(), value_type_list()),
+            make_type_selector(updated.get_value_typestr(),
+                               value_type_list_base()),
             make_type_selector(updated.get_index_typestr(),
                                syn::type_list<int32>()),
             make_type_selector(updated.get_global_index_typestr(),
@@ -39,7 +40,8 @@ deferred_factory_parameter<gko::LinOpFactory> parse<LinOpFactoryType::Schwarz>(
             gko::LinOpFactory,
             gko::experimental::distributed::preconditioner::Schwarz>(
             config, context, updated,
-            make_type_selector(updated.get_value_typestr(), value_type_list()),
+            make_type_selector(updated.get_value_typestr(),
+                               value_type_list_base()),
             make_type_selector(updated.get_index_typestr(),
                                syn::type_list<int64>()),
             make_type_selector(updated.get_global_index_typestr(),
diff --git a/core/config/stop_config.cpp b/core/config/stop_config.cpp
index 2696b471a21..4623eb768fc 100644
--- a/core/config/stop_config.cpp
+++ b/core/config/stop_config.cpp
@@ -87,8 +87,7 @@ deferred_factory_parameter<stop::CriterionFactory> configure_residual(
     auto updated = update_type(config, td);
     return dispatch<stop::CriterionFactory, ResidualNormConfigurer>(
         config, context, updated,
-        make_type_selector(updated.get_value_typestr(),
-                           value_type_list_with_half()));
+        make_type_selector(updated.get_value_typestr(), value_type_list()));
 }
 
 
@@ -120,8 +119,7 @@ deferred_factory_parameter<stop::CriterionFactory> configure_implicit_residual(
     auto updated = update_type(config, td);
     return dispatch<stop::CriterionFactory, ImplicitResidualNormConfigurer>(
         config, context, updated,
-        make_type_selector(updated.get_value_typestr(),
-                           value_type_list_with_half()));
+        make_type_selector(updated.get_value_typestr(), value_type_list()));
 }
 
 

From 64a170ddc91293f025f9686ab3ba642f0b5102be Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 2 Dec 2024 16:32:51 +0100
Subject: [PATCH 428/448] manual changes

---
 core/config/parse_macro.hpp | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/core/config/parse_macro.hpp b/core/config/parse_macro.hpp
index 273bc8b1a9f..728fceabee1 100644
--- a/core/config/parse_macro.hpp
+++ b/core/config/parse_macro.hpp
@@ -16,8 +16,7 @@
 
 
 // for value_type only
-#define GKO_PARSE_VALUE_TYPE_BASE_(_type, _configurator,                     \
-                                   _value_type_list_base)                    \
+#define GKO_PARSE_VALUE_TYPE_(_type, _configurator, _value_type_list)        \
     template <>                                                              \
     deferred_factory_parameter<gko::LinOpFactory>                            \
     parse<gko::config::LinOpFactoryType::_type>(                             \
@@ -29,22 +28,21 @@
         return gko::config::dispatch<gko::LinOpFactory, _configurator>(      \
             config, context, updated,                                        \
             gko::config::make_type_selector(updated.get_value_typestr(),     \
-                                            _value_type_list_base));         \
+                                            _value_type_list));              \
     }                                                                        \
     static_assert(true,                                                      \
                   "This assert is used to counter the false positive extra " \
                   "semi-colon warnings")
 #define GKO_PARSE_VALUE_TYPE_BASE(_type, _configurator) \
-    GKO_PARSE_VALUE_TYPE_BASE_(_type, _configurator,    \
-                               gko::config::value_type_list_base())
+    GKO_PARSE_VALUE_TYPE_(_type, _configurator,         \
+                          gko::config::value_type_list_base())
 
-#define GKO_PARSE_VALUE_TYPE(_type, _configurator)   \
-    GKO_PARSE_VALUE_TYPE_BASE_(_type, _configurator, \
-                               gko::config::value_type_list())
+#define GKO_PARSE_VALUE_TYPE(_type, _configurator) \
+    GKO_PARSE_VALUE_TYPE_(_type, _configurator, gko::config::value_type_list())
 
 // for value_type and index_type
-#define GKO_PARSE_VALUE_AND_INDEX_TYPE_BASE_(_type, _configurator,            \
-                                             _value_type_list_base)           \
+#define GKO_PARSE_VALUE_AND_INDEX_TYPE_(_type, _configurator,                 \
+                                        _value_type_list)                     \
     template <>                                                               \
     deferred_factory_parameter<gko::LinOpFactory>                             \
     parse<gko::config::LinOpFactoryType::_type>(                              \
@@ -56,7 +54,7 @@
         return gko::config::dispatch<gko::LinOpFactory, _configurator>(       \
             config, context, updated,                                         \
             gko::config::make_type_selector(updated.get_value_typestr(),      \
-                                            _value_type_list_base),           \
+                                            _value_type_list),                \
             gko::config::make_type_selector(updated.get_index_typestr(),      \
                                             gko::config::index_type_list())); \
     }                                                                         \
@@ -65,12 +63,12 @@
                   "semi-colon warnings")
 
 #define GKO_PARSE_VALUE_AND_INDEX_TYPE_BASE(_type, _configurator) \
-    GKO_PARSE_VALUE_AND_INDEX_TYPE_BASE_(_type, _configurator,    \
-                                         gko::config::value_type_list_base())
+    GKO_PARSE_VALUE_AND_INDEX_TYPE_(_type, _configurator,         \
+                                    gko::config::value_type_list_base())
 
-#define GKO_PARSE_VALUE_AND_INDEX_TYPE(_type, _configurator)   \
-    GKO_PARSE_VALUE_AND_INDEX_TYPE_BASE_(_type, _configurator, \
-                                         gko::config::value_type_list())
+#define GKO_PARSE_VALUE_AND_INDEX_TYPE(_type, _configurator) \
+    GKO_PARSE_VALUE_AND_INDEX_TYPE_(_type, _configurator,    \
+                                    gko::config::value_type_list())
 
 
 #endif  // GKO_CORE_CONFIG_PARSE_MACRO_HPP_

From 8fb8d1a0446bbe5ad2a3f03588ba3c8e748417b3 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 3 Dec 2024 11:32:44 +0100
Subject: [PATCH 429/448] remove unused data and type

---
 test/matrix/csr_kernels2.cpp | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/test/matrix/csr_kernels2.cpp b/test/matrix/csr_kernels2.cpp
index 27c4a5a9494..e7f73cbc124 100644
--- a/test/matrix/csr_kernels2.cpp
+++ b/test/matrix/csr_kernels2.cpp
@@ -34,7 +34,6 @@ class Csr : public CommonTestFixture {
 protected:
     using Arr = gko::array<int>;
     using Vec = gko::matrix::Dense<value_type>;
-    using Vec2 = gko::matrix::Dense<gko::next_precision_base<value_type>>;
     using Mtx = gko::matrix::Csr<value_type>;
     using ComplexVec = gko::matrix::Dense<std::complex<value_type>>;
     using ComplexMtx = gko::matrix::Csr<std::complex<value_type>>;
@@ -123,27 +122,17 @@ class Csr : public CommonTestFixture {
         square_mtx = Mtx::create(ref, strategy);
         square_mtx->move_from(gen_mtx<Vec>(mtx_size[0], mtx_size[0], 1));
         expected = gen_mtx<Vec>(mtx_size[0], num_vectors, 1);
-        expected2 = Vec2::create(ref);
-        expected2->copy_from(expected);
         y = gen_mtx<Vec>(mtx_size[1], num_vectors, 1);
-        y2 = Vec2::create(ref);
-        y2->copy_from(y);
         alpha = gko::initialize<Vec>({2.0}, ref);
-        alpha2 = gko::initialize<Vec2>({2.0}, ref);
         beta = gko::initialize<Vec>({-1.0}, ref);
-        beta2 = gko::initialize<Vec2>({-1.0}, ref);
         dmtx = Mtx::create(exec, strategy);
         dmtx->copy_from(mtx);
         dsquare_mtx = Mtx::create(exec, strategy);
         dsquare_mtx->copy_from(square_mtx);
         dresult = gko::clone(exec, expected);
-        dresult2 = gko::clone(exec, expected2);
         dy = gko::clone(exec, y);
-        dy2 = gko::clone(exec, y2);
         dalpha = gko::clone(exec, alpha);
-        dalpha2 = gko::clone(exec, alpha2);
         dbeta = gko::clone(exec, beta);
-        dbeta2 = gko::clone(exec, beta2);
 
         std::vector<int> tmp(mtx->get_size()[0], 0);
         auto rng = std::default_random_engine{};
@@ -196,26 +185,18 @@ class Csr : public CommonTestFixture {
     std::unique_ptr<ComplexMtx> complex_mtx;
     std::unique_ptr<Mtx> square_mtx;
     std::unique_ptr<Vec> expected;
-    std::unique_ptr<Vec2> expected2;
     std::unique_ptr<Vec> y;
-    std::unique_ptr<Vec2> y2;
     std::unique_ptr<Vec> alpha;
-    std::unique_ptr<Vec2> alpha2;
     std::unique_ptr<Vec> beta;
-    std::unique_ptr<Vec2> beta2;
 
     std::unique_ptr<Mtx> dmtx;
     std::unique_ptr<Mtx> dmtx2;
     std::unique_ptr<ComplexMtx> dcomplex_mtx;
     std::unique_ptr<Mtx> dsquare_mtx;
     std::unique_ptr<Vec> dresult;
-    std::unique_ptr<Vec2> dresult2;
     std::unique_ptr<Vec> dy;
-    std::unique_ptr<Vec2> dy2;
     std::unique_ptr<Vec> dalpha;
-    std::unique_ptr<Vec2> dalpha2;
     std::unique_ptr<Vec> dbeta;
-    std::unique_ptr<Vec2> dbeta2;
     std::unique_ptr<Arr> rpermute_idxs;
     std::unique_ptr<Arr> cpermute_idxs;
     std::unique_ptr<Perm> rpermutation;

From 52b2261179ff8443c5b067a44c634698df8f40b8 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 3 Dec 2024 16:37:11 +0100
Subject: [PATCH 430/448] update the multigrid preconditioner usage in
 test/mpi/solver

---
 test/mpi/solver/solver.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/test/mpi/solver/solver.cpp b/test/mpi/solver/solver.cpp
index 80142e2e947..392bf9990b8 100644
--- a/test/mpi/solver/solver.cpp
+++ b/test/mpi/solver/solver.cpp
@@ -138,11 +138,7 @@ struct CgWithMg : SimpleSolverTest<gko::solver::Cg<solver_value_type>> {
                         16u)  // necessary since the test matrices have less
                               // rows than the default value
                     .with_criteria(
-                        gko::stop::Iteration::build().with_max_iters(
-                            iteration_count()),
-                        gko::stop::ResidualNorm<value_type>::build()
-                            .with_baseline(gko::stop::mode::absolute)
-                            .with_reduction_factor(2 * reduction_factor())));
+                        gko::stop::Iteration::build().with_max_iters(1u)));
     }
 
     static bool blacklisted(const std::string& test)

From 98cd40bc414f8e1fc91f23b2b51f787a07952279 Mon Sep 17 00:00:00 2001
From: Gregor Olenik <gregor.olenik@web.de>
Date: Tue, 5 Nov 2024 09:13:56 +0100
Subject: [PATCH 431/448] add overload to create json obj from std::string

---
 include/ginkgo/extensions/config/json_config.hpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/include/ginkgo/extensions/config/json_config.hpp b/include/ginkgo/extensions/config/json_config.hpp
index f8c3cfd5860..05eaff13a54 100644
--- a/include/ginkgo/extensions/config/json_config.hpp
+++ b/include/ginkgo/extensions/config/json_config.hpp
@@ -80,6 +80,13 @@ inline gko::config::pnode parse_json_file(std::string filename)
     return parse_json(nlohmann::json::parse(fstream));
 }
 
+/**
+ * parse_json_string takes a json string to generate the property tree object
+ */
+inline gko::config::pnode parse_json_string(std::string json)
+{
+    return parse_json(nlohmann::json::parse(json));
+}
 
 }  // namespace config
 }  // namespace ext

From de217b8d6b68c0b643f5e2805487d15446b75901 Mon Sep 17 00:00:00 2001
From: Gregor Olenik <gregor.olenik@web.de>
Date: Thu, 5 Dec 2024 09:21:37 +0100
Subject: [PATCH 432/448] [test] add parse_json_string test

---
 extensions/test/config/json_config.cpp | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/extensions/test/config/json_config.cpp b/extensions/test/config/json_config.cpp
index a46cdd93628..58000fd057d 100644
--- a/extensions/test/config/json_config.cpp
+++ b/extensions/test/config/json_config.cpp
@@ -54,7 +54,7 @@ TEST(JsonConfig, ReadInput)
 {
     const char json[] =
         R"({"item": 4,
-            "array": [3.0, 4.5], 
+            "array": [3.0, 4.5],
             "map": {"bool": false}})";
 
     auto d = nlohmann::json::parse(json);
@@ -72,6 +72,26 @@ TEST(JsonConfig, ReadInput)
     ASSERT_EQ(child_map.at("bool").get_boolean(), false);
 }
 
+TEST(JsonConfig, ReadInputString)
+{
+    std::string =
+        R"({"item": 4,
+            "array": [3.0, 4.5],
+            "map": {"bool": false}})";
+
+    auto ptree = gko::ext::config::parse_json_string(d);
+
+    auto& child_array = ptree.get("array").get_array();
+    auto& child_map = ptree.get("map").get_map();
+    ASSERT_EQ(ptree.get_map().size(), 3);
+    ASSERT_EQ(ptree.get("item").get_integer(), 4);
+    ASSERT_EQ(child_array.size(), 2);
+    ASSERT_EQ(child_array.at(0).get_real(), 3.0);
+    ASSERT_EQ(child_array.at(1).get_real(), 4.5);
+    ASSERT_EQ(child_map.size(), 1);
+    ASSERT_EQ(child_map.at("bool").get_boolean(), false);
+}
+
 
 TEST(JsonConfig, ReadInputFromFile)
 {

From 53a8723ba397e98890d58a9227d0d36137c79b75 Mon Sep 17 00:00:00 2001
From: Gregor Olenik <gregor.olenik@web.de>
Date: Thu, 5 Dec 2024 09:48:56 +0100
Subject: [PATCH 433/448] [test] add parse_json_string test

---
 extensions/test/config/json_config.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/extensions/test/config/json_config.cpp b/extensions/test/config/json_config.cpp
index 58000fd057d..81d60a5090e 100644
--- a/extensions/test/config/json_config.cpp
+++ b/extensions/test/config/json_config.cpp
@@ -3,6 +3,7 @@
 // SPDX-License-Identifier: BSD-3-Clause
 
 #include <stdexcept>
+#include <string>
 
 #include <gtest/gtest.h>
 #include <nlohmann/json.hpp>
@@ -74,12 +75,11 @@ TEST(JsonConfig, ReadInput)
 
 TEST(JsonConfig, ReadInputString)
 {
-    std::string =
-        R"({"item": 4,
-            "array": [3.0, 4.5],
-            "map": {"bool": false}})";
+    std::string json_string = R"({"item": 4,
+           "array": [3.0, 4.5],
+           "map": {"bool": false}})";
 
-    auto ptree = gko::ext::config::parse_json_string(d);
+    auto ptree = gko::ext::config::parse_json_string(json_string);
 
     auto& child_array = ptree.get("array").get_array();
     auto& child_map = ptree.get("map").get_map();

From 718ffe4b2d00d36639a4d7c14e991b21e359979c Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 4 Dec 2024 17:43:32 +0100
Subject: [PATCH 434/448] disable the half properly

---
 common/cuda_hip/base/math.hpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/common/cuda_hip/base/math.hpp b/common/cuda_hip/base/math.hpp
index 8c0da63c181..de7eb51714f 100644
--- a/common/cuda_hip/base/math.hpp
+++ b/common/cuda_hip/base/math.hpp
@@ -104,6 +104,9 @@ struct is_complex_or_scalar_impl<thrust::complex<T>>
 }  // namespace gko
 
 
+#if GINKGO_ENABLE_HALF
+
+
 GKO_THRUST_NAEMSPACE_PREFIX
 namespace thrust {
 
@@ -186,4 +189,7 @@ __device__ __forceinline__ bool is_finite(const thrust::complex<__half>& value)
 }  // namespace gko
 
 
+#endif  // GINKGO_ENABLE_HALF
+
+
 #endif  // GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_

From c6434ffede1f6eb2a7944d841f6e21d60fa6ae77 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 5 Dec 2024 15:18:57 +0100
Subject: [PATCH 435/448] update documentation

---
 INSTALL.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/INSTALL.md b/INSTALL.md
index 87ed9c4f61a..b55d14fffca 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -26,7 +26,8 @@ Ginkgo adds the following additional switches to control what is being built:
     Enabling this flag increases the library size, but improves performance of
     mixed-precision kernels.
 *   `-DGINKGO_ENABLE_HALF={ON, OFF}` enable half precision support in Ginkgo, default is `ON`.
-    It is `OFF` when the compiler is MSVC.
+    It is `OFF` when the compiler is MSVC. If compiling is done with the CUDA backend before CUDA 12.2, 
+    we only support half precision after compute capability 5.3. CUDA 12.2+ compilers waive the compute capbility limitation.
 *   `-DGINKGO_BUILD_TESTS={ON, OFF}` builds Ginkgo's tests
     (will download googletest), default is `ON`.
 *   `-DGINKGO_FAST_TESTS={ON, OFF}` reduces the input sizes for a few slow tests

From 1291b369621d8e10611dc526e9035cedc94367af Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Fri, 6 Dec 2024 13:02:42 +0000
Subject: [PATCH 436/448] [omp] fix RCM RAW

---
 omp/reorder/rcm_kernels.cpp | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/omp/reorder/rcm_kernels.cpp b/omp/reorder/rcm_kernels.cpp
index fbe5c8d42c4..4f49d7d93b3 100644
--- a/omp/reorder/rcm_kernels.cpp
+++ b/omp/reorder/rcm_kernels.cpp
@@ -96,16 +96,6 @@ struct UbfsLinearQueue {
         tail += n;
     }
 
-    /**
-     * Computes the correct chunk size at a given moment.
-     * It is upper bounded by the chunk_bound and the half of all nodes.
-     */
-    IndexType chunk_size()
-    {
-        const auto available_nodes = tail - head;
-        return std::min<IndexType>((available_nodes + 1) / 2, chunk_bound);
-    }
-
     /**
      * Returns a pointer to an exclusively owned chunk of length <= n/2.
      * Blocks in case no nodes are available and other threads are still
@@ -118,6 +108,18 @@ struct UbfsLinearQueue {
         const auto data = &arr[0];
         std::lock_guard<omp_mutex> read_guard{read_mutex};
 
+        /**
+         * Computes the correct chunk size at a given moment.
+         * It is upper bounded by the chunk_bound and the half of all nodes.
+         */
+        auto chunk_size = [this]() {
+            // Can only acquire write_mutex as this function is called when a
+            // read_mutex is already acquired
+            std::lock_guard<omp_mutex> lock{write_mutex};
+            const auto available_nodes = tail - head;
+            return std::min<IndexType>((available_nodes + 1) / 2, chunk_bound);
+        };
+
         const auto n = chunk_size();
 
         if (n > 0) {

From ab53c62491c8d7945447fb7d9ce01213c8e723f3 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Fri, 6 Dec 2024 13:33:55 +0000
Subject: [PATCH 437/448] [omp] fix sparsity csr RAW

---
 omp/matrix/sparsity_csr_kernels.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/omp/matrix/sparsity_csr_kernels.cpp b/omp/matrix/sparsity_csr_kernels.cpp
index 35bb42c70a6..2fd04d3cfd0 100644
--- a/omp/matrix/sparsity_csr_kernels.cpp
+++ b/omp/matrix/sparsity_csr_kernels.cpp
@@ -183,11 +183,14 @@ void is_sorted_by_column_index(
     bool local_is_sorted = true;
 #pragma omp parallel for shared(local_is_sorted)
     for (size_type i = 0; i < size[0]; ++i) {
-#pragma omp flush(local_is_sorted)
         // Skip comparison if any thread detects that it is not sorted
-        if (local_is_sorted) {
+        bool sync_local_is_sorted = true;
+#pragma omp atomic read
+        sync_local_is_sorted = local_is_sorted;
+        if (sync_local_is_sorted) {
             for (auto idx = row_ptrs[i] + 1; idx < row_ptrs[i + 1]; ++idx) {
                 if (col_idxs[idx - 1] > col_idxs[idx]) {
+#pragma omp atomic write
                     local_is_sorted = false;
                     break;
                 }

From a29b7725715d5ee3e813e5380c07e67bd8aaa8d2 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Fri, 6 Dec 2024 13:34:38 +0000
Subject: [PATCH 438/448] [omp] fix kcycle stop WAW

This is not really necessary, but it keeps the TSAN happy.
---
 omp/solver/multigrid_kernels.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/omp/solver/multigrid_kernels.cpp b/omp/solver/multigrid_kernels.cpp
index 12e5bad8577..2203187383d 100644
--- a/omp/solver/multigrid_kernels.cpp
+++ b/omp/solver/multigrid_kernels.cpp
@@ -84,9 +84,10 @@ void kcycle_check_stop(std::shared_ptr<const DefaultExecutor> exec,
                        const ValueType rel_tol, bool& is_stop)
 {
     is_stop = true;
-#pragma omp parallel for
+#pragma omp parallel for shared(is_stop)
     for (size_type i = 0; i < old_norm->get_size()[1]; i++) {
         if (new_norm->at(0, i) > rel_tol * old_norm->at(0, i)) {
+#pragma omp atomic write
             is_stop = false;
         }
     }

From bd8afe6bccd0f3d3f73da6f39ab1067bb761074c Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Fri, 6 Dec 2024 13:34:57 +0000
Subject: [PATCH 439/448] [test] fix 3pt generation for batch solver

---
 core/test/utils/batch_helpers.hpp                | 2 +-
 reference/test/solver/batch_bicgstab_kernels.cpp | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/core/test/utils/batch_helpers.hpp b/core/test/utils/batch_helpers.hpp
index 790034b724c..f405f5be63f 100644
--- a/core/test/utils/batch_helpers.hpp
+++ b/core/test/utils/batch_helpers.hpp
@@ -105,7 +105,7 @@ std::unique_ptr<MatrixType> generate_3pt_stencil_batch_matrix(
         {}};
     for (int row = 0; row < num_rows; ++row) {
         if (row > 0) {
-            data.nonzeros.emplace_back(row - 1, row, value_type{-1.0});
+            data.nonzeros.emplace_back(row, row - 1, value_type{-1.0});
         }
         data.nonzeros.emplace_back(row, row, value_type{6.0});
         if (row < num_rows - 1) {
diff --git a/reference/test/solver/batch_bicgstab_kernels.cpp b/reference/test/solver/batch_bicgstab_kernels.cpp
index 16d2f7a6653..5bc6eb47702 100644
--- a/reference/test/solver/batch_bicgstab_kernels.cpp
+++ b/reference/test/solver/batch_bicgstab_kernels.cpp
@@ -62,7 +62,7 @@ class BatchBicgstab : public ::testing::Test {
     }
 
     std::shared_ptr<const gko::ReferenceExecutor> exec;
-    const real_type eps = 1e-3;
+    const real_type eps = 5e-3;
     const gko::size_type num_batch_items = 2;
     const int num_rows = 15;
     const int num_rhs = 1;
@@ -147,7 +147,7 @@ TYPED_TEST(BatchBicgstab, CanSolveDenseSystem)
     using real_type = gko::remove_complex<value_type>;
     using Solver = typename TestFixture::solver_type;
     using Mtx = typename TestFixture::Mtx;
-    const real_type tol = 1e-4;
+    const real_type tol = 1e-3;
     const int max_iters = 1000;
     auto solver_factory =
         Solver::build()
@@ -227,7 +227,7 @@ TYPED_TEST(BatchBicgstab, CanSolveEllSystem)
     using real_type = gko::remove_complex<value_type>;
     using Solver = typename TestFixture::solver_type;
     using Mtx = typename TestFixture::EllMtx;
-    const real_type tol = 1e-4;
+    const real_type tol = 1e-3;
     const int max_iters = 1000;
     auto solver_factory =
         Solver::build()
@@ -263,7 +263,7 @@ TYPED_TEST(BatchBicgstab, CanSolveCsrSystem)
     using real_type = gko::remove_complex<value_type>;
     using Solver = typename TestFixture::solver_type;
     using Mtx = typename TestFixture::CsrMtx;
-    const real_type tol = 1e-4;
+    const real_type tol = 1e-3;
     const int max_iters = 1000;
     auto solver_factory =
         Solver::build()

From 051fea8efef2a68bdb54732222b10ecd14854797 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Fri, 6 Dec 2024 15:24:57 +0000
Subject: [PATCH 440/448] [omp] fix par IC and variants using atomic
 load/stores

---
 omp/components/atomic.hpp              | 68 +++++++++++++++++++++++++-
 omp/factorization/par_ic_kernels.cpp   |  8 +--
 omp/factorization/par_ict_kernels.cpp  |  8 +--
 omp/factorization/par_ilu_kernels.cpp  | 11 +++--
 omp/factorization/par_ilut_kernels.cpp |  9 ++--
 5 files changed, 89 insertions(+), 15 deletions(-)

diff --git a/omp/components/atomic.hpp b/omp/components/atomic.hpp
index 35b94a65fe5..3f743efeecf 100644
--- a/omp/components/atomic.hpp
+++ b/omp/components/atomic.hpp
@@ -52,7 +52,7 @@ inline ResultType copy_cast(const ValueType& val)
 
 
 template <>
-void atomic_add(half& out, half val)
+inline void atomic_add(half& out, half val)
 {
 #ifdef __NVCOMPILER
 // NVC++ uses atomic capture on uint16 leads the following error.
@@ -85,6 +85,72 @@ void atomic_add(half& out, half val)
 }
 
 
+// There is an error in Clang 17 which prevents us from merging the
+// implementation of double and float. The compiler will throw an error if the
+// templated version is implemented. GCC doesn't throw an error.
+inline void store(double* addr, double val)
+{
+#pragma omp atomic write
+    *addr = val;
+}
+
+inline void store(float* addr, float val)
+{
+#pragma omp atomic write
+    *addr = val;
+}
+
+inline void store(half* addr, half val)
+{
+    auto uint_addr = copy_cast<uint16_t*>(addr);
+    auto uint_val = copy_cast<uint16_t>(val);
+#pragma omp atomic write
+    *uint_addr = uint_val;
+}
+
+template <typename T>
+inline void store(std::complex<T>* addr, std::complex<T> val)
+{
+    auto values = reinterpret_cast<T*>(addr);
+    store(values + 0, real(val));
+    store(values + 1, imag(val));
+}
+
+
+// Same issue as with the store_helper
+inline float load(float* addr)
+{
+    float val;
+#pragma omp atomic read
+    val = *addr;
+    return val;
+}
+
+inline double load(double* addr)
+{
+    double val;
+#pragma omp atomic read
+    val = *addr;
+    return val;
+}
+
+inline half load(half* addr)
+{
+    uint16_t uint_val;
+    auto uint_addr = copy_cast<uint16_t*>(addr);
+#pragma omp atomic read
+    uint_val = *uint_addr;
+    return copy_cast<half>(uint_val);
+}
+
+template <typename T>
+inline std::complex<T> load(std::complex<T>* addr)
+{
+    auto values = reinterpret_cast<T*>(addr);
+    return {load(values + 0), load(values + 1)};
+}
+
+
 }  // namespace omp
 }  // namespace kernels
 }  // namespace gko
diff --git a/omp/factorization/par_ic_kernels.cpp b/omp/factorization/par_ic_kernels.cpp
index 93093783acc..caaa7c00b1a 100644
--- a/omp/factorization/par_ic_kernels.cpp
+++ b/omp/factorization/par_ic_kernels.cpp
@@ -9,6 +9,7 @@
 #include <ginkgo/core/matrix/csr.hpp>
 
 #include "core/base/utils.hpp"
+#include "omp/components/atomic.hpp"
 
 
 namespace gko {
@@ -76,7 +77,8 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
                     auto l_col = l_col_idxs[l_begin];
                     auto lh_row = l_col_idxs[lh_begin];
                     if (l_col == lh_row && l_col < col) {
-                        sum += l_vals[l_begin] * conj(l_vals[lh_begin]);
+                        sum += load(l_vals + l_begin) *
+                               conj(load(l_vals + lh_begin));
                     }
                     l_begin += (l_col <= lh_row);
                     lh_begin += (lh_row <= l_col);
@@ -85,11 +87,11 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
                 if (row == col) {
                     new_val = sqrt(new_val);
                 } else {
-                    auto diag = l_vals[l_row_ptrs[col + 1] - 1];
+                    auto diag = load(l_vals + l_row_ptrs[col + 1] - 1);
                     new_val = new_val / diag;
                 }
                 if (is_finite(new_val)) {
-                    l_vals[l_nz] = new_val;
+                    store(l_vals + l_nz, new_val);
                 }
             }
         }
diff --git a/omp/factorization/par_ict_kernels.cpp b/omp/factorization/par_ict_kernels.cpp
index b5546e1a644..c6573c8bfb5 100644
--- a/omp/factorization/par_ict_kernels.cpp
+++ b/omp/factorization/par_ict_kernels.cpp
@@ -17,6 +17,7 @@
 #include "core/base/utils.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
+#include "omp/components/atomic.hpp"
 #include "omp/components/csr_spgeam.hpp"
 
 
@@ -69,7 +70,8 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
                 auto l_col = l_col_idxs[l_begin];
                 auto lh_row = l_col_idxs[lh_begin];
                 if (l_col == lh_row && l_col < col) {
-                    sum += l_vals[l_begin] * conj(l_vals[lh_begin]);
+                    sum +=
+                        load(l_vals + l_begin) * conj(load(l_vals + lh_begin));
                 }
                 if (lh_row == row) {
                     lh_nz = lh_begin;
@@ -81,11 +83,11 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
             if (row == col) {
                 new_val = sqrt(new_val);
             } else {
-                auto diag = l_vals[l_row_ptrs[col + 1] - 1];
+                auto diag = load(l_vals + l_row_ptrs[col + 1] - 1);
                 new_val = new_val / diag;
             }
             if (is_finite(new_val)) {
-                l_vals[l_nz] = new_val;
+                store(l_vals + l_nz, new_val);
             }
         }
     }
diff --git a/omp/factorization/par_ilu_kernels.cpp b/omp/factorization/par_ilu_kernels.cpp
index da42a631b81..30f337eed46 100644
--- a/omp/factorization/par_ilu_kernels.cpp
+++ b/omp/factorization/par_ilu_kernels.cpp
@@ -10,6 +10,8 @@
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
+#include "omp/components/atomic.hpp"
+
 
 namespace gko {
 namespace kernels {
@@ -57,7 +59,8 @@ void compute_l_u_factors(std::shared_ptr<const OmpExecutor> exec,
                 auto col_l = col_idxs_l[row_l];
                 auto col_u = col_idxs_u[row_u];
                 if (col_l == col_u) {
-                    last_operation = vals_l[row_l] * vals_u[row_u];
+                    last_operation =
+                        load(vals_l + row_l) * load(vals_u + row_u);
                     sum -= last_operation;
                 } else {
                     last_operation = zero<ValueType>();
@@ -74,14 +77,14 @@ void compute_l_u_factors(std::shared_ptr<const OmpExecutor> exec,
             sum += last_operation;  // undo the last operation
 
             if (row > col) {  // modify entry in L
-                auto to_write = sum / vals_u[row_ptrs_u[col + 1] - 1];
+                auto to_write = sum / load(vals_u + row_ptrs_u[col + 1] - 1);
                 if (is_finite(to_write)) {
-                    vals_l[row_l - 1] = to_write;
+                    store(vals_l + row_l - 1, to_write);
                 }
             } else {  // modify entry in U
                 auto to_write = sum;
                 if (is_finite(to_write)) {
-                    vals_u[row_u - 1] = to_write;
+                    store(vals_u + row_u - 1, to_write);
                 }
             }
         }
diff --git a/omp/factorization/par_ilut_kernels.cpp b/omp/factorization/par_ilut_kernels.cpp
index 8b251e88bf4..035464517f7 100644
--- a/omp/factorization/par_ilut_kernels.cpp
+++ b/omp/factorization/par_ilut_kernels.cpp
@@ -20,6 +20,7 @@
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
+#include "omp/components/atomic.hpp"
 #include "omp/components/csr_spgeam.hpp"
 
 
@@ -281,7 +282,7 @@ void compute_l_u_factors(std::shared_ptr<const DefaultExecutor> exec,
             auto l_col = l_col_idxs[l_begin];
             auto u_row = ut_row_idxs[u_begin];
             if (l_col == u_row && l_col < last_entry) {
-                sum += l_vals[l_begin] * ut_vals[u_begin];
+                sum += load(l_vals + l_begin) * load(ut_vals + u_begin);
             }
             if (u_row == row) {
                 ut_nz = u_begin;
@@ -297,7 +298,7 @@ void compute_l_u_factors(std::shared_ptr<const DefaultExecutor> exec,
         for (size_type l_nz = l_row_ptrs[row]; l_nz < l_row_ptrs[row + 1] - 1;
              ++l_nz) {
             auto col = l_col_idxs[l_nz];
-            auto u_diag = ut_vals[ut_col_ptrs[col + 1] - 1];
+            auto u_diag = load(ut_vals + ut_col_ptrs[col + 1] - 1);
             auto new_val = compute_sum(row, col).first / u_diag;
             if (is_finite(new_val)) {
                 l_vals[l_nz] = new_val;
@@ -310,8 +311,8 @@ void compute_l_u_factors(std::shared_ptr<const DefaultExecutor> exec,
             auto new_val = result.first;
             auto ut_nz = result.second;
             if (is_finite(new_val)) {
-                u_vals[u_nz] = new_val;
-                ut_vals[ut_nz] = new_val;
+                store(u_vals + u_nz, new_val);
+                store(ut_vals + ut_nz, new_val);
             }
         }
     }

From 81c3b5021dca49d0a103fdb7fefd4abec1cfb5bf Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Fri, 6 Dec 2024 15:55:14 +0000
Subject: [PATCH 441/448] [mg] use atomics for match_edge

---
 common/cuda_hip/multigrid/pgm_kernels.cpp | 40 +++++++++++++++++++++++
 common/unified/multigrid/pgm_kernels.cpp  | 25 --------------
 dpcpp/components/atomic.dp.hpp            | 25 ++++++++++++++
 dpcpp/multigrid/pgm_kernels.dp.cpp        | 30 +++++++++++++++++
 omp/components/atomic.hpp                 | 28 ++++++++++++++++
 omp/multigrid/pgm_kernels.cpp             | 25 ++++++++++++++
 6 files changed, 148 insertions(+), 25 deletions(-)

diff --git a/common/cuda_hip/multigrid/pgm_kernels.cpp b/common/cuda_hip/multigrid/pgm_kernels.cpp
index d3c44cf540e..61a2f9ac74a 100644
--- a/common/cuda_hip/multigrid/pgm_kernels.cpp
+++ b/common/cuda_hip/multigrid/pgm_kernels.cpp
@@ -17,6 +17,8 @@
 
 #include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/memory.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 
 
 namespace gko {
@@ -28,6 +30,44 @@ namespace GKO_DEVICE_NAMESPACE {
  * @ingroup pgm
  */
 namespace pgm {
+namespace kernels {
+
+
+template <typename IndexType>
+__global__ void match_edge(size_type size,
+                           const IndexType* __restrict__ strongest_neighbor,
+                           IndexType* __restrict__ agg)
+{
+    auto tidx = static_cast<IndexType>(thread::get_thread_id_flat<int64>());
+    if (tidx >= size || load_relaxed(agg + tidx) != -1) {
+        return;
+    }
+    auto neighbor = strongest_neighbor[tidx];
+    if (neighbor != -1 && strongest_neighbor[neighbor] == tidx &&
+        tidx <= neighbor) {
+        store_relaxed(agg + tidx, tidx);
+        store_relaxed(agg + neighbor, tidx);
+    }
+}
+
+
+}  // namespace kernels
+
+
+template <typename IndexType>
+void match_edge(std::shared_ptr<const DefaultExecutor> exec,
+                const array<IndexType>& strongest_neighbor,
+                array<IndexType>& agg)
+{
+    constexpr int default_block_size = 512;
+    auto num_blocks = ceildiv(agg.get_size(), default_block_size);
+    kernels::
+        match_edge<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+            agg.get_size(), strongest_neighbor.get_const_data(),
+            agg.get_data());
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PGM_MATCH_EDGE_KERNEL);
 
 
 template <typename IndexType>
diff --git a/common/unified/multigrid/pgm_kernels.cpp b/common/unified/multigrid/pgm_kernels.cpp
index 409dbc8b9b6..53a57530f84 100644
--- a/common/unified/multigrid/pgm_kernels.cpp
+++ b/common/unified/multigrid/pgm_kernels.cpp
@@ -23,31 +23,6 @@ namespace GKO_DEVICE_NAMESPACE {
 namespace pgm {
 
 
-template <typename IndexType>
-void match_edge(std::shared_ptr<const DefaultExecutor> exec,
-                const array<IndexType>& strongest_neighbor,
-                array<IndexType>& agg)
-{
-    run_kernel(
-        exec,
-        [] GKO_KERNEL(auto tidx, auto strongest_neighbor_vals, auto agg_vals) {
-            if (agg_vals[tidx] != -1) {
-                return;
-            }
-            auto neighbor = strongest_neighbor_vals[tidx];
-            if (neighbor != -1 && strongest_neighbor_vals[neighbor] == tidx &&
-                tidx <= neighbor) {
-                // Use the smaller index as agg point
-                agg_vals[tidx] = tidx;
-                agg_vals[neighbor] = tidx;
-            }
-        },
-        agg.get_size(), strongest_neighbor.get_const_data(), agg.get_data());
-}
-
-GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PGM_MATCH_EDGE_KERNEL);
-
-
 template <typename IndexType>
 void count_unagg(std::shared_ptr<const DefaultExecutor> exec,
                  const array<IndexType>& agg, IndexType* num_unagg)
diff --git a/dpcpp/components/atomic.dp.hpp b/dpcpp/components/atomic.dp.hpp
index 68a1232849f..7b48d27f036 100644
--- a/dpcpp/components/atomic.dp.hpp
+++ b/dpcpp/components/atomic.dp.hpp
@@ -278,6 +278,31 @@ __dpct_inline__ T atomic_max(T* __restrict__ addr, T val)
 }
 
 
+template <sycl::access::address_space addressSpace = atomic::global_space,
+          typename T>
+__dpct_inline__ void store(
+    T* __restrict__ addr, T val,
+    sycl::memory_order memoryOrder = sycl::memory_order::relaxed)
+{
+    sycl::atomic_ref<T, sycl::memory_order::relaxed,
+                     atomic::memory_scope_v<addressSpace>, addressSpace>
+        obj(*addr);
+    obj.store(val, memoryOrder);
+}
+
+
+template <sycl::access::address_space addressSpace = atomic::global_space,
+          typename T>
+__dpct_inline__ T load(T* __restrict__ addr, sycl::memory_order memoryOrder =
+                                                 sycl::memory_order::relaxed)
+{
+    sycl::atomic_ref<T, sycl::memory_order::relaxed,
+                     atomic::memory_scope_v<addressSpace>, addressSpace>
+        obj(*addr);
+    return obj.load(memoryOrder);
+}
+
+
 }  // namespace dpcpp
 }  // namespace kernels
 }  // namespace gko
diff --git a/dpcpp/multigrid/pgm_kernels.dp.cpp b/dpcpp/multigrid/pgm_kernels.dp.cpp
index a9148c54ff4..398fc5255e2 100644
--- a/dpcpp/multigrid/pgm_kernels.dp.cpp
+++ b/dpcpp/multigrid/pgm_kernels.dp.cpp
@@ -12,6 +12,7 @@
 #include <ginkgo/core/base/math.hpp>
 
 #include "dpcpp/base/onedpl.hpp"
+#include "dpcpp/components/atomic.dp.hpp"
 
 
 namespace gko {
@@ -25,6 +26,35 @@ namespace dpcpp {
 namespace pgm {
 
 
+template <typename IndexType>
+void match_edge(std::shared_ptr<const DefaultExecutor> exec,
+                const array<IndexType>& strongest_neighbor,
+                array<IndexType>& agg)
+{
+    exec->get_queue()->submit([size = agg.get_size(), agg = agg.get_data(),
+                               strongest_neighbor =
+                                   strongest_neighbor.get_const_data()](
+                                  sycl::handler& cgh) {
+        cgh.parallel_for(
+            sycl::range<1>{static_cast<std::size_t>(size)},
+            [=](sycl::id<1> idx_id) {
+                auto tidx = static_cast<IndexType>(idx_id[0]);
+                if (load(agg + tidx, sycl::memory_order_relaxed) != -1) {
+                    return;
+                }
+                auto neighbor = strongest_neighbor[tidx];
+                if (neighbor != -1 && strongest_neighbor[neighbor] == tidx &&
+                    tidx <= neighbor) {
+                    store(agg + tidx, tidx, sycl::memory_order_relaxed);
+                    store(agg + neighbor, tidx, sycl::memory_order_relaxed);
+                }
+            });
+    });
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PGM_MATCH_EDGE_KERNEL);
+
+
 template <typename IndexType>
 void sort_agg(std::shared_ptr<const DefaultExecutor> exec, IndexType num,
               IndexType* row_idxs, IndexType* col_idxs)
diff --git a/omp/components/atomic.hpp b/omp/components/atomic.hpp
index 3f743efeecf..51af478fd59 100644
--- a/omp/components/atomic.hpp
+++ b/omp/components/atomic.hpp
@@ -100,6 +100,18 @@ inline void store(float* addr, float val)
     *addr = val;
 }
 
+inline void store(int32* addr, int32 val)
+{
+#pragma omp atomic write
+    *addr = val;
+}
+
+inline void store(int64* addr, int64 val)
+{
+#pragma omp atomic write
+    *addr = val;
+}
+
 inline void store(half* addr, half val)
 {
     auto uint_addr = copy_cast<uint16_t*>(addr);
@@ -134,6 +146,22 @@ inline double load(double* addr)
     return val;
 }
 
+inline int32 load(int32* addr)
+{
+    float val;
+#pragma omp atomic read
+    val = *addr;
+    return val;
+}
+
+inline int64 load(int64* addr)
+{
+    float val;
+#pragma omp atomic read
+    val = *addr;
+    return val;
+}
+
 inline half load(half* addr)
 {
     uint16_t uint_val;
diff --git a/omp/multigrid/pgm_kernels.cpp b/omp/multigrid/pgm_kernels.cpp
index 4c824a0140b..fb64796c4f7 100644
--- a/omp/multigrid/pgm_kernels.cpp
+++ b/omp/multigrid/pgm_kernels.cpp
@@ -13,6 +13,7 @@
 #include <ginkgo/core/base/math.hpp>
 
 #include "core/base/iterator_factory.hpp"
+#include "omp/components/atomic.hpp"
 
 
 namespace gko {
@@ -26,6 +27,30 @@ namespace omp {
 namespace pgm {
 
 
+template <typename IndexType>
+void match_edge(std::shared_ptr<const DefaultExecutor> exec,
+                const array<IndexType>& strongest_neighbor,
+                array<IndexType>& agg)
+{
+    auto agg_vals = agg.get_data();
+    auto strongest_neighbor_vals = strongest_neighbor.get_const_data();
+#pragma omp parallel for
+    for (int64 i = 0; i < static_cast<int64>(agg.get_size()); i++) {
+        if (load(agg_vals + i) != -1) {
+            continue;
+        }
+        auto neighbor = strongest_neighbor_vals[i];
+        if (neighbor != -1 && strongest_neighbor_vals[neighbor] == i &&
+            i <= neighbor) {
+            store(agg_vals + i, i);
+            store(agg_vals + neighbor, i);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PGM_MATCH_EDGE_KERNEL);
+
+
 template <typename IndexType>
 void sort_agg(std::shared_ptr<const DefaultExecutor> exec, IndexType num,
               IndexType* row_idxs, IndexType* col_idxs)

From 51f3a9bdf1ce92a74f067a29bd2c0a1dc9772a10 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Wed, 4 Dec 2024 11:42:14 +0100
Subject: [PATCH 442/448] [core] deprecate the `master` branch

---
 CMakeLists.txt               | 11 +++++++++++
 CONTRIBUTING.md              |  4 ++--
 include/ginkgo/config.hpp.in | 10 ++++++++++
 3 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8362cfaa277..913dd38f06d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,6 +3,17 @@ cmake_minimum_required(VERSION 3.16)
 project(Ginkgo LANGUAGES CXX VERSION 1.8.0 DESCRIPTION "A numerical linear algebra library targeting many-core architectures")
 set(Ginkgo_VERSION_TAG "master")
 set(PROJECT_VERSION_TAG ${Ginkgo_VERSION_TAG})
+if(Ginkgo_VERSION_TAG STREQUAL "master")
+    set(GINKGO_VERSION_TAG_DEPRECATED ON)
+else()
+    set(GINKGO_VERSION_TAG_DEPRECATED OFF)
+endif()
+if(GINKGO_VERSION_TAG_DEPRECATED)
+    message(
+            WARNING
+            "The branch ${Ginkgo_VERSION_TAG} is deprecated and will stop receiving updates after 2025. "
+            "Please use the main branch for the latest release, or the develop branch for the latest development updates.")
+endif()
 # Cuda and Hip also look for Threads. Set it before any find_package to ensure the Threads setting is not changed.
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index d460087b3c8..e2cf463e3dc 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -151,8 +151,8 @@ improvements from code reviews.
 ### Creating, Reviewing and Merging Pull Requests
 
 * The `develop` branch is the default branch to submit PR's to. From time to
-  time, we merge the `develop` branch to the `master` branch and create tags on
-  the `master` to create new releases of Ginkgo. Therefore, all pull requests
+  time, we merge the `develop` branch to the `main` branch and create tags on
+  the `main` to create new releases of Ginkgo. Therefore, all pull requests
   must be merged into `develop`.
 * Please have a look at the labels and make sure to add the relevant labels.
 * You can mark the PR as a `WIP` if you are still working on it, `Ready for
diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in
index d9251e66165..2798f18578f 100644
--- a/include/ginkgo/config.hpp.in
+++ b/include/ginkgo/config.hpp.in
@@ -13,6 +13,16 @@
 #define GKO_VERSION_STR @Ginkgo_VERSION_MAJOR@, @Ginkgo_VERSION_MINOR@, @Ginkgo_VERSION_PATCH@
 // clang-format on
 
+
+// clang-format off
+#cmakedefine01 GINKGO_VERSION_TAG_DEPRECATED
+#if GINKGO_VERSION_TAG_DEPRECATED
+#pragma message ("The branch " GKO_VERSION_TAG " is deprecated and will stop receiving updates after 2025. " \
+ "Please use the main branch for the latest release, or the develop branch for the latest development updates.")
+#endif
+// clang-format on
+
+
 /*
  * Controls the amount of messages output by Ginkgo.
  * 0 disables all output (except for test, benchmarks and examples).

From d38fd9628d9f3ce679f32d8ab419c82da0f55306 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Wed, 4 Dec 2024 16:20:48 +0100
Subject: [PATCH 443/448] [ci] disable CI on master

---
 .gitlab/rules.yml | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/.gitlab/rules.yml b/.gitlab/rules.yml
index e60aaf7a66c..49b54f05266 100644
--- a/.gitlab/rules.yml
+++ b/.gitlab/rules.yml
@@ -5,8 +5,8 @@
 
 .pr_condition:
   rules:
-    # Exclude `develop`, `master`, and tags with `when: never`
-    - if: $CI_COMMIT_BRANCH == "develop" || $CI_COMMIT_BRANCH == "master" || $CI_COMMIT_TAG
+    # Exclude `develop`, `main`, and tags with `when: never`
+    - if: $CI_COMMIT_BRANCH == "develop" || $CI_COMMIT_BRANCH == "main" || $CI_COMMIT_BRANCH == "master" || $CI_COMMIT_TAG
       when: never
     # Run only when the `RUN_CI_TAG` variable is set
     - if: $RUN_CI_TAG
@@ -16,7 +16,7 @@
 .pr_trigger_condition:
   rules:
     # Exclude `develop`, `master`, and tags with `when: never`
-    - if: $CI_COMMIT_BRANCH == "develop" || $CI_COMMIT_BRANCH == "master" || $CI_COMMIT_TAG
+    - if: $CI_COMMIT_BRANCH == "develop" || $CI_COMMIT_BRANCH == "main" || $CI_COMMIT_BRANCH == "master" || $CI_COMMIT_TAG
       when: never
     # Run only for quick pipelines and when the `RUN_CI_TAG` variable is set
     - if: $RUN_CI_TAG && $STATUS_CONTEXT == "quick"
@@ -26,13 +26,13 @@
 .full_test_condition:
   rules:
     # Run only when the `RUN_CI_TAG` variable is set and this is a full pipeline, or for `master`, `develop` or tags.
-    - if: $RUN_CI_TAG && ($STATUS_CONTEXT == "full" || $CI_COMMIT_BRANCH == "master" || $CI_COMMIT_BRANCH == "develop" || $CI_COMMIT_TAG)
+    - if: $RUN_CI_TAG && ($STATUS_CONTEXT == "full" || $CI_COMMIT_BRANCH == "main" || $CI_COMMIT_BRANCH == "develop" || $CI_COMMIT_TAG)
   dependencies: []
 
 
 .full_test_short_lived_condition:
   rules:
-    - if: $CI_COMMIT_BRANCH == "develop" || $CI_COMMIT_BRANCH == "master" || $CI_COMMIT_TAG
+    - if: $CI_COMMIT_BRANCH == "develop" || $CI_COMMIT_BRANCH == "main" || $CI_COMMIT_BRANCH == "master"  || $CI_COMMIT_TAG
       when: never
     - if: $RUN_CI_TAG && $STATUS_CONTEXT == "full"
   dependencies: []
@@ -40,13 +40,15 @@
 
 .quick_test_condition:
   rules:
+    - if: $RUN_CI_TAG && $CI_COMMIT_BRANCH == "master"
+      when: never
     - if: $RUN_CI_TAG && $STATUS_CONTEXT == null
   dependencies: []
 
 
 .deploy_condition:
   rules:
-    - if: $RUN_CI_TAG && ($CI_COMMIT_BRANCH == "master" || $CI_COMMIT_BRANCH == "develop" || $CI_COMMIT_TAG) && $CI_PIPELINE_SOURCE != "schedule"
+    - if: $RUN_CI_TAG && ($CI_COMMIT_BRANCH == "main" || $CI_COMMIT_BRANCH == "develop" || $CI_COMMIT_TAG) && $CI_PIPELINE_SOURCE != "schedule"
   dependencies: []
 
 

From aec24ba6d7e427dd3517626c66ab0851d727b1db Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Wed, 4 Dec 2024 15:44:34 +0100
Subject: [PATCH 444/448] [release] update gtest

---
 third_party/gtest/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/gtest/CMakeLists.txt b/third_party/gtest/CMakeLists.txt
index fb0407ba215..75817db1ab9 100644
--- a/third_party/gtest/CMakeLists.txt
+++ b/third_party/gtest/CMakeLists.txt
@@ -3,7 +3,7 @@ include(FetchContent)
 FetchContent_Declare(
     googletest
     GIT_REPOSITORY https://github.com/google/googletest.git
-    GIT_TAG        v1.14.0
+    GIT_TAG        v1.15.2
 )
 # need to set the variables in CACHE due to CMP0077
 set(gtest_disable_pthreads ON CACHE INTERNAL "")

From 129a37d97d821ec28bc0d57939f16ba6e1573422 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Wed, 4 Dec 2024 10:10:03 +0100
Subject: [PATCH 445/448] [release] update changelog

Co-authored-by: Tobias Ribizel <mail@ribizel.de>
Co-authored-by: Yu-Hsiang M. Tsai <yhmtsai@gmail.com>
Co-authored-by: Pratik Nayak <pratik.nayak@kit.edu>
---
 CHANGELOG.md | 151 ++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 130 insertions(+), 21 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9b5c65f9750..fb17d0d5d6a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,116 @@ git log --first-parent
 
 Please visit our wiki [Changelog](https://github.com/ginkgo-project/ginkgo/wiki/Changelog) for unreleased changes.
 
+## Version 1.9.0
+
+The Ginkgo team is proud to announce the new Ginkgo minor release 1.9.0.
+This release brings new features such as:
+- Support for half precision (IEEE FP16). The type `gko::half` can now be selected in most instances as the value type
+  of a matrix, solver, preconditioner, etc. If the selected backend supports FP16 as a native type, the native type is
+  used within the kernels, otherwise an overhead might occur. The new behavior is enabled by default, but it can be
+  turned off during configuration.
+- New implementations of the ILU and IC factorization for CUDA, HIP, OpenMP, and Reference backends. These are
+  available in addition to the existing implementations based on the vendor libraries cuSPARSE and hipSPARSE.
+- New (S)SOR and Gauss-Seidel preconditioners.
+- Simplifyied distributed matrix assembly by exchanging local rows between neighboring processes.
+
+And more!
+
+If you face an issue, please first check our [known issues page](https://github.com/ginkgo-project/ginkgo/wiki/Known-Issues) and the [open issues list](https://github.com/ginkgo-project/ginkgo/issues) and if you do not
+find a solution, feel free to [open a new issue](https://github.com/ginkgo-project/ginkgo/issues/new/choose) or ask a question using the [github discussions](https://github.com/ginkgo-project/ginkgo/discussions).
+
+Supported systems and requirements:
++ For all platforms, CMake 3.16+
++ C++17 compliant compiler
++ Linux and macOS
+  + GCC: 7.0+
+  + clang: 5.0+
+  + Intel compiler: 2019+
+  + Apple Clang: 15.0 is tested. Earlier versions might also work.
+  + NVHPC: 22.7+
+  + Cray Compiler: 14.0.1+
+  + CUDA module: CMake 3.18+, and CUDA 11.0+ or NVHPC 22.7+, Compute Capability 5.3+
+  + HIP module: CMake 3.21+, and ROCm 4.5+
+  + DPC++ module: Intel oneAPI 2023.1+ with oneMKL and oneDPL. Set the CXX compiler to `dpcpp` or `icpx`.
+  + MPI: standard version 3.1+, ideally GPU Aware, for best performance
++ Windows
+  + MinGW: GCC 7.0+
+  + Microsoft Visual Studio: VS 2019+
+  + CUDA module: CUDA 11.0+, Microsoft Visual Studio
+  + OpenMP module: MinGW.
+
+### Version support changes
++ Ginkgo now requires a compiler with C++ 17 support [#1603](https://github.com/ginkgo-project/ginkgo/pull/1603)
+
+### Deprecations
+
++ The `Executor::run` overload taking in multiple functions without a name as first parameter has been deprecated [#1667](https://github.com/ginkgo-project/ginkgo/pull/1667)
++ The `master` branch has been deprecated in favor of a new branch named `main` [#1739](https://github.com/ginkgo-project/ginkgo/pull/1739).
+
+#### Summary of previous deprecations
++ The `device_reset` parameter of CUDA and HIP executors no longer has an effect, and its `allocation_mode` parameters have been deprecated in favor of the `Allocator` interface.
++ The CMake parameter `GINKGO_BUILD_DPCPP` has been deprecated in favor of `GINKGO_BUILD_SYCL`.
++ The `gko::reorder::Rcm` interface has been deprecated in favor of `gko::experimental::reorder::Rcm` based on `Permutation`.
++ The Permutation class' `permute_mask` functionality.
++ Multiple functions with typos (`set_complex_subpsace()`, range functions such as `conj_operaton` etc).
++ `gko::lend()` is not necessary anymore.
++ The classes `RelativeResidualNorm` and `AbsoluteResidualNorm` are deprecated in favor of `ResidualNorm`.
++ The class `AmgxPgm` is deprecated in favor of `Pgm`.
++ Default constructors for the CSR `load_balance` and `automatical` strategies
++ The PolymorphicObject's move-semantic `copy_from` variant
++ The templated `SolverBase` class.
++ The class `MachineTopology` is deprecated in favor of `machine_topology`.
++ Logger constructors and create functions with the `executor` parameter.
++ The virtual, protected, Dense functions `compute_norm1_impl`, `add_scaled_impl`, etc.
++ Logger events for solvers and criterion without the additional `implicit_tau_sq` parameter.
++ The global `gko::solver::default_krylov_dim`, use instead `gko::solver::gmres_default_krylov_dim`.
++ `array::get_num_elems()` has been renamed to `get_size()`
++ `matrix_data::ensure_row_major_order()` has been renamed to `sort_row_major()`
++ `device_matrix_data::get_num_elems()` has been renamed to `get_num_stored_elements()`
++ The CMake parameter `GINKGO_COMPILER_FLAGS` has been superseded by `CMAKE_CXX_FLAGS`, and `GINKGO_CUDA_COMPILER_FLAGS` has been superseded by `CMAKE_CUDA_FLAGS`
++ The `std::initializer_list` overloads of matrix `create` methods and constructors are deprecated in favor of explicit `array` parameters
+
+### Added features
++ Add `Executor::get_description()` for textual representation of the device [#1615](https://github.com/ginkgo-project/ginkgo/pull/1615)
++ Add row and column scaling functionality to the distributed matrix [#1640](https://github.com/ginkgo-project/ginkgo/pull/1640)
++ Add `SolverProgress` logger printing out or storing to disk the individual scalars (and vectors) of an iterative solver after each iteration [#1620](https://github.com/ginkgo-project/ginkgo/pull/1620)
++ Add new `ortho_method` parameter for GMRES, with classical Gram-Schmidt and classical Gram-Schmidt with re-orthogonalization options in addition to previously-available modified Gram-Schmidt [#1646](https://github.com/ginkgo-project/ginkgo/pull/1646)
++ Add file config support for Schwarz [#1658](https://github.com/ginkgo-project/ginkgo/pull/1658)
++ Add overload for `Executor::run` which accepts a name and a closure for the ReferenceExecutor as the first two arguments [#1667](https://github.com/ginkgo-project/ginkgo/pull/1667)
++ Add function to fill `device_matrix_data` with zeros [#1683](https://github.com/ginkgo-project/ginkgo/pull/1683)
++ Add (S)SOR and Gauss-Seidel preconditioner [#1633](https://github.com/ginkgo-project/ginkgo/pull/1633), [#1634](https://github.com/ginkgo-project/ginkgo/pull/1634)
++ Add support for additive `read_distributed` for the distributed matrix [#1650](https://github.com/ginkgo-project/ginkgo/pull/1650)
++ Add Ginkgo's own ILU and IC implementation [#1684](https://github.com/ginkgo-project/ginkgo/pull/1684)
++ Add NVIDIA Ada architecture [#1733](https://github.com/ginkgo-project/ginkgo/pull/1733)
++ Add half precision support [#1706](https://github.com/ginkgo-project/ginkgo/pull/1706), [#1708](https://github.com/ginkgo-project/ginkgo/pull/1708), [#1711](https://github.com/ginkgo-project/ginkgo/pull/1711), [#1712](https://github.com/ginkgo-project/ginkgo/pull/1712), [#1713](https://github.com/ginkgo-project/ginkgo/pull/1713), [#1716](https://github.com/ginkgo-project/ginkgo/pull/1716), [#1710](https://github.com/ginkgo-project/ginkgo/pull/1710), [#1736](https://github.com/ginkgo-project/ginkgo/pull/1736)
+
+### Improvements
++ Add workspace in residual norm check [#1687](https://github.com/ginkgo-project/ginkgo/pull/1687), which reduces the alloc/free and corresponding overhead.
++ Add distributed `VectorCache` and use it as workspace in `Schwarz` [#1688](https://github.com/ginkgo-project/ginkgo/pull/1688).
++ Add example to show the file config usage [#1662](https://github.com/ginkgo-project/ginkgo/pull/1662)
++ Improve compile time for batched solvers [#1629](https://github.com/ginkgo-project/ginkgo/pull/1629)
++ Reduce conflicting thrust symbols when linking with different thrust libraries by adding a custom thrust namespace [#1730](https://github.com/ginkgo-project/ginkgo/pull/1730)
+
+### Fixes
++ Fix using the same algorithm as the original triangular solver when creating the transposed of the solver [#1641](https://github.com/ginkgo-project/ginkgo/pull/1641)
++ Fix the inconsistent behavior on the zero diagonal value in scalar Jacobi [#1642](https://github.com/ginkgo-project/ginkgo/pull/1642)
++ Fix an issue related to GCR and non-default strides in the rhs vector [#1656](https://github.com/ginkgo-project/ginkgo/pull/1656)
++ Fix an issue related to triangular solvers with CUDA on Windows [#1665](https://github.com/ginkgo-project/ginkgo/pull/1665)
++ Fix an issue where non-conforming MatrixMarket files were parsed without an error [#1628](https://github.com/ginkgo-project/ginkgo/pull/1628)
++ Fix finding rocthrust if it's not installed paths included by default [#1668](https://github.com/ginkgo-project/ginkgo/pull/1668)
++ Fix an issue related to casting between vectors of different value types in the mixed-precision multigrid setup [#1663](https://github.com/ginkgo-project/ginkgo/pull/1663)
++ Fix some test failures with ROCm 6.x [#1670](https://github.com/ginkgo-project/ginkgo/pull/1670)
++ Fix a race condition in bicgstab [#1676](https://github.com/ginkgo-project/ginkgo/pull/1676)
++ Fix an issue with MGS GMRES for complex numbers [#1678](https://github.com/ginkgo-project/ginkgo/pull/1678)
++ Fix finding ROCm on recent ROCm version (5.0+) [#1673](https://github.com/ginkgo-project/ginkgo/pull/1673)
++ Fix a compiler error when using NVHPC with MPI enabled [#1697](https://github.com/ginkgo-project/ginkgo/pull/1697)
++ Fix build issues of OMP backend when using HIPCC as C++ compiler [#1695](https://github.com/ginkgo-project/ginkgo/pull/1695)
++ Fix build issues for Intel OneAPI 2025.0 [#1718](https://github.com/ginkgo-project/ginkgo/pull/1718)
++ Fix inconsistencies between declaration and definition of functions and classes/structs, which mainly fixes clang-cl [#1725](https://github.com/ginkgo-project/ginkgo/pull/1725)
++ Fix undefined symbols in shared library in msys2/clang [#1724](https://github.com/ginkgo-project/ginkgo/pull/1724)
++ Fix page fault issues when running on multiple Intel GPUs in parallel [#1723](https://github.com/ginkgo-project/ginkgo/pull/1723)
++ Fix data races in several OMP kernels [#1743](https://github.com/ginkgo-project/ginkgo/pull/1743)
+
 ## Version 1.8.0
 
 The Ginkgo team is proud to announce the new Ginkgo minor release 1.8.0. This
@@ -1081,35 +1191,34 @@ About
 
 Ginkgo 1.0.0 is brought to you by:
 
-**Karlsruhe Institute of Technology**, Germany  
-**Universitat Jaume I**, Spain  
-**University of Tennessee, Knoxville**, US  
+**Karlsruhe Institute of Technology**, Germany
+**Universitat Jaume I**, Spain
+**University of Tennessee, Knoxville**, US
 
 These universities, along with various project grants, supported the development team and provided resources needed for the development of Ginkgo.
 
 Ginkgo 1.0.0 contains contributions from:
 
-**Hartwig Anzt**, Karlsruhe Institute of Technology  
-**Yenchen Chen**, National Taiwan University  
-**Terry Cojean**, Karlsruhe Institute of Technology  
-**Goran Flegar**, Universitat Jaume I  
-**Fritz Göbel**, Karlsruhe Institute of Technology  
-**Thomas Grützmacher**, Karlsruhe Institute of Technology  
-**Pratik Nayak**, Karlsruhe Institute of Technology  
-**Tobias Ribizel**, Karlsruhe Institute of Technology  
-**Yuhsiang Tsai**, National Taiwan University  
+**Hartwig Anzt**, Karlsruhe Institute of Technology
+**Yenchen Chen**, National Taiwan University
+**Terry Cojean**, Karlsruhe Institute of Technology
+**Goran Flegar**, Universitat Jaume I
+**Fritz Göbel**, Karlsruhe Institute of Technology
+**Thomas Grützmacher**, Karlsruhe Institute of Technology
+**Pratik Nayak**, Karlsruhe Institute of Technology
+**Tobias Ribizel**, Karlsruhe Institute of Technology
+**Yuhsiang Tsai**, National Taiwan University
 
 Supporting materials are provided by the following individuals:
 
-**David Rogers** - the Ginkgo logo  
-**Frithjof Fleischhammer** - the Ginkgo website  
+**David Rogers** - the Ginkgo logo
+**Frithjof Fleischhammer** - the Ginkgo website
 
 The development team is grateful to the following individuals for discussions and comments:
 
-**Erik Boman**  
-**Jelena Držaić**  
-**Mike Heroux**  
-**Mark Hoemmen**  
-**Timo Heister**  
-**Jens Saak**  
-
+**Erik Boman**
+**Jelena Držaić**
+**Mike Heroux**
+**Mark Hoemmen**
+**Timo Heister**
+**Jens Saak**

From 7b7992f54cb3f2d107d2b0d8fce35db37cbe2350 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Mon, 9 Dec 2024 14:56:34 +0100
Subject: [PATCH 446/448] [release] fix changelog

---
 CHANGELOG.md | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fb17d0d5d6a..254890e2d38 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1191,34 +1191,34 @@ About
 
 Ginkgo 1.0.0 is brought to you by:
 
-**Karlsruhe Institute of Technology**, Germany
-**Universitat Jaume I**, Spain
-**University of Tennessee, Knoxville**, US
+**Karlsruhe Institute of Technology**, Germany  
+**Universitat Jaume I**, Spain  
+**University of Tennessee, Knoxville**, US  
 
 These universities, along with various project grants, supported the development team and provided resources needed for the development of Ginkgo.
 
 Ginkgo 1.0.0 contains contributions from:
 
-**Hartwig Anzt**, Karlsruhe Institute of Technology
-**Yenchen Chen**, National Taiwan University
-**Terry Cojean**, Karlsruhe Institute of Technology
-**Goran Flegar**, Universitat Jaume I
-**Fritz Göbel**, Karlsruhe Institute of Technology
-**Thomas Grützmacher**, Karlsruhe Institute of Technology
-**Pratik Nayak**, Karlsruhe Institute of Technology
-**Tobias Ribizel**, Karlsruhe Institute of Technology
-**Yuhsiang Tsai**, National Taiwan University
+**Hartwig Anzt**, Karlsruhe Institute of Technology  
+**Yenchen Chen**, National Taiwan University  
+**Terry Cojean**, Karlsruhe Institute of Technology  
+**Goran Flegar**, Universitat Jaume I  
+**Fritz Göbel**, Karlsruhe Institute of Technology  
+**Thomas Grützmacher**, Karlsruhe Institute of Technology  
+**Pratik Nayak**, Karlsruhe Institute of Technology  
+**Tobias Ribizel**, Karlsruhe Institute of Technology  
+**Yuhsiang Tsai**, National Taiwan University  
 
 Supporting materials are provided by the following individuals:
 
-**David Rogers** - the Ginkgo logo
-**Frithjof Fleischhammer** - the Ginkgo website
+**David Rogers** - the Ginkgo logo  
+**Frithjof Fleischhammer** - the Ginkgo website  
 
 The development team is grateful to the following individuals for discussions and comments:
 
-**Erik Boman**
-**Jelena Držaić**
-**Mike Heroux**
-**Mark Hoemmen**
-**Timo Heister**
-**Jens Saak**
+**Erik Boman**  
+**Jelena Držaić**  
+**Mike Heroux**  
+**Mark Hoemmen**  
+**Timo Heister**  
+**Jens Saak**  

From 036c7fedd4a5e42b42632dfa45eac93f0c172ab6 Mon Sep 17 00:00:00 2001
From: Terry Cojean <terry.cojean@kit.edu>
Date: Thu, 6 Jun 2024 15:21:42 +0200
Subject: [PATCH 447/448] Version: Develop now prepares 1.9.0

---
 CMakeLists.txt                                                  | 2 +-
 examples/adaptiveprecision-blockjacobi/CMakeLists.txt           | 2 +-
 examples/batched-solver/CMakeLists.txt                          | 2 +-
 examples/build-setup.sh                                         | 2 +-
 examples/cb-gmres/CMakeLists.txt                                | 2 +-
 examples/custom-logger/CMakeLists.txt                           | 2 +-
 examples/custom-matrix-format/CMakeLists.txt                    | 2 +-
 examples/custom-stopping-criterion/CMakeLists.txt               | 2 +-
 examples/ginkgo-overhead/CMakeLists.txt                         | 2 +-
 examples/ginkgo-ranges/CMakeLists.txt                           | 2 +-
 examples/heat-equation/CMakeLists.txt                           | 2 +-
 examples/ilu-preconditioned-solver/CMakeLists.txt               | 2 +-
 examples/inverse-iteration/CMakeLists.txt                       | 2 +-
 examples/ir-ilu-preconditioned-solver/CMakeLists.txt            | 2 +-
 examples/iterative-refinement/CMakeLists.txt                    | 2 +-
 examples/kokkos-assembly/CMakeLists.txt                         | 2 +-
 examples/minimal-cuda-solver/CMakeLists.txt                     | 2 +-
 examples/mixed-multigrid-preconditioned-solver/CMakeLists.txt   | 2 +-
 examples/mixed-multigrid-solver/CMakeLists.txt                  | 2 +-
 examples/mixed-precision-ir/CMakeLists.txt                      | 2 +-
 examples/mixed-spmv/CMakeLists.txt                              | 2 +-
 .../multigrid-preconditioned-solver-customized/CMakeLists.txt   | 2 +-
 examples/multigrid-preconditioned-solver/CMakeLists.txt         | 2 +-
 examples/nine-pt-stencil-solver/CMakeLists.txt                  | 2 +-
 examples/papi-logging/CMakeLists.txt                            | 2 +-
 examples/par-ilu-convergence/CMakeLists.txt                     | 2 +-
 examples/performance-debugging/CMakeLists.txt                   | 2 +-
 examples/poisson-solver/CMakeLists.txt                          | 2 +-
 examples/preconditioned-solver/CMakeLists.txt                   | 2 +-
 examples/preconditioner-export/CMakeLists.txt                   | 2 +-
 examples/reordered-preconditioned-solver/CMakeLists.txt         | 2 +-
 examples/schroedinger-splitting/CMakeLists.txt                  | 2 +-
 examples/simple-solver-logging/CMakeLists.txt                   | 2 +-
 examples/simple-solver/CMakeLists.txt                           | 2 +-
 examples/three-pt-stencil-solver/CMakeLists.txt                 | 2 +-
 35 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 913dd38f06d..482f853f753 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.16)
 
-project(Ginkgo LANGUAGES CXX VERSION 1.8.0 DESCRIPTION "A numerical linear algebra library targeting many-core architectures")
+project(Ginkgo LANGUAGES CXX VERSION 1.9.0 DESCRIPTION "A numerical linear algebra library targeting many-core architectures")
 set(Ginkgo_VERSION_TAG "master")
 set(PROJECT_VERSION_TAG ${Ginkgo_VERSION_TAG})
 if(Ginkgo_VERSION_TAG STREQUAL "master")
diff --git a/examples/adaptiveprecision-blockjacobi/CMakeLists.txt b/examples/adaptiveprecision-blockjacobi/CMakeLists.txt
index 8edbdc88af0..7b61bf6a175 100644
--- a/examples/adaptiveprecision-blockjacobi/CMakeLists.txt
+++ b/examples/adaptiveprecision-blockjacobi/CMakeLists.txt
@@ -3,7 +3,7 @@ project(adaptiveprecision-blockjacobi)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
 endif()
 
 add_executable(adaptiveprecision-blockjacobi adaptiveprecision-blockjacobi.cpp)
diff --git a/examples/batched-solver/CMakeLists.txt b/examples/batched-solver/CMakeLists.txt
index 33c3d332b96..cc6b694a54a 100644
--- a/examples/batched-solver/CMakeLists.txt
+++ b/examples/batched-solver/CMakeLists.txt
@@ -3,7 +3,7 @@ project(batched-solver)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
 endif()
 
 add_executable(batched-solver batched-solver.cpp)
diff --git a/examples/build-setup.sh b/examples/build-setup.sh
index c4fa523640f..c153be6a77f 100755
--- a/examples/build-setup.sh
+++ b/examples/build-setup.sh
@@ -3,7 +3,7 @@
 # copy libraries
 LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip ginkgo_dpcpp ginkgo_device"
 SUFFIXES=".so .dylib .dll d.so d.dylib d.dll"
-VERSION="1.8.0"
+VERSION="1.9.0"
 for name in ${LIBRARY_NAMES}; do
     for suffix in ${SUFFIXES}; do
         cp ${BUILD_DIR}/lib/lib${name}${suffix}.${VERSION} \
diff --git a/examples/cb-gmres/CMakeLists.txt b/examples/cb-gmres/CMakeLists.txt
index 8dcc43376b4..f4283c45a34 100644
--- a/examples/cb-gmres/CMakeLists.txt
+++ b/examples/cb-gmres/CMakeLists.txt
@@ -3,7 +3,7 @@ project(cb-gmres)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
 endif()
 
 add_executable(cb-gmres cb-gmres.cpp)
diff --git a/examples/custom-logger/CMakeLists.txt b/examples/custom-logger/CMakeLists.txt
index 2278e0848d2..ef355c2ebb7 100644
--- a/examples/custom-logger/CMakeLists.txt
+++ b/examples/custom-logger/CMakeLists.txt
@@ -3,7 +3,7 @@ project(custom-logger)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
 endif()
 
 add_executable(custom-logger custom-logger.cpp)
diff --git a/examples/custom-matrix-format/CMakeLists.txt b/examples/custom-matrix-format/CMakeLists.txt
index 9a1280ff9f5..493437f9789 100644
--- a/examples/custom-matrix-format/CMakeLists.txt
+++ b/examples/custom-matrix-format/CMakeLists.txt
@@ -3,7 +3,7 @@ project(custom-matrix-format CXX CUDA)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
     find_package(OpenMP 3.0 REQUIRED)
 endif()
 
diff --git a/examples/custom-stopping-criterion/CMakeLists.txt b/examples/custom-stopping-criterion/CMakeLists.txt
index d77d3d2247a..1cecf5b87ec 100644
--- a/examples/custom-stopping-criterion/CMakeLists.txt
+++ b/examples/custom-stopping-criterion/CMakeLists.txt
@@ -3,7 +3,7 @@ project(custom-stopping-criterion)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
     set(THREADS_PREFER_PTHREAD_FLAG ON)
     find_package(Threads REQUIRED)
 endif()
diff --git a/examples/ginkgo-overhead/CMakeLists.txt b/examples/ginkgo-overhead/CMakeLists.txt
index 382a6813cab..5f888d77aa7 100644
--- a/examples/ginkgo-overhead/CMakeLists.txt
+++ b/examples/ginkgo-overhead/CMakeLists.txt
@@ -3,7 +3,7 @@ project(ginkgo-overhead)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
 endif()
 
 add_executable(ginkgo-overhead ginkgo-overhead.cpp)
diff --git a/examples/ginkgo-ranges/CMakeLists.txt b/examples/ginkgo-ranges/CMakeLists.txt
index e0b3dfcad99..c4f45e63211 100644
--- a/examples/ginkgo-ranges/CMakeLists.txt
+++ b/examples/ginkgo-ranges/CMakeLists.txt
@@ -3,7 +3,7 @@ project(ginkgo-ranges)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
 endif()
 add_executable(ginkgo-ranges ginkgo-ranges.cpp)
 target_link_libraries(ginkgo-ranges Ginkgo::ginkgo)
diff --git a/examples/heat-equation/CMakeLists.txt b/examples/heat-equation/CMakeLists.txt
index 0029b195613..d7d7a994e33 100644
--- a/examples/heat-equation/CMakeLists.txt
+++ b/examples/heat-equation/CMakeLists.txt
@@ -3,7 +3,7 @@ project(heat-equation)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
 endif()
 find_package(OpenCV REQUIRED)
 
diff --git a/examples/ilu-preconditioned-solver/CMakeLists.txt b/examples/ilu-preconditioned-solver/CMakeLists.txt
index 07485b6afd8..fa84e6163a2 100644
--- a/examples/ilu-preconditioned-solver/CMakeLists.txt
+++ b/examples/ilu-preconditioned-solver/CMakeLists.txt
@@ -3,7 +3,7 @@ project(ilu-preconditioned-solver)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
 endif()
 
 add_executable(ilu-preconditioned-solver ilu-preconditioned-solver.cpp)
diff --git a/examples/inverse-iteration/CMakeLists.txt b/examples/inverse-iteration/CMakeLists.txt
index 5b2a1872b72..3f70d329981 100644
--- a/examples/inverse-iteration/CMakeLists.txt
+++ b/examples/inverse-iteration/CMakeLists.txt
@@ -3,7 +3,7 @@ project(inverse-iteration)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
 endif()
 
 add_executable(inverse-iteration inverse-iteration.cpp)
diff --git a/examples/ir-ilu-preconditioned-solver/CMakeLists.txt b/examples/ir-ilu-preconditioned-solver/CMakeLists.txt
index 178a0fa0044..37bcd36e9f6 100644
--- a/examples/ir-ilu-preconditioned-solver/CMakeLists.txt
+++ b/examples/ir-ilu-preconditioned-solver/CMakeLists.txt
@@ -3,7 +3,7 @@ project(ir-ilu-preconditioned-solver)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
 endif()
 
 add_executable(ir-ilu-preconditioned-solver ir-ilu-preconditioned-solver.cpp)
diff --git a/examples/iterative-refinement/CMakeLists.txt b/examples/iterative-refinement/CMakeLists.txt
index ee80ee7d008..2dc7bef6b79 100644
--- a/examples/iterative-refinement/CMakeLists.txt
+++ b/examples/iterative-refinement/CMakeLists.txt
@@ -3,7 +3,7 @@ project(iterative-refinement)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
 endif()
 
 add_executable(iterative-refinement iterative-refinement.cpp)
diff --git a/examples/kokkos-assembly/CMakeLists.txt b/examples/kokkos-assembly/CMakeLists.txt
index 39c4cabd57b..45bf7ffa6f9 100644
--- a/examples/kokkos-assembly/CMakeLists.txt
+++ b/examples/kokkos-assembly/CMakeLists.txt
@@ -3,7 +3,7 @@ project(kokkos-assembly CXX)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if(NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
 endif()
 find_package(Kokkos 4.1.00 REQUIRED)
 
diff --git a/examples/minimal-cuda-solver/CMakeLists.txt b/examples/minimal-cuda-solver/CMakeLists.txt
index 4262f98e719..84088a76e7f 100644
--- a/examples/minimal-cuda-solver/CMakeLists.txt
+++ b/examples/minimal-cuda-solver/CMakeLists.txt
@@ -3,7 +3,7 @@ project(minimal-cuda-solver)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
 endif()
 
 add_executable(minimal-cuda-solver minimal-cuda-solver.cpp)
diff --git a/examples/mixed-multigrid-preconditioned-solver/CMakeLists.txt b/examples/mixed-multigrid-preconditioned-solver/CMakeLists.txt
index 3ca157faeb5..182d4dd155d 100644
--- a/examples/mixed-multigrid-preconditioned-solver/CMakeLists.txt
+++ b/examples/mixed-multigrid-preconditioned-solver/CMakeLists.txt
@@ -3,7 +3,7 @@ project(mixed-multigrid-preconditioned-solver)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
 endif()
 
 add_executable(mixed-multigrid-preconditioned-solver mixed-multigrid-preconditioned-solver.cpp)
diff --git a/examples/mixed-multigrid-solver/CMakeLists.txt b/examples/mixed-multigrid-solver/CMakeLists.txt
index 0e4eda7e5af..b143cec6449 100644
--- a/examples/mixed-multigrid-solver/CMakeLists.txt
+++ b/examples/mixed-multigrid-solver/CMakeLists.txt
@@ -3,7 +3,7 @@ project(mixed-multigrid-solver)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
 endif()
 
 add_executable(mixed-multigrid-solver mixed-multigrid-solver.cpp)
diff --git a/examples/mixed-precision-ir/CMakeLists.txt b/examples/mixed-precision-ir/CMakeLists.txt
index c540a7d00b6..e9cfc26abd7 100644
--- a/examples/mixed-precision-ir/CMakeLists.txt
+++ b/examples/mixed-precision-ir/CMakeLists.txt
@@ -3,7 +3,7 @@ project(mixed-precision-ir)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
 endif()
 
 add_executable(mixed-precision-ir mixed-precision-ir.cpp)
diff --git a/examples/mixed-spmv/CMakeLists.txt b/examples/mixed-spmv/CMakeLists.txt
index e65ab60186b..0feadf1e038 100644
--- a/examples/mixed-spmv/CMakeLists.txt
+++ b/examples/mixed-spmv/CMakeLists.txt
@@ -3,7 +3,7 @@ project(mixed-spmv)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
 endif()
 
 add_executable(mixed-spmv mixed-spmv.cpp)
diff --git a/examples/multigrid-preconditioned-solver-customized/CMakeLists.txt b/examples/multigrid-preconditioned-solver-customized/CMakeLists.txt
index a9c85fdcb21..7c0d93efe42 100644
--- a/examples/multigrid-preconditioned-solver-customized/CMakeLists.txt
+++ b/examples/multigrid-preconditioned-solver-customized/CMakeLists.txt
@@ -3,7 +3,7 @@ project(multigrid-preconditioned-solver-customized)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
 endif()
 
 add_executable(multigrid-preconditioned-solver-customized multigrid-preconditioned-solver-customized.cpp)
diff --git a/examples/multigrid-preconditioned-solver/CMakeLists.txt b/examples/multigrid-preconditioned-solver/CMakeLists.txt
index e5938656169..2c73b3fa037 100644
--- a/examples/multigrid-preconditioned-solver/CMakeLists.txt
+++ b/examples/multigrid-preconditioned-solver/CMakeLists.txt
@@ -3,7 +3,7 @@ project(multigrid-preconditioned-solver)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
 endif()
 
 add_executable(multigrid-preconditioned-solver multigrid-preconditioned-solver.cpp)
diff --git a/examples/nine-pt-stencil-solver/CMakeLists.txt b/examples/nine-pt-stencil-solver/CMakeLists.txt
index a29230fab2d..64743593c70 100644
--- a/examples/nine-pt-stencil-solver/CMakeLists.txt
+++ b/examples/nine-pt-stencil-solver/CMakeLists.txt
@@ -3,7 +3,7 @@ project(nine-pt-stencil-solver)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
 endif()
 
 add_executable(nine-pt-stencil-solver nine-pt-stencil-solver.cpp)
diff --git a/examples/papi-logging/CMakeLists.txt b/examples/papi-logging/CMakeLists.txt
index 2c71316b37b..37b5a1352df 100644
--- a/examples/papi-logging/CMakeLists.txt
+++ b/examples/papi-logging/CMakeLists.txt
@@ -3,7 +3,7 @@ project(papi-logging)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
 endif()
 
 if (NOT GINKGO_HAVE_PAPI_SDE)
diff --git a/examples/par-ilu-convergence/CMakeLists.txt b/examples/par-ilu-convergence/CMakeLists.txt
index f285ccf415e..f8f0f037c16 100644
--- a/examples/par-ilu-convergence/CMakeLists.txt
+++ b/examples/par-ilu-convergence/CMakeLists.txt
@@ -3,7 +3,7 @@ project(par-ilu-convergence)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
 endif()
 
 add_executable(par-ilu-convergence par-ilu-convergence.cpp)
diff --git a/examples/performance-debugging/CMakeLists.txt b/examples/performance-debugging/CMakeLists.txt
index 2f9ea970cbf..36e906553fa 100644
--- a/examples/performance-debugging/CMakeLists.txt
+++ b/examples/performance-debugging/CMakeLists.txt
@@ -3,7 +3,7 @@ project(performance-debugging)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
 endif()
 
 add_executable(performance-debugging performance-debugging.cpp)
diff --git a/examples/poisson-solver/CMakeLists.txt b/examples/poisson-solver/CMakeLists.txt
index f3ae7a3353b..1bc92aa2c9b 100644
--- a/examples/poisson-solver/CMakeLists.txt
+++ b/examples/poisson-solver/CMakeLists.txt
@@ -3,7 +3,7 @@ project(poisson-solver)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
 endif()
 
 add_executable(poisson-solver poisson-solver.cpp)
diff --git a/examples/preconditioned-solver/CMakeLists.txt b/examples/preconditioned-solver/CMakeLists.txt
index 60b2b7a35d5..09f22537619 100644
--- a/examples/preconditioned-solver/CMakeLists.txt
+++ b/examples/preconditioned-solver/CMakeLists.txt
@@ -3,7 +3,7 @@ project(preconditioned-solver)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
 endif()
 add_executable(preconditioned-solver preconditioned-solver.cpp)
 target_link_libraries(preconditioned-solver Ginkgo::ginkgo)
diff --git a/examples/preconditioner-export/CMakeLists.txt b/examples/preconditioner-export/CMakeLists.txt
index e7f07c89608..f928859bd1a 100644
--- a/examples/preconditioner-export/CMakeLists.txt
+++ b/examples/preconditioner-export/CMakeLists.txt
@@ -3,7 +3,7 @@ project(preconditioner-export)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
 endif()
 
 add_executable(preconditioner-export preconditioner-export.cpp)
diff --git a/examples/reordered-preconditioned-solver/CMakeLists.txt b/examples/reordered-preconditioned-solver/CMakeLists.txt
index 17c17f2b71c..bfaceef782a 100644
--- a/examples/reordered-preconditioned-solver/CMakeLists.txt
+++ b/examples/reordered-preconditioned-solver/CMakeLists.txt
@@ -3,7 +3,7 @@ project(reordered-preconditioned-solver)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
 endif()
 add_executable(reordered-preconditioned-solver reordered-preconditioned-solver.cpp)
 target_link_libraries(reordered-preconditioned-solver Ginkgo::ginkgo)
diff --git a/examples/schroedinger-splitting/CMakeLists.txt b/examples/schroedinger-splitting/CMakeLists.txt
index 730f9c415e9..b782095bdbd 100644
--- a/examples/schroedinger-splitting/CMakeLists.txt
+++ b/examples/schroedinger-splitting/CMakeLists.txt
@@ -3,7 +3,7 @@ project(schroedinger-splitting)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
 endif()
 find_package(OpenCV REQUIRED)
 
diff --git a/examples/simple-solver-logging/CMakeLists.txt b/examples/simple-solver-logging/CMakeLists.txt
index 6b54d3af791..ef36b7ab041 100644
--- a/examples/simple-solver-logging/CMakeLists.txt
+++ b/examples/simple-solver-logging/CMakeLists.txt
@@ -3,7 +3,7 @@ project(simple-solver-logging)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
 endif()
 
 add_executable(simple-solver-logging simple-solver-logging.cpp)
diff --git a/examples/simple-solver/CMakeLists.txt b/examples/simple-solver/CMakeLists.txt
index b7d7b15df78..2d274f50f81 100644
--- a/examples/simple-solver/CMakeLists.txt
+++ b/examples/simple-solver/CMakeLists.txt
@@ -3,7 +3,7 @@ project(simple-solver)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
 endif()
 
 add_executable(simple-solver simple-solver.cpp)
diff --git a/examples/three-pt-stencil-solver/CMakeLists.txt b/examples/three-pt-stencil-solver/CMakeLists.txt
index ab456f062ae..bd7ce57a171 100644
--- a/examples/three-pt-stencil-solver/CMakeLists.txt
+++ b/examples/three-pt-stencil-solver/CMakeLists.txt
@@ -3,7 +3,7 @@ project(three-pt-stencil-solver)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.8.0 REQUIRED)
+    find_package(Ginkgo 1.9.0 REQUIRED)
 endif()
 
 add_executable(three-pt-stencil-solver three-pt-stencil-solver.cpp)

From 02dd762b26c2212a843552aa3769bf50f117eaed Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 18 Oct 2024 10:33:39 +0200
Subject: [PATCH 448/448] clear unused version switch macro and change the
 badge

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 598b17e5b5b..0ca9befb690 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 <div align="center">
 
-[![License](https://img.shields.io/github/license/ginkgo-project/ginkgo.svg)](./LICENSE)|[![c++ standard](https://img.shields.io/badge/c%2B%2B-14-blue.svg)](https://en.wikipedia.org/wiki/C%2B%2B#Standardization)|[![Documentation](https://img.shields.io/badge/Documentation-latest-blue.svg)](https://ginkgo-project.github.io/ginkgo-generated-documentation/doc/master/)|[![DOI](https://joss.theoj.org/papers/10.21105/joss.02260/status.svg)](https://doi.org/10.21105/joss.02260)
+[![License](https://img.shields.io/github/license/ginkgo-project/ginkgo.svg)](./LICENSE)|[![c++ standard](https://img.shields.io/badge/c%2B%2B-17-blue.svg)](https://en.wikipedia.org/wiki/C%2B%2B#Standardization)|[![Documentation](https://img.shields.io/badge/Documentation-latest-blue.svg)](https://ginkgo-project.github.io/ginkgo-generated-documentation/doc/master/)|[![DOI](https://joss.theoj.org/papers/10.21105/joss.02260/status.svg)](https://doi.org/10.21105/joss.02260)
 |:-:|:-:|:-:|:-:|