From d394a1f4c76b2092a7fa6dd5a0179b7e45445d4e Mon Sep 17 00:00:00 2001
From: Tobias Meyer Andersen <tobiasmeyer.andersen@sintef.no>
Date: Thu, 5 Dec 2024 10:47:05 +0100
Subject: [PATCH 1/5] use cudagraphs in gpu DILU and ILU0 apply

is also supported in HIP, though not does not
seem to affect performance in any clear way.
1.1 to 1.2 speedup in Nvidia GPUs.
---
 opm/simulators/linalg/gpuistl/GpuBuffer.cpp   |  21 ++
 opm/simulators/linalg/gpuistl/GpuBuffer.hpp   |  19 ++
 opm/simulators/linalg/gpuistl/GpuDILU.cpp     |  81 +++--
 opm/simulators/linalg/gpuistl/GpuDILU.hpp     |  15 +-
 .../linalg/gpuistl/GpuSparseMatrix.cpp        |  13 +
 .../linalg/gpuistl/GpuSparseMatrix.hpp        |  12 +
 opm/simulators/linalg/gpuistl/GpuVector.cpp   |  13 +
 opm/simulators/linalg/gpuistl/GpuVector.hpp   |   1 +
 opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp  |  81 +++--
 opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp  |  12 +-
 .../detail/gpusparse_matrix_operations.cu     |  53 +++-
 .../detail/gpusparse_matrix_operations.hpp    |  49 +++
 .../preconditionerKernels/DILUKernels.cu      | 279 +++++++++++++-----
 .../preconditionerKernels/DILUKernels.hpp     |  18 +-
 .../preconditionerKernels/ILU0Kernels.cu      | 142 ++++-----
 .../preconditionerKernels/ILU0Kernels.hpp     |  18 +-
 16 files changed, 628 insertions(+), 199 deletions(-)
diff --git a/opm/simulators/linalg/gpuistl/GpuBuffer.cpp b/opm/simulators/linalg/gpuistl/GpuBuffer.cpp
index b4132ce09fb..dba271d68d7 100644
--- a/opm/simulators/linalg/gpuistl/GpuBuffer.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuBuffer.cpp
@@ -174,6 +174,19 @@ GpuBuffer<T>::copyFromHost(const T* dataPointer, size_t numberOfElements)
     OPM_GPU_SAFE_CALL(cudaMemcpy(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice));
 }
 
+template <class T>
+void
+GpuBuffer<T>::copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream)
+{
+    if (numberOfElements > size()) {
+        OPM_THROW(std::runtime_error,
+                  fmt::format("Requesting to copy too many elements. buffer has {} elements, while {} was requested.",
+                              size(),
+                              numberOfElements));
+    }
+    OPM_GPU_SAFE_CALL(cudaMemcpyAsync(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice, stream));
+}
+
 template <class T>
 void
 GpuBuffer<T>::copyToHost(T* dataPointer, size_t numberOfElements) const
@@ -188,6 +201,14 @@ GpuBuffer<T>::copyFromHost(const std::vector<T>& data)
 {
     copyFromHost(data.data(), data.size());
 }
+
+template <class T>
+void
+GpuBuffer<T>::copyFromHost(const std::vector<T>& data, cudaStream_t stream)
+{
+    copyFromHost(data.data(), data.size(), stream);
+}
+
 template <class T>
 void
 GpuBuffer<T>::copyToHost(std::vector<T>& data) const
diff --git a/opm/simulators/linalg/gpuistl/GpuBuffer.hpp b/opm/simulators/linalg/gpuistl/GpuBuffer.hpp
index 369ea3ba28f..1865d7a71d3 100644
--- a/opm/simulators/linalg/gpuistl/GpuBuffer.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuBuffer.hpp
@@ -27,6 +27,7 @@
 #include <opm/simulators/linalg/gpuistl/GpuView.hpp>
 #include <vector>
 #include <string>
+#include <cuda_runtime.h>
 
 
 namespace Opm::gpuistl
@@ -174,6 +175,15 @@ class GpuBuffer
      */
     void copyFromHost(const T* dataPointer, size_t numberOfElements);
 
+    /**
+     * @brief copyFromHost copies numberOfElements from the CPU memory dataPointer
+     * @param dataPointer raw pointer to CPU memory
+     * @param numberOfElements number of elements to copy
+     * @note This does synchronous transfer.
+     * @note assumes that this buffer has numberOfElements elements
+     */
+    void copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream);
+
     /**
      * @brief copyFromHost copies numberOfElements to the CPU memory dataPointer
      * @param dataPointer raw pointer to CPU memory
@@ -192,6 +202,15 @@ class GpuBuffer
      */
     void copyFromHost(const std::vector<T>& data);
 
+    /**
+     * @brief copyToHost copies data from an std::vector
+     * @param data the vector to copy from
+     *
+     * @note This does synchronous transfer.
+     * @note This assumes that the size of this buffer is equal to the size of the input vector.
+     */
+    void copyFromHost(const std::vector<T>& data, cudaStream_t stream);
+
     /**
      * @brief copyToHost copies data to an std::vector
      * @param data the vector to copy to
diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.cpp b/opm/simulators/linalg/gpuistl/GpuDILU.cpp
index d311386b00d..b76bfe1ed61 100644
--- a/opm/simulators/linalg/gpuistl/GpuDILU.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuDILU.cpp
@@ -37,6 +37,7 @@
 #include <functional>
 #include <utility>
 #include <string>
+
 namespace Opm::gpuistl
 {
 
@@ -54,6 +55,8 @@ GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int
     , m_tuneThreadBlockSizes(tuneKernels)
     , m_mixedPrecisionScheme(makeMatrixStorageMPScheme(mixedPrecisionScheme))
 {
+    OPM_GPU_SAFE_CALL(cudaStreamCreate(&m_stream));
+
     // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
     // Some sanity check
     OPM_ERROR_IF(A.N() != m_gpuMatrix.N(),
@@ -93,7 +96,8 @@ GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int
         }
     }
 
-    computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
+    reorderAndSplitMatrix(m_moveThreadBlockSize);
+    computeDiagonal(m_DILUFactorizationThreadBlockSize);
 
     if (m_tuneThreadBlockSizes) {
         tuneThreadBlockSizes();
@@ -112,7 +116,22 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d)
 {
     OPM_TIMEBLOCK(prec_apply);
     {
-        apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
+        const auto ptrs = std::make_pair(v.data(), d.data());
+
+        auto it = m_apply_graphs.find(ptrs);
+
+        if (it == m_apply_graphs.end()) {
+            m_apply_graphs[ptrs] = cudaGraph_t();
+            m_executableGraphs[ptrs] = cudaGraphExec_t();
+            OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream, cudaStreamCaptureModeGlobal));
+
+            // The apply functions contains lots of small function calls which call a kernel each
+            apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
+
+            OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream, &m_apply_graphs[ptrs]));
+            OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs], m_apply_graphs[ptrs], nullptr, nullptr, 0));
+        }
+        OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs], 0));
     }
 }
 
@@ -135,7 +154,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                     m_gpuDInvFloat->data(),
                     d.data(),
                     v.data(),
-                    lowerSolveThreadBlockSize);
+                    lowerSolveThreadBlockSize,
+                    m_stream);
             } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) {
                 detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float, field_type>(
                     m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
@@ -147,7 +167,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                     m_gpuDInv.data(),
                     d.data(),
                     v.data(),
-                    lowerSolveThreadBlockSize);
+                    lowerSolveThreadBlockSize,
+                    m_stream);
             } else {
                 detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, field_type, field_type>(
                     m_gpuMatrixReorderedLower->getNonZeroValues().data(),
@@ -159,7 +180,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                     m_gpuDInv.data(),
                     d.data(),
                     v.data(),
-                    lowerSolveThreadBlockSize);
+                    lowerSolveThreadBlockSize,
+                    m_stream);
             }
         } else {
             detail::DILU::solveLowerLevelSet<field_type, blocksize_>(
@@ -172,7 +194,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                 m_gpuDInv.data(),
                 d.data(),
                 v.data(),
-                lowerSolveThreadBlockSize);
+                lowerSolveThreadBlockSize,
+                m_stream);
         }
         levelStartIdx += numOfRowsInLevel;
     }
@@ -193,7 +216,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                     numOfRowsInLevel,
                     m_gpuDInvFloat->data(),
                     v.data(),
-                    upperSolveThreadBlockSize);
+                    upperSolveThreadBlockSize,
+                    m_stream);
             } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG){
                 detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, float>(
                     m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
@@ -204,7 +228,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                     numOfRowsInLevel,
                     m_gpuDInv.data(),
                     v.data(),
-                    upperSolveThreadBlockSize);
+                    upperSolveThreadBlockSize,
+                    m_stream);
             } else {
                 detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, field_type>(
                     m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
@@ -215,7 +240,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                     numOfRowsInLevel,
                     m_gpuDInv.data(),
                     v.data(),
-                    upperSolveThreadBlockSize);
+                    upperSolveThreadBlockSize,
+                    m_stream);
             }
         } else {
             detail::DILU::solveUpperLevelSet<field_type, blocksize_>(
@@ -227,7 +253,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                 numOfRowsInLevel,
                 m_gpuDInv.data(),
                 v.data(),
-                upperSolveThreadBlockSize);
+                upperSolveThreadBlockSize,
+                m_stream);
         }
     }
 }
@@ -252,7 +279,9 @@ GpuDILU<M, X, Y, l>::update()
 {
     OPM_TIMEBLOCK(prec_update);
     {
-        update(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
+        m_gpuMatrix.updateNonzeroValuesDirectlyInStream(m_cpuMatrix, m_stream); // send updated matrix to the gpu
+        reorderAndSplitMatrix(m_moveThreadBlockSize);
+        computeDiagonal(m_DILUFactorizationThreadBlockSize);
     }
 }
 
@@ -260,13 +289,14 @@ template <class M, class X, class Y, int l>
 void
 GpuDILU<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationBlockSize)
 {
-    m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
-    computeDiagAndMoveReorderedData(moveThreadBlockSize, factorizationBlockSize);
+    m_gpuMatrix.updateNonzeroValuesDirectlyInStream(m_cpuMatrix, m_stream); // send updated matrix to the gpu
+    reorderAndSplitMatrix(moveThreadBlockSize);
+    computeDiagonal(factorizationBlockSize);
 }
 
 template <class M, class X, class Y, int l>
 void
-GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationBlockSize)
+GpuDILU<M, X, Y, l>::reorderAndSplitMatrix(int moveThreadBlockSize)
 {
     if (m_splitMatrix) {
         detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
@@ -280,7 +310,8 @@ GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in
             m_gpuMatrixReorderedDiag->data(),
             m_gpuNaturalToReorder.data(),
             m_gpuMatrixReorderedLower->N(),
-            moveThreadBlockSize);
+            moveThreadBlockSize,
+            m_stream);
     } else {
         detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
                                                                 m_gpuMatrix.getRowIndices().data(),
@@ -288,9 +319,15 @@ GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in
                                                                 m_gpuMatrixReordered->getRowIndices().data(),
                                                                 m_gpuNaturalToReorder.data(),
                                                                 m_gpuMatrixReordered->N(),
-                                                                moveThreadBlockSize);
+                                                                moveThreadBlockSize,
+                                                                m_stream);
     }
+}
 
+template <class M, class X, class Y, int l>
+void
+GpuDILU<M, X, Y, l>::computeDiagonal(int factorizationBlockSize)
+{
     int levelStartIdx = 0;
     for (int level = 0; level < m_levelSets.size(); ++level) {
         const int numOfRowsInLevel = m_levelSets[level].size();
@@ -312,7 +349,8 @@ GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in
                     m_gpuDInvFloat->data(),
                     m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
                     m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
-                    factorizationBlockSize);
+                    factorizationBlockSize,
+                    m_stream);
             } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) {
                 detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(
                     m_gpuMatrixReorderedLower->getNonZeroValues().data(),
@@ -330,7 +368,8 @@ GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in
                     nullptr,
                     m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
                     m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
-                    factorizationBlockSize);
+                    factorizationBlockSize,
+                    m_stream);
             } else {
                 // TODO: should this be field type twice or field type then float in the template?
                 detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(
@@ -349,7 +388,8 @@ GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in
                     nullptr,
                     nullptr,
                     nullptr,
-                    factorizationBlockSize);
+                    factorizationBlockSize,
+                    m_stream);
             }
         } else {
             detail::DILU::computeDiluDiagonal<field_type, blocksize_>(
@@ -361,7 +401,8 @@ GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in
                 levelStartIdx,
                 numOfRowsInLevel,
                 m_gpuDInv.data(),
-                factorizationBlockSize);
+                factorizationBlockSize,
+                m_stream);
         }
         levelStartIdx += numOfRowsInLevel;
     }
diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.hpp b/opm/simulators/linalg/gpuistl/GpuDILU.hpp
index 241eef3e98d..bde85aff6b3 100644
--- a/opm/simulators/linalg/gpuistl/GpuDILU.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuDILU.hpp
@@ -25,7 +25,8 @@
 #include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
 #include <opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp>
 #include <vector>
-
+#include <map>
+#include <utility>
 
 
 namespace Opm::gpuistl
@@ -82,8 +83,11 @@ class GpuDILU : public Dune::PreconditionerWithUpdate<X, Y>
     //! \brief Updates the matrix data.
     void update() final;
 
+    //! \brief perform matrix splitting and reordering
+    void reorderAndSplitMatrix(int moveThreadBlockSize);
+
     //! \brief Compute the diagonal of the DILU, and update the data of the reordered matrix
-    void computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationThreadBlockSize);
+    void computeDiagonal(int factorizationThreadBlockSize);
 
     //! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
     void tuneThreadBlockSizes();
@@ -153,6 +157,13 @@ class GpuDILU : public Dune::PreconditionerWithUpdate<X, Y>
     int m_lowerSolveThreadBlockSize = -1;
     int m_moveThreadBlockSize = -1;
     int m_DILUFactorizationThreadBlockSize = -1;
+
+    // Graphs for Apply
+    std::map<std::pair<field_type*, const field_type*>, cudaGraph_t> m_apply_graphs;
+    std::map<std::pair<field_type*, const field_type*>, cudaGraphExec_t> m_executableGraphs;
+
+    // Stream for the DILU operations on the GPU
+    cudaStream_t m_stream;
 };
 } // end namespace Opm::gpuistl
 
diff --git a/opm/simulators/linalg/gpuistl/GpuSparseMatrix.cpp b/opm/simulators/linalg/gpuistl/GpuSparseMatrix.cpp
index c2b49cb71d2..6277118e8ae 100644
--- a/opm/simulators/linalg/gpuistl/GpuSparseMatrix.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuSparseMatrix.cpp
@@ -183,6 +183,19 @@ GpuSparseMatrix<T>::updateNonzeroValues(const MatrixType& matrix, bool copyNonZe
     }
 }
 
+// template <class T>
+// template <class MatrixType>
+// void
+// GpuSparseMatrix<T>::updateNonzeroValuesDirectlyInStream(const MatrixType& matrix, cudaStream_t stream)
+// {
+//     OPM_ERROR_IF(nonzeroes() != matrix.nonzeroes(), "Matrix does not have the same number of non-zero elements.");
+//     OPM_ERROR_IF(matrix[0][0].N() != blockSize(), "Matrix does not have the same blocksize.");
+//     OPM_ERROR_IF(matrix.N() != N(), "Matrix does not have the same number of rows.");
+
+//     const T* newNonZeroElements = static_cast<const T*>(&((matrix[0][0][0][0])));
+//     m_nonZeroElements.copyFromHost(newNonZeroElements, nonzeroes() * blockSize() * blockSize(), stream);
+// }
+
 template <typename T>
 void
 GpuSparseMatrix<T>::setUpperTriangular()
diff --git a/opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp b/opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp
index b2d4a954c50..5aca19f5b85 100644
--- a/opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp
@@ -293,6 +293,18 @@ class GpuSparseMatrix
     template <class MatrixType>
     void updateNonzeroValues(const MatrixType& matrix, bool copyNonZeroElementsDirectly = false);
 
+
+    template <class MatrixType>
+    void updateNonzeroValuesDirectlyInStream(const MatrixType& matrix, cudaStream_t stream)
+    {
+        OPM_ERROR_IF(nonzeroes() != matrix.nonzeroes(), "Matrix does not have the same number of non-zero elements.");
+        OPM_ERROR_IF(matrix[0][0].N() != blockSize(), "Matrix does not have the same blocksize.");
+        OPM_ERROR_IF(matrix.N() != N(), "Matrix does not have the same number of rows.");
+
+        const T* newNonZeroElements = static_cast<const T*>(&((matrix[0][0][0][0])));
+        m_nonZeroElements.copyFromHost(newNonZeroElements, nonzeroes() * blockSize() * blockSize(), stream);
+    }
+
 private:
     GpuVector<T> m_nonZeroElements;
     GpuVector<int> m_columnIndices;
diff --git a/opm/simulators/linalg/gpuistl/GpuVector.cpp b/opm/simulators/linalg/gpuistl/GpuVector.cpp
index ddafc1dbfd8..af39e99419e 100644
--- a/opm/simulators/linalg/gpuistl/GpuVector.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuVector.cpp
@@ -266,6 +266,19 @@ GpuVector<T>::copyFromHost(const T* dataPointer, size_t numberOfElements)
     OPM_GPU_SAFE_CALL(cudaMemcpy(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice));
 }
 
+template <class T>
+void
+GpuVector<T>::copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream)
+{
+    if (numberOfElements > dim()) {
+        OPM_THROW(std::runtime_error,
+                  fmt::format("Requesting to copy too many elements. Vector has {} elements, while {} was requested.",
+                              dim(),
+                              numberOfElements));
+    }
+    OPM_GPU_SAFE_CALL(cudaMemcpyAsync(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice, stream));
+}
+
 template <class T>
 void
 GpuVector<T>::copyToHost(T* dataPointer, size_t numberOfElements) const
diff --git a/opm/simulators/linalg/gpuistl/GpuVector.hpp b/opm/simulators/linalg/gpuistl/GpuVector.hpp
index e369d1d9d1d..66a1ebd65e1 100644
--- a/opm/simulators/linalg/gpuistl/GpuVector.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuVector.hpp
@@ -203,6 +203,7 @@ class GpuVector
      * @note assumes that this vector has numberOfElements elements
      */
     void copyFromHost(const T* dataPointer, size_t numberOfElements);
+    void copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream);
 
     /**
      * @brief copyFromHost copies numberOfElements to the CPU memory dataPointer
diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
index 686b8f09f4a..394f83edb9b 100644
--- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
+++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
@@ -37,6 +37,7 @@
 #include <string>
 #include <tuple>
 #include <utility>
+
 namespace Opm::gpuistl
 {
 
@@ -59,6 +60,8 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
     , m_mixedPrecisionScheme(makeMatrixStorageMPScheme(mixedPrecisionScheme))
 {
 
+    OPM_GPU_SAFE_CALL(cudaStreamCreate(&m_stream));
+
     // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
     // Some sanity check
     OPM_ERROR_IF(A.N() != m_gpuMatrix.N(),
@@ -98,7 +101,8 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
         }
     }
 
-    LUFactorizeAndMoveData(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
+    reorderAndSplitMatrix(m_moveThreadBlockSize);
+    LUFactorizeMatrix(m_ILU0FactorizationThreadBlockSize);
 
     if (m_tuneThreadBlockSizes) {
         tuneThreadBlockSizes();
@@ -117,7 +121,22 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d)
 {
     OPM_TIMEBLOCK(prec_apply);
     {
-        apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
+        const auto ptrs = std::make_pair(v.data(), d.data());
+
+        auto it = m_apply_graphs.find(ptrs);
+
+        if (it == m_apply_graphs.end()) {
+            m_apply_graphs[ptrs] = cudaGraph_t();
+            m_executableGraphs[ptrs] = cudaGraphExec_t();
+            OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream, cudaStreamCaptureModeGlobal));
+
+            // The apply functions contains lots of small function calls which call a kernel each
+            apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
+
+            OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream, &m_apply_graphs[ptrs]));
+            OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs], m_apply_graphs[ptrs], nullptr, nullptr, 0));
+        }
+        OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs], 0));
     }
 }
 
@@ -142,7 +161,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                     numOfRowsInLevel,
                     d.data(),
                     v.data(),
-                    lowerSolveThreadBlockSize);
+                    lowerSolveThreadBlockSize,
+                    m_stream);
             }
             else{
                 detail::ILU0::solveLowerLevelSetSplit<blocksize_, field_type, field_type>(
@@ -154,7 +174,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                     numOfRowsInLevel,
                     d.data(),
                     v.data(),
-                    lowerSolveThreadBlockSize);
+                    lowerSolveThreadBlockSize,
+                    m_stream);
             }
         } else {
             detail::ILU0::solveLowerLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
@@ -165,7 +186,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                                                                      numOfRowsInLevel,
                                                                      d.data(),
                                                                      v.data(),
-                                                                     lowerSolveThreadBlockSize);
+                                                                     lowerSolveThreadBlockSize,
+                                                                     m_stream);
         }
         levelStartIdx += numOfRowsInLevel;
     }
@@ -185,7 +207,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                         numOfRowsInLevel,
                         m_gpuMatrixReorderedDiagFloat.value().data(),
                         v.data(),
-                        upperSolveThreadBlockSize);
+                        upperSolveThreadBlockSize,
+                        m_stream);
             }
             else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) {
                     detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, float, field_type>(
@@ -197,7 +220,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                         numOfRowsInLevel,
                         m_gpuMatrixReorderedDiag.value().data(),
                         v.data(),
-                        upperSolveThreadBlockSize);
+                        upperSolveThreadBlockSize,
+                        m_stream);
             }
             else{
                 detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, field_type, field_type>(
@@ -209,7 +233,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                     numOfRowsInLevel,
                     m_gpuMatrixReorderedDiag.value().data(),
                     v.data(),
-                    upperSolveThreadBlockSize);
+                    upperSolveThreadBlockSize,
+                    m_stream);
             }
         } else {
             detail::ILU0::solveUpperLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
@@ -219,7 +244,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                                                                      levelStartIdx,
                                                                      numOfRowsInLevel,
                                                                      v.data(),
-                                                                     upperSolveThreadBlockSize);
+                                                                     upperSolveThreadBlockSize,
+                                                                     m_stream);
         }
     }
 }
@@ -241,10 +267,9 @@ template <class M, class X, class Y, int l>
 void
 OpmGpuILU0<M, X, Y, l>::update()
 {
-    OPM_TIMEBLOCK(prec_update);
-    {
-        update(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
-    }
+    m_gpuMatrix.updateNonzeroValuesDirectlyInStream(m_cpuMatrix, m_stream); // send updated matrix to the gpu
+    reorderAndSplitMatrix(m_moveThreadBlockSize);
+    LUFactorizeMatrix(m_ILU0FactorizationThreadBlockSize);
 }
 
 template <class M, class X, class Y, int l>
@@ -254,12 +279,14 @@ OpmGpuILU0<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationThreadB
     OPM_TIMEBLOCK(prec_update);
     {
         m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
-        LUFactorizeAndMoveData(moveThreadBlockSize, factorizationThreadBlockSize);
+        reorderAndSplitMatrix(moveThreadBlockSize);
+        LUFactorizeMatrix(factorizationThreadBlockSize);
     }
 }
+
 template <class M, class X, class Y, int l>
 void
-OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int factorizationThreadBlockSize)
+OpmGpuILU0<M, X, Y, l>::reorderAndSplitMatrix(int moveThreadBlockSize)
 {
     if (m_splitMatrix) {
         detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
@@ -273,7 +300,8 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact
             m_gpuMatrixReorderedDiag.value().data(),
             m_gpuNaturalToReorder.data(),
             m_gpuMatrixReorderedLower->N(),
-            moveThreadBlockSize);
+            moveThreadBlockSize,
+            m_stream);
     } else {
         detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
                                                                m_gpuMatrix.getRowIndices().data(),
@@ -281,8 +309,15 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact
                                                                m_gpuReorderedLU->getRowIndices().data(),
                                                                m_gpuNaturalToReorder.data(),
                                                                m_gpuReorderedLU->N(),
-                                                               moveThreadBlockSize);
+                                                               moveThreadBlockSize,
+                                                               m_stream);
     }
+}
+
+template <class M, class X, class Y, int l>
+void
+OpmGpuILU0<M, X, Y, l>::LUFactorizeMatrix(int factorizationThreadBlockSize)
+{
     int levelStartIdx = 0;
     for (int level = 0; level < m_levelSets.size(); ++level) {
         const int numOfRowsInLevel = m_levelSets[level].size();
@@ -304,7 +339,8 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact
                     m_gpuNaturalToReorder.data(),
                     levelStartIdx,
                     numOfRowsInLevel,
-                    factorizationThreadBlockSize);
+                    factorizationThreadBlockSize,
+                    m_stream);
             }
             else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG){
                 detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(
@@ -322,7 +358,8 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact
                     m_gpuNaturalToReorder.data(),
                     levelStartIdx,
                     numOfRowsInLevel,
-                    factorizationThreadBlockSize);
+                    factorizationThreadBlockSize,
+                    m_stream);
             }
             else{
                 detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(
@@ -340,7 +377,8 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact
                     m_gpuNaturalToReorder.data(),
                     levelStartIdx,
                     numOfRowsInLevel,
-                    factorizationThreadBlockSize);
+                    factorizationThreadBlockSize,
+                    m_stream);
             }
 
         } else {
@@ -351,7 +389,8 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact
                                                                   m_gpuReorderToNatural.data(),
                                                                   numOfRowsInLevel,
                                                                   levelStartIdx,
-                                                                  factorizationThreadBlockSize);
+                                                                  factorizationThreadBlockSize,
+                                                                  m_stream);
         }
         levelStartIdx += numOfRowsInLevel;
     }
diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp
index c163e048326..064818c68fc 100644
--- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp
+++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp
@@ -84,8 +84,11 @@ class OpmGpuILU0 : public Dune::PreconditionerWithUpdate<X, Y>
     //! \brief Updates the matrix data.
     void update() final;
 
+    //! \brief perform matrix splitting and reordering
+    void reorderAndSplitMatrix(int moveThreadBlockSize);
+
     //! \brief Compute LU factorization, and update the data of the reordered matrix
-    void LUFactorizeAndMoveData(int moveThreadBlockSize, int factorizationThreadBlockSize);
+    void LUFactorizeMatrix(int factorizationThreadBlockSize);
 
     //! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
     void tuneThreadBlockSizes();
@@ -152,6 +155,13 @@ class OpmGpuILU0 : public Dune::PreconditionerWithUpdate<X, Y>
     int m_lowerSolveThreadBlockSize = -1;
     int m_moveThreadBlockSize = -1;
     int m_ILU0FactorizationThreadBlockSize = -1;
+
+    // Graphs for Apply
+    std::map<std::pair<field_type*, const field_type*>, cudaGraph_t> m_apply_graphs;
+    std::map<std::pair<field_type*, const field_type*>, cudaGraphExec_t> m_executableGraphs;
+
+    // Stream for the DILU operations on the GPU
+    cudaStream_t m_stream;
 };
 } // end namespace Opm::gpuistl
 
diff --git a/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.cu b/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.cu
index 52b8a166eca..3f4a095e3f9 100644
--- a/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.cu
+++ b/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.cu
@@ -116,6 +116,24 @@ copyMatDataToReordered(T* srcMatrix,
         srcMatrix, srcRowIndices, dstMatrix, dstRowIndices, naturalToReordered, numberOfRows);
 }
 
+template <class T, int blocksize>
+void
+copyMatDataToReordered(T* srcMatrix,
+                       int* srcRowIndices,
+                       T* dstMatrix,
+                       int* dstRowIndices,
+                       int* naturalToReordered,
+                       size_t numberOfRows,
+                       int thrBlockSize,
+                       cudaStream_t stream)
+{
+    int threadBlockSize
+        = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuMoveDataToReordered<T, blocksize>, thrBlockSize);
+    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(numberOfRows, threadBlockSize);
+    cuMoveDataToReordered<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
+        srcMatrix, srcRowIndices, dstMatrix, dstRowIndices, naturalToReordered, numberOfRows);
+}
+
 template <class T, int blocksize>
 void
 copyMatDataToReorderedSplit(T* srcMatrix,
@@ -145,9 +163,42 @@ copyMatDataToReorderedSplit(T* srcMatrix,
                                                                                  numberOfRows);
 }
 
+template <class T, int blocksize>
+void
+copyMatDataToReorderedSplit(T* srcMatrix,
+                            int* srcRowIndices,
+                            int* srcColumnIndices,
+                            T* dstLowerMatrix,
+                            int* dstLowerRowIndices,
+                            T* dstUpperMatrix,
+                            int* dstUpperRowIndices,
+                            T* dstDiag,
+                            int* naturalToReordered,
+                            size_t numberOfRows,
+                            int thrBlockSize,
+                            cudaStream_t stream)
+{
+    int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
+        cuMoveDataToReorderedSplit<T, blocksize>, thrBlockSize);
+    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(numberOfRows, threadBlockSize);
+    cuMoveDataToReorderedSplit<T, blocksize><<<nThreadBlocks, threadBlockSize, 0 , stream>>>(srcMatrix,
+                                                                                 srcRowIndices,
+                                                                                 srcColumnIndices,
+                                                                                 dstLowerMatrix,
+                                                                                 dstLowerRowIndices,
+                                                                                 dstUpperMatrix,
+                                                                                 dstUpperRowIndices,
+                                                                                 dstDiag,
+                                                                                 naturalToReordered,
+                                                                                 numberOfRows);
+}
+
+
 #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize)                                                                      \
     template void copyMatDataToReordered<T, blocksize>(T*, int*, T*, int*, int*, size_t, int);                         \
-    template void copyMatDataToReorderedSplit<T, blocksize>(T*, int*, int*, T*, int*, T*, int*, T*, int*, size_t, int);
+    template void copyMatDataToReordered<T, blocksize>(T*, int*, T*, int*, int*, size_t, int, cudaStream_t);           \
+    template void copyMatDataToReorderedSplit<T, blocksize>(T*, int*, int*, T*, int*, T*, int*, T*, int*, size_t, int);\
+    template void copyMatDataToReorderedSplit<T, blocksize>(T*, int*, int*, T*, int*, T*, int*, T*, int*, size_t, int, cudaStream_t);
 
 INSTANTIATE_KERNEL_WRAPPERS(float, 1);
 INSTANTIATE_KERNEL_WRAPPERS(float, 2);
diff --git a/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.hpp b/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.hpp
index 3e953a5056a..90ed59e8336 100644
--- a/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.hpp
@@ -41,6 +41,27 @@ void copyMatDataToReordered(T* srcMatrix,
                             size_t numberOfRows,
                             int threadBlockSize);
 
+// TODO: document the stream
+/**
+ * @brief Reorders the elements of a matrix by copying them from one matrix to another using a permutation list
+ * @param srcMatrix The source matrix we will copy data from
+ * @param srcRowIndices Pointer to vector on GPU containing row indices for the source matrix compliant wiht bsr format
+ * @param [out] dstMatrix The destination matrix that we copy data to
+ * @param dstRowIndices Pointer to vector on GPU containing riw indices for the destination matrix compliant wiht bsr
+ * format
+ * @param naturalToReordered Permuation list that converts indices in the src matrix to the indices in the dst matrix
+ * @param numberOfRows The number of rows in the matrices
+ */
+template <class T, int blocksize>
+void copyMatDataToReordered(T* srcMatrix,
+                            int* srcRowIndices,
+                            T* dstMatrix,
+                            int* dstRowIndices,
+                            int* naturalToReordered,
+                            size_t numberOfRows,
+                            int threadBlockSize,
+                            cudaStream_t stream);
+
 /**
  * @brief Reorders the elements of a matrix by copying them from one matrix to a split matrix using a permutation list
  * @param srcMatrix The source matrix we will copy data from
@@ -68,5 +89,33 @@ void copyMatDataToReorderedSplit(T* srcMatrix,
                                  size_t numberOfRows,
                                  int threadBlockSize);
 
+/**
+ * @brief Reorders the elements of a matrix by copying them from one matrix to a split matrix using a permutation list
+ * @param srcMatrix The source matrix we will copy data from
+ * @param srcRowIndices Pointer to vector on GPU containing row indices for the source matrix compliant wiht bsr format
+ * @param [out] dstLowerMatrix The destination of entries that originates from the strictly lower triangular matrix
+ * @param dstRowIndices Pointer to vector on GPU containing rww indices for the destination lower matrix compliant wiht
+ * bsr format
+ * @param [out] dstUpperMatrix The destination of entries that originates from the strictly upper triangular matrix
+ * @param dstRowIndices Pointer to vector on GPU containing riw indices for the destination upper matrix compliant wiht
+ * bsr format
+ * @param [out] dstDiag The destination buffer for the diagonal part of the matrix
+ * @param naturalToReordered Permuation list that converts indices in the src matrix to the indices in the dst matrix
+ * @param numberOfRows The number of rows in the matrices
+ */
+template <class T, int blocksize>
+void copyMatDataToReorderedSplit(T* srcMatrix,
+                                 int* srcRowIndices,
+                                 int* srcColumnIndices,
+                                 T* dstLowerMatrix,
+                                 int* dstLowerRowIndices,
+                                 T* dstUpperMatrix,
+                                 int* dstUpperRowIndices,
+                                 T* dstDiag,
+                                 int* naturalToReordered,
+                                 size_t numberOfRows,
+                                 int threadBlockSize,
+                                 cudaStream_t stream);
+
 } // namespace Opm::gpuistl::detail
 #endif
diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu
index b8f9f48ca5b..d2ad2f1cd6e 100644
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu
@@ -85,10 +85,12 @@ namespace
             // TODO: removce the first condition in the for loop
             for (int block = nnzIdx; block < nnzIdxLim; ++block) {
                 const int col = colIndices[block];
-                mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
+                mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
+                    &mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
             }
 
-            mvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
+            mvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
+                &dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
         }
     }
 
@@ -137,10 +139,12 @@ namespace
             LinearSolverScalar rhs[blocksize] = {0};
             for (int block = nnzIdx; block < nnzIdxLim; ++block) {
                 const int col = colIndices[block];
-                umvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
+                umvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
+                    &mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
             }
 
-            mmvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
+            mmvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
+                &dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
         }
     }
 
@@ -211,8 +215,8 @@ namespace
         }
     }
 
-    // TODO: rewrite such that during the factorization there is a dInv of InputScalar type that stores intermediate results
-    // TOOD: The important part is to only cast after that is fully computed
+    // TODO: rewrite such that during the factorization there is a dInv of InputScalar type that stores intermediate
+    // results TOOD: The important part is to only cast after that is fully computed
     template <int blocksize, class InputScalar, class OutputScalar, MatrixStorageMPScheme mixedPrecisionScheme>
     __global__ void cuComputeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
                                                int* lowerRowIndices,
@@ -239,7 +243,8 @@ namespace
             InputScalar dInvTmp[blocksize * blocksize];
             for (int i = 0; i < blocksize; ++i) {
                 for (int j = 0; j < blocksize; ++j) {
-                    dInvTmp[i * blocksize + j] = srcDiagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j];
+                    dInvTmp[i * blocksize + j]
+                        = srcDiagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j];
                 }
             }
 
@@ -257,21 +262,26 @@ namespace
 
                 if constexpr (detail::storeOffDiagonalAsFloat(mixedPrecisionScheme)) {
                     // TODO: think long and hard about whether this performs only the wanted memory transfers
-                    moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedLowerMat[block * blocksize * blocksize], &dstLowerMat[block * blocksize * blocksize]);
-                    moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize], &dstUpperMat[symOppositeBlock * blocksize * blocksize]);
+                    moveBlock<blocksize, InputScalar, OutputScalar>(
+                        &srcReorderedLowerMat[block * blocksize * blocksize],
+                        &dstLowerMat[block * blocksize * blocksize]);
+                    moveBlock<blocksize, InputScalar, OutputScalar>(
+                        &srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize],
+                        &dstUpperMat[symOppositeBlock * blocksize * blocksize]);
                 }
 
                 mmx2Subtraction<InputScalar, blocksize>(&srcReorderedLowerMat[block * blocksize * blocksize],
-                                              &dInv[col * blocksize * blocksize],
-                                              &srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize],
-                                              dInvTmp);
+                                                        &dInv[col * blocksize * blocksize],
+                                                        &srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize],
+                                                        dInvTmp);
             }
 
             invBlockInPlace<InputScalar, blocksize>(dInvTmp);
             moveBlock<blocksize, InputScalar, InputScalar>(dInvTmp, &dInv[reorderedRowIdx * blocksize * blocksize]);
 
             if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)) {
-                moveBlock<blocksize, InputScalar, OutputScalar>(dInvTmp, &dstDiag[reorderedRowIdx * blocksize * blocksize]); // important!
+                moveBlock<blocksize, InputScalar, OutputScalar>(
+                    dInvTmp, &dstDiag[reorderedRowIdx * blocksize * blocksize]); // important!
             }
         }
     }
@@ -289,12 +299,13 @@ solveLowerLevelSet(T* reorderedMat,
                    const T* dInv,
                    const T* d,
                    T* v,
-                   int thrBlockSize)
+                   int thrBlockSize,
+                   cudaStream_t stream)
 {
     int threadBlockSize
         = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet<T, blocksize>, thrBlockSize);
     int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+    cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
         reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
 }
 
@@ -310,13 +321,15 @@ solveLowerLevelSetSplit(MatrixScalar* reorderedMat,
                         const DiagonalScalar* dInv,
                         const LinearSolverScalar* d,
                         LinearSolverScalar* v,
-                        int thrBlockSize)
+                        int thrBlockSize,
+                        cudaStream_t stream)
 {
     int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
         cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
     int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
-        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
+    cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>
+        <<<nThreadBlocks, threadBlockSize, 0, stream>>>(
+            reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
 }
 // perform the upper solve for all rows in the same level set
 template <class T, int blocksize>
@@ -329,12 +342,13 @@ solveUpperLevelSet(T* reorderedMat,
                    int rowsInLevelSet,
                    const T* dInv,
                    T* v,
-                   int thrBlockSize)
+                   int thrBlockSize,
+                   cudaStream_t stream)
 {
     int threadBlockSize
         = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet<T, blocksize>, thrBlockSize);
     int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+    cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
         reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }
 
@@ -348,13 +362,15 @@ solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
                         int rowsInLevelSet,
                         const DiagonalScalar* dInv,
                         LinearSolverScalar* v,
-                        int thrBlockSize)
+                        int thrBlockSize,
+                        cudaStream_t stream)
 {
     int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
         cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
     int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
-        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
+    cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>
+        <<<nThreadBlocks, threadBlockSize, 0, stream>>>(
+            reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }
 
 template <class T, int blocksize>
@@ -367,20 +383,21 @@ computeDiluDiagonal(T* reorderedMat,
                     const int startIdx,
                     int rowsInLevelSet,
                     T* dInv,
-                    int thrBlockSize)
+                    int thrBlockSize,
+                    cudaStream_t stream)
 {
     if (blocksize <= 3) {
         int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
             cuComputeDiluDiagonal<T, blocksize>, thrBlockSize);
         int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-        cuComputeDiluDiagonal<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(reorderedMat,
-                                                                                rowIndices,
-                                                                                colIndices,
-                                                                                reorderedToNatural,
-                                                                                naturalToReordered,
-                                                                                startIdx,
-                                                                                rowsInLevelSet,
-                                                                                dInv);
+        cuComputeDiluDiagonal<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(reorderedMat,
+                                                                                           rowIndices,
+                                                                                           colIndices,
+                                                                                           reorderedToNatural,
+                                                                                           naturalToReordered,
+                                                                                           startIdx,
+                                                                                           rowsInLevelSet,
+                                                                                           dInv);
     } else {
         OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
     }
@@ -403,49 +420,151 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
                          OutputScalar* dstDiag,
                          OutputScalar* dstLowerMat,
                          OutputScalar* dstUpperMat,
-                         int thrBlockSize)
+                         int thrBlockSize,
+                         cudaStream_t stream)
 {
     if (blocksize <= 3) {
         int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
             cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme>, thrBlockSize);
         int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-        cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme><<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
-                                                                                     lowerRowIndices,
-                                                                                     lowerColIndices,
-                                                                                     srcReorderedUpperMat,
-                                                                                     upperRowIndices,
-                                                                                     upperColIndices,
-                                                                                     srcDiagonal,
-                                                                                     reorderedToNatural,
-                                                                                     naturalToReordered,
-                                                                                     startIdx,
-                                                                                     rowsInLevelSet,
-                                                                                     dInv,
-                                                                                     dstDiag,
-                                                                                     dstLowerMat,
-                                                                                     dstUpperMat);
+        cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme>
+            <<<nThreadBlocks, threadBlockSize, 0, stream>>>(srcReorderedLowerMat,
+                                                            lowerRowIndices,
+                                                            lowerColIndices,
+                                                            srcReorderedUpperMat,
+                                                            upperRowIndices,
+                                                            upperColIndices,
+                                                            srcDiagonal,
+                                                            reorderedToNatural,
+                                                            naturalToReordered,
+                                                            startIdx,
+                                                            rowsInLevelSet,
+                                                            dInv,
+                                                            dstDiag,
+                                                            dstLowerMat,
+                                                            dstUpperMat);
     } else {
         OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
     }
 }
 
-// TODO: format
 #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize)                                                                      \
-    template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*, int);              \
-    template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(                                                              \
-        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int);                                      \
-    template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(                                                              \
-        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int);                                      \
-    template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(                                                              \
-        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int);                                      \
-    template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(                                                              \
-        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int);                                      \
-    template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(                                                              \
-        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int);                                      \
-    template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(                                                              \
-        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int);                                      \
-    template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);                 \
-    template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);
+    template void computeDiluDiagonal<T, blocksize>(                                                                   \
+        T*, int*, int*, int*, int*, const int, int, T*, int, cudaStream_t);                                            \
+    template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(   \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const int,                                                                                                     \
+        int,                                                                                                           \
+        T*,                                                                                                            \
+        double*,                                                                                                       \
+        double*,                                                                                                       \
+        double*,                                                                                                       \
+        int,                                                                                                           \
+        cudaStream_t);                                                                                                 \
+    template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(    \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const int,                                                                                                     \
+        int,                                                                                                           \
+        T*,                                                                                                            \
+        float*,                                                                                                        \
+        float*,                                                                                                        \
+        float*,                                                                                                        \
+        int,                                                                                                           \
+        cudaStream_t);                                                                                                 \
+    template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(      \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const int,                                                                                                     \
+        int,                                                                                                           \
+        T*,                                                                                                            \
+        float*,                                                                                                        \
+        float*,                                                                                                        \
+        float*,                                                                                                        \
+        int,                                                                                                           \
+        cudaStream_t);                                                                                                 \
+    template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(     \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const int,                                                                                                     \
+        int,                                                                                                           \
+        T*,                                                                                                            \
+        double*,                                                                                                       \
+        double*,                                                                                                       \
+        double*,                                                                                                       \
+        int,                                                                                                           \
+        cudaStream_t);                                                                                                 \
+    template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(     \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const int,                                                                                                     \
+        int,                                                                                                           \
+        T*,                                                                                                            \
+        float*,                                                                                                        \
+        float*,                                                                                                        \
+        float*,                                                                                                        \
+        int,                                                                                                           \
+        cudaStream_t);                                                                                                 \
+    template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(    \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const int,                                                                                                     \
+        int,                                                                                                           \
+        T*,                                                                                                            \
+        double*,                                                                                                       \
+        double*,                                                                                                       \
+        double*,                                                                                                       \
+        int,                                                                                                           \
+        cudaStream_t);                                                                                                 \
+    template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int, cudaStream_t);   \
+    template void solveLowerLevelSet<T, blocksize>(                                                                    \
+        T*, int*, int*, int*, int, int, const T*, const T*, T*, int, cudaStream_t);
+// template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);       \
+    // template void solveUpperLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int, cudaStream_t);            \
+    // template void solveLowerLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int, cudaStream_t);
 
 INSTANTIATE_KERNEL_WRAPPERS(float, 1);
 INSTANTIATE_KERNEL_WRAPPERS(float, 2);
@@ -460,19 +579,29 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 4);
 INSTANTIATE_KERNEL_WRAPPERS(double, 5);
 INSTANTIATE_KERNEL_WRAPPERS(double, 6);
 
-#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar)                                 \
-    template void solveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>(                                \
-        MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, LinearSolverScalar*, int);                     \
-    template void solveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>(                                \
-        MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, const LinearSolverScalar*, LinearSolverScalar*, int);
+#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar)                 \
+    template void solveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>(                \
+        MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, LinearSolverScalar*, int, cudaStream_t);     \
+    template void solveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>(                \
+        MatrixScalar*,                                                                                                 \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        int,                                                                                                           \
+        int,                                                                                                           \
+        const DiagonalScalar*,                                                                                         \
+        const LinearSolverScalar*,                                                                                     \
+        LinearSolverScalar*,                                                                                           \
+        int,                                                                                                           \
+        cudaStream_t);
 
 // TODO: be smarter about this... Surely this instantiates many more combinations that are actually needed
-#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(blocksize) \
-    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, float); \
-    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, float); \
-    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, float); \
-    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, double); \
-    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, double); \
+#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(blocksize)                                                               \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, float);                                                 \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, float);                                               \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, float);                                                \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, double);                                                \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, double);                                              \
     INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, double);
 
 INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(1);
diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp
index b7141834f14..dbc8d5d83fd 100644
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp
@@ -54,7 +54,8 @@ void solveLowerLevelSet(T* reorderedMat,
                         const T* dInv,
                         const T* d,
                         T* v,
-                        int threadBlockSize);
+                        int threadBlockSize,
+                        cudaStream_t stream);
 
 /**
  * @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
@@ -82,7 +83,8 @@ void solveLowerLevelSetSplit(MatrixScalar* reorderedUpperMat,
                              const DiagonalScalar* dInv,
                              const LinearSolverScalar* d,
                              LinearSolverScalar* v,
-                             int threadBlockSize);
+                             int threadBlockSize,
+                             cudaStream_t stream);
 
 /**
  * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
@@ -108,7 +110,8 @@ void solveUpperLevelSet(T* reorderedMat,
                         int rowsInLevelSet,
                         const T* dInv,
                         T* v,
-                        int threadBlockSize);
+                        int threadBlockSize,
+                        cudaStream_t stream);
 
 /**
  * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
@@ -134,7 +137,8 @@ void solveUpperLevelSetSplit(MatrixScalar* reorderedUpperMat,
                              int rowsInLevelSet,
                              const DiagonalScalar* dInv,
                              LinearSolverScalar* v,
-                             int threadBlockSize);
+                             int threadBlockSize,
+                             cudaStream_t stream);
 
 /**
  * @brief Computes the ILU0 of the diagonal elements of the reordered matrix and stores it in a reordered vector
@@ -161,7 +165,8 @@ void computeDiluDiagonal(T* reorderedMat,
                          int startIdx,
                          int rowsInLevelSet,
                          T* dInv,
-                         int threadBlockSize);
+                         int threadBlockSize,
+                         cudaStream_t stream);
 
 /**
  * @brief Computes the ILU0 of the diagonal elements of the split reordered matrix and stores it in a reordered vector
@@ -200,7 +205,8 @@ void computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
                               OutputScalar* dstDiagonal,
                               OutputScalar* dstLowerMat,
                               OutputScalar* dstUpperMat,
-                              int threadBlockSize);
+                              int threadBlockSize,
+                              cudaStream_t stream);
 
 } // namespace Opm::gpuistl::detail::DILU
 #endif
diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu
index 46183efd237..92f3a6998c6 100644
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu
@@ -215,9 +215,9 @@ namespace
             // as of now, if we are using mixed precision, then we are always storing the off-diagonals as floats,
             // and sometimes also the diagonal.
             if constexpr (detail::usingMixedPrecision(mixedPrecisionScheme)) {
-                // if we are want to store the entire matrix as a float then we must also move the diagonal block from double to float
-                // if not then we just use the double diagonal that is already now stored in srcDiagonal
-                if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)){
+                // if we are want to store the entire matrix as a float then we must also move the diagonal block from
+                // double to float if not then we just use the double diagonal that is already now stored in srcDiagonal
+                if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)) {
                     moveBlock<blocksize, InputScalar, OutputScalar>(&srcDiagonal[reorderedIdx * scalarsInBlock],
                                                                     &dstDiagonal[reorderedIdx * scalarsInBlock]);
                 }
@@ -362,12 +362,13 @@ solveLowerLevelSet(T* reorderedMat,
                    int rowsInLevelSet,
                    const T* d,
                    T* v,
-                   int thrBlockSize)
+                   int thrBlockSize,
+                   cudaStream_t stream)
 {
     int threadBlockSize
         = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet<T, blocksize>, thrBlockSize);
     int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+    cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
         reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
 }
 // perform the upper solve for all rows in the same level set
@@ -380,12 +381,13 @@ solveUpperLevelSet(T* reorderedMat,
                    int startIdx,
                    int rowsInLevelSet,
                    T* v,
-                   int thrBlockSize)
+                   int thrBlockSize,
+                   cudaStream_t stream)
 {
     int threadBlockSize
         = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet<T, blocksize>, thrBlockSize);
     int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+    cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
         reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, v);
 }
 
@@ -399,13 +401,15 @@ solveLowerLevelSetSplit(MatrixScalar* reorderedMat,
                         int rowsInLevelSet,
                         const LinearSolverScalar* d,
                         LinearSolverScalar* v,
-                        int thrBlockSize)
+                        int thrBlockSize,
+                        cudaStream_t stream)
 {
     int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
         cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>, thrBlockSize);
     int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar><<<nThreadBlocks, threadBlockSize>>>(
-        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
+    cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>
+        <<<nThreadBlocks, threadBlockSize, 0, stream>>>(
+            reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
 }
 // perform the upper solve for all rows in the same level set
 template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
@@ -418,13 +422,15 @@ solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
                         int rowsInLevelSet,
                         const DiagonalScalar* dInv,
                         LinearSolverScalar* v,
-                        int thrBlockSize)
+                        int thrBlockSize,
+                        cudaStream_t stream)
 {
     int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
         cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
     int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
-        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
+    cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>
+        <<<nThreadBlocks, threadBlockSize, 0, stream>>>(
+            reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }
 
 template <class T, int blocksize>
@@ -436,12 +442,13 @@ LUFactorization(T* srcMatrix,
                 int* reorderedToNatual,
                 size_t rowsInLevelSet,
                 int startIdx,
-                int thrBlockSize)
+                int thrBlockSize,
+                cudaStream_t stream)
 {
     int threadBlockSize
         = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuLUFactorization<T, blocksize>, thrBlockSize);
     int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuLUFactorization<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+    cuLUFactorization<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
         srcMatrix, srcRowIndices, srcColumnIndices, naturalToReordered, reorderedToNatual, rowsInLevelSet, startIdx);
 }
 
@@ -461,51 +468,52 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
                      int* naturalToReordered,
                      const int startIdx,
                      int rowsInLevelSet,
-                     int thrBlockSize)
+                     int thrBlockSize,
+                     cudaStream_t stream)
 {
     int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
         cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar, mixedPrecisionScheme>, thrBlockSize);
     int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
     cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar, mixedPrecisionScheme>
-        <<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
-                                             lowerRowIndices,
-                                             lowerColIndices,
-                                             reorderedUpperMat,
-                                             upperRowIndices,
-                                             upperColIndices,
-                                             srcDiagonal,
-                                             dstReorderedLowerMat,
-                                             dstReorderedUpperMat,
-                                             dstDiagonal,
-                                             reorderedToNatural,
-                                             naturalToReordered,
-                                             startIdx,
-                                             rowsInLevelSet);
+        <<<nThreadBlocks, threadBlockSize, 0, stream>>>(srcReorderedLowerMat,
+                                                        lowerRowIndices,
+                                                        lowerColIndices,
+                                                        reorderedUpperMat,
+                                                        upperRowIndices,
+                                                        upperColIndices,
+                                                        srcDiagonal,
+                                                        dstReorderedLowerMat,
+                                                        dstReorderedUpperMat,
+                                                        dstDiagonal,
+                                                        reorderedToNatural,
+                                                        naturalToReordered,
+                                                        startIdx,
+                                                        rowsInLevelSet);
 }
 
 #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize)                                                                      \
-    template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int);                           \
-    template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);                 \
-    template void LUFactorization<T, blocksize>(T*, int*, int*, int*, int*, size_t, int, int);                         \
-    template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(                                                     \
-        T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int);                  \
-    template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(                                                    \
-        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int);               \
-    template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(                                                    \
-        T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int);                  \
-    template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(                                                   \
-        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \
-    template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(                                                    \
-        T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int);                  \
-    template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(                                                   \
-        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); 
-
-#define INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(T) \
-    INSTANTIATE_KERNEL_WRAPPERS(T, 1); \
-    INSTANTIATE_KERNEL_WRAPPERS(T, 2); \
-    INSTANTIATE_KERNEL_WRAPPERS(T, 3); \
-    INSTANTIATE_KERNEL_WRAPPERS(T, 4); \
-    INSTANTIATE_KERNEL_WRAPPERS(T, 5); \
+    template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int, cudaStream_t);             \
+    template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int, cudaStream_t);   \
+    template void LUFactorization<T, blocksize>(T*, int*, int*, int*, int*, size_t, int, int, cudaStream_t);           \
+    template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(        \
+        T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int, cudaStream_t);    \
+    template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(       \
+        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int, cudaStream_t); \
+    template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(          \
+        T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int, cudaStream_t);    \
+    template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(         \
+        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int, cudaStream_t); \
+    template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(         \
+        T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int, cudaStream_t);    \
+    template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(        \
+        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int, cudaStream_t);
+
+#define INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(T)                                                                     \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 1);                                                                                 \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 2);                                                                                 \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 3);                                                                                 \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 4);                                                                                 \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 5);                                                                                 \
     INSTANTIATE_KERNEL_WRAPPERS(T, 6);
 
 INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(float)
@@ -514,32 +522,32 @@ INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(double)
 #define INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(blocksize)                                                         \
     /* double preconditioner */                                                                                        \
     template void solveLowerLevelSetSplit<blocksize, double, double>(                                                  \
-        double*, int*, int*, int*, int, int, const double*, double*, int);                                             \
+        double*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t);                               \
     /* float matrix, double compute preconditioner */                                                                  \
     template void solveLowerLevelSetSplit<blocksize, double, float>(                                                   \
-        float*, int*, int*, int*, int, int, const double*, double*, int);                                              \
+        float*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t);                                \
     /* float preconditioner */                                                                                         \
     template void solveLowerLevelSetSplit<blocksize, float, float>(                                                    \
-        float*, int*, int*, int*, int, int, const float*, float*, int);                                                \
+        float*, int*, int*, int*, int, int, const float*, float*, int, cudaStream_t);                                  \
                                                                                                                        \
     /* double preconditioner */                                                                                        \
-    template void solveUpperLevelSetSplit<blocksize, double, double, double>(                                                  \
-        double*, int*, int*, int*, int, int, const double*, double*, int);                                             \
+    template void solveUpperLevelSetSplit<blocksize, double, double, double>(                                          \
+        double*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t);                               \
     /* float matrix, double compute preconditioner */                                                                  \
-    template void solveUpperLevelSetSplit<blocksize, double, float, double>(                                                   \
-        float*, int*, int*, int*, int, int, const double*, double*, int);                                               \
+    template void solveUpperLevelSetSplit<blocksize, double, float, double>(                                           \
+        float*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t);                                \
     /* float preconditioner */                                                                                         \
-    template void solveUpperLevelSetSplit<blocksize, float, float, double>(                                                    \
-        float*, int*, int*, int*, int, int, const double*, float*, int);                                                \
+    template void solveUpperLevelSetSplit<blocksize, float, float, double>(                                            \
+        float*, int*, int*, int*, int, int, const double*, float*, int, cudaStream_t);                                 \
     /* double preconditioner */                                                                                        \
-    template void solveUpperLevelSetSplit<blocksize, double, double, float>(                                                  \
-        double*, int*, int*, int*, int, int, const float*, double*, int);                                             \
+    template void solveUpperLevelSetSplit<blocksize, double, double, float>(                                           \
+        double*, int*, int*, int*, int, int, const float*, double*, int, cudaStream_t);                                \
     /* float matrix, double compute preconditioner */                                                                  \
-    template void solveUpperLevelSetSplit<blocksize, double, float, float>(                                                   \
-        float*, int*, int*, int*, int, int, const float*, double*, int);                                               \
+    template void solveUpperLevelSetSplit<blocksize, double, float, float>(                                            \
+        float*, int*, int*, int*, int, int, const float*, double*, int, cudaStream_t);                                 \
     /* float preconditioner */                                                                                         \
-    template void solveUpperLevelSetSplit<blocksize, float, float, float>(                                                    \
-        float*, int*, int*, int*, int, int, const float*, float*, int);
+    template void solveUpperLevelSetSplit<blocksize, float, float, float>(                                             \
+        float*, int*, int*, int*, int, int, const float*, float*, int, cudaStream_t);
 
 
 INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(1);
diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp
index cdd78ec9105..91cf89d6376 100644
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp
@@ -46,7 +46,8 @@ void solveUpperLevelSet(T* reorderedMat,
                         int startIdx,
                         int rowsInLevelSet,
                         T* v,
-                        int threadBlockSize);
+                        int threadBlockSize,
+                        cudaStream_t stream);
 
 /**
  * @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
@@ -72,7 +73,8 @@ void solveLowerLevelSet(T* reorderedMat,
                         int rowsInLevelSet,
                         const T* d,
                         T* v,
-                        int threadBlockSize);
+                        int threadBlockSize,
+                        cudaStream_t stream);
 
 /**
  * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
@@ -99,7 +101,8 @@ void solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
                              int rowsInLevelSet,
                              const DiagonalScalar* dInv,
                              LinearSolverScalar* v,
-                             int threadBlockSize);
+                             int threadBlockSize,
+                             cudaStream_t stream);
 
 /**
  * @brief Perform an lower solve on certain rows in a matrix that can safely be computed in parallel
@@ -127,7 +130,8 @@ void solveLowerLevelSetSplit(MatrixScalar* reorderedLowerMat,
                              int rowsInLevelSet,
                              const LinearSolverScalar* d,
                              LinearSolverScalar* v,
-                             int threadBlockSize);
+                             int threadBlockSize,
+                             cudaStream_t stream);
 
 /**
  * @brief Computes the ILU Factorization of the input bcsr matrix, which is stored in a reordered way. The diagonal
@@ -153,7 +157,8 @@ void LUFactorization(T* reorderedMat,
                      int* reorderedToNatual,
                      size_t rowsInLevelSet,
                      int startIdx,
-                     int threadBlockSize);
+                     int threadBlockSize,
+                     cudaStream_t stream);
 
 /**
  * TODO: update this doucmentation
@@ -192,7 +197,8 @@ void LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
                           int* naturalToReordered,
                           int startIdx,
                           int rowsInLevelSet,
-                          int threadBlockSize);
+                          int threadBlockSize,
+                          cudaStream_t stream);
 
 } // namespace Opm::gpuistl::detail::ILU0
 #endif

From 1ae1e0531a6bed452515e4e2f69b4a84cd314171 Mon Sep 17 00:00:00 2001
From: Tobias Meyer Andersen <tobiasmeyer.andersen@sintef.no>
Date: Mon, 20 Jan 2025 09:48:02 +0100
Subject: [PATCH 2/5] resimplify update and format

---
 opm/simulators/linalg/gpuistl/GpuBuffer.cpp   | 20 -----
 opm/simulators/linalg/gpuistl/GpuBuffer.hpp   | 18 -----
 opm/simulators/linalg/gpuistl/GpuDILU.cpp     | 22 ++----
 .../linalg/gpuistl/GpuSparseMatrix.cpp        | 13 ----
 .../linalg/gpuistl/GpuSparseMatrix.hpp        | 12 ---
 opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp  | 20 ++---
 .../preconditionerKernels/DILUKernels.cu      | 76 ++++++++-----------
 .../preconditionerKernels/DILUKernels.hpp     |  6 +-
 .../preconditionerKernels/ILU0Kernels.cu      | 50 ++++++------
 .../preconditionerKernels/ILU0Kernels.hpp     |  6 +-
 10 files changed, 75 insertions(+), 168 deletions(-)

diff --git a/opm/simulators/linalg/gpuistl/GpuBuffer.cpp b/opm/simulators/linalg/gpuistl/GpuBuffer.cpp
index dba271d68d7..49ce121e1fe 100644
--- a/opm/simulators/linalg/gpuistl/GpuBuffer.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuBuffer.cpp
@@ -174,19 +174,6 @@ GpuBuffer<T>::copyFromHost(const T* dataPointer, size_t numberOfElements)
     OPM_GPU_SAFE_CALL(cudaMemcpy(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice));
 }
 
-template <class T>
-void
-GpuBuffer<T>::copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream)
-{
-    if (numberOfElements > size()) {
-        OPM_THROW(std::runtime_error,
-                  fmt::format("Requesting to copy too many elements. buffer has {} elements, while {} was requested.",
-                              size(),
-                              numberOfElements));
-    }
-    OPM_GPU_SAFE_CALL(cudaMemcpyAsync(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice, stream));
-}
-
 template <class T>
 void
 GpuBuffer<T>::copyToHost(T* dataPointer, size_t numberOfElements) const
@@ -202,13 +189,6 @@ GpuBuffer<T>::copyFromHost(const std::vector<T>& data)
     copyFromHost(data.data(), data.size());
 }
 
-template <class T>
-void
-GpuBuffer<T>::copyFromHost(const std::vector<T>& data, cudaStream_t stream)
-{
-    copyFromHost(data.data(), data.size(), stream);
-}
-
 template <class T>
 void
 GpuBuffer<T>::copyToHost(std::vector<T>& data) const
diff --git a/opm/simulators/linalg/gpuistl/GpuBuffer.hpp b/opm/simulators/linalg/gpuistl/GpuBuffer.hpp
index 1865d7a71d3..1ba5cb86124 100644
--- a/opm/simulators/linalg/gpuistl/GpuBuffer.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuBuffer.hpp
@@ -175,15 +175,6 @@ class GpuBuffer
      */
     void copyFromHost(const T* dataPointer, size_t numberOfElements);
 
-    /**
-     * @brief copyFromHost copies numberOfElements from the CPU memory dataPointer
-     * @param dataPointer raw pointer to CPU memory
-     * @param numberOfElements number of elements to copy
-     * @note This does synchronous transfer.
-     * @note assumes that this buffer has numberOfElements elements
-     */
-    void copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream);
-
     /**
      * @brief copyFromHost copies numberOfElements to the CPU memory dataPointer
      * @param dataPointer raw pointer to CPU memory
@@ -202,15 +193,6 @@ class GpuBuffer
      */
     void copyFromHost(const std::vector<T>& data);
 
-    /**
-     * @brief copyToHost copies data from an std::vector
-     * @param data the vector to copy from
-     *
-     * @note This does synchronous transfer.
-     * @note This assumes that the size of this buffer is equal to the size of the input vector.
-     */
-    void copyFromHost(const std::vector<T>& data, cudaStream_t stream);
-
     /**
      * @brief copyToHost copies data to an std::vector
      * @param data the vector to copy to
diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.cpp b/opm/simulators/linalg/gpuistl/GpuDILU.cpp
index b76bfe1ed61..2b616214d06 100644
--- a/opm/simulators/linalg/gpuistl/GpuDILU.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuDILU.cpp
@@ -279,7 +279,7 @@ GpuDILU<M, X, Y, l>::update()
 {
     OPM_TIMEBLOCK(prec_update);
     {
-        m_gpuMatrix.updateNonzeroValuesDirectlyInStream(m_cpuMatrix, m_stream); // send updated matrix to the gpu
+        m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu
         reorderAndSplitMatrix(m_moveThreadBlockSize);
         computeDiagonal(m_DILUFactorizationThreadBlockSize);
     }
@@ -289,7 +289,7 @@ template <class M, class X, class Y, int l>
 void
 GpuDILU<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationBlockSize)
 {
-    m_gpuMatrix.updateNonzeroValuesDirectlyInStream(m_cpuMatrix, m_stream); // send updated matrix to the gpu
+    m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu
     reorderAndSplitMatrix(moveThreadBlockSize);
     computeDiagonal(factorizationBlockSize);
 }
@@ -310,8 +310,7 @@ GpuDILU<M, X, Y, l>::reorderAndSplitMatrix(int moveThreadBlockSize)
             m_gpuMatrixReorderedDiag->data(),
             m_gpuNaturalToReorder.data(),
             m_gpuMatrixReorderedLower->N(),
-            moveThreadBlockSize,
-            m_stream);
+            moveThreadBlockSize);
     } else {
         detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
                                                                 m_gpuMatrix.getRowIndices().data(),
@@ -319,8 +318,7 @@ GpuDILU<M, X, Y, l>::reorderAndSplitMatrix(int moveThreadBlockSize)
                                                                 m_gpuMatrixReordered->getRowIndices().data(),
                                                                 m_gpuNaturalToReorder.data(),
                                                                 m_gpuMatrixReordered->N(),
-                                                                moveThreadBlockSize,
-                                                                m_stream);
+                                                                moveThreadBlockSize);
     }
 }
 
@@ -349,8 +347,7 @@ GpuDILU<M, X, Y, l>::computeDiagonal(int factorizationBlockSize)
                     m_gpuDInvFloat->data(),
                     m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
                     m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
-                    factorizationBlockSize,
-                    m_stream);
+                    factorizationBlockSize);
             } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) {
                 detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(
                     m_gpuMatrixReorderedLower->getNonZeroValues().data(),
@@ -368,8 +365,7 @@ GpuDILU<M, X, Y, l>::computeDiagonal(int factorizationBlockSize)
                     nullptr,
                     m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
                     m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
-                    factorizationBlockSize,
-                    m_stream);
+                    factorizationBlockSize);
             } else {
                 // TODO: should this be field type twice or field type then float in the template?
                 detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(
@@ -388,8 +384,7 @@ GpuDILU<M, X, Y, l>::computeDiagonal(int factorizationBlockSize)
                     nullptr,
                     nullptr,
                     nullptr,
-                    factorizationBlockSize,
-                    m_stream);
+                    factorizationBlockSize);
             }
         } else {
             detail::DILU::computeDiluDiagonal<field_type, blocksize_>(
@@ -401,8 +396,7 @@ GpuDILU<M, X, Y, l>::computeDiagonal(int factorizationBlockSize)
                 levelStartIdx,
                 numOfRowsInLevel,
                 m_gpuDInv.data(),
-                factorizationBlockSize,
-                m_stream);
+                factorizationBlockSize);
         }
         levelStartIdx += numOfRowsInLevel;
     }
diff --git a/opm/simulators/linalg/gpuistl/GpuSparseMatrix.cpp b/opm/simulators/linalg/gpuistl/GpuSparseMatrix.cpp
index 6277118e8ae..c2b49cb71d2 100644
--- a/opm/simulators/linalg/gpuistl/GpuSparseMatrix.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuSparseMatrix.cpp
@@ -183,19 +183,6 @@ GpuSparseMatrix<T>::updateNonzeroValues(const MatrixType& matrix, bool copyNonZe
     }
 }
 
-// template <class T>
-// template <class MatrixType>
-// void
-// GpuSparseMatrix<T>::updateNonzeroValuesDirectlyInStream(const MatrixType& matrix, cudaStream_t stream)
-// {
-//     OPM_ERROR_IF(nonzeroes() != matrix.nonzeroes(), "Matrix does not have the same number of non-zero elements.");
-//     OPM_ERROR_IF(matrix[0][0].N() != blockSize(), "Matrix does not have the same blocksize.");
-//     OPM_ERROR_IF(matrix.N() != N(), "Matrix does not have the same number of rows.");
-
-//     const T* newNonZeroElements = static_cast<const T*>(&((matrix[0][0][0][0])));
-//     m_nonZeroElements.copyFromHost(newNonZeroElements, nonzeroes() * blockSize() * blockSize(), stream);
-// }
-
 template <typename T>
 void
 GpuSparseMatrix<T>::setUpperTriangular()
diff --git a/opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp b/opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp
index 5aca19f5b85..b2d4a954c50 100644
--- a/opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp
@@ -293,18 +293,6 @@ class GpuSparseMatrix
     template <class MatrixType>
     void updateNonzeroValues(const MatrixType& matrix, bool copyNonZeroElementsDirectly = false);
 
-
-    template <class MatrixType>
-    void updateNonzeroValuesDirectlyInStream(const MatrixType& matrix, cudaStream_t stream)
-    {
-        OPM_ERROR_IF(nonzeroes() != matrix.nonzeroes(), "Matrix does not have the same number of non-zero elements.");
-        OPM_ERROR_IF(matrix[0][0].N() != blockSize(), "Matrix does not have the same blocksize.");
-        OPM_ERROR_IF(matrix.N() != N(), "Matrix does not have the same number of rows.");
-
-        const T* newNonZeroElements = static_cast<const T*>(&((matrix[0][0][0][0])));
-        m_nonZeroElements.copyFromHost(newNonZeroElements, nonzeroes() * blockSize() * blockSize(), stream);
-    }
-
 private:
     GpuVector<T> m_nonZeroElements;
     GpuVector<int> m_columnIndices;
diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
index 394f83edb9b..41abb6a6030 100644
--- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
+++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
@@ -267,7 +267,7 @@ template <class M, class X, class Y, int l>
 void
 OpmGpuILU0<M, X, Y, l>::update()
 {
-    m_gpuMatrix.updateNonzeroValuesDirectlyInStream(m_cpuMatrix, m_stream); // send updated matrix to the gpu
+    m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu
     reorderAndSplitMatrix(m_moveThreadBlockSize);
     LUFactorizeMatrix(m_ILU0FactorizationThreadBlockSize);
 }
@@ -300,8 +300,7 @@ OpmGpuILU0<M, X, Y, l>::reorderAndSplitMatrix(int moveThreadBlockSize)
             m_gpuMatrixReorderedDiag.value().data(),
             m_gpuNaturalToReorder.data(),
             m_gpuMatrixReorderedLower->N(),
-            moveThreadBlockSize,
-            m_stream);
+            moveThreadBlockSize);
     } else {
         detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
                                                                m_gpuMatrix.getRowIndices().data(),
@@ -309,8 +308,7 @@ OpmGpuILU0<M, X, Y, l>::reorderAndSplitMatrix(int moveThreadBlockSize)
                                                                m_gpuReorderedLU->getRowIndices().data(),
                                                                m_gpuNaturalToReorder.data(),
                                                                m_gpuReorderedLU->N(),
-                                                               moveThreadBlockSize,
-                                                               m_stream);
+                                                               moveThreadBlockSize);
     }
 }
 
@@ -339,8 +337,7 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeMatrix(int factorizationThreadBlockSize)
                     m_gpuNaturalToReorder.data(),
                     levelStartIdx,
                     numOfRowsInLevel,
-                    factorizationThreadBlockSize,
-                    m_stream);
+                    factorizationThreadBlockSize);
             }
             else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG){
                 detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(
@@ -358,8 +355,7 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeMatrix(int factorizationThreadBlockSize)
                     m_gpuNaturalToReorder.data(),
                     levelStartIdx,
                     numOfRowsInLevel,
-                    factorizationThreadBlockSize,
-                    m_stream);
+                    factorizationThreadBlockSize);
             }
             else{
                 detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(
@@ -377,8 +373,7 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeMatrix(int factorizationThreadBlockSize)
                     m_gpuNaturalToReorder.data(),
                     levelStartIdx,
                     numOfRowsInLevel,
-                    factorizationThreadBlockSize,
-                    m_stream);
+                    factorizationThreadBlockSize);
             }
 
         } else {
@@ -389,8 +384,7 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeMatrix(int factorizationThreadBlockSize)
                                                                   m_gpuReorderToNatural.data(),
                                                                   numOfRowsInLevel,
                                                                   levelStartIdx,
-                                                                  factorizationThreadBlockSize,
-                                                                  m_stream);
+                                                                  factorizationThreadBlockSize);
         }
         levelStartIdx += numOfRowsInLevel;
     }
diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu
index d2ad2f1cd6e..8af6765df7e 100644
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu
@@ -383,21 +383,20 @@ computeDiluDiagonal(T* reorderedMat,
                     const int startIdx,
                     int rowsInLevelSet,
                     T* dInv,
-                    int thrBlockSize,
-                    cudaStream_t stream)
+                    int thrBlockSize)
 {
     if (blocksize <= 3) {
         int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
             cuComputeDiluDiagonal<T, blocksize>, thrBlockSize);
         int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-        cuComputeDiluDiagonal<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(reorderedMat,
-                                                                                           rowIndices,
-                                                                                           colIndices,
-                                                                                           reorderedToNatural,
-                                                                                           naturalToReordered,
-                                                                                           startIdx,
-                                                                                           rowsInLevelSet,
-                                                                                           dInv);
+        cuComputeDiluDiagonal<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(reorderedMat,
+                                                                                rowIndices,
+                                                                                colIndices,
+                                                                                reorderedToNatural,
+                                                                                naturalToReordered,
+                                                                                startIdx,
+                                                                                rowsInLevelSet,
+                                                                                dInv);
     } else {
         OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
     }
@@ -420,37 +419,35 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
                          OutputScalar* dstDiag,
                          OutputScalar* dstLowerMat,
                          OutputScalar* dstUpperMat,
-                         int thrBlockSize,
-                         cudaStream_t stream)
+                         int thrBlockSize)
 {
     if (blocksize <= 3) {
         int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
             cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme>, thrBlockSize);
         int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
         cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme>
-            <<<nThreadBlocks, threadBlockSize, 0, stream>>>(srcReorderedLowerMat,
-                                                            lowerRowIndices,
-                                                            lowerColIndices,
-                                                            srcReorderedUpperMat,
-                                                            upperRowIndices,
-                                                            upperColIndices,
-                                                            srcDiagonal,
-                                                            reorderedToNatural,
-                                                            naturalToReordered,
-                                                            startIdx,
-                                                            rowsInLevelSet,
-                                                            dInv,
-                                                            dstDiag,
-                                                            dstLowerMat,
-                                                            dstUpperMat);
+            <<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
+                                                 lowerRowIndices,
+                                                 lowerColIndices,
+                                                 srcReorderedUpperMat,
+                                                 upperRowIndices,
+                                                 upperColIndices,
+                                                 srcDiagonal,
+                                                 reorderedToNatural,
+                                                 naturalToReordered,
+                                                 startIdx,
+                                                 rowsInLevelSet,
+                                                 dInv,
+                                                 dstDiag,
+                                                 dstLowerMat,
+                                                 dstUpperMat);
     } else {
         OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
     }
 }
 
 #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize)                                                                      \
-    template void computeDiluDiagonal<T, blocksize>(                                                                   \
-        T*, int*, int*, int*, int*, const int, int, T*, int, cudaStream_t);                                            \
+    template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*, int);              \
     template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(   \
         const T*,                                                                                                      \
         int*,                                                                                                          \
@@ -467,8 +464,7 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
         double*,                                                                                                       \
         double*,                                                                                                       \
         double*,                                                                                                       \
-        int,                                                                                                           \
-        cudaStream_t);                                                                                                 \
+        int);                                                                                                          \
     template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(    \
         const T*,                                                                                                      \
         int*,                                                                                                          \
@@ -485,8 +481,7 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
         float*,                                                                                                        \
         float*,                                                                                                        \
         float*,                                                                                                        \
-        int,                                                                                                           \
-        cudaStream_t);                                                                                                 \
+        int);                                                                                                          \
     template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(      \
         const T*,                                                                                                      \
         int*,                                                                                                          \
@@ -503,8 +498,7 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
         float*,                                                                                                        \
         float*,                                                                                                        \
         float*,                                                                                                        \
-        int,                                                                                                           \
-        cudaStream_t);                                                                                                 \
+        int);                                                                                                          \
     template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(     \
         const T*,                                                                                                      \
         int*,                                                                                                          \
@@ -521,8 +515,7 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
         double*,                                                                                                       \
         double*,                                                                                                       \
         double*,                                                                                                       \
-        int,                                                                                                           \
-        cudaStream_t);                                                                                                 \
+        int);                                                                                                          \
     template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(     \
         const T*,                                                                                                      \
         int*,                                                                                                          \
@@ -539,8 +532,7 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
         float*,                                                                                                        \
         float*,                                                                                                        \
         float*,                                                                                                        \
-        int,                                                                                                           \
-        cudaStream_t);                                                                                                 \
+        int);                                                                                                          \
     template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(    \
         const T*,                                                                                                      \
         int*,                                                                                                          \
@@ -557,14 +549,10 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
         double*,                                                                                                       \
         double*,                                                                                                       \
         double*,                                                                                                       \
-        int,                                                                                                           \
-        cudaStream_t);                                                                                                 \
+        int);                                                                                                          \
     template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int, cudaStream_t);   \
     template void solveLowerLevelSet<T, blocksize>(                                                                    \
         T*, int*, int*, int*, int, int, const T*, const T*, T*, int, cudaStream_t);
-// template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);       \
-    // template void solveUpperLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int, cudaStream_t);            \
-    // template void solveLowerLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int, cudaStream_t);
 
 INSTANTIATE_KERNEL_WRAPPERS(float, 1);
 INSTANTIATE_KERNEL_WRAPPERS(float, 2);
diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp
index dbc8d5d83fd..77a9cae6997 100644
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp
@@ -165,8 +165,7 @@ void computeDiluDiagonal(T* reorderedMat,
                          int startIdx,
                          int rowsInLevelSet,
                          T* dInv,
-                         int threadBlockSize,
-                         cudaStream_t stream);
+                         int threadBlockSize);
 
 /**
  * @brief Computes the ILU0 of the diagonal elements of the split reordered matrix and stores it in a reordered vector
@@ -205,8 +204,7 @@ void computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
                               OutputScalar* dstDiagonal,
                               OutputScalar* dstLowerMat,
                               OutputScalar* dstUpperMat,
-                              int threadBlockSize,
-                              cudaStream_t stream);
+                              int threadBlockSize);
 
 } // namespace Opm::gpuistl::detail::DILU
 #endif
diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu
index 92f3a6998c6..e66c42f8a56 100644
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu
@@ -442,13 +442,12 @@ LUFactorization(T* srcMatrix,
                 int* reorderedToNatual,
                 size_t rowsInLevelSet,
                 int startIdx,
-                int thrBlockSize,
-                cudaStream_t stream)
+                int thrBlockSize)
 {
     int threadBlockSize
         = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuLUFactorization<T, blocksize>, thrBlockSize);
     int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuLUFactorization<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
+    cuLUFactorization<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
         srcMatrix, srcRowIndices, srcColumnIndices, naturalToReordered, reorderedToNatual, rowsInLevelSet, startIdx);
 }
 
@@ -468,45 +467,44 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
                      int* naturalToReordered,
                      const int startIdx,
                      int rowsInLevelSet,
-                     int thrBlockSize,
-                     cudaStream_t stream)
+                     int thrBlockSize)
 {
     int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
         cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar, mixedPrecisionScheme>, thrBlockSize);
     int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
     cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar, mixedPrecisionScheme>
-        <<<nThreadBlocks, threadBlockSize, 0, stream>>>(srcReorderedLowerMat,
-                                                        lowerRowIndices,
-                                                        lowerColIndices,
-                                                        reorderedUpperMat,
-                                                        upperRowIndices,
-                                                        upperColIndices,
-                                                        srcDiagonal,
-                                                        dstReorderedLowerMat,
-                                                        dstReorderedUpperMat,
-                                                        dstDiagonal,
-                                                        reorderedToNatural,
-                                                        naturalToReordered,
-                                                        startIdx,
-                                                        rowsInLevelSet);
+        <<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
+                                             lowerRowIndices,
+                                             lowerColIndices,
+                                             reorderedUpperMat,
+                                             upperRowIndices,
+                                             upperColIndices,
+                                             srcDiagonal,
+                                             dstReorderedLowerMat,
+                                             dstReorderedUpperMat,
+                                             dstDiagonal,
+                                             reorderedToNatural,
+                                             naturalToReordered,
+                                             startIdx,
+                                             rowsInLevelSet);
 }
 
 #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize)                                                                      \
     template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int, cudaStream_t);             \
     template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int, cudaStream_t);   \
-    template void LUFactorization<T, blocksize>(T*, int*, int*, int*, int*, size_t, int, int, cudaStream_t);           \
+    template void LUFactorization<T, blocksize>(T*, int*, int*, int*, int*, size_t, int, int);                         \
     template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(        \
-        T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int, cudaStream_t);    \
+        T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int);                  \
     template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(       \
-        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int, cudaStream_t); \
+        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int);               \
     template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(          \
-        T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int, cudaStream_t);    \
+        T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int);                  \
     template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(         \
-        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int, cudaStream_t); \
+        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int);               \
     template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(         \
-        T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int, cudaStream_t);    \
+        T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int);                  \
     template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(        \
-        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int, cudaStream_t);
+        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int);
 
 #define INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(T)                                                                     \
     INSTANTIATE_KERNEL_WRAPPERS(T, 1);                                                                                 \
diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp
index 91cf89d6376..04390ff77c2 100644
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp
@@ -157,8 +157,7 @@ void LUFactorization(T* reorderedMat,
                      int* reorderedToNatual,
                      size_t rowsInLevelSet,
                      int startIdx,
-                     int threadBlockSize,
-                     cudaStream_t stream);
+                     int threadBlockSize);
 
 /**
  * TODO: update this doucmentation
@@ -197,8 +196,7 @@ void LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
                           int* naturalToReordered,
                           int startIdx,
                           int rowsInLevelSet,
-                          int threadBlockSize,
-                          cudaStream_t stream);
+                          int threadBlockSize);
 
 } // namespace Opm::gpuistl::detail::ILU0
 #endif

From 6a5d8baa6caeccb67f24b8e8ab284732b81f19ba Mon Sep 17 00:00:00 2001
From: Tobias Meyer Andersen <tobiasmeyer.andersen@sintef.no>
Date: Mon, 20 Jan 2025 09:52:07 +0100
Subject: [PATCH 3/5] reduce diff

---
 .../detail/gpusparse_matrix_operations.cu     | 53 +------------------
 .../detail/gpusparse_matrix_operations.hpp    | 49 -----------------
 2 files changed, 1 insertion(+), 101 deletions(-)

diff --git a/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.cu b/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.cu
index 3f4a095e3f9..52b8a166eca 100644
--- a/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.cu
+++ b/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.cu
@@ -116,24 +116,6 @@ copyMatDataToReordered(T* srcMatrix,
         srcMatrix, srcRowIndices, dstMatrix, dstRowIndices, naturalToReordered, numberOfRows);
 }
 
-template <class T, int blocksize>
-void
-copyMatDataToReordered(T* srcMatrix,
-                       int* srcRowIndices,
-                       T* dstMatrix,
-                       int* dstRowIndices,
-                       int* naturalToReordered,
-                       size_t numberOfRows,
-                       int thrBlockSize,
-                       cudaStream_t stream)
-{
-    int threadBlockSize
-        = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuMoveDataToReordered<T, blocksize>, thrBlockSize);
-    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(numberOfRows, threadBlockSize);
-    cuMoveDataToReordered<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
-        srcMatrix, srcRowIndices, dstMatrix, dstRowIndices, naturalToReordered, numberOfRows);
-}
-
 template <class T, int blocksize>
 void
 copyMatDataToReorderedSplit(T* srcMatrix,
@@ -163,42 +145,9 @@ copyMatDataToReorderedSplit(T* srcMatrix,
                                                                                  numberOfRows);
 }
 
-template <class T, int blocksize>
-void
-copyMatDataToReorderedSplit(T* srcMatrix,
-                            int* srcRowIndices,
-                            int* srcColumnIndices,
-                            T* dstLowerMatrix,
-                            int* dstLowerRowIndices,
-                            T* dstUpperMatrix,
-                            int* dstUpperRowIndices,
-                            T* dstDiag,
-                            int* naturalToReordered,
-                            size_t numberOfRows,
-                            int thrBlockSize,
-                            cudaStream_t stream)
-{
-    int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
-        cuMoveDataToReorderedSplit<T, blocksize>, thrBlockSize);
-    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(numberOfRows, threadBlockSize);
-    cuMoveDataToReorderedSplit<T, blocksize><<<nThreadBlocks, threadBlockSize, 0 , stream>>>(srcMatrix,
-                                                                                 srcRowIndices,
-                                                                                 srcColumnIndices,
-                                                                                 dstLowerMatrix,
-                                                                                 dstLowerRowIndices,
-                                                                                 dstUpperMatrix,
-                                                                                 dstUpperRowIndices,
-                                                                                 dstDiag,
-                                                                                 naturalToReordered,
-                                                                                 numberOfRows);
-}
-
-
 #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize)                                                                      \
     template void copyMatDataToReordered<T, blocksize>(T*, int*, T*, int*, int*, size_t, int);                         \
-    template void copyMatDataToReordered<T, blocksize>(T*, int*, T*, int*, int*, size_t, int, cudaStream_t);           \
-    template void copyMatDataToReorderedSplit<T, blocksize>(T*, int*, int*, T*, int*, T*, int*, T*, int*, size_t, int);\
-    template void copyMatDataToReorderedSplit<T, blocksize>(T*, int*, int*, T*, int*, T*, int*, T*, int*, size_t, int, cudaStream_t);
+    template void copyMatDataToReorderedSplit<T, blocksize>(T*, int*, int*, T*, int*, T*, int*, T*, int*, size_t, int);
 
 INSTANTIATE_KERNEL_WRAPPERS(float, 1);
 INSTANTIATE_KERNEL_WRAPPERS(float, 2);
diff --git a/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.hpp b/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.hpp
index 90ed59e8336..3e953a5056a 100644
--- a/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.hpp
@@ -41,27 +41,6 @@ void copyMatDataToReordered(T* srcMatrix,
                             size_t numberOfRows,
                             int threadBlockSize);
 
-// TODO: document the stream
-/**
- * @brief Reorders the elements of a matrix by copying them from one matrix to another using a permutation list
- * @param srcMatrix The source matrix we will copy data from
- * @param srcRowIndices Pointer to vector on GPU containing row indices for the source matrix compliant wiht bsr format
- * @param [out] dstMatrix The destination matrix that we copy data to
- * @param dstRowIndices Pointer to vector on GPU containing riw indices for the destination matrix compliant wiht bsr
- * format
- * @param naturalToReordered Permuation list that converts indices in the src matrix to the indices in the dst matrix
- * @param numberOfRows The number of rows in the matrices
- */
-template <class T, int blocksize>
-void copyMatDataToReordered(T* srcMatrix,
-                            int* srcRowIndices,
-                            T* dstMatrix,
-                            int* dstRowIndices,
-                            int* naturalToReordered,
-                            size_t numberOfRows,
-                            int threadBlockSize,
-                            cudaStream_t stream);
-
 /**
  * @brief Reorders the elements of a matrix by copying them from one matrix to a split matrix using a permutation list
  * @param srcMatrix The source matrix we will copy data from
@@ -89,33 +68,5 @@ void copyMatDataToReorderedSplit(T* srcMatrix,
                                  size_t numberOfRows,
                                  int threadBlockSize);
 
-/**
- * @brief Reorders the elements of a matrix by copying them from one matrix to a split matrix using a permutation list
- * @param srcMatrix The source matrix we will copy data from
- * @param srcRowIndices Pointer to vector on GPU containing row indices for the source matrix compliant wiht bsr format
- * @param [out] dstLowerMatrix The destination of entries that originates from the strictly lower triangular matrix
- * @param dstRowIndices Pointer to vector on GPU containing rww indices for the destination lower matrix compliant wiht
- * bsr format
- * @param [out] dstUpperMatrix The destination of entries that originates from the strictly upper triangular matrix
- * @param dstRowIndices Pointer to vector on GPU containing riw indices for the destination upper matrix compliant wiht
- * bsr format
- * @param [out] dstDiag The destination buffer for the diagonal part of the matrix
- * @param naturalToReordered Permuation list that converts indices in the src matrix to the indices in the dst matrix
- * @param numberOfRows The number of rows in the matrices
- */
-template <class T, int blocksize>
-void copyMatDataToReorderedSplit(T* srcMatrix,
-                                 int* srcRowIndices,
-                                 int* srcColumnIndices,
-                                 T* dstLowerMatrix,
-                                 int* dstLowerRowIndices,
-                                 T* dstUpperMatrix,
-                                 int* dstUpperRowIndices,
-                                 T* dstDiag,
-                                 int* naturalToReordered,
-                                 size_t numberOfRows,
-                                 int threadBlockSize,
-                                 cudaStream_t stream);
-
 } // namespace Opm::gpuistl::detail
 #endif

From bca83687ccca7da8b248ae28d5c700b6b36be283 Mon Sep 17 00:00:00 2001
From: Tobias Meyer Andersen <tobiasmeyer.andersen@sintef.no>
Date: Wed, 5 Feb 2025 10:29:07 +0100
Subject: [PATCH 4/5] use events to ensure correctness

---
 opm/simulators/linalg/gpuistl/GpuDILU.cpp    | 38 +++++++++++++-------
 opm/simulators/linalg/gpuistl/GpuDILU.hpp    |  6 +++-
 opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp | 38 +++++++++++++-------
 opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp |  6 +++-
 4 files changed, 62 insertions(+), 26 deletions(-)

diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.cpp b/opm/simulators/linalg/gpuistl/GpuDILU.cpp
index 2b616214d06..2e3fbb7e7ec 100644
--- a/opm/simulators/linalg/gpuistl/GpuDILU.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuDILU.cpp
@@ -55,8 +55,6 @@ GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int
     , m_tuneThreadBlockSizes(tuneKernels)
     , m_mixedPrecisionScheme(makeMatrixStorageMPScheme(mixedPrecisionScheme))
 {
-    OPM_GPU_SAFE_CALL(cudaStreamCreate(&m_stream));
-
     // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
     // Some sanity check
     OPM_ERROR_IF(A.N() != m_gpuMatrix.N(),
@@ -114,6 +112,10 @@ template <class M, class X, class Y, int l>
 void
 GpuDILU<M, X, Y, l>::apply(X& v, const Y& d)
 {
+    // ensure that this stream only starts doing work when main stream is completed up to this point
+    OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0));
+    OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0));
+
     OPM_TIMEBLOCK(prec_apply);
     {
         const auto ptrs = std::make_pair(v.data(), d.data());
@@ -123,16 +125,20 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d)
         if (it == m_apply_graphs.end()) {
             m_apply_graphs[ptrs] = cudaGraph_t();
             m_executableGraphs[ptrs] = cudaGraphExec_t();
-            OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream, cudaStreamCaptureModeGlobal));
+            OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream.get(), cudaStreamCaptureModeGlobal));
 
             // The apply functions contains lots of small function calls which call a kernel each
             apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
 
-            OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream, &m_apply_graphs[ptrs]));
+            OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream.get(), &m_apply_graphs[ptrs]));
             OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs], m_apply_graphs[ptrs], nullptr, nullptr, 0));
         }
         OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs], 0));
     }
+
+    // ensure that main stream only continues after this stream is completed
+    OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get()));
+    OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0));
 }
 
 template <class M, class X, class Y, int l>
@@ -155,7 +161,7 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                     d.data(),
                     v.data(),
                     lowerSolveThreadBlockSize,
-                    m_stream);
+                    m_stream.get());
             } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) {
                 detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float, field_type>(
                     m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
@@ -168,7 +174,7 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                     d.data(),
                     v.data(),
                     lowerSolveThreadBlockSize,
-                    m_stream);
+                    m_stream.get());
             } else {
                 detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, field_type, field_type>(
                     m_gpuMatrixReorderedLower->getNonZeroValues().data(),
@@ -181,7 +187,7 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                     d.data(),
                     v.data(),
                     lowerSolveThreadBlockSize,
-                    m_stream);
+                    m_stream.get());
             }
         } else {
             detail::DILU::solveLowerLevelSet<field_type, blocksize_>(
@@ -195,7 +201,7 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                 d.data(),
                 v.data(),
                 lowerSolveThreadBlockSize,
-                m_stream);
+                m_stream.get());
         }
         levelStartIdx += numOfRowsInLevel;
     }
@@ -217,7 +223,7 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                     m_gpuDInvFloat->data(),
                     v.data(),
                     upperSolveThreadBlockSize,
-                    m_stream);
+                    m_stream.get());
             } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG){
                 detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, float>(
                     m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
@@ -229,7 +235,7 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                     m_gpuDInv.data(),
                     v.data(),
                     upperSolveThreadBlockSize,
-                    m_stream);
+                    m_stream.get());
             } else {
                 detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, field_type>(
                     m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
@@ -241,7 +247,7 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                     m_gpuDInv.data(),
                     v.data(),
                     upperSolveThreadBlockSize,
-                    m_stream);
+                    m_stream.get());
             }
         } else {
             detail::DILU::solveUpperLevelSet<field_type, blocksize_>(
@@ -254,7 +260,7 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                 m_gpuDInv.data(),
                 v.data(),
                 upperSolveThreadBlockSize,
-                m_stream);
+                m_stream.get());
         }
     }
 }
@@ -289,9 +295,17 @@ template <class M, class X, class Y, int l>
 void
 GpuDILU<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationBlockSize)
 {
+    // ensure that this stream only starts doing work when main stream is completed up to this point
+    OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0));
+    OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0));
+
     m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu
     reorderAndSplitMatrix(moveThreadBlockSize);
     computeDiagonal(factorizationBlockSize);
+
+    // ensure that main stream only continues after this stream is completed
+    OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get()));
+    OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0));
 }
 
 template <class M, class X, class Y, int l>
diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.hpp b/opm/simulators/linalg/gpuistl/GpuDILU.hpp
index bde85aff6b3..a4ba988cf4f 100644
--- a/opm/simulators/linalg/gpuistl/GpuDILU.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuDILU.hpp
@@ -24,6 +24,7 @@
 #include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
 #include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
 #include <opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp>
+#include <opm/simulators/linalg/gpuistl/gpu_resources.hpp>
 #include <vector>
 #include <map>
 #include <utility>
@@ -163,7 +164,10 @@ class GpuDILU : public Dune::PreconditionerWithUpdate<X, Y>
     std::map<std::pair<field_type*, const field_type*>, cudaGraphExec_t> m_executableGraphs;
 
     // Stream for the DILU operations on the GPU
-    cudaStream_t m_stream;
+    GPUStream m_stream{};
+    // Events for synchronization with main stream
+    GPUEvent m_before{};
+    GPUEvent m_after{};
 };
 } // end namespace Opm::gpuistl
 
diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
index 41abb6a6030..f5972f52e43 100644
--- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
+++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
@@ -59,9 +59,6 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
     , m_tuneThreadBlockSizes(tuneKernels)
     , m_mixedPrecisionScheme(makeMatrixStorageMPScheme(mixedPrecisionScheme))
 {
-
-    OPM_GPU_SAFE_CALL(cudaStreamCreate(&m_stream));
-
     // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
     // Some sanity check
     OPM_ERROR_IF(A.N() != m_gpuMatrix.N(),
@@ -121,6 +118,10 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d)
 {
     OPM_TIMEBLOCK(prec_apply);
     {
+        // ensure that this stream only starts doing work when main stream is completed up to this point
+        OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0));
+        OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0));
+
         const auto ptrs = std::make_pair(v.data(), d.data());
 
         auto it = m_apply_graphs.find(ptrs);
@@ -128,15 +129,20 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d)
         if (it == m_apply_graphs.end()) {
             m_apply_graphs[ptrs] = cudaGraph_t();
             m_executableGraphs[ptrs] = cudaGraphExec_t();
-            OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream, cudaStreamCaptureModeGlobal));
+            OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream.get(), cudaStreamCaptureModeGlobal));
 
             // The apply functions contains lots of small function calls which call a kernel each
             apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
 
-            OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream, &m_apply_graphs[ptrs]));
+            OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream.get(), &m_apply_graphs[ptrs]));
             OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs], m_apply_graphs[ptrs], nullptr, nullptr, 0));
         }
         OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs], 0));
+
+
+        // ensure that main stream only continues after this stream is completed
+        OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get()));
+        OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0));
     }
 }
 
@@ -162,7 +168,7 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                     d.data(),
                     v.data(),
                     lowerSolveThreadBlockSize,
-                    m_stream);
+                    m_stream.get());
             }
             else{
                 detail::ILU0::solveLowerLevelSetSplit<blocksize_, field_type, field_type>(
@@ -175,7 +181,7 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                     d.data(),
                     v.data(),
                     lowerSolveThreadBlockSize,
-                    m_stream);
+                    m_stream.get());
             }
         } else {
             detail::ILU0::solveLowerLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
@@ -187,7 +193,7 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                                                                      d.data(),
                                                                      v.data(),
                                                                      lowerSolveThreadBlockSize,
-                                                                     m_stream);
+                                                                     m_stream.get());
         }
         levelStartIdx += numOfRowsInLevel;
     }
@@ -208,7 +214,7 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                         m_gpuMatrixReorderedDiagFloat.value().data(),
                         v.data(),
                         upperSolveThreadBlockSize,
-                        m_stream);
+                        m_stream.get());
             }
             else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) {
                     detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, float, field_type>(
@@ -221,7 +227,7 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                         m_gpuMatrixReorderedDiag.value().data(),
                         v.data(),
                         upperSolveThreadBlockSize,
-                        m_stream);
+                        m_stream.get());
             }
             else{
                 detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, field_type, field_type>(
@@ -234,7 +240,7 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                     m_gpuMatrixReorderedDiag.value().data(),
                     v.data(),
                     upperSolveThreadBlockSize,
-                    m_stream);
+                    m_stream.get());
             }
         } else {
             detail::ILU0::solveUpperLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
@@ -245,7 +251,7 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                                                                      numOfRowsInLevel,
                                                                      v.data(),
                                                                      upperSolveThreadBlockSize,
-                                                                     m_stream);
+                                                                     m_stream.get());
         }
     }
 }
@@ -278,9 +284,17 @@ OpmGpuILU0<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationThreadB
 {
     OPM_TIMEBLOCK(prec_update);
     {
+        // ensure that this stream only starts doing work when main stream is completed up to this point
+        OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0));
+        OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0));
+
         m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
         reorderAndSplitMatrix(moveThreadBlockSize);
         LUFactorizeMatrix(factorizationThreadBlockSize);
+
+        // ensure that main stream only continues after this stream is completed
+        OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get()));
+        OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0));
     }
 }
 
diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp
index 064818c68fc..9e899765d83 100644
--- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp
+++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp
@@ -24,6 +24,7 @@
 #include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
 #include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
 #include <opm/simulators/linalg/gpuistl/GpuVector.hpp>
+#include <opm/simulators/linalg/gpuistl/gpu_resources.hpp>
 #include <opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp>
 #include <optional>
 #include <type_traits>
@@ -161,7 +162,10 @@ class OpmGpuILU0 : public Dune::PreconditionerWithUpdate<X, Y>
     std::map<std::pair<field_type*, const field_type*>, cudaGraphExec_t> m_executableGraphs;
 
     // Stream for the DILU operations on the GPU
-    cudaStream_t m_stream;
+    GPUStream m_stream{};
+    // Events for synchronization with main stream
+    GPUEvent m_before{};
+    GPUEvent m_after{};
 };
 } // end namespace Opm::gpuistl
 

From 41e022ba310495814bdab44ce4e01aad0e4bbf96 Mon Sep 17 00:00:00 2001
From: Tobias Meyer Andersen <tobiasmeyer.andersen@sintef.no>
Date: Wed, 5 Feb 2025 10:45:53 +0100
Subject: [PATCH 5/5] Use GPUGraph and GPUGraphExec

---
 opm/simulators/linalg/gpuistl/GpuDILU.cpp    | 8 +++-----
 opm/simulators/linalg/gpuistl/GpuDILU.hpp    | 4 ++--
 opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp | 8 +++-----
 opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp | 4 ++--
 4 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.cpp b/opm/simulators/linalg/gpuistl/GpuDILU.cpp
index 2e3fbb7e7ec..2011fb2a2c9 100644
--- a/opm/simulators/linalg/gpuistl/GpuDILU.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuDILU.cpp
@@ -123,17 +123,15 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d)
         auto it = m_apply_graphs.find(ptrs);
 
         if (it == m_apply_graphs.end()) {
-            m_apply_graphs[ptrs] = cudaGraph_t();
-            m_executableGraphs[ptrs] = cudaGraphExec_t();
             OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream.get(), cudaStreamCaptureModeGlobal));
 
             // The apply functions contains lots of small function calls which call a kernel each
             apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
 
-            OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream.get(), &m_apply_graphs[ptrs]));
-            OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs], m_apply_graphs[ptrs], nullptr, nullptr, 0));
+            OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream.get(), &m_apply_graphs[ptrs].get()));
+            OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs].get(), m_apply_graphs[ptrs].get(), nullptr, nullptr, 0));
         }
-        OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs], 0));
+        OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs].get(), 0));
     }
 
     // ensure that main stream only continues after this stream is completed
diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.hpp b/opm/simulators/linalg/gpuistl/GpuDILU.hpp
index a4ba988cf4f..a023c05d436 100644
--- a/opm/simulators/linalg/gpuistl/GpuDILU.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuDILU.hpp
@@ -160,8 +160,8 @@ class GpuDILU : public Dune::PreconditionerWithUpdate<X, Y>
     int m_DILUFactorizationThreadBlockSize = -1;
 
     // Graphs for Apply
-    std::map<std::pair<field_type*, const field_type*>, cudaGraph_t> m_apply_graphs;
-    std::map<std::pair<field_type*, const field_type*>, cudaGraphExec_t> m_executableGraphs;
+    std::map<std::pair<field_type*, const field_type*>, GPUGraph> m_apply_graphs;
+    std::map<std::pair<field_type*, const field_type*>, GPUGraphExec> m_executableGraphs;
 
     // Stream for the DILU operations on the GPU
     GPUStream m_stream{};
diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
index f5972f52e43..42b535b7664 100644
--- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
+++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
@@ -127,17 +127,15 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d)
         auto it = m_apply_graphs.find(ptrs);
 
         if (it == m_apply_graphs.end()) {
-            m_apply_graphs[ptrs] = cudaGraph_t();
-            m_executableGraphs[ptrs] = cudaGraphExec_t();
             OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream.get(), cudaStreamCaptureModeGlobal));
 
             // The apply functions contains lots of small function calls which call a kernel each
             apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
 
-            OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream.get(), &m_apply_graphs[ptrs]));
-            OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs], m_apply_graphs[ptrs], nullptr, nullptr, 0));
+            OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream.get(), &m_apply_graphs[ptrs].get()));
+            OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs].get(), m_apply_graphs[ptrs].get(), nullptr, nullptr, 0));
         }
-        OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs], 0));
+        OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs].get(), 0));
 
 
         // ensure that main stream only continues after this stream is completed
diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp
index 9e899765d83..e4cc3f05914 100644
--- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp
+++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp
@@ -158,8 +158,8 @@ class OpmGpuILU0 : public Dune::PreconditionerWithUpdate<X, Y>
     int m_ILU0FactorizationThreadBlockSize = -1;
 
     // Graphs for Apply
-    std::map<std::pair<field_type*, const field_type*>, cudaGraph_t> m_apply_graphs;
-    std::map<std::pair<field_type*, const field_type*>, cudaGraphExec_t> m_executableGraphs;
+    std::map<std::pair<field_type*, const field_type*>, GPUGraph> m_apply_graphs;
+    std::map<std::pair<field_type*, const field_type*>, GPUGraphExec> m_executableGraphs;
 
     // Stream for the DILU operations on the GPU
     GPUStream m_stream{};