diff --git a/opm/simulators/linalg/gpuistl/GpuBuffer.cpp b/opm/simulators/linalg/gpuistl/GpuBuffer.cpp
index b4132ce09f..49ce121e1f 100644
--- a/opm/simulators/linalg/gpuistl/GpuBuffer.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuBuffer.cpp
@@ -188,6 +188,7 @@ GpuBuffer<T>::copyFromHost(const std::vector<T>& data)
 {
     copyFromHost(data.data(), data.size());
 }
+
 template <class T>
 void
 GpuBuffer<T>::copyToHost(std::vector<T>& data) const
diff --git a/opm/simulators/linalg/gpuistl/GpuBuffer.hpp b/opm/simulators/linalg/gpuistl/GpuBuffer.hpp
index 369ea3ba28..1ba5cb8612 100644
--- a/opm/simulators/linalg/gpuistl/GpuBuffer.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuBuffer.hpp
@@ -27,6 +27,7 @@
 #include <opm/simulators/linalg/gpuistl/GpuView.hpp>
 #include <vector>
 #include <string>
+#include <cuda_runtime.h>
 
 
 namespace Opm::gpuistl
diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.cpp b/opm/simulators/linalg/gpuistl/GpuDILU.cpp
index d311386b00..2b616214d0 100644
--- a/opm/simulators/linalg/gpuistl/GpuDILU.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuDILU.cpp
@@ -37,6 +37,7 @@
 #include <functional>
 #include <utility>
 #include <string>
+
 namespace Opm::gpuistl
 {
 
@@ -54,6 +55,8 @@ GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int
     , m_tuneThreadBlockSizes(tuneKernels)
     , m_mixedPrecisionScheme(makeMatrixStorageMPScheme(mixedPrecisionScheme))
 {
+    OPM_GPU_SAFE_CALL(cudaStreamCreate(&m_stream));
+
     // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
     // Some sanity check
     OPM_ERROR_IF(A.N() != m_gpuMatrix.N(),
@@ -93,7 +96,8 @@ GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int
         }
     }
 
-    computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
+    reorderAndSplitMatrix(m_moveThreadBlockSize);
+    computeDiagonal(m_DILUFactorizationThreadBlockSize);
 
     if (m_tuneThreadBlockSizes) {
         tuneThreadBlockSizes();
@@ -112,7 +116,22 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d)
 {
     OPM_TIMEBLOCK(prec_apply);
     {
-        apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
+        const auto ptrs = std::make_pair(v.data(), d.data());
+
+        auto it = m_apply_graphs.find(ptrs);
+
+        if (it == m_apply_graphs.end()) {
+            m_apply_graphs[ptrs] = cudaGraph_t();
+            m_executableGraphs[ptrs] = cudaGraphExec_t();
+            OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream, cudaStreamCaptureModeGlobal));
+
+            // The apply functions contains lots of small function calls which call a kernel each
+            apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
+
+            OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream, &m_apply_graphs[ptrs]));
+            OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs], m_apply_graphs[ptrs], nullptr, nullptr, 0));
+        }
+        OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs], 0));
     }
 }
 
@@ -135,7 +154,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                     m_gpuDInvFloat->data(),
                     d.data(),
                     v.data(),
-                    lowerSolveThreadBlockSize);
+                    lowerSolveThreadBlockSize,
+                    m_stream);
             } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) {
                 detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float, field_type>(
                     m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
@@ -147,7 +167,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                     m_gpuDInv.data(),
                     d.data(),
                     v.data(),
-                    lowerSolveThreadBlockSize);
+                    lowerSolveThreadBlockSize,
+                    m_stream);
             } else {
                 detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, field_type, field_type>(
                     m_gpuMatrixReorderedLower->getNonZeroValues().data(),
@@ -159,7 +180,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                     m_gpuDInv.data(),
                     d.data(),
                     v.data(),
-                    lowerSolveThreadBlockSize);
+                    lowerSolveThreadBlockSize,
+                    m_stream);
             }
         } else {
             detail::DILU::solveLowerLevelSet<field_type, blocksize_>(
@@ -172,7 +194,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                 m_gpuDInv.data(),
                 d.data(),
                 v.data(),
-                lowerSolveThreadBlockSize);
+                lowerSolveThreadBlockSize,
+                m_stream);
         }
         levelStartIdx += numOfRowsInLevel;
     }
@@ -193,7 +216,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                     numOfRowsInLevel,
                     m_gpuDInvFloat->data(),
                     v.data(),
-                    upperSolveThreadBlockSize);
+                    upperSolveThreadBlockSize,
+                    m_stream);
             } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG){
                 detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, float>(
                     m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
@@ -204,7 +228,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                     numOfRowsInLevel,
                     m_gpuDInv.data(),
                     v.data(),
-                    upperSolveThreadBlockSize);
+                    upperSolveThreadBlockSize,
+                    m_stream);
             } else {
                 detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, field_type>(
                     m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
@@ -215,7 +240,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                     numOfRowsInLevel,
                     m_gpuDInv.data(),
                     v.data(),
-                    upperSolveThreadBlockSize);
+                    upperSolveThreadBlockSize,
+                    m_stream);
             }
         } else {
             detail::DILU::solveUpperLevelSet<field_type, blocksize_>(
@@ -227,7 +253,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                 numOfRowsInLevel,
                 m_gpuDInv.data(),
                 v.data(),
-                upperSolveThreadBlockSize);
+                upperSolveThreadBlockSize,
+                m_stream);
         }
     }
 }
@@ -252,7 +279,9 @@ GpuDILU<M, X, Y, l>::update()
 {
     OPM_TIMEBLOCK(prec_update);
     {
-        update(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
+        m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu
+        reorderAndSplitMatrix(m_moveThreadBlockSize);
+        computeDiagonal(m_DILUFactorizationThreadBlockSize);
     }
 }
 
@@ -260,13 +289,14 @@ template <class M, class X, class Y, int l>
 void
 GpuDILU<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationBlockSize)
 {
-    m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
-    computeDiagAndMoveReorderedData(moveThreadBlockSize, factorizationBlockSize);
+    m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu
+    reorderAndSplitMatrix(moveThreadBlockSize);
+    computeDiagonal(factorizationBlockSize);
 }
 
 template <class M, class X, class Y, int l>
 void
-GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationBlockSize)
+GpuDILU<M, X, Y, l>::reorderAndSplitMatrix(int moveThreadBlockSize)
 {
     if (m_splitMatrix) {
         detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
@@ -290,7 +320,12 @@ GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in
                                                                 m_gpuMatrixReordered->N(),
                                                                 moveThreadBlockSize);
     }
+}
 
+template <class M, class X, class Y, int l>
+void
+GpuDILU<M, X, Y, l>::computeDiagonal(int factorizationBlockSize)
+{
     int levelStartIdx = 0;
     for (int level = 0; level < m_levelSets.size(); ++level) {
         const int numOfRowsInLevel = m_levelSets[level].size();
diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.hpp b/opm/simulators/linalg/gpuistl/GpuDILU.hpp
index 241eef3e98..bde85aff6b 100644
--- a/opm/simulators/linalg/gpuistl/GpuDILU.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuDILU.hpp
@@ -25,7 +25,8 @@
 #include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
 #include <opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp>
 #include <vector>
-
+#include <map>
+#include <utility>
 
 
 namespace Opm::gpuistl
@@ -82,8 +83,11 @@ class GpuDILU : public Dune::PreconditionerWithUpdate<X, Y>
     //! \brief Updates the matrix data.
     void update() final;
 
+    //! \brief perform matrix splitting and reordering
+    void reorderAndSplitMatrix(int moveThreadBlockSize);
+
     //! \brief Compute the diagonal of the DILU, and update the data of the reordered matrix
-    void computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationThreadBlockSize);
+    void computeDiagonal(int factorizationThreadBlockSize);
 
     //! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
     void tuneThreadBlockSizes();
@@ -153,6 +157,13 @@ class GpuDILU : public Dune::PreconditionerWithUpdate<X, Y>
     int m_lowerSolveThreadBlockSize = -1;
     int m_moveThreadBlockSize = -1;
     int m_DILUFactorizationThreadBlockSize = -1;
+
+    // Graphs for Apply
+    std::map<std::pair<field_type*, const field_type*>, cudaGraph_t> m_apply_graphs;
+    std::map<std::pair<field_type*, const field_type*>, cudaGraphExec_t> m_executableGraphs;
+
+    // Stream for the DILU operations on the GPU
+    cudaStream_t m_stream;
 };
 } // end namespace Opm::gpuistl
 
diff --git a/opm/simulators/linalg/gpuistl/GpuVector.cpp b/opm/simulators/linalg/gpuistl/GpuVector.cpp
index ddafc1dbfd..af39e99419 100644
--- a/opm/simulators/linalg/gpuistl/GpuVector.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuVector.cpp
@@ -266,6 +266,19 @@ GpuVector<T>::copyFromHost(const T* dataPointer, size_t numberOfElements)
     OPM_GPU_SAFE_CALL(cudaMemcpy(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice));
 }
 
+template <class T>
+void
+GpuVector<T>::copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream)
+{
+    if (numberOfElements > dim()) {
+        OPM_THROW(std::runtime_error,
+                  fmt::format("Requesting to copy too many elements. Vector has {} elements, while {} was requested.",
+                              dim(),
+                              numberOfElements));
+    }
+    OPM_GPU_SAFE_CALL(cudaMemcpyAsync(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice, stream));
+}
+
 template <class T>
 void
 GpuVector<T>::copyToHost(T* dataPointer, size_t numberOfElements) const
diff --git a/opm/simulators/linalg/gpuistl/GpuVector.hpp b/opm/simulators/linalg/gpuistl/GpuVector.hpp
index e369d1d9d1..66a1ebd65e 100644
--- a/opm/simulators/linalg/gpuistl/GpuVector.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuVector.hpp
@@ -203,6 +203,7 @@ class GpuVector
      * @note assumes that this vector has numberOfElements elements
      */
     void copyFromHost(const T* dataPointer, size_t numberOfElements);
+    void copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream);
 
     /**
      * @brief copyFromHost copies numberOfElements to the CPU memory dataPointer
diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
index 686b8f09f4..41abb6a603 100644
--- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
+++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
@@ -37,6 +37,7 @@
 #include <string>
 #include <tuple>
 #include <utility>
+
 namespace Opm::gpuistl
 {
 
@@ -59,6 +60,8 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
     , m_mixedPrecisionScheme(makeMatrixStorageMPScheme(mixedPrecisionScheme))
 {
 
+    OPM_GPU_SAFE_CALL(cudaStreamCreate(&m_stream));
+
     // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
     // Some sanity check
     OPM_ERROR_IF(A.N() != m_gpuMatrix.N(),
@@ -98,7 +101,8 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
         }
     }
 
-    LUFactorizeAndMoveData(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
+    reorderAndSplitMatrix(m_moveThreadBlockSize);
+    LUFactorizeMatrix(m_ILU0FactorizationThreadBlockSize);
 
     if (m_tuneThreadBlockSizes) {
         tuneThreadBlockSizes();
@@ -117,7 +121,22 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d)
 {
     OPM_TIMEBLOCK(prec_apply);
     {
-        apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
+        const auto ptrs = std::make_pair(v.data(), d.data());
+
+        auto it = m_apply_graphs.find(ptrs);
+
+        if (it == m_apply_graphs.end()) {
+            m_apply_graphs[ptrs] = cudaGraph_t();
+            m_executableGraphs[ptrs] = cudaGraphExec_t();
+            OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream, cudaStreamCaptureModeGlobal));
+
+            // The apply functions contains lots of small function calls which call a kernel each
+            apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
+
+            OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream, &m_apply_graphs[ptrs]));
+            OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs], m_apply_graphs[ptrs], nullptr, nullptr, 0));
+        }
+        OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs], 0));
     }
 }
 
@@ -142,7 +161,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                     numOfRowsInLevel,
                     d.data(),
                     v.data(),
-                    lowerSolveThreadBlockSize);
+                    lowerSolveThreadBlockSize,
+                    m_stream);
             }
             else{
                 detail::ILU0::solveLowerLevelSetSplit<blocksize_, field_type, field_type>(
@@ -154,7 +174,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                     numOfRowsInLevel,
                     d.data(),
                     v.data(),
-                    lowerSolveThreadBlockSize);
+                    lowerSolveThreadBlockSize,
+                    m_stream);
             }
         } else {
             detail::ILU0::solveLowerLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
@@ -165,7 +186,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                                                                      numOfRowsInLevel,
                                                                      d.data(),
                                                                      v.data(),
-                                                                     lowerSolveThreadBlockSize);
+                                                                     lowerSolveThreadBlockSize,
+                                                                     m_stream);
         }
         levelStartIdx += numOfRowsInLevel;
     }
@@ -185,7 +207,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                         numOfRowsInLevel,
                         m_gpuMatrixReorderedDiagFloat.value().data(),
                         v.data(),
-                        upperSolveThreadBlockSize);
+                        upperSolveThreadBlockSize,
+                        m_stream);
             }
             else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) {
                     detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, float, field_type>(
@@ -197,7 +220,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                         numOfRowsInLevel,
                         m_gpuMatrixReorderedDiag.value().data(),
                         v.data(),
-                        upperSolveThreadBlockSize);
+                        upperSolveThreadBlockSize,
+                        m_stream);
             }
             else{
                 detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, field_type, field_type>(
@@ -209,7 +233,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                     numOfRowsInLevel,
                     m_gpuMatrixReorderedDiag.value().data(),
                     v.data(),
-                    upperSolveThreadBlockSize);
+                    upperSolveThreadBlockSize,
+                    m_stream);
             }
         } else {
             detail::ILU0::solveUpperLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
@@ -219,7 +244,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                                                                      levelStartIdx,
                                                                      numOfRowsInLevel,
                                                                      v.data(),
-                                                                     upperSolveThreadBlockSize);
+                                                                     upperSolveThreadBlockSize,
+                                                                     m_stream);
         }
     }
 }
@@ -241,10 +267,9 @@ template <class M, class X, class Y, int l>
 void
 OpmGpuILU0<M, X, Y, l>::update()
 {
-    OPM_TIMEBLOCK(prec_update);
-    {
-        update(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
-    }
+    m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu
+    reorderAndSplitMatrix(m_moveThreadBlockSize);
+    LUFactorizeMatrix(m_ILU0FactorizationThreadBlockSize);
 }
 
 template <class M, class X, class Y, int l>
@@ -254,12 +279,14 @@ OpmGpuILU0<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationThreadB
     OPM_TIMEBLOCK(prec_update);
     {
         m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
-        LUFactorizeAndMoveData(moveThreadBlockSize, factorizationThreadBlockSize);
+        reorderAndSplitMatrix(moveThreadBlockSize);
+        LUFactorizeMatrix(factorizationThreadBlockSize);
     }
 }
+
 template <class M, class X, class Y, int l>
 void
-OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int factorizationThreadBlockSize)
+OpmGpuILU0<M, X, Y, l>::reorderAndSplitMatrix(int moveThreadBlockSize)
 {
     if (m_splitMatrix) {
         detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
@@ -283,6 +310,12 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact
                                                                m_gpuReorderedLU->N(),
                                                                moveThreadBlockSize);
     }
+}
+
+template <class M, class X, class Y, int l>
+void
+OpmGpuILU0<M, X, Y, l>::LUFactorizeMatrix(int factorizationThreadBlockSize)
+{
     int levelStartIdx = 0;
     for (int level = 0; level < m_levelSets.size(); ++level) {
         const int numOfRowsInLevel = m_levelSets[level].size();
diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp
index c163e04832..064818c68f 100644
--- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp
+++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp
@@ -84,8 +84,11 @@ class OpmGpuILU0 : public Dune::PreconditionerWithUpdate<X, Y>
     //! \brief Updates the matrix data.
     void update() final;
 
+    //! \brief perform matrix splitting and reordering
+    void reorderAndSplitMatrix(int moveThreadBlockSize);
+
     //! \brief Compute LU factorization, and update the data of the reordered matrix
-    void LUFactorizeAndMoveData(int moveThreadBlockSize, int factorizationThreadBlockSize);
+    void LUFactorizeMatrix(int factorizationThreadBlockSize);
 
     //! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
     void tuneThreadBlockSizes();
@@ -152,6 +155,13 @@ class OpmGpuILU0 : public Dune::PreconditionerWithUpdate<X, Y>
     int m_lowerSolveThreadBlockSize = -1;
     int m_moveThreadBlockSize = -1;
     int m_ILU0FactorizationThreadBlockSize = -1;
+
+    // Graphs for Apply
+    std::map<std::pair<field_type*, const field_type*>, cudaGraph_t> m_apply_graphs;
+    std::map<std::pair<field_type*, const field_type*>, cudaGraphExec_t> m_executableGraphs;
+
+    // Stream for the DILU operations on the GPU
+    cudaStream_t m_stream;
 };
 } // end namespace Opm::gpuistl
 
diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu
index b8f9f48ca5..8af6765df7 100644
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu
@@ -85,10 +85,12 @@ namespace
             // TODO: removce the first condition in the for loop
             for (int block = nnzIdx; block < nnzIdxLim; ++block) {
                 const int col = colIndices[block];
-                mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
+                mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
+                    &mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
             }
 
-            mvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
+            mvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
+                &dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
         }
     }
 
@@ -137,10 +139,12 @@ namespace
             LinearSolverScalar rhs[blocksize] = {0};
             for (int block = nnzIdx; block < nnzIdxLim; ++block) {
                 const int col = colIndices[block];
-                umvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
+                umvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
+                    &mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
             }
 
-            mmvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
+            mmvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
+                &dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
         }
     }
 
@@ -211,8 +215,8 @@ namespace
         }
     }
 
-    // TODO: rewrite such that during the factorization there is a dInv of InputScalar type that stores intermediate results
-    // TOOD: The important part is to only cast after that is fully computed
+    // TODO: rewrite such that during the factorization there is a dInv of InputScalar type that stores intermediate
+    // results TOOD: The important part is to only cast after that is fully computed
     template <int blocksize, class InputScalar, class OutputScalar, MatrixStorageMPScheme mixedPrecisionScheme>
     __global__ void cuComputeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
                                                int* lowerRowIndices,
@@ -239,7 +243,8 @@ namespace
             InputScalar dInvTmp[blocksize * blocksize];
             for (int i = 0; i < blocksize; ++i) {
                 for (int j = 0; j < blocksize; ++j) {
-                    dInvTmp[i * blocksize + j] = srcDiagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j];
+                    dInvTmp[i * blocksize + j]
+                        = srcDiagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j];
                 }
             }
 
@@ -257,21 +262,26 @@ namespace
 
                 if constexpr (detail::storeOffDiagonalAsFloat(mixedPrecisionScheme)) {
                     // TODO: think long and hard about whether this performs only the wanted memory transfers
-                    moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedLowerMat[block * blocksize * blocksize], &dstLowerMat[block * blocksize * blocksize]);
-                    moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize], &dstUpperMat[symOppositeBlock * blocksize * blocksize]);
+                    moveBlock<blocksize, InputScalar, OutputScalar>(
+                        &srcReorderedLowerMat[block * blocksize * blocksize],
+                        &dstLowerMat[block * blocksize * blocksize]);
+                    moveBlock<blocksize, InputScalar, OutputScalar>(
+                        &srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize],
+                        &dstUpperMat[symOppositeBlock * blocksize * blocksize]);
                 }
 
                 mmx2Subtraction<InputScalar, blocksize>(&srcReorderedLowerMat[block * blocksize * blocksize],
-                                              &dInv[col * blocksize * blocksize],
-                                              &srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize],
-                                              dInvTmp);
+                                                        &dInv[col * blocksize * blocksize],
+                                                        &srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize],
+                                                        dInvTmp);
             }
 
             invBlockInPlace<InputScalar, blocksize>(dInvTmp);
             moveBlock<blocksize, InputScalar, InputScalar>(dInvTmp, &dInv[reorderedRowIdx * blocksize * blocksize]);
 
             if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)) {
-                moveBlock<blocksize, InputScalar, OutputScalar>(dInvTmp, &dstDiag[reorderedRowIdx * blocksize * blocksize]); // important!
+                moveBlock<blocksize, InputScalar, OutputScalar>(
+                    dInvTmp, &dstDiag[reorderedRowIdx * blocksize * blocksize]); // important!
             }
         }
     }
@@ -289,12 +299,13 @@ solveLowerLevelSet(T* reorderedMat,
                    const T* dInv,
                    const T* d,
                    T* v,
-                   int thrBlockSize)
+                   int thrBlockSize,
+                   cudaStream_t stream)
 {
     int threadBlockSize
         = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet<T, blocksize>, thrBlockSize);
     int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+    cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
         reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
 }
 
@@ -310,13 +321,15 @@ solveLowerLevelSetSplit(MatrixScalar* reorderedMat,
                         const DiagonalScalar* dInv,
                         const LinearSolverScalar* d,
                         LinearSolverScalar* v,
-                        int thrBlockSize)
+                        int thrBlockSize,
+                        cudaStream_t stream)
 {
     int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
         cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
     int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
-        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
+    cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>
+        <<<nThreadBlocks, threadBlockSize, 0, stream>>>(
+            reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
 }
 // perform the upper solve for all rows in the same level set
 template <class T, int blocksize>
@@ -329,12 +342,13 @@ solveUpperLevelSet(T* reorderedMat,
                    int rowsInLevelSet,
                    const T* dInv,
                    T* v,
-                   int thrBlockSize)
+                   int thrBlockSize,
+                   cudaStream_t stream)
 {
     int threadBlockSize
         = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet<T, blocksize>, thrBlockSize);
     int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+    cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
         reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }
 
@@ -348,13 +362,15 @@ solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
                         int rowsInLevelSet,
                         const DiagonalScalar* dInv,
                         LinearSolverScalar* v,
-                        int thrBlockSize)
+                        int thrBlockSize,
+                        cudaStream_t stream)
 {
     int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
         cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
     int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
-        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
+    cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>
+        <<<nThreadBlocks, threadBlockSize, 0, stream>>>(
+            reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }
 
 template <class T, int blocksize>
@@ -409,43 +425,134 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
         int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
             cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme>, thrBlockSize);
         int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-        cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme><<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
-                                                                                     lowerRowIndices,
-                                                                                     lowerColIndices,
-                                                                                     srcReorderedUpperMat,
-                                                                                     upperRowIndices,
-                                                                                     upperColIndices,
-                                                                                     srcDiagonal,
-                                                                                     reorderedToNatural,
-                                                                                     naturalToReordered,
-                                                                                     startIdx,
-                                                                                     rowsInLevelSet,
-                                                                                     dInv,
-                                                                                     dstDiag,
-                                                                                     dstLowerMat,
-                                                                                     dstUpperMat);
+        cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme>
+            <<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
+                                                 lowerRowIndices,
+                                                 lowerColIndices,
+                                                 srcReorderedUpperMat,
+                                                 upperRowIndices,
+                                                 upperColIndices,
+                                                 srcDiagonal,
+                                                 reorderedToNatural,
+                                                 naturalToReordered,
+                                                 startIdx,
+                                                 rowsInLevelSet,
+                                                 dInv,
+                                                 dstDiag,
+                                                 dstLowerMat,
+                                                 dstUpperMat);
     } else {
         OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
     }
 }
 
-// TODO: format
 #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize)                                                                      \
     template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*, int);              \
-    template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(                                                              \
-        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int);                                      \
-    template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(                                                              \
-        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int);                                      \
-    template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(                                                              \
-        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int);                                      \
-    template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(                                                              \
-        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int);                                      \
-    template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(                                                              \
-        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int);                                      \
-    template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(                                                              \
-        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int);                                      \
-    template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);                 \
-    template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);
+    template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(   \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const int,                                                                                                     \
+        int,                                                                                                           \
+        T*,                                                                                                            \
+        double*,                                                                                                       \
+        double*,                                                                                                       \
+        double*,                                                                                                       \
+        int);                                                                                                          \
+    template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(    \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const int,                                                                                                     \
+        int,                                                                                                           \
+        T*,                                                                                                            \
+        float*,                                                                                                        \
+        float*,                                                                                                        \
+        float*,                                                                                                        \
+        int);                                                                                                          \
+    template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(      \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const int,                                                                                                     \
+        int,                                                                                                           \
+        T*,                                                                                                            \
+        float*,                                                                                                        \
+        float*,                                                                                                        \
+        float*,                                                                                                        \
+        int);                                                                                                          \
+    template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(     \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const int,                                                                                                     \
+        int,                                                                                                           \
+        T*,                                                                                                            \
+        double*,                                                                                                       \
+        double*,                                                                                                       \
+        double*,                                                                                                       \
+        int);                                                                                                          \
+    template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(     \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const int,                                                                                                     \
+        int,                                                                                                           \
+        T*,                                                                                                            \
+        float*,                                                                                                        \
+        float*,                                                                                                        \
+        float*,                                                                                                        \
+        int);                                                                                                          \
+    template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(    \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const int,                                                                                                     \
+        int,                                                                                                           \
+        T*,                                                                                                            \
+        double*,                                                                                                       \
+        double*,                                                                                                       \
+        double*,                                                                                                       \
+        int);                                                                                                          \
+    template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int, cudaStream_t);   \
+    template void solveLowerLevelSet<T, blocksize>(                                                                    \
+        T*, int*, int*, int*, int, int, const T*, const T*, T*, int, cudaStream_t);
 
 INSTANTIATE_KERNEL_WRAPPERS(float, 1);
 INSTANTIATE_KERNEL_WRAPPERS(float, 2);
@@ -460,19 +567,29 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 4);
 INSTANTIATE_KERNEL_WRAPPERS(double, 5);
 INSTANTIATE_KERNEL_WRAPPERS(double, 6);
 
-#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar)                                 \
-    template void solveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>(                                \
-        MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, LinearSolverScalar*, int);                     \
-    template void solveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>(                                \
-        MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, const LinearSolverScalar*, LinearSolverScalar*, int);
+#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar)                 \
+    template void solveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>(                \
+        MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, LinearSolverScalar*, int, cudaStream_t);     \
+    template void solveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>(                \
+        MatrixScalar*,                                                                                                 \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        int,                                                                                                           \
+        int,                                                                                                           \
+        const DiagonalScalar*,                                                                                         \
+        const LinearSolverScalar*,                                                                                     \
+        LinearSolverScalar*,                                                                                           \
+        int,                                                                                                           \
+        cudaStream_t);
 
 // TODO: be smarter about this... Surely this instantiates many more combinations that are actually needed
-#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(blocksize) \
-    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, float); \
-    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, float); \
-    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, float); \
-    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, double); \
-    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, double); \
+#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(blocksize)                                                               \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, float);                                                 \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, float);                                               \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, float);                                                \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, double);                                                \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, double);                                              \
     INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, double);
 
 INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(1);
diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp
index b7141834f1..77a9cae699 100644
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp
@@ -54,7 +54,8 @@ void solveLowerLevelSet(T* reorderedMat,
                         const T* dInv,
                         const T* d,
                         T* v,
-                        int threadBlockSize);
+                        int threadBlockSize,
+                        cudaStream_t stream);
 
 /**
  * @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
@@ -82,7 +83,8 @@ void solveLowerLevelSetSplit(MatrixScalar* reorderedUpperMat,
                              const DiagonalScalar* dInv,
                              const LinearSolverScalar* d,
                              LinearSolverScalar* v,
-                             int threadBlockSize);
+                             int threadBlockSize,
+                             cudaStream_t stream);
 
 /**
  * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
@@ -108,7 +110,8 @@ void solveUpperLevelSet(T* reorderedMat,
                         int rowsInLevelSet,
                         const T* dInv,
                         T* v,
-                        int threadBlockSize);
+                        int threadBlockSize,
+                        cudaStream_t stream);
 
 /**
  * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
@@ -134,7 +137,8 @@ void solveUpperLevelSetSplit(MatrixScalar* reorderedUpperMat,
                              int rowsInLevelSet,
                              const DiagonalScalar* dInv,
                              LinearSolverScalar* v,
-                             int threadBlockSize);
+                             int threadBlockSize,
+                             cudaStream_t stream);
 
 /**
  * @brief Computes the ILU0 of the diagonal elements of the reordered matrix and stores it in a reordered vector
diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu
index 46183efd23..e66c42f8a5 100644
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu
@@ -215,9 +215,9 @@ namespace
             // as of now, if we are using mixed precision, then we are always storing the off-diagonals as floats,
             // and sometimes also the diagonal.
             if constexpr (detail::usingMixedPrecision(mixedPrecisionScheme)) {
-                // if we are want to store the entire matrix as a float then we must also move the diagonal block from double to float
-                // if not then we just use the double diagonal that is already now stored in srcDiagonal
-                if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)){
+                // if we are want to store the entire matrix as a float then we must also move the diagonal block from
+                // double to float if not then we just use the double diagonal that is already now stored in srcDiagonal
+                if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)) {
                     moveBlock<blocksize, InputScalar, OutputScalar>(&srcDiagonal[reorderedIdx * scalarsInBlock],
                                                                     &dstDiagonal[reorderedIdx * scalarsInBlock]);
                 }
@@ -362,12 +362,13 @@ solveLowerLevelSet(T* reorderedMat,
                    int rowsInLevelSet,
                    const T* d,
                    T* v,
-                   int thrBlockSize)
+                   int thrBlockSize,
+                   cudaStream_t stream)
 {
     int threadBlockSize
         = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet<T, blocksize>, thrBlockSize);
     int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+    cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
         reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
 }
 // perform the upper solve for all rows in the same level set
@@ -380,12 +381,13 @@ solveUpperLevelSet(T* reorderedMat,
                    int startIdx,
                    int rowsInLevelSet,
                    T* v,
-                   int thrBlockSize)
+                   int thrBlockSize,
+                   cudaStream_t stream)
 {
     int threadBlockSize
         = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet<T, blocksize>, thrBlockSize);
     int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+    cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
         reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, v);
 }
 
@@ -399,13 +401,15 @@ solveLowerLevelSetSplit(MatrixScalar* reorderedMat,
                         int rowsInLevelSet,
                         const LinearSolverScalar* d,
                         LinearSolverScalar* v,
-                        int thrBlockSize)
+                        int thrBlockSize,
+                        cudaStream_t stream)
 {
     int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
         cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>, thrBlockSize);
     int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar><<<nThreadBlocks, threadBlockSize>>>(
-        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
+    cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>
+        <<<nThreadBlocks, threadBlockSize, 0, stream>>>(
+            reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
 }
 // perform the upper solve for all rows in the same level set
 template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
@@ -418,13 +422,15 @@ solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
                         int rowsInLevelSet,
                         const DiagonalScalar* dInv,
                         LinearSolverScalar* v,
-                        int thrBlockSize)
+                        int thrBlockSize,
+                        cudaStream_t stream)
 {
     int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
         cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
     int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
-        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
+    cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>
+        <<<nThreadBlocks, threadBlockSize, 0, stream>>>(
+            reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }
 
 template <class T, int blocksize>
@@ -484,28 +490,28 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
 }
 
 #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize)                                                                      \
-    template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int);                           \
-    template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);                 \
+    template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int, cudaStream_t);             \
+    template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int, cudaStream_t);   \
     template void LUFactorization<T, blocksize>(T*, int*, int*, int*, int*, size_t, int, int);                         \
-    template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(                                                     \
+    template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(        \
         T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int);                  \
-    template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(                                                    \
+    template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(       \
         T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int);               \
-    template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(                                                    \
+    template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(          \
         T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int);                  \
-    template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(                                                   \
-        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \
-    template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(                                                    \
+    template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(         \
+        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int);               \
+    template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(         \
         T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int);                  \
-    template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(                                                   \
-        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); 
-
-#define INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(T) \
-    INSTANTIATE_KERNEL_WRAPPERS(T, 1); \
-    INSTANTIATE_KERNEL_WRAPPERS(T, 2); \
-    INSTANTIATE_KERNEL_WRAPPERS(T, 3); \
-    INSTANTIATE_KERNEL_WRAPPERS(T, 4); \
-    INSTANTIATE_KERNEL_WRAPPERS(T, 5); \
+    template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(        \
+        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int);
+
+#define INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(T)                                                                     \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 1);                                                                                 \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 2);                                                                                 \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 3);                                                                                 \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 4);                                                                                 \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 5);                                                                                 \
     INSTANTIATE_KERNEL_WRAPPERS(T, 6);
 
 INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(float)
@@ -514,32 +520,32 @@ INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(double)
 #define INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(blocksize)                                                         \
     /* double preconditioner */                                                                                        \
     template void solveLowerLevelSetSplit<blocksize, double, double>(                                                  \
-        double*, int*, int*, int*, int, int, const double*, double*, int);                                             \
+        double*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t);                               \
     /* float matrix, double compute preconditioner */                                                                  \
     template void solveLowerLevelSetSplit<blocksize, double, float>(                                                   \
-        float*, int*, int*, int*, int, int, const double*, double*, int);                                              \
+        float*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t);                                \
     /* float preconditioner */                                                                                         \
     template void solveLowerLevelSetSplit<blocksize, float, float>(                                                    \
-        float*, int*, int*, int*, int, int, const float*, float*, int);                                                \
+        float*, int*, int*, int*, int, int, const float*, float*, int, cudaStream_t);                                  \
                                                                                                                        \
     /* double preconditioner */                                                                                        \
-    template void solveUpperLevelSetSplit<blocksize, double, double, double>(                                                  \
-        double*, int*, int*, int*, int, int, const double*, double*, int);                                             \
+    template void solveUpperLevelSetSplit<blocksize, double, double, double>(                                          \
+        double*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t);                               \
     /* float matrix, double compute preconditioner */                                                                  \
-    template void solveUpperLevelSetSplit<blocksize, double, float, double>(                                                   \
-        float*, int*, int*, int*, int, int, const double*, double*, int);                                               \
+    template void solveUpperLevelSetSplit<blocksize, double, float, double>(                                           \
+        float*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t);                                \
     /* float preconditioner */                                                                                         \
-    template void solveUpperLevelSetSplit<blocksize, float, float, double>(                                                    \
-        float*, int*, int*, int*, int, int, const double*, float*, int);                                                \
+    template void solveUpperLevelSetSplit<blocksize, float, float, double>(                                            \
+        float*, int*, int*, int*, int, int, const double*, float*, int, cudaStream_t);                                 \
     /* double preconditioner */                                                                                        \
-    template void solveUpperLevelSetSplit<blocksize, double, double, float>(                                                  \
-        double*, int*, int*, int*, int, int, const float*, double*, int);                                             \
+    template void solveUpperLevelSetSplit<blocksize, double, double, float>(                                           \
+        double*, int*, int*, int*, int, int, const float*, double*, int, cudaStream_t);                                \
     /* float matrix, double compute preconditioner */                                                                  \
-    template void solveUpperLevelSetSplit<blocksize, double, float, float>(                                                   \
-        float*, int*, int*, int*, int, int, const float*, double*, int);                                               \
+    template void solveUpperLevelSetSplit<blocksize, double, float, float>(                                            \
+        float*, int*, int*, int*, int, int, const float*, double*, int, cudaStream_t);                                 \
     /* float preconditioner */                                                                                         \
-    template void solveUpperLevelSetSplit<blocksize, float, float, float>(                                                    \
-        float*, int*, int*, int*, int, int, const float*, float*, int);
+    template void solveUpperLevelSetSplit<blocksize, float, float, float>(                                             \
+        float*, int*, int*, int*, int, int, const float*, float*, int, cudaStream_t);
 
 
 INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(1);
diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp
index cdd78ec910..04390ff77c 100644
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp
@@ -46,7 +46,8 @@ void solveUpperLevelSet(T* reorderedMat,
                         int startIdx,
                         int rowsInLevelSet,
                         T* v,
-                        int threadBlockSize);
+                        int threadBlockSize,
+                        cudaStream_t stream);
 
 /**
  * @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
@@ -72,7 +73,8 @@ void solveLowerLevelSet(T* reorderedMat,
                         int rowsInLevelSet,
                         const T* d,
                         T* v,
-                        int threadBlockSize);
+                        int threadBlockSize,
+                        cudaStream_t stream);
 
 /**
  * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
@@ -99,7 +101,8 @@ void solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
                              int rowsInLevelSet,
                              const DiagonalScalar* dInv,
                              LinearSolverScalar* v,
-                             int threadBlockSize);
+                             int threadBlockSize,
+                             cudaStream_t stream);
 
 /**
  * @brief Perform an lower solve on certain rows in a matrix that can safely be computed in parallel
@@ -127,7 +130,8 @@ void solveLowerLevelSetSplit(MatrixScalar* reorderedLowerMat,
                              int rowsInLevelSet,
                              const LinearSolverScalar* d,
                              LinearSolverScalar* v,
-                             int threadBlockSize);
+                             int threadBlockSize,
+                             cudaStream_t stream);
 
 /**
  * @brief Computes the ILU Factorization of the input bcsr matrix, which is stored in a reordered way. The diagonal