OPM · multitalentloes · Dec 5, 2024 · Jan 20, 2025 · Jan 20, 2025
diff --git a/opm/simulators/linalg/gpuistl/GpuBuffer.cpp b/opm/simulators/linalg/gpuistl/GpuBuffer.cpp
@@ -188,6 +188,7 @@ GpuBuffer<T>::copyFromHost(const std::vector<T>& data)
 {
     copyFromHost(data.data(), data.size());
 }
+
 template <class T>
 void
 GpuBuffer<T>::copyToHost(std::vector<T>& data) const

diff --git a/opm/simulators/linalg/gpuistl/GpuBuffer.hpp b/opm/simulators/linalg/gpuistl/GpuBuffer.hpp
@@ -27,6 +27,7 @@
 #include <opm/simulators/linalg/gpuistl/GpuView.hpp>
 #include <vector>
 #include <string>
+#include <cuda_runtime.h>
 
 
 namespace Opm::gpuistl

diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.cpp b/opm/simulators/linalg/gpuistl/GpuDILU.cpp
@@ -37,6 +37,7 @@
 #include <functional>
 #include <utility>
 #include <string>
+
 namespace Opm::gpuistl
 {
 
@@ -54,6 +55,8 @@ GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int
     , m_tuneThreadBlockSizes(tuneKernels)
     , m_mixedPrecisionScheme(makeMatrixStorageMPScheme(mixedPrecisionScheme))
 {
+    OPM_GPU_SAFE_CALL(cudaStreamCreate(&m_stream));
+
     // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
     // Some sanity check
     OPM_ERROR_IF(A.N() != m_gpuMatrix.N(),
@@ -93,7 +96,8 @@ GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int
         }
     }
 
-    computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
+    reorderAndSplitMatrix(m_moveThreadBlockSize);
+    computeDiagonal(m_DILUFactorizationThreadBlockSize);
 
     if (m_tuneThreadBlockSizes) {
         tuneThreadBlockSizes();
@@ -112,7 +116,22 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d)
 {
     OPM_TIMEBLOCK(prec_apply);
     {
-        apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
+        const auto ptrs = std::make_pair(v.data(), d.data());
+
+        auto it = m_apply_graphs.find(ptrs);
+
+        if (it == m_apply_graphs.end()) {
+            m_apply_graphs[ptrs] = cudaGraph_t();
+            m_executableGraphs[ptrs] = cudaGraphExec_t();
+            OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream, cudaStreamCaptureModeGlobal));
+
+            // The apply functions contains lots of small function calls which call a kernel each
+            apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
+
+            OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream, &m_apply_graphs[ptrs]));
+            OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs], m_apply_graphs[ptrs], nullptr, nullptr, 0));
+        }
+        OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs], 0));
     }
 }
 
@@ -135,7 +154,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                     m_gpuDInvFloat->data(),
                     d.data(),
                     v.data(),
-                    lowerSolveThreadBlockSize);
+                    lowerSolveThreadBlockSize,
+                    m_stream);
             } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) {
                 detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float, field_type>(
                     m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
@@ -147,7 +167,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                     m_gpuDInv.data(),
                     d.data(),
                     v.data(),
-                    lowerSolveThreadBlockSize);
+                    lowerSolveThreadBlockSize,
+                    m_stream);
             } else {
                 detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, field_type, field_type>(
                     m_gpuMatrixReorderedLower->getNonZeroValues().data(),
@@ -159,7 +180,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                     m_gpuDInv.data(),
                     d.data(),
                     v.data(),
-                    lowerSolveThreadBlockSize);
+                    lowerSolveThreadBlockSize,
+                    m_stream);
             }
         } else {
             detail::DILU::solveLowerLevelSet<field_type, blocksize_>(
@@ -172,7 +194,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                 m_gpuDInv.data(),
                 d.data(),
                 v.data(),
-                lowerSolveThreadBlockSize);
+                lowerSolveThreadBlockSize,
+                m_stream);
         }
         levelStartIdx += numOfRowsInLevel;
     }
@@ -193,7 +216,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                     numOfRowsInLevel,
                     m_gpuDInvFloat->data(),
                     v.data(),
-                    upperSolveThreadBlockSize);
+                    upperSolveThreadBlockSize,
+                    m_stream);
             } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG){
                 detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, float>(
                     m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
@@ -204,7 +228,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                     numOfRowsInLevel,
                     m_gpuDInv.data(),
                     v.data(),
-                    upperSolveThreadBlockSize);
+                    upperSolveThreadBlockSize,
+                    m_stream);
             } else {
                 detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, field_type>(
                     m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
@@ -215,7 +240,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                     numOfRowsInLevel,
                     m_gpuDInv.data(),
                     v.data(),
-                    upperSolveThreadBlockSize);
+                    upperSolveThreadBlockSize,
+                    m_stream);
             }
         } else {
             detail::DILU::solveUpperLevelSet<field_type, blocksize_>(
@@ -227,7 +253,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                 numOfRowsInLevel,
                 m_gpuDInv.data(),
                 v.data(),
-                upperSolveThreadBlockSize);
+                upperSolveThreadBlockSize,
+                m_stream);
         }
     }
 }
@@ -252,21 +279,24 @@ GpuDILU<M, X, Y, l>::update()
 {
     OPM_TIMEBLOCK(prec_update);
     {
-        update(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
+        m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu
+        reorderAndSplitMatrix(m_moveThreadBlockSize);
+        computeDiagonal(m_DILUFactorizationThreadBlockSize);
     }
 }
 
 template <class M, class X, class Y, int l>
 void
 GpuDILU<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationBlockSize)
 {
-    m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
-    computeDiagAndMoveReorderedData(moveThreadBlockSize, factorizationBlockSize);
+    m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu
+    reorderAndSplitMatrix(moveThreadBlockSize);
+    computeDiagonal(factorizationBlockSize);
 }
 
 template <class M, class X, class Y, int l>
 void
-GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationBlockSize)
+GpuDILU<M, X, Y, l>::reorderAndSplitMatrix(int moveThreadBlockSize)
 {
     if (m_splitMatrix) {
         detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
@@ -290,7 +320,12 @@ GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in
                                                                 m_gpuMatrixReordered->N(),
                                                                 moveThreadBlockSize);
     }
+}
 
+template <class M, class X, class Y, int l>
+void
+GpuDILU<M, X, Y, l>::computeDiagonal(int factorizationBlockSize)
+{
     int levelStartIdx = 0;
     for (int level = 0; level < m_levelSets.size(); ++level) {
         const int numOfRowsInLevel = m_levelSets[level].size();

diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.hpp b/opm/simulators/linalg/gpuistl/GpuDILU.hpp
@@ -25,7 +25,8 @@
 #include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
 #include <opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp>
 #include <vector>
-
+#include <map>
+#include <utility>
 
 
 namespace Opm::gpuistl
@@ -82,8 +83,11 @@ class GpuDILU : public Dune::PreconditionerWithUpdate<X, Y>
     //! \brief Updates the matrix data.
     void update() final;
 
+    //! \brief perform matrix splitting and reordering
+    void reorderAndSplitMatrix(int moveThreadBlockSize);
+
     //! \brief Compute the diagonal of the DILU, and update the data of the reordered matrix
-    void computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationThreadBlockSize);
+    void computeDiagonal(int factorizationThreadBlockSize);
 
     //! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
     void tuneThreadBlockSizes();
@@ -153,6 +157,13 @@ class GpuDILU : public Dune::PreconditionerWithUpdate<X, Y>
     int m_lowerSolveThreadBlockSize = -1;
     int m_moveThreadBlockSize = -1;
     int m_DILUFactorizationThreadBlockSize = -1;
+
+    // Graphs for Apply
+    std::map<std::pair<field_type*, const field_type*>, cudaGraph_t> m_apply_graphs;
+    std::map<std::pair<field_type*, const field_type*>, cudaGraphExec_t> m_executableGraphs;
+
+    // Stream for the DILU operations on the GPU
+    cudaStream_t m_stream;
 };
 } // end namespace Opm::gpuistl
 

diff --git a/opm/simulators/linalg/gpuistl/GpuVector.cpp b/opm/simulators/linalg/gpuistl/GpuVector.cpp
@@ -266,6 +266,19 @@ GpuVector<T>::copyFromHost(const T* dataPointer, size_t numberOfElements)
     OPM_GPU_SAFE_CALL(cudaMemcpy(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice));
 }
 
+template <class T>
+void
+GpuVector<T>::copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream)
+{
+    if (numberOfElements > dim()) {
+        OPM_THROW(std::runtime_error,
+                  fmt::format("Requesting to copy too many elements. Vector has {} elements, while {} was requested.",
+                              dim(),
+                              numberOfElements));
+    }
+    OPM_GPU_SAFE_CALL(cudaMemcpyAsync(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice, stream));
+}
+
 template <class T>
 void
 GpuVector<T>::copyToHost(T* dataPointer, size_t numberOfElements) const

diff --git a/opm/simulators/linalg/gpuistl/GpuVector.hpp b/opm/simulators/linalg/gpuistl/GpuVector.hpp
@@ -203,6 +203,7 @@ class GpuVector
      * @note assumes that this vector has numberOfElements elements
      */
     void copyFromHost(const T* dataPointer, size_t numberOfElements);
+    void copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream);
 
     /**
      * @brief copyFromHost copies numberOfElements to the CPU memory dataPointer
-Original file line number
+Diff line change
@@ Expand Up / @@ -188,6 +188,7 @@ GpuBuffer<T>::copyFromHost(const std::vector<T>& data) @@
     {
         copyFromHost(data.data(), data.size());
     }
     template <class T>
     void
     GpuBuffer<T>::copyToHost(std::vector<T>& data) const
@@ Expand Down @@