From d394a1f4c76b2092a7fa6dd5a0179b7e45445d4e Mon Sep 17 00:00:00 2001 From: Tobias Meyer Andersen Date: Thu, 5 Dec 2024 10:47:05 +0100 Subject: [PATCH 1/5] use cudagraphs in gpu DILU and ILU0 apply is also supported in HIP, though not does not seem to affect performance in any clear way. 1.1 to 1.2 speedup in Nvidia GPUs. --- opm/simulators/linalg/gpuistl/GpuBuffer.cpp | 21 ++ opm/simulators/linalg/gpuistl/GpuBuffer.hpp | 19 ++ opm/simulators/linalg/gpuistl/GpuDILU.cpp | 81 +++-- opm/simulators/linalg/gpuistl/GpuDILU.hpp | 15 +- .../linalg/gpuistl/GpuSparseMatrix.cpp | 13 + .../linalg/gpuistl/GpuSparseMatrix.hpp | 12 + opm/simulators/linalg/gpuistl/GpuVector.cpp | 13 + opm/simulators/linalg/gpuistl/GpuVector.hpp | 1 + opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp | 81 +++-- opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp | 12 +- .../detail/gpusparse_matrix_operations.cu | 53 +++- .../detail/gpusparse_matrix_operations.hpp | 49 +++ .../preconditionerKernels/DILUKernels.cu | 279 +++++++++++++----- .../preconditionerKernels/DILUKernels.hpp | 18 +- .../preconditionerKernels/ILU0Kernels.cu | 142 ++++----- .../preconditionerKernels/ILU0Kernels.hpp | 18 +- 16 files changed, 628 insertions(+), 199 deletions(-) diff --git a/opm/simulators/linalg/gpuistl/GpuBuffer.cpp b/opm/simulators/linalg/gpuistl/GpuBuffer.cpp index b4132ce09fb..dba271d68d7 100644 --- a/opm/simulators/linalg/gpuistl/GpuBuffer.cpp +++ b/opm/simulators/linalg/gpuistl/GpuBuffer.cpp @@ -174,6 +174,19 @@ GpuBuffer::copyFromHost(const T* dataPointer, size_t numberOfElements) OPM_GPU_SAFE_CALL(cudaMemcpy(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice)); } +template +void +GpuBuffer::copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream) +{ + if (numberOfElements > size()) { + OPM_THROW(std::runtime_error, + fmt::format("Requesting to copy too many elements. buffer has {} elements, while {} was requested.", + size(), + numberOfElements)); + } + OPM_GPU_SAFE_CALL(cudaMemcpyAsync(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice, stream)); +} + template void GpuBuffer::copyToHost(T* dataPointer, size_t numberOfElements) const @@ -188,6 +201,14 @@ GpuBuffer::copyFromHost(const std::vector& data) { copyFromHost(data.data(), data.size()); } + +template +void +GpuBuffer::copyFromHost(const std::vector& data, cudaStream_t stream) +{ + copyFromHost(data.data(), data.size(), stream); +} + template void GpuBuffer::copyToHost(std::vector& data) const diff --git a/opm/simulators/linalg/gpuistl/GpuBuffer.hpp b/opm/simulators/linalg/gpuistl/GpuBuffer.hpp index 369ea3ba28f..1865d7a71d3 100644 --- a/opm/simulators/linalg/gpuistl/GpuBuffer.hpp +++ b/opm/simulators/linalg/gpuistl/GpuBuffer.hpp @@ -27,6 +27,7 @@ #include #include #include +#include namespace Opm::gpuistl @@ -174,6 +175,15 @@ class GpuBuffer */ void copyFromHost(const T* dataPointer, size_t numberOfElements); + /** + * @brief copyFromHost copies numberOfElements from the CPU memory dataPointer + * @param dataPointer raw pointer to CPU memory + * @param numberOfElements number of elements to copy + * @note This does synchronous transfer. + * @note assumes that this buffer has numberOfElements elements + */ + void copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream); + /** * @brief copyFromHost copies numberOfElements to the CPU memory dataPointer * @param dataPointer raw pointer to CPU memory @@ -192,6 +202,15 @@ class GpuBuffer */ void copyFromHost(const std::vector& data); + /** + * @brief copyToHost copies data from an std::vector + * @param data the vector to copy from + * + * @note This does synchronous transfer. + * @note This assumes that the size of this buffer is equal to the size of the input vector. + */ + void copyFromHost(const std::vector& data, cudaStream_t stream); + /** * @brief copyToHost copies data to an std::vector * @param data the vector to copy to diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.cpp b/opm/simulators/linalg/gpuistl/GpuDILU.cpp index d311386b00d..b76bfe1ed61 100644 --- a/opm/simulators/linalg/gpuistl/GpuDILU.cpp +++ b/opm/simulators/linalg/gpuistl/GpuDILU.cpp @@ -37,6 +37,7 @@ #include #include #include + namespace Opm::gpuistl { @@ -54,6 +55,8 @@ GpuDILU::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int , m_tuneThreadBlockSizes(tuneKernels) , m_mixedPrecisionScheme(makeMatrixStorageMPScheme(mixedPrecisionScheme)) { + OPM_GPU_SAFE_CALL(cudaStreamCreate(&m_stream)); + // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode? // Some sanity check OPM_ERROR_IF(A.N() != m_gpuMatrix.N(), @@ -93,7 +96,8 @@ GpuDILU::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int } } - computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize); + reorderAndSplitMatrix(m_moveThreadBlockSize); + computeDiagonal(m_DILUFactorizationThreadBlockSize); if (m_tuneThreadBlockSizes) { tuneThreadBlockSizes(); @@ -112,7 +116,22 @@ GpuDILU::apply(X& v, const Y& d) { OPM_TIMEBLOCK(prec_apply); { - apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize); + const auto ptrs = std::make_pair(v.data(), d.data()); + + auto it = m_apply_graphs.find(ptrs); + + if (it == m_apply_graphs.end()) { + m_apply_graphs[ptrs] = cudaGraph_t(); + m_executableGraphs[ptrs] = cudaGraphExec_t(); + OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream, cudaStreamCaptureModeGlobal)); + + // The apply functions contains lots of small function calls which call a kernel each + apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize); + + OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream, &m_apply_graphs[ptrs])); + OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs], m_apply_graphs[ptrs], nullptr, nullptr, 0)); + } + OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs], 0)); } } @@ -135,7 +154,8 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int m_gpuDInvFloat->data(), d.data(), v.data(), - lowerSolveThreadBlockSize); + lowerSolveThreadBlockSize, + m_stream); } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) { detail::DILU::solveLowerLevelSetSplit( m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(), @@ -147,7 +167,8 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int m_gpuDInv.data(), d.data(), v.data(), - lowerSolveThreadBlockSize); + lowerSolveThreadBlockSize, + m_stream); } else { detail::DILU::solveLowerLevelSetSplit( m_gpuMatrixReorderedLower->getNonZeroValues().data(), @@ -159,7 +180,8 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int m_gpuDInv.data(), d.data(), v.data(), - lowerSolveThreadBlockSize); + lowerSolveThreadBlockSize, + m_stream); } } else { detail::DILU::solveLowerLevelSet( @@ -172,7 +194,8 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int m_gpuDInv.data(), d.data(), v.data(), - lowerSolveThreadBlockSize); + lowerSolveThreadBlockSize, + m_stream); } levelStartIdx += numOfRowsInLevel; } @@ -193,7 +216,8 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int numOfRowsInLevel, m_gpuDInvFloat->data(), v.data(), - upperSolveThreadBlockSize); + upperSolveThreadBlockSize, + m_stream); } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG){ detail::DILU::solveUpperLevelSetSplit( m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(), @@ -204,7 +228,8 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int numOfRowsInLevel, m_gpuDInv.data(), v.data(), - upperSolveThreadBlockSize); + upperSolveThreadBlockSize, + m_stream); } else { detail::DILU::solveUpperLevelSetSplit( m_gpuMatrixReorderedUpper->getNonZeroValues().data(), @@ -215,7 +240,8 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int numOfRowsInLevel, m_gpuDInv.data(), v.data(), - upperSolveThreadBlockSize); + upperSolveThreadBlockSize, + m_stream); } } else { detail::DILU::solveUpperLevelSet( @@ -227,7 +253,8 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int numOfRowsInLevel, m_gpuDInv.data(), v.data(), - upperSolveThreadBlockSize); + upperSolveThreadBlockSize, + m_stream); } } } @@ -252,7 +279,9 @@ GpuDILU::update() { OPM_TIMEBLOCK(prec_update); { - update(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize); + m_gpuMatrix.updateNonzeroValuesDirectlyInStream(m_cpuMatrix, m_stream); // send updated matrix to the gpu + reorderAndSplitMatrix(m_moveThreadBlockSize); + computeDiagonal(m_DILUFactorizationThreadBlockSize); } } @@ -260,13 +289,14 @@ template void GpuDILU::update(int moveThreadBlockSize, int factorizationBlockSize) { - m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu - computeDiagAndMoveReorderedData(moveThreadBlockSize, factorizationBlockSize); + m_gpuMatrix.updateNonzeroValuesDirectlyInStream(m_cpuMatrix, m_stream); // send updated matrix to the gpu + reorderAndSplitMatrix(moveThreadBlockSize); + computeDiagonal(factorizationBlockSize); } template void -GpuDILU::computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationBlockSize) +GpuDILU::reorderAndSplitMatrix(int moveThreadBlockSize) { if (m_splitMatrix) { detail::copyMatDataToReorderedSplit( @@ -280,7 +310,8 @@ GpuDILU::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in m_gpuMatrixReorderedDiag->data(), m_gpuNaturalToReorder.data(), m_gpuMatrixReorderedLower->N(), - moveThreadBlockSize); + moveThreadBlockSize, + m_stream); } else { detail::copyMatDataToReordered(m_gpuMatrix.getNonZeroValues().data(), m_gpuMatrix.getRowIndices().data(), @@ -288,9 +319,15 @@ GpuDILU::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in m_gpuMatrixReordered->getRowIndices().data(), m_gpuNaturalToReorder.data(), m_gpuMatrixReordered->N(), - moveThreadBlockSize); + moveThreadBlockSize, + m_stream); } +} +template +void +GpuDILU::computeDiagonal(int factorizationBlockSize) +{ int levelStartIdx = 0; for (int level = 0; level < m_levelSets.size(); ++level) { const int numOfRowsInLevel = m_levelSets[level].size(); @@ -312,7 +349,8 @@ GpuDILU::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in m_gpuDInvFloat->data(), m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(), m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(), - factorizationBlockSize); + factorizationBlockSize, + m_stream); } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) { detail::DILU::computeDiluDiagonalSplit( m_gpuMatrixReorderedLower->getNonZeroValues().data(), @@ -330,7 +368,8 @@ GpuDILU::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in nullptr, m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(), m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(), - factorizationBlockSize); + factorizationBlockSize, + m_stream); } else { // TODO: should this be field type twice or field type then float in the template? detail::DILU::computeDiluDiagonalSplit( @@ -349,7 +388,8 @@ GpuDILU::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in nullptr, nullptr, nullptr, - factorizationBlockSize); + factorizationBlockSize, + m_stream); } } else { detail::DILU::computeDiluDiagonal( @@ -361,7 +401,8 @@ GpuDILU::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in levelStartIdx, numOfRowsInLevel, m_gpuDInv.data(), - factorizationBlockSize); + factorizationBlockSize, + m_stream); } levelStartIdx += numOfRowsInLevel; } diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.hpp b/opm/simulators/linalg/gpuistl/GpuDILU.hpp index 241eef3e98d..bde85aff6b3 100644 --- a/opm/simulators/linalg/gpuistl/GpuDILU.hpp +++ b/opm/simulators/linalg/gpuistl/GpuDILU.hpp @@ -25,7 +25,8 @@ #include #include #include - +#include +#include namespace Opm::gpuistl @@ -82,8 +83,11 @@ class GpuDILU : public Dune::PreconditionerWithUpdate //! \brief Updates the matrix data. void update() final; + //! \brief perform matrix splitting and reordering + void reorderAndSplitMatrix(int moveThreadBlockSize); + //! \brief Compute the diagonal of the DILU, and update the data of the reordered matrix - void computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationThreadBlockSize); + void computeDiagonal(int factorizationThreadBlockSize); //! \brief function that will experimentally tune the thread block sizes of the important cuda kernels void tuneThreadBlockSizes(); @@ -153,6 +157,13 @@ class GpuDILU : public Dune::PreconditionerWithUpdate int m_lowerSolveThreadBlockSize = -1; int m_moveThreadBlockSize = -1; int m_DILUFactorizationThreadBlockSize = -1; + + // Graphs for Apply + std::map, cudaGraph_t> m_apply_graphs; + std::map, cudaGraphExec_t> m_executableGraphs; + + // Stream for the DILU operations on the GPU + cudaStream_t m_stream; }; } // end namespace Opm::gpuistl diff --git a/opm/simulators/linalg/gpuistl/GpuSparseMatrix.cpp b/opm/simulators/linalg/gpuistl/GpuSparseMatrix.cpp index c2b49cb71d2..6277118e8ae 100644 --- a/opm/simulators/linalg/gpuistl/GpuSparseMatrix.cpp +++ b/opm/simulators/linalg/gpuistl/GpuSparseMatrix.cpp @@ -183,6 +183,19 @@ GpuSparseMatrix::updateNonzeroValues(const MatrixType& matrix, bool copyNonZe } } +// template +// template +// void +// GpuSparseMatrix::updateNonzeroValuesDirectlyInStream(const MatrixType& matrix, cudaStream_t stream) +// { +// OPM_ERROR_IF(nonzeroes() != matrix.nonzeroes(), "Matrix does not have the same number of non-zero elements."); +// OPM_ERROR_IF(matrix[0][0].N() != blockSize(), "Matrix does not have the same blocksize."); +// OPM_ERROR_IF(matrix.N() != N(), "Matrix does not have the same number of rows."); + +// const T* newNonZeroElements = static_cast(&((matrix[0][0][0][0]))); +// m_nonZeroElements.copyFromHost(newNonZeroElements, nonzeroes() * blockSize() * blockSize(), stream); +// } + template void GpuSparseMatrix::setUpperTriangular() diff --git a/opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp b/opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp index b2d4a954c50..5aca19f5b85 100644 --- a/opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp +++ b/opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp @@ -293,6 +293,18 @@ class GpuSparseMatrix template void updateNonzeroValues(const MatrixType& matrix, bool copyNonZeroElementsDirectly = false); + + template + void updateNonzeroValuesDirectlyInStream(const MatrixType& matrix, cudaStream_t stream) + { + OPM_ERROR_IF(nonzeroes() != matrix.nonzeroes(), "Matrix does not have the same number of non-zero elements."); + OPM_ERROR_IF(matrix[0][0].N() != blockSize(), "Matrix does not have the same blocksize."); + OPM_ERROR_IF(matrix.N() != N(), "Matrix does not have the same number of rows."); + + const T* newNonZeroElements = static_cast(&((matrix[0][0][0][0]))); + m_nonZeroElements.copyFromHost(newNonZeroElements, nonzeroes() * blockSize() * blockSize(), stream); + } + private: GpuVector m_nonZeroElements; GpuVector m_columnIndices; diff --git a/opm/simulators/linalg/gpuistl/GpuVector.cpp b/opm/simulators/linalg/gpuistl/GpuVector.cpp index ddafc1dbfd8..af39e99419e 100644 --- a/opm/simulators/linalg/gpuistl/GpuVector.cpp +++ b/opm/simulators/linalg/gpuistl/GpuVector.cpp @@ -266,6 +266,19 @@ GpuVector::copyFromHost(const T* dataPointer, size_t numberOfElements) OPM_GPU_SAFE_CALL(cudaMemcpy(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice)); } +template +void +GpuVector::copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream) +{ + if (numberOfElements > dim()) { + OPM_THROW(std::runtime_error, + fmt::format("Requesting to copy too many elements. Vector has {} elements, while {} was requested.", + dim(), + numberOfElements)); + } + OPM_GPU_SAFE_CALL(cudaMemcpyAsync(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice, stream)); +} + template void GpuVector::copyToHost(T* dataPointer, size_t numberOfElements) const diff --git a/opm/simulators/linalg/gpuistl/GpuVector.hpp b/opm/simulators/linalg/gpuistl/GpuVector.hpp index e369d1d9d1d..66a1ebd65e1 100644 --- a/opm/simulators/linalg/gpuistl/GpuVector.hpp +++ b/opm/simulators/linalg/gpuistl/GpuVector.hpp @@ -203,6 +203,7 @@ class GpuVector * @note assumes that this vector has numberOfElements elements */ void copyFromHost(const T* dataPointer, size_t numberOfElements); + void copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream); /** * @brief copyFromHost copies numberOfElements to the CPU memory dataPointer diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp index 686b8f09f4a..394f83edb9b 100644 --- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp +++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp @@ -37,6 +37,7 @@ #include #include #include + namespace Opm::gpuistl { @@ -59,6 +60,8 @@ OpmGpuILU0::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel , m_mixedPrecisionScheme(makeMatrixStorageMPScheme(mixedPrecisionScheme)) { + OPM_GPU_SAFE_CALL(cudaStreamCreate(&m_stream)); + // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode? // Some sanity check OPM_ERROR_IF(A.N() != m_gpuMatrix.N(), @@ -98,7 +101,8 @@ OpmGpuILU0::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel } } - LUFactorizeAndMoveData(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize); + reorderAndSplitMatrix(m_moveThreadBlockSize); + LUFactorizeMatrix(m_ILU0FactorizationThreadBlockSize); if (m_tuneThreadBlockSizes) { tuneThreadBlockSizes(); @@ -117,7 +121,22 @@ OpmGpuILU0::apply(X& v, const Y& d) { OPM_TIMEBLOCK(prec_apply); { - apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize); + const auto ptrs = std::make_pair(v.data(), d.data()); + + auto it = m_apply_graphs.find(ptrs); + + if (it == m_apply_graphs.end()) { + m_apply_graphs[ptrs] = cudaGraph_t(); + m_executableGraphs[ptrs] = cudaGraphExec_t(); + OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream, cudaStreamCaptureModeGlobal)); + + // The apply functions contains lots of small function calls which call a kernel each + apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize); + + OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream, &m_apply_graphs[ptrs])); + OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs], m_apply_graphs[ptrs], nullptr, nullptr, 0)); + } + OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs], 0)); } } @@ -142,7 +161,8 @@ OpmGpuILU0::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i numOfRowsInLevel, d.data(), v.data(), - lowerSolveThreadBlockSize); + lowerSolveThreadBlockSize, + m_stream); } else{ detail::ILU0::solveLowerLevelSetSplit( @@ -154,7 +174,8 @@ OpmGpuILU0::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i numOfRowsInLevel, d.data(), v.data(), - lowerSolveThreadBlockSize); + lowerSolveThreadBlockSize, + m_stream); } } else { detail::ILU0::solveLowerLevelSet(m_gpuReorderedLU->getNonZeroValues().data(), @@ -165,7 +186,8 @@ OpmGpuILU0::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i numOfRowsInLevel, d.data(), v.data(), - lowerSolveThreadBlockSize); + lowerSolveThreadBlockSize, + m_stream); } levelStartIdx += numOfRowsInLevel; } @@ -185,7 +207,8 @@ OpmGpuILU0::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i numOfRowsInLevel, m_gpuMatrixReorderedDiagFloat.value().data(), v.data(), - upperSolveThreadBlockSize); + upperSolveThreadBlockSize, + m_stream); } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) { detail::ILU0::solveUpperLevelSetSplit( @@ -197,7 +220,8 @@ OpmGpuILU0::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i numOfRowsInLevel, m_gpuMatrixReorderedDiag.value().data(), v.data(), - upperSolveThreadBlockSize); + upperSolveThreadBlockSize, + m_stream); } else{ detail::ILU0::solveUpperLevelSetSplit( @@ -209,7 +233,8 @@ OpmGpuILU0::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i numOfRowsInLevel, m_gpuMatrixReorderedDiag.value().data(), v.data(), - upperSolveThreadBlockSize); + upperSolveThreadBlockSize, + m_stream); } } else { detail::ILU0::solveUpperLevelSet(m_gpuReorderedLU->getNonZeroValues().data(), @@ -219,7 +244,8 @@ OpmGpuILU0::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i levelStartIdx, numOfRowsInLevel, v.data(), - upperSolveThreadBlockSize); + upperSolveThreadBlockSize, + m_stream); } } } @@ -241,10 +267,9 @@ template void OpmGpuILU0::update() { - OPM_TIMEBLOCK(prec_update); - { - update(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize); - } + m_gpuMatrix.updateNonzeroValuesDirectlyInStream(m_cpuMatrix, m_stream); // send updated matrix to the gpu + reorderAndSplitMatrix(m_moveThreadBlockSize); + LUFactorizeMatrix(m_ILU0FactorizationThreadBlockSize); } template @@ -254,12 +279,14 @@ OpmGpuILU0::update(int moveThreadBlockSize, int factorizationThreadB OPM_TIMEBLOCK(prec_update); { m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu - LUFactorizeAndMoveData(moveThreadBlockSize, factorizationThreadBlockSize); + reorderAndSplitMatrix(moveThreadBlockSize); + LUFactorizeMatrix(factorizationThreadBlockSize); } } + template void -OpmGpuILU0::LUFactorizeAndMoveData(int moveThreadBlockSize, int factorizationThreadBlockSize) +OpmGpuILU0::reorderAndSplitMatrix(int moveThreadBlockSize) { if (m_splitMatrix) { detail::copyMatDataToReorderedSplit( @@ -273,7 +300,8 @@ OpmGpuILU0::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact m_gpuMatrixReorderedDiag.value().data(), m_gpuNaturalToReorder.data(), m_gpuMatrixReorderedLower->N(), - moveThreadBlockSize); + moveThreadBlockSize, + m_stream); } else { detail::copyMatDataToReordered(m_gpuMatrix.getNonZeroValues().data(), m_gpuMatrix.getRowIndices().data(), @@ -281,8 +309,15 @@ OpmGpuILU0::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact m_gpuReorderedLU->getRowIndices().data(), m_gpuNaturalToReorder.data(), m_gpuReorderedLU->N(), - moveThreadBlockSize); + moveThreadBlockSize, + m_stream); } +} + +template +void +OpmGpuILU0::LUFactorizeMatrix(int factorizationThreadBlockSize) +{ int levelStartIdx = 0; for (int level = 0; level < m_levelSets.size(); ++level) { const int numOfRowsInLevel = m_levelSets[level].size(); @@ -304,7 +339,8 @@ OpmGpuILU0::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact m_gpuNaturalToReorder.data(), levelStartIdx, numOfRowsInLevel, - factorizationThreadBlockSize); + factorizationThreadBlockSize, + m_stream); } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG){ detail::ILU0::LUFactorizationSplit( @@ -322,7 +358,8 @@ OpmGpuILU0::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact m_gpuNaturalToReorder.data(), levelStartIdx, numOfRowsInLevel, - factorizationThreadBlockSize); + factorizationThreadBlockSize, + m_stream); } else{ detail::ILU0::LUFactorizationSplit( @@ -340,7 +377,8 @@ OpmGpuILU0::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact m_gpuNaturalToReorder.data(), levelStartIdx, numOfRowsInLevel, - factorizationThreadBlockSize); + factorizationThreadBlockSize, + m_stream); } } else { @@ -351,7 +389,8 @@ OpmGpuILU0::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact m_gpuReorderToNatural.data(), numOfRowsInLevel, levelStartIdx, - factorizationThreadBlockSize); + factorizationThreadBlockSize, + m_stream); } levelStartIdx += numOfRowsInLevel; } diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp index c163e048326..064818c68fc 100644 --- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp +++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp @@ -84,8 +84,11 @@ class OpmGpuILU0 : public Dune::PreconditionerWithUpdate //! \brief Updates the matrix data. void update() final; + //! \brief perform matrix splitting and reordering + void reorderAndSplitMatrix(int moveThreadBlockSize); + //! \brief Compute LU factorization, and update the data of the reordered matrix - void LUFactorizeAndMoveData(int moveThreadBlockSize, int factorizationThreadBlockSize); + void LUFactorizeMatrix(int factorizationThreadBlockSize); //! \brief function that will experimentally tune the thread block sizes of the important cuda kernels void tuneThreadBlockSizes(); @@ -152,6 +155,13 @@ class OpmGpuILU0 : public Dune::PreconditionerWithUpdate int m_lowerSolveThreadBlockSize = -1; int m_moveThreadBlockSize = -1; int m_ILU0FactorizationThreadBlockSize = -1; + + // Graphs for Apply + std::map, cudaGraph_t> m_apply_graphs; + std::map, cudaGraphExec_t> m_executableGraphs; + + // Stream for the DILU operations on the GPU + cudaStream_t m_stream; }; } // end namespace Opm::gpuistl diff --git a/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.cu b/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.cu index 52b8a166eca..3f4a095e3f9 100644 --- a/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.cu +++ b/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.cu @@ -116,6 +116,24 @@ copyMatDataToReordered(T* srcMatrix, srcMatrix, srcRowIndices, dstMatrix, dstRowIndices, naturalToReordered, numberOfRows); } +template +void +copyMatDataToReordered(T* srcMatrix, + int* srcRowIndices, + T* dstMatrix, + int* dstRowIndices, + int* naturalToReordered, + size_t numberOfRows, + int thrBlockSize, + cudaStream_t stream) +{ + int threadBlockSize + = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuMoveDataToReordered, thrBlockSize); + int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(numberOfRows, threadBlockSize); + cuMoveDataToReordered<<>>( + srcMatrix, srcRowIndices, dstMatrix, dstRowIndices, naturalToReordered, numberOfRows); +} + template void copyMatDataToReorderedSplit(T* srcMatrix, @@ -145,9 +163,42 @@ copyMatDataToReorderedSplit(T* srcMatrix, numberOfRows); } +template +void +copyMatDataToReorderedSplit(T* srcMatrix, + int* srcRowIndices, + int* srcColumnIndices, + T* dstLowerMatrix, + int* dstLowerRowIndices, + T* dstUpperMatrix, + int* dstUpperRowIndices, + T* dstDiag, + int* naturalToReordered, + size_t numberOfRows, + int thrBlockSize, + cudaStream_t stream) +{ + int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( + cuMoveDataToReorderedSplit, thrBlockSize); + int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(numberOfRows, threadBlockSize); + cuMoveDataToReorderedSplit<<>>(srcMatrix, + srcRowIndices, + srcColumnIndices, + dstLowerMatrix, + dstLowerRowIndices, + dstUpperMatrix, + dstUpperRowIndices, + dstDiag, + naturalToReordered, + numberOfRows); +} + + #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \ template void copyMatDataToReordered(T*, int*, T*, int*, int*, size_t, int); \ - template void copyMatDataToReorderedSplit(T*, int*, int*, T*, int*, T*, int*, T*, int*, size_t, int); + template void copyMatDataToReordered(T*, int*, T*, int*, int*, size_t, int, cudaStream_t); \ + template void copyMatDataToReorderedSplit(T*, int*, int*, T*, int*, T*, int*, T*, int*, size_t, int);\ + template void copyMatDataToReorderedSplit(T*, int*, int*, T*, int*, T*, int*, T*, int*, size_t, int, cudaStream_t); INSTANTIATE_KERNEL_WRAPPERS(float, 1); INSTANTIATE_KERNEL_WRAPPERS(float, 2); diff --git a/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.hpp b/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.hpp index 3e953a5056a..90ed59e8336 100644 --- a/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.hpp +++ b/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.hpp @@ -41,6 +41,27 @@ void copyMatDataToReordered(T* srcMatrix, size_t numberOfRows, int threadBlockSize); +// TODO: document the stream +/** + * @brief Reorders the elements of a matrix by copying them from one matrix to another using a permutation list + * @param srcMatrix The source matrix we will copy data from + * @param srcRowIndices Pointer to vector on GPU containing row indices for the source matrix compliant wiht bsr format + * @param [out] dstMatrix The destination matrix that we copy data to + * @param dstRowIndices Pointer to vector on GPU containing riw indices for the destination matrix compliant wiht bsr + * format + * @param naturalToReordered Permuation list that converts indices in the src matrix to the indices in the dst matrix + * @param numberOfRows The number of rows in the matrices + */ +template +void copyMatDataToReordered(T* srcMatrix, + int* srcRowIndices, + T* dstMatrix, + int* dstRowIndices, + int* naturalToReordered, + size_t numberOfRows, + int threadBlockSize, + cudaStream_t stream); + /** * @brief Reorders the elements of a matrix by copying them from one matrix to a split matrix using a permutation list * @param srcMatrix The source matrix we will copy data from @@ -68,5 +89,33 @@ void copyMatDataToReorderedSplit(T* srcMatrix, size_t numberOfRows, int threadBlockSize); +/** + * @brief Reorders the elements of a matrix by copying them from one matrix to a split matrix using a permutation list + * @param srcMatrix The source matrix we will copy data from + * @param srcRowIndices Pointer to vector on GPU containing row indices for the source matrix compliant wiht bsr format + * @param [out] dstLowerMatrix The destination of entries that originates from the strictly lower triangular matrix + * @param dstRowIndices Pointer to vector on GPU containing rww indices for the destination lower matrix compliant wiht + * bsr format + * @param [out] dstUpperMatrix The destination of entries that originates from the strictly upper triangular matrix + * @param dstRowIndices Pointer to vector on GPU containing riw indices for the destination upper matrix compliant wiht + * bsr format + * @param [out] dstDiag The destination buffer for the diagonal part of the matrix + * @param naturalToReordered Permuation list that converts indices in the src matrix to the indices in the dst matrix + * @param numberOfRows The number of rows in the matrices + */ +template +void copyMatDataToReorderedSplit(T* srcMatrix, + int* srcRowIndices, + int* srcColumnIndices, + T* dstLowerMatrix, + int* dstLowerRowIndices, + T* dstUpperMatrix, + int* dstUpperRowIndices, + T* dstDiag, + int* naturalToReordered, + size_t numberOfRows, + int threadBlockSize, + cudaStream_t stream); + } // namespace Opm::gpuistl::detail #endif diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu index b8f9f48ca5b..d2ad2f1cd6e 100644 --- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu +++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu @@ -85,10 +85,12 @@ namespace // TODO: removce the first condition in the for loop for (int block = nnzIdx; block < nnzIdxLim; ++block) { const int col = colIndices[block]; - mmvMixedGeneral(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); + mmvMixedGeneral( + &mat[block * blocksize * blocksize], &v[col * blocksize], rhs); } - mvMixedGeneral(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); + mvMixedGeneral( + &dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); } } @@ -137,10 +139,12 @@ namespace LinearSolverScalar rhs[blocksize] = {0}; for (int block = nnzIdx; block < nnzIdxLim; ++block) { const int col = colIndices[block]; - umvMixedGeneral(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); + umvMixedGeneral( + &mat[block * blocksize * blocksize], &v[col * blocksize], rhs); } - mmvMixedGeneral(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); + mmvMixedGeneral( + &dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); } } @@ -211,8 +215,8 @@ namespace } } - // TODO: rewrite such that during the factorization there is a dInv of InputScalar type that stores intermediate results - // TOOD: The important part is to only cast after that is fully computed + // TODO: rewrite such that during the factorization there is a dInv of InputScalar type that stores intermediate + // results TOOD: The important part is to only cast after that is fully computed template __global__ void cuComputeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat, int* lowerRowIndices, @@ -239,7 +243,8 @@ namespace InputScalar dInvTmp[blocksize * blocksize]; for (int i = 0; i < blocksize; ++i) { for (int j = 0; j < blocksize; ++j) { - dInvTmp[i * blocksize + j] = srcDiagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j]; + dInvTmp[i * blocksize + j] + = srcDiagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j]; } } @@ -257,21 +262,26 @@ namespace if constexpr (detail::storeOffDiagonalAsFloat(mixedPrecisionScheme)) { // TODO: think long and hard about whether this performs only the wanted memory transfers - moveBlock(&srcReorderedLowerMat[block * blocksize * blocksize], &dstLowerMat[block * blocksize * blocksize]); - moveBlock(&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize], &dstUpperMat[symOppositeBlock * blocksize * blocksize]); + moveBlock( + &srcReorderedLowerMat[block * blocksize * blocksize], + &dstLowerMat[block * blocksize * blocksize]); + moveBlock( + &srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize], + &dstUpperMat[symOppositeBlock * blocksize * blocksize]); } mmx2Subtraction(&srcReorderedLowerMat[block * blocksize * blocksize], - &dInv[col * blocksize * blocksize], - &srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize], - dInvTmp); + &dInv[col * blocksize * blocksize], + &srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize], + dInvTmp); } invBlockInPlace(dInvTmp); moveBlock(dInvTmp, &dInv[reorderedRowIdx * blocksize * blocksize]); if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)) { - moveBlock(dInvTmp, &dstDiag[reorderedRowIdx * blocksize * blocksize]); // important! + moveBlock( + dInvTmp, &dstDiag[reorderedRowIdx * blocksize * blocksize]); // important! } } } @@ -289,12 +299,13 @@ solveLowerLevelSet(T* reorderedMat, const T* dInv, const T* d, T* v, - int thrBlockSize) + int thrBlockSize, + cudaStream_t stream) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuSolveLowerLevelSet<<>>( + cuSolveLowerLevelSet<<>>( reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v); } @@ -310,13 +321,15 @@ solveLowerLevelSetSplit(MatrixScalar* reorderedMat, const DiagonalScalar* dInv, const LinearSolverScalar* d, LinearSolverScalar* v, - int thrBlockSize) + int thrBlockSize, + cudaStream_t stream) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( cuSolveLowerLevelSetSplit, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuSolveLowerLevelSetSplit<<>>( - reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v); + cuSolveLowerLevelSetSplit + <<>>( + reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v); } // perform the upper solve for all rows in the same level set template @@ -329,12 +342,13 @@ solveUpperLevelSet(T* reorderedMat, int rowsInLevelSet, const T* dInv, T* v, - int thrBlockSize) + int thrBlockSize, + cudaStream_t stream) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuSolveUpperLevelSet<<>>( + cuSolveUpperLevelSet<<>>( reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v); } @@ -348,13 +362,15 @@ solveUpperLevelSetSplit(MatrixScalar* reorderedMat, int rowsInLevelSet, const DiagonalScalar* dInv, LinearSolverScalar* v, - int thrBlockSize) + int thrBlockSize, + cudaStream_t stream) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( cuSolveUpperLevelSetSplit, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuSolveUpperLevelSetSplit<<>>( - reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v); + cuSolveUpperLevelSetSplit + <<>>( + reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v); } template @@ -367,20 +383,21 @@ computeDiluDiagonal(T* reorderedMat, const int startIdx, int rowsInLevelSet, T* dInv, - int thrBlockSize) + int thrBlockSize, + cudaStream_t stream) { if (blocksize <= 3) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( cuComputeDiluDiagonal, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuComputeDiluDiagonal<<>>(reorderedMat, - rowIndices, - colIndices, - reorderedToNatural, - naturalToReordered, - startIdx, - rowsInLevelSet, - dInv); + cuComputeDiluDiagonal<<>>(reorderedMat, + rowIndices, + colIndices, + reorderedToNatural, + naturalToReordered, + startIdx, + rowsInLevelSet, + dInv); } else { OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3"); } @@ -403,49 +420,151 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat, OutputScalar* dstDiag, OutputScalar* dstLowerMat, OutputScalar* dstUpperMat, - int thrBlockSize) + int thrBlockSize, + cudaStream_t stream) { if (blocksize <= 3) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( cuComputeDiluDiagonalSplit, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuComputeDiluDiagonalSplit<<>>(srcReorderedLowerMat, - lowerRowIndices, - lowerColIndices, - srcReorderedUpperMat, - upperRowIndices, - upperColIndices, - srcDiagonal, - reorderedToNatural, - naturalToReordered, - startIdx, - rowsInLevelSet, - dInv, - dstDiag, - dstLowerMat, - dstUpperMat); + cuComputeDiluDiagonalSplit + <<>>(srcReorderedLowerMat, + lowerRowIndices, + lowerColIndices, + srcReorderedUpperMat, + upperRowIndices, + upperColIndices, + srcDiagonal, + reorderedToNatural, + naturalToReordered, + startIdx, + rowsInLevelSet, + dInv, + dstDiag, + dstLowerMat, + dstUpperMat); } else { OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3"); } } -// TODO: format #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \ - template void computeDiluDiagonal(T*, int*, int*, int*, int*, const int, int, T*, int); \ - template void computeDiluDiagonalSplit( \ - const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \ - template void computeDiluDiagonalSplit( \ - const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \ - template void computeDiluDiagonalSplit( \ - const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \ - template void computeDiluDiagonalSplit( \ - const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \ - template void computeDiluDiagonalSplit( \ - const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \ - template void computeDiluDiagonalSplit( \ - const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \ - template void solveUpperLevelSet(T*, int*, int*, int*, int, int, const T*, T*, int); \ - template void solveLowerLevelSet(T*, int*, int*, int*, int, int, const T*, const T*, T*, int); + template void computeDiluDiagonal( \ + T*, int*, int*, int*, int*, const int, int, T*, int, cudaStream_t); \ + template void computeDiluDiagonalSplit( \ + const T*, \ + int*, \ + int*, \ + const T*, \ + int*, \ + int*, \ + const T*, \ + int*, \ + int*, \ + const int, \ + int, \ + T*, \ + double*, \ + double*, \ + double*, \ + int, \ + cudaStream_t); \ + template void computeDiluDiagonalSplit( \ + const T*, \ + int*, \ + int*, \ + const T*, \ + int*, \ + int*, \ + const T*, \ + int*, \ + int*, \ + const int, \ + int, \ + T*, \ + float*, \ + float*, \ + float*, \ + int, \ + cudaStream_t); \ + template void computeDiluDiagonalSplit( \ + const T*, \ + int*, \ + int*, \ + const T*, \ + int*, \ + int*, \ + const T*, \ + int*, \ + int*, \ + const int, \ + int, \ + T*, \ + float*, \ + float*, \ + float*, \ + int, \ + cudaStream_t); \ + template void computeDiluDiagonalSplit( \ + const T*, \ + int*, \ + int*, \ + const T*, \ + int*, \ + int*, \ + const T*, \ + int*, \ + int*, \ + const int, \ + int, \ + T*, \ + double*, \ + double*, \ + double*, \ + int, \ + cudaStream_t); \ + template void computeDiluDiagonalSplit( \ + const T*, \ + int*, \ + int*, \ + const T*, \ + int*, \ + int*, \ + const T*, \ + int*, \ + int*, \ + const int, \ + int, \ + T*, \ + float*, \ + float*, \ + float*, \ + int, \ + cudaStream_t); \ + template void computeDiluDiagonalSplit( \ + const T*, \ + int*, \ + int*, \ + const T*, \ + int*, \ + int*, \ + const T*, \ + int*, \ + int*, \ + const int, \ + int, \ + T*, \ + double*, \ + double*, \ + double*, \ + int, \ + cudaStream_t); \ + template void solveUpperLevelSet(T*, int*, int*, int*, int, int, const T*, T*, int, cudaStream_t); \ + template void solveLowerLevelSet( \ + T*, int*, int*, int*, int, int, const T*, const T*, T*, int, cudaStream_t); +// template void solveLowerLevelSet(T*, int*, int*, int*, int, int, const T*, const T*, T*, int); \ + // template void solveUpperLevelSetSplit(T*, int*, int*, int*, int, int, const T*, T*, int, cudaStream_t); \ + // template void solveLowerLevelSetSplit(T*, int*, int*, int*, int, int, const T*, const T*, T*, int, cudaStream_t); INSTANTIATE_KERNEL_WRAPPERS(float, 1); INSTANTIATE_KERNEL_WRAPPERS(float, 2); @@ -460,19 +579,29 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 4); INSTANTIATE_KERNEL_WRAPPERS(double, 5); INSTANTIATE_KERNEL_WRAPPERS(double, 6); -#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar) \ - template void solveUpperLevelSetSplit( \ - MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, LinearSolverScalar*, int); \ - template void solveLowerLevelSetSplit( \ - MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, const LinearSolverScalar*, LinearSolverScalar*, int); +#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar) \ + template void solveUpperLevelSetSplit( \ + MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, LinearSolverScalar*, int, cudaStream_t); \ + template void solveLowerLevelSetSplit( \ + MatrixScalar*, \ + int*, \ + int*, \ + int*, \ + int, \ + int, \ + const DiagonalScalar*, \ + const LinearSolverScalar*, \ + LinearSolverScalar*, \ + int, \ + cudaStream_t); // TODO: be smarter about this... Surely this instantiates many more combinations that are actually needed -#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(blocksize) \ - INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, float); \ - INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, float); \ - INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, float); \ - INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, double); \ - INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, double); \ +#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(blocksize) \ + INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, float); \ + INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, float); \ + INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, float); \ + INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, double); \ + INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, double); \ INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, double); INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(1); diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp index b7141834f14..dbc8d5d83fd 100644 --- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp +++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp @@ -54,7 +54,8 @@ void solveLowerLevelSet(T* reorderedMat, const T* dInv, const T* d, T* v, - int threadBlockSize); + int threadBlockSize, + cudaStream_t stream); /** * @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel @@ -82,7 +83,8 @@ void solveLowerLevelSetSplit(MatrixScalar* reorderedUpperMat, const DiagonalScalar* dInv, const LinearSolverScalar* d, LinearSolverScalar* v, - int threadBlockSize); + int threadBlockSize, + cudaStream_t stream); /** * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel @@ -108,7 +110,8 @@ void solveUpperLevelSet(T* reorderedMat, int rowsInLevelSet, const T* dInv, T* v, - int threadBlockSize); + int threadBlockSize, + cudaStream_t stream); /** * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel @@ -134,7 +137,8 @@ void solveUpperLevelSetSplit(MatrixScalar* reorderedUpperMat, int rowsInLevelSet, const DiagonalScalar* dInv, LinearSolverScalar* v, - int threadBlockSize); + int threadBlockSize, + cudaStream_t stream); /** * @brief Computes the ILU0 of the diagonal elements of the reordered matrix and stores it in a reordered vector @@ -161,7 +165,8 @@ void computeDiluDiagonal(T* reorderedMat, int startIdx, int rowsInLevelSet, T* dInv, - int threadBlockSize); + int threadBlockSize, + cudaStream_t stream); /** * @brief Computes the ILU0 of the diagonal elements of the split reordered matrix and stores it in a reordered vector @@ -200,7 +205,8 @@ void computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat, OutputScalar* dstDiagonal, OutputScalar* dstLowerMat, OutputScalar* dstUpperMat, - int threadBlockSize); + int threadBlockSize, + cudaStream_t stream); } // namespace Opm::gpuistl::detail::DILU #endif diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu index 46183efd237..92f3a6998c6 100644 --- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu +++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu @@ -215,9 +215,9 @@ namespace // as of now, if we are using mixed precision, then we are always storing the off-diagonals as floats, // and sometimes also the diagonal. if constexpr (detail::usingMixedPrecision(mixedPrecisionScheme)) { - // if we are want to store the entire matrix as a float then we must also move the diagonal block from double to float - // if not then we just use the double diagonal that is already now stored in srcDiagonal - if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)){ + // if we are want to store the entire matrix as a float then we must also move the diagonal block from + // double to float if not then we just use the double diagonal that is already now stored in srcDiagonal + if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)) { moveBlock(&srcDiagonal[reorderedIdx * scalarsInBlock], &dstDiagonal[reorderedIdx * scalarsInBlock]); } @@ -362,12 +362,13 @@ solveLowerLevelSet(T* reorderedMat, int rowsInLevelSet, const T* d, T* v, - int thrBlockSize) + int thrBlockSize, + cudaStream_t stream) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuSolveLowerLevelSet<<>>( + cuSolveLowerLevelSet<<>>( reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v); } // perform the upper solve for all rows in the same level set @@ -380,12 +381,13 @@ solveUpperLevelSet(T* reorderedMat, int startIdx, int rowsInLevelSet, T* v, - int thrBlockSize) + int thrBlockSize, + cudaStream_t stream) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuSolveUpperLevelSet<<>>( + cuSolveUpperLevelSet<<>>( reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, v); } @@ -399,13 +401,15 @@ solveLowerLevelSetSplit(MatrixScalar* reorderedMat, int rowsInLevelSet, const LinearSolverScalar* d, LinearSolverScalar* v, - int thrBlockSize) + int thrBlockSize, + cudaStream_t stream) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( cuSolveLowerLevelSetSplit, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuSolveLowerLevelSetSplit<<>>( - reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v); + cuSolveLowerLevelSetSplit + <<>>( + reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v); } // perform the upper solve for all rows in the same level set template @@ -418,13 +422,15 @@ solveUpperLevelSetSplit(MatrixScalar* reorderedMat, int rowsInLevelSet, const DiagonalScalar* dInv, LinearSolverScalar* v, - int thrBlockSize) + int thrBlockSize, + cudaStream_t stream) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( cuSolveUpperLevelSetSplit, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuSolveUpperLevelSetSplit<<>>( - reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v); + cuSolveUpperLevelSetSplit + <<>>( + reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v); } template @@ -436,12 +442,13 @@ LUFactorization(T* srcMatrix, int* reorderedToNatual, size_t rowsInLevelSet, int startIdx, - int thrBlockSize) + int thrBlockSize, + cudaStream_t stream) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuLUFactorization, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuLUFactorization<<>>( + cuLUFactorization<<>>( srcMatrix, srcRowIndices, srcColumnIndices, naturalToReordered, reorderedToNatual, rowsInLevelSet, startIdx); } @@ -461,51 +468,52 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat, int* naturalToReordered, const int startIdx, int rowsInLevelSet, - int thrBlockSize) + int thrBlockSize, + cudaStream_t stream) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( cuLUFactorizationSplit, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); cuLUFactorizationSplit - <<>>(srcReorderedLowerMat, - lowerRowIndices, - lowerColIndices, - reorderedUpperMat, - upperRowIndices, - upperColIndices, - srcDiagonal, - dstReorderedLowerMat, - dstReorderedUpperMat, - dstDiagonal, - reorderedToNatural, - naturalToReordered, - startIdx, - rowsInLevelSet); + <<>>(srcReorderedLowerMat, + lowerRowIndices, + lowerColIndices, + reorderedUpperMat, + upperRowIndices, + upperColIndices, + srcDiagonal, + dstReorderedLowerMat, + dstReorderedUpperMat, + dstDiagonal, + reorderedToNatural, + naturalToReordered, + startIdx, + rowsInLevelSet); } #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \ - template void solveUpperLevelSet(T*, int*, int*, int*, int, int, T*, int); \ - template void solveLowerLevelSet(T*, int*, int*, int*, int, int, const T*, T*, int); \ - template void LUFactorization(T*, int*, int*, int*, int*, size_t, int, int); \ - template void LUFactorizationSplit( \ - T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \ - template void LUFactorizationSplit( \ - T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \ - template void LUFactorizationSplit( \ - T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \ - template void LUFactorizationSplit( \ - T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \ - template void LUFactorizationSplit( \ - T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \ - template void LUFactorizationSplit( \ - T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); - -#define INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(T) \ - INSTANTIATE_KERNEL_WRAPPERS(T, 1); \ - INSTANTIATE_KERNEL_WRAPPERS(T, 2); \ - INSTANTIATE_KERNEL_WRAPPERS(T, 3); \ - INSTANTIATE_KERNEL_WRAPPERS(T, 4); \ - INSTANTIATE_KERNEL_WRAPPERS(T, 5); \ + template void solveUpperLevelSet(T*, int*, int*, int*, int, int, T*, int, cudaStream_t); \ + template void solveLowerLevelSet(T*, int*, int*, int*, int, int, const T*, T*, int, cudaStream_t); \ + template void LUFactorization(T*, int*, int*, int*, int*, size_t, int, int, cudaStream_t); \ + template void LUFactorizationSplit( \ + T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int, cudaStream_t); \ + template void LUFactorizationSplit( \ + T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int, cudaStream_t); \ + template void LUFactorizationSplit( \ + T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int, cudaStream_t); \ + template void LUFactorizationSplit( \ + T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int, cudaStream_t); \ + template void LUFactorizationSplit( \ + T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int, cudaStream_t); \ + template void LUFactorizationSplit( \ + T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int, cudaStream_t); + +#define INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(T) \ + INSTANTIATE_KERNEL_WRAPPERS(T, 1); \ + INSTANTIATE_KERNEL_WRAPPERS(T, 2); \ + INSTANTIATE_KERNEL_WRAPPERS(T, 3); \ + INSTANTIATE_KERNEL_WRAPPERS(T, 4); \ + INSTANTIATE_KERNEL_WRAPPERS(T, 5); \ INSTANTIATE_KERNEL_WRAPPERS(T, 6); INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(float) @@ -514,32 +522,32 @@ INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(double) #define INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(blocksize) \ /* double preconditioner */ \ template void solveLowerLevelSetSplit( \ - double*, int*, int*, int*, int, int, const double*, double*, int); \ + double*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t); \ /* float matrix, double compute preconditioner */ \ template void solveLowerLevelSetSplit( \ - float*, int*, int*, int*, int, int, const double*, double*, int); \ + float*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t); \ /* float preconditioner */ \ template void solveLowerLevelSetSplit( \ - float*, int*, int*, int*, int, int, const float*, float*, int); \ + float*, int*, int*, int*, int, int, const float*, float*, int, cudaStream_t); \ \ /* double preconditioner */ \ - template void solveUpperLevelSetSplit( \ - double*, int*, int*, int*, int, int, const double*, double*, int); \ + template void solveUpperLevelSetSplit( \ + double*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t); \ /* float matrix, double compute preconditioner */ \ - template void solveUpperLevelSetSplit( \ - float*, int*, int*, int*, int, int, const double*, double*, int); \ + template void solveUpperLevelSetSplit( \ + float*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t); \ /* float preconditioner */ \ - template void solveUpperLevelSetSplit( \ - float*, int*, int*, int*, int, int, const double*, float*, int); \ + template void solveUpperLevelSetSplit( \ + float*, int*, int*, int*, int, int, const double*, float*, int, cudaStream_t); \ /* double preconditioner */ \ - template void solveUpperLevelSetSplit( \ - double*, int*, int*, int*, int, int, const float*, double*, int); \ + template void solveUpperLevelSetSplit( \ + double*, int*, int*, int*, int, int, const float*, double*, int, cudaStream_t); \ /* float matrix, double compute preconditioner */ \ - template void solveUpperLevelSetSplit( \ - float*, int*, int*, int*, int, int, const float*, double*, int); \ + template void solveUpperLevelSetSplit( \ + float*, int*, int*, int*, int, int, const float*, double*, int, cudaStream_t); \ /* float preconditioner */ \ - template void solveUpperLevelSetSplit( \ - float*, int*, int*, int*, int, int, const float*, float*, int); + template void solveUpperLevelSetSplit( \ + float*, int*, int*, int*, int, int, const float*, float*, int, cudaStream_t); INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(1); diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp index cdd78ec9105..91cf89d6376 100644 --- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp +++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp @@ -46,7 +46,8 @@ void solveUpperLevelSet(T* reorderedMat, int startIdx, int rowsInLevelSet, T* v, - int threadBlockSize); + int threadBlockSize, + cudaStream_t stream); /** * @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel @@ -72,7 +73,8 @@ void solveLowerLevelSet(T* reorderedMat, int rowsInLevelSet, const T* d, T* v, - int threadBlockSize); + int threadBlockSize, + cudaStream_t stream); /** * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel @@ -99,7 +101,8 @@ void solveUpperLevelSetSplit(MatrixScalar* reorderedMat, int rowsInLevelSet, const DiagonalScalar* dInv, LinearSolverScalar* v, - int threadBlockSize); + int threadBlockSize, + cudaStream_t stream); /** * @brief Perform an lower solve on certain rows in a matrix that can safely be computed in parallel @@ -127,7 +130,8 @@ void solveLowerLevelSetSplit(MatrixScalar* reorderedLowerMat, int rowsInLevelSet, const LinearSolverScalar* d, LinearSolverScalar* v, - int threadBlockSize); + int threadBlockSize, + cudaStream_t stream); /** * @brief Computes the ILU Factorization of the input bcsr matrix, which is stored in a reordered way. The diagonal @@ -153,7 +157,8 @@ void LUFactorization(T* reorderedMat, int* reorderedToNatual, size_t rowsInLevelSet, int startIdx, - int threadBlockSize); + int threadBlockSize, + cudaStream_t stream); /** * TODO: update this doucmentation @@ -192,7 +197,8 @@ void LUFactorizationSplit(InputScalar* srcReorderedLowerMat, int* naturalToReordered, int startIdx, int rowsInLevelSet, - int threadBlockSize); + int threadBlockSize, + cudaStream_t stream); } // namespace Opm::gpuistl::detail::ILU0 #endif From 1ae1e0531a6bed452515e4e2f69b4a84cd314171 Mon Sep 17 00:00:00 2001 From: Tobias Meyer Andersen Date: Mon, 20 Jan 2025 09:48:02 +0100 Subject: [PATCH 2/5] resimplify update and format --- opm/simulators/linalg/gpuistl/GpuBuffer.cpp | 20 ----- opm/simulators/linalg/gpuistl/GpuBuffer.hpp | 18 ----- opm/simulators/linalg/gpuistl/GpuDILU.cpp | 22 ++---- .../linalg/gpuistl/GpuSparseMatrix.cpp | 13 ---- .../linalg/gpuistl/GpuSparseMatrix.hpp | 12 --- opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp | 20 ++--- .../preconditionerKernels/DILUKernels.cu | 76 ++++++++----------- .../preconditionerKernels/DILUKernels.hpp | 6 +- .../preconditionerKernels/ILU0Kernels.cu | 50 ++++++------ .../preconditionerKernels/ILU0Kernels.hpp | 6 +- 10 files changed, 75 insertions(+), 168 deletions(-) diff --git a/opm/simulators/linalg/gpuistl/GpuBuffer.cpp b/opm/simulators/linalg/gpuistl/GpuBuffer.cpp index dba271d68d7..49ce121e1fe 100644 --- a/opm/simulators/linalg/gpuistl/GpuBuffer.cpp +++ b/opm/simulators/linalg/gpuistl/GpuBuffer.cpp @@ -174,19 +174,6 @@ GpuBuffer::copyFromHost(const T* dataPointer, size_t numberOfElements) OPM_GPU_SAFE_CALL(cudaMemcpy(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice)); } -template -void -GpuBuffer::copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream) -{ - if (numberOfElements > size()) { - OPM_THROW(std::runtime_error, - fmt::format("Requesting to copy too many elements. buffer has {} elements, while {} was requested.", - size(), - numberOfElements)); - } - OPM_GPU_SAFE_CALL(cudaMemcpyAsync(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice, stream)); -} - template void GpuBuffer::copyToHost(T* dataPointer, size_t numberOfElements) const @@ -202,13 +189,6 @@ GpuBuffer::copyFromHost(const std::vector& data) copyFromHost(data.data(), data.size()); } -template -void -GpuBuffer::copyFromHost(const std::vector& data, cudaStream_t stream) -{ - copyFromHost(data.data(), data.size(), stream); -} - template void GpuBuffer::copyToHost(std::vector& data) const diff --git a/opm/simulators/linalg/gpuistl/GpuBuffer.hpp b/opm/simulators/linalg/gpuistl/GpuBuffer.hpp index 1865d7a71d3..1ba5cb86124 100644 --- a/opm/simulators/linalg/gpuistl/GpuBuffer.hpp +++ b/opm/simulators/linalg/gpuistl/GpuBuffer.hpp @@ -175,15 +175,6 @@ class GpuBuffer */ void copyFromHost(const T* dataPointer, size_t numberOfElements); - /** - * @brief copyFromHost copies numberOfElements from the CPU memory dataPointer - * @param dataPointer raw pointer to CPU memory - * @param numberOfElements number of elements to copy - * @note This does synchronous transfer. - * @note assumes that this buffer has numberOfElements elements - */ - void copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream); - /** * @brief copyFromHost copies numberOfElements to the CPU memory dataPointer * @param dataPointer raw pointer to CPU memory @@ -202,15 +193,6 @@ class GpuBuffer */ void copyFromHost(const std::vector& data); - /** - * @brief copyToHost copies data from an std::vector - * @param data the vector to copy from - * - * @note This does synchronous transfer. - * @note This assumes that the size of this buffer is equal to the size of the input vector. - */ - void copyFromHost(const std::vector& data, cudaStream_t stream); - /** * @brief copyToHost copies data to an std::vector * @param data the vector to copy to diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.cpp b/opm/simulators/linalg/gpuistl/GpuDILU.cpp index b76bfe1ed61..2b616214d06 100644 --- a/opm/simulators/linalg/gpuistl/GpuDILU.cpp +++ b/opm/simulators/linalg/gpuistl/GpuDILU.cpp @@ -279,7 +279,7 @@ GpuDILU::update() { OPM_TIMEBLOCK(prec_update); { - m_gpuMatrix.updateNonzeroValuesDirectlyInStream(m_cpuMatrix, m_stream); // send updated matrix to the gpu + m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu reorderAndSplitMatrix(m_moveThreadBlockSize); computeDiagonal(m_DILUFactorizationThreadBlockSize); } @@ -289,7 +289,7 @@ template void GpuDILU::update(int moveThreadBlockSize, int factorizationBlockSize) { - m_gpuMatrix.updateNonzeroValuesDirectlyInStream(m_cpuMatrix, m_stream); // send updated matrix to the gpu + m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu reorderAndSplitMatrix(moveThreadBlockSize); computeDiagonal(factorizationBlockSize); } @@ -310,8 +310,7 @@ GpuDILU::reorderAndSplitMatrix(int moveThreadBlockSize) m_gpuMatrixReorderedDiag->data(), m_gpuNaturalToReorder.data(), m_gpuMatrixReorderedLower->N(), - moveThreadBlockSize, - m_stream); + moveThreadBlockSize); } else { detail::copyMatDataToReordered(m_gpuMatrix.getNonZeroValues().data(), m_gpuMatrix.getRowIndices().data(), @@ -319,8 +318,7 @@ GpuDILU::reorderAndSplitMatrix(int moveThreadBlockSize) m_gpuMatrixReordered->getRowIndices().data(), m_gpuNaturalToReorder.data(), m_gpuMatrixReordered->N(), - moveThreadBlockSize, - m_stream); + moveThreadBlockSize); } } @@ -349,8 +347,7 @@ GpuDILU::computeDiagonal(int factorizationBlockSize) m_gpuDInvFloat->data(), m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(), m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(), - factorizationBlockSize, - m_stream); + factorizationBlockSize); } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) { detail::DILU::computeDiluDiagonalSplit( m_gpuMatrixReorderedLower->getNonZeroValues().data(), @@ -368,8 +365,7 @@ GpuDILU::computeDiagonal(int factorizationBlockSize) nullptr, m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(), m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(), - factorizationBlockSize, - m_stream); + factorizationBlockSize); } else { // TODO: should this be field type twice or field type then float in the template? detail::DILU::computeDiluDiagonalSplit( @@ -388,8 +384,7 @@ GpuDILU::computeDiagonal(int factorizationBlockSize) nullptr, nullptr, nullptr, - factorizationBlockSize, - m_stream); + factorizationBlockSize); } } else { detail::DILU::computeDiluDiagonal( @@ -401,8 +396,7 @@ GpuDILU::computeDiagonal(int factorizationBlockSize) levelStartIdx, numOfRowsInLevel, m_gpuDInv.data(), - factorizationBlockSize, - m_stream); + factorizationBlockSize); } levelStartIdx += numOfRowsInLevel; } diff --git a/opm/simulators/linalg/gpuistl/GpuSparseMatrix.cpp b/opm/simulators/linalg/gpuistl/GpuSparseMatrix.cpp index 6277118e8ae..c2b49cb71d2 100644 --- a/opm/simulators/linalg/gpuistl/GpuSparseMatrix.cpp +++ b/opm/simulators/linalg/gpuistl/GpuSparseMatrix.cpp @@ -183,19 +183,6 @@ GpuSparseMatrix::updateNonzeroValues(const MatrixType& matrix, bool copyNonZe } } -// template -// template -// void -// GpuSparseMatrix::updateNonzeroValuesDirectlyInStream(const MatrixType& matrix, cudaStream_t stream) -// { -// OPM_ERROR_IF(nonzeroes() != matrix.nonzeroes(), "Matrix does not have the same number of non-zero elements."); -// OPM_ERROR_IF(matrix[0][0].N() != blockSize(), "Matrix does not have the same blocksize."); -// OPM_ERROR_IF(matrix.N() != N(), "Matrix does not have the same number of rows."); - -// const T* newNonZeroElements = static_cast(&((matrix[0][0][0][0]))); -// m_nonZeroElements.copyFromHost(newNonZeroElements, nonzeroes() * blockSize() * blockSize(), stream); -// } - template void GpuSparseMatrix::setUpperTriangular() diff --git a/opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp b/opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp index 5aca19f5b85..b2d4a954c50 100644 --- a/opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp +++ b/opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp @@ -293,18 +293,6 @@ class GpuSparseMatrix template void updateNonzeroValues(const MatrixType& matrix, bool copyNonZeroElementsDirectly = false); - - template - void updateNonzeroValuesDirectlyInStream(const MatrixType& matrix, cudaStream_t stream) - { - OPM_ERROR_IF(nonzeroes() != matrix.nonzeroes(), "Matrix does not have the same number of non-zero elements."); - OPM_ERROR_IF(matrix[0][0].N() != blockSize(), "Matrix does not have the same blocksize."); - OPM_ERROR_IF(matrix.N() != N(), "Matrix does not have the same number of rows."); - - const T* newNonZeroElements = static_cast(&((matrix[0][0][0][0]))); - m_nonZeroElements.copyFromHost(newNonZeroElements, nonzeroes() * blockSize() * blockSize(), stream); - } - private: GpuVector m_nonZeroElements; GpuVector m_columnIndices; diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp index 394f83edb9b..41abb6a6030 100644 --- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp +++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp @@ -267,7 +267,7 @@ template void OpmGpuILU0::update() { - m_gpuMatrix.updateNonzeroValuesDirectlyInStream(m_cpuMatrix, m_stream); // send updated matrix to the gpu + m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu reorderAndSplitMatrix(m_moveThreadBlockSize); LUFactorizeMatrix(m_ILU0FactorizationThreadBlockSize); } @@ -300,8 +300,7 @@ OpmGpuILU0::reorderAndSplitMatrix(int moveThreadBlockSize) m_gpuMatrixReorderedDiag.value().data(), m_gpuNaturalToReorder.data(), m_gpuMatrixReorderedLower->N(), - moveThreadBlockSize, - m_stream); + moveThreadBlockSize); } else { detail::copyMatDataToReordered(m_gpuMatrix.getNonZeroValues().data(), m_gpuMatrix.getRowIndices().data(), @@ -309,8 +308,7 @@ OpmGpuILU0::reorderAndSplitMatrix(int moveThreadBlockSize) m_gpuReorderedLU->getRowIndices().data(), m_gpuNaturalToReorder.data(), m_gpuReorderedLU->N(), - moveThreadBlockSize, - m_stream); + moveThreadBlockSize); } } @@ -339,8 +337,7 @@ OpmGpuILU0::LUFactorizeMatrix(int factorizationThreadBlockSize) m_gpuNaturalToReorder.data(), levelStartIdx, numOfRowsInLevel, - factorizationThreadBlockSize, - m_stream); + factorizationThreadBlockSize); } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG){ detail::ILU0::LUFactorizationSplit( @@ -358,8 +355,7 @@ OpmGpuILU0::LUFactorizeMatrix(int factorizationThreadBlockSize) m_gpuNaturalToReorder.data(), levelStartIdx, numOfRowsInLevel, - factorizationThreadBlockSize, - m_stream); + factorizationThreadBlockSize); } else{ detail::ILU0::LUFactorizationSplit( @@ -377,8 +373,7 @@ OpmGpuILU0::LUFactorizeMatrix(int factorizationThreadBlockSize) m_gpuNaturalToReorder.data(), levelStartIdx, numOfRowsInLevel, - factorizationThreadBlockSize, - m_stream); + factorizationThreadBlockSize); } } else { @@ -389,8 +384,7 @@ OpmGpuILU0::LUFactorizeMatrix(int factorizationThreadBlockSize) m_gpuReorderToNatural.data(), numOfRowsInLevel, levelStartIdx, - factorizationThreadBlockSize, - m_stream); + factorizationThreadBlockSize); } levelStartIdx += numOfRowsInLevel; } diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu index d2ad2f1cd6e..8af6765df7e 100644 --- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu +++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu @@ -383,21 +383,20 @@ computeDiluDiagonal(T* reorderedMat, const int startIdx, int rowsInLevelSet, T* dInv, - int thrBlockSize, - cudaStream_t stream) + int thrBlockSize) { if (blocksize <= 3) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( cuComputeDiluDiagonal, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuComputeDiluDiagonal<<>>(reorderedMat, - rowIndices, - colIndices, - reorderedToNatural, - naturalToReordered, - startIdx, - rowsInLevelSet, - dInv); + cuComputeDiluDiagonal<<>>(reorderedMat, + rowIndices, + colIndices, + reorderedToNatural, + naturalToReordered, + startIdx, + rowsInLevelSet, + dInv); } else { OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3"); } @@ -420,37 +419,35 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat, OutputScalar* dstDiag, OutputScalar* dstLowerMat, OutputScalar* dstUpperMat, - int thrBlockSize, - cudaStream_t stream) + int thrBlockSize) { if (blocksize <= 3) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( cuComputeDiluDiagonalSplit, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); cuComputeDiluDiagonalSplit - <<>>(srcReorderedLowerMat, - lowerRowIndices, - lowerColIndices, - srcReorderedUpperMat, - upperRowIndices, - upperColIndices, - srcDiagonal, - reorderedToNatural, - naturalToReordered, - startIdx, - rowsInLevelSet, - dInv, - dstDiag, - dstLowerMat, - dstUpperMat); + <<>>(srcReorderedLowerMat, + lowerRowIndices, + lowerColIndices, + srcReorderedUpperMat, + upperRowIndices, + upperColIndices, + srcDiagonal, + reorderedToNatural, + naturalToReordered, + startIdx, + rowsInLevelSet, + dInv, + dstDiag, + dstLowerMat, + dstUpperMat); } else { OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3"); } } #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \ - template void computeDiluDiagonal( \ - T*, int*, int*, int*, int*, const int, int, T*, int, cudaStream_t); \ + template void computeDiluDiagonal(T*, int*, int*, int*, int*, const int, int, T*, int); \ template void computeDiluDiagonalSplit( \ const T*, \ int*, \ @@ -467,8 +464,7 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat, double*, \ double*, \ double*, \ - int, \ - cudaStream_t); \ + int); \ template void computeDiluDiagonalSplit( \ const T*, \ int*, \ @@ -485,8 +481,7 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat, float*, \ float*, \ float*, \ - int, \ - cudaStream_t); \ + int); \ template void computeDiluDiagonalSplit( \ const T*, \ int*, \ @@ -503,8 +498,7 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat, float*, \ float*, \ float*, \ - int, \ - cudaStream_t); \ + int); \ template void computeDiluDiagonalSplit( \ const T*, \ int*, \ @@ -521,8 +515,7 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat, double*, \ double*, \ double*, \ - int, \ - cudaStream_t); \ + int); \ template void computeDiluDiagonalSplit( \ const T*, \ int*, \ @@ -539,8 +532,7 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat, float*, \ float*, \ float*, \ - int, \ - cudaStream_t); \ + int); \ template void computeDiluDiagonalSplit( \ const T*, \ int*, \ @@ -557,14 +549,10 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat, double*, \ double*, \ double*, \ - int, \ - cudaStream_t); \ + int); \ template void solveUpperLevelSet(T*, int*, int*, int*, int, int, const T*, T*, int, cudaStream_t); \ template void solveLowerLevelSet( \ T*, int*, int*, int*, int, int, const T*, const T*, T*, int, cudaStream_t); -// template void solveLowerLevelSet(T*, int*, int*, int*, int, int, const T*, const T*, T*, int); \ - // template void solveUpperLevelSetSplit(T*, int*, int*, int*, int, int, const T*, T*, int, cudaStream_t); \ - // template void solveLowerLevelSetSplit(T*, int*, int*, int*, int, int, const T*, const T*, T*, int, cudaStream_t); INSTANTIATE_KERNEL_WRAPPERS(float, 1); INSTANTIATE_KERNEL_WRAPPERS(float, 2); diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp index dbc8d5d83fd..77a9cae6997 100644 --- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp +++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp @@ -165,8 +165,7 @@ void computeDiluDiagonal(T* reorderedMat, int startIdx, int rowsInLevelSet, T* dInv, - int threadBlockSize, - cudaStream_t stream); + int threadBlockSize); /** * @brief Computes the ILU0 of the diagonal elements of the split reordered matrix and stores it in a reordered vector @@ -205,8 +204,7 @@ void computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat, OutputScalar* dstDiagonal, OutputScalar* dstLowerMat, OutputScalar* dstUpperMat, - int threadBlockSize, - cudaStream_t stream); + int threadBlockSize); } // namespace Opm::gpuistl::detail::DILU #endif diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu index 92f3a6998c6..e66c42f8a56 100644 --- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu +++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu @@ -442,13 +442,12 @@ LUFactorization(T* srcMatrix, int* reorderedToNatual, size_t rowsInLevelSet, int startIdx, - int thrBlockSize, - cudaStream_t stream) + int thrBlockSize) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuLUFactorization, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuLUFactorization<<>>( + cuLUFactorization<<>>( srcMatrix, srcRowIndices, srcColumnIndices, naturalToReordered, reorderedToNatual, rowsInLevelSet, startIdx); } @@ -468,45 +467,44 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat, int* naturalToReordered, const int startIdx, int rowsInLevelSet, - int thrBlockSize, - cudaStream_t stream) + int thrBlockSize) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( cuLUFactorizationSplit, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); cuLUFactorizationSplit - <<>>(srcReorderedLowerMat, - lowerRowIndices, - lowerColIndices, - reorderedUpperMat, - upperRowIndices, - upperColIndices, - srcDiagonal, - dstReorderedLowerMat, - dstReorderedUpperMat, - dstDiagonal, - reorderedToNatural, - naturalToReordered, - startIdx, - rowsInLevelSet); + <<>>(srcReorderedLowerMat, + lowerRowIndices, + lowerColIndices, + reorderedUpperMat, + upperRowIndices, + upperColIndices, + srcDiagonal, + dstReorderedLowerMat, + dstReorderedUpperMat, + dstDiagonal, + reorderedToNatural, + naturalToReordered, + startIdx, + rowsInLevelSet); } #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \ template void solveUpperLevelSet(T*, int*, int*, int*, int, int, T*, int, cudaStream_t); \ template void solveLowerLevelSet(T*, int*, int*, int*, int, int, const T*, T*, int, cudaStream_t); \ - template void LUFactorization(T*, int*, int*, int*, int*, size_t, int, int, cudaStream_t); \ + template void LUFactorization(T*, int*, int*, int*, int*, size_t, int, int); \ template void LUFactorizationSplit( \ - T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int, cudaStream_t); \ + T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \ template void LUFactorizationSplit( \ - T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int, cudaStream_t); \ + T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \ template void LUFactorizationSplit( \ - T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int, cudaStream_t); \ + T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \ template void LUFactorizationSplit( \ - T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int, cudaStream_t); \ + T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \ template void LUFactorizationSplit( \ - T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int, cudaStream_t); \ + T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \ template void LUFactorizationSplit( \ - T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int, cudaStream_t); + T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); #define INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(T) \ INSTANTIATE_KERNEL_WRAPPERS(T, 1); \ diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp index 91cf89d6376..04390ff77c2 100644 --- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp +++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp @@ -157,8 +157,7 @@ void LUFactorization(T* reorderedMat, int* reorderedToNatual, size_t rowsInLevelSet, int startIdx, - int threadBlockSize, - cudaStream_t stream); + int threadBlockSize); /** * TODO: update this doucmentation @@ -197,8 +196,7 @@ void LUFactorizationSplit(InputScalar* srcReorderedLowerMat, int* naturalToReordered, int startIdx, int rowsInLevelSet, - int threadBlockSize, - cudaStream_t stream); + int threadBlockSize); } // namespace Opm::gpuistl::detail::ILU0 #endif From 6a5d8baa6caeccb67f24b8e8ab284732b81f19ba Mon Sep 17 00:00:00 2001 From: Tobias Meyer Andersen Date: Mon, 20 Jan 2025 09:52:07 +0100 Subject: [PATCH 3/5] reduce diff --- .../detail/gpusparse_matrix_operations.cu | 53 +------------------ .../detail/gpusparse_matrix_operations.hpp | 49 ----------------- 2 files changed, 1 insertion(+), 101 deletions(-) diff --git a/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.cu b/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.cu index 3f4a095e3f9..52b8a166eca 100644 --- a/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.cu +++ b/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.cu @@ -116,24 +116,6 @@ copyMatDataToReordered(T* srcMatrix, srcMatrix, srcRowIndices, dstMatrix, dstRowIndices, naturalToReordered, numberOfRows); } -template -void -copyMatDataToReordered(T* srcMatrix, - int* srcRowIndices, - T* dstMatrix, - int* dstRowIndices, - int* naturalToReordered, - size_t numberOfRows, - int thrBlockSize, - cudaStream_t stream) -{ - int threadBlockSize - = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuMoveDataToReordered, thrBlockSize); - int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(numberOfRows, threadBlockSize); - cuMoveDataToReordered<<>>( - srcMatrix, srcRowIndices, dstMatrix, dstRowIndices, naturalToReordered, numberOfRows); -} - template void copyMatDataToReorderedSplit(T* srcMatrix, @@ -163,42 +145,9 @@ copyMatDataToReorderedSplit(T* srcMatrix, numberOfRows); } -template -void -copyMatDataToReorderedSplit(T* srcMatrix, - int* srcRowIndices, - int* srcColumnIndices, - T* dstLowerMatrix, - int* dstLowerRowIndices, - T* dstUpperMatrix, - int* dstUpperRowIndices, - T* dstDiag, - int* naturalToReordered, - size_t numberOfRows, - int thrBlockSize, - cudaStream_t stream) -{ - int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( - cuMoveDataToReorderedSplit, thrBlockSize); - int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(numberOfRows, threadBlockSize); - cuMoveDataToReorderedSplit<<>>(srcMatrix, - srcRowIndices, - srcColumnIndices, - dstLowerMatrix, - dstLowerRowIndices, - dstUpperMatrix, - dstUpperRowIndices, - dstDiag, - naturalToReordered, - numberOfRows); -} - - #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \ template void copyMatDataToReordered(T*, int*, T*, int*, int*, size_t, int); \ - template void copyMatDataToReordered(T*, int*, T*, int*, int*, size_t, int, cudaStream_t); \ - template void copyMatDataToReorderedSplit(T*, int*, int*, T*, int*, T*, int*, T*, int*, size_t, int);\ - template void copyMatDataToReorderedSplit(T*, int*, int*, T*, int*, T*, int*, T*, int*, size_t, int, cudaStream_t); + template void copyMatDataToReorderedSplit(T*, int*, int*, T*, int*, T*, int*, T*, int*, size_t, int); INSTANTIATE_KERNEL_WRAPPERS(float, 1); INSTANTIATE_KERNEL_WRAPPERS(float, 2); diff --git a/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.hpp b/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.hpp index 90ed59e8336..3e953a5056a 100644 --- a/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.hpp +++ b/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.hpp @@ -41,27 +41,6 @@ void copyMatDataToReordered(T* srcMatrix, size_t numberOfRows, int threadBlockSize); -// TODO: document the stream -/** - * @brief Reorders the elements of a matrix by copying them from one matrix to another using a permutation list - * @param srcMatrix The source matrix we will copy data from - * @param srcRowIndices Pointer to vector on GPU containing row indices for the source matrix compliant wiht bsr format - * @param [out] dstMatrix The destination matrix that we copy data to - * @param dstRowIndices Pointer to vector on GPU containing riw indices for the destination matrix compliant wiht bsr - * format - * @param naturalToReordered Permuation list that converts indices in the src matrix to the indices in the dst matrix - * @param numberOfRows The number of rows in the matrices - */ -template -void copyMatDataToReordered(T* srcMatrix, - int* srcRowIndices, - T* dstMatrix, - int* dstRowIndices, - int* naturalToReordered, - size_t numberOfRows, - int threadBlockSize, - cudaStream_t stream); - /** * @brief Reorders the elements of a matrix by copying them from one matrix to a split matrix using a permutation list * @param srcMatrix The source matrix we will copy data from @@ -89,33 +68,5 @@ void copyMatDataToReorderedSplit(T* srcMatrix, size_t numberOfRows, int threadBlockSize); -/** - * @brief Reorders the elements of a matrix by copying them from one matrix to a split matrix using a permutation list - * @param srcMatrix The source matrix we will copy data from - * @param srcRowIndices Pointer to vector on GPU containing row indices for the source matrix compliant wiht bsr format - * @param [out] dstLowerMatrix The destination of entries that originates from the strictly lower triangular matrix - * @param dstRowIndices Pointer to vector on GPU containing rww indices for the destination lower matrix compliant wiht - * bsr format - * @param [out] dstUpperMatrix The destination of entries that originates from the strictly upper triangular matrix - * @param dstRowIndices Pointer to vector on GPU containing riw indices for the destination upper matrix compliant wiht - * bsr format - * @param [out] dstDiag The destination buffer for the diagonal part of the matrix - * @param naturalToReordered Permuation list that converts indices in the src matrix to the indices in the dst matrix - * @param numberOfRows The number of rows in the matrices - */ -template -void copyMatDataToReorderedSplit(T* srcMatrix, - int* srcRowIndices, - int* srcColumnIndices, - T* dstLowerMatrix, - int* dstLowerRowIndices, - T* dstUpperMatrix, - int* dstUpperRowIndices, - T* dstDiag, - int* naturalToReordered, - size_t numberOfRows, - int threadBlockSize, - cudaStream_t stream); - } // namespace Opm::gpuistl::detail #endif From bca83687ccca7da8b248ae28d5c700b6b36be283 Mon Sep 17 00:00:00 2001 From: Tobias Meyer Andersen Date: Wed, 5 Feb 2025 10:29:07 +0100 Subject: [PATCH 4/5] use events to ensure correctness --- opm/simulators/linalg/gpuistl/GpuDILU.cpp | 38 +++++++++++++------- opm/simulators/linalg/gpuistl/GpuDILU.hpp | 6 +++- opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp | 38 +++++++++++++------- opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp | 6 +++- 4 files changed, 62 insertions(+), 26 deletions(-) diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.cpp b/opm/simulators/linalg/gpuistl/GpuDILU.cpp index 2b616214d06..2e3fbb7e7ec 100644 --- a/opm/simulators/linalg/gpuistl/GpuDILU.cpp +++ b/opm/simulators/linalg/gpuistl/GpuDILU.cpp @@ -55,8 +55,6 @@ GpuDILU::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int , m_tuneThreadBlockSizes(tuneKernels) , m_mixedPrecisionScheme(makeMatrixStorageMPScheme(mixedPrecisionScheme)) { - OPM_GPU_SAFE_CALL(cudaStreamCreate(&m_stream)); - // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode? // Some sanity check OPM_ERROR_IF(A.N() != m_gpuMatrix.N(), @@ -114,6 +112,10 @@ template void GpuDILU::apply(X& v, const Y& d) { + // ensure that this stream only starts doing work when main stream is completed up to this point + OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0)); + OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0)); + OPM_TIMEBLOCK(prec_apply); { const auto ptrs = std::make_pair(v.data(), d.data()); @@ -123,16 +125,20 @@ GpuDILU::apply(X& v, const Y& d) if (it == m_apply_graphs.end()) { m_apply_graphs[ptrs] = cudaGraph_t(); m_executableGraphs[ptrs] = cudaGraphExec_t(); - OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream, cudaStreamCaptureModeGlobal)); + OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream.get(), cudaStreamCaptureModeGlobal)); // The apply functions contains lots of small function calls which call a kernel each apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize); - OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream, &m_apply_graphs[ptrs])); + OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream.get(), &m_apply_graphs[ptrs])); OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs], m_apply_graphs[ptrs], nullptr, nullptr, 0)); } OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs], 0)); } + + // ensure that main stream only continues after this stream is completed + OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get())); + OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0)); } template @@ -155,7 +161,7 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int d.data(), v.data(), lowerSolveThreadBlockSize, - m_stream); + m_stream.get()); } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) { detail::DILU::solveLowerLevelSetSplit( m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(), @@ -168,7 +174,7 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int d.data(), v.data(), lowerSolveThreadBlockSize, - m_stream); + m_stream.get()); } else { detail::DILU::solveLowerLevelSetSplit( m_gpuMatrixReorderedLower->getNonZeroValues().data(), @@ -181,7 +187,7 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int d.data(), v.data(), lowerSolveThreadBlockSize, - m_stream); + m_stream.get()); } } else { detail::DILU::solveLowerLevelSet( @@ -195,7 +201,7 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int d.data(), v.data(), lowerSolveThreadBlockSize, - m_stream); + m_stream.get()); } levelStartIdx += numOfRowsInLevel; } @@ -217,7 +223,7 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int m_gpuDInvFloat->data(), v.data(), upperSolveThreadBlockSize, - m_stream); + m_stream.get()); } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG){ detail::DILU::solveUpperLevelSetSplit( m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(), @@ -229,7 +235,7 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int m_gpuDInv.data(), v.data(), upperSolveThreadBlockSize, - m_stream); + m_stream.get()); } else { detail::DILU::solveUpperLevelSetSplit( m_gpuMatrixReorderedUpper->getNonZeroValues().data(), @@ -241,7 +247,7 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int m_gpuDInv.data(), v.data(), upperSolveThreadBlockSize, - m_stream); + m_stream.get()); } } else { detail::DILU::solveUpperLevelSet( @@ -254,7 +260,7 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int m_gpuDInv.data(), v.data(), upperSolveThreadBlockSize, - m_stream); + m_stream.get()); } } } @@ -289,9 +295,17 @@ template void GpuDILU::update(int moveThreadBlockSize, int factorizationBlockSize) { + // ensure that this stream only starts doing work when main stream is completed up to this point + OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0)); + OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0)); + m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu reorderAndSplitMatrix(moveThreadBlockSize); computeDiagonal(factorizationBlockSize); + + // ensure that main stream only continues after this stream is completed + OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get())); + OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0)); } template diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.hpp b/opm/simulators/linalg/gpuistl/GpuDILU.hpp index bde85aff6b3..a4ba988cf4f 100644 --- a/opm/simulators/linalg/gpuistl/GpuDILU.hpp +++ b/opm/simulators/linalg/gpuistl/GpuDILU.hpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -163,7 +164,10 @@ class GpuDILU : public Dune::PreconditionerWithUpdate std::map, cudaGraphExec_t> m_executableGraphs; // Stream for the DILU operations on the GPU - cudaStream_t m_stream; + GPUStream m_stream{}; + // Events for synchronization with main stream + GPUEvent m_before{}; + GPUEvent m_after{}; }; } // end namespace Opm::gpuistl diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp index 41abb6a6030..f5972f52e43 100644 --- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp +++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp @@ -59,9 +59,6 @@ OpmGpuILU0::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel , m_tuneThreadBlockSizes(tuneKernels) , m_mixedPrecisionScheme(makeMatrixStorageMPScheme(mixedPrecisionScheme)) { - - OPM_GPU_SAFE_CALL(cudaStreamCreate(&m_stream)); - // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode? // Some sanity check OPM_ERROR_IF(A.N() != m_gpuMatrix.N(), @@ -121,6 +118,10 @@ OpmGpuILU0::apply(X& v, const Y& d) { OPM_TIMEBLOCK(prec_apply); { + // ensure that this stream only starts doing work when main stream is completed up to this point + OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0)); + OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0)); + const auto ptrs = std::make_pair(v.data(), d.data()); auto it = m_apply_graphs.find(ptrs); @@ -128,15 +129,20 @@ OpmGpuILU0::apply(X& v, const Y& d) if (it == m_apply_graphs.end()) { m_apply_graphs[ptrs] = cudaGraph_t(); m_executableGraphs[ptrs] = cudaGraphExec_t(); - OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream, cudaStreamCaptureModeGlobal)); + OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream.get(), cudaStreamCaptureModeGlobal)); // The apply functions contains lots of small function calls which call a kernel each apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize); - OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream, &m_apply_graphs[ptrs])); + OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream.get(), &m_apply_graphs[ptrs])); OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs], m_apply_graphs[ptrs], nullptr, nullptr, 0)); } OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs], 0)); + + + // ensure that main stream only continues after this stream is completed + OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get())); + OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0)); } } @@ -162,7 +168,7 @@ OpmGpuILU0::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i d.data(), v.data(), lowerSolveThreadBlockSize, - m_stream); + m_stream.get()); } else{ detail::ILU0::solveLowerLevelSetSplit( @@ -175,7 +181,7 @@ OpmGpuILU0::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i d.data(), v.data(), lowerSolveThreadBlockSize, - m_stream); + m_stream.get()); } } else { detail::ILU0::solveLowerLevelSet(m_gpuReorderedLU->getNonZeroValues().data(), @@ -187,7 +193,7 @@ OpmGpuILU0::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i d.data(), v.data(), lowerSolveThreadBlockSize, - m_stream); + m_stream.get()); } levelStartIdx += numOfRowsInLevel; } @@ -208,7 +214,7 @@ OpmGpuILU0::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i m_gpuMatrixReorderedDiagFloat.value().data(), v.data(), upperSolveThreadBlockSize, - m_stream); + m_stream.get()); } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) { detail::ILU0::solveUpperLevelSetSplit( @@ -221,7 +227,7 @@ OpmGpuILU0::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i m_gpuMatrixReorderedDiag.value().data(), v.data(), upperSolveThreadBlockSize, - m_stream); + m_stream.get()); } else{ detail::ILU0::solveUpperLevelSetSplit( @@ -234,7 +240,7 @@ OpmGpuILU0::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i m_gpuMatrixReorderedDiag.value().data(), v.data(), upperSolveThreadBlockSize, - m_stream); + m_stream.get()); } } else { detail::ILU0::solveUpperLevelSet(m_gpuReorderedLU->getNonZeroValues().data(), @@ -245,7 +251,7 @@ OpmGpuILU0::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i numOfRowsInLevel, v.data(), upperSolveThreadBlockSize, - m_stream); + m_stream.get()); } } } @@ -278,9 +284,17 @@ OpmGpuILU0::update(int moveThreadBlockSize, int factorizationThreadB { OPM_TIMEBLOCK(prec_update); { + // ensure that this stream only starts doing work when main stream is completed up to this point + OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0)); + OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0)); + m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu reorderAndSplitMatrix(moveThreadBlockSize); LUFactorizeMatrix(factorizationThreadBlockSize); + + // ensure that main stream only continues after this stream is completed + OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get())); + OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0)); } } diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp index 064818c68fc..9e899765d83 100644 --- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp +++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -161,7 +162,10 @@ class OpmGpuILU0 : public Dune::PreconditionerWithUpdate std::map, cudaGraphExec_t> m_executableGraphs; // Stream for the DILU operations on the GPU - cudaStream_t m_stream; + GPUStream m_stream{}; + // Events for synchronization with main stream + GPUEvent m_before{}; + GPUEvent m_after{}; }; } // end namespace Opm::gpuistl From 41e022ba310495814bdab44ce4e01aad0e4bbf96 Mon Sep 17 00:00:00 2001 From: Tobias Meyer Andersen Date: Wed, 5 Feb 2025 10:45:53 +0100 Subject: [PATCH 5/5] Use GPUGraph and GPUGraphExec --- opm/simulators/linalg/gpuistl/GpuDILU.cpp | 8 +++----- opm/simulators/linalg/gpuistl/GpuDILU.hpp | 4 ++-- opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp | 8 +++----- opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp | 4 ++-- 4 files changed, 10 insertions(+), 14 deletions(-) diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.cpp b/opm/simulators/linalg/gpuistl/GpuDILU.cpp index 2e3fbb7e7ec..2011fb2a2c9 100644 --- a/opm/simulators/linalg/gpuistl/GpuDILU.cpp +++ b/opm/simulators/linalg/gpuistl/GpuDILU.cpp @@ -123,17 +123,15 @@ GpuDILU::apply(X& v, const Y& d) auto it = m_apply_graphs.find(ptrs); if (it == m_apply_graphs.end()) { - m_apply_graphs[ptrs] = cudaGraph_t(); - m_executableGraphs[ptrs] = cudaGraphExec_t(); OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream.get(), cudaStreamCaptureModeGlobal)); // The apply functions contains lots of small function calls which call a kernel each apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize); - OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream.get(), &m_apply_graphs[ptrs])); - OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs], m_apply_graphs[ptrs], nullptr, nullptr, 0)); + OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream.get(), &m_apply_graphs[ptrs].get())); + OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs].get(), m_apply_graphs[ptrs].get(), nullptr, nullptr, 0)); } - OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs], 0)); + OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs].get(), 0)); } // ensure that main stream only continues after this stream is completed diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.hpp b/opm/simulators/linalg/gpuistl/GpuDILU.hpp index a4ba988cf4f..a023c05d436 100644 --- a/opm/simulators/linalg/gpuistl/GpuDILU.hpp +++ b/opm/simulators/linalg/gpuistl/GpuDILU.hpp @@ -160,8 +160,8 @@ class GpuDILU : public Dune::PreconditionerWithUpdate int m_DILUFactorizationThreadBlockSize = -1; // Graphs for Apply - std::map, cudaGraph_t> m_apply_graphs; - std::map, cudaGraphExec_t> m_executableGraphs; + std::map, GPUGraph> m_apply_graphs; + std::map, GPUGraphExec> m_executableGraphs; // Stream for the DILU operations on the GPU GPUStream m_stream{}; diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp index f5972f52e43..42b535b7664 100644 --- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp +++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp @@ -127,17 +127,15 @@ OpmGpuILU0::apply(X& v, const Y& d) auto it = m_apply_graphs.find(ptrs); if (it == m_apply_graphs.end()) { - m_apply_graphs[ptrs] = cudaGraph_t(); - m_executableGraphs[ptrs] = cudaGraphExec_t(); OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream.get(), cudaStreamCaptureModeGlobal)); // The apply functions contains lots of small function calls which call a kernel each apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize); - OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream.get(), &m_apply_graphs[ptrs])); - OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs], m_apply_graphs[ptrs], nullptr, nullptr, 0)); + OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream.get(), &m_apply_graphs[ptrs].get())); + OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs].get(), m_apply_graphs[ptrs].get(), nullptr, nullptr, 0)); } - OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs], 0)); + OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs].get(), 0)); // ensure that main stream only continues after this stream is completed diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp index 9e899765d83..e4cc3f05914 100644 --- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp +++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp @@ -158,8 +158,8 @@ class OpmGpuILU0 : public Dune::PreconditionerWithUpdate int m_ILU0FactorizationThreadBlockSize = -1; // Graphs for Apply - std::map, cudaGraph_t> m_apply_graphs; - std::map, cudaGraphExec_t> m_executableGraphs; + std::map, GPUGraph> m_apply_graphs; + std::map, GPUGraphExec> m_executableGraphs; // Stream for the DILU operations on the GPU GPUStream m_stream{};