diff --git a/opm/simulators/linalg/gpuistl/GpuBuffer.cpp b/opm/simulators/linalg/gpuistl/GpuBuffer.cpp index b4132ce09f..49ce121e1f 100644 --- a/opm/simulators/linalg/gpuistl/GpuBuffer.cpp +++ b/opm/simulators/linalg/gpuistl/GpuBuffer.cpp @@ -188,6 +188,7 @@ GpuBuffer::copyFromHost(const std::vector& data) { copyFromHost(data.data(), data.size()); } + template void GpuBuffer::copyToHost(std::vector& data) const diff --git a/opm/simulators/linalg/gpuistl/GpuBuffer.hpp b/opm/simulators/linalg/gpuistl/GpuBuffer.hpp index 369ea3ba28..1ba5cb8612 100644 --- a/opm/simulators/linalg/gpuistl/GpuBuffer.hpp +++ b/opm/simulators/linalg/gpuistl/GpuBuffer.hpp @@ -27,6 +27,7 @@ #include #include #include +#include namespace Opm::gpuistl diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.cpp b/opm/simulators/linalg/gpuistl/GpuDILU.cpp index d311386b00..2b616214d0 100644 --- a/opm/simulators/linalg/gpuistl/GpuDILU.cpp +++ b/opm/simulators/linalg/gpuistl/GpuDILU.cpp @@ -37,6 +37,7 @@ #include #include #include + namespace Opm::gpuistl { @@ -54,6 +55,8 @@ GpuDILU::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int , m_tuneThreadBlockSizes(tuneKernels) , m_mixedPrecisionScheme(makeMatrixStorageMPScheme(mixedPrecisionScheme)) { + OPM_GPU_SAFE_CALL(cudaStreamCreate(&m_stream)); + // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode? // Some sanity check OPM_ERROR_IF(A.N() != m_gpuMatrix.N(), @@ -93,7 +96,8 @@ GpuDILU::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int } } - computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize); + reorderAndSplitMatrix(m_moveThreadBlockSize); + computeDiagonal(m_DILUFactorizationThreadBlockSize); if (m_tuneThreadBlockSizes) { tuneThreadBlockSizes(); @@ -112,7 +116,22 @@ GpuDILU::apply(X& v, const Y& d) { OPM_TIMEBLOCK(prec_apply); { - apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize); + const auto ptrs = std::make_pair(v.data(), d.data()); + + auto it = m_apply_graphs.find(ptrs); + + if (it == m_apply_graphs.end()) { + m_apply_graphs[ptrs] = cudaGraph_t(); + m_executableGraphs[ptrs] = cudaGraphExec_t(); + OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream, cudaStreamCaptureModeGlobal)); + + // The apply functions contains lots of small function calls which call a kernel each + apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize); + + OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream, &m_apply_graphs[ptrs])); + OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs], m_apply_graphs[ptrs], nullptr, nullptr, 0)); + } + OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs], 0)); } } @@ -135,7 +154,8 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int m_gpuDInvFloat->data(), d.data(), v.data(), - lowerSolveThreadBlockSize); + lowerSolveThreadBlockSize, + m_stream); } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) { detail::DILU::solveLowerLevelSetSplit( m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(), @@ -147,7 +167,8 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int m_gpuDInv.data(), d.data(), v.data(), - lowerSolveThreadBlockSize); + lowerSolveThreadBlockSize, + m_stream); } else { detail::DILU::solveLowerLevelSetSplit( m_gpuMatrixReorderedLower->getNonZeroValues().data(), @@ -159,7 +180,8 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int m_gpuDInv.data(), d.data(), v.data(), - lowerSolveThreadBlockSize); + lowerSolveThreadBlockSize, + m_stream); } } else { detail::DILU::solveLowerLevelSet( @@ -172,7 +194,8 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int m_gpuDInv.data(), d.data(), v.data(), - lowerSolveThreadBlockSize); + lowerSolveThreadBlockSize, + m_stream); } levelStartIdx += numOfRowsInLevel; } @@ -193,7 +216,8 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int numOfRowsInLevel, m_gpuDInvFloat->data(), v.data(), - upperSolveThreadBlockSize); + upperSolveThreadBlockSize, + m_stream); } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG){ detail::DILU::solveUpperLevelSetSplit( m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(), @@ -204,7 +228,8 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int numOfRowsInLevel, m_gpuDInv.data(), v.data(), - upperSolveThreadBlockSize); + upperSolveThreadBlockSize, + m_stream); } else { detail::DILU::solveUpperLevelSetSplit( m_gpuMatrixReorderedUpper->getNonZeroValues().data(), @@ -215,7 +240,8 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int numOfRowsInLevel, m_gpuDInv.data(), v.data(), - upperSolveThreadBlockSize); + upperSolveThreadBlockSize, + m_stream); } } else { detail::DILU::solveUpperLevelSet( @@ -227,7 +253,8 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int numOfRowsInLevel, m_gpuDInv.data(), v.data(), - upperSolveThreadBlockSize); + upperSolveThreadBlockSize, + m_stream); } } } @@ -252,7 +279,9 @@ GpuDILU::update() { OPM_TIMEBLOCK(prec_update); { - update(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize); + m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu + reorderAndSplitMatrix(m_moveThreadBlockSize); + computeDiagonal(m_DILUFactorizationThreadBlockSize); } } @@ -260,13 +289,14 @@ template void GpuDILU::update(int moveThreadBlockSize, int factorizationBlockSize) { - m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu - computeDiagAndMoveReorderedData(moveThreadBlockSize, factorizationBlockSize); + m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu + reorderAndSplitMatrix(moveThreadBlockSize); + computeDiagonal(factorizationBlockSize); } template void -GpuDILU::computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationBlockSize) +GpuDILU::reorderAndSplitMatrix(int moveThreadBlockSize) { if (m_splitMatrix) { detail::copyMatDataToReorderedSplit( @@ -290,7 +320,12 @@ GpuDILU::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in m_gpuMatrixReordered->N(), moveThreadBlockSize); } +} +template +void +GpuDILU::computeDiagonal(int factorizationBlockSize) +{ int levelStartIdx = 0; for (int level = 0; level < m_levelSets.size(); ++level) { const int numOfRowsInLevel = m_levelSets[level].size(); diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.hpp b/opm/simulators/linalg/gpuistl/GpuDILU.hpp index 241eef3e98..bde85aff6b 100644 --- a/opm/simulators/linalg/gpuistl/GpuDILU.hpp +++ b/opm/simulators/linalg/gpuistl/GpuDILU.hpp @@ -25,7 +25,8 @@ #include #include #include - +#include +#include namespace Opm::gpuistl @@ -82,8 +83,11 @@ class GpuDILU : public Dune::PreconditionerWithUpdate //! \brief Updates the matrix data. void update() final; + //! \brief perform matrix splitting and reordering + void reorderAndSplitMatrix(int moveThreadBlockSize); + //! \brief Compute the diagonal of the DILU, and update the data of the reordered matrix - void computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationThreadBlockSize); + void computeDiagonal(int factorizationThreadBlockSize); //! \brief function that will experimentally tune the thread block sizes of the important cuda kernels void tuneThreadBlockSizes(); @@ -153,6 +157,13 @@ class GpuDILU : public Dune::PreconditionerWithUpdate int m_lowerSolveThreadBlockSize = -1; int m_moveThreadBlockSize = -1; int m_DILUFactorizationThreadBlockSize = -1; + + // Graphs for Apply + std::map, cudaGraph_t> m_apply_graphs; + std::map, cudaGraphExec_t> m_executableGraphs; + + // Stream for the DILU operations on the GPU + cudaStream_t m_stream; }; } // end namespace Opm::gpuistl diff --git a/opm/simulators/linalg/gpuistl/GpuVector.cpp b/opm/simulators/linalg/gpuistl/GpuVector.cpp index ddafc1dbfd..af39e99419 100644 --- a/opm/simulators/linalg/gpuistl/GpuVector.cpp +++ b/opm/simulators/linalg/gpuistl/GpuVector.cpp @@ -266,6 +266,19 @@ GpuVector::copyFromHost(const T* dataPointer, size_t numberOfElements) OPM_GPU_SAFE_CALL(cudaMemcpy(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice)); } +template +void +GpuVector::copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream) +{ + if (numberOfElements > dim()) { + OPM_THROW(std::runtime_error, + fmt::format("Requesting to copy too many elements. Vector has {} elements, while {} was requested.", + dim(), + numberOfElements)); + } + OPM_GPU_SAFE_CALL(cudaMemcpyAsync(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice, stream)); +} + template void GpuVector::copyToHost(T* dataPointer, size_t numberOfElements) const diff --git a/opm/simulators/linalg/gpuistl/GpuVector.hpp b/opm/simulators/linalg/gpuistl/GpuVector.hpp index e369d1d9d1..66a1ebd65e 100644 --- a/opm/simulators/linalg/gpuistl/GpuVector.hpp +++ b/opm/simulators/linalg/gpuistl/GpuVector.hpp @@ -203,6 +203,7 @@ class GpuVector * @note assumes that this vector has numberOfElements elements */ void copyFromHost(const T* dataPointer, size_t numberOfElements); + void copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream); /** * @brief copyFromHost copies numberOfElements to the CPU memory dataPointer diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp index 686b8f09f4..41abb6a603 100644 --- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp +++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp @@ -37,6 +37,7 @@ #include #include #include + namespace Opm::gpuistl { @@ -59,6 +60,8 @@ OpmGpuILU0::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel , m_mixedPrecisionScheme(makeMatrixStorageMPScheme(mixedPrecisionScheme)) { + OPM_GPU_SAFE_CALL(cudaStreamCreate(&m_stream)); + // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode? // Some sanity check OPM_ERROR_IF(A.N() != m_gpuMatrix.N(), @@ -98,7 +101,8 @@ OpmGpuILU0::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel } } - LUFactorizeAndMoveData(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize); + reorderAndSplitMatrix(m_moveThreadBlockSize); + LUFactorizeMatrix(m_ILU0FactorizationThreadBlockSize); if (m_tuneThreadBlockSizes) { tuneThreadBlockSizes(); @@ -117,7 +121,22 @@ OpmGpuILU0::apply(X& v, const Y& d) { OPM_TIMEBLOCK(prec_apply); { - apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize); + const auto ptrs = std::make_pair(v.data(), d.data()); + + auto it = m_apply_graphs.find(ptrs); + + if (it == m_apply_graphs.end()) { + m_apply_graphs[ptrs] = cudaGraph_t(); + m_executableGraphs[ptrs] = cudaGraphExec_t(); + OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream, cudaStreamCaptureModeGlobal)); + + // The apply functions contains lots of small function calls which call a kernel each + apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize); + + OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream, &m_apply_graphs[ptrs])); + OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs], m_apply_graphs[ptrs], nullptr, nullptr, 0)); + } + OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs], 0)); } } @@ -142,7 +161,8 @@ OpmGpuILU0::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i numOfRowsInLevel, d.data(), v.data(), - lowerSolveThreadBlockSize); + lowerSolveThreadBlockSize, + m_stream); } else{ detail::ILU0::solveLowerLevelSetSplit( @@ -154,7 +174,8 @@ OpmGpuILU0::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i numOfRowsInLevel, d.data(), v.data(), - lowerSolveThreadBlockSize); + lowerSolveThreadBlockSize, + m_stream); } } else { detail::ILU0::solveLowerLevelSet(m_gpuReorderedLU->getNonZeroValues().data(), @@ -165,7 +186,8 @@ OpmGpuILU0::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i numOfRowsInLevel, d.data(), v.data(), - lowerSolveThreadBlockSize); + lowerSolveThreadBlockSize, + m_stream); } levelStartIdx += numOfRowsInLevel; } @@ -185,7 +207,8 @@ OpmGpuILU0::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i numOfRowsInLevel, m_gpuMatrixReorderedDiagFloat.value().data(), v.data(), - upperSolveThreadBlockSize); + upperSolveThreadBlockSize, + m_stream); } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) { detail::ILU0::solveUpperLevelSetSplit( @@ -197,7 +220,8 @@ OpmGpuILU0::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i numOfRowsInLevel, m_gpuMatrixReorderedDiag.value().data(), v.data(), - upperSolveThreadBlockSize); + upperSolveThreadBlockSize, + m_stream); } else{ detail::ILU0::solveUpperLevelSetSplit( @@ -209,7 +233,8 @@ OpmGpuILU0::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i numOfRowsInLevel, m_gpuMatrixReorderedDiag.value().data(), v.data(), - upperSolveThreadBlockSize); + upperSolveThreadBlockSize, + m_stream); } } else { detail::ILU0::solveUpperLevelSet(m_gpuReorderedLU->getNonZeroValues().data(), @@ -219,7 +244,8 @@ OpmGpuILU0::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i levelStartIdx, numOfRowsInLevel, v.data(), - upperSolveThreadBlockSize); + upperSolveThreadBlockSize, + m_stream); } } } @@ -241,10 +267,9 @@ template void OpmGpuILU0::update() { - OPM_TIMEBLOCK(prec_update); - { - update(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize); - } + m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu + reorderAndSplitMatrix(m_moveThreadBlockSize); + LUFactorizeMatrix(m_ILU0FactorizationThreadBlockSize); } template @@ -254,12 +279,14 @@ OpmGpuILU0::update(int moveThreadBlockSize, int factorizationThreadB OPM_TIMEBLOCK(prec_update); { m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu - LUFactorizeAndMoveData(moveThreadBlockSize, factorizationThreadBlockSize); + reorderAndSplitMatrix(moveThreadBlockSize); + LUFactorizeMatrix(factorizationThreadBlockSize); } } + template void -OpmGpuILU0::LUFactorizeAndMoveData(int moveThreadBlockSize, int factorizationThreadBlockSize) +OpmGpuILU0::reorderAndSplitMatrix(int moveThreadBlockSize) { if (m_splitMatrix) { detail::copyMatDataToReorderedSplit( @@ -283,6 +310,12 @@ OpmGpuILU0::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact m_gpuReorderedLU->N(), moveThreadBlockSize); } +} + +template +void +OpmGpuILU0::LUFactorizeMatrix(int factorizationThreadBlockSize) +{ int levelStartIdx = 0; for (int level = 0; level < m_levelSets.size(); ++level) { const int numOfRowsInLevel = m_levelSets[level].size(); diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp index c163e04832..064818c68f 100644 --- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp +++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp @@ -84,8 +84,11 @@ class OpmGpuILU0 : public Dune::PreconditionerWithUpdate //! \brief Updates the matrix data. void update() final; + //! \brief perform matrix splitting and reordering + void reorderAndSplitMatrix(int moveThreadBlockSize); + //! \brief Compute LU factorization, and update the data of the reordered matrix - void LUFactorizeAndMoveData(int moveThreadBlockSize, int factorizationThreadBlockSize); + void LUFactorizeMatrix(int factorizationThreadBlockSize); //! \brief function that will experimentally tune the thread block sizes of the important cuda kernels void tuneThreadBlockSizes(); @@ -152,6 +155,13 @@ class OpmGpuILU0 : public Dune::PreconditionerWithUpdate int m_lowerSolveThreadBlockSize = -1; int m_moveThreadBlockSize = -1; int m_ILU0FactorizationThreadBlockSize = -1; + + // Graphs for Apply + std::map, cudaGraph_t> m_apply_graphs; + std::map, cudaGraphExec_t> m_executableGraphs; + + // Stream for the DILU operations on the GPU + cudaStream_t m_stream; }; } // end namespace Opm::gpuistl diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu index b8f9f48ca5..8af6765df7 100644 --- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu +++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu @@ -85,10 +85,12 @@ namespace // TODO: removce the first condition in the for loop for (int block = nnzIdx; block < nnzIdxLim; ++block) { const int col = colIndices[block]; - mmvMixedGeneral(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); + mmvMixedGeneral( + &mat[block * blocksize * blocksize], &v[col * blocksize], rhs); } - mvMixedGeneral(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); + mvMixedGeneral( + &dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); } } @@ -137,10 +139,12 @@ namespace LinearSolverScalar rhs[blocksize] = {0}; for (int block = nnzIdx; block < nnzIdxLim; ++block) { const int col = colIndices[block]; - umvMixedGeneral(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); + umvMixedGeneral( + &mat[block * blocksize * blocksize], &v[col * blocksize], rhs); } - mmvMixedGeneral(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); + mmvMixedGeneral( + &dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); } } @@ -211,8 +215,8 @@ namespace } } - // TODO: rewrite such that during the factorization there is a dInv of InputScalar type that stores intermediate results - // TOOD: The important part is to only cast after that is fully computed + // TODO: rewrite such that during the factorization there is a dInv of InputScalar type that stores intermediate + // results TOOD: The important part is to only cast after that is fully computed template __global__ void cuComputeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat, int* lowerRowIndices, @@ -239,7 +243,8 @@ namespace InputScalar dInvTmp[blocksize * blocksize]; for (int i = 0; i < blocksize; ++i) { for (int j = 0; j < blocksize; ++j) { - dInvTmp[i * blocksize + j] = srcDiagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j]; + dInvTmp[i * blocksize + j] + = srcDiagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j]; } } @@ -257,21 +262,26 @@ namespace if constexpr (detail::storeOffDiagonalAsFloat(mixedPrecisionScheme)) { // TODO: think long and hard about whether this performs only the wanted memory transfers - moveBlock(&srcReorderedLowerMat[block * blocksize * blocksize], &dstLowerMat[block * blocksize * blocksize]); - moveBlock(&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize], &dstUpperMat[symOppositeBlock * blocksize * blocksize]); + moveBlock( + &srcReorderedLowerMat[block * blocksize * blocksize], + &dstLowerMat[block * blocksize * blocksize]); + moveBlock( + &srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize], + &dstUpperMat[symOppositeBlock * blocksize * blocksize]); } mmx2Subtraction(&srcReorderedLowerMat[block * blocksize * blocksize], - &dInv[col * blocksize * blocksize], - &srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize], - dInvTmp); + &dInv[col * blocksize * blocksize], + &srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize], + dInvTmp); } invBlockInPlace(dInvTmp); moveBlock(dInvTmp, &dInv[reorderedRowIdx * blocksize * blocksize]); if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)) { - moveBlock(dInvTmp, &dstDiag[reorderedRowIdx * blocksize * blocksize]); // important! + moveBlock( + dInvTmp, &dstDiag[reorderedRowIdx * blocksize * blocksize]); // important! } } } @@ -289,12 +299,13 @@ solveLowerLevelSet(T* reorderedMat, const T* dInv, const T* d, T* v, - int thrBlockSize) + int thrBlockSize, + cudaStream_t stream) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuSolveLowerLevelSet<<>>( + cuSolveLowerLevelSet<<>>( reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v); } @@ -310,13 +321,15 @@ solveLowerLevelSetSplit(MatrixScalar* reorderedMat, const DiagonalScalar* dInv, const LinearSolverScalar* d, LinearSolverScalar* v, - int thrBlockSize) + int thrBlockSize, + cudaStream_t stream) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( cuSolveLowerLevelSetSplit, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuSolveLowerLevelSetSplit<<>>( - reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v); + cuSolveLowerLevelSetSplit + <<>>( + reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v); } // perform the upper solve for all rows in the same level set template @@ -329,12 +342,13 @@ solveUpperLevelSet(T* reorderedMat, int rowsInLevelSet, const T* dInv, T* v, - int thrBlockSize) + int thrBlockSize, + cudaStream_t stream) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuSolveUpperLevelSet<<>>( + cuSolveUpperLevelSet<<>>( reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v); } @@ -348,13 +362,15 @@ solveUpperLevelSetSplit(MatrixScalar* reorderedMat, int rowsInLevelSet, const DiagonalScalar* dInv, LinearSolverScalar* v, - int thrBlockSize) + int thrBlockSize, + cudaStream_t stream) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( cuSolveUpperLevelSetSplit, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuSolveUpperLevelSetSplit<<>>( - reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v); + cuSolveUpperLevelSetSplit + <<>>( + reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v); } template @@ -409,43 +425,134 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat, int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( cuComputeDiluDiagonalSplit, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuComputeDiluDiagonalSplit<<>>(srcReorderedLowerMat, - lowerRowIndices, - lowerColIndices, - srcReorderedUpperMat, - upperRowIndices, - upperColIndices, - srcDiagonal, - reorderedToNatural, - naturalToReordered, - startIdx, - rowsInLevelSet, - dInv, - dstDiag, - dstLowerMat, - dstUpperMat); + cuComputeDiluDiagonalSplit + <<>>(srcReorderedLowerMat, + lowerRowIndices, + lowerColIndices, + srcReorderedUpperMat, + upperRowIndices, + upperColIndices, + srcDiagonal, + reorderedToNatural, + naturalToReordered, + startIdx, + rowsInLevelSet, + dInv, + dstDiag, + dstLowerMat, + dstUpperMat); } else { OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3"); } } -// TODO: format #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \ template void computeDiluDiagonal(T*, int*, int*, int*, int*, const int, int, T*, int); \ - template void computeDiluDiagonalSplit( \ - const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \ - template void computeDiluDiagonalSplit( \ - const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \ - template void computeDiluDiagonalSplit( \ - const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \ - template void computeDiluDiagonalSplit( \ - const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \ - template void computeDiluDiagonalSplit( \ - const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \ - template void computeDiluDiagonalSplit( \ - const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \ - template void solveUpperLevelSet(T*, int*, int*, int*, int, int, const T*, T*, int); \ - template void solveLowerLevelSet(T*, int*, int*, int*, int, int, const T*, const T*, T*, int); + template void computeDiluDiagonalSplit( \ + const T*, \ + int*, \ + int*, \ + const T*, \ + int*, \ + int*, \ + const T*, \ + int*, \ + int*, \ + const int, \ + int, \ + T*, \ + double*, \ + double*, \ + double*, \ + int); \ + template void computeDiluDiagonalSplit( \ + const T*, \ + int*, \ + int*, \ + const T*, \ + int*, \ + int*, \ + const T*, \ + int*, \ + int*, \ + const int, \ + int, \ + T*, \ + float*, \ + float*, \ + float*, \ + int); \ + template void computeDiluDiagonalSplit( \ + const T*, \ + int*, \ + int*, \ + const T*, \ + int*, \ + int*, \ + const T*, \ + int*, \ + int*, \ + const int, \ + int, \ + T*, \ + float*, \ + float*, \ + float*, \ + int); \ + template void computeDiluDiagonalSplit( \ + const T*, \ + int*, \ + int*, \ + const T*, \ + int*, \ + int*, \ + const T*, \ + int*, \ + int*, \ + const int, \ + int, \ + T*, \ + double*, \ + double*, \ + double*, \ + int); \ + template void computeDiluDiagonalSplit( \ + const T*, \ + int*, \ + int*, \ + const T*, \ + int*, \ + int*, \ + const T*, \ + int*, \ + int*, \ + const int, \ + int, \ + T*, \ + float*, \ + float*, \ + float*, \ + int); \ + template void computeDiluDiagonalSplit( \ + const T*, \ + int*, \ + int*, \ + const T*, \ + int*, \ + int*, \ + const T*, \ + int*, \ + int*, \ + const int, \ + int, \ + T*, \ + double*, \ + double*, \ + double*, \ + int); \ + template void solveUpperLevelSet(T*, int*, int*, int*, int, int, const T*, T*, int, cudaStream_t); \ + template void solveLowerLevelSet( \ + T*, int*, int*, int*, int, int, const T*, const T*, T*, int, cudaStream_t); INSTANTIATE_KERNEL_WRAPPERS(float, 1); INSTANTIATE_KERNEL_WRAPPERS(float, 2); @@ -460,19 +567,29 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 4); INSTANTIATE_KERNEL_WRAPPERS(double, 5); INSTANTIATE_KERNEL_WRAPPERS(double, 6); -#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar) \ - template void solveUpperLevelSetSplit( \ - MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, LinearSolverScalar*, int); \ - template void solveLowerLevelSetSplit( \ - MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, const LinearSolverScalar*, LinearSolverScalar*, int); +#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar) \ + template void solveUpperLevelSetSplit( \ + MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, LinearSolverScalar*, int, cudaStream_t); \ + template void solveLowerLevelSetSplit( \ + MatrixScalar*, \ + int*, \ + int*, \ + int*, \ + int, \ + int, \ + const DiagonalScalar*, \ + const LinearSolverScalar*, \ + LinearSolverScalar*, \ + int, \ + cudaStream_t); // TODO: be smarter about this... Surely this instantiates many more combinations that are actually needed -#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(blocksize) \ - INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, float); \ - INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, float); \ - INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, float); \ - INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, double); \ - INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, double); \ +#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(blocksize) \ + INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, float); \ + INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, float); \ + INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, float); \ + INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, double); \ + INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, double); \ INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, double); INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(1); diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp index b7141834f1..77a9cae699 100644 --- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp +++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp @@ -54,7 +54,8 @@ void solveLowerLevelSet(T* reorderedMat, const T* dInv, const T* d, T* v, - int threadBlockSize); + int threadBlockSize, + cudaStream_t stream); /** * @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel @@ -82,7 +83,8 @@ void solveLowerLevelSetSplit(MatrixScalar* reorderedUpperMat, const DiagonalScalar* dInv, const LinearSolverScalar* d, LinearSolverScalar* v, - int threadBlockSize); + int threadBlockSize, + cudaStream_t stream); /** * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel @@ -108,7 +110,8 @@ void solveUpperLevelSet(T* reorderedMat, int rowsInLevelSet, const T* dInv, T* v, - int threadBlockSize); + int threadBlockSize, + cudaStream_t stream); /** * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel @@ -134,7 +137,8 @@ void solveUpperLevelSetSplit(MatrixScalar* reorderedUpperMat, int rowsInLevelSet, const DiagonalScalar* dInv, LinearSolverScalar* v, - int threadBlockSize); + int threadBlockSize, + cudaStream_t stream); /** * @brief Computes the ILU0 of the diagonal elements of the reordered matrix and stores it in a reordered vector diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu index 46183efd23..e66c42f8a5 100644 --- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu +++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu @@ -215,9 +215,9 @@ namespace // as of now, if we are using mixed precision, then we are always storing the off-diagonals as floats, // and sometimes also the diagonal. if constexpr (detail::usingMixedPrecision(mixedPrecisionScheme)) { - // if we are want to store the entire matrix as a float then we must also move the diagonal block from double to float - // if not then we just use the double diagonal that is already now stored in srcDiagonal - if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)){ + // if we are want to store the entire matrix as a float then we must also move the diagonal block from + // double to float if not then we just use the double diagonal that is already now stored in srcDiagonal + if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)) { moveBlock(&srcDiagonal[reorderedIdx * scalarsInBlock], &dstDiagonal[reorderedIdx * scalarsInBlock]); } @@ -362,12 +362,13 @@ solveLowerLevelSet(T* reorderedMat, int rowsInLevelSet, const T* d, T* v, - int thrBlockSize) + int thrBlockSize, + cudaStream_t stream) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuSolveLowerLevelSet<<>>( + cuSolveLowerLevelSet<<>>( reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v); } // perform the upper solve for all rows in the same level set @@ -380,12 +381,13 @@ solveUpperLevelSet(T* reorderedMat, int startIdx, int rowsInLevelSet, T* v, - int thrBlockSize) + int thrBlockSize, + cudaStream_t stream) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuSolveUpperLevelSet<<>>( + cuSolveUpperLevelSet<<>>( reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, v); } @@ -399,13 +401,15 @@ solveLowerLevelSetSplit(MatrixScalar* reorderedMat, int rowsInLevelSet, const LinearSolverScalar* d, LinearSolverScalar* v, - int thrBlockSize) + int thrBlockSize, + cudaStream_t stream) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( cuSolveLowerLevelSetSplit, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuSolveLowerLevelSetSplit<<>>( - reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v); + cuSolveLowerLevelSetSplit + <<>>( + reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v); } // perform the upper solve for all rows in the same level set template @@ -418,13 +422,15 @@ solveUpperLevelSetSplit(MatrixScalar* reorderedMat, int rowsInLevelSet, const DiagonalScalar* dInv, LinearSolverScalar* v, - int thrBlockSize) + int thrBlockSize, + cudaStream_t stream) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( cuSolveUpperLevelSetSplit, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuSolveUpperLevelSetSplit<<>>( - reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v); + cuSolveUpperLevelSetSplit + <<>>( + reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v); } template @@ -484,28 +490,28 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat, } #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \ - template void solveUpperLevelSet(T*, int*, int*, int*, int, int, T*, int); \ - template void solveLowerLevelSet(T*, int*, int*, int*, int, int, const T*, T*, int); \ + template void solveUpperLevelSet(T*, int*, int*, int*, int, int, T*, int, cudaStream_t); \ + template void solveLowerLevelSet(T*, int*, int*, int*, int, int, const T*, T*, int, cudaStream_t); \ template void LUFactorization(T*, int*, int*, int*, int*, size_t, int, int); \ - template void LUFactorizationSplit( \ + template void LUFactorizationSplit( \ T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \ - template void LUFactorizationSplit( \ + template void LUFactorizationSplit( \ T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \ - template void LUFactorizationSplit( \ + template void LUFactorizationSplit( \ T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \ - template void LUFactorizationSplit( \ - T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \ - template void LUFactorizationSplit( \ + template void LUFactorizationSplit( \ + T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \ + template void LUFactorizationSplit( \ T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \ - template void LUFactorizationSplit( \ - T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); - -#define INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(T) \ - INSTANTIATE_KERNEL_WRAPPERS(T, 1); \ - INSTANTIATE_KERNEL_WRAPPERS(T, 2); \ - INSTANTIATE_KERNEL_WRAPPERS(T, 3); \ - INSTANTIATE_KERNEL_WRAPPERS(T, 4); \ - INSTANTIATE_KERNEL_WRAPPERS(T, 5); \ + template void LUFactorizationSplit( \ + T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); + +#define INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(T) \ + INSTANTIATE_KERNEL_WRAPPERS(T, 1); \ + INSTANTIATE_KERNEL_WRAPPERS(T, 2); \ + INSTANTIATE_KERNEL_WRAPPERS(T, 3); \ + INSTANTIATE_KERNEL_WRAPPERS(T, 4); \ + INSTANTIATE_KERNEL_WRAPPERS(T, 5); \ INSTANTIATE_KERNEL_WRAPPERS(T, 6); INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(float) @@ -514,32 +520,32 @@ INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(double) #define INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(blocksize) \ /* double preconditioner */ \ template void solveLowerLevelSetSplit( \ - double*, int*, int*, int*, int, int, const double*, double*, int); \ + double*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t); \ /* float matrix, double compute preconditioner */ \ template void solveLowerLevelSetSplit( \ - float*, int*, int*, int*, int, int, const double*, double*, int); \ + float*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t); \ /* float preconditioner */ \ template void solveLowerLevelSetSplit( \ - float*, int*, int*, int*, int, int, const float*, float*, int); \ + float*, int*, int*, int*, int, int, const float*, float*, int, cudaStream_t); \ \ /* double preconditioner */ \ - template void solveUpperLevelSetSplit( \ - double*, int*, int*, int*, int, int, const double*, double*, int); \ + template void solveUpperLevelSetSplit( \ + double*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t); \ /* float matrix, double compute preconditioner */ \ - template void solveUpperLevelSetSplit( \ - float*, int*, int*, int*, int, int, const double*, double*, int); \ + template void solveUpperLevelSetSplit( \ + float*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t); \ /* float preconditioner */ \ - template void solveUpperLevelSetSplit( \ - float*, int*, int*, int*, int, int, const double*, float*, int); \ + template void solveUpperLevelSetSplit( \ + float*, int*, int*, int*, int, int, const double*, float*, int, cudaStream_t); \ /* double preconditioner */ \ - template void solveUpperLevelSetSplit( \ - double*, int*, int*, int*, int, int, const float*, double*, int); \ + template void solveUpperLevelSetSplit( \ + double*, int*, int*, int*, int, int, const float*, double*, int, cudaStream_t); \ /* float matrix, double compute preconditioner */ \ - template void solveUpperLevelSetSplit( \ - float*, int*, int*, int*, int, int, const float*, double*, int); \ + template void solveUpperLevelSetSplit( \ + float*, int*, int*, int*, int, int, const float*, double*, int, cudaStream_t); \ /* float preconditioner */ \ - template void solveUpperLevelSetSplit( \ - float*, int*, int*, int*, int, int, const float*, float*, int); + template void solveUpperLevelSetSplit( \ + float*, int*, int*, int*, int, int, const float*, float*, int, cudaStream_t); INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(1); diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp index cdd78ec910..04390ff77c 100644 --- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp +++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp @@ -46,7 +46,8 @@ void solveUpperLevelSet(T* reorderedMat, int startIdx, int rowsInLevelSet, T* v, - int threadBlockSize); + int threadBlockSize, + cudaStream_t stream); /** * @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel @@ -72,7 +73,8 @@ void solveLowerLevelSet(T* reorderedMat, int rowsInLevelSet, const T* d, T* v, - int threadBlockSize); + int threadBlockSize, + cudaStream_t stream); /** * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel @@ -99,7 +101,8 @@ void solveUpperLevelSetSplit(MatrixScalar* reorderedMat, int rowsInLevelSet, const DiagonalScalar* dInv, LinearSolverScalar* v, - int threadBlockSize); + int threadBlockSize, + cudaStream_t stream); /** * @brief Perform an lower solve on certain rows in a matrix that can safely be computed in parallel @@ -127,7 +130,8 @@ void solveLowerLevelSetSplit(MatrixScalar* reorderedLowerMat, int rowsInLevelSet, const LinearSolverScalar* d, LinearSolverScalar* v, - int threadBlockSize); + int threadBlockSize, + cudaStream_t stream); /** * @brief Computes the ILU Factorization of the input bcsr matrix, which is stored in a reordered way. The diagonal