Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve gpuistl using cudaGraphs #5852

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions opm/simulators/linalg/gpuistl/GpuBuffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ GpuBuffer<T>::copyFromHost(const std::vector<T>& data)
{
copyFromHost(data.data(), data.size());
}

template <class T>
void
GpuBuffer<T>::copyToHost(std::vector<T>& data) const
Expand Down
1 change: 1 addition & 0 deletions opm/simulators/linalg/gpuistl/GpuBuffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <opm/simulators/linalg/gpuistl/GpuView.hpp>
#include <vector>
#include <string>
#include <cuda_runtime.h>


namespace Opm::gpuistl
Expand Down
63 changes: 49 additions & 14 deletions opm/simulators/linalg/gpuistl/GpuDILU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include <functional>
#include <utility>
#include <string>

namespace Opm::gpuistl
{

Expand All @@ -54,6 +55,8 @@ GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int
, m_tuneThreadBlockSizes(tuneKernels)
, m_mixedPrecisionScheme(makeMatrixStorageMPScheme(mixedPrecisionScheme))
{
OPM_GPU_SAFE_CALL(cudaStreamCreate(&m_stream));

// TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
// Some sanity check
OPM_ERROR_IF(A.N() != m_gpuMatrix.N(),
Expand Down Expand Up @@ -93,7 +96,8 @@ GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int
}
}

computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
reorderAndSplitMatrix(m_moveThreadBlockSize);
computeDiagonal(m_DILUFactorizationThreadBlockSize);

if (m_tuneThreadBlockSizes) {
tuneThreadBlockSizes();
Expand All @@ -112,7 +116,22 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d)
{
OPM_TIMEBLOCK(prec_apply);
{
apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
const auto ptrs = std::make_pair(v.data(), d.data());

auto it = m_apply_graphs.find(ptrs);

if (it == m_apply_graphs.end()) {
m_apply_graphs[ptrs] = cudaGraph_t();
m_executableGraphs[ptrs] = cudaGraphExec_t();
OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream, cudaStreamCaptureModeGlobal));

// The apply functions contains lots of small function calls which call a kernel each
apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);

OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream, &m_apply_graphs[ptrs]));
OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs], m_apply_graphs[ptrs], nullptr, nullptr, 0));
}
OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs], 0));
}
}

Expand All @@ -135,7 +154,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
m_gpuDInvFloat->data(),
d.data(),
v.data(),
lowerSolveThreadBlockSize);
lowerSolveThreadBlockSize,
m_stream);
} else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) {
detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float, field_type>(
m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
Expand All @@ -147,7 +167,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
m_gpuDInv.data(),
d.data(),
v.data(),
lowerSolveThreadBlockSize);
lowerSolveThreadBlockSize,
m_stream);
} else {
detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, field_type, field_type>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
Expand All @@ -159,7 +180,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
m_gpuDInv.data(),
d.data(),
v.data(),
lowerSolveThreadBlockSize);
lowerSolveThreadBlockSize,
m_stream);
}
} else {
detail::DILU::solveLowerLevelSet<field_type, blocksize_>(
Expand All @@ -172,7 +194,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
m_gpuDInv.data(),
d.data(),
v.data(),
lowerSolveThreadBlockSize);
lowerSolveThreadBlockSize,
m_stream);
}
levelStartIdx += numOfRowsInLevel;
}
Expand All @@ -193,7 +216,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
numOfRowsInLevel,
m_gpuDInvFloat->data(),
v.data(),
upperSolveThreadBlockSize);
upperSolveThreadBlockSize,
m_stream);
} else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG){
detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, float>(
m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
Expand All @@ -204,7 +228,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
numOfRowsInLevel,
m_gpuDInv.data(),
v.data(),
upperSolveThreadBlockSize);
upperSolveThreadBlockSize,
m_stream);
} else {
detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, field_type>(
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
Expand All @@ -215,7 +240,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
numOfRowsInLevel,
m_gpuDInv.data(),
v.data(),
upperSolveThreadBlockSize);
upperSolveThreadBlockSize,
m_stream);
}
} else {
detail::DILU::solveUpperLevelSet<field_type, blocksize_>(
Expand All @@ -227,7 +253,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
numOfRowsInLevel,
m_gpuDInv.data(),
v.data(),
upperSolveThreadBlockSize);
upperSolveThreadBlockSize,
m_stream);
}
}
}
Expand All @@ -252,21 +279,24 @@ GpuDILU<M, X, Y, l>::update()
{
OPM_TIMEBLOCK(prec_update);
{
update(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu
reorderAndSplitMatrix(m_moveThreadBlockSize);
computeDiagonal(m_DILUFactorizationThreadBlockSize);
}
}

template <class M, class X, class Y, int l>
void
GpuDILU<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationBlockSize)
{
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
computeDiagAndMoveReorderedData(moveThreadBlockSize, factorizationBlockSize);
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu
reorderAndSplitMatrix(moveThreadBlockSize);
computeDiagonal(factorizationBlockSize);
}

template <class M, class X, class Y, int l>
void
GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationBlockSize)
GpuDILU<M, X, Y, l>::reorderAndSplitMatrix(int moveThreadBlockSize)
{
if (m_splitMatrix) {
detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
Expand All @@ -290,7 +320,12 @@ GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in
m_gpuMatrixReordered->N(),
moveThreadBlockSize);
}
}

template <class M, class X, class Y, int l>
void
GpuDILU<M, X, Y, l>::computeDiagonal(int factorizationBlockSize)
{
int levelStartIdx = 0;
for (int level = 0; level < m_levelSets.size(); ++level) {
const int numOfRowsInLevel = m_levelSets[level].size();
Expand Down
15 changes: 13 additions & 2 deletions opm/simulators/linalg/gpuistl/GpuDILU.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@
#include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
#include <opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp>
#include <vector>

#include <map>
#include <utility>


namespace Opm::gpuistl
Expand Down Expand Up @@ -82,8 +83,11 @@ class GpuDILU : public Dune::PreconditionerWithUpdate<X, Y>
//! \brief Updates the matrix data.
void update() final;

//! \brief perform matrix splitting and reordering
void reorderAndSplitMatrix(int moveThreadBlockSize);

//! \brief Compute the diagonal of the DILU, and update the data of the reordered matrix
void computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationThreadBlockSize);
void computeDiagonal(int factorizationThreadBlockSize);

//! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
void tuneThreadBlockSizes();
Expand Down Expand Up @@ -153,6 +157,13 @@ class GpuDILU : public Dune::PreconditionerWithUpdate<X, Y>
int m_lowerSolveThreadBlockSize = -1;
int m_moveThreadBlockSize = -1;
int m_DILUFactorizationThreadBlockSize = -1;

// Graphs for Apply
std::map<std::pair<field_type*, const field_type*>, cudaGraph_t> m_apply_graphs;
std::map<std::pair<field_type*, const field_type*>, cudaGraphExec_t> m_executableGraphs;

// Stream for the DILU operations on the GPU
cudaStream_t m_stream;
};
} // end namespace Opm::gpuistl

Expand Down
13 changes: 13 additions & 0 deletions opm/simulators/linalg/gpuistl/GpuVector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,19 @@ GpuVector<T>::copyFromHost(const T* dataPointer, size_t numberOfElements)
OPM_GPU_SAFE_CALL(cudaMemcpy(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice));
}

template <class T>
void
GpuVector<T>::copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream)
{
if (numberOfElements > dim()) {
OPM_THROW(std::runtime_error,
fmt::format("Requesting to copy too many elements. Vector has {} elements, while {} was requested.",
dim(),
numberOfElements));
}
OPM_GPU_SAFE_CALL(cudaMemcpyAsync(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice, stream));
}

template <class T>
void
GpuVector<T>::copyToHost(T* dataPointer, size_t numberOfElements) const
Expand Down
1 change: 1 addition & 0 deletions opm/simulators/linalg/gpuistl/GpuVector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ class GpuVector
* @note assumes that this vector has numberOfElements elements
*/
void copyFromHost(const T* dataPointer, size_t numberOfElements);
void copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream);

/**
* @brief copyFromHost copies numberOfElements to the CPU memory dataPointer
Expand Down
Loading