Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
experimenting with update
Browse files Browse the repository at this point in the history
multitalentloes committed Jan 13, 2025
1 parent f924361 commit d990b2e
Showing 1 changed file with 34 additions and 33 deletions.
67 changes: 34 additions & 33 deletions opm/simulators/linalg/gpuistl/GpuDILU.cpp
Original file line number Diff line number Diff line change
@@ -52,7 +52,7 @@ class CumulativeScopeTimer {
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
total_time_spent += duration;

std::cout << "Average: " << total_time_spent / instance_count << "us/update. This update: " << duration << "us. Total time spent: " << total_time_spent / 1000.0 << " ms. " << " Applies: " << instance_count << std::endl;
std::cout << "Average: " << total_time_spent / instance_count << "us/update. This update: " << duration << "us. Total time spent: " << total_time_spent / 1000.0 << " ms. " << " Updates: " << instance_count << std::endl;
}

// Static method to report the cumulative time and instance count
@@ -145,29 +145,29 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d)
{
OPM_TIMEBLOCK(prec_apply);
{
cudaDeviceSynchronize(); // only for timing
CumulativeScopeTimer timer; // only for timing
apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
// cudaDeviceSynchronize(); // only for timing
// CumulativeScopeTimer timer; // only for timing
// apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);

// const auto ptrs = std::make_pair(v.data(), d.data());
const auto ptrs = std::make_pair(v.data(), d.data());

// auto it = m_apply_graphs.find(ptrs);
auto it = m_apply_graphs.find(ptrs);

// if (it == m_apply_graphs.end()) {
// printf("\ncapturing apply\n");
// m_apply_graphs[ptrs] = cudaGraph_t();
// m_executableGraphs[ptrs] = cudaGraphExec_t();
// OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream, cudaStreamCaptureModeGlobal));
if (it == m_apply_graphs.end()) {
printf("\ncapturing apply\n");
m_apply_graphs[ptrs] = cudaGraph_t();
m_executableGraphs[ptrs] = cudaGraphExec_t();
OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream, cudaStreamCaptureModeGlobal));

// // The apply functions contains lots of small function calls which call a kernel each
// apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
// The apply functions contains lots of small function calls which call a kernel each
apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);

// OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream, &m_apply_graphs[ptrs]));
// OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs], m_apply_graphs[ptrs], nullptr, nullptr, 0));
// }
// OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs], 0));
OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream, &m_apply_graphs[ptrs]));
OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs], m_apply_graphs[ptrs], nullptr, nullptr, 0));
}
OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs], 0));

cudaDeviceSynchronize(); // only for timing
// cudaDeviceSynchronize(); // only for timing
}
}

@@ -291,21 +291,22 @@ GpuDILU<M, X, Y, l>::update()
OPM_TIMEBLOCK(prec_update);
{

// cudaDeviceSynchronize(); // only for timing
// CumulativeScopeTimer timer; // only for timing
// update(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
if (!m_update_graph_captured)
{
m_update_graph = cudaGraph_t();
m_update_executable_graph = cudaGraphExec_t();
OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream, cudaStreamCaptureModeGlobal));
update(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream, &m_update_graph));
OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_update_executable_graph, m_update_graph, nullptr, nullptr, 0));
m_update_graph_captured = true;
}
OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_update_executable_graph, 0));
// cudaDeviceSynchronize(); // only for timing
cudaDeviceSynchronize(); // only for timing
CumulativeScopeTimer timer; // only for timing
update(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
// m_gpuMatrix.updateNonzeroValuesDirectlyInStream(m_cpuMatrix, m_stream); // send updated matrix to the gpu
// if (!m_update_graph_captured)
// {
// m_update_graph = cudaGraph_t();
// m_update_executable_graph = cudaGraphExec_t();
// OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream, cudaStreamCaptureModeGlobal));
// computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
// OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream, &m_update_graph));
// OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_update_executable_graph, m_update_graph, nullptr, nullptr, 0));
// m_update_graph_captured = true;
// }
// OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_update_executable_graph, 0));
cudaDeviceSynchronize(); // only for timing
}

}

0 comments on commit d990b2e

Please sign in to comment.