experimenting with update

OPM · Jan 13, 2025 · d990b2e · d990b2e
1 parent f924361
commit d990b2e
Showing 1 changed file with 34 additions and 33 deletions.
diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.cpp b/opm/simulators/linalg/gpuistl/GpuDILU.cpp
@@ -52,7 +52,7 @@ class CumulativeScopeTimer {
         auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
         total_time_spent += duration;
 
-        std::cout << "Average: " << total_time_spent / instance_count << "us/update. This update: " << duration << "us. Total time spent: " << total_time_spent / 1000.0 << " ms. " << " Applies: " << instance_count << std::endl;
+        std::cout << "Average: " << total_time_spent / instance_count << "us/update. This update: " << duration << "us. Total time spent: " << total_time_spent / 1000.0 << " ms. " << " Updates: " << instance_count << std::endl;
     }
 
     // Static method to report the cumulative time and instance count
@@ -145,29 +145,29 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d)
 {
     OPM_TIMEBLOCK(prec_apply);
     {
-        cudaDeviceSynchronize(); // only for timing
-        CumulativeScopeTimer timer; // only for timing
-        apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
+        // cudaDeviceSynchronize(); // only for timing
+        // CumulativeScopeTimer timer; // only for timing
+        // apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
 
-        // const auto ptrs = std::make_pair(v.data(), d.data());
+        const auto ptrs = std::make_pair(v.data(), d.data());
 
-        // auto it = m_apply_graphs.find(ptrs);
+        auto it = m_apply_graphs.find(ptrs);
 
-        // if (it == m_apply_graphs.end()) {
-        //     printf("\ncapturing apply\n");
-        //     m_apply_graphs[ptrs] = cudaGraph_t();
-        //     m_executableGraphs[ptrs] = cudaGraphExec_t();
-        //     OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream, cudaStreamCaptureModeGlobal));
+        if (it == m_apply_graphs.end()) {
+            printf("\ncapturing apply\n");
+            m_apply_graphs[ptrs] = cudaGraph_t();
+            m_executableGraphs[ptrs] = cudaGraphExec_t();
+            OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream, cudaStreamCaptureModeGlobal));
 
-        //     // The apply functions contains lots of small function calls which call a kernel each
-        //     apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
+            // The apply functions contains lots of small function calls which call a kernel each
+            apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
 
-        //     OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream, &m_apply_graphs[ptrs]));
-        //     OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs], m_apply_graphs[ptrs], nullptr, nullptr, 0));
-        // }
-        // OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs], 0));
+            OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream, &m_apply_graphs[ptrs]));
+            OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs], m_apply_graphs[ptrs], nullptr, nullptr, 0));
+        }
+        OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs], 0));
 
-        cudaDeviceSynchronize(); // only for timing
+        // cudaDeviceSynchronize(); // only for timing
     }
 }
 
@@ -291,21 +291,22 @@ GpuDILU<M, X, Y, l>::update()
     OPM_TIMEBLOCK(prec_update);
     {
 
-        // cudaDeviceSynchronize(); // only for timing
-        // CumulativeScopeTimer timer; // only for timing
-        // update(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
-        if (!m_update_graph_captured)
-        {
-            m_update_graph = cudaGraph_t();
-            m_update_executable_graph = cudaGraphExec_t();
-            OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream, cudaStreamCaptureModeGlobal));
-            update(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
-            OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream, &m_update_graph));
-            OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_update_executable_graph, m_update_graph, nullptr, nullptr, 0));
-            m_update_graph_captured = true;
-        }
-        OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_update_executable_graph, 0));
-        // cudaDeviceSynchronize(); // only for timing
+        cudaDeviceSynchronize(); // only for timing
+        CumulativeScopeTimer timer; // only for timing
+        update(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
+        // m_gpuMatrix.updateNonzeroValuesDirectlyInStream(m_cpuMatrix, m_stream); // send updated matrix to the gpu
+        // if (!m_update_graph_captured)
+        // {
+        //     m_update_graph = cudaGraph_t();
+        //     m_update_executable_graph = cudaGraphExec_t();
+        //     OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream, cudaStreamCaptureModeGlobal));
+        //     computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
+        //     OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream, &m_update_graph));
+        //     OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_update_executable_graph, m_update_graph, nullptr, nullptr, 0));
+        //     m_update_graph_captured = true;
+        // }
+        // OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_update_executable_graph, 0));
+        cudaDeviceSynchronize(); // only for timing
     }
 
 }