Merge pull request #46 from mhaseeb123/fix_1d

format 1d stencil codes and add openmp impl
mhaseeb123 · Oct 23, 2023 · 6711d32 · 6711d32
2 parents 1aef2b2 + 2a0ec55
commit 6711d32
Show file tree

Hide file tree

Showing 6 changed files with 228 additions and 159 deletions.
diff --git a/apps/1d_stencil/CMakeLists.txt b/apps/1d_stencil/CMakeLists.txt
@@ -6,20 +6,26 @@ target_include_directories(
   PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
           ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})
 
+add_executable(stencil_omp stencil_omp.cpp)
+target_include_directories(
+  stencil_omp
+  PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
+          ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})
+
 add_executable(stencil_stdpar stencil_stdpar.cpp)
 target_link_libraries(stencil_stdpar stdexec)
 target_include_directories(
   stencil_stdpar
   PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
           ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})
 
-  # TODO, fix cmake
-  add_executable(stencil_stdexec stencil_stdexec.cpp)
-  target_link_libraries(stencil_stdexec stdexec)
-  target_include_directories(
-    stencil_stdexec
-    PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
-            ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})
+# TODO, fix cmake
+add_executable(stencil_stdexec stencil_stdexec.cpp)
+target_link_libraries(stencil_stdexec stdexec)
+target_include_directories(
+  stencil_stdexec
+  PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
+          ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})
 
 if("${STDPAR}" STREQUAL "gpu")
   add_executable(stencil_cuda stencil_cuda.cpp)

diff --git a/apps/1d_stencil/stencil_cuda.cpp b/apps/1d_stencil/stencil_cuda.cpp
@@ -9,10 +9,8 @@
 struct args_params_t : public argparse::Args {
   bool& results = kwarg("results", "print generated results (default: false)")
                       .set_default(false);
-  std::uint64_t& nx =
-      kwarg("nx", "Local x dimension (of each partition)").set_default(10);
   std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
-  std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10);
+  std::uint64_t& size = kwarg("size", "Number of elements").set_default(10);
   bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
   double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
   double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
@@ -23,51 +21,51 @@ struct args_params_t : public argparse::Args {
   bool& time = kwarg("t, time", "print time").set_default(true);
 };
 
+using Real_t = double;
 ///////////////////////////////////////////////////////////////////////////////
 // Command-line variables
-bool header = true;        // print csv heading
-constexpr double k = 0.5;  // heat transfer coefficient
-constexpr double dt = 1.;  // time step
-constexpr double dx = 1.;  // grid spacing
+constexpr bool header = true;        // print csv heading
+constexpr Real_t k = 0.5;  // heat transfer coefficient
+constexpr Real_t dt = 1.;  // time step
+constexpr Real_t dx = 1.;  // grid spacing
 
 // Our operator
-__device__ double heat(double left, double middle, double right) {
+__device__ Real_t heat(const Real_t left, const Real_t middle, const Real_t right, 
+  const Real_t k, const Real_t dt, const Real_t dx) {
   return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
 }
 
-__global__ void heat_equation(double* current, double* next, std::size_t size) {
+__global__ void heat_equation(Real_t* current, Real_t* next, std::size_t size, 
+  const Real_t k, const Real_t dt, const Real_t dx) {
   std::size_t i = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (i < size) {
     std::size_t left = (i == 0) ? size - 1 : i - 1;
     std::size_t right = (i == size - 1) ? 0 : i + 1;
-    next[i] = heat(current[left], current[i], current[right]);
+    next[i] = heat(current[left], current[i], current[right], k, dt, dx);
   }
 }
 
 int benchmark(args_params_t const& args) {
-  // Parameters (for simplicity, some are hardcoded)
-  std::uint64_t np = args.np;  // Number of partitions.
-  std::uint64_t nx = args.nx;  // Number of grid points.
+  std::uint64_t size = args.size;  // Number of elements.
   std::uint64_t nt = args.nt;  // Number of steps.
-  std::size_t size = np * nx;
 
-  double* h_current = nullptr;
-  double* h_next = nullptr;
+  Real_t* h_current = nullptr;
+  Real_t* h_next = nullptr;
 
   // Measure execution time.
   Timer timer;
 
   // Memory allocation
   if (args.results) {
-    h_current = new double[size];
-    h_next = new double[size];
+    h_current = new Real_t[size];
+    h_next = new Real_t[size];
   }
 
-  double* d_current;
-  double* d_next;
-  cudaMalloc(&d_current, size * sizeof(double));
-  cudaMalloc(&d_next, size * sizeof(double));
+  Real_t* d_current;
+  Real_t* d_next;
+  cudaMalloc(&d_current, size * sizeof(Real_t));
+  cudaMalloc(&d_next, size * sizeof(Real_t));
   thrust::sequence(thrust::device, d_current, d_current + size, 0);
   thrust::sequence(thrust::device, d_next, d_next + size, 0);
 
@@ -77,25 +75,22 @@ int benchmark(args_params_t const& args) {
 
   // Actual time step loop
   for (std::size_t t = 0; t < nt; ++t) {
-    heat_equation<<<blocks, threadsPerBlock>>>(d_current, d_next, size);
+    heat_equation<<<blocks, threadsPerBlock>>>(d_current, d_next, size, k, dt, dx);
     std::swap(d_current, d_next);
   }
   cudaDeviceSynchronize();
   auto time = timer.stop();
 
   if (args.results) {
     // Copy result back to host
-    cudaMemcpy(h_current, d_current, size * sizeof(double),
+    cudaMemcpy(h_current, d_current, size * sizeof(Real_t),
                cudaMemcpyDeviceToHost);
 
     // Print results
-    for (std::size_t i = 0; i < np; ++i) {
-      std::cout << "U[" << i << "] = {";
-      for (std::size_t j = 0; j < nx; ++j) {
-        std::cout << h_current[i * nx + j] << " ";
-      }
-      std::cout << "}\n";
+    for (std::size_t i = 0; i != size; ++i) {
+      std::cout << h_current[i] << " ";
     }
+    std::cout << "\n";
     // Cleanup
     delete[] h_current;
     delete[] h_next;

diff --git a/apps/1d_stencil/stencil_omp.cpp b/apps/1d_stencil/stencil_omp.cpp
@@ -0,0 +1,130 @@
+/*
+ * MIT License
+ *
+ * Copyright (c) 2023 Weile Wei 
+ * Copyright (c) 2023 The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of any
+ * required approvals from the U.S. Dept. of Energy).All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "argparse/argparse.hpp"
+#include "commons.hpp"
+#include <omp.h>
+
+// parameters
+struct args_params_t : public argparse::Args {
+  bool& results = kwarg("results", "print generated results (default: false)")
+                      .set_default(false);
+  std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
+  std::uint64_t& size = kwarg("size", "Number of elements").set_default(10);
+  int& nthreads = kwarg("nthreads", "Number of openmp threads").set_default(1);
+  bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
+  double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
+  double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
+  bool& help = flag("h, help", "print help");
+  bool& time = kwarg("t, time", "print time").set_default(true);
+};
+
+using Real_t = double;
+///////////////////////////////////////////////////////////////////////////////
+// Command-line variables
+constexpr Real_t k = 0.5;      // heat transfer coefficient
+constexpr Real_t dt = 1.;      // time step
+constexpr Real_t dx = 1.;      // grid spacing
+
+///////////////////////////////////////////////////////////////////////////////
+//[stepper_1
+struct stepper {
+  // Our operator
+  Real_t heat(const Real_t left, const Real_t middle, const Real_t right, const Real_t k = ::k,
+              const Real_t dt = ::dt, const Real_t dx = ::dx) {
+    return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
+  }
+
+  // do all the work on 'size' data points for 'nt' time steps 
+  [[nodiscard]] std::vector<Real_t> do_work(const std::size_t size, const std::size_t nt, const int nthreads) {
+    std::vector<Real_t> current(size);
+    std::vector<Real_t> next(size);
+
+   #pragma omp parallel for num_threads(nthreads)
+    for (std::size_t i = 0; i < size; ++i) {
+        current[i] = Real_t(i);
+    }
+
+    // Actual time step loop
+    for (std::size_t t = 0; t != nt; ++t) {
+      // OpenMP parallel for loop
+      #pragma omp parallel for num_threads(nthreads)
+      for (std::size_t i = 0; i < size; ++i) {
+        std::size_t left = (i == 0) ? size - 1 : i - 1;
+        std::size_t right = (i == size - 1) ? 0 : i + 1;
+        next[i] = heat(current[left], current[i], current[right], k, dt, dx);
+      }
+      std::swap(current, next);
+    }
+
+    return current;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+int benchmark(args_params_t const& args) {
+  std::uint64_t size = args.size;  // Number of elements.
+  std::uint64_t nt = args.nt;  // Number of steps.
+  int nthreads = args.nthreads;
+
+  // Create the stepper object
+  stepper step;
+
+  // Measure execution time.
+  Timer timer;
+
+  auto solution = step.do_work(size, nt, nthreads);
+  auto time = timer.stop();
+
+  // Print the final solution
+  if (args.results) {
+    for (const auto& ele: solution) {
+      std::cout << ele << " ";
+    }
+    std::cout << "\n";
+  }
+
+  if (args.time) {
+    std::cout << "Duration: " << time << " ms."
+              << "\n";
+  }
+
+  return 0;
+}
+
+int main(int argc, char* argv[]) {
+  // parse params
+  args_params_t args = argparse::parse<args_params_t>(argc, argv);
+  // see if help wanted
+  if (args.help) {
+    args.print();  // prints all variables
+    return 0;
+  }
+
+  benchmark(args);
+
+  return 0;
+}