Skip to content
This repository has been archived by the owner on Nov 2, 2023. It is now read-only.

format 1d stencil codes and add openmp impl #46

Merged
merged 4 commits into from
Oct 23, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 13 additions & 7 deletions apps/1d_stencil/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -6,20 +6,26 @@ target_include_directories(
PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})

add_executable(stencil_omp stencil_omp.cpp)
target_include_directories(
stencil_omp
PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})

add_executable(stencil_stdpar stencil_stdpar.cpp)
target_link_libraries(stencil_stdpar stdexec)
target_include_directories(
stencil_stdpar
PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})

# TODO, fix cmake
add_executable(stencil_stdexec stencil_stdexec.cpp)
target_link_libraries(stencil_stdexec stdexec)
target_include_directories(
stencil_stdexec
PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})
# TODO, fix cmake
add_executable(stencil_stdexec stencil_stdexec.cpp)
target_link_libraries(stencil_stdexec stdexec)
target_include_directories(
stencil_stdexec
PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})

if("${STDPAR}" STREQUAL "gpu")
add_executable(stencil_cuda stencil_cuda.cpp)
55 changes: 25 additions & 30 deletions apps/1d_stencil/stencil_cuda.cpp
Original file line number Diff line number Diff line change
@@ -9,10 +9,8 @@
struct args_params_t : public argparse::Args {
bool& results = kwarg("results", "print generated results (default: false)")
.set_default(false);
std::uint64_t& nx =
kwarg("nx", "Local x dimension (of each partition)").set_default(10);
std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10);
std::uint64_t& size = kwarg("size", "Number of elements").set_default(10);
bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
@@ -23,51 +21,51 @@ struct args_params_t : public argparse::Args {
bool& time = kwarg("t, time", "print time").set_default(true);
};

using Real_t = double;
///////////////////////////////////////////////////////////////////////////////
// Command-line variables
bool header = true; // print csv heading
constexpr double k = 0.5; // heat transfer coefficient
constexpr double dt = 1.; // time step
constexpr double dx = 1.; // grid spacing
constexpr bool header = true; // print csv heading
constexpr Real_t k = 0.5; // heat transfer coefficient
constexpr Real_t dt = 1.; // time step
constexpr Real_t dx = 1.; // grid spacing

// Our operator
__device__ double heat(double left, double middle, double right) {
__device__ Real_t heat(const Real_t left, const Real_t middle, const Real_t right,
const Real_t k, const Real_t dt, const Real_t dx) {
return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
}

__global__ void heat_equation(double* current, double* next, std::size_t size) {
__global__ void heat_equation(Real_t* current, Real_t* next, std::size_t size,
const Real_t k, const Real_t dt, const Real_t dx) {
std::size_t i = blockIdx.x * blockDim.x + threadIdx.x;

if (i < size) {
std::size_t left = (i == 0) ? size - 1 : i - 1;
std::size_t right = (i == size - 1) ? 0 : i + 1;
next[i] = heat(current[left], current[i], current[right]);
next[i] = heat(current[left], current[i], current[right], k, dt, dx);
}
}

int benchmark(args_params_t const& args) {
// Parameters (for simplicity, some are hardcoded)
std::uint64_t np = args.np; // Number of partitions.
std::uint64_t nx = args.nx; // Number of grid points.
std::uint64_t size = args.size; // Number of elements.
std::uint64_t nt = args.nt; // Number of steps.
std::size_t size = np * nx;

double* h_current = nullptr;
double* h_next = nullptr;
Real_t* h_current = nullptr;
Real_t* h_next = nullptr;

// Measure execution time.
Timer timer;

// Memory allocation
if (args.results) {
h_current = new double[size];
h_next = new double[size];
h_current = new Real_t[size];
h_next = new Real_t[size];
}

double* d_current;
double* d_next;
cudaMalloc(&d_current, size * sizeof(double));
cudaMalloc(&d_next, size * sizeof(double));
Real_t* d_current;
Real_t* d_next;
cudaMalloc(&d_current, size * sizeof(Real_t));
cudaMalloc(&d_next, size * sizeof(Real_t));
thrust::sequence(thrust::device, d_current, d_current + size, 0);
thrust::sequence(thrust::device, d_next, d_next + size, 0);

@@ -77,25 +75,22 @@ int benchmark(args_params_t const& args) {

// Actual time step loop
for (std::size_t t = 0; t < nt; ++t) {
heat_equation<<<blocks, threadsPerBlock>>>(d_current, d_next, size);
heat_equation<<<blocks, threadsPerBlock>>>(d_current, d_next, size, k, dt, dx);
std::swap(d_current, d_next);
}
cudaDeviceSynchronize();
auto time = timer.stop();

if (args.results) {
// Copy result back to host
cudaMemcpy(h_current, d_current, size * sizeof(double),
cudaMemcpy(h_current, d_current, size * sizeof(Real_t),
cudaMemcpyDeviceToHost);

// Print results
for (std::size_t i = 0; i < np; ++i) {
std::cout << "U[" << i << "] = {";
for (std::size_t j = 0; j < nx; ++j) {
std::cout << h_current[i * nx + j] << " ";
}
std::cout << "}\n";
for (std::size_t i = 0; i != size; ++i) {
std::cout << h_current[i] << " ";
}
std::cout << "\n";
// Cleanup
delete[] h_current;
delete[] h_next;
130 changes: 130 additions & 0 deletions apps/1d_stencil/stencil_omp.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
/*
* MIT License
*
* Copyright (c) 2023 Weile Wei
* Copyright (c) 2023 The Regents of the University of California,
* through Lawrence Berkeley National Laboratory (subject to receipt of any
* required approvals from the U.S. Dept. of Energy).All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "argparse/argparse.hpp"
#include "commons.hpp"
#include <omp.h>

// parameters
struct args_params_t : public argparse::Args {
bool& results = kwarg("results", "print generated results (default: false)")
.set_default(false);
std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
std::uint64_t& size = kwarg("size", "Number of elements").set_default(10);
int& nthreads = kwarg("nthreads", "Number of openmp threads").set_default(1);
bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
bool& help = flag("h, help", "print help");
bool& time = kwarg("t, time", "print time").set_default(true);
};

using Real_t = double;
///////////////////////////////////////////////////////////////////////////////
// Command-line variables
constexpr Real_t k = 0.5; // heat transfer coefficient
constexpr Real_t dt = 1.; // time step
constexpr Real_t dx = 1.; // grid spacing

///////////////////////////////////////////////////////////////////////////////
//[stepper_1
struct stepper {
// Our operator
Real_t heat(const Real_t left, const Real_t middle, const Real_t right, const Real_t k = ::k,
const Real_t dt = ::dt, const Real_t dx = ::dx) {
return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
}

// do all the work on 'size' data points for 'nt' time steps
[[nodiscard]] std::vector<Real_t> do_work(const std::size_t size, const std::size_t nt, const int nthreads) {
std::vector<Real_t> current(size);
std::vector<Real_t> next(size);

#pragma omp parallel for num_threads(nthreads)
for (std::size_t i = 0; i < size; ++i) {
current[i] = Real_t(i);
}

// Actual time step loop
for (std::size_t t = 0; t != nt; ++t) {
// OpenMP parallel for loop
#pragma omp parallel for num_threads(nthreads)
for (std::size_t i = 0; i < size; ++i) {
std::size_t left = (i == 0) ? size - 1 : i - 1;
std::size_t right = (i == size - 1) ? 0 : i + 1;
next[i] = heat(current[left], current[i], current[right], k, dt, dx);
}
std::swap(current, next);
}

return current;
}
};

///////////////////////////////////////////////////////////////////////////////
int benchmark(args_params_t const& args) {
std::uint64_t size = args.size; // Number of elements.
std::uint64_t nt = args.nt; // Number of steps.
int nthreads = args.nthreads;

// Create the stepper object
stepper step;

// Measure execution time.
Timer timer;

auto solution = step.do_work(size, nt, nthreads);
auto time = timer.stop();

// Print the final solution
if (args.results) {
for (const auto& ele: solution) {
std::cout << ele << " ";
}
std::cout << "\n";
}

if (args.time) {
std::cout << "Duration: " << time << " ms."
<< "\n";
}

return 0;
}

int main(int argc, char* argv[]) {
// parse params
args_params_t args = argparse::parse<args_params_t>(argc, argv);
// see if help wanted
if (args.help) {
args.print(); // prints all variables
return 0;
}

benchmark(args);

return 0;
}
Loading
Loading