Skip to content

Commit

Permalink
Add device-based chain task benchmark
Browse files Browse the repository at this point in the history
Signed-off-by: Joseph Schuchart <[email protected]>
  • Loading branch information
devreal committed Oct 1, 2023
1 parent 4f21ccd commit f91fa92
Show file tree
Hide file tree
Showing 5 changed files with 373 additions and 60 deletions.
3 changes: 3 additions & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ if (TARGET tiledarray)
COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2
RUNTIMES "parsec")
endif()

add_ttg_executable(chain-ttg-dev task-benchmarks/chain-ttg-dev.cc LINK_LIBRARIES tiledarray RUNTIMES "parsec")
endif()

if (TARGET MADworld)
Expand Down Expand Up @@ -61,3 +63,4 @@ add_ttg_executable(sw sw/sw.cc)
if (TARGET MADworld)
add_ttg_executable(randomaccess randomaccess/randomaccess.cc RUNTIMES "mad")
endif (TARGET MADworld)

251 changes: 251 additions & 0 deletions examples/task-benchmarks/chain-ttg-dev.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
//#define TTG_USE_USER_TERMDET 1
#include "ttg.h"

#include "chrono.h"

#if defined(TTG_HAVE_CUDA)
#define ES ttg::ExecutionSpace::CUDA
#elif defined(TTG_HAVE_HIP)
#define ES ttg::ExecutionSpace::HIP
#else
#error "Either CUDA OR HIP is required to build this test!"
#endif // 0

#define NUM_TASKS 10000

using namespace ttg;

std::atomic<int> task_counter = 0;

struct A : public ttg::TTValue<A> {
// TODO: allocate pinned memory
int v = 0;
ttg::buffer<int> b;
A() : b(&v, 1) { }

A(A&& a) = default;
A(const A& a) : v(a.v), b(&v, 1) { }

template <typename Archive>
void serialize(Archive& ar) {
ttg_abort();
}
template <typename Archive>
void serialize(Archive& ar, const unsigned int) {
ttg_abort();
}

};

template <int num_flows>
auto make_ttg(bool do_move);

// flows task ids via values
template <>
auto make_ttg<1>(bool do_move) {
Edge<int, A> I2N, N2N;
Edge<void, A> N2S;

auto init = make_tt<void>(
[]() {
++task_counter;
std::cout << "init 1 " << std::endl;
send<0>(0, A{});
}, edges(), edges(I2N));

auto next = make_tt<ES, int>([=](const int &key, auto&& value) -> ttg::device_task {
//++task_counter;
co_await ttg::to_device(value.b);
co_await ttg::wait_kernel(); // empty kernel
if (key < NUM_TASKS) {
if (do_move) {
co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(value)));
} else {
co_await ttg::device::forward(ttg::device::send<0>(key+1, value));
}
} else {
}
} , edges(fuse(I2N, N2N)), edges(N2N));

return std::make_tuple(std::move(init), std::move(next));
}

template <>
auto make_ttg<2>(bool do_move) {
Edge<int, A> I2N1, I2N2;
Edge<int, A> N2N1, N2N2;
Edge<void, A> N2S1, N2S2;

auto init = make_tt<void>([]() {
send<0>(0, A{});
send<1>(0, A{});
}, edges(), edges(I2N1, I2N2));

auto next = make_tt<ES, int>([=](const int &key, A&& v1, A&& v2) -> ttg::device_task {
co_await ttg::to_device(v1.b, v2.b);
co_await ttg::wait_kernel(); // empty kernel
if (key < NUM_TASKS) {
if (do_move) {
co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(v1)),
ttg::device::send<1>(key+1, std::move(v2)));
} else {
co_await ttg::device::forward(ttg::device::send<0>(key+1, v1),
ttg::device::send<1>(key+1, v2));
}
}
} , edges(fuse(I2N1, N2N1), fuse(I2N2, N2N2)), edges(N2N1, N2N2));

return std::make_tuple(std::move(init), std::move(next));
}

template <>
auto make_ttg<4>(bool do_move) {
Edge<int, A> I2N1, I2N2, I2N3, I2N4;
Edge<int, A> N2N1, N2N2, N2N3, N2N4;
Edge<void, A> N2S1, N2S2, N2S3, N2S4;

auto init = make_tt<void>(
[]() {
send<0>(0, A{});
send<1>(0, A{});
send<2>(0, A{});
send<3>(0, A{});
}, edges(), edges(I2N1, I2N2, I2N3, I2N4));

auto next = make_tt<ES, int>([=](const int &key, A&& v1, A&& v2, A&& v3, A&& v4) -> ttg::device_task {
co_await ttg::to_device(v1.b, v2.b, v3.b, v4.b);
co_await ttg::wait_kernel(); // empty kernel
if (key < NUM_TASKS) {
if (do_move) {
co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(v1)),
ttg::device::send<1>(key+1, std::move(v2)),
ttg::device::send<2>(key+1, std::move(v3)),
ttg::device::send<3>(key+1, std::move(v4)));
} else {
co_await ttg::device::forward(ttg::device::send<0>(key+1, v1),
ttg::device::send<1>(key+1, v2),
ttg::device::send<2>(key+1, v3),
ttg::device::send<3>(key+1, v4));
}
}
}, edges(fuse(I2N1, N2N1), fuse(I2N2, N2N2),
fuse(I2N3, N2N3), fuse(I2N4, N2N4)),
edges(N2N1, N2N2, N2N3, N2N4));

return std::make_tuple(std::move(init), std::move(next));
}

template <>
auto make_ttg<8>(bool do_move) {
Edge<int, A> I2N1, I2N2, I2N3, I2N4, I2N5, I2N6, I2N7, I2N8;
Edge<int, A> N2N1, N2N2, N2N3, N2N4, N2N5, N2N6, N2N7, N2N8;
Edge<void, A> N2S1, N2S2, N2S3, N2S4, N2S5, N2S6, N2S7, N2S8;

auto init = make_tt<void>(
[]() {
send<0>(0, A{});
send<1>(0, A{});
send<2>(0, A{});
send<3>(0, A{});
send<4>(0, A{});
send<5>(0, A{});
send<6>(0, A{});
send<7>(0, A{});
}, edges(), edges(I2N1, I2N2, I2N3, I2N4, I2N5, I2N6, I2N7, I2N8));

auto next = make_tt<ES, int>([=](const int &key, auto&& v1, auto&& v2, auto&& v3, auto&& v4, auto&& v5, auto&& v6, auto&& v7, auto&& v8) -> ttg::device_task {
co_await ttg::to_device(v1.b, v2.b, v3.b, v4.b, v5.b, v6.b, v7.b, v8.b);
co_await ttg::wait_kernel(); // empty kernel
if (key < NUM_TASKS) {
if (do_move) {
co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(v1)),
ttg::device::send<1>(key+1, std::move(v2)),
ttg::device::send<2>(key+1, std::move(v3)),
ttg::device::send<3>(key+1, std::move(v4)),
ttg::device::send<4>(key+1, std::move(v5)),
ttg::device::send<5>(key+1, std::move(v6)),
ttg::device::send<6>(key+1, std::move(v7)),
ttg::device::send<7>(key+1, std::move(v8)));
} else {
co_await ttg::device::forward(ttg::device::send<0>(key+1, v1),
ttg::device::send<1>(key+1, v2),
ttg::device::send<2>(key+1, v3),
ttg::device::send<3>(key+1, v4),
ttg::device::send<4>(key+1, v5),
ttg::device::send<5>(key+1, v6),
ttg::device::send<6>(key+1, v7),
ttg::device::send<7>(key+1, v8));
}
}
}, edges(fuse(I2N1, N2N1), fuse(I2N2, N2N2), fuse(I2N3, N2N3), fuse(I2N4, N2N4), fuse(I2N5, N2N5), fuse(I2N6, N2N6), fuse(I2N7, N2N7), fuse(I2N8, N2N8)),
edges(N2N1, N2N2, N2N3, N2N4, N2N5, N2N6, N2N7, N2N8));

return std::make_tuple(std::move(init), std::move(next));
}

// flows task ids via keys
template <>
auto make_ttg<0>(bool do_move) {
Edge<int, void> I2N, N2N;
Edge<void, int> N2S;

auto init = make_tt<void>([](std::tuple<Out<int, void>> &outs) { sendk<0>(0, outs); }, edges(), edges(I2N));

auto next = make_tt<ES>([](const int& key) -> ttg::device_task {
co_await ttg::to_device();
co_await ttg::wait_kernel();
if (key < NUM_TASKS) {
co_await ttg::device::forward(ttg::device::sendk<0>(key+1));
}
}, edges(fuse(I2N, N2N)), edges(N2N));

return std::make_tuple(std::move(init), std::move(next));
}

template<int num_flows>
void run_bench(bool do_move)
{
auto [init, next] = make_ttg<num_flows>(do_move);

auto connected = make_graph_executable(init.get());
assert(connected);
std::cout << "Graph " << num_flows << " is connected.\n";

auto t0 = now();
if (ttg::default_execution_context().rank() == 0) init->invoke();

ttg_execute(ttg_default_execution_context());
ttg_fence(ttg_default_execution_context());
auto t1 = now();

std::cout << "# of tasks = " << task_counter.load() << std::endl;
std::cout << "time elapsed (microseconds) = " << duration_in_mus(t0, t1) << std::endl;
}

int main(int argc, char* argv[]) {

int num_flows = 0;
int do_move = 1;
ttg_initialize(argc, argv, -1);

if (argc > 1) {
num_flows = std::atoi(argv[1]);
}

if (argc > 2) {
do_move = std::atoi(argv[2]);
}

switch(num_flows) {
case 0: run_bench<0>(do_move); break;
case 1: run_bench<1>(do_move); break;
case 2: run_bench<2>(do_move); break;
case 4: run_bench<4>(do_move); break;
case 8: run_bench<8>(do_move); break;
default: std::cout << "Unsupported number of flows: " << num_flows << std::endl;
}

ttg_finalize();
return 0;
}

22 changes: 22 additions & 0 deletions examples/task-benchmarks/chrono.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
//
// Created by Eduard Valeyev on 10/24/21.
//

#ifndef TEST_BENCHMARKS_CHRONO_H
#define TEST_BENCHMARKS_CHRONO_H

#include <chrono>

using time_point = std::chrono::high_resolution_clock::time_point;

inline time_point now() { return std::chrono::high_resolution_clock::now(); }

inline std::chrono::system_clock::time_point system_now() {
return std::chrono::system_clock::now();
}

inline int64_t duration_in_mus(time_point const &t0, time_point const &t1) {
return std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count();
}

#endif // TEST_BENCHMARKS_CHRONO_H
Loading

0 comments on commit f91fa92

Please sign in to comment.