diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index b2e398ddf..24b6f5897 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -22,6 +22,8 @@ if (TARGET tiledarray) COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2 RUNTIMES "parsec") endif() + + add_ttg_executable(chain-ttg-dev task-benchmarks/chain-ttg-dev.cc LINK_LIBRARIES tiledarray RUNTIMES "parsec") endif() if (TARGET MADworld) @@ -61,3 +63,4 @@ add_ttg_executable(sw sw/sw.cc) if (TARGET MADworld) add_ttg_executable(randomaccess randomaccess/randomaccess.cc RUNTIMES "mad") endif (TARGET MADworld) + diff --git a/examples/task-benchmarks/chain-ttg-dev.cc b/examples/task-benchmarks/chain-ttg-dev.cc new file mode 100644 index 000000000..5ccf1297d --- /dev/null +++ b/examples/task-benchmarks/chain-ttg-dev.cc @@ -0,0 +1,251 @@ +//#define TTG_USE_USER_TERMDET 1 +#include "ttg.h" + +#include "chrono.h" + +#if defined(TTG_HAVE_CUDA) +#define ES ttg::ExecutionSpace::CUDA +#elif defined(TTG_HAVE_HIP) +#define ES ttg::ExecutionSpace::HIP +#else +#error "Either CUDA OR HIP is required to build this test!" +#endif // 0 + +#define NUM_TASKS 10000 + +using namespace ttg; + +std::atomic task_counter = 0; + +struct A : public ttg::TTValue { + // TODO: allocate pinned memory + int v = 0; + ttg::buffer b; + A() : b(&v, 1) { } + + A(A&& a) = default; + A(const A& a) : v(a.v), b(&v, 1) { } + + template + void serialize(Archive& ar) { + ttg_abort(); + } + template + void serialize(Archive& ar, const unsigned int) { + ttg_abort(); + } + +}; + +template +auto make_ttg(bool do_move); + +// flows task ids via values +template <> +auto make_ttg<1>(bool do_move) { + Edge I2N, N2N; + Edge N2S; + + auto init = make_tt( + []() { + ++task_counter; + std::cout << "init 1 " << std::endl; + send<0>(0, A{}); + }, edges(), edges(I2N)); + + auto next = make_tt([=](const int &key, auto&& value) -> ttg::device_task { + //++task_counter; + co_await ttg::to_device(value.b); + co_await ttg::wait_kernel(); // empty kernel + if (key < NUM_TASKS) { + if (do_move) { + co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(value))); + } else { + co_await ttg::device::forward(ttg::device::send<0>(key+1, value)); + } + } else { + } + } , edges(fuse(I2N, N2N)), edges(N2N)); + + return std::make_tuple(std::move(init), std::move(next)); +} + +template <> +auto make_ttg<2>(bool do_move) { + Edge I2N1, I2N2; + Edge N2N1, N2N2; + Edge N2S1, N2S2; + + auto init = make_tt([]() { + send<0>(0, A{}); + send<1>(0, A{}); + }, edges(), edges(I2N1, I2N2)); + + auto next = make_tt([=](const int &key, A&& v1, A&& v2) -> ttg::device_task { + co_await ttg::to_device(v1.b, v2.b); + co_await ttg::wait_kernel(); // empty kernel + if (key < NUM_TASKS) { + if (do_move) { + co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(v1)), + ttg::device::send<1>(key+1, std::move(v2))); + } else { + co_await ttg::device::forward(ttg::device::send<0>(key+1, v1), + ttg::device::send<1>(key+1, v2)); + } + } + } , edges(fuse(I2N1, N2N1), fuse(I2N2, N2N2)), edges(N2N1, N2N2)); + + return std::make_tuple(std::move(init), std::move(next)); +} + +template <> +auto make_ttg<4>(bool do_move) { + Edge I2N1, I2N2, I2N3, I2N4; + Edge N2N1, N2N2, N2N3, N2N4; + Edge N2S1, N2S2, N2S3, N2S4; + + auto init = make_tt( + []() { + send<0>(0, A{}); + send<1>(0, A{}); + send<2>(0, A{}); + send<3>(0, A{}); + }, edges(), edges(I2N1, I2N2, I2N3, I2N4)); + + auto next = make_tt([=](const int &key, A&& v1, A&& v2, A&& v3, A&& v4) -> ttg::device_task { + co_await ttg::to_device(v1.b, v2.b, v3.b, v4.b); + co_await ttg::wait_kernel(); // empty kernel + if (key < NUM_TASKS) { + if (do_move) { + co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(v1)), + ttg::device::send<1>(key+1, std::move(v2)), + ttg::device::send<2>(key+1, std::move(v3)), + ttg::device::send<3>(key+1, std::move(v4))); + } else { + co_await ttg::device::forward(ttg::device::send<0>(key+1, v1), + ttg::device::send<1>(key+1, v2), + ttg::device::send<2>(key+1, v3), + ttg::device::send<3>(key+1, v4)); + } + } + }, edges(fuse(I2N1, N2N1), fuse(I2N2, N2N2), + fuse(I2N3, N2N3), fuse(I2N4, N2N4)), + edges(N2N1, N2N2, N2N3, N2N4)); + + return std::make_tuple(std::move(init), std::move(next)); +} + +template <> +auto make_ttg<8>(bool do_move) { + Edge I2N1, I2N2, I2N3, I2N4, I2N5, I2N6, I2N7, I2N8; + Edge N2N1, N2N2, N2N3, N2N4, N2N5, N2N6, N2N7, N2N8; + Edge N2S1, N2S2, N2S3, N2S4, N2S5, N2S6, N2S7, N2S8; + + auto init = make_tt( + []() { + send<0>(0, A{}); + send<1>(0, A{}); + send<2>(0, A{}); + send<3>(0, A{}); + send<4>(0, A{}); + send<5>(0, A{}); + send<6>(0, A{}); + send<7>(0, A{}); + }, edges(), edges(I2N1, I2N2, I2N3, I2N4, I2N5, I2N6, I2N7, I2N8)); + + auto next = make_tt([=](const int &key, auto&& v1, auto&& v2, auto&& v3, auto&& v4, auto&& v5, auto&& v6, auto&& v7, auto&& v8) -> ttg::device_task { + co_await ttg::to_device(v1.b, v2.b, v3.b, v4.b, v5.b, v6.b, v7.b, v8.b); + co_await ttg::wait_kernel(); // empty kernel + if (key < NUM_TASKS) { + if (do_move) { + co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(v1)), + ttg::device::send<1>(key+1, std::move(v2)), + ttg::device::send<2>(key+1, std::move(v3)), + ttg::device::send<3>(key+1, std::move(v4)), + ttg::device::send<4>(key+1, std::move(v5)), + ttg::device::send<5>(key+1, std::move(v6)), + ttg::device::send<6>(key+1, std::move(v7)), + ttg::device::send<7>(key+1, std::move(v8))); + } else { + co_await ttg::device::forward(ttg::device::send<0>(key+1, v1), + ttg::device::send<1>(key+1, v2), + ttg::device::send<2>(key+1, v3), + ttg::device::send<3>(key+1, v4), + ttg::device::send<4>(key+1, v5), + ttg::device::send<5>(key+1, v6), + ttg::device::send<6>(key+1, v7), + ttg::device::send<7>(key+1, v8)); + } + } + }, edges(fuse(I2N1, N2N1), fuse(I2N2, N2N2), fuse(I2N3, N2N3), fuse(I2N4, N2N4), fuse(I2N5, N2N5), fuse(I2N6, N2N6), fuse(I2N7, N2N7), fuse(I2N8, N2N8)), + edges(N2N1, N2N2, N2N3, N2N4, N2N5, N2N6, N2N7, N2N8)); + + return std::make_tuple(std::move(init), std::move(next)); +} + +// flows task ids via keys +template <> +auto make_ttg<0>(bool do_move) { + Edge I2N, N2N; + Edge N2S; + + auto init = make_tt([](std::tuple> &outs) { sendk<0>(0, outs); }, edges(), edges(I2N)); + + auto next = make_tt([](const int& key) -> ttg::device_task { + co_await ttg::to_device(); + co_await ttg::wait_kernel(); + if (key < NUM_TASKS) { + co_await ttg::device::forward(ttg::device::sendk<0>(key+1)); + } + }, edges(fuse(I2N, N2N)), edges(N2N)); + + return std::make_tuple(std::move(init), std::move(next)); +} + +template +void run_bench(bool do_move) +{ + auto [init, next] = make_ttg(do_move); + + auto connected = make_graph_executable(init.get()); + assert(connected); + std::cout << "Graph " << num_flows << " is connected.\n"; + + auto t0 = now(); + if (ttg::default_execution_context().rank() == 0) init->invoke(); + + ttg_execute(ttg_default_execution_context()); + ttg_fence(ttg_default_execution_context()); + auto t1 = now(); + + std::cout << "# of tasks = " << task_counter.load() << std::endl; + std::cout << "time elapsed (microseconds) = " << duration_in_mus(t0, t1) << std::endl; +} + +int main(int argc, char* argv[]) { + + int num_flows = 0; + int do_move = 1; + ttg_initialize(argc, argv, -1); + + if (argc > 1) { + num_flows = std::atoi(argv[1]); + } + + if (argc > 2) { + do_move = std::atoi(argv[2]); + } + + switch(num_flows) { + case 0: run_bench<0>(do_move); break; + case 1: run_bench<1>(do_move); break; + case 2: run_bench<2>(do_move); break; + case 4: run_bench<4>(do_move); break; + case 8: run_bench<8>(do_move); break; + default: std::cout << "Unsupported number of flows: " << num_flows << std::endl; + } + + ttg_finalize(); + return 0; +} + diff --git a/examples/task-benchmarks/chrono.h b/examples/task-benchmarks/chrono.h new file mode 100644 index 000000000..358d6dcc4 --- /dev/null +++ b/examples/task-benchmarks/chrono.h @@ -0,0 +1,22 @@ +// +// Created by Eduard Valeyev on 10/24/21. +// + +#ifndef TEST_BENCHMARKS_CHRONO_H +#define TEST_BENCHMARKS_CHRONO_H + +#include + +using time_point = std::chrono::high_resolution_clock::time_point; + +inline time_point now() { return std::chrono::high_resolution_clock::now(); } + +inline std::chrono::system_clock::time_point system_now() { + return std::chrono::system_clock::now(); +} + +inline int64_t duration_in_mus(time_point const &t0, time_point const &t1) { + return std::chrono::duration_cast(t1 - t0).count(); +} + +#endif // TEST_BENCHMARKS_CHRONO_H diff --git a/ttg/ttg/make_tt.h b/ttg/ttg/make_tt.h index ef52228b8..5dd18402d 100644 --- a/ttg/ttg/make_tt.h +++ b/ttg/ttg/make_tt.h @@ -165,49 +165,50 @@ class CallableWrapTTArgs protected: - /// @return coroutine handle<> (if funcT is a coroutine), else void - template - auto call_func(Key &&key, Tuple &&args_tuple, output_terminalsT &out, std::index_sequence) { - using func_args_t = ttg::meta::tuple_concat_t, input_refs_tuple_type, output_edges_type>; - - auto process_return = [&out](auto &&ret) { - static_assert(std::is_same_v, returnT>, - "CallableWrapTTArgs: returnT does not match the actual return type of funcT"); - if constexpr (!std::is_void_v) { // protect from compiling for void returnT + template + auto process_return(ReturnT&& ret, output_terminalsT &out) { + static_assert(std::is_same_v, returnT>, + "CallableWrapTTArgs: returnT does not match the actual return type of funcT"); + if constexpr (!std::is_void_v) { // protect from compiling for void returnT #ifdef TTG_HAS_COROUTINE - if constexpr (std::is_same_v) { - ttg::coroutine_handle<> coro_handle; - // if task completed destroy it - if (ret.completed()) { - ret.destroy(); - } else { // if task is suspended return the coroutine promise ptr - coro_handle = ret; - } - return coro_handle; - } else if constexpr (std::is_same_v) { - ttg::device_task::base_type coro_handle = ret; - return coro_handle; + if constexpr (std::is_same_v) { + ttg::coroutine_handle<> coro_handle; + // if task completed destroy it + if (ret.completed()) { + ret.destroy(); + } else { // if task is suspended return the coroutine promise ptr + coro_handle = ret; } - if constexpr (!(std::is_same_v || std::is_same_v)) + return coro_handle; + } else if constexpr (std::is_same_v) { + ttg::device_task::base_type coro_handle = ret; + return coro_handle; + } + if constexpr (!(std::is_same_v || std::is_same_v)) #endif - { - static_assert(std::tuple_size_v> == 1, - "CallableWrapTTArgs <= 2, - "CallableWrapTTArgs == 0) - std::get<0>(out).sendv(std::move(ret)); - else if constexpr (std::tuple_size_v == 1) - std::get<0>(out).sendk(std::move(std::get<0>(ret))); - else if constexpr (std::tuple_size_v == 2) - std::get<0>(out).send(std::move(std::get<0>(ret)), std::move(std::get<1>(ret))); - return; - } + { + static_assert(std::tuple_size_v> == 1, + "CallableWrapTTArgs <= 2, + "CallableWrapTTArgs == 0) + std::get<0>(out).sendv(std::move(ret)); + else if constexpr (std::tuple_size_v == 1) + std::get<0>(out).sendk(std::move(std::get<0>(ret))); + else if constexpr (std::tuple_size_v == 2) + std::get<0>(out).send(std::move(std::get<0>(ret)), std::move(std::get<1>(ret))); + return; } - }; + } + } + + /// @return coroutine handle<> (if funcT is a coroutine), else void + template + auto call_func(Key &&key, Tuple &&args_tuple, output_terminalsT &out, std::index_sequence) { + using func_args_t = ttg::meta::tuple_concat_t, input_refs_tuple_type, output_edges_type>; if constexpr (funcT_receives_outterm_tuple) { if constexpr (std::is_void_v) { @@ -219,7 +220,7 @@ class CallableWrapTTArgs std::forward(key), baseT::template get>(std::forward(args_tuple))..., out); - return process_return(std::move(ret)); + return process_return(std::move(ret), out); } } else { auto old_output_tls_ptr = this->outputs_tls_ptr_accessor(); @@ -234,45 +235,78 @@ class CallableWrapTTArgs func(std::forward(key), baseT::template get>(std::forward(args_tuple))...); this->set_outputs_tls_ptr(old_output_tls_ptr); - return process_return(std::move(ret)); + return process_return(std::move(ret), out); } } } template - void call_func(Tuple &&args_tuple, output_terminalsT &out, std::index_sequence) { + auto call_func(Tuple &&args_tuple, output_terminalsT &out, std::index_sequence) { using func_args_t = ttg::meta::tuple_concat_t; - if constexpr (funcT_receives_outterm_tuple) - func(baseT::template get>(std::forward(args_tuple))..., out); - else { + if constexpr (funcT_receives_outterm_tuple) { + if constexpr (std::is_void_v) { + func(baseT::template get>(std::forward(args_tuple))..., out); + } else { + auto ret = func(baseT::template get>(std::forward(args_tuple))..., out); + return process_return(std::move(ret), out); + } + } else { auto old_output_tls_ptr = this->outputs_tls_ptr_accessor(); this->set_outputs_tls_ptr(); - func(baseT::template get>(std::forward(args_tuple))...); - this->set_outputs_tls_ptr(old_output_tls_ptr); + if constexpr (std::is_void_v) { + func(baseT::template get>(std::forward(args_tuple))...); + this->set_outputs_tls_ptr(old_output_tls_ptr); + } else { + auto ret = func(baseT::template get>(std::forward(args_tuple))...); + this->set_outputs_tls_ptr(old_output_tls_ptr); + return process_return(std::move(ret), out); + } } } template - void call_func(Key &&key, output_terminalsT &out) { - if constexpr (funcT_receives_outterm_tuple) - func(std::forward(key), out); - else { + auto call_func(Key &&key, output_terminalsT &out) { + if constexpr (funcT_receives_outterm_tuple) { + if constexpr (std::is_void_v) { + func(std::forward(key), out); + } else { + auto ret = func(std::forward(key), out); + return process_return(std::move(ret), out); + } + } else { auto old_output_tls_ptr = this->outputs_tls_ptr_accessor(); this->set_outputs_tls_ptr(); - func(std::forward(key)); - this->set_outputs_tls_ptr(old_output_tls_ptr); + if constexpr (std::is_void_v) { + func(std::forward(key)); + this->set_outputs_tls_ptr(old_output_tls_ptr); + } else { + auto ret = func(std::forward(key)); + this->set_outputs_tls_ptr(old_output_tls_ptr); + return process_return(std::move(ret), out); + } } } template - void call_func(OutputTerminals &out) { - if constexpr (funcT_receives_outterm_tuple) - func(out); - else { + auto call_func(OutputTerminals &out) { + if constexpr (funcT_receives_outterm_tuple) { + if constexpr (std::is_void_v) { + func(out); + } else { + auto ret = func(out); + return process_return(std::move(ret), out); + } + } else { auto old_output_tls_ptr = this->outputs_tls_ptr_accessor(); this->set_outputs_tls_ptr(); - func(); - this->set_outputs_tls_ptr(old_output_tls_ptr); + if constexpr (std::is_void_v) { + func(); + this->set_outputs_tls_ptr(old_output_tls_ptr); + } else { + auto ret = func(out); + this->set_outputs_tls_ptr(old_output_tls_ptr); + return process_return(std::move(ret), out); + } } } diff --git a/ttg/ttg/parsec/devicefunc.h b/ttg/ttg/parsec/devicefunc.h index 773218e4d..d250e060e 100644 --- a/ttg/ttg/parsec/devicefunc.h +++ b/ttg/ttg/parsec/devicefunc.h @@ -78,6 +78,7 @@ namespace ttg_parsec { * is current on the target device, false if transfers are required. */ template inline bool register_device_memory(std::tuple &views) { + bool is_current = true; if (nullptr == detail::parsec_ttg_caller) { throw std::runtime_error("register_device_memory may only be invoked from inside a task!"); } @@ -86,7 +87,9 @@ namespace ttg_parsec { throw std::runtime_error("register_device_memory called inside a non-gpu task!"); } - bool is_current = detail::register_device_memory(views, std::index_sequence_for{}); + if constexpr (sizeof...(Views) > 0) { + is_current = detail::register_device_memory(views, std::index_sequence_for{}); + } /* reset all entries in the current task */ for (int i = sizeof...(Views); i < MAX_PARAM_COUNT; ++i) {