diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index b2e398ddf..24b6f5897 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -22,6 +22,8 @@ if (TARGET tiledarray)
                         COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2
                         RUNTIMES "parsec")
     endif()
+
+    add_ttg_executable(chain-ttg-dev task-benchmarks/chain-ttg-dev.cc LINK_LIBRARIES tiledarray RUNTIMES "parsec")
 endif()
 
 if (TARGET MADworld)
@@ -61,3 +63,4 @@ add_ttg_executable(sw sw/sw.cc)
 if (TARGET MADworld)
   add_ttg_executable(randomaccess randomaccess/randomaccess.cc RUNTIMES "mad")
 endif (TARGET MADworld)
+
diff --git a/examples/task-benchmarks/chain-ttg-dev.cc b/examples/task-benchmarks/chain-ttg-dev.cc
new file mode 100644
index 000000000..5ccf1297d
--- /dev/null
+++ b/examples/task-benchmarks/chain-ttg-dev.cc
@@ -0,0 +1,251 @@
+//#define TTG_USE_USER_TERMDET 1
+#include "ttg.h"
+
+#include "chrono.h"
+
+#if defined(TTG_HAVE_CUDA)
+#define ES ttg::ExecutionSpace::CUDA
+#elif defined(TTG_HAVE_HIP)
+#define ES ttg::ExecutionSpace::HIP
+#else
+#error "Either CUDA OR HIP is required to build this test!"
+#endif // 0
+
+#define NUM_TASKS 10000
+
+using namespace ttg;
+
+std::atomic<int> task_counter = 0;
+
+struct A : public ttg::TTValue<A> {
+  // TODO: allocate pinned memory
+  int v = 0;
+  ttg::buffer<int> b;
+  A() : b(&v, 1) { }
+
+  A(A&& a) = default;
+  A(const A& a) : v(a.v), b(&v, 1) { }
+
+  template <typename Archive>
+  void serialize(Archive& ar) {
+    ttg_abort();
+  }
+  template <typename Archive>
+  void serialize(Archive& ar, const unsigned int) {
+    ttg_abort();
+  }
+
+};
+
+template <int num_flows>
+auto make_ttg(bool do_move);
+
+// flows task ids via values
+template <>
+auto make_ttg<1>(bool do_move) {
+  Edge<int, A> I2N, N2N;
+  Edge<void, A> N2S;
+
+  auto init = make_tt<void>(
+    []() {
+      ++task_counter;
+      std::cout << "init 1 " << std::endl;
+      send<0>(0, A{});
+    }, edges(), edges(I2N));
+
+  auto next = make_tt<ES, int>([=](const int &key, auto&& value) -> ttg::device_task {
+    //++task_counter;
+    co_await ttg::to_device(value.b);
+    co_await ttg::wait_kernel(); // empty kernel
+    if (key < NUM_TASKS) {
+      if (do_move) {
+        co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(value)));
+      } else {
+        co_await ttg::device::forward(ttg::device::send<0>(key+1, value));
+      }
+    } else {
+    }
+  } , edges(fuse(I2N, N2N)), edges(N2N));
+
+  return std::make_tuple(std::move(init), std::move(next));
+}
+
+template <>
+auto make_ttg<2>(bool do_move) {
+  Edge<int, A> I2N1, I2N2;
+  Edge<int, A> N2N1, N2N2;
+  Edge<void, A> N2S1, N2S2;
+
+  auto init = make_tt<void>([]() {
+    send<0>(0, A{});
+    send<1>(0, A{});
+  }, edges(), edges(I2N1, I2N2));
+
+  auto next = make_tt<ES, int>([=](const int &key, A&& v1, A&& v2) -> ttg::device_task {
+    co_await ttg::to_device(v1.b, v2.b);
+    co_await ttg::wait_kernel(); // empty kernel
+    if (key < NUM_TASKS) {
+      if (do_move) {
+        co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(v1)),
+                                      ttg::device::send<1>(key+1, std::move(v2)));
+      } else {
+        co_await ttg::device::forward(ttg::device::send<0>(key+1, v1),
+                                      ttg::device::send<1>(key+1, v2));
+      }
+    }
+  } , edges(fuse(I2N1, N2N1), fuse(I2N2, N2N2)), edges(N2N1, N2N2));
+
+  return std::make_tuple(std::move(init), std::move(next));
+}
+
+template <>
+auto make_ttg<4>(bool do_move) {
+  Edge<int, A> I2N1, I2N2, I2N3, I2N4;
+  Edge<int, A> N2N1, N2N2, N2N3, N2N4;
+  Edge<void, A> N2S1, N2S2, N2S3, N2S4;
+
+  auto init = make_tt<void>(
+    []() {
+      send<0>(0, A{});
+      send<1>(0, A{});
+      send<2>(0, A{});
+      send<3>(0, A{});
+    }, edges(), edges(I2N1, I2N2, I2N3, I2N4));
+
+  auto next = make_tt<ES, int>([=](const int &key, A&& v1, A&& v2, A&& v3, A&& v4) -> ttg::device_task {
+    co_await ttg::to_device(v1.b, v2.b, v3.b, v4.b);
+    co_await ttg::wait_kernel(); // empty kernel
+    if (key < NUM_TASKS) {
+      if (do_move) {
+        co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(v1)),
+                                      ttg::device::send<1>(key+1, std::move(v2)),
+                                      ttg::device::send<2>(key+1, std::move(v3)),
+                                      ttg::device::send<3>(key+1, std::move(v4)));
+      } else {
+        co_await ttg::device::forward(ttg::device::send<0>(key+1, v1),
+                                      ttg::device::send<1>(key+1, v2),
+                                      ttg::device::send<2>(key+1, v3),
+                                      ttg::device::send<3>(key+1, v4));
+      }
+    }
+  }, edges(fuse(I2N1, N2N1), fuse(I2N2, N2N2),
+           fuse(I2N3, N2N3), fuse(I2N4, N2N4)),
+     edges(N2N1, N2N2, N2N3, N2N4));
+
+  return std::make_tuple(std::move(init), std::move(next));
+}
+
+template <>
+auto make_ttg<8>(bool do_move) {
+  Edge<int, A> I2N1, I2N2, I2N3, I2N4, I2N5, I2N6, I2N7, I2N8;
+  Edge<int, A> N2N1, N2N2, N2N3, N2N4, N2N5, N2N6, N2N7, N2N8;
+  Edge<void, A> N2S1, N2S2, N2S3, N2S4, N2S5, N2S6, N2S7, N2S8;
+
+  auto init = make_tt<void>(
+    []() {
+      send<0>(0, A{});
+      send<1>(0, A{});
+      send<2>(0, A{});
+      send<3>(0, A{});
+      send<4>(0, A{});
+      send<5>(0, A{});
+      send<6>(0, A{});
+      send<7>(0, A{});
+    }, edges(), edges(I2N1, I2N2, I2N3, I2N4, I2N5, I2N6, I2N7, I2N8));
+
+  auto next = make_tt<ES, int>([=](const int &key, auto&& v1, auto&& v2, auto&& v3, auto&& v4, auto&& v5, auto&& v6, auto&& v7, auto&& v8) -> ttg::device_task {
+    co_await ttg::to_device(v1.b, v2.b, v3.b, v4.b, v5.b, v6.b, v7.b, v8.b);
+    co_await ttg::wait_kernel(); // empty kernel
+    if (key < NUM_TASKS) {
+      if (do_move) {
+        co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(v1)),
+                                      ttg::device::send<1>(key+1, std::move(v2)),
+                                      ttg::device::send<2>(key+1, std::move(v3)),
+                                      ttg::device::send<3>(key+1, std::move(v4)),
+                                      ttg::device::send<4>(key+1, std::move(v5)),
+                                      ttg::device::send<5>(key+1, std::move(v6)),
+                                      ttg::device::send<6>(key+1, std::move(v7)),
+                                      ttg::device::send<7>(key+1, std::move(v8)));
+      } else {
+        co_await ttg::device::forward(ttg::device::send<0>(key+1, v1),
+                                      ttg::device::send<1>(key+1, v2),
+                                      ttg::device::send<2>(key+1, v3),
+                                      ttg::device::send<3>(key+1, v4),
+                                      ttg::device::send<4>(key+1, v5),
+                                      ttg::device::send<5>(key+1, v6),
+                                      ttg::device::send<6>(key+1, v7),
+                                      ttg::device::send<7>(key+1, v8));
+      }
+    }
+  }, edges(fuse(I2N1, N2N1), fuse(I2N2, N2N2), fuse(I2N3, N2N3), fuse(I2N4, N2N4), fuse(I2N5, N2N5), fuse(I2N6, N2N6), fuse(I2N7, N2N7), fuse(I2N8, N2N8)),
+     edges(N2N1, N2N2, N2N3, N2N4, N2N5, N2N6, N2N7, N2N8));
+
+  return std::make_tuple(std::move(init), std::move(next));
+}
+
+// flows task ids via keys
+template <>
+auto make_ttg<0>(bool do_move) {
+  Edge<int, void> I2N, N2N;
+  Edge<void, int> N2S;
+
+  auto init = make_tt<void>([](std::tuple<Out<int, void>> &outs) { sendk<0>(0, outs); }, edges(), edges(I2N));
+
+  auto next = make_tt<ES>([](const int& key) -> ttg::device_task {
+    co_await ttg::to_device();
+    co_await ttg::wait_kernel();
+    if (key < NUM_TASKS) {
+      co_await ttg::device::forward(ttg::device::sendk<0>(key+1));
+    }
+  }, edges(fuse(I2N, N2N)), edges(N2N));
+
+  return std::make_tuple(std::move(init), std::move(next));
+}
+
+template<int num_flows>
+void run_bench(bool do_move)
+{
+  auto [init, next] = make_ttg<num_flows>(do_move);
+
+  auto connected = make_graph_executable(init.get());
+  assert(connected);
+  std::cout << "Graph " << num_flows << " is connected.\n";
+
+  auto t0 = now();
+  if (ttg::default_execution_context().rank() == 0) init->invoke();
+
+  ttg_execute(ttg_default_execution_context());
+  ttg_fence(ttg_default_execution_context());
+  auto t1 = now();
+
+  std::cout << "# of tasks = " << task_counter.load() << std::endl;
+  std::cout << "time elapsed (microseconds) = " << duration_in_mus(t0, t1) << std::endl;
+}
+
+int main(int argc, char* argv[]) {
+
+  int num_flows = 0;
+  int do_move = 1;
+  ttg_initialize(argc, argv, -1);
+
+  if (argc > 1) {
+    num_flows = std::atoi(argv[1]);
+  }
+
+  if (argc > 2) {
+    do_move = std::atoi(argv[2]);
+  }
+
+  switch(num_flows) {
+  case 0: run_bench<0>(do_move); break;
+  case 1: run_bench<1>(do_move); break;
+  case 2: run_bench<2>(do_move); break;
+  case 4: run_bench<4>(do_move); break;
+  case 8: run_bench<8>(do_move); break;
+  default: std::cout << "Unsupported number of flows: " << num_flows << std::endl;
+  }
+
+  ttg_finalize();
+  return 0;
+}
+
diff --git a/examples/task-benchmarks/chrono.h b/examples/task-benchmarks/chrono.h
new file mode 100644
index 000000000..358d6dcc4
--- /dev/null
+++ b/examples/task-benchmarks/chrono.h
@@ -0,0 +1,22 @@
+//
+// Created by Eduard Valeyev on 10/24/21.
+//
+
+#ifndef TEST_BENCHMARKS_CHRONO_H
+#define TEST_BENCHMARKS_CHRONO_H
+
+#include <chrono>
+
+using time_point = std::chrono::high_resolution_clock::time_point;
+
+inline time_point now() { return std::chrono::high_resolution_clock::now(); }
+
+inline std::chrono::system_clock::time_point system_now() {
+  return std::chrono::system_clock::now();
+}
+
+inline int64_t duration_in_mus(time_point const &t0, time_point const &t1) {
+  return std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count();
+}
+
+#endif // TEST_BENCHMARKS_CHRONO_H
diff --git a/ttg/ttg/make_tt.h b/ttg/ttg/make_tt.h
index ef52228b8..5dd18402d 100644
--- a/ttg/ttg/make_tt.h
+++ b/ttg/ttg/make_tt.h
@@ -165,49 +165,50 @@ class CallableWrapTTArgs
 
 protected:
 
-  /// @return coroutine handle<> (if funcT is a coroutine), else void
-  template <typename Key, typename Tuple, std::size_t... S>
-  auto call_func(Key &&key, Tuple &&args_tuple, output_terminalsT &out, std::index_sequence<S...>) {
-    using func_args_t = ttg::meta::tuple_concat_t<std::tuple<const Key &>, input_refs_tuple_type, output_edges_type>;
-
-    auto process_return = [&out](auto &&ret) {
-      static_assert(std::is_same_v<std::remove_reference_t<decltype(ret)>, returnT>,
-                    "CallableWrapTTArgs<funcT,returnT,...>: returnT does not match the actual return type of funcT");
-      if constexpr (!std::is_void_v<returnT>) {  // protect from compiling for void returnT
+  template<typename ReturnT>
+  auto process_return(ReturnT&& ret, output_terminalsT &out) {
+    static_assert(std::is_same_v<std::remove_reference_t<decltype(ret)>, returnT>,
+                  "CallableWrapTTArgs<funcT,returnT,...>: returnT does not match the actual return type of funcT");
+    if constexpr (!std::is_void_v<returnT>) {  // protect from compiling for void returnT
 #ifdef TTG_HAS_COROUTINE
-        if constexpr (std::is_same_v<returnT, ttg::resumable_task>) {
-          ttg::coroutine_handle<> coro_handle;
-          // if task completed destroy it
-          if (ret.completed()) {
-            ret.destroy();
-          } else {  // if task is suspended return the coroutine promise ptr
-            coro_handle = ret;
-          }
-          return coro_handle;
-        } else if constexpr (std::is_same_v<returnT, ttg::device_task>) {
-          ttg::device_task::base_type coro_handle = ret;
-          return coro_handle;
+      if constexpr (std::is_same_v<returnT, ttg::resumable_task>) {
+        ttg::coroutine_handle<> coro_handle;
+        // if task completed destroy it
+        if (ret.completed()) {
+          ret.destroy();
+        } else {  // if task is suspended return the coroutine promise ptr
+          coro_handle = ret;
         }
-        if constexpr (!(std::is_same_v<returnT, ttg::resumable_task> || std::is_same_v<returnT, ttg::device_task>))
+        return coro_handle;
+      } else if constexpr (std::is_same_v<returnT, ttg::device_task>) {
+        ttg::device_task::base_type coro_handle = ret;
+        return coro_handle;
+      }
+      if constexpr (!(std::is_same_v<returnT, ttg::resumable_task> || std::is_same_v<returnT, ttg::device_task>))
 #endif
-        {
-          static_assert(std::tuple_size_v<std::remove_reference_t<decltype(out)>> == 1,
-                        "CallableWrapTTArgs<funcT,returnT,funcT_receives_outterm_tuple=true,...): funcT can return a "
-                        "value only if there is only 1 out terminal");
-          static_assert(std::tuple_size_v<returnT> <= 2,
-                        "CallableWrapTTArgs<funcT,returnT,funcT_receives_outterm_tuple=true,...): funcT can return a "
-                        "value only if it is a plain value (then sent with null key), a tuple-like containing a single "
-                        "key (hence value is void), or a tuple-like containing a key and a value");
-          if constexpr (std::tuple_size_v<returnT> == 0)
-            std::get<0>(out).sendv(std::move(ret));
-          else if constexpr (std::tuple_size_v<returnT> == 1)
-            std::get<0>(out).sendk(std::move(std::get<0>(ret)));
-          else if constexpr (std::tuple_size_v<returnT> == 2)
-            std::get<0>(out).send(std::move(std::get<0>(ret)), std::move(std::get<1>(ret)));
-          return;
-        }
+      {
+        static_assert(std::tuple_size_v<std::remove_reference_t<decltype(out)>> == 1,
+                      "CallableWrapTTArgs<funcT,returnT,funcT_receives_outterm_tuple=true,...): funcT can return a "
+                      "value only if there is only 1 out terminal");
+        static_assert(std::tuple_size_v<returnT> <= 2,
+                      "CallableWrapTTArgs<funcT,returnT,funcT_receives_outterm_tuple=true,...): funcT can return a "
+                      "value only if it is a plain value (then sent with null key), a tuple-like containing a single "
+                      "key (hence value is void), or a tuple-like containing a key and a value");
+        if constexpr (std::tuple_size_v<returnT> == 0)
+          std::get<0>(out).sendv(std::move(ret));
+        else if constexpr (std::tuple_size_v<returnT> == 1)
+          std::get<0>(out).sendk(std::move(std::get<0>(ret)));
+        else if constexpr (std::tuple_size_v<returnT> == 2)
+          std::get<0>(out).send(std::move(std::get<0>(ret)), std::move(std::get<1>(ret)));
+        return;
       }
-    };
+    }
+  }
+
+  /// @return coroutine handle<> (if funcT is a coroutine), else void
+  template <typename Key, typename Tuple, std::size_t... S>
+  auto call_func(Key &&key, Tuple &&args_tuple, output_terminalsT &out, std::index_sequence<S...>) {
+    using func_args_t = ttg::meta::tuple_concat_t<std::tuple<const Key &>, input_refs_tuple_type, output_edges_type>;
 
     if constexpr (funcT_receives_outterm_tuple) {
       if constexpr (std::is_void_v<returnT>) {
@@ -219,7 +220,7 @@ class CallableWrapTTArgs
             std::forward<Key>(key),
             baseT::template get<S, std::tuple_element_t<S + 1, func_args_t>>(std::forward<Tuple>(args_tuple))..., out);
 
-        return process_return(std::move(ret));
+        return process_return(std::move(ret), out);
       }
     } else {
       auto old_output_tls_ptr = this->outputs_tls_ptr_accessor();
@@ -234,45 +235,78 @@ class CallableWrapTTArgs
             func(std::forward<Key>(key),
                  baseT::template get<S, std::tuple_element_t<S + 1, func_args_t>>(std::forward<Tuple>(args_tuple))...);
         this->set_outputs_tls_ptr(old_output_tls_ptr);
-        return process_return(std::move(ret));
+        return process_return(std::move(ret), out);
       }
     }
   }
 
   template <typename Tuple, std::size_t... S>
-  void call_func(Tuple &&args_tuple, output_terminalsT &out, std::index_sequence<S...>) {
+  auto call_func(Tuple &&args_tuple, output_terminalsT &out, std::index_sequence<S...>) {
     using func_args_t = ttg::meta::tuple_concat_t<input_refs_tuple_type, output_edges_type>;
-    if constexpr (funcT_receives_outterm_tuple)
-      func(baseT::template get<S, std::tuple_element_t<S, func_args_t>>(std::forward<Tuple>(args_tuple))..., out);
-    else {
+    if constexpr (funcT_receives_outterm_tuple) {
+      if constexpr (std::is_void_v<returnT>) {
+        func(baseT::template get<S, std::tuple_element_t<S, func_args_t>>(std::forward<Tuple>(args_tuple))..., out);
+      } else {
+        auto ret = func(baseT::template get<S, std::tuple_element_t<S, func_args_t>>(std::forward<Tuple>(args_tuple))..., out);
+        return process_return(std::move(ret), out);
+      }
+    } else {
       auto old_output_tls_ptr = this->outputs_tls_ptr_accessor();
       this->set_outputs_tls_ptr();
-      func(baseT::template get<S, std::tuple_element_t<S, func_args_t>>(std::forward<Tuple>(args_tuple))...);
-      this->set_outputs_tls_ptr(old_output_tls_ptr);
+      if constexpr (std::is_void_v<returnT>) {
+        func(baseT::template get<S, std::tuple_element_t<S, func_args_t>>(std::forward<Tuple>(args_tuple))...);
+        this->set_outputs_tls_ptr(old_output_tls_ptr);
+      } else {
+        auto ret = func(baseT::template get<S, std::tuple_element_t<S, func_args_t>>(std::forward<Tuple>(args_tuple))...);
+        this->set_outputs_tls_ptr(old_output_tls_ptr);
+        return process_return(std::move(ret), out);
+      }
     }
   }
 
   template <typename Key>
-  void call_func(Key &&key, output_terminalsT &out) {
-    if constexpr (funcT_receives_outterm_tuple)
-      func(std::forward<Key>(key), out);
-    else {
+  auto call_func(Key &&key, output_terminalsT &out) {
+    if constexpr (funcT_receives_outterm_tuple) {
+      if constexpr (std::is_void_v<returnT>) {
+        func(std::forward<Key>(key), out);
+      } else {
+        auto ret = func(std::forward<Key>(key), out);
+        return process_return(std::move(ret), out);
+      }
+    } else {
       auto old_output_tls_ptr = this->outputs_tls_ptr_accessor();
       this->set_outputs_tls_ptr();
-      func(std::forward<Key>(key));
-      this->set_outputs_tls_ptr(old_output_tls_ptr);
+      if constexpr (std::is_void_v<returnT>) {
+        func(std::forward<Key>(key));
+        this->set_outputs_tls_ptr(old_output_tls_ptr);
+      } else {
+        auto ret = func(std::forward<Key>(key));
+        this->set_outputs_tls_ptr(old_output_tls_ptr);
+        return process_return(std::move(ret), out);
+      }
     }
   }
 
   template <typename OutputTerminals>
-  void call_func(OutputTerminals &out) {
-    if constexpr (funcT_receives_outterm_tuple)
-      func(out);
-    else {
+  auto call_func(OutputTerminals &out) {
+    if constexpr (funcT_receives_outterm_tuple) {
+      if constexpr (std::is_void_v<returnT>) {
+        func(out);
+      } else {
+        auto ret = func(out);
+        return process_return(std::move(ret), out);
+      }
+    } else {
       auto old_output_tls_ptr = this->outputs_tls_ptr_accessor();
       this->set_outputs_tls_ptr();
-      func();
-      this->set_outputs_tls_ptr(old_output_tls_ptr);
+      if constexpr (std::is_void_v<returnT>) {
+        func();
+        this->set_outputs_tls_ptr(old_output_tls_ptr);
+      } else {
+        auto ret = func(out);
+        this->set_outputs_tls_ptr(old_output_tls_ptr);
+        return process_return(std::move(ret), out);
+      }
     }
   }
 
diff --git a/ttg/ttg/parsec/devicefunc.h b/ttg/ttg/parsec/devicefunc.h
index 773218e4d..d250e060e 100644
--- a/ttg/ttg/parsec/devicefunc.h
+++ b/ttg/ttg/parsec/devicefunc.h
@@ -78,6 +78,7 @@ namespace ttg_parsec {
    * is current on the target device, false if transfers are required. */
   template<typename... Views>
   inline bool register_device_memory(std::tuple<Views&...> &views) {
+    bool is_current = true;
     if (nullptr == detail::parsec_ttg_caller) {
       throw std::runtime_error("register_device_memory may only be invoked from inside a task!");
     }
@@ -86,7 +87,9 @@ namespace ttg_parsec {
       throw std::runtime_error("register_device_memory called inside a non-gpu task!");
     }
 
-    bool is_current = detail::register_device_memory(views, std::index_sequence_for<Views...>{});
+    if constexpr (sizeof...(Views) > 0) {
+      is_current = detail::register_device_memory(views, std::index_sequence_for<Views...>{});
+    }
 
     /* reset all entries in the current task */
     for (int i = sizeof...(Views); i < MAX_PARAM_COUNT; ++i) {