From f9971f5b66c7a7095f17dcfa7df83fa90dcb6657 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Thu, 5 Sep 2024 09:43:59 -0400
Subject: [PATCH] First dips at moving execution space selection to
 ttg::device::Task

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 examples/CMakeLists.txt                   |  20 +-
 examples/potrf/potrf.h                    | 107 +++++----
 examples/task-benchmarks/chain-ttg-dev.cc |  25 +-
 examples/task-benchmarks/chain-ttg.cc     | 267 ++++++++++++++++++++++
 tests/unit/cuda_kernel.cu                 |   5 +-
 tests/unit/cuda_kernel.h                  |   3 +-
 tests/unit/device_coro.cc                 | 143 ++++++------
 ttg/ttg/coroutine.h                       |   1 +
 ttg/ttg/device/task.h                     | 186 ++++++---------
 ttg/ttg/madness/ttg.h                     |  10 +-
 ttg/ttg/make_tt.h                         |  91 ++++----
 ttg/ttg/parsec/ttg.h                      | 118 +++++-----
 ttg/ttg/tt.h                              |  46 ++--
 13 files changed, 648 insertions(+), 374 deletions(-)
 create mode 100644 examples/task-benchmarks/chain-ttg.cc

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index c34e3c07e..a6c97183a 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -15,6 +15,9 @@ if (TARGET tiledarray)
                                           COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2)
 
     add_ttg_executable(testing_dpotrf potrf/testing_dpotrf.cc LINK_LIBRARIES tiledarray lapackpp)
+    add_ttg_executable(testing_dpotrf_host potrf/testing_dpotrf.cc
+                       LINK_LIBRARIES tiledarray lapackpp
+                       COMPILE_DEFINITIONS TTG_ENABLE_DEV_HOST=1)
     add_ttg_executable(testing_dtrtri potrf/testing_dtrtri.cc LINK_LIBRARIES tiledarray lapackpp)
     add_ttg_executable(testing_dlauum potrf/testing_dlauum.cc LINK_LIBRARIES tiledarray lapackpp)
     add_ttg_executable(testing_dpoinv potrf/testing_dpoinv.cc LINK_LIBRARIES tiledarray lapackpp)
@@ -50,14 +53,27 @@ if (TARGET tiledarray)
     endif()
 
     if (TTG_HAVE_CUDA)
-      add_ttg_executable(chain-ttg-cuda task-benchmarks/chain-ttg-dev.cc LINK_LIBRARIES tiledarray RUNTIMES "parsec")
+      add_ttg_executable(chain-ttg-dev-cuda task-benchmarks/chain-ttg-dev.cc
+                         COMPILE_DEFINITIONS CHAIN_CUDA=1
+                         LINK_LIBRARIES tiledarray
+                         RUNTIMES "parsec")
     endif(TTG_HAVE_CUDA)
 
     if (TTG_HAVE_HIP)
-      add_ttg_executable(chain-ttg-hip task-benchmarks/chain-ttg-dev.cc LINK_LIBRARIES tiledarray RUNTIMES "parsec")
+      add_ttg_executable(chain-ttg-dev-hip task-benchmarks/chain-ttg-dev.cc
+                         COMPILE_DEFINITIONS CHAIN_HIP=1
+                         LINK_LIBRARIES tiledarray
+                         RUNTIMES "parsec")
     endif(TTG_HAVE_HIP)
 endif()
 
+add_ttg_executable(chain-ttg-host task-benchmarks/chain-ttg.cc)
+
+add_ttg_executable(chain-ttg-dev-host task-benchmarks/chain-ttg-dev.cc
+                   COMPILE_DEFINITIONS CHAIN_HOST=1
+                   LINK_LIBRARIES tiledarray
+                   RUNTIMES "parsec")
+                    
 if (TARGET MADworld)
   add_ttg_executable(madness-1d madness/madness-1d/madness-1d.cc RUNTIMES "mad")
   if (TARGET blaspp) #(CBLAS_FOUND AND MKL_FOUND)
diff --git a/examples/potrf/potrf.h b/examples/potrf/potrf.h
index d3a92cb2f..894064b54 100644
--- a/examples/potrf/potrf.h
+++ b/examples/potrf/potrf.h
@@ -7,22 +7,19 @@
 #include "util.h"
 #include "../devblas_helper.h"
 
-#if (defined(TTG_ENABLE_CUDA) || defined(TTG_ENABLE_HIP))
+#if (defined(TTG_ENABLE_CUDA) || defined(TTG_ENABLE_HIP) || defined(TTG_ENABLE_DEV_HOST))
 #define ENABLE_DEVICE_KERNEL 1
 #endif
 
 #if defined(TTG_HAVE_CUDART)
 #define ES ttg::ExecutionSpace::CUDA
-#define TASKRET -> ttg::device::Task
 #include <cusolverDn.h>
 #elif defined(TTG_ENABLE_HIP)
 #define ES ttg::ExecutionSpace::HIP
-#define TASKRET -> ttg::device::Task
 #include <hipsolver/hipsolver.h>
 #include <hipblas/hipblas.h>
 #else
 #define ES ttg::ExecutionSpace::Host
-#define TASKRET -> void
 #endif
 
 namespace potrf {
@@ -35,21 +32,21 @@ namespace potrf {
 #if defined(ENABLE_DEVICE_KERNEL)
   static int device_potrf_workspace_size(MatrixTile<double> &A) {
     int Lwork;
-    #if defined(TTG_ENABLE_CUDA)
+#if defined(TTG_ENABLE_CUDA)
       cusolverDnDpotrf_bufferSize(cusolver_handle(),
                                   CUBLAS_FILL_MODE_LOWER, A.cols(),
                                   nullptr, A.lda(),
                                   &Lwork);
       return Lwork;
-    #elif defined(TTG_ENABLE_HIP)
+#elif defined(TTG_ENABLE_HIP)
       hipsolverDnDpotrf_bufferSize(hipsolver_handle(),
                                    HIPSOLVER_FILL_MODE_LOWER, A.cols(),
                                    nullptr, A.lda(),
                                    &Lwork);
       return Lwork;
-    #else
+#else
       return 0;
-    #endif
+#endif
   }
 
   static void device_potrf(MatrixTile<double> &A, double *workspace, int Lwork, int *devInfo) {
@@ -64,13 +61,16 @@ namespace potrf {
                       A.buffer().current_device_ptr(), A.lda(),
                       workspace, Lwork,
                       devInfo);
-  #elif defined(TTG_ENABLE_HIP)
+#elif defined(TTG_ENABLE_HIP)
     hipsolverDpotrf(hipsolver_handle(),
                       HIPSOLVER_FILL_MODE_LOWER, A.cols(),
                       A.buffer().current_device_ptr(), A.lda(),
                       workspace, Lwork,
                       devInfo);
-  #endif
+#else
+    auto info = lapack::potrf(lapack::Uplo::Lower, A.rows(), A.buffer().current_device_ptr(), A.lda());
+    assert(info == 0);
+#endif
   }
 
   static void device_norm(const MatrixTile<double> &A, double *norm) {
@@ -81,9 +81,11 @@ namespace potrf {
     auto handle = cublas_handle();
     //double n = 1.0;
     cublasDnrm2(handle, size, buffer, 1, norm);
-  #elif defined(TTG_ENABLE_HIP)
+#elif defined(TTG_ENABLE_HIP)
     hipblasDnrm2(hipblas_handle(), size, buffer, 1, norm);
-  #endif
+#else
+    *norm = blas::nrm2(size, buffer, 1);
+#endif
   }
 #endif // ENABLE_DEVICE_KERNEL
 
@@ -99,7 +101,8 @@ namespace potrf {
     //std::cout << "Creating CUDA POTRF task " << std::endl;
     auto f_dev = [=, iallocator = std::move(iallocator)]
                     (const Key1& key, MatrixTile<T>&& tile_kk,
-                     std::tuple<ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key2, MatrixTile<T>>>& out) TASKRET {
+                     std::tuple<ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key2, MatrixTile<T>>>& out)
+                  -> ttg::device::Task<ES> {
       const auto K = key[0];
 
       /* compute successors before submitting the kernel
@@ -186,7 +189,7 @@ namespace potrf {
         ttg::abort();
       }
     };
-    return ttg::make_tt<ES>(f_dev, ttg::edges(ttg::fuse(input, input_disp)), ttg::edges(output_result, output_trsm), "POTRF",
+    return ttg::make_tt(f_dev, ttg::edges(ttg::fuse(input, input_disp)), ttg::edges(output_result, output_trsm), "POTRF",
                         {"tile_kk/dispatcher"}, {"output_result", "output_trsm"});
 #else /* defined(ENABLE_DEVICE_KERNEL) */
     auto f = [=](const Key1& key, MatrixTile<T>&& tile_kk,
@@ -234,7 +237,7 @@ namespace potrf {
 #if defined(ENABLE_DEVICE_KERNEL)
     auto f = [=](const Key2& key, const MatrixTile<T>& tile_kk, MatrixTile<T>&& tile_mk,
                  std::tuple<ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key3, MatrixTile<T>>,
-                            ttg::Out<Key3, MatrixTile<T>>>& out) TASKRET {
+                            ttg::Out<Key3, MatrixTile<T>>>& out) -> ttg::device::Task<ES> {
       const int M = key[0];
       const int K = key[1];  // the column equals the outer most look K (same as PO)
 
@@ -302,6 +305,9 @@ namespace potrf {
                    mb, nb, &alpha,
                    tile_kk.buffer().current_device_ptr(), tile_kk.lda(),
                    tile_mk.buffer().current_device_ptr(), tile_mk.lda());
+#else
+      blas::trsm(blas::Layout::ColMajor, blas::Side::Right, lapack::Uplo::Lower, blas::Op::Trans, blas::Diag::NonUnit,
+                 mb, nb, 1.0, tile_kk.data(), tile_kk.lda(), tile_mk.data(), tile_mk.lda());
 
 #endif
 
@@ -320,7 +326,7 @@ namespace potrf {
       co_await ttg::device::forward(ttg::device::broadcast<0, 1, 2, 3>(std::make_tuple(key, Key2(K, M), keylist_row, keylist_col),
                                                                        std::move(tile_mk), out));
     };
-    return ttg::make_tt<ES>(f, ttg::edges(input_kk, ttg::fuse(input_mk, input_disp)),
+    return ttg::make_tt(f, ttg::edges(input_kk, ttg::fuse(input_mk, input_disp)),
                             ttg::edges(output_result, output_diag, output_row, output_col), "TRSM",
                             {"tile_kk", "tile_mk/dispatcher"}, {"output_result", "tile_mk", "output_row", "output_col"});
 #else // defined(ENABLE_DEVICE_KERNEL)
@@ -386,8 +392,8 @@ namespace potrf {
                  ttg::Edge<Key2, MatrixTile<typename MatrixT::element_type>>& output_syrk) {
     using T = typename MatrixT::element_type;
 #if defined(ENABLE_DEVICE_KERNEL)
-    auto f = [=](const Key2& key, const MatrixTile<T>& tile_mk, MatrixTile<T>&& tile_kk,
-                 std::tuple<ttg::Out<Key1, MatrixTile<T>>, ttg::Out<Key2, MatrixTile<T>>>& out) TASKRET {
+    auto f = [=](const Key2& key, const MatrixTile<T>& tile_mk, MatrixTile<T>&& tile_kk)
+            -> ttg::device::Task<ES> {
       const int K = key[0];
       const int M = key[1];
 
@@ -432,6 +438,9 @@ namespace potrf {
                    mb, nb, &alpha,
                    tile_mk.buffer().current_device_ptr(), tile_mk.lda(), &beta,
                    tile_kk.buffer().current_device_ptr(), tile_kk.lda());
+#else
+      blas::syrk(blas::Layout::ColMajor, lapack::Uplo::Lower, blas::Op::NoTrans, mb, nb, -1.0, tile_mk.data(),
+                 tile_mk.lda(), 1.0, tile_kk.data(), tile_kk.lda());
 #endif
 
 #ifdef DEBUG_TILES_VALUES
@@ -449,18 +458,17 @@ namespace potrf {
       if (M == K + 1) {
         /* send the tile to potrf */
         if (ttg::tracing()) ttg::print("SYRK(", key, "): sending output to POTRF(", Key1{K + 1}, ")");
-        co_await ttg::device::send<0>(Key1(K + 1), std::move(tile_kk), out);
+        co_await ttg::device::send<0>(Key1(K + 1), std::move(tile_kk));
       } else {
         /* send output to next syrk */
         if (ttg::tracing()) ttg::print("SYRK(", key, "): sending output to SYRK(", Key2{K + 1, M}, ")");
-        co_await ttg::device::send<1>(Key2(K + 1, M), std::move(tile_kk), out);
+        co_await ttg::device::send<1>(Key2(K + 1, M), std::move(tile_kk));
       }
     };
-    return ttg::make_tt<ES>(f, ttg::edges(input_mk, ttg::fuse(input_kk, input_disp)), ttg::edges(output_potrf, output_syrk),
+    return ttg::make_tt(f, ttg::edges(input_mk, ttg::fuse(input_kk, input_disp)), ttg::edges(output_potrf, output_syrk),
                             "SYRK", {"tile_mk", "tile_kk/dispatcher"}, {"output_potrf", "output_syrk"});
 #else // defined(ENABLE_DEVICE_KERNEL)
-    auto f = [=](const Key2& key, const MatrixTile<T>& tile_mk, MatrixTile<T>&& tile_kk,
-                 std::tuple<ttg::Out<Key1, MatrixTile<T>>, ttg::Out<Key2, MatrixTile<T>>>& out) {
+    auto f = [=](const Key2& key, const MatrixTile<T>& tile_mk, MatrixTile<T>&& tile_kk) {
       const int K = key[0];
       const int M = key[1];
 
@@ -487,11 +495,11 @@ namespace potrf {
       if (M == K + 1) {
         /* send the tile to potrf */
         if (ttg::tracing()) ttg::print("SYRK(", key, "): sending output to POTRF(", Key1{K + 1}, ")");
-        ttg::send<0>(Key1(K + 1), std::move(tile_kk), out);
+        ttg::send<0>(Key1(K + 1), std::move(tile_kk));
       } else {
         /* send output to next syrk */
         if (ttg::tracing()) ttg::print("SYRK(", key, "): sending output to SYRK(", Key2{K + 1, M}, ")");
-        ttg::send<1>(Key2(K + 1, M), std::move(tile_kk), out);
+        ttg::send<1>(Key2(K + 1, M), std::move(tile_kk));
       }
     };
     return ttg::make_tt(f, ttg::edges(input_mk, ttg::fuse(input_kk, input_disp)), ttg::edges(output_potrf, output_syrk),
@@ -509,8 +517,8 @@ namespace potrf {
                  ttg::Edge<Key3, MatrixTile<typename MatrixT::element_type>>& output_gemm) {
     using T = typename MatrixT::element_type;
 #if defined(ENABLE_DEVICE_KERNEL)
-    auto f = [=](const Key3& key, const MatrixTile<T>& tile_mk, const MatrixTile<T>& tile_nk, MatrixTile<T>&& tile_mn,
-                 std::tuple<ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key3, MatrixTile<T>>>& out) TASKRET {
+    auto f = [=](const Key3& key, const MatrixTile<T>& tile_mk, const MatrixTile<T>& tile_nk, MatrixTile<T>&& tile_mn)
+            -> ttg::device::Task<ES> {
       const int M = key[0];
       const int N = key[1];
       const int K = key[2];
@@ -559,6 +567,10 @@ namespace potrf {
                    tile_mk.buffer().current_device_ptr(), tile_mk.lda(),
                    tile_nk.buffer().current_device_ptr(), tile_nk.lda(), &beta,
                    tile_mn.buffer().current_device_ptr(), tile_mn.lda());
+#else
+      blas::gemm(blas::Layout::ColMajor, blas::Op::NoTrans, blas::Op::Trans, tile_mk.rows(), tile_nk.rows(),
+                 tile_nk.cols(), -1.0, tile_mk.data(), tile_mk.lda(), tile_nk.data(), tile_nk.lda(), 1.0,
+                 tile_mn.data(), tile_mn.lda());
 #endif
 
 
@@ -578,19 +590,18 @@ namespace potrf {
       if (N == K + 1) {
         /* send the tile to trsm */
         if (ttg::tracing()) ttg::print("GEMM(", key, "): sending output to TRSM(", Key2{M, N}, ")");
-        co_await ttg::device::send<0>(Key2(M, N), std::move(tile_mn), out);
+        co_await ttg::device::send<0>(Key2(M, N), std::move(tile_mn));
       } else {
         /* send the tile to the next gemm */
         if (ttg::tracing()) ttg::print("GEMM(", key, "): sending output to GEMM(", Key3{M, N, K + 1}, ")");
-        co_await ttg::device::send<1>(Key3(M, N, K + 1), std::move(tile_mn), out);
+        co_await ttg::device::send<1>(Key3(M, N, K + 1), std::move(tile_mn));
       }
     };
-    return ttg::make_tt<ES>(f, ttg::edges(input_mk, input_nk, ttg::fuse(input_disp, input_mn)),
+    return ttg::make_tt(f, ttg::edges(input_mk, input_nk, ttg::fuse(input_disp, input_mn)),
                             ttg::edges(output_trsm, output_gemm), "GEMM", {"input_mk", "input_kn", "input_mn/dispatcher"},
                             {"output_trsm", "outout_gemm"});
 #else // defined(ENABLE_DEVICE_KERNEL)
-    auto f = [=](const Key3& key, const MatrixTile<T>& tile_mk, const MatrixTile<T>& tile_nk, MatrixTile<T>&& tile_mn,
-                 std::tuple<ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key3, MatrixTile<T>>>& out) {
+    auto f = [=](const Key3& key, const MatrixTile<T>& tile_mk, const MatrixTile<T>& tile_nk, MatrixTile<T>&& tile_mn) {
       const int M = key[0];
       const int N = key[1];
       const int K = key[2];
@@ -617,11 +628,11 @@ namespace potrf {
       if (N == K + 1) {
         /* send the tile to trsm */
         if (ttg::tracing()) ttg::print("GEMM(", key, "): sending output to TRSM(", Key2{M, N}, ")");
-        ttg::send<0>(Key2(M, N), std::move(tile_mn), out);
+        ttg::send<0>(Key2(M, N), std::move(tile_mn));
       } else {
         /* send the tile to the next gemm */
         if (ttg::tracing()) ttg::print("GEMM(", key, "): sending output to GEMM(", Key3{M, N, K + 1}, ")");
-        ttg::send<1>(Key3(M, N, K + 1), std::move(tile_mn), out);
+        ttg::send<1>(Key3(M, N, K + 1), std::move(tile_mn));
       }
     };
     return ttg::make_tt(f, ttg::edges(input_mk, input_nk, ttg::fuse(input_disp, input_mn)),
@@ -634,20 +645,18 @@ namespace potrf {
   auto make_dispatcher(ttg::Edge<Key2, MatrixTile<T>>& input, ttg::Edge<Key1, MatrixTile<T>>& to_potrf,
                        ttg::Edge<Key2, MatrixTile<T>>& to_trsm, ttg::Edge<Key2, MatrixTile<T>>& to_syrk,
                        ttg::Edge<Key3, MatrixTile<T>>& to_gemm) {
-    auto f = [=](const Key2& key, const MatrixTile<T>& tile,
-                 std::tuple<ttg::Out<Key1, MatrixTile<T>>, ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key2, MatrixTile<T>>,
-                            ttg::Out<Key3, MatrixTile<T>>>& out) {
+    auto f = [=](const Key2& key, const MatrixTile<T>& tile) {
       if (ttg::tracing()) ttg::print("POTRF_Dispatch(", key, ")");
       if (0 == key[0] && 0 == key[1]) {
         // First element goes to POTRF
         if (ttg::tracing()) ttg::print("POTRF_Dispatch(", key, ") sending to POTRF(", Key1{key[0]}, ")");
-        ttg::send<0>(Key1{key[0]}, tile, out);
+        ttg::send<0>(Key1{key[0]}, tile);
         return;
       }
       if (key[0] == key[1]) {
         // Other diagonal elements go to SYRK
         if (ttg::tracing()) ttg::print("POTRF_Dispatch(", key, ") sending to SYRK(", Key2{0, key[0]}, ")");
-        ttg::send<2>(Key2{0, key[0]}, tile, out);
+        ttg::send<2>(Key2{0, key[0]}, tile);
         return;
       }
       // We only consider the lower triangular
@@ -655,12 +664,12 @@ namespace potrf {
       if (0 == key[1]) {
         // First column goes to TRSM
         if (ttg::tracing()) ttg::print("POTRF_Dispatch(", key, ") sending to TRSM(", key, ")");
-        ttg::send<1>(key, tile, out);
+        ttg::send<1>(key, tile);
         return;
       }
       // Rest goes to GEMM
       if (ttg::tracing()) ttg::print("POTRF_Dispatch(", key, ") sending to GEMM(", Key3{key[0], key[1], 0}, ")");
-      ttg::send<3>(Key3{key[0], key[1], 0}, tile, out);
+      ttg::send<3>(Key3{key[0], key[1], 0}, tile);
     };
 
     return ttg::make_tt(f, ttg::edges(input), ttg::edges(to_potrf, to_trsm, to_syrk, to_gemm), "POTRF Dispatch",
@@ -705,28 +714,36 @@ namespace potrf {
     tt_potrf->set_keymap(keymap1);
     tt_potrf->set_defer_writer(defer_write);
 #ifdef ENABLE_DEVICE_KERNEL
-    tt_potrf->set_devicemap(devmap1);
+    if constexpr (ES != ttg::ExecutionSpace::Host) {
+      tt_potrf->set_devicemap(devmap1);
+    }
 #endif // 0
 
     auto tt_trsm = make_trsm(A, disp_trsm, potrf_trsm, gemm_trsm, trsm_syrk, trsm_gemm_row, trsm_gemm_col, output);
     tt_trsm->set_keymap(keymap2a);
     tt_trsm->set_defer_writer(defer_write);
 #ifdef ENABLE_DEVICE_KERNEL
-    tt_trsm->set_devicemap(devmap2a);
+    if constexpr (ES != ttg::ExecutionSpace::Host) {
+      tt_trsm->set_devicemap(devmap2a);
+    }
 #endif // 0
 
     auto tt_syrk = make_syrk(A, disp_syrk, trsm_syrk, syrk_syrk, syrk_potrf, syrk_syrk);
     tt_syrk->set_keymap(keymap2b);
     tt_syrk->set_defer_writer(defer_write);
 #ifdef ENABLE_DEVICE_KERNEL
-    tt_syrk->set_devicemap(devmap2b);
+    if constexpr (ES != ttg::ExecutionSpace::Host) {
+      tt_syrk->set_devicemap(devmap2b);
+    }
 #endif // 0
 
     auto tt_gemm = make_gemm(A, disp_gemm, trsm_gemm_row, trsm_gemm_col, gemm_gemm, gemm_trsm, gemm_gemm);
     tt_gemm->set_keymap(keymap3);
     tt_gemm->set_defer_writer(defer_write);
 #ifdef ENABLE_DEVICE_KERNEL
-    tt_gemm->set_devicemap(devmap3);
+    if constexpr (ES != ttg::ExecutionSpace::Host) {
+      tt_gemm->set_devicemap(devmap3);
+    }
 #endif // 0
 
     /* Priorities taken from DPLASMA */
diff --git a/examples/task-benchmarks/chain-ttg-dev.cc b/examples/task-benchmarks/chain-ttg-dev.cc
index 80f14bff4..559e32870 100644
--- a/examples/task-benchmarks/chain-ttg-dev.cc
+++ b/examples/task-benchmarks/chain-ttg-dev.cc
@@ -3,13 +3,19 @@
 
 #include "chrono.h"
 
-#if defined(TTG_HAVE_CUDA)
+#if defined(CHAIN_CUDA)
+#ifndef TTG_HAVE_CUDA
+#error Cannot build CUDA chain benchmark against TTG that does not support CUDA!
+#endif
 #define ES ttg::ExecutionSpace::CUDA
-#elif defined(TTG_HAVE_HIP)
+#elif defined(CHAIN_HIP)
 #define ES ttg::ExecutionSpace::HIP
+#ifndef TTG_HAVE_HIP
+#error Cannot build HIP chain benchmark against TTG that does not support HIP!
+#endif
 #else
-#error "Either CUDA OR HIP is required to build this test!"
-#endif // 0
+#define ES ttg::ExecutionSpace::Host
+#endif
 
 #define NUM_TASKS 100000
 
@@ -53,7 +59,7 @@ auto make_ttg<1>(bool do_move) {
       send<0>(0, A{});
     }, edges(), edges(I2N));
 
-  auto next = make_tt<ES, int>([=](const int &key, auto&& value) -> ttg::device::Task {
+  auto next = make_tt<int>([=](const int &key, auto&& value) -> ttg::device::Task<ES> {
     //++task_counter;
     co_await ttg::device::select(value.b);
     if (key < NUM_TASKS) {
@@ -62,7 +68,6 @@ auto make_ttg<1>(bool do_move) {
       } else {
         co_await ttg::device::forward(ttg::device::send<0>(key+1, value));
       }
-    } else {
     }
   } , edges(fuse(I2N, N2N)), edges(N2N));
 
@@ -80,7 +85,7 @@ auto make_ttg<2>(bool do_move) {
     send<1>(0, A{});
   }, edges(), edges(I2N1, I2N2));
 
-  auto next = make_tt<ES, int>([=](const int &key, A&& v1, A&& v2) -> ttg::device::Task {
+  auto next = make_tt<int>([=](const int &key, A&& v1, A&& v2) -> ttg::device::Task<ES> {
     co_await ttg::device::select(v1.b, v2.b);
     if (key < NUM_TASKS) {
       if (do_move) {
@@ -110,7 +115,7 @@ auto make_ttg<4>(bool do_move) {
       send<3>(0, A{});
     }, edges(), edges(I2N1, I2N2, I2N3, I2N4));
 
-  auto next = make_tt<ES, int>([=](const int &key, A&& v1, A&& v2, A&& v3, A&& v4) -> ttg::device::Task {
+  auto next = make_tt<int>([=](const int &key, A&& v1, A&& v2, A&& v3, A&& v4) -> ttg::device::Task<ES> {
     co_await ttg::device::select(v1.b, v2.b, v3.b, v4.b);
     if (key < NUM_TASKS) {
       if (do_move) {
@@ -150,7 +155,7 @@ auto make_ttg<8>(bool do_move) {
       send<7>(0, A{});
     }, edges(), edges(I2N1, I2N2, I2N3, I2N4, I2N5, I2N6, I2N7, I2N8));
 
-  auto next = make_tt<ES, int>([=](const int &key, auto&& v1, auto&& v2, auto&& v3, auto&& v4, auto&& v5, auto&& v6, auto&& v7, auto&& v8) -> ttg::device::Task {
+  auto next = make_tt<int>([=](const int &key, auto&& v1, auto&& v2, auto&& v3, auto&& v4, auto&& v5, auto&& v6, auto&& v7, auto&& v8) -> ttg::device::Task<ES> {
     co_await ttg::device::select(v1.b, v2.b, v3.b, v4.b, v5.b, v6.b, v7.b, v8.b);
     if (key < NUM_TASKS) {
       if (do_move) {
@@ -187,7 +192,7 @@ auto make_ttg<0>(bool do_move) {
 
   auto init = make_tt<void>([](std::tuple<Out<int, void>> &outs) { sendk<0>(0, outs); }, edges(), edges(I2N));
 
-  auto next = make_tt<ES>([](const int& key) -> ttg::device::Task {
+  auto next = make_tt([](const int& key) -> ttg::device::Task<ES> {
     co_await ttg::device::select();
     if (key < NUM_TASKS) {
       co_await ttg::device::forward(ttg::device::sendk<0>(key+1));
diff --git a/examples/task-benchmarks/chain-ttg.cc b/examples/task-benchmarks/chain-ttg.cc
new file mode 100644
index 000000000..285d57aca
--- /dev/null
+++ b/examples/task-benchmarks/chain-ttg.cc
@@ -0,0 +1,267 @@
+//#define TTG_USE_USER_TERMDET 1
+#include "ttg.h"
+
+#include "chrono.h"
+
+#define NUM_TASKS 100000
+
+using namespace ttg;
+
+template <int num_flows>
+auto make_ttg(bool do_move);
+
+// flows task ids via values
+template <>
+auto make_ttg<1>(bool do_move) {
+  Edge<int, int> I2N, N2N;
+  Edge<void, int> N2S;
+
+  auto init = make_tt<void>([]() { send<0>(0, 0); }, edges(), edges(I2N));
+
+  auto next = make_tt<int>([=](const int &key, auto&& value) {
+    if (key < NUM_TASKS) {
+    //std::cout << &value << " -> " << value << std::endl;
+    //if (key < 10) {
+      //value++;
+      if (do_move) {
+        send<0>(key+1, std::move(value));
+        //send<0>(key+1, value);
+      } else {
+        send<0>(key+1, value);
+      }
+    }
+    else {
+      sendv<1>(std::move(value));
+    }
+  } , edges(fuse(I2N, N2N)), edges(N2N, N2S));
+
+  auto stop = make_tt<void>([](const int& v) {
+     //std::cout << "last task received v=" << v << std::endl;
+     ttg::default_execution_context().impl().final_task();
+  }, edges(N2S), edges());
+
+  return std::make_tuple(std::move(init), std::move(next), std::move(stop));
+}
+
+template <>
+auto make_ttg<2>(bool do_move) {
+  Edge<int, int> I2N1, I2N2;
+  Edge<int, int> N2N1, N2N2;
+  Edge<void, int> N2S1, N2S2;
+
+  auto init = make_tt<void>([]() {
+    send<0>(0, 0);
+    send<1>(0, 0);
+  }, edges(), edges(I2N1, I2N2));
+
+  auto next = make_tt<int>([=](const int &key, int&& v1, int&& v2) {
+    if (key < NUM_TASKS) {
+      v1++; v2++;
+      if (do_move) {
+        send<0>(key+1, std::move(v1));
+        send<1>(key+1, std::move(v2));
+      } else {
+        send<0>(key+1, v1);
+        send<1>(key+1, v2);
+      }
+    }
+    else {
+      sendv<2>(std::move(v1));
+      sendv<3>(std::move(v2));
+    }
+  } , edges(fuse(I2N1, N2N1), fuse(I2N2, N2N2)), edges(N2N1, N2N2, N2S1, N2S2));
+
+  auto stop = make_tt<void>([](const int &v1, const int &v2) {
+     //std::cout << "last task received v=" << v1 << std::endl;
+     ttg::default_execution_context().impl().final_task();
+  }, edges(N2S1, N2S2), edges());
+
+  return std::make_tuple(std::move(init), std::move(next), std::move(stop));
+}
+
+template <>
+auto make_ttg<4>(bool do_move) {
+  Edge<int, int> I2N1, I2N2, I2N3, I2N4;
+  Edge<int, int> N2N1, N2N2, N2N3, N2N4;
+  Edge<void, int> N2S1, N2S2, N2S3, N2S4;
+
+  auto init = make_tt<void>([]() {
+    send<0>(0, 0);
+    send<1>(0, 0);
+    send<2>(0, 0);
+    send<3>(0, 0);
+  }, edges(), edges(I2N1, I2N2, I2N3, I2N4));
+
+  auto next = make_tt<int>([=](const int &key, int&& v1, int&& v2, int&& v3, int&& v4) {
+    if (key < NUM_TASKS) {
+      v1++; v2++;
+      v3++; v4++;
+      if (do_move) {
+        send<0>(key+1, std::move(v1));
+        send<1>(key+1, std::move(v2));
+        send<2>(key+1, std::move(v3));
+        send<3>(key+1, std::move(v4));
+      } else {
+        send<0>(key+1, v1);
+        send<1>(key+1, v2);
+        send<2>(key+1, v3);
+        send<3>(key+1, v4);
+      }
+    }
+    else {
+      sendv<4>(std::move(v1));
+      sendv<5>(std::move(v2));
+      sendv<6>(std::move(v3));
+      sendv<7>(std::move(v4));
+    }
+  }, edges(fuse(I2N1, N2N1), fuse(I2N2, N2N2),
+           fuse(I2N3, N2N3), fuse(I2N4, N2N4)),
+     edges(N2N1, N2N2, N2N3, N2N4, N2S1, N2S2, N2S3, N2S4));
+
+  auto stop = make_tt<void>([](const int& v1, const int& v2, const int& v3, const int& v4){
+     //std::cout << "last task received v=" << v1 << std::endl;
+     ttg::default_execution_context().impl().final_task();
+  }, edges(N2S1, N2S2, N2S3, N2S4), edges());
+
+  return std::make_tuple(std::move(init), std::move(next), std::move(stop));
+}
+
+template <>
+auto make_ttg<8>(bool do_move) {
+  Edge<int, int> I2N1, I2N2, I2N3, I2N4, I2N5, I2N6, I2N7, I2N8;
+  Edge<int, int> N2N1, N2N2, N2N3, N2N4, N2N5, N2N6, N2N7, N2N8;
+  Edge<void, int> N2S1, N2S2, N2S3, N2S4, N2S5, N2S6, N2S7, N2S8;
+
+  auto init = make_tt<void>([]() {
+    send<0>(0, 0);
+    send<1>(0, 0);
+    send<2>(0, 0);
+    send<3>(0, 0);
+    send<4>(0, 0);
+    send<5>(0, 0);
+    send<6>(0, 0);
+    send<7>(0, 0);
+  }, edges(), edges(I2N1, I2N2, I2N3, I2N4, I2N5, I2N6, I2N7, I2N8));
+
+  auto next = make_tt<int>([=](const int &key, auto&& v1, auto&& v2, auto&& v3,
+                               auto&& v4, auto&& v5, auto&& v6, auto&& v7, auto&& v8) {
+    if (key < NUM_TASKS) {
+    //if (key < 1000) {
+      v1++; v2++;
+      v3++; v4++;
+      v5++; v6++;
+      v6++; v8++;
+      if (do_move) {
+        send<0>(key+1, std::move(v1));
+        send<1>(key+1, std::move(v2));
+        send<2>(key+1, std::move(v3));
+        send<3>(key+1, std::move(v4));
+        send<4>(key+1, std::move(v5));
+        send<5>(key+1, std::move(v6));
+        send<6>(key+1, std::move(v7));
+        send<7>(key+1, std::move(v8));
+      } else {
+        send<0>(key+1, v1);
+        send<1>(key+1, v2);
+        send<2>(key+1, v3);
+        send<3>(key+1, v4);
+        send<4>(key+1, v5);
+        send<5>(key+1, v6);
+        send<6>(key+1, v7);
+        send<7>(key+1, v8);
+      }
+    }
+    else {
+      sendv<8>(std::move(v1));
+      sendv<9>(std::move(v2));
+      sendv<10>(std::move(v3));
+      sendv<11>(std::move(v4));
+      sendv<12>(std::move(v5));
+      sendv<13>(std::move(v6));
+      sendv<14>(std::move(v7));
+      sendv<15>(std::move(v8));
+    }
+  }, edges(fuse(I2N1, N2N1), fuse(I2N2, N2N2), fuse(I2N3, N2N3), fuse(I2N4, N2N4), fuse(I2N5, N2N5), fuse(I2N6, N2N6), fuse(I2N7, N2N7), fuse(I2N8, N2N8)),
+     edges(N2N1, N2N2, N2N3, N2N4, N2N5, N2N6, N2N7, N2N8, N2S1, N2S2, N2S3, N2S4, N2S5, N2S6, N2S7, N2S8));
+
+  auto stop = make_tt<void>([](const int &v1, const int &v2, const int &v3, const int &v4, const int &v5, const int &v6, const int &v7, const int &v8) {
+     //std::cout << "last task received v=" << v1 << std::endl;
+     ttg::default_execution_context().impl().final_task();
+  }, edges(N2S1, N2S2, N2S3, N2S4, N2S5, N2S6, N2S7, N2S8), edges());
+
+  return std::make_tuple(std::move(init), std::move(next), std::move(stop));
+}
+
+// flows task ids via keys
+template <>
+auto make_ttg<0>(bool do_move) {
+  Edge<int, void> I2N, N2N;
+  Edge<void, int> N2S;
+
+  auto init = make_tt<void>([]() { sendk<0>(0); }, edges(), edges(I2N));
+
+  auto next = make_tt([](const int& key) {
+    if (key < NUM_TASKS) {
+      ::sendk<0>(key+1);
+    }
+    else {
+      ::sendv<1>(key);
+    }
+  }, edges(fuse(I2N, N2N)), edges(N2N, N2S));
+
+  auto stop = make_tt<void>([](const int &v) {
+     //std::cout << "last task received v=" << v << std::endl;
+     ttg::default_execution_context().impl().final_task();
+  }, edges(N2S), edges());
+
+  return std::make_tuple(std::move(init), std::move(next), std::move(stop));
+}
+
+template<int num_flows>
+void run_bench(bool do_move)
+{
+  auto [init, next, stop] = make_ttg<num_flows>(do_move);
+
+  auto connected = make_graph_executable(init.get());
+  assert(connected);
+  std::cout << "Graph is connected.\n";
+
+  auto t0 = now();
+  if (ttg::default_execution_context().rank() == 0) init->invoke();
+
+  ttg_execute(ttg_default_execution_context());
+  ttg_fence(ttg_default_execution_context());
+  auto t1 = now();
+
+  std::cout << "# of tasks = " << NUM_TASKS << std::endl;
+  std::cout << "time elapsed (microseconds) = " << duration_in_mus(t0, t1)
+            << ", avg " << duration_in_mus(t0, t1) / (double)NUM_TASKS << std::endl;
+}
+
+int main(int argc, char* argv[]) {
+
+  int num_flows = 0;
+  int do_move = 1;
+  ttg_initialize(argc, argv, -1);
+
+  if (argc > 1) {
+    num_flows = std::atoi(argv[1]);
+  }
+
+  if (argc > 2) {
+    do_move = std::atoi(argv[2]);
+  }
+
+  switch(num_flows) {
+  case 0: run_bench<0>(do_move); break;
+  case 1: run_bench<1>(do_move); break;
+  case 2: run_bench<2>(do_move); break;
+  case 4: run_bench<4>(do_move); break;
+  case 8: run_bench<8>(do_move); break;
+  default: std::cout << "Unsupported number of flows: " << num_flows << std::endl;
+  }
+
+  ttg_finalize();
+  return 0;
+}
+
diff --git a/tests/unit/cuda_kernel.cu b/tests/unit/cuda_kernel.cu
index f6f00d172..f280f9a97 100644
--- a/tests/unit/cuda_kernel.cu
+++ b/tests/unit/cuda_kernel.cu
@@ -13,7 +13,10 @@ static __global__ void cu_increment_buffer(double* buffer, double* scratch) {
   }
 }
 
-void increment_buffer(double* buffer, std::size_t buffer_size, double* scratch, std::size_t scratch_size) {
+void increment_buffer_cuda(
+  double* buffer, std::size_t buffer_size,
+  double* scratch, std::size_t scratch_size)
+{
 
   cu_increment_buffer<<<1, buffer_size>>>(buffer, scratch);
 
diff --git a/tests/unit/cuda_kernel.h b/tests/unit/cuda_kernel.h
index 4fec87a99..a0d3181ce 100644
--- a/tests/unit/cuda_kernel.h
+++ b/tests/unit/cuda_kernel.h
@@ -1,4 +1,5 @@
 #include "ttg/config.h"
 #include <cinttypes>
 
-void increment_buffer(double* buffer, std::size_t buffer_size, double* scratch, std::size_t scratch_size);
\ No newline at end of file
+void increment_buffer_cuda(
+    double* buffer, std::size_t buffer_size, double* scratch, std::size_t scratch_size);
\ No newline at end of file
diff --git a/tests/unit/device_coro.cc b/tests/unit/device_coro.cc
index 60581d232..489ad8ffb 100644
--- a/tests/unit/device_coro.cc
+++ b/tests/unit/device_coro.cc
@@ -6,10 +6,24 @@
 
 #include "cuda_kernel.h"
 
+#if defined(TTG_HAVE_CUDA)
+#define SPACE ttg::ExecutionSpace::CUDA
+#else
+#define SPACE ttg::ExecutionSpace::Host
+#endif // 0
+
 struct value_t {
   ttg::Buffer<double> db; // TODO: rename
   int quark;
 
+  value_t()
+  {}
+
+  value_t(std::size_t c)
+  : db(c)
+  , quark(c)
+  { }
+
   template<typename Archive>
   void serialize(Archive& ar, const unsigned int version) {
     ar& quark;
@@ -17,6 +31,22 @@ struct value_t {
   }
 };
 
+static void increment_buffer(
+  double* buffer, std::size_t buffer_size,
+  double* scratch, std::size_t scratch_size)
+{
+#if defined(TTG_HAVE_CUDA)
+  increment_buffer(buffer, buffer_size, scratch, scratch_size);
+#else  // TTG_HAVE_CUDA
+  for (std::size_t i = 0 ; i < buffer_size; ++i) {
+    buffer[i] += 1.0;
+  }
+  if (scratch != nullptr) {
+    *scratch += 1.0;
+  }
+#endif // TTG_HAVE_CUDA
+}
+
 #ifdef TTG_SERIALIZATION_SUPPORTS_MADNESS
 /* devicebuf is non-POD so provide serialization
  * information for members not a devicebuf */
@@ -28,14 +58,14 @@ namespace madness::archive {
 }  // namespace madness::archive
 #endif  // TTG_SERIALIZATION_SUPPORTS_MADNESS
 
-#if defined(TTG_HAVE_DEVICE) && defined(TTG_IMPL_DEVICE_SUPPORT)
+#if defined(TTG_IMPL_DEVICE_SUPPORT)
 
 TEST_CASE("Device", "coro") {
 
   SECTION("devicebuf") {
 
     ttg::Edge<int, value_t> edge;
-    auto fn = [&](const int& key, value_t&& val) -> ttg::device::Task {
+    auto fn = [&](const int& key, value_t&& val) -> ttg::device::Task<SPACE> {
       //ttg::print("device_task key ", key);
 
       /* wait for the view to be available on the device */
@@ -58,10 +88,10 @@ TEST_CASE("Device", "coro") {
 
     //ptr.get_view<ttg::ExecutionSpace::CUDA>(device_id);
 
-    auto tt = ttg::make_tt<ttg::ExecutionSpace::CUDA>(fn, ttg::edges(edge), ttg::edges(edge),
-                                                      "device_task", {"edge_in"}, {"edge_out"});
+    auto tt = ttg::make_tt(fn, ttg::edges(edge), ttg::edges(edge),
+                           "device_task", {"edge_in"}, {"edge_out"});
     ttg::make_graph_executable(tt);
-    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, value_t{});
+    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, value_t{1});
     std::cout << "Entering fence" << std::endl;
     ttg::ttg_fence(ttg::default_execution_context());
   }
@@ -69,7 +99,7 @@ TEST_CASE("Device", "coro") {
   SECTION("devicebuf-inc") {
 
     ttg::Edge<int, value_t> edge;
-    auto fn = [&](const int& key, value_t&& val) -> ttg::device::Task {
+    auto fn = [&](const int& key, value_t&& val) -> ttg::device::Task<SPACE> {
       //ttg::print("device_task key ", key);
 
       /* wait for the view to be available on the device */
@@ -80,19 +110,15 @@ TEST_CASE("Device", "coro") {
       std::cout << "KEY " << key << " VAL IN DEV " << *val.db.current_device_ptr() << " VAL IN HOST " << *val.db.host_ptr() << std::endl;
 
       /* call a kernel */
-#ifdef TTG_HAVE_CUDA
       increment_buffer(val.db.current_device_ptr(), val.db.size(), nullptr, 0);
-#endif // TTG_HAVE_CUDA
 
       /* here we suspend to wait for a kernel to complete */
       co_await ttg::device::wait(val.db);
 
       std::cout << "KEY " << key << " VAL OUT DEV " << *val.db.current_device_ptr() << " VAL OUT HOST " << *val.db.host_ptr() << std::endl;
 
-#ifdef TTG_HAVE_CUDA
       /* buffer is increment once per task, so it should be the same as key */
       CHECK(static_cast<int>(*val.db.host_ptr()) == key+1);
-#endif // TTG_HAVE_CUDA
 
       /* we're back, the kernel executed and we can send */
       if (key < 10) {
@@ -104,10 +130,10 @@ TEST_CASE("Device", "coro") {
 
     //ptr.get_view<ttg::ExecutionSpace::CUDA>(device_id);
 
-    auto tt = ttg::make_tt<ttg::ExecutionSpace::CUDA>(fn, ttg::edges(edge), ttg::edges(edge),
-                                                      "device_task", {"edge_in"}, {"edge_out"});
+    auto tt = ttg::make_tt(fn, ttg::edges(edge), ttg::edges(edge),
+                           "device_task", {"edge_in"}, {"edge_out"});
     ttg::make_graph_executable(tt);
-    value_t v;
+    value_t v{1};
     *v.db.host_ptr() = 2.0; // start from non-zero value
     if (ttg::default_execution_context().rank() == 0) tt->invoke(2, std::move(v));
     std::cout << "Entering fence" << std::endl;
@@ -117,7 +143,7 @@ TEST_CASE("Device", "coro") {
   SECTION("scratch") {
 
     ttg::Edge<int, value_t> edge;
-    auto fn = [&](const int& key, value_t&& val) -> ttg::device::Task {
+    auto fn = [&](const int& key, value_t&& val) -> ttg::device::Task<SPACE> {
       double scratch = 0.0;
       ttg::devicescratch<double> ds = ttg::make_scratch(&scratch, ttg::scope::Allocate);
 
@@ -127,17 +153,13 @@ TEST_CASE("Device", "coro") {
       CHECK(ds.device_ptr()  != nullptr);
 
       /* call a kernel */
-#ifdef TTG_HAVE_CUDA
       increment_buffer(val.db.current_device_ptr(), val.db.size(), ds.device_ptr(), ds.size());
-#endif // TTG_HAVE_CUDA
 
       /* here we suspend to wait for a kernel to complete */
       co_await ttg::device::wait(ds);
 
-#ifdef TTG_HAVE_CUDA
       /* the scratch is allocated but no data is transferred in; it's incremented once */
       CHECK((static_cast<int>(scratch)-1) == 0);
-#endif // 0
 
       /* we're back, the kernel executed and we can send */
       if (key < 10) {
@@ -149,17 +171,17 @@ TEST_CASE("Device", "coro") {
       }
     };
 
-    auto tt = ttg::make_tt<ttg::ExecutionSpace::CUDA>(fn, ttg::edges(edge), ttg::edges(edge),
-                                                      "device_task", {"edge_in"}, {"edge_out"});
+    auto tt = ttg::make_tt(fn, ttg::edges(edge), ttg::edges(edge),
+                           "device_task", {"edge_in"}, {"edge_out"});
     ttg::make_graph_executable(tt);
-    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, value_t{});
+    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, value_t{1});
     ttg::ttg_fence(ttg::default_execution_context());
   }
 
   SECTION("scratch-syncin") {
 
     ttg::Edge<int, value_t> edge;
-    auto fn = [&](const int& key, value_t&& val) -> ttg::device::Task {
+    auto fn = [&](const int& key, value_t&& val) -> ttg::device::Task<SPACE> {
       double scratch = key;
       ttg::devicescratch<double> ds = ttg::make_scratch(&scratch, ttg::scope::SyncIn);
 
@@ -169,17 +191,13 @@ TEST_CASE("Device", "coro") {
       CHECK(ds.device_ptr()  != nullptr);
 
       /* call a kernel */
-#ifdef TTG_HAVE_CUDA
       increment_buffer(val.db.current_device_ptr(), val.db.size(), ds.device_ptr(), ds.size());
-#endif // TTG_HAVE_CUDA
 
       /* here we suspend to wait for a kernel to complete */
       co_await ttg::device::wait(ds);
 
-#ifdef TTG_HAVE_CUDA
       /* scratch is increment once per task, so it should be the same as key */
       CHECK((static_cast<int>(scratch))-1 == key);
-#endif // 0
 
       /* we're back, the kernel executed and we can send */
       if (key < 10) {
@@ -191,17 +209,17 @@ TEST_CASE("Device", "coro") {
       }
     };
 
-    auto tt = ttg::make_tt<ttg::ExecutionSpace::CUDA>(fn, ttg::edges(edge), ttg::edges(edge),
-                                                      "device_task", {"edge_in"}, {"edge_out"});
+    auto tt = ttg::make_tt(fn, ttg::edges(edge), ttg::edges(edge),
+                           "device_task", {"edge_in"}, {"edge_out"});
     ttg::make_graph_executable(tt);
-    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, value_t{});
+    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, value_t{1});
     ttg::ttg_fence(ttg::default_execution_context());
   }
 
   SECTION("scratch-value-out") {
 
     ttg::Edge<int, value_t> edge;
-    auto fn = [&](const int& key, value_t&& val) -> ttg::device::Task {
+    auto fn = [&](const int& key, value_t&& val) -> ttg::device::Task<SPACE> {
       double scratch = 0.0;
       ttg::devicescratch<double> ds = ttg::make_scratch(&scratch, ttg::scope::Allocate);
 
@@ -211,17 +229,13 @@ TEST_CASE("Device", "coro") {
       CHECK(ds.device_ptr()  != nullptr);
 
       /* call a kernel */
-#ifdef TTG_HAVE_CUDA
       increment_buffer(val.db.current_device_ptr(), val.db.size(), ds.device_ptr(), ds.size());
-#endif // TTG_HAVE_CUDA
 
       /* here we suspend to wait for a kernel to complete */
       co_await ttg::device::wait(ds, val.db);
 
-#ifdef TTG_HAVE_CUDA
       /* buffer is increment once per task, so it should be 1 */
       CHECK((static_cast<int>(scratch)-1) == 0);
-#endif // 0
 
       /* we're back, the kernel executed and we can send */
       if (key < 10) {
@@ -233,20 +247,23 @@ TEST_CASE("Device", "coro") {
       }
     };
 
-    auto tt = ttg::make_tt<ttg::ExecutionSpace::CUDA>(fn, ttg::edges(edge), ttg::edges(edge),
-                                                      "device_task", {"edge_in"}, {"edge_out"});
+    auto tt = ttg::make_tt(fn, ttg::edges(edge), ttg::edges(edge),
+                           "device_task", {"edge_in"}, {"edge_out"});
     ttg::make_graph_executable(tt);
-    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, value_t{});
+    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, value_t{1});
     ttg::ttg_fence(ttg::default_execution_context());
   }
 
+
+#if 0
+  /* TODO: Ptr seems broken atm, fix or remove! */
   SECTION("ptr") {
 
     ttg::Edge<int, value_t> edge;
     ttg::Ptr<value_t> ptr;
     int last_key = 0;
     constexpr const int num_iter = 10;
-    auto fn = [&](const int& key, value_t&& val) -> ttg::device::Task {
+    auto fn = [&](const int& key, value_t&& val) -> ttg::device::Task<SPACE> {
       double scratch = key;
       ttg::devicescratch<double> ds = ttg::make_scratch(&scratch, ttg::scope::SyncIn);
 
@@ -256,18 +273,14 @@ TEST_CASE("Device", "coro") {
       CHECK(ds.device_ptr()  != nullptr);
 
       /* KERNEL */
-#ifdef TTG_HAVE_CUDA
       increment_buffer(val.db.current_device_ptr(), val.db.size(), ds.device_ptr(), ds.size());
-#endif // TTG_HAVE_CUDA
 
       /* here we suspend to wait for a kernel and the out-transfer to complete */
       co_await ttg::device::wait(val.db, ds);
 
-#ifdef TTG_HAVE_CUDA
       /* buffer is increment once per task, so it should be the same as key */
       CHECK(static_cast<int>(scratch) == key+1);
       CHECK(static_cast<int>(*val.db.host_ptr()) == key+1);
-#endif // TTG_HAVE_CUDA
 
       /* we're back, the kernel executed and we can send */
       if (key < num_iter) {
@@ -283,10 +296,10 @@ TEST_CASE("Device", "coro") {
 
     //ptr.get_view<ttg::ExecutionSpace::CUDA>(device_id);
 
-    auto tt = ttg::make_tt<ttg::ExecutionSpace::CUDA>(fn, ttg::edges(edge), ttg::edges(edge),
-                                                      "device_task", {"edge_in"}, {"edge_out"});
+    auto tt = ttg::make_tt(fn, ttg::edges(edge), ttg::edges(edge),
+                           "device_task", {"edge_in"}, {"edge_out"});
     ttg::make_graph_executable(tt);
-    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, value_t{});
+    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, value_t{1});
     ttg::ttg_fence(ttg::default_execution_context());
     if (num_iter == last_key) {
       CHECK(ptr.is_valid());
@@ -300,6 +313,8 @@ TEST_CASE("Device", "coro") {
     ptr.reset();
   }
 
+#endif // 0
+
   /* TODO: enabel this test once we control the PaRSEC state machine! */
   SECTION("device-host-tasks") {
 
@@ -307,15 +322,11 @@ TEST_CASE("Device", "coro") {
 
     auto host_fn = [&](const int& key, value_t&& val) {
       /* check that the data has been synced back */
-#ifdef TTG_HAVE_CUDA
       CHECK(static_cast<int>(*val.db.host_ptr()) == key);
-#endif // TTG_HAVE_CUDA
 
       /* modify the data */
       *val.db.host_ptr() += 1.0;
-#ifdef TTG_HAVE_CUDA
       CHECK(static_cast<int>(*val.db.host_ptr()) == key+1);
-#endif // TTG_HAVE_CUDA
 
       /* send back to the device */
       ttg::send<0>(key+1, std::move(val));
@@ -323,15 +334,13 @@ TEST_CASE("Device", "coro") {
     auto htt = ttg::make_tt(host_fn, ttg::edges(d2h), ttg::edges(h2d),
                             "host_task", {"d2h"}, {"h2d"});
 
-    auto device_fn = [&](const int& key, value_t&& val) -> ttg::device::Task {
+    auto device_fn = [&](const int& key, value_t&& val) -> ttg::device::Task<SPACE> {
 
       /* wait for the view to be available on the device */
       co_await ttg::device::select(val.db);
 
       /* call a kernel */
-#ifdef TTG_HAVE_CUDA
       increment_buffer(val.db.current_device_ptr(), val.db.size(), nullptr, 0);
-#endif // TTG_HAVE_CUDA
 
       /* here we suspend to wait for a kernel to complete */
       //co_await ttg::device::wait(val.db);
@@ -345,17 +354,17 @@ TEST_CASE("Device", "coro") {
       }
     };
 
-    auto dtt = ttg::make_tt<ttg::ExecutionSpace::CUDA>(device_fn, ttg::edges(h2d), ttg::edges(d2h),
-                                                      "device_task", {"h2d"}, {"d2h"});
+    auto dtt = ttg::make_tt(device_fn, ttg::edges(h2d), ttg::edges(d2h),
+                            "device_task", {"h2d"}, {"d2h"});
     ttg::make_graph_executable(dtt);
-    if (ttg::default_execution_context().rank() == 0) htt->invoke(0, value_t{});
+    if (ttg::default_execution_context().rank() == 0) htt->invoke(0, value_t{1});
     ttg::ttg_fence(ttg::default_execution_context());
   }
 
   SECTION("loop") {
 
     ttg::Edge<int, value_t> edge;
-    auto fn = [&](int key, value_t&& val) -> ttg::device::Task {
+    auto fn = [&](int key, value_t&& val) -> ttg::device::Task<SPACE> {
       double scratch = 1.0;
       ttg::devicescratch<double> ds = ttg::make_scratch(&scratch, ttg::scope::Allocate);
 
@@ -370,33 +379,29 @@ TEST_CASE("Device", "coro") {
         CHECK(val.db.current_device_ptr() != nullptr);
 
         /* KERNEL */
-#ifdef TTG_HAVE_CUDA
         increment_buffer(val.db.current_device_ptr(), val.db.size(), ds.device_ptr(), ds.size());
         //increment_buffer(val.db.current_device_ptr(), val.db.size(), 0, 0);
-#endif // TTG_HAVE_CUDA
 
         /* here we suspend to wait for a kernel and the out-transfer to complete */
         co_await ttg::device::wait(val.db);
 
-#ifdef TTG_HAVE_CUDA
         /* buffer is increment once per task, so it should be the same as key */
         //CHECK(static_cast<int>(scratch) == i);
         CHECK(static_cast<int>(*val.db.host_ptr()) == i+1);
-#endif // TTG_HAVE_CUDA
       }
     };
 
-    auto tt = ttg::make_tt<ttg::ExecutionSpace::CUDA>(fn, ttg::edges(edge), ttg::edges(edge),
-                                                      "device_task", {"edge_in"}, {"edge_out"});
+    auto tt = ttg::make_tt(fn, ttg::edges(edge), ttg::edges(edge),
+                           "device_task", {"edge_in"}, {"edge_out"});
     ttg::make_graph_executable(tt);
-    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, value_t{});
+    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, value_t{1});
     ttg::ttg_fence(ttg::default_execution_context());
   }
 
   SECTION("loop-scratchout") {
 
     ttg::Edge<int, value_t> edge;
-    auto fn = [&](int key, value_t&& val) -> ttg::device::Task {
+    auto fn = [&](int key, value_t&& val) -> ttg::device::Task<SPACE> {
       double scratch = -10.0;
       ttg::devicescratch<double> ds = ttg::make_scratch(&scratch, ttg::scope::SyncIn);
 
@@ -411,26 +416,22 @@ TEST_CASE("Device", "coro") {
         CHECK(val.db.current_device_ptr() != nullptr);
 
         /* KERNEL */
-#ifdef TTG_HAVE_CUDA
         increment_buffer(val.db.current_device_ptr(), val.db.size(), ds.device_ptr(), ds.size());
         //increment_buffer(val.db.current_device_ptr(), val.db.size(), 0, 0);
-#endif // TTG_HAVE_CUDA
 
         /* here we suspend to wait for a kernel and the out-transfer to complete */
         co_await ttg::device::wait(val.db, ds);
 
-#ifdef TTG_HAVE_CUDA
         /* buffer is increment once per task, so it should be the same as key */
         CHECK(static_cast<int>(scratch) == (-10+i+1));
         CHECK(static_cast<int>(*val.db.host_ptr()) == i+1);
-#endif // TTG_HAVE_CUDA
       }
     };
 
-    auto tt = ttg::make_tt<ttg::ExecutionSpace::CUDA>(fn, ttg::edges(edge), ttg::edges(edge),
-                                                      "device_task", {"edge_in"}, {"edge_out"});
+    auto tt = ttg::make_tt(fn, ttg::edges(edge), ttg::edges(edge),
+                           "device_task", {"edge_in"}, {"edge_out"});
     ttg::make_graph_executable(tt);
-    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, value_t{});
+    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, value_t{1});
     ttg::ttg_fence(ttg::default_execution_context());
   }
 }
diff --git a/ttg/ttg/coroutine.h b/ttg/ttg/coroutine.h
index dc1ed8e5b..333490a22 100644
--- a/ttg/ttg/coroutine.h
+++ b/ttg/ttg/coroutine.h
@@ -214,6 +214,7 @@ namespace ttg {
 
   // fwd declare all coro promise types that have not been declared yet
   namespace device::detail {
+    template<ttg::ExecutionSpace Space>
     struct device_task_promise_type;
   }  // namespace device::detail
 
diff --git a/ttg/ttg/device/task.h b/ttg/ttg/device/task.h
index 8e2d14cfc..a772ad73d 100644
--- a/ttg/ttg/device/task.h
+++ b/ttg/ttg/device/task.h
@@ -402,26 +402,31 @@ namespace ttg::device {
 
   namespace detail {
     // fwd-decl
+    template<ttg::ExecutionSpace ES>
     struct device_task_promise_type;
     // base type for ttg::device::Task
-    using device_task_handle_type = ttg::coroutine_handle<device_task_promise_type>;
+    template<ttg::ExecutionSpace ES>
+    using device_task_handle_type = ttg::coroutine_handle<device_task_promise_type<ES>>;
   } // namespace detail
 
   /// A device::Task is a coroutine (a callable that can be suspended and resumed).
 
-  /// Since task execution in TTG is not preempable, tasks should not block.
+  /// Since task execution in TTG is not preemptable, tasks should not block.
   /// The purpose of suspending a task is to yield control back to the runtime until some events occur;
   /// in the meantime its executor (e.g., a user-space thread) can perform other work.
   /// Once the task function reaches a point where further progress is pending completion of one or more asynchronous
   /// actions the function needs to be suspended via a coroutine await (`co_await`).
   /// Resumption will be handled by the runtime.
-  struct Task : public detail::device_task_handle_type {
-    using base_type = detail::device_task_handle_type;
+  template<ttg::ExecutionSpace ES = available_execution_space>
+  struct Task : public detail::device_task_handle_type<ES> {
+    using base_type = detail::device_task_handle_type<ES>;
+
+    static constexpr const ttg::ExecutionSpace space = ES;
 
     /// these are members mandated by the promise_type concept
     ///@{
 
-    using promise_type = detail::device_task_promise_type;
+    using promise_type = detail::device_task_promise_type<ES>;
 
     ///@}
 
@@ -444,8 +449,11 @@ namespace ttg::device {
     * application task coroutine on the first co_yield. It subsequently
     * tracks the state of the task when it moves from waiting for transfers
     * to waiting for the submitted kernel to complete. */
+    template<ttg::ExecutionSpace ES>
     struct device_task_promise_type {
 
+      static constexpr const ttg::ExecutionSpace space = ES;
+
       /* do not suspend the coroutine on first invocation, we want to run
       * the coroutine immediately and suspend when we get the device transfers.
       */
@@ -463,41 +471,58 @@ namespace ttg::device {
         return {};
       }
 
-      /* Allow co_await on a tuple */
-      template<typename... Views>
-      ttg::suspend_always await_transform(std::tuple<Views&...> &views) {
-        return yield_value(views);
-      }
-
       template<typename... Ts>
-      ttg::suspend_always await_transform(detail::to_device_t<Ts...>&& a) {
-        bool need_transfer = !(TTG_IMPL_NS::register_device_memory(a.ties));
-        /* TODO: are we allowed to not suspend here and launch the kernel directly? */
-        m_state = ttg::device::detail::TTG_DEVICE_CORO_WAIT_TRANSFER;
-        return {};
+      auto await_transform(detail::to_device_t<Ts...>&& a) {
+        if constexpr (space != ttg::ExecutionSpace::Host) {
+          bool need_transfer = !(TTG_IMPL_NS::register_device_memory(a.ties));
+          /* TODO: are we allowed to not suspend here and launch the kernel directly? */
+          m_state = ttg::device::detail::TTG_DEVICE_CORO_WAIT_TRANSFER;
+          return ttg::suspend_always{};
+        } else {
+          return ttg::suspend_never{}; // host never suspends
+        }
       }
 
       template<typename... Ts>
       auto await_transform(detail::wait_kernel_t<Ts...>&& a) {
-        //std::cout << "yield_value: wait_kernel_t" << std::endl;
-        if constexpr (sizeof...(Ts) > 0) {
-          TTG_IMPL_NS::mark_device_out(a.ties);
+        if constexpr (space != ttg::ExecutionSpace::Host) {
+          if constexpr (sizeof...(Ts) > 0) {
+            TTG_IMPL_NS::mark_device_out(a.ties);
+          }
+          m_state = ttg::device::detail::TTG_DEVICE_CORO_WAIT_KERNEL;
+          return ttg::suspend_always{};
+        } else {
+          return ttg::suspend_never{}; // host never suspends
         }
-        m_state = ttg::device::detail::TTG_DEVICE_CORO_WAIT_KERNEL;
-        return a;
       }
 
       ttg::suspend_always await_transform(std::vector<device::detail::send_t>&& v) {
-        m_sends = std::forward<std::vector<device::detail::send_t>>(v);
-        m_state = ttg::device::detail::TTG_DEVICE_CORO_SENDOUT;
+        if constexpr (space != ttg::ExecutionSpace::Host) {
+          m_sends = std::move(v);
+          m_state = ttg::device::detail::TTG_DEVICE_CORO_SENDOUT;
+        } else {
+          /* execute second part of sends immediately and never suspend */
+          for (auto& send : v) {
+            send.coro();
+          }
+          v.clear();
+        }
         return {};
+        // unreachable
+        throw std::runtime_error("Returned after sending!");
       }
 
       ttg::suspend_always await_transform(device::detail::send_t&& v) {
-        m_sends.clear();
-        m_sends.push_back(std::forward<device::detail::send_t>(v));
-        m_state = ttg::device::detail::TTG_DEVICE_CORO_SENDOUT;
+        if constexpr (space != ttg::ExecutionSpace::Host) {
+          m_sends.clear();
+          m_sends.push_back(std::move(v));
+          m_state = ttg::device::detail::TTG_DEVICE_CORO_SENDOUT;
+        } else {
+          v.coro();
+        }
         return {};
+        // unreachable
+        throw std::runtime_error("Returned after sending!");
       }
 
       void return_void() {
@@ -508,7 +533,9 @@ namespace ttg::device {
         return m_state == ttg::device::detail::TTG_DEVICE_CORO_COMPLETE;
       }
 
-      ttg::device::Task get_return_object() { return {detail::device_task_handle_type::from_promise(*this)}; }
+      ttg::device::Task<ES> get_return_object() {
+        return {detail::device_task_handle_type<ES>::from_promise(*this)};
+      }
 
       void unhandled_exception() {
         std::cerr << "Task coroutine caught an unhandled exception!" << std::endl;
@@ -532,108 +559,25 @@ namespace ttg::device {
     private:
       std::vector<device::detail::send_t> m_sends;
       ttg_device_coro_state m_state = ttg::device::detail::TTG_DEVICE_CORO_STATE_NONE;
-
     };
 
+    template<typename T>
+    struct is_device_task : std::false_type { };
+    template<ttg::ExecutionSpace ES>
+    struct is_device_task<Task<ES>> : std::true_type { };
+    template<typename T>
+    constexpr bool is_device_task_v = is_device_task<T>::value;
+
   } // namespace detail
 
-  bool Task::completed() { return base_type::promise().state() == ttg::device::detail::TTG_DEVICE_CORO_COMPLETE; }
+  template<ttg::ExecutionSpace ES>
+  bool Task<ES>::completed() {
+    return base_type::promise().state() == ttg::device::detail::TTG_DEVICE_CORO_COMPLETE;
+  }
 
   struct device_wait_kernel
   { };
 
-
-  /* NOTE: below is preliminary for reductions on the device, which is not available yet */
-#if 0
-  /**************************
-   * Device reduction coros *
-   **************************/
-
-  struct device_reducer_promise_type;
-
-  using device_reducer_handle_type = ttg::coroutine_handle<device_reducer_promise_type>;
-
-  /// task that can be resumed after some events occur
-  struct device_reducer : public device_reducer_handle_type {
-    using base_type = device_reducer_handle_type;
-
-    /// these are members mandated by the promise_type concept
-    ///@{
-
-    using promise_type = device_reducer_promise_type;
-
-    ///@}
-
-    device_reducer(base_type base) : base_type(std::move(base)) {}
-
-    base_type& handle() { return *this; }
-
-    /// @return true if ready to resume
-    inline bool ready() {
-      return true;
-    }
-
-    /// @return true if task completed and can be destroyed
-    inline bool completed();
-  };
-
-
-  /* The promise type that stores the views provided by the
-  * application task coroutine on the first co_yield. It subsequently
-  * tracks the state of the task when it moves from waiting for transfers
-  * to waiting for the submitted kernel to complete. */
-  struct device_reducer_promise_type {
-
-    /* do not suspend the coroutine on first invocation, we want to run
-    * the coroutine immediately and suspend when we get the device transfers.
-    */
-    ttg::suspend_never initial_suspend() {
-      m_state = ttg::device::detail::TTG_DEVICE_CORO_INIT;
-      return {};
-    }
-
-    /* suspend the coroutine at the end of the execution
-    * so we can access the promise.
-    * TODO: necessary? maybe we can save one suspend here
-    */
-    ttg::suspend_always final_suspend() noexcept {
-      m_state = ttg::device::detail::TTG_DEVICE_CORO_COMPLETE;
-      return {};
-    }
-
-    template<typename... Ts>
-    ttg::suspend_always await_transform(detail::to_device_t<Ts...>&& a) {
-      bool need_transfer = !(TTG_IMPL_NS::register_device_memory(a.ties));
-      /* TODO: are we allowed to not suspend here and launch the kernel directly? */
-      m_state = ttg::device::detail::TTG_DEVICE_CORO_WAIT_TRANSFER;
-      return {};
-    }
-
-    void return_void() {
-      m_state = ttg::device::detail::TTG_DEVICE_CORO_COMPLETE;
-    }
-
-    bool complete() const {
-      return m_state == ttg::device::detail::TTG_DEVICE_CORO_COMPLETE;
-    }
-
-    device_reducer get_return_object() { return device_reducer{device_reducer_handle_type::from_promise(*this)}; }
-
-    void unhandled_exception() { }
-
-    auto state() {
-      return m_state;
-    }
-
-
-  private:
-    ttg::device::detail::ttg_device_coro_state m_state = ttg::device::detail::TTG_DEVICE_CORO_STATE_NONE;
-
-  };
-
-  bool device_reducer::completed() { return base_type::promise().state() == ttg::device::detail::TTG_DEVICE_CORO_COMPLETE; }
-#endif // 0
-
 }  // namespace ttg::device
 
 #endif // TTG_HAVE_COROUTINE
diff --git a/ttg/ttg/madness/ttg.h b/ttg/ttg/madness/ttg.h
index 5d2360cfb..bf572e04d 100644
--- a/ttg/ttg/madness/ttg.h
+++ b/ttg/ttg/madness/ttg.h
@@ -343,20 +343,26 @@ namespace ttg_madness {
           // ttg::print("starting task");
           if constexpr (!ttg::meta::is_void_v<keyT> && !ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
             TTG_PROCESS_TT_OP_RETURN(
+                ttg::ExecutionSpace::Host,
                 suspended_task_address,
                 coroutine_id,
                 derived->op(key, this->make_input_refs(),
                             derived->output_terminals));  // !!! NOTE converting input values to refs
           } else if constexpr (!ttg::meta::is_void_v<keyT> && ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
-            TTG_PROCESS_TT_OP_RETURN(suspended_task_address, coroutine_id, derived->op(key, derived->output_terminals));
+            TTG_PROCESS_TT_OP_RETURN(
+                ttg::ExecutionSpace::Host, suspended_task_address,
+                coroutine_id, derived->op(key, derived->output_terminals));
           } else if constexpr (ttg::meta::is_void_v<keyT> && !ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
             TTG_PROCESS_TT_OP_RETURN(
+                ttg::ExecutionSpace::Host,
                 suspended_task_address,
                 coroutine_id,
                 derived->op(this->make_input_refs(),
                             derived->output_terminals));  // !!! NOTE converting input values to refs
           } else if constexpr (ttg::meta::is_void_v<keyT> && ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
-            TTG_PROCESS_TT_OP_RETURN(suspended_task_address, coroutine_id, derived->op(derived->output_terminals));
+            TTG_PROCESS_TT_OP_RETURN(
+                ttg::ExecutionSpace::Host, suspended_task_address,
+                coroutine_id, derived->op(derived->output_terminals));
           } else  // unreachable
             ttg::abort();
         } else {  // resume suspended coroutine
diff --git a/ttg/ttg/make_tt.h b/ttg/ttg/make_tt.h
index 81897b816..161fa8725 100644
--- a/ttg/ttg/make_tt.h
+++ b/ttg/ttg/make_tt.h
@@ -3,6 +3,41 @@
 #ifndef TTG_MAKE_TT_H
 #define TTG_MAKE_TT_H
 
+namespace detail {
+  template<typename T>
+  struct op_return_type {
+    using type = void;
+  };
+
+#ifdef TTG_HAVE_COROUTINE
+  template<>
+  struct op_return_type<ttg::resumable_task> {
+    using type = ttg::coroutine_handle<ttg::resumable_task_state>;
+  };
+
+  template<ttg::ExecutionSpace ES>
+  struct op_return_type<ttg::device::Task<ES>> {
+    using type = typename ttg::device::Task<ES>::base_type;
+  };
+#endif // TTG_HAVE_COROUTINE
+
+  template<typename T>
+  using op_return_type_t = typename op_return_type<T>::type;
+
+  template<typename T>
+  struct op_execution_space : std::integral_constant<ttg::ExecutionSpace, ttg::ExecutionSpace::Host>
+  { };
+
+  template<ttg::ExecutionSpace ES>
+  struct op_execution_space<ttg::device::Task<ES>> : std::integral_constant<ttg::ExecutionSpace, ES>
+  { };
+
+  template<typename T>
+  constexpr const ttg::ExecutionSpace op_execution_space_v = op_execution_space<T>::value;
+
+} // namespace detail
+
+
 // Class to wrap a callable with signature
 //
 // case 1 (keyT != void): void op(auto&& key, std::tuple<input_valuesT...>&&, std::tuple<output_terminalsT...>&)
@@ -131,12 +166,12 @@ struct CallableWrapTTUnwrapTypelist<funcT, funcT_receives_outterm_tuple, keyT, o
 // case 2 (keyT == void): returnT op(input_valuesT&&..., std::tuple<output_terminalsT...>&)
 //
 // returnT is void for funcT = synchronous (ordinary) function and the appropriate return type for funcT=coroutine
-template <typename funcT, typename returnT, bool funcT_receives_outterm_tuple, ttg::ExecutionSpace space,
+template <typename funcT, typename returnT, bool funcT_receives_outterm_tuple, ttg::ExecutionSpace Space,
           typename keyT, typename output_terminalsT, typename... input_valuesT>
 class CallableWrapTTArgs
     : public TT<
           keyT, output_terminalsT,
-          CallableWrapTTArgs<funcT, returnT, funcT_receives_outterm_tuple, space, keyT, output_terminalsT, input_valuesT...>,
+          CallableWrapTTArgs<funcT, returnT, funcT_receives_outterm_tuple, Space, keyT, output_terminalsT, input_valuesT...>,
           ttg::typelist<input_valuesT...>> {
   using baseT = typename CallableWrapTTArgs::ttT;
 
@@ -147,27 +182,13 @@ class CallableWrapTTArgs
 
   using noref_funcT = std::remove_reference_t<funcT>;
   std::conditional_t<std::is_function_v<noref_funcT>, std::add_pointer_t<noref_funcT>, noref_funcT> func;
-
-  using op_return_type =
-#ifdef TTG_HAVE_COROUTINE
-      std::conditional_t<std::is_same_v<returnT, ttg::resumable_task>,
-                         ttg::coroutine_handle<ttg::resumable_task_state>,
-#ifdef TTG_HAVE_DEVICE
-                         std::conditional_t<std::is_same_v<returnT, ttg::device::Task>,
-                                            ttg::device::Task::base_type,
-                                            void>
-#else  // TTG_HAVE_DEVICE
-                           void
-#endif  // TTG_HAVE_DEVICE
-                         >;
-#else   // TTG_HAVE_COROUTINE
-      void;
-#endif  // TTG_HAVE_COROUTINE
+  static_assert(!ttg::device::detail::is_device_task_v<void>);
+  using op_return_type = detail::op_return_type_t<returnT>;
 
 public:
-  static constexpr bool have_cuda_op = (space == ttg::ExecutionSpace::CUDA);
-  static constexpr bool have_hip_op  = (space == ttg::ExecutionSpace::HIP);
-  static constexpr bool have_level_zero_op = (space == ttg::ExecutionSpace::L0);
+  static constexpr bool have_cuda_op = (Space == ttg::ExecutionSpace::CUDA);
+  static constexpr bool have_hip_op  = (Space == ttg::ExecutionSpace::HIP);
+  static constexpr bool have_level_zero_op = (Space == ttg::ExecutionSpace::L0);
 
 protected:
 
@@ -186,20 +207,12 @@ class CallableWrapTTArgs
           coro_handle = ret;
         }
         return coro_handle;
-      } else
-#ifdef TTG_HAVE_DEVICE
-          if constexpr (std::is_same_v<returnT, ttg::device::Task>) {
-        ttg::device::Task::base_type coro_handle = ret;
+      } else if constexpr (ttg::device::detail::is_device_task_v<returnT>) {
+        typename returnT::base_type coro_handle = ret;
         return coro_handle;
       }
-#else  // TTG_HAVE_DEVICE
-        ttg::abort();  // should not happen
-#endif  // TTG_HAVE_DEVICE
       if constexpr (!(std::is_same_v<returnT, ttg::resumable_task>
-#ifdef TTG_HAVE_DEVICE
-          || std::is_same_v<returnT, ttg::device::Task>
-#endif  // TTG_HAVE_DEVICE
-              ))
+                   || ttg::device::detail::is_device_task_v<returnT>))
 #endif
       {
         static_assert(std::tuple_size_v<std::remove_reference_t<decltype(out)>> == 1,
@@ -554,8 +567,7 @@ auto make_tt_tpl(funcT &&func, const std::tuple<ttg::Edge<keyT, input_edge_value
 /// @warning Although generic arguments annotated by `const auto&` are also permitted, their use is discouraged to avoid confusion;
 ///          namely, `const auto&` denotes a _consumable_ argument, NOT read-only, despite the `const`.
 // clang-format on
-template <ttg::ExecutionSpace space,
-          typename keyT = void, typename funcT,
+template <typename keyT = void, typename funcT,
           typename... input_edge_valuesT, typename... output_edgesT>
 auto make_tt(funcT &&func, const std::tuple<ttg::Edge<keyT, input_edge_valuesT>...> &inedges = std::tuple<>{},
              const std::tuple<output_edgesT...> &outedges = std::tuple<>{}, const std::string &name = "wrapper",
@@ -593,6 +605,8 @@ auto make_tt(funcT &&func, const std::tuple<ttg::Edge<keyT, input_edge_valuesT>.
                 "signature of func "
                 "is faulty, or inedges does match the expected list of types, or both");
 
+  constexpr const ttg::ExecutionSpace space = detail::op_execution_space_v<func_return_t>;
+
   // net argument typelist
   using func_args_t = ttg::meta::drop_void_t<gross_func_args_t>;
   constexpr auto num_args = std::tuple_size_v<func_args_t>;
@@ -637,15 +651,6 @@ auto make_tt(funcT &&func, const std::tuple<ttg::Edge<keyT, input_edge_valuesT>.
   return std::make_unique<wrapT>(std::forward<funcT>(func), inedges, outedges, name, innames, outnames);
 }
 
-template <typename keyT = void, typename funcT,
-          typename... input_edge_valuesT, typename... output_edgesT>
-auto make_tt(funcT &&func, const std::tuple<ttg::Edge<keyT, input_edge_valuesT>...> &inedges = std::tuple<>{},
-             const std::tuple<output_edgesT...> &outedges = std::tuple<>{}, const std::string &name = "wrapper",
-             const std::vector<std::string> &innames = std::vector<std::string>(sizeof...(input_edge_valuesT), "input"),
-             const std::vector<std::string> &outnames = std::vector<std::string>(sizeof...(output_edgesT), "output")) {
-  return make_tt<ttg::ExecutionSpace::Host, keyT>(std::forward<funcT>(func), inedges, outedges, name, innames, outnames);
-}
-
 template <typename keyT, typename funcT, typename... input_valuesT, typename... output_edgesT>
 [[deprecated("use make_tt_tpl instead")]] inline auto wrapt(
     funcT &&func, const std::tuple<ttg::Edge<keyT, input_valuesT>...> &inedges,
diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h
index 0a0ddefcb..0c2be52ad 100644
--- a/ttg/ttg/parsec/ttg.h
+++ b/ttg/ttg/parsec/ttg.h
@@ -1397,7 +1397,7 @@ namespace ttg_parsec {
 
       task_t *task = (task_t*)gpu_task->ec;
       // get the device task from the coroutine handle
-      ttg::device::Task dev_task = ttg::device::detail::device_task_handle_type::from_address(task->suspended_task_address);
+      auto dev_task = ttg::device::detail::device_task_handle_type<Space>::from_address(task->suspended_task_address);
 
       task->dev_ptr->stream = gpu_stream;
 
@@ -1444,7 +1444,7 @@ namespace ttg_parsec {
       int rc = PARSEC_HOOK_RETURN_DONE;
       if (nullptr != task->suspended_task_address) {
         /* Get a new handle for the promise*/
-        dev_task = ttg::device::detail::device_task_handle_type::from_address(task->suspended_task_address);
+        dev_task = ttg::device::detail::device_task_handle_type<Space>::from_address(task->suspended_task_address);
         dev_data = dev_task.promise();
 
         assert(dev_data.state() == ttg::device::detail::TTG_DEVICE_CORO_WAIT_KERNEL ||
@@ -1593,10 +1593,10 @@ namespace ttg_parsec {
       }
 
       // get the device task from the coroutine handle
-      auto dev_task = ttg::device::detail::device_task_handle_type::from_address(task->suspended_task_address);
+      auto dev_task = ttg::device::detail::device_task_handle_type<Space>::from_address(task->suspended_task_address);
 
       // get the promise which contains the views
-      ttg::device::detail::device_task_promise_type& dev_data = dev_task.promise();
+      ttg::device::detail::device_task_promise_type<Space>& dev_data = dev_task.promise();
 
       /* for now make sure we're waiting for transfers and the coro hasn't skipped this step */
       assert(dev_data.state() == ttg::device::detail::TTG_DEVICE_CORO_WAIT_TRANSFER);
@@ -1674,14 +1674,14 @@ namespace ttg_parsec {
 
         if constexpr (!ttg::meta::is_void_v<keyT> && !ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
           auto input = make_tuple_of_ref_from_array(task, std::make_index_sequence<numinvals>{});
-          TTG_PROCESS_TT_OP_RETURN(suspended_task_address, task->coroutine_id, baseobj->template op<Space>(task->key, std::move(input), obj->output_terminals));
+          TTG_PROCESS_TT_OP_RETURN(Space, suspended_task_address, task->coroutine_id, baseobj->template op<Space>(task->key, std::move(input), obj->output_terminals));
         } else if constexpr (!ttg::meta::is_void_v<keyT> && ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
-          TTG_PROCESS_TT_OP_RETURN(suspended_task_address, task->coroutine_id, baseobj->template op<Space>(task->key, obj->output_terminals));
+          TTG_PROCESS_TT_OP_RETURN(Space, suspended_task_address, task->coroutine_id, baseobj->template op<Space>(task->key, obj->output_terminals));
         } else if constexpr (ttg::meta::is_void_v<keyT> && !ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
           auto input = make_tuple_of_ref_from_array(task, std::make_index_sequence<numinvals>{});
-          TTG_PROCESS_TT_OP_RETURN(suspended_task_address, task->coroutine_id, baseobj->template op<Space>(std::move(input), obj->output_terminals));
+          TTG_PROCESS_TT_OP_RETURN(Space, suspended_task_address, task->coroutine_id, baseobj->template op<Space>(std::move(input), obj->output_terminals));
         } else if constexpr (ttg::meta::is_void_v<keyT> && ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
-          TTG_PROCESS_TT_OP_RETURN(suspended_task_address, task->coroutine_id, baseobj->template op<Space>(obj->output_terminals));
+          TTG_PROCESS_TT_OP_RETURN(Space, suspended_task_address, task->coroutine_id, baseobj->template op<Space>(obj->output_terminals));
         } else {
           ttg::abort();
         }
@@ -1692,50 +1692,47 @@ namespace ttg_parsec {
 #ifdef TTG_HAVE_COROUTINE
         assert(task->coroutine_id != ttg::TaskCoroutineID::Invalid);
 
-#ifdef TTG_HAVE_DEVICE
         if (task->coroutine_id == ttg::TaskCoroutineID::DeviceTask) {
-          ttg::device::Task coro = ttg::device::detail::device_task_handle_type::from_address(suspended_task_address);
+          auto coro = ttg::device::detail::device_task_handle_type<Space>::from_address(suspended_task_address);
           assert(detail::parsec_ttg_caller == nullptr);
           detail::parsec_ttg_caller = static_cast<detail::parsec_ttg_task_base_t*>(task);
           // TODO: unify the outputs tls handling
           auto old_output_tls_ptr = task->tt->outputs_tls_ptr_accessor();
           task->tt->set_outputs_tls_ptr();
           coro.resume();
-          if (coro.completed()) {
+          if (coro.done()) {
             coro.destroy();
             suspended_task_address = nullptr;
           }
           task->tt->set_outputs_tls_ptr(old_output_tls_ptr);
           detail::parsec_ttg_caller = nullptr;
-        } else
-#endif  // TTG_HAVE_DEVICE
-      if (task->coroutine_id == ttg::TaskCoroutineID::ResumableTask) {
-        auto ret = static_cast<ttg::resumable_task>(ttg::coroutine_handle<ttg::resumable_task_state>::from_address(suspended_task_address));
-        assert(ret.ready());
-        auto old_output_tls_ptr = task->tt->outputs_tls_ptr_accessor();
-        task->tt->set_outputs_tls_ptr();
-        ret.resume();
-        if (ret.completed()) {
-          ret.destroy();
-          suspended_task_address = nullptr;
-        }
-        else { // not yet completed
-          // leave suspended_task_address as is
-
-          // right now can events are not properly implemented, we are only testing the workflow with dummy events
-          // so mark the events finished manually, parsec will rerun this task again and it should complete the second time
-          auto events = static_cast<ttg::resumable_task>(ttg::coroutine_handle<ttg::resumable_task_state>::from_address(suspended_task_address)).events();
-          for (auto &event_ptr : events) {
-            event_ptr->finish();
+        } else if (task->coroutine_id == ttg::TaskCoroutineID::ResumableTask) {
+          auto ret = static_cast<ttg::resumable_task>(ttg::coroutine_handle<ttg::resumable_task_state>::from_address(suspended_task_address));
+          assert(ret.ready());
+          auto old_output_tls_ptr = task->tt->outputs_tls_ptr_accessor();
+          task->tt->set_outputs_tls_ptr();
+          ret.resume();
+          if (ret.done()) {
+            ret.destroy();
+            suspended_task_address = nullptr;
           }
-          assert(ttg::coroutine_handle<ttg::resumable_task_state>::from_address(suspended_task_address).promise().ready());
+          else { // not yet completed
+            // leave suspended_task_address as is
+
+            // right now can events are not properly implemented, we are only testing the workflow with dummy events
+            // so mark the events finished manually, parsec will rerun this task again and it should complete the second time
+            auto events = static_cast<ttg::resumable_task>(ttg::coroutine_handle<ttg::resumable_task_state>::from_address(suspended_task_address)).events();
+            for (auto &event_ptr : events) {
+              event_ptr->finish();
+            }
+            assert(ttg::coroutine_handle<ttg::resumable_task_state>::from_address(suspended_task_address).promise().ready());
+          }
+          task->tt->set_outputs_tls_ptr(old_output_tls_ptr);
+          detail::parsec_ttg_caller = nullptr;
+          task->suspended_task_address = suspended_task_address;
         }
-        task->tt->set_outputs_tls_ptr(old_output_tls_ptr);
-        detail::parsec_ttg_caller = nullptr;
-        task->suspended_task_address = suspended_task_address;
-      }
-      else
-        ttg::abort();  // unrecognized task id
+        else
+          ttg::abort();  // unrecognized task id
 #else // TTG_HAVE_COROUTINE
       ttg::abort();  // should not happen
 #endif  // TTG_HAVE_COROUTINE
@@ -1773,9 +1770,9 @@ namespace ttg_parsec {
         assert(detail::parsec_ttg_caller == NULL);
         detail::parsec_ttg_caller = task;
         if constexpr (!ttg::meta::is_void_v<keyT>) {
-          TTG_PROCESS_TT_OP_RETURN(suspended_task_address, task->coroutine_id, baseobj->template op<Space>(task->key, obj->output_terminals));
+          TTG_PROCESS_TT_OP_RETURN(Space, suspended_task_address, task->coroutine_id, baseobj->template op<Space>(task->key, obj->output_terminals));
         } else if constexpr (ttg::meta::is_void_v<keyT>) {
-          TTG_PROCESS_TT_OP_RETURN(suspended_task_address, task->coroutine_id, baseobj->template op<Space>(obj->output_terminals));
+          TTG_PROCESS_TT_OP_RETURN(Space, suspended_task_address, task->coroutine_id, baseobj->template op<Space>(obj->output_terminals));
         } else  // unreachable
           ttg:: abort();
         detail::parsec_ttg_caller = NULL;
@@ -2038,7 +2035,7 @@ namespace ttg_parsec {
       detail::parsec_ttg_caller = parsec_ttg_caller_save;
 
       /* release the dummy task */
-      complete_task_and_release(es, &dummy->parsec_task);
+      complete_task_and_release<ttg::ExecutionSpace::Host>(es, &dummy->parsec_task);
       parsec_thread_mempool_free(mempool, &dummy->parsec_task);
     }
 
@@ -3334,16 +3331,19 @@ namespace ttg_parsec {
       }
     }
 
-    void copy_mark_pushout(detail::ttg_data_copy_t *copy) {
+    void copy_mark_pushout(detail::parsec_ttg_task_base_t *caller, detail::ttg_data_copy_t *copy) {
 
-      assert(detail::parsec_ttg_caller->dev_ptr && detail::parsec_ttg_caller->dev_ptr->gpu_task);
-      parsec_gpu_task_t *gpu_task = detail::parsec_ttg_caller->dev_ptr->gpu_task;
+      assert(caller->dev_ptr && caller->dev_ptr->gpu_task);
+      parsec_gpu_task_t *gpu_task = caller->dev_ptr->gpu_task;
+      if (nullptr == gpu_task->flow[0]) {
+        return; // this task has no flows because we skipped flow creation for host tasks
+      }
       auto check_parsec_data = [&](parsec_data_t* data) {
         if (data->owner_device != 0) {
           /* find the flow */
           int flowidx = 0;
           while (flowidx < MAX_PARAM_COUNT &&
-                gpu_task->flow[flowidx]->flow_flags != PARSEC_FLOW_ACCESS_NONE) {
+                 gpu_task->flow[flowidx]->flow_flags != PARSEC_FLOW_ACCESS_NONE) {
             if (detail::parsec_ttg_caller->parsec_task.data[flowidx].data_in->original == data) {
               /* found the right data, set the corresponding flow as pushout */
               break;
@@ -3369,22 +3369,25 @@ namespace ttg_parsec {
 
     /* check whether a data needs to be pushed out */
     template <std::size_t i, typename Value, typename RemoteCheckFn>
-    std::enable_if_t<!std::is_void_v<std::decay_t<Value>>,
-                     void>
+    std::enable_if_t<!std::is_void_v<std::decay_t<Value>>, void>
     do_prepare_send(const Value &value, RemoteCheckFn&& remote_check) {
       using valueT = std::tuple_element_t<i, input_values_full_tuple_type>;
       static constexpr const bool value_is_const = std::is_const_v<valueT>;
 
+      detail::parsec_ttg_task_base_t *caller = detail::parsec_ttg_caller;
+
+      // host tasks have no dev_ptr
+      if (nullptr == caller->dev_ptr) return;
+
       /* get the copy */
       detail::ttg_data_copy_t *copy;
-      copy = detail::find_copy_in_task(detail::parsec_ttg_caller, &value);
+      copy = detail::find_copy_in_task(caller, &value);
 
       /* if there is no copy we don't need to prepare anything */
       if (nullptr == copy) {
         return;
       }
 
-      detail::parsec_ttg_task_base_t *caller = detail::parsec_ttg_caller;
       bool need_pushout = false;
 
       if (caller->data_flags & detail::ttg_parsec_data_flags::MARKED_PUSHOUT) {
@@ -3396,7 +3399,7 @@ namespace ttg_parsec {
       auto &reducer = std::get<i>(input_reducers);
       if (reducer) {
         /* reductions are currently done only on the host so push out */
-        copy_mark_pushout(copy);
+        copy_mark_pushout(caller, copy);
         caller->data_flags |= detail::ttg_parsec_data_flags::MARKED_PUSHOUT;
         return;
       }
@@ -3405,6 +3408,9 @@ namespace ttg_parsec {
         if (caller->data_flags & detail::ttg_parsec_data_flags::IS_MODIFIED) {
           /* The data has been modified previously. PaRSEC requires us to pushout
            * data if we transition from a writer to one or more readers. */
+          /**
+           * TODO: check if there is only one device and avoid the pushout!
+           */
           need_pushout = true;
         }
 
@@ -3454,7 +3460,7 @@ namespace ttg_parsec {
       }
 
       if (need_pushout) {
-        copy_mark_pushout(copy);
+        copy_mark_pushout(caller, copy);
         caller->data_flags |= detail::ttg_parsec_data_flags::MARKED_PUSHOUT;
       }
     }
@@ -3726,6 +3732,7 @@ namespace ttg_parsec {
       junk[0]++;
     }
 
+    template<ttg::ExecutionSpace Space>
     static parsec_hook_return_t complete_task_and_release(parsec_execution_stream_t *es, parsec_task_t *parsec_task) {
 
       //std::cout << "complete_task_and_release: task " << parsec_task << std::endl;
@@ -3743,7 +3750,7 @@ namespace ttg_parsec {
           //increment_data_versions(task, std::make_index_sequence<std::tuple_size_v<typename TT::input_values_tuple_type>>{});
 
           // get the device task from the coroutine handle
-          auto dev_task = ttg::device::detail::device_task_handle_type::from_address(task->suspended_task_address);
+          auto dev_task = ttg::device::detail::device_task_handle_type<Space>::from_address(task->suspended_task_address);
 
           // get the promise which contains the views
           auto dev_data = dev_task.promise();
@@ -3841,7 +3848,6 @@ namespace ttg_parsec {
       world_impl.taskpool()->nb_task_classes = std::max(world_impl.taskpool()->nb_task_classes, static_cast<decltype(world_impl.taskpool()->nb_task_classes)>(self.task_class_id+1));
       //    function_id_to_instance[self.task_class_id] = this;
       //self.incarnations = incarnations_array.data();
-//#if 0
       if constexpr (derived_has_cuda_op()) {
         self.incarnations = (__parsec_chore_t *)malloc(3 * sizeof(__parsec_chore_t));
         ((__parsec_chore_t *)self.incarnations)[0].type = PARSEC_DEV_CUDA;
@@ -3850,6 +3856,7 @@ namespace ttg_parsec {
         ((__parsec_chore_t *)self.incarnations)[1].type = PARSEC_DEV_NONE;
         ((__parsec_chore_t *)self.incarnations)[1].evaluate = NULL;
         ((__parsec_chore_t *)self.incarnations)[1].hook = NULL;
+        self.complete_execution = complete_task_and_release<ttg::ExecutionSpace::CUDA>;
       } else if constexpr (derived_has_hip_op()) {
         self.incarnations = (__parsec_chore_t *)malloc(3 * sizeof(__parsec_chore_t));
         ((__parsec_chore_t *)self.incarnations)[0].type = PARSEC_DEV_HIP;
@@ -3859,6 +3866,7 @@ namespace ttg_parsec {
         ((__parsec_chore_t *)self.incarnations)[1].type = PARSEC_DEV_NONE;
         ((__parsec_chore_t *)self.incarnations)[1].evaluate = NULL;
         ((__parsec_chore_t *)self.incarnations)[1].hook = NULL;
+        self.complete_execution = complete_task_and_release<ttg::ExecutionSpace::HIP>;
 #if defined(PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT)
       } else if constexpr (derived_has_level_zero_op()) {
         self.incarnations = (__parsec_chore_t *)malloc(3 * sizeof(__parsec_chore_t));
@@ -3869,6 +3877,7 @@ namespace ttg_parsec {
         ((__parsec_chore_t *)self.incarnations)[1].type = PARSEC_DEV_NONE;
         ((__parsec_chore_t *)self.incarnations)[1].evaluate = NULL;
         ((__parsec_chore_t *)self.incarnations)[1].hook = NULL;
+        self.complete_execution = complete_task_and_release<ttg::ExecutionSpace::L0>;
 #endif // PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT
       } else {
         self.incarnations = (__parsec_chore_t *)malloc(2 * sizeof(__parsec_chore_t));
@@ -3878,11 +3887,10 @@ namespace ttg_parsec {
         ((__parsec_chore_t *)self.incarnations)[1].type = PARSEC_DEV_NONE;
         ((__parsec_chore_t *)self.incarnations)[1].evaluate = NULL;
         ((__parsec_chore_t *)self.incarnations)[1].hook = NULL;
+        self.complete_execution = complete_task_and_release<ttg::ExecutionSpace::Host>;
       }
-//#endif // 0
 
       self.release_task = &parsec_release_task_to_mempool_update_nbtasks;
-      self.complete_execution = complete_task_and_release;
 
       for (i = 0; i < MAX_PARAM_COUNT; i++) {
         parsec_flow_t *flow = new parsec_flow_t;
diff --git a/ttg/ttg/tt.h b/ttg/ttg/tt.h
index 1ca33fe9e..dea32a994 100644
--- a/ttg/ttg/tt.h
+++ b/ttg/ttg/tt.h
@@ -178,31 +178,31 @@ namespace ttg {
 
 #ifndef TTG_PROCESS_TT_OP_RETURN
 #ifdef TTG_HAVE_COROUTINE
-#define TTG_PROCESS_TT_OP_RETURN(result, id, invoke)                                                              \
-  {                                                                                                               \
-    using return_type = decltype(invoke);                                                                         \
-    if constexpr (std::is_same_v<return_type, void>) {                                                            \
-      invoke;                                                                                                     \
-      id = ttg::TaskCoroutineID::Invalid;                                                                         \
-    } else {                                                                                                      \
-      auto coro_return = invoke;                                                                                  \
-      static_assert(std::is_same_v<return_type, void> ||                                                          \
-                    std::is_base_of_v<ttg::coroutine_handle<ttg::resumable_task_state>, decltype(coro_return)>||  \
-                    std::is_base_of_v<ttg::coroutine_handle<ttg::device::detail::device_task_promise_type>,       \
-                                      decltype(coro_return)>);                                                    \
-      if constexpr (std::is_base_of_v<ttg::coroutine_handle<ttg::resumable_task_state>, decltype(coro_return)>)   \
-        id = ttg::TaskCoroutineID::ResumableTask;                                                                 \
-      else if constexpr (std::is_base_of_v<                                                                       \
-                                        ttg::coroutine_handle<ttg::device::detail::device_task_promise_type>,     \
-                                        decltype(coro_return)>)                                                   \
-        id = ttg::TaskCoroutineID::DeviceTask;                                                                    \
-      else                                                                                                        \
-        std::abort();                                                                                             \
-      result = coro_return.address();                                                                             \
-    }                                                                                                             \
+#define TTG_PROCESS_TT_OP_RETURN(Space, result, id, invoke)                                                           \
+  {                                                                                                                   \
+    using return_type = decltype(invoke);                                                                             \
+    if constexpr (std::is_same_v<return_type, void>) {                                                                \
+      invoke;                                                                                                         \
+      id = ttg::TaskCoroutineID::Invalid;                                                                             \
+    } else {                                                                                                          \
+      auto coro_return = invoke;                                                                                      \
+      static_assert(std::is_same_v<return_type, void> ||                                                              \
+                    std::is_base_of_v<ttg::coroutine_handle<ttg::resumable_task_state>, decltype(coro_return)>||      \
+                    std::is_base_of_v<ttg::coroutine_handle<ttg::device::detail::device_task_promise_type<Space>>,    \
+                                      decltype(coro_return)>);                                                        \
+      if constexpr (std::is_base_of_v<ttg::coroutine_handle<ttg::resumable_task_state>, decltype(coro_return)>)       \
+        id = ttg::TaskCoroutineID::ResumableTask;                                                                     \
+      else if constexpr (std::is_base_of_v<                                                                           \
+                                        ttg::coroutine_handle<ttg::device::detail::device_task_promise_type<Space>>,  \
+                                        decltype(coro_return)>)                                                       \
+        id = ttg::TaskCoroutineID::DeviceTask;                                                                        \
+      else                                                                                                            \
+        std::abort();                                                                                                 \
+      result = coro_return.address();                                                                                 \
+    }                                                                                                                 \
   }
 #else
-#define TTG_PROCESS_TT_OP_RETURN(result, id, invoke) invoke
+#define TTG_PROCESS_TT_OP_RETURN(Space, result, id, invoke) invoke
 #endif
 #else
 #error "TTG_PROCESS_TT_OP_RETURN already defined in ttg/tt.h, check your header guards"