From 42ef7d8399ddc2a5e0a569412816e0b3cc8b8e04 Mon Sep 17 00:00:00 2001
From: hyndavi <venkatreddygarihyndavi.3@gmail.com>
Date: Fri, 15 Mar 2024 10:10:12 -0400
Subject: [PATCH 01/28] Initial Code

---
 tests/unit/fibonacci_cuda_kernel.cu |  27 +++++
 tests/unit/fibonacci_cuda_kernel.h  |   4 +
 tests/unit/fibonacci_device.cc      | 158 ++++++++++++++++++++++++++++
 3 files changed, 189 insertions(+)
 create mode 100644 tests/unit/fibonacci_cuda_kernel.cu
 create mode 100644 tests/unit/fibonacci_cuda_kernel.h
 create mode 100644 tests/unit/fibonacci_device.cc
diff --git a/tests/unit/fibonacci_cuda_kernel.cu b/tests/unit/fibonacci_cuda_kernel.cu
new file mode 100644
index 000000000..78b24be8f
--- /dev/null
+++ b/tests/unit/fibonacci_cuda_kernel.cu
@@ -0,0 +1,27 @@
+#include "cuda_kernel.h"
+
+#ifdef TTG_HAVE_CUDA
+
+__global__ void cu_calculate_fibonacci(int64_t* results, std::size_t n) {
+  int tx = threadIdx.x; // Thread index
+
+  if (tx == 0) {
+    int64_t a = 0, b = 1, c;
+    if (n == 0) {
+      results[tx] = a;
+      return;
+    }
+    for (int i = 2; i <= n; i++) {
+      c = a + b;
+      a = b;
+      b = c;
+    }
+    results[tx] = b;
+  }
+}
+
+void calculate_fibonacci(int64_t* results, std::size_t n) {
+  cu_calculate_fibonacci<<<1, 1>>>(results, n); // Adjust <<<1, 1>>> as needed for parallel computation
+}
+
+#endif // TTG_HAVE_CUDA
diff --git a/tests/unit/fibonacci_cuda_kernel.h b/tests/unit/fibonacci_cuda_kernel.h
new file mode 100644
index 000000000..aa71382a7
--- /dev/null
+++ b/tests/unit/fibonacci_cuda_kernel.h
@@ -0,0 +1,4 @@
+#include "ttg/config.h"
+#include <cinttypes>
+
+void calculate_fibonacci(int64_t* result, std::size_t n);
diff --git a/tests/unit/fibonacci_device.cc b/tests/unit/fibonacci_device.cc
new file mode 100644
index 000000000..e2d4e3913
--- /dev/null
+++ b/tests/unit/fibonacci_device.cc
@@ -0,0 +1,158 @@
+////
+
+#include <ttg.h>
+// Define TTG_USE_CUDA only if CUDA support is desired and available
+#ifdef TTG_USE_CUDA
+#include "cuda_runtime.h"
+#include "cuda_kernel.h"
+#endif
+
+#include "ttg/serialization.h"
+
+// Default to CUDA if available, can be overridden by defining TTG_USE_XXX for other backends
+#define ES ttg::default_execution_space()
+
+struct A : public ttg::TTValue<A> {
+  int64_t value;
+  ttg::Buffer<int64_t> buffer;
+
+  A() : value(0), buffer(&value, 1) {}
+  A(int64_t val) : value(val), buffer(&value, 1) {}
+
+  A(A&& other) = default;
+  A& operator=(A&& other) = default;
+
+  template <typename Archive>
+  void serialize(Archive& ar) {
+    ttg_abort();
+  }
+  template <typename Archive>
+  void serialize(Archive& ar, const unsigned int) {
+    ttg_abort();
+  }
+};
+
+int main(int argc, char* argv[]) {
+  ttg::initialize(argc, argv, -1);
+  const int64_t N = 20;
+
+  ttg::Edge<int64_t, A> f2f;
+  ttg::Edge<void, A> f2p;
+
+  auto fib = ttg::make_tt<ES>(
+      [=](int64_t n, A& F_nms) -> ttg::device::Task {
+        if (n <= N) {
+          co_await ttg::device::select(F_nms.buffer);
+
+          int64_t result = calculate_fibonacci(n);
+
+          A F_n(result);
+          if (n < N) {
+            co_await ttg::device::send<0>(n + 1, F_n);
+          } else {
+            co_await ttg::device::sendv<1>(F_n);
+          }
+        }
+      },
+      ttg::edges(f2f),
+      ttg::edges(f2f, f2p),
+      "fib");
+
+  auto print = ttg::make_tt([](A F_N) {
+    std::cout << "The " << N << "th Fibonacci number is " << F_N.value << std::endl;
+  },
+                            ttg::edges(f2p),
+                            ttg::edges(),
+                            "print");
+
+  ttg::make_graph_executable(fib.get());
+  if (ttg::default_execution_context().rank() == 0) fib->invoke(2, A(1));
+
+  ttg::execute(ttg_default_execution_context());
+  ttg::fence(ttg_default_execution_context());
+
+  ttg::finalize();
+  return 0;
+}
+
+//// Created by Hyndavi Venkatreddygari on 3/13/24.
+////
+//#include <ttg.h>
+//#define TTG_USE_CUDA
+//#include "cuda_runtime.h"
+//#include "cuda_kernel.h"
+//
+//#include "ttg/serialization.h"
+//
+//#define ES ttg::ExecutionSpace::CUDA
+//
+//struct A : public ttg::TTValue<A> {
+//  int64_t value;
+//  ttg::Buffer<int64_t> buffer;
+//
+//  A() : value(0), buffer(&value, 1) {}
+//  A(int64_t val) : value(val), buffer(&value, 1) {}
+//
+//  A(A&& other) = default;
+//  A& operator=(A&& other) = default;
+//
+//  template <typename Archive>
+//  void serialize(Archive& ar) {
+//    ttg_abort();
+//  }
+//  template <typename Archive>
+//  void serialize(Archive& ar, const unsigned int) {
+//    ttg_abort();
+//  }
+//};
+//
+//int main(int argc, char* argv[]) {
+//  ttg::initialize(argc, argv, -1);
+//  const int64_t N = 20;
+//
+//  ttg::Edge<int64_t, A> f2f;
+//  ttg::Edge<void, A> f2p;
+//
+//  auto fib = ttg::make_tt<ES>(
+//      [=](int64_t n, A& F_nms) -> ttg::device::Task {
+//        if (n <= N) {
+//          co_await ttg::device::select(F_nms.buffer);
+//
+//          int64_t* d_result;
+//          cudaMalloc(&d_result, sizeof(int64_t));
+//
+//          calculate_fibonacci(d_result, n);
+//
+//          co_await ttg::wait_kernel();
+//
+//          int64_t h_result;
+//          cudaMemcpy(&h_result, d_result, sizeof(int64_t), cudaMemcpyDeviceToHost);
+//
+//          A F_n(h_result);
+//          if (n < N) {
+//            co_await ttg::device::send<0>(n + 1, F_n);
+//          } else {
+//            co_await ttg::device::sendv<1>(F_n);
+//          }
+//        }
+//      },
+//      ttg::edges(f2f),
+//      ttg::edges(f2f, f2p),
+//      "fib");
+//
+//  auto print = ttg::make_tt([](A F_N) {
+//    std::cout << "The " << N << "th Fibonacci number is " << F_N.value << std::endl;
+//  },
+//                            ttg::edges(f2p),
+//                            ttg::edges(),
+//                            "print");
+//
+//  ttg::make_graph_executable(fib.get());
+//  if (ttg::default_execution_context().rank() == 0) fib->invoke(2, A(1));
+//
+//  ttg::execute(ttg_default_execution_context());
+//  ttg::fence(ttg_default_execution_context());
+//
+//  ttg::finalize();
+//  return 0;
+//}

From 4d525efbd7acbd4082b15468c3351af3b2023362 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Wed, 20 Mar 2024 09:44:52 -0400
Subject: [PATCH 02/28] updated the Fibonacci device example to be roughly
 correct :)

---
 tests/unit/CMakeLists.txt           | 15 +++++---
 tests/unit/fibonacci_cuda_kernel.cu | 26 ++++----------
 tests/unit/fibonacci_cuda_kernel.h  |  2 +-
 tests/unit/fibonacci_device.cc      | 54 +++++++++++++++--------------
 4 files changed, 46 insertions(+), 51 deletions(-)

diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 275f3fdd8..3f1a62be7 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -16,6 +16,12 @@ list(APPEND ut_src fibonacci-coro.cc)
 list(APPEND ut_src device_coro.cc)
 if (TTG_HAVE_CUDA)
     list(APPEND ut_src cuda_kernel.cu)
+    # fibonacci device example
+    list(APPEND ut_src
+            fibonacci_device.cc
+            fibonacci_cuda_kernel.h
+            fibonacci_cuda_kernel.cc
+    )
 endif(TTG_HAVE_CUDA)
 list(APPEND ut_libs std::coroutine)
 
@@ -25,11 +31,10 @@ add_ttg_executable(core-unittests-ttg "${ut_src}" LINK_LIBRARIES "${ut_libs}" CO
 add_ttg_executable(serialization serialization.cc unit_main.cpp
                    LINK_LIBRARIES Catch2::Catch2 ttg-serialization  $<TARGET_NAME_IF_EXISTS:BTAS::BTAS>
                    COMPILE_DEFINITIONS $<$<TARGET_EXISTS:BTAS::BTAS>:TTG_HAS_BTAS=1>)
-#target_link_libraries(serialization "Catch2::Catch2;ttg-serialization")
-#if (TARGET BTAS::BTAS)
-#    target_link_libraries(serialization BTAS::BTAS)
-#    target_compile_definitions(serialization PRIVATE TTG_HAS_BTAS=1)
-#endif (TARGET BTAS::BTAS)
+
+# Boost serialization test: checks low-level codegen
+add_ttg_executable(serialization_boost serialization_boost.cc
+        LINK_LIBRARIES ttg-serialization-boost RUNTIMES "parsec")
 
 # TODO: convert into unit test
 #if (TARGET MADworld)
diff --git a/tests/unit/fibonacci_cuda_kernel.cu b/tests/unit/fibonacci_cuda_kernel.cu
index 78b24be8f..6fa316468 100644
--- a/tests/unit/fibonacci_cuda_kernel.cu
+++ b/tests/unit/fibonacci_cuda_kernel.cu
@@ -1,27 +1,15 @@
-#include "cuda_kernel.h"
+#include "fibonacci_cuda_kernel.h"
 
 #ifdef TTG_HAVE_CUDA
 
-__global__ void cu_calculate_fibonacci(int64_t* results, std::size_t n) {
-  int tx = threadIdx.x; // Thread index
-
-  if (tx == 0) {
-    int64_t a = 0, b = 1, c;
-    if (n == 0) {
-      results[tx] = a;
-      return;
-    }
-    for (int i = 2; i <= n; i++) {
-      c = a + b;
-      a = b;
-      b = c;
-    }
-    results[tx] = b;
-  }
+__global__ void cu_next_value(int64_t* fn_and_fnm1) {
+  int64_t fnp1 = fn_and_fnm1[0] + fn_and_fnm1[1];
+  fn_and_fnm1[1] = fn_and_fnm1[0];
+  fn_and_fnm1[0] = fnp1;
 }
 
-void calculate_fibonacci(int64_t* results, std::size_t n) {
-  cu_calculate_fibonacci<<<1, 1>>>(results, n); // Adjust <<<1, 1>>> as needed for parallel computation
+void next_value(int64_t* fn_and_fnm1) {
+  cu_next_value<<<1, 1>>>(fn_and_fnm1);
 }
 
 #endif // TTG_HAVE_CUDA
diff --git a/tests/unit/fibonacci_cuda_kernel.h b/tests/unit/fibonacci_cuda_kernel.h
index aa71382a7..a096ec3f1 100644
--- a/tests/unit/fibonacci_cuda_kernel.h
+++ b/tests/unit/fibonacci_cuda_kernel.h
@@ -1,4 +1,4 @@
 #include "ttg/config.h"
 #include <cinttypes>
 
-void calculate_fibonacci(int64_t* result, std::size_t n);
+void next_value(int64_t* fn_and_fnm1);
\ No newline at end of file
diff --git a/tests/unit/fibonacci_device.cc b/tests/unit/fibonacci_device.cc
index e2d4e3913..4b915f537 100644
--- a/tests/unit/fibonacci_device.cc
+++ b/tests/unit/fibonacci_device.cc
@@ -4,7 +4,7 @@
 // Define TTG_USE_CUDA only if CUDA support is desired and available
 #ifdef TTG_USE_CUDA
 #include "cuda_runtime.h"
-#include "cuda_kernel.h"
+#include "fibonacci_cuda_kernel.h"
 #endif
 
 #include "ttg/serialization.h"
@@ -12,15 +12,15 @@
 // Default to CUDA if available, can be overridden by defining TTG_USE_XXX for other backends
 #define ES ttg::default_execution_space()
 
-struct A : public ttg::TTValue<A> {
-  int64_t value;
-  ttg::Buffer<int64_t> buffer;
+/// N.B. contains values of F_n and F_{n-1}
+struct Fn : public ttg::TTValue<Fn> {
+  int64_t F[2] = {1, 0};  // F[0] = F_n, F[1] = F_{n-1}
+  ttg::Buffer<int64_t> b;
 
-  A() : value(0), buffer(&value, 1) {}
-  A(int64_t val) : value(val), buffer(&value, 1) {}
+  Fn() : b(&F[0], 2) {}
 
-  A(A&& other) = default;
-  A& operator=(A&& other) = default;
+  Fn(Fn&& other) = default;
+  Fn& operator=(Fn&& other) = default;
 
   template <typename Archive>
   void serialize(Archive& ar) {
@@ -34,39 +34,41 @@ struct A : public ttg::TTValue<A> {
 
 int main(int argc, char* argv[]) {
   ttg::initialize(argc, argv, -1);
-  const int64_t N = 20;
+  const int64_t F_n_max = 1000;
 
-  ttg::Edge<int64_t, A> f2f;
-  ttg::Edge<void, A> f2p;
+  ttg::Edge<int64_t, Fn> f2f;
+  ttg::Edge<void, Fn> f2p;
 
   auto fib = ttg::make_tt<ES>(
-      [=](int64_t n, A& F_nms) -> ttg::device::Task {
-        if (n <= N) {
-          co_await ttg::device::select(F_nms.buffer);
-
-          int64_t result = calculate_fibonacci(n);
-
-          A F_n(result);
-          if (n < N) {
-            co_await ttg::device::send<0>(n + 1, F_n);
-          } else {
-            co_await ttg::device::sendv<1>(F_n);
-          }
+      [=](int64_t n, Fn& f_n) -> ttg::device::Task {
+        assert(n > 0);
+
+        co_await ttg::device::select(f_n.b);
+
+        next_value(f_n.b.current_device_ptr());
+
+        // wait for the task to complete and the values to be brought back to the host
+        co_await ttg::device::wait(f_n.b);
+
+        if (f_n.F[0] < F_n_max) {
+          co_await ttg::device::forward(ttg::device::send<0>(n + 1, f_n));
+        } else {
+          co_await ttg::device::forward(ttg::device::sendv<1>(f_n));
         }
       },
       ttg::edges(f2f),
       ttg::edges(f2f, f2p),
       "fib");
 
-  auto print = ttg::make_tt([](A F_N) {
-    std::cout << "The " << N << "th Fibonacci number is " << F_N.value << std::endl;
+  auto print = ttg::make_tt([](Fn f_n) {
+    std::cout << "The largest Fibonacci number smaller than" << F_n_max << " is " << f_n.F[1] << std::endl;
   },
                             ttg::edges(f2p),
                             ttg::edges(),
                             "print");
 
   ttg::make_graph_executable(fib.get());
-  if (ttg::default_execution_context().rank() == 0) fib->invoke(2, A(1));
+  if (ttg::default_execution_context().rank() == 0) fib->invoke(1, Fn{});
 
   ttg::execute(ttg_default_execution_context());
   ttg::fence(ttg_default_execution_context());

From 2956f1f70ac6275c54831da0cf2f8a8a0c11f3d1 Mon Sep 17 00:00:00 2001
From: hyndavi <venkatreddygarihyndavi.3@gmail.com>
Date: Tue, 26 Mar 2024 19:27:53 -0400
Subject: [PATCH 03/28] file extension change

---
 tests/unit/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 3f1a62be7..7b1417d8c 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -20,7 +20,7 @@ if (TTG_HAVE_CUDA)
     list(APPEND ut_src
             fibonacci_device.cc
             fibonacci_cuda_kernel.h
-            fibonacci_cuda_kernel.cc
+            fibonacci_cuda_kernel.cu
     )
 endif(TTG_HAVE_CUDA)
 list(APPEND ut_libs std::coroutine)

From 1629d6a5ef36a6bce4229252422cd5de3f5c87f9 Mon Sep 17 00:00:00 2001
From: hyndavi <venkatreddygarihyndavi.3@gmail.com>
Date: Fri, 5 Apr 2024 13:19:37 -0400
Subject: [PATCH 04/28] code cleanup

---
 tests/unit/fibonacci_device.cc | 143 +++++++--------------------------
 1 file changed, 29 insertions(+), 114 deletions(-)

diff --git a/tests/unit/fibonacci_device.cc b/tests/unit/fibonacci_device.cc
index 4b915f537..4a052eb18 100644
--- a/tests/unit/fibonacci_device.cc
+++ b/tests/unit/fibonacci_device.cc
@@ -1,17 +1,16 @@
-////
-
 #include <ttg.h>
-// Define TTG_USE_CUDA only if CUDA support is desired and available
-#ifdef TTG_USE_CUDA
+
+#if defined(TTG_HAVE_CUDA)
+#define ES ttg::ExecutionSpace::CUDA
 #include "cuda_runtime.h"
 #include "fibonacci_cuda_kernel.h"
+#else
+#error " CUDA  is required to build this test!"
 #endif
 
 #include "ttg/serialization.h"
 
-// Default to CUDA if available, can be overridden by defining TTG_USE_XXX for other backends
-#define ES ttg::default_execution_space()
-
+const int64_t F_n_max = 1000;
 /// N.B. contains values of F_n and F_{n-1}
 struct Fn : public ttg::TTValue<Fn> {
   int64_t F[2] = {1, 0};  // F[0] = F_n, F[1] = F_{n-1}
@@ -19,28 +18,25 @@ struct Fn : public ttg::TTValue<Fn> {
 
   Fn() : b(&F[0], 2) {}
 
+  Fn(const Fn&) = delete;
   Fn(Fn&& other) = default;
+  Fn& operator=(const Fn& other) = delete;
   Fn& operator=(Fn&& other) = default;
 
   template <typename Archive>
   void serialize(Archive& ar) {
-    ttg_abort();
+    ttg::ttg_abort();
   }
   template <typename Archive>
   void serialize(Archive& ar, const unsigned int) {
-    ttg_abort();
+    ttg::ttg_abort();
   }
 };
-
-int main(int argc, char* argv[]) {
-  ttg::initialize(argc, argv, -1);
-  const int64_t F_n_max = 1000;
-
-  ttg::Edge<int64_t, Fn> f2f;
-  ttg::Edge<void, Fn> f2p;
-
-  auto fib = ttg::make_tt<ES>(
-      [=](int64_t n, Fn& f_n) -> ttg::device::Task {
+extern ttg::Edge<int64_t, Fn> f2f;
+extern ttg::Edge<void, Fn> f2p;
+auto create_fib_task() {
+  return ttg::make_tt<ES>(
+      [=](int64_t n, Fn&& f_n) -> ttg::device::Task {
         assert(n > 0);
 
         co_await ttg::device::select(f_n.b);
@@ -51,110 +47,29 @@ int main(int argc, char* argv[]) {
         co_await ttg::device::wait(f_n.b);
 
         if (f_n.F[0] < F_n_max) {
-          co_await ttg::device::forward(ttg::device::send<0>(n + 1, f_n));
+          co_await ttg::device::forward(ttg::device::send<0>(n + 1, std::move(f_n)));
         } else {
-          co_await ttg::device::forward(ttg::device::sendv<1>(f_n));
+          co_await ttg::device::forward(ttg::device::sendv<1>(std::move(f_n)));
         }
       },
-      ttg::edges(f2f),
-      ttg::edges(f2f, f2p),
-      "fib");
+      ttg::edges(f2f), ttg::edges(f2f, f2p), "fib");
+  auto print = ttg::make_tt(
+      [](Fn f_n) {
+        std::cout << "The largest Fibonacci number smaller than " << F_n_max << " is " << f_n.F[1] << std::endl;
+      },
+      ttg::edges(f2p), ttg::edges(), "print");
+}
 
-  auto print = ttg::make_tt([](Fn f_n) {
-    std::cout << "The largest Fibonacci number smaller than" << F_n_max << " is " << f_n.F[1] << std::endl;
-  },
-                            ttg::edges(f2p),
-                            ttg::edges(),
-                            "print");
+int main(int argc, char* argv[]) {
+  ttg::initialize(argc, argv, -1);
+  auto fib = create_fib_task();
 
   ttg::make_graph_executable(fib.get());
   if (ttg::default_execution_context().rank() == 0) fib->invoke(1, Fn{});
 
-  ttg::execute(ttg_default_execution_context());
-  ttg::fence(ttg_default_execution_context());
+  ttg::execute(ttg::ttg_default_execution_context());
+  ttg::fence(ttg::ttg_default_execution_context());
 
   ttg::finalize();
   return 0;
 }
-
-//// Created by Hyndavi Venkatreddygari on 3/13/24.
-////
-//#include <ttg.h>
-//#define TTG_USE_CUDA
-//#include "cuda_runtime.h"
-//#include "cuda_kernel.h"
-//
-//#include "ttg/serialization.h"
-//
-//#define ES ttg::ExecutionSpace::CUDA
-//
-//struct A : public ttg::TTValue<A> {
-//  int64_t value;
-//  ttg::Buffer<int64_t> buffer;
-//
-//  A() : value(0), buffer(&value, 1) {}
-//  A(int64_t val) : value(val), buffer(&value, 1) {}
-//
-//  A(A&& other) = default;
-//  A& operator=(A&& other) = default;
-//
-//  template <typename Archive>
-//  void serialize(Archive& ar) {
-//    ttg_abort();
-//  }
-//  template <typename Archive>
-//  void serialize(Archive& ar, const unsigned int) {
-//    ttg_abort();
-//  }
-//};
-//
-//int main(int argc, char* argv[]) {
-//  ttg::initialize(argc, argv, -1);
-//  const int64_t N = 20;
-//
-//  ttg::Edge<int64_t, A> f2f;
-//  ttg::Edge<void, A> f2p;
-//
-//  auto fib = ttg::make_tt<ES>(
-//      [=](int64_t n, A& F_nms) -> ttg::device::Task {
-//        if (n <= N) {
-//          co_await ttg::device::select(F_nms.buffer);
-//
-//          int64_t* d_result;
-//          cudaMalloc(&d_result, sizeof(int64_t));
-//
-//          calculate_fibonacci(d_result, n);
-//
-//          co_await ttg::wait_kernel();
-//
-//          int64_t h_result;
-//          cudaMemcpy(&h_result, d_result, sizeof(int64_t), cudaMemcpyDeviceToHost);
-//
-//          A F_n(h_result);
-//          if (n < N) {
-//            co_await ttg::device::send<0>(n + 1, F_n);
-//          } else {
-//            co_await ttg::device::sendv<1>(F_n);
-//          }
-//        }
-//      },
-//      ttg::edges(f2f),
-//      ttg::edges(f2f, f2p),
-//      "fib");
-//
-//  auto print = ttg::make_tt([](A F_N) {
-//    std::cout << "The " << N << "th Fibonacci number is " << F_N.value << std::endl;
-//  },
-//                            ttg::edges(f2p),
-//                            ttg::edges(),
-//                            "print");
-//
-//  ttg::make_graph_executable(fib.get());
-//  if (ttg::default_execution_context().rank() == 0) fib->invoke(2, A(1));
-//
-//  ttg::execute(ttg_default_execution_context());
-//  ttg::fence(ttg_default_execution_context());
-//
-//  ttg::finalize();
-//  return 0;
-//}

From ba177457e02a42b48e36e9cf89228e294b8a4463 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Fri, 5 Apr 2024 17:37:25 -0400
Subject: [PATCH 05/28] remove unneeded rangeT template parameter of range-free
 versions of prepare_send functions

---
 ttg/ttg/terminal.h | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/ttg/ttg/terminal.h b/ttg/ttg/terminal.h
index dc9f4b08b..0a6e5aabf 100644
--- a/ttg/ttg/terminal.h
+++ b/ttg/ttg/terminal.h
@@ -317,16 +317,11 @@ namespace ttg {
       }
     }
 
-    template <typename rangeT, typename Value>
+    template <typename Value>
     void prepare_send(Value &&value) {
       const Value &v = value;
       if (prepare_send_callback) {
-        if constexpr (ttg::meta::is_iterable_v<rangeT>) {
-          prepare_send_callback(v);
-        } else {
-          /* got something we cannot iterate over (single element?) so put one element in the span */
           prepare_send_callback(v);
-        }
       }
     }
   };
@@ -579,7 +574,7 @@ namespace ttg {
       }
     }
 
-    template <typename rangeT, typename Key = keyT, typename Value = valueT>
+    template <typename Key = keyT, typename Value = valueT>
     std::enable_if_t<meta::is_none_void_v<Key> && !meta::is_void_v<valueT>, void>
     prepare_send(const Value &value) {
       for (auto &&successor : this->successors()) {

From 8d3246b604d8989e81f2773d5efbfcca57d846a1 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Fri, 5 Apr 2024 17:39:51 -0400
Subject: [PATCH 06/28] added the serialization_boost unit test

---
 tests/unit/serialization_boost.cc | 64 +++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 tests/unit/serialization_boost.cc

diff --git a/tests/unit/serialization_boost.cc b/tests/unit/serialization_boost.cc
new file mode 100644
index 000000000..954a8cc46
--- /dev/null
+++ b/tests/unit/serialization_boost.cc
@@ -0,0 +1,64 @@
+//
+// Created by Eduard Valeyev on 2/27/24.
+//
+
+#include "ttg/serialization.h"
+
+#include "ttg/util/meta.h"
+
+#include "ttg/serialization/data_descriptor.h"
+
+struct pod {
+  double a;
+  int b;
+  float c[3];
+  friend bool operator==(const pod& lhs, const pod& rhs) {
+    return lhs.a == rhs.a && lhs.b == rhs.b && lhs.c[0] == rhs.c[0] && lhs.c[1] == rhs.c[1] && lhs.c[2] == rhs.c[2];
+  }
+};
+
+BOOST_CLASS_IMPLEMENTATION(pod, primitive_type)
+BOOST_IS_BITWISE_SERIALIZABLE(pod)
+
+#include "ttg/serialization/std/vector.h"
+#include "ttg/serialization/std/array.h"
+
+static_assert(ttg::detail::is_memcpyable_v<pod>);
+static_assert(ttg::detail::is_boost_buffer_serializable_v<std::vector<pod>>);
+
+template <typename T>
+void save_to_buffer(const T& t, char* buffer, std::size_t buffer_size) {
+  ttg::detail::byte_ostreambuf oabuf(buffer, buffer_size);
+  ttg::detail::boost_byte_oarchive oa(oabuf);
+  oa << t;
+}
+
+int main() {
+
+  std::array<char, 32768> buf;
+
+  constexpr auto N = 10;
+  pod x{1., 2, {3., 4., 5.}};
+  std::vector<pod> vx(N,x);
+  std::array<pod, 5> ax{{x, x, x, x, x}};
+
+//  const ttg_data_descriptor* pod_dd = ttg::get_data_descriptor<pod>();
+//  auto x_size = pod_dd->payload_size(&x);
+
+  auto vx_size = ttg::default_data_descriptor<decltype(vx)>::pack_payload(&vx, size(buf), 0, data(buf));
+  auto ax_size = ttg::default_data_descriptor<decltype(ax)>::pack_payload(&ax, size(buf)-vx_size, vx_size, data(buf));
+
+  decltype(vx) vx_copy;
+  decltype(ax) ax_copy;
+  auto vx_copy_size = ttg::default_data_descriptor<decltype(vx)>::unpack_payload(&vx_copy, size(buf), 0, data(buf));
+  assert(vx_copy == vx);
+  ttg::default_data_descriptor<decltype(ax)>::unpack_payload(&ax_copy, size(buf)-vx_copy_size, vx_copy_size, data(buf));
+  assert(ax_copy == ax);
+
+//  constexpr std::size_t buffer_size = 4096;
+//  char buffer[buffer_size];
+//  save_to_buffer(vx, buffer, buffer_size);
+//  save_to_buffer(ax, buffer, buffer_size);
+
+  return 0;
+}
\ No newline at end of file

From 978ce532414481eff3a5d3988b6db9a14a337d8e Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Sat, 6 Apr 2024 15:16:03 -0400
Subject: [PATCH 07/28] partial fixes for flow of move-only types in parsec
 device backend

---
 ttg/ttg/terminal.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ttg/ttg/terminal.h b/ttg/ttg/terminal.h
index 0a6e5aabf..c6d293088 100644
--- a/ttg/ttg/terminal.h
+++ b/ttg/ttg/terminal.h
@@ -304,7 +304,7 @@ namespace ttg {
 
     template <typename rangeT, typename Value>
     void prepare_send(const rangeT &keylist, Value &&value) {
-      const Value &v = value;
+      const std::remove_reference_t<Value> &v = value;
       if (prepare_send_callback) {
         if constexpr (ttg::meta::is_iterable_v<rangeT>) {
           prepare_send_callback(ttg::span<const keyT>(&(*std::begin(keylist)),
@@ -319,7 +319,7 @@ namespace ttg {
 
     template <typename Value>
     void prepare_send(Value &&value) {
-      const Value &v = value;
+      const std::remove_reference_t<Value> &v = value;
       if (prepare_send_callback) {
           prepare_send_callback(v);
       }
@@ -575,7 +575,7 @@ namespace ttg {
     }
 
     template <typename Key = keyT, typename Value = valueT>
-    std::enable_if_t<meta::is_none_void_v<Key> && !meta::is_void_v<valueT>, void>
+    std::enable_if_t<meta::is_void_v<Key> && !meta::is_void_v<valueT>, void>
     prepare_send(const Value &value) {
       for (auto &&successor : this->successors()) {
         assert(successor->get_type() != TerminalBase::Type::Write);

From 149f33bacaf78bec0b540c8de1446f6b957cef10 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Sun, 7 Apr 2024 21:41:02 -0400
Subject: [PATCH 08/28] more fixes for flow of move-only types in parsec device
 backend

---
 ttg/ttg/parsec/ttg.h | 95 +++++++++++++++++++++++++++++++-------------
 1 file changed, 68 insertions(+), 27 deletions(-)

diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h
index f4653f962..2f3adb3ae 100644
--- a/ttg/ttg/parsec/ttg.h
+++ b/ttg/ttg/parsec/ttg.h
@@ -118,6 +118,8 @@
 
 #include "ttg/device/device.h"
 
+#include <boost/type_index.hpp>
+
 #undef TTG_PARSEC_DEBUG_TRACK_DATA_COPIES
 
 /* PaRSEC function declarations */
@@ -960,28 +962,34 @@ namespace ttg_parsec {
       }
 
       if (NULL == copy_res) {
-        ttg_data_copy_t *new_copy = detail::create_new_datacopy(*static_cast<Value *>(copy_in->get_ptr()));
-        if (replace && nullptr != copy_in->get_next_task()) {
-          /* replace the task that was deferred */
-          parsec_ttg_task_base_t *deferred_op = (parsec_ttg_task_base_t *)copy_in->get_next_task();
-          new_copy->mark_mutable();
-          /* replace the copy in the deferred task */
-          for (int i = 0; i < deferred_op->data_count; ++i) {
-            if (deferred_op->copies[i] == copy_in) {
-              deferred_op->copies[i] = new_copy;
-              break;
-            }
-          }
-          copy_in->set_next_task(nullptr);
-          deferred_op->release_task();
-          copy_in->reset_readers();            // set the copy back to being read-only
-          copy_in->increment_readers<false>(); // register as reader
-          copy_res = copy_in;                  // return the copy we were passed
-        } else {
-          if (!readonly) {
+        // can only make a copy if Value is copy-constructible ... so this codepath should never be hit
+        if constexpr (std::is_copy_constructible_v<std::decay_t<Value>>) {
+          ttg_data_copy_t *new_copy = detail::create_new_datacopy(*static_cast<Value *>(copy_in->get_ptr()));
+          if (replace && nullptr != copy_in->get_next_task()) {
+            /* replace the task that was deferred */
+            parsec_ttg_task_base_t *deferred_op = (parsec_ttg_task_base_t *)copy_in->get_next_task();
             new_copy->mark_mutable();
+            /* replace the copy in the deferred task */
+            for (int i = 0; i < deferred_op->data_count; ++i) {
+              if (deferred_op->copies[i] == copy_in) {
+                deferred_op->copies[i] = new_copy;
+                break;
+              }
+            }
+            copy_in->set_next_task(nullptr);
+            deferred_op->release_task();
+            copy_in->reset_readers();            // set the copy back to being read-only
+            copy_in->increment_readers<false>(); // register as reader
+            copy_res = copy_in;                  // return the copy we were passed
+          } else {
+            if (!readonly) {
+              new_copy->mark_mutable();
+            }
+            copy_res = new_copy;  // return the new copy
           }
-          copy_res = new_copy;  // return the new copy
+        }
+        else {
+          throw std::logic_error(std::string("TTG::PaRSEC: need to copy a datum of type") + boost::typeindex::type_id<std::decay_t<Value>>().pretty_name() + " but the type is not copyable");
         }
       }
       return copy_res;
@@ -1933,7 +1941,19 @@ ttg::abort();  // should not happen
       /* iterate over the keys and have them use the copy we made */
       parsec_task_t *task_ring = nullptr;
       for (auto &&key : keylist) {
-        set_arg_local_impl<i>(key, *reinterpret_cast<valueT *>(copy->get_ptr()), copy, &task_ring);
+        // copy-constructible? can broadcast to any number of keys
+        if constexpr (std::is_copy_constructible_v<valueT>) {
+          set_arg_local_impl<i>(key, *reinterpret_cast<valueT *>(copy->get_ptr()), copy, &task_ring);
+        }
+        else {
+          // not copy-constructible? can move, but only to single key
+          static_assert(!std::is_reference_v<valueT>);
+          if (std::size(keylist) == 1)
+            set_arg_local_impl<i>(key, std::move(*reinterpret_cast<valueT *>(copy->get_ptr())), copy, &task_ring);
+          else {
+            throw std::logic_error(std::string("TTG::PaRSEC: need to copy a datum of type") + boost::typeindex::type_id<std::decay_t<valueT>>().pretty_name() + " but the type is not copyable");
+          }
+        }
       }
 
       if (nullptr != task_ring) {
@@ -3429,10 +3449,20 @@ ttg::abort();  // should not happen
           set_arg<i, keyT, valueT>(key, std::forward<valueT>(value));
         };
         auto send_callback = [this](const keyT &key, const valueT &value) {
-          set_arg<i, keyT, const valueT &>(key, value);
+          if constexpr (std::is_copy_constructible_v<valueT>) {
+            set_arg<i, keyT, const valueT &>(key, value);
+          }
+          else {
+            throw std::logic_error(std::string("TTG::PaRSEC: send_callback is invoked on datum of type ") + boost::typeindex::type_id<valueT>().pretty_name() + " which is not copy constructible, std::move datum into send statement");
+          }
         };
         auto broadcast_callback = [this](const ttg::span<const keyT> &keylist, const valueT &value) {
+          if constexpr (std::is_copy_constructible_v<valueT>) {
             broadcast_arg<i, keyT, valueT>(keylist, value);
+          }
+          else {
+            throw std::logic_error(std::string("TTG::PaRSEC: broadcast_callback is invoked on datum of type ") + boost::typeindex::type_id<valueT>().pretty_name() + " which is not copy constructible, broadcast is not possible with move-only type");
+          }
         };
         auto prepare_send_callback = [this](const ttg::span<const keyT> &keylist, const valueT &value) {
             prepare_send<i, keyT, valueT>(keylist, value);
@@ -3460,7 +3490,14 @@ ttg::abort();  // should not happen
       //////////////////////////////////////////////////////////////////
       else if constexpr (ttg::meta::is_void_v<keyT> && !std::is_void_v<valueT>) {
         auto move_callback = [this](valueT &&value) { set_arg<i, keyT, valueT>(std::forward<valueT>(value)); };
-        auto send_callback = [this](const valueT &value) { set_arg<i, keyT, const valueT &>(value); };
+        auto send_callback = [this](const valueT &value) {
+          if constexpr (std::is_copy_constructible_v<valueT>) {
+            set_arg<i, keyT, const valueT &>(value);
+          }
+          else {
+            throw std::logic_error(std::string("TTG::PaRSEC: send_callback is invoked on datum of type ") + boost::typeindex::type_id<valueT>().pretty_name() + " which is not copy constructible, std::move datum into send/broadcast statement");
+          }
+        };
         auto setsize_callback = [this](std::size_t size) { set_argstream_size<i>(size); };
         auto finalize_callback = [this]() { finalize_argstream<i>(); };
         auto prepare_send_callback = [this](const valueT &value) {
@@ -4238,8 +4275,9 @@ struct ttg::detail::value_copy_handler<ttg::Runtime::PaRSEC> {
   }
 
   template <typename Value>
-  inline std::add_lvalue_reference_t<Value> operator()(Value &&value) {
-    static_assert(std::is_rvalue_reference_v<decltype(value)> ||
+  inline std::conditional_t<std::is_reference_v<Value>,Value,Value&&> operator()(Value &&value) {
+    constexpr auto value_is_rvref = std::is_rvalue_reference_v<decltype(value)>;
+    static_assert(value_is_rvref ||
                   std::is_copy_constructible_v<std::decay_t<Value>>,
                   "Data sent without being moved must be copy-constructible!");
 
@@ -4262,7 +4300,7 @@ struct ttg::detail::value_copy_handler<ttg::Runtime::PaRSEC> {
       value_ptr = reinterpret_cast<value_type *>(copy->get_ptr());
       copy_to_remove = copy;
     } else {
-      if constexpr (std::is_rvalue_reference_v<decltype(value)>) {
+      if constexpr (value_is_rvref) {
         /* this copy won't be modified anymore so mark it as read-only */
         copy->reset_readers();
       }
@@ -4272,7 +4310,10 @@ struct ttg::detail::value_copy_handler<ttg::Runtime::PaRSEC> {
     /* We're coming from a writer so mark the data as modified.
      * That way we can force a pushout in prepare_send if we move to read-only tasks (needed by PaRSEC). */
     caller->data_flags = ttg_parsec::detail::ttg_parsec_data_flags::IS_MODIFIED;
-    return *value_ptr;
+    if constexpr (value_is_rvref)
+      return std::move(*value_ptr);
+    else
+      return *value_ptr;
   }
 
   template<typename Value>

From 66ab8ee4e367bce7b6e0b947ba7c94dcdf73f701 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Mon, 8 Apr 2024 16:55:37 -0400
Subject: [PATCH 09/28] device fib example is a standalone executable, takes N
 from command line (default = 1000). Does not seem to invoke the kernel

---
 tests/unit/CMakeLists.txt      | 14 ++++++++------
 tests/unit/fibonacci_device.cc | 27 ++++++++++++++++++++-------
 2 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 7b1417d8c..19ff9c8dd 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -16,12 +16,6 @@ list(APPEND ut_src fibonacci-coro.cc)
 list(APPEND ut_src device_coro.cc)
 if (TTG_HAVE_CUDA)
     list(APPEND ut_src cuda_kernel.cu)
-    # fibonacci device example
-    list(APPEND ut_src
-            fibonacci_device.cc
-            fibonacci_cuda_kernel.h
-            fibonacci_cuda_kernel.cu
-    )
 endif(TTG_HAVE_CUDA)
 list(APPEND ut_libs std::coroutine)
 
@@ -36,6 +30,14 @@ add_ttg_executable(serialization serialization.cc unit_main.cpp
 add_ttg_executable(serialization_boost serialization_boost.cc
         LINK_LIBRARIES ttg-serialization-boost RUNTIMES "parsec")
 
+# Fib device test
+if (TTG_HAVE_CUDA)
+       add_ttg_executable(fibonacci_device fibonacci_device.cc
+            fibonacci_cuda_kernel.h
+            fibonacci_cuda_kernel.cu
+        LINK_LIBRARIES std::coroutine RUNTIMES "parsec")
+endif()
+
 # TODO: convert into unit test
 #if (TARGET MADworld)
 #add_executable(splitmd_serialization splitmd_serialization.cc unit_main.cpp)
diff --git a/tests/unit/fibonacci_device.cc b/tests/unit/fibonacci_device.cc
index 4a052eb18..0c9444b8d 100644
--- a/tests/unit/fibonacci_device.cc
+++ b/tests/unit/fibonacci_device.cc
@@ -32,12 +32,15 @@ struct Fn : public ttg::TTValue<Fn> {
     ttg::ttg_abort();
   }
 };
-extern ttg::Edge<int64_t, Fn> f2f;
-extern ttg::Edge<void, Fn> f2p;
-auto create_fib_task() {
-  return ttg::make_tt<ES>(
+
+auto make_ttg_fib_lt(const int64_t F_n_max = 1000) {
+  ttg::Edge<int64_t, Fn> f2f;
+  ttg::Edge<void, Fn> f2p;
+
+  auto fib = ttg::make_tt<ES>(
       [=](int64_t n, Fn&& f_n) -> ttg::device::Task {
         assert(n > 0);
+        ttg::trace("in fib: n=", n, " F_n=", f_n.F[0]);
 
         co_await ttg::device::select(f_n.b);
 
@@ -54,18 +57,28 @@ auto create_fib_task() {
       },
       ttg::edges(f2f), ttg::edges(f2f, f2p), "fib");
   auto print = ttg::make_tt(
-      [](Fn f_n) {
+      [=](Fn f_n) {
         std::cout << "The largest Fibonacci number smaller than " << F_n_max << " is " << f_n.F[1] << std::endl;
       },
       ttg::edges(f2p), ttg::edges(), "print");
+
+  auto ins = std::make_tuple(fib->template in<0>());
+  std::vector<std::unique_ptr<::ttg::TTBase>> ops;
+  ops.emplace_back(std::move(fib));
+  ops.emplace_back(std::move(print));
+  return make_ttg(std::move(ops), ins, std::make_tuple(), "Fib_n < N");
 }
 
 int main(int argc, char* argv[]) {
   ttg::initialize(argc, argv, -1);
-  auto fib = create_fib_task();
+  ttg::trace_on();
+  int64_t N = 1000;
+  if (argc > 1) N = std::atol(argv[1]);
+  auto fib = make_ttg_fib_lt(N);  // computes largest F_n < N
 
   ttg::make_graph_executable(fib.get());
-  if (ttg::default_execution_context().rank() == 0) fib->invoke(1, Fn{});
+  if (ttg::default_execution_context().rank() == 0)
+    fib->template in<0>()->send(1, Fn{});;
 
   ttg::execute(ttg::ttg_default_execution_context());
   ttg::fence(ttg::ttg_default_execution_context());

From ca4c976e002d828044a2a528979559985a3e9d25 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Mon, 8 Apr 2024 18:22:04 -0400
Subject: [PATCH 10/28] address of Fn::F must be invariant under move ... use
 std::unique_ptr which makes Fn truly move-only. Still fails in last send when
 trying to copy Fn

---
 tests/unit/fibonacci_device.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/unit/fibonacci_device.cc b/tests/unit/fibonacci_device.cc
index 0c9444b8d..a1603cb58 100644
--- a/tests/unit/fibonacci_device.cc
+++ b/tests/unit/fibonacci_device.cc
@@ -13,10 +13,10 @@
 const int64_t F_n_max = 1000;
 /// N.B. contains values of F_n and F_{n-1}
 struct Fn : public ttg::TTValue<Fn> {
-  int64_t F[2] = {1, 0};  // F[0] = F_n, F[1] = F_{n-1}
+  std::unique_ptr<int64_t[]> F;  // F[0] = F_n, F[1] = F_{n-1}
   ttg::Buffer<int64_t> b;
 
-  Fn() : b(&F[0], 2) {}
+  Fn() : F(std::make_unique<int64_t[]>(2)), b(F.get(), 2) { F[0] = 1; F[1] = 0; }
 
   Fn(const Fn&) = delete;
   Fn(Fn&& other) = default;
@@ -57,7 +57,7 @@ auto make_ttg_fib_lt(const int64_t F_n_max = 1000) {
       },
       ttg::edges(f2f), ttg::edges(f2f, f2p), "fib");
   auto print = ttg::make_tt(
-      [=](Fn f_n) {
+      [=](Fn&& f_n) {
         std::cout << "The largest Fibonacci number smaller than " << F_n_max << " is " << f_n.F[1] << std::endl;
       },
       ttg::edges(f2p), ttg::edges(), "print");

From b9ae7233d07c61ff2b0beee461d957d20f291255 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Tue, 9 Apr 2024 09:07:47 -0400
Subject: [PATCH 11/28] cleanup/condense Terminal::send* implementations

---
 ttg/ttg/terminal.h | 83 +++++++++++++++++++++++++---------------------
 1 file changed, 46 insertions(+), 37 deletions(-)

diff --git a/ttg/ttg/terminal.h b/ttg/ttg/terminal.h
index c6d293088..85c6ca5f2 100644
--- a/ttg/ttg/terminal.h
+++ b/ttg/ttg/terminal.h
@@ -202,16 +202,16 @@ namespace ttg {
     }
 
     template <typename Key = keyT, typename Value = valueT>
-    std::enable_if_t<meta::is_none_void_v<Key, Value>, void> send(const Key &key, const Value &value) {
-      if (!send_callback) throw std::runtime_error("send callback not initialized");
-      send_callback(key, value);
-    }
-
-    template <typename Key = keyT, typename Value = valueT>
-    std::enable_if_t<meta::is_none_void_v<Key, Value> && std::is_same_v<Value, std::remove_reference_t<Value>>, void>
+    std::enable_if_t<meta::is_none_void_v<Key, Value>, void>
     send(const Key &key, Value &&value) {
-      if (!move_callback) throw std::runtime_error("move callback not initialized");
-      move_callback(key, std::forward<valueT>(value));
+      constexpr auto value_is_rvref = !std::is_reference_v<Value>;
+      if constexpr (value_is_rvref) {
+        if (!move_callback) throw std::runtime_error("move callback not initialized");
+        move_callback(key, std::move(value));
+      } else {
+        if (!send_callback) throw std::runtime_error("send callback not initialized");
+        send_callback(key, value);
+      }
     }
 
     template <typename Key = keyT>
@@ -221,16 +221,17 @@ namespace ttg {
     }
 
     template <typename Value = valueT>
-    std::enable_if_t<!meta::is_void_v<Value>, void> sendv(const Value &value) {
-      if (!send_callback) throw std::runtime_error("send callback not initialized");
-      send_callback(value);
-    }
-
-    template <typename Value = valueT>
-    std::enable_if_t<!meta::is_void_v<Value> && std::is_same_v<Value, std::remove_reference_t<Value>>, void> sendv(
+    std::enable_if_t<!meta::is_void_v<Value>, void> sendv(
         Value &&value) {
-      if (!move_callback) throw std::runtime_error("move callback not initialized");
-      move_callback(std::forward<valueT>(value));
+      constexpr auto value_is_rvref = !std::is_reference_v<Value>;
+      if constexpr (value_is_rvref) {
+        if (!move_callback) throw std::runtime_error("move callback not initialized");
+        move_callback(std::move(value));
+      }
+      else {
+        if (!send_callback) throw std::runtime_error("send callback not initialized");
+        send_callback(value);
+      }
     }
 
     void send() {
@@ -458,18 +459,6 @@ namespace ttg {
         in->connect_pull(this);
     }
 
-    template<typename Key = keyT, typename Value = valueT>
-    std::enable_if_t<meta::is_none_void_v<Key,Value>,void> send(const Key &key, const Value &value) {
-      for (auto && successor : this->successors()) {
-        assert(successor->get_type() != TerminalBase::Type::Write);
-        if (successor->get_type() == TerminalBase::Type::Read) {
-          static_cast<In<keyT, std::add_const_t<valueT>> *>(successor)->send(key, value);
-        } else if (successor->get_type() == TerminalBase::Type::Consume) {
-          static_cast<In<keyT, valueT> *>(successor)->send(key, value);
-        }
-      }
-    }
-
     template <typename Key = keyT, typename Value = valueT>
     std::enable_if_t<!meta::is_void_v<Key> && meta::is_void_v<Value>, void> sendk(const Key &key) {
       for (auto &&successor : this->successors()) {
@@ -483,15 +472,30 @@ namespace ttg {
     }
 
     template <typename Key = keyT, typename Value = valueT>
-    std::enable_if_t<meta::is_void_v<Key> && !meta::is_void_v<Value>, void> sendv(const Value &value) {
-      for (auto &&successor : this->successors()) {
-        assert(successor->get_type() != TerminalBase::Type::Write);
+    std::enable_if_t<meta::is_void_v<Key> && !meta::is_void_v<Value>, void> sendv(Value&& value) {
+      const std::size_t N = this->nsuccessors();
+      TerminalBase *move_successor = nullptr;
+      // send copies to every terminal except the one we will move the results to
+      for (std::size_t i = 0; i != N; ++i) {
+        TerminalBase *successor = this->successors().at(i);
         if (successor->get_type() == TerminalBase::Type::Read) {
-          static_cast<In<keyT, std::add_const_t<valueT>> *>(successor)->sendv(value);
+          // if only have 1 successor forward value even if successor is read-only, so we can deal with move-only types
+          auto* read_successor = static_cast<In<keyT, std::add_const_t<valueT>> *>(successor);
+          if (N != 1)
+            read_successor->sendv(value);
+          else
+            read_successor->sendv(std::forward<Value>(value));
         } else if (successor->get_type() == TerminalBase::Type::Consume) {
-          static_cast<In<keyT, valueT> *>(successor)->sendv(value);
+          if (nullptr == move_successor) {
+            move_successor = successor;
+          } else {
+            static_cast<In<keyT, valueT> *>(successor)->sendv(value);
+          }
         }
       }
+      if (nullptr != move_successor) {
+        static_cast<In<keyT, valueT> *>(move_successor)->sendv(std::forward<Value>(value));
+      }
     }
 
     template <typename Key = keyT, typename Value = valueT>
@@ -512,7 +516,7 @@ namespace ttg {
     }
 
     template <typename Key = keyT, typename Value = valueT>
-    std::enable_if_t<meta::is_none_void_v<Key, Value> && std::is_same_v<Value, std::remove_reference_t<Value>>, void>
+    std::enable_if_t<meta::is_none_void_v<Key, Value>, void>
     send(const Key &key, Value &&value) {
       const std::size_t N = this->nsuccessors();
       TerminalBase *move_successor = nullptr;
@@ -520,7 +524,12 @@ namespace ttg {
       for (std::size_t i = 0; i != N; ++i) {
         TerminalBase *successor = this->successors().at(i);
         if (successor->get_type() == TerminalBase::Type::Read) {
-          static_cast<In<keyT, std::add_const_t<valueT>> *>(successor)->send(key, value);
+          // if only have 1 successor forward value even if successor is read-only, so we can deal with move-only types
+          auto* read_successor = static_cast<In<keyT, std::add_const_t<valueT>> *>(successor);
+          if (N != 1)
+            read_successor->send(key, value);
+          else
+            read_successor->send(key, std::forward<Value>(value));
         } else if (successor->get_type() == TerminalBase::Type::Consume) {
           if (nullptr == move_successor) {
             move_successor = successor;

From d71b88a0d0e6c5d357c1d018b1483ad18fea912f Mon Sep 17 00:00:00 2001
From: hyndavi <venkatreddygarihyndavi.3@gmail.com>
Date: Wed, 10 Apr 2024 09:27:44 -0400
Subject: [PATCH 12/28] Readme initial update

---
 README.md | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)

diff --git a/README.md b/README.md
index f359dde52..94bbe6d91 100644
--- a/README.md
+++ b/README.md
@@ -225,6 +225,107 @@ separating the downward flow of control (task creation,
 <img src="https://render.githubusercontent.com/render/math?math=F_{n} \to F_{n-1},F_{n-2}">)
 from the upward flow of data (task evaluation,
 <img src="https://render.githubusercontent.com/render/math?math=F_{n-1},F_{n-2} \to F_{n}">).
+## Example : Computing the Largest Fibonacci Number Smaller Than a Given Threshold on the device(GPU)
+
+This example demonstrates how to efficiently compute the largest Fibonacci number smaller than a specified threshold, utilizing GPU acceleration. The Fibonacci sequence is defined by the recurrence relation F(N) = F(N-1) + F(N-2), with F(0) = 0, and F(1) = 1
+
+
+###  CUDA and GPU Acceleration
+In this example, CUDA is used to accelerate the computation of Fibonacci numbers by leveraging the parallel processing capabilities of GPUs. This is achieved through the use of a specific CUDA kernel defined in fibonacci_cuda_kernel.h whose cuda based implementation is described below, which contains the logic for computing the next Fibonacci number in the sequence.
+
+```cpp
+ #include "fibonacci_cuda_kernel.h"
+#ifdef TTG_HAVE_CUDA
+    __global__ void cu_next_value(int64_t* fn_and_fnm1) {
+      int64_t fnp1 = fn_and_fnm1[0] + fn_and_fnm1[1];
+      fn_and_fnm1[1] = fn_and_fnm1[0];
+      fn_and_fnm1[0] = fnp1;
+    }
+    void next_value(int64_t* fn_and_fnm1) {
+      cu_next_value<<<1, 1>>>(fn_and_fnm1);
+    }
+#endif // TTG_HAVE_CUDA
+```
+
+### Struct Fn and Serialization
+
+```cpp
+struct Fn : public ttg::TTValue<Fn> {
+  std::unique_ptr<int64_t[]> F;  // F[0] = F_n, F[1] = F_{n-1}
+  ttg::Buffer<int64_t> b;
+  Fn() : F(std::make_unique<int64_t[]>(2)), b(F.get(), 2) { F[0] = 1; F[1] = 0; }
+  Fn(const Fn&) = delete;
+  Fn(Fn&& other) = default;
+  Fn& operator=(const Fn& other) = delete;
+  Fn& operator=(Fn&& other) = default;
+  template <typename Archive>
+  void serialize(Archive& ar) {
+    ttg::ttg_abort();
+  }
+  template <typename Archive>
+  void serialize(Archive& ar, const unsigned int) {
+    ttg::ttg_abort();
+  }
+};
+```
+The Fn struct is integral to our TTG-powered, GPU-accelerated Fibonacci sequence computation, serving dual roles in task management and efficient GPU data handling. It inherits from ttg::TTValue<Fn> for seamless integration with TTG tasks and employs ttg::Buffer<int64_t> for optimal data transfer and access in GPU environments. With smart management of resources — from its constructor initializing Fibonacci number storage and preparing for GPU data transfers, to explicitly deleted copy operations ensuring unique resource ownership and default move semantics for task-based data flow — Fn is finely tuned for high-performance parallel computing. Its serialization methods, currently aborting operations, hint at future extensions for distributed or GPU-accelerated contexts, underscoring the struct's pivotal role in efficient, scalable Fibonacci calculations.
+
+### Task Graph Construction and Execution
+
+The task graph consists of two types of tasks: fib tasks that compute Fibonacci numbers and a print task that outputs the final result. Tasks are connected by edges, which represent the data flow between them. The task graph is explicitly constructed and made executable, then the computation is initiated by sending the first task into the graph.
+
+#### GPU Memory Management and Kernel Execution
+The Fn struct also contains a ttg::Buffer<int64_t> b, which is used for GPU memory management. This buffer manages the memory where the Fibonacci numbers are stored and provides mechanisms to transfer data between the host and the GPU. The next_value function is called to execute the CUDA kernel, which computes the next Fibonacci number and updates the values in the GPU memory. This operation is performed asynchronously, allowing the CPU to continue executing other tasks while the GPU is working
+
+```cpp
+auto make_ttg_fib_lt(const int64_t F_n_max = 1000) {
+  ttg::Edge<int64_t, Fn> f2f;
+  ttg::Edge<void, Fn> f2p;
+
+  auto fib = ttg::make_tt<ES>(
+      [=](int64_t n, Fn&& f_n) -> ttg::device::Task {
+        assert(n > 0);
+        ttg::trace("in fib: n=", n, " F_n=", f_n.F[0]);
+
+        co_await ttg::device::select(f_n.b);
+
+        next_value(f_n.b.current_device_ptr());
+
+        // wait for the task to complete and the values to be brought back to the host
+        co_await ttg::device::wait(f_n.b);
+
+        if (f_n.F[0] < F_n_max) {
+          co_await ttg::device::forward(ttg::device::send<0>(n + 1, std::move(f_n)));
+        } else {
+          co_await ttg::device::forward(ttg::device::sendv<1>(std::move(f_n)));
+        }
+      },
+      ttg::edges(f2f), ttg::edges(f2f, f2p), "fib");
+  auto print = ttg::make_tt(
+      [=](Fn&& f_n) {
+        std::cout << "The largest Fibonacci number smaller than " << F_n_max << " is " << f_n.F[1] << std::endl;
+      },
+      ttg::edges(f2p), ttg::edges(), "print");
+
+  auto ins = std::make_tuple(fib->template in<0>());
+  std::vector<std::unique_ptr<::ttg::TTBase>> ops;
+  ops.emplace_back(std::move(fib));
+  ops.emplace_back(std::move(print));
+  return make_ttg(std::move(ops), ins, std::make_tuple(), "Fib_n < N");
+}
+```
+#### Asynchronous Task Execution with Co-Routines
+In the make_ttg_fib_lt function, co-routines are used to await the completion of GPU tasks and the transfer of computed values back to the host. This approach enables efficient overlap of computation and communication, reducing the overall execution time.
+
+
+## Comparing _nth Fibonacci_ CPU vs GPU-version
+
+| Concept   | CPU version             | GPU version               |
+|--------------------------------------|--------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Execution Environment | Utilizing the TTG library to manage task parallelism within a possibly distributed computing environment but without explicit hardware acceleration             |  It uses the TTG library in conjunction with CUDA to offload computationally intensive tasks to the GPU, thereby achieving significant performance gains through parallel execution|
+| Computational Model  | It relies on the CPU for all computations and data management            |  Computations are offloaded to the GPU, allowing for the parallel computation of Fibonacci numbers, which is particularly beneficial for large sequences due to the GPU's ability to handle many threads simultaneously  |
+| Data Management  | Manages data flow between tasks using TTG edges, with each task operating on standard CPU memory   |Incorporates complex data management strategies to handle memory transfers between host (CPU) and device (GPU) memory spaces |
+| Software Requirements and Portability  | Relies on the TTG library and a standard C++ compiler | Requires a CUDA-enabled environment and a compatible NVIDIA GPU, in addition to the TTG library  |
 
 ## Debugging TTG Programs
 

From 72eea82d696d7e9c6e9197d56fbf6d0f2205b05a Mon Sep 17 00:00:00 2001
From: hyndavi <venkatreddygarihyndavi.3@gmail.com>
Date: Wed, 17 Apr 2024 09:55:34 -0400
Subject: [PATCH 13/28] Readme and ReStructured

---
 README.md                                     | 69 +++++++++++++++
 doc/dox/dev/devsamp/main/CMakeLists.txt       |  8 ++
 .../dev/devsamp/main/fibonacci/fibonacci.cc   | 70 +++++++++++++++
 .../main/fibonacci/fibonacci_cuda_kernel.cu   | 15 ++++
 .../main/fibonacci/fibonacci_cuda_kernel.h    |  4 +
 .../main/fibonacci/fibonacci_device.cc        | 88 +++++++++++++++++++
 6 files changed, 254 insertions(+)
 create mode 100644 doc/dox/dev/devsamp/main/fibonacci/fibonacci.cc
 create mode 100644 doc/dox/dev/devsamp/main/fibonacci/fibonacci_cuda_kernel.cu
 create mode 100644 doc/dox/dev/devsamp/main/fibonacci/fibonacci_cuda_kernel.h
 create mode 100644 doc/dox/dev/devsamp/main/fibonacci/fibonacci_device.cc

diff --git a/README.md b/README.md
index 94bbe6d91..a639a5fbb 100644
--- a/README.md
+++ b/README.md
@@ -317,7 +317,74 @@ auto make_ttg_fib_lt(const int64_t F_n_max = 1000) {
 #### Asynchronous Task Execution with Co-Routines
 In the make_ttg_fib_lt function, co-routines are used to await the completion of GPU tasks and the transfer of computed values back to the host. This approach enables efficient overlap of computation and communication, reducing the overall execution time.
 
+## Example : Computing the Largest Fibonacci Number Smaller Than a Given Threshold on the CPU
+```cpp
+#include <ttg.h>
+#include "ttg/serialization.h"
+
+const int64_t F_n_max = 1000;
+/// N.B. contains values of F_n and F_{n-1}
+struct Fn : public ttg::TTValue<Fn> {
+  std::unique_ptr<int64_t[]> F;  // F[0] = F_n, F[1] = F_{n-1}
+  Fn() : F(std::make_unique<int64_t[]>(2)) { F[0] = 1; F[1] = 0; }
+  Fn(const Fn&) = delete;
+  Fn(Fn&& other) = default;
+  Fn& operator=(const Fn& other) = delete;
+  Fn& operator=(Fn&& other) = default;
+  template <typename Archive>
+  void serialize(Archive& ar) {
+    ttg::ttg_abort();
+  }
+  template <typename Archive>
+  void serialize(Archive& ar, const unsigned int) {
+    ttg::ttg_abort();
+  }
+};
+auto make_ttg_fib_lt(const int64_t) {
+  ttg::Edge<int64_t, Fn> f2f;
+  ttg::Edge<void, Fn> f2p;
+
+  auto fib = ttg::make_tt(
+      [=](int64_t n, Fn&& f_n) {
+        int64_t next_f_n = f_n.F[0] + f_n.F[1];
+        f_n.F[1] = f_n.F[0];
+        f_n.F[0] = next_f_n;
+        if (next_f_n < F_n_max) {
+          ttg::send<0>(n + 1, std::move(f_n));
+        } else {
+          ttg::send<1>(n, std::move(f_n));
+        }
+      },
+      ttg::edges(f2f), ttg::edges(f2f, f2p), "fib");
+
+  auto print = ttg::make_tt(
+      [=](Fn&& f_n) {
+        std::cout << "The largest Fibonacci number smaller than " << F_n_max << " is " << f_n.F[1] << std::endl;
+      },
+      ttg::edges(f2p), ttg::edges(), "print");
+  auto ins = std::make_tuple(fib->template in<0>());
+  std::vector<std::unique_ptr<ttg::TTBase>> ops;
+  ops.emplace_back(std::move(fib));
+  ops.emplace_back(std::move(print));
+  return make_ttg(std::move(ops), ins, std::make_tuple(), "Fib_n < N");
+}
 
+int main(int argc, char* argv[]) {
+  ttg::initialize(argc, argv, -1);
+  ttg::trace_on();
+  int64_t N = 1000;
+  if (argc > 1) N = std::atol(argv[1]);
+  auto fib = make_ttg_fib_lt(N);
+  ttg::make_graph_executable(fib.get());
+  if (ttg::default_execution_context().rank() == 0)
+    fib->template in<0>()->send(1, Fn{});;
+  ttg::execute();
+  ttg::fence();
+  ttg::finalize();
+  return 0;
+}
+
+```
 ## Comparing _nth Fibonacci_ CPU vs GPU-version
 
 | Concept   | CPU version             | GPU version               |
@@ -326,6 +393,8 @@ In the make_ttg_fib_lt function, co-routines are used to await the completion of
 | Computational Model  | It relies on the CPU for all computations and data management            |  Computations are offloaded to the GPU, allowing for the parallel computation of Fibonacci numbers, which is particularly beneficial for large sequences due to the GPU's ability to handle many threads simultaneously  |
 | Data Management  | Manages data flow between tasks using TTG edges, with each task operating on standard CPU memory   |Incorporates complex data management strategies to handle memory transfers between host (CPU) and device (GPU) memory spaces |
 | Software Requirements and Portability  | Relies on the TTG library and a standard C++ compiler | Requires a CUDA-enabled environment and a compatible NVIDIA GPU, in addition to the TTG library  |
+| Data Structures |  Utilizes a simple structure Fn containing a std::unique_ptr<int64_t[]> to store the Fibonacci sequence |Similar to the CPU version but includes a ttg::Buffer<int64_t> for GPU memory management |
+|Concurrency | There's no explicit synchronization or concurrency control needed beyond what's handled by the TTG framework |Uses co_await for synchronization, indicating more complex control flow to manage GPU operations efficiently |
 
 ## Debugging TTG Programs
 
diff --git a/doc/dox/dev/devsamp/main/CMakeLists.txt b/doc/dox/dev/devsamp/main/CMakeLists.txt
index 84b2eb865..4979da2ca 100644
--- a/doc/dox/dev/devsamp/main/CMakeLists.txt
+++ b/doc/dox/dev/devsamp/main/CMakeLists.txt
@@ -4,3 +4,11 @@ project(test)
 find_package(ttg REQUIRED)
 
 add_ttg_executable(test test.cpp NOT_EXCLUDE_FROM_ALL)
+
+# Fib device test
+if (TTG_HAVE_CUDA)
+    add_ttg_executable(fibonacci_device fibonacci_device.cc
+            fibonacci_cuda_kernel.h
+            fibonacci_cuda_kernel.cu
+            LINK_LIBRARIES std::coroutine RUNTIMES "parsec")
+endif()
\ No newline at end of file
diff --git a/doc/dox/dev/devsamp/main/fibonacci/fibonacci.cc b/doc/dox/dev/devsamp/main/fibonacci/fibonacci.cc
new file mode 100644
index 000000000..551529b0b
--- /dev/null
+++ b/doc/dox/dev/devsamp/main/fibonacci/fibonacci.cc
@@ -0,0 +1,70 @@
+#include <ttg.h>
+#include "ttg/serialization.h"
+
+const int64_t F_n_max = 1000;
+/// N.B. contains values of F_n and F_{n-1}
+struct Fn : public ttg::TTValue<Fn> {
+  std::unique_ptr<int64_t[]> F;  // F[0] = F_n, F[1] = F_{n-1}
+
+  Fn() : F(std::make_unique<int64_t[]>(2)) { F[0] = 1; F[1] = 0; }
+
+  Fn(const Fn&) = delete;
+  Fn(Fn&& other) = default;
+  Fn& operator=(const Fn& other) = delete;
+  Fn& operator=(Fn&& other) = default;
+
+  template <typename Archive>
+  void serialize(Archive& ar) {
+    ttg::ttg_abort();
+  }
+  template <typename Archive>
+  void serialize(Archive& ar, const unsigned int) {
+    ttg::ttg_abort();
+  }
+};
+auto make_ttg_fib_lt(const int64_t) {
+  ttg::Edge<int64_t, Fn> f2f;
+  ttg::Edge<void, Fn> f2p;
+
+  auto fib = ttg::make_tt(
+      [=](int64_t n, Fn&& f_n) {
+        int64_t next_f_n = f_n.F[0] + f_n.F[1];
+        f_n.F[1] = f_n.F[0];
+        f_n.F[0] = next_f_n;
+        if (next_f_n < F_n_max) {
+          ttg::send<0>(n + 1, std::move(f_n));
+        } else {
+          ttg::send<1>(n, std::move(f_n));
+        }
+      },
+      ttg::edges(f2f), ttg::edges(f2f, f2p), "fib");
+
+  auto print = ttg::make_tt(
+      [=](Fn&& f_n) {
+        std::cout << "The largest Fibonacci number smaller than " << F_n_max << " is " << f_n.F[1] << std::endl;
+      },
+      ttg::edges(f2p), ttg::edges(), "print");
+  auto ins = std::make_tuple(fib->template in<0>());
+  std::vector<std::unique_ptr<ttg::TTBase>> ops;
+  ops.emplace_back(std::move(fib));
+  ops.emplace_back(std::move(print));
+  return make_ttg(std::move(ops), ins, std::make_tuple(), "Fib_n < N");
+}
+
+
+int main(int argc, char* argv[]) {
+  ttg::initialize(argc, argv, -1);
+  ttg::trace_on();
+  int64_t N = 1000;
+  if (argc > 1) N = std::atol(argv[1]);
+
+  auto fib = make_ttg_fib_lt(N);
+  ttg::make_graph_executable(fib.get());
+  if (ttg::default_execution_context().rank() == 0)
+    fib->template in<0>()->send(1, Fn{});;
+
+  ttg::execute();
+  ttg::fence();
+  ttg::finalize();
+  return 0;
+}
diff --git a/doc/dox/dev/devsamp/main/fibonacci/fibonacci_cuda_kernel.cu b/doc/dox/dev/devsamp/main/fibonacci/fibonacci_cuda_kernel.cu
new file mode 100644
index 000000000..6fa316468
--- /dev/null
+++ b/doc/dox/dev/devsamp/main/fibonacci/fibonacci_cuda_kernel.cu
@@ -0,0 +1,15 @@
+#include "fibonacci_cuda_kernel.h"
+
+#ifdef TTG_HAVE_CUDA
+
+__global__ void cu_next_value(int64_t* fn_and_fnm1) {
+  int64_t fnp1 = fn_and_fnm1[0] + fn_and_fnm1[1];
+  fn_and_fnm1[1] = fn_and_fnm1[0];
+  fn_and_fnm1[0] = fnp1;
+}
+
+void next_value(int64_t* fn_and_fnm1) {
+  cu_next_value<<<1, 1>>>(fn_and_fnm1);
+}
+
+#endif // TTG_HAVE_CUDA
diff --git a/doc/dox/dev/devsamp/main/fibonacci/fibonacci_cuda_kernel.h b/doc/dox/dev/devsamp/main/fibonacci/fibonacci_cuda_kernel.h
new file mode 100644
index 000000000..a096ec3f1
--- /dev/null
+++ b/doc/dox/dev/devsamp/main/fibonacci/fibonacci_cuda_kernel.h
@@ -0,0 +1,4 @@
+#include "ttg/config.h"
+#include <cinttypes>
+
+void next_value(int64_t* fn_and_fnm1);
\ No newline at end of file
diff --git a/doc/dox/dev/devsamp/main/fibonacci/fibonacci_device.cc b/doc/dox/dev/devsamp/main/fibonacci/fibonacci_device.cc
new file mode 100644
index 000000000..a1603cb58
--- /dev/null
+++ b/doc/dox/dev/devsamp/main/fibonacci/fibonacci_device.cc
@@ -0,0 +1,88 @@
+#include <ttg.h>
+
+#if defined(TTG_HAVE_CUDA)
+#define ES ttg::ExecutionSpace::CUDA
+#include "cuda_runtime.h"
+#include "fibonacci_cuda_kernel.h"
+#else
+#error " CUDA  is required to build this test!"
+#endif
+
+#include "ttg/serialization.h"
+
+const int64_t F_n_max = 1000;
+/// N.B. contains values of F_n and F_{n-1}
+struct Fn : public ttg::TTValue<Fn> {
+  std::unique_ptr<int64_t[]> F;  // F[0] = F_n, F[1] = F_{n-1}
+  ttg::Buffer<int64_t> b;
+
+  Fn() : F(std::make_unique<int64_t[]>(2)), b(F.get(), 2) { F[0] = 1; F[1] = 0; }
+
+  Fn(const Fn&) = delete;
+  Fn(Fn&& other) = default;
+  Fn& operator=(const Fn& other) = delete;
+  Fn& operator=(Fn&& other) = default;
+
+  template <typename Archive>
+  void serialize(Archive& ar) {
+    ttg::ttg_abort();
+  }
+  template <typename Archive>
+  void serialize(Archive& ar, const unsigned int) {
+    ttg::ttg_abort();
+  }
+};
+
+auto make_ttg_fib_lt(const int64_t F_n_max = 1000) {
+  ttg::Edge<int64_t, Fn> f2f;
+  ttg::Edge<void, Fn> f2p;
+
+  auto fib = ttg::make_tt<ES>(
+      [=](int64_t n, Fn&& f_n) -> ttg::device::Task {
+        assert(n > 0);
+        ttg::trace("in fib: n=", n, " F_n=", f_n.F[0]);
+
+        co_await ttg::device::select(f_n.b);
+
+        next_value(f_n.b.current_device_ptr());
+
+        // wait for the task to complete and the values to be brought back to the host
+        co_await ttg::device::wait(f_n.b);
+
+        if (f_n.F[0] < F_n_max) {
+          co_await ttg::device::forward(ttg::device::send<0>(n + 1, std::move(f_n)));
+        } else {
+          co_await ttg::device::forward(ttg::device::sendv<1>(std::move(f_n)));
+        }
+      },
+      ttg::edges(f2f), ttg::edges(f2f, f2p), "fib");
+  auto print = ttg::make_tt(
+      [=](Fn&& f_n) {
+        std::cout << "The largest Fibonacci number smaller than " << F_n_max << " is " << f_n.F[1] << std::endl;
+      },
+      ttg::edges(f2p), ttg::edges(), "print");
+
+  auto ins = std::make_tuple(fib->template in<0>());
+  std::vector<std::unique_ptr<::ttg::TTBase>> ops;
+  ops.emplace_back(std::move(fib));
+  ops.emplace_back(std::move(print));
+  return make_ttg(std::move(ops), ins, std::make_tuple(), "Fib_n < N");
+}
+
+int main(int argc, char* argv[]) {
+  ttg::initialize(argc, argv, -1);
+  ttg::trace_on();
+  int64_t N = 1000;
+  if (argc > 1) N = std::atol(argv[1]);
+  auto fib = make_ttg_fib_lt(N);  // computes largest F_n < N
+
+  ttg::make_graph_executable(fib.get());
+  if (ttg::default_execution_context().rank() == 0)
+    fib->template in<0>()->send(1, Fn{});;
+
+  ttg::execute(ttg::ttg_default_execution_context());
+  ttg::fence(ttg::ttg_default_execution_context());
+
+  ttg::finalize();
+  return 0;
+}

From 4e4cd4f0e8f432bf899051a4959f7da1dfdcca13 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Wed, 17 Apr 2024 11:36:30 -0400
Subject: [PATCH 14/28] Restructure F_n<N section in README.md, way to go still

---
 README.md | 189 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 99 insertions(+), 90 deletions(-)

diff --git a/README.md b/README.md
index a639a5fbb..6447d1371 100644
--- a/README.md
+++ b/README.md
@@ -225,29 +225,87 @@ separating the downward flow of control (task creation,
 <img src="https://render.githubusercontent.com/render/math?math=F_{n} \to F_{n-1},F_{n-2}">)
 from the upward flow of data (task evaluation,
 <img src="https://render.githubusercontent.com/render/math?math=F_{n-1},F_{n-2} \to F_{n}">).
-## Example : Computing the Largest Fibonacci Number Smaller Than a Given Threshold on the device(GPU)
 
-This example demonstrates how to efficiently compute the largest Fibonacci number smaller than a specified threshold, utilizing GPU acceleration. The Fibonacci sequence is defined by the recurrence relation F(N) = F(N-1) + F(N-2), with F(0) = 0, and F(1) = 1
+## Data-Dependent Example : Largest Fibonacci Number < N
 
+To illustrate the real power of TTG let's tweak the problem slightly: instead of computing first N Fibonacci numbers let's find the largest Fibonacci number smaller than some N. The key difference in the latter case is that, unlike the former, the number of tasks is NOT known a priori; furthermore, to make a decision whether we need to compute next Fibonacci number we must examine the value returned by the previous task. This is an example of data-dependent tasking, where the decision which (if any) task to execute next depends on the values produced by previous tasks. The ability to compose regular as well as data-dependent task graphs is a distinguishing strength of TTG.
 
-###  CUDA and GPU Acceleration
-In this example, CUDA is used to accelerate the computation of Fibonacci numbers by leveraging the parallel processing capabilities of GPUs. This is achieved through the use of a specific CUDA kernel defined in fibonacci_cuda_kernel.h whose cuda based implementation is described below, which contains the logic for computing the next Fibonacci number in the sequence.
+To make things even more interesting, we will demonstrate how to implement such program both for execution on CPUs as well as on accelerators (GPUs).
 
+### CPU Version
+
+Here's 
 ```cpp
- #include "fibonacci_cuda_kernel.h"
-#ifdef TTG_HAVE_CUDA
-    __global__ void cu_next_value(int64_t* fn_and_fnm1) {
-      int64_t fnp1 = fn_and_fnm1[0] + fn_and_fnm1[1];
-      fn_and_fnm1[1] = fn_and_fnm1[0];
-      fn_and_fnm1[0] = fnp1;
-    }
-    void next_value(int64_t* fn_and_fnm1) {
-      cu_next_value<<<1, 1>>>(fn_and_fnm1);
-    }
-#endif // TTG_HAVE_CUDA
+#include <ttg.h>
+#include "ttg/serialization.h"
+
+/// N.B. contains values of F_n and F_{n-1}
+struct Fn {
+  int64_t F[2];  // F[0] = F_n, F[1] = F_{n-1}
+  Fn() { F[0] = 1; F[1] = 0; }
+  template <typename Archive>
+  void serialize(Archive& ar) {
+    ar & F;
+  }
+  template <typename Archive>
+  void serialize(Archive& ar, const unsigned int) {
+    ar & F;
+  }
+};
+auto make_ttg_fib_lt(const int64_t) {
+  ttg::Edge<int64_t, Fn> f2f;
+  ttg::Edge<void, Fn> f2p;
+
+  auto fib = ttg::make_tt(
+      [=](int64_t n, Fn&& f_n) {
+        int64_t next_f_n = f_n.F[0] + f_n.F[1];
+        f_n.F[1] = f_n.F[0];
+        f_n.F[0] = next_f_n;
+        if (next_f_n < F_n_max) {
+          ttg::send<0>(n + 1, f_n);
+        } else {
+          ttg::send<1>(n, f_n);
+        }
+      },
+      ttg::edges(f2f), ttg::edges(f2f, f2p), "fib");
+
+  auto print = ttg::make_tt(
+      [=](Fn&& f_n) {
+        std::cout << "The largest Fibonacci number smaller than " << F_n_max << " is " << f_n.F[1] << std::endl;
+      },
+      ttg::edges(f2p), ttg::edges(), "print");
+  auto ins = std::make_tuple(fib->template in<0>());
+  std::vector<std::unique_ptr<ttg::TTBase>> ops;
+  ops.emplace_back(std::move(fib));
+  ops.emplace_back(std::move(print));
+  return make_ttg(std::move(ops), ins, std::make_tuple(), "Fib_n < N");
+}
+
+int main(int argc, char* argv[]) {
+  ttg::initialize(argc, argv, -1);
+  int64_t N = 1000;
+  if (argc > 1) N = std::atol(argv[1]);
+
+  auto fib = make_ttg_fib_lt(N);
+  ttg::make_graph_executable(fib.get());
+  if (ttg::default_execution_context().rank() == 0)
+    fib->template in<0>()->send(1, Fn{});;
+
+  ttg::execute();
+  ttg::fence();
+
+  ttg::finalize();
+  return 0;
+}
 ```
 
-### Struct Fn and Serialization
+TODO: walk through the example, key things to emphasize:
+- `Fn` aggregates 2 pieces of data that were separate before in preparation for aggregating datums into single continguous chunks that can be allocated on GPU more efficiently
+- `make_ttg_fib_lt` creates a TTG composed of multiple TTs, whereas before we had disparate TTs connected to each other (i.e. there was no explicit graph object). This allows to support composition of multiple TTGs together, as described in Herault et al DOI 10.1109/PAW-ATM56565.2022.00008
+
+###  CUDA Version
+
+First show complete example, split into host and device code (single source cannot work since CUDA does not support C++20 and probably cannot handle TTG hyper-C++ anyway).
 
 ```cpp
 struct Fn : public ttg::TTValue<Fn> {
@@ -267,17 +325,7 @@ struct Fn : public ttg::TTValue<Fn> {
     ttg::ttg_abort();
   }
 };
-```
-The Fn struct is integral to our TTG-powered, GPU-accelerated Fibonacci sequence computation, serving dual roles in task management and efficient GPU data handling. It inherits from ttg::TTValue<Fn> for seamless integration with TTG tasks and employs ttg::Buffer<int64_t> for optimal data transfer and access in GPU environments. With smart management of resources — from its constructor initializing Fibonacci number storage and preparing for GPU data transfers, to explicitly deleted copy operations ensuring unique resource ownership and default move semantics for task-based data flow — Fn is finely tuned for high-performance parallel computing. Its serialization methods, currently aborting operations, hint at future extensions for distributed or GPU-accelerated contexts, underscoring the struct's pivotal role in efficient, scalable Fibonacci calculations.
 
-### Task Graph Construction and Execution
-
-The task graph consists of two types of tasks: fib tasks that compute Fibonacci numbers and a print task that outputs the final result. Tasks are connected by edges, which represent the data flow between them. The task graph is explicitly constructed and made executable, then the computation is initiated by sending the first task into the graph.
-
-#### GPU Memory Management and Kernel Execution
-The Fn struct also contains a ttg::Buffer<int64_t> b, which is used for GPU memory management. This buffer manages the memory where the Fibonacci numbers are stored and provides mechanisms to transfer data between the host and the GPU. The next_value function is called to execute the CUDA kernel, which computes the next Fibonacci number and updates the values in the GPU memory. This operation is performed asynchronously, allowing the CPU to continue executing other tasks while the GPU is working
-
-```cpp
 auto make_ttg_fib_lt(const int64_t F_n_max = 1000) {
   ttg::Edge<int64_t, Fn> f2f;
   ttg::Edge<void, Fn> f2p;
@@ -314,77 +362,38 @@ auto make_ttg_fib_lt(const int64_t F_n_max = 1000) {
   return make_ttg(std::move(ops), ins, std::make_tuple(), "Fib_n < N");
 }
 ```
-#### Asynchronous Task Execution with Co-Routines
-In the make_ttg_fib_lt function, co-routines are used to await the completion of GPU tasks and the transfer of computed values back to the host. This approach enables efficient overlap of computation and communication, reducing the overall execution time.
 
-## Example : Computing the Largest Fibonacci Number Smaller Than a Given Threshold on the CPU
+Walk through the key differences ... potentially we could show both side by side ... not sure how to do that in Markdown though ...
+
+here's the CUDA code
 ```cpp
-#include <ttg.h>
-#include "ttg/serialization.h"
+ #include "fibonacci_cuda_kernel.h"
+#ifdef TTG_HAVE_CUDA
+    __global__ void cu_next_value(int64_t* fn_and_fnm1) {
+      int64_t fnp1 = fn_and_fnm1[0] + fn_and_fnm1[1];
+      fn_and_fnm1[1] = fn_and_fnm1[0];
+      fn_and_fnm1[0] = fnp1;
+    }
+    void next_value(int64_t* fn_and_fnm1) {
+      cu_next_value<<<1, 1>>>(fn_and_fnm1);
+    }
+#endif // TTG_HAVE_CUDA
+```
 
-const int64_t F_n_max = 1000;
-/// N.B. contains values of F_n and F_{n-1}
-struct Fn : public ttg::TTValue<Fn> {
-  std::unique_ptr<int64_t[]> F;  // F[0] = F_n, F[1] = F_{n-1}
-  Fn() : F(std::make_unique<int64_t[]>(2)) { F[0] = 1; F[1] = 0; }
-  Fn(const Fn&) = delete;
-  Fn(Fn&& other) = default;
-  Fn& operator=(const Fn& other) = delete;
-  Fn& operator=(Fn&& other) = default;
-  template <typename Archive>
-  void serialize(Archive& ar) {
-    ttg::ttg_abort();
-  }
-  template <typename Archive>
-  void serialize(Archive& ar, const unsigned int) {
-    ttg::ttg_abort();
-  }
-};
-auto make_ttg_fib_lt(const int64_t) {
-  ttg::Edge<int64_t, Fn> f2f;
-  ttg::Edge<void, Fn> f2p;
 
-  auto fib = ttg::make_tt(
-      [=](int64_t n, Fn&& f_n) {
-        int64_t next_f_n = f_n.F[0] + f_n.F[1];
-        f_n.F[1] = f_n.F[0];
-        f_n.F[0] = next_f_n;
-        if (next_f_n < F_n_max) {
-          ttg::send<0>(n + 1, std::move(f_n));
-        } else {
-          ttg::send<1>(n, std::move(f_n));
-        }
-      },
-      ttg::edges(f2f), ttg::edges(f2f, f2p), "fib");
+### Struct Fn and Serialization
 
-  auto print = ttg::make_tt(
-      [=](Fn&& f_n) {
-        std::cout << "The largest Fibonacci number smaller than " << F_n_max << " is " << f_n.F[1] << std::endl;
-      },
-      ttg::edges(f2p), ttg::edges(), "print");
-  auto ins = std::make_tuple(fib->template in<0>());
-  std::vector<std::unique_ptr<ttg::TTBase>> ops;
-  ops.emplace_back(std::move(fib));
-  ops.emplace_back(std::move(print));
-  return make_ttg(std::move(ops), ins, std::make_tuple(), "Fib_n < N");
-}
 
-int main(int argc, char* argv[]) {
-  ttg::initialize(argc, argv, -1);
-  ttg::trace_on();
-  int64_t N = 1000;
-  if (argc > 1) N = std::atol(argv[1]);
-  auto fib = make_ttg_fib_lt(N);
-  ttg::make_graph_executable(fib.get());
-  if (ttg::default_execution_context().rank() == 0)
-    fib->template in<0>()->send(1, Fn{});;
-  ttg::execute();
-  ttg::fence();
-  ttg::finalize();
-  return 0;
-}
+### Task Graph Construction and Execution
+
+The task graph consists of two types of tasks: fib tasks that compute Fibonacci numbers and a print task that outputs the final result. Tasks are connected by edges, which represent the data flow between them. The task graph is explicitly constructed and made executable, then the computation is initiated by sending the first task into the graph.
+
+#### GPU Memory Management and Kernel Execution
+The Fn struct also contains a ttg::Buffer<int64_t> b, which is used for GPU memory management. This buffer manages the memory where the Fibonacci numbers are stored and provides mechanisms to transfer data between the host and the GPU. The next_value function is called to execute the CUDA kernel, which computes the next Fibonacci number and updates the values in the GPU memory. This operation is performed asynchronously, allowing the CPU to continue executing other tasks while the GPU is working
+
+#### Asynchronous Task Execution with Co-Routines
+In the make_ttg_fib_lt function, co-routines are used to await the completion of GPU tasks and the transfer of computed values back to the host. This approach enables efficient overlap of computation and communication, reducing the overall execution time.
 
-```
 ## Comparing _nth Fibonacci_ CPU vs GPU-version
 
 | Concept   | CPU version             | GPU version               |

From 7cd3c9bcf7375b53d9e62ff0193e4b84e74d2f57 Mon Sep 17 00:00:00 2001
From: hyndavi <venkatreddygarihyndavi.3@gmail.com>
Date: Wed, 17 Apr 2024 11:59:43 -0400
Subject: [PATCH 15/28] ReStructured

---
 doc/dox/dev/devsamp/main/CMakeLists.txt       |  1 +
 .../dev/devsamp/main/fibonacci/fibonacci.cc   | 29 +++++++------------
 2 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/doc/dox/dev/devsamp/main/CMakeLists.txt b/doc/dox/dev/devsamp/main/CMakeLists.txt
index 4979da2ca..ba2593927 100644
--- a/doc/dox/dev/devsamp/main/CMakeLists.txt
+++ b/doc/dox/dev/devsamp/main/CMakeLists.txt
@@ -5,6 +5,7 @@ find_package(ttg REQUIRED)
 
 add_ttg_executable(test test.cpp NOT_EXCLUDE_FROM_ALL)
 
+add_ttg_executable(fibonacci fibonacci.cc LINK_LIBRARIES std::coroutine RUNTIMES "parsec")
 # Fib device test
 if (TTG_HAVE_CUDA)
     add_ttg_executable(fibonacci_device fibonacci_device.cc
diff --git a/doc/dox/dev/devsamp/main/fibonacci/fibonacci.cc b/doc/dox/dev/devsamp/main/fibonacci/fibonacci.cc
index 551529b0b..1a03149b2 100644
--- a/doc/dox/dev/devsamp/main/fibonacci/fibonacci.cc
+++ b/doc/dox/dev/devsamp/main/fibonacci/fibonacci.cc
@@ -1,28 +1,20 @@
 #include <ttg.h>
 #include "ttg/serialization.h"
 
-const int64_t F_n_max = 1000;
 /// N.B. contains values of F_n and F_{n-1}
-struct Fn : public ttg::TTValue<Fn> {
-  std::unique_ptr<int64_t[]> F;  // F[0] = F_n, F[1] = F_{n-1}
-
-  Fn() : F(std::make_unique<int64_t[]>(2)) { F[0] = 1; F[1] = 0; }
-
-  Fn(const Fn&) = delete;
-  Fn(Fn&& other) = default;
-  Fn& operator=(const Fn& other) = delete;
-  Fn& operator=(Fn&& other) = default;
-
+struct Fn {
+  int64_t F[2];  // F[0] = F_n, F[1] = F_{n-1}
+  Fn() { F[0] = 1; F[1] = 0; }
   template <typename Archive>
   void serialize(Archive& ar) {
-    ttg::ttg_abort();
+    ar & F;
   }
   template <typename Archive>
   void serialize(Archive& ar, const unsigned int) {
-    ttg::ttg_abort();
+    ar & F;
   }
 };
-auto make_ttg_fib_lt(const int64_t) {
+auto make_ttg_fib_lt(const int64_t F_n_max =1000) {
   ttg::Edge<int64_t, Fn> f2f;
   ttg::Edge<void, Fn> f2p;
 
@@ -32,9 +24,9 @@ auto make_ttg_fib_lt(const int64_t) {
         f_n.F[1] = f_n.F[0];
         f_n.F[0] = next_f_n;
         if (next_f_n < F_n_max) {
-          ttg::send<0>(n + 1, std::move(f_n));
+          ttg::send<0>(n + 1, f_n);
         } else {
-          ttg::send<1>(n, std::move(f_n));
+          ttg::send<1>(n, f_n);
         }
       },
       ttg::edges(f2f), ttg::edges(f2f, f2p), "fib");
@@ -51,10 +43,8 @@ auto make_ttg_fib_lt(const int64_t) {
   return make_ttg(std::move(ops), ins, std::make_tuple(), "Fib_n < N");
 }
 
-
 int main(int argc, char* argv[]) {
   ttg::initialize(argc, argv, -1);
-  ttg::trace_on();
   int64_t N = 1000;
   if (argc > 1) N = std::atol(argv[1]);
 
@@ -65,6 +55,7 @@ int main(int argc, char* argv[]) {
 
   ttg::execute();
   ttg::fence();
+
   ttg::finalize();
   return 0;
-}
+}
\ No newline at end of file

From 5a388f4d546348be3298f1a1f77719738fca99d0 Mon Sep 17 00:00:00 2001
From: hyndavi <venkatreddygarihyndavi.3@gmail.com>
Date: Wed, 17 Apr 2024 20:52:26 -0400
Subject: [PATCH 16/28] Rewritten ReadME

---
 README.md                    |  83 +++++++++++++++++++++++------------
 doc/images/fibonacci_ttg.png | Bin 0 -> 15012 bytes
 2 files changed, 54 insertions(+), 29 deletions(-)
 create mode 100644 doc/images/fibonacci_ttg.png

diff --git a/README.md b/README.md
index 6447d1371..928ba422c 100644
--- a/README.md
+++ b/README.md
@@ -232,9 +232,8 @@ To illustrate the real power of TTG let's tweak the problem slightly: instead of
 
 To make things even more interesting, we will demonstrate how to implement such program both for execution on CPUs as well as on accelerators (GPUs).
 
-### CPU Version
+### Here's  CPU Version
 
-Here's 
 ```cpp
 #include <ttg.h>
 #include "ttg/serialization.h"
@@ -299,9 +298,30 @@ int main(int argc, char* argv[]) {
 }
 ```
 
-TODO: walk through the example, key things to emphasize:
-- `Fn` aggregates 2 pieces of data that were separate before in preparation for aggregating datums into single continguous chunks that can be allocated on GPU more efficiently
-- `make_ttg_fib_lt` creates a TTG composed of multiple TTs, whereas before we had disparate TTs connected to each other (i.e. there was no explicit graph object). This allows to support composition of multiple TTGs together, as described in Herault et al DOI 10.1109/PAW-ATM56565.2022.00008
+[//]: # (TODO: walk through the example, key things to emphasize:)
+
+[//]: # (- `Fn` aggregates 2 pieces of data that were separate before in preparation for aggregating datums into single continguous chunks that can be allocated on GPU more efficiently)
+
+[//]: # (- `make_ttg_fib_lt` creates a TTG composed of multiple TTs, whereas before we had disparate TTs connected to each other &#40;i.e. there was no explicit graph object&#41;. This allows to support composition of multiple TTGs together, as described in Herault et al DOI 10.1109/PAW-ATM56565.2022.00008)
+
+### Utility of Fn struct
+Fn aggregates 2 pieces of data that were separate before in preparation for aggregating datums into single continguous chunks that can be allocated on GPU more efficiently.This arrangement allows each task to access and modify both current and previous Fibonacci values without the need for separate data fields or additional communication overhead.
+
+- F[0] and F[1] store the current and previous Fibonacci numbers, respectively.
+- The default constructor initializes the sequence starting values, with F[0] as 1 (first Fibonacci number) and F[1] as 0 (base case).
+
+##### Serialization Functions in Fn
+Serialize functions are useful to communicate the struct among the tasks. TTG leverages these functions to serialize and deserialize the data as it is sent and received through the task graph.
+
+### Utility of make_ttg_fib_lt
+This function creates a TTG composed of multiple TTs, whereas before we had disparate TTs connected to each other (i.e. there was no explicit graph object). This allows to support composition of multiple TTGs together
+
+- The function make_ttg_fib_lt constructs the task graph that calculates Fibonacci numbers. It defines two template tasks (fib and print) and their connections via edges.
+- Task 1 - fib Task: Computes the next Fibonacci number and decides whether to continue the sequence or send the result to the print task.Receives an Fn object containing the current and previous Fibonacci numbers.Sends updated Fn objects either back to itself for the next computation or to the print task if the maximum is reached or exceeded.
+- Task2 - print Task: Outputs the largest Fibonacci number less than F_n_max.Receives an Fn object from the fib task. No output terminals, as its sole purpose is to display the result.Just uses std::out and prints the largest fibonacci until N.
+- The fib task sends either a new Fn object to itself for further computation or to the print task if the condition next_f_n < F_n_max is not met.The print task simply outputs the received Fibonacci number and completes the computation.
+
+![Fibonacci_TTG_example](doc/images/fibonacci_ttg.png)
 
 ###  CUDA Version
 
@@ -363,7 +383,22 @@ auto make_ttg_fib_lt(const int64_t F_n_max = 1000) {
 }
 ```
 
-Walk through the key differences ... potentially we could show both side by side ... not sure how to do that in Markdown though ...
+[//]: # (Walk through the key differences ... potentially we could show both side by side ... not sure how to do that in Markdown though ...)
+### Differences in the Code Implementation 
+| Aspect                             | CPU Implementation Code                         | GPU Implementation Code (CUDA)                                                                                                                                |
+|------------------------------------|-------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| C++ Features                       | Full use of C++20                               | Limited C++20 due to `__global__` and other CUDA specifics                                                                                                     |
+| Execution Space                    | CPU cores                                       | Defined by `#define ES ttg::ExecutionSpace::CUDA`                                                                                                              |
+| Data Transfer                      | Direct C++ object passing                       | Use of `ttg::Buffer` and CUDA memory operations                                                                                                                |
+| Task Creation                      | `auto fib = ttg::make_tt(...);`                 | `auto fib = ttg::make_tt<ES>(...);`                                                                                                                            |
+| Task Execution Synchronization    | Synchronous execution within TTG runtime        | Asynchronous execution using CUDA and coroutines: `co_await ttg::device::wait(f_n.b);`                                                                        |
+| Device-Specific Code               | Not applicable                                  | CUDA kernels: `__global__ void cu_next_value(int64_t* fn_and_fnm1);`                                                                                           |
+| Serialization                      | ` void serialize(Archive& ar) {ar & F;}`        | `void serialize(Archive& ar) { ttg::ttg_abort(); }`                                                                                                            |
+| Memory Management                  | Managed by C++ runtime and Managed by Fn struct                     | Managed by `Fn` struct with `ttg::Buffer`: `Fn() : F(std::make_unique<int64_t[]>(2)), b(F.get(), 2) {...}`                                                     |
+| Task Continuation and Data Locality| Managed implicitly by TTG                       | Managed explicitly: `co_await ttg::device::select(f_n.b); next_value(f_n.b.current_device_ptr()); co_await ttg::device::wait(f_n.b);`                         |
+| Memory Model                       | Implicit, managed by C++ runtime                | Explicit memory model involving CUDA memory management: `std::unique_ptr<int64_t[]> F; ttg::Buffer<int64_t> b;`                                                |
+| Task Flow and Control              | Implicit control flow based on TTG              | Explicit control flow using CUDA streams and coroutines: `if (f_n.F[0] < F_n_max) {...} else {...}`                                                             |
+| Task Invocation                    | Task invocation within TTG: `fib->invoke(...);` | Task invocation with host-device communication: `fib->template in<0>()->send(1, Fn{});`                                                                       |
 
 here's the CUDA code
 ```cpp
@@ -379,31 +414,21 @@ here's the CUDA code
     }
 #endif // TTG_HAVE_CUDA
 ```
+| CPU Implementation Feature          | GPU Implementation Feature (CUDA)                                                                                                                                                                                                                     |
+|-------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Standard C++20 features             | Limited C++20 features due to CUDA restrictions                                                                                                                                                                                                       |
+| Task execution on CPU               | Task execution split between CPU (host) and GPU (device)                                                                                                                                                                                              |
+| Direct data passing between tasks   | Use of `ttg::Buffer` for managing data transfer between host and device memory                                                                                                                                                                        |
+| Simple TT task creation             | `make_tt<ES>` specifies that the task is meant for execution on an Execution Space (`ES`), which can be CUDA                                                                                                                                          |
+| Synchronous task execution          | Asynchronous task execution with `co_await`, waiting for GPU computation to complete, and data to be brought back to the host                                                                                                                         |
+| No explicit device code             | Separate device code (`cuda_kernel code`) and host/device memory management; Device code uses CUDA kernels, e.g., `cu_next_value`, to compute the next Fibonacci number on the GPU                                                                    |
+| Standard C++ serialization          | Custom serialization that aborts execution; in GPU code, serialization is not needed as the data does not leave the device memory space until computation is complete                                                                                 |
+| TTG manages data locality           | Explicit data locality management due to device memory requirements; use of `ttg::device::select`, `ttg::device::wait`, and `ttg::device::forward` to handle data between host and device                                                             |
+| Classic CPU memory model            | Complex memory model involving host and device memory; `Fn` struct now includes `std::unique_ptr` and `ttg::Buffer` to accommodate CUDA memory management                                                                                             |
+| TTG-based computation flow          | Computation flow is managed partially by TTG on the host side and by CUDA on the device side; Task continuation and flow control is achieved through the use of CUDA co-routines. |
+| TTG task invocation                 | Task invocation involves sending data to the GPU and managing the lifecycle of the computation on the device; use of `ttg::device::send` and `ttg::device::sendv`                                                                                     |
 
 
-### Struct Fn and Serialization
-
-
-### Task Graph Construction and Execution
-
-The task graph consists of two types of tasks: fib tasks that compute Fibonacci numbers and a print task that outputs the final result. Tasks are connected by edges, which represent the data flow between them. The task graph is explicitly constructed and made executable, then the computation is initiated by sending the first task into the graph.
-
-#### GPU Memory Management and Kernel Execution
-The Fn struct also contains a ttg::Buffer<int64_t> b, which is used for GPU memory management. This buffer manages the memory where the Fibonacci numbers are stored and provides mechanisms to transfer data between the host and the GPU. The next_value function is called to execute the CUDA kernel, which computes the next Fibonacci number and updates the values in the GPU memory. This operation is performed asynchronously, allowing the CPU to continue executing other tasks while the GPU is working
-
-#### Asynchronous Task Execution with Co-Routines
-In the make_ttg_fib_lt function, co-routines are used to await the completion of GPU tasks and the transfer of computed values back to the host. This approach enables efficient overlap of computation and communication, reducing the overall execution time.
-
-## Comparing _nth Fibonacci_ CPU vs GPU-version
-
-| Concept   | CPU version             | GPU version               |
-|--------------------------------------|--------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| Execution Environment | Utilizing the TTG library to manage task parallelism within a possibly distributed computing environment but without explicit hardware acceleration             |  It uses the TTG library in conjunction with CUDA to offload computationally intensive tasks to the GPU, thereby achieving significant performance gains through parallel execution|
-| Computational Model  | It relies on the CPU for all computations and data management            |  Computations are offloaded to the GPU, allowing for the parallel computation of Fibonacci numbers, which is particularly beneficial for large sequences due to the GPU's ability to handle many threads simultaneously  |
-| Data Management  | Manages data flow between tasks using TTG edges, with each task operating on standard CPU memory   |Incorporates complex data management strategies to handle memory transfers between host (CPU) and device (GPU) memory spaces |
-| Software Requirements and Portability  | Relies on the TTG library and a standard C++ compiler | Requires a CUDA-enabled environment and a compatible NVIDIA GPU, in addition to the TTG library  |
-| Data Structures |  Utilizes a simple structure Fn containing a std::unique_ptr<int64_t[]> to store the Fibonacci sequence |Similar to the CPU version but includes a ttg::Buffer<int64_t> for GPU memory management |
-|Concurrency | There's no explicit synchronization or concurrency control needed beyond what's handled by the TTG framework |Uses co_await for synchronization, indicating more complex control flow to manage GPU operations efficiently |
 
 ## Debugging TTG Programs
 
diff --git a/doc/images/fibonacci_ttg.png b/doc/images/fibonacci_ttg.png
new file mode 100644
index 0000000000000000000000000000000000000000..e97fac19af7b4b7a7067f3eb3afe7f507b1aa696
GIT binary patch
literal 15012
zcmeHu2UL^ammr9A0TltIic$qalOnwr=`{kO7m;r09YF-7Nbgkyqy>;>kbnqC=pZFX
z6-Yoz=!6mo8-Kt5%sI1XXLirboY_6Qn{&Q=-+k}h^6tC$-FNSo#Odp*Q&BKd5D*Yh
zX=<n#5)cr9@K-h&5xxbZbj=ulyKk>ys6#*y#6>_5@tT0(9N!f2n}EPqh=5?zhJZjS
zn}C49E5FS^hJb)jOkdkr_3G*hiA1Gm=E}&c@$ie<I=pCXYSon!>F^a<ej_sdN+iuj
z<fXpIX{yLYx(F7FZS|81wN$M1k{OMV*iIB(jS~ausY)q6&3BRQ4wgJj7EN@J{~0ar
z4pi3GP=|i$42z1ZhJ3QLcbQ*U!eCCY7Z<&K1LG4@+J+W8yL%qqK?NmcYwN$0QnUSn
zUOBmfgCpLk=>Q8M-EBjthVHqLxLiB8prK{#DDtYQZ^<Jt?p@(0`20z9TG_Yhz0#(E
z_R-%%%NMy-oi!a(8u}K-RxUv?*|u&$s@lLe=@o*a(lW~0??1tn^>s4a^XnFxD<*3F
zQp2pk&%Yk_cAvnjrW;;F_{LVI!gjxf7sR#xhUI-LZTR*1>(Termb}uTDm6nBLt7i)
zcM(=EUEdE?!S;J9en140W-IZUCA8C0S0RWcX3@X{<X#$P-UI}9^Zs0fn!Zu#_(oD6
zO&wLz-`8&w+@w7bG`mAUAStS;qG%j2i!LCyn1kb-(wd}0C7j8$I5})7H6xI?;B!ci
z+jqCd;|0c{+l@|69arjiE1#xvn%VTXR3WRp3wH=w_y;?I>Q#?4r_;FcJpN-Jla-Ce
zU6-4y*mb>182jUCE7DONMB`hTQq1=;<6st<>Z!2PVb7B8-LNa=T1@_Io`N;ncdyA5
z+dU)6ai&~bUgE2SZX7cT7LcjCv}wzwbJ!+jNu+ftiY}aWUB=Gt>Hr(xUX){4nk8Om
z&bpSEVhc9xOjDMoUE~deMV%{Q^e)Xyxd11+dlTb(6_}hqEo72RND)%788tYE2+B_i
z9eM@ocC<T)K1cQTOg3#q@4wUlaJ$w;KfNv^%GdMZ+}DuIAeyLp1deh>H|xhmSik*Q
zDKwR%DQp`Q_?;zna2(I~Hlg0z$`Q7@o|+BRJ%0^=Iv)S+b(x7Xdo{9&LD8LbknXe<
zWd_-^&-a9yM@gV>G#?oHUn|G_ypIslRNK`4Iw{A5WPBjH5wWf<m?8~^ng<IQ-~UB^
zi>;oKEnI+WDcgz*hy=G_%!SzMThSpI0X7~Y<Ne}MOkk)*us}9VK#*y>PQI_GxS&&F
z(VNvbpdCEFH6qL$*WteWQh7_cnwK)bA1~FT>oS=;#=u|KCGhl`G3FnYzIGApCAjRo
zdm_rLjc!zg*rt-r6cFY7B@vg>X~FM})+FM;dv4#j>PV-;@jf3J`dfB#z{A{d^c7a5
z-EWlVpEa~ZCVb@*;_+?nO-C->2&i8D5809fsj$jGI`P7tzZIDn(bu;gd3de-`6I8j
z-=DR>P&53TIe%3xr-DrWX<;&>Kx+zsK}CS~{Vde|tS8BX;N(GOxZ@MJBQ^39^MAjN
z|5d}EaWFWPDg)pof7y?1BIvQsr(!eb|16OpUD<^>_lk0}@39qSXKTr@vn6EbW!FpE
zxOhVF7C^h_%HDJOVebd+1J>WC3E3yt*n$u34p8Yu7Z$cWw!x@PgJ8XmuaBx->xdr`
zj*{(TP^a<zl|EnX8&@(;b}g1g<<Z0fJx$Gozy{1BZYXQCZ*$hQOXDl8V#w-7gmtXu
z?vkdkv>4jmW*zmxvzF@f+vcKRzO`;%HwcXW1~GFP7f~VkEN;<s3hS;LEhi?pn84(k
z|BN4D-+WS7m>+-C_!%$et$Md3-`UGBZkw=y@RjC(<xA{S52Z_;4XxcJ2{5UX<5#+x
z%hPHgZndk29$`hvb*(U;?l3LQ_N$e&!;kpWL~Wc7_u4*hbD{~&>|HNnS9!>z#k1jQ
zyr}vUD!#Q2Usrl(PKRo)H0n$8Z!kU-eS{nn!(M~fpgCEf+qE$IAyR&SJ)^MV&E+Og
zS1COSsB2bB3jFggf0@~RcJ^TSe%9HnR`OtOM46_2*jGW58D6N9l$ixRo|)jro%uyP
ztB|jC?<1n<aa#%yiK;0(PrT5u>(Cl<Nw-|>n8f5bXCArzicmseJ?3_9^z`-lsmj7k
zZn?zK;kUeBi2ld|chvc6r$kNKi7_h7(i=T5cqMVDlyDZ_8%*R!4C<Z5`XE2G{^i$F
z$xq}TGF=^LkG~k@+m&x#Wh|lzn`Q8#a}`wLLgQ6GL@IL#wi?od7Z^+GN}`Lz_e@bz
zwA)|#Z{Sx*5AOK#&;C?xTM<RJ6niE#^g9Ht`A~p181*#G081AC*Otb-KkJF(&K`h+
zNJrpw%cgjY+K9e2qvvn^WQ!?Jkl<?kmSFmIwXVm_&i*It$<*s$Fw_L^nEb!?fwcQt
zgMD0F36Hh&(+4Z@+nx&{keC5gMe%=uLmuDt7UV|BQdsMB#LCMYuS{t#nziTTZwvRv
z$MIh2lw0c}3t#l&x9l~Sx@eJWg*TM~RErAf><qv2_slWakFW0cU;AM@5-4~JCupBm
zx&Ga%{g^8pb-AXqvg8)_JrC^im}pubyT8$D_W*OM(0tOA2Nb<`iC1DmM0CKV8i^!!
zK<if-Lr;4JBD=FJeyRIBC8DC|q9>n1^lU!6BC~ZdHfwQL)YI{kH|+}AnFJJu?z0YT
zP6bnyU{>wx9xAJMV9a^AZO-fsze{oUFx`va?e^I-zPqRFot<oiHEP!-xiB0W=^s8w
z4>6aM;O;p(t-ogpwQD{J8zK#cuuq10n6?*C`G=QU1q-C#+|#~O&(`D0K8IVx=sPa;
zKe)9wGjw!1lNTqkY~-?#U67+mQj^PBSY3x=s$5@uyWD!AZVYii_w^#@Rd^6zn~QD<
ze&^~od>4c_v8P}6+hVB5Q%<Y-Y&>fNRwWFbDkj_5i3%B+^hhqa0{Rn-#X>y5DntMn
zW|cODFA5PB62xV?X;25Fe|qEDn<otAlDt;OWC3AeD^oSuI%=6WB%Koy-jNH7pA%+T
zVv8LDkpl);he~xAUI*;I;&^`T>q>LZ+#iv@y@z}@`DT|h%iKUs_SYv@HksQ4Vm9cs
zGS*|f?q5e(SF`T>WP;KfN-P9_#YNl;Zv(neWhYy%%YXx2yYqv(<MwSu--9__*wXp=
zM<SYO)^5o>*=$C98zU;ryK^}4sH1Wp0JVzyDPdm_q0Du$vG%RMd~NE{4;C*&faeMV
zXQ2~!R@tz>ERqT?`$GhTC9MffUeP|9)4*?f$}|B=VgRSeRX$$mcdJ;Q?W1YYT<xAD
zxC{@X$vnqlrmknKwdf%)unN;<zr0H{Etk!a?%IR*<q}e6xAV2ds>I*V>P?u7=y3YG
z*(ZB>x?c;jdWn*B(kM<%yoN-aG1{Zp7}!Or^<lT1fei=h>6F|@FbBe>nTCrT#kl`R
z2Yy!;Q;{@DZoHKt{TJ=lec}l;B->ZQEV8ULlG|F^m9J_8r_KY}v8$yFUK`Co6KWJ=
zJ*JDJ1oZH4OG&4AsjZ4pYA?u)30<o4dXHXtNZ3cMQ+}<GuC5Q6<1P5ToAAslJ^bhz
zetD|~3J4RsT<W;U3z`3{z@)tZu5%)+=%>7%&YR*l?~R+@U9S;g&dbT!XN+9z^tSaM
z3?9BnSYj$-WxHJd^}|(uJ!7XVpprt;3ta(|Z<j4SsUvPiD8&(uRxM7aGo}aKdu?_9
zSI$hVsbuE9p{x*Hc&P!^=c8G@9!fk4cWDg&$3I3TS%V$wLC$3<Rse}*#J1K&={nt<
zrA}9Sk&^*4GkdI**$2{FE|Ha4p4?H@u};RLd@nb<c!)tk=!z9$=Fb?x_kR;250Inx
zirlc--a~J%HfC?LF0$l>=Y%=G-vSklQ)JKPRpYsl1ps(HBWCqzICO|Z3HMj_&9vb!
z<0rM2K2%UZ^n+-RYSjUM2X~&;+y`~mkDpYGkm}sv3|z3K>mTZwU-iBlFZJS<=a0Dw
zpi6R(eD;&~4BH05-uQXnBg86>TvzO4E7E6(S+}*rEmxmi<5Kl90B*-_IOET4!&6q2
zoS{VHO%z~&!8tVNnr6bW%b?!izGGmSHF-i6XtG0T@7U^RLOtjq&A&Jm|5u%+?q@9|
zuxAN3O<GZ2&+p%LX=ba0-TMPAO@-OFWt$*Rns@Qp!=IK>7x^1kgB|KcY%Zh!Xwq|S
zmd}Pbq3&X%GM(g`M-`0F$9wIj*yWQ>drC%O%-qt2ezj~VhGgtnu&hkAB10}C*Q6Yg
z1!Ht-#b~F5<*3X4f~Y(Cwo8$cJ=%AFb%Ns8k~`u^<kXp!v>YSMe=n+zHSBA7R+q^~
z7>b{)!8zz2D0<d{nlLZw|9}5qssa1B*||h+#HS5MuM9`_Ol?7;bhtEo&)bb?uQa;4
zqg`#m>*?@d!DX6qhHpA@0cGKyLF~oa^sv}0_%rEU7%PJ%U~qpf+G;?H(sU1S<z$B{
z?LvXf(;X{JoS(1(qjmV#ENmGvbp$`&tY*`BPUG0v;t?dk1PC<!saMYM!#;G_=b-pi
z28~v{>3x6oY8Y^q28<Sme$ZmOTN*uX*Nbq1ts8el61P1k*i$LQj2(V0@ci|Bf|5eW
z1^L9Qm8XYIN09sL?}i!dhxaDb`9w#GNfWj&=&l_pCmmj-P!_W#Qx<dlzjN7~@(ToA
zAoS#GvGQ4;%Af5QnO$sD6yfr&4huGjN$%3g#olET;WkUs`{We#icl>&<MR`~oHYL4
z@@L|(c7oFGpuYU$ZQf}+umD}&lZZinpCPjVn^Z^hXoArC>A8GP{KIAa{6xOkUk{UW
z`Ts`OS9#r00-hf8U0nKs=hRW^wFJgNS*+1OCw}Gc*i1g#TqrUxHz6A%-3`iXRH?pM
z%w`&#+o<;ELGD=md5$T&0$cl*`^fEDrHHU+=ocf>f0*u7wm!j|k|4)do&$(~_~u0L
z1SR=cTE%EbwIWSNIBwEMS13qE0=Ae$F{;f?T>8Dy9K9}od5?8nF#Hl~8tfs(cUPcr
z+9e!|B{^F2Y-SWt=J|FwJ6evIy|uh6KOyv5Da0CEz0vGvd}@T2o_NuvVNiY9d-PQL
z(FN2LJMIG#dOGP6E)M&VL=j#wK^(0Bxwy}8=Nj}N3Zzel9H0R!vM2M&!~ZKg-iJdw
zCM7P}Ear|2PP^y6B*F+ye>sJQdr|u3VyUjyJsKOtnCm$3GW`ZZ3fgY^TE8J;XWZw0
z9VGK%(YE3Q)X7XSK0mYD&NbLHT-fLqK-sX>3B4HuCoab$luORcrvg|)HDwx_1y$Vq
z*`*!_jWz?Q{ZHBh)gOy;Z$23;(0MVH$?cltuL|Ia#-UQTV6E@q&(7?hk~J)l!N<*I
zG69c!U!LaHm|DvA@t<7VZ|S`Se#L&CQO!DlY9%~jT+E$!7Qek14erIw?7Pj(E``sC
zG*7s4F=o=heCtqYT8&qyHOZ7uKZHZ=`084(X7|snR=loWLO1PB;GV<o>Oo?;x2T>R
zfA>PU+-JCJuK>;c^b^(Zd%s13rdLrAO05!#*b-G;z0<!lMGt<Zdc>DShH`lXO*;9&
zB>h6d5KJC^w5s72^N!;LM1{N-|Jw*sYooEpldc8XjXCJcIz4T@c|67nT|Z!(Gm&p2
z57N+idAEZrSmB}g;N#$~`+X@h#$c8k;7KZ|4Z(Su>y^SzNQ;bxUph$IrTGMS0wZc*
z(!9m=HfuG+C6U2QbYbqsZ2WLv-|W`DS-JCT`(5jvS28MIJmN8a%=e(hH!u4+C&~PO
znL21mz7Ci%LmLG{Nc(m(jIdpTmpf2DRUez`N@^#Gq{gKi&|;tSv@Lxzx*ZooFq%1b
zH3<|a2Q-JA_TOFH*$dDMytT#S?h!=GskTcS3OUEA9r7@;Rql!_L0h8T?gkBtG^IUG
zrlc5uUc~oX{7m300NO%rR|{nYbE;Fmo1_+I2VT}&$?qgYd_dhk?k-h^-UJ7VGa5AR
z1!xIq9bCVSyyku+Z63gb9B~k39*eETB#wktSAaDAz7ZyV2H{5bCFQQAN)h$UMSs`r
zCPU38kUV~~Mh{jLCXpMN5<qm!j`Gw}^f%vP<<WvSuU8)wgJKR+5(|DgeAEZLS?Q7_
z8f^Z_3H+sI4fbVQuNw5;Wy`FFAgf4~<Ke^^whB-`)W8=bVD(;ky9$Sw@{{DxE-JEY
zI=EyIcJpaN401MMR3x1bwk}1DQU?gBUV({MUoY@^dR_a@U~utCoxP3vY?@{Q%xQfK
zPIr5NGnrC*q7F1SP|Wf&TkcM?e@(9Y-1}4Lt(XO+r!R9K*y?A2mYsteHw1u<_X^@r
zFlTRjXT1Br-{nh*mLvmU7IZa`*17zMEhjd_)-J`%d?{Z0iPq<xJA=8$6h9;9Z%|QC
z=eWC5K6(5pnCE;Z^Z~<-n2t{6RgD6|ea=;pE1W-laLh_qBlzTy?S{woQl^fxZf{Cx
z@m-=2>_pNzNjwaB{5A2r4a2njX@*GN7wCEe!Wr&3*YA7g?SWs$E?_l^6mLNA3^rt>
z(tThUzU>w0OcB6y9sIx2F<DFiCngqxwI$Kx@5N@>|2{JRD&xCZt`*}mGoIoXc|07Q
zu0XlF4O4FPd7k8WC4OrHsVkBB^3_fHk4W?=Dz(_#&I9nXxSLtc-m~A*R$yrefsI)9
z&`Ea<8{LS+Z@1lGF=>A<oq^_`zKAxb3%NBl1pv?Mm(U0HA3)2py6v*&lRP(ejqTcP
zWAURK!Q`5g>j%;{qz4|p>dT2kC+&UIEsjxp@$Drn;I5B0rjYXz$&UzSXcA0b2G*gm
z)xiTz>RoOi^Yh$PL`L4~3TV_{5NfaV(1f;*OCGV2`xzF3ViI+k_MNjOB-H>sd5`)z
zPxP?3n=wH%txq>OCgHY?uE|8u;%~;95VXQaHEqC?$Y((1dvM~1su{>s67-ppyZU?g
z9;A!UcXSG6m0kvNHW5h?1cVcpYG)uRPE4g6oNPgW+-M;~Jc$}YDr96z*;8;_IrCfj
z11idxl~pjYtS~Ziy!a`&G73&g^>crABXdh(`yoRDwMlAqL+E|#++dc1RPY(MGCRJl
zyl(wz1MTlaRc4<!V~P1kDgYj}UgTTJgD+O(7}1akN__mPdW2Y1+~i+?kRIWKT5Q!o
za|0hWt#s9nWsV=-o(#;O;(mGigXbr?I^Bs%$jC@zAY}8^|F-b=KZv~_6$$Zz+64Ks
z2;lGfl}@&`PC?{B{H$PnM>Twfo#74v^zDBf<deT*ffkd&Gm5V8^S_RK^eF`IHgI;t
zVvFRWCs{+o_?_b((T&b8*0VGF;UTw3gQ2F2ZwHU%Qz$8tl_BozDxz8t7Ir_@P{&oF
zfUXqDfyM(<OUP9;NI!xZOx^%Pp`#cr!7TA>rGoQEHO7K7Qa7!^HSKU>@f0nHe!>Tk
z&dYq^W8F~sQ?v%N+Z(}|;VGdZP9QAihlTp4jz5jc%6@my;r^vtzEi05s%0|e+c6`^
zWqIqmi+D?}4liPH#~{>bprwp48arHDyHHv>_AybdX$SER!<drj+hCRpvPhg^v!-rG
z9R~%+C^2)OZr_vbX^V{=b+EUNO$ZeLE8YqkC^PM>AN394zdACRHIW94<`P|%MAbHd
zq;YJ=&hE0zsUYdZT8L`GWrM(`G+3A#V}u=$W&crkWEJ#`Mgu+liNqBzS#q`6%pTB`
zH>>q{$%PdzZdEUMfRuLj2z0MsXaX_4gti1xg&{Mm^Hi{PfZ=yA?~6*Oq?l`duYU*O
zqac|o>Hsk>d-l*-V`>U#Bh5?`q5vr@Wy=AJ<zj=ebH_Sox>nsy%a@%?Zm$!^-8)e&
zog3(9(ua`hUPI}^+BMwwtl@LWti4z;95o#18jk&~b=r_ZN%H<92rMRh1nk!U{FItM
zF52Df>OpRbQK#@`RlxlwsgIctzUY9}7IB*5YBVR25;_2P9z$hj3Dm@wK;!|Yx37R6
z0Pc#olfm~4A5e*y47T)f&<$V9Bojd=ZUKZr(yhcBP?-WotGtP<S^pQo^67)F2|EE)
zlb4Asqg!gvA1lSe@PTxzbJ8THt`)TZH??#v=*qi#f24~VxYGJlUbO7!T7Uu|+#{IO
z1;`t7s0?ipkBpi~d`;5Drd|S?V;UddVFsh&dj8Cfcc8`A=kwm~44KtSvcNG+&0y5e
zai;G~rQ}e*DT0fUlKIQWk%Ydy&!=M@-J0}q4eZ=$htfA<oRcZfOF!aIG_d#+=t2L~
z^CB&@TuTPX+79fyD8_}bvSi-ok#AYmGNJL0P_K*vHdxtxislnr(56~=$Z)6RBM3Jg
zq7GRjMs2z_3C){`XMhSM@Qd_@@to{Labsa?TCvT26CK4wXvxOuoc4g>NOOM`WGx;I
z@-1BjX+m3a6Zf*5ye*i6uFh$;kA3s`n%6gZ5!0BiSImX-BeULtjZd`2jtxU0r|LiV
zG2V&DNjR}8k5n<c`uEn$HA9~_YKD+lwaH&W#hqz-5QUGWAh0S+d16%kDBXmUysy&a
zD%i2fJ_T=y$`B`St(_R8u;uQR`xI^5h(LPvg=ynfLZ_34y>jBKBhD=Ox!R~6gDnrJ
z-D=P)-MnRITkSljdg=HA`Qunn%)%FQWaNVzUYy{<uH)p8EG;>&{OZ8_$AKCU{8{j4
z>>8_Y_9vuE74OQrw+s(MAJj&HB-vHazW%&K6Wd<M=N7~IQS|XARBA=NI;`|a5y=mI
zZ*-bM`P3nKcg$I(;@iplCBBr)0{hay4<l6R!xM>n^WOsOo9@XpH{6oGe9?FqA{6+z
zX8c~ojOX|7{I^Ai8DHTK0pAp?^&4{<ZD&sWo$tH|Bw_B+o7cGwpSlw$PELgZ3EiK<
zVJa3@!0z!6mkL;>(i09j_jdLq``j?%qHiSdn$dTMGWMco52S3u;_Cs1e#~zwzDR(#
zKVzbx_Zg1f`GVaaZ;|5S8P)sx{R+4(UhR(EG=udhfkUlgPbIyyDJgd&m*fB;;;<0~
z-6{cKFAoPoX}zhp_m%yD)2H$<i`$gth<k8yFkNvmXb^e=+6~qj6ZRp5s*xZcDJVke
zz4iFyYLByR0!>25oZ>;H*YG1BLrtTfh5MXv(KGaH28&rB8R?R%3CjA9ajTzhR#Wmd
z#L!Dywv2Xw^rm~+rrc6ojTIo*?zG)fz6F($yBeSPZS<VO>xJ+)SCO~J+(l)MaIrrJ
zC!BwY1sPrXR)1OsXPxio{LB2CILYOMU?d|aKEFGT1gko1?b!-13<q`o(n^oMI>nlV
ztHH&j@ThMWTAE`(eDRj=1~Y9jL&(+LizrLD*!_ZMD)NFSc9(aDEBl(@CJy`O9`uk}
z*}yA<L|{m!Wd%3oU=eNZN8!XWXYEz{_3Q64g!)uHGx3;FgU#o1`9|+vq~L?su>fM_
zpl`}Xj}Cr(cA1vV^{KNNC+sKmM9iGW8)Qbumh7r9`B`6aG5UE?KS_Uum#(Z;G)R@c
zE*#pkGBEIox%QMfdTyLU%<y+3=E+KE`f6mWr!u6A2Y|rR3>Lf(Zg}IZ04V`15`b&Z
zx<g%E<KwsNLJg=KeQwhZQobu9q+9!p`?W`g{O}RvTGX!AYv)6fhyb6KKBHNiTrGoV
z<aN?;WQ8CP2flsd-hZK3@N;n-X<3&O{Ul5cqR7jIQ1_H?KIirGSxVEQntOl{I@7Af
z*uQ!vf(lAq;2&yGyWYvqj)=IE?CNN17K@^ZQ;S|&zr1e>EJP-6eVHxNu4`hc5<pnH
zE}azoE(Ujv-3%LCZiuIwesD(Lj<IPoU?)wdN4;sR0O6~sR|`kxOa|`IFo^7~^msPN
zwdxXgcKCdMLzF?zrJ~Q#Z&%(o-%Mk-rSLkqmV6%&1H#u?J9DgDA6D@rmaa@|1~WLg
zfFkzYuDMY%P3z|$aUd(pM5VL#!c8w(^3miJ1kaJ~ZeT6u(E7?S5e9Tbonwoxj=XCz
zHCHN*M)oLgH_^K{8uP5C<{vJuADG`NVu*?lrDvFWm(iOoS|lH5JfYuz9A?IkN*;8*
zs{}aryrUEyJ9unXBPhsU?TIfbP?+DlwTmiNTi57U3V0N8nqRT7e{^<Ij+om#ToCOz
z>!&sZpIz~v#cU%GQEz(eW^?@aCltCx9dJC;-r}3#A?CanBN`aCavl*9nhY*p#1Gm)
z&P}@-%Jv!b(7=zFJ%m2wr265kz3$P`2Bup`a{!h1{tXqIGktm72>L?k{T50oB-_XK
zuC)%y&H9^I@n|{ZIdiBdGA24jMmG*rjE5NmlPQ~S!Z`)m&C=}OurV+IdskL|Ev10W
z@a^?n0&lC23_`hYf(aE2{;g_UrXg71M4pczwviuMM)1~ODK>-9iG<)5pHkSB`q$vy
zIvS!ODuM?Wb*s9euIGlM7GNR;K~uT1j^oXObJm(N;=F*qX(hqkrQV&XjYnICWj#RQ
za|yDBcfEy6wp7Zit8E*f`MO;;nfj);`4f8)V#W#Y%1a^C;h?Rmkpa>#ew%~1b&%zj
zz3i)%kj(0-HcOWHj?>>No({cNVQIb-TFYd(^6e<vYC4|G>wUXN^<0wKt9O|@C!)id
z>?Z~Zg3J<%Vy>aOepc=@sH_Dk08Z|E>*zcG|L^}(HE?)fW`1;9R}^M`5)R0*VjLsP
zab1au-S0a35kU<H(emvp)n)(Hyk!S7TKAn8ycUFCQhpx_m`#wG0*wPCu35JPp)E-H
z`_+(d1M_s?&!f}`03K7zw&Y&AFNp}Ehf(1A`h9Op{s|>7YCL%RFHomu{4cOq<ZqDw
zZ?H$Ya|~Z}C;t5!Pwi1p)rKPhEE&orN50>GBobkTr@QOSE=iiRWtTx8>HVkYpYY7d
zMr?!E^ol}(QFoW*!(F6kMG)GWIr0*h&?>m-gljl7Igo_Y0as&+MX_DIroK8q5p16I
zp4bY{Ne;{fi-en3U#=r)H4ci*hgUM$bDkS0t&O^NX5)<MkO*rxkGrXX^mz18^isLJ
z3Ei1GzBEdaPMY&v<YLIRIXnlaZ-Ojpo_$x&!b2r8>TsHaaJhPXSyBbB6Zda}%cLzo
z*DYyO1e_=Ubna&b-_PPo9+XYiqJ^Va;br7VFb(p@bug3^{NfsPo(TH*6=>%bC>QTO
z|0%-<R+@a*t|U91Ex)sMoX&PI7EdE;^SX$>w0FF~%6D3|j#j4zzEX<eyv%K+U+zPv
z`9xlNhlU23`2;5OJsiutDbQKis>Jb#cXE}zmiozU85r%#IkNmPx2AckW}Sa-Q7Pnj
z2^9N94nZ3Qwxl%s(NfcE53>A-&bxYPAKkV+xz0a2*MB{jK}>TkFW|TXwwb=9UBai*
z3AT)%6AiDs-*fq5CGL=k-R=wJID6l84d+gWRev+zFDdW$g^lepX{H6ipD<7z32uam
zzM)CH`X1V~SKi|^phB)TZSiC4pur?E?>VgB1e@3N2r83HX~}+`wI%iOCBmro^KB(i
z6A`$HzO&z<uD5kh%&fM}G|9}0cnco=d3S5|UV62FN7TEV=1gl<&V6ykTCZM=+>WCg
z$%7ibq=K=sr$TxLX<9jFT-F&Tz$bJ9rDJsOt)b>6Ac36EO9@bqm`ZNs+sl_5#QtI4
zhh9N075<l}^sQ5+&3Cjpvevi|-!0VFh>?czA$;<RZ4T()-K`#T=-(5)I@%t47tVmr
zLQK#Xkg86*whYFL68hGm@Ik{t5GU-&d}l?s<pN*o<Lp?(QpLrOw1HX1GmWb6-Ozzx
ze)k`XJ#%1(Os=1<nQUVo@IO0RhKbv=Lfg?93FQ;yo9XjKQ@r4e?-5|jL8mO>4~v&C
z{VEtAzw7tqGq<*A*zx}q?3)C_RdkX)%HlDr-+j?1A0h|c>FGsCm$~d|s8q}-r*1Jr
z&--Pf$xv;k88|;qzTm)A@I%flkou15XBJ&FbQVtMo;RWlea*N)_H_Ck$mLgMqoUR}
zG_sW`6~qmn(p%FUiDK`<>I3c#7wA(Z&giEgwstsv8h&Z1Q_O<J7svPr?0iEw&^Y%v
z<*YsW{cb7VCjGpnv(#e^X~=n8br<f-kgmcKb;`9P)s13-M!sW`d1S}LU()2}EAFfb
zP=d1ZR%y$;&^pjr2`s9}QuoH2R5*BHexPPV)&gKsCTOA*CIW0-aph2bPj#E);;ZTI
zCjU&kmAoAP{L{y-sZHGtCq^&%m<;__YU)<u<w&WMZG$i2Ch?}9Q*Pe-!}8lZCT%8H
z7gDN$#%;@h>|dp-8J=EwAu?xNEIZM6maGdyEX0scAL1+3+1eFM)o*U@N|BdgKr~59
zebW_$7?3r-PQ7K$60sjcbbcD~9iI#v1FKZ5X*sFndx?g>?X8kbaA81-SXjoT-=i+l
z_*v7G`FIbg$O&lu<bh;<YNKKj=*5mP*D{2Q6M?EdrPs~b^e#iaElOBMQ+HbW){;Gc
zzNiv6j#W7+_VosbviWkGThEn%7?eB?VU8PH98^YNVoul%*AyV+*J}Ada4IKo`@_8}
z^zGX*sxT?a5vtGBNtF3k{{2&DtL45wJSwS{r&^t-;0PfV2q4lUZRwhf=AdsFlk4zt
zGT#&PfI7cea;Uj~nEJy-TMgC^9t#Ugs@!v@r4zP|dvj3p%ov(|=V2MolDbIF4vmDB
z<J7I{^UvIs2Zav(8P1iY<mK@}!EF3$_p`!hwe5syJ;4GyyXG!Y=Q>Y(87)|cqaimh
zljs*KOBNcv6jnc^>3?Zl+T;bM4tgm(1Y~t`UeU7=Y8DW#xO3ZA*uDlAs1h-ThRQ?{
zF+Q&x{{{jt^s(-60@vos-UjW!u6p|&zl1Iq@s~O!2bevDP=>pDh25rd&KE8=;5fWf
zJE?!6$z8(KTx4KQm%7Cvqo0|)H88sQ!4=TOw}X6$3mP*m$nXm}Q}gG+aaceG_dsiW
zyF}p`>v09@Y^}8;rve<nY5n$m41_Kb{Bh&9pjQ7B#(p=S<g0#2!;p~*xS-$ziUPc?
z1=(2z)tXBE+&TW5R!w)K<R<{AA}wr==mep?KFV25x|*C`1Vic%vi`UXZ{%jBw=clr
z5mXdvro()w2NHW<_>OB=P1T#W)A+pwr16wr8Tum_;wd}Li_T$4=99VH7*Dn20t_0k
zofzUYbeWiGwI(0y)}Lw8A4~Cv%&c#w^6hi6K&A42wp&^vH_PdgX#MKiyU6cWygzk(
z*O2D(UZQPFR~+|N0W#9|gL>`pIhgLVIaeE5@gDnO&2?{hhB>nSphA%icpsQDWCY@g
z0!;Oby@YgB(yOs47R~!o{=BUiR6FF8fqyfAJ#KCVArRv^&z6N}R<dRItTK)OSHv;n
zeK3KrSHoqW(Ts@r{<0+R0&DAFs+6rfWWZAub;z%0$AK5O@BT4>Q0qAU9(S*XT~(zA
zSKmy{X13%Yh^t5KfS`jCQTf`uOhMOaJ_pxz{>MDdJO$QdP~ziJAo|dP=!R9?g-*ti
zTI`n)EM#vri2CT|3uj>D!7tmIkK6`KN888)@#;c}Do?KsYmG2^3)Y8!xP`}jhe=%z
z&wZH_UL;NVu38=7-q{~6^qD2Wqp3&v`Q*l^On6CDs?~A(kUZ`s_KH3Ma-D5;6Vsq+
zv3Y%`g-1voIe;`I4$5fv5xWc8%p>hIT;mVt+j#U@1|zcW*eoR?X!yJS!;?;{^+yQK
zKG+g~6s`WHUJ*tqK!XqX*0tsT-srDR4ptP0qZr|3R7fyYZgg(F#h>3PH${K|ksxzC
zAToOG-+z+0YYT<Uq7S{V^3LZf>$W=EN6wCCE<=XH9Vsl?AF(5bkE536LV_;CebiWl
z+rVc_mtg{)H_|)8f3#XH$%dCcE%;n{45w>TGFF1b>?#NFs#rXqyj9AFDERE!EO_~)
z^8>OiX{lN&s`}R=BW~Df=ln7$^bo%y8Od8U8MqZ({4ZTz+;`LGlPE_>?bw61I-ntB
zkhN85U)-*Bnex1&){OWsNq`cH11|Q}!ke)MnSAr(TF{a1PP@c!hq`?2-V)Mv1LKxl
zf|Lu>pUm;Josd%R)ESlpEJ;pA40uWJ6>Y`0wGWBWwUzBCvSOefJMEq3VFr3r0xx0o
zr_CqNd`OW=AO`e|0BBTYKiJsx!x){?7Z75_oGYw+Cv_ULs|CP<7I&XpSzmOpK-kA&
zwqk0QR%^Kwp=(PK5MTW1j_!P}c2kpB%q`Ay1=Hc(`K2fHnKZb7!eBeJ+UPfhtpa3@
z=cOJb^Edo>c<U`?I8m@4x5uT$!6J!-r7T_OyfxG=XUm<Q1RQ6IMQ$p^9{%>55P+I}
zIj3)FVoM(63tg0c1FRjR`xXHTTUkE|Tftq*>W%6tK`%~6%k-=G!tOmO6WBZuk$4ES
zkOi~&_L^^c^e*ZSvjv`bjx8SGsHmgy?_N;tj&8?zZEC}PD{o(I4*Hg)8V3W8v8p_X
zN;1!=ntxC*hIvD=2jN+@%0IV_YCYpzI36Hkomg*kg{_yZ_xo0*x{=NyzVdMbpYz;<
z9?)iKx##b|<Y$G)b#*6;ve16m>d$*Pckb^gAtUD<CSNv`fMi7AP)fs&y@wr!ovo#Q
zTIHZ~^~c-ES{di;&`H6cp#XO}bqM=y##OfVO^!un$Wffk!))l8)3V7aR+Ad)OrK=9
z#*Tlvk?cT&2i7VO5hUOj-A55>$qwZpiSd3f{EG?FH!GgqC8r_aXmv%kI=rf0gHG1D
z*BQb@QHs!xfn#=aqHj@;85@)#Un~MO#@X^dlaV^5fmNGIQ<TFawcOR|Q(R-=QE(jZ
zwddS4gPeNk9p=Th8{bpJ>|#K*d7g#lpCA>R1VCNrNKn3_^)h}|v_y<yE3MDD@#n2a
zdf{y}-=l&p`GqNR@}!i{AI<PvE~q{qTn|X;D8<0VR6fW6v!Lbf@pImE&6q9D+7v#^
zA<+C5Q-Lak-O$<iQd_r1c#{l^q<EVXGjWZTngUFfq6F@5(N59X8LQWa%)7&s1mKE_
z_|e+|_1x1}Q}=vdM}rHR>JI4+MAm(j388Oty-IKsh^?y%(M>WYR29d6N!sy0NovyX
zGdS2cf|9BJN_wK4O6a#U<A)mmQ!<|a3G06){fe;!H`jTxh8?WA&dsN?1I<3CL%WoU
zmMF)ipl+)9oP{3bFP|Hf@LkVw9hkQ%sq01eqBJd3nr`b^FV2&D&j}+R9F7BynosJ0
zzx}>Cx<pdG^wj(3;k(^ReXOp*bG3(g#VZw+=vfVhHl<`}`4AO-8>j9U3W1&}iMj?a
zZfTf%9^Mz>Afq)k%7qc0qXOrGg}~z$yq8A~i8*@IkG1ARcgSYR>$>y#MB|h3oIJjx
zjGI|iV+ca`&TkAjZ-KK?0S`3q-<?c+M;o<OFWjDCW(Awv0*flq_$|2OQ`tYL)r@sv
zs$BWFXuLtyO0B!M(J?82e?~0${N)Bk0Z(}QSXS$IMsm37TB(Kel}+e7yi(rQy`iUu
z!b8$2L%~db9&C@AObX^G6xVV^>gI*tsgwvXX4c+xfbe=#)K>n37oJRQfS85A&sA7T
z)`O7de`;?UlWl__#PPu*IZ#}@-}e^taX@nL%S=yQ&InN1FVi&=%23{(g^g~#qD6A=
z(ob*`Y7a+`1MD61l?L&MlN#vOT5>f<{=odU1Yt89v>e;CAnu#`lN_88qnvXnX7TfX
zHRBFCrGw_%pXM0-*2bAsUpmje`f>kI`%TfAe{*>DpA3Jgc~^%|pVgtTe@`?VUq}E>
z6jeoD7zTTozdHDP<b*j|9$z3pogT}J`$OR}@rEMdR@IjWyWtI-@u$Z{SEpsB!A`Rg
zBJ78E${cYuB$m(Sc{mP9c{mUM{74mFtmyS0<@caX4dmi<!mS#`*8ATm9Af*HGDP?v
z`-2`U@``Bqr*nmmIX@|$5~q*qGam<AA4e&Bup|CPAR;94SU~8pfUu;okdTzHxRme{
zKKz4_(2SDW^uIuG_jGvS6!@<rRI=e*@CZEr%)#e{yQ8;{t-II1j)S9EYW_1nO;ueL
Jh>}g@e*-+3*}nh)

literal 0
HcmV?d00001


From acf06c8e8030288410435e00ad612c6dc67c35dd Mon Sep 17 00:00:00 2001
From: hyndavi <38397846+hyndavi17@users.noreply.github.com>
Date: Fri, 19 Apr 2024 11:59:29 -0400
Subject: [PATCH 17/28] Update README.md

---
 README.md | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/README.md b/README.md
index 928ba422c..e6de23211 100644
--- a/README.md
+++ b/README.md
@@ -414,19 +414,6 @@ here's the CUDA code
     }
 #endif // TTG_HAVE_CUDA
 ```
-| CPU Implementation Feature          | GPU Implementation Feature (CUDA)                                                                                                                                                                                                                     |
-|-------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| Standard C++20 features             | Limited C++20 features due to CUDA restrictions                                                                                                                                                                                                       |
-| Task execution on CPU               | Task execution split between CPU (host) and GPU (device)                                                                                                                                                                                              |
-| Direct data passing between tasks   | Use of `ttg::Buffer` for managing data transfer between host and device memory                                                                                                                                                                        |
-| Simple TT task creation             | `make_tt<ES>` specifies that the task is meant for execution on an Execution Space (`ES`), which can be CUDA                                                                                                                                          |
-| Synchronous task execution          | Asynchronous task execution with `co_await`, waiting for GPU computation to complete, and data to be brought back to the host                                                                                                                         |
-| No explicit device code             | Separate device code (`cuda_kernel code`) and host/device memory management; Device code uses CUDA kernels, e.g., `cu_next_value`, to compute the next Fibonacci number on the GPU                                                                    |
-| Standard C++ serialization          | Custom serialization that aborts execution; in GPU code, serialization is not needed as the data does not leave the device memory space until computation is complete                                                                                 |
-| TTG manages data locality           | Explicit data locality management due to device memory requirements; use of `ttg::device::select`, `ttg::device::wait`, and `ttg::device::forward` to handle data between host and device                                                             |
-| Classic CPU memory model            | Complex memory model involving host and device memory; `Fn` struct now includes `std::unique_ptr` and `ttg::Buffer` to accommodate CUDA memory management                                                                                             |
-| TTG-based computation flow          | Computation flow is managed partially by TTG on the host side and by CUDA on the device side; Task continuation and flow control is achieved through the use of CUDA co-routines. |
-| TTG task invocation                 | Task invocation involves sending data to the GPU and managing the lifecycle of the computation on the device; use of `ttg::device::send` and `ttg::device::sendv`                                                                                     |
 
 
 

From a50cb40a577b572587c9f569a5384bc24dce15fc Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Mon, 22 Apr 2024 11:27:52 -0400
Subject: [PATCH 18/28] fix F_n < N CPU example

---
 doc/dox/dev/devsamp/main/fibonacci/fibonacci.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/dox/dev/devsamp/main/fibonacci/fibonacci.cc b/doc/dox/dev/devsamp/main/fibonacci/fibonacci.cc
index 1a03149b2..fce6d9954 100644
--- a/doc/dox/dev/devsamp/main/fibonacci/fibonacci.cc
+++ b/doc/dox/dev/devsamp/main/fibonacci/fibonacci.cc
@@ -26,7 +26,7 @@ auto make_ttg_fib_lt(const int64_t F_n_max =1000) {
         if (next_f_n < F_n_max) {
           ttg::send<0>(n + 1, f_n);
         } else {
-          ttg::send<1>(n, f_n);
+          ttg::sendv<1>(f_n);
         }
       },
       ttg::edges(f2f), ttg::edges(f2f, f2p), "fib");

From aeebda803189bef1ee16e1ba200f7cb7384657cf Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Mon, 22 Apr 2024 11:29:37 -0400
Subject: [PATCH 19/28] more static asserts in In::send*

---
 ttg/ttg/terminal.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/ttg/ttg/terminal.h b/ttg/ttg/terminal.h
index 85c6ca5f2..57c287ed5 100644
--- a/ttg/ttg/terminal.h
+++ b/ttg/ttg/terminal.h
@@ -204,6 +204,9 @@ namespace ttg {
     template <typename Key = keyT, typename Value = valueT>
     std::enable_if_t<meta::is_none_void_v<Key, Value>, void>
     send(const Key &key, Value &&value) {
+      static_assert(meta::is_none_void_v<keyT, valueT>, "ttg::send<>() sending to a terminal expecting void key and value; use ttg::send<>() instead");
+      static_assert(!meta::is_void_v<keyT>, "ttg::send<>(key,value) sending to a terminal expecting void key; use ttg::sendv(value) instead");
+      static_assert(!meta::is_void_v<valueT>, "ttg::send<>(key,value) sending to a terminal expecting void value; use ttg::sendk(key) instead");
       constexpr auto value_is_rvref = !std::is_reference_v<Value>;
       if constexpr (value_is_rvref) {
         if (!move_callback) throw std::runtime_error("move callback not initialized");
@@ -216,6 +219,9 @@ namespace ttg {
 
     template <typename Key = keyT>
     std::enable_if_t<!meta::is_void_v<Key>, void> sendk(const Key &key) {
+      static_assert(!meta::is_void_v<keyT> && meta::is_void_v<valueT>, "ttg::sendk<>(key) sending to a terminal expecting void key and nonvoid value; use ttg::sendv<>(value) instead");
+      static_assert(!meta::is_void_v<keyT>, "ttg::sendk<>(key) sending to a terminal expecting void key; use ttg::send() instead");
+      static_assert(meta::is_void_v<valueT>, "ttg::sendk<>(key) sending to a terminal expecting nonvoid value; use ttg::send(key,value) instead");
       if (!send_callback) throw std::runtime_error("send callback not initialized");
       send_callback(key);
     }
@@ -223,6 +229,9 @@ namespace ttg {
     template <typename Value = valueT>
     std::enable_if_t<!meta::is_void_v<Value>, void> sendv(
         Value &&value) {
+      static_assert(meta::is_void_v<keyT> && !meta::is_void_v<valueT>, "ttg::sendv<>(value) sending to a terminal expecting nonvoid key and void value; use ttg::sendk<>(key) instead");
+      static_assert(meta::is_void_v<keyT>, "ttg::sendv<>(value) sending to a terminal expecting nonvoid key; use ttg::send(key, value) instead");
+      static_assert(!meta::is_void_v<valueT>, "ttg::sendv<>(value) sending to a terminal expecting void value; use ttg::send() instead");
       constexpr auto value_is_rvref = !std::is_reference_v<Value>;
       if constexpr (value_is_rvref) {
         if (!move_callback) throw std::runtime_error("move callback not initialized");
@@ -235,6 +244,9 @@ namespace ttg {
     }
 
     void send() {
+      static_assert(!meta::is_none_void_v<keyT, valueT>, "ttg::send<>() sending to a terminal expecting nonvoid key and value; use ttg::send<>(key,value) instead");
+      static_assert(meta::is_void_v<keyT>, "ttg::send<>() sending to a terminal expecting nonvoid key; use ttg::sendk<>(key) instead");
+      static_assert(meta::is_void_v<valueT>, "ttg::send<>() sending to a terminal expecting nonvoid value; use ttg::sendv<>(value) instead");
       if (!send_callback) throw std::runtime_error("send callback not initialized");
       send_callback();
     }

From 380f3250dfbf8207af9cbf88ed27b03d3ff2ceaf Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Mon, 22 Apr 2024 11:29:50 -0400
Subject: [PATCH 20/28] more static asserts in ttg::send*

---
 ttg/ttg/func.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/ttg/ttg/func.h b/ttg/ttg/func.h
index 28b75ace9..4273e7c66 100644
--- a/ttg/ttg/func.h
+++ b/ttg/ttg/func.h
@@ -211,6 +211,8 @@ namespace ttg {
   template <typename keyT, typename valueT, ttg::Runtime Runtime = ttg::ttg_runtime>
   inline std::enable_if_t<meta::is_none_void_v<keyT, std::decay_t<valueT>>, void> send(size_t i, const keyT &key,
                                                                                        valueT &&value) {
+    // to avoid mixups due to value being a terminal tuple
+    static_assert(!meta::is_output_terminal_v<std::decay_t<valueT>> && !meta::is_output_terminal_tuple_v<std::decay_t<valueT>>, "ttg::send(i, key, x) - invalid invocation, x cannot be a terminal or a tuple of terminals; did you mean to call another version of send, e.g. sendk<i>(key, x)?");
     detail::value_copy_handler<Runtime> copy_handler;
     auto *terminal_ptr = detail::get_out_terminal<keyT, valueT>(i, "ttg::send(i, key, value)");
     terminal_ptr->send(key, copy_handler(std::forward<valueT>(value)));
@@ -226,6 +228,8 @@ namespace ttg {
   template <size_t i, typename keyT, typename valueT, ttg::Runtime Runtime = ttg::ttg_runtime>
   inline std::enable_if_t<meta::is_none_void_v<keyT, std::decay_t<valueT>>, void> send(const keyT &key,
                                                                                        valueT &&value) {
+    // to avoid mixups due to value being a terminal tuple
+    static_assert(!meta::is_output_terminal_v<std::decay_t<valueT>> && !meta::is_output_terminal_tuple_v<std::decay_t<valueT>>, "ttg::send<i>(key, x) - invalid invocation, x cannot be a terminal or a tuple of terminals; did you mean to call another version of send, e.g. sendk<i>(key, x)?");
     send(i, key, std::forward<valueT>(value));
   }
 
@@ -248,6 +252,8 @@ namespace ttg {
   // clang-format on
   template <typename keyT>
   inline std::enable_if_t<!meta::is_void_v<keyT>, void> sendk(std::size_t i, const keyT &key) {
+    // to avoid mixups due to key being a terminal tuple
+    static_assert(!meta::is_output_terminal_v<std::decay_t<keyT>> && !meta::is_output_terminal_tuple_v<std::decay_t<keyT>>, "ttg::sendk(i, x) - invalid invocation, x cannot be a terminal or a tuple of terminals; did you mean to call another version of send, e.g. send<i>(x)?");
     auto *terminal_ptr = detail::get_out_terminal<keyT, void>(i, "ttg::sendk(i, key)");
     terminal_ptr->sendk(key);
   }
@@ -260,6 +266,8 @@ namespace ttg {
   // clang-format on
   template <size_t i, typename keyT>
   inline std::enable_if_t<!meta::is_void_v<keyT>, void> sendk(const keyT &key) {
+    // to avoid mixups due to key being a terminal tuple
+    static_assert(!meta::is_output_terminal_v<std::decay_t<keyT>> && !meta::is_output_terminal_tuple_v<std::decay_t<keyT>>, "ttg::sendk<i>(x) - invalid invocation, x cannot be a terminal or a tuple of terminals; did you mean to call another version of send, e.g. send<i>(x)?");
     sendk(i, key);
   }
 
@@ -284,6 +292,8 @@ namespace ttg {
   // clang-format on
   template <typename valueT, ttg::Runtime Runtime = ttg::ttg_runtime>
   inline std::enable_if_t<!meta::is_void_v<std::decay_t<valueT>>, void> sendv(std::size_t i, valueT &&value) {
+    // to avoid mixups due to key being a terminal tuple
+    static_assert(!meta::is_output_terminal_v<std::decay_t<valueT>> && !meta::is_output_terminal_tuple_v<std::decay_t<valueT>>, "ttg::sendv(i, x) - invalid invocation, x cannot be a terminal or a tuple of terminals; did you mean to call another version of send, e.g. send<i>(x)?");
     detail::value_copy_handler<Runtime> copy_handler;
     auto *terminal_ptr = detail::get_out_terminal<void, valueT>(i, "ttg::sendv(i, value)");
     terminal_ptr->sendv(copy_handler(std::forward<valueT>(value)));
@@ -297,6 +307,8 @@ namespace ttg {
   // clang-format on
   template <size_t i, typename valueT, ttg::Runtime Runtime = ttg::ttg_runtime>
   inline std::enable_if_t<!meta::is_void_v<std::decay_t<valueT>>, void> sendv(valueT &&value) {
+    // to avoid mixups due to key being a terminal tuple
+    static_assert(!meta::is_output_terminal_v<std::decay_t<valueT>> && !meta::is_output_terminal_tuple_v<std::decay_t<valueT>>, "ttg::sendv<i>(x) - invalid invocation, x cannot be a terminal or a tuple of terminals; did you mean to call another version of send, e.g. send<i>(x)?");
     sendv(i, std::forward<valueT>(value));
   }
 

From 5b1381b70c9ee61776e29096c78ecba820a55aca Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Mon, 22 Apr 2024 11:31:03 -0400
Subject: [PATCH 21/28] cleanup fibonacci dev example cmake harness

---
 doc/dox/dev/devsamp/main/CMakeLists.txt | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/doc/dox/dev/devsamp/main/CMakeLists.txt b/doc/dox/dev/devsamp/main/CMakeLists.txt
index ba2593927..6e7777462 100644
--- a/doc/dox/dev/devsamp/main/CMakeLists.txt
+++ b/doc/dox/dev/devsamp/main/CMakeLists.txt
@@ -5,11 +5,12 @@ find_package(ttg REQUIRED)
 
 add_ttg_executable(test test.cpp NOT_EXCLUDE_FROM_ALL)
 
-add_ttg_executable(fibonacci fibonacci.cc LINK_LIBRARIES std::coroutine RUNTIMES "parsec")
+add_ttg_executable(fibonacci fibonacci/fibonacci.cc NOT_EXCLUDE_FROM_ALL)
 # Fib device test
 if (TTG_HAVE_CUDA)
-    add_ttg_executable(fibonacci_device fibonacci_device.cc
-            fibonacci_cuda_kernel.h
-            fibonacci_cuda_kernel.cu
-            LINK_LIBRARIES std::coroutine RUNTIMES "parsec")
+    add_ttg_executable(fibonacci/fibonacci_device
+            fibonacci/fibonacci_device.cc
+            fibonacci/fibonacci_cuda_kernel.h
+            fibonacci/fibonacci_cuda_kernel.cu
+            LINK_LIBRARIES std::coroutine RUNTIMES "parsec" NOT_EXCLUDE_FROM_ALL)
 endif()
\ No newline at end of file

From a6a48ac1ee5f1084bef565ad6ea89e38088be2ab Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Mon, 22 Apr 2024 11:35:22 -0400
Subject: [PATCH 22/28] [cosmetic] fibonacci CPU devsamp

---
 doc/dox/dev/devsamp/main/fibonacci/fibonacci.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/doc/dox/dev/devsamp/main/fibonacci/fibonacci.cc b/doc/dox/dev/devsamp/main/fibonacci/fibonacci.cc
index fce6d9954..d2d829c45 100644
--- a/doc/dox/dev/devsamp/main/fibonacci/fibonacci.cc
+++ b/doc/dox/dev/devsamp/main/fibonacci/fibonacci.cc
@@ -14,7 +14,7 @@ struct Fn {
     ar & F;
   }
 };
-auto make_ttg_fib_lt(const int64_t F_n_max =1000) {
+auto make_ttg_fib_lt(const int64_t F_n_max = 1000) {
   ttg::Edge<int64_t, Fn> f2f;
   ttg::Edge<void, Fn> f2p;
 
@@ -45,8 +45,7 @@ auto make_ttg_fib_lt(const int64_t F_n_max =1000) {
 
 int main(int argc, char* argv[]) {
   ttg::initialize(argc, argv, -1);
-  int64_t N = 1000;
-  if (argc > 1) N = std::atol(argv[1]);
+  int64_t N = (argc > 1) ? std::atol(argv[1]) : 1000;
 
   auto fib = make_ttg_fib_lt(N);
   ttg::make_graph_executable(fib.get());

From 653fc72a8c887941b92816382220b67709ecbf2b Mon Sep 17 00:00:00 2001
From: hyndavi <38397846+hyndavi17@users.noreply.github.com>
Date: Tue, 4 Jun 2024 09:19:24 -0400
Subject: [PATCH 23/28] README.md - Added Image Source

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e6de23211..425663321 100644
--- a/README.md
+++ b/README.md
@@ -322,7 +322,7 @@ This function creates a TTG composed of multiple TTs, whereas before we had disp
 - The fib task sends either a new Fn object to itself for further computation or to the print task if the condition next_f_n < F_n_max is not met.The print task simply outputs the received Fibonacci number and completes the computation.
 
 ![Fibonacci_TTG_example](doc/images/fibonacci_ttg.png)
-
+Image Source - https://lucid.app/lucidchart/a42b4027-156d-4cd0-8167-7d4acd205996/edit?viewport_loc=-1180%2C-522%2C2023%2C1165%2C0_0&invitationId=inv_bf9ace23-880a-4b36-a51c-d49c63913fad
 ###  CUDA Version
 
 First show complete example, split into host and device code (single source cannot work since CUDA does not support C++20 and probably cannot handle TTG hyper-C++ anyway).

From 36b793b4e40b6ce5e43d1a99d03da19451bf04a7 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Tue, 4 Jun 2024 17:38:11 -0400
Subject: [PATCH 24/28] [dox] update shell codeblock in INSTALL.md to use
 standard annotator

---
 INSTALL.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/INSTALL.md b/INSTALL.md
index bb459f618..b8974f927 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -1,6 +1,6 @@
 # synopsis
 
-```.sh
+```sh
 $ git clone https://github.com/TESSEorg/ttg.git
 $ cmake -S ttg -B ttg/build -DCMAKE_INSTALL_PREFIX=/path/to/ttg/install [optional cmake args]
 $ cmake --build ttg/build

From e87a2c280592077312d53579cfaabb4c92fab218 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Tue, 4 Jun 2024 17:38:40 -0400
Subject: [PATCH 25/28] [dox] updating README.md for GPU ... WIP

---
 README.md | 128 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 71 insertions(+), 57 deletions(-)

diff --git a/README.md b/README.md
index e6de23211..c842add8f 100644
--- a/README.md
+++ b/README.md
@@ -5,9 +5,13 @@ This is the C++ API for the Template Task Graph (TTG) programming model for flow
 
 # Why TTG?
 
-- TTG marries the idea of flowgraph programming models with the key innovations in the PARSEC runtime for compact specification of DAGs (PTG).
-- TTG can be used to efficiently compose and execute irregular computation patterns which are poorly served by the current programming and execution models.
-- TTG has strong support for distributed hybrid architectures for running modern scientific algorithms efficiently on current and near-future supercomputers.
+TTG might be for you if you want fine-grained parallel execution of complex (especially, data-dependent) algorithms on distributed-memory heterogeneous machines, for these reasons:
+
+  - programming models that target fine-grained parallelism, like native language tools (threads, async) and programming models/libraries (OpenMP, TaskFlow, Cilk, etc.) deal only with control flow, and thus are poorly suited for dealing with data-dependent execution
+  - such models do not deal with distributed memory anyway
+  - and specialized runtimes like HPC, UPC++, StarPU, MADNESS, PaRSEC, etc., are still relatively low-level abstractions for expressing complex data-dependent task flows across modern distributed heterogeneous machines.
+
+The development of TTG was motivated by _irregular_ scientific applications like adaptive multiresolution numerical calculus and data-sparse tensor algebra which have lacked tools to keep up with the evolution of HPC platforms, especially toward heterogeneity. But TTG is far more widely applicable than that; it is a general-purpose programming model.
 
 # Installation
 
@@ -19,6 +23,7 @@ This is the C++ API for the Template Task Graph (TTG) programming model for flow
 ## TL;DR: A "Hello, World" TTG Program
 
 `helloworld.cpp`
+
 ```cpp
 #include <ttg.h>
 
@@ -38,7 +43,8 @@ int main(int argc, char *argv[]) {
 ```
 
 `CMakeLists.txt`
-~~~~~~~~~~~~~{.cmake}
+
+```cmake
 cmake_minimum_required(VERSION 3.19)
 project(TTG-HW CXX)
 
@@ -52,10 +58,11 @@ endif()
 add_executable(hw-parsec helloworld.cpp)
 target_link_libraries(hw-parsec PRIVATE ttg-parsec)
 target_compile_definitions(hw-parsec PRIVATE TTG_USE_PARSEC=1)
-~~~~~~~~~~~~~
+```
 
 Configure + build:
-```shell
+
+```sh
 > cmake -S . -B build && cmake --build build --target hw-parsec
 ```
 
@@ -92,25 +99,31 @@ Let's go over each of these steps using the "Hello, World!" example.
 
 TTG C++ implementation is currently supported by 2 backends providing task scheduling, data transfer, and resource management.
 While it is possible to use specific TTG backend explicitly, by using the appropriate namespaces, it is recommended to write backend-neutral programs that can be specialized to a particular backend as follows.
-1. By defining one (and only one) of the following macros, via the command-line argument to the compiler (recommended) or as an explicit `#define` statement in the source code:
-   - `TTG_USE_PARSEC`: selects the PaRSEC backend as the default;
-   - `TTG_USE_MADNESS`: selects the MADNESS backend as the default (expert-use only).
 
-   Following the definition of this macro it is safe to include the top-level TTG header file:
-```cpp
+  1. By defining one (and only one) of the following macros, via the command-line argument to the compiler (recommended) or as an explicit `#define` statement in the source code:
+    - `TTG_USE_PARSEC`: selects the PaRSEC backend as the default;
+    - `TTG_USE_MADNESS`: selects the MADNESS backend as the default (expert-use only).
+
+      Following the definition of this macro it is safe to include the top-level TTG header file:
+
+      ```cpp
 #include <ttg.h>
-```
-2. By including the corresponding backend-specific header directly:
+      ```
+
+  2. By including the corresponding backend-specific header directly:
     - to use PaRSEC backend only, add:
-```cpp
+
+      ```cpp
 #include <ttg/parsec/ttg.h>
-```
+      ```
+
     - to use the MADNESS backend only, add:
-```cpp
+
+      ```cpp
 #include <ttg/madness/ttg.h>
-```
+      ```
 
-  This approach does not require inclusion of the top-level TTG header or definition of a backend selection macro.
+   This approach does not require inclusion of the top-level TTG header or definition of a backend selection macro.
 
 ### Initialize
 
@@ -121,6 +134,7 @@ To initialize TTG runtime invoke `ttg::initialize(argc, argv)`; there are severa
 To make a TTG create and connect one or more TTs. The simplest TTG consists of a single TT.
 
 The "Hello, World!" example contains a single TT that executes a single task (hence, task ID can be omitted, i.e., void) that does not take and produce any data. The easiest way to make such a TT  is by wrapping a callable (e.g., a lambda) with `ttg::make_tt`:
+
 ```cpp
   auto tt = ttg::make_tt([]() { std::cout << "Hello, World!"; });
 ```
@@ -128,22 +142,26 @@ The "Hello, World!" example contains a single TT that executes a single task (he
 ## Execute TTG
 
 To execute a TTG we must make it executable (this will declare the TTG complete). To execute the TTG its root TT must receive at least one message; since in this case the task does not receive either task ID or data the message is empty (i.e., void):
+
 ```cpp
   ttg::make_graph_executable(tt);
   ttg::execute();
   if (ttg::get_default_world().rank() == 0)
       tt->invoke();
 ```
+
 Note that we must ensure that only one such message must be generated. Since TTG execution uses the Single Program Multiple Data (SPMD) model,
 when launching the TTG program as multiple processes only the first process (rank) gets to send the message.
 
 ## Finalize TTG
 Since TTG program is executed asynchronously, we must ensure that all tasks are finished:
+
 ```cpp
   ttg::fence();
 ```
 
 Before exiting `main()` the TTG runtime should be finalized:
+
 ```cpp
   ttg::finalize();
 ```
@@ -163,9 +181,10 @@ Although the example lacks opportunity for parallelism,  the point here is not p
 
 This example illustrates how to compute a particular element of the Fibonacci sequence
 defined by recurrence
-<img src="https://latex.codecogs.com/svg.image?F_N&space;=&space;F_{N-1}&space;&plus;&space;F_{N-2},&space;\quad&space;F_0=0,&space;\quad&space;F_1=1" title="https://latex.codecogs.com/svg.image?F_N = F_{N-1} + F_{N-2}, \quad F_0=0, \quad F_1=1" />.
+$F_N = F_{N-1} + F_{N-2}, F_0=0, F_1=1$.
 
 `nth-fibonacci.cpp`
+
 ```cpp
 #include <ttg.h>
 
@@ -203,36 +222,27 @@ int main(int argc, char *argv[]) {
 
 The TTG consists of 2 TTs, one (`fib`) that implements the Fibonacci recurrence and another (`print`) that prints the result to
 `std::cout`:
-- `fib` computes <img src="https://render.githubusercontent.com/render/math?math=F_{n}"> from <img src="https://render.githubusercontent.com/render/math?math=F_{n-1}"> and <img src="https://render.githubusercontent.com/render/math?math=F_{n-2}">
-  and either sends <img src="https://render.githubusercontent.com/render/math?math=F_{n}"> and <img src="https://render.githubusercontent.com/render/math?math=F_{n-1}"> to the next (`n+1`)
-  instance of `fib`, or, if `n==N`, sends <img src="https://render.githubusercontent.com/render/math?math=F_{n}"> to `print`. Thus `fib`
-  needs 2 input terminals and 3 output terminals (for better efficiency instead of
-  sending individual  Fibonacci numbers, each over an individual edge, it is better to send
-  a pair of Fibonacci numbers over a single edge).
-- `print` receives a single unannotated datum and produces no data, so it needs a single input terminal and no output terminals.
-
-Execution of the program starts by explicitly instantiating `fib` for `n=2`.
-In total 20 tasks will be executed: 19 instances of `fib` with `n=2..20` and the single instance of `print`.
-
-Note that unlike typical task-based implementations in the literature which construct tasks _recursively_,
-i.e., the task for
-computing <img src="https://render.githubusercontent.com/render/math?math=F_{n}">
-is created before the task computing <img src="https://render.githubusercontent.com/render/math?math=F_{n-1}">,
-the TTG implementation constructs the tasks in the order of increasing `n`. This is because
-parametric dataflow of TTG naturally expresses inductive (push) computation patterns rather than
-recursive (pull) computation patterns. However, it is easy to implement proper recursion by
-separating the downward flow of control (task creation,
-<img src="https://render.githubusercontent.com/render/math?math=F_{n} \to F_{n-1},F_{n-2}">)
+  
+  - `fib` computes $F_{n}$ from $F_{n-1}$ and $F_{n-2}$ and either sends $F_{n}$ and $F_{n-1}$ to the next ($n+1$)
+  instance of `fib`, or, if $n=N$, sends $F_{n}$ to `print`. Thus `fib`
+  needs 2 input terminals and 3 output terminals (for better efficiency instead of sending individual  Fibonacci numbers, each over an individual edge, it is better to send a pair of Fibonacci numbers over a single edge).
+  - `print` receives a single unannotated datum and produces no data, so it needs a single input terminal and no output terminals.
+
+Execution of the program starts by explicitly instantiating `fib` for $n=2$.
+In total 20 tasks will be executed: 19 instances of `fib` with $n=2\dots20$ and the single instance of `print`.
+
+Note that unlike typical task-based implementations in the literature which construct tasks _recursively_, i.e., the task for
+computing $F_{n}$ is created before the task computing $F_{n-1}$, the TTG implementation constructs the tasks in the order of increasing $n$. This is because parametric dataflow of TTG naturally expresses inductive (push) computation patterns rather than recursive (pull) computation patterns. However, it is easy to implement proper recursion by separating the downward flow of control (task creation, $F_{n} \to F_{n-1},F_{n-2}$)
 from the upward flow of data (task evaluation,
-<img src="https://render.githubusercontent.com/render/math?math=F_{n-1},F_{n-2} \to F_{n}">).
+$F_{n-1},F_{n-2} \to F_{n}$).
 
 ## Data-Dependent Example : Largest Fibonacci Number < N
 
-To illustrate the real power of TTG let's tweak the problem slightly: instead of computing first N Fibonacci numbers let's find the largest Fibonacci number smaller than some N. The key difference in the latter case is that, unlike the former, the number of tasks is NOT known a priori; furthermore, to make a decision whether we need to compute next Fibonacci number we must examine the value returned by the previous task. This is an example of data-dependent tasking, where the decision which (if any) task to execute next depends on the values produced by previous tasks. The ability to compose regular as well as data-dependent task graphs is a distinguishing strength of TTG.
+To illustrate the real power of TTG let's tweak the problem slightly: instead of computing first $N$ Fibonacci numbers let's find the largest Fibonacci number smaller than some $N$. The key difference in the latter case is that, unlike the former, the number of tasks is NOT known a priori; furthermore, to make a decision whether we need to compute next Fibonacci number we must examine the value returned by the previous task. This is an example of data-dependent tasking, where the decision which (if any) task to execute next depends on the values produced by previous tasks. The ability to compose regular as well as data-dependent task graphs is a distinguishing strength of TTG.
 
 To make things even more interesting, we will demonstrate how to implement such program both for execution on CPUs as well as on accelerators (GPUs).
 
-### Here's  CPU Version
+### The CPU Version
 
 ```cpp
 #include <ttg.h>
@@ -251,6 +261,7 @@ struct Fn {
     ar & F;
   }
 };
+
 auto make_ttg_fib_lt(const int64_t) {
   ttg::Edge<int64_t, Fn> f2f;
   ttg::Edge<void, Fn> f2p;
@@ -304,24 +315,20 @@ int main(int argc, char* argv[]) {
 
 [//]: # (- `make_ttg_fib_lt` creates a TTG composed of multiple TTs, whereas before we had disparate TTs connected to each other &#40;i.e. there was no explicit graph object&#41;. This allows to support composition of multiple TTGs together, as described in Herault et al DOI 10.1109/PAW-ATM56565.2022.00008)
 
-### Utility of Fn struct
-Fn aggregates 2 pieces of data that were separate before in preparation for aggregating datums into single continguous chunks that can be allocated on GPU more efficiently.This arrangement allows each task to access and modify both current and previous Fibonacci values without the need for separate data fields or additional communication overhead.
+#### Utility of `Fn` struct
+`Fn` aggregates 2 pieces of data that were separate before in preparation for aggregating datums into single continguous chunks that can be allocated on GPU more efficiently.This arrangement allows each task to access and modify both current and previous Fibonacci values without the need for separate data fields or additional communication overhead.
 
-- F[0] and F[1] store the current and previous Fibonacci numbers, respectively.
-- The default constructor initializes the sequence starting values, with F[0] as 1 (first Fibonacci number) and F[1] as 0 (base case).
+- `F[0]` and `F[1]` store the current ($F_n$) and previous ($F_{n-1}$) Fibonacci numbers, respectively.
+- The default constructor starts the iteration by initializing `F[0]=1` and `F[1]=0`.
 
-##### Serialization Functions in Fn
-Serialize functions are useful to communicate the struct among the tasks. TTG leverages these functions to serialize and deserialize the data as it is sent and received through the task graph.
+Because `Fn` is now a user-defined type, for TTG to be able to copy/move it between tasks it needs to know how to serialize and deseralize it.
+ functions are useful to communicate the struct among the tasks. TTG leverages these functions to serialize and deserialize the data as it is sent and received through the task graph.
 
-### Utility of make_ttg_fib_lt
-This function creates a TTG composed of multiple TTs, whereas before we had disparate TTs connected to each other (i.e. there was no explicit graph object). This allows to support composition of multiple TTGs together
+#### Why `make_ttg_fib_lt`?
 
-- The function make_ttg_fib_lt constructs the task graph that calculates Fibonacci numbers. It defines two template tasks (fib and print) and their connections via edges.
-- Task 1 - fib Task: Computes the next Fibonacci number and decides whether to continue the sequence or send the result to the print task.Receives an Fn object containing the current and previous Fibonacci numbers.Sends updated Fn objects either back to itself for the next computation or to the print task if the maximum is reached or exceeded.
-- Task2 - print Task: Outputs the largest Fibonacci number less than F_n_max.Receives an Fn object from the fib task. No output terminals, as its sole purpose is to display the result.Just uses std::out and prints the largest fibonacci until N.
-- The fib task sends either a new Fn object to itself for further computation or to the print task if the condition next_f_n < F_n_max is not met.The print task simply outputs the received Fibonacci number and completes the computation.
+Until now we have constructed individual TTs and linked them together; i.e.,  TTGs until now was implicit. Function `make_ttg_fib_lt` instead explicitly creates a graph of TTs (a TTG). This seemingly small step will greatly improve composability by allowing to use entire TTGs as a component of other graphs by stitching it with TTs or TTGs together.
 
-![Fibonacci_TTG_example](doc/images/fibonacci_ttg.png)
+[//]: ![Fibonacci_TTG_example](doc/images/fibonacci_ttg.png)
 
 ###  CUDA Version
 
@@ -383,7 +390,10 @@ auto make_ttg_fib_lt(const int64_t F_n_max = 1000) {
 }
 ```
 
+`Fn` 
+
 [//]: # (Walk through the key differences ... potentially we could show both side by side ... not sure how to do that in Markdown though ...)
+
 ### Differences in the Code Implementation 
 | Aspect                             | CPU Implementation Code                         | GPU Implementation Code (CUDA)                                                                                                                                |
 |------------------------------------|-------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------|
@@ -401,8 +411,9 @@ auto make_ttg_fib_lt(const int64_t F_n_max = 1000) {
 | Task Invocation                    | Task invocation within TTG: `fib->invoke(...);` | Task invocation with host-device communication: `fib->template in<0>()->send(1, Fn{});`                                                                       |
 
 here's the CUDA code
+
 ```cpp
- #include "fibonacci_cuda_kernel.h"
+#include "fibonacci_cuda_kernel.h"
 #ifdef TTG_HAVE_CUDA
     __global__ void cu_next_value(int64_t* fn_and_fnm1) {
       int64_t fnp1 = fn_and_fnm1[0] + fn_and_fnm1[1];
@@ -481,7 +492,9 @@ For example, executing the Fibonacci program described above using 2 MPI process
 ![Fibonacci_traces_example](doc/images/nth-fib-trace-2proc-2thr.png) 
 
 # TTG reference documentation
-TTG API documentation is available for the following versions:0
+
+TTG API documentation is available for the following versions:
+
 - [master branch](https://tesseorg.github.io/ttg/dox-master) .
 
 # Cite
@@ -492,5 +505,6 @@ When referring to TTG in an academic setting please cite the following publicati
 # Acknowledgment
 
 The development of TTG was made possible by:
+
 - [The EPEXA project](https://tesseorg.github.io/), currently supported by the National Science Foundation under grants [1931387](https://www.nsf.gov/awardsearch/showAward?AWD_ID=1931387) at Stony Brook University, [1931347](https://www.nsf.gov/awardsearch/showAward?AWD_ID=1931347) at Virginia Tech, and [1931384](https://www.nsf.gov/awardsearch/showAward?AWD_ID=1931384) at the University of Tennesse, Knoxville.
 - The TESSE project, supported by the National Science Foundation under grants [1450344](https://www.nsf.gov/awardsearch/showAward?AWD_ID=1450344) at Stony Brook University, [1450262](https://www.nsf.gov/awardsearch/showAward?AWD_ID=1450262) at Virginia Tech, and [1450300](https://www.nsf.gov/awardsearch/showAward?AWD_ID=1450300) at the University of Tennesse, Knoxville.

From e99f2e73732b283ad1d1665b35417ef77f4552ff Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Sun, 9 Jun 2024 11:14:21 -0400
Subject: [PATCH 26/28] split off fibonacci devsample

---
 .github/workflows/cmake.yml                        |  6 ++++--
 doc/dox/dev/devsamp/fibonacci/CMakeLists.txt       | 14 ++++++++++++++
 .../dev/devsamp/{main => }/fibonacci/fibonacci.cc  |  0
 .../{main => }/fibonacci/fibonacci_cuda_kernel.cu  |  0
 .../{main => }/fibonacci/fibonacci_cuda_kernel.h   |  0
 .../{main => }/fibonacci/fibonacci_device.cc       |  0
 doc/dox/dev/devsamp/main/CMakeLists.txt            | 12 +-----------
 7 files changed, 19 insertions(+), 13 deletions(-)
 create mode 100644 doc/dox/dev/devsamp/fibonacci/CMakeLists.txt
 rename doc/dox/dev/devsamp/{main => }/fibonacci/fibonacci.cc (100%)
 rename doc/dox/dev/devsamp/{main => }/fibonacci/fibonacci_cuda_kernel.cu (100%)
 rename doc/dox/dev/devsamp/{main => }/fibonacci/fibonacci_cuda_kernel.h (100%)
 rename doc/dox/dev/devsamp/{main => }/fibonacci/fibonacci_device.cc (100%)

diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index 5e311df1f..ae87e0f63 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -110,8 +110,10 @@ jobs:
       working-directory: ${{github.workspace}}/build
       shell: bash
       run: |
-        cmake -S $GITHUB_WORKSPACE/doc/dox/dev/devsamp/main -B test_install_devsamp -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat test_install_devsamp/CMakeFiles/CMakeOutput.log && cat test_install_devsamp/CMakeFiles/CMakeError.log)
-        cmake --build test_install_devsamp
+        cmake -S $GITHUB_WORKSPACE/doc/dox/dev/devsamp/main -B test_install_devsamp_main -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat test_install_devsamp_main/CMakeFiles/CMakeOutput.log && cat test_install_devsamp_main/CMakeFiles/CMakeError.log)
+        cmake --build test_install_devsamp_main
+        cmake -S $GITHUB_WORKSPACE/doc/dox/dev/devsamp/fibonacci -B test_install_devsamp_fibonacci -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat test_install_devsamp_fibonacci/CMakeFiles/CMakeOutput.log && cat test_install_devsamp_fibonacci/CMakeFiles/CMakeError.log)
+        cmake --build test_install_devsamp_fibonacci
         cmake -E make_directory test_install_userexamples
         cat > test_install_userexamples/CMakeLists.txt <<EOF
         cmake_minimum_required(VERSION 3.14)
diff --git a/doc/dox/dev/devsamp/fibonacci/CMakeLists.txt b/doc/dox/dev/devsamp/fibonacci/CMakeLists.txt
new file mode 100644
index 000000000..e5058cb43
--- /dev/null
+++ b/doc/dox/dev/devsamp/fibonacci/CMakeLists.txt
@@ -0,0 +1,14 @@
+cmake_minimum_required(VERSION 3.14)
+project(ttg-devsample-fibonacci)
+
+find_package(ttg REQUIRED)
+
+add_ttg_executable(fibonacci fibonacci.cc NOT_EXCLUDE_FROM_ALL)
+# Fib device test
+if (TTG_HAVE_CUDA)
+    add_ttg_executable(fibonacci_cuda
+            fibonacci_device.cc
+            fibonacci_cuda_kernel.h
+            fibonacci_cuda_kernel.cu
+            LINK_LIBRARIES std::coroutine RUNTIMES "parsec" NOT_EXCLUDE_FROM_ALL)
+endif()
\ No newline at end of file
diff --git a/doc/dox/dev/devsamp/main/fibonacci/fibonacci.cc b/doc/dox/dev/devsamp/fibonacci/fibonacci.cc
similarity index 100%
rename from doc/dox/dev/devsamp/main/fibonacci/fibonacci.cc
rename to doc/dox/dev/devsamp/fibonacci/fibonacci.cc
diff --git a/doc/dox/dev/devsamp/main/fibonacci/fibonacci_cuda_kernel.cu b/doc/dox/dev/devsamp/fibonacci/fibonacci_cuda_kernel.cu
similarity index 100%
rename from doc/dox/dev/devsamp/main/fibonacci/fibonacci_cuda_kernel.cu
rename to doc/dox/dev/devsamp/fibonacci/fibonacci_cuda_kernel.cu
diff --git a/doc/dox/dev/devsamp/main/fibonacci/fibonacci_cuda_kernel.h b/doc/dox/dev/devsamp/fibonacci/fibonacci_cuda_kernel.h
similarity index 100%
rename from doc/dox/dev/devsamp/main/fibonacci/fibonacci_cuda_kernel.h
rename to doc/dox/dev/devsamp/fibonacci/fibonacci_cuda_kernel.h
diff --git a/doc/dox/dev/devsamp/main/fibonacci/fibonacci_device.cc b/doc/dox/dev/devsamp/fibonacci/fibonacci_device.cc
similarity index 100%
rename from doc/dox/dev/devsamp/main/fibonacci/fibonacci_device.cc
rename to doc/dox/dev/devsamp/fibonacci/fibonacci_device.cc
diff --git a/doc/dox/dev/devsamp/main/CMakeLists.txt b/doc/dox/dev/devsamp/main/CMakeLists.txt
index 6e7777462..5a127cd97 100644
--- a/doc/dox/dev/devsamp/main/CMakeLists.txt
+++ b/doc/dox/dev/devsamp/main/CMakeLists.txt
@@ -1,16 +1,6 @@
 cmake_minimum_required(VERSION 3.14)
-project(test)
+project(ttg-devsample-main)
 
 find_package(ttg REQUIRED)
 
 add_ttg_executable(test test.cpp NOT_EXCLUDE_FROM_ALL)
-
-add_ttg_executable(fibonacci fibonacci/fibonacci.cc NOT_EXCLUDE_FROM_ALL)
-# Fib device test
-if (TTG_HAVE_CUDA)
-    add_ttg_executable(fibonacci/fibonacci_device
-            fibonacci/fibonacci_device.cc
-            fibonacci/fibonacci_cuda_kernel.h
-            fibonacci/fibonacci_cuda_kernel.cu
-            LINK_LIBRARIES std::coroutine RUNTIMES "parsec" NOT_EXCLUDE_FROM_ALL)
-endif()
\ No newline at end of file

From 0f498bce2116ede8fd156b71532599d21fb138fa Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Sun, 9 Jun 2024 12:06:50 -0400
Subject: [PATCH 27/28] typo

---
 ttg/ttg/util/env.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ttg/ttg/util/env.h b/ttg/ttg/util/env.h
index 4e07bff7b..f78e03640 100644
--- a/ttg/ttg/util/env.h
+++ b/ttg/ttg/util/env.h
@@ -21,7 +21,7 @@ namespace ttg {
     /// using the unofficial extension MPIX_Query_cuda_support). However, since not all MPI implementations
     /// support this extension, users can force the use of device buffers in communication by setting
     /// `TTG_FORCE_DEVICE_COMM` to a non-negative number.
-    /// @return true if the user wants to force the use of device-side buffers in communicaton.
+    /// @return true if the user wants to force the use of device-side buffers in communication.
     bool force_device_comm();
 
   }  // namespace detail

From d614c92ed7e3676d4015445ff3abc308bcb987fa Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Sun, 9 Jun 2024 12:07:41 -0400
Subject: [PATCH 28/28] [skip ci] README: device example completed

---
 README.md | 70 +++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 47 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index fd99b01dc..ed043fe67 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ TTG might be for you if you want fine-grained parallel execution of complex (esp
 
   - programming models that target fine-grained parallelism, like native language tools (threads, async) and programming models/libraries (OpenMP, TaskFlow, Cilk, etc.) deal only with control flow, and thus are poorly suited for dealing with data-dependent execution
   - such models do not deal with distributed memory anyway
-  - and specialized runtimes like HPC, UPC++, StarPU, MADNESS, PaRSEC, etc., are still relatively low-level abstractions for expressing complex data-dependent task flows across modern distributed heterogeneous machines.
+  - and specialized runtimes like StarPU, PaRSEC, MADNESS, HPX, UPC++, etc., are still relatively low-level abstractions for expressing complex data-dependent task flows across modern distributed heterogeneous machines.
 
 The development of TTG was motivated by _irregular_ scientific applications like adaptive multiresolution numerical calculus and data-sparse tensor algebra which have lacked tools to keep up with the evolution of HPC platforms, especially toward heterogeneity. But TTG is far more widely applicable than that; it is a general-purpose programming model.
 
@@ -77,6 +77,7 @@ The basic model of computation is built around a Template Task Graph (TTG). A TT
 Thus, task creation is a byproduct of messages traveling through one or more TTGs. What makes the model powerful is the ability to encode large DAGs of tasks compactly.
 
 Before proceeding further, let's refine the few concepts used to define the programming model above:
+
 - `TaskId` (aka `Key`): A unique identifier for each task. It must be _perfectly_ hashable.
 - `Terminal`: A port for receiving (input)  and sending (output) messages. Each message consists of a (potentially void) `TaskId` and an (optional) datum. Terminals are strongly-typed. An {in,out}put terminal can be connected to one or more {out,in}put terminal (as long as the `TaskId` and datum types match). Input terminals are programmable (e.g., incoming messages can be optionally reduced).
 - `TemplateTask` (aka `TT`): This is a _template_ for creating tasks. Task template creates a task associated with a given `TaskId` when every input terminal received messages for the given `TaskId`.
@@ -87,6 +88,7 @@ Due to its simplicity only template tasks appear in the "Hello, World!" program.
 ## Structure of a Minimal TTG Program
 
 Every TTG program must:
+
 - select the TTG backend,
 - initialize the TTG runtime,
 - construct a TTG by declaring its constituent nodes,
@@ -326,13 +328,17 @@ Because `Fn` is now a user-defined type, for TTG to be able to copy/move it betw
 
 #### Why `make_ttg_fib_lt`?
 
-Until now we have constructed individual TTs and linked them together; i.e.,  TTGs until now was implicit. Function `make_ttg_fib_lt` instead explicitly creates a graph of TTs (a TTG). This seemingly small step will greatly improve composability by allowing to use entire TTGs as a component of other graphs by stitching it with TTs or TTGs together.
+Until now we have constructed individual TTs and linked them together; i.e.,  TTGs until now was implicit. Function `make_ttg_fib_lt` instead explicitly creates a graph of TTs (a TTG). This seemingly small step helps improve composability by allowing to use entire TTGs as a component of other graphs by stitching it with TTs or TTGs together.
+
+[//]: ![Fibonacci_TTG_example](doc/images/fibonacci_ttg.png)
 
-[//]: ![Fibonacci_TTG_example](doc/images/fibonacci_ttg.png) Image Source - https://lucid.app/lucidchart/a42b4027-156d-4cd0-8167-7d4acd205996/edit?viewport_loc=-1180%2C-522%2C2023%2C1165%2C0_0&invitationId=inv_bf9ace23-880a-4b36-a51c-d49c63913fad
+###  Device Version
 
-###  CUDA Version
+It is currently not possible to have a general-purpose task runtime execute purely on device, hence TTG and the underlying runtimes execute tasks on the host (CPU), and these tasks launch device _kernels_. For technical reasons it is necessary to split the code into the host-only part, which looks remarkably like the CPU-only version above, and the device-specific part that implements the core part of the computation on the device. In the future it _may_ become possible to have single-source programs that contain both host and device parts contain in the same source file.
 
-First show complete example, split into host and device code (single source cannot work since CUDA does not support C++20 and probably cannot handle TTG hyper-C++ anyway).
+####  Host-side Code
+
+The host-only part is completely independent of the type of the device programming model.
 
 ```cpp
 struct Fn : public ttg::TTValue<Fn> {
@@ -390,27 +396,43 @@ auto make_ttg_fib_lt(const int64_t F_n_max = 1000) {
 }
 ```
 
-`Fn` 
+Although the structure of the device-capable program is nearly identical to the CPU version, there are important differences:
+
+  - `Fn`'s data must exist on the host side (where the task is executed). To automate moving of the data between host and device memories `Fn` is implemented with the help of helper classes `TTValue` and `Buffer`.
+  - task functions become _coroutines_ (as indicated by their return type `device::Task`) to deal with the asynchrony of the host-device interactions (kernel launch, memory allocation and transfers)
+  - the target execution space is specified as a template argument of type `ExecutionSpace` to `make_tt`
+
+##### `TTValue`
+
+For optimal performance low-level runtime that manages the data motion across the memory hierarchy (host-to-host (i.e., between MPI ranks), host-to-device, and device-to-device) must be able to _track_ each datum as it orchestrates the computation. For example, when a TTG task `send`'s a datum to an output terminal connected to multiple consumers the runtime may avoid unnecessary copies, e.g. by recognizing that all consumers will only need read-only access to the data, hence reference to the same datum can be passed to all consumers. This requires being able to map pointer to a C++ object to the control block that describes that object to the runtime. Deriving C++ type `T` from `TTValue<T>` makes it possible to track objects `T` by embedding the control block into each object. This is particularly important for the data that has to travel to the device.
+
+##### `Buffer`
+`Buffer<T>` is a view of a contiguous sequence of objects of type `T` in the host memory that can be automatically moved by the runtime to/from the device memory. Here `Fn::b` is a view of the 2-element sequence pointed to by `Fn::F`; once it's constructed the content of `Fn::F` will be moved to/from the device by the runtime. The subsequent actions of `Fn::b` cause the automatic transfers of data to (`device::select(f_n.b)`) and from (`ttg::device::wait(f_n.b)`) the device.
+
+##### `device::Task`
+
+The key challenge of device programming models is that they are fundamentally _asynchronous_ to hide the large latency of interacting with the device. Kernel launches, unlike function calls on CPU, take 1000s of CPU cycles to occur, and the asynchrony helps amortize these costs by overlapping kernels launch and execution. Task programming models are a seemingly good match for device programming, but the key challenge is how to make device-capable task code look most like standard host-only task code. TTG ability to use _C++ coroutines_ as task bodies allows it to deal with asynchronous calls inside the tasks (the use of coroutines is the primary reason why TTG requires C++20 support by the C++ compiler). Roughly speaking, coroutines are resumable functions; they can return to the caller via a `co_await` statement and resumed at that point once some condition (typically, completion of submitted actions) has been satisdied. Device tasks `co_await` at every point where further progress requires completion of preceding device tasks:
+
+  - First `co_await` ensures that contents of `f_n.F[]` are available on the device. During the first invocation the data resides on the host, hence this allocates memory on the device and transfers the contents of `f_n.F[]` from host to device. During subsequent invocations the contents of `f_n.F[]` are likely already available on the device (unless the runtime decides to compute $F_{n+1}$ on a different device than $F_n$), thus this `co_await` may become a no-op.
+  - Second `co_await` ensures that the kernel launched by `next_value` has completed and the contents of `f_n.F[]` changed by that kernel are available on the host. This always causes device-to-host transfer.
+  - Third set of `co_await`'s ensures that the corresponding `device::send`, which sends the data located in the device memory, has completed. Since `device::send` within a task will typically return a local variable exit from coroutine would destroy such variables prematurely, hence instead of a `co_return` the coroutine concludes by waiting for the `device::send` to complete before exiting.
+
+##### `ExecutionSpace`
+
+TTG and its underlying runtime needs to be told in which _execution space_ the task code will operate. The current choices are denoted by the `ExecutionSpace` enumeration:
+
+- `ExecutionSpace::Host`: host processor (default)
+- `ExecutionSpace::CUDA`: an NVIDIA CUDA device
+- `ExecutionSpace::HIP`: an AMD HIP device
+- `ExecutionSpace::L0`: an Intel L0 device
+
+
 
 [//]: # (Walk through the key differences ... potentially we could show both side by side ... not sure how to do that in Markdown though ...)
 
-### Differences in the Code Implementation 
-| Aspect                             | CPU Implementation Code                         | GPU Implementation Code (CUDA)                                                                                                                                |
-|------------------------------------|-------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| C++ Features                       | Full use of C++20                               | Limited C++20 due to `__global__` and other CUDA specifics                                                                                                     |
-| Execution Space                    | CPU cores                                       | Defined by `#define ES ttg::ExecutionSpace::CUDA`                                                                                                              |
-| Data Transfer                      | Direct C++ object passing                       | Use of `ttg::Buffer` and CUDA memory operations                                                                                                                |
-| Task Creation                      | `auto fib = ttg::make_tt(...);`                 | `auto fib = ttg::make_tt<ES>(...);`                                                                                                                            |
-| Task Execution Synchronization    | Synchronous execution within TTG runtime        | Asynchronous execution using CUDA and coroutines: `co_await ttg::device::wait(f_n.b);`                                                                        |
-| Device-Specific Code               | Not applicable                                  | CUDA kernels: `__global__ void cu_next_value(int64_t* fn_and_fnm1);`                                                                                           |
-| Serialization                      | ` void serialize(Archive& ar) {ar & F;}`        | `void serialize(Archive& ar) { ttg::ttg_abort(); }`                                                                                                            |
-| Memory Management                  | Managed by C++ runtime and Managed by Fn struct                     | Managed by `Fn` struct with `ttg::Buffer`: `Fn() : F(std::make_unique<int64_t[]>(2)), b(F.get(), 2) {...}`                                                     |
-| Task Continuation and Data Locality| Managed implicitly by TTG                       | Managed explicitly: `co_await ttg::device::select(f_n.b); next_value(f_n.b.current_device_ptr()); co_await ttg::device::wait(f_n.b);`                         |
-| Memory Model                       | Implicit, managed by C++ runtime                | Explicit memory model involving CUDA memory management: `std::unique_ptr<int64_t[]> F; ttg::Buffer<int64_t> b;`                                                |
-| Task Flow and Control              | Implicit control flow based on TTG              | Explicit control flow using CUDA streams and coroutines: `if (f_n.F[0] < F_n_max) {...} else {...}`                                                             |
-| Task Invocation                    | Task invocation within TTG: `fib->invoke(...);` | Task invocation with host-device communication: `fib->template in<0>()->send(1, Fn{});`                                                                       |
-
-here's the CUDA code
+####  Device Kernel
+
+Here's the CUDA version of the device kernel and its host-side wrapper; ROCm and SYCL/Level0 variants will be very similar to the CUDA version:
 
 ```cpp
 #include "fibonacci_cuda_kernel.h"
@@ -426,7 +448,9 @@ here's the CUDA code
 #endif // TTG_HAVE_CUDA
 ```
 
+`cu_next_value` is the device kernel that evaluates $F_{n+1}$ from $F_{n}$ and $F_{n-1}$. `next_value` is a host function that launches `cu_next_value`; this is the function called in the `fib` task.
 
+The complete example, including the CMake build harness, can be found in [dox examples](https://github.com/TESSEorg/ttg/tree/master/doc/dox/dev/devsamp/fibonacci).
 
 ## Debugging TTG Programs