From 5979c708dc709347256de99aa8e631a3d355c6cd Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Mon, 11 Nov 2024 21:18:48 -0500
Subject: [PATCH 01/14] Bump actions/checkout version to v4

See https://github.blog/changelog/2022-09-22-github-actions-all-actions-will-begin-running-on-node16-instead-of-node12/

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 .github/workflows/cmake.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index 8a1d151e6..2368563ec 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -41,7 +41,7 @@ jobs:
         -DCMAKE_CXX_STANDARD=20
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
 
     - name: Install prerequisite MacOS packages
       if: ${{ matrix.os == 'macos-latest' }}

From b0d93d3ccbf505589b8dda6f27e2c52522879286 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Tue, 12 Nov 2024 05:54:32 -0500
Subject: [PATCH 02/14] CI: Bump actions/cache to v3

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 .github/workflows/cmake.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index 2368563ec..4915c779b 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -72,7 +72,7 @@ jobs:
         message("::set-output name=timestamp::${current_date}")
 
     - name: Setup ccache cache files
-      uses: actions/cache@v1.1.0
+      uses: actions/cache@v3
       with:
         path: ${{github.workspace}}/build/.ccache
         key: ${{ matrix.config.name }}-ccache-${{ steps.ccache_cache_timestamp.outputs.timestamp }}

From 24fadbdb76afba72a3bf602d794289bfbccc8d2b Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Tue, 12 Nov 2024 08:05:23 -0500
Subject: [PATCH 03/14] bump `actions/cache` to v4

---
 .github/workflows/cmake.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index 4915c779b..68a36b4d0 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -72,7 +72,7 @@ jobs:
         message("::set-output name=timestamp::${current_date}")
 
     - name: Setup ccache cache files
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ${{github.workspace}}/build/.ccache
         key: ${{ matrix.config.name }}-ccache-${{ steps.ccache_cache_timestamp.outputs.timestamp }}

From 879605981a7a0ce2ed4ec5b26696570a01ce9219 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Fri, 11 Oct 2024 11:41:26 -0400
Subject: [PATCH 04/14] ttg::Buffer: allow specifying a scope

Similar to device scratch, we may want to specify
whether the buffer should be synchronized to the device or not.
This can help reduce superfluous transfers in cases where the
data is overwritten on the device anyway.

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 ttg/ttg/parsec/buffer.h        | 67 +++++++++++++++++++++++++++++-----
 ttg/ttg/parsec/ttg_data_copy.h | 19 +++++++---
 2 files changed, 71 insertions(+), 15 deletions(-)

diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h
index 6e609a158..8ca836cd6 100644
--- a/ttg/ttg/parsec/buffer.h
+++ b/ttg/ttg/parsec/buffer.h
@@ -79,7 +79,15 @@ struct Buffer : public detail::ttg_parsec_data_wrapper_t
   Buffer() : Buffer(nullptr, 0)
   { }
 
-  Buffer(std::size_t n)
+  /**
+   * Allocates n elements, unitialized
+   * By default, data is synchronized to the device, allowing codes
+   * to fill the buffer before making it available on the device.
+   * Passing ttg::scope::Allocate will prevent the initial synchronization.
+   * Subsequent data transfers behave as expected (i.e., data is transferred
+   * to the host and other devices as needed).
+   */
+  Buffer(std::size_t n, ttg::scope scope = ttg::scope::SyncIn)
   : ttg_parsec_data_wrapper_t()
   , allocator_type()
   , m_host_data(allocate(n))
@@ -90,13 +98,19 @@ struct Buffer : public detail::ttg_parsec_data_wrapper_t
     //          << m_count << "(" << m_host_data << ") ttg_copy "
     //          << m_ttg_copy
     //          << " parsec_data " << m_data.get() << std::endl;
-    this->reset_parsec_data(m_host_data, n*sizeof(element_type));
+    this->reset_parsec_data(m_host_data, n*sizeof(element_type), (scope == ttg::scope::SyncIn));
   }
 
-  /* Constructing a buffer using application-managed memory.
+  /**
+   * Constructing a buffer using application-managed memory.
    * The memory pointed to by ptr must be accessible during
-   * the life-time of the buffer. */
-  Buffer(element_type* ptr, std::size_t n = 1)
+   * the life-time of the buffer.
+   *
+   * Passing ttg::scope::Allocate will prevent the initial synchronization.
+   * Subsequent data transfers behave as expected (i.e., data is transferred
+   * to the host and other devices as needed).
+   */
+  Buffer(element_type* ptr, std::size_t n = 1, ttg::scope scope = ttg::scope::SyncIn)
   : ttg_parsec_data_wrapper_t()
   , allocator_type()
   , m_host_data(ptr)
@@ -107,7 +121,7 @@ struct Buffer : public detail::ttg_parsec_data_wrapper_t
     //          << m_count << "(" << m_host_data << ") ttg_copy "
     //          << m_ttg_copy
     //          << " parsec_data " << m_data.get() << std::endl;
-    this->reset_parsec_data(m_host_data, n*sizeof(element_type));
+    this->reset_parsec_data(m_host_data, n*sizeof(element_type), (scope == ttg::scope::SyncIn));
   }
 
   virtual ~Buffer() {
@@ -281,7 +295,7 @@ struct Buffer : public detail::ttg_parsec_data_wrapper_t
   }
 
   /* Reallocate the buffer with count elements */
-  void reset(std::size_t n) {
+  void reset(std::size_t n, ttg::scope scope = ttg::scope::SyncIn) {
     /* TODO: can we resize if count is smaller than m_count? */
 
     if (m_owned) {
@@ -296,7 +310,7 @@ struct Buffer : public detail::ttg_parsec_data_wrapper_t
       m_host_data = allocate(n);
       m_owned = true;
     }
-    reset_parsec_data(m_host_data, n*sizeof(element_type));
+    reset_parsec_data(m_host_data, n*sizeof(element_type), (scope == ttg::scope::SyncIn));
     //std::cout << "buffer::reset(" << count << ") ptr " << m_host_data.get()
     //          << " ttg_copy " << m_ttg_copy
     //          << " parsec_data " << m_data.get() << std::endl;
@@ -304,7 +318,7 @@ struct Buffer : public detail::ttg_parsec_data_wrapper_t
   }
 
   /* Reset the buffer to use the ptr to count elements */
-  void reset(T* ptr, std::size_t n = 1) {
+  void reset(T* ptr, std::size_t n = 1, ttg::scope scope = ttg::scope::SyncIn) {
     /* TODO: can we resize if count is smaller than m_count? */
     if (n == m_count) {
       return;
@@ -323,12 +337,34 @@ struct Buffer : public detail::ttg_parsec_data_wrapper_t
       m_count = n;
       m_owned = false;
     }
-    reset_parsec_data(m_host_data, n*sizeof(element_type));
+    reset_parsec_data(m_host_data, n*sizeof(element_type), (scope == ttg::scope::SyncIn));
     //std::cout << "buffer::reset(" << ptr << ", " << count << ") ptr " << m_host_data.get()
     //          << " ttg_copy " << m_ttg_copy
     //          << " parsec_data " << m_data.get() << std::endl;
   }
 
+  /**
+   * Resets the scope of the buffer.
+   * If scope is SyncIn then the next time
+   * the buffer is made available on a device the host
+   * data will be copied from the host.
+   * If scope is Allocate then no data will be moved.
+   */
+  void reset_scope(ttg::scope scope) {
+    if (scope == ttg::scope::Allocate) {
+      m_data->device_copies[0]->version = 0;
+    } else {
+      m_data->device_copies[0]->version = 1;
+      /* reset all other copies to force a sync-in */
+      for (int i = 0; i < parsec_nb_devices; ++i) {
+        if (m_data->device_copies[i] != nullptr) {
+          m_data->device_copies[i]->version = 0;
+        }
+      }
+      m_data->owner_device = 0;
+    }
+  }
+
   void prefer_device(ttg::device::Device dev) {
     /* only set device if the host has the latest copy as otherwise we might end up with a stale copy */
     if (dev.is_device() && this->parsec_data()->owner_device == 0) {
@@ -337,6 +373,17 @@ struct Buffer : public detail::ttg_parsec_data_wrapper_t
     }
   }
 
+  void add_device(ttg::device::Device dev, T* ptr, bool is_current = false) {
+    if (is_valid_on(dev)) {
+      throw std::runtime_error("Unable to add device that has already a buffer set!");
+    }
+    add_copy(detail::ttg_device_to_parsec_device(dev), ptr);
+    if (is_current) {
+      // mark the data as being current on the new device
+      parsec_data()->owner_device = detail::ttg_device_to_parsec_device(dev);
+    }
+  }
+
   /* serialization support */
 
 #ifdef TTG_SERIALIZATION_SUPPORTS_BOOST
diff --git a/ttg/ttg/parsec/ttg_data_copy.h b/ttg/ttg/parsec/ttg_data_copy.h
index a4b4575fa..70a5df715 100644
--- a/ttg/ttg/parsec/ttg_data_copy.h
+++ b/ttg/ttg/parsec/ttg_data_copy.h
@@ -37,12 +37,13 @@ namespace ttg_parsec {
 
       friend ttg_data_copy_t;
 
-      static parsec_data_t* create_parsec_data(void *ptr, size_t size) {
+      static parsec_data_t* create_parsec_data(void *ptr, size_t size, bool sync_to_device) {
         parsec_data_t *data = parsec_data_create_with_type(nullptr, 0, ptr, size,
                                                           parsec_datatype_int8_t);
         data->device_copies[0]->flags |= PARSEC_DATA_FLAG_PARSEC_MANAGED;
         data->device_copies[0]->coherency_state = PARSEC_DATA_COHERENCY_SHARED;
-        data->device_copies[0]->version = 1;
+        // if we don't want to synchronize data to the device we set the version to 0
+        data->device_copies[0]->version = (sync_to_device) ? 1 : 0;
         return data;
       }
 
@@ -84,7 +85,7 @@ namespace ttg_parsec {
       void remove_from_owner();
 
       /* add the data to the owning data copy */
-      void reset_parsec_data(void *ptr, size_t size);
+      void reset_parsec_data(void *ptr, size_t size, bool sync_to_device);
 
       ttg_parsec_data_wrapper_t();
 
@@ -102,6 +103,14 @@ namespace ttg_parsec {
       void set_owner(ttg_data_copy_t& new_copy) {
         m_ttg_copy = &new_copy;
       }
+
+      /* add a new copy to the data on the give device backed by ptr */
+      void add_copy(int parsec_dev, void *ptr) {
+        parsec_data_copy_t* copy = parsec_data_copy_new(m_data.get(), parsec_dev,
+                                                        parsec_datatype_int8_t,
+                                                        PARSEC_DATA_FLAG_PARSEC_MANAGED);
+        copy->device_private = ptr;
+      }
     };
 
 
@@ -528,13 +537,13 @@ namespace ttg_parsec {
     }
 
     inline
-    void ttg_parsec_data_wrapper_t::reset_parsec_data(void *ptr, size_t size) {
+    void ttg_parsec_data_wrapper_t::reset_parsec_data(void *ptr, size_t size, bool sync_to_device) {
       if (ptr == m_data.get()) return;
 
       if (nullptr == ptr) {
         m_data = parsec_data_ptr(nullptr, &delete_null_parsec_data);
       } else {
-        m_data = parsec_data_ptr(create_parsec_data(ptr, size), &delete_parsec_data);
+        m_data = parsec_data_ptr(create_parsec_data(ptr, size, sync_to_device), &delete_parsec_data);
       }
     }
 

From 4a98658654394defd90c3838ab894c46273c2e1b Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Fri, 11 Oct 2024 14:59:03 -0400
Subject: [PATCH 05/14] Allow const T on ttg::Buffer

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 ttg/ttg/buffer.h        |  4 +++-
 ttg/ttg/parsec/buffer.h | 44 ++++++++++++++++++++++-------------------
 ttg/ttg/parsec/fwd.h    |  2 +-
 3 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/ttg/ttg/buffer.h b/ttg/ttg/buffer.h
index 4d998c29a..3de6bcecd 100644
--- a/ttg/ttg/buffer.h
+++ b/ttg/ttg/buffer.h
@@ -1,11 +1,13 @@
 #ifndef TTG_BUFFER_H
 #define TTG_BUFFER_H
 
+#include <memory>
+
 #include "ttg/fwd.h"
 
 namespace ttg {
 
-template<typename T, typename Allocator = std::allocator<T>>
+template<typename T, typename Allocator = std::allocator<std::decay_t<T>>>
 using Buffer = TTG_IMPL_NS::Buffer<T, Allocator>;
 
 } // namespace ttg
diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h
index 8ca836cd6..929ea50ca 100644
--- a/ttg/ttg/parsec/buffer.h
+++ b/ttg/ttg/parsec/buffer.h
@@ -40,6 +40,10 @@ template<typename T, typename Allocator>
 struct Buffer : public detail::ttg_parsec_data_wrapper_t
               , private Allocator {
 
+  /* TODO: add overloads for T[]? */
+  using value_type = std::remove_all_extents_t<T>;
+  using pointer_type = std::add_pointer_t<value_type>;
+  using const_pointer_type = const std::remove_const_t<value_type>*;
   using element_type = std::decay_t<T>;
 
   using allocator_traits = std::allocator_traits<Allocator>;
@@ -110,10 +114,10 @@ struct Buffer : public detail::ttg_parsec_data_wrapper_t
    * Subsequent data transfers behave as expected (i.e., data is transferred
    * to the host and other devices as needed).
    */
-  Buffer(element_type* ptr, std::size_t n = 1, ttg::scope scope = ttg::scope::SyncIn)
+  Buffer(pointer_type ptr, std::size_t n = 1, ttg::scope scope = ttg::scope::SyncIn)
   : ttg_parsec_data_wrapper_t()
   , allocator_type()
-  , m_host_data(ptr)
+  , m_host_data(const_cast<element_type*>(ptr))
   , m_count(n)
   , m_owned(false)
   {
@@ -191,55 +195,55 @@ struct Buffer : public detail::ttg_parsec_data_wrapper_t
   }
 
   /* Get the pointer on the currently active device. */
-  element_type* current_device_ptr() {
+  pointer_type current_device_ptr() {
     assert(is_valid());
     int device_id = detail::ttg_device_to_parsec_device(ttg::device::current_device());
-    return static_cast<element_type*>(m_data->device_copies[device_id]->device_private);
+    return static_cast<pointer_type>(m_data->device_copies[device_id]->device_private);
   }
 
   /* Get the pointer on the currently active device. */
-  const element_type* current_device_ptr() const {
+  const_pointer_type current_device_ptr() const {
     assert(is_valid());
     int device_id = detail::ttg_device_to_parsec_device(ttg::device::current_device());
-    return static_cast<element_type*>(m_data->device_copies[device_id]->device_private);
+    return static_cast<const_pointer_type>(m_data->device_copies[device_id]->device_private);
   }
 
   /* Get the pointer on the owning device.
    * @note: This may not be the device assigned to the currently executing task.
    *        See \ref ttg::device::current_device for that. */
-  element_type* owner_device_ptr() {
+  pointer_type owner_device_ptr() {
     assert(is_valid());
-    return static_cast<element_type*>(m_data->device_copies[m_data->owner_device]->device_private);
+    return static_cast<pointer_type>(m_data->device_copies[m_data->owner_device]->device_private);
   }
 
   /* get the current device pointer */
-  const element_type* owner_device_ptr() const {
+  const_pointer_type owner_device_ptr() const {
     assert(is_valid());
-    return static_cast<element_type*>(m_data->device_copies[m_data->owner_device]->device_private);
+    return static_cast<const_pointer_type>(m_data->device_copies[m_data->owner_device]->device_private);
   }
 
   /* get the device pointer at the given device
    */
-  element_type* device_ptr_on(const ttg::device::Device& device) {
+  pointer_type device_ptr_on(const ttg::device::Device& device) {
     assert(is_valid());
     int device_id = detail::ttg_device_to_parsec_device(device);
-    return static_cast<element_type*>(parsec_data_get_ptr(m_data.get(), device_id));
+    return static_cast<pointer_type>(parsec_data_get_ptr(m_data.get(), device_id));
   }
 
   /* get the device pointer at the given device
    */
-  const element_type* device_ptr_on(const ttg::device::Device& device) const {
+  const_pointer_type device_ptr_on(const ttg::device::Device& device) const {
     assert(is_valid());
     int device_id = detail::ttg_device_to_parsec_device(device);
-    return static_cast<element_type*>(parsec_data_get_ptr(m_data.get(), device_id));
+    return static_cast<const_pointer_type>(parsec_data_get_ptr(m_data.get(), device_id));
   }
 
-  element_type* host_ptr() {
-    return static_cast<element_type*>(parsec_data_get_ptr(m_data.get(), 0));
+  pointer_type host_ptr() {
+    return static_cast<pointer_type>(parsec_data_get_ptr(m_data.get(), 0));
   }
 
-  const element_type* host_ptr() const {
-    return static_cast<element_type*>(parsec_data_get_ptr(m_data.get(), 0));
+  const_pointer_type host_ptr() const {
+    return static_cast<const_pointer_type>(parsec_data_get_ptr(m_data.get(), 0));
   }
 
   bool is_valid_on(const ttg::device::Device& device) const {
@@ -318,7 +322,7 @@ struct Buffer : public detail::ttg_parsec_data_wrapper_t
   }
 
   /* Reset the buffer to use the ptr to count elements */
-  void reset(T* ptr, std::size_t n = 1, ttg::scope scope = ttg::scope::SyncIn) {
+  void reset(pointer_type ptr, std::size_t n = 1, ttg::scope scope = ttg::scope::SyncIn) {
     /* TODO: can we resize if count is smaller than m_count? */
     if (n == m_count) {
       return;
@@ -373,7 +377,7 @@ struct Buffer : public detail::ttg_parsec_data_wrapper_t
     }
   }
 
-  void add_device(ttg::device::Device dev, T* ptr, bool is_current = false) {
+  void add_device(ttg::device::Device dev, pointer_type ptr, bool is_current = false) {
     if (is_valid_on(dev)) {
       throw std::runtime_error("Unable to add device that has already a buffer set!");
     }
diff --git a/ttg/ttg/parsec/fwd.h b/ttg/ttg/parsec/fwd.h
index 0cd798e87..de7996962 100644
--- a/ttg/ttg/parsec/fwd.h
+++ b/ttg/ttg/parsec/fwd.h
@@ -55,7 +55,7 @@ namespace ttg_parsec {
   static void ttg_broadcast(ttg::World world, T &data, int source_rank);
 
   /* device definitions */
-  template<typename T, typename Allocator = std::allocator<T>>
+  template<typename T, typename Allocator = std::allocator<std::decay_t<T>>>
   struct Buffer;
 
   template<typename T>

From 3e7789d4b0575530f1c89c487dc3052e700a8c02 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Fri, 11 Oct 2024 15:05:36 -0400
Subject: [PATCH 06/14] Add device-task broadcastk bindings

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 ttg/ttg/device/task.h | 93 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 90 insertions(+), 3 deletions(-)

diff --git a/ttg/ttg/device/task.h b/ttg/ttg/device/task.h
index 8e2d14cfc..e592e99a2 100644
--- a/ttg/ttg/device/task.h
+++ b/ttg/ttg/device/task.h
@@ -46,7 +46,7 @@ namespace ttg::device {
 
     template <typename... Ts>
     struct wait_kernel_t {
-      std::tuple<Ts &...> ties;
+      std::tuple<std::add_lvalue_reference_t<Ts>...> ties;
 
       /* always suspend */
       constexpr bool await_ready() const noexcept { return false; }
@@ -270,7 +270,7 @@ namespace ttg::device {
     /* overload for iterable types that extracts the type of the first element */
     template<typename T>
     struct broadcast_keylist_trait<T, std::enable_if_t<ttg::meta::is_iterable_v<T>>> {
-      using key_type = decltype(*std::begin(std::get<0>(std::declval<T>())));
+      using key_type = decltype(*std::begin(std::declval<T>()));
     };
 
     template <size_t KeyId, size_t I, size_t... Is, typename... RangesT, typename valueT,
@@ -364,6 +364,71 @@ namespace ttg::device {
         ttg::device::detail::broadcast<0, I, Is...>(std::tie(kl), std::forward<std::decay_t<decltype(value)>>(value));
       }
     }
+
+    /**
+     * broadcastk
+     */
+
+    template <size_t KeyId, size_t I, size_t... Is, typename... RangesT,
+              typename... out_keysT, typename... out_valuesT>
+    inline void broadcastk(const std::tuple<RangesT...> &keylists,
+                           std::tuple<ttg::Out<out_keysT, out_valuesT>...> &t) {
+      std::get<I>(t).broadcast(std::get<KeyId>(keylists));
+      if constexpr (sizeof...(Is) > 0) {
+        detail::broadcastk<KeyId+1, Is...>(keylists, t);
+      }
+    }
+
+    template <size_t KeyId, size_t I, size_t... Is, typename... RangesT,
+              typename... out_keysT, typename... out_valuesT>
+    inline void broadcastk(const std::tuple<RangesT...> &keylists) {
+      using key_t = typename broadcast_keylist_trait<
+                      std::tuple_element_t<KeyId, std::tuple<std::remove_reference_t<RangesT>...>>
+                    >::key_type;
+      auto *terminal_ptr = ttg::detail::get_out_terminal<key_t, void>(I, "ttg::device::broadcastk(keylists)");
+      terminal_ptr->broadcast(std::get<KeyId>(keylists));
+      if constexpr (sizeof...(Is) > 0) {
+        ttg::device::detail::broadcastk<KeyId+1, Is...>(keylists);
+      }
+    }
+
+    /* overload with explicit terminals */
+    template <size_t I, size_t... Is, typename RangesT,
+              typename... out_keysT, typename... out_valuesT,
+              ttg::Runtime Runtime = ttg::ttg_runtime>
+    inline send_coro_state
+    broadcastk_coro(RangesT &&keylists,
+                    std::tuple<ttg::Out<out_keysT, out_valuesT>...> &t) {
+      RangesT kl = std::forward<RangesT>(keylists); // capture the keylist(s)
+      if constexpr (ttg::meta::is_tuple_v<RangesT>) {
+        // treat as tuple
+        co_await ttg::Void{}; // we'll come back once the task is done
+        ttg::device::detail::broadcastk<0, I, Is...>(kl, t);
+      } else if constexpr (!ttg::meta::is_tuple_v<RangesT>) {
+        // create a tie to the captured keylist
+        co_await ttg::Void{}; // we'll come back once the task is done
+        ttg::device::detail::broadcastk<0, I, Is...>(std::tie(kl), t);
+      }
+    }
+
+    /* overload with implicit terminals */
+    template <size_t I, size_t... Is, typename RangesT,
+              ttg::Runtime Runtime = ttg::ttg_runtime>
+    inline send_coro_state
+    broadcastk_coro(RangesT &&keylists) {
+      RangesT kl = std::forward<RangesT>(keylists); // capture the keylist(s)
+      if constexpr (ttg::meta::is_tuple_v<RangesT>) {
+        // treat as tuple
+        static_assert(sizeof...(Is)+1 == std::tuple_size_v<RangesT>,
+                      "Size of keylist tuple must match the number of output terminals");
+        co_await ttg::Void{}; // we'll come back once the task is done
+        ttg::device::detail::broadcastk<0, I, Is...>(kl);
+      } else if constexpr (!ttg::meta::is_tuple_v<RangesT>) {
+        // create a tie to the captured keylist
+        co_await ttg::Void{}; // we'll come back once the task is done
+        ttg::device::detail::broadcastk<0, I, Is...>(std::tie(kl));
+      }
+    }
   }  // namespace detail
 
   /* overload with explicit terminals and keylist passed by const reference */
@@ -389,11 +454,33 @@ namespace ttg::device {
                                             std::move(copy_handler))};
   }
 
+  /* overload with explicit terminals and keylist passed by const reference */
+  template <size_t I, size_t... Is, typename rangeT, typename... out_keysT, typename... out_valuesT,
+            ttg::Runtime Runtime = ttg::ttg_runtime>
+  [[nodiscard]]
+  inline detail::send_t broadcastk(rangeT &&keylist,
+                                   std::tuple<ttg::Out<out_keysT, out_valuesT>...> &t) {
+    ttg::detail::value_copy_handler<Runtime> copy_handler;
+    return detail::send_t{
+            detail::broadcastk_coro<I, Is...>(std::forward<rangeT>(keylist), t)};
+  }
+
+  /* overload with implicit terminals and keylist passed by const reference */
+  template <size_t i, typename rangeT,
+            ttg::Runtime Runtime = ttg::ttg_runtime>
+  inline detail::send_t broadcastk(rangeT &&keylist) {
+    if constexpr (std::is_rvalue_reference_v<decltype(keylist)>) {
+      return detail::send_t{detail::broadcastk_coro<i>(std::forward<rangeT>(keylist))};
+    } else {
+      return detail::send_t{detail::broadcastk_coro<i>(std::tie(keylist))};
+    }
+  }
+
   template<typename... Args, ttg::Runtime Runtime = ttg::ttg_runtime>
   [[nodiscard]]
   std::vector<device::detail::send_t> forward(Args&&... args) {
     // TODO: check the cost of this!
-    return std::vector{std::forward<Args>(args)...};
+    return std::vector<device::detail::send_t>{std::forward<Args>(args)...};
   }
 
   /*******************************************

From b6c467efa5d6bacfc1b32d5ae9197dfce4e8e1fd Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Fri, 11 Oct 2024 15:13:32 -0400
Subject: [PATCH 07/14] Be more careful about when to enqueue transfers in
 ttg::device::wait

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 ttg/ttg/parsec/devicefunc.h | 80 +++++++++++++++++++++----------------
 1 file changed, 45 insertions(+), 35 deletions(-)

diff --git a/ttg/ttg/parsec/devicefunc.h b/ttg/ttg/parsec/devicefunc.h
index d4a488673..ff8452034 100644
--- a/ttg/ttg/parsec/devicefunc.h
+++ b/ttg/ttg/parsec/devicefunc.h
@@ -35,38 +35,47 @@ namespace ttg_parsec {
       parsec_data_t* data = detail::get_parsec_data(view);
       /* TODO: check whether the device is current */
 
-      auto access = PARSEC_FLOW_ACCESS_RW;
-      if constexpr (std::is_const_v<view_type>) {
-        // keep the flow at RW if it was RW to make sure we pull the data back out eventually
-        //if (flows[I].flow_flags != PARSEC_FLOW_ACCESS_RW) {
+
+      if (nullptr != data) {
+        auto access = PARSEC_FLOW_ACCESS_RW;
+        if constexpr (std::is_const_v<view_type>) {
+          // keep the flow at RW if it was RW to make sure we pull the data back out eventually
           access = PARSEC_FLOW_ACCESS_READ;
-        //}
-      } else if constexpr (ttg::meta::is_devicescratch_v<view_type>) {
-        if (view.scope() == ttg::scope::Allocate) {
-          access = PARSEC_FLOW_ACCESS_WRITE;
+        } else if constexpr (ttg::meta::is_devicescratch_v<view_type>) {
+          if (view.scope() == ttg::scope::Allocate) {
+            access = PARSEC_FLOW_ACCESS_WRITE;
+          }
         }
-      }
-
-      //std::cout << "register_device_memory task " << detail::parsec_ttg_caller << " data " << I << " "
-      //          << data << " size " << data->nb_elts << std::endl;
-
-      /* build the flow */
-      /* TODO: reuse the flows of the task class? How can we control the sync direction then? */
-      flows[I] = parsec_flow_t{.name = nullptr,
-                               .sym_type = PARSEC_SYM_INOUT,
-                               .flow_flags = static_cast<uint8_t>(access),
-                               .flow_index = I,
-                               .flow_datatype_mask = ~0 };
 
-      gpu_task->flow_nb_elts[I] = data->nb_elts; // size in bytes
-      gpu_task->flow[I] = &flows[I];
-
-      /* set the input data copy, parsec will take care of the transfer
-       * and the buffer will look at the parsec_data_t for the current pointer */
-      //detail::parsec_ttg_caller->parsec_task.data[I].data_in = data->device_copies[data->owner_device];
-      assert(nullptr != data->device_copies[0]->original);
-      caller->parsec_task.data[I].data_in = data->device_copies[0];
-      caller->parsec_task.data[I].source_repo_entry = NULL;
+        /* build the flow */
+        /* TODO: reuse the flows of the task class? How can we control the sync direction then? */
+        flows[I] = parsec_flow_t{.name = nullptr,
+                                .sym_type = PARSEC_SYM_INOUT,
+                                .flow_flags = static_cast<uint8_t>(access),
+                                .flow_index = I,
+                                .flow_datatype_mask = ~0 };
+
+        gpu_task->flow_nb_elts[I] = data->nb_elts; // size in bytes
+        gpu_task->flow[I] = &flows[I];
+
+        /* set the input data copy, parsec will take care of the transfer
+        * and the buffer will look at the parsec_data_t for the current pointer */
+        //detail::parsec_ttg_caller->parsec_task.data[I].data_in = data->device_copies[data->owner_device];
+        assert(nullptr != data->device_copies[0]->original);
+        caller->parsec_task.data[I].data_in = data->device_copies[0];
+        caller->parsec_task.data[I].source_repo_entry = NULL;
+
+      } else {
+        /* ignore the flow */
+        flows[I] = parsec_flow_t{.name = nullptr,
+                                 .sym_type = PARSEC_FLOW_ACCESS_NONE,
+                                 .flow_flags = 0,
+                                 .flow_index = I,
+                                 .flow_datatype_mask = ~0 };
+        gpu_task->flow[I] = &flows[I];
+        gpu_task->flow_nb_elts[I] = 0; // size in bytes
+        caller->parsec_task.data[I].data_in = nullptr;
+      }
 
       if constexpr (sizeof...(Is) > 0) {
         is_current |= register_device_memory(views, std::index_sequence<Is...>{});
@@ -118,12 +127,13 @@ namespace ttg_parsec {
       parsec_gpu_exec_stream_t *stream = detail::parsec_ttg_caller->dev_ptr->stream;
 
       /* enqueue the transfer into the compute stream to come back once the compute and transfer are complete */
-      parsec_device_gpu_module_t *device_module = detail::parsec_ttg_caller->dev_ptr->device;
-      device_module->memcpy_async(device_module, stream,
-                                  data->device_copies[0]->device_private,
-                                  data->device_copies[data->owner_device]->device_private,
-                                  data->nb_elts, parsec_device_gpu_transfer_direction_d2h);
-
+      if (data->owner_device != 0) {
+        parsec_device_gpu_module_t *device_module = detail::parsec_ttg_caller->dev_ptr->device;
+        device_module->memcpy_async(device_module, stream,
+                                    data->device_copies[0]->device_private,
+                                    data->device_copies[data->owner_device]->device_private,
+                                    data->nb_elts, parsec_device_gpu_transfer_direction_d2h);
+      }
       if constexpr (sizeof...(Is) > 0) {
         // recursion
         mark_device_out(views, std::index_sequence<Is...>{});

From 25bf2728cd8bf9dac86f1e0ed8e80497f224d117 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Fri, 11 Oct 2024 15:19:54 -0400
Subject: [PATCH 08/14] PaRSEC: Add missing invoke_evaluate

Also more aggressive nulling of data_out pointers

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 ttg/ttg/parsec/task.h | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/ttg/ttg/parsec/task.h b/ttg/ttg/parsec/task.h
index 656117b94..58ee528ad 100644
--- a/ttg/ttg/parsec/task.h
+++ b/ttg/ttg/parsec/task.h
@@ -162,7 +162,10 @@ namespace ttg_parsec {
         parsec_task.priority = 0;
 
         // TODO: can we avoid this?
-        for (int i = 0; i < MAX_PARAM_COUNT; ++i) { this->parsec_task.data[i].data_in = nullptr; }
+        for (int i = 0; i < MAX_PARAM_COUNT; ++i) {
+          this->parsec_task.data[i].data_in  = nullptr;
+          this->parsec_task.data[i].data_out = nullptr;
+        }
       }
 
       parsec_ttg_task_base_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class,
@@ -184,7 +187,10 @@ namespace ttg_parsec {
         parsec_task.chore_mask = 1<<0;
 
         // TODO: can we avoid this?
-        for (int i = 0; i < MAX_PARAM_COUNT; ++i) { this->parsec_task.data[i].data_in = nullptr; }
+        for (int i = 0; i < MAX_PARAM_COUNT; ++i) {
+          this->parsec_task.data[i].data_in  = nullptr;
+          this->parsec_task.data[i].data_out = nullptr;
+        }
       }
 
     public:
@@ -308,6 +314,15 @@ namespace ttg_parsec {
         }
       }
 
+      template<ttg::ExecutionSpace Space>
+      parsec_hook_return_t invoke_evaluate() {
+        if constexpr (Space == ttg::ExecutionSpace::Host) {
+          return PARSEC_HOOK_RETURN_DONE;
+        } else {
+          return TT::template device_static_evaluate<Space>(&this->parsec_task);
+        }
+      }
+
       parsec_key_t pkey() { return 0; }
     };
 

From 32d2f9abff7bfa3ee6a12278279423fea7f3618f Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Fri, 11 Oct 2024 15:36:05 -0400
Subject: [PATCH 09/14] Defer error on non-copyable types as long as possible

Try to defer tasks whenever possible if a conflict is detected
and the type is not copyable. Only throw an exception
if we run out of options (e.g., if there are two competing tasks
mutating a value).

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 ttg/ttg/parsec/buffer.h        |  2 --
 ttg/ttg/parsec/ttg.h           | 44 +++++++++++++++-------------------
 ttg/ttg/parsec/ttg_data_copy.h | 23 +++++++-----------
 3 files changed, 27 insertions(+), 42 deletions(-)

diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h
index 929ea50ca..713317848 100644
--- a/ttg/ttg/parsec/buffer.h
+++ b/ttg/ttg/parsec/buffer.h
@@ -51,8 +51,6 @@ struct Buffer : public detail::ttg_parsec_data_wrapper_t
 
   static_assert(std::is_trivially_copyable_v<element_type>,
                 "Only trivially copyable types are supported for devices.");
-  static_assert(std::is_default_constructible_v<element_type>,
-                "Only default constructible types are supported for devices.");
 
 private:
   using delete_fn_t = std::function<void(element_type*)>;
diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h
index 2fc62bd91..c0081c553 100644
--- a/ttg/ttg/parsec/ttg.h
+++ b/ttg/ttg/parsec/ttg.h
@@ -702,12 +702,13 @@ namespace ttg_parsec {
     inline ttg_data_copy_t *create_new_datacopy(Value &&value) {
       using value_type = std::decay_t<Value>;
       ttg_data_copy_t *copy;
-      if constexpr (std::is_base_of_v<ttg::TTValue<value_type>, value_type>) {
+      if constexpr (std::is_base_of_v<ttg::TTValue<value_type>, value_type> &&
+                    std::is_constructible_v<value_type, decltype(value)>) {
         copy = new value_type(std::forward<Value>(value));
-      } else if constexpr (std::is_rvalue_reference_v<decltype(value)> ||
-                           std::is_copy_constructible_v<std::decay_t<Value>>) {
+      } else if constexpr (std::is_constructible_v<ttg_data_value_copy_t<value_type>, decltype(value)>) {
         copy = new ttg_data_value_copy_t<value_type>(std::forward<Value>(value));
       } else {
+        /* we have no way to create a new copy from this value */
         throw std::logic_error("Trying to copy-construct data that is not copy-constructible!");
       }
 #if defined(PARSEC_PROF_TRACE) && defined(PARSEC_TTG_PROFILE_BACKEND)
@@ -939,9 +940,12 @@ namespace ttg_parsec {
       ttg_data_copy_t *copy_res = copy_in;
       bool replace = false;
       int32_t readers = copy_in->num_readers();
-
       assert(readers != 0);
 
+      /* try hard to defer writers if we cannot make copies
+       * if deferral fails we have to bail out */
+      bool defer_writer = (!std::is_copy_constructible_v<std::decay_t<Value>>) || task->defer_writer;
+
       if (readonly && !copy_in->is_mutable()) {
         /* simply increment the number of readers */
         readers = copy_in->increment_readers();
@@ -980,7 +984,7 @@ namespace ttg_parsec {
          *       (current task) or there are others, in which we case won't
          *       touch it.
          */
-        if (1 == copy_in->num_readers() && !task->defer_writer) {
+        if (1 == copy_in->num_readers() && !defer_writer) {
           /**
            * no other readers, mark copy as mutable and defer the release
            * of the task
@@ -990,9 +994,10 @@ namespace ttg_parsec {
           std::atomic_thread_fence(std::memory_order_release);
           copy_in->mark_mutable();
         } else {
-          if (task->defer_writer && nullptr == copy_in->get_next_task()) {
+          if (defer_writer && nullptr == copy_in->get_next_task()) {
             /* we're the first writer and want to wait for all readers to complete */
             copy_res->set_next_task(&task->parsec_task);
+            task->defer_writer = true;
           } else {
             /* there are writers and/or waiting already of this copy already, make a copy that we can mutate */
             copy_res = NULL;
@@ -3505,7 +3510,7 @@ namespace ttg_parsec {
     // Registers the callback for the i'th input terminal
     template <typename terminalT, std::size_t i>
     void register_input_callback(terminalT &input) {
-      using valueT = typename terminalT::value_type;
+      using valueT = std::decay_t<typename terminalT::value_type>;
       if (input.is_pull_terminal) {
         num_pullins++;
       }
@@ -3517,20 +3522,10 @@ namespace ttg_parsec {
           set_arg<i, keyT, valueT>(key, std::forward<valueT>(value));
         };
         auto send_callback = [this](const keyT &key, const valueT &value) {
-          if constexpr (std::is_copy_constructible_v<valueT>) {
-            set_arg<i, keyT, const valueT &>(key, value);
-          }
-          else {
-            throw std::logic_error(std::string("TTG::PaRSEC: send_callback is invoked on datum of type ") + boost::typeindex::type_id<valueT>().pretty_name() + " which is not copy constructible, std::move datum into send statement");
-          }
+          set_arg<i, keyT, const valueT &>(key, value);
         };
         auto broadcast_callback = [this](const ttg::span<const keyT> &keylist, const valueT &value) {
-          if constexpr (std::is_copy_constructible_v<valueT>) {
-            broadcast_arg<i, keyT, valueT>(keylist, value);
-          }
-          else {
-            throw std::logic_error(std::string("TTG::PaRSEC: broadcast_callback is invoked on datum of type ") + boost::typeindex::type_id<valueT>().pretty_name() + " which is not copy constructible, broadcast is not possible with move-only type");
-          }
+          broadcast_arg<i, keyT, valueT>(keylist, value);
         };
         auto prepare_send_callback = [this](const ttg::span<const keyT> &keylist, const valueT &value) {
             prepare_send<i, keyT, valueT>(keylist, value);
@@ -4380,15 +4375,16 @@ struct ttg::detail::value_copy_handler<ttg::Runtime::PaRSEC> {
   template <typename Value>
   inline std::conditional_t<std::is_reference_v<Value>,Value,Value&&> operator()(Value &&value) {
     constexpr auto value_is_rvref = std::is_rvalue_reference_v<decltype(value)>;
+    using value_type = std::remove_reference_t<Value>;
     static_assert(value_is_rvref ||
                   std::is_copy_constructible_v<std::decay_t<Value>>,
                   "Data sent without being moved must be copy-constructible!");
 
     auto caller = ttg_parsec::detail::parsec_ttg_caller;
     if (nullptr == caller) {
-      ttg::print("ERROR: ttg::send or ttg::broadcast called outside of a task!\n");
+      throw std::runtime_error("ERROR: ttg::send or ttg::broadcast called outside of a task!");
     }
-    using value_type = std::remove_reference_t<Value>;
+
     ttg_parsec::detail::ttg_data_copy_t *copy;
     copy = ttg_parsec::detail::find_copy_in_task(caller, &value);
     value_type *value_ptr = &value;
@@ -4421,7 +4417,7 @@ struct ttg::detail::value_copy_handler<ttg::Runtime::PaRSEC> {
   inline std::add_lvalue_reference_t<Value> operator()(ttg_parsec::detail::persistent_value_ref<Value> vref) {
     auto caller = ttg_parsec::detail::parsec_ttg_caller;
     if (nullptr == caller) {
-      ttg::print("ERROR: ttg::send or ttg::broadcast called outside of a task!\n");
+      throw std::runtime_error("ERROR: ttg::send or ttg::broadcast called outside of a task!");
     }
     ttg_parsec::detail::ttg_data_copy_t *copy;
     copy = ttg_parsec::detail::find_copy_in_task(caller, &vref.value_ref);
@@ -4439,11 +4435,9 @@ struct ttg::detail::value_copy_handler<ttg::Runtime::PaRSEC> {
 
   template <typename Value>
   inline const Value &operator()(const Value &value) {
-    static_assert(std::is_copy_constructible_v<std::decay_t<Value>>,
-                  "Data sent without being moved must be copy-constructible!");
     auto caller = ttg_parsec::detail::parsec_ttg_caller;
     if (nullptr == caller) {
-      ttg::print("ERROR: ttg::send or ttg::broadcast called outside of a task!\n");
+      throw std::runtime_error("ERROR: ttg::send or ttg::broadcast called outside of a task!");
     }
     ttg_parsec::detail::ttg_data_copy_t *copy;
     copy = ttg_parsec::detail::find_copy_in_task(caller, &value);
diff --git a/ttg/ttg/parsec/ttg_data_copy.h b/ttg/ttg/parsec/ttg_data_copy.h
index 70a5df715..351bc7a2c 100644
--- a/ttg/ttg/parsec/ttg_data_copy.h
+++ b/ttg/ttg/parsec/ttg_data_copy.h
@@ -65,14 +65,11 @@ namespace ttg_parsec {
           data->device_copies[0]->flags ^= TTG_PARSEC_DATA_FLAG_REGISTERED;
         }
 #endif // PARSEC_HAVE_DEV_CUDA_SUPPORT
-        //std::fprintf(stderr, "parsec_data_destroy %p device_copy[0] %p\n", data, data->device_copies[0]);
-        //parsec_data_destroy(data);
         assert(data->device_copies[0] != nullptr);
         auto copy = data->device_copies[0];
         parsec_data_copy_detach(data, data->device_copies[0], 0);
         PARSEC_OBJ_RELEASE(copy);
         PARSEC_OBJ_RELEASE(data);
-
       }
 
       static void delete_null_parsec_data(parsec_data_t *) {
@@ -81,7 +78,7 @@ namespace ttg_parsec {
 
     protected:
 
-      /* remove the the data from the owning data copy */
+      /* remove the data from the owning data copy */
       void remove_from_owner();
 
       /* add the data to the owning data copy */
@@ -461,6 +458,7 @@ namespace ttg_parsec {
       value_type m_value;
 
       template<typename T>
+      requires(std::constructible_from<ValueT, T>)
       ttg_data_value_copy_t(T&& value)
       : ttg_data_copy_container_setter(this)
       , ttg_data_copy_t()
@@ -562,18 +560,13 @@ namespace ttg_parsec {
     : m_data(std::move(other.m_data))
     , m_ttg_copy(detail::ttg_data_copy_container())
     {
-      /* the ttg_data_copy may have moved us already */
-      //if (other.m_ttg_copy != m_ttg_copy) {
-        // try to remove the old buffer from the *old* ttg_copy
-        other.remove_from_owner();
+      // try to remove the old buffer from the *old* ttg_copy
+      other.remove_from_owner();
 
-        // register with the new ttg_copy
-        if (nullptr != m_ttg_copy) {
-          m_ttg_copy->add_device_data(this);
-        }
-      //} else {
-      //  other.m_ttg_copy = nullptr;
-      //}
+      // register with the new ttg_copy
+      if (nullptr != m_ttg_copy) {
+        m_ttg_copy->add_device_data(this);
+      }
     }
 
     inline

From 72f1d3dce5454da500dea0f107a3551c379e8eec Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Fri, 11 Oct 2024 15:36:49 -0400
Subject: [PATCH 10/14] Allow device tasks to jump directly to send or exit
 right away

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 ttg/ttg/parsec/ttg.h | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h
index c0081c553..8e7f117d6 100644
--- a/ttg/ttg/parsec/ttg.h
+++ b/ttg/ttg/parsec/ttg.h
@@ -1604,8 +1604,11 @@ namespace ttg_parsec {
       // get the promise which contains the views
       ttg::device::detail::device_task_promise_type& dev_data = dev_task.promise();
 
-      /* for now make sure we're waiting for transfers and the coro hasn't skipped this step */
-      assert(dev_data.state() == ttg::device::detail::TTG_DEVICE_CORO_WAIT_TRANSFER);
+      if (dev_data.state() == ttg::device::detail::TTG_DEVICE_CORO_SENDOUT ||
+          dev_data.state() == ttg::device::detail::TTG_DEVICE_CORO_COMPLETE) {
+        /* the task jumped directly to send or returned so we're done */
+        return PARSEC_HOOK_RETURN_DONE;
+      }
 
       parsec_device_gpu_module_t *device = (parsec_device_gpu_module_t*)task->parsec_task.selected_device;
       assert(NULL != device);
@@ -3744,14 +3747,17 @@ namespace ttg_parsec {
           // get the promise which contains the views
           auto dev_data = dev_task.promise();
 
-          /* for now make sure we're waiting for the kernel to complete and the coro hasn't skipped this step */
-          assert(dev_data.state() == ttg::device::detail::TTG_DEVICE_CORO_SENDOUT);
+          assert(dev_data.state() == ttg::device::detail::TTG_DEVICE_CORO_SENDOUT ||
+                 dev_data.state() == ttg::device::detail::TTG_DEVICE_CORO_COMPLETE);
 
           /* execute the sends we stored */
           if (dev_data.state() == ttg::device::detail::TTG_DEVICE_CORO_SENDOUT) {
             /* set the current task, needed inside the sends */
             detail::parsec_ttg_caller = task;
-            dev_data.do_sends();
+            auto old_output_tls_ptr = task->tt->outputs_tls_ptr_accessor();
+            task->tt->set_outputs_tls_ptr();
+            dev_data.do_sends(); // all sends happen here
+            task->tt->set_outputs_tls_ptr(old_output_tls_ptr);
             detail::parsec_ttg_caller = nullptr;
           }
         }

From 56aca3ab4870b431c565c3f07592393e9c0fe0dc Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Fri, 11 Oct 2024 15:48:56 -0400
Subject: [PATCH 11/14] Add ttg::device::Stream encapsulating a lower-level
 stream object

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 ttg/ttg/device/device.h | 115 +++++++++++++++++-----------------------
 1 file changed, 49 insertions(+), 66 deletions(-)

diff --git a/ttg/ttg/device/device.h b/ttg/ttg/device/device.h
index 244e9c944..6dcb3c722 100644
--- a/ttg/ttg/device/device.h
+++ b/ttg/ttg/device/device.h
@@ -4,21 +4,12 @@
 #include "ttg/execution.h"
 #include "ttg/impl_selector.h"
 #include "ttg/fwd.h"
+#include "ttg/util/meta.h"
 
 
 
 namespace ttg::device {
 
-#if defined(TTG_HAVE_CUDA)
-  constexpr ttg::ExecutionSpace available_execution_space = ttg::ExecutionSpace::CUDA;
-#elif defined(TTG_HAVE_HIP)
-  constexpr ttg::ExecutionSpace available_execution_space = ttg::ExecutionSpace::HIP;
-#elif defined(TTG_HAVE_LEVEL_ZERO)
-  constexpr ttg::ExecutionSpace available_execution_space = ttg::ExecutionSpace::L0;
-#else
-  constexpr ttg::ExecutionSpace available_execution_space = ttg::ExecutionSpace::Invalid;
-#endif
-
   /// Represents a device in a specific execution space
   class Device {
     int m_id = 0;
@@ -74,52 +65,64 @@ namespace std {
   }
 } // namespace std
 
-#if defined(TTG_HAVE_CUDA)
-#include <cuda_runtime.h>
-
 namespace ttg::device {
-  namespace detail {
-    inline thread_local ttg::device::Device current_device_ts = {};
-    inline thread_local cudaStream_t current_stream_ts = 0; // default stream
-
-    inline void reset_current() {
-      current_device_ts = {};
-      current_stream_ts = 0;
-    }
 
-    inline void set_current(int device, cudaStream_t stream) {
-      current_device_ts = ttg::device::Device(device, ttg::ExecutionSpace::CUDA);
-      current_stream_ts = stream;
-    }
+  namespace detail {
+    template<typename Stream>
+    struct default_stream {
+      static constexpr const Stream value = 0;
+    };
+    template<typename Stream>
+    constexpr const Stream default_stream_v = default_stream<Stream>::value;
   } // namespace detail
 
-  inline
-  Device current_device() {
-    return detail::current_device_ts;
-  }
-
-  inline
-  cudaStream_t current_stream() {
-    return detail::current_stream_ts;
-  }
 } // namespace ttg
 
+#if defined(TTG_HAVE_CUDA)
+#include <cuda_runtime.h>
+namespace ttg::device {
+  constexpr ttg::ExecutionSpace available_execution_space = ttg::ExecutionSpace::CUDA;
+  using Stream = cudaStream_t;
+} // namespace ttg::device
 #elif defined(TTG_HAVE_HIP)
-
 #include <hip/hip_runtime.h>
+namespace ttg::device {
+  constexpr ttg::ExecutionSpace available_execution_space = ttg::ExecutionSpace::HIP;
+  using Stream = hipStream_t;
+} // namespace ttg::device
+#elif defined(TTG_HAVE_LEVEL_ZERO)
+#include <CL/sycl.hpp>
+namespace ttg::device {
+  constexpr ttg::ExecutionSpace available_execution_space = ttg::ExecutionSpace::L0;
+  using Stream = std::add_reference_t<sycl::queue>;
+} // namespace ttg::device
+#else
+namespace ttg::device {
+  struct Stream { };
+  namespace detail {
+    template<>
+    struct default_stream<Stream> {
+      static constexpr const Stream value = {};
+    };
+  } // namespace detail
+  constexpr ttg::ExecutionSpace available_execution_space = ttg::ExecutionSpace::Host;
+} // namespace ttg::device
+#endif
 
 namespace ttg::device {
+
+#if !defined(TTG_HAVE_LEVEL_ZERO)
   namespace detail {
     inline thread_local ttg::device::Device current_device_ts = {};
-    inline thread_local hipStream_t current_stream_ts = 0; // default stream
+    inline thread_local Stream current_stream_ts = detail::default_stream_v<Stream>; // default stream
 
     inline void reset_current() {
       current_device_ts = {};
-      current_stream_ts = 0;
+      current_stream_ts = detail::default_stream_v<Stream>;
     }
 
-    inline void set_current(int device, hipStream_t stream) {
-      current_device_ts = ttg::device::Device(device, ttg::ExecutionSpace::HIP);
+    inline void set_current(int device, Stream stream) {
+      current_device_ts = ttg::device::Device(device, available_execution_space);
       current_stream_ts = stream;
     }
   } // namespace detail
@@ -130,16 +133,16 @@ namespace ttg::device {
   }
 
   inline
-  hipStream_t current_stream() {
+  Stream current_stream() {
     return detail::current_stream_ts;
   }
-} // namespace ttg
-
-#elif defined(TTG_HAVE_LEVEL_ZERO)
 
-#include <CL/sycl.hpp>
+  inline int num_devices() {
+    return TTG_IMPL_NS::num_devices();
+  }
 
-namespace ttg::device {
+#else // TTG_HAVE_LEVEL_ZERO
+  /* SYCL needs special treatment because it uses pointers/references */
   namespace detail {
     inline thread_local ttg::device::Device current_device_ts = {};
     inline thread_local sycl::queue* current_stream_ts = nullptr; // default stream
@@ -165,26 +168,6 @@ namespace ttg::device {
   sycl::queue& current_stream() {
     return *detail::current_stream_ts;
   }
-} // namespace ttg
-
-#else
+#endif // TTG_HAVE_LEVEL_ZERO
 
-namespace ttg::device {
-  inline Device current_device() {
-    return {};
-  }
-
-  template<ttg::ExecutionSpace Space = ttg::ExecutionSpace::Invalid>
-  inline const void* current_stream() {
-    static_assert(Space != ttg::ExecutionSpace::Invalid,
-                  "TTG was built without any known device support so we cannot provide a current stream!");
-    return nullptr;
-  }
 } // namespace ttg
-#endif // defined(TTG_HAVE_HIP)
-
-namespace ttg::device {
-  inline int num_devices() {
-    return TTG_IMPL_NS::num_devices();
-  }
-}

From b77b5439f78e9a98aadc4cee2dd84fbe48bf2fdd Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Wed, 13 Nov 2024 11:36:52 -0500
Subject: [PATCH 12/14] Remove superfluous template parameter packs

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 ttg/ttg/device/task.h | 6 ++----
 ttg/ttg/func.h        | 8 +++-----
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/ttg/ttg/device/task.h b/ttg/ttg/device/task.h
index e592e99a2..476119886 100644
--- a/ttg/ttg/device/task.h
+++ b/ttg/ttg/device/task.h
@@ -306,8 +306,7 @@ namespace ttg::device {
       }
     }
 
-    template <size_t KeyId, size_t I, size_t... Is, typename... RangesT, typename valueT,
-              typename... out_keysT, typename... out_valuesT>
+    template <size_t KeyId, size_t I, size_t... Is, typename... RangesT, typename valueT>
     inline void broadcast(const std::tuple<RangesT...> &keylists, valueT &&value) {
       using key_t = typename broadcast_keylist_trait<
                       std::tuple_element_t<KeyId, std::tuple<std::remove_reference_t<RangesT>...>>
@@ -379,8 +378,7 @@ namespace ttg::device {
       }
     }
 
-    template <size_t KeyId, size_t I, size_t... Is, typename... RangesT,
-              typename... out_keysT, typename... out_valuesT>
+    template <size_t KeyId, size_t I, size_t... Is, typename... RangesT>
     inline void broadcastk(const std::tuple<RangesT...> &keylists) {
       using key_t = typename broadcast_keylist_trait<
                       std::tuple_element_t<KeyId, std::tuple<std::remove_reference_t<RangesT>...>>
diff --git a/ttg/ttg/func.h b/ttg/ttg/func.h
index 4273e7c66..9e9830b0b 100644
--- a/ttg/ttg/func.h
+++ b/ttg/ttg/func.h
@@ -416,8 +416,7 @@ namespace ttg {
     std::get<i>(t).broadcast(keylist, copy_handler(std::forward<valueT>(value)));
   }
 
-  template <typename rangeT, typename valueT, typename... out_keysT, typename... out_valuesT,
-            ttg::Runtime Runtime = ttg::ttg_runtime>
+  template <typename rangeT, typename valueT, ttg::Runtime Runtime = ttg::ttg_runtime>
   inline void broadcast(std::size_t i, const rangeT &keylist, valueT &&value) {
     detail::value_copy_handler<Runtime> copy_handler;
     using key_t = decltype(*std::begin(keylist));
@@ -425,8 +424,7 @@ namespace ttg {
     terminal_ptr->broadcast(keylist, copy_handler(std::forward<valueT>(value)));
   }
 
-  template <size_t i, typename rangeT, typename valueT, typename... out_keysT, typename... out_valuesT,
-            ttg::Runtime Runtime = ttg::ttg_runtime>
+  template <size_t i, typename rangeT, typename valueT, ttg::Runtime Runtime = ttg::ttg_runtime>
   inline void broadcast(const rangeT &keylist, valueT &&value) {
     broadcast(i, keylist, std::forward<valueT>(value));
   }
@@ -505,7 +503,7 @@ namespace ttg {
     terminal_ptr->set_size(size);
   }
 
-  template <size_t i, typename keyT, typename... out_keysT, typename... out_valuesT>
+  template <size_t i, typename keyT>
   inline std::enable_if_t<!meta::is_void_v<keyT>, void> set_size(const keyT &key, const std::size_t size) {
     set_size(i, key, size);
   }

From 3435452c0052afa03dde5ea077976fc46e46115c Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Wed, 13 Nov 2024 11:38:11 -0500
Subject: [PATCH 13/14] Remove unused variables and functions

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 ttg/ttg/parsec/devicefunc.h |  1 -
 ttg/ttg/parsec/ttg.h        | 44 ++++++-------------------------------
 ttg/ttg/util/dot.h          |  3 +--
 3 files changed, 8 insertions(+), 40 deletions(-)

diff --git a/ttg/ttg/parsec/devicefunc.h b/ttg/ttg/parsec/devicefunc.h
index ff8452034..080660205 100644
--- a/ttg/ttg/parsec/devicefunc.h
+++ b/ttg/ttg/parsec/devicefunc.h
@@ -123,7 +123,6 @@ namespace ttg_parsec {
 
       /* get_parsec_data is overloaded for buffer and devicescratch */
       parsec_data_t* data = detail::get_parsec_data(view);
-      parsec_gpu_task_t *gpu_task = detail::parsec_ttg_caller->dev_ptr->gpu_task;
       parsec_gpu_exec_stream_t *stream = detail::parsec_ttg_caller->dev_ptr->stream;
 
       /* enqueue the transfer into the compute stream to come back once the compute and transfer are complete */
diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h
index 8e7f117d6..87994dd91 100644
--- a/ttg/ttg/parsec/ttg.h
+++ b/ttg/ttg/parsec/ttg.h
@@ -701,7 +701,7 @@ namespace ttg_parsec {
     template <typename Value>
     inline ttg_data_copy_t *create_new_datacopy(Value &&value) {
       using value_type = std::decay_t<Value>;
-      ttg_data_copy_t *copy;
+      ttg_data_copy_t *copy = nullptr;
       if constexpr (std::is_base_of_v<ttg::TTValue<value_type>, value_type> &&
                     std::is_constructible_v<value_type, decltype(value)>) {
         copy = new value_type(std::forward<Value>(value));
@@ -1472,36 +1472,6 @@ namespace ttg_parsec {
       return rc;
     }
 
-    static void
-    static_device_stage_in(parsec_gpu_task_t        *gtask,
-                         uint32_t                  flow_mask,
-                         parsec_gpu_exec_stream_t *gpu_stream) {
-      /* register any memory that hasn't been registered yet */
-      for (int i = 0; i < MAX_PARAM_COUNT; ++i) {
-        if (flow_mask & (1<<i)) {
-          task_t *task = (task_t*)gtask->ec;
-          parsec_data_copy_t *copy = task->parsec_task.data[i].data_in;
-          if (0 == (copy->flags & TTG_PARSEC_DATA_FLAG_REGISTERED)) {
-#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
-            // register host memory for faster device access
-            cudaError_t status;
-            //status = cudaHostRegister(copy->device_private, gtask->flow_nb_elts[i], cudaHostRegisterPortable);
-            //assert(cudaSuccess == status);
-#endif // PARSEC_HAVE_DEV_CUDA_SUPPORT
-            //copy->flags |= TTG_PARSEC_DATA_FLAG_REGISTERED;
-          }
-        }
-      }
-    }
-
-    static int
-    static_device_stage_in_hook(parsec_gpu_task_t        *gtask,
-                                uint32_t                  flow_mask,
-                                parsec_gpu_exec_stream_t *gpu_stream) {
-      static_device_stage_in(gtask, flow_mask, gpu_stream);
-      return parsec_default_gpu_stage_in(gtask, flow_mask, gpu_stream);
-    }
-
     template <ttg::ExecutionSpace Space>
     static parsec_hook_return_t device_static_evaluate(parsec_task_t* parsec_task) {
 
@@ -1515,7 +1485,7 @@ namespace ttg_parsec {
         PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t);
         gpu_task->ec = parsec_task;
         gpu_task->task_type = 0; // user task
-        gpu_task->last_data_check_epoch = -1; // used internally
+        gpu_task->last_data_check_epoch = 0; // used internally
         gpu_task->pushout = 0;
         gpu_task->submit = &TT::device_static_submit<Space>;
 
@@ -1624,7 +1594,7 @@ namespace ttg_parsec {
           if constexpr (Space == ttg::ExecutionSpace::CUDA) {
             /* TODO: we need custom staging functions because PaRSEC looks at the
              *       task-class to determine the number of flows. */
-            gpu_task->stage_in  = static_device_stage_in_hook;
+            gpu_task->stage_in  = parsec_default_gpu_stage_in;
             gpu_task->stage_out = parsec_default_gpu_stage_out;
             return parsec_device_kernel_scheduler(&device->super, es, gpu_task);
           }
@@ -1633,7 +1603,7 @@ namespace ttg_parsec {
 #if defined(PARSEC_HAVE_DEV_HIP_SUPPORT)
         case PARSEC_DEV_HIP:
           if constexpr (Space == ttg::ExecutionSpace::HIP) {
-            gpu_task->stage_in  = static_device_stage_in_hook;
+            gpu_task->stage_in  = parsec_default_gpu_stage_in;
             gpu_task->stage_out = parsec_default_gpu_stage_out;
             return parsec_device_kernel_scheduler(&device->super, es, gpu_task);
           }
@@ -1642,7 +1612,7 @@ namespace ttg_parsec {
 #if defined(PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT)
         case PARSEC_DEV_LEVEL_ZERO:
           if constexpr (Space == ttg::ExecutionSpace::L0) {
-            gpu_task->stage_in  = static_device_stage_in_hook;
+            gpu_task->stage_in  = parsec_default_gpu_stage_in;
             gpu_task->stage_out = parsec_default_gpu_stage_out;
             return parsec_device_kernel_scheduler(&device->super, es, gpu_task);
           }
@@ -2399,7 +2369,9 @@ namespace ttg_parsec {
       auto &reducer = std::get<i>(input_reducers);
       bool release = false;
       bool remove_from_hash = true;
+#if defined(PARSEC_PROF_GRAPHER)
       bool discover_task = true;
+#endif
       bool get_pull_data = false;
       bool has_lock = false;
       /* If we have only one input and no reducer on that input we can skip the hash table */
@@ -2795,7 +2767,6 @@ namespace ttg_parsec {
           num_iovecs = std::distance(std::begin(iovs), std::end(iovs));
           /* pack the metadata */
           auto metadata = descr.get_metadata(*const_cast<decvalueT *>(value_ptr));
-          size_t metadata_size = sizeof(metadata);
           pos = pack(metadata, msg->bytes, pos);
           //std::cout << "set_arg_impl splitmd num_iovecs " << num_iovecs << std::endl;
           handle_iovec_fn(iovs);
@@ -2970,7 +2941,6 @@ namespace ttg_parsec {
           ttg::SplitMetadataDescriptor<decvalueT> descr;
           /* pack the metadata */
           auto metadata = descr.get_metadata(value);
-          size_t metadata_size = sizeof(metadata);
           pos = pack(metadata, msg->bytes, pos);
           auto iovs = descr.get_data(*const_cast<decvalueT *>(&value));
           num_iovs = std::distance(std::begin(iovs), std::end(iovs));
diff --git a/ttg/ttg/util/dot.h b/ttg/ttg/util/dot.h
index 5e0dea7f6..f562dcb49 100644
--- a/ttg/ttg/util/dot.h
+++ b/ttg/ttg/util/dot.h
@@ -57,13 +57,12 @@ namespace ttg {
 
     void ttfunc(TTBase *tt) {
       std::string ttnm = nodename(tt);
-      bool is_ttg = true;
 
       const TTBase *ttc = reinterpret_cast<const TTBase*>(tt);
       build_ttg_hierarchy(ttc);
       if(!tt->is_ttg()) {
         std::stringstream ttss;
-        
+
         ttss << "        " << ttnm << " [shape=record,style=filled,fillcolor=gray90,label=\"{";
 
         size_t count = 0;

From a9bb37a03af9b5a05e7e924355cf8990b5931cc4 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Wed, 13 Nov 2024 11:39:00 -0500
Subject: [PATCH 14/14] Fix sign mismatch in assignments

Instead of initializing an unsigned value to -1 just
use numeric_limits::max().

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 ttg/ttg/parsec/ttg.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h
index 87994dd91..7b49dcb4b 100644
--- a/ttg/ttg/parsec/ttg.h
+++ b/ttg/ttg/parsec/ttg.h
@@ -146,8 +146,8 @@ namespace ttg_parsec {
       MSG_SET_ARGSTREAM_SIZE = 1,
       MSG_FINALIZE_ARGSTREAM_SIZE = 2,
       MSG_GET_FROM_PULL = 3 } fn_id_t;
-    uint32_t taskpool_id = -1;
-    uint64_t op_id = -1;
+    uint32_t taskpool_id = std::numeric_limits<uint32_t>::max();
+    uint64_t op_id = std::numeric_limits<uint64_t>::max();
     std::size_t key_offset = 0;
     fn_id_t fn_id = MSG_INVALID;
     std::int8_t num_iovecs = 0;
@@ -334,7 +334,7 @@ namespace ttg_parsec {
     void create_tpool() {
       assert(nullptr == tpool);
       tpool = PARSEC_OBJ_NEW(parsec_taskpool_t);
-      tpool->taskpool_id = -1;
+      tpool->taskpool_id = std::numeric_limits<uint32_t>::max();
       tpool->update_nb_runtime_task = parsec_add_fetch_runtime_task;
       tpool->taskpool_type = PARSEC_TASKPOOL_TYPE_TTG;
       tpool->taskpool_name = strdup("TTG Taskpool");