From e642a59492a0024bf4370cd4f2bf75f2c9c5c9d1 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Tue, 27 Feb 2024 18:11:11 -0500
Subject: [PATCH 01/57] Implement simple sequenced keys constraint

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 examples/spmm/spmm.cc |   5 +
 ttg/CMakeLists.txt    |   1 +
 ttg/ttg/constraint.h  | 221 ++++++++++++++++++++++++++++++++++++++++++
 ttg/ttg/parsec/task.h |  14 +++
 ttg/ttg/parsec/ttg.h  | 139 ++++++++++++++++++++++++--
 ttg/ttg/util/meta.h   |  15 +++
 6 files changed, 385 insertions(+), 10 deletions(-)
 create mode 100644 ttg/ttg/constraint.h
diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc
index 3760c9946..a8da3e03f 100644
--- a/examples/spmm/spmm.cc
+++ b/examples/spmm/spmm.cc
@@ -292,11 +292,16 @@ class SpMM25D {
       , parallel_bcasts_(parallel_bcasts) {
     Edge<Key<2>, void> a_ctl, b_ctl;
     Edge<Key<2>, int> a_rowctl, b_colctl; // TODO: can we have multiple control inputs per TT?
+    auto constraint = ttg::make_shared_constraint<ttg::SequencedKeysConstraint<Key<2>>>();
     bcast_a_ = std::make_unique<BcastA>(a, a_ctl, a_rowctl, local_a_ijk_, a_rows_of_col_, a_cols_of_row_, b_cols_of_row_,
                                         ij_keymap_, ijk_keymap_, parallel_bcasts_);
+    // add constraint with external mapper: key[1] represents `k`
+    bcast_a_->add_constraint(constraint, [](const Key<2>& key){ return key[1]; });
     local_bcast_a_ = std::make_unique<LocalBcastA>(local_a_ijk_, a_ijk_, b_cols_of_row_, ijk_keymap_);
     bcast_b_ = std::make_unique<BcastB>(b, b_ctl, b_colctl, local_b_ijk_, a_rows_of_col_, b_cols_of_row_, b_rows_of_col_,
                                         ij_keymap_, ijk_keymap_, parallel_bcasts_);
+    // add constraint with external mapper: key[0] represents `k`
+    bcast_b_->add_constraint(constraint, [](const Key<2>& key){ return key[0]; });
     local_bcast_b_ = std::make_unique<LocalBcastB>(local_b_ijk_, b_ijk_, a_rows_of_col_, ijk_keymap_);
     multiplyadd_ = std::make_unique<MultiplyAdd>(a_ijk_, b_ijk_, c_ijk_, c_ij_p_, a_cols_of_row_,
                                                  b_rows_of_col_, mTiles, nTiles, ijk_keymap_);
diff --git a/ttg/CMakeLists.txt b/ttg/CMakeLists.txt
index b1fa72947..61036bdf0 100644
--- a/ttg/CMakeLists.txt
+++ b/ttg/CMakeLists.txt
@@ -45,6 +45,7 @@ configure_file(
 set(ttg-impl-headers
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/broadcast.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/buffer.h
+        ${CMAKE_CURRENT_SOURCE_DIR}/ttg/constraint.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/devicescope.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/devicescratch.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/edge.h
diff --git a/ttg/ttg/constraint.h b/ttg/ttg/constraint.h
new file mode 100644
index 000000000..e7b3193c5
--- /dev/null
+++ b/ttg/ttg/constraint.h
@@ -0,0 +1,221 @@
+#ifndef TTG_CONSTRAINT_H
+#define TTG_CONSTRAINT_H
+
+#include <functional>
+#include <atomic>
+#include <cstdint>
+#include <mutex>
+#include <map>
+
+namespace ttg {
+
+  // TODO: do we need a (virtual) base class?
+
+  template<typename Key>
+  struct ConstraintBase {
+    using key_type = Key;
+    using listener_t = std::function<void(const std::span<key_type>&)>;
+
+    ConstraintBase()
+    { }
+
+    virtual ~ConstraintBase() = default;
+
+    void add_listener(listener_t l, ttg::TTBase *tt) {
+      auto g = this->lock_guard();
+      m_listeners.insert_or_assign(tt, std::move(l));
+    }
+
+    void notify_listener(const std::span<key_type>& keys, ttg::TTBase* tt) {
+      auto& release = m_listeners[tt];
+      release(keys);
+    }
+
+  protected:
+
+    auto lock_guard() {
+      return std::lock_guard{m_mtx};
+    }
+
+  private:
+    std::map<ttg::TTBase*, listener_t> m_listeners;
+    std::mutex m_mtx;
+  };
+
+  template<typename Key,
+           typename Ordinal = std::size_t,
+           typename Compare = std::less<Ordinal>,
+           typename Mapper = ttg::Void>
+  struct SequencedKeysConstraint : public ConstraintBase<Key> {
+
+    using key_type = std::conditional_t<ttg::meta::is_void_v<Key>, ttg::Void, Key>;
+    using ordinal_type = Ordinal;
+    using keymap_t = std::function<Ordinal(const key_type&)>;
+    using compare_t = Compare;
+    using base_t = ConstraintBase<Key>;
+
+  private:
+    struct sequence_elem_t {
+      std::map<ttg::TTBase*, std::vector<key_type>> m_keys;
+
+      sequence_elem_t() = default;
+
+      void add_key(const key_type& key, ttg::TTBase* tt) {
+        auto it = m_keys.find(tt);
+        if (it == m_keys.end()) {
+          m_keys.insert(std::make_pair(tt, std::vector<key_type>{key}));
+        } else {
+          it->second.push_back(key);
+        }
+      }
+    };
+
+    void release_next() {
+      // trigger the next sequence
+      sequence_elem_t elem;
+      {
+        // extract the next sequence
+        auto g = this->lock_guard();
+        auto it = m_sequence.begin(); // sequence is ordered by ordinal
+        if (it == m_sequence.end()) {
+          return; // nothing to be done
+        }
+        m_current = it->first;
+        elem = std::move(it->second);
+        m_sequence.erase(it);
+      }
+
+      for (auto& seq : elem.m_keys) {
+        // account for the newly active keys
+        m_active.fetch_add(seq.second.size(), std::memory_order_relaxed);
+        this->notify_listener(std::span<key_type>(seq.second.data(), seq.second.size()), seq.first);
+      }
+    }
+
+    bool check_key_impl(const key_type& key, Ordinal ord, ttg::TTBase *tt) {
+      if (m_order(ord, m_current)) {
+        // key should be executed
+        m_active.fetch_add(1, std::memory_order_relaxed);
+        // reset current to the lower ordinal
+        m_current = ord;
+        return true;
+      } else if (m_sequence.empty() && 0 == m_active.load(std::memory_order_relaxed)) {
+        // there are no keys (active or blocked) so we execute to avoid a deadlock
+        // we don't change the current ordinal because there may be lower ordinals coming in later
+        m_active.fetch_add(1, std::memory_order_relaxed);
+        return true;
+      } else {
+        // key should be deferred
+        auto g = this->lock_guard();
+        if (m_order(ord, m_current)) {
+          // someone released this ordinal while we took the lock
+          return true;
+        }
+        auto it = m_sequence.find(ord);
+        if (it == m_sequence.end()) {
+          auto [iter, success] = m_sequence.insert(std::make_pair(ord, sequence_elem_t{}));
+          assert(success);
+          it = iter;
+        }
+        it->second.add_key(key, tt);
+        return false;
+      }
+    }
+
+    void complete_key_impl() {
+      auto active = m_active.fetch_sub(1, std::memory_order_relaxed) - 1;
+      if (0 == active) {
+        release_next();
+      }
+    }
+
+
+  public:
+
+    /**
+     * Used for external key mapper.
+     */
+    SequencedKeysConstraint()
+    : base_t()
+    { }
+
+    template<typename Mapper_>
+    SequencedKeysConstraint(Mapper_&& map)
+    : base_t()
+    , m_map(std::forward<Mapper_>(map))
+    { }
+
+    ~SequencedKeysConstraint() = default;
+
+    /* Check whether the key may be executed.
+     * Returns true if the key may be executed.
+     * Otherwise, returns false and  */
+    template<typename Key_ = key_type, typename Mapper_ = Mapper>
+    std::enable_if_t<!ttg::meta::is_void_v<Key_> && !ttg::meta::is_void_v<Mapper_>, bool>
+    check(const key_type& key, ttg::TTBase *tt) {
+      ordinal_type ord = m_map(key);
+      return check_key_impl(key, ord, tt);
+    }
+
+    template<typename Key_ = key_type, typename Mapper_ = Mapper>
+    std::enable_if_t<!ttg::meta::is_void_v<Key_> && ttg::meta::is_void_v<Mapper_>, bool>
+    check(const key_type& key, Ordinal ord, ttg::TTBase *tt) {
+      return check_key_impl(key, ord, tt);
+    }
+
+    template<typename Key_ = key_type, typename Mapper_ = Mapper>
+    std::enable_if_t<ttg::meta::is_void_v<Key_> && !ttg::meta::is_void_v<Mapper_>, bool>
+    check(ttg::TTBase *tt) {
+      return check_key_impl(ttg::Void{}, m_map(), tt);
+    }
+
+    template<typename Key_ = key_type, typename Mapper_ = Mapper>
+    std::enable_if_t<ttg::meta::is_void_v<Key_> && ttg::meta::is_void_v<Mapper_>, bool>
+    check(ordinal_type ord, ttg::TTBase *tt) {
+      return check_key_impl(ttg::Void{}, ord, tt);
+    }
+
+    template<typename Key_ = key_type, typename Mapper_ = Mapper>
+    std::enable_if_t<!ttg::meta::is_void_v<Key_> && !ttg::meta::is_void_v<Mapper_>>
+    complete(const key_type& key, ttg::TTBase *tt) {
+      complete_key_impl();
+    }
+
+    template<typename Key_ = key_type, typename Mapper_ = Mapper>
+    std::enable_if_t<!ttg::meta::is_void_v<Key_> && ttg::meta::is_void_v<Mapper_>>
+    complete(const key_type& key, Ordinal ord, ttg::TTBase *tt) {
+      complete_key_impl();
+    }
+
+    template<typename Key_ = key_type, typename Mapper_ = Mapper>
+    std::enable_if_t<!ttg::meta::is_void_v<Key_> && ttg::meta::is_void_v<Mapper_>>
+    complete(Ordinal ord, ttg::TTBase *tt) {
+      complete_key_impl();
+    }
+
+    template<typename Key_ = key_type, typename Mapper_ = Mapper>
+    std::enable_if_t<!ttg::meta::is_void_v<Key_> && !ttg::meta::is_void_v<Mapper_>>
+    complete(ttg::TTBase *tt) {
+      complete_key_impl();
+    }
+
+  private:
+    std::map<ordinal_type, sequence_elem_t, compare_t> m_sequence;
+    std::atomic<std::size_t> m_active;
+    ordinal_type m_current;
+    [[no_unique_address]]
+    Mapper m_map;
+    [[no_unique_address]]
+    compare_t m_order;
+  };
+
+
+
+  template<typename Constraint, typename... Args>
+  std::shared_ptr<Constraint> make_shared_constraint(Args&&... args) {
+    return std::make_shared<Constraint>(new Constraint(std::forward<Args>(args)...));
+  }
+
+} // namespace ttg
+
+#endif // TTG_CONSTRAINT_H
\ No newline at end of file
diff --git a/ttg/ttg/parsec/task.h b/ttg/ttg/parsec/task.h
index 656117b94..8a01e517b 100644
--- a/ttg/ttg/parsec/task.h
+++ b/ttg/ttg/parsec/task.h
@@ -92,6 +92,7 @@ namespace ttg_parsec {
       int32_t data_count = 0;      //< number of data elements in the copies array
       ttg_data_copy_t **copies;    //< pointer to the fixed copies array of the derived task
       parsec_hash_table_item_t tt_ht_item = {};
+      std::atomic<uint32_t> constraint_blocks;
 
       struct stream_info_t {
         std::size_t goal;
@@ -143,6 +144,19 @@ namespace ttg_parsec {
         release_task_cb(this);
       }
 
+      /* add a constraint to the task
+       * returns true if this is the first constraint */
+      bool add_constraint() {
+        return (0 == constraint_blocks.fetch_add(1, std::memory_order_relaxed));
+      }
+
+      /* remove a constraint from the task
+       * returns true if this is the last constraint */
+      bool release_constaint() {
+        /* return true if this was the last constraint*/
+        return 1 == constraint_blocks.fetch_sub(1, std::memory_order_relaxed);
+      }
+
      protected:
       /**
        * Protected constructors: this class should not be instantiated directly
diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h
index 0a0ddefcb..e63f91903 100644
--- a/ttg/ttg/parsec/ttg.h
+++ b/ttg/ttg/parsec/ttg.h
@@ -22,6 +22,7 @@
 #include "ttg/base/keymap.h"
 #include "ttg/base/tt.h"
 #include "ttg/base/world.h"
+#include "ttg/constraint.h"
 #include "ttg/edge.h"
 #include "ttg/execution.h"
 #include "ttg/func.h"
@@ -1150,6 +1151,7 @@ namespace ttg_parsec {
      protected:
       //  static std::map<int, ParsecBaseTT*> function_id_to_instance;
       parsec_hash_table_t tasks_table;
+      parsec_hash_table_t task_constraint_table;
       parsec_task_class_t self;
     };
 
@@ -1319,6 +1321,9 @@ namespace ttg_parsec {
 
     bool m_defer_writer = TTG_PARSEC_DEFER_WRITER;
 
+    std::vector<ttg::meta::detail::constraint_callback_t<keyT>> constraints_check;
+    std::vector<ttg::meta::detail::constraint_callback_t<keyT>> constraints_complete;
+
    public:
     ttg::World get_world() const override final { return world; }
 
@@ -2570,6 +2575,73 @@ namespace ttg_parsec {
       }
     }
 
+    bool check_constraints(task_t *task) {
+      bool release = true;
+      for (auto& c : constraints_check) {
+        bool constrained = false;
+        if constexpr (ttg::meta::is_void_v<keyT>) {
+          constrained = !c();
+        } else {
+          constrained = !c(task->key);
+        }
+        if (constrained) {
+          if (task->add_constraint()) {
+            if constexpr (!ttg::meta::is_void_v<keyT>) {
+            }
+            parsec_hash_table_insert(&task_constraint_table, &task->tt_ht_item);
+          }
+          release = false;
+        }
+      }
+      return release;
+    }
+
+    template<typename Key = keyT>
+    std::enable_if_t<ttg::meta::is_void_v<Key>, void> release_constraint(const std::span<Key>& keys) {
+      task_t *task;
+      parsec_key_t hk = 0;
+      parsec_hash_table_lock_bucket(&task_constraint_table, hk);
+      task = (task_t *)parsec_hash_table_nolock_find(&task_constraint_table, hk);
+      if (task->release_constaint()) {
+        parsec_hash_table_nolock_remove(&task_constraint_table, hk);
+        auto &world_impl = world.impl();
+        parsec_execution_stream_t *es = world_impl.execution_stream();
+        parsec_task_t *vp_task_rings[1] = { &task->parsec_task };
+        __parsec_schedule_vp(es, vp_task_rings, 0);
+      }
+      parsec_hash_table_unlock_bucket(&task_constraint_table, hk);
+    }
+
+    template<typename Key = keyT>
+    std::enable_if_t<!ttg::meta::is_void_v<Key>, void> release_constraint(const std::span<Key>& keys) {
+      parsec_task_t *task_ring = nullptr;
+      for (auto& key : keys) {
+        task_t *task;
+        auto hk = reinterpret_cast<parsec_key_t>(&key);
+        parsec_hash_table_lock_bucket(&task_constraint_table, hk);
+        task = (task_t *)parsec_hash_table_nolock_find(&task_constraint_table, hk);
+        assert(task != nullptr);
+        if (task->release_constaint()) {
+          parsec_hash_table_nolock_remove(&task_constraint_table, hk);
+          if (task_ring == nullptr) {
+            /* the first task is set directly */
+            task_ring = &task->parsec_task;
+          } else {
+            /* push into the ring */
+            parsec_list_item_ring_push_sorted(&task_ring->super, &task->parsec_task.super,
+                                              offsetof(parsec_task_t, priority));
+          }
+        }
+        parsec_hash_table_unlock_bucket(&task_constraint_table, hk);
+      }
+      if (nullptr != task_ring) {
+        auto &world_impl = world.impl();
+        parsec_execution_stream_t *es = world_impl.execution_stream();
+        parsec_task_t *vp_task_rings[1] = { task_ring };
+        __parsec_schedule_vp(es, vp_task_rings, 0);
+      }
+    }
+
     void release_task(task_t *task,
                       parsec_task_t **task_ring = nullptr) {
       constexpr const bool keyT_is_Void = ttg::meta::is_void_v<keyT>;
@@ -2599,16 +2671,19 @@ namespace ttg_parsec {
           }
         }
         if (task->remove_from_hash) parsec_hash_table_remove(&tasks_table, hk);
-        if (nullptr == task_ring) {
-          parsec_task_t *vp_task_rings[1] = { &task->parsec_task };
-          __parsec_schedule_vp(es, vp_task_rings, 0);
-        } else if (*task_ring == nullptr) {
-          /* the first task is set directly */
-          *task_ring = &task->parsec_task;
-        } else {
-          /* push into the ring */
-          parsec_list_item_ring_push_sorted(&(*task_ring)->super, &task->parsec_task.super,
-                                            offsetof(parsec_task_t, priority));
+
+        if (check_constraints(task)) {
+          if (nullptr == task_ring) {
+            parsec_task_t *vp_task_rings[1] = { &task->parsec_task };
+            __parsec_schedule_vp(es, vp_task_rings, 0);
+          } else if (*task_ring == nullptr) {
+            /* the first task is set directly */
+            *task_ring = &task->parsec_task;
+          } else {
+            /* push into the ring */
+            parsec_list_item_ring_push_sorted(&(*task_ring)->super, &task->parsec_task.super,
+                                              offsetof(parsec_task_t, priority));
+          }
         }
       } else if constexpr (!ttg::meta::is_void_v<keyT>) {
         if ((baseobj->num_pullins + count == numins) && baseobj->is_lazy_pull()) {
@@ -3772,6 +3847,14 @@ namespace ttg_parsec {
         detail::release_data_copy(copy);
         task->copies[i] = nullptr;
       }
+
+      for (auto& c : task->tt->constraints_complete) {
+        if constexpr(std::is_void_v<keyT>) {
+          c();
+        } else {
+          c(task->key);
+        }
+      }
       return PARSEC_HOOK_RETURN_DONE;
     }
 
@@ -3926,6 +4009,9 @@ namespace ttg_parsec {
 
       parsec_hash_table_init(&tasks_table, offsetof(detail::parsec_ttg_task_base_t, tt_ht_item), 8, tasks_hash_fcts,
                              NULL);
+
+      parsec_hash_table_init(&task_constraint_table, offsetof(detail::parsec_ttg_task_base_t, tt_ht_item), 8, tasks_hash_fcts,
+                             NULL);
     }
 
     template <typename keymapT = ttg::detail::default_keymap<keyT>,
@@ -4295,6 +4381,39 @@ namespace ttg_parsec {
     /// @return the device map
     auto get_devicemap() { return devicemap; }
 
+    /// add a constraint
+    /// the constraint must provide a valid override of `check_key(key)`
+    template<typename Constraint>
+    void add_constraint(std::shared_ptr<Constraint> c) {
+      c->add_listener(&release_constraint, this);
+      if constexpr(ttg::meta::is_void_v<keyT>) {
+        c->add_listener([this](){ this->release_constraint(); }, this);
+        constraints_check.push_back([c, this](){ return c->check(this); });
+        constraints_complete.push_back([c, this](const keyT& key){ c->complete(this); return true; });
+      } else {
+        c->add_listener([this](const std::span<keyT>& keys){ this->release_constraint(keys); }, this);
+        constraints_check.push_back([c, this](const keyT& key){ return c->check(key, this); });
+        constraints_complete.push_back([c, this](const keyT& key){ c->complete(key, this); return true; });
+      }
+    }
+
+    /// add a constraint
+    /// the constraint must provide a valid override of `check_key(key, map(key))`
+    /// ths overload can be used to provide different key mapping functions for each TT
+    template<typename Constraint, typename Mapper>
+    void add_constraint(std::shared_ptr<Constraint> c, Mapper&& map) {
+      static_assert(std::is_same_v<typename Constraint::key_type, keyT>);
+      if constexpr(ttg::meta::is_void_v<keyT>) {
+        c->add_listener([this](){ this->release_constraint(); }, this);
+        constraints_check.push_back([map, c, this](){ return c->check(map(), this); });
+        constraints_complete.push_back([map, c, this](){ c->complete(map(), this); return true; });
+      } else {
+        c->add_listener([this](const std::span<keyT>& keys){ this->release_constraint(keys); }, this);
+        constraints_check.push_back([map, c, this](const keyT& key){ return c->check(key, map(key), this); });
+        constraints_complete.push_back([map, c, this](const keyT& key){ c->complete(key, map(key), this); return true; });
+      }
+    }
+
     // Register the static_op function to associate it to instance_id
     void register_static_op_function(void) {
       int rank;
diff --git a/ttg/ttg/util/meta.h b/ttg/ttg/util/meta.h
index f3af03152..c595369b0 100644
--- a/ttg/ttg/util/meta.h
+++ b/ttg/ttg/util/meta.h
@@ -912,6 +912,21 @@ namespace ttg {
       template <typename Key, typename Value>
       using prepare_send_callback_t = typename prepare_send_callback<Key, Value>::type;
 
+      template<typename Key, typename Enabler = void>
+      struct constraint_callback;
+
+      template<typename Key>
+      struct constraint_callback<Key, std::enable_if_t<!is_void_v<Key>>> {
+        using type = std::function<bool(const Key&)>;
+      };
+
+      template<typename Key>
+      struct constraint_callback<Key, std::enable_if_t<is_void_v<Key>>> {
+        using type = std::function<bool()>;
+      };
+
+      template<typename Key>
+      using constraint_callback_t = typename constraint_callback<Key>::type;
 
     }  // namespace detail
 

From 6bbd722c45ecd40b5e782c03f5fdef48ef64fb38 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Wed, 28 Feb 2024 11:05:27 -0500
Subject: [PATCH 02/57] Chain constraint in the order in which they were added

Constraints should be checked in the order in which they were added.
This avoids deadlocks if all constraints ensure progress (e.g., they
don't wait for a specific task to appear) but may lead to different
outcomes (execution orders) if the order of constraints changes.

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ttg/ttg/constraint.h  |  2 --
 ttg/ttg/parsec/task.h | 14 --------
 ttg/ttg/parsec/ttg.h  | 81 ++++++++++++++++++++++++-------------------
 3 files changed, 46 insertions(+), 51 deletions(-)

diff --git a/ttg/ttg/constraint.h b/ttg/ttg/constraint.h
index e7b3193c5..66dad57c6 100644
--- a/ttg/ttg/constraint.h
+++ b/ttg/ttg/constraint.h
@@ -9,8 +9,6 @@
 
 namespace ttg {
 
-  // TODO: do we need a (virtual) base class?
-
   template<typename Key>
   struct ConstraintBase {
     using key_type = Key;
diff --git a/ttg/ttg/parsec/task.h b/ttg/ttg/parsec/task.h
index 8a01e517b..656117b94 100644
--- a/ttg/ttg/parsec/task.h
+++ b/ttg/ttg/parsec/task.h
@@ -92,7 +92,6 @@ namespace ttg_parsec {
       int32_t data_count = 0;      //< number of data elements in the copies array
       ttg_data_copy_t **copies;    //< pointer to the fixed copies array of the derived task
       parsec_hash_table_item_t tt_ht_item = {};
-      std::atomic<uint32_t> constraint_blocks;
 
       struct stream_info_t {
         std::size_t goal;
@@ -144,19 +143,6 @@ namespace ttg_parsec {
         release_task_cb(this);
       }
 
-      /* add a constraint to the task
-       * returns true if this is the first constraint */
-      bool add_constraint() {
-        return (0 == constraint_blocks.fetch_add(1, std::memory_order_relaxed));
-      }
-
-      /* remove a constraint from the task
-       * returns true if this is the last constraint */
-      bool release_constaint() {
-        /* return true if this was the last constraint*/
-        return 1 == constraint_blocks.fetch_sub(1, std::memory_order_relaxed);
-      }
-
      protected:
       /**
        * Protected constructors: this class should not be instantiated directly
diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h
index e63f91903..54f51856c 100644
--- a/ttg/ttg/parsec/ttg.h
+++ b/ttg/ttg/parsec/ttg.h
@@ -2576,53 +2576,64 @@ namespace ttg_parsec {
     }
 
     bool check_constraints(task_t *task) {
-      bool release = true;
-      for (auto& c : constraints_check) {
-        bool constrained = false;
+      bool constrained = false;
+      if (constraints_check.size() > 0) {
         if constexpr (ttg::meta::is_void_v<keyT>) {
-          constrained = !c();
+          constrained = !constraints_check[0]();
         } else {
-          constrained = !c(task->key);
-        }
-        if (constrained) {
-          if (task->add_constraint()) {
-            if constexpr (!ttg::meta::is_void_v<keyT>) {
-            }
-            parsec_hash_table_insert(&task_constraint_table, &task->tt_ht_item);
-          }
-          release = false;
+          constrained = !constraints_check[0](task->key);
         }
       }
-      return release;
+      if (constrained) {
+        // store the task so we can later access it once it is released
+        parsec_hash_table_insert(&task_constraint_table, &task->tt_ht_item);
+      }
+      return !constrained;
     }
 
     template<typename Key = keyT>
-    std::enable_if_t<ttg::meta::is_void_v<Key>, void> release_constraint(const std::span<Key>& keys) {
-      task_t *task;
-      parsec_key_t hk = 0;
-      parsec_hash_table_lock_bucket(&task_constraint_table, hk);
-      task = (task_t *)parsec_hash_table_nolock_find(&task_constraint_table, hk);
-      if (task->release_constaint()) {
-        parsec_hash_table_nolock_remove(&task_constraint_table, hk);
+    std::enable_if_t<ttg::meta::is_void_v<Key>, void> release_constraint(std::size_t cid) {
+      // check the next constraint, if any
+      assert(cid < constraints_check.size());
+      bool release = true;
+      for (std::size_t i = cid+1; i < constraints_check.size(); i++) {
+        if (!constraints_check[i]()) {
+          release = false;
+          break;
+        }
+      }
+      if (release) {
+        // no constraint blocked us
+        task_t *task;
+        parsec_key_t hk = 0;
+        task = (task_t*)parsec_hash_table_remove(&task_constraint_table, hk);
+        assert(task != nullptr);
         auto &world_impl = world.impl();
         parsec_execution_stream_t *es = world_impl.execution_stream();
         parsec_task_t *vp_task_rings[1] = { &task->parsec_task };
         __parsec_schedule_vp(es, vp_task_rings, 0);
       }
-      parsec_hash_table_unlock_bucket(&task_constraint_table, hk);
     }
 
     template<typename Key = keyT>
-    std::enable_if_t<!ttg::meta::is_void_v<Key>, void> release_constraint(const std::span<Key>& keys) {
+    std::enable_if_t<!ttg::meta::is_void_v<Key>, void> release_constraint(std::size_t cid, const std::span<Key>& keys) {
+      assert(cid < constraints_check.size());
       parsec_task_t *task_ring = nullptr;
       for (auto& key : keys) {
         task_t *task;
-        auto hk = reinterpret_cast<parsec_key_t>(&key);
-        parsec_hash_table_lock_bucket(&task_constraint_table, hk);
-        task = (task_t *)parsec_hash_table_nolock_find(&task_constraint_table, hk);
-        assert(task != nullptr);
-        if (task->release_constaint()) {
-          parsec_hash_table_nolock_remove(&task_constraint_table, hk);
+        bool release = true;
+        for (std::size_t i = cid+1; i < constraints_check.size(); i++) {
+          if (!constraints_check[i](key)) {
+            release = false;
+            break;
+          }
+        }
+
+        if (release) {
+          // no constraint blocked this task, so go ahead and release
+          auto hk = reinterpret_cast<parsec_key_t>(&key);
+          task = (task_t*)parsec_hash_table_remove(&task_constraint_table, hk);
+          assert(task != nullptr);
           if (task_ring == nullptr) {
             /* the first task is set directly */
             task_ring = &task->parsec_task;
@@ -2632,7 +2643,6 @@ namespace ttg_parsec {
                                               offsetof(parsec_task_t, priority));
           }
         }
-        parsec_hash_table_unlock_bucket(&task_constraint_table, hk);
       }
       if (nullptr != task_ring) {
         auto &world_impl = world.impl();
@@ -4385,13 +4395,13 @@ namespace ttg_parsec {
     /// the constraint must provide a valid override of `check_key(key)`
     template<typename Constraint>
     void add_constraint(std::shared_ptr<Constraint> c) {
-      c->add_listener(&release_constraint, this);
+      std::size_t cid = constraints_check.size();
       if constexpr(ttg::meta::is_void_v<keyT>) {
-        c->add_listener([this](){ this->release_constraint(); }, this);
+        c->add_listener([this, cid](){ this->release_constraint(cid); }, this);
         constraints_check.push_back([c, this](){ return c->check(this); });
         constraints_complete.push_back([c, this](const keyT& key){ c->complete(this); return true; });
       } else {
-        c->add_listener([this](const std::span<keyT>& keys){ this->release_constraint(keys); }, this);
+        c->add_listener([this, cid](const std::span<keyT>& keys){ this->release_constraint(cid, keys); }, this);
         constraints_check.push_back([c, this](const keyT& key){ return c->check(key, this); });
         constraints_complete.push_back([c, this](const keyT& key){ c->complete(key, this); return true; });
       }
@@ -4403,12 +4413,13 @@ namespace ttg_parsec {
     template<typename Constraint, typename Mapper>
     void add_constraint(std::shared_ptr<Constraint> c, Mapper&& map) {
       static_assert(std::is_same_v<typename Constraint::key_type, keyT>);
+      std::size_t cid = constraints_check.size();
       if constexpr(ttg::meta::is_void_v<keyT>) {
-        c->add_listener([this](){ this->release_constraint(); }, this);
+        c->add_listener([this, cid](){ this->release_constraint(cid); }, this);
         constraints_check.push_back([map, c, this](){ return c->check(map(), this); });
         constraints_complete.push_back([map, c, this](){ c->complete(map(), this); return true; });
       } else {
-        c->add_listener([this](const std::span<keyT>& keys){ this->release_constraint(keys); }, this);
+        c->add_listener([this, cid](const std::span<keyT>& keys){ this->release_constraint(cid, keys); }, this);
         constraints_check.push_back([map, c, this](const keyT& key){ return c->check(key, map(key), this); });
         constraints_complete.push_back([map, c, this](const keyT& key){ c->complete(key, map(key), this); return true; });
       }

From 403350fe4b9dcf13a9882623ab2132dfd7610982 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Wed, 28 Feb 2024 18:35:50 -0500
Subject: [PATCH 03/57] Constraints: Add first test and improve impl details

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 tests/unit/CMakeLists.txt |   2 +
 tests/unit/constraints.cc |  51 ++++++++++++
 ttg/ttg/constraint.h      | 165 +++++++++++++++++++++++++++++++-------
 ttg/ttg/parsec/ttg.h      |  19 ++++-
 4 files changed, 206 insertions(+), 31 deletions(-)
 create mode 100644 tests/unit/constraints.cc

diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index e1fb7d685..a33bc3c94 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -21,6 +21,8 @@ if (CXXStdCoroutine_FOUND)
     list(APPEND ut_libs std::coroutine)
 endif(CXXStdCoroutine_FOUND)
 
+list(APPEND ut_src constraints.cc)
+
 add_ttg_executable(core-unittests-ttg "${ut_src}" LINK_LIBRARIES "${ut_libs}" COMPILE_DEFINITIONS "CATCH_CONFIG_NO_POSIX_SIGNALS=1" )
 
 # serialization test: probes serialization via all supported serialization methods (MADNESS, Boost::serialization) that are available
diff --git a/tests/unit/constraints.cc b/tests/unit/constraints.cc
new file mode 100644
index 000000000..0255b51e3
--- /dev/null
+++ b/tests/unit/constraints.cc
@@ -0,0 +1,51 @@
+#include <catch2/catch_all.hpp>
+#include <ctime>
+
+#include "ttg.h"
+
+#include "ttg/serialization/std/pair.h"
+#include "ttg/util/hash/std/pair.h"
+#include "ttg/util/multiindex.h"
+
+using Key = ttg::MultiIndex<2>;
+
+TEST_CASE("constraints", "") {
+
+  SECTION("sequenced") {
+    ttg::Edge<Key, int> e;
+    auto world = ttg::default_execution_context();
+    std::atomic<int> last_ord = world.rank();
+    auto tt = ttg::make_tt([&](const Key& key, const int& value){
+      int check_ord = last_ord;
+      CHECK(((key[1] == check_ord) || (key[1] == check_ord+1)));
+      last_ord = key[1];
+    }, ttg::edges(e), ttg::edges());
+    // every process executes all tasks
+    tt->set_keymap([&](const Key&){ return world.rank(); });
+    auto constraint = ttg::make_shared_constraint<ttg::SequencedKeysConstraint>([](const Key& k){ return k[1]; });
+    tt->add_constraint(constraint);
+    constraint->stop();
+
+    auto bcast = ttg::make_tt([&](){
+      std::vector<Key> keys;
+      // loop iteration order intentionally reversed
+      for (int i = 10; i > 0; --i) {
+        for (int j = 10; j > world.rank(); --j) {
+          keys.push_back(Key{i, j});
+        }
+      }
+      ttg::broadcast<0>(std::move(keys), 0);
+
+      // explicit start here to ensure absolute order
+      constraint->start();
+    }, ttg::edges(), ttg::edges(e));
+    bcast->set_keymap([&](){ return world.rank(); });
+
+    make_graph_executable(bcast);
+    ttg::execute(ttg::default_execution_context());
+    bcast->invoke();
+
+    ttg::ttg_fence(ttg::default_execution_context());
+
+  }
+}  // TEST_CASE("streams")
diff --git a/ttg/ttg/constraint.h b/ttg/ttg/constraint.h
index 66dad57c6..f624b1ed8 100644
--- a/ttg/ttg/constraint.h
+++ b/ttg/ttg/constraint.h
@@ -7,6 +7,12 @@
 #include <mutex>
 #include <map>
 
+#ifdef TTG_USE_BUNDLED_BOOST_CALLABLE_TRAITS
+#include <ttg/external/boost/callable_traits.hpp>
+#else
+#include <boost/callable_traits.hpp>
+#endif
+
 namespace ttg {
 
   template<typename Key>
@@ -17,6 +23,21 @@ namespace ttg {
     ConstraintBase()
     { }
 
+    ConstraintBase(ConstraintBase&& cb)
+    : m_listeners(std::move(cb.m_listeners))
+    { }
+
+    ConstraintBase(const ConstraintBase& cb)
+    : m_listeners(cb.m_listeners)
+    {}
+
+    ConstraintBase& operator=(ConstraintBase&& cb) {
+      m_listeners = std::move(cb.m_listeners);
+    }
+    ConstraintBase& operator=(const ConstraintBase& cb) {
+      m_listeners = cb.m_listeners;
+    }
+
     virtual ~ConstraintBase() = default;
 
     void add_listener(listener_t l, ttg::TTBase *tt) {
@@ -69,6 +90,10 @@ namespace ttg {
     };
 
     void release_next() {
+      if (m_stopped) {
+        // don't release tasks if we're stopped
+        return;
+      }
       // trigger the next sequence
       sequence_elem_t elem;
       {
@@ -91,33 +116,34 @@ namespace ttg {
     }
 
     bool check_key_impl(const key_type& key, Ordinal ord, ttg::TTBase *tt) {
-      if (m_order(ord, m_current)) {
-        // key should be executed
-        m_active.fetch_add(1, std::memory_order_relaxed);
-        // reset current to the lower ordinal
-        m_current = ord;
-        return true;
-      } else if (m_sequence.empty() && 0 == m_active.load(std::memory_order_relaxed)) {
-        // there are no keys (active or blocked) so we execute to avoid a deadlock
-        // we don't change the current ordinal because there may be lower ordinals coming in later
-        m_active.fetch_add(1, std::memory_order_relaxed);
-        return true;
-      } else {
-        // key should be deferred
-        auto g = this->lock_guard();
+      if (!m_stopped) {
         if (m_order(ord, m_current)) {
-          // someone released this ordinal while we took the lock
+          // key should be executed
+          m_active.fetch_add(1, std::memory_order_relaxed);
+          // reset current to the lower ordinal
+          m_current = ord;
+          return true;
+        } else if (m_sequence.empty() && 0 == m_active.load(std::memory_order_relaxed)) {
+          // there are no keys (active or blocked) so we execute to avoid a deadlock
+          // we don't change the current ordinal because there may be lower ordinals coming in later
+          m_active.fetch_add(1, std::memory_order_relaxed);
           return true;
         }
-        auto it = m_sequence.find(ord);
-        if (it == m_sequence.end()) {
-          auto [iter, success] = m_sequence.insert(std::make_pair(ord, sequence_elem_t{}));
-          assert(success);
-          it = iter;
-        }
-        it->second.add_key(key, tt);
-        return false;
       }
+      // key should be deferred
+      auto g = this->lock_guard();
+      if (!m_stopped && m_order(ord, m_current)) {
+        // someone released this ordinal while we took the lock
+        return true;
+      }
+      auto it = m_sequence.find(ord);
+      if (it == m_sequence.end()) {
+        auto [iter, success] = m_sequence.insert(std::make_pair(ord, sequence_elem_t{}));
+        assert(success);
+        it = iter;
+      }
+      it->second.add_key(key, tt);
+      return false;
     }
 
     void complete_key_impl() {
@@ -137,13 +163,52 @@ namespace ttg {
     : base_t()
     { }
 
-    template<typename Mapper_>
+    template<typename Mapper_, typename = std::enable_if_t<std::is_invocable_v<Mapper_, Key>, Mapper_>>
     SequencedKeysConstraint(Mapper_&& map)
     : base_t()
     , m_map(std::forward<Mapper_>(map))
     { }
 
-    ~SequencedKeysConstraint() = default;
+    SequencedKeysConstraint(SequencedKeysConstraint&& skc)
+    : base_t(std::move(skc))
+    , m_sequence(std::move(skc.m_sequence))
+    , m_active(skc.m_active.load(std::memory_order_relaxed))
+    , m_current(std::move(skc.m_current))
+    , m_map(std::move(skc.m_map))
+    , m_order(std::move(skc.m_order))
+    , m_stopped(skc.m_stopped)
+    { }
+
+    SequencedKeysConstraint(const SequencedKeysConstraint& skc)
+    : base_t(skc)
+    , m_sequence(skc.m_sequence)
+    , m_active(skc.m_active.load(std::memory_order_relaxed))
+    , m_current(skc.m_current)
+    , m_map(skc.m_map)
+    , m_order(skc.m_order)
+    , m_stopped(skc.m_stopped)
+    { }
+
+    SequencedKeysConstraint& operator=(SequencedKeysConstraint&& skc) {
+      base_t::operator=(std::move(skc));
+      m_sequence = std::move(skc.m_sequence);
+      m_active = skc.m_active.load(std::memory_order_relaxed);
+      m_current = std::move(skc.m_current);
+      m_map = std::move(skc.m_map);
+      m_order = std::move(skc.m_order);
+      m_stopped = skc.m_stopped;
+    }
+    SequencedKeysConstraint& operator=(const SequencedKeysConstraint& skc) {
+      base_t::operator=(skc);
+      m_sequence = skc.m_sequence;
+      m_active = skc.m_active.load(std::memory_order_relaxed);
+      m_current = skc.m_current;
+      m_map = skc.m_map;
+      m_order = skc.m_order;
+      m_stopped = skc.m_stopped;
+    }
+
+    virtual ~SequencedKeysConstraint() = default;
 
     /* Check whether the key may be executed.
      * Returns true if the key may be executed.
@@ -197,6 +262,28 @@ namespace ttg {
       complete_key_impl();
     }
 
+    /**
+     * Stop all execution. Call \c start to resume.
+     * This constraint is not stopped by default so calls to \c start
+     * are only necessary if explictily stopped.
+     */
+    void stop() {
+      m_stopped = true;
+    }
+
+    /**
+     * Start execution.
+     * This constraint is not stopped by default so calls to \c start
+     * are only necessary if explictily stopped.
+     */
+    void start() {
+      if (m_stopped) {
+        m_stopped = false;
+        release_next();
+      }
+    }
+
+
   private:
     std::map<ordinal_type, sequence_elem_t, compare_t> m_sequence;
     std::atomic<std::size_t> m_active;
@@ -205,13 +292,31 @@ namespace ttg {
     Mapper m_map;
     [[no_unique_address]]
     compare_t m_order;
+    bool m_stopped = false;
   };
 
-
-
-  template<typename Constraint, typename... Args>
-  std::shared_ptr<Constraint> make_shared_constraint(Args&&... args) {
-    return std::make_shared<Constraint>(new Constraint(std::forward<Args>(args)...));
+  // deduction guide: take type of first argument to Mapper as the key type
+  // TODO: can we use the TTG callable_args instead?
+  template<typename Mapper, typename = std::enable_if_t<std::is_invocable_v<Mapper, std::decay_t<std::tuple_element_t<0, boost::callable_traits::args_t<Mapper>>>>>>
+  SequencedKeysConstraint(Mapper&&)
+    -> SequencedKeysConstraint<
+          std::decay_t<std::tuple_element_t<0, boost::callable_traits::args_t<Mapper>>>,
+          std::decay_t<boost::callable_traits::return_type_t<Mapper>>,
+          std::less<std::decay_t<boost::callable_traits::return_type_t<Mapper>>>,
+          std::enable_if_t<std::is_invocable_v<Mapper, std::decay_t<std::tuple_element_t<0, boost::callable_traits::args_t<Mapper>>>>, Mapper>
+          >;
+
+  template<typename Key, typename Ordinal, typename Compare, typename Mapper>
+  SequencedKeysConstraint(SequencedKeysConstraint<Key, Ordinal, Compare, Mapper>&&)
+    -> SequencedKeysConstraint<Key, Ordinal, Compare, Mapper>;
+
+  template<typename Key, typename Ordinal, typename Compare, typename Mapper>
+  SequencedKeysConstraint(const SequencedKeysConstraint<Key, Ordinal, Compare, Mapper>&)
+    -> SequencedKeysConstraint<Key, Ordinal, Compare, Mapper>;
+
+  template<template<typename...> typename Constraint, typename... Args>
+  auto make_shared_constraint(Args&&... args) {
+    return std::make_shared<decltype(Constraint(std::forward<Args>(args)...))>(Constraint(std::forward<Args>(args)...));
   }
 
 } // namespace ttg
diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h
index 54f51856c..e82cb42a1 100644
--- a/ttg/ttg/parsec/ttg.h
+++ b/ttg/ttg/parsec/ttg.h
@@ -4391,7 +4391,7 @@ namespace ttg_parsec {
     /// @return the device map
     auto get_devicemap() { return devicemap; }
 
-    /// add a constraint
+    /// add a shared constraint
     /// the constraint must provide a valid override of `check_key(key)`
     template<typename Constraint>
     void add_constraint(std::shared_ptr<Constraint> c) {
@@ -4408,6 +4408,14 @@ namespace ttg_parsec {
     }
 
     /// add a constraint
+    /// the constraint must provide a valid override of `check_key(key)`
+    template<typename Constraint>
+    void add_constraint(Constraint&& c) {
+      // need to make this a shared_ptr since it's shared between different callbacks
+      this->add_constraint(std::make_shared<Constraint>(std::forward<Constraint>(c)));
+    }
+
+    /// add a shared constraint
     /// the constraint must provide a valid override of `check_key(key, map(key))`
     /// ths overload can be used to provide different key mapping functions for each TT
     template<typename Constraint, typename Mapper>
@@ -4425,6 +4433,15 @@ namespace ttg_parsec {
       }
     }
 
+    /// add a shared constraint
+    /// the constraint must provide a valid override of `check_key(key, map(key))`
+    /// ths overload can be used to provide different key mapping functions for each TT
+    template<typename Constraint, typename Mapper>
+    void add_constraint(Constraint c, Mapper&& map) {
+      // need to make this a shared_ptr since it's shared between different callbacks
+      this->add_constraint(std::make_shared<Constraint>(std::forward<Constraint>(c)), std::forward<Mapper>(map));
+    }
+
     // Register the static_op function to associate it to instance_id
     void register_static_op_function(void) {
       int rank;

From 46de34446dc2d6eba7ad8639fd1eec98f647a287 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Fri, 1 Mar 2024 15:28:04 -0500
Subject: [PATCH 04/57] SPMM: Remove control flow from BcastA/B

Replaced by constraints.

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 examples/spmm/spmm.cc | 236 +++---------------------------------------
 1 file changed, 13 insertions(+), 223 deletions(-)

diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc
index a8da3e03f..497f93da6 100644
--- a/examples/spmm/spmm.cc
+++ b/examples/spmm/spmm.cc
@@ -293,13 +293,11 @@ class SpMM25D {
     Edge<Key<2>, void> a_ctl, b_ctl;
     Edge<Key<2>, int> a_rowctl, b_colctl; // TODO: can we have multiple control inputs per TT?
     auto constraint = ttg::make_shared_constraint<ttg::SequencedKeysConstraint<Key<2>>>();
-    bcast_a_ = std::make_unique<BcastA>(a, a_ctl, a_rowctl, local_a_ijk_, a_rows_of_col_, a_cols_of_row_, b_cols_of_row_,
-                                        ij_keymap_, ijk_keymap_, parallel_bcasts_);
+    bcast_a_ = std::make_unique<BcastA>(a, local_a_ijk_, b_cols_of_row_, ij_keymap_, ijk_keymap_);
     // add constraint with external mapper: key[1] represents `k`
     bcast_a_->add_constraint(constraint, [](const Key<2>& key){ return key[1]; });
     local_bcast_a_ = std::make_unique<LocalBcastA>(local_a_ijk_, a_ijk_, b_cols_of_row_, ijk_keymap_);
-    bcast_b_ = std::make_unique<BcastB>(b, b_ctl, b_colctl, local_b_ijk_, a_rows_of_col_, b_cols_of_row_, b_rows_of_col_,
-                                        ij_keymap_, ijk_keymap_, parallel_bcasts_);
+    bcast_b_ = std::make_unique<BcastB>(b, local_b_ijk_, a_rows_of_col_, ij_keymap_, ijk_keymap_);
     // add constraint with external mapper: key[0] represents `k`
     bcast_b_->add_constraint(constraint, [](const Key<2>& key){ return key[0]; });
     local_bcast_b_ = std::make_unique<LocalBcastB>(local_b_ijk_, b_ijk_, a_rows_of_col_, ijk_keymap_);
@@ -343,62 +341,6 @@ class SpMM25D {
       }
     }
 
-    /* kick off the first broadcast in each row of A
-     * this is used to enforce strict ordering within a row of A */
-    for (int i = 0; i < a_cols_of_row_.size(); ++i) {
-      for (int k : a_cols_of_row_[i]) {
-        auto key = Key<2>(i, k);
-        if (world.rank() == ij_keymap_(key)) {
-          bcast_a_->template in<1>()->send(key, 0);
-          break;
-        }
-      }
-    }
-
-    /* initial ctl input for a number of bcasts for A
-     * this is used to limit the number of concurrent bcasts */
-    int to_start = parallel_bcasts;
-    for (int k = 0;
-          0 < to_start && k < a_rows_of_col_.size();
-          ++k) {
-      for (auto i : a_rows_of_col_[k]) {
-        auto key = Key<2>(i, k);
-        if (world.rank() == ij_keymap_(key)) {
-          //std::cout << "SPMM kick off BcastA " << key << std::endl;
-          bcast_a_->template in<2>()->sendk(key);
-          if (0 == --to_start) break;
-        }
-      }
-    }
-
-    /* kick off the first broadcast in each column of B
-     * this is used to enforce strict ordering within a column of B */
-    for (int j = 0; j < b_rows_of_col_.size(); ++j) {
-      for (int k : b_rows_of_col_[j]) {
-        auto key = Key<2>(k, j);
-        if (world.rank() == ij_keymap_(key)) {
-          //std::cout << "BcastB kick off " << key << std::endl;
-          bcast_b_->template in<1>()->send(key, 0);
-          break;
-        }
-      }
-    }
-
-    /* initial ctl input for bcasts for B */
-    to_start = parallel_bcasts;
-    for (int k = 0;
-          0 < to_start && k < b_cols_of_row_.size();
-          ++k) {
-      for (auto j : b_cols_of_row_[k]) {
-        auto key = Key<2>(k, j);
-        if (world.rank() == ij_keymap_(key)) {
-          //std::cout << "SPMM kick off BcastB " << key << std::endl;
-          bcast_b_->template in<2>()->sendk(key);
-          if (0 == --to_start) break;
-        }
-      }
-    }
-
     TTGUNUSED(bcast_a_);
     TTGUNUSED(bcast_b_);
     TTGUNUSED(multiplyadd_);
@@ -444,24 +386,16 @@ class SpMM25D {
   };  // class LocalBcastA
 
   /// broadcast `A[i][k]` to all processors which will contain at least one `C[i][j]` such that `B[k][j]` exists
-  class BcastA : public TT<Key<2>, std::tuple<Out<Key<3>, Blk>, Out<Key<2>, int>, Out<Key<2>, void>>, BcastA, ttg::typelist<Blk, int, void>> {
+  class BcastA : public TT<Key<2>, std::tuple<Out<Key<3>, Blk>>, BcastA, ttg::typelist<Blk>> {
    public:
     using baseT = typename BcastA::ttT;
 
-    BcastA(Edge<Key<2>, Blk> &a_ik, Edge<Key<2>, void> &ctl,
-           Edge<Key<2>, int> &rowctl, Edge<Key<3>, Blk> &a_ikp,
-           const std::vector<std::vector<long>> &a_rows_of_col,
-           const std::vector<std::vector<long>> &a_cols_of_row,
+    BcastA(Edge<Key<2>, Blk> &a_ik, Edge<Key<3>, Blk> &a_ikp,
            const std::vector<std::vector<long>> &b_cols_of_row,
-           const Keymap2 &ij_keymap, const Keymap3 &ijk_keymap,
-           const int parallel_bcasts)
-        : baseT(edges(a_ik, rowctl, ctl), edges(a_ikp, rowctl, ctl), "SpMM25D::bcast_a", {"a_ik", "rowctl", "ctl"}, {"a_ikp", "rowctl", "ctl"}, ij_keymap)
-        , a_rows_of_col_(a_rows_of_col)
-        , a_cols_of_row_(a_cols_of_row)
+           const Keymap2 &ij_keymap, const Keymap3 &ijk_keymap)
+        : baseT(edges(a_ik), edges(a_ikp), "SpMM25D::bcast_a", {"a_ik"}, {"a_ikp"}, ij_keymap)
         , b_cols_of_row_(b_cols_of_row)
-        , ijk_keymap_(ijk_keymap)
-        , ij_keymap_(ij_keymap)
-        , parallel_bcasts_(parallel_bcasts) {
+        , ijk_keymap_(ijk_keymap) {
 
       this->set_priomap([](const Key<2>& key){
         return std::numeric_limits<int>::max() - key[0];
@@ -469,7 +403,7 @@ class SpMM25D {
     }
 
     void op(const Key<2> &ik, typename baseT::input_values_tuple_type &&a_ik,
-            std::tuple<Out<Key<3>, Blk>, Out<Key<2>, int>, Out<Key<2>, void>> &outs) {
+            std::tuple<Out<Key<3>, Blk>> &outs) {
       const auto i = ik[0]; // row
       const auto k = ik[1]; // col
       ttg::trace("BcastA(", i, ", ", k, ")");
@@ -487,81 +421,13 @@ class SpMM25D {
           ikp_keys.emplace_back(Key<3>({i, k, p}));
           procmap[p] = true;
         }
-        // TODO: debug
-        //if (p != world.rank() && ij_keymap_(Key<2>{k, j}) != p) {
-        //  std::cout << "[" << world.rank() << "] BCAST A " << ik << " for C update " << Key<3>({i, k, p}) << " on " << p << " has B from " << ij_keymap_(Key<2>{k, j}) << std::endl;
-        //}
       }
       ::broadcast<0>(ikp_keys, std::move(baseT::template get<0>(a_ik)), outs);
-
-      /* enable the next broadcast on this row */
-      int row = i;
-      int col = k;
-      auto rowit = std::find(a_cols_of_row_[row].begin(), a_cols_of_row_[row].end(), col);
-      for (++rowit; rowit != a_cols_of_row_[row].end(); ++rowit) {
-        Key<2> key = {row, *rowit};
-        if (world.rank() == this->get_keymap()(key)) {
-          ::send<1>(key, std::move(baseT::template get<1>(a_ik)), outs);
-          break;
-        }
-      }
-
-
-      /* enable next broadcast through a control message
-       * we don't check whether this tile is in B here, this is
-       * done inside the next task (see above)
-       * we walk the matrix A column-major in an attempt to send from top to bottom, left to right */
-      long to_skip = parallel_bcasts_;
-
-      auto colit = std::find(a_rows_of_col_[col].begin(), a_rows_of_col_[col].end(), row);
-      ++colit; // skip to next row
-      do {
-        for (; colit != a_rows_of_col_[col].end(); ++colit) {
-          Key<2> key = {*colit, col};
-          if (world.rank() == this->get_keymap()(key)) {
-            if (0 == --to_skip) {
-              //std::cout << "BcastA sending to " << key << " from " << ik << std::endl;
-              ::sendk<2>(key, outs);
-              return;
-            }
-          }
-        }
-        /* nothing for us in this column, move on to the next column */
-        if (++col < a_rows_of_col_.size()) {
-          colit = a_rows_of_col_[col].begin();
-        } else {
-          break;
-        }
-      } while (1);
-
-#if 0
-      do {
-        for (; it != a_cols_of_row_[i].end(); ++it) {
-          Key<2> key = {i, *it};
-          if (world.rank() == this->get_keymap()(key)) {
-            if (0 == --to_skip) {
-              ::sendk<1>(key, outs);
-              return;
-            }
-          }
-        }
-        if ((i+1) < num_rows) {
-          it = a_cols_of_row_[++i].begin();
-        } else {
-          break;
-        }
-      } while (1);
-#endif // 0
     }
 
    private:
-    //const std::vector<std::vector<long>> &a_cols_of_row_;
-    const std::vector<std::vector<long>> &a_rows_of_col_;
-    const std::vector<std::vector<long>> &a_cols_of_row_;
     const std::vector<std::vector<long>> &b_cols_of_row_;
     const Keymap3 &ijk_keymap_;
-    const Keymap2 &ij_keymap_;
-    const int parallel_bcasts_;
   };  // class BcastA
 
   /// Locally broadcast `B[k][j]` assigned to this processor `p` to matmul tasks `{i,j,k}` for all `k` such that
@@ -602,22 +468,16 @@ class SpMM25D {
   };  // class LocalBcastB
 
   /// broadcast `B[k][j]` to all processors which will contain at least one `C[i][j]` such that `A[i][k]` exists
-  class BcastB : public TT<Key<2>, std::tuple<Out<Key<3>, Blk>, Out<Key<2>, int>, Out<Key<2>, void>>, BcastB, ttg::typelist<Blk, int, void>> {
+  class BcastB : public TT<Key<2>, std::tuple<Out<Key<3>, Blk>>, BcastB, ttg::typelist<Blk>> {
    public:
     using baseT = typename BcastB::ttT;
 
-    BcastB(Edge<Key<2>, Blk> &b_kj, Edge<Key<2>, void> ctl, Edge<Key<2>, int> colctl, Edge<Key<3>, Blk> &b_kjp,
+    BcastB(Edge<Key<2>, Blk> &b_kj, Edge<Key<3>, Blk> &b_kjp,
            const std::vector<std::vector<long>> &a_rows_of_col,
-           const std::vector<std::vector<long>> &b_cols_of_row,
-           const std::vector<std::vector<long>> &b_rows_of_col,
-           const Keymap2 &ij_keymap, const Keymap3 &ijk_keymap,
-           const int parallel_bcasts)
-        : baseT(edges(b_kj, colctl, ctl), edges(b_kjp, colctl, ctl), "SpMM25D::bcast_b", {"b_kj", "colctl", "ctl"}, {"b_kjp", "colctl", "ctl"}, ij_keymap)
+           const Keymap2 &ij_keymap, const Keymap3 &ijk_keymap)
+        : baseT(edges(b_kj), edges(b_kjp), "SpMM25D::bcast_b", {"b_kj"}, {"b_kjp"}, ij_keymap)
         , a_rows_of_col_(a_rows_of_col)
-        , b_cols_of_row_(b_cols_of_row)
-        , b_rows_of_col_(b_rows_of_col)
         , ijk_keymap_(ijk_keymap)
-        , parallel_bcasts_(parallel_bcasts)
     {
       this->set_priomap([](const Key<2>& key){
         return std::numeric_limits<int>::max() - key[1];
@@ -625,7 +485,7 @@ class SpMM25D {
     }
 
     void op(const Key<2> &kj, typename baseT::input_values_tuple_type &&b_kj,
-            std::tuple<Out<Key<3>, Blk>, Out<Key<2>, int>, Out<Key<2>, void>> &outs) {
+            std::tuple<Out<Key<3>, Blk>> &outs) {
       const auto k = kj[0]; // row
       const auto j = kj[1]; // col
       // broadcast b_kj to all processors which will contain at least one c_ij such that a_ik exists
@@ -644,81 +504,11 @@ class SpMM25D {
         }
       }
       ::broadcast<0>(kjp_keys, std::move(baseT::template get<0>(b_kj)), outs);
-
-      /* enable the next broadcast on this row */
-      int row = k;
-      int col = j;
-      auto colit = std::find(b_rows_of_col_[col].begin(), b_rows_of_col_[col].end(), row);
-      for (++colit; colit != b_rows_of_col_[col].end(); ++colit) {
-        Key<2> key = {*colit, col};
-        if (world.rank() == this->get_keymap()(key)) {
-          //std::cout << "BcastB kick off " << key << std::endl;
-          ::send<1>(key, std::move(baseT::template get<1>(b_kj)), outs);
-          break;
-        }
-      }
-
-      /* enable next broadcast through a control message
-       * we don't check whether this tile is in A here, this is
-       * done inside the next task (see above)
-       * we run across a row to enable broadcasts */
-      long to_skip = parallel_bcasts_;
-
-      // iterator over the current row
-      auto rowit = std::find(b_cols_of_row_[row].begin(), b_cols_of_row_[row].end(), col);
-      ++rowit; // skip to next col
-      do {
-        for (; rowit != b_cols_of_row_[row].end(); ++rowit) {
-          Key<2> key = {row, *rowit};
-          if (world.rank() == this->get_keymap()(key)) {
-            if (0 == --to_skip) {
-              //std::cout << "BcastB sending to " << key << " from " << kj << " pb " << parallel_bcasts_ << std::endl;
-              ::sendk<2>(key, outs);
-              return;
-            } else {
-              //std::cout << "BcastB skipping " << key << " from " << kj << " pb " << parallel_bcasts_ << std::endl;
-            }
-          }
-        }
-        /* nothing for us in this row, move on to the next row */
-        if (++row != b_cols_of_row_.size()) {
-          rowit = b_cols_of_row_[row].begin();
-        } else {
-          break;
-        }
-      } while (1);
-
-
-#if 0
-      std::size_t num_rows = b_cols_of_row_.size();
-      auto it = std::find(b_cols_of_row_[k].begin(), b_cols_of_row_[k].end(), j);
-      ++it; // skip the current tile
-      long to_skip = parallel_bcasts_;
-      do {
-        for (; it != b_cols_of_row_[k].end(); ++it) {
-          Key<2> key = {k, *it};
-          if (world.rank() == this->get_keymap()(key)) {
-            if (0 == --to_skip) {
-              ::sendk<1>(key, outs);
-              return;
-            }
-          }
-        }
-        if ((k+1) < num_rows) {
-          it = b_cols_of_row_[++k].begin();
-        } else {
-          break;
-        }
-      } while (1);
-#endif // 0
     }
 
    private:
     const std::vector<std::vector<long>> &a_rows_of_col_;
-    const std::vector<std::vector<long>> &b_cols_of_row_;
-    const std::vector<std::vector<long>> &b_rows_of_col_;
     const Keymap3 &ijk_keymap_;
-    const int parallel_bcasts_;
   };  // class BcastB
 
   /// multiply task has 3 input flows: a_ijk, b_ijk, and c_ijk, c_ijk contains the running total for this layer of the

From caafc78b91bac4055d9f34d132643b94e0f8c277 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Mon, 4 Mar 2024 17:35:52 -0500
Subject: [PATCH 05/57] Extend sequence constraint to enable/disable
 auto-release

Auto-release makes sure there are no deadlocks by enabling the next
set of keys once the current ordinal is done. Without auto-release
applications must release the next set explitly and ensure there
are no deadlocks.

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 examples/spmm/spmm.cc |  48 ++++++++-
 ttg/ttg/constraint.h  | 222 ++++++++++++++++++++++++++----------------
 2 files changed, 182 insertions(+), 88 deletions(-)

diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc
index 497f93da6..5806ce9ef 100644
--- a/examples/spmm/spmm.cc
+++ b/examples/spmm/spmm.cc
@@ -39,6 +39,8 @@ using namespace ttg;
 
 #include "ttg/util/bug.h"
 
+#define USE_AUTO_CONSTRAINT false
+
 #if defined(BLOCK_SPARSE_GEMM) && defined(BTAS_IS_USABLE)
 using scalar_t = double;
 using blk_t = btas::Tensor<scalar_t, btas::DEFAULT::range, btas::mohndle<btas::varray<scalar_t>, btas::Handle::shared_ptr>>;
@@ -292,7 +294,7 @@ class SpMM25D {
       , parallel_bcasts_(parallel_bcasts) {
     Edge<Key<2>, void> a_ctl, b_ctl;
     Edge<Key<2>, int> a_rowctl, b_colctl; // TODO: can we have multiple control inputs per TT?
-    auto constraint = ttg::make_shared_constraint<ttg::SequencedKeysConstraint<Key<2>>>();
+    auto constraint = ttg::make_shared_constraint<ttg::SequencedKeysConstraint<Key<2>>>(USE_AUTO_CONSTRAINT);
     bcast_a_ = std::make_unique<BcastA>(a, local_a_ijk_, b_cols_of_row_, ij_keymap_, ijk_keymap_);
     // add constraint with external mapper: key[1] represents `k`
     bcast_a_->add_constraint(constraint, [](const Key<2>& key){ return key[1]; });
@@ -302,7 +304,8 @@ class SpMM25D {
     bcast_b_->add_constraint(constraint, [](const Key<2>& key){ return key[0]; });
     local_bcast_b_ = std::make_unique<LocalBcastB>(local_b_ijk_, b_ijk_, a_rows_of_col_, ijk_keymap_);
     multiplyadd_ = std::make_unique<MultiplyAdd>(a_ijk_, b_ijk_, c_ijk_, c_ij_p_, a_cols_of_row_,
-                                                 b_rows_of_col_, mTiles, nTiles, ijk_keymap_);
+                                                 b_rows_of_col_, mTiles, nTiles, ijk_keymap_, constraint, k_cnt_);
+
     reduce_c_ = std::make_unique<ReduceC>(c_ij_p_, c, ij_keymap_);
     reduce_c_->template set_input_reducer<0>(
       [&](Blk &c_ij, const Blk &c_ij_p) {
@@ -317,6 +320,9 @@ class SpMM25D {
     auto world = ttg::default_execution_context();
     const auto my_rank = world.rank();
     std::vector<bool> c_ij_procmask(world.size(), false);
+    std::vector<unsigned long> first_k_map(world.size(), std::numeric_limits<unsigned long>::max());
+    std::size_t max_k = a_rows_of_col_.size();
+    k_cnt_.resize(max_k+1, false);
     for (auto i = 0ul; i != a_cols_of_row_.size(); ++i) {
       if (a_cols_of_row_[i].empty()) continue;
       for (auto j = 0ul; j != b_rows_of_col_.size(); ++j) {
@@ -326,10 +332,15 @@ class SpMM25D {
           decltype(i) k;
           bool have_k;
           std::tie(k, have_k) = multiplyadd_->compute_first_k(i, j);
+          if (have_k) {
+            k_cnt_[k] = true;
+          }
           while (have_k) {
             const auto pR = ijk_keymap_(Key<3>{i, j, k});
             assert(pR < c_ij_procmask.size());
             c_ij_procmask[pR] = true;
+            // find the first k that is needed from us by this rank
+            first_k_map[pR] = std::min(first_k_map[pR], k);
             /* get next k */
             std::tie(k, have_k) = multiplyadd_->compute_next_k(i, j, k);
           }
@@ -341,6 +352,17 @@ class SpMM25D {
       }
     }
 
+    k_cnt_.push_back(true); // we always want to release the last k
+
+    // find the maximum k for which we need to release the broadcast constraint
+    unsigned long first_k = 0;
+    for (auto k : first_k_map) {
+      if (k != std::numeric_limits<unsigned long>::max()) {
+        first_k = std::max(first_k, k);
+      }
+    }
+    constraint->release(first_k);
+
     TTGUNUSED(bcast_a_);
     TTGUNUSED(bcast_b_);
     TTGUNUSED(multiplyadd_);
@@ -521,11 +543,15 @@ class SpMM25D {
     MultiplyAdd(Edge<Key<3>, Blk> &a_ijk, Edge<Key<3>, Blk> &b_ijk, Edge<Key<3>, Blk> &c_ijk, Edge<Key<2>, Blk> &c,
                 const std::vector<std::vector<long>> &a_cols_of_row,
                 const std::vector<std::vector<long>> &b_rows_of_col, const std::vector<int> &mTiles,
-                const std::vector<int> &nTiles, const Keymap3 &ijk_keymap)
+                const std::vector<int> &nTiles, const Keymap3 &ijk_keymap,
+                std::shared_ptr<ttg::SequencedKeysConstraint<Key<2>>> constraint,
+                std::vector<bool>& k_cnt)
         : baseT(edges(a_ijk, b_ijk, c_ijk), edges(c, c_ijk), "SpMM25D::MultiplyAdd", {"a_ijk", "b_ijk", "c_ijk"},
                 {"c_ij", "c_ijk"}, ijk_keymap)
         , a_cols_of_row_(a_cols_of_row)
-        , b_rows_of_col_(b_rows_of_col) {
+        , b_rows_of_col_(b_rows_of_col)
+        , k_cnt_(k_cnt)
+        , constraint(std::move(constraint)){
       this->set_priomap([=,this](const Key<3> &ijk) { return this->prio(ijk); });  // map a key to an integral priority value
 
       // for each {i,j} determine first k that contributes AND belongs to this node,
@@ -569,6 +595,17 @@ class SpMM25D {
                  " C[",
                  i, "][", j, "]  += A[", i, "][", k, "] by B[", k, "][", j, "],  next_k? ",
                  (have_next_k ? std::to_string(next_k) : "does not exist"));
+      // release the constraint on the next round of broadcasts
+      {
+        std::size_t release_k = k;
+        while (release_k < k_cnt_.size()) {
+          ++release_k;
+          if (k_cnt_[release_k])
+            break;
+        }
+        constraint->release(release_k);
+      }
+
       // compute the contrib, pass the running total to the next flow, if needed
       // otherwise write to the result flow
       if (have_next_k) {
@@ -588,6 +625,8 @@ class SpMM25D {
    private:
     const std::vector<std::vector<long>> &a_cols_of_row_;
     const std::vector<std::vector<long>> &b_rows_of_col_;
+    std::vector<bool>& k_cnt_;
+    std::shared_ptr<ttg::SequencedKeysConstraint<Key<2>>> constraint;
 
     /* Compute the length of the remaining sequence on that tile */
     int32_t prio(const Key<3> &key) const {
@@ -733,6 +772,7 @@ class SpMM25D {
   std::unique_ptr<LocalBcastB> local_bcast_b_;
   std::unique_ptr<MultiplyAdd> multiplyadd_;
   std::unique_ptr<ReduceC> reduce_c_;
+  std::vector<bool> k_cnt_;
   Keymap2 ij_keymap_;
   Keymap3 ijk_keymap_;
   long parallel_bcasts_;
diff --git a/ttg/ttg/constraint.h b/ttg/ttg/constraint.h
index f624b1ed8..199fb5ed6 100644
--- a/ttg/ttg/constraint.h
+++ b/ttg/ttg/constraint.h
@@ -63,7 +63,7 @@ namespace ttg {
 
   template<typename Key,
            typename Ordinal = std::size_t,
-           typename Compare = std::less<Ordinal>,
+           typename Compare = std::less_equal<Ordinal>,
            typename Mapper = ttg::Void>
   struct SequencedKeysConstraint : public ConstraintBase<Key> {
 
@@ -73,7 +73,7 @@ namespace ttg {
     using compare_t = Compare;
     using base_t = ConstraintBase<Key>;
 
-  private:
+  protected:
     struct sequence_elem_t {
       std::map<ttg::TTBase*, std::vector<key_type>> m_keys;
 
@@ -89,43 +89,22 @@ namespace ttg {
       }
     };
 
-    void release_next() {
-      if (m_stopped) {
-        // don't release tasks if we're stopped
-        return;
-      }
-      // trigger the next sequence
-      sequence_elem_t elem;
-      {
-        // extract the next sequence
-        auto g = this->lock_guard();
-        auto it = m_sequence.begin(); // sequence is ordered by ordinal
-        if (it == m_sequence.end()) {
-          return; // nothing to be done
-        }
-        m_current = it->first;
-        elem = std::move(it->second);
-        m_sequence.erase(it);
-      }
-
-      for (auto& seq : elem.m_keys) {
-        // account for the newly active keys
-        m_active.fetch_add(seq.second.size(), std::memory_order_relaxed);
-        this->notify_listener(std::span<key_type>(seq.second.data(), seq.second.size()), seq.first);
-      }
-    }
-
     bool check_key_impl(const key_type& key, Ordinal ord, ttg::TTBase *tt) {
       if (!m_stopped) {
         if (m_order(ord, m_current)) {
           // key should be executed
-          m_active.fetch_add(1, std::memory_order_relaxed);
+          if (m_auto_release) { // only needed for auto-release
+            m_active.fetch_add(1, std::memory_order_relaxed);
+          }
           // reset current to the lower ordinal
           m_current = ord;
           return true;
-        } else if (m_sequence.empty() && 0 == m_active.load(std::memory_order_relaxed)) {
+        } else if (m_sequence.empty() && m_auto_release && 0 == m_active.load(std::memory_order_relaxed)) {
           // there are no keys (active or blocked) so we execute to avoid a deadlock
           // we don't change the current ordinal because there may be lower ordinals coming in later
+          // NOTE: there is a race condition between the check here and the increment above.
+          //       This is mostly benign as it can lead to out-of-sequence released tasks.
+          //       Avoiding this race would incur significant overheads.
           m_active.fetch_add(1, std::memory_order_relaxed);
           return true;
         }
@@ -146,67 +125,99 @@ namespace ttg {
       return false;
     }
 
+
     void complete_key_impl() {
-      auto active = m_active.fetch_sub(1, std::memory_order_relaxed) - 1;
-      if (0 == active) {
-        release_next();
+      if (m_auto_release) {
+        auto active = m_active.fetch_sub(1, std::memory_order_relaxed) - 1;
+        if (0 == active) {
+          release_next();
+        }
+      }
+    }
+
+    // used in the auto case
+    void release_next() {
+      if (this->m_stopped) {
+        // don't release tasks if we're stopped
+        return;
+      }
+      // trigger the next sequence
+      sequence_elem_t elem;
+      {
+        // extract the next sequence
+        auto g = this->lock_guard();
+        auto it = this->m_sequence.begin(); // sequence is ordered by ordinal
+        if (it == this->m_sequence.end()) {
+          return; // nothing to be done
+        }
+        this->m_current = it->first;
+        elem = std::move(it->second);
+        this->m_sequence.erase(it);
+      }
+
+      for (auto& seq : elem.m_keys) {
+        // account for the newly active keys
+        this->m_active.fetch_add(seq.second.size(), std::memory_order_relaxed);
+        this->notify_listener(std::span<key_type>(seq.second.data(), seq.second.size()), seq.first);
       }
     }
 
 
+    // used in the non-auto case
+    void release_next(ordinal_type ord, bool force_check = false) {
+      if (this->m_stopped) {
+        // don't release tasks if we're stopped
+        return;
+      }
+      if (!force_check && m_order(ord, this->m_current)) {
+        return; // already at the provided ordinal, nothing to be done
+      }
+      // set current ordinal
+      this->m_current = ord;
+      // trigger the next sequence(s) (m_sequence is ordered by ordinal)
+      std::vector<sequence_elem_t> seqs;
+      {
+        auto g = this->lock_guard();
+        for (auto it = this->m_sequence.begin(); it != this->m_sequence.end(); it = this->m_sequence.begin()) {
+          if (!this->m_order(it->first, this->m_current)) break;
+          // extract the next sequence
+          this->m_current = it->first;
+          seqs.push_back(std::move(it->second));
+          this->m_sequence.erase(it);
+        }
+      }
+      for (auto& elem : seqs) {
+        for (auto& e : elem.m_keys) {
+          // account for the newly active keys
+          this->notify_listener(std::span<key_type>(e.second.data(), e.second.size()), e.first);
+        }
+      }
+    }
+
   public:
 
     /**
      * Used for external key mapper.
      */
-    SequencedKeysConstraint()
+    SequencedKeysConstraint(bool auto_release = false)
     : base_t()
+    , m_auto_release(auto_release)
     { }
 
     template<typename Mapper_, typename = std::enable_if_t<std::is_invocable_v<Mapper_, Key>, Mapper_>>
-    SequencedKeysConstraint(Mapper_&& map)
+    SequencedKeysConstraint(Mapper_&& map, bool auto_release)
     : base_t()
     , m_map(std::forward<Mapper_>(map))
+    , m_auto_release(auto_release)
     { }
 
-    SequencedKeysConstraint(SequencedKeysConstraint&& skc)
-    : base_t(std::move(skc))
-    , m_sequence(std::move(skc.m_sequence))
-    , m_active(skc.m_active.load(std::memory_order_relaxed))
-    , m_current(std::move(skc.m_current))
-    , m_map(std::move(skc.m_map))
-    , m_order(std::move(skc.m_order))
-    , m_stopped(skc.m_stopped)
-    { }
+    SequencedKeysConstraint(SequencedKeysConstraint&& skc) = default;
 
-    SequencedKeysConstraint(const SequencedKeysConstraint& skc)
-    : base_t(skc)
-    , m_sequence(skc.m_sequence)
-    , m_active(skc.m_active.load(std::memory_order_relaxed))
-    , m_current(skc.m_current)
-    , m_map(skc.m_map)
-    , m_order(skc.m_order)
-    , m_stopped(skc.m_stopped)
-    { }
+    SequencedKeysConstraint(const SequencedKeysConstraint& skc) = default;
 
-    SequencedKeysConstraint& operator=(SequencedKeysConstraint&& skc) {
-      base_t::operator=(std::move(skc));
-      m_sequence = std::move(skc.m_sequence);
-      m_active = skc.m_active.load(std::memory_order_relaxed);
-      m_current = std::move(skc.m_current);
-      m_map = std::move(skc.m_map);
-      m_order = std::move(skc.m_order);
-      m_stopped = skc.m_stopped;
-    }
-    SequencedKeysConstraint& operator=(const SequencedKeysConstraint& skc) {
-      base_t::operator=(skc);
-      m_sequence = skc.m_sequence;
-      m_active = skc.m_active.load(std::memory_order_relaxed);
-      m_current = skc.m_current;
-      m_map = skc.m_map;
-      m_order = skc.m_order;
-      m_stopped = skc.m_stopped;
-    }
+    SequencedKeysConstraint& operator=(SequencedKeysConstraint&& skc) = default;
+
+    SequencedKeysConstraint& operator=(const SequencedKeysConstraint& skc) = default;
 
     virtual ~SequencedKeysConstraint() = default;
 
@@ -217,49 +228,49 @@ namespace ttg {
     std::enable_if_t<!ttg::meta::is_void_v<Key_> && !ttg::meta::is_void_v<Mapper_>, bool>
     check(const key_type& key, ttg::TTBase *tt) {
       ordinal_type ord = m_map(key);
-      return check_key_impl(key, ord, tt);
+      return this->check_key_impl(key, ord, tt);
     }
 
     template<typename Key_ = key_type, typename Mapper_ = Mapper>
     std::enable_if_t<!ttg::meta::is_void_v<Key_> && ttg::meta::is_void_v<Mapper_>, bool>
     check(const key_type& key, Ordinal ord, ttg::TTBase *tt) {
-      return check_key_impl(key, ord, tt);
+      return this->check_key_impl(key, ord, tt);
     }
 
     template<typename Key_ = key_type, typename Mapper_ = Mapper>
     std::enable_if_t<ttg::meta::is_void_v<Key_> && !ttg::meta::is_void_v<Mapper_>, bool>
     check(ttg::TTBase *tt) {
-      return check_key_impl(ttg::Void{}, m_map(), tt);
+      return this->check_key_impl(ttg::Void{}, m_map(), tt);
     }
 
     template<typename Key_ = key_type, typename Mapper_ = Mapper>
     std::enable_if_t<ttg::meta::is_void_v<Key_> && ttg::meta::is_void_v<Mapper_>, bool>
     check(ordinal_type ord, ttg::TTBase *tt) {
-      return check_key_impl(ttg::Void{}, ord, tt);
+      return this->check_key_impl(ttg::Void{}, ord, tt);
     }
 
     template<typename Key_ = key_type, typename Mapper_ = Mapper>
     std::enable_if_t<!ttg::meta::is_void_v<Key_> && !ttg::meta::is_void_v<Mapper_>>
     complete(const key_type& key, ttg::TTBase *tt) {
-      complete_key_impl();
+      this->complete_key_impl();
     }
 
     template<typename Key_ = key_type, typename Mapper_ = Mapper>
     std::enable_if_t<!ttg::meta::is_void_v<Key_> && ttg::meta::is_void_v<Mapper_>>
     complete(const key_type& key, Ordinal ord, ttg::TTBase *tt) {
-      complete_key_impl();
+      this->complete_key_impl();
     }
 
     template<typename Key_ = key_type, typename Mapper_ = Mapper>
     std::enable_if_t<!ttg::meta::is_void_v<Key_> && ttg::meta::is_void_v<Mapper_>>
     complete(Ordinal ord, ttg::TTBase *tt) {
-      complete_key_impl();
+      this->complete_key_impl();
     }
 
     template<typename Key_ = key_type, typename Mapper_ = Mapper>
     std::enable_if_t<!ttg::meta::is_void_v<Key_> && !ttg::meta::is_void_v<Mapper_>>
     complete(ttg::TTBase *tt) {
-      complete_key_impl();
+      this->complete_key_impl();
     }
 
     /**
@@ -279,23 +290,41 @@ namespace ttg {
     void start() {
       if (m_stopped) {
         m_stopped = false;
+        release_next(m_current, true); // force the check for a next release even if the current ordinal hasn't changed
+      }
+    }
+
+    /**
+     * Release tasks up to the ordinal. The provided ordinal is ignored if `auto_release` is enabled.
+     */
+    void release(ordinal_type ord = 0) {
+      if (m_auto_release) {
+        // last key for this ordinal, release the next
+        // the provided ordinal is ignored
         release_next();
+      } else {
+        release_next(ord);
       }
     }
 
+    bool is_auto() const {
+      return m_auto_release;
+    }
+
 
-  private:
+  protected:
     std::map<ordinal_type, sequence_elem_t, compare_t> m_sequence;
-    std::atomic<std::size_t> m_active;
-    ordinal_type m_current;
+    ordinal_type m_current = std::numeric_limits<ordinal_type>::min();
     [[no_unique_address]]
     Mapper m_map;
     [[no_unique_address]]
     compare_t m_order;
+    std::atomic<std::size_t> m_active;
     bool m_stopped = false;
+    bool m_auto_release = false;
   };
 
-  // deduction guide: take type of first argument to Mapper as the key type
+  // deduction guides: take type of first argument to Mapper as the key type
   // TODO: can we use the TTG callable_args instead?
   template<typename Mapper, typename = std::enable_if_t<std::is_invocable_v<Mapper, std::decay_t<std::tuple_element_t<0, boost::callable_traits::args_t<Mapper>>>>>>
   SequencedKeysConstraint(Mapper&&)
@@ -314,11 +343,36 @@ namespace ttg {
   SequencedKeysConstraint(const SequencedKeysConstraint<Key, Ordinal, Compare, Mapper>&)
     -> SequencedKeysConstraint<Key, Ordinal, Compare, Mapper>;
 
+  /**
+   * Make a constraint that can be shared between multiple TT instances.
+   * Overload for incomplete templated constraint types.
+   *
+   * Example:
+   * // SequencedKeysConstraint is incomplete
+   * auto c = ttg::make_shared_constraint<SequencedKeysConstraint>([](Key& k){ return k[0]; });
+   * auto tt_a = ttg::make_tt<Key>(...);
+   * tt_a->add_constraint(c);
+   * auto tt_b = ttg::make_tt<Key>(...);
+   * tt_b->add_constraint(c);
+   *
+   * -> the constraint will handle keys from both tt_a and tt_b. Both TTs must have the same key type.
+   */
   template<template<typename...> typename Constraint, typename... Args>
   auto make_shared_constraint(Args&&... args) {
-    return std::make_shared<decltype(Constraint(std::forward<Args>(args)...))>(Constraint(std::forward<Args>(args)...));
+    return std::make_shared<decltype(Constraint(std::forward<Args>(args)...))>(std::forward<Args>(args)...);
   }
 
+  /**
+   * Make a constraint that can be shared between multiple TT instances.
+   * Overload for complete constraint types.
+   */
+  template<typename Constraint, typename... Args>
+  auto make_shared_constraint(Args&&... args) {
+    return std::make_shared<Constraint>(std::forward<Args>(args)...);
+  }
+
+
+
 } // namespace ttg
 
 #endif // TTG_CONSTRAINT_H
\ No newline at end of file

From 3ff9e6b1f6c66aa3ed35d3c4fba97496a78ce202 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Wed, 6 Mar 2024 15:29:12 -0500
Subject: [PATCH 06/57] Modify constraint to make auto-progress optional

The constructor of the SequencedKeysConstraint takes a Boolean
argument that determines whether tasks are automatically released
or whether we depend on the application to release the next wave of tasks.

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 examples/spmm/spmm.cc | 13 +++++++++----
 ttg/ttg/constraint.h  | 26 ++++++++++++++++----------
 ttg/ttg/parsec/task.h | 10 ++++++----
 ttg/ttg/parsec/ttg.h  |  2 +-
 4 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc
index 5806ce9ef..42e39dce6 100644
--- a/examples/spmm/spmm.cc
+++ b/examples/spmm/spmm.cc
@@ -304,7 +304,8 @@ class SpMM25D {
     bcast_b_->add_constraint(constraint, [](const Key<2>& key){ return key[0]; });
     local_bcast_b_ = std::make_unique<LocalBcastB>(local_b_ijk_, b_ijk_, a_rows_of_col_, ijk_keymap_);
     multiplyadd_ = std::make_unique<MultiplyAdd>(a_ijk_, b_ijk_, c_ijk_, c_ij_p_, a_cols_of_row_,
-                                                 b_rows_of_col_, mTiles, nTiles, ijk_keymap_, constraint, k_cnt_);
+                                                 b_rows_of_col_, mTiles, nTiles, ijk_keymap_, constraint,
+                                                 k_cnt_, parallel_bcasts);
 
     reduce_c_ = std::make_unique<ReduceC>(c_ij_p_, c, ij_keymap_);
     reduce_c_->template set_input_reducer<0>(
@@ -545,13 +546,15 @@ class SpMM25D {
                 const std::vector<std::vector<long>> &b_rows_of_col, const std::vector<int> &mTiles,
                 const std::vector<int> &nTiles, const Keymap3 &ijk_keymap,
                 std::shared_ptr<ttg::SequencedKeysConstraint<Key<2>>> constraint,
-                std::vector<bool>& k_cnt)
+                std::vector<bool>& k_cnt,
+                std::size_t parallel_bcasts)
         : baseT(edges(a_ijk, b_ijk, c_ijk), edges(c, c_ijk), "SpMM25D::MultiplyAdd", {"a_ijk", "b_ijk", "c_ijk"},
                 {"c_ij", "c_ijk"}, ijk_keymap)
         , a_cols_of_row_(a_cols_of_row)
         , b_rows_of_col_(b_rows_of_col)
         , k_cnt_(k_cnt)
-        , constraint(std::move(constraint)){
+        , constraint(std::move(constraint))
+        , parallel_bcasts_(parallel_bcasts) {
       this->set_priomap([=,this](const Key<3> &ijk) { return this->prio(ijk); });  // map a key to an integral priority value
 
       // for each {i,j} determine first k that contributes AND belongs to this node,
@@ -598,9 +601,10 @@ class SpMM25D {
       // release the constraint on the next round of broadcasts
       {
         std::size_t release_k = k;
+        std::size_t bcasts_ahead = parallel_bcasts_;
         while (release_k < k_cnt_.size()) {
           ++release_k;
-          if (k_cnt_[release_k])
+          if (k_cnt_[release_k] && --bcasts_ahead)
             break;
         }
         constraint->release(release_k);
@@ -627,6 +631,7 @@ class SpMM25D {
     const std::vector<std::vector<long>> &b_rows_of_col_;
     std::vector<bool>& k_cnt_;
     std::shared_ptr<ttg::SequencedKeysConstraint<Key<2>>> constraint;
+    std::size_t parallel_bcasts_;
 
     /* Compute the length of the remaining sequence on that tile */
     int32_t prio(const Key<3> &key) const {
diff --git a/ttg/ttg/constraint.h b/ttg/ttg/constraint.h
index 199fb5ed6..311bf3ead 100644
--- a/ttg/ttg/constraint.h
+++ b/ttg/ttg/constraint.h
@@ -78,6 +78,10 @@ namespace ttg {
       std::map<ttg::TTBase*, std::vector<key_type>> m_keys;
 
       sequence_elem_t() = default;
+      sequence_elem_t(sequence_elem_t&&) = default;
+      sequence_elem_t(const sequence_elem_t&) = default;
+      sequence_elem_t& operator=(sequence_elem_t&&) = default;
+      sequence_elem_t& operator=(const sequence_elem_t&) = default;
 
       void add_key(const key_type& key, ttg::TTBase* tt) {
         auto it = m_keys.find(tt);
@@ -95,9 +99,9 @@ namespace ttg {
           // key should be executed
           if (m_auto_release) { // only needed for auto-release
             m_active.fetch_add(1, std::memory_order_relaxed);
+            // revert the current ordinal to the lower ordinal
+            m_current = ord;
           }
-          // reset current to the lower ordinal
-          m_current = ord;
           return true;
         } else if (m_sequence.empty() && m_auto_release && 0 == m_active.load(std::memory_order_relaxed)) {
           // there are no keys (active or blocked) so we execute to avoid a deadlock
@@ -172,18 +176,20 @@ namespace ttg {
       if (!force_check && m_order(ord, this->m_current)) {
         return; // already at the provided ordinal, nothing to be done
       }
-      // set current ordinal
-      this->m_current = ord;
       // trigger the next sequence(s) (m_sequence is ordered by ordinal)
       std::vector<sequence_elem_t> seqs;
       {
         auto g = this->lock_guard();
-        for (auto it = this->m_sequence.begin(); it != this->m_sequence.end(); it = this->m_sequence.begin()) {
-          if (!this->m_order(it->first, this->m_current)) break;
-          // extract the next sequence
-          this->m_current = it->first;
-          seqs.push_back(std::move(it->second));
-          this->m_sequence.erase(it);
+        // set current ordinal
+        this->m_current = ord;
+        {
+          for (auto it = this->m_sequence.begin(); it != this->m_sequence.end();) {
+            if (!this->m_order(it->first, this->m_current)) break;
+            // extract the next sequence
+            this->m_current = it->first;
+            seqs.push_back(std::move(it->second));
+            it = this->m_sequence.erase(it);
+          }
         }
       }
       for (auto& elem : seqs) {
diff --git a/ttg/ttg/parsec/task.h b/ttg/ttg/parsec/task.h
index 656117b94..8e0575772 100644
--- a/ttg/ttg/parsec/task.h
+++ b/ttg/ttg/parsec/task.h
@@ -209,8 +209,9 @@ namespace ttg_parsec {
       device_state_t<TT::derived_has_device_op()> dev_state;
       ttg_data_copy_t *copies[num_copies] = { nullptr };  // the data copies tracked by this task
 
-      parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class)
-          : parsec_ttg_task_base_t(mempool, task_class, num_streams, copies) {
+      parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class, TT *tt_ptr)
+          : parsec_ttg_task_base_t(mempool, task_class, num_streams, copies)
+          , tt(tt_ptr) {
         tt_ht_item.key = pkey();
         this->dev_ptr = this->dev_state.dev_ptr();
         // We store the hash of the key and the address where it can be found in locals considered as a scratchpad
@@ -276,8 +277,9 @@ namespace ttg_parsec {
       ttg_data_copy_t *copies[num_streams+1] = { nullptr };  // the data copies tracked by this task
                                                              // +1 for the copy needed during send/bcast
 
-      parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class)
-          : parsec_ttg_task_base_t(mempool, task_class, num_streams, copies) {
+      parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class, TT *tt_ptr)
+          : parsec_ttg_task_base_t(mempool, task_class, num_streams, copies)
+          , tt(tt_ptr) {
         tt_ht_item.key = pkey();
         this->dev_ptr = this->dev_state.dev_ptr();
       }
diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h
index e82cb42a1..01cb8bcab 100644
--- a/ttg/ttg/parsec/ttg.h
+++ b/ttg/ttg/parsec/ttg.h
@@ -2001,7 +2001,7 @@ namespace ttg_parsec {
       task_t *dummy;
       parsec_execution_stream_s *es = world.impl().execution_stream();
       parsec_thread_mempool_t *mempool = get_task_mempool();
-      dummy = new (parsec_thread_mempool_allocate(mempool)) task_t(mempool, &this->self);
+      dummy = new (parsec_thread_mempool_allocate(mempool)) task_t(mempool, &this->self, this);
       dummy->set_dummy(true);
       // TODO: do we need to copy static_stream_goal in dummy?
 

From e789a90bbcfec41235c3916d0fa055de60869d81 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Tue, 12 Mar 2024 15:28:44 -0400
Subject: [PATCH 07/57] SPMM: release next bcast once all prior gemms have
 executed

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 examples/spmm/spmm.cc | 48 +++++++++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc
index 42e39dce6..2be1d8ec5 100644
--- a/examples/spmm/spmm.cc
+++ b/examples/spmm/spmm.cc
@@ -289,6 +289,7 @@ class SpMM25D {
       , b_rows_of_col_(b_rows_of_col)
       , a_rows_of_col_(a_rows_of_col)
       , b_cols_of_row_(b_cols_of_row)
+      , k_cnt_(a_cols_of_row_.size()+1)
       , ij_keymap_(std::move(ij_keymap))
       , ijk_keymap_(std::move(ijk_keymap))
       , parallel_bcasts_(parallel_bcasts) {
@@ -323,7 +324,8 @@ class SpMM25D {
     std::vector<bool> c_ij_procmask(world.size(), false);
     std::vector<unsigned long> first_k_map(world.size(), std::numeric_limits<unsigned long>::max());
     std::size_t max_k = a_rows_of_col_.size();
-    k_cnt_.resize(max_k+1, false);
+    std::vector<std::size_t> k_cnt;
+    k_cnt.resize(a_cols_of_row_.size(), 0);
     for (auto i = 0ul; i != a_cols_of_row_.size(); ++i) {
       if (a_cols_of_row_[i].empty()) continue;
       for (auto j = 0ul; j != b_rows_of_col_.size(); ++j) {
@@ -333,12 +335,10 @@ class SpMM25D {
           decltype(i) k;
           bool have_k;
           std::tie(k, have_k) = multiplyadd_->compute_first_k(i, j);
-          if (have_k) {
-            k_cnt_[k] = true;
-          }
           while (have_k) {
             const auto pR = ijk_keymap_(Key<3>{i, j, k});
             assert(pR < c_ij_procmask.size());
+            k_cnt[k]++;
             c_ij_procmask[pR] = true;
             // find the first k that is needed from us by this rank
             first_k_map[pR] = std::min(first_k_map[pR], k);
@@ -353,16 +353,23 @@ class SpMM25D {
       }
     }
 
-    k_cnt_.push_back(true); // we always want to release the last k
+    k_cnt.push_back(1); // we always want to release the last k
+    assert(k_cnt.size() == k_cnt_.size());
+    // copy into atomic counters
+    std::size_t i = 0;
+    for (auto c : k_cnt) {
+      assert(i < k_cnt_.size());
+      k_cnt_[i++].store(c, std::memory_order_relaxed);
+    }
 
-    // find the maximum k for which we need to release the broadcast constraint
-    unsigned long first_k = 0;
-    for (auto k : first_k_map) {
-      if (k != std::numeric_limits<unsigned long>::max()) {
-        first_k = std::max(first_k, k);
-      }
+    // release the first bcast(s)
+    auto pbcasts = parallel_bcasts_;
+    auto release_k = k_cnt.begin();
+    while (pbcasts-- > 0 && (k_cnt.end() != release_k))
+    {
+      release_k = std::find_if(release_k, k_cnt.end(), [](std::size_t c){ return c > 0; });
     }
-    constraint->release(first_k);
+    constraint->release(*release_k);
 
     TTGUNUSED(bcast_a_);
     TTGUNUSED(bcast_b_);
@@ -546,7 +553,7 @@ class SpMM25D {
                 const std::vector<std::vector<long>> &b_rows_of_col, const std::vector<int> &mTiles,
                 const std::vector<int> &nTiles, const Keymap3 &ijk_keymap,
                 std::shared_ptr<ttg::SequencedKeysConstraint<Key<2>>> constraint,
-                std::vector<bool>& k_cnt,
+                std::vector<std::atomic<std::size_t>>& k_cnt,
                 std::size_t parallel_bcasts)
         : baseT(edges(a_ijk, b_ijk, c_ijk), edges(c, c_ijk), "SpMM25D::MultiplyAdd", {"a_ijk", "b_ijk", "c_ijk"},
                 {"c_ij", "c_ijk"}, ijk_keymap)
@@ -602,12 +609,13 @@ class SpMM25D {
       {
         std::size_t release_k = k;
         std::size_t bcasts_ahead = parallel_bcasts_;
-        while (release_k < k_cnt_.size()) {
-          ++release_k;
-          if (k_cnt_[release_k] && --bcasts_ahead)
-            break;
+        assert(k_cnt_.size() > release_k);
+        if (0 == k_cnt_[release_k].fetch_sub(1, std::memory_order_relaxed)-1) {
+          // this was the last gemm in this k, find the one to release
+          while (++release_k < k_cnt_.size() && (0 == k_cnt_[release_k].load(std::memory_order_relaxed) || --bcasts_ahead > 0))
+          { }
+          constraint->release(release_k);
         }
-        constraint->release(release_k);
       }
 
       // compute the contrib, pass the running total to the next flow, if needed
@@ -629,7 +637,7 @@ class SpMM25D {
    private:
     const std::vector<std::vector<long>> &a_cols_of_row_;
     const std::vector<std::vector<long>> &b_rows_of_col_;
-    std::vector<bool>& k_cnt_;
+    std::vector<std::atomic<std::size_t>>& k_cnt_;
     std::shared_ptr<ttg::SequencedKeysConstraint<Key<2>>> constraint;
     std::size_t parallel_bcasts_;
 
@@ -777,7 +785,7 @@ class SpMM25D {
   std::unique_ptr<LocalBcastB> local_bcast_b_;
   std::unique_ptr<MultiplyAdd> multiplyadd_;
   std::unique_ptr<ReduceC> reduce_c_;
-  std::vector<bool> k_cnt_;
+  std::vector<std::atomic<std::size_t>> k_cnt_;
   Keymap2 ij_keymap_;
   Keymap3 ijk_keymap_;
   long parallel_bcasts_;

From 99f03b6c8cade3776cc3d84b0f3d97d378d1643f Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Tue, 4 Jun 2024 17:36:01 -0400
Subject: [PATCH 08/57] Replace std::span with ttg::span

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 ttg/ttg.h            |  1 +
 ttg/ttg/constraint.h | 10 ++++++----
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/ttg/ttg.h b/ttg/ttg.h
index 50a891c1d..8dd86a636 100644
--- a/ttg/ttg.h
+++ b/ttg/ttg.h
@@ -26,6 +26,7 @@
 #include "ttg/util/print.h"
 #include "ttg/world.h"
 
+#include "ttg/constraint.h"
 #include "ttg/edge.h"
 
 #include "ttg/ptr.h"
diff --git a/ttg/ttg/constraint.h b/ttg/ttg/constraint.h
index 311bf3ead..54f070f73 100644
--- a/ttg/ttg/constraint.h
+++ b/ttg/ttg/constraint.h
@@ -7,6 +7,8 @@
 #include <mutex>
 #include <map>
 
+#include "ttg/util/span.h"
+
 #ifdef TTG_USE_BUNDLED_BOOST_CALLABLE_TRAITS
 #include <ttg/external/boost/callable_traits.hpp>
 #else
@@ -18,7 +20,7 @@ namespace ttg {
   template<typename Key>
   struct ConstraintBase {
     using key_type = Key;
-    using listener_t = std::function<void(const std::span<key_type>&)>;
+    using listener_t = std::function<void(const ttg::span<key_type>&)>;
 
     ConstraintBase()
     { }
@@ -45,7 +47,7 @@ namespace ttg {
       m_listeners.insert_or_assign(tt, std::move(l));
     }
 
-    void notify_listener(const std::span<key_type>& keys, ttg::TTBase* tt) {
+    void notify_listener(const ttg::span<key_type>& keys, ttg::TTBase* tt) {
       auto& release = m_listeners[tt];
       release(keys);
     }
@@ -162,7 +164,7 @@ namespace ttg {
       for (auto& seq : elem.m_keys) {
         // account for the newly active keys
         this->m_active.fetch_add(seq.second.size(), std::memory_order_relaxed);
-        this->notify_listener(std::span<key_type>(seq.second.data(), seq.second.size()), seq.first);
+        this->notify_listener(ttg::span<key_type>(seq.second.data(), seq.second.size()), seq.first);
       }
     }
 
@@ -195,7 +197,7 @@ namespace ttg {
       for (auto& elem : seqs) {
         for (auto& e : elem.m_keys) {
           // account for the newly active keys
-          this->notify_listener(std::span<key_type>(e.second.data(), e.second.size()), e.first);
+          this->notify_listener(ttg::span<key_type>(e.second.data(), e.second.size()), e.first);
         }
       }
     }

From ef8bd3bb11a61a65b82c75cb0535ed838fe4fd86 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Thu, 11 Jul 2024 14:33:03 -0400
Subject: [PATCH 09/57] SPMM: Adjust maxTs if only minTs is provided

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 examples/spmm/spmm.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc
index 2be1d8ec5..7e0232525 100644
--- a/examples/spmm/spmm.cc
+++ b/examples/spmm/spmm.cc
@@ -1690,6 +1690,9 @@ int main(int argc, char **argv) {
         minTs = parseOption(minTsStr, 32);
         std::string maxTsStr(getCmdOption(argv, argv + argc, "-T"));
         maxTs = parseOption(maxTsStr, 256);
+        if (minTs >= maxTs) {
+          maxTs = minTs;
+        }
         std::string avgStr(getCmdOption(argv, argv + argc, "-a"));
         double avg = parseOption(avgStr, 0.3);
         timing = (check == 0);

From d916ae0f0654df60d5fdfce882f2e578ee391866 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Thu, 11 Jul 2024 14:33:57 -0400
Subject: [PATCH 10/57] SPMM: Fix up-front release of k tasks

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 examples/spmm/spmm.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc
index 7e0232525..bf1fc0a89 100644
--- a/examples/spmm/spmm.cc
+++ b/examples/spmm/spmm.cc
@@ -365,11 +365,13 @@ class SpMM25D {
     // release the first bcast(s)
     auto pbcasts = parallel_bcasts_;
     auto release_k = k_cnt.begin();
+    release_k_ = release_k; // this will be released
     while (pbcasts-- > 0 && (k_cnt.end() != release_k))
     {
-      release_k = std::find_if(release_k, k_cnt.end(), [](std::size_t c){ return c > 0; });
+      release_k_ = release_k;
+      release_k = std::find_if(release_k++, k_cnt.end(), [](std::size_t c){ return c > 0; });
     }
-    constraint->release(*release_k);
+    constraint->release(*release_k_);
 
     TTGUNUSED(bcast_a_);
     TTGUNUSED(bcast_b_);

From 95df973662fa29238553447a7993d5c3d57df3f6 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Thu, 11 Jul 2024 14:50:22 -0400
Subject: [PATCH 11/57] SPMM: second try at initial release of k

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 examples/spmm/spmm.cc | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc
index bf1fc0a89..bd0982c06 100644
--- a/examples/spmm/spmm.cc
+++ b/examples/spmm/spmm.cc
@@ -365,12 +365,15 @@ class SpMM25D {
     // release the first bcast(s)
     auto pbcasts = parallel_bcasts_;
     auto release_k = k_cnt.begin();
-    release_k_ = release_k; // this will be released
-    while (pbcasts-- > 0 && (k_cnt.end() != release_k))
-    {
+    auto release_k_ = release_k; // this will be released
+    do {
+      release_k = std::find_if(release_k, k_cnt.end(), [](std::size_t c){ return c > 0; });
+      if (k_cnt.end() == release_k) {
+        break;
+      }
       release_k_ = release_k;
-      release_k = std::find_if(release_k++, k_cnt.end(), [](std::size_t c){ return c > 0; });
-    }
+      ++release_k;
+    } while (--pbcasts > 0);
     constraint->release(*release_k_);
 
     TTGUNUSED(bcast_a_);

From fbfdf0cee5756779e159130a5473441004cd1343 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Tue, 16 Jul 2024 18:05:44 -0400
Subject: [PATCH 12/57] SPMM: Fix initial constraint release of k

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 examples/spmm/spmm.cc | 42 ++++++++++++++++++++----------------------
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc
index bd0982c06..7cf656e4e 100644
--- a/examples/spmm/spmm.cc
+++ b/examples/spmm/spmm.cc
@@ -289,10 +289,10 @@ class SpMM25D {
       , b_rows_of_col_(b_rows_of_col)
       , a_rows_of_col_(a_rows_of_col)
       , b_cols_of_row_(b_cols_of_row)
-      , k_cnt_(a_cols_of_row_.size()+1)
+      , k_cnt_(a_rows_of_col_.size()+1)
       , ij_keymap_(std::move(ij_keymap))
       , ijk_keymap_(std::move(ijk_keymap))
-      , parallel_bcasts_(parallel_bcasts) {
+      , parallel_bcasts_(std::max(parallel_bcasts, 1L)) {
     Edge<Key<2>, void> a_ctl, b_ctl;
     Edge<Key<2>, int> a_rowctl, b_colctl; // TODO: can we have multiple control inputs per TT?
     auto constraint = ttg::make_shared_constraint<ttg::SequencedKeysConstraint<Key<2>>>(USE_AUTO_CONSTRAINT);
@@ -306,7 +306,7 @@ class SpMM25D {
     local_bcast_b_ = std::make_unique<LocalBcastB>(local_b_ijk_, b_ijk_, a_rows_of_col_, ijk_keymap_);
     multiplyadd_ = std::make_unique<MultiplyAdd>(a_ijk_, b_ijk_, c_ijk_, c_ij_p_, a_cols_of_row_,
                                                  b_rows_of_col_, mTiles, nTiles, ijk_keymap_, constraint,
-                                                 k_cnt_, parallel_bcasts);
+                                                 k_cnt_, parallel_bcasts_);
 
     reduce_c_ = std::make_unique<ReduceC>(c_ij_p_, c, ij_keymap_);
     reduce_c_->template set_input_reducer<0>(
@@ -325,7 +325,8 @@ class SpMM25D {
     std::vector<unsigned long> first_k_map(world.size(), std::numeric_limits<unsigned long>::max());
     std::size_t max_k = a_rows_of_col_.size();
     std::vector<std::size_t> k_cnt;
-    k_cnt.resize(a_cols_of_row_.size(), 0);
+    k_cnt.resize(a_rows_of_col_.size(), 0);
+    int release_k = 0;
     for (auto i = 0ul; i != a_cols_of_row_.size(); ++i) {
       if (a_cols_of_row_[i].empty()) continue;
       for (auto j = 0ul; j != b_rows_of_col_.size(); ++j) {
@@ -362,19 +363,12 @@ class SpMM25D {
       k_cnt_[i++].store(c, std::memory_order_relaxed);
     }
 
-    // release the first bcast(s)
-    auto pbcasts = parallel_bcasts_;
-    auto release_k = k_cnt.begin();
-    auto release_k_ = release_k; // this will be released
+    /* release the first parallel_bcasts_ k that are non-zero */
+    auto k_cnt_iter = k_cnt.begin();
     do {
-      release_k = std::find_if(release_k, k_cnt.end(), [](std::size_t c){ return c > 0; });
-      if (k_cnt.end() == release_k) {
-        break;
-      }
-      release_k_ = release_k;
-      ++release_k;
-    } while (--pbcasts > 0);
-    constraint->release(*release_k_);
+      k_cnt_iter = std::find_if(k_cnt_iter, k_cnt.end(), [](auto c){ return c > 0; });
+    } while (++k_cnt_iter != k_cnt.end() && std::distance(k_cnt_iter, k_cnt.end()) < parallel_bcasts_);
+    constraint->release(std::distance(k_cnt.begin(), k_cnt_iter));
 
     TTGUNUSED(bcast_a_);
     TTGUNUSED(bcast_b_);
@@ -612,12 +606,16 @@ class SpMM25D {
                  (have_next_k ? std::to_string(next_k) : "does not exist"));
       // release the constraint on the next round of broadcasts
       {
-        std::size_t release_k = k;
-        std::size_t bcasts_ahead = parallel_bcasts_;
-        assert(k_cnt_.size() > release_k);
-        if (0 == k_cnt_[release_k].fetch_sub(1, std::memory_order_relaxed)-1) {
-          // this was the last gemm in this k, find the one to release
-          while (++release_k < k_cnt_.size() && (0 == k_cnt_[release_k].load(std::memory_order_relaxed) || --bcasts_ahead > 0))
+        assert(k_cnt_.size() > k);
+        long cnt = k_cnt_[k].fetch_sub(1, std::memory_order_relaxed)-1;
+        assert(cnt >= 0);
+        if (0 == cnt) {
+          auto release_k = k;
+          auto bcasts_ahead = parallel_bcasts_;
+          // this was the last gemm in this k, find the next one to release
+          while (++release_k < k_cnt_.size() &&
+                 (0 == k_cnt_[release_k].load(std::memory_order_relaxed)
+                  || --bcasts_ahead > 0))
           { }
           constraint->release(release_k);
         }

From bebd5d8023e9ca8aa7f6aa0ab72cd4f73136a272 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Mon, 22 Jul 2024 09:17:51 -0400
Subject: [PATCH 13/57] Add stubs for constraints to MADNESS backend

Constraints will be ignored in the MADNESS backend for now.

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 ttg/ttg/madness/ttg.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/ttg/ttg/madness/ttg.h b/ttg/ttg/madness/ttg.h
index 5d2360cfb..4d4f53b4f 100644
--- a/ttg/ttg/madness/ttg.h
+++ b/ttg/ttg/madness/ttg.h
@@ -1205,6 +1205,23 @@ namespace ttg_madness {
       priomap = std::forward<Priomap>(pm);
     }
 
+    /// add a constraint
+    /// the constraint must provide a valid override of `check_key(key)`
+    template<typename Constraint>
+    void add_constraint(Constraint&& c) {
+      /* currently a noop */
+    }
+
+    template<typename Constraint, typename Mapper>
+    void add_constraint(std::shared_ptr<Constraint> c, Mapper&& map) {
+      /* currently a noop */
+    }
+
+    template<typename Constraint, typename Mapper>
+    void add_constraint(Constraint c, Mapper&& map) {
+      /* currently a noop */
+    }
+
     /// implementation of TTBase::make_executable()
     void make_executable() override {
       TTBase::make_executable();

From c12a88c01285126d9d8b4763e37eb811a8536cb9 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Wed, 24 Jul 2024 08:41:56 -0400
Subject: [PATCH 14/57] typo

---
 ttg/ttg/parsec/task.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ttg/ttg/parsec/task.h b/ttg/ttg/parsec/task.h
index 8e0575772..010174243 100644
--- a/ttg/ttg/parsec/task.h
+++ b/ttg/ttg/parsec/task.h
@@ -196,7 +196,7 @@ namespace ttg_parsec {
     struct parsec_ttg_task_t : public parsec_ttg_task_base_t {
       using key_type = typename TT::key_type;
       static constexpr size_t num_streams = TT::numins;
-      /* device tasks may have to store more copies than it's inputs as their sends are aggregated */
+      /* device tasks may have to store more copies than # of its inputs as their sends are aggregated */
       static constexpr size_t num_copies  = TT::derived_has_device_op() ? static_cast<size_t>(MAX_PARAM_COUNT)
                                                                       : (num_streams+1);
       TT* tt = nullptr;

From 4c6cc1544d7e72107712f0184388eb12d1a0fc48 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Wed, 24 Jul 2024 08:42:34 -0400
Subject: [PATCH 15/57] bump TA+MAD tags

---
 cmake/modules/ExternalDependenciesVersions.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/modules/ExternalDependenciesVersions.cmake b/cmake/modules/ExternalDependenciesVersions.cmake
index 1429de0bf..ac3271567 100644
--- a/cmake/modules/ExternalDependenciesVersions.cmake
+++ b/cmake/modules/ExternalDependenciesVersions.cmake
@@ -3,10 +3,10 @@
 
 set(TTG_TRACKED_VG_CMAKE_KIT_TAG 092efee765e039b02e0a9aaf013c12fc3c4e89cf)  # used to provide "real" FindOrFetchBoost
 set(TTG_TRACKED_CATCH2_VERSION 3.5.0)
-set(TTG_TRACKED_MADNESS_TAG 2eb3bcf0138127ee2dbc651f1aabd3e9b0def4e3)
+set(TTG_TRACKED_MADNESS_TAG 96ac90e8f193ccfaf16f346b4652927d2d362e75)
 set(TTG_TRACKED_PARSEC_TAG 58f8f3089ecad2e8ee50e80a9586e05ce8873b1c)
 set(TTG_TRACKED_BTAS_TAG 4e8f5233aa7881dccdfcc37ce07128833926d3c2)
-set(TTG_TRACKED_TILEDARRAY_TAG 493c109379a1b64ddd5ef59f7e33b95633b68d73)
+set(TTG_TRACKED_TILEDARRAY_TAG 5204c06cf978892ee04503b476162d1c5cefd9de)
 
 # need Boost.CallableTraits (header only, part of Boost 1.66 released in Dec 2017) for wrap.h to work
 set(TTG_OLDEST_BOOST_VERSION 1.66)

From 2a8621c4a7a59ba5fd95123eb30f5ba735eef1b8 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Wed, 24 Jul 2024 08:44:18 -0400
Subject: [PATCH 16/57] introduced ttg::matrix::Triplet (replacement for
 Eigen::Triplet) to enable moves

---
 examples/spmm/spmm.cc      | 11 +++--
 examples/spmm/spmm_cuda.cc | 12 +++--
 examples/ttg_matrix.h      | 97 ++++++++++++++++++++++++++------------
 3 files changed, 79 insertions(+), 41 deletions(-)

diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc
index 7cf656e4e..c09e908fb 100644
--- a/examples/spmm/spmm.cc
+++ b/examples/spmm/spmm.cc
@@ -29,6 +29,7 @@
 #endif
 
 #include "ttg.h"
+#include "../ttg_matrix.h"
 
 using namespace ttg;
 
@@ -103,7 +104,7 @@ using blk_t = double;
 template <typename T = blk_t>
 using SpMatrix = Eigen::SparseMatrix<T>;
 template <typename T = blk_t>
-using SpMatrixTriplet = Eigen::Triplet<T>;  // {row,col,value}
+using SpMatrixTriplet = ttg::matrix::Triplet<T>;  // {row,col,value}
 
 #if defined(BLOCK_SPARSE_GEMM) && defined(BTAS_IS_USABLE)
 
@@ -990,7 +991,7 @@ static void initSpRmat(const std::function<int(const Key<2> &)> &keymap, const c
   boost::minstd_rand gen(seed);
   boost::rmat_iterator<boost::minstd_rand, boost::directed_graph<>> rmat_it(gen, N, E, a, b, c, d);
 
-  using triplet_t = Eigen::Triplet<blk_t>;
+  using triplet_t = ttg::matrix::Triplet<blk_t>;
   std::vector<triplet_t> A_elements;
   for (int i = 0; i < N; i++) {
     nnz++;
@@ -1026,7 +1027,7 @@ static void initSpHardCoded(const std::function<int(const Key<2> &)> &keymap, Sp
   C.resize(m, n);
   // We initialize the same matrices on all the ranks, but we will use only the local part
   // following the keymap
-  using triplet_t = Eigen::Triplet<blk_t>;
+  using triplet_t = ttg::matrix::Triplet<blk_t>;
   std::vector<triplet_t> A_elements;
   A_elements.emplace_back(0, 1, 12.3);
   A_elements.emplace_back(0, 2, 10.7);
@@ -1073,7 +1074,7 @@ static void initBlSpHardCoded(const std::function<int(const Key<2> &)> &keymap,
 
   int rank = ttg::default_execution_context().rank();
 
-  using triplet_t = Eigen::Triplet<blk_t>;
+  using triplet_t = ttg::matrix::Triplet<blk_t>;
   std::vector<triplet_t> A_elements;
   std::vector<triplet_t> Aref_elements;
 #if defined(BTAS_IS_USABLE)
@@ -1239,7 +1240,7 @@ static void initBlSpRandom(const std::function<int(const Key<2> &)> &keymap, siz
   std::mt19937 genv(seed + 1);
 
   std::uniform_int_distribution<> dist(minTs, maxTs);  // randomly pick any value in the range minTs, maxTs
-  using triplet_t = Eigen::Triplet<blk_t>;
+  using triplet_t = ttg::matrix::Triplet<blk_t>;
   std::vector<triplet_t> A_elements;
   std::vector<triplet_t> B_elements;
   std::vector<triplet_t> Aref_elements;
diff --git a/examples/spmm/spmm_cuda.cc b/examples/spmm/spmm_cuda.cc
index 9dcf5928e..6597c0921 100644
--- a/examples/spmm/spmm_cuda.cc
+++ b/examples/spmm/spmm_cuda.cc
@@ -54,6 +54,8 @@ using namespace ttg;
 #include <sys/time.h>
 #endif
 
+#include "../ttg_matrix.h"
+
 #if defined(BLOCK_SPARSE_GEMM) && defined(BTAS_IS_USABLE)
 
 template <typename _T, class _Range, class _Storage>
@@ -399,7 +401,7 @@ using blk_t = double;
 template <typename T = blk_t>
 using SpMatrix = Eigen::SparseMatrix<T>;
 template <typename T = blk_t>
-using SpMatrixTriplet = Eigen::Triplet<T>;  // {row,col,value}
+using SpMatrixTriplet = ttg::matrix::Triplet<T>;  // {row,col,value}
 
 #if defined(BLOCK_SPARSE_GEMM) && defined(BTAS_IS_USABLE)
 
@@ -1197,7 +1199,7 @@ static void initSpRmat(const std::function<int(const Key<2> &)> &keymap, const c
   boost::minstd_rand gen(seed);
   boost::rmat_iterator<boost::minstd_rand, boost::directed_graph<>> rmat_it(gen, N, E, a, b, c, d);
 
-  using triplet_t = Eigen::Triplet<blk_t>;
+  using triplet_t = ttg::matrix::Triplet<blk_t>;
   std::vector<triplet_t> A_elements;
   for (int i = 0; i < N; i++) {
     nnz++;
@@ -1232,7 +1234,7 @@ static void initSpHardCoded(const std::function<int(const Key<2> &)> &keymap, Sp
   C.resize(m, n);
   // We initialize the same matrices on all the ranks, but we will use only the local part
   // following the keymap
-  using triplet_t = Eigen::Triplet<blk_t>;
+  using triplet_t = ttg::matrix::Triplet<blk_t>;
   std::vector<triplet_t> A_elements;
   A_elements.emplace_back(0, 1, 12.3);
   A_elements.emplace_back(0, 2, 10.7);
@@ -1279,7 +1281,7 @@ static void initBlSpHardCoded(const std::function<int(const Key<2> &)> &keymap,
 
   int rank = ttg::default_execution_context().rank();
 
-  using triplet_t = Eigen::Triplet<blk_t>;
+  using triplet_t = ttg::matrix::Triplet<blk_t>;
   std::vector<triplet_t> A_elements;
   std::vector<triplet_t> Aref_elements;
 #if defined(BTAS_IS_USABLE)
@@ -1446,7 +1448,7 @@ static void initBlSpRandom(const std::function<int(const Key<2> &)> &keymap, siz
   std::mt19937 genv(seed + 1);
 
   std::uniform_int_distribution<> dist(minTs, maxTs);  // randomly pick any value in the range minTs, maxTs
-  using triplet_t = Eigen::Triplet<blk_t>;
+  using triplet_t = ttg::matrix::Triplet<blk_t>;
   std::vector<triplet_t> A_elements;
   std::vector<triplet_t> B_elements;
   std::vector<triplet_t> Aref_elements;
diff --git a/examples/ttg_matrix.h b/examples/ttg_matrix.h
index 3478e2aa4..ce563cb4a 100644
--- a/examples/ttg_matrix.h
+++ b/examples/ttg_matrix.h
@@ -8,11 +8,46 @@
 #include <vector>
 
 #include "ttg/serialization/std/vector.h"
+#include "ttg/util/multiindex.h"
+
+#include <Eigen/SparseCore>
 
 namespace ttg {
 
   namespace matrix {
 
+    /// element of a sparse matrix = {row index, col index, value}
+
+    /// movable replacement for Eigen::Triplet
+    template<typename Value, typename StorageIndex=typename Eigen::SparseMatrix<Value>::StorageIndex >
+    class Triplet {
+     public:
+      Triplet() = default;
+      Triplet(const Triplet&) = default;
+      Triplet(Triplet&&) = default;
+      Triplet& operator=(const Triplet&) = default;
+      Triplet& operator=(Triplet&&) = default;
+
+      Triplet(StorageIndex r, const StorageIndex c, const Value& v)
+          : m_row(r), m_col(c), m_value(v)
+      {}
+      Triplet(StorageIndex r, const StorageIndex c, Value&& v = Value{})
+          : m_row(r), m_col(c), m_value(std::move(v))
+      {}
+
+      /** \returns the row index of the element */
+      const StorageIndex& row() const { return m_row; }
+
+      /** \returns the column index of the element */
+      const StorageIndex& col() const { return m_col; }
+
+      /** \returns the value of the element */
+      const Value& value() const { return m_value; }
+     protected:
+      StorageIndex m_row = -1, m_col = -1;
+      Value m_value;
+    };
+
     // matrix shape = maps {column,row} index to {row,column} indices
     class Shape : public std::vector<std::vector<long>> {
       using base_t = std::vector<std::vector<long>>;
@@ -192,22 +227,22 @@ namespace ttg {
     }
 
     // compute shape of an existing SpMatrix on rank 0
-    template <typename Blk = blk_t>
+    template <typename Blk>
     class ReadShape : public TT<void, std::tuple<Out<void, Shape>>, ReadShape<Blk>, ttg::typelist<void>> {
      public:
       using baseT = typename ReadShape::ttT;
       static constexpr const int owner = 0;  // where data resides
 
-      ReadShape(const char *label, const SpMatrix<Blk> &matrix, Edge<void, void> &in, Edge<void, Shape> &out)
+      ReadShape(const char *label, const Eigen::SparseMatrix<Blk> &matrix, Edge<void, void> &in, Edge<void, Shape> &out)
           : baseT(edges(in), edges(out), std::string("read_spmatrix_shape(") + label + ")", {"ctl"},
                   {std::string("shape[") + label + "]"},
                   /* keymap */ []() { return owner; })
           , matrix_(matrix) {}
 
-      void op(std::tuple<Out<void, Shape>> &out) { ::sendv<0>(Shape(matrix_), out); }
+      void op(std::tuple<Out<void, Shape>> &out) { ttg::sendv<0>(Shape(matrix_), out); }
 
      private:
-      const SpMatrix<Blk> &matrix_;
+      const Eigen::SparseMatrix<Blk> &matrix_;
     };
 
     // flow data from an existing SpMatrix on rank 0
@@ -217,38 +252,38 @@ namespace ttg {
     //   but will only be efficient if can do random access (slow with CSC format used by Eigen matrices)
     // - this could be generalized to read efficiently from a distributed data structure
     // Use Read_SpMatrix if need to read all data from a data structure localized on 1 process
-    template <typename Blk = blk_t>
-    class Read : public TT<Key<2>, std::tuple<Out<Key<2>, Blk>>, Read<Blk>, ttg::typelist<void>> {
+    template <typename Blk>
+    class Read : public TT<MultiIndex<2>, std::tuple<Out<MultiIndex<2>, Blk>>, Read<Blk>, ttg::typelist<void>> {
      public:
-      using baseT = TT<Key<2>, std::tuple<Out<Key<2>, Blk>>, Read<Blk>, void>;
+      using baseT = TT<MultiIndex<2>, std::tuple<Out<MultiIndex<2>, Blk>>, Read<Blk>, void>;
       static constexpr const int owner = 0;  // where data resides
 
-      Read(const char *label, const SpMatrix<Blk> &matrix, Edge<Key<2>, void> &in, Edge<Key<2>, Blk> &out)
+      Read(const char *label, const Eigen::SparseMatrix<Blk> &matrix, Edge<MultiIndex<2>, void> &in, Edge<MultiIndex<2>, Blk> &out)
           : baseT(edges(in), edges(out), std::string("read_spmatrix(") + label + ")", {"ctl[ij]"},
                   {std::string(label) + "[ij]"},
                   /* keymap */ [](auto key) { return owner; })
           , matrix_(matrix) {}
 
-      void op(const Key<2> &key, std::tuple<Out<Key<2>, Blk>> &out) {
+      void op(const MultiIndex<2> &key, std::tuple<Out<MultiIndex<2>, Blk>> &out) {
         // random access in CSC format is inefficient, this is only to demonstrate the way to go for hash-based storage
         // for whatever reason coeffRef does not work on a const SpMatrix&
-        ::send<0>(key, static_cast<const Blk &>(const_cast<SpMatrix<Blk> &>(matrix_).coeffRef(key[0], key[1])), out);
+        ttg::send<0>(key, static_cast<const Blk &>(const_cast<Eigen::SparseMatrix<Blk> &>(matrix_).coeffRef(key[0], key[1])), out);
       }
 
      private:
-      const SpMatrix<Blk> &matrix_;
+      const Eigen::SparseMatrix<Blk> &matrix_;
     };
 
     // WriteShape commits shape to an existing SpMatrix on rank 0 and sends it on
     // since SpMatrix supports random inserts there is no need to commit the shape into the matrix, other than get the
     // dimensions
-    template <typename Blk = blk_t>
+    template <typename Blk>
     class WriteShape : public TT<void, std::tuple<Out<void, Shape>>, WriteShape<Blk>, ttg::typelist<Shape>> {
      public:
       using baseT = typename WriteShape::ttT;
       static constexpr const int owner = 0;  // where data resides
 
-      WriteShape(const char *label, SpMatrix<Blk> &matrix, Edge<void, Shape> &in, Edge<void, Shape> &out)
+      WriteShape(const char *label, Eigen::SparseMatrix<Blk> &matrix, Edge<void, Shape> &in, Edge<void, Shape> &out)
           : baseT(edges(in), edges(out), std::string("write_spmatrix_shape(") + label + ")",
                   {std::string("shape_in[") + label + "]"}, {std::string("shape_out[") + label + "]"},
                   /* keymap */ []() { return owner; })
@@ -256,28 +291,28 @@ namespace ttg {
 
       void op(typename baseT::input_values_tuple_type &&ins, std::tuple<Out<void, Shape>> &out) {
         const auto &shape = baseT::template get<0>(ins);
-        ::ttg::trace("Resizing ", static_cast<void *>(&matrix_));
+        ttg::trace("Resizing ", static_cast<void *>(&matrix_));
         matrix_.resize(shape.nrows(), shape.ncols());
-        ::sendv<0>(shape, out);
+        ttg::sendv<0>(shape, out);
       }
 
      private:
-      SpMatrix<Blk> &matrix_;
+      Eigen::SparseMatrix<Blk> &matrix_;
     };
 
     // flow (move?) data into an existing SpMatrix on rank 0
-    template <typename Blk = blk_t>
-    class Write : public TT<Key<2>, std::tuple<>, Write<Blk>, Blk, ttg::typelist<void>> {
+    template <typename Blk>
+    class Write : public TT<MultiIndex<2>, std::tuple<>, Write<Blk>, ttg::typelist<void>> {
      public:
       using baseT = typename Write::ttT;
 
-      Write(const char *label, SpMatrix<Blk> &matrix, Edge<Key<2>, Blk> &data_in, Edge<Key<2>, void> &ctl_in)
+      Write(const char *label, Eigen::SparseMatrix<Blk> &matrix, Edge<MultiIndex<2>, Blk> &data_in, Edge<MultiIndex<2>, void> &ctl_in)
           : baseT(edges(data_in, ctl_in), edges(), std::string("write_spmatrix(") + label + ")",
                   {std::string(label) + "[ij]", std::string("ctl[ij]")}, {},
                   /* keymap */ [](auto key) { return 0; })
           , matrix_(matrix) {}
 
-      void op(const Key<2> &key, typename baseT::input_values_tuple_type &&elem, std::tuple<> &) {
+      void op(const MultiIndex<2> &key, typename baseT::input_values_tuple_type &&elem, std::tuple<> &) {
         std::lock_guard<std::mutex> lock(mtx_);
         ttg::trace("rank =", default_execution_context().rank(),
                    "/ thread_id =", reinterpret_cast<std::uintptr_t>(pthread_self()),
@@ -309,8 +344,8 @@ namespace ttg {
 
      private:
       std::mutex mtx_;
-      SpMatrix<Blk> &matrix_;
-      std::vector<SpMatrixTriplet<Blk>> values_;
+      Eigen::SparseMatrix<Blk> &matrix_;
+      std::vector<Triplet<Blk>> values_;
       mutable std::shared_ptr<std::shared_future<void>> completion_status_;
     };
 
@@ -325,28 +360,28 @@ namespace ttg {
                   /* keymap */ []() { return owner; }) {}
 
       void op(typename baseT::input_values_tuple_type &&ins, std::tuple<Out<void, Shape>> &out) {
-        ::sendv<0>(Shape::add(baseT::template get<0>(ins), baseT::template get<1>(ins)), out);
+        ttg::sendv<0>(Shape::add(baseT::template get<0>(ins), baseT::template get<1>(ins)), out);
       }
     };
 
     // pushes all blocks given by the shape
-    class Push : public TT<void, std::tuple<Out<Key<2>, void>>, Push, ttg::typelist<Shape>> {
+    class Push : public TT<void, std::tuple<Out<MultiIndex<2>, void>>, Push, ttg::typelist<Shape>> {
      public:
       using baseT = typename Push::ttT;
       static constexpr const int owner = 0;  // where data resides
 
-      Push(const char *label, Edge<void, Shape> &in, Edge<Key<2>, void> &out)
+      Push(const char *label, Edge<void, Shape> &in, Edge<MultiIndex<2>, void> &out)
           : baseT(edges(in), edges(out), std::string("push_spmatrix(") + label + ")",
                   {std::string("shape[") + label + "]"}, {"ctl[ij]"},
                   /* keymap */ []() { return owner; }) {}
 
-      void op(typename baseT::input_values_tuple_type &&ins, std::tuple<Out<Key<2>, void>> &out) {
+      void op(typename baseT::input_values_tuple_type &&ins, std::tuple<Out<MultiIndex<2>, void>> &out) {
         const auto &shape = baseT::get<0>(ins);
         if (shape.type() == Shape::Type::col2row) {
           long colidx = 0;
           for (const auto &col : shape) {
             for (const auto rowidx : col) {
-              ::sendk<0>(Key<2>({rowidx, colidx}), out);
+              ttg::sendk<0>(MultiIndex<2>({rowidx, colidx}), out);
             }
             ++colidx;
           }
@@ -354,7 +389,7 @@ namespace ttg {
           long rowidx = 0;
           for (const auto &row : shape) {
             for (const auto colidx : row) {
-              ::sendk<0>(Key<2>({rowidx, colidx}), out);
+              ttg::sendk<0>(MultiIndex<2>({rowidx, colidx}), out);
             }
             ++rowidx;
           }
@@ -372,9 +407,9 @@ namespace ttg {
   class Matrix {
    public:
     using shape_t = matrix::Shape;
-    using data_edge_t = Edge<Key<2>, T>;
+    using data_edge_t = Edge<MultiIndex<2>, T>;
     using shape_edge_t = Edge<void, shape_t>;
-    using ctl_edge_t = Edge<Key<2>, void>;
+    using ctl_edge_t = Edge<MultiIndex<2>, void>;
 
     Matrix() = default;
 
@@ -403,7 +438,7 @@ namespace ttg {
     /// @return an std::future<void> object indicating the status; @c destination_matrix is ready if calling has_value()
     /// on the return value of this function is true.
     /// @note up to the user to ensure completion before reading destination_matrix
-    auto operator>>(SpMatrix<T> &destination_matrix) {
+    auto operator>>(Eigen::SparseMatrix<T> &destination_matrix) {
       // shape writer writes shape to destination_matrix
       ttg_register_ptr(world_, std::make_shared<matrix::WriteShape<T>>("Matrix.WriteShape", destination_matrix,
                                                                        shape_edge_, shape_writer_push_edge_));

From 36a309e4a70b46d7c52b20ac08a5134a3362a921 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Wed, 24 Jul 2024 09:09:28 -0400
Subject: [PATCH 17/57] fixup spmm_cuda.cc

---
 examples/spmm/spmm_cuda.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/spmm/spmm_cuda.cc b/examples/spmm/spmm_cuda.cc
index 6597c0921..c9555fd44 100644
--- a/examples/spmm/spmm_cuda.cc
+++ b/examples/spmm/spmm_cuda.cc
@@ -1346,7 +1346,6 @@ static void initBlSpHardCoded(const std::function<int(const Key<2> &)> &keymap,
   a_colidx_to_rowidx[3].emplace_back(0);  // A[0][3]
 
   A.setFromTriplets(A_elements.begin(), A_elements.end());
-  std::cout << "A_elements.begin()" << A_elements.begin() << "A_elements.end()" << A_elements.end() << "\n";
 
   if (buildRefs && 0 == rank) {
     Aref.setFromTriplets(Aref_elements.begin(), Aref_elements.end());

From 67606e51bc3747337274053a963d9f65ff476c0d Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Wed, 24 Jul 2024 13:58:59 -0400
Subject: [PATCH 18/57] fixup DeviceTensor's move ctor/assignment to avoid use
 of copying btas::Tensor ops

---
 examples/spmm/spmm_cuda.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/spmm/spmm_cuda.cc b/examples/spmm/spmm_cuda.cc
index c9555fd44..2df91d147 100644
--- a/examples/spmm/spmm_cuda.cc
+++ b/examples/spmm/spmm_cuda.cc
@@ -191,7 +191,7 @@ struct DeviceTensor : public ttg::TTValue<DeviceTensor<_T, _Range, _Storage>>
 
     DeviceTensor(DeviceTensor&& x) noexcept
     : ttvalue_type(std::move(x))
-    , tensor_type(std::move(x))
+    , tensor_type(static_cast<tensor_type&&>(x))
     /* Grrrr, moving a Tensor does not guarantee to move the pointer */
     , b((this->size() == 0 ||
          this->data() == x.b.host_ptr()) ? std::move(x.b)
@@ -237,7 +237,7 @@ struct DeviceTensor : public ttg::TTValue<DeviceTensor<_T, _Range, _Storage>>
     /// move assignment operator
     DeviceTensor& operator=(DeviceTensor&& x) noexcept {
       ttvalue_type::operator=(std::move(x));
-      tensor_type::operator=(std::move(x));
+      tensor_type::operator=(static_cast<tensor_type&&>(x));
       if (this->size() == 0 || this->data() == x.b.host_ptr()){
         b = std::move(x.b);
       } else  {

From a36a88bce8e58f2449743b9e235ca93a6ac8dc91 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Wed, 24 Jul 2024 13:59:24 -0400
Subject: [PATCH 19/57] ttg::matrix::Triplet is read-once by default

---
 examples/ttg_matrix.h | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/examples/ttg_matrix.h b/examples/ttg_matrix.h
index ce563cb4a..e0d6cb3f6 100644
--- a/examples/ttg_matrix.h
+++ b/examples/ttg_matrix.h
@@ -16,10 +16,12 @@ namespace ttg {
 
   namespace matrix {
 
+    enum class ReadOnceTriplet { yes, no };
+
     /// element of a sparse matrix = {row index, col index, value}
 
-    /// movable replacement for Eigen::Triplet
-    template<typename Value, typename StorageIndex=typename Eigen::SparseMatrix<Value>::StorageIndex >
+    /// move-capable replacement for Eigen::Triplet
+    template<typename Value, ReadOnceTriplet ReadOnce = ReadOnceTriplet::yes, typename StorageIndex=typename Eigen::SparseMatrix<Value>::StorageIndex >
     class Triplet {
      public:
       Triplet() = default;
@@ -42,10 +44,15 @@ namespace ttg {
       const StorageIndex& col() const { return m_col; }
 
       /** \returns the value of the element */
-      const Value& value() const { return m_value; }
+      std::conditional_t<ReadOnce == ReadOnceTriplet::yes, Value&&, const Value&> value() const {
+        if constexpr (ReadOnce == ReadOnceTriplet::yes)
+          return std::move(m_value);
+        else
+          return m_value;
+      }
      protected:
       StorageIndex m_row = -1, m_col = -1;
-      Value m_value;
+      mutable Value m_value;
     };
 
     // matrix shape = maps {column,row} index to {row,column} indices

From c8c2b9575dd62076018292fcc90e1f11bfc18f58 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Wed, 24 Jul 2024 15:17:37 -0400
Subject: [PATCH 20/57] DeviceTensor's move ctro/assignment can just move
 buffer (?)

---
 examples/spmm/spmm_cuda.cc | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/examples/spmm/spmm_cuda.cc b/examples/spmm/spmm_cuda.cc
index 2df91d147..0986c7ed8 100644
--- a/examples/spmm/spmm_cuda.cc
+++ b/examples/spmm/spmm_cuda.cc
@@ -192,12 +192,7 @@ struct DeviceTensor : public ttg::TTValue<DeviceTensor<_T, _Range, _Storage>>
     DeviceTensor(DeviceTensor&& x) noexcept
     : ttvalue_type(std::move(x))
     , tensor_type(static_cast<tensor_type&&>(x))
-    /* Grrrr, moving a Tensor does not guarantee to move the pointer */
-    , b((this->size() == 0 ||
-         this->data() == x.b.host_ptr()) ? std::move(x.b)
-                                         : ttg::Buffer<_T>(this->size() ? this->data()
-                                                                        : nullptr,
-                                                           this->size()))
+    , b(std::move(x.b))
     {
       assert(this->data() == b.host_ptr());
       //std::cout << "DeviceTensor move ctor" << std::endl;
@@ -238,12 +233,7 @@ struct DeviceTensor : public ttg::TTValue<DeviceTensor<_T, _Range, _Storage>>
     DeviceTensor& operator=(DeviceTensor&& x) noexcept {
       ttvalue_type::operator=(std::move(x));
       tensor_type::operator=(static_cast<tensor_type&&>(x));
-      if (this->size() == 0 || this->data() == x.b.host_ptr()){
-        b = std::move(x.b);
-      } else  {
-        b = ttg::Buffer<_T>(this->size() ? this->data() : nullptr, this->size());
-      }
-      //std::swap(x.b, b);
+      b = std::move(x.b);
       //std::cout << "DeviceTensor move ctor" << std::endl;
       return *this;
     }

From 2dd5905ade7268a9a0c5bd08bc06ce6c0d32f63d Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Wed, 24 Jul 2024 15:32:53 -0400
Subject: [PATCH 21/57] ttg_parsec_data_wrapper_t's move assignment always
 cleans up after itself

---
 ttg/ttg/parsec/ttg_data_copy.h | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/ttg/ttg/parsec/ttg_data_copy.h b/ttg/ttg/parsec/ttg_data_copy.h
index a4b4575fa..5c250657d 100644
--- a/ttg/ttg/parsec/ttg_data_copy.h
+++ b/ttg/ttg/parsec/ttg_data_copy.h
@@ -570,15 +570,12 @@ namespace ttg_parsec {
     inline
     ttg_parsec_data_wrapper_t& ttg_parsec_data_wrapper_t::operator=(ttg_parsec_data_wrapper_t&& other) {
       m_data = std::move(other.m_data);
-      /* check whether the owning ttg_data_copy has already moved us */
-      if (other.m_ttg_copy != m_ttg_copy) {
-        /* remove from old ttg copy */
-        other.remove_from_owner();
+      /* remove from old ttg copy */
+      other.remove_from_owner();
 
-        if (nullptr != m_ttg_copy) {
-          /* register with the new ttg_copy */
-          m_ttg_copy->add_device_data(this);
-        }
+      if (nullptr != m_ttg_copy) {
+        /* register with the new ttg_copy */
+        m_ttg_copy->add_device_data(this);
       }
       return *this;
     }

From d0d8855adeebe05bb48fd18eb67dccbae7beb061 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Fri, 26 Jul 2024 13:25:57 -0400
Subject: [PATCH 22/57] Merge Device-SPMM into spmm.cc

This consolidates implementations and provides access to
constraints for the device-enabled SPMM.
DeviceTensor and device_gemm() have been outlined into a header file.

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 examples/CMakeLists.txt      |    6 +-
 examples/spmm/devicegemm.h   |   81 ++
 examples/spmm/devicetensor.h |  218 ++++
 examples/spmm/spmm.cc        |  135 ++-
 examples/spmm/spmm_cuda.cc   | 1961 ----------------------------------
 5 files changed, 409 insertions(+), 1992 deletions(-)
 create mode 100644 examples/spmm/devicegemm.h
 create mode 100644 examples/spmm/devicetensor.h
 delete mode 100644 examples/spmm/spmm_cuda.cc

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index c34e3c07e..b5d1ff622 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -20,7 +20,7 @@ if (TARGET tiledarray)
     add_ttg_executable(testing_dpoinv potrf/testing_dpoinv.cc LINK_LIBRARIES tiledarray lapackpp)
 
     if (TARGET CUDA::cublas)
-      add_ttg_executable(bspmm-cuda spmm/spmm_cuda.cc
+      add_ttg_executable(bspmm-cuda spmm/spmm.cc
                          LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS CUDA::cublas
                          COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2;TTG_ENABLE_CUDA=1
                          RUNTIMES "parsec")
@@ -32,7 +32,7 @@ if (TARGET tiledarray)
                           RUNTIMES "parsec")
       endif(TARGET CUDA::cusolver)
     elseif (TARGET roc::hipblas)
-      add_ttg_executable(bspmm-hip spmm/spmm_cuda.cc
+      add_ttg_executable(bspmm-hip spmm/spmm.cc
                           LINK_LIBRARIES tiledarray TiledArray_Eigen roc::hipblas
                           COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2;TTG_ENABLE_HIP=1
                           RUNTIMES "parsec")
@@ -43,7 +43,7 @@ if (TARGET tiledarray)
                            RUNTIMES "parsec")
       endif(TARGET roc::hipsolver)
     elseif (TARGET MKL::MKL_DPCPP)
-      add_ttg_executable(bspmm-lz spmm/spmm_cuda.cc
+      add_ttg_executable(bspmm-lz spmm/spmm.cc
                           LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS MKL::MKL_DPCPP level_zero::ze_loader m
                           COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2;TTG_ENABLE_LEVEL_ZERO=1
                           RUNTIMES "parsec")
diff --git a/examples/spmm/devicegemm.h b/examples/spmm/devicegemm.h
new file mode 100644
index 000000000..bdfc99562
--- /dev/null
+++ b/examples/spmm/devicegemm.h
@@ -0,0 +1,81 @@
+
+#if defined(TTG_ENABLE_LEVEL_ZERO)
+#include <oneapi/mkl.hpp>
+#include <sys/time.h>
+#endif
+
+#include "../devblas_helper.h"
+
+
+template <typename Blk>
+inline void device_gemm(Blk &C, const Blk &A, const Blk &B) {
+  using blk_t = Blk;
+  using T = typename blk_t::value_type;
+  static_assert(std::is_same_v<T,double> || std::is_same_v<T,float>);
+  static const T alpha = 1.0;
+  static const T beta  = 1.0;
+  // make sure all memory is on the device
+  // TODO: A and B are read-only so the owner device will be 0. How to fix?
+  //assert(A.b.get_current_device() != 0);
+  //assert(B.b.get_current_device() != 0);
+  auto device = ttg::device::current_device();
+  assert(device.is_device());
+#if defined(TTG_ENABLE_CUDA)
+  if constexpr (std::is_same_v<T,double>) {
+      cublasDgemm(cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, C.extent(0), C.extent(1), A.extent(1),
+                  &alpha, A.b.current_device_ptr(), A.extent(0), B.b.current_device_ptr(), B.extent(0), &beta,
+                  C.b.current_device_ptr(), C.extent(0));
+  }
+  else if constexpr (std::is_same_v<T,float>) {
+      cublasSgemm(cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, C.extent(0), C.extent(1), A.extent(1),
+                  &alpha, A.b.current_device_ptr(), A.extent(0), B.b.current_device_ptr(), B.extent(0), &beta,
+                  C.b.current_device_ptr(), C.extent(0));
+  }
+#elif defined(TTG_ENABLE_HIP)
+  if constexpr (std::is_same_v<T,double>) {
+    hipblasDgemm(hipblas_handle(),
+                 HIPBLAS_OP_N, HIPBLAS_OP_N,
+                 C.extent(0), C.extent(1), A.extent(1), &alpha,
+                 A.b.current_device_ptr(), A.extent(0),
+                 B.b.current_device_ptr(), B.extent(0), &beta,
+                 C.b.current_device_ptr(), C.extent(0));
+  } else if constexpr (std::is_same_v<T,float>) {
+    hipblasSgemm(hipblas_handle(),
+                 HIPBLAS_OP_N, HIPBLAS_OP_N,
+                 C.extent(0), C.extent(1), A.extent(1), &alpha,
+                 A.b.current_device_ptr(), A.extent(0),
+                 B.b.current_device_ptr(), B.extent(0), &beta,
+                 C.b.current_device_ptr(), C.extent(0));
+  }
+#elif defined(TTG_ENABLE_LEVEL_ZERO)
+
+#if defined(DEBUG_SYNCHRONOUS)
+  try {
+#endif /* DEBUG_SYNCHRONOUS */
+    cl::sycl::event gemm_event;
+    gemm_event = oneapi::mkl::blas::gemm(ttg::device::current_stream(),
+         oneapi::mkl::transpose::N, oneapi::mkl::transpose::N,
+         C.extent(0), C.extent(1), A.extent(1),
+         alpha, A.b.current_device_ptr(), A.extent(0),
+                B.b.current_device_ptr(), B.extent(0),
+         beta,  C.b.current_device_ptr(), C.extent(0));
+#if defined(DEBUG_SYNCHRONOUS)
+    gemm_event.wait();
+  } catch (const oneapi::mkl::invalid_argument &e) {
+    std::cerr << "OneAPI MKL BLAS GEMM throws invalid argument exception" << std::endl;
+  } catch (const oneapi::mkl::unsupported_device &e) {
+    std::cerr << "OneAPI MKL BLAS GEMM throws unsuported device exception" << std::endl;
+  } catch (const oneapi::mkl::host_bad_alloc &e) {
+    std::cerr << "OneAPI MKL BLAS GEMM throws host bad allocation exception" << std::endl;
+  } catch (const oneapi::mkl::device_bad_alloc &e) {
+    std::cerr << "OneAPI MKL BLAS GEMM throws device bad allocation exception" << std::endl;
+  } catch (const oneapi::mkl::unimplemented &e) {
+    std::cerr << "OneAPI MKL BLAS GEMM throws unimplemented exception" << std::endl;
+  } catch (const std::exception& e) {
+    std::cerr << "OneAPI MKL BLAS GEMM throws unexpected exception" << std::endl;
+  } catch (...) {
+    std::cerr << "OneAPI MKL BLAS GEMM throws unexpected exception that is also badly formatted..." << std::endl;
+  }
+#endif /* DEBUG_SYNCHRONOUS */
+#endif
+}
diff --git a/examples/spmm/devicetensor.h b/examples/spmm/devicetensor.h
new file mode 100644
index 000000000..4d0a7f1cf
--- /dev/null
+++ b/examples/spmm/devicetensor.h
@@ -0,0 +1,218 @@
+#ifndef HAVE_DEVICETENSOR_H
+#define HAVE_DEVICETENSOR_H
+
+#include <ttg.h>
+
+#if __has_include(<btas/features.h>)
+#pragma message("C Preprocessor got here!")
+#include <btas/features.h>
+#ifdef BTAS_IS_USABLE
+#include <btas/btas.h>
+#include <btas/optimize/contract.h>
+#include <btas/util/mohndle.h>
+#include <TiledArray/device/allocators.h>
+#include "../devblas_helper.h"
+#include <madness/world/parsec.h>  // need to initialize MADNESS purely for the purposes of TA allocators
+#else
+#warning "found btas/features.h but Boost.Iterators is missing, hence BTAS is unusable ... add -I/path/to/boost"
+#endif
+#endif
+
+#if defined(BTAS_IS_USABLE)
+
+/**
+ * Derives from btas::Tensor and wraps a ttg::Buffer
+ * to enable device support in SPMM. The ttg::Buffer
+ * does not own the host memory but mananages the device
+ * memory.
+ */
+template <typename _T, class _Range, class _Storage>
+struct DeviceTensor : public ttg::TTValue<DeviceTensor<_T, _Range, _Storage>>
+                    , public btas::Tensor<_T, _Range, _Storage> {
+  using tensor_type = typename btas::Tensor<_T, _Range, _Storage>;
+  using ttvalue_type = typename ttg::TTValue<DeviceTensor<_T, _Range, _Storage>>;
+  ttg::Buffer<_T> b; // does not own the host buffer
+
+  using value_type = typename tensor_type::value_type;
+  using size_type = typename tensor_type::size_type;
+  using storage_type = typename tensor_type::storage_type;
+  using range_type = typename tensor_type::range_type;
+
+
+   public:
+    DeviceTensor() = default;
+    ~DeviceTensor() = default;
+
+    /// constructor with index extent
+    template <typename... _args>
+    explicit DeviceTensor(const size_type& first, const _args&... rest)
+    : ttvalue_type()
+    , tensor_type(first, rest...)
+    , b(this->size() ? this->data() : nullptr, this->size())
+    { }
+
+    /// construct from \c range, allocate data, but not initialized
+    template <typename Range>
+    explicit DeviceTensor(const Range& range, typename std::enable_if<btas::is_boxrange<Range>::value>::type* = 0)
+    : ttvalue_type()
+    , tensor_type(range)
+    , b(this->size() ? this->data() : nullptr, this->size())
+    { }
+
+    /// construct from \c range object, set all elements to \c v
+    template <typename Range>
+    DeviceTensor(const Range& range, value_type v, typename std::enable_if<btas::is_boxrange<Range>::value>::type* = 0)
+    : ttvalue_type()
+    , tensor_type(range)
+    , b(this->size() ? this->data() : nullptr, this->size())
+    { }
+
+    /// construct from \c range object, copy elements from \c vec
+    template <typename Range, typename U>
+    DeviceTensor(const Range& range, U* vec, typename std::enable_if<btas::is_boxrange<Range>::value>::type* = 0)
+    : ttvalue_type()
+    , tensor_type(range, vec)
+    , b(this->size() ? this->data() : nullptr, this->size())
+    { }
+
+    /// construct from \c range and \c storage
+    template <typename Range, typename Storage>
+    DeviceTensor(const Range& range, const Storage& storage,
+           typename std::enable_if<btas::is_boxrange<Range>::value & not std::is_same<Range, range_type>::value &
+                                   not std::is_same<Storage, storage_type>::value>::type* = 0)
+    : ttvalue_type()
+    , tensor_type(range, storage)
+    , b(this->size() ? this->data() : nullptr, this->size())
+    { }
+
+    /// copy-copy-construct from \c range and \c storage
+    DeviceTensor(const range_type& range, const storage_type& storage)
+    : ttvalue_type()
+    , tensor_type(range, storage)
+    , b(this->size() ? this->data() : nullptr, this->size())
+    { }
+
+    /// copy-move-construct from \c range and \c storage
+    DeviceTensor(const range_type& range, storage_type&& storage)
+    : ttvalue_type()
+    , tensor_type(range, std::forward<storage_type>(storage))
+    , b(this->size() ? this->data() : nullptr, this->size())
+    { }
+
+    /// move-construct from \c range and \c storage
+    DeviceTensor(range_type&& range, storage_type&& storage)
+    : ttvalue_type()
+    , tensor_type(std::forward<range_type>(range), std::forward<storage_type>(storage))
+    , b(this->size() ? this->data() : nullptr, this->size())
+    { }
+
+    /// Construct an evaluated tensor
+
+    /// This constructor will allocate memory for \c range.area() elements. Each element
+    /// will be initialized as:
+    /// \code
+    ///   for(auto&& idx: range)
+    ///     (*this)[idx] = op(*(it++));
+    /// \endcode
+    /// \tparam Range An input Range type.
+    /// \tparam InIter An input iterator type.
+    /// \tparam Op A unary operation type
+    /// \param range the input range type
+    /// \param first An input iterator for the argument
+    /// \param op The unary operation to be applied to the argument data
+    template <typename Range, typename InIter, typename Op>
+    DeviceTensor(const Range& range, InIter it, const Op& op,
+           typename std::enable_if<btas::is_boxrange<Range>::value>::type* = 0)
+    : ttvalue_type()
+    , tensor_type(range, it, op)
+    , b(this->size() ? this->data() : nullptr, this->size())
+    { }
+
+    /// copy constructor
+    /// It will accept Tensors and TensorViews
+    template <class _Tensor, class = typename std::enable_if<btas::is_boxtensor<_Tensor>::value>::type>
+    DeviceTensor(const _Tensor& x) noexcept
+    : ttvalue_type()
+    , tensor_type(x.clone())
+    , b(this->size() ? this->data() : nullptr, this->size())
+    {
+      //std::cout << "DeviceTensor tensor_type copy ctor" << std::endl;
+    }
+
+    /// copy constructor: devicebuf cannot be copied, so deleted
+    DeviceTensor(const DeviceTensor& x) noexcept
+    : ttvalue_type(x)
+    , tensor_type(x.clone())
+    , b(this->size() ? this->data() : nullptr, this->size())
+    {
+      //std::cout << "DeviceTensor copy ctor" << std::endl;
+    }
+
+    /// move constructor
+    DeviceTensor(tensor_type&& x) noexcept
+    : ttvalue_type()
+    , tensor_type(std::move(x))
+    , b(this->size() ? this->data() : nullptr, this->size())
+    {
+      //std::cout << "DeviceTensor tensor_type move ctor" << std::endl;
+    }
+
+    DeviceTensor(DeviceTensor&& x) noexcept
+    : ttvalue_type(std::move(x))
+    , tensor_type(static_cast<tensor_type&&>(x))
+    , b(std::move(x.b))
+    {
+      assert(this->data() == b.host_ptr());
+      //std::cout << "DeviceTensor move ctor" << std::endl;
+    }
+
+    /// copy assignment operator
+    template <class _Tensor, class = typename std::enable_if<
+                                 btas::is_boxtensor<_Tensor>::value &&
+                                 not std::is_same<typename _Tensor::storage_type, storage_type>::value>::type>
+    DeviceTensor& operator=(const _Tensor& x) noexcept {
+      tensor_type::operator=(x.clone());
+      b.reset(this->size() ? this->data() : nullptr, this->size());
+      //std::cout << "DeviceTensor tensor_type copy operator" << std::endl;
+      return *this;
+    }
+
+    /// copy assignment operator
+    template <class _Tensor, class = typename std::enable_if<btas::is_boxtensor<_Tensor>::value>::type,
+              class = typename std::enable_if<
+                  std::is_same<typename _Tensor::storage_type, storage_type>::value>::type>
+    DeviceTensor& operator=(const _Tensor& x) noexcept {
+      tensor_type::operator=(x.clone());
+      b.reset(this->size() ? this->data() : nullptr, this->size());
+      //std::cout << "DeviceTensor tensor_type copy operator" << std::endl;
+      return *this;
+    }
+
+    /// copy assignment: devicebuf cannot be copied, deleted
+    DeviceTensor& operator=(const DeviceTensor& x) noexcept {
+      ttvalue_type::operator=(x);
+      tensor_type::operator=(x.clone());
+      b.reset(this->size() ? this->data() : nullptr, this->size());
+      //std::cout << "DeviceTensor copy operator" << std::endl;
+      return *this;
+    }
+
+    /// move assignment operator
+    DeviceTensor& operator=(DeviceTensor&& x) noexcept {
+      ttvalue_type::operator=(std::move(x));
+      tensor_type::operator=(static_cast<tensor_type&&>(x));
+      b = std::move(x.b);
+      //std::cout << "DeviceTensor move ctor" << std::endl;
+      return *this;
+    }
+
+    using tensor_type::begin;
+    using tensor_type::cbegin;
+    using tensor_type::end;
+    using tensor_type::cend;
+
+};
+
+#endif // defined(BTAS_IS_USABLE)
+
+#endif // HAVE_DEVICETENSOR_H
\ No newline at end of file
diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc
index c09e908fb..51fb39d5f 100644
--- a/examples/spmm/spmm.cc
+++ b/examples/spmm/spmm.cc
@@ -40,12 +40,34 @@ using namespace ttg;
 
 #include "ttg/util/bug.h"
 
+#if defined(TTG_ENABLE_CUDA)
+#define HAVE_SPMM_DEVICE 1
+static constexpr ttg::ExecutionSpace space = ttg::ExecutionSpace::CUDA;
+#elif defined(TTG_ENABLE_HIP)
+#define HAVE_SPMM_DEVICE 1
+static constexpr ttg::ExecutionSpace space = ttg::ExecutionSpace::HIP;
+#elif defined(TTG_ENABLE_LEVEL_ZERO)
+#define HAVE_SPMM_DEVICE 1
+static constexpr ttg::ExecutionSpace space = ttg::ExecutionSpace::L0;
+#else
+static constexpr ttg::ExecutionSpace space = ttg::ExecutionSpace::Host;
+#endif
+
+/* set to true to automatically release constraints
+ * this removes the ability to control the window
+ * size and is equal to a window size of 1 */
 #define USE_AUTO_CONSTRAINT false
 
 #if defined(BLOCK_SPARSE_GEMM) && defined(BTAS_IS_USABLE)
 using scalar_t = double;
-using blk_t = btas::Tensor<scalar_t, btas::DEFAULT::range, btas::mohndle<btas::varray<scalar_t>, btas::Handle::shared_ptr>>;
 
+#if HAVE_SPMM_DEVICE
+using blk_t = DeviceTensor<scalar_t, btas::DEFAULT::range,
+                           btas::mohndle<btas::varray<scalar_t, TiledArray::device_pinned_allocator<scalar_t>>,
+                                         btas::Handle::shared_ptr>>;
+#else  // HAVE_SPMM_DEVICE
+using blk_t = btas::Tensor<scalar_t, btas::DEFAULT::range, btas::mohndle<btas::varray<scalar_t>, btas::Handle::shared_ptr>>;
+#endif // HAVE_SPMM_DEVICE
 //#include <atomic>
 //static std::atomic<uint64_t> reduce_count = 0;
 
@@ -272,7 +294,9 @@ class Write_SpMatrix : public TT<Key<2>, std::tuple<>, Write_SpMatrix<Blk>, ttg:
 
 /// @tparam KeyMap2 maps {i,j} to processor
 /// @tparam KeyMap3 maps {i,j,k} to processor
-template <typename Keymap2 = std::function<int(const Key<2> &)>, typename Keymap3 = std::function<int(const Key<3> &)>,
+template <ttg::ExecutionSpace Space = space,
+          typename Keymap2 = std::function<int(const Key<2> &)>,
+          typename Keymap3 = std::function<int(const Key<3> &)>,
           typename Blk = blk_t>
 class SpMM25D {
  public:
@@ -305,9 +329,9 @@ class SpMM25D {
     // add constraint with external mapper: key[0] represents `k`
     bcast_b_->add_constraint(constraint, [](const Key<2>& key){ return key[0]; });
     local_bcast_b_ = std::make_unique<LocalBcastB>(local_b_ijk_, b_ijk_, a_rows_of_col_, ijk_keymap_);
-    multiplyadd_ = std::make_unique<MultiplyAdd>(a_ijk_, b_ijk_, c_ijk_, c_ij_p_, a_cols_of_row_,
-                                                 b_rows_of_col_, mTiles, nTiles, ijk_keymap_, constraint,
-                                                 k_cnt_, parallel_bcasts_);
+    multiplyadd_ = std::make_unique<MultiplyAdd<Space>>(a_ijk_, b_ijk_, c_ijk_, c_ij_p_, a_cols_of_row_,
+                                                        b_rows_of_col_, mTiles, nTiles, ijk_keymap_, constraint,
+                                                        k_cnt_, parallel_bcasts_);
 
     reduce_c_ = std::make_unique<ReduceC>(c_ij_p_, c, ij_keymap_);
     reduce_c_->template set_input_reducer<0>(
@@ -543,8 +567,33 @@ class SpMM25D {
 
   /// multiply task has 3 input flows: a_ijk, b_ijk, and c_ijk, c_ijk contains the running total for this layer of the
   /// 3-D process grid only
-  class MultiplyAdd : public TT<Key<3>, std::tuple<Out<Key<2>, Blk>, Out<Key<3>, Blk>>, MultiplyAdd,
+  template<ttg::ExecutionSpace Space_>
+  class MultiplyAdd : public TT<Key<3>, std::tuple<Out<Key<2>, Blk>, Out<Key<3>, Blk>>, MultiplyAdd<Space_>,
                                 ttg::typelist<const Blk, const Blk, Blk>> {
+    static constexpr const bool is_device_space = (Space_ != ttg::ExecutionSpace::Host);
+    using task_return_type = std::conditional_t<is_device_space, ttg::device::Task, void>;
+    /* communicate to the runtime which device we support (if any) */
+    static constexpr bool have_cuda_op = (Space_ == ttg::ExecutionSpace::CUDA);
+    static constexpr bool have_hip_op  = (Space_ == ttg::ExecutionSpace::HIP);
+    static constexpr bool have_level_zero_op = (Space_ == ttg::ExecutionSpace::L0);
+
+    void release_next_k(long k) {
+      assert(k_cnt_.size() > k);
+      long cnt = k_cnt_[k].fetch_sub(1, std::memory_order_relaxed)-1;
+      assert(cnt >= 0);
+      if (0 == cnt) {
+        auto release_k = k;
+        auto bcasts_ahead = parallel_bcasts_;
+        // this was the last gemm in this k, find the next one to release
+        while (++release_k < k_cnt_.size() &&
+                (0 == k_cnt_[release_k].load(std::memory_order_relaxed)
+                || --bcasts_ahead > 0))
+        { }
+        constraint->release(release_k);
+      }
+    }
+
+
    public:
     using baseT = typename MultiplyAdd::ttT;
 
@@ -563,7 +612,13 @@ class SpMM25D {
         , constraint(std::move(constraint))
         , parallel_bcasts_(parallel_bcasts) {
       this->set_priomap([=,this](const Key<3> &ijk) { return this->prio(ijk); });  // map a key to an integral priority value
-
+      if constexpr (is_device_space) {
+        auto num_devices = ttg::device::num_devices();
+        this->set_devicemap(
+          [num_devices](const Key<3> &ijk){
+            return ((((uint64_t)ijk[0]) << 32) + ijk[1]) % num_devices;
+          });
+      }
       // for each {i,j} determine first k that contributes AND belongs to this node,
       // initialize input {i,j,first_k} flow to 0
       for (auto i = 0ul; i != a_cols_of_row_.size(); ++i) {
@@ -591,8 +646,8 @@ class SpMM25D {
       }
     }
 
-    void op(const Key<3> &ijk, typename baseT::input_refs_tuple_type &&_ijk,
-            std::tuple<Out<Key<2>, Blk>, Out<Key<3>, Blk>> &result) {
+    task_return_type op(const Key<3> &ijk, typename baseT::input_refs_tuple_type &&_ijk,
+                        std::tuple<Out<Key<2>, Blk>, Out<Key<3>, Blk>> &result) {
       const auto i = ijk[0];
       const auto j = ijk[1];
       const auto k = ijk[2];  // k==l same because 000 will always be on layer 0, 001 will be accessed on layer 1
@@ -606,36 +661,52 @@ class SpMM25D {
                  i, "][", j, "]  += A[", i, "][", k, "] by B[", k, "][", j, "],  next_k? ",
                  (have_next_k ? std::to_string(next_k) : "does not exist"));
       // release the constraint on the next round of broadcasts
-      {
-        assert(k_cnt_.size() > k);
-        long cnt = k_cnt_[k].fetch_sub(1, std::memory_order_relaxed)-1;
-        assert(cnt >= 0);
-        if (0 == cnt) {
-          auto release_k = k;
-          auto bcasts_ahead = parallel_bcasts_;
-          // this was the last gemm in this k, find the next one to release
-          while (++release_k < k_cnt_.size() &&
-                 (0 == k_cnt_[release_k].load(std::memory_order_relaxed)
-                  || --bcasts_ahead > 0))
-          { }
-          constraint->release(release_k);
-        }
+      release_next_k(k);
+      const blk_t& A = baseT::template get<0>(_ijk);
+      const blk_t& B = baseT::template get<1>(_ijk);
+      blk_t& C = baseT::template get<2>(_ijk);
+
+      if (C.empty()) {
+        C = blk_t(btas::Range(A.range().extent(0), B.range().extent(1)), 0.0);
       }
 
+#ifdef HAVE_SPMM_DEVICE
+      /* pull all buffers onto the device */
+      co_await ttg::device::select(A.b, B.b, C.b);
+
+      /* everything is on the device, call the gemm */
+      device_gemm(C, A, B);
+
+      // pass the running total to the next flow, if needed
+      // otherwise write to the result flow
+      if (have_next_k) {
+        co_await ttg::device::forward(ttg::device::send<1>(
+                                                Key<3>({i, j, next_k}),
+                                                std::move(C),
+                                                result));
+      } else {  // done with all local contributions to C[i][j], reduce with others on the process to which C[i][j]
+                // belongs
+        co_await ttg::device::forward(ttg::device::send<0>(
+                                                Key<2>({i, j}),
+                                                std::move(C),
+                                                result));
+      }
+#else  // HAVE_SPMM_DEVICE
       // compute the contrib, pass the running total to the next flow, if needed
       // otherwise write to the result flow
       if (have_next_k) {
         ::send<1>(
             Key<3>({i, j, next_k}),
-            gemm(std::move(baseT::template get<2>(_ijk)), baseT::template get<0>(_ijk), baseT::template get<1>(_ijk)),
+            gemm(std::move(C), A, B),
             result);
       } else {  // done with all local contributions to C[i][j], reduce with others on the process to which C[i][j]
                 // belongs
         ::send<0>(
             Key<2>({i, j}),
-            gemm(std::move(baseT::template get<2>(_ijk)), baseT::template get<0>(_ijk), baseT::template get<1>(_ijk)),
+            gemm(std::move(C), A, B),
             result);
       }
+#endif // HAVE_SPMM_DEVICE
     }
 
    private:
@@ -787,7 +858,7 @@ class SpMM25D {
   std::unique_ptr<LocalBcastA> local_bcast_a_;
   std::unique_ptr<BcastB> bcast_b_;
   std::unique_ptr<LocalBcastB> local_bcast_b_;
-  std::unique_ptr<MultiplyAdd> multiplyadd_;
+  std::unique_ptr<MultiplyAdd<Space>> multiplyadd_;
   std::unique_ptr<ReduceC> reduce_c_;
   std::vector<std::atomic<std::size_t>> k_cnt_;
   Keymap2 ij_keymap_;
@@ -1418,12 +1489,12 @@ static void timed_measurement(SpMatrix<> &A, SpMatrix<> &B, const std::function<
   std::string rt("Unkown???");
 #endif
   if (ttg::default_execution_context().rank() == 0) {
-    std::cout << "TTG-" << rt << " PxQxR=   " << P << " " << Q << " " << R << " 1 average_NB= " << avg_nb << " M= " << M
+    std::cout << "TTG-" << rt << " PxQxR=   " << P << " " << Q << " " << R << " " << ttg::device::num_devices()
+              << " average_NB= " << avg_nb << " M= " << M
               << " N= " << N << " K= " << K << " t= " << minTs << " T=" << maxTs << " Tiling= " << tiling_type
               << " A_density= " << Adensity << " B_density= " << Bdensity << " gflops= " << gflops << " seconds= " << tc
               << " gflops/s= " << gflops / tc << std::endl;
   }
-  //std::cout << "num reductions " << reduce_count.load() << " tiles " << MT*KT << std::endl;
 }
 
 #if !defined(BLOCK_SPARSE_GEMM)
@@ -1722,9 +1793,17 @@ int main(int argc, char **argv) {
         // Start up engine
         execute();
         for (int nrun = 0; nrun < nb_runs; nrun++) {
+#if TTG_USE_PARSEC
+          /* flush all PaRSEC memory */
+          parsec_devices_release_memory();
+#endif // TTG_USE_PARSEC
           timed_measurement(A, B, ij_keymap, ijk_keymap, tiling_type, gflops, avg_nb, Adensity, Bdensity,
                             a_cols_of_row, a_rows_of_col, b_cols_of_row, b_rows_of_col, mTiles,
                             nTiles, kTiles, M, N, K, minTs, maxTs, P, Q, R, parallel_bcasts);
+#if TTG_USE_PARSEC
+          /* reset PaRSEC's load tracking */
+          parsec_devices_reset_load(default_execution_context().impl().context());
+#endif // TTG_USE_PARSEC
         }
       } else {
         // flow graph needs to exist on every node
diff --git a/examples/spmm/spmm_cuda.cc b/examples/spmm/spmm_cuda.cc
deleted file mode 100644
index acdf6c1e2..000000000
--- a/examples/spmm/spmm_cuda.cc
+++ /dev/null
@@ -1,1961 +0,0 @@
-
-#include <Eigen/SparseCore>
-#include <algorithm>
-#include <array>
-#include <chrono>
-#include <iostream>
-#include <random>
-#include <thread>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include <madness/world/world.h>
-
-#if __has_include(<btas/features.h>)
-#pragma message("C Preprocessor got here!")
-#include <btas/features.h>
-#ifdef BTAS_IS_USABLE
-#include <btas/btas.h>
-#include <btas/optimize/contract.h>
-#include <btas/util/mohndle.h>
-#include <TiledArray/device/allocators.h>
-#include "../devblas_helper.h"
-#include <madness/world/parsec.h>  // need to initialize MADNESS purely for the purposes of TA allocators
-#else
-#warning "found btas/features.h but Boost.Iterators is missing, hence BTAS is unusable ... add -I/path/to/boost"
-#endif
-#endif
-
-#include <sys/time.h>
-#include <boost/graph/rmat_graph_generator.hpp>
-#if !defined(BLOCK_SPARSE_GEMM)
-#include <boost/graph/directed_graph.hpp>
-#include <boost/random/linear_congruential.hpp>
-#include <unsupported/Eigen/SparseExtra>
-#endif
-
-#include "ttg.h"
-
-#include "../devblas_helper.h"
-
-using namespace ttg;
-
-#include "ttg/util/future.h"
-
-#include "ttg/util/multiindex.h"
-
-#include "ttg/util/bug.h"
-
-#include "ttg/serialization/std/pair.h"
-
-#if defined(TTG_ENABLE_LEVEL_ZERO)
-#include <oneapi/mkl.hpp>
-#include <sys/time.h>
-#endif
-
-#include "../ttg_matrix.h"
-
-#if defined(BLOCK_SPARSE_GEMM) && defined(BTAS_IS_USABLE)
-
-template <typename _T, class _Range, class _Storage>
-struct DeviceTensor : public ttg::TTValue<DeviceTensor<_T, _Range, _Storage>>
-                    , public btas::Tensor<_T, _Range, _Storage> {
-  using tensor_type = typename btas::Tensor<_T, _Range, _Storage>;
-  using ttvalue_type = typename ttg::TTValue<DeviceTensor<_T, _Range, _Storage>>;
-  ttg::Buffer<_T> b; // does not own the host buffer
-
-  using value_type = typename tensor_type::value_type;
-  using size_type = typename tensor_type::size_type;
-  using storage_type = typename tensor_type::storage_type;
-  using range_type = typename tensor_type::range_type;
-
-
-   public:
-    DeviceTensor() = default;
-    ~DeviceTensor() = default;
-
-    /// constructor with index extent
-    template <typename... _args>
-    explicit DeviceTensor(const size_type& first, const _args&... rest)
-    : ttvalue_type()
-    , tensor_type(first, rest...)
-    , b(this->size() ? this->data() : nullptr, this->size())
-    { }
-
-    /// construct from \c range, allocate data, but not initialized
-    template <typename Range>
-    explicit DeviceTensor(const Range& range, typename std::enable_if<btas::is_boxrange<Range>::value>::type* = 0)
-    : ttvalue_type()
-    , tensor_type(range)
-    , b(this->size() ? this->data() : nullptr, this->size())
-    { }
-
-    /// construct from \c range object, set all elements to \c v
-    template <typename Range>
-    DeviceTensor(const Range& range, value_type v, typename std::enable_if<btas::is_boxrange<Range>::value>::type* = 0)
-    : ttvalue_type()
-    , tensor_type(range)
-    , b(this->size() ? this->data() : nullptr, this->size())
-    { }
-
-    /// construct from \c range object, copy elements from \c vec
-    template <typename Range, typename U>
-    DeviceTensor(const Range& range, U* vec, typename std::enable_if<btas::is_boxrange<Range>::value>::type* = 0)
-    : ttvalue_type()
-    , tensor_type(range, vec)
-    , b(this->size() ? this->data() : nullptr, this->size())
-    { }
-
-    /// construct from \c range and \c storage
-    template <typename Range, typename Storage>
-    DeviceTensor(const Range& range, const Storage& storage,
-           typename std::enable_if<btas::is_boxrange<Range>::value & not std::is_same<Range, range_type>::value &
-                                   not std::is_same<Storage, storage_type>::value>::type* = 0)
-    : ttvalue_type()
-    , tensor_type(range, storage)
-    , b(this->size() ? this->data() : nullptr, this->size())
-    { }
-
-    /// copy-copy-construct from \c range and \c storage
-    DeviceTensor(const range_type& range, const storage_type& storage)
-    : ttvalue_type()
-    , tensor_type(range, storage)
-    , b(this->size() ? this->data() : nullptr, this->size())
-    { }
-
-    /// copy-move-construct from \c range and \c storage
-    DeviceTensor(const range_type& range, storage_type&& storage)
-    : ttvalue_type()
-    , tensor_type(range, std::forward<storage_type>(storage))
-    , b(this->size() ? this->data() : nullptr, this->size())
-    { }
-
-    /// move-construct from \c range and \c storage
-    DeviceTensor(range_type&& range, storage_type&& storage)
-    : ttvalue_type()
-    , tensor_type(std::forward<range_type>(range), std::forward<storage_type>(storage))
-    , b(this->size() ? this->data() : nullptr, this->size())
-    { }
-
-    /// Construct an evaluated tensor
-
-    /// This constructor will allocate memory for \c range.area() elements. Each element
-    /// will be initialized as:
-    /// \code
-    ///   for(auto&& idx: range)
-    ///     (*this)[idx] = op(*(it++));
-    /// \endcode
-    /// \tparam Range An input Range type.
-    /// \tparam InIter An input iterator type.
-    /// \tparam Op A unary operation type
-    /// \param range the input range type
-    /// \param first An input iterator for the argument
-    /// \param op The unary operation to be applied to the argument data
-    template <typename Range, typename InIter, typename Op>
-    DeviceTensor(const Range& range, InIter it, const Op& op,
-           typename std::enable_if<btas::is_boxrange<Range>::value>::type* = 0)
-    : ttvalue_type()
-    , tensor_type(range, it, op)
-    , b(this->size() ? this->data() : nullptr, this->size())
-    { }
-
-    /// copy constructor
-    /// It will accept Tensors and TensorViews
-    template <class _Tensor, class = typename std::enable_if<btas::is_boxtensor<_Tensor>::value>::type>
-    DeviceTensor(const _Tensor& x) noexcept
-    : ttvalue_type()
-    , tensor_type(x.clone())
-    , b(this->size() ? this->data() : nullptr, this->size())
-    {
-      //std::cout << "DeviceTensor tensor_type copy ctor" << std::endl;
-    }
-
-    /// copy constructor: devicebuf cannot be copied, so deleted
-    DeviceTensor(const DeviceTensor& x) noexcept
-    : ttvalue_type(x)
-    , tensor_type(x.clone())
-    , b(this->size() ? this->data() : nullptr, this->size())
-    {
-      //std::cout << "DeviceTensor copy ctor" << std::endl;
-    }
-
-    /// move constructor
-    DeviceTensor(tensor_type&& x) noexcept
-    : ttvalue_type()
-    , tensor_type(std::move(x))
-    , b(this->size() ? this->data() : nullptr, this->size())
-    {
-      //std::cout << "DeviceTensor tensor_type move ctor" << std::endl;
-    }
-
-    DeviceTensor(DeviceTensor&& x) noexcept
-    : ttvalue_type(std::move(x))
-    , tensor_type(static_cast<tensor_type&&>(x))
-    , b(std::move(x.b))
-    {
-      assert(this->data() == b.host_ptr());
-      //std::cout << "DeviceTensor move ctor" << std::endl;
-    }
-
-    /// copy assignment operator
-    template <class _Tensor, class = typename std::enable_if<
-                                 btas::is_boxtensor<_Tensor>::value &&
-                                 not std::is_same<typename _Tensor::storage_type, storage_type>::value>::type>
-    DeviceTensor& operator=(const _Tensor& x) noexcept {
-      tensor_type::operator=(x.clone());
-      b.reset(this->size() ? this->data() : nullptr, this->size());
-      //std::cout << "DeviceTensor tensor_type copy operator" << std::endl;
-      return *this;
-    }
-
-    /// copy assignment operator
-    template <class _Tensor, class = typename std::enable_if<btas::is_boxtensor<_Tensor>::value>::type,
-              class = typename std::enable_if<
-                  std::is_same<typename _Tensor::storage_type, storage_type>::value>::type>
-    DeviceTensor& operator=(const _Tensor& x) noexcept {
-      tensor_type::operator=(x.clone());
-      b.reset(this->size() ? this->data() : nullptr, this->size());
-      //std::cout << "DeviceTensor tensor_type copy operator" << std::endl;
-      return *this;
-    }
-
-    /// copy assignment: devicebuf cannot be copied, deleted
-    DeviceTensor& operator=(const DeviceTensor& x) noexcept {
-      ttvalue_type::operator=(x);
-      tensor_type::operator=(x.clone());
-      b.reset(this->size() ? this->data() : nullptr, this->size());
-      //std::cout << "DeviceTensor copy operator" << std::endl;
-      return *this;
-    }
-
-    /// move assignment operator
-    DeviceTensor& operator=(DeviceTensor&& x) noexcept {
-      ttvalue_type::operator=(std::move(x));
-      tensor_type::operator=(static_cast<tensor_type&&>(x));
-      b = std::move(x.b);
-      //std::cout << "DeviceTensor move ctor" << std::endl;
-      return *this;
-    }
-
-    using tensor_type::begin;
-    using tensor_type::cbegin;
-    using tensor_type::end;
-    using tensor_type::cend;
-
-};
-
-using scalar_t = double;
-#if defined(TTG_ENABLE_CUDA) || defined(TTG_ENABLE_HIP)
-using blk_t = DeviceTensor<scalar_t, btas::DEFAULT::range,
-                           btas::mohndle<btas::varray<scalar_t, TiledArray::device_pinned_allocator<scalar_t>>,
-                                         btas::Handle::shared_ptr>>;
-#else
-using blk_t = DeviceTensor<scalar_t, btas::DEFAULT::range,
-                           btas::mohndle<btas::varray<scalar_t>, btas::Handle::shared_ptr>>;
-#endif
-
-
-//inline blk_t operator*(const blk_t &A, const blk_t &B) {
-//  blk_t::tensor_type c;
-//  btas::contract(1.0, A, {1, 2}, B, {2, 3}, 0.0, c, {1, 3});
-//  return blk_t(std::move(c));
-//}
-
-/* TODO: call CUDA gemm here */
-template <typename Blk>
-static void device_gemm(Blk &C, const Blk &A, const Blk &B) {
-  using blk_t = Blk;
-  using T = typename blk_t::value_type;
-  static_assert(std::is_same_v<T,double> || std::is_same_v<T,float>);
-  static const T alpha = 1.0;
-  static const T beta  = 1.0;
-  // make sure all memory is on the device
-  // TODO: A and B are read-only so the owner device will be 0. How to fix?
-  //assert(A.b.get_current_device() != 0);
-  //assert(B.b.get_current_device() != 0);
-  auto device = ttg::device::current_device();
-  assert(device.is_device());
-#if defined(TTG_ENABLE_CUDA)
-  if constexpr (std::is_same_v<T,double>) {
-      cublasDgemm(cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, C.extent(0), C.extent(1), A.extent(1),
-                  &alpha, A.b.current_device_ptr(), A.extent(0), B.b.current_device_ptr(), B.extent(0), &beta,
-                  C.b.current_device_ptr(), C.extent(0));
-  }
-  else if constexpr (std::is_same_v<T,float>) {
-      cublasSgemm(cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, C.extent(0), C.extent(1), A.extent(1),
-                  &alpha, A.b.current_device_ptr(), A.extent(0), B.b.current_device_ptr(), B.extent(0), &beta,
-                  C.b.current_device_ptr(), C.extent(0));
-  }
-#elif defined(TTG_ENABLE_HIP)
-  if constexpr (std::is_same_v<T,double>) {
-    hipblasDgemm(hipblas_handle(),
-                 HIPBLAS_OP_N, HIPBLAS_OP_N,
-                 C.extent(0), C.extent(1), A.extent(1), &alpha,
-                 A.b.current_device_ptr(), A.extent(0),
-                 B.b.current_device_ptr(), B.extent(0), &beta,
-                 C.b.current_device_ptr(), C.extent(0));
-  } else if constexpr (std::is_same_v<T,float>) {
-    hipblasSgemm(hipblas_handle(),
-                 HIPBLAS_OP_N, HIPBLAS_OP_N,
-                 C.extent(0), C.extent(1), A.extent(1), &alpha,
-                 A.b.current_device_ptr(), A.extent(0),
-                 B.b.current_device_ptr(), B.extent(0), &beta,
-                 C.b.current_device_ptr(), C.extent(0));
-  }
-#elif defined(TTG_ENABLE_LEVEL_ZERO)
-
-#if defined(DEBUG_SYNCHRONOUS)
-  try {
-#endif /* DEBUG_SYNCHRONOUS */
-    cl::sycl::event gemm_event;
-    gemm_event = oneapi::mkl::blas::gemm(ttg::device::current_stream(),
-         oneapi::mkl::transpose::N, oneapi::mkl::transpose::N,
-         C.extent(0), C.extent(1), A.extent(1),
-         alpha, A.b.current_device_ptr(), A.extent(0),
-                B.b.current_device_ptr(), B.extent(0),
-         beta,  C.b.current_device_ptr(), C.extent(0));
-#if defined(DEBUG_SYNCHRONOUS)
-    gemm_event.wait();
-  } catch (const oneapi::mkl::invalid_argument &e) {
-    std::cerr << "OneAPI MKL BLAS GEMM throws invalid argument exception" << std::endl;
-  } catch (const oneapi::mkl::unsupported_device &e) {
-    std::cerr << "OneAPI MKL BLAS GEMM throws unsuported device exception" << std::endl;
-  } catch (const oneapi::mkl::host_bad_alloc &e) {
-    std::cerr << "OneAPI MKL BLAS GEMM throws host bad allocation exception" << std::endl;
-  } catch (const oneapi::mkl::device_bad_alloc &e) {
-    std::cerr << "OneAPI MKL BLAS GEMM throws device bad allocation exception" << std::endl;
-  } catch (const oneapi::mkl::unimplemented &e) {
-    std::cerr << "OneAPI MKL BLAS GEMM throws unimplemented exception" << std::endl;
-  } catch (const std::exception& e) {
-    std::cerr << "OneAPI MKL BLAS GEMM throws unexpected exception" << std::endl;
-  } catch (...) {
-    std::cerr << "OneAPI MKL BLAS GEMM throws unexpected exception that is also badly formatted..." << std::endl;
-  }
-#endif /* DEBUG_SYNCHRONOUS */
-#endif
-}
-
-#if defined(TTG_USE_PARSEC)
-namespace ttg {
-  template <>
-  struct SplitMetadataDescriptor<blk_t> {
-    // TODO: this is a quick and dirty approach.
-    //   - blk_t could have any number of dimensions, this code only works for 2 dim blocks
-    //   - we use Blk{} to send a control flow in some tasks below, these blocks have only
-    //     1 dimension (of size 0), to code this, we set the second dimension to 0 in our
-    //     quick and dirty linearization, then have a case when we create the object
-    //   - when we create the object with the metadata, we use a constructor that initializes
-    //     the data to 0, which is useless: the data could be left uninitialized
-    static auto get_metadata(const blk_t &b) {
-      std::pair<int, int> dim{0, 0};
-      if (!b.empty()) {
-        assert(b.range().extent().size() == 2);
-        std::get<0>(dim) = (int)b.range().extent(0);
-        std::get<1>(dim) = (int)b.range().extent(1);
-      }
-      return dim;
-    }
-    static auto get_data(blk_t &b) {
-      using T = typename blk_t::value_type;
-      if (!b.empty())
-        return boost::container::small_vector<iovec, 1>(1, iovec{b.size() * sizeof(T), b.data()});
-      else
-        return boost::container::small_vector<iovec, 1>{};
-    }
-    static auto create_from_metadata(const std::pair<int, int> &meta) {
-      if (meta != std::pair{0, 0}) // N.B. allocate only, do not fill with zeroes
-        return blk_t(btas::Range(std::get<0>(meta), std::get<1>(meta)));
-      else
-        return blk_t{};
-    }
-  };
-}  // namespace ttg
-#endif /* TTG_USE_PARSEC */
-
-// declare btas::Tensor serializable by Boost
-#include "ttg/serialization/backends/boost.h"
-namespace ttg::detail {
-  // BTAS defines all of its Boost serializers in boost::serialization namespace ... as explained in
-  // ttg/serialization/boost.h such functions are not detectable via SFINAE, so must explicitly define serialization
-  // traits here
-  template <typename Archive>
-  inline static constexpr bool is_boost_serializable_v<Archive, blk_t> = is_boost_archive_v<Archive>;
-  template <typename Archive>
-  inline static constexpr bool is_boost_serializable_v<Archive, const blk_t> = is_boost_archive_v<Archive>;
-}  // namespace ttg::detail
-
-#else
-using blk_t = double;
-#endif
-template <typename T = blk_t>
-using SpMatrix = Eigen::SparseMatrix<T>;
-template <typename T = blk_t>
-using SpMatrixTriplet = ttg::matrix::Triplet<T>;  // {row,col,value}
-
-#if defined(BLOCK_SPARSE_GEMM) && defined(BTAS_IS_USABLE)
-
-#if __has_include(<madness/world/archive.h>)
-
-#include <madness/world/archive.h>
-
-#endif  // __has_include(<madness/world/archive.h>)
-
-namespace btas {
-  template <typename T_, class Range_, class Store_>
-  inline btas::Tensor<T_, Range_, Store_> operator*(const btas::Tensor<T_, Range_, Store_> &A,
-                                                    const btas::Tensor<T_, Range_, Store_> &B) {
-    btas::Tensor<T_, Range_, Store_> C;
-    btas::contract(1.0, A, {1, 2}, B, {2, 3}, 0.0, C, {1, 3});
-    return C;
-  }
-
-  template <typename T_, class Range_, class Store_>
-  btas::Tensor<T_, Range_, Store_> gemm(btas::Tensor<T_, Range_, Store_> &&C, const btas::Tensor<T_, Range_, Store_> &A,
-                                        const btas::Tensor<T_, Range_, Store_> &B) {
-    using array = btas::DEFAULT::index<int>;
-    if (C.empty()) {  // first contribution to C = allocate it and gemm with beta=0
-      C = btas::Tensor<T_, Range_, Store_>(btas::Range(A.range().extent(0), B.range().extent(1)));
-      btas::contract_222(1.0, A, array{1, 2}, B, array{2, 3}, 0.0, C, array{1, 3}, false, false);
-    }
-    else {   // subsequent contributions to C = gemm with beta=1
-      btas::contract_222(1.0, A, array{1, 2}, B, array{2, 3}, 1.0, C, array{1, 3}, false, false);
-    }
-    return std::move(C);
-  }
-}  // namespace btas
-#endif  // BTAS_IS_USABLE
-double gemm(double C, double A, double B) { return C + A * B; }
-
-// template <typename _Scalar, int _Options, typename _StorageIndex>
-// struct colmajor_layout;
-// template <typename _Scalar, typename _StorageIndex>
-// struct colmajor_layout<_Scalar, Eigen::ColMajor, _StorageIndex> : public std::true_type {};
-// template <typename _Scalar, typename _StorageIndex>
-// struct colmajor_layout<_Scalar, Eigen::RowMajor, _StorageIndex> : public std::false_type {};
-
-template <std::size_t Rank>
-using Key = MultiIndex<Rank>;
-
-/// maps {i,j} to rank within first (R=0) layer of the 3-d process grid
-inline int ij2rank(int i, int j, int P, int Q) {
-  std::vector<int> vec;
-  int p = (i % P);
-  int q = (j % Q);
-  int rank = (q * P) + p;
-  return rank;
-}
-
-/// maps {i,j,k} to rank within a 3-d process grid
-inline int ijk2rank(int i, int j, int k, int P, int Q, int R) {
-  std::vector<int> vec;
-  int p = (i % P);
-  int q = (j % Q);
-  int l = (k % R);
-  int rank = (l * P * Q) + (q * P) + p;
-  return rank;
-}
-
-// flow data from an existing SpMatrix on rank 0
-template <typename Blk = blk_t, typename Keymap = std::function<int(const Key<2> &)>>
-class Read_SpMatrix : public TT<Key<2>, std::tuple<Out<Key<2>, Blk>>, Read_SpMatrix<Blk, Keymap>, ttg::typelist<void>> {
- public:
-  using baseT = typename Read_SpMatrix::ttT;
-  Read_SpMatrix(const char *label, const SpMatrix<Blk> &matrix, Edge<Key<2>> &ctl, Edge<Key<2>, Blk> &out,
-                Keymap &ij_keymap)
-      : baseT(edges(ctl), edges(out), std::string("read_spmatrix(") + label + ")", {"ctl"}, {std::string(label) + "ij"},
-              ij_keymap)
-      , matrix_(matrix) {}
-
-  void op(const Key<2> &, std::tuple<Out<Key<2>, Blk>> &out) {
-    auto rank = ttg::default_execution_context().rank();
-    for (int k = 0; k < matrix_.outerSize(); ++k) {
-      for (typename SpMatrix<Blk>::InnerIterator it(matrix_, k); it; ++it) {
-        if (rank == this->get_keymap()(Key<2>(std::initializer_list<long>({it.row(), it.col()}))))
-          ::send<0>(Key<2>(std::initializer_list<long>({it.row(), it.col()})), ttg::persistent(it.value()), out);
-      }
-    }
-  }
-
- private:
-  const SpMatrix<Blk> &matrix_;
-};
-
-// flow (move?) data into an existing SpMatrix on rank 0
-template <typename Blk = blk_t>
-class Write_SpMatrix : public TT<Key<2>, std::tuple<>, Write_SpMatrix<Blk>, ttg::typelist<Blk>> {
- public:
-  using baseT = typename Write_SpMatrix::ttT;
-
-  template <typename Keymap2>
-  Write_SpMatrix(SpMatrix<Blk> &matrix, Edge<Key<2>, Blk> &in, Keymap2 &&ij_keymap, bool write_back = false)
-      : baseT(edges(in), edges(), "write_spmatrix", {"Cij"}, {}, ij_keymap)
-      , matrix_(matrix)
-      , write_back(write_back)
-  { }
-
-  void op(const Key<2> &key, typename baseT::input_refs_tuple_type &&elem, std::tuple<> &) {
-
-    if (write_back) {
-      std::lock_guard<std::mutex> lock(mtx_);
-      ttg::trace("rank =", default_execution_context().rank(),
-                 "/ thread_id =", reinterpret_cast<std::uintptr_t>(pthread_self()), "spmm.cc Write_SpMatrix wrote {",
-                 key[0], ",", key[1], "} = ", baseT::template get<0>(elem), " in ", static_cast<void *>(&matrix_),
-                 " with mutex @", static_cast<void *>(&mtx_), " for object @", static_cast<void *>(this));
-      values_.emplace_back(key[0], key[1], std::move(baseT::template get<0>(elem)));
-    }
-  }
-
-  /// grab completion status as a future<void>
-  /// \note cannot be called once this is executable
-  const std::shared_future<void> &status() const {
-    assert(!this->is_executable());
-    if (!completion_status_) {  // if not done yet, register completion work with the world
-      auto promise = std::make_shared<std::promise<void>>();
-      completion_status_ = std::make_shared<std::shared_future<void>>(promise->get_future());
-      ttg_register_status(this->get_world(), std::move(promise));
-      ttg_register_callback(this->get_world(),
-                            [this]() { this->matrix_.setFromTriplets(this->values_.begin(), this->values_.end()); });
-    } else {  // if done already, commit the result
-      this->matrix_.setFromTriplets(this->values_.begin(), this->values_.end());
-    }
-    return *completion_status_;
-  }
-
- private:
-  std::mutex mtx_;
-  SpMatrix<Blk> &matrix_;
-  std::vector<SpMatrixTriplet<Blk>> values_;
-  mutable std::shared_ptr<std::shared_future<void>> completion_status_;
-  bool write_back = false;
-};
-
-/// sparse mm via 2.5D SUMMA
-
-/// @tparam KeyMap2 maps {i,j} to processor
-/// @tparam KeyMap3 maps {i,j,k} to processor
-template <typename Keymap2 = std::function<int(const Key<2> &)>, typename Keymap3 = std::function<int(const Key<3> &)>,
-          typename Blk = blk_t>
-class SpMM25D {
- public:
-  /// @param ij_keymap maps {i,j} to process, specifies distribution of tiles of A, B, and C
-  /// @param ijk_keymap maps {i,j,k} to process, controls distribution of tasks performing C[i][j] += A[i][k]*B[k][j]
-  /// @param R the number of "layers" in the 3-D process grid
-  SpMM25D(Edge<Key<2>, Blk> &a, Edge<Key<2>, Blk> &b, Edge<Key<2>, Blk> &c, const SpMatrix<Blk> &a_mat,
-          const SpMatrix<Blk> &b_mat, const std::vector<std::vector<long>> &a_rowidx_to_colidx,
-          const std::vector<std::vector<long>> &a_colidx_to_rowidx,
-          const std::vector<std::vector<long>> &b_rowidx_to_colidx,
-          const std::vector<std::vector<long>> &b_colidx_to_rowidx, const std::vector<int> &mTiles,
-          const std::vector<int> &nTiles, const std::vector<int> &kTiles, Keymap2 ij_keymap, Keymap3 ijk_keymap, long R)
-      : a_rowidx_to_colidx_(a_rowidx_to_colidx)
-      , b_colidx_to_rowidx_(b_colidx_to_rowidx)
-      , a_colidx_to_rowidx_(a_colidx_to_rowidx)
-      , b_rowidx_to_colidx_(b_rowidx_to_colidx)
-      , ij_keymap_(std::move(ij_keymap))
-      , ijk_keymap_(std::move(ijk_keymap)) {
-    bcast_a_ = std::make_unique<BcastA>(a, local_a_ijk_, b_rowidx_to_colidx_, ij_keymap_, ijk_keymap_);
-    local_bcast_a_ = std::make_unique<LocalBcastA>(local_a_ijk_, a_ijk_, b_rowidx_to_colidx_, ijk_keymap_);
-    bcast_b_ = std::make_unique<BcastB>(b, local_b_ijk_, a_colidx_to_rowidx_, ij_keymap_, ijk_keymap_);
-    local_bcast_b_ = std::make_unique<LocalBcastB>(local_b_ijk_, b_ijk_, a_colidx_to_rowidx_, ijk_keymap_);
-    multiplyadd_ = std::make_unique<MultiplyAdd>(a_ijk_, b_ijk_, c_ijk_, c_ij_p_, a_rowidx_to_colidx_,
-                                                 b_colidx_to_rowidx_, mTiles, nTiles, ijk_keymap_);
-    reduce_c_ = std::make_unique<ReduceC>(c_ij_p_, c, ij_keymap_);
-    reduce_c_->template set_input_reducer<0>([](Blk &c_ij, const Blk &c_ij_p) { c_ij = c_ij + c_ij_p; });
-    // compute how many contributions each C[i][j] should expect ... MultiplyAdd already does this, but need a way to
-    // send message from each process p to the process owning C[i][j] to expect a contribution from it for now replicate
-    // this logic ...
-    // TODO: do this in MultiplyAdd (need to allreduce this info so that everyone has it)
-    // N.B. only need to set stream size on the rank that will accumulate the C[i][j] contribution
-    const auto my_rank = ttg::default_execution_context().rank();
-    for (auto i = 0ul; i != a_rowidx_to_colidx_.size(); ++i) {
-      if (a_rowidx_to_colidx_[i].empty()) continue;
-      for (auto j = 0ul; j != b_colidx_to_rowidx_.size(); ++j) {
-        if (b_colidx_to_rowidx_[j].empty()) continue;
-
-        if (ij_keymap_(Key<2>{i, j}) == my_rank) {
-          decltype(i) k;
-          bool have_k;
-          std::tie(k, have_k) = multiplyadd_->compute_first_k(i, j);
-          std::vector<bool> c_ij_procmask(R, false);
-          if (have_k) {
-            const auto pR = k % R;  // k values are distributed round-robin among the layers of the 3-D grid
-            assert(pR < c_ij_procmask.size());
-            c_ij_procmask[pR] = true;
-            while (have_k) {
-              std::tie(k, have_k) = multiplyadd_->compute_next_k(i, j, k);
-              if (have_k) {
-                const auto pR = k % R;
-                assert(pR < c_ij_procmask.size());
-                c_ij_procmask[pR] = true;
-              }
-            }
-          }
-          const auto c_ij_nprocs = std::count_if(c_ij_procmask.begin(), c_ij_procmask.end(), [](bool b) { return b; });
-          if (c_ij_nprocs > 0) reduce_c_->template set_argstream_size<0>(Key<2>{i, j}, c_ij_nprocs);
-        }
-      }
-    }
-
-    TTGUNUSED(bcast_a_);
-    TTGUNUSED(bcast_b_);
-    TTGUNUSED(multiplyadd_);
-    TTGUNUSED(reduce_c_);
-  }
-
-  /// Locally broadcast `A[i][k]` assigned to this processor `p` to matmul tasks `{i,j,k}` for all `j` such that
-  /// `B[k][j]` exists AND `k` contribution to `C[i][j]` is assigned to this processor
-  class LocalBcastA : public TT<Key<3>, std::tuple<Out<Key<3>, Blk>>, LocalBcastA, ttg::typelist<const Blk>> {
-   public:
-    using baseT = typename LocalBcastA::ttT;
-
-    LocalBcastA(Edge<Key<3>, Blk> &a, Edge<Key<3>, Blk> &a_ijk,
-                const std::vector<std::vector<long>> &b_rowidx_to_colidx, const Keymap3 &ijk_keymap)
-        : baseT(edges(a), edges(a_ijk), "SpMM25D::local_bcast_a", {"a_ikp"}, {"a_ijk"},
-                [](const Key<3> &ikp) { return ikp[2]; })
-        , b_rowidx_to_colidx_(b_rowidx_to_colidx)
-        , ijk_keymap_(ijk_keymap) {}
-
-    void op(const Key<3> &ikp, typename baseT::input_refs_tuple_type &&a_ik, std::tuple<Out<Key<3>, Blk>> &a_ijk) {
-      const auto i = ikp[0];
-      const auto k = ikp[1];
-      const auto p = ikp[2];
-
-      auto world = default_execution_context();
-      assert(p == world.rank());
-      ttg::trace("LocalBcastA(", i, ", ", k, ", ", p, ")");
-      if (k >= b_rowidx_to_colidx_.size()) return;
-      // local broadcast a_ik to all {i,j,k} such that b_kj exists
-      std::vector<Key<3>> ijk_keys;
-      for (auto &j : b_rowidx_to_colidx_[k]) {
-        if (ijk_keymap_(Key<3>({i, j, k})) == world.rank()) {
-          ttg::trace("Broadcasting A[", i, "][", k, "] on proc ", p, " to j=", j);
-          ijk_keys.emplace_back(Key<3>({i, j, k}));
-        }
-      }
-      ::broadcast<0>(ijk_keys, std::move(baseT::template get<0>(a_ik)), a_ijk);
-    }
-
-   private:
-    const std::vector<std::vector<long>> &b_rowidx_to_colidx_;
-    const Keymap3 &ijk_keymap_;
-  };  // class LocalBcastA
-
-  /// broadcast `A[i][k]` to all processors which will contain at least one `C[i][j]` such that `B[k][j]` exists
-  class BcastA : public TT<Key<2>, std::tuple<Out<Key<3>, Blk>>, BcastA, ttg::typelist<const Blk>> {
-   public:
-    using baseT = typename BcastA::ttT;
-
-    BcastA(Edge<Key<2>, Blk> &a_ik, Edge<Key<3>, Blk> &a_ikp, const std::vector<std::vector<long>> &b_rowidx_to_colidx,
-           const Keymap2 &ij_keymap, const Keymap3 &ijk_keymap)
-        : baseT(edges(a_ik), edges(a_ikp), "SpMM25D::bcast_a", {"a_ik"}, {"a_ikp"}, ij_keymap)
-        , b_rowidx_to_colidx_(b_rowidx_to_colidx)
-        , ijk_keymap_(ijk_keymap) {}
-
-    void op(const Key<2> &ik, typename baseT::input_refs_tuple_type &&a_ik, std::tuple<Out<Key<3>, Blk>> &a_ikp) {
-      const auto i = ik[0];
-      const auto k = ik[1];
-      ttg::trace("BcastA(", i, ", ", k, ")");
-      std::vector<Key<3>> ikp_keys;
-
-      if (k >= b_rowidx_to_colidx_.size()) return;
-      auto world = default_execution_context();
-      std::vector<bool> procmap(world.size());
-      for (auto &j : b_rowidx_to_colidx_[k]) {
-        const long p = ijk_keymap_(Key<3>(
-            {i, j, k}));  // N.B. in 2.5D SUMMA different k contributions to C[i][j] are computed on different nodes
-        if (!procmap[p]) {
-          ttg::trace("Broadcasting A[", i, "][", k, "] to proc ", p);
-          ikp_keys.emplace_back(Key<3>({i, k, p}));
-          procmap[p] = true;
-        }
-      }
-      ::broadcast<0>(ikp_keys, std::move(baseT::template get<0>(a_ik)), a_ikp);
-    }
-
-   private:
-    const std::vector<std::vector<long>> &b_rowidx_to_colidx_;
-    const Keymap3 &ijk_keymap_;
-  };  // class BcastA
-
-  /// Locally broadcast `B[k][j]` assigned to this processor `p` to matmul tasks `{i,j,k}` for all `k` such that
-  /// `A[i][k]` exists AND `k` contribution to `C[i][j]` is assigned to this processor
-  class LocalBcastB : public TT<Key<3>, std::tuple<Out<Key<3>, Blk>>, LocalBcastB, ttg::typelist<const Blk>> {
-   public:
-    using baseT = typename LocalBcastB::ttT;
-
-    LocalBcastB(Edge<Key<3>, Blk> &b_kjp, Edge<Key<3>, Blk> &b_ijk,
-                const std::vector<std::vector<long>> &a_colidx_to_rowidx, const Keymap3 &ijk_keymap)
-        : baseT(edges(b_kjp), edges(b_ijk), "SpMM25D::local_bcast_b", {"b_kjp"}, {"b_ijk"},
-                [](const Key<3> &kjp) { return kjp[2]; })
-        , a_colidx_to_rowidx_(a_colidx_to_rowidx)
-        , ijk_keymap_(ijk_keymap) {}
-
-    void op(const Key<3> &kjp, typename baseT::input_refs_tuple_type &&b_kj, std::tuple<Out<Key<3>, Blk>> &b_ijk) {
-      const auto k = kjp[0];
-      const auto j = kjp[1];
-      const auto p = kjp[2];
-      auto world = default_execution_context();
-      assert(p == world.rank());
-      ttg::trace("BcastB(", k, ", ", j, ", ", p, ")");
-      if (k >= a_colidx_to_rowidx_.size()) return;
-      // broadcast b_kj to all ijk for which c_ij is on this processor and a_ik exists
-      std::vector<Key<3>> ijk_keys;
-      for (auto &i : a_colidx_to_rowidx_[k]) {
-        if (ijk_keymap_(Key<3>({i, j, k})) == world.rank()) {
-          ttg::trace("Broadcasting B[", k, "][", j, "] on proc ", p, " to i=", i);
-          ijk_keys.emplace_back(Key<3>({i, j, k}));
-        }
-      }
-      ::broadcast<0>(ijk_keys, std::move(baseT::template get<0>(b_kj)), b_ijk);
-    }
-
-   private:
-    const std::vector<std::vector<long>> &a_colidx_to_rowidx_;
-    const Keymap3 &ijk_keymap_;
-  };  // class LocalBcastB
-
-  /// broadcast `B[k][j]` to all processors which will contain at least one `C[i][j]` such that `A[i][k]` exists
-  class BcastB : public TT<Key<2>, std::tuple<Out<Key<3>, Blk>>, BcastB, ttg::typelist<const Blk>> {
-   public:
-    using baseT = typename BcastB::ttT;
-
-    BcastB(Edge<Key<2>, Blk> &b_kj, Edge<Key<3>, Blk> &b_kjp, const std::vector<std::vector<long>> &a_colidx_to_rowidx,
-           const Keymap2 &ij_keymap, const Keymap3 &ijk_keymap)
-        : baseT(edges(b_kj), edges(b_kjp), "SpMM25D::bcast_b", {"b_kj"}, {"b_kjp"}, ij_keymap)
-        , a_colidx_to_rowidx_(a_colidx_to_rowidx)
-        , ijk_keymap_(ijk_keymap) {}
-
-    void op(const Key<2> &kj, typename baseT::input_refs_tuple_type &&b_kj, std::tuple<Out<Key<3>, Blk>> &b_kjp) {
-      const auto k = kj[0];
-      const auto j = kj[1];
-      // broadcast b_kj to all processors which will contain at least one c_ij such that a_ik exists
-      std::vector<Key<3>> kjp_keys;
-      ttg::trace("BcastB(", k, ", ", j, ")");
-      if (k >= a_colidx_to_rowidx_.size()) return;
-      auto world = default_execution_context();
-      std::vector<bool> procmap(world.size());
-      for (auto &i : a_colidx_to_rowidx_[k]) {
-        long p = ijk_keymap_(Key<3>({i, j, k}));
-        if (!procmap[p]) {
-          ttg::trace("Broadcasting B[", k, "][", j, "] to proc ", p);
-          kjp_keys.emplace_back(Key<3>({k, j, p}));
-          procmap[p] = true;
-        }
-      }
-      ::broadcast<0>(kjp_keys, std::move(baseT::template get<0>(b_kj)), b_kjp);
-    }
-
-   private:
-    const std::vector<std::vector<long>> &a_colidx_to_rowidx_;
-    const Keymap3 &ijk_keymap_;
-  };  // class BcastB
-
-  /// multiply task has 3 input flows: a_ijk, b_ijk, and c_ijk, c_ijk contains the running total for this kayer of the
-  /// 3-D process grid only
-  class MultiplyAdd : public TT<Key<3>, std::tuple<Out<Key<2>, Blk>, Out<Key<3>, Blk>>, MultiplyAdd,
-                                ttg::typelist<const Blk, const Blk, Blk>> {
-   public:
-    using baseT = typename MultiplyAdd::ttT;
-
-#if defined(TTG_ENABLE_CUDA)
-    static constexpr bool have_cuda_op = true;
-#warning SPMM using CUDA implementation
-#elif defined(TTG_ENABLE_HIP)
-    static constexpr bool have_hip_op  = true;
-#warning SPMM using HIP implementation
-#elif defined(TTG_ENABLE_LEVEL_ZERO)
-    static constexpr bool have_level_zero_op = true;
-#warning SPMM using LEVEL_ZERO implementation
-#else
-#error No valid device implementation found!
-#endif
-
-    MultiplyAdd(Edge<Key<3>, Blk> &a_ijk, Edge<Key<3>, Blk> &b_ijk, Edge<Key<3>, Blk> &c_ijk, Edge<Key<2>, Blk> &c,
-                const std::vector<std::vector<long>> &a_rowidx_to_colidx,
-                const std::vector<std::vector<long>> &b_colidx_to_rowidx, const std::vector<int> &mTiles,
-                const std::vector<int> &nTiles, const Keymap3 &ijk_keymap)
-        : baseT(edges(a_ijk, b_ijk, c_ijk), edges(c, c_ijk), "SpMM25D::MultiplyAdd", {"a_ijk", "b_ijk", "c_ijk"},
-                {"c_ij", "c_ijk"}, ijk_keymap)
-        , a_rowidx_to_colidx_(a_rowidx_to_colidx)
-        , b_colidx_to_rowidx_(b_colidx_to_rowidx) {
-      this->set_priomap([this](const Key<3> &ijk) { return this->prio(ijk); });  // map a key to an integral priority value
-      auto num_devices = ttg::device::num_devices();
-      this->set_devicemap(
-        [num_devices](const Key<3> &ijk){
-          return ((((uint64_t)ijk[0]) << 32) + ijk[1]) % num_devices;
-        });
-      // for each {i,j} determine first k that contributes AND belongs to this node,
-      // initialize input {i,j,first_k} flow to 0
-      for (auto i = 0ul; i != a_rowidx_to_colidx_.size(); ++i) {
-        if (a_rowidx_to_colidx_[i].empty()) continue;
-        for (auto j = 0ul; j != b_colidx_to_rowidx_.size(); ++j) {
-          if (b_colidx_to_rowidx_[j].empty()) continue;
-
-          const auto p = ttg::default_execution_context().rank();
-          decltype(i) k;
-          bool have_k;
-          std::tie(k, have_k) = compute_first_k(i, j, p);
-          if (have_k) {
-            ttg::trace("Initializing C[", i, "][", j, "] on process ", p, " to zero");
-#if BLOCK_SPARSE_GEMM
-            Blk zero(btas::Range(mTiles[i], nTiles[j]), 0.0);
-#else
-            Blk zero{0.0};
-#endif
-            this->template in<2>()->send(Key<3>({i, j, k}), zero);
-          } else {
-            if (tracing() && a_rowidx_to_colidx_.size() * b_colidx_to_rowidx_.size() < 400)
-              ttg::print("C[", i, "][", j, "] is empty");
-          }
-        }
-      }
-    }
-
-    ttg::device::Task op(const Key<3> &ijk, typename baseT::input_refs_tuple_type &&_ijk,
-            std::tuple<Out<Key<2>, Blk>, Out<Key<3>, Blk>> &result) {
-      const auto i = ijk[0];
-      const auto j = ijk[1];
-      const auto k = ijk[2];  // k==l same because 000 will always be on layer 0, 001 will be accessed on layer 1
-      const auto p = ttg::default_execution_context().rank();
-      long next_k;
-      bool have_next_k;
-
-      const blk_t& A = baseT::template get<0>(_ijk);
-      const blk_t& B = baseT::template get<1>(_ijk);
-      blk_t& C = baseT::template get<2>(_ijk);
-
-      if (C.empty()) {
-        C = blk_t(btas::Range(A.range().extent(0), B.range().extent(1)), 0.0);
-      }
-
-      /* pull all buffers onto the device */
-      co_await ttg::device::select(A.b, B.b, C.b);
-
-      /* everything is on the device, call the gemm */
-      device_gemm(C, A, B);
-
-      /* compute next k while the kernel is running */
-      std::tie(next_k, have_next_k) = compute_next_k(i, j, k, p);
-      ttg::trace("Rank ", ttg::default_execution_context().rank(),
-                 " :"
-                 " C[",
-                 i, "][", j, "]  += A[", i, "][", k, "] by B[", k, "][", j, "],  next_k? ",
-                 (have_next_k ? std::to_string(next_k) : "does not exist"));
-
-      /* wait for the kernel to complete */
-      co_await ttg::device::wait();
-
-
-      // compute the contrib, pass the running total to the next flow, if needed
-      // otherwise write to the result flow
-      if (have_next_k) {
-        co_await ttg::device::forward(ttg::device::send<1>(
-                                                Key<3>({i, j, next_k}),
-                                                std::move(C),
-                                                result));
-      } else {  // done with all local contributions to C[i][j], reduce with others on the process to which C[i][j]
-                // belongs
-        co_await ttg::device::forward(ttg::device::send<0>(
-                                                Key<2>({i, j}),
-                                                std::move(C),
-                                                result));
-      }
-    }
-
-   private:
-    const std::vector<std::vector<long>> &a_rowidx_to_colidx_;
-    const std::vector<std::vector<long>> &b_colidx_to_rowidx_;
-
-    /* Compute the length of the remaining sequence on that tile */
-    int32_t prio(const Key<3> &key) const {
-      const auto i = key[0];
-      const auto j = key[1];
-      const auto k = key[2];
-      int32_t len = -1;  // will be incremented at least once
-      long next_k = k;
-      bool have_next_k;
-      do {
-        std::tie(next_k, have_next_k) = compute_next_k(i, j, next_k);  // here I know how many 'k' I have with same ij
-        ++len;
-      } while (have_next_k);
-      return len;
-    }
-
-   public:  // to be able to reuse this logic in SpMM25D
-    // given {i,j} return first k such that A[i][k] and B[k][j] exist
-    std::tuple<long, bool> compute_first_k(long i, long j) const {
-      const auto &a_k_range = a_rowidx_to_colidx_.at(i);
-      auto a_iter = a_k_range.begin();
-      auto a_iter_fence = a_k_range.end();
-      if (a_iter == a_iter_fence) return std::make_tuple(-1, false);
-      const auto &b_k_range = b_colidx_to_rowidx_.at(j);
-      auto b_iter = b_k_range.begin();
-      auto b_iter_fence = b_k_range.end();
-      if (b_iter == b_iter_fence) return std::make_tuple(-1, false);
-
-      {
-        auto a_colidx = *a_iter;  // pointing to next kth element
-        auto b_rowidx = *b_iter;
-        while (a_colidx != b_rowidx) {
-          if (a_colidx < b_rowidx) {
-            ++a_iter;
-            if (a_iter == a_iter_fence) return std::make_tuple(-1, false);
-            a_colidx = *a_iter;
-          } else {
-            ++b_iter;
-            if (b_iter == b_iter_fence) return std::make_tuple(-1, false);
-            b_rowidx = *b_iter;
-          }
-        }
-        return std::make_tuple(a_colidx, true);  // returned true for kth element exist and also returns next k since
-                                                 // a_colidx points to ++a_iter,  if not reaches to fence
-      }
-      assert(false);
-    }
-
-    // given {i,j,k} such that A[i][k] and B[k][j] exist
-    // return next k such that this condition holds
-    std::tuple<long, bool> compute_next_k(long i, long j, long k) const {
-      const auto &a_k_range = a_rowidx_to_colidx_.at(i);
-      auto a_iter_fence = a_k_range.end();
-      auto a_iter = std::find(a_k_range.begin(), a_iter_fence, k);
-      assert(a_iter != a_iter_fence);
-      const auto &b_k_range = b_colidx_to_rowidx_.at(j);
-      auto b_iter_fence = b_k_range.end();
-      auto b_iter = std::find(b_k_range.begin(), b_iter_fence, k);
-      assert(b_iter != b_iter_fence);
-      while (a_iter != a_iter_fence && b_iter != b_iter_fence) {
-        ++a_iter;
-        ++b_iter;
-        if (a_iter == a_iter_fence || b_iter == b_iter_fence) return std::make_tuple(-1, false);
-        auto a_colidx = *a_iter;
-        auto b_rowidx = *b_iter;
-        while (a_colidx != b_rowidx) {
-          if (a_colidx < b_rowidx) {
-            ++a_iter;
-            if (a_iter == a_iter_fence) return std::make_tuple(-1, false);
-            a_colidx = *a_iter;
-          } else {
-            ++b_iter;
-            if (b_iter == b_iter_fence) return std::make_tuple(-1, false);
-            b_rowidx = *b_iter;
-          }
-        }
-        return std::make_tuple(a_colidx, true);
-      }
-      ttg::abort();  // unreachable
-      return std::make_tuple(0, false);
-    }
-
-    // given {i,j} return first k such that A[i][k] and B[k][j] exist AND ijk_keymap_(i,j,k) == p
-    std::tuple<long, bool> compute_first_k(long i, long j, long p) const {
-      long first_k = 0;
-      bool have_k = false;
-      std::tie(first_k, have_k) = compute_first_k(i, j);
-      while (have_k) {
-        if (this->get_keymap()(Key<3>{i, j, first_k}) == p)
-          return {first_k, true};
-        else
-          std::tie(first_k, have_k) = compute_next_k(i, j, first_k);
-      }
-      return {0, false};
-    }
-
-    // given {i,j,k} such that A[i][k] and B[k][j] exist
-    // return next k such that this condition holds AND ijk_keymap_(i,j,k) == p
-    std::tuple<long, bool> compute_next_k(long i, long j, long k, long p) const {
-      long next_k = 0;
-      bool have_k = false;
-      std::tie(next_k, have_k) = compute_next_k(i, j, k);
-      while (have_k) {
-        if (this->get_keymap()(Key<3>{i, j, next_k}) == p)
-          return {next_k, true};
-        else
-          std::tie(next_k, have_k) = compute_next_k(i, j, next_k);
-      }
-      return {0, false};
-    }
-
-  };  // MultiplyAdd
-
-  /// reduces contributions to `C[i][j]` produced on different layers of the 3-d process grid
-  class ReduceC : public TT<Key<2>, std::tuple<Out<Key<2>, Blk>>, ReduceC, ttg::typelist<Blk>> {
-   public:
-    using baseT = typename ReduceC::ttT;
-
-    ReduceC(Edge<Key<2>, Blk> &c_ij_p, Edge<Key<2>, Blk> &c_ij, const Keymap2 &ij_keymap)
-        : baseT(edges(c_ij_p), edges(c_ij), "SpMM25D::reduce_c", {"c_ij(p)"}, {"c_ij"}, ij_keymap) {}
-
-    void op(const Key<2> &ij, typename baseT::input_refs_tuple_type &&c_ij_p, std::tuple<Out<Key<2>, Blk>> &c_ij) {
-      ttg::trace("ReduceC(", ij[0], ", ", ij[1], ")");
-      ::send<0>(ij, std::move(baseT::template get<0>(c_ij_p)), c_ij);
-    }
-  };  // class ReduceC
-
- private:
-  Edge<Key<3>, Blk> a_ijk_;
-  Edge<Key<3>, Blk> local_a_ijk_;
-  Edge<Key<3>, Blk> b_ijk_;
-  Edge<Key<3>, Blk> local_b_ijk_;
-  Edge<Key<3>, Blk> c_ijk_;
-  Edge<Key<2>, Blk> c_ij_p_;
-  const std::vector<std::vector<long>> &a_rowidx_to_colidx_;
-  const std::vector<std::vector<long>> &b_colidx_to_rowidx_;
-  const std::vector<std::vector<long>> &a_colidx_to_rowidx_;
-  const std::vector<std::vector<long>> &b_rowidx_to_colidx_;
-  std::unique_ptr<BcastA> bcast_a_;
-  std::unique_ptr<LocalBcastA> local_bcast_a_;
-  std::unique_ptr<BcastB> bcast_b_;
-  std::unique_ptr<LocalBcastB> local_bcast_b_;
-  std::unique_ptr<MultiplyAdd> multiplyadd_;
-  std::unique_ptr<ReduceC> reduce_c_;
-  Keymap2 ij_keymap_;
-  Keymap3 ijk_keymap_;
-};
-
-class Control : public TT<void, std::tuple<Out<Key<2>>>, Control> {
-  using baseT = typename Control::ttT;
-  int P;
-  int Q;
-
- public:
-  explicit Control(Edge<Key<2>> &ctl) : baseT(edges(), edges(ctl), "Control", {}, {"ctl"}), P(0), Q(0) {}
-
-  void op(std::tuple<Out<Key<2>>> &out) const {
-    for (int p = 0; p < P; p++) {
-      for (int q = 0; q < Q; q++) {
-        ttg::trace("Control: start computing on process {", p, ", ", q, "}");
-        ::sendk<0>(Key<2>{p, q}, out);
-      }
-    }
-  }
-
-  void start(const int _p, const int _q) {
-    P = _p;
-    Q = _q;
-    invoke();
-  }
-};
-
-#ifdef BTAS_IS_USABLE
-template <typename T_, class Range_, class Store_>
-std::tuple<T_, T_> norms(const btas::Tensor<T_, Range_, Store_> &t) {
-  T_ norm_2_square = 0.0;
-  T_ norm_inf = 0.0;
-  for (auto k : t) {
-    norm_2_square += k * k;
-    norm_inf = std::max(norm_inf, std::abs(k));
-  }
-  return std::make_tuple(norm_2_square, norm_inf);
-}
-#endif
-
-std::tuple<double, double> norms(double t) { return std::make_tuple(t * t, std::abs(t)); }
-
-template <typename Blk = blk_t>
-std::tuple<double, double> norms(const SpMatrix<Blk> &A) {
-  double norm_2_square = 0.0;
-  double norm_inf = 0.0;
-  for (int i = 0; i < A.outerSize(); ++i) {
-    for (typename SpMatrix<Blk>::InnerIterator it(A, i); it; ++it) {
-      //  cout << 1+it.row() << "\t"; // row index
-      //  cout << 1+it.col() << "\t"; // col index (here it is equal to k)
-      //  cout << it.value() << endl;
-      auto& elem = it.value();
-      double elem_norm_2_square, elem_norm_inf;
-      std::tie(elem_norm_2_square, elem_norm_inf) = norms(elem);
-      norm_2_square += elem_norm_2_square;
-      norm_inf = std::max(norm_inf, elem_norm_inf);
-    }
-  }
-  return std::make_tuple(norm_2_square, norm_inf);
-}
-
-char *getCmdOption(char **begin, char **end, const std::string &option) {
-  static char *empty = (char *)"";
-  char **itr = std::find(begin, end, option);
-  if (itr != end && ++itr != end) return *itr;
-  return empty;
-}
-
-bool cmdOptionExists(char **begin, char **end, const std::string &option) {
-  return std::find(begin, end, option) != end;
-}
-
-int cmdOptionIndex(char **begin, char **end, const std::string &option) {
-  char **itr = std::find(begin, end, option);
-  if (itr != end) return (int)(itr - begin);
-  return -1;
-}
-
-static int parseOption(std::string &option, int default_value) {
-  size_t pos;
-  std::string token;
-  int N = default_value;
-  if (option.length() == 0) return N;
-  pos = option.find(':');
-  if (pos == std::string::npos) {
-    pos = option.length();
-  }
-  token = option.substr(0, pos);
-  N = std::stoi(token);
-  option.erase(0, pos + 1);
-  return N;
-}
-
-static long parseOption(std::string &option, long default_value) {
-  size_t pos;
-  std::string token;
-  long N = default_value;
-  if (option.length() == 0) return N;
-  pos = option.find(':');
-  if (pos == std::string::npos) {
-    pos = option.length();
-  }
-  token = option.substr(0, pos);
-  N = std::stol(token);
-  option.erase(0, pos + 1);
-  return N;
-}
-
-static double parseOption(std::string &option, double default_value = 0.25) {
-  size_t pos;
-  std::string token;
-  double N = default_value;
-  if (option.length() == 0) return N;
-  pos = option.find(':');
-  if (pos == std::string::npos) {
-    pos = option.length();
-  }
-  token = option.substr(0, pos);
-  N = std::stod(token);
-  option.erase(0, pos + 1);
-  return N;
-}
-
-#if !defined(BLOCK_SPARSE_GEMM)
-static void initSpMatrixMarket(const std::function<int(const Key<2> &)> &keymap, const char *filename, SpMatrix<> &A,
-                               SpMatrix<> &B, SpMatrix<> &C, int &M, int &N, int &K) {
-  std::vector<int> sizes;
-  // We load the entire matrix on each rank, but we only use the local part for the GEMM
-  // loadMarket() is the eigan fuction to load matrix from a file
-  if (!loadMarket(A, filename)) {
-    std::cerr << "Failed to load " << filename << ", bailing out..." << std::endl;
-    ttg::ttg_abort();
-  }
-  if (0 == ttg::default_execution_context().rank()) {
-    std::cout << "##MatrixMarket file " << filename << " -- " << A.rows() << " x " << A.cols() << " -- " << A.nonZeros()
-              << " nnz (density: " << (float)A.nonZeros() / (float)A.rows() / (float)A.cols() << ")" << std::endl;
-  }
-  if (A.rows() != A.cols()) {
-    B = A.transpose();
-  } else {
-    B = A;
-  }
-
-  C.resize(A.rows(), B.cols());
-  M = (int)A.rows();
-  N = (int)C.cols();
-  K = (int)A.cols();
-}
-
-static void initSpRmat(const std::function<int(const Key<2> &)> &keymap, const char *opt, SpMatrix<> &A, SpMatrix<> &B,
-                       SpMatrix<> &C, int &M, int &N, int &K, unsigned long seed) {
-  int E;
-  double a = 0.25, b = 0.25, c = 0.25, d = 0.25;
-  size_t nnz = 0;
-
-  if (nullptr == opt) {
-    std::cerr << "Usage: -rmat <#nodes>[:<#edges>[:<a>[:<b>:[<c>[:<d>]]]]]" << std::endl;
-    exit(1);
-  }
-  std::string token;
-  std::string option = std::string(opt);
-  N = parseOption(option, -1);
-  K = N;
-  M = N;
-
-  // We build the entire sparse matrix on each rank, but use only the local part
-  // on a given rank, according to keymap
-  A.resize(N, N);
-
-  E = parseOption(option, (int)(0.01 * N * N));
-  a = parseOption(option, a);
-  b = parseOption(option, b);
-  c = parseOption(option, c);
-  d = parseOption(option, d);
-
-  if (ttg::default_execution_context().rank() == 0) {
-    std::cout << "#R-MAT: " << N << " nodes, " << E << " edges, a/b/c/d = " << a << "/" << b << "/" << c << "/" << d
-              << std::endl;
-  }
-
-  boost::minstd_rand gen(seed);
-  boost::rmat_iterator<boost::minstd_rand, boost::directed_graph<>> rmat_it(gen, N, E, a, b, c, d);
-
-  using triplet_t = ttg::matrix::Triplet<blk_t>;
-  std::vector<triplet_t> A_elements;
-  for (int i = 0; i < N; i++) {
-    nnz++;
-    A_elements.emplace_back(i, i, 1.0);
-  }
-  for (int i = 0; i < E; i++) {
-    auto x = *rmat_it++;
-    if (x.first != x.second) {
-      A_elements.emplace_back(x.first, x.second, 1.0);
-      nnz++;
-    }
-  }
-  A.setFromTriplets(A_elements.begin(), A_elements.end());
-
-  B = A;
-  C.resize(N, N);
-
-  if (ttg::default_execution_context().rank() == 0) {
-    std::cout << "#R-MAT: " << E << " nonzero elements, density: " << (double)nnz / (double)N / (double)N << std::endl;
-  }
-}
-
-static void initSpHardCoded(const std::function<int(const Key<2> &)> &keymap, SpMatrix<> &A, SpMatrix<> &B,
-                            SpMatrix<> &C, int &m, int &n, int &k) {
-  m = 2;
-  n = 3;
-  k = 4;
-
-  std::cout << "#HardCoded A, B, C" << std::endl;
-  A.resize(m, k);
-  B.resize(k, n);
-  C.resize(m, n);
-  // We initialize the same matrices on all the ranks, but we will use only the local part
-  // following the keymap
-  using triplet_t = ttg::matrix::Triplet<blk_t>;
-  std::vector<triplet_t> A_elements;
-  A_elements.emplace_back(0, 1, 12.3);
-  A_elements.emplace_back(0, 2, 10.7);
-  A_elements.emplace_back(0, 3, -2.3);
-  A_elements.emplace_back(1, 0, -0.3);
-  A_elements.emplace_back(1, 2, 1.2);
-  A.setFromTriplets(A_elements.begin(), A_elements.end());
-
-  std::vector<triplet_t> B_elements;
-  B_elements.emplace_back(0, 0, 12.3);
-  B_elements.emplace_back(1, 0, 10.7);
-  B_elements.emplace_back(3, 0, -2.3);
-  B_elements.emplace_back(1, 1, -0.3);
-  B_elements.emplace_back(1, 2, 1.2);
-  B_elements.emplace_back(2, 2, 7.2);
-  B_elements.emplace_back(3, 2, 0.2);
-  B.setFromTriplets(B_elements.begin(), B_elements.end());
-}
-
-#else
-static void initBlSpHardCoded(const std::function<int(const Key<2> &)> &keymap, SpMatrix<> &A, SpMatrix<> &B,
-                              SpMatrix<> &C, SpMatrix<> &Aref, SpMatrix<> &Bref, bool buildRefs,
-                              std::vector<int> &mTiles, std::vector<int> &nTiles, std::vector<int> &kTiles,
-                              std::vector<std::vector<long>> &a_rowidx_to_colidx,
-                              std::vector<std::vector<long>> &a_colidx_to_rowidx,
-                              std::vector<std::vector<long>> &b_rowidx_to_colidx,
-                              std::vector<std::vector<long>> &b_colidx_to_rowidx, int &m, int &n, int &k) {
-  m = 2;
-  n = 3;
-  k = 4;
-
-  std::cout << "#HardCoded A, B, C" << std::endl;
-  A.resize(m, k);
-  B.resize(k, n);
-  C.resize(m, n);
-  if (buildRefs) {
-    Aref.resize(m, k);
-    Bref.resize(k, n);
-  }
-
-  for (int mt = 0; mt < m; mt++) mTiles.push_back(128);
-  for (int nt = 0; nt < n; nt++) nTiles.push_back(196);
-  for (int kt = 0; kt < k; kt++) kTiles.push_back(256);
-
-  int rank = ttg::default_execution_context().rank();
-
-  using triplet_t = ttg::matrix::Triplet<blk_t>;
-  std::vector<triplet_t> A_elements;
-  std::vector<triplet_t> Aref_elements;
-#if defined(BTAS_IS_USABLE)
-  if (keymap({0, 1}) == rank) {
-    A_elements.emplace_back(0, 1, blk_t(btas::Range(128, 256), 12.3));
-  }
-  if (keymap({0, 2}) == rank) {
-    A_elements.emplace_back(0, 2, blk_t(btas::Range(128, 256), 10.7));
-  }
-  if (keymap({0, 3}) == rank) {
-    A_elements.emplace_back(0, 3, blk_t(btas::Range(128, 256), -2.3));
-  }
-  if (keymap({1, 0}) == rank) {
-    A_elements.emplace_back(1, 0, blk_t(btas::Range(128, 256), -0.3));
-  }
-  if (keymap({1, 2}) == rank) {
-    A_elements.emplace_back(1, 2, blk_t(btas::Range(128, 256), 1.2));
-  }
-  if (buildRefs && rank == 0) {
-    Aref_elements.emplace_back(0, 1, blk_t(btas::Range(128, 256), 12.3));
-    Aref_elements.emplace_back(0, 2, blk_t(btas::Range(128, 256), 10.7));
-    Aref_elements.emplace_back(0, 3, blk_t(btas::Range(128, 256), -2.3));
-    Aref_elements.emplace_back(1, 0, blk_t(btas::Range(128, 256), -0.3));
-    Aref_elements.emplace_back(1, 2, blk_t(btas::Range(128, 256), 1.2));
-  }
-#else
-  if ((buildRefs && rank == 0) || keymap({0, 1}) == rank) {
-    A_elements.emplace_back(0, 1, 12.3);
-  }
-  if ((buildRefs && rank == 0) || keymap({0, 2}) == rank) {
-    A_elements.emplace_back(0, 2, 10.7);
-  }
-  if ((buildRefs && rank == 0) || keymap({0, 3}) == rank) {
-    A_elements.emplace_back(0, 3, -2.3);
-  }
-  if ((buildRefs && rank == 0) || keymap({1, 0}) == rank) {
-    A_elements.emplace_back(1, 0, -0.3);
-  }
-  if ((buildRefs && rank == 0) || keymap({1, 2}) == rank) {
-    A_elements.emplace_back(1, 2, .2);
-  }
-  if (buildRefs && rank == 0) {
-    Aref_elements.emplace_back(0, 1, 12.3);
-    Aref_elements.emplace_back(0, 2, 10.7);
-    Aref_elements.emplace_back(0, 3, -2.3);
-    Aref_elements.emplace_back(1, 0, -0.3);
-    Aref_elements.emplace_back(1, 2, .2);
-  }
-#endif
-  a_rowidx_to_colidx.resize(2);
-  a_rowidx_to_colidx[0].emplace_back(1);  // A[0][1]
-  a_rowidx_to_colidx[0].emplace_back(2);  // A[0][2]
-  a_rowidx_to_colidx[0].emplace_back(3);  // A[0][3]
-  a_rowidx_to_colidx[1].emplace_back(0);  // A[1][0]
-  a_rowidx_to_colidx[1].emplace_back(2);  // A[1][2]
-
-  a_colidx_to_rowidx.resize(4);
-  a_colidx_to_rowidx[0].emplace_back(1);  // A[1][0]
-  a_colidx_to_rowidx[1].emplace_back(0);  // A[0][1]
-  a_colidx_to_rowidx[2].emplace_back(0);  // A[0][2]
-  a_colidx_to_rowidx[2].emplace_back(1);  // A[1][2]
-  a_colidx_to_rowidx[3].emplace_back(0);  // A[0][3]
-
-  A.setFromTriplets(A_elements.begin(), A_elements.end());
-
-  if (buildRefs && 0 == rank) {
-    Aref.setFromTriplets(Aref_elements.begin(), Aref_elements.end());
-  }
-
-  std::vector<triplet_t> B_elements;
-  std::vector<triplet_t> Bref_elements;
-#if defined(BTAS_IS_USABLE)
-  if (keymap({0, 0}) == rank) {
-    B_elements.emplace_back(0, 0, blk_t(btas::Range(256, 196), 12.3));
-  }
-  if (keymap({1, 0}) == rank) {
-    B_elements.emplace_back(1, 0, blk_t(btas::Range(256, 196), 10.7));
-  }
-  if (keymap({3, 0}) == rank) {
-    B_elements.emplace_back(3, 0, blk_t(btas::Range(256, 196), -2.3));
-  }
-  if (keymap({1, 1}) == rank) {
-    B_elements.emplace_back(1, 1, blk_t(btas::Range(256, 196), -0.3));
-  }
-  if (keymap({1, 2}) == rank) {
-    B_elements.emplace_back(1, 2, blk_t(btas::Range(256, 196), 1.2));
-  }
-  if (keymap({2, 2}) == rank) {
-    B_elements.emplace_back(2, 2, blk_t(btas::Range(256, 196), 7.2));
-  }
-  if (keymap({3, 2}) == rank) {
-    B_elements.emplace_back(3, 2, blk_t(btas::Range(256, 196), 0.2));
-  }
-  if (buildRefs && rank == 0) {
-    Bref_elements.emplace_back(0, 0, blk_t(btas::Range(256, 196), 12.3));
-    Bref_elements.emplace_back(1, 0, blk_t(btas::Range(256, 196), 10.7));
-    Bref_elements.emplace_back(3, 0, blk_t(btas::Range(256, 196), -2.3));
-    Bref_elements.emplace_back(1, 1, blk_t(btas::Range(256, 196), -0.3));
-    Bref_elements.emplace_back(1, 2, blk_t(btas::Range(256, 196), 1.2));
-    Bref_elements.emplace_back(2, 2, blk_t(btas::Range(256, 196), 7.2));
-    Bref_elements.emplace_back(3, 2, blk_t(btas::Range(256, 196), 0.2));
-  }
-#else
-  if (keymap({0, 0}) == rank) {
-    B_elements.emplace_back(0, 0, 12.3);
-  }
-  if (keymap({1, 0}) == rank) {
-    B_elements.emplace_back(1, 0, 10.7);
-  }
-  if (keymap({3, 0}) == rank) {
-    B_elements.emplace_back(3, 0, -2.3);
-  }
-  if (keymap({1, 1}) == rank) {
-    B_elements.emplace_back(1, 1, -0.3);
-  }
-  if (keymap({1, 2}) == rank) {
-    B_elements.emplace_back(1, 2, 1.2);
-  }
-  if (keymap({2, 2}) == rank) {
-    B_elements.emplace_back(2, 2, 7.2);
-  }
-  if (keymap({3, 2}) == rank) {
-    B_elements.emplace_back(3, 2, 0.2);
-  }
-#endif
-  b_rowidx_to_colidx.resize(4);
-  b_rowidx_to_colidx[0].emplace_back(0);  // B[0][0]
-  b_rowidx_to_colidx[1].emplace_back(0);  // B[1][0]
-  b_rowidx_to_colidx[1].emplace_back(1);  // B[1][1]
-  b_rowidx_to_colidx[1].emplace_back(2);  // B[1][2]
-  b_rowidx_to_colidx[2].emplace_back(2);  // B[2][2]
-  b_rowidx_to_colidx[3].emplace_back(0);  // B[3][0]
-  b_rowidx_to_colidx[3].emplace_back(2);  // B[3][2]
-
-  b_colidx_to_rowidx.resize(3);
-  b_colidx_to_rowidx[0].emplace_back(0);  // B[0][0]
-  b_colidx_to_rowidx[0].emplace_back(1);  // B[1][0]
-  b_colidx_to_rowidx[0].emplace_back(3);  // B[3][0]
-  b_colidx_to_rowidx[1].emplace_back(1);  // B[1][1]
-  b_colidx_to_rowidx[2].emplace_back(1);  // B[1][2]
-  b_colidx_to_rowidx[2].emplace_back(2);  // B[2][2]
-  b_colidx_to_rowidx[2].emplace_back(3);  // A[3][2]
-
-  B.setFromTriplets(B_elements.begin(), B_elements.end());
-  if (buildRefs && 0 == rank) {
-    Bref.setFromTriplets(Bref_elements.begin(), Bref_elements.end());
-  }
-}
-
-#if defined(BTAS_IS_USABLE)
-static void initBlSpRandom(const std::function<int(const Key<2> &)> &keymap, size_t M, size_t N, size_t K, int minTs,
-                           int maxTs, double avgDensity, SpMatrix<> &A, SpMatrix<> &B, SpMatrix<> &Aref,
-                           SpMatrix<> &Bref, bool buildRefs, std::vector<int> &mTiles, std::vector<int> &nTiles,
-                           std::vector<int> &kTiles, std::vector<std::vector<long>> &a_rowidx_to_colidx,
-                           std::vector<std::vector<long>> &a_colidx_to_rowidx,
-                           std::vector<std::vector<long>> &b_rowidx_to_colidx,
-                           std::vector<std::vector<long>> &b_colidx_to_rowidx, double &average_tile_size,
-                           double &Adensity, double &Bdensity, unsigned int seed) {
-  int rank = ttg::default_execution_context().rank();
-
-  int ts;
-  std::mt19937 gen(seed);
-  std::mt19937 genv(seed + 1);
-
-  std::uniform_int_distribution<> dist(minTs, maxTs);  // randomly pick any value in the range minTs, maxTs
-  using triplet_t = ttg::matrix::Triplet<blk_t>;
-  std::vector<triplet_t> A_elements;
-  std::vector<triplet_t> B_elements;
-  std::vector<triplet_t> Aref_elements;
-  std::vector<triplet_t> Bref_elements;
-
-  for (int m = 0; m < M; m += ts) {
-    ts = dist(gen);
-    if (ts > M - m) ts = M - m;
-    mTiles.push_back(ts);
-  }
-  for (int n = 0; n < N; n += ts) {
-    ts = dist(gen);
-    if (ts > N - n) ts = N - n;
-    nTiles.push_back(ts);
-  }
-  for (int k = 0; k < K; k += ts) {
-    ts = dist(gen);
-    if (ts > K - k) ts = K - k;
-    kTiles.push_back(ts);
-  }
-
-  A.resize(mTiles.size(), kTiles.size());
-  B.resize(kTiles.size(), nTiles.size());
-  if (buildRefs) {
-    Aref.resize(mTiles.size(), kTiles.size());
-    Bref.resize(kTiles.size(), nTiles.size());
-  }
-
-  std::uniform_int_distribution<> mDist(0, mTiles.size() - 1);
-  std::uniform_int_distribution<> nDist(0, nTiles.size() - 1);
-  std::uniform_int_distribution<> kDist(0, kTiles.size() - 1);
-  std::uniform_real_distribution<> vDist(-1.0, 1.0);
-
-  size_t filling = 0;
-  size_t avg_nb = 0;
-  int avg_nb_nb = 0;
-
-  struct tuple_hash : public std::unary_function<std::tuple<int, int>, std::size_t> {
-    std::size_t operator()(const std::tuple<int, int> &k) const {
-      return static_cast<size_t>(std::get<0>(k)) | (static_cast<size_t>(std::get<1>(k)) << 32);
-    }
-  };
-
-  std::unordered_set<std::tuple<int, int>, tuple_hash> fills;
-
-  fills.clear();
-  while ((double)filling / (double)(M * K) < avgDensity) {
-    int mt = mDist(gen);
-    int kt = kDist(gen);
-
-    if (fills.find({mt, kt}) != fills.end()) continue;
-    fills.insert({mt, kt});
-
-    if (mt >= a_rowidx_to_colidx.size()) a_rowidx_to_colidx.resize(mt + 1);
-    a_rowidx_to_colidx[mt].emplace_back(kt);
-    if (kt >= a_colidx_to_rowidx.size()) a_colidx_to_rowidx.resize(kt + 1);
-    a_colidx_to_rowidx[kt].emplace_back(mt);
-
-    filling += mTiles[mt] * kTiles[kt];
-    avg_nb += mTiles[mt] * kTiles[kt];
-    avg_nb_nb++;
-    double value = vDist(genv);
-    if (0 == rank && buildRefs) Aref_elements.emplace_back(mt, kt, blk_t(btas::Range(mTiles[mt], kTiles[kt]), value));
-    if (rank != keymap({mt, kt})) continue;
-    A_elements.emplace_back(mt, kt, blk_t(btas::Range(mTiles[mt], kTiles[kt]), value));
-  }
-  for (auto &row : a_rowidx_to_colidx) {
-    std::sort(row.begin(), row.end());
-  }
-  for (auto &col : a_colidx_to_rowidx) {
-    std::sort(col.begin(), col.end());
-  }
-  A.setFromTriplets(A_elements.begin(), A_elements.end());
-  Adensity = (double)filling / (double)(M * K);
-  if (0 == rank && buildRefs) Aref.setFromTriplets(Aref_elements.begin(), Aref_elements.end());
-
-  filling = 0;
-  fills.clear();
-  while ((double)filling / (double)(K * N) < avgDensity) {
-    int nt = nDist(gen);
-    int kt = kDist(gen);
-
-    if (fills.find({kt, nt}) != fills.end()) continue;
-    fills.insert({kt, nt});
-
-    if (kt >= b_rowidx_to_colidx.size()) b_rowidx_to_colidx.resize(kt + 1);
-    b_rowidx_to_colidx[kt].emplace_back(nt);
-    if (nt >= b_colidx_to_rowidx.size()) b_colidx_to_rowidx.resize(nt + 1);
-    b_colidx_to_rowidx[nt].emplace_back(kt);
-
-    filling += kTiles[kt] * nTiles[nt];
-    avg_nb += kTiles[kt] * nTiles[nt];
-    avg_nb_nb++;
-    double value = vDist(genv);
-    if (0 == rank && buildRefs) Bref_elements.emplace_back(kt, nt, blk_t(btas::Range(kTiles[kt], nTiles[nt]), value));
-    if (rank != keymap({kt, nt})) continue;
-    B_elements.emplace_back(kt, nt, blk_t(btas::Range(kTiles[kt], nTiles[nt]), value));
-  }
-  for (auto &row : b_rowidx_to_colidx) {
-    std::sort(row.begin(), row.end());
-  }
-  for (auto &col : b_colidx_to_rowidx) {
-    std::sort(col.begin(), col.end());
-  }
-  B.setFromTriplets(B_elements.begin(), B_elements.end());
-  Bdensity = (double)filling / (double)(K * N);
-  if (0 == rank && buildRefs) Bref.setFromTriplets(Bref_elements.begin(), Bref_elements.end());
-  fills.clear();
-
-  average_tile_size = (double)avg_nb / avg_nb_nb;
-}
-#endif
-
-#endif
-
-static void timed_measurement(SpMatrix<> &A, SpMatrix<> &B, const std::function<int(const Key<2> &)> &ij_keymap,
-                              const std::function<int(const Key<3> &)> &ijk_keymap, const std::string &tiling_type,
-                              double gflops, double avg_nb, double Adensity, double Bdensity,
-                              const std::vector<std::vector<long>> &a_rowidx_to_colidx,
-                              const std::vector<std::vector<long>> &a_colidx_to_rowidx,
-                              const std::vector<std::vector<long>> &b_rowidx_to_colidx,
-                              const std::vector<std::vector<long>> &b_colidx_to_rowidx, std::vector<int> &mTiles,
-                              std::vector<int> &nTiles, std::vector<int> &kTiles, int M, int N, int K, int minTs,
-                              int maxTs, int P, int Q, int R) {
-  int MT = (int)A.rows();
-  int NT = (int)B.cols();
-  int KT = (int)A.cols();
-  assert(KT == B.rows());
-
-  SpMatrix<> C;
-  C.resize(MT, NT);
-
-  // flow graph needs to exist on every node
-  Edge<Key<2>> ctl("control");
-  Control control(ctl);
-  Edge<Key<2>, blk_t> eA, eB;
-  Edge<Key<2>, blk_t> eC;
-
-  Read_SpMatrix a("A", A, ctl, eA, ij_keymap);
-  Read_SpMatrix b("B", B, ctl, eB, ij_keymap);
-  Write_SpMatrix<> c(C, eC, ij_keymap);
-  auto &c_status = c.status();
-  assert(!has_value(c_status));
-  //  SpMM25D a_times_b(world, eA, eB, eC, A, B);
-  SpMM25D<> a_times_b(eA, eB, eC, A, B, a_rowidx_to_colidx, a_colidx_to_rowidx, b_rowidx_to_colidx, b_colidx_to_rowidx,
-                      mTiles, nTiles, kTiles, ij_keymap, ijk_keymap, R);
-  TTGUNUSED(a);
-  TTGUNUSED(b);
-  TTGUNUSED(a_times_b);
-
-  auto connected = make_graph_executable(&control);
-  assert(connected);
-  TTGUNUSED(connected);
-
-  struct timeval start {
-    0
-  }, end{0}, diff{0};
-  gettimeofday(&start, nullptr);
-  // ready, go! need only 1 kick, so must be done by 1 thread only
-  if (ttg::default_execution_context().rank() == 0) control.start(P, Q);
-  fence();
-  gettimeofday(&end, nullptr);
-  timersub(&end, &start, &diff);
-  double tc = (double)diff.tv_sec + (double)diff.tv_usec / 1e6;
-#if defined(TTG_USE_MADNESS)
-  std::string rt("MAD");
-#elif defined(TTG_USE_PARSEC)
-  std::string rt("PARSEC");
-#else
-  std::string rt("Unkown???");
-#endif
-  if (ttg::default_execution_context().rank() == 0) {
-    std::cout << "TTG-" << rt << " PxQxR=   " << P << " " << Q << " " << R << " 1 average_NB= " << avg_nb << " M= " << M
-              << " N= " << N << " K= " << K << " t= " << minTs << " T=" << maxTs << " Tiling= " << tiling_type
-              << " A_density= " << Adensity << " B_density= " << Bdensity << " gflops= " << gflops << " seconds= " << tc
-              << " gflops/s= " << gflops / tc << std::endl;
-  }
-}
-
-#if !defined(BLOCK_SPARSE_GEMM)
-static void make_rowidx_to_colidx_from_eigen(const SpMatrix<> &mat, std::vector<std::vector<long>> &r2c) {
-  for (int k = 0; k < mat.outerSize(); ++k) {  // cols, if col-major, rows otherwise
-    for (typename SpMatrix<blk_t>::InnerIterator it(mat, k); it; ++it) {
-      const long row = it.row();
-      const long col = it.col();
-      if (row >= r2c.size()) r2c.resize(row + 1);
-      r2c[row].push_back(col);
-    }
-  }
-  // Sort each vector of column indices, as we pushed them in an arbitrary order
-  for (auto &row : r2c) {
-    std::sort(row.begin(), row.end());
-  }
-}
-
-static void make_colidx_to_rowidx_from_eigen(const SpMatrix<> &mat, std::vector<std::vector<long>> &c2r) {
-  for (int k = 0; k < mat.outerSize(); ++k) {  // cols, if col-major, rows otherwise
-    for (typename SpMatrix<blk_t>::InnerIterator it(mat, k); it; ++it) {
-      const long row = it.row();
-      const long col = it.col();
-
-      if (col >= c2r.size()) c2r.resize(col + 1);
-      c2r[col].push_back(row);
-    }
-    // Sort each vector of row indices, as we pushed them in an arbitrary order
-    for (auto &col : c2r) {
-      std::sort(col.begin(), col.end());
-    }
-  }
-}
-#endif
-
-static double compute_gflops(const std::vector<std::vector<long>> &a_r2c, const std::vector<std::vector<long>> &b_r2c,
-                             const std::vector<int> &mTiles, const std::vector<int> &nTiles,
-                             const std::vector<int> &kTiles) {
-  unsigned long flops = 0;
-  for (auto i = 0; i < a_r2c.size(); i++) {
-    for (auto kk = 0; kk < a_r2c[i].size(); kk++) {
-      auto k = a_r2c[i][kk];
-      if (k > b_r2c.size()) continue;
-      for (auto jj = 0; jj < b_r2c[k].size(); jj++) {
-        auto j = b_r2c[k][jj];
-        flops += static_cast<long>(mTiles[i]) * nTiles[j] * kTiles[k];
-      }
-    }
-  }
-  return 2.0 * (double)flops / 1e9;
-}
-
-int main(int argc, char **argv) {
-  bool timing;
-  double gflops;
-
-  // warm up silicon by calling gemm a few times
-#ifdef BTAS_IS_USABLE
-  for (int i = 0; i < 20; i++) {
-    using baseT = typename btas::Tensor<double>;
-    btas::Tensor<double, btas::Range, std::vector<double>> At(30, 30);
-    btas::Tensor<double, btas::Range, std::vector<double>> Bt(30, 30);
-    btas::Tensor<double, btas::Range, std::vector<double>> Ct(30, 30);
-    At.fill(1.0);
-    Bt.fill(2.0);
-    Ct.fill(3.0);
-    btas::gemm(std::move(Ct), Bt, At);
-  }
-#endif  // BTAS_IS_USABLE
-
-//  static volatile int debug_signal = 0;
-//  std::cout << "Waiting on debug signal (int*)" << &debug_signal << std::endl;
-//  while (!debug_signal) {}
-
-
-  int cores = -1;
-  std::string nbCoreStr(getCmdOption(argv, argv + argc, "-c"));
-  cores = parseOption(nbCoreStr, cores);
-
-  if (int dashdash = cmdOptionIndex(argv, argv + argc, "--") > -1) {
-    initialize(argc - dashdash, argv + dashdash, cores);
-  } else {
-    initialize(1, argv, cores);
-  }
-
-#ifdef BTAS_IS_USABLE
-  // initialize MADNESS so that TA allocators can be created
-  madness::ParsecRuntime::initialize_with_existing_context(ttg::default_execution_context().impl().context());
-  madness::initialize(argc, argv, /* nthread = */ 1, /* quiet = */ true);
-#endif  // BTAS_IS_USABLE
-
-  std::string debugStr(getCmdOption(argv, argv + argc, "-d"));
-  auto debug = (unsigned int)parseOption(debugStr, 0);
-
-  if (debug & (1 << 1)) {
-    using ttg::Debugger;
-    auto debugger = std::make_shared<Debugger>();
-    Debugger::set_default_debugger(debugger);
-    debugger->set_exec(argv[0]);
-    debugger->set_prefix(ttg::default_execution_context().rank());
-    // debugger->set_cmd("lldb_xterm");
-    debugger->set_cmd("gdb_xterm");
-  }
-
-  int mpi_size = ttg::default_execution_context().size();
-  int mpi_rank = ttg::default_execution_context().rank();
-  int best_pqc = mpi_size;
-  int P, Q, R;
-  for (int c = 1; c <= (int)cbrt(mpi_size); c++) {
-    for (int p = 1; p <= (int)sqrt(mpi_size / c); p++) {
-      if ((mpi_size % (p * c)) == 0) {
-        int q = mpi_size / (p * c);
-        if (abs(c - p - q) <= best_pqc) {
-          best_pqc = abs(c - p - q);
-          P = p;
-          Q = q;
-          R = c;
-        }
-      }
-    }
-    // ttg::launch_lldb(ttg::default_execution_context().rank(), argv[0]);
-
-    {
-      if (debug & (1 << 0)) {
-        ttg::trace_on();
-        TTBase::set_trace_all(true);
-      }
-
-      SpMatrix<> A, B, C, Aref, Bref;
-      std::string tiling_type;
-      int M = 0, N = 0, K = 0;
-      int minTs = 0, maxTs = 0;
-
-      double avg_nb = nan("undefined");
-      double Adensity = nan("undefined");
-      double Bdensity = nan("undefined");
-
-      std::string PStr(getCmdOption(argv, argv + argc, "-P"));
-      P = parseOption(PStr, P);
-      std::string QStr(getCmdOption(argv, argv + argc, "-Q"));
-      Q = parseOption(QStr, Q);
-      // to make code behave like 2D summa if R not given
-      std::string RStr(getCmdOption(argv, argv + argc, "-R"));
-      R = parseOption(RStr, 1);
-
-      if (P * Q * R != mpi_size) {
-        if (!cmdOptionExists(argv, argv + argc, "-Q") && (mpi_size % (P * R) == 0))
-          Q = mpi_size / (P * R);
-        else if (!cmdOptionExists(argv, argv + argc, "-P") && (mpi_size % (Q * R)) == 0)
-          P = mpi_size / (Q * R);
-        else if (!cmdOptionExists(argv, argv + argc, "-R") && (mpi_size % (Q * P)) == 0)
-          R = mpi_size / (Q * P);
-        else {
-          if (0 == mpi_rank) {
-            std::cerr << P << "x" << Q << "x" << R << " is not a valid process grid -- bailing out" << std::endl;
-            MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
-          }
-        }
-      }
-
-      auto ij_keymap = [P, Q](const Key<2> &ij) {
-        int i = (int)ij[0];
-        int j = (int)ij[1];
-        int r = ij2rank(i, j, P, Q);
-        return r;
-      };
-
-      auto ijk_keymap = [P, Q, R](const Key<3> &ijk) {
-        int i = (int)ijk[0];
-        int j = (int)ijk[1];
-        int k = (int)ijk[2];
-        int r = ijk2rank(i, j, k, P, Q, R);
-        return r;
-      };
-
-      std::string seedStr(getCmdOption(argv, argv + argc, "-s"));
-      unsigned int seed = parseOption(seedStr, 0);
-      if (seed == 0) {
-        std::random_device rd;
-        seed = rd();
-        if (0 == ttg::default_execution_context().rank()) std::cerr << "#Random seeded with " << seed << std::endl;
-      }
-      ttg_broadcast(ttg::default_execution_context(), seed, 0);
-
-      std::vector<int> mTiles;
-      std::vector<int> nTiles;
-      std::vector<int> kTiles;
-      std::vector<std::vector<long>> a_rowidx_to_colidx;
-      std::vector<std::vector<long>> a_colidx_to_rowidx;
-      std::vector<std::vector<long>> b_rowidx_to_colidx;
-      std::vector<std::vector<long>> b_colidx_to_rowidx;
-
-      std::string checkStr(getCmdOption(argv, argv + argc, "-x"));
-      int check = parseOption(checkStr, !(argc >= 2));
-      timing = (check == 0);
-
-#if !defined(BLOCK_SPARSE_GEMM)
-      if (cmdOptionExists(argv, argv + argc, "-mm")) {
-        char *filename = getCmdOption(argv, argv + argc, "-mm");
-        tiling_type = filename;
-        initSpMatrixMarket(ij_keymap, filename, A, B, C, M, N, K);
-      } else if (cmdOptionExists(argv, argv + argc, "-rmat")) {
-        char *opt = getCmdOption(argv, argv + argc, "-rmat");
-        tiling_type = "RandomSparseMatrix";
-        initSpRmat(ij_keymap, opt, A, B, C, M, N, K, seed);
-      } else {
-        tiling_type = "HardCodedSparseMatrix";
-        initSpHardCoded(ij_keymap, A, B, C, M, N, K);
-      }
-
-      if (check) {
-        // We don't generate the sparse matrices in distributed, so Aref and Bref can
-        // just point to the same matrix, or be a local copy.
-        Aref = A;
-        Bref = B;
-      }
-
-      // We still need to build the metadata from the  matrices.
-      make_rowidx_to_colidx_from_eigen(A, a_rowidx_to_colidx);
-      make_colidx_to_rowidx_from_eigen(A, a_colidx_to_rowidx);
-      make_rowidx_to_colidx_from_eigen(B, b_rowidx_to_colidx);
-      make_colidx_to_rowidx_from_eigen(B, b_colidx_to_rowidx);
-      // This is only needed to compute the flops
-      for (int mt = 0; mt < M; mt++) mTiles.emplace_back(1);
-      for (int nt = 0; nt < N; nt++) nTiles.emplace_back(1);
-      for (int kt = 0; kt < K; kt++) kTiles.emplace_back(1);
-#else
-      if (argc >= 2) {
-        std::string Mstr(getCmdOption(argv, argv + argc, "-M"));
-        M = parseOption(Mstr, 1200);
-        std::string Nstr(getCmdOption(argv, argv + argc, "-N"));
-        N = parseOption(Nstr, M);
-        std::string Kstr(getCmdOption(argv, argv + argc, "-K"));
-        K = parseOption(Kstr, N);
-        std::string minTsStr(getCmdOption(argv, argv + argc, "-t"));
-        minTs = parseOption(minTsStr, 64);
-        std::string maxTsStr(getCmdOption(argv, argv + argc, "-T"));
-        maxTs = parseOption(maxTsStr, minTs);
-        std::string avgStr(getCmdOption(argv, argv + argc, "-a"));
-        double avg = parseOption(avgStr, 0.3);
-        timing = (check == 0);
-        tiling_type = "RandomIrregularTiling";
-        initBlSpRandom(ij_keymap, M, N, K, minTs, maxTs, avg, A, B, Aref, Bref, check, mTiles, nTiles, kTiles,
-                       a_rowidx_to_colidx, a_colidx_to_rowidx, b_rowidx_to_colidx, b_colidx_to_rowidx, avg_nb, Adensity,
-                       Bdensity, seed);
-
-        C.resize(mTiles.size(), nTiles.size());
-      } else {
-        tiling_type = "HardCodedBlockSparseMatrix";
-        initBlSpHardCoded(ij_keymap, A, B, C, Aref, Bref, true, mTiles, nTiles, kTiles, a_rowidx_to_colidx,
-                          a_colidx_to_rowidx, b_rowidx_to_colidx, b_colidx_to_rowidx, M, N, K);
-      }
-#endif  // !defined(BLOCK_SPARSE_GEMM)
-
-      gflops = compute_gflops(a_rowidx_to_colidx, b_rowidx_to_colidx, mTiles, nTiles, kTiles);
-
-      std::string nbrunStr(getCmdOption(argv, argv + argc, "-n"));
-      int nb_runs = parseOption(nbrunStr, 1);
-
-      if (timing) {
-        // Start up engine
-        execute();
-        for (int nrun = 0; nrun < nb_runs; nrun++) {
-          parsec_devices_release_memory();
-          timed_measurement(A, B, ij_keymap, ijk_keymap, tiling_type, gflops, avg_nb, Adensity, Bdensity,
-                            a_rowidx_to_colidx, a_colidx_to_rowidx, b_rowidx_to_colidx, b_colidx_to_rowidx, mTiles,
-                            nTiles, kTiles, M, N, K, minTs, maxTs, P, Q, R);
-          parsec_devices_reset_load(default_execution_context().impl().context());
-        }
-      } else {
-        // flow graph needs to exist on every node
-        // N.B. to validate C we need it on node 0!
-        auto keymap_write = [](const Key<2> &key) { return 0; };
-        Edge<Key<2>> ctl("control");
-        Control control(ctl);
-        Edge<Key<2>, blk_t> eA, eB, eC;
-        Read_SpMatrix a("A", A, ctl, eA, ij_keymap);
-        Read_SpMatrix b("B", B, ctl, eB, ij_keymap);
-        Write_SpMatrix<> c(C, eC, keymap_write, true);
-        auto &c_status = c.status();
-        assert(!has_value(c_status));
-        //  SpMM25D a_times_b(world, eA, eB, eC, A, B);
-        SpMM25D<> a_times_b(eA, eB, eC, A, B, a_rowidx_to_colidx, a_colidx_to_rowidx, b_rowidx_to_colidx,
-                            b_colidx_to_rowidx, mTiles, nTiles, kTiles, ij_keymap, ijk_keymap, R);
-        TTGUNUSED(a_times_b);
-        // calling the Dot constructor with 'true' argument disables the type
-        if (default_execution_context().rank() == 0) std::cout << Dot{/*disable_type=*/true}(&control) << std::endl;
-
-        // ready to run!
-        auto connected = make_graph_executable(&control);
-        assert(connected);
-        TTGUNUSED(connected);
-
-        // ready, go! need only 1 kick, so must be done by 1 thread only
-        if (ttg::default_execution_context().rank() == 0) control.start(P, Q);
-
-        execute();
-        fence();
-
-        // validate C=A*B against the reference output
-        assert(has_value(c_status));
-        if (ttg::default_execution_context().rank() == 0) {
-          SpMatrix<> Cref = Aref * Bref;
-
-          double norm_2_square, norm_inf;
-          std::tie(norm_2_square, norm_inf) = norms<blk_t>(Cref - C);
-          std::cout << "||Cref - C||_2      = " << std::sqrt(norm_2_square) << std::endl;
-          std::cout << "||Cref - C||_\\infty = " << norm_inf << std::endl;
-          if (norm_inf > 1e-9) {
-            std::cout << "Cref:\n" << Cref << std::endl;
-            std::cout << "C:\n" << C << std::endl;
-            ttg_abort();
-          }
-        }
-
-        // validate Acopy=A against the reference output
-        //      assert(has_value(copy_status));
-        //      if (ttg::default_execution_context().rank() == 0) {
-        //        double norm_2_square, norm_inf;
-        //        std::tie(norm_2_square, norm_inf) = norms<blk_t>(Acopy - A);
-        //        std::cout << "||Acopy - A||_2      = " << std::sqrt(norm_2_square) << std::endl;
-        //        std::cout << "||Acopy - A||_\\infty = " << norm_inf << std::endl;
-        //        if (::ttg::tracing()) {
-        //          std::cout << "Acopy (" << static_cast<void *>(&Acopy) << "):\n" << Acopy << std::endl;
-        //          std::cout << "A (" << static_cast<void *>(&A) << "):\n" << A << std::endl;
-        //        }
-        //        if (norm_inf != 0) {
-        //          ttg_abort();
-        //        }
-        //      }
-      }
-    }
-
-#ifdef BTAS_IS_USABLE
-    madness::finalize();
-#endif  // BTAS_IS_USABLE
-    ttg_finalize();
-    return 0;
-  }
-}

From 7bd3804395ab46c1526309911ca9fc077d699a33 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Fri, 26 Jul 2024 15:16:52 -0400
Subject: [PATCH 23/57] SPMM: Add missing includes

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 examples/spmm/spmm.cc | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc
index 51fb39d5f..d9f8f4ecb 100644
--- a/examples/spmm/spmm.cc
+++ b/examples/spmm/spmm.cc
@@ -31,8 +31,6 @@
 #include "ttg.h"
 #include "../ttg_matrix.h"
 
-using namespace ttg;
-
 #include "ttg/util/future.h"
 
 #include "ttg/util/multiindex.h"
@@ -40,6 +38,11 @@ using namespace ttg;
 
 #include "ttg/util/bug.h"
 
+#include "devicetensor.h"
+#include "devicegemm.h"
+
+using namespace ttg;
+
 #if defined(TTG_ENABLE_CUDA)
 #define HAVE_SPMM_DEVICE 1
 static constexpr ttg::ExecutionSpace space = ttg::ExecutionSpace::CUDA;
@@ -572,10 +575,6 @@ class SpMM25D {
                                 ttg::typelist<const Blk, const Blk, Blk>> {
     static constexpr const bool is_device_space = (Space_ != ttg::ExecutionSpace::Host);
     using task_return_type = std::conditional_t<is_device_space, ttg::device::Task, void>;
-    /* communicate to the runtime which device we support (if any) */
-    static constexpr bool have_cuda_op = (Space_ == ttg::ExecutionSpace::CUDA);
-    static constexpr bool have_hip_op  = (Space_ == ttg::ExecutionSpace::HIP);
-    static constexpr bool have_level_zero_op = (Space_ == ttg::ExecutionSpace::L0);
 
     void release_next_k(long k) {
       assert(k_cnt_.size() > k);
@@ -597,6 +596,11 @@ class SpMM25D {
    public:
     using baseT = typename MultiplyAdd::ttT;
 
+    /* communicate to the runtime which device we support (if any) */
+    static constexpr bool have_cuda_op = (Space_ == ttg::ExecutionSpace::CUDA);
+    static constexpr bool have_hip_op  = (Space_ == ttg::ExecutionSpace::HIP);
+    static constexpr bool have_level_zero_op = (Space_ == ttg::ExecutionSpace::L0);
+
     MultiplyAdd(Edge<Key<3>, Blk> &a_ijk, Edge<Key<3>, Blk> &b_ijk, Edge<Key<3>, Blk> &c_ijk, Edge<Key<2>, Blk> &c,
                 const std::vector<std::vector<long>> &a_cols_of_row,
                 const std::vector<std::vector<long>> &b_rows_of_col, const std::vector<int> &mTiles,

From f31fedb42f2ccc2e06a8fc9a4c60f6eab54b48f6 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Fri, 26 Jul 2024 17:00:46 -0400
Subject: [PATCH 24/57] Make sure MADNESS is initialized if we use pinning
 allocators

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 examples/spmm/spmm.cc | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc
index d9f8f4ecb..e6af3737e 100644
--- a/examples/spmm/spmm.cc
+++ b/examples/spmm/spmm.cc
@@ -1586,6 +1586,12 @@ int main(int argc, char **argv) {
     initialize(1, argv, cores);
   }
 
+#ifdef BTAS_IS_USABLE
+  // initialize MADNESS so that TA allocators can be created
+  madness::ParsecRuntime::initialize_with_existing_context(ttg::default_execution_context().impl().context());
+  madness::initialize(argc, argv, /* nthread = */ 1, /* quiet = */ true);
+#endif  // BTAS_IS_USABLE
+
   std::string debugStr(getCmdOption(argv, argv + argc, "-d"));
   auto debug = (unsigned int)parseOption(debugStr, 0);
 
@@ -1878,6 +1884,10 @@ int main(int argc, char **argv) {
       }
     }
 
+#ifdef BTAS_IS_USABLE
+    madness::finalize();
+#endif  // BTAS_IS_USABLE
+
     ttg_finalize();
 
     return 0;

From 1e348273073cdc136fd21fe5fcc1b008f26646dd Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Wed, 14 Aug 2024 01:13:43 -0400
Subject: [PATCH 25/57] Fix usage of ttg::persistent()

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 examples/spmm/spmm.cc | 19 +++++++++++--------
 ttg/ttg/parsec/ttg.h  |  2 +-
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc
index e6af3737e..161d88cdc 100644
--- a/examples/spmm/spmm.cc
+++ b/examples/spmm/spmm.cc
@@ -64,13 +64,14 @@ static constexpr ttg::ExecutionSpace space = ttg::ExecutionSpace::Host;
 #if defined(BLOCK_SPARSE_GEMM) && defined(BTAS_IS_USABLE)
 using scalar_t = double;
 
-#if HAVE_SPMM_DEVICE
 using blk_t = DeviceTensor<scalar_t, btas::DEFAULT::range,
-                           btas::mohndle<btas::varray<scalar_t, TiledArray::device_pinned_allocator<scalar_t>>,
-                                         btas::Handle::shared_ptr>>;
+                           btas::mohndle<btas::varray<scalar_t
+#if HAVE_SPMM_DEVICE
+                                                      , TiledArray::device_pinned_allocator<scalar_t>
 #else  // HAVE_SPMM_DEVICE
-using blk_t = btas::Tensor<scalar_t, btas::DEFAULT::range, btas::mohndle<btas::varray<scalar_t>, btas::Handle::shared_ptr>>;
-#endif // HAVE_SPMM_DEVICE
+                                                      >,
+#endif  // HAVE_SPMM_DEVICE
+                                         btas::Handle::shared_ptr>>;
 //#include <atomic>
 //static std::atomic<uint64_t> reduce_count = 0;
 
@@ -234,7 +235,9 @@ class Read_SpMatrix : public TT<Key<3>,
         const auto i = it.row();
         // IF the receiver uses the same keymap, these sends are local
         if (rank == this->ij_keymap_(Key<2>(std::initializer_list<long>({i, j})))) {
-          ::send<0>(Key<2>(std::initializer_list<long>({i, j})), it.value(), out);
+          ::send<0>(Key<2>(std::initializer_list<long>({i, j})),
+                           ttg::persistent(it.value()),
+                           out);
         }
       }
     }
@@ -258,14 +261,14 @@ class Write_SpMatrix : public TT<Key<2>, std::tuple<>, Write_SpMatrix<Blk>, ttg:
       , write_back_(write_back)
   { }
 
-  void op(const Key<2> &key, typename baseT::input_values_tuple_type &&elem, std::tuple<> &) {
+  void op(const Key<2> &key, typename baseT::input_refs_tuple_type &&elem, std::tuple<> &) {
     if (write_back_) {
       std::lock_guard<std::mutex> lock(mtx_);
       ttg::trace("rank =", default_execution_context().rank(),
                 "/ thread_id =", reinterpret_cast<std::uintptr_t>(pthread_self()), "spmm.cc Write_SpMatrix wrote {",
                 key[0], ",", key[1], "} = ", baseT::template get<0>(elem), " in ", static_cast<void *>(&matrix_),
                 " with mutex @", static_cast<void *>(&mtx_), " for object @", static_cast<void *>(this));
-      values_.emplace_back(key[0], key[1], baseT::template get<0>(elem));
+      values_.emplace_back(key[0], key[1], std::move(baseT::template get<0>(elem)));
     }
   }
 
diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h
index 01cb8bcab..30ac27821 100644
--- a/ttg/ttg/parsec/ttg.h
+++ b/ttg/ttg/parsec/ttg.h
@@ -4577,7 +4577,7 @@ struct ttg::detail::value_copy_handler<ttg::Runtime::PaRSEC> {
       bool inserted = ttg_parsec::detail::add_copy_to_task(copy, caller);
       assert(inserted);
       copy_to_remove = copy; // we want to remove the copy from the task once done sending
-      do_release = false; // we don't release the copy since we didn't allocate it
+      do_release = true; // we don't release the copy since we didn't allocate it
       copy->add_ref(); // add a reference so that TTG does not attempt to delete this object
     }
     return vref.value_ref;

From 8f58c146a07ed5335fa09e42d93ab30731738d94 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Wed, 14 Aug 2024 14:21:24 -0400
Subject: [PATCH 26/57] Make sure persistent data has at least one reader

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 ttg/ttg/parsec/ttg.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h
index 30ac27821..536880c2a 100644
--- a/ttg/ttg/parsec/ttg.h
+++ b/ttg/ttg/parsec/ttg.h
@@ -4579,6 +4579,11 @@ struct ttg::detail::value_copy_handler<ttg::Runtime::PaRSEC> {
       copy_to_remove = copy; // we want to remove the copy from the task once done sending
       do_release = true; // we don't release the copy since we didn't allocate it
       copy->add_ref(); // add a reference so that TTG does not attempt to delete this object
+      copy->add_ref(); // add another reference so that TTG never attempts to free this copy
+      if (copy->num_readers() == 0) {
+        /* add at least one reader (the current task) */
+        copy->increment_readers<false>();
+      }
     }
     return vref.value_ref;
   }

From 1948ee7c3f7af23f6cea1664ea7338e2a1105b3d Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Wed, 14 Aug 2024 14:22:13 -0400
Subject: [PATCH 27/57] Maximize data reuse by not moving C in and out of
 gemm()

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 examples/spmm/spmm.cc | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc
index 161d88cdc..392e1dca6 100644
--- a/examples/spmm/spmm.cc
+++ b/examples/spmm/spmm.cc
@@ -150,7 +150,7 @@ namespace btas {
   }
 
   template <typename T_, class Range_, class Store_>
-  btas::Tensor<T_, Range_, Store_> gemm(btas::Tensor<T_, Range_, Store_> &&C, const btas::Tensor<T_, Range_, Store_> &A,
+  void gemm(btas::Tensor<T_, Range_, Store_> &C, const btas::Tensor<T_, Range_, Store_> &A,
                                         const btas::Tensor<T_, Range_, Store_> &B) {
     using array = btas::DEFAULT::index<int>;
     if (C.empty()) {  // first contribution to C = allocate it and gemm with beta=0
@@ -160,7 +160,7 @@ namespace btas {
     else {   // subsequent contributions to C = gemm with beta=1
       btas::contract_222(1.0, A, array{1, 2}, B, array{2, 3}, 1.0, C, array{1, 3}, false, false);
     }
-    return std::move(C);
+    //return std::move(C);
   }
 }  // namespace btas
 #endif  // BTAS_IS_USABLE
@@ -699,18 +699,19 @@ class SpMM25D {
                                                 result));
       }
 #else  // HAVE_SPMM_DEVICE
+      gemm(C, A, B);
       // compute the contrib, pass the running total to the next flow, if needed
       // otherwise write to the result flow
       if (have_next_k) {
         ::send<1>(
             Key<3>({i, j, next_k}),
-            gemm(std::move(C), A, B),
+            std::move(C),
             result);
       } else {  // done with all local contributions to C[i][j], reduce with others on the process to which C[i][j]
                 // belongs
         ::send<0>(
             Key<2>({i, j}),
-            gemm(std::move(C), A, B),
+            std::move(C),
             result);
       }
 #endif // HAVE_SPMM_DEVICE
@@ -1575,7 +1576,7 @@ int main(int argc, char **argv) {
     At.fill(1.0);
     Bt.fill(2.0);
     Ct.fill(3.0);
-    btas::gemm(std::move(Ct), Bt, At);
+    btas::gemm(Ct, Bt, At);
   }
 #endif  // BTAS_IS_USABLE
 

From c0cabfa062888b822f2b6a98524d031258f35116 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Wed, 14 Aug 2024 14:23:05 -0400
Subject: [PATCH 28/57] SPMM bcasts: use input_refs_tuple_type and and const

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 examples/spmm/spmm.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc
index 392e1dca6..de66734ab 100644
--- a/examples/spmm/spmm.cc
+++ b/examples/spmm/spmm.cc
@@ -409,7 +409,7 @@ class SpMM25D {
 
   /// Locally broadcast `A[i][k]` assigned to this processor `p` to matmul tasks `{i,j,k}` for all `j` such that
   /// `B[k][j]` exists AND `k` contribution to `C[i][j]` is assigned to this processor
-  class LocalBcastA : public TT<Key<3>, std::tuple<Out<Key<3>, Blk>>, LocalBcastA, ttg::typelist<Blk>> {
+  class LocalBcastA : public TT<Key<3>, std::tuple<Out<Key<3>, Blk>>, LocalBcastA, ttg::typelist<const Blk>> {
    public:
     using baseT = typename LocalBcastA::ttT;
 
@@ -446,7 +446,7 @@ class SpMM25D {
   };  // class LocalBcastA
 
   /// broadcast `A[i][k]` to all processors which will contain at least one `C[i][j]` such that `B[k][j]` exists
-  class BcastA : public TT<Key<2>, std::tuple<Out<Key<3>, Blk>>, BcastA, ttg::typelist<Blk>> {
+  class BcastA : public TT<Key<2>, std::tuple<Out<Key<3>, Blk>>, BcastA, ttg::typelist<const Blk>> {
    public:
     using baseT = typename BcastA::ttT;
 
@@ -462,7 +462,7 @@ class SpMM25D {
       });
     }
 
-    void op(const Key<2> &ik, typename baseT::input_values_tuple_type &&a_ik,
+    void op(const Key<2> &ik, typename baseT::input_refs_tuple_type &&a_ik,
             std::tuple<Out<Key<3>, Blk>> &outs) {
       const auto i = ik[0]; // row
       const auto k = ik[1]; // col
@@ -492,7 +492,7 @@ class SpMM25D {
 
   /// Locally broadcast `B[k][j]` assigned to this processor `p` to matmul tasks `{i,j,k}` for all `k` such that
   /// `A[i][k]` exists AND `k` contribution to `C[i][j]` is assigned to this processor
-  class LocalBcastB : public TT<Key<3>, std::tuple<Out<Key<3>, Blk>>, LocalBcastB, ttg::typelist<Blk>> {
+  class LocalBcastB : public TT<Key<3>, std::tuple<Out<Key<3>, Blk>>, LocalBcastB, ttg::typelist<const Blk>> {
    public:
     using baseT = typename LocalBcastB::ttT;
 
@@ -528,7 +528,7 @@ class SpMM25D {
   };  // class LocalBcastB
 
   /// broadcast `B[k][j]` to all processors which will contain at least one `C[i][j]` such that `A[i][k]` exists
-  class BcastB : public TT<Key<2>, std::tuple<Out<Key<3>, Blk>>, BcastB, ttg::typelist<Blk>> {
+  class BcastB : public TT<Key<2>, std::tuple<Out<Key<3>, Blk>>, BcastB, ttg::typelist<const Blk>> {
    public:
     using baseT = typename BcastB::ttT;
 
@@ -544,7 +544,7 @@ class SpMM25D {
       });
     }
 
-    void op(const Key<2> &kj, typename baseT::input_values_tuple_type &&b_kj,
+    void op(const Key<2> &kj, typename baseT::input_refs_tuple_type &&b_kj,
             std::tuple<Out<Key<3>, Blk>> &outs) {
       const auto k = kj[0]; // row
       const auto j = kj[1]; // col

From a15f724720cbfee963a9d2a7ac8e2530d2586cf5 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Wed, 14 Aug 2024 14:24:51 -0400
Subject: [PATCH 29/57] Fix conditional use of pinned allocator on devices

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 examples/spmm/spmm.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc
index de66734ab..6c66988e5 100644
--- a/examples/spmm/spmm.cc
+++ b/examples/spmm/spmm.cc
@@ -68,9 +68,8 @@ using blk_t = DeviceTensor<scalar_t, btas::DEFAULT::range,
                            btas::mohndle<btas::varray<scalar_t
 #if HAVE_SPMM_DEVICE
                                                       , TiledArray::device_pinned_allocator<scalar_t>
-#else  // HAVE_SPMM_DEVICE
-                                                      >,
 #endif  // HAVE_SPMM_DEVICE
+                                                      >,
                                          btas::Handle::shared_ptr>>;
 //#include <atomic>
 //static std::atomic<uint64_t> reduce_count = 0;

From 8982c2543e0c9c2b5e1ceda9d06d6a26ea689b0f Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Fri, 16 Aug 2024 12:21:36 -0400
Subject: [PATCH 30/57] SPMM: Add option to disable device mapping hint

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 examples/spmm/spmm.cc | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc
index 6c66988e5..902f4e22a 100644
--- a/examples/spmm/spmm.cc
+++ b/examples/spmm/spmm.cc
@@ -314,7 +314,7 @@ class SpMM25D {
           const std::vector<std::vector<long>> &b_cols_of_row,
           const std::vector<std::vector<long>> &b_rows_of_col, const std::vector<int> &mTiles,
           const std::vector<int> &nTiles, const std::vector<int> &kTiles, Keymap2 ij_keymap, Keymap3 ijk_keymap,
-          long R, long parallel_bcasts = 1)
+          long R, long parallel_bcasts = 1, bool enable_device_map = true)
       : a_cols_of_row_(a_cols_of_row)
       , b_rows_of_col_(b_rows_of_col)
       , a_rows_of_col_(a_rows_of_col)
@@ -322,7 +322,8 @@ class SpMM25D {
       , k_cnt_(a_rows_of_col_.size()+1)
       , ij_keymap_(std::move(ij_keymap))
       , ijk_keymap_(std::move(ijk_keymap))
-      , parallel_bcasts_(std::max(parallel_bcasts, 1L)) {
+      , parallel_bcasts_(std::max(parallel_bcasts, 1L))
+      , enable_device_map_(enable_device_map) {
     Edge<Key<2>, void> a_ctl, b_ctl;
     Edge<Key<2>, int> a_rowctl, b_colctl; // TODO: can we have multiple control inputs per TT?
     auto constraint = ttg::make_shared_constraint<ttg::SequencedKeysConstraint<Key<2>>>(USE_AUTO_CONSTRAINT);
@@ -336,7 +337,7 @@ class SpMM25D {
     local_bcast_b_ = std::make_unique<LocalBcastB>(local_b_ijk_, b_ijk_, a_rows_of_col_, ijk_keymap_);
     multiplyadd_ = std::make_unique<MultiplyAdd<Space>>(a_ijk_, b_ijk_, c_ijk_, c_ij_p_, a_cols_of_row_,
                                                         b_rows_of_col_, mTiles, nTiles, ijk_keymap_, constraint,
-                                                        k_cnt_, parallel_bcasts_);
+                                                        k_cnt_, parallel_bcasts_, enable_device_map_);
 
     reduce_c_ = std::make_unique<ReduceC>(c_ij_p_, c, ij_keymap_);
     reduce_c_->template set_input_reducer<0>(
@@ -609,7 +610,7 @@ class SpMM25D {
                 const std::vector<int> &nTiles, const Keymap3 &ijk_keymap,
                 std::shared_ptr<ttg::SequencedKeysConstraint<Key<2>>> constraint,
                 std::vector<std::atomic<std::size_t>>& k_cnt,
-                std::size_t parallel_bcasts)
+                std::size_t parallel_bcasts, bool enable_device_map)
         : baseT(edges(a_ijk, b_ijk, c_ijk), edges(c, c_ijk), "SpMM25D::MultiplyAdd", {"a_ijk", "b_ijk", "c_ijk"},
                 {"c_ij", "c_ijk"}, ijk_keymap)
         , a_cols_of_row_(a_cols_of_row)
@@ -619,11 +620,13 @@ class SpMM25D {
         , parallel_bcasts_(parallel_bcasts) {
       this->set_priomap([=,this](const Key<3> &ijk) { return this->prio(ijk); });  // map a key to an integral priority value
       if constexpr (is_device_space) {
-        auto num_devices = ttg::device::num_devices();
-        this->set_devicemap(
-          [num_devices](const Key<3> &ijk){
-            return ((((uint64_t)ijk[0]) << 32) + ijk[1]) % num_devices;
-          });
+        if (enable_device_map) {
+          auto num_devices = ttg::device::num_devices();
+          this->set_devicemap(
+            [num_devices](const Key<3> &ijk){
+              return ((((uint64_t)ijk[0]) << 32) + ijk[1]) % num_devices;
+            });
+        }
       }
       // for each {i,j} determine first k that contributes AND belongs to this node,
       // initialize input {i,j,first_k} flow to 0
@@ -871,6 +874,7 @@ class SpMM25D {
   Keymap2 ij_keymap_;
   Keymap3 ijk_keymap_;
   long parallel_bcasts_;
+  bool enable_device_map_;
 };
 
 class Control : public TT<void, std::tuple<Out<Key<3>>>, Control> {
@@ -1442,7 +1446,7 @@ static void timed_measurement(SpMatrix<> &A, SpMatrix<> &B, const std::function<
                               const std::vector<std::vector<long>> &b_cols_of_row,
                               const std::vector<std::vector<long>> &b_rows_of_col, std::vector<int> &mTiles,
                               std::vector<int> &nTiles, std::vector<int> &kTiles, int M, int N, int K, int minTs,
-                              int maxTs, int P, int Q, int R, int parallel_bcasts) {
+                              int maxTs, int P, int Q, int R, int parallel_bcasts, bool enable_device_map) {
   int MT = (int)A.rows();
   int NT = (int)B.cols();
   int KT = (int)A.cols();
@@ -1469,7 +1473,7 @@ static void timed_measurement(SpMatrix<> &A, SpMatrix<> &B, const std::function<
   assert(!has_value(c_status));
   //  SpMM25D a_times_b(world, eA, eB, eC, A, B);
   SpMM25D<> a_times_b(eA, eB, eC, A, B, a_cols_of_row, a_rows_of_col, b_cols_of_row, b_rows_of_col,
-                      mTiles, nTiles, kTiles, ij_keymap, ijk_keymap, R, parallel_bcasts);
+                      mTiles, nTiles, kTiles, ij_keymap, ijk_keymap, R, parallel_bcasts, enable_device_map);
   TTGUNUSED(a);
   TTGUNUSED(b);
   TTGUNUSED(a_times_b);
@@ -1646,6 +1650,9 @@ int main(int argc, char **argv) {
         parallel_bcasts = std::stol(pStr);
       }
 
+      /* whether we set a device mapping */
+      bool enable_device_map = !cmdOptionExists(argv, argv+argc, "--default-device-map");
+
       std::string PStr(getCmdOption(argv, argv + argc, "-P"));
       P = parseOption(PStr, P);
       std::string QStr(getCmdOption(argv, argv + argc, "-Q"));
@@ -1812,7 +1819,7 @@ int main(int argc, char **argv) {
 #endif // TTG_USE_PARSEC
           timed_measurement(A, B, ij_keymap, ijk_keymap, tiling_type, gflops, avg_nb, Adensity, Bdensity,
                             a_cols_of_row, a_rows_of_col, b_cols_of_row, b_rows_of_col, mTiles,
-                            nTiles, kTiles, M, N, K, minTs, maxTs, P, Q, R, parallel_bcasts);
+                            nTiles, kTiles, M, N, K, minTs, maxTs, P, Q, R, parallel_bcasts, enable_device_map);
 #if TTG_USE_PARSEC
           /* reset PaRSEC's load tracking */
           parsec_devices_reset_load(default_execution_context().impl().context());

From 0ddaef6f4cc7efa1555cb08774e6089e505bbcca Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Fri, 16 Aug 2024 12:26:13 -0400
Subject: [PATCH 31/57] POTRF: add option to disable explicit device mapping
 hint

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 examples/potrf/potrf.h           | 19 ++++++++++++++-----
 examples/potrf/testing_dpotrf.cc |  5 ++++-
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/examples/potrf/potrf.h b/examples/potrf/potrf.h
index d3a92cb2f..689a932e2 100644
--- a/examples/potrf/potrf.h
+++ b/examples/potrf/potrf.h
@@ -669,7 +669,8 @@ namespace potrf {
 
   template <typename MatrixT>
   auto make_potrf_ttg(MatrixT& A, ttg::Edge<Key2, MatrixTile<typename MatrixT::element_type>>& input,
-                      ttg::Edge<Key2, MatrixTile<typename MatrixT::element_type>>& output, bool defer_write) {
+                      ttg::Edge<Key2, MatrixTile<typename MatrixT::element_type>>& output, bool defer_write,
+                      bool enable_device_map = true) {
     using T = typename MatrixT::element_type;
     auto keymap1 = [&](const Key1& key) { return A.rank_of(key[0], key[0]); };
 
@@ -705,28 +706,36 @@ namespace potrf {
     tt_potrf->set_keymap(keymap1);
     tt_potrf->set_defer_writer(defer_write);
 #ifdef ENABLE_DEVICE_KERNEL
-    tt_potrf->set_devicemap(devmap1);
+    if (enable_device_map) {
+      tt_potrf->set_devicemap(devmap1);
+    }
 #endif // 0
 
     auto tt_trsm = make_trsm(A, disp_trsm, potrf_trsm, gemm_trsm, trsm_syrk, trsm_gemm_row, trsm_gemm_col, output);
     tt_trsm->set_keymap(keymap2a);
     tt_trsm->set_defer_writer(defer_write);
 #ifdef ENABLE_DEVICE_KERNEL
-    tt_trsm->set_devicemap(devmap2a);
+    if (enable_device_map) {
+      tt_trsm->set_devicemap(devmap2a);
+    }
 #endif // 0
 
     auto tt_syrk = make_syrk(A, disp_syrk, trsm_syrk, syrk_syrk, syrk_potrf, syrk_syrk);
     tt_syrk->set_keymap(keymap2b);
     tt_syrk->set_defer_writer(defer_write);
 #ifdef ENABLE_DEVICE_KERNEL
-    tt_syrk->set_devicemap(devmap2b);
+    if (enable_device_map) {
+      tt_syrk->set_devicemap(devmap2b);
+    }
 #endif // 0
 
     auto tt_gemm = make_gemm(A, disp_gemm, trsm_gemm_row, trsm_gemm_col, gemm_gemm, gemm_trsm, gemm_gemm);
     tt_gemm->set_keymap(keymap3);
     tt_gemm->set_defer_writer(defer_write);
 #ifdef ENABLE_DEVICE_KERNEL
-    tt_gemm->set_devicemap(devmap3);
+    if (enable_device_map) {
+      tt_gemm->set_devicemap(devmap3);
+    }
 #endif // 0
 
     /* Priorities taken from DPLASMA */
diff --git a/examples/potrf/testing_dpotrf.cc b/examples/potrf/testing_dpotrf.cc
index 2a5f6dacb..781cc417c 100644
--- a/examples/potrf/testing_dpotrf.cc
+++ b/examples/potrf/testing_dpotrf.cc
@@ -61,6 +61,9 @@ int main(int argc, char **argv)
   bool check = !cmdOptionExists(argv+1, argv+argc, "-x");
   bool cow_hint = !cmdOptionExists(argv+1, argv+argc, "-w");
 
+  /* whether we set a device mapping */
+  bool enable_device_map = !cmdOptionExists(argv, argv+argc, "--default-device-map");
+
   // TODO: need to filter out our arguments to make parsec happy
   ttg::initialize(1, argv, nthreads);
 
@@ -130,7 +133,7 @@ int main(int argc, char **argv)
       init_tt->set_keymap([&]() {return world.rank();});
 
       auto plgsy_ttg = make_plgsy_ttg(A, N, random_seed, startup, topotrf, cow_hint);
-      auto potrf_ttg = potrf::make_potrf_ttg(A, topotrf, result, cow_hint);
+      auto potrf_ttg = potrf::make_potrf_ttg(A, topotrf, result, cow_hint, enable_device_map);
       auto result_ttg = make_result_ttg(A, result, cow_hint);
 
       auto connected = make_graph_executable(init_tt.get());

From 67d8f70d7deecd49cb6cb922af5f29dc13eefc6a Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Fri, 6 Sep 2024 14:30:51 -0400
Subject: [PATCH 32/57] bump TA tag to pull in
 https://github.com/ValeevGroup/tiledarray/pull/472

---
 cmake/modules/ExternalDependenciesVersions.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/modules/ExternalDependenciesVersions.cmake b/cmake/modules/ExternalDependenciesVersions.cmake
index ac3271567..53e09a2f3 100644
--- a/cmake/modules/ExternalDependenciesVersions.cmake
+++ b/cmake/modules/ExternalDependenciesVersions.cmake
@@ -6,7 +6,7 @@ set(TTG_TRACKED_CATCH2_VERSION 3.5.0)
 set(TTG_TRACKED_MADNESS_TAG 96ac90e8f193ccfaf16f346b4652927d2d362e75)
 set(TTG_TRACKED_PARSEC_TAG 58f8f3089ecad2e8ee50e80a9586e05ce8873b1c)
 set(TTG_TRACKED_BTAS_TAG 4e8f5233aa7881dccdfcc37ce07128833926d3c2)
-set(TTG_TRACKED_TILEDARRAY_TAG 5204c06cf978892ee04503b476162d1c5cefd9de)
+set(TTG_TRACKED_TILEDARRAY_TAG 8af44bdff51697de114219fea88db28ca10666e9)
 
 # need Boost.CallableTraits (header only, part of Boost 1.66 released in Dec 2017) for wrap.h to work
 set(TTG_OLDEST_BOOST_VERSION 1.66)

From c9069fff6c953b45c0bc5fd21696d051b8907e35 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Fri, 6 Sep 2024 14:31:41 -0400
Subject: [PATCH 33/57] bump VG cmake toolkit to update linalgpp versions

this addresses rocmblas issues on Frontier among other things
---
 cmake/modules/ExternalDependenciesVersions.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/modules/ExternalDependenciesVersions.cmake b/cmake/modules/ExternalDependenciesVersions.cmake
index 53e09a2f3..f2d48b233 100644
--- a/cmake/modules/ExternalDependenciesVersions.cmake
+++ b/cmake/modules/ExternalDependenciesVersions.cmake
@@ -1,7 +1,7 @@
 # for each dependency track both current and previous id (the variable for the latter must contain PREVIOUS)
 # to be able to auto-update them
 
-set(TTG_TRACKED_VG_CMAKE_KIT_TAG 092efee765e039b02e0a9aaf013c12fc3c4e89cf)  # used to provide "real" FindOrFetchBoost
+set(TTG_TRACKED_VG_CMAKE_KIT_TAG 72bb8f049e68443e817ce7299f0d1dabfaf01b7e)  # provides FindOrFetchLinalgPP and "real" FindOrFetchBoost
 set(TTG_TRACKED_CATCH2_VERSION 3.5.0)
 set(TTG_TRACKED_MADNESS_TAG 96ac90e8f193ccfaf16f346b4652927d2d362e75)
 set(TTG_TRACKED_PARSEC_TAG 58f8f3089ecad2e8ee50e80a9586e05ce8873b1c)

From 48bd8c9e58254fec66e853ba84f154832e0fdd52 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Fri, 6 Sep 2024 14:32:15 -0400
Subject: [PATCH 34/57] if building example bump up the acceptable Boost
 version to please spmm/TA

---
 cmake/modules/ExternalDependenciesVersions.cmake | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/cmake/modules/ExternalDependenciesVersions.cmake b/cmake/modules/ExternalDependenciesVersions.cmake
index f2d48b233..dec4c56f3 100644
--- a/cmake/modules/ExternalDependenciesVersions.cmake
+++ b/cmake/modules/ExternalDependenciesVersions.cmake
@@ -9,4 +9,9 @@ set(TTG_TRACKED_BTAS_TAG 4e8f5233aa7881dccdfcc37ce07128833926d3c2)
 set(TTG_TRACKED_TILEDARRAY_TAG 8af44bdff51697de114219fea88db28ca10666e9)
 
 # need Boost.CallableTraits (header only, part of Boost 1.66 released in Dec 2017) for wrap.h to work
-set(TTG_OLDEST_BOOST_VERSION 1.66)
+# BUT if will be building examples, inherit the oldest version from the pickiest Boost consumer (TA and/or BSPMM)
+if (TTG_EXAMPLES)
+  set(TTG_OLDEST_BOOST_VERSION 1.81)
+else()
+  set(TTG_OLDEST_BOOST_VERSION 1.66)
+endif()

From 6be1a3a5e00eec5496091a29f503f1f6fecfca52 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Tue, 10 Sep 2024 15:01:13 -0400
Subject: [PATCH 35/57] SPMM: localize index space for device mapper

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 examples/spmm/spmm.cc | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc
index 902f4e22a..64a850a7a 100644
--- a/examples/spmm/spmm.cc
+++ b/examples/spmm/spmm.cc
@@ -314,7 +314,7 @@ class SpMM25D {
           const std::vector<std::vector<long>> &b_cols_of_row,
           const std::vector<std::vector<long>> &b_rows_of_col, const std::vector<int> &mTiles,
           const std::vector<int> &nTiles, const std::vector<int> &kTiles, Keymap2 ij_keymap, Keymap3 ijk_keymap,
-          long R, long parallel_bcasts = 1, bool enable_device_map = true)
+          long P, long Q, long R, long parallel_bcasts = 1, bool enable_device_map = true)
       : a_cols_of_row_(a_cols_of_row)
       , b_rows_of_col_(b_rows_of_col)
       , a_rows_of_col_(a_rows_of_col)
@@ -337,7 +337,7 @@ class SpMM25D {
     local_bcast_b_ = std::make_unique<LocalBcastB>(local_b_ijk_, b_ijk_, a_rows_of_col_, ijk_keymap_);
     multiplyadd_ = std::make_unique<MultiplyAdd<Space>>(a_ijk_, b_ijk_, c_ijk_, c_ij_p_, a_cols_of_row_,
                                                         b_rows_of_col_, mTiles, nTiles, ijk_keymap_, constraint,
-                                                        k_cnt_, parallel_bcasts_, enable_device_map_);
+                                                        k_cnt_, P, Q, parallel_bcasts_, enable_device_map_);
 
     reduce_c_ = std::make_unique<ReduceC>(c_ij_p_, c, ij_keymap_);
     reduce_c_->template set_input_reducer<0>(
@@ -610,6 +610,7 @@ class SpMM25D {
                 const std::vector<int> &nTiles, const Keymap3 &ijk_keymap,
                 std::shared_ptr<ttg::SequencedKeysConstraint<Key<2>>> constraint,
                 std::vector<std::atomic<std::size_t>>& k_cnt,
+                long P, long Q,
                 std::size_t parallel_bcasts, bool enable_device_map)
         : baseT(edges(a_ijk, b_ijk, c_ijk), edges(c, c_ijk), "SpMM25D::MultiplyAdd", {"a_ijk", "b_ijk", "c_ijk"},
                 {"c_ij", "c_ijk"}, ijk_keymap)
@@ -621,10 +622,14 @@ class SpMM25D {
       this->set_priomap([=,this](const Key<3> &ijk) { return this->prio(ijk); });  // map a key to an integral priority value
       if constexpr (is_device_space) {
         if (enable_device_map) {
-          auto num_devices = ttg::device::num_devices();
+          int num_devices = ttg::device::num_devices();
+          int gp = std::sqrt(num_devices);
+          int gq = num_devices / gp;
           this->set_devicemap(
-            [num_devices](const Key<3> &ijk){
-              return ((((uint64_t)ijk[0]) << 32) + ijk[1]) % num_devices;
+            [P,Q,gp,gq,num_devices](const Key<3> &ijk){
+              // TODO: include the number of rows/columns in this formula
+              auto device = (((ijk[0]/P)%gp)*gq) + (ijk[1]/Q)%gq;
+              return device;
             });
         }
       }
@@ -1473,7 +1478,7 @@ static void timed_measurement(SpMatrix<> &A, SpMatrix<> &B, const std::function<
   assert(!has_value(c_status));
   //  SpMM25D a_times_b(world, eA, eB, eC, A, B);
   SpMM25D<> a_times_b(eA, eB, eC, A, B, a_cols_of_row, a_rows_of_col, b_cols_of_row, b_rows_of_col,
-                      mTiles, nTiles, kTiles, ij_keymap, ijk_keymap, R, parallel_bcasts, enable_device_map);
+                      mTiles, nTiles, kTiles, ij_keymap, ijk_keymap, P, Q, R, parallel_bcasts, enable_device_map);
   TTGUNUSED(a);
   TTGUNUSED(b);
   TTGUNUSED(a_times_b);
@@ -1844,7 +1849,7 @@ int main(int argc, char **argv) {
         assert(!has_value(c_status));
         //  SpMM25D a_times_b(world, eA, eB, eC, A, B);
         SpMM25D<> a_times_b(eA, eB, eC, A, B, a_cols_of_row, a_rows_of_col, b_cols_of_row,
-                            b_rows_of_col, mTiles, nTiles, kTiles, ij_keymap, ijk_keymap, R);
+                            b_rows_of_col, mTiles, nTiles, kTiles, ij_keymap, ijk_keymap, P, Q, R);
         TTGUNUSED(a_times_b);
         // calling the Dot constructor with 'true' argument disables the type
         if (default_execution_context().rank() == 0) std::cout << Dot{/*disable_type=*/true}(&control) << std::endl;

From 7362231715c83f17472cdbc62172f5e4f70395c3 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Tue, 10 Sep 2024 15:01:41 -0400
Subject: [PATCH 36/57] SPMM: remove warmup since it's too expensive and
 useless on the device

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 examples/spmm/spmm.cc | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc
index 64a850a7a..26c6f4aa9 100644
--- a/examples/spmm/spmm.cc
+++ b/examples/spmm/spmm.cc
@@ -1574,20 +1574,6 @@ int main(int argc, char **argv) {
   bool timing;
   double gflops;
 
-  // warm up silicon by calling gemm a few times
-#ifdef BTAS_IS_USABLE
-  for (int i = 0; i < 20; i++) {
-    using baseT = typename btas::Tensor<double>;
-    btas::Tensor<double, btas::Range, std::vector<double>> At(30, 30);
-    btas::Tensor<double, btas::Range, std::vector<double>> Bt(30, 30);
-    btas::Tensor<double, btas::Range, std::vector<double>> Ct(30, 30);
-    At.fill(1.0);
-    Bt.fill(2.0);
-    Ct.fill(3.0);
-    btas::gemm(Ct, Bt, At);
-  }
-#endif  // BTAS_IS_USABLE
-
   int cores = -1;
   std::string nbCoreStr(getCmdOption(argv, argv + argc, "-c"));
   cores = parseOption(nbCoreStr, cores);

From 84c9c13f930039525a16f2a8c83f61285deb6215 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Mon, 30 Sep 2024 21:29:49 -0400
Subject: [PATCH 37/57] bump BTAS, MADNESS, and TA tag; most important pull is
 https://github.com/ValeevGroup/tiledarray/pull/472

---
 cmake/modules/ExternalDependenciesVersions.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/modules/ExternalDependenciesVersions.cmake b/cmake/modules/ExternalDependenciesVersions.cmake
index dec4c56f3..6767e0693 100644
--- a/cmake/modules/ExternalDependenciesVersions.cmake
+++ b/cmake/modules/ExternalDependenciesVersions.cmake
@@ -3,10 +3,10 @@
 
 set(TTG_TRACKED_VG_CMAKE_KIT_TAG 72bb8f049e68443e817ce7299f0d1dabfaf01b7e)  # provides FindOrFetchLinalgPP and "real" FindOrFetchBoost
 set(TTG_TRACKED_CATCH2_VERSION 3.5.0)
-set(TTG_TRACKED_MADNESS_TAG 96ac90e8f193ccfaf16f346b4652927d2d362e75)
+set(TTG_TRACKED_MADNESS_TAG 95589b0d020a076f93d02eead6da654b23dd3d91)
 set(TTG_TRACKED_PARSEC_TAG 58f8f3089ecad2e8ee50e80a9586e05ce8873b1c)
-set(TTG_TRACKED_BTAS_TAG 4e8f5233aa7881dccdfcc37ce07128833926d3c2)
-set(TTG_TRACKED_TILEDARRAY_TAG 8af44bdff51697de114219fea88db28ca10666e9)
+set(TTG_TRACKED_BTAS_TAG c25b0a11d2a76190bfb13fa72f9e9dc3e57c3c2f)
+set(TTG_TRACKED_TILEDARRAY_TAG ec51edbcafc363c418f5ce235423463d2beb2785)
 
 # need Boost.CallableTraits (header only, part of Boost 1.66 released in Dec 2017) for wrap.h to work
 # BUT if will be building examples, inherit the oldest version from the pickiest Boost consumer (TA and/or BSPMM)

From 318ea20f000d5e71cda64809d95622137bfee351 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Tue, 1 Oct 2024 07:37:01 -0400
Subject: [PATCH 38/57] TiledArray/device/allocators.h ->
 TiledArray/external/device.h

---
 examples/spmm/devicetensor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/spmm/devicetensor.h b/examples/spmm/devicetensor.h
index 4d0a7f1cf..d5a0f6d9f 100644
--- a/examples/spmm/devicetensor.h
+++ b/examples/spmm/devicetensor.h
@@ -10,7 +10,7 @@
 #include <btas/btas.h>
 #include <btas/optimize/contract.h>
 #include <btas/util/mohndle.h>
-#include <TiledArray/device/allocators.h>
+#include <TiledArray/external/device.h>
 #include "../devblas_helper.h"
 #include <madness/world/parsec.h>  // need to initialize MADNESS purely for the purposes of TA allocators
 #else

From 8f94781f1cd455af8a93715f20f0a133cb9bfaba Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Thu, 24 Oct 2024 11:55:47 -0400
Subject: [PATCH 39/57] Limit execution of constraints test to PaRSEC backend

Constraints are a no-op in the madness backend so don't try to
execute with them.

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 tests/unit/constraints.cc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/unit/constraints.cc b/tests/unit/constraints.cc
index 0255b51e3..c358aa0e5 100644
--- a/tests/unit/constraints.cc
+++ b/tests/unit/constraints.cc
@@ -41,11 +41,18 @@ TEST_CASE("constraints", "") {
     }, ttg::edges(), ttg::edges(e));
     bcast->set_keymap([&](){ return world.rank(); });
 
+    /**
+     * Constraints are currently only implemented in the PaRSEC backend.
+     * Codes using constraints will still compile but they will not
+     * affect the execution order in other backends.
+     */
+#ifdef TTG_USE_PARSEC
     make_graph_executable(bcast);
     ttg::execute(ttg::default_execution_context());
     bcast->invoke();
 
     ttg::ttg_fence(ttg::default_execution_context());
+#endif // TTG_USE_PARSEC
 
   }
 }  // TEST_CASE("streams")

From 45af6ec4c65f9c7a137de8ed648992aaeeb1a256 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Fri, 25 Oct 2024 09:48:51 -0400
Subject: [PATCH 40/57] bump TA tag + associated MAD tag

most importantly pulls in https://github.com/ValeevGroup/tiledarray/pull/483 which solves linking errors due to missing fmt instantiations
---
 cmake/modules/ExternalDependenciesVersions.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/modules/ExternalDependenciesVersions.cmake b/cmake/modules/ExternalDependenciesVersions.cmake
index 6767e0693..5bbf8954d 100644
--- a/cmake/modules/ExternalDependenciesVersions.cmake
+++ b/cmake/modules/ExternalDependenciesVersions.cmake
@@ -3,10 +3,10 @@
 
 set(TTG_TRACKED_VG_CMAKE_KIT_TAG 72bb8f049e68443e817ce7299f0d1dabfaf01b7e)  # provides FindOrFetchLinalgPP and "real" FindOrFetchBoost
 set(TTG_TRACKED_CATCH2_VERSION 3.5.0)
-set(TTG_TRACKED_MADNESS_TAG 95589b0d020a076f93d02eead6da654b23dd3d91)
+set(TTG_TRACKED_MADNESS_TAG 93a9a5cec2a8fa87fba3afe8056607e6062a9058)
 set(TTG_TRACKED_PARSEC_TAG 58f8f3089ecad2e8ee50e80a9586e05ce8873b1c)
 set(TTG_TRACKED_BTAS_TAG c25b0a11d2a76190bfb13fa72f9e9dc3e57c3c2f)
-set(TTG_TRACKED_TILEDARRAY_TAG ec51edbcafc363c418f5ce235423463d2beb2785)
+set(TTG_TRACKED_TILEDARRAY_TAG 3b4baf7afed4ac7357290f3d90f5d4e3edbc09e0)
 
 # need Boost.CallableTraits (header only, part of Boost 1.66 released in Dec 2017) for wrap.h to work
 # BUT if will be building examples, inherit the oldest version from the pickiest Boost consumer (TA and/or BSPMM)

From 319cd6b57a8b80b1974da6be9a5022e068bedf52 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Mon, 28 Oct 2024 17:49:29 -0400
Subject: [PATCH 41/57] Fix ttg::persistent

We should consider this a hint and fall back to non-persistent
values if the type is not derived from TTValue or we are running
with madness.

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 ttg/ttg/madness/ttvalue.h | 5 +++++
 ttg/ttg/parsec/ttvalue.h  | 8 +++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/ttg/ttg/madness/ttvalue.h b/ttg/ttg/madness/ttvalue.h
index ad53ee5f8..a171c3c71 100644
--- a/ttg/ttg/madness/ttvalue.h
+++ b/ttg/ttg/madness/ttvalue.h
@@ -9,6 +9,11 @@ namespace ttg_madness {
     /* empty */
   };
 
+  template<typename ValueT>
+  inline auto persistent(ValueT&& value) {
+    return std::forward<ValueT>(value);
+  }
+
 } // namespace ttg_madness
 
   #endif // TTG_MADNESS_TTVALUE_H
diff --git a/ttg/ttg/parsec/ttvalue.h b/ttg/ttg/parsec/ttvalue.h
index b93f1687f..b5b6aa982 100644
--- a/ttg/ttg/parsec/ttvalue.h
+++ b/ttg/ttg/parsec/ttvalue.h
@@ -91,9 +91,11 @@ namespace ttg_parsec {
 
   template<typename ValueT>
   inline auto persistent(ValueT&& value) {
-    static_assert(std::is_base_of_v<TTValue<std::decay_t<ValueT>>, std::decay_t<ValueT>>,
-                  "ttg::persistent can only be used on types derived from ttg::TTValue");
-    return detail::persistent_value_ref<ValueT>{value};
+    if constexpr (std::is_base_of_v<TTValue<std::decay_t<ValueT>>, std::decay_t<ValueT>>) {
+      return detail::persistent_value_ref<ValueT>{value};
+    } else {
+      return std::forward<ValueT>(value);
+    }
   }
 
 } // namespace ttg_parsec

From 3d4632c8746aa7a23bd1ad73ade2dc7975bb2c11 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Mon, 28 Oct 2024 17:50:07 -0400
Subject: [PATCH 42/57] SPMM: Restrict empty() calls to block-sparse execution

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 examples/spmm/spmm.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc
index 26c6f4aa9..bc376e185 100644
--- a/examples/spmm/spmm.cc
+++ b/examples/spmm/spmm.cc
@@ -680,9 +680,11 @@ class SpMM25D {
       const blk_t& B = baseT::template get<1>(_ijk);
       blk_t& C = baseT::template get<2>(_ijk);
 
+#if defined(BLOCK_SPARSE_GEMM)
       if (C.empty()) {
         C = blk_t(btas::Range(A.range().extent(0), B.range().extent(1)), 0.0);
       }
+#endif // BLOCK_SPARSE_GEMM
 
 #ifdef HAVE_SPMM_DEVICE
       /* pull all buffers onto the device */

From ce80a3871fd560eb994a73c7206b5d456bb6be89 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Mon, 28 Oct 2024 17:55:22 -0400
Subject: [PATCH 43/57] Fix change in TiledArray include path

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 examples/matrixtile.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/matrixtile.h b/examples/matrixtile.h
index a759b1c3c..ee623ee4a 100644
--- a/examples/matrixtile.h
+++ b/examples/matrixtile.h
@@ -10,7 +10,7 @@
 #include <ttg/serialization/splitmd_data_descriptor.h>
 
 
-#include <TiledArray/device/allocators.h>
+#include <TiledArray/external/device.h>
 #if defined(TILEDARRAY_HAS_DEVICE)
 #define ALLOCATOR TiledArray::device_pinned_allocator<T>
 

From d405b7edcb88043349c31b27f4e3b7b640cb7c70 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Mon, 28 Oct 2024 18:04:28 -0400
Subject: [PATCH 44/57] Only use TiledArray allocator if available

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 examples/matrixtile.h  | 8 +++++---
 examples/potrf/potrf.h | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/examples/matrixtile.h b/examples/matrixtile.h
index ee623ee4a..63c612a01 100644
--- a/examples/matrixtile.h
+++ b/examples/matrixtile.h
@@ -12,7 +12,8 @@
 
 #include <TiledArray/external/device.h>
 #if defined(TILEDARRAY_HAS_DEVICE)
-#define ALLOCATOR TiledArray::device_pinned_allocator<T>
+template<typename T>
+using Allocator = TiledArray::device_pinned_allocator<T>;
 
 inline void allocator_init(int argc, char **argv) {
   // initialize MADNESS so that TA allocators can be created
@@ -26,7 +27,8 @@ inline void allocator_fini() {
   madness::finalize();
 }
 #else  // TILEDARRAY_HAS_DEVICE
-#define ALLOCATOR std::allocator<T>
+template<typename T>
+using Allocator = std::allocator<T>;
 
 inline void allocator_init(int argc, char **argv) { }
 
@@ -34,7 +36,7 @@ inline void allocator_fini() { }
 
 #endif // TILEDARRAY_HAS_DEVICE
 
-template <typename T, class Allocator = ALLOCATOR>
+template <typename T, class Allocator = Allocator<T>>
 class MatrixTile : public ttg::TTValue<MatrixTile<T, Allocator>> {
  public:
   using metadata_t = typename std::tuple<std::size_t, std::size_t, std::size_t>;
diff --git a/examples/potrf/potrf.h b/examples/potrf/potrf.h
index 689a932e2..4e5e3c194 100644
--- a/examples/potrf/potrf.h
+++ b/examples/potrf/potrf.h
@@ -95,7 +95,7 @@ namespace potrf {
                   ttg::Edge<Key2, MatrixTile<typename MatrixT::element_type>>& output_result) {
     using T = typename MatrixT::element_type;
 #if defined(ENABLE_DEVICE_KERNEL)
-    auto iallocator = std::make_shared<TiledArray::device_pinned_allocator<int>>();
+    auto iallocator = std::make_shared<Allocator<int>>();
     //std::cout << "Creating CUDA POTRF task " << std::endl;
     auto f_dev = [=, iallocator = std::move(iallocator)]
                     (const Key1& key, MatrixTile<T>&& tile_kk,

From c3df91c91ec12af8699092d3c4253ee07070ca3a Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Mon, 28 Oct 2024 21:10:23 -0400
Subject: [PATCH 45/57] Fix Allocator type for MatrixTile

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 examples/matrixtile.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/matrixtile.h b/examples/matrixtile.h
index 63c612a01..8e519db86 100644
--- a/examples/matrixtile.h
+++ b/examples/matrixtile.h
@@ -36,13 +36,13 @@ inline void allocator_fini() { }
 
 #endif // TILEDARRAY_HAS_DEVICE
 
-template <typename T, class Allocator = Allocator<T>>
-class MatrixTile : public ttg::TTValue<MatrixTile<T, Allocator>> {
+template <typename T, class AllocatorT = Allocator<T>>
+class MatrixTile : public ttg::TTValue<MatrixTile<T, AllocatorT>> {
  public:
   using metadata_t = typename std::tuple<std::size_t, std::size_t, std::size_t>;
 
-  using buffer_t  = typename ttg::Buffer<T, Allocator>;
-  using ttvalue_type = ttg::TTValue<MatrixTile<T, Allocator>>;
+  using buffer_t  = typename ttg::Buffer<T, AllocatorT>;
+  using ttvalue_type = ttg::TTValue<MatrixTile<T, AllocatorT>>;
 
  private:
   buffer_t _buffer;
@@ -89,15 +89,15 @@ class MatrixTile : public ttg::TTValue<MatrixTile<T, Allocator>> {
   , _lda(lda)
   { }
 
-  MatrixTile(MatrixTile<T, Allocator>&& other) = default;
+  MatrixTile(MatrixTile<T, AllocatorT>&& other) = default;
 
-  MatrixTile& operator=(MatrixTile<T, Allocator>&& other) = default;
+  MatrixTile& operator=(MatrixTile<T, AllocatorT>&& other) = default;
 
   /* Deep copy ctor und op are not needed for PO since tiles will never be read
    * and written concurrently. Hence shallow copies are enough, will all
    * receiving tasks sharing tile data. Re-enable this once the PaRSEC backend
    * can handle data sharing without excessive copying */
-  MatrixTile(const MatrixTile<T, Allocator>& other)
+  MatrixTile(const MatrixTile<T, AllocatorT>& other)
   : ttvalue_type()
   , _buffer(other._lda*other._cols)
   , _rows(other._rows)
@@ -110,7 +110,7 @@ class MatrixTile : public ttg::TTValue<MatrixTile<T, Allocator>> {
     std::copy_n(other.data(), _lda * _cols, this->data());
   }
 
-  MatrixTile& operator=(const MatrixTile<T, Allocator>& other) {
+  MatrixTile& operator=(const MatrixTile<T, AllocatorT>& other) {
     this->_rows = other._rows;
     this->_cols = other._cols;
     this->_lda = other._lda;
@@ -168,7 +168,7 @@ class MatrixTile : public ttg::TTValue<MatrixTile<T, Allocator>> {
   }
 #endif // DEBUG_TILES_VALUES
 
-  friend std::ostream& operator<<(std::ostream& o, MatrixTile<T> const& tt) {
+  friend std::ostream& operator<<(std::ostream& o, MatrixTile<T, AllocatorT> const& tt) {
     auto ptr = tt.data();
     o << std::endl << " ";
     o << "MatrixTile<" << typeid(T).name() << ">{ rows=" << tt.rows() << " cols=" << tt.cols() << " ld=" << tt.lda();

From 5d06eec5ce8ad8444f02faa9806f1987635c101a Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Mon, 28 Oct 2024 21:11:23 -0400
Subject: [PATCH 46/57] SPMM: Only use DeviceTensor for device execution

We cannot fully serialize the DeviceTensor and we don't need
the DeviceTensor in host execution.

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 examples/spmm/spmm.cc | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc
index bc376e185..8118d9e4c 100644
--- a/examples/spmm/spmm.cc
+++ b/examples/spmm/spmm.cc
@@ -64,15 +64,15 @@ static constexpr ttg::ExecutionSpace space = ttg::ExecutionSpace::Host;
 #if defined(BLOCK_SPARSE_GEMM) && defined(BTAS_IS_USABLE)
 using scalar_t = double;
 
-using blk_t = DeviceTensor<scalar_t, btas::DEFAULT::range,
-                           btas::mohndle<btas::varray<scalar_t
 #if HAVE_SPMM_DEVICE
-                                                      , TiledArray::device_pinned_allocator<scalar_t>
-#endif  // HAVE_SPMM_DEVICE
-                                                      >,
+using blk_t = DeviceTensor<scalar_t, btas::DEFAULT::range,
+                           btas::mohndle<btas::varray<scalar_t,
+                                                      TiledArray::device_pinned_allocator<scalar_t>>,
                                          btas::Handle::shared_ptr>>;
-//#include <atomic>
-//static std::atomic<uint64_t> reduce_count = 0;
+#else   // HAVE_SPMM_DEVICE
+using blk_t = btas::Tensor<scalar_t, btas::DEFAULT::range, btas::mohndle<btas::varray<scalar_t>, btas::Handle::shared_ptr>>;
+#endif  // HAVE_SPMM_DEVICE
+
 
 #if defined(TTG_USE_PARSEC)
 namespace ttg {
@@ -1586,7 +1586,7 @@ int main(int argc, char **argv) {
     initialize(1, argv, cores);
   }
 
-#ifdef BTAS_IS_USABLE
+#if defined(BTAS_IS_USABLE) && defined(TTG_PARSEC_IMPORTED)
   // initialize MADNESS so that TA allocators can be created
   madness::ParsecRuntime::initialize_with_existing_context(ttg::default_execution_context().impl().context());
   madness::initialize(argc, argv, /* nthread = */ 1, /* quiet = */ true);

From 635cdbceef5f0978542584c979bc83744f196834 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Tue, 29 Oct 2024 09:44:28 -0400
Subject: [PATCH 47/57] Bump TA tag

Fixes issues in TA device header.

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 cmake/modules/ExternalDependenciesVersions.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/modules/ExternalDependenciesVersions.cmake b/cmake/modules/ExternalDependenciesVersions.cmake
index 5bbf8954d..784f9edac 100644
--- a/cmake/modules/ExternalDependenciesVersions.cmake
+++ b/cmake/modules/ExternalDependenciesVersions.cmake
@@ -6,7 +6,7 @@ set(TTG_TRACKED_CATCH2_VERSION 3.5.0)
 set(TTG_TRACKED_MADNESS_TAG 93a9a5cec2a8fa87fba3afe8056607e6062a9058)
 set(TTG_TRACKED_PARSEC_TAG 58f8f3089ecad2e8ee50e80a9586e05ce8873b1c)
 set(TTG_TRACKED_BTAS_TAG c25b0a11d2a76190bfb13fa72f9e9dc3e57c3c2f)
-set(TTG_TRACKED_TILEDARRAY_TAG 3b4baf7afed4ac7357290f3d90f5d4e3edbc09e0)
+set(TTG_TRACKED_TILEDARRAY_TAG 5944bdba3266a3fa19f1809c8e2accf3dad4d815)
 
 # need Boost.CallableTraits (header only, part of Boost 1.66 released in Dec 2017) for wrap.h to work
 # BUT if will be building examples, inherit the oldest version from the pickiest Boost consumer (TA and/or BSPMM)

From 8fd0f17fbc1f00d77c14b57fea88dbb9c74de0a2 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Tue, 29 Oct 2024 16:36:11 -0400
Subject: [PATCH 48/57] Protect user-level madness init/finalize against the
 madness backend

We should only call that when we run on the PaRSEC backend.
Otherwise we'll pull the rug from under the madness backend.

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 examples/matrixtile.h | 4 +++-
 examples/spmm/spmm.cc | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/matrixtile.h b/examples/matrixtile.h
index 8e519db86..203c58bf2 100644
--- a/examples/matrixtile.h
+++ b/examples/matrixtile.h
@@ -19,12 +19,14 @@ inline void allocator_init(int argc, char **argv) {
   // initialize MADNESS so that TA allocators can be created
 #if defined(TTG_PARSEC_IMPORTED)
   madness::ParsecRuntime::initialize_with_existing_context(ttg::default_execution_context().impl().context());
-#endif // TTG_PARSEC_IMPORTED
   madness::initialize(argc, argv, /* nthread = */ 1, /* quiet = */ true);
+#endif // TTG_PARSEC_IMPORTED
 }
 
 inline void allocator_fini() {
+#if defined(TTG_PARSEC_IMPORTED)
   madness::finalize();
+#endif // TTG_PARSEC_IMPORTED
 }
 #else  // TILEDARRAY_HAS_DEVICE
 template<typename T>
diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc
index 8118d9e4c..d2b7021af 100644
--- a/examples/spmm/spmm.cc
+++ b/examples/spmm/spmm.cc
@@ -1887,7 +1887,7 @@ int main(int argc, char **argv) {
       }
     }
 
-#ifdef BTAS_IS_USABLE
+#if defined(BTAS_IS_USABLE) && defined(TTG_PARSEC_IMPORTED)
     madness::finalize();
 #endif  // BTAS_IS_USABLE
 

From 4f0f89a5cbf5255fa271b9db2a0d2fb8b168580f Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Tue, 29 Oct 2024 21:02:10 -0400
Subject: [PATCH 49/57] Fix constraints test

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 tests/unit/constraints.cc | 63 ++++++++++++++++++++++++++++++++++++---
 ttg/ttg/constraint.h      | 31 +++++++++++++++----
 2 files changed, 85 insertions(+), 9 deletions(-)

diff --git a/tests/unit/constraints.cc b/tests/unit/constraints.cc
index c358aa0e5..75838f7c7 100644
--- a/tests/unit/constraints.cc
+++ b/tests/unit/constraints.cc
@@ -11,18 +11,73 @@ using Key = ttg::MultiIndex<2>;
 
 TEST_CASE("constraints", "") {
 
-  SECTION("sequenced") {
+  SECTION("manual") {
     ttg::Edge<Key, int> e;
     auto world = ttg::default_execution_context();
-    std::atomic<int> last_ord = world.rank();
+    std::atomic<int> last_ord = 0;
+    std::atomic<int> cnt = 10;
+    auto constraint = ttg::make_shared_constraint<ttg::SequencedKeysConstraint>([](const Key& k){ return k[1]; });
     auto tt = ttg::make_tt([&](const Key& key, const int& value){
       int check_ord = last_ord;
+      std::cout << "key " << key[0] << ", " << key[1] << " check_ord " << check_ord << std::endl;
+      CHECK(((key[1] == check_ord) || (key[1] == check_ord+1)));
+      last_ord = key[1];
+      if (--cnt == 0) {
+        cnt = 10;
+        constraint->release(check_ord+1);
+      }
+    }, ttg::edges(e), ttg::edges());
+    // every process executes all tasks
+    tt->set_keymap([&](const Key&){ return world.rank(); });
+    tt->add_constraint(constraint);
+    constraint->stop();
+    constraint->release(1);
+
+    auto bcast = ttg::make_tt([&](){
+      std::vector<Key> keys;
+      // loop iteration order intentionally reversed
+      for (int i = 10; i > 0; --i) {
+        for (int j = 10; j > 0; --j) {
+          keys.push_back(Key{i, j});
+        }
+      }
+      ttg::broadcast<0>(std::move(keys), 0);
+
+      // explicit start here to ensure absolute order
+      constraint->start();
+    }, ttg::edges(), ttg::edges(e));
+    bcast->set_keymap([&](){ return world.rank(); });
+
+    /**
+     * Constraints are currently only implemented in the PaRSEC backend.
+     * Codes using constraints will still compile but they will not
+     * affect the execution order in other backends.
+     */
+#ifdef TTG_USE_PARSEC
+    make_graph_executable(bcast);
+    ttg::execute(ttg::default_execution_context());
+    bcast->invoke();
+
+    ttg::ttg_fence(ttg::default_execution_context());
+#endif // TTG_USE_PARSEC
+
+  }
+
+
+  SECTION("automatic") {
+    ttg::Edge<Key, int> e;
+    auto world = ttg::default_execution_context();
+    std::atomic<int> last_ord = 0;
+    auto constraint = ttg::make_shared_constraint<ttg::SequencedKeysConstraint>(
+                                                    [](const Key& k){ return k[1]; }, true);
+    auto tt = ttg::make_tt([&](const Key& key, const int& value){
+      int check_ord = last_ord;
+      std::cout << "key " << key[0] << ", " << key[1] << " check_ord " << check_ord << std::endl;
       CHECK(((key[1] == check_ord) || (key[1] == check_ord+1)));
       last_ord = key[1];
     }, ttg::edges(e), ttg::edges());
     // every process executes all tasks
     tt->set_keymap([&](const Key&){ return world.rank(); });
-    auto constraint = ttg::make_shared_constraint<ttg::SequencedKeysConstraint>([](const Key& k){ return k[1]; });
     tt->add_constraint(constraint);
     constraint->stop();
 
@@ -30,7 +85,7 @@ TEST_CASE("constraints", "") {
       std::vector<Key> keys;
       // loop iteration order intentionally reversed
       for (int i = 10; i > 0; --i) {
-        for (int j = 10; j > world.rank(); --j) {
+        for (int j = 10; j > 0; --j) {
           keys.push_back(Key{i, j});
         }
       }
diff --git a/ttg/ttg/constraint.h b/ttg/ttg/constraint.h
index 54f070f73..dc1926204 100644
--- a/ttg/ttg/constraint.h
+++ b/ttg/ttg/constraint.h
@@ -172,7 +172,8 @@ namespace ttg {
     // used in the non-auto case
     void release_next(ordinal_type ord, bool force_check = false) {
       if (this->m_stopped) {
-        // don't release tasks if we're stopped
+        // don't release tasks if we're stopped but remember that this was released
+        this->m_current = ord;
         return;
       }
       if (!force_check && m_order(ord, this->m_current)) {
@@ -212,8 +213,9 @@ namespace ttg {
     , m_auto_release(auto_release)
     { }
 
-    template<typename Mapper_, typename = std::enable_if_t<std::is_invocable_v<Mapper_, Key>, Mapper_>>
-    SequencedKeysConstraint(Mapper_&& map, bool auto_release)
+    template<typename Mapper_>
+    requires(std::is_invocable_v<Mapper_, Key>)
+    SequencedKeysConstraint(Mapper_&& map, bool auto_release = false)
     : base_t()
     , m_map(std::forward<Mapper_>(map))
     , m_auto_release(auto_release)
@@ -298,7 +300,17 @@ namespace ttg {
     void start() {
       if (m_stopped) {
         m_stopped = false;
-        release_next(m_current, true); // force the check for a next release even if the current ordinal hasn't changed
+        if (m_auto_release) {
+          release_next();
+        } else {
+          auto ord = m_current;
+          // release the first set of available keys if none were set explicitly
+          if (ord == std::numeric_limits<ordinal_type>::min() &&
+              this->m_sequence.begin() != this->m_sequence.end()) {
+            ord = this->m_sequence.begin()->first;
+          }
+          release_next(ord, true); // force the check for a next release even if the current ordinal hasn't changed
+        }
       }
     }
 
@@ -339,7 +351,16 @@ namespace ttg {
     -> SequencedKeysConstraint<
           std::decay_t<std::tuple_element_t<0, boost::callable_traits::args_t<Mapper>>>,
           std::decay_t<boost::callable_traits::return_type_t<Mapper>>,
-          std::less<std::decay_t<boost::callable_traits::return_type_t<Mapper>>>,
+          std::less_equal<std::decay_t<boost::callable_traits::return_type_t<Mapper>>>,
+          std::enable_if_t<std::is_invocable_v<Mapper, std::decay_t<std::tuple_element_t<0, boost::callable_traits::args_t<Mapper>>>>, Mapper>
+          >;
+
+  template<typename Mapper, typename = std::enable_if_t<std::is_invocable_v<Mapper, std::decay_t<std::tuple_element_t<0, boost::callable_traits::args_t<Mapper>>>>>>
+  SequencedKeysConstraint(Mapper&&, bool)
+    -> SequencedKeysConstraint<
+          std::decay_t<std::tuple_element_t<0, boost::callable_traits::args_t<Mapper>>>,
+          std::decay_t<boost::callable_traits::return_type_t<Mapper>>,
+          std::less_equal<std::decay_t<boost::callable_traits::return_type_t<Mapper>>>,
           std::enable_if_t<std::is_invocable_v<Mapper, std::decay_t<std::tuple_element_t<0, boost::callable_traits::args_t<Mapper>>>>, Mapper>
           >;
 

From 595f6509208f14dde3a697a21591a36cd2335d32 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Wed, 13 Nov 2024 22:44:11 -0500
Subject: [PATCH 50/57] amended 1948ee7c3f7af23f6cea1664ea7338e2a1105b3d to
 make scalar spmm work

---
 examples/spmm/spmm.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc
index d2b7021af..f29ae9088 100644
--- a/examples/spmm/spmm.cc
+++ b/examples/spmm/spmm.cc
@@ -163,7 +163,7 @@ namespace btas {
   }
 }  // namespace btas
 #endif  // BTAS_IS_USABLE
-double gemm(double C, double A, double B) { return C + A * B; }
+inline void gemm(double& C, const double A, const double B) { C += A * B; }
 
 // template <typename _Scalar, int _Options, typename _StorageIndex>
 // struct colmajor_layout;

From a64c8d34d536492a62d84f65c9c7a6a343612d12 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Thu, 14 Nov 2024 09:25:49 -0500
Subject: [PATCH 51/57] Fix manual release constraint test

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 tests/unit/constraints.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/unit/constraints.cc b/tests/unit/constraints.cc
index 75838f7c7..70003b945 100644
--- a/tests/unit/constraints.cc
+++ b/tests/unit/constraints.cc
@@ -14,17 +14,17 @@ TEST_CASE("constraints", "") {
   SECTION("manual") {
     ttg::Edge<Key, int> e;
     auto world = ttg::default_execution_context();
-    std::atomic<int> last_ord = 0;
+    std::atomic<int> check_ord = 1;
     std::atomic<int> cnt = 10;
     auto constraint = ttg::make_shared_constraint<ttg::SequencedKeysConstraint>([](const Key& k){ return k[1]; });
     auto tt = ttg::make_tt([&](const Key& key, const int& value){
-      int check_ord = last_ord;
-      std::cout << "key " << key[0] << ", " << key[1] << " check_ord " << check_ord << std::endl;
-      CHECK(((key[1] == check_ord) || (key[1] == check_ord+1)));
-      last_ord = key[1];
+      std::cout << "key " << key[0] << ", " << key[1] << " check_ord " << check_ord << " cnt " << cnt << std::endl;
+      CHECK((key[1] == check_ord));
       if (--cnt == 0) {
         cnt = 10;
-        constraint->release(check_ord+1);
+        check_ord++;
+        std::cout << "key " << key[0] << " releasing next ord " << check_ord << std::endl;
+        constraint->release(check_ord);
       }
     }, ttg::edges(e), ttg::edges());
     // every process executes all tasks

From 393b4ee9cc33a36c926531967fff35d829ab3e7c Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Thu, 14 Nov 2024 13:27:52 -0500
Subject: [PATCH 52/57] Sequence: Comparator must provide a strict ordering

We have to provide our own check for equal based on the provided comparator.

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 ttg/ttg/constraint.h | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/ttg/ttg/constraint.h b/ttg/ttg/constraint.h
index dc1926204..1c353702f 100644
--- a/ttg/ttg/constraint.h
+++ b/ttg/ttg/constraint.h
@@ -65,7 +65,7 @@ namespace ttg {
 
   template<typename Key,
            typename Ordinal = std::size_t,
-           typename Compare = std::less_equal<Ordinal>,
+           typename Compare = std::less<Ordinal>,
            typename Mapper = ttg::Void>
   struct SequencedKeysConstraint : public ConstraintBase<Key> {
 
@@ -95,9 +95,17 @@ namespace ttg {
       }
     };
 
+    bool comp_equal(const Ordinal& a, const Ordinal& b) const {
+      return (!m_order(a, b) && !m_order(b, a));
+    }
+
+    bool eligible(const Ordinal& ord) const {
+      return m_order(ord, m_current) || comp_equal(ord, m_current);
+    }
+
     bool check_key_impl(const key_type& key, Ordinal ord, ttg::TTBase *tt) {
       if (!m_stopped) {
-        if (m_order(ord, m_current)) {
+        if (eligible(ord)) {
           // key should be executed
           if (m_auto_release) { // only needed for auto-release
             m_active.fetch_add(1, std::memory_order_relaxed);
@@ -117,7 +125,7 @@ namespace ttg {
       }
       // key should be deferred
       auto g = this->lock_guard();
-      if (!m_stopped && m_order(ord, m_current)) {
+      if (!m_stopped && eligible(ord)) {
         // someone released this ordinal while we took the lock
         return true;
       }
@@ -176,7 +184,7 @@ namespace ttg {
         this->m_current = ord;
         return;
       }
-      if (!force_check && m_order(ord, this->m_current)) {
+      if (!force_check && eligible(ord)) {
         return; // already at the provided ordinal, nothing to be done
       }
       // trigger the next sequence(s) (m_sequence is ordered by ordinal)
@@ -185,14 +193,12 @@ namespace ttg {
         auto g = this->lock_guard();
         // set current ordinal
         this->m_current = ord;
-        {
-          for (auto it = this->m_sequence.begin(); it != this->m_sequence.end();) {
-            if (!this->m_order(it->first, this->m_current)) break;
-            // extract the next sequence
-            this->m_current = it->first;
-            seqs.push_back(std::move(it->second));
-            it = this->m_sequence.erase(it);
-          }
+        for (auto it = this->m_sequence.begin(); it != this->m_sequence.end();) {
+          if (!eligible(it->first)) break;
+          // extract the next sequence
+          this->m_current = it->first;
+          seqs.push_back(std::move(it->second));
+          it = this->m_sequence.erase(it);
         }
       }
       for (auto& elem : seqs) {
@@ -351,7 +357,7 @@ namespace ttg {
     -> SequencedKeysConstraint<
           std::decay_t<std::tuple_element_t<0, boost::callable_traits::args_t<Mapper>>>,
           std::decay_t<boost::callable_traits::return_type_t<Mapper>>,
-          std::less_equal<std::decay_t<boost::callable_traits::return_type_t<Mapper>>>,
+          std::less<std::decay_t<boost::callable_traits::return_type_t<Mapper>>>,
           std::enable_if_t<std::is_invocable_v<Mapper, std::decay_t<std::tuple_element_t<0, boost::callable_traits::args_t<Mapper>>>>, Mapper>
           >;
 
@@ -360,7 +366,7 @@ namespace ttg {
     -> SequencedKeysConstraint<
           std::decay_t<std::tuple_element_t<0, boost::callable_traits::args_t<Mapper>>>,
           std::decay_t<boost::callable_traits::return_type_t<Mapper>>,
-          std::less_equal<std::decay_t<boost::callable_traits::return_type_t<Mapper>>>,
+          std::less<std::decay_t<boost::callable_traits::return_type_t<Mapper>>>,
           std::enable_if_t<std::is_invocable_v<Mapper, std::decay_t<std::tuple_element_t<0, boost::callable_traits::args_t<Mapper>>>>, Mapper>
           >;
 

From 6a6fa70d9911ee4543c2f4829463fc36fba392bf Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Thu, 14 Nov 2024 13:49:06 -0500
Subject: [PATCH 53/57] Static assert that comparator provides a strict
 ordering

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 ttg/ttg/constraint.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ttg/ttg/constraint.h b/ttg/ttg/constraint.h
index 1c353702f..ab3671601 100644
--- a/ttg/ttg/constraint.h
+++ b/ttg/ttg/constraint.h
@@ -75,6 +75,8 @@ namespace ttg {
     using compare_t = Compare;
     using base_t = ConstraintBase<Key>;
 
+    static_assert((!Compare{}(Ordinal{}, Ordinal{})), "Comparator must provide strict ordering.");
+
   protected:
     struct sequence_elem_t {
       std::map<ttg::TTBase*, std::vector<key_type>> m_keys;

From df2e124d29d3bafb68fa45d267208def157a5ace Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Thu, 14 Nov 2024 15:48:11 -0500
Subject: [PATCH 54/57] [ci] debug install-tree tests

---
 .github/workflows/cmake.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index 8a1d151e6..9a0c87c5f 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -113,11 +113,11 @@ jobs:
       working-directory: ${{github.workspace}}/build
       shell: bash
       run: |
-        cmake -S $GITHUB_WORKSPACE/doc/dox/dev/devsamp/helloworld -B test_install_devsamp_helloworld -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat test_install_devsamp_helloworld/CMakeFiles/CMakeOutput.log && cat test_install_devsamp_helloworld/CMakeFiles/CMakeError.log)
+        cmake -S $GITHUB_WORKSPACE/doc/dox/dev/devsamp/helloworld -B test_install_devsamp_helloworld -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat /home/runner/work/ttg/ttg/install/lib/cmake/ttg/ttg-config.cmake && test_install_devsamp_helloworld/CMakeFiles/CMakeConfigureLog.yaml)
         cmake --build test_install_devsamp_helloworld
         $MPIEXEC -n 2 test_install_devsamp_helloworld/helloworld-parsec
         $MPIEXEC -n 2 test_install_devsamp_helloworld/helloworld-mad
-        cmake -S $GITHUB_WORKSPACE/doc/dox/dev/devsamp/fibonacci -B test_install_devsamp_fibonacci -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat test_install_devsamp_fibonacci/CMakeFiles/CMakeOutput.log && cat test_install_devsamp_fibonacci/CMakeFiles/CMakeError.log)
+        cmake -S $GITHUB_WORKSPACE/doc/dox/dev/devsamp/fibonacci -B test_install_devsamp_fibonacci -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat /home/runner/work/ttg/ttg/install/lib/cmake/ttg/ttg-config.cmake && cat test_install_devsamp_fibonacci/CMakeFiles/CMakeConfigureLog.yaml)
         cmake --build test_install_devsamp_fibonacci
         $MPIEXEC -n 2 test_install_devsamp_fibonacci/fibonacci-parsec
         cmake -E make_directory test_install_userexamples
@@ -130,7 +130,7 @@ jobs:
         add_ttg_executable(iterative $GITHUB_WORKSPACE/doc/dox/user/examples/iterative.cc NOT_EXCLUDE_FROM_ALL)
         add_ttg_executable(distributed $GITHUB_WORKSPACE/doc/dox/user/examples/distributed.cc NOT_EXCLUDE_FROM_ALL)
         EOF
-        cmake -S test_install_userexamples -B test_install_userexamples/build -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat test_install_userexamples/CMakeFiles/CMakeOutput.log && cat test_install_userexamples/CMakeFiles/CMakeError.log)
+        cmake -S test_install_userexamples -B test_install_userexamples/build -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat /home/runner/work/ttg/ttg/install/lib/cmake/ttg/ttg-config.cmake && cat test_install_devsamp_fibonacci/CMakeFiles/CMakeConfigureLog.yaml)
         cmake --build test_install_userexamples/build
 
     - name: Build+Deploy Dox

From 448edcf087b63ebba90cd90db652ddc143f0170e Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Thu, 14 Nov 2024 16:55:22 -0500
Subject: [PATCH 55/57] [cmake] bump VRG cmake kit version to avoid using flaky
 FindBoost module

---
 cmake/modules/ExternalDependenciesVersions.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/modules/ExternalDependenciesVersions.cmake b/cmake/modules/ExternalDependenciesVersions.cmake
index 784f9edac..49f66fe0a 100644
--- a/cmake/modules/ExternalDependenciesVersions.cmake
+++ b/cmake/modules/ExternalDependenciesVersions.cmake
@@ -1,7 +1,7 @@
 # for each dependency track both current and previous id (the variable for the latter must contain PREVIOUS)
 # to be able to auto-update them
 
-set(TTG_TRACKED_VG_CMAKE_KIT_TAG 72bb8f049e68443e817ce7299f0d1dabfaf01b7e)  # provides FindOrFetchLinalgPP and "real" FindOrFetchBoost
+set(TTG_TRACKED_VG_CMAKE_KIT_TAG d1b34157c349cf0a7c2f149b7704a682d53f6486)  # provides FindOrFetchLinalgPP and "real" FindOrFetchBoost
 set(TTG_TRACKED_CATCH2_VERSION 3.5.0)
 set(TTG_TRACKED_MADNESS_TAG 93a9a5cec2a8fa87fba3afe8056607e6062a9058)
 set(TTG_TRACKED_PARSEC_TAG 58f8f3089ecad2e8ee50e80a9586e05ce8873b1c)

From 62bc42b2871e779058900675ef00bf36a687b307 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Thu, 14 Nov 2024 22:27:56 -0500
Subject: [PATCH 56/57] [cmake] default `TTG_FETCH_BOOST` to `ON`

---
 CMakeLists.txt |  2 +-
 INSTALL.md     | 22 +++++++++++-----------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5406877c8..63b0d2bfa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,7 +60,7 @@ option(TTG_EXAMPLES "Whether to build examples" OFF)
 option(TTG_ENABLE_ASAN "Whether to enable address sanitizer" OFF)
 
 option(TTG_ENABLE_COROUTINES "Whether to enable C++ coroutines, needed for accelerator device support" ON)
-option(TTG_FETCH_BOOST "Whether to fetch+build Boost, if missing" OFF)
+option(TTG_FETCH_BOOST "Whether to fetch+build Boost, if missing" ON)
 option(TTG_IGNORE_BUNDLED_EXTERNALS "Whether to skip installation and use of bundled external dependencies (Boost.CallableTraits)" OFF)
 option(TTG_ENABLE_TRACE "Whether to enable ttg::trace() output" OFF)
 # See https://medium.com/@alasher/colored-c-compiler-output-with-ninja-clang-gcc-10bfe7f2b949
diff --git a/INSTALL.md b/INSTALL.md
index b8974f927..b9497ac0a 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -63,14 +63,14 @@ TTG includes several examples that may require additional prerequisites. These a
 
 ## useful cmake cache variables:
 
-| Variable                             |Default             | Description                                                                                                                                                                                                                                  |
-|--------------------------------------|--------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `TTG_ENABLE_CUDA`                    | `OFF`              | whether to enable CUDA device support                                                                                                                                                                                                        |
-| `TTG_ENABLE_HIP`                     | `OFF`              | whether to enable HIP/ROCm device support                                                                                                                                                                                                    |
-| `TTG_ENABLE_LEVEL_ZERO`              | `OFF`              | whether to enable Intel oneAPI Level Zero device support                                                                                                                                                                                     |
-| `BUILD_TESTING`                      | `ON`               | whether target `check-ttg` and its relatives will actually build and run unit tests                                                                                                                                                          |
-| `TTG_EXAMPLES`                       | `OFF`              | whether target `check-ttg` and its relatives will actually build and run examples; setting this to `ON` will cause detection of several optional prerequisites, and (if missing) building from source                                        |
-| `TTG_ENABLE_TRACE`                   | `OFF`              | setting this to `ON` will enable the ability to instrument TTG code for tracing (see `ttg::trace()`, etc.); if this is set to `OFF`, `ttg::trace()` is a no-op                                                                               |
-| `TTG_PARSEC_USE_BOOST_SERIALIZATION` | `OFF`       | whether to use Boost.Serialization for serialization for the PaRSEC backend; if this is set to `OFF`, PaRSEC backend will only be able to use trivially-copyable data types or, if MADNESS backend is available, MADNESS-serializable types. |
-| `TTG_FETCH_BOOST`                    | `OFF`              | whether to download and build Boost automatically, if missing                                                                                                                                                                                |
-| `TTG_IGNORE_BUNDLED_EXTERNALS`       | `OFF`              | whether to install and use bundled external dependencies (currently, only Boost.CallableTraits)                                                                                                                                              |
+| Variable                             | Default | Description                                                                                                                                                                                                                                  |
+|--------------------------------------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `TTG_ENABLE_CUDA`                    | `OFF`   | whether to enable CUDA device support                                                                                                                                                                                                        |
+| `TTG_ENABLE_HIP`                     | `OFF`   | whether to enable HIP/ROCm device support                                                                                                                                                                                                    |
+| `TTG_ENABLE_LEVEL_ZERO`              | `OFF`   | whether to enable Intel oneAPI Level Zero device support                                                                                                                                                                                     |
+| `BUILD_TESTING`                      | `ON`    | whether target `check-ttg` and its relatives will actually build and run unit tests                                                                                                                                                          |
+| `TTG_EXAMPLES`                       | `OFF`   | whether target `check-ttg` and its relatives will actually build and run examples; setting this to `ON` will cause detection of several optional prerequisites, and (if missing) building from source                                        |
+| `TTG_ENABLE_TRACE`                   | `OFF`   | setting this to `ON` will enable the ability to instrument TTG code for tracing (see `ttg::trace()`, etc.); if this is set to `OFF`, `ttg::trace()` is a no-op                                                                               |
+| `TTG_PARSEC_USE_BOOST_SERIALIZATION` | `OFF`   | whether to use Boost.Serialization for serialization for the PaRSEC backend; if this is set to `OFF`, PaRSEC backend will only be able to use trivially-copyable data types or, if MADNESS backend is available, MADNESS-serializable types. |
+| `TTG_FETCH_BOOST`                    | `ON`    | whether to download and build Boost automatically, if missing                                                                                                                                                                                |
+| `TTG_IGNORE_BUNDLED_EXTERNALS`       | `OFF`   | whether to install and use bundled external dependencies (currently, only Boost.CallableTraits)                                                                                                                                              |

From 7048ab5f74a89cc9c40ddb11d7c7d9420fb42754 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Thu, 14 Nov 2024 23:13:20 -0500
Subject: [PATCH 57/57] [ci skip] if dump CMakeConfigureLog.yaml instead of
 legacy CMake{Output,Error}.log

---
 .github/workflows/cmake.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index 9a0c87c5f..506474240 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -88,7 +88,7 @@ jobs:
       # and build directories, but this is only available with CMake 3.13 and higher.  
       # The CMake binaries on the Github Actions machines are (as of this writing) 3.12
       run: |
-        cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE $BUILD_CONFIG || (cat CMakeFiles/CMakeOutput.log && cat CMakeFiles/CMakeError.log)
+        cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE $BUILD_CONFIG || (cat CMakeFiles/CMakeConfigureLog.yaml)
 
     - name: Build
       working-directory: ${{github.workspace}}/build