From 1ffaf6aa00b96a39bd74456f1effda222ae1ed20 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Fri, 2 Feb 2024 09:36:24 -0500
Subject: [PATCH] implemented Read{A,B} that read A and B more intelligently

---
 examples/spmm/spmm.cc | 147 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 140 insertions(+), 7 deletions(-)
diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc
index 94edf672b..f3ae517b6 100644
--- a/examples/spmm/spmm.cc
+++ b/examples/spmm/spmm.cc
@@ -163,7 +163,13 @@ inline int ijk2rank(int i, int j, int k, int P, int Q, int R) {
   return rank;
 }
 
-// flow data from an existing SpMatrix on rank 0
+/// Pushes out data from an existing SpMatrix whose data is distributed on a 2-d grid.
+
+/// Data is pushed in the order of the appearance of the data in the container, without any tailoring to
+/// the order in which the data is consumed; thus this is likely to generate tasks in a suboptimal order.
+/// \note Reading should in general occur in the same order as the data will be consumed.
+/// If all consuming tasks can execute concurrently this should be OK, albeit the runtime will likely throttle
+/// sends, thus task dependencies further "down" the DAG may result in some reading orders being better than others
 template <typename Blk = blk_t, typename Keymap = std::function<int(const Key<2> &)>>
 class Read_SpMatrix : public TT<Key<2>, std::tuple<Out<Key<2>, Blk>>, Read_SpMatrix<Blk, Keymap>, ttg::typelist<void>> {
  public:
@@ -174,18 +180,142 @@ class Read_SpMatrix : public TT<Key<2>, std::tuple<Out<Key<2>, Blk>>, Read_SpMat
               ij_keymap)
       , matrix_(matrix) {}
 
-  void op(const Key<2> &, std::tuple<Out<Key<2>, Blk>> &out) {
+  // key is this process' coordinate in the 2-d grid of processes (managed by ij_keymap) ...
+  // but it's not used at all since all this TT does is generate consuming tasks that use local tiles ...
+  // the consumers better use same keymap (ij_keymap) as this TT to avoid for the data flow from this to be local
+  void op(const Key<2> & /* pq */, std::tuple<Out<Key<2>, Blk>> &out) {
+    auto rank = ttg::default_execution_context().rank();
+    // this code assumes col-major layout
+    static_assert(SpMatrix<Blk>::IsRowMajor == false, "SpMatrix must be col-major");
+    for (int j = 0; j < matrix_.outerSize(); ++j) {
+      for (typename SpMatrix<Blk>::InnerIterator it(matrix_, j); it; ++it) {
+        assert(j == it.col());
+        const auto i = it.row();
+        // IF the receiver uses the same keymap, these sends are local
+        if (rank == this->get_keymap()(Key<2>(std::initializer_list<long>({i, j})))) {
+          ::send<0>(Key<2>(std::initializer_list<long>({i, j})), it.value(), out);
+        }
+      }
+    }
+  }
+
+ private:
+  const SpMatrix<Blk> &matrix_;
+};
+
+enum class ReadSchedule {
+  SingleK,   // appropriate for 2D (see ReadA)
+  MultipleK  // appropriate for 3D (see ReadA)
+};
+// change this to control the schedule of sends (2-D vs 3-D)
+constexpr auto DefaultReadSchedule = ReadSchedule::SingleK;
+
+/// flow data from A distributed on a 2-d grid of processes in the order they are likely to be consumed
+
+/// The order of sends needs to tailored as follows:
+/// - for 2-D SUMMA (R=1): read A[i][k] for all i and k=0 first, then k=1, etc. Clearly, Read_SpMatrix is going to
+/// generate reads in the wrong (transposed) order (send all k for i=0, then for i=1, etc.).
+/// - for 2.5/3-D SUMMA (R>1): same order of sends as for 2-D SUMMA will be suboptimal since all k=0 will only
+/// generate work on the r=0 process plane. Instead we *may* want to one tile needed on each plane, then one more for
+/// each plane. Hence the need for ReadSchedule.
+template <ReadSchedule KSchedule = DefaultReadSchedule, typename Blk = blk_t, typename Keymap2 = std::function<int(const Key<2> &)>>
+class ReadA : public TT<Key<2>, std::tuple<Out<Key<2>, Blk>>, ReadA<KSchedule, Blk, Keymap2>, ttg::typelist<void>> {
+ public:
+  using baseT = typename ReadA::ttT;
+  ReadA(const SpMatrix<Blk> &matrix, Edge<Key<2>> &ctl, Edge<Key<2>, Blk> &out, const Keymap2 &ij_keymap, long R)
+      : baseT(edges(ctl), edges(out), std::string("SpMM25D::read_a"), {"ctl"}, {"a_ik"}, ij_keymap)
+      , matrix_(matrix)
+      , R_(R) {}
+
+  // key is this process' coordinate in the 2-d grid of processes (managed by ij_keymap) ...
+  // but it's not used at all since all this TT does is generate consuming tasks that use local tiles ...
+  // the consumers better use same keymap (ij_keymap) as this TT to avoid for the data flow from this to be local
+  void op(const Key<2> & /* pq */, std::tuple<Out<Key<2>, Blk>> &out) {
+    auto rank = ttg::default_execution_context().rank();
+    const int I = matrix_.rows();
+    const int K = matrix_.cols();
+
+    // this assumes col-major layout of SpMatrix
+    static_assert(SpMatrix<Blk>::IsRowMajor == false, "SpMatrix must be col-major");
+
+    // MultipleK schedule is not yet correctly implemented
+    static_assert(KSchedule == ReadSchedule::SingleK, "MultipleK schedule not yet implemented");
+
+    // loop over blocks of k at a time, block size controlled by KSchedule
+    const int k_blk_size = (KSchedule == ReadSchedule::SingleK) ? 1 : R_;
+    for (std::pair<int, int> k_blk = {0, std::min(k_blk_size, K)}; k_blk.first < K;
+         k_blk = {k_blk.first + k_blk_size, std::min(k_blk.first + k_blk_size + k_blk_size, K)}) {
+
+      // N.B. : due to the CSC layout of A iterating over (blocks of) columns is efficient
+      for (int k = k_blk.first; k < k_blk.second; ++k) {
+        for (typename SpMatrix<Blk>::InnerIterator it(matrix_, k); it; ++it) {
+          assert(k == it.col());
+          const auto i = it.row();
+          // IF the receiver uses the same keymap, these sends are local
+          if (rank == this->get_keymap()(Key<2>(std::initializer_list<long>({i, k})))) {
+            ::send<0>(Key<2>(std::initializer_list<long>({i, k})), it.value(), out);
+          }
+        }
+      }
+
+    }
+
+  }
+
+ private:
+  const SpMatrix<Blk> &matrix_;
+  long R_;
+};
+
+// flow data from an existing SpMatrix on rank 0
+template <ReadSchedule KSchedule = DefaultReadSchedule, typename Blk = blk_t, typename Keymap2 = std::function<int(const Key<2> &)>>
+class ReadB : public TT<Key<2>, std::tuple<Out<Key<2>, Blk>>, ReadB<KSchedule, Blk, Keymap2>, ttg::typelist<void>> {
+ public:
+  using baseT = typename ReadB::ttT;
+  ReadB(const SpMatrix<Blk> &matrix, Edge<Key<2>> &ctl, Edge<Key<2>, Blk> &out, const Keymap2 &ij_keymap, long R)
+      : baseT(edges(ctl), edges(out), std::string("read_b"), {"ctl"}, {"b_kj"}, ij_keymap), matrix_(matrix), R_(R) {}
+
+  // key is this process' coordinate in the 2-d grid of processes (managed by ij_keymap) ...
+  // but it's not used at all since all this TT does is generate consuming tasks that use local tiles ...
+  // the consumers better use same keymap (ij_keymap) as this TT to avoid for the data flow from this to be local
+  void op(const Key<2> & /* pq */, std::tuple<Out<Key<2>, Blk>> &out) {
     auto rank = ttg::default_execution_context().rank();
-    for (int k = 0; k < matrix_.outerSize(); ++k) {
-      for (typename SpMatrix<Blk>::InnerIterator it(matrix_, k); it; ++it) {
-        if (rank == this->get_keymap()(Key<2>(std::initializer_list<long>({it.row(), it.col()}))))
-          ::send<0>(Key<2>(std::initializer_list<long>({it.row(), it.col()})), it.value(), out);
+    const int J = matrix_.cols();
+    const int K = matrix_.rows();
+
+    // this assumes col-major layout of SpMatrix
+    static_assert(SpMatrix<Blk>::IsRowMajor == false, "SpMatrix must be col-major");
+
+    // MultipleK schedule is not yet correctly implemented
+    static_assert(KSchedule == ReadSchedule::SingleK, "MultipleK schedule not yet implemented");
+
+    // loop over blocks of k at a time, block size controlled by KSchedule
+    const int k_blk_size = (KSchedule == ReadSchedule::SingleK) ? 1 : R_;
+    for (std::pair<int, int> k_blk = {0, std::min(k_blk_size, K)}; k_blk.first < K;
+         k_blk = {k_blk.first + k_blk_size, std::min(k_blk.first + k_blk_size + k_blk_size, K)}) {
+
+      // WARNING : due to the CSC layout of B iterating over (blocks of) columns is inefficient
+      for (int j = 0; j < matrix_.outerSize(); ++j) {
+        for (typename SpMatrix<Blk>::InnerIterator it(matrix_, j); it; ++it) {
+          assert(j == it.col());
+          const auto k = it.row();
+          // if k past the k block, we are done with this i
+          if (k >= k_blk.second) break;
+          // continue iterating until k has not reached this k block
+          if (k < k_blk.first) continue;
+          // IF the receiver uses the same keymap, these sends are local
+          if (rank == this->get_keymap()(Key<2>(std::initializer_list<long>({k, j})))) {
+            ::send<0>(Key<2>(std::initializer_list<long>({k, j})), it.value(), out);
+          }
+        }
+
       }
     }
   }
 
  private:
   const SpMatrix<Blk> &matrix_;
+  long R_;
 };
 
 // flow (move?) data into an existing SpMatrix on rank 0
@@ -1153,7 +1283,7 @@ static void initBlSpRandom(const std::function<int(const Key<2> &)> &keymap, siz
   size_t avg_nb = 0;
   int avg_nb_nb = 0;
 
-  struct tuple_hash : public std::unary_function<std::tuple<int, int>, std::size_t> {
+  struct tuple_hash {
     std::size_t operator()(const std::tuple<int, int> &k) const {
       return static_cast<size_t>(std::get<0>(k)) | (static_cast<size_t>(std::get<1>(k)) << 32);
     }
@@ -1256,6 +1386,9 @@ static void timed_measurement(SpMatrix<> &A, SpMatrix<> &B, const std::function<
 
   Read_SpMatrix a("A", A, ctl, eA, ij_keymap);
   Read_SpMatrix b("B", B, ctl, eB, ij_keymap);
+  // uncomment this to use more intelligent schedule of reads
+//  ReadA<> a(A, ctl, eA, ij_keymap, R);
+//  ReadB<> b(B, ctl, eB, ij_keymap, R);
   Write_SpMatrix<> c(C, eC, ij_keymap);
   auto &c_status = c.status();
   assert(!has_value(c_status));