From 143b827595493eb8c94b806113914ab28e2fc076 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Mon, 13 Sep 2021 16:05:25 -0400 Subject: [PATCH 1/3] SPMM: Fix benchmark if checking is enabled If checking is enabled, we want to gather the results at rank 0 so that we can do the validation there Signed-off-by: Joseph Schuchart --- examples/spmm/spmm.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc index 84e4dbb56..af3a94f54 100644 --- a/examples/spmm/spmm.cc +++ b/examples/spmm/spmm.cc @@ -1209,6 +1209,7 @@ int main(int argc, char **argv) { } } else { // flow graph needs to exist on every node + auto keymap_write = [](const Key<2> &key) { return 0; }; Edge> ctl("control"); Control control(ctl); Edge, blk_t> eA, eB, eC; @@ -1220,7 +1221,7 @@ int main(int argc, char **argv) { }; Read_SpMatrix<> a("A", A, ctl, eA, keymap); Read_SpMatrix<> b("B", B, ctl, eB, keymap); - Write_SpMatrix<> c(C, eC, keymap); + Write_SpMatrix<> c(C, eC, keymap_write); auto &c_status = c.status(); assert(!has_value(c_status)); // SpMM a_times_b(world, eA, eB, eC, A, B); From 977d1b8e0d03e52f917ff7acdcdbcc0fe84de557 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Mon, 13 Sep 2021 16:06:02 -0400 Subject: [PATCH 2/3] SPMM: Only print the abstract graph on process 0 Signed-off-by: Joseph Schuchart --- examples/spmm/spmm.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc index af3a94f54..4e669988d 100644 --- a/examples/spmm/spmm.cc +++ b/examples/spmm/spmm.cc @@ -1228,7 +1228,7 @@ int main(int argc, char **argv) { SpMM<> a_times_b(eA, eB, eC, A, B, Afilling, Bfilling, keymap); TTGUNUSED(a_times_b); - std::cout << Dot{}(&a, &b) << std::endl; + if (get_default_world().rank() == 0) std::cout << Dot{}(&a, &b) << std::endl; // ready to run! auto connected = make_graph_executable(&control); From e95d219a00034081623089fa0c1efcbab31438bf Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Mon, 13 Sep 2021 16:16:18 -0400 Subject: [PATCH 3/3] SPMM: implement hierarchical broadcast of A and B The hierarchical broadcast is used to avoid sending all the keys and instead leave it to the recipient to distribute the tile to all the relevant keys. This is done because the keys in the PaRSEC backend are sent inline so that large numbers of keys may grow messages past the eager limit. This is a bandage, not a fix. Instead, the PaRSEC backend should learn to handle large key collections. Eventually, we need to have edges containing key generators... Signed-off-by: Joseph Schuchart --- examples/spmm/spmm.cc | 132 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 108 insertions(+), 24 deletions(-) diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc index 4e669988d..015019e71 100644 --- a/examples/spmm/spmm.cc +++ b/examples/spmm/spmm.cc @@ -256,15 +256,16 @@ class Write_SpMatrix : public Op, std::tuple<>, Write_SpMatrix, Blk> }; // sparse mm -template +template &)>, typename Blk = blk_t> class SpMM { public: - template SpMM(Edge, Blk> &a, Edge, Blk> &b, Edge, Blk> &c, const SpMatrix &a_mat, const SpMatrix &b_mat, std::map, bool> &Afilling, - std::map, bool> &Bfilling, Keymap &&keymap) + std::map, bool> &Bfilling, const Keymap& keymap) : a_ijk_() + , local_a_ijk_() , b_ijk_() + , local_b_ijk_() , c_ijk_() , a_rowidx_to_colidx_(make_rowidx_to_colidx(Afilling)) , b_colidx_to_rowidx_(make_colidx_to_rowidx(Bfilling)) @@ -277,8 +278,10 @@ class SpMM { ttg_broadcast(ttg_default_execution_context(), a_colidx_to_rowidx_, root); ttg_broadcast(ttg_default_execution_context(), b_colidx_to_rowidx_, root); - bcast_a_ = std::make_unique(a, a_ijk_, b_rowidx_to_colidx_, keymap); - bcast_b_ = std::make_unique(b, b_ijk_, a_colidx_to_rowidx_, keymap); + bcast_a_ = std::make_unique(a, local_a_ijk_, b_rowidx_to_colidx_, keymap); + local_bcast_a_ = std::make_unique(local_a_ijk_, a_ijk_, b_rowidx_to_colidx_, keymap); + bcast_b_ = std::make_unique(b, local_b_ijk_, a_colidx_to_rowidx_, keymap); + local_bcast_b_ = std::make_unique(local_b_ijk_, b_ijk_, a_colidx_to_rowidx_, keymap); multiplyadd_ = std::make_unique(a_ijk_, b_ijk_, c_ijk_, c, a_rowidx_to_colidx_, b_colidx_to_rowidx_, keymap); @@ -287,56 +290,133 @@ class SpMM { TTGUNUSED(multiplyadd_); } - /// broadcast A[i][k] to all {i,j,k} such that B[j][k] exists + + /// Locally broadcast A[i][k] to all {i,j,k} such that B[j][k] exists + class LocalBcastA : public Op, std::tuple, Blk>>, LocalBcastA, Blk> { + public: + using baseT = Op, std::tuple, Blk>>, LocalBcastA, Blk>; + + LocalBcastA(Edge, Blk> &a, Edge, Blk> &a_ijk, const std::vector> &b_rowidx_to_colidx, Keymap keymap) + : baseT(edges(a), edges(a_ijk), "SpMM::local_bcast_a", {"a_ik"}, {"a_ijk"}, [](const Key<3>& key){ return key[2]; }) + , b_rowidx_to_colidx_(b_rowidx_to_colidx), keymap_(keymap) {} + + void op(const Key<3> &key, typename baseT::input_values_tuple_type &&a_ik, std::tuple, Blk>> &a_ijk) { + const auto i = key[0]; + const auto k = key[1]; + auto world = get_default_world(); + assert(key[2] == world.rank()); + if (tracing()) ttg::print("LocalBcastA(", i, ", ", k, ")"); + if (k >= b_rowidx_to_colidx_.size()) return; + // broadcast a_ik to all existing {i,j,k} + std::vector> ijk_keys; + for (auto &j : b_rowidx_to_colidx_[k]) { + if (tracing()) ttg::print("Broadcasting A[", i, "][", k, "] to j=", j); + if (keymap_(Key<2>({i, j})) == world.rank()) { + ijk_keys.emplace_back(Key<3>({i, j, k})); + } + } + ::broadcast<0>(ijk_keys, baseT::template get<0>(a_ik), a_ijk); + } + + private: + const std::vector> &b_rowidx_to_colidx_; + Keymap keymap_; + }; // class LocalBcastA + + + /// broadcast A[i][k] to all procs where B[j][k] class BcastA : public Op, std::tuple, Blk>>, BcastA, Blk> { public: using baseT = Op, std::tuple, Blk>>, BcastA, Blk>; - template - BcastA(Edge, Blk> &a, Edge, Blk> &a_ijk, const std::vector> &b_rowidx_to_colidx, Keymap&& keymap) - : baseT(edges(a), edges(a_ijk), "SpMM::bcast_a", {"a_ik"}, {"a_ijk"}, keymap) + BcastA(Edge, Blk> &a, Edge, Blk> &a_ikp, const std::vector> &b_rowidx_to_colidx, Keymap keymap) + : baseT(edges(a), edges(a_ikp), "SpMM::bcast_a", {"a_ik"}, {"a_ikp"}, keymap) , b_rowidx_to_colidx_(b_rowidx_to_colidx) {} - void op(const Key<2> &key, typename baseT::input_values_tuple_type &&a_ik, std::tuple, Blk>> &a_ijk) { + void op(const Key<2> &key, typename baseT::input_values_tuple_type &&a_ik, std::tuple, Blk>> &a_ikp) { const auto i = key[0]; const auto k = key[1]; if (tracing()) ttg::print("BcastA(", i, ", ", k, ")"); // broadcast a_ik to all existing {i,j,k} - std::vector> ijk_keys; + std::vector> ikp_keys; if (k >= b_rowidx_to_colidx_.size()) return; + auto world = get_default_world(); + std::vector procmap(world.size()); + auto keymap = baseT::get_keymap(); for (auto &j : b_rowidx_to_colidx_[k]) { - if (tracing()) ttg::print("Broadcasting A[", i, "][", k, "] to j=", j); - ijk_keys.emplace_back(Key<3>({i, j, k})); + long proc = keymap(Key<2>({i, j})); + if (!procmap[proc]) { + if (tracing()) ttg::print("Broadcasting A[", i, "][", k, "] to proc ", proc); + ikp_keys.emplace_back(Key<3>({i, k, proc})); + procmap[proc] = true; + } } - ::broadcast<0>(ijk_keys, baseT::template get<0>(a_ik), a_ijk); + ::broadcast<0>(ikp_keys, baseT::template get<0>(a_ik), a_ikp); } private: const std::vector> &b_rowidx_to_colidx_; }; // class BcastA + /// broadcast B[k][j] to all {i,j,k} such that A[i][k] exists + class LocalBcastB : public Op, std::tuple, Blk>>, LocalBcastB, Blk> { + public: + using baseT = Op, std::tuple, Blk>>, LocalBcastB, Blk>; + + LocalBcastB(Edge, Blk> &b, Edge, Blk> &b_ijk, const std::vector> &a_colidx_to_rowidx, Keymap keymap) + : baseT(edges(b), edges(b_ijk), "SpMM::local_bcast_b", {"b_kj"}, {"b_ijk"}, [](const Key<3> &key){ return key[2]; }) + , a_colidx_to_rowidx_(a_colidx_to_rowidx), keymap_(keymap) {} + + void op(const Key<3> &key, typename baseT::input_values_tuple_type &&b_kj, std::tuple, Blk>> &b_ijk) { + const auto k = key[0]; + const auto j = key[1]; + auto world = get_default_world(); + assert(key[2] == world.rank()); + if (tracing()) ttg::print("BcastB(", k, ", ", j, ")"); + if (k >= a_colidx_to_rowidx_.size()) return; + // broadcast b_kj to *jk + std::vector> ijk_keys; + for (auto &i : a_colidx_to_rowidx_[k]) { + if (tracing()) ttg::print("Broadcasting B[", k, "][", j, "] to i=", i); + if (keymap_(Key<2>({i, j})) == world.rank()) { + ijk_keys.emplace_back(Key<3>({i, j, k})); + } + } + ::broadcast<0>(ijk_keys, baseT::template get<0>(b_kj), b_ijk); + } + + private: + const std::vector> &a_colidx_to_rowidx_; + Keymap keymap_; + }; // class BcastA + /// broadcast B[k][j] to all {i,j,k} such that A[i][k] exists class BcastB : public Op, std::tuple, Blk>>, BcastB, Blk> { public: using baseT = Op, std::tuple, Blk>>, BcastB, Blk>; - template - BcastB(Edge, Blk> &b, Edge, Blk> &b_ijk, const std::vector> &a_colidx_to_rowidx, Keymap&& keymap) - : baseT(edges(b), edges(b_ijk), "SpMM::bcast_b", {"b_kj"}, {"b_ijk"}, keymap) + BcastB(Edge, Blk> &b, Edge, Blk> &b_kjp, const std::vector> &a_colidx_to_rowidx, Keymap keymap) + : baseT(edges(b), edges(b_kjp), "SpMM::bcast_b", {"b_kjp"}, {"b_ijk"}, keymap) , a_colidx_to_rowidx_(a_colidx_to_rowidx) {} - void op(const Key<2> &key, typename baseT::input_values_tuple_type &&b_kj, std::tuple, Blk>> &b_ijk) { + void op(const Key<2> &key, typename baseT::input_values_tuple_type &&b_kj, std::tuple, Blk>> &b_kjp) { const auto k = key[0]; const auto j = key[1]; // broadcast b_kj to *jk - std::vector> ijk_keys; + std::vector> kjp_keys; if (tracing()) ttg::print("BcastB(", k, ", ", j, ")"); if (k >= a_colidx_to_rowidx_.size()) return; + auto world = get_default_world(); + std::vector procmap(world.size()); for (auto &i : a_colidx_to_rowidx_[k]) { - if (tracing()) ttg::print("Broadcasting B[", k, "][", j, "] to i=", i); - ijk_keys.emplace_back(Key<3>({i, j, k})); + long proc = baseT::get_keymap()(Key<2>({i, j})); + if (!procmap[proc]) { + if (tracing()) ttg::print("Broadcasting A[", k, "][", j, "] to proc ", proc); + kjp_keys.emplace_back(Key<3>({k, j, proc})); + procmap[proc] = true; + } } - ::broadcast<0>(ijk_keys, baseT::template get<0>(b_kj), b_ijk); + ::broadcast<0>(kjp_keys, baseT::template get<0>(b_kj), b_kjp); } private: @@ -349,10 +429,10 @@ class SpMM { public: using baseT = Op, std::tuple, Blk>, Out, Blk>>, MultiplyAdd, const Blk, const Blk, Blk>; - template MultiplyAdd(Edge, Blk> &a_ijk, Edge, Blk> &b_ijk, Edge, Blk> &c_ijk, Edge, Blk> &c, const std::vector> &a_rowidx_to_colidx, - const std::vector> &b_colidx_to_rowidx, Keymap &&keymap) + const std::vector> &b_colidx_to_rowidx, + Keymap keymap) : baseT(edges(a_ijk, b_ijk, c_ijk), edges(c, c_ijk), "SpMM::MultiplyAdd", {"a_ijk", "b_ijk", "c_ijk"}, {"c_ij", "c_ijk"}, [keymap](const Key<3> &key) { @@ -500,14 +580,18 @@ class SpMM { private: Edge, Blk> a_ijk_; + Edge, Blk> local_a_ijk_; Edge, Blk> b_ijk_; + Edge, Blk> local_b_ijk_; Edge, Blk> c_ijk_; std::vector> a_rowidx_to_colidx_; std::vector> b_colidx_to_rowidx_; std::vector> a_colidx_to_rowidx_; std::vector> b_rowidx_to_colidx_; std::unique_ptr bcast_a_; + std::unique_ptr local_bcast_a_; std::unique_ptr bcast_b_; + std::unique_ptr local_bcast_b_; std::unique_ptr multiplyadd_; // result[i][j] gives the j-th nonzero row for column i in matrix mat