diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc index 94edf672b..99509bd1c 100644 --- a/examples/spmm/spmm.cc +++ b/examples/spmm/spmm.cc @@ -163,7 +163,13 @@ inline int ijk2rank(int i, int j, int k, int P, int Q, int R) { return rank; } -// flow data from an existing SpMatrix on rank 0 +/// Pushes out data from an existing SpMatrix whose data is distributed on a 2-d grid. + +/// Data is pushed in the order of the appearance of the data in the container, without any tailoring to +/// the order in which the data is consumed; thus this is likely to generate tasks in a suboptimal order. +/// \note Reading should in general occur in the same order as the data will be consumed. +/// If all consuming tasks can execute concurrently this should be OK, albeit the runtime will likely throttle +/// sends, thus task dependencies further "down" the DAG may result in some reading orders being better than others template &)>> class Read_SpMatrix : public TT, std::tuple, Blk>>, Read_SpMatrix, ttg::typelist> { public: @@ -174,18 +180,135 @@ class Read_SpMatrix : public TT, std::tuple, Blk>>, Read_SpMat ij_keymap) , matrix_(matrix) {} - void op(const Key<2> &, std::tuple, Blk>> &out) { + // key is this process' coordinate in the 2-d grid of processes (managed by ij_keymap) ... + // but it's not used at all since all this TT does is generate consuming tasks that use local tiles ... + // the consumers better use same keymap (ij_keymap) as this TT to avoid for the data flow from this to be local + void op(const Key<2> & /* pq */, std::tuple, Blk>> &out) { + auto rank = ttg::default_execution_context().rank(); + // this code assumes col-major layout + static_assert(SpMatrix::IsRowMajor == false, "SpMatrix must be col-major"); + for (int j = 0; j < matrix_.outerSize(); ++j) { + for (typename SpMatrix::InnerIterator it(matrix_, j); it; ++it) { + assert(j == it.col()); + const auto i = it.row(); + // IF the receiver uses the same keymap, these sends are local + if (rank == this->get_keymap()(Key<2>(std::initializer_list({i, j})))) { + ::send<0>(Key<2>(std::initializer_list({i, j})), it.value(), out); + } + } + } + } + + private: + const SpMatrix &matrix_; +}; + +enum class ReadSchedule { + SingleK, // appropriate for 2D (see ReadA) + MultipleK // appropriate for 3D (see ReadA) +}; +// change this to control the schedule of sends (2-D vs 3-D) +constexpr auto DefaultReadSchedule = ReadSchedule::MultipleK; + +/// flow data from A distributed on a 2-d grid of processes in the order they are likely to be consumed + +/// The order of sends needs to tailored as follows: +/// - for 2-D SUMMA (R=1): read A[i][k] for all i and k=0 first, then k=1, etc. Clearly, Read_SpMatrix is going to +/// generate reads in the wrong (transposed) order (send all k for i=0, then for i=1, etc.). +/// - for 2.5/3-D SUMMA (R>1): same order of sends as for 2-D SUMMA will be suboptimal since all k=0 will only +/// generate work on the r=0 process plane. Instead we *may* want to one tile needed on each plane, then one more for +/// each plane. Hence the need for ReadSchedule. +template &)>> +class ReadA : public TT, std::tuple, Blk>>, ReadA, ttg::typelist> { + public: + using baseT = typename ReadA::ttT; + ReadA(const SpMatrix &matrix, Edge> &ctl, Edge, Blk> &out, const Keymap2 &ij_keymap, long R) + : baseT(edges(ctl), edges(out), std::string("SpMM25D::read_a"), {"ctl"}, {"a_ik"}, ij_keymap) + , matrix_(matrix) + , R_(R) {} + + // key is this process' coordinate in the 2-d grid of processes (managed by ij_keymap) ... + // but it's not used at all since all this TT does is generate consuming tasks that use local tiles ... + // the consumers better use same keymap (ij_keymap) as this TT to avoid for the data flow from this to be local + void op(const Key<2> & /* pq */, std::tuple, Blk>> &out) { + auto rank = ttg::default_execution_context().rank(); + const int I = matrix_.rows(); + const int K = matrix_.cols(); + + // this assumes col-major layout of SpMatrix + static_assert(SpMatrix::IsRowMajor == false, "SpMatrix must be col-major"); + + // loop over blocks of k at a time, block size controlled by KSchedule + const int k_blk_size = (KSchedule == ReadSchedule::SingleK) ? 1 : R_; + for (std::pair k_blk = {0, std::min(k_blk_size, K)}; k_blk.first < K; + k_blk = {k_blk.first + k_blk_size, std::min(k_blk.first + k_blk_size + k_blk_size, K)}) { + + // N.B. : due to the CSC layout of A iterating over (blocks of) columns is efficient + for (int k = k_blk.first; k < k_blk.second; ++k) { + for (typename SpMatrix::InnerIterator it(matrix_, k); it; ++it) { + assert(k == it.col()); + const auto i = it.row(); + // IF the receiver uses the same keymap, these sends are local + if (rank == this->get_keymap()(Key<2>(std::initializer_list({i, k})))) { + ::send<0>(Key<2>(std::initializer_list({i, k})), it.value(), out); + } + } + } + + } + + } + + private: + const SpMatrix &matrix_; + long R_; +}; + +// flow data from an existing SpMatrix on rank 0 +template &)>> +class ReadB : public TT, std::tuple, Blk>>, ReadB, ttg::typelist> { + public: + using baseT = typename ReadB::ttT; + ReadB(const SpMatrix &matrix, Edge> &ctl, Edge, Blk> &out, const Keymap2 &ij_keymap, long R) + : baseT(edges(ctl), edges(out), std::string("read_b"), {"ctl"}, {"b_kj"}, ij_keymap), matrix_(matrix), R_(R) {} + + // key is this process' coordinate in the 2-d grid of processes (managed by ij_keymap) ... + // but it's not used at all since all this TT does is generate consuming tasks that use local tiles ... + // the consumers better use same keymap (ij_keymap) as this TT to avoid for the data flow from this to be local + void op(const Key<2> & /* pq */, std::tuple, Blk>> &out) { auto rank = ttg::default_execution_context().rank(); - for (int k = 0; k < matrix_.outerSize(); ++k) { - for (typename SpMatrix::InnerIterator it(matrix_, k); it; ++it) { - if (rank == this->get_keymap()(Key<2>(std::initializer_list({it.row(), it.col()})))) - ::send<0>(Key<2>(std::initializer_list({it.row(), it.col()})), it.value(), out); + const int J = matrix_.cols(); + const int K = matrix_.rows(); + + // loop over blocks of k at a time, block size controlled by KSchedule + const int k_blk_size = (KSchedule == ReadSchedule::SingleK) ? 1 : R_; + for (std::pair k_blk = {0, std::min(k_blk_size, K)}; k_blk.first < K; + k_blk = {k_blk.first + k_blk_size, std::min(k_blk.first + k_blk_size + k_blk_size, K)}) { + // this assumes col-major layout of SpMatrix + static_assert(SpMatrix::IsRowMajor == false, "SpMatrix must be col-major"); + + // WARNING : due to the CSC layout of B iterating over (blocks of) columns is inefficient + for (int j = 0; j < matrix_.outerSize(); ++j) { + for (typename SpMatrix::InnerIterator it(matrix_, j); it; ++it) { + assert(j == it.col()); + const auto k = it.row(); + // if k past the k block, we are done with this i + if (k >= k_blk.second) break; + // continue iterating until k has not reached this k block + if (k < k_blk.first) continue; + // IF the receiver uses the same keymap, these sends are local + if (rank == this->get_keymap()(Key<2>(std::initializer_list({k, j})))) { + ::send<0>(Key<2>(std::initializer_list({k, j})), it.value(), out); + } + } + } } } private: const SpMatrix &matrix_; + long R_; }; // flow (move?) data into an existing SpMatrix on rank 0 @@ -1153,7 +1276,7 @@ static void initBlSpRandom(const std::function &)> &keymap, siz size_t avg_nb = 0; int avg_nb_nb = 0; - struct tuple_hash : public std::unary_function, std::size_t> { + struct tuple_hash { std::size_t operator()(const std::tuple &k) const { return static_cast(std::get<0>(k)) | (static_cast(std::get<1>(k)) << 32); } @@ -1256,6 +1379,9 @@ static void timed_measurement(SpMatrix<> &A, SpMatrix<> &B, const std::function< Read_SpMatrix a("A", A, ctl, eA, ij_keymap); Read_SpMatrix b("B", B, ctl, eB, ij_keymap); + // uncomment this to use more intelligent schedule of reads +// ReadA<> a(A, ctl, eA, ij_keymap, R); +// ReadB<> b(B, ctl, eB, ij_keymap, R); Write_SpMatrix<> c(C, eC, ij_keymap); auto &c_status = c.status(); assert(!has_value(c_status));