Skip to content

Commit

Permalink
implemented Read{A,B} that read A and B more intelligently
Browse files Browse the repository at this point in the history
  • Loading branch information
evaleev committed Feb 2, 2024
1 parent b70444c commit 753ccb9
Showing 1 changed file with 133 additions and 7 deletions.
140 changes: 133 additions & 7 deletions examples/spmm/spmm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,13 @@ inline int ijk2rank(int i, int j, int k, int P, int Q, int R) {
return rank;
}

// flow data from an existing SpMatrix on rank 0
/// Pushes out data from an existing SpMatrix whose data is distributed on a 2-d grid.

/// Data is pushed in the order of the appearance of the data in the container, without any tailoring to
/// the order in which the data is consumed; thus this is likely to generate tasks in a suboptimal order.
/// \note Reading should in general occur in the same order as the data will be consumed.
/// If all consuming tasks can execute concurrently this should be OK, albeit the runtime will likely throttle
/// sends, thus task dependencies further "down" the DAG may result in some reading orders being better than others
template <typename Blk = blk_t, typename Keymap = std::function<int(const Key<2> &)>>
class Read_SpMatrix : public TT<Key<2>, std::tuple<Out<Key<2>, Blk>>, Read_SpMatrix<Blk, Keymap>, ttg::typelist<void>> {
public:
Expand All @@ -174,18 +180,135 @@ class Read_SpMatrix : public TT<Key<2>, std::tuple<Out<Key<2>, Blk>>, Read_SpMat
ij_keymap)
, matrix_(matrix) {}

void op(const Key<2> &, std::tuple<Out<Key<2>, Blk>> &out) {
// key is this process' coordinate in the 2-d grid of processes (managed by ij_keymap) ...
// but it's not used at all since all this TT does is generate consuming tasks that use local tiles ...
// the consumers better use same keymap (ij_keymap) as this TT to avoid for the data flow from this to be local
void op(const Key<2> & /* pq */, std::tuple<Out<Key<2>, Blk>> &out) {
auto rank = ttg::default_execution_context().rank();
// this code assumes col-major layout
static_assert(SpMatrix<Blk>::IsRowMajor == false, "SpMatrix must be col-major");
for (int j = 0; j < matrix_.outerSize(); ++j) {
for (typename SpMatrix<Blk>::InnerIterator it(matrix_, j); it; ++it) {
assert(j == it.col());
const auto i = it.row();
// IF the receiver uses the same keymap, these sends are local
if (rank == this->get_keymap()(Key<2>(std::initializer_list<long>({i, j})))) {
::send<0>(Key<2>(std::initializer_list<long>({i, j})), it.value(), out);
}
}
}
}

private:
const SpMatrix<Blk> &matrix_;
};

enum class ReadSchedule {
SingleK, // appropriate for 2D (see ReadA)
MultipleK // appropriate for 3D (see ReadA)
};
// change this to control the schedule of sends (2-D vs 3-D)
constexpr auto DefaultReadSchedule = ReadSchedule::MultipleK;

/// flow data from A distributed on a 2-d grid of processes in the order they are likely to be consumed

/// The order of sends needs to tailored as follows:
/// - for 2-D SUMMA (R=1): read A[i][k] for all i and k=0 first, then k=1, etc. Clearly, Read_SpMatrix is going to
/// generate reads in the wrong (transposed) order (send all k for i=0, then for i=1, etc.).
/// - for 2.5/3-D SUMMA (R>1): same order of sends as for 2-D SUMMA will be suboptimal since all k=0 will only
/// generate work on the r=0 process plane. Instead we *may* want to one tile needed on each plane, then one more for
/// each plane. Hence the need for ReadSchedule.
template <ReadSchedule KSchedule = DefaultReadSchedule, typename Blk = blk_t, typename Keymap2 = std::function<int(const Key<2> &)>>
class ReadA : public TT<Key<2>, std::tuple<Out<Key<2>, Blk>>, ReadA<KSchedule, Blk, Keymap2>, ttg::typelist<void>> {
public:
using baseT = typename ReadA::ttT;
ReadA(const SpMatrix<Blk> &matrix, Edge<Key<2>> &ctl, Edge<Key<2>, Blk> &out, const Keymap2 &ij_keymap, long R)
: baseT(edges(ctl), edges(out), std::string("SpMM25D::read_a"), {"ctl"}, {"a_ik"}, ij_keymap)
, matrix_(matrix)
, R_(R) {}

// key is this process' coordinate in the 2-d grid of processes (managed by ij_keymap) ...
// but it's not used at all since all this TT does is generate consuming tasks that use local tiles ...
// the consumers better use same keymap (ij_keymap) as this TT to avoid for the data flow from this to be local
void op(const Key<2> & /* pq */, std::tuple<Out<Key<2>, Blk>> &out) {
auto rank = ttg::default_execution_context().rank();
const int I = matrix_.rows();
const int K = matrix_.cols();

// this assumes col-major layout of SpMatrix
static_assert(SpMatrix<Blk>::IsRowMajor == false, "SpMatrix must be col-major");

// loop over blocks of k at a time, block size controlled by KSchedule
const int k_blk_size = (KSchedule == ReadSchedule::SingleK) ? 1 : R_;
for (std::pair<int, int> k_blk = {0, std::min(k_blk_size, K)}; k_blk.first < K;
k_blk = {k_blk.first + k_blk_size, std::min(k_blk.first + k_blk_size + k_blk_size, K)}) {

// N.B. : due to the CSC layout of A iterating over (blocks of) columns is efficient
for (int k = k_blk.first; k < k_blk.second; ++k) {
for (typename SpMatrix<Blk>::InnerIterator it(matrix_, k); it; ++it) {
assert(k == it.col());
const auto i = it.row();
// IF the receiver uses the same keymap, these sends are local
if (rank == this->get_keymap()(Key<2>(std::initializer_list<long>({i, k})))) {
::send<0>(Key<2>(std::initializer_list<long>({i, k})), it.value(), out);
}
}
}

}

}

private:
const SpMatrix<Blk> &matrix_;
long R_;
};

// flow data from an existing SpMatrix on rank 0
template <ReadSchedule KSchedule = DefaultReadSchedule, typename Blk = blk_t, typename Keymap2 = std::function<int(const Key<2> &)>>
class ReadB : public TT<Key<2>, std::tuple<Out<Key<2>, Blk>>, ReadB<KSchedule, Blk, Keymap2>, ttg::typelist<void>> {
public:
using baseT = typename ReadB::ttT;
ReadB(const SpMatrix<Blk> &matrix, Edge<Key<2>> &ctl, Edge<Key<2>, Blk> &out, const Keymap2 &ij_keymap, long R)
: baseT(edges(ctl), edges(out), std::string("read_b"), {"ctl"}, {"b_kj"}, ij_keymap), matrix_(matrix), R_(R) {}

// key is this process' coordinate in the 2-d grid of processes (managed by ij_keymap) ...
// but it's not used at all since all this TT does is generate consuming tasks that use local tiles ...
// the consumers better use same keymap (ij_keymap) as this TT to avoid for the data flow from this to be local
void op(const Key<2> & /* pq */, std::tuple<Out<Key<2>, Blk>> &out) {
auto rank = ttg::default_execution_context().rank();
for (int k = 0; k < matrix_.outerSize(); ++k) {
for (typename SpMatrix<Blk>::InnerIterator it(matrix_, k); it; ++it) {
if (rank == this->get_keymap()(Key<2>(std::initializer_list<long>({it.row(), it.col()}))))
::send<0>(Key<2>(std::initializer_list<long>({it.row(), it.col()})), it.value(), out);
const int J = matrix_.cols();
const int K = matrix_.rows();

// loop over blocks of k at a time, block size controlled by KSchedule
const int k_blk_size = (KSchedule == ReadSchedule::SingleK) ? 1 : R_;
for (std::pair<int, int> k_blk = {0, std::min(k_blk_size, K)}; k_blk.first < K;
k_blk = {k_blk.first + k_blk_size, std::min(k_blk.first + k_blk_size + k_blk_size, K)}) {
// this assumes col-major layout of SpMatrix
static_assert(SpMatrix<Blk>::IsRowMajor == false, "SpMatrix must be col-major");

// WARNING : due to the CSC layout of B iterating over (blocks of) columns is inefficient
for (int j = 0; j < matrix_.outerSize(); ++j) {
for (typename SpMatrix<Blk>::InnerIterator it(matrix_, j); it; ++it) {
assert(j == it.col());
const auto k = it.row();
// if k past the k block, we are done with this i
if (k >= k_blk.second) break;
// continue iterating until k has not reached this k block
if (k < k_blk.first) continue;
// IF the receiver uses the same keymap, these sends are local
if (rank == this->get_keymap()(Key<2>(std::initializer_list<long>({k, j})))) {
::send<0>(Key<2>(std::initializer_list<long>({k, j})), it.value(), out);
}
}

}
}
}

private:
const SpMatrix<Blk> &matrix_;
long R_;
};

// flow (move?) data into an existing SpMatrix on rank 0
Expand Down Expand Up @@ -1153,7 +1276,7 @@ static void initBlSpRandom(const std::function<int(const Key<2> &)> &keymap, siz
size_t avg_nb = 0;
int avg_nb_nb = 0;

struct tuple_hash : public std::unary_function<std::tuple<int, int>, std::size_t> {
struct tuple_hash {
std::size_t operator()(const std::tuple<int, int> &k) const {
return static_cast<size_t>(std::get<0>(k)) | (static_cast<size_t>(std::get<1>(k)) << 32);
}
Expand Down Expand Up @@ -1256,6 +1379,9 @@ static void timed_measurement(SpMatrix<> &A, SpMatrix<> &B, const std::function<

Read_SpMatrix a("A", A, ctl, eA, ij_keymap);
Read_SpMatrix b("B", B, ctl, eB, ij_keymap);
// uncomment this to use more intelligent schedule of reads
// ReadA<> a(A, ctl, eA, ij_keymap, R);
// ReadB<> b(B, ctl, eB, ij_keymap, R);
Write_SpMatrix<> c(C, eC, ij_keymap);
auto &c_status = c.status();
assert(!has_value(c_status));
Expand Down

0 comments on commit 753ccb9

Please sign in to comment.