Skip to content

Commit

Permalink
more performance counters
Browse files Browse the repository at this point in the history
  • Loading branch information
JiakunYan committed Nov 27, 2024
1 parent 00049a0 commit cdf2297
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 5 deletions.
2 changes: 2 additions & 0 deletions lci/backend/ibv/server_ibv.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,9 @@ static inline int LCISD_poll_cq(LCIS_endpoint_t endpoint_pp,
#ifdef LCI_ENABLE_MULTITHREAD_PROGRESS
if (!LCIU_try_acquire_spinlock(&endpoint_p->cq_lock)) return 0;
#endif
LCII_PCOUNTER_START(net_poll_cq_succeed_timer);
int ne = ibv_poll_cq(endpoint_p->cq, LCI_CQ_MAX_POLL, wc);
LCII_PCOUNTER_END(net_poll_cq_succeed_timer);
LCI_DBG_Assert(ne >= 0, "ibv_poll_cq returns error %d\n", ne);
#ifdef LCI_ENABLE_MULTITHREAD_PROGRESS
LCIU_release_spinlock(&endpoint_p->cq_lock);
Expand Down
44 changes: 44 additions & 0 deletions lci/backend/server.h
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,9 @@ static inline int LCIS_poll_cq(LCIS_endpoint_t endpoint_pp,
{
LCII_PCOUNTER_ADD(net_poll_cq_attempts, 1);
LCISI_CS_ENTER(endpoint_pp, 0);
LCII_PCOUNTER_NOW(start);
int ret = LCISD_poll_cq(endpoint_pp, entry);
LCII_PCOUNTER_SINCE(net_poll_cq_timer, start);
LCISI_CS_EXIT(endpoint_pp);
return ret;
}
Expand All @@ -182,8 +184,14 @@ static inline LCI_error_t LCIS_post_sends(LCIS_endpoint_t endpoint_pp, int rank,
#ifdef LCI_ENABLE_SLOWDOWN
LCIU_spin_for_nsec(LCI_SEND_SLOW_DOWN_USEC * 1000);
#endif

LCII_PCOUNTER_NOW(start);
LCI_error_t ret = LCISD_post_sends(endpoint_pp, rank, buf, size, meta);
LCII_PCOUNTER_NOW(end);
LCII_PCOUNTER_ADD(net_post_send_timer, end - start);

if (ret == LCI_OK) {
LCII_PCOUNTER_ADD(net_post_send_succeed_timer, end - start);
LCII_PCOUNTER_ADD(net_sends_posted, (int64_t)size);
} else if (ret == LCI_ERR_RETRY_LOCK) {
LCII_PCOUNTER_ADD(net_send_failed_lock, 1);
Expand All @@ -206,9 +214,15 @@ static inline LCI_error_t LCIS_post_send(LCIS_endpoint_t endpoint_pp, int rank,
#ifdef LCI_ENABLE_SLOWDOWN
LCIU_spin_for_nsec(LCI_SEND_SLOW_DOWN_USEC * 1000);
#endif

LCII_PCOUNTER_NOW(start);
LCI_error_t ret =
LCISD_post_send(endpoint_pp, rank, buf, size, mr, meta, ctx);
LCII_PCOUNTER_NOW(end);
LCII_PCOUNTER_ADD(net_post_send_timer, end - start);

if (ret == LCI_OK) {
LCII_PCOUNTER_ADD(net_post_send_succeed_timer, end - start);
LCII_PCOUNTER_ADD(net_send_posted, (int64_t)size);
} else if (ret == LCI_ERR_RETRY_LOCK) {
LCII_PCOUNTER_ADD(net_send_failed_lock, 1);
Expand All @@ -232,9 +246,15 @@ static inline LCI_error_t LCIS_post_puts(LCIS_endpoint_t endpoint_pp, int rank,
#ifdef LCI_ENABLE_SLOWDOWN
LCIU_spin_for_nsec(LCI_SEND_SLOW_DOWN_USEC * 1000);
#endif

LCII_PCOUNTER_NOW(start);
LCI_error_t ret =
LCISD_post_puts(endpoint_pp, rank, buf, size, base, offset, rkey);
LCII_PCOUNTER_NOW(end);
LCII_PCOUNTER_ADD(net_post_send_timer, end - start);

if (ret == LCI_OK) {
LCII_PCOUNTER_ADD(net_post_send_succeed_timer, end - start);
LCII_PCOUNTER_ADD(net_sends_posted, (int64_t)size);
} else if (ret == LCI_ERR_RETRY_LOCK) {
LCII_PCOUNTER_ADD(net_send_failed_lock, 1);
Expand All @@ -259,9 +279,15 @@ static inline LCI_error_t LCIS_post_put(LCIS_endpoint_t endpoint_pp, int rank,
#ifdef LCI_ENABLE_SLOWDOWN
LCIU_spin_for_nsec(LCI_SEND_SLOW_DOWN_USEC * 1000);
#endif

LCII_PCOUNTER_NOW(start);
LCI_error_t ret =
LCISD_post_put(endpoint_pp, rank, buf, size, mr, base, offset, rkey, ctx);
LCII_PCOUNTER_NOW(end);
LCII_PCOUNTER_ADD(net_post_send_timer, end - start);

if (ret == LCI_OK) {
LCII_PCOUNTER_ADD(net_post_send_succeed_timer, end - start);
LCII_PCOUNTER_ADD(net_send_posted, (int64_t)size);
} else if (ret == LCI_ERR_RETRY_LOCK) {
LCII_PCOUNTER_ADD(net_send_failed_lock, 1);
Expand All @@ -287,9 +313,15 @@ static inline LCI_error_t LCIS_post_putImms(LCIS_endpoint_t endpoint_pp,
#ifdef LCI_ENABLE_SLOWDOWN
LCIU_spin_for_nsec(LCI_SEND_SLOW_DOWN_USEC * 1000);
#endif

LCII_PCOUNTER_NOW(start);
LCI_error_t ret = LCISD_post_putImms(endpoint_pp, rank, buf, size, base,
offset, rkey, meta);
LCII_PCOUNTER_NOW(end);
LCII_PCOUNTER_ADD(net_post_send_timer, end - start);

if (ret == LCI_OK) {
LCII_PCOUNTER_ADD(net_post_send_succeed_timer, end - start);
LCII_PCOUNTER_ADD(net_send_posted, (int64_t)size);
} else if (ret == LCI_ERR_RETRY_LOCK) {
LCII_PCOUNTER_ADD(net_send_failed_lock, 1);
Expand All @@ -316,9 +348,15 @@ static inline LCI_error_t LCIS_post_putImm(LCIS_endpoint_t endpoint_pp,
#ifdef LCI_ENABLE_SLOWDOWN
LCIU_spin_for_nsec(LCI_SEND_SLOW_DOWN_USEC * 1000);
#endif

LCII_PCOUNTER_NOW(start);
LCI_error_t ret = LCISD_post_putImm(endpoint_pp, rank, buf, size, mr, base,
offset, rkey, meta, ctx);
LCII_PCOUNTER_NOW(end);
LCII_PCOUNTER_ADD(net_post_send_timer, end - start);

if (ret == LCI_OK) {
LCII_PCOUNTER_ADD(net_post_send_succeed_timer, end - start);
LCII_PCOUNTER_ADD(net_send_posted, (int64_t)size);
} else if (ret == LCI_ERR_RETRY_LOCK) {
LCII_PCOUNTER_ADD(net_send_failed_lock, 1);
Expand All @@ -337,8 +375,14 @@ static inline LCI_error_t LCIS_post_recv(LCIS_endpoint_t endpoint_pp, void* buf,
LCI_DBG_Log(LCI_LOG_TRACE, "server",
"LCIS_post_recv: buf %p size %u mr %p user_context %p\n", buf,
size, mr.mr_p, ctx);

LCII_PCOUNTER_NOW(start);
LCI_error_t ret = LCISD_post_recv(endpoint_pp, buf, size, mr, ctx);
LCII_PCOUNTER_NOW(end);
LCII_PCOUNTER_ADD(net_post_recv_timer, end - start);

if (ret == LCI_OK) {
LCII_PCOUNTER_ADD(net_post_recv_succeed_timer, end - start);
LCII_PCOUNTER_ADD(net_recv_posted, 1);
} else if (ret == LCI_ERR_RETRY_LOCK) {
LCII_PCOUNTER_ADD(net_recv_failed_lock, 1);
Expand Down
11 changes: 11 additions & 0 deletions lci/profile/performance_counter.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,12 @@ extern LCT_pcounter_ctx_t LCII_pcounter_ctx;
_macro(unexpected_msg)

#define LCII_PCOUNTER_TIMER_FOR_EACH(_macro) \
_macro(net_post_send_timer) \
_macro(net_post_send_succeed_timer) \
_macro(net_post_recv_timer) \
_macro(net_post_recv_succeed_timer) \
_macro(net_poll_cq_timer) \
_macro(net_poll_cq_succeed_timer) \
_macro(sync_stay_timer) \
_macro(cq_stay_timer) \
_macro(useful_progress_timer) \
Expand All @@ -55,7 +61,9 @@ extern LCT_pcounter_ctx_t LCII_pcounter_ctx;
_macro(cq_push_timer) \
_macro(cq_pop_timer) \
_macro(serve_rts_timer) \
_macro(handle_rts_timer) \
_macro(rts_mem_timer) \
_macro(rts_archive_timer) \
_macro(rts_send_timer) \
_macro(serve_rtr_timer) \
_macro(rtr_mem_reg_timer) \
Expand Down Expand Up @@ -87,13 +95,16 @@ LCII_PCOUNTER_TIMER_FOR_EACH(LCII_PCOUNTER_HANDLE_DECL)
LCT_pcounter_startt(LCII_pcounter_ctx, LCII_pcounter_handle_##name, time);
#define LCII_PCOUNTER_ENDT(name, time) \
LCT_pcounter_endt(LCII_pcounter_ctx, LCII_pcounter_handle_##name, time);
#define LCII_PCOUNTER_SINCE(name, time) \
LCT_pcounter_add(LCII_pcounter_ctx, LCII_pcounter_handle_##name, LCT_now() - time);
#else
#define LCII_PCOUNTER_ADD(name, val)
#define LCII_PCOUNTER_START(name)
#define LCII_PCOUNTER_END(name)
#define LCII_PCOUNTER_NOW(time)
#define LCII_PCOUNTER_STARTT(name, time)
#define LCII_PCOUNTER_ENDT(name, time)
#define LCII_PCOUNTER_SINCE(name, time)
#endif

void LCII_pcounters_init();
Expand Down
4 changes: 4 additions & 0 deletions lci/runtime/rendezvous.h
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ static inline void LCII_handle_rts(LCI_endpoint_t ep, LCII_packet_t* packet,
int src_rank, uint16_t tag,
LCII_context_t* rdv_ctx, bool is_in_progress)
{
LCII_PCOUNTER_START(handle_rts_timer);
// Extract information from the received RTS packet
LCII_rdv_type_t rdv_type = packet->data.rts.rdv_type;
LCI_DBG_Log(LCI_LOG_TRACE, "rdv", "handle rts: rdv_type %d\n", rdv_type);
Expand Down Expand Up @@ -270,8 +271,10 @@ static inline void LCII_handle_rts(LCI_endpoint_t ep, LCII_packet_t* packet,
// We cannot use writeimm for more than 1 rdma messages.
// IOVEC does not support writeimm for now
uint64_t ctx_key;
LCII_PCOUNTER_START(rts_archive_timer);
int result =
LCM_archive_put(ep->ctx_archive_p, (uintptr_t)rdv_ctx, &ctx_key);
LCII_PCOUNTER_END(rts_archive_timer);
// TODO: be able to pass back pressure to user
LCI_Assert(result == LCM_SUCCESS, "Archive is full!\n");
packet->data.rtr.recv_ctx_key = ctx_key;
Expand Down Expand Up @@ -312,6 +315,7 @@ static inline void LCII_handle_rts(LCI_endpoint_t ep, LCII_packet_t* packet,
ep->device->heap_segment->mr,
LCII_MAKE_PROTO(ep->gid, LCI_MSG_RTR, 0), rtr_ctx);
LCII_PCOUNTER_END(rts_send_timer);
LCII_PCOUNTER_END(handle_rts_timer);
}

static inline void LCII_handle_rtr(LCI_endpoint_t ep, LCII_packet_t* packet)
Expand Down
10 changes: 5 additions & 5 deletions lct/pcounter/pcounter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,19 +109,19 @@ struct tls_ctx_t {
switch (handle.type) {
case LCT_PCOUNTER_COUNTER:
if (handle.idx >= counters.size()) {
counters.resize(handle.idx + 1);
counters.resize(handle.idx * 2 + 1);
}
counters[handle.idx].add(val);
break;
case LCT_PCOUNTER_TREND:
if (handle.idx >= trends.size()) {
trends.resize(handle.idx + 1);
trends.resize(handle.idx * 2 + 1);
}
trends[handle.idx].add(val);
break;
case LCT_PCOUNTER_TIMER:
if (handle.idx >= timers.size()) {
timers.resize(handle.idx + 1);
timers.resize(handle.idx * 2 + 1);
}
timers[handle.idx].add(val);
break;
Expand All @@ -141,7 +141,7 @@ struct tls_ctx_t {
", type: " + std::to_string(handle.type) + ", " +
std::to_string(handle.idx));
if (handle.idx >= timers.size()) {
timers.resize(handle.idx + 1);
timers.resize(handle.idx * 2 + 1);
}
timers[handle.idx].start(time);
lock.unlock();
Expand All @@ -155,7 +155,7 @@ struct tls_ctx_t {
std::to_string(handle.type) + " " +
std::to_string(handle.idx));
if (handle.idx >= timers.size()) {
timers.resize(handle.idx + 1);
timers.resize(handle.idx * 2 + 1);
}
timers[handle.idx].end(time);
lock.unlock();
Expand Down

0 comments on commit cdf2297

Please sign in to comment.