diff --git a/lci/backend/ibv/server_ibv.h b/lci/backend/ibv/server_ibv.h index 0d8b92e0..db955eeb 100644 --- a/lci/backend/ibv/server_ibv.h +++ b/lci/backend/ibv/server_ibv.h @@ -139,6 +139,7 @@ static inline int LCISD_poll_cq(LCIS_endpoint_t endpoint_pp, #ifdef LCI_ENABLE_MULTITHREAD_PROGRESS LCIU_release_spinlock(&endpoint_p->cq_lock); #endif + if (ne > 0) LCII_PCOUNTER_ADD(net_poll_cq_num, ne); for (int i = 0; i < ne; i++) { LCI_DBG_Assert( wc[i].status == IBV_WC_SUCCESS, "Failed status %s (%d) for wr_id %d\n", diff --git a/lci/backend/ofi/server_ofi.h b/lci/backend/ofi/server_ofi.h index c8c712af..96643a66 100644 --- a/lci/backend/ofi/server_ofi.h +++ b/lci/backend/ofi/server_ofi.h @@ -133,6 +133,7 @@ static inline int LCISD_poll_cq(LCIS_endpoint_t endpoint_pp, LCISI_OFI_CS_EXIT(endpoint_p, LCI_BACKEND_TRY_LOCK_POLL) ret = ne; if (ne > 0) { + LCII_PCOUNTER_ADD(net_poll_cq_num, ne); // Got an entry here for (int i = 0; i < ne; i++) { if (fi_entry[i].flags & FI_RECV) { diff --git a/lci/profile/performance_counter.h b/lci/profile/performance_counter.h index 7683c507..eca53982 100644 --- a/lci/profile/performance_counter.h +++ b/lci/profile/performance_counter.h @@ -30,6 +30,7 @@ extern LCT_pcounter_ctx_t LCII_pcounter_ctx; _macro(net_send_failed_lock) \ _macro(net_send_failed_nomem) \ _macro(net_recv_failed_nopacket) \ + _macro(net_poll_cq_num) \ _macro(progress_call) \ _macro(packet_get) \ _macro(packet_put) \ @@ -42,6 +43,8 @@ extern LCT_pcounter_ctx_t LCII_pcounter_ctx; _macro(unexpected_msg) #define LCII_PCOUNTER_TIMER_FOR_EACH(_macro) \ + _macro(sync_stay_timer) \ + _macro(cq_stay_timer) \ _macro(useful_progress_timer) \ _macro(refill_rq_timer) \ _macro(update_posted_recv) \ @@ -50,11 +53,11 @@ extern LCT_pcounter_ctx_t LCII_pcounter_ctx; _macro(cq_push_timer) \ _macro(cq_pop_timer) \ _macro(serve_rts_timer) \ - _macro(rts_mem_timer) \ + _macro(rts_mem_timer) \ _macro(rts_send_timer) \ _macro(serve_rtr_timer) \ _macro(rtr_mem_reg_timer) \ - _macro(rtr_put_timer) \ + _macro(rtr_put_timer) \ _macro(serve_rdma_timer) \ _macro(packet_stealing_timer) \ _macro(mem_reg_timer) \ diff --git a/lci/runtime/completion/cq.c b/lci/runtime/completion/cq.c index 25e03382..d90cee2a 100644 --- a/lci/runtime/completion/cq.c +++ b/lci/runtime/completion/cq.c @@ -68,6 +68,7 @@ LCI_error_t LCI_queue_pop(LCI_comp_t cq, LCI_request_t* request) LCII_context_t* ctx = LCT_queue_pop(cq); #endif if (ctx == NULL) return LCI_ERR_RETRY; + LCII_PCOUNTER_ADD(cq_stay_timer, LCT_now() - ctx->time); *request = LCII_ctx2req(ctx); LCII_PCOUNTER_ADD(comp_consume, 1); return LCI_OK; @@ -83,6 +84,7 @@ LCI_error_t LCI_queue_wait(LCI_comp_t cq, LCI_request_t* request) ctx = LCT_queue_pop(cq); #endif } + LCII_PCOUNTER_ADD(cq_stay_timer, LCT_now() - ctx->time); *request = LCII_ctx2req(ctx); LCII_PCOUNTER_ADD(comp_consume, 1); return LCI_OK; @@ -101,6 +103,7 @@ LCI_error_t LCI_queue_pop_multiple(LCI_comp_t cq, size_t request_count, ctx = LCT_queue_pop(cq); #endif if (ctx != NULL) { + LCII_PCOUNTER_ADD(cq_stay_timer, LCT_now() - ctx->time); requests[count] = LCII_ctx2req(ctx); ++count; } else { @@ -124,6 +127,7 @@ LCI_error_t LCI_queue_wait_multiple(LCI_comp_t cq, size_t request_count, ctx = LCT_queue_pop(cq); #endif if (ctx != NULL) { + LCII_PCOUNTER_ADD(cq_stay_timer, LCT_now() - ctx->time); requests[count] = LCII_ctx2req(ctx); ++count; } else { diff --git a/lci/runtime/completion/cq.h b/lci/runtime/completion/cq.h index e7e434e1..a92ea2cc 100644 --- a/lci/runtime/completion/cq.h +++ b/lci/runtime/completion/cq.h @@ -11,6 +11,9 @@ static inline void LCII_queue_push(LCI_comp_t cq, LCII_context_t* ctx) #ifdef LCI_USE_INLINE_CQ LCM_aqueue_push(cq, ctx); #else +#ifdef LCI_USE_PERFORMANCE_COUNTER + ctx->time = LCT_now(); +#endif LCT_queue_push(cq, ctx); #endif LCII_PCOUNTER_END(cq_push_timer); diff --git a/lci/runtime/completion/sync_flag.c b/lci/runtime/completion/sync_flag.c index 27fcfa11..3c5a47c8 100644 --- a/lci/runtime/completion/sync_flag.c +++ b/lci/runtime/completion/sync_flag.c @@ -45,6 +45,9 @@ LCI_error_t LCII_sync_signal(LCI_comp_t completion, LCII_context_t* ctx) { LCII_sync_t* sync = completion; LCI_DBG_Assert(sync != NULL, "synchronizer is a NULL pointer!\n"); +#ifdef LCI_USE_PERFORMANCE_COUNTER + ctx->time = LCT_now(); +#endif uint_fast64_t tail = 0; uint_fast64_t pos = 0; if (sync->threshold > 1) { @@ -98,14 +101,13 @@ LCI_error_t LCI_sync_test(LCI_comp_t completion, LCI_request_t request[]) &sync->tail, &expected, top2, LCIU_memory_order_release, LCIU_memory_order_relaxed); if (succeed) { - if (request) - for (int i = 0; i < sync->threshold; ++i) { + for (int i = 0; i < sync->threshold; ++i) { + LCII_PCOUNTER_ADD(sync_stay_timer, LCT_now() - sync->ctx[i]->time); + if (request) request[i] = LCII_ctx2req(sync->ctx[i]); - } - else - for (int i = 0; i < sync->threshold; ++i) { + else LCIU_free(sync->ctx[i]); - } + } LCII_PCOUNTER_ADD(comp_consume, sync->threshold); return LCI_OK; } else { diff --git a/lci/runtime/lcii.h b/lci/runtime/lcii.h index 4d92e804..f817ec90 100644 --- a/lci/runtime/lcii.h +++ b/lci/runtime/lcii.h @@ -153,6 +153,9 @@ typedef struct __attribute__((aligned(LCI_CACHE_LINE))) { LCI_tag_t tag; // 4 bytes // used by LCI internally LCI_comp_t completion; // 8 bytes +#ifdef LCI_USE_PERFORMANCE_COUNTER + LCT_time_t time; // 8 bytes +#endif } LCII_context_t; /** * comp_type: user-defined comp_type diff --git a/lct/api/lct.h b/lct/api/lct.h index b1b4e089..43a59f7d 100644 --- a/lct/api/lct.h +++ b/lct/api/lct.h @@ -28,7 +28,7 @@ LCT_API int LCT_get_rank(); extern char LCT_hostname[HOST_NAME_MAX + 1]; // time -typedef uint64_t LCT_time_t; +typedef int64_t LCT_time_t; LCT_API LCT_time_t LCT_now(); LCT_API double LCT_time_to_ns(LCT_time_t time); LCT_API double LCT_time_to_us(LCT_time_t time); diff --git a/lct/pcounter/pcounter.cpp b/lct/pcounter/pcounter.cpp index 445dd21e..3001a5b7 100644 --- a/lct/pcounter/pcounter.cpp +++ b/lct/pcounter/pcounter.cpp @@ -59,20 +59,25 @@ struct timer_t { } void end(LCT_time_t time) { - entry.add(static_cast(time - start_time)); + entry.add(time - start_time); start_time = 0; --start_count; } - void add(LCT_time_t time) { entry.add(static_cast(time)); } + void add(LCT_time_t time) { entry.add(time); } [[nodiscard]] entry_t get() const { entry_t ret = entry; if (consecutive_start) { // min and max is not valid - ret.min = -1; - ret.max = -1; + ret.min = 1; + ret.max = 0; } - ret.total = static_cast(LCT_time_to_ns(ret.total)); + // else { + // ret.min = static_cast(LCT_time_to_ns(ret.min)); + // ret.max = static_cast(LCT_time_to_ns(ret.max)); + // } + // ret.total = static_cast(LCT_time_to_ns(ret.total)); + // Here we exploit the fact that LCT_time_to_ns is an identity function. return ret; } bool consecutive_start; diff --git a/lct/util/time.cpp b/lct/util/time.cpp index 8906ab39..99df3028 100644 --- a/lct/util/time.cpp +++ b/lct/util/time.cpp @@ -10,7 +10,7 @@ LCT_time_t LCT_now() fprintf(stderr, "Cannot get time!\n"); abort(); } - return t1.tv_sec * long(1e9) + t1.tv_nsec; + return (LCT_time_t)t1.tv_sec * (LCT_time_t)1e9 + (LCT_time_t)t1.tv_nsec; } double LCT_time_to_ns(LCT_time_t time) { return (double)time; }