From e24e9ce5a5eacf78c0e482aa6785f67ccfca9246 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Fri, 24 May 2024 00:34:30 +0000 Subject: [PATCH] prov/efa: Implement FI_MORE for send and rdma-write For send, only respect FI_MORE for eager pkt type because 1. For some non-REQ pkts like CTSDATA, its current implementation relies on the logic that efa_rdm_ope_post_send always rings the doorbell, because the ep progress call will keep calling this function until ope->window is 0, but ope->window will only be decremented after the CTSDATA pkts are actually posted to rdma-core. 2. For non-eager REQ packets, we already send multiple pkts that contain data and make the firmware saturated, there is no meaning to queue pkts in this case. Signed-off-by: Shi Jin --- prov/efa/src/efa_base_ep.h | 1 + prov/efa/src/rdm/efa_rdm_ep_progress.c | 2 +- prov/efa/src/rdm/efa_rdm_ep_utils.c | 2 +- prov/efa/src/rdm/efa_rdm_ope.c | 17 ++++++++++++++- prov/efa/src/rdm/efa_rdm_pke.c | 29 ++++++++++++++++++++------ prov/efa/src/rdm/efa_rdm_pke.h | 2 +- 6 files changed, 43 insertions(+), 10 deletions(-) diff --git a/prov/efa/src/efa_base_ep.h b/prov/efa/src/efa_base_ep.h index 5de64cf13e4..a6a126b2900 100644 --- a/prov/efa/src/efa_base_ep.h +++ b/prov/efa/src/efa_base_ep.h @@ -46,6 +46,7 @@ struct efa_base_ep { bool util_ep_initialized; bool efa_qp_enabled; + bool is_wr_started; struct ibv_send_wr xmit_more_wr_head; struct ibv_send_wr *xmit_more_wr_tail; diff --git a/prov/efa/src/rdm/efa_rdm_ep_progress.c b/prov/efa/src/rdm/efa_rdm_ep_progress.c index ec0323418c8..f1b2a18453c 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_progress.c +++ b/prov/efa/src/rdm/efa_rdm_ep_progress.c @@ -286,7 +286,7 @@ ssize_t efa_rdm_ep_post_queued_pkts(struct efa_rdm_ep *ep, assert(((struct efa_rdm_rma_context_pkt *)pkt_entry->wiredata)->context_type == EFA_RDM_RDMA_WRITE_CONTEXT); ret = efa_rdm_pke_write(pkt_entry); } else { - ret = efa_rdm_pke_sendv(&pkt_entry, 1); + ret = efa_rdm_pke_sendv(&pkt_entry, 1, 0); } if (ret) { diff --git a/prov/efa/src/rdm/efa_rdm_ep_utils.c b/prov/efa/src/rdm/efa_rdm_ep_utils.c index bc0f2067a13..9917e76dcae 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_utils.c +++ b/prov/efa/src/rdm/efa_rdm_ep_utils.c @@ -617,7 +617,7 @@ ssize_t efa_rdm_ep_post_handshake(struct efa_rdm_ep *ep, struct efa_rdm_peer *pe efa_rdm_pke_init_handshake(pkt_entry, addr); - ret = efa_rdm_pke_sendv(&pkt_entry, 1); + ret = efa_rdm_pke_sendv(&pkt_entry, 1, 0); if (OFI_UNLIKELY(ret)) { efa_rdm_pke_release_tx(pkt_entry); } diff --git a/prov/efa/src/rdm/efa_rdm_ope.c b/prov/efa/src/rdm/efa_rdm_ope.c index e4765b52473..e0a7e327215 100644 --- a/prov/efa/src/rdm/efa_rdm_ope.c +++ b/prov/efa/src/rdm/efa_rdm_ope.c @@ -1691,6 +1691,7 @@ ssize_t efa_rdm_ope_post_send(struct efa_rdm_ope *ope, int pkt_type) size_t segment_offset; int pkt_entry_cnt, pkt_entry_cnt_allocated = 0, pkt_entry_data_size_vec[EFA_RDM_EP_MAX_WR_PER_IBV_POST_SEND]; int i; + uint64_t flags = 0; err = efa_rdm_ope_prepare_to_post_send(ope, pkt_type, &pkt_entry_cnt, pkt_entry_data_size_vec); if (err) @@ -1726,7 +1727,21 @@ ssize_t efa_rdm_ope_post_send(struct efa_rdm_ope *ope, int pkt_type) assert(pkt_entry_cnt == pkt_entry_cnt_allocated); - err = efa_rdm_pke_sendv(pkt_entry_vec, pkt_entry_cnt); + /** + * We currently respect FI_MORE only for eager pkt type because + * 1. For some non-REQ pkts like CTSDATA, its current implementation + * relies on the logic that efa_rdm_ope_post_send always rings the doorbell, + * because the ep progress call will keep calling this function until + * ope->window is 0, but ope->window will only be decremented after + * the CTSDATA pkts are actually posted to rdma-core. + * 2. For non-eager REQ packets, we already send multiple pkts that contain + * data and make the firmware saturated, there is no meaning to queue + * pkts in this case. + */ + if (ope->fi_flags & FI_MORE && efa_rdm_pkt_type_is_eager(pkt_type)) + flags |= FI_MORE; + + err = efa_rdm_pke_sendv(pkt_entry_vec, pkt_entry_cnt, flags); if (err) goto handle_err; diff --git a/prov/efa/src/rdm/efa_rdm_pke.c b/prov/efa/src/rdm/efa_rdm_pke.c index 1abce84fe59..429f6741bb1 100644 --- a/prov/efa/src/rdm/efa_rdm_pke.c +++ b/prov/efa/src/rdm/efa_rdm_pke.c @@ -350,11 +350,13 @@ void efa_rdm_pke_append(struct efa_rdm_pke *dst, * * @param[in] pkt_entry_vec an array of packet entries to be sent * @param[in] pkt_entry_cnt number of packet entries to be sent + * @param[in] flags flags, currently only accept 0 or FI_MORE. When FI_MORE + * is passed, it doesn't ring the doorbell (ibv_wr_complete). * @return 0 on success * On error, a negative value corresponding to fabric errno */ ssize_t efa_rdm_pke_sendv(struct efa_rdm_pke **pkt_entry_vec, - int pkt_entry_cnt) + int pkt_entry_cnt, uint64_t flags) { struct efa_qp *qp; struct efa_conn *conn; @@ -363,7 +365,7 @@ ssize_t efa_rdm_pke_sendv(struct efa_rdm_pke **pkt_entry_vec, struct efa_rdm_peer *peer; struct ibv_sge sg_list[2]; /* efa device support up to 2 iov */ struct ibv_data_buf inline_data_list[2]; - int ret, pkt_idx, iov_cnt; + int ret = 0, pkt_idx, iov_cnt; assert(pkt_entry_cnt); ep = pkt_entry_vec[0]->ep; @@ -379,7 +381,10 @@ ssize_t efa_rdm_pke_sendv(struct efa_rdm_pke **pkt_entry_vec, assert(conn && conn->ep_addr); qp = ep->base_ep.qp; - ibv_wr_start(qp->ibv_qp_ex); + if (!ep->base_ep.is_wr_started) { + ibv_wr_start(qp->ibv_qp_ex); + ep->base_ep.is_wr_started = true; + } for (pkt_idx = 0; pkt_idx < pkt_entry_cnt; ++pkt_idx) { pkt_entry = pkt_entry_vec[pkt_idx]; assert(pkt_entry->pkt_size); @@ -429,7 +434,11 @@ ssize_t efa_rdm_pke_sendv(struct efa_rdm_pke **pkt_entry_vec, #endif } - ret = ibv_wr_complete(qp->ibv_qp_ex); + if (!(flags & FI_MORE)) { + ret = ibv_wr_complete(qp->ibv_qp_ex); + ep->base_ep.is_wr_started = false; + } + if (OFI_UNLIKELY(ret)) { return ret; } @@ -506,6 +515,8 @@ int efa_rdm_pke_read(struct efa_rdm_pke *pkt_entry, * This function posts one write request. * * @param[in] pkt_entry write_entry that has information of the write request. + * @param[in] flags flags, currently only accept 0 or FI_MORE. When FI_MORE + * is passed, it doesn't ring the doorbell (ibv_wr_complete). * @return On success, return 0 * On failure, return a negative error code. */ @@ -543,7 +554,10 @@ int efa_rdm_pke_write(struct efa_rdm_pke *pkt_entry) pkt_entry->flags |= EFA_RDM_PKE_LOCAL_WRITE; qp = ep->base_ep.qp; - ibv_wr_start(qp->ibv_qp_ex); + if (!ep->base_ep.is_wr_started) { + ibv_wr_start(qp->ibv_qp_ex); + ep->base_ep.is_wr_started = true; + } qp->ibv_qp_ex->wr_id = (uintptr_t)pkt_entry; if (txe->fi_flags & FI_REMOTE_CQ_DATA) { @@ -574,7 +588,10 @@ int efa_rdm_pke_write(struct efa_rdm_pke *pkt_entry) conn->ep_addr->qpn, conn->ep_addr->qkey); } - err = ibv_wr_complete(qp->ibv_qp_ex); + if (!(txe->fi_flags & FI_MORE)) { + err = ibv_wr_complete(qp->ibv_qp_ex); + ep->base_ep.is_wr_started = false; + } if (OFI_UNLIKELY(err)) return err; diff --git a/prov/efa/src/rdm/efa_rdm_pke.h b/prov/efa/src/rdm/efa_rdm_pke.h index 50dcd187a0c..1072e376613 100644 --- a/prov/efa/src/rdm/efa_rdm_pke.h +++ b/prov/efa/src/rdm/efa_rdm_pke.h @@ -227,7 +227,7 @@ struct efa_rdm_pke *efa_rdm_pke_clone(struct efa_rdm_pke *src, struct efa_rdm_pke *efa_rdm_pke_get_unexp(struct efa_rdm_pke **pkt_entry_ptr); ssize_t efa_rdm_pke_sendv(struct efa_rdm_pke **pkt_entry_vec, - int pkt_entry_cnt); + int pkt_entry_cnt, uint64_t flags); int efa_rdm_pke_read(struct efa_rdm_pke *pkt_entry, void *local_buf, size_t len, void *desc,