Skip to content

Commit

Permalink
prov/efa: Implement FI_MORE for send and rdma-write
Browse files Browse the repository at this point in the history
For send, only respect FI_MORE for eager pkt type
because
1. For some non-REQ pkts like CTSDATA, its current implementation
relies on the logic that efa_rdm_ope_post_send always rings the doorbell,
because the ep progress call will keep calling this function until
ope->window is 0, but ope->window will only be decremented after
the CTSDATA pkts are actually posted to rdma-core.
2. For non-eager REQ packets, we already send multiple pkts that contain
data and make the firmware saturated, there is no meaning to queue
pkts in this case.

Signed-off-by: Shi Jin <[email protected]>
  • Loading branch information
shijin-aws committed Jun 7, 2024
1 parent bd4da6e commit e24e9ce
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 10 deletions.
1 change: 1 addition & 0 deletions prov/efa/src/efa_base_ep.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ struct efa_base_ep {

bool util_ep_initialized;
bool efa_qp_enabled;
bool is_wr_started;

struct ibv_send_wr xmit_more_wr_head;
struct ibv_send_wr *xmit_more_wr_tail;
Expand Down
2 changes: 1 addition & 1 deletion prov/efa/src/rdm/efa_rdm_ep_progress.c
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ ssize_t efa_rdm_ep_post_queued_pkts(struct efa_rdm_ep *ep,
assert(((struct efa_rdm_rma_context_pkt *)pkt_entry->wiredata)->context_type == EFA_RDM_RDMA_WRITE_CONTEXT);
ret = efa_rdm_pke_write(pkt_entry);
} else {
ret = efa_rdm_pke_sendv(&pkt_entry, 1);
ret = efa_rdm_pke_sendv(&pkt_entry, 1, 0);
}

if (ret) {
Expand Down
2 changes: 1 addition & 1 deletion prov/efa/src/rdm/efa_rdm_ep_utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -617,7 +617,7 @@ ssize_t efa_rdm_ep_post_handshake(struct efa_rdm_ep *ep, struct efa_rdm_peer *pe

efa_rdm_pke_init_handshake(pkt_entry, addr);

ret = efa_rdm_pke_sendv(&pkt_entry, 1);
ret = efa_rdm_pke_sendv(&pkt_entry, 1, 0);
if (OFI_UNLIKELY(ret)) {
efa_rdm_pke_release_tx(pkt_entry);
}
Expand Down
17 changes: 16 additions & 1 deletion prov/efa/src/rdm/efa_rdm_ope.c
Original file line number Diff line number Diff line change
Expand Up @@ -1691,6 +1691,7 @@ ssize_t efa_rdm_ope_post_send(struct efa_rdm_ope *ope, int pkt_type)
size_t segment_offset;
int pkt_entry_cnt, pkt_entry_cnt_allocated = 0, pkt_entry_data_size_vec[EFA_RDM_EP_MAX_WR_PER_IBV_POST_SEND];
int i;
uint64_t flags = 0;

err = efa_rdm_ope_prepare_to_post_send(ope, pkt_type, &pkt_entry_cnt, pkt_entry_data_size_vec);
if (err)
Expand Down Expand Up @@ -1726,7 +1727,21 @@ ssize_t efa_rdm_ope_post_send(struct efa_rdm_ope *ope, int pkt_type)

assert(pkt_entry_cnt == pkt_entry_cnt_allocated);

err = efa_rdm_pke_sendv(pkt_entry_vec, pkt_entry_cnt);
/**
* We currently respect FI_MORE only for eager pkt type because
* 1. For some non-REQ pkts like CTSDATA, its current implementation
* relies on the logic that efa_rdm_ope_post_send always rings the doorbell,
* because the ep progress call will keep calling this function until
* ope->window is 0, but ope->window will only be decremented after
* the CTSDATA pkts are actually posted to rdma-core.
* 2. For non-eager REQ packets, we already send multiple pkts that contain
* data and make the firmware saturated, there is no meaning to queue
* pkts in this case.
*/
if (ope->fi_flags & FI_MORE && efa_rdm_pkt_type_is_eager(pkt_type))
flags |= FI_MORE;

err = efa_rdm_pke_sendv(pkt_entry_vec, pkt_entry_cnt, flags);
if (err)
goto handle_err;

Expand Down
29 changes: 23 additions & 6 deletions prov/efa/src/rdm/efa_rdm_pke.c
Original file line number Diff line number Diff line change
Expand Up @@ -350,11 +350,13 @@ void efa_rdm_pke_append(struct efa_rdm_pke *dst,
*
* @param[in] pkt_entry_vec an array of packet entries to be sent
* @param[in] pkt_entry_cnt number of packet entries to be sent
* @param[in] flags flags, currently only accept 0 or FI_MORE. When FI_MORE
* is passed, it doesn't ring the doorbell (ibv_wr_complete).
* @return 0 on success
* On error, a negative value corresponding to fabric errno
*/
ssize_t efa_rdm_pke_sendv(struct efa_rdm_pke **pkt_entry_vec,
int pkt_entry_cnt)
int pkt_entry_cnt, uint64_t flags)
{
struct efa_qp *qp;
struct efa_conn *conn;
Expand All @@ -363,7 +365,7 @@ ssize_t efa_rdm_pke_sendv(struct efa_rdm_pke **pkt_entry_vec,
struct efa_rdm_peer *peer;
struct ibv_sge sg_list[2]; /* efa device support up to 2 iov */
struct ibv_data_buf inline_data_list[2];
int ret, pkt_idx, iov_cnt;
int ret = 0, pkt_idx, iov_cnt;

assert(pkt_entry_cnt);
ep = pkt_entry_vec[0]->ep;
Expand All @@ -379,7 +381,10 @@ ssize_t efa_rdm_pke_sendv(struct efa_rdm_pke **pkt_entry_vec,
assert(conn && conn->ep_addr);

qp = ep->base_ep.qp;
ibv_wr_start(qp->ibv_qp_ex);
if (!ep->base_ep.is_wr_started) {
ibv_wr_start(qp->ibv_qp_ex);
ep->base_ep.is_wr_started = true;
}
for (pkt_idx = 0; pkt_idx < pkt_entry_cnt; ++pkt_idx) {
pkt_entry = pkt_entry_vec[pkt_idx];
assert(pkt_entry->pkt_size);
Expand Down Expand Up @@ -429,7 +434,11 @@ ssize_t efa_rdm_pke_sendv(struct efa_rdm_pke **pkt_entry_vec,
#endif
}

ret = ibv_wr_complete(qp->ibv_qp_ex);
if (!(flags & FI_MORE)) {
ret = ibv_wr_complete(qp->ibv_qp_ex);
ep->base_ep.is_wr_started = false;
}

if (OFI_UNLIKELY(ret)) {
return ret;
}
Expand Down Expand Up @@ -506,6 +515,8 @@ int efa_rdm_pke_read(struct efa_rdm_pke *pkt_entry,
* This function posts one write request.
*
* @param[in] pkt_entry write_entry that has information of the write request.
* @param[in] flags flags, currently only accept 0 or FI_MORE. When FI_MORE
* is passed, it doesn't ring the doorbell (ibv_wr_complete).
* @return On success, return 0
* On failure, return a negative error code.
*/
Expand Down Expand Up @@ -543,7 +554,10 @@ int efa_rdm_pke_write(struct efa_rdm_pke *pkt_entry)
pkt_entry->flags |= EFA_RDM_PKE_LOCAL_WRITE;

qp = ep->base_ep.qp;
ibv_wr_start(qp->ibv_qp_ex);
if (!ep->base_ep.is_wr_started) {
ibv_wr_start(qp->ibv_qp_ex);
ep->base_ep.is_wr_started = true;
}
qp->ibv_qp_ex->wr_id = (uintptr_t)pkt_entry;

if (txe->fi_flags & FI_REMOTE_CQ_DATA) {
Expand Down Expand Up @@ -574,7 +588,10 @@ int efa_rdm_pke_write(struct efa_rdm_pke *pkt_entry)
conn->ep_addr->qpn, conn->ep_addr->qkey);
}

err = ibv_wr_complete(qp->ibv_qp_ex);
if (!(txe->fi_flags & FI_MORE)) {
err = ibv_wr_complete(qp->ibv_qp_ex);
ep->base_ep.is_wr_started = false;
}

if (OFI_UNLIKELY(err))
return err;
Expand Down
2 changes: 1 addition & 1 deletion prov/efa/src/rdm/efa_rdm_pke.h
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ struct efa_rdm_pke *efa_rdm_pke_clone(struct efa_rdm_pke *src,
struct efa_rdm_pke *efa_rdm_pke_get_unexp(struct efa_rdm_pke **pkt_entry_ptr);

ssize_t efa_rdm_pke_sendv(struct efa_rdm_pke **pkt_entry_vec,
int pkt_entry_cnt);
int pkt_entry_cnt, uint64_t flags);

int efa_rdm_pke_read(struct efa_rdm_pke *pkt_entry,
void *local_buf, size_t len, void *desc,
Expand Down

0 comments on commit e24e9ce

Please sign in to comment.