From cdbf7ad795cdc9a83ff03f21eb4883eb36ad033a Mon Sep 17 00:00:00 2001 From: Sylvain Didelot Date: Mon, 13 Nov 2023 14:11:54 +0000 Subject: [PATCH] prov/verbs: Async route resolution and non-blocking EP creation The patch implements the last changes to make the EP creation non-blocking. The address is now resolved asynchronously when connection is established, which does no longer block the caller. The connection initiator creates the QP when the route is resolved. As for the connection target, it creates the QP when the EP is enabled (same as it was before). Note that the call to rdma_resolve_addr() in vrb_msg_ep_connect() becomes asynchronous because the EP CM id is associated with an event channel when the endpoint is bound to an event queue, which happend before the EP is enabled. Signed-off-by: Sylvain Didelot --- prov/verbs/src/verbs_cm.c | 12 ++++++++---- prov/verbs/src/verbs_ep.c | 20 ++++++++++++-------- prov/verbs/src/verbs_eq.c | 1 + prov/verbs/src/verbs_init.c | 20 -------------------- 4 files changed, 21 insertions(+), 32 deletions(-) mode change 100644 => 100755 prov/verbs/src/verbs_eq.c diff --git a/prov/verbs/src/verbs_cm.c b/prov/verbs/src/verbs_cm.c index 4ff3dcd5e2b..bf34b9f9500 100644 --- a/prov/verbs/src/verbs_cm.c +++ b/prov/verbs/src/verbs_cm.c @@ -199,11 +199,15 @@ vrb_msg_ep_connect(struct fid_ep *ep_fid, const void *addr, ofi_genlock_lock(&vrb_ep2_progress(ep)->ep_lock); assert(ep->state == VRB_IDLE); - ep->state = VRB_RESOLVE_ROUTE; - ret = rdma_resolve_route(ep->id, VERBS_RESOLVE_TIMEOUT); - if (ret) { + ep->state = VRB_RESOLVE_ADDR; + if (rdma_resolve_addr(ep->id, ep->info_attr.src_addr, + ep->info_attr.dest_addr, VERBS_RESOLVE_TIMEOUT)) { ret = -errno; - VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "rdma_resolve_route"); + VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "rdma_resolve_addr"); + ofi_straddr_log(&vrb_prov, FI_LOG_WARN, FI_LOG_EP_CTRL, + "src addr", ep->info_attr.src_addr); + ofi_straddr_log(&vrb_prov, FI_LOG_WARN, FI_LOG_EP_CTRL, + "dst addr", ep->info_attr.dest_addr); free(ep->cm_priv_data); ep->cm_priv_data = NULL; ep->state = VRB_IDLE; diff --git a/prov/verbs/src/verbs_ep.c b/prov/verbs/src/verbs_ep.c index 641dbe0a81b..4b99e091065 100755 --- a/prov/verbs/src/verbs_ep.c +++ b/prov/verbs/src/verbs_ep.c @@ -1033,15 +1033,19 @@ static int vrb_ep_enable(struct fid_ep *ep_fid) return -FI_EINVAL; } - ret = rdma_create_qp(ep->id, domain->pd, &attr); - if (ret) { - VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "rdma_create_qp"); - return -errno; - } + /* Server-side QP creation, after RDMA_CM_EVENT_CONNECT_REQUEST + * is recevied */ + if (ep->id->verbs && ep->ibv_qp == NULL) { + ret = rdma_create_qp(ep->id, domain->pd, &attr); + if (ret) { + VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "rdma_create_qp"); + return -errno; + } - /* Allow shared XRC INI QP not controlled by RDMA CM - * to share same post functions as RC QP. */ - ep->ibv_qp = ep->id->qp; + /* Allow shared XRC INI QP not controlled by RDMA CM + * to share same post functions as RC QP. */ + ep->ibv_qp = ep->id->qp; + } break; case FI_EP_DGRAM: assert(domain); diff --git a/prov/verbs/src/verbs_eq.c b/prov/verbs/src/verbs_eq.c old mode 100644 new mode 100755 index 03a3b9f87a3..6d6ea1e5009 --- a/prov/verbs/src/verbs_eq.c +++ b/prov/verbs/src/verbs_eq.c @@ -875,6 +875,7 @@ vrb_eq_addr_resolved_event(struct vrb_ep *ep) if (ep->util_ep.type == FI_EP_MSG) { vrb_msg_ep_get_qp_attr(ep, &attr); + /* Client-side QP creation */ if (rdma_create_qp(ep->id, vrb_ep2_domain(ep)->pd, &attr)) { ep->state = VRB_DISCONNECTED; ret = -errno; diff --git a/prov/verbs/src/verbs_init.c b/prov/verbs/src/verbs_init.c index 6069dd3d46c..05183d0f9d4 100644 --- a/prov/verbs/src/verbs_init.c +++ b/prov/verbs/src/verbs_init.c @@ -338,29 +338,9 @@ int vrb_create_ep(struct vrb_ep *ep, enum rdma_port_space ps, goto err1; } - /* TODO convert this call to non-blocking (use event channel) as well: - * This may likely be needed for better scaling when running large - * MPI jobs. - * Making this non-blocking would mean we can't create QP at EP enable - * time. We need to wait for RDMA_CM_EVENT_ADDR_RESOLVED event before - * creating the QP using rdma_create_qp. It would also require a SW - * receive queue to store recvs posted by app after enabling the EP. - */ - if (rdma_resolve_addr(*id, rai->ai_src_addr, rai->ai_dst_addr, - VERBS_RESOLVE_TIMEOUT)) { - ret = -errno; - VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "rdma_resolve_addr"); - ofi_straddr_log(&vrb_prov, FI_LOG_WARN, FI_LOG_EP_CTRL, - "src addr", rai->ai_src_addr); - ofi_straddr_log(&vrb_prov, FI_LOG_WARN, FI_LOG_EP_CTRL, - "dst addr", rai->ai_dst_addr); - goto err2; - } rdma_freeaddrinfo(rai); return 0; -err2: - rdma_destroy_id(*id); err1: rdma_freeaddrinfo(rai); return ret;