Skip to content

Commit

Permalink
UCP/WIREUP: Don't discard CM lane
Browse files Browse the repository at this point in the history
  • Loading branch information
dmitrygx committed Nov 5, 2020
1 parent f7d0c13 commit bfb2958
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 25 deletions.
24 changes: 18 additions & 6 deletions src/ucp/core/ucp_worker.c
Original file line number Diff line number Diff line change
Expand Up @@ -481,12 +481,24 @@ static unsigned ucp_worker_iface_err_handle_progress(void *arg)
continue;
}

ucs_trace("ep %p: discard uct_ep[%d]=%p", ucp_ep, lane,
ucp_ep->uct_eps[lane]);
ucp_worker_discard_uct_ep(ucp_ep->worker, ucp_ep->uct_eps[lane],
UCT_FLUSH_FLAG_CANCEL,
ucp_ep_err_pending_purge,
UCS_STATUS_PTR(status));
if (lane == ucp_ep_get_cm_lane(ucp_ep)) {
/* Don't schedule UCT CM lane to be discarded, since UCP EP will be
* destroyed due to peer failure and ucp_cm_disconnect_cb() could be
* invoked on async thread after UCP EP is destroyed and before UCT
* CM EP is destroyed from discarding functionality. So, UCP EP will
* passed as a corrupted argument to ucp_cm_disconnect_cb() */
ucs_trace("ep %p: destroy uct_ep[%d]=%p", ucp_ep, lane,
ucp_ep->uct_eps[lane]);
uct_ep_destroy(ucp_ep->uct_eps[lane]);
} else {
ucs_trace("ep %p: discard uct_ep[%d]=%p", ucp_ep, lane,
ucp_ep->uct_eps[lane]);
ucp_worker_discard_uct_ep(ucp_ep->worker, ucp_ep->uct_eps[lane],
UCT_FLUSH_FLAG_CANCEL,
ucp_ep_err_pending_purge,
UCS_STATUS_PTR(status));
}

ucp_ep->uct_eps[lane] = &ucp_failed_tl_ep;
}

Expand Down
1 change: 1 addition & 0 deletions src/ucp/wireup/wireup.c
Original file line number Diff line number Diff line change
Expand Up @@ -1085,6 +1085,7 @@ ucp_wireup_check_config_intersect(ucp_ep_h ep,
reuse_lane = reuse_lane_map[lane];
if (reuse_lane == UCP_NULL_RESOURCE) {
if (ep->uct_eps[lane] != NULL) {
ucs_assert(lane != ucp_ep_get_cm_lane(ep));
ucp_worker_discard_uct_ep(worker, ep->uct_eps[lane],
UCT_FLUSH_FLAG_LOCAL,
ucp_wireup_pending_purge_cb,
Expand Down
21 changes: 2 additions & 19 deletions src/ucp/wireup/wireup_cm.c
Original file line number Diff line number Diff line change
Expand Up @@ -761,37 +761,20 @@ static void ucp_cm_disconnect_cb(uct_ep_h uct_cm_ep, void *arg)
uct_worker_cb_id_t prog_id = UCS_CALLBACKQ_ID_NULL;
ucp_worker_h worker = ucp_ep->worker;
uct_ep_h uct_ep;
int discard_uct_ep;

ucs_trace("ep %p: CM remote disconnect callback invoked, flags 0x%x",
ucp_ep, ucp_ep->flags);

uct_ep = ucp_ep_get_cm_uct_ep(ucp_ep);
if (uct_ep == NULL) {
UCS_ASYNC_BLOCK(&worker->async);
discard_uct_ep = ucp_worker_is_uct_ep_discarding(worker, uct_cm_ep);
UCS_ASYNC_UNBLOCK(&worker->async);

if (discard_uct_ep) {
/* The CM lane couldn't exist if the error was detected on the
* transport lane and all UCT lanes have already been discraded */
ucs_diag("ep %p: UCT EP %p for CM lane doesn't exist, it"
" has already been discarded", ucp_ep, uct_cm_ep);
return;
}

ucs_fatal("ep %p: UCT EP for CM lane doesn't exist", ucp_ep);
}

ucs_assertv_always(uct_cm_ep == uct_ep,
"%p: uct_cm_ep=%p vs found_uct_ep=%p",
ucp_ep, uct_cm_ep, uct_ep);

uct_worker_progress_register_safe(ucp_ep->worker->uct,
uct_worker_progress_register_safe(worker->uct,
ucp_ep_cm_disconnect_progress,
ucp_ep, UCS_CALLBACKQ_FLAG_ONESHOT,
&prog_id);
ucp_worker_signal_internal(ucp_ep->worker);
ucp_worker_signal_internal(worker);
}

ucs_status_t ucp_ep_client_cm_create_uct_ep(ucp_ep_h ucp_ep)
Expand Down

0 comments on commit bfb2958

Please sign in to comment.