Skip to content

Commit

Permalink
DAOS-16971 utils: two patches from Jerome to fix progress hang
Browse files Browse the repository at this point in the history
Two patches from Jerome to fix progress hang

Signed-off-by: Di Wang <[email protected]>
  • Loading branch information
wangdi1 committed Jan 23, 2025
1 parent 8ddde42 commit 64ae3f6
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 1 deletion.
1 change: 1 addition & 0 deletions site_scons/prereq_tools/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1091,6 +1091,7 @@ def _resolve_patches(self):
if "^" in raw:
(patch_subdir, raw) = raw.split("^")
if "https://" not in raw:
raw = os.path.join(Dir('#').abspath, raw)
patches[raw] = patch_subdir
continue
patch_name = f"{self.name}_{self._sanitize_patch_path(raw)}_{patchnum:d}"
Expand Down
3 changes: 2 additions & 1 deletion utils/build.config
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,6 @@ ucx=https://github.com/openucx/ucx.git
[patch_versions]
spdk=https://github.com/spdk/spdk/commit/b0aba3fcd5aceceea530a702922153bc75664978.diff,https://github.com/spdk/spdk/commit/445a4c808badbad3942696ecf16fa60e8129a747.diff
fuse=https://github.com/libfuse/libfuse/commit/c9905341ea34ff9acbc11b3c53ba8bcea35eeed8.diff
mercury=https://raw.githubusercontent.com/daos-stack/mercury/f3dc286fb40ec1a3a38a2e17c45497bc2aa6290d/na_ucx.patch
mercury=https://raw.githubusercontent.com/daos-stack/mercury/f3dc286fb40ec1a3a38a2e17c45497bc2aa6290d/na_ucx.patch,utils/mrecv_err.patch
pmdk=https://github.com/pmem/pmdk/commit/2abe15ac0b4eed894b6768cd82a3b0a7c4336284.diff
ofi=utils/ofi_tcp.patch
60 changes: 60 additions & 0 deletions utils/mrecv_err.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
diff --git a/src/mercury_core.c b/src/mercury_core.c
index 3c177630..c6150852 100644
--- a/src/mercury_core.c
+++ b/src/mercury_core.c
@@ -4727,11 +4727,13 @@ hg_core_multi_recv_input_cb(const struct na_cb_info *callback_info)
} else if (callback_info->ret == NA_CANCELED) {
HG_LOG_SUBSYS_DEBUG(
rpc, "NA_CANCELED event on multi-recv op %d", multi_recv_op->id);
- hg_atomic_decr32(&context->multi_recv_op_count);
+ if (na_cb_info_multi_recv_unexpected->last)
+ hg_atomic_decr32(&context->multi_recv_op_count);
} else {
HG_LOG_SUBSYS_ERROR(rpc, "NA callback returned error (%s)",
NA_Error_to_string(callback_info->ret));
- hg_atomic_decr32(&context->multi_recv_op_count);
+ if (na_cb_info_multi_recv_unexpected->last)
+ hg_atomic_decr32(&context->multi_recv_op_count);
/* TODO can an unexpected multi-recv operation ever fail? */
}

diff --git a/src/na/na_ofi.c b/src/na/na_ofi.c
index 92ddacc4..ea1187be 100644
--- a/src/na/na_ofi.c
+++ b/src/na/na_ofi.c
@@ -6558,9 +6558,21 @@ na_ofi_cq_readerr(struct fid_cq *cq, struct fi_cq_tagged_entry *cq_event,
!(hg_atomic_get32(&na_ofi_op_id->status) & NA_OFI_OP_CANCELED),
"Operation ID was not canceled by user");
*/
+ // if (na_ofi_op_id->type == NA_CB_MULTI_RECV_UNEXPECTED) {
+ // if (cq_err.flags & FI_MULTI_RECV)
+ // NA_LOG_SUBSYS_WARNING(
+ // op, "FI_ECANCELED reported on multi-recv (completed)");
+ // else
+ // NA_LOG_SUBSYS_WARNING(op,
+ // "FI_ECANCELED reported on multi-recv (not completed)");
+ // }

/* Complete operation in canceled state */
- na_ofi_op_id->complete(na_ofi_op_id, true, NA_CANCELED);
+ na_ofi_op_id->complete(na_ofi_op_id,
+ (na_ofi_op_id->type == NA_CB_MULTI_RECV_UNEXPECTED)
+ ? cq_err.flags & FI_MULTI_RECV
+ : true,
+ NA_CANCELED);
} break;

case FI_EADDRNOTAVAIL:
@@ -6623,7 +6635,11 @@ na_ofi_cq_readerr(struct fid_cq *cq, struct fi_cq_tagged_entry *cq_event,
na_ofi_op_id->addr->fi_addr, NA_HOSTUNREACH);

/* Complete operation in error state */
- na_ofi_op_id->complete(na_ofi_op_id, true, na_ret);
+ na_ofi_op_id->complete(na_ofi_op_id,
+ (na_ofi_op_id->type == NA_CB_MULTI_RECV_UNEXPECTED)
+ ? cq_err.flags & FI_MULTI_RECV
+ : true,
+ na_ret);
}
break;
}
26 changes: 26 additions & 0 deletions utils/ofi_tcp.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
diff --git a/prov/tcp/src/xnet_cq.c b/prov/tcp/src/xnet_cq.c
index 03ea97537..2090bdf71 100644
--- a/prov/tcp/src/xnet_cq.c
+++ b/prov/tcp/src/xnet_cq.c
@@ -202,13 +202,15 @@ void xnet_report_error(struct xnet_xfer_entry *xfer_entry, int err)

err_entry.flags = xfer_entry->cq_flags & ~FI_COMPLETION;
if (err_entry.flags & FI_RECV) {
- if (xfer_entry->ctrl_flags & XNET_MULTI_RECV &&
- xfer_entry->mrecv) {
- xfer_entry->mrecv->ref_cnt--;
- if (!xfer_entry->mrecv->ref_cnt) {
+ if (xfer_entry->ctrl_flags & XNET_MULTI_RECV) {
+ if (xfer_entry->mrecv) {
+ xfer_entry->mrecv->ref_cnt--;
+ if (!xfer_entry->mrecv->ref_cnt) {
+ err_entry.flags |= FI_MULTI_RECV;
+ free(xfer_entry->mrecv);
+ }
+ } else
err_entry.flags |= FI_MULTI_RECV;
- free(xfer_entry->mrecv);
- }
}
xnet_get_cq_info(xfer_entry, &err_entry.flags, &err_entry.data,
&err_entry.tag);

0 comments on commit 64ae3f6

Please sign in to comment.