diff --git a/man/fi_efa.7.md b/man/fi_efa.7.md index e39553f99ee..a7705098403 100644 --- a/man/fi_efa.7.md +++ b/man/fi_efa.7.md @@ -84,6 +84,8 @@ No support for counters for the DGRAM endpoint. No support for inject. +No support for `fi_cancel()` for the [zero-copy receive mode](https://github.com/ofiwg/libfabric/blob/main/prov/efa/docs/efa_rdm_protocol_v4.md#48-user-receive-qp-feature--request-and-zero-copy-receive). + When using FI_HMEM for AWS Neuron or Habana SynapseAI buffers, the provider requires peer to peer transaction support between the EFA and the FI_HMEM device. Therefore, the FI_HMEM_P2P_DISABLED option is not supported by the EFA diff --git a/prov/efa/docs/efa_rdm_protocol_v4.md b/prov/efa/docs/efa_rdm_protocol_v4.md index 2cbd80c561e..33954a25777 100644 --- a/prov/efa/docs/efa_rdm_protocol_v4.md +++ b/prov/efa/docs/efa_rdm_protocol_v4.md @@ -1599,6 +1599,10 @@ If the receiver supports it, sender will then send packets with user data to the there is no ordering or tagging requirement, and the receiver already knows the sender, sender can send packets without any headers in the payload. If the receiver doesn't support this extra feature, the sender will continue send packets with headers to the receiver's default QP. + +On the receiver side, it will post the user recv buffer to the user recv QP directly when the user +calls fi_recv(). Currently such receive cannot be cancelled and fi_cancel() is not supported in +zero-copy receive mode. If a receiver gets RTM packets delivered to its default QP, it raises an error because it requests all RTM packets must be delivered to its user recv QP. diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index f0eaa928ceb..39cc5ce13ae 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -1313,6 +1313,11 @@ ssize_t efa_rdm_ep_cancel(fid_t fid_ep, void *context) struct efa_rdm_ep *ep; ep = container_of(fid_ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); + if (ep->use_zcpy_rx) { + EFA_WARN(FI_LOG_EP_CTRL, "fi_cancel is not supported in zero-copy receive mode.\n"); + return -FI_EOPNOTSUPP; + } + return ep->peer_srx_ep->ops->cancel(&ep->peer_srx_ep->fid, context); } diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c index 8ba0041620e..d05672d4c6f 100644 --- a/prov/efa/test/efa_unit_test_ep.c +++ b/prov/efa/test/efa_unit_test_ep.c @@ -1017,4 +1017,35 @@ void test_efa_rdm_ep_close_discard_posted_recv(struct efa_resource **state) /* Reset to NULL to avoid test reaper closing again */ resource->ep = NULL; -} \ No newline at end of file +} + +void test_efa_rdm_ep_zcpy_recv_cancel(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct fi_context cancel_context = {0}; + struct efa_unit_test_buff recv_buff; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + assert_non_null(resource->hints); + + resource->hints->tx_attr->msg_order = FI_ORDER_NONE; + resource->hints->rx_attr->msg_order = FI_ORDER_NONE; + resource->hints->caps = FI_MSG; + + /* enable zero-copy recv mode in ep */ + test_efa_rdm_ep_use_zcpy_rx_impl(resource, true); + + /* Construct a recv buffer with mr */ + efa_unit_test_buff_construct(&recv_buff, resource, 16); + + assert_int_equal(fi_recv(resource->ep, recv_buff.buff, recv_buff.size, fi_mr_desc(recv_buff.mr), FI_ADDR_UNSPEC, &cancel_context), 0); + + assert_int_equal(fi_cancel((struct fid *)resource->ep, &cancel_context), -FI_EOPNOTSUPP); + + /** + * the buf is still posted to rdma-core, so unregistering mr can + * return non-zero. Currently ignore this failure. + */ + (void) fi_close(&recv_buff.mr->fid); + free(recv_buff.buff); +} diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index d32b074622a..e5fadf9fecf 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -109,6 +109,7 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_rdm_ep_user_zcpy_rx_happy, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_user_zcpy_rx_unhappy_due_to_sas, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_close_discard_posted_recv, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_zcpy_recv_cancel, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_dgram_cq_read_empty_cq, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_empty_cq, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_failed_poll, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index 5eae3cf25c6..2f445bb3637 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -117,6 +117,7 @@ void test_efa_rdm_ep_enable_qp_in_order_aligned_128_bytes_bad(); void test_efa_rdm_ep_user_zcpy_rx_happy(); void test_efa_rdm_ep_user_zcpy_rx_unhappy_due_to_sas(); void test_efa_rdm_ep_close_discard_posted_recv(); +void test_efa_rdm_ep_zcpy_recv_cancel(); void test_dgram_cq_read_empty_cq(); void test_ibv_cq_ex_read_empty_cq(); void test_ibv_cq_ex_read_failed_poll();