From 2be8604de759c2dfe3fa3e2b2109247f4c761b59 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Tue, 23 Apr 2024 21:15:51 +0000 Subject: [PATCH] prov/efa: Do not assert tx and rx msg order must equal The tx_attr at the sender should align with the rx_attr at the receiver. But that's an application level responsibility to handle. The tx_attr and rx_attr of a single endpoint are likely to be consistent, but that's not required. Efa doesn't have to generate an error when an inconsitency is detected, but can generate a warning for it. Also added unit tests. Signed-off-by: Shi Jin --- prov/efa/src/efa_user_info.c | 5 ++ prov/efa/src/rdm/efa_rdm_ep.h | 5 +- prov/efa/src/rdm/efa_rdm_ep_fiops.c | 3 -- prov/efa/test/efa_unit_test_ep.c | 78 ++++++++++++++++++++++++++++- prov/efa/test/efa_unit_tests.c | 3 ++ prov/efa/test/efa_unit_tests.h | 3 ++ 6 files changed, 89 insertions(+), 8 deletions(-) diff --git a/prov/efa/src/efa_user_info.c b/prov/efa/src/efa_user_info.c index 17fae308b97..1d56554ea0d 100644 --- a/prov/efa/src/efa_user_info.c +++ b/prov/efa/src/efa_user_info.c @@ -434,6 +434,11 @@ int efa_user_info_alter_rdm(int version, struct fi_info *info, const struct fi_i info->rx_attr->msg_order &= hints->rx_attr->msg_order; } + if (info->tx_attr->msg_order != info->rx_attr->msg_order) + EFA_INFO(FI_LOG_EP_CTRL, "Inconsistent tx/rx msg order. Tx msg order: %lu, Rx msg order: %lu. " + "Libfabric can proceed but it is recommended to align the tx and rx msg order.\n", + info->tx_attr->msg_order, info->rx_attr->msg_order); + /* We only support manual progress for RMA operations */ if (hints->caps & FI_RMA) { info->domain_attr->control_progress = FI_PROGRESS_MANUAL; diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h index 2b52d9ad467..4469a27d29d 100644 --- a/prov/efa/src/rdm/efa_rdm_ep.h +++ b/prov/efa/src/rdm/efa_rdm_ep.h @@ -75,9 +75,6 @@ struct efa_rdm_ep { /* Resource management flag */ uint64_t rm_full; - /* application's ordering requirements */ - uint64_t msg_order; - /* Application's maximum msg size hint */ size_t max_msg_size; @@ -244,7 +241,7 @@ static inline size_t efa_rdm_ep_get_tx_pool_size(struct efa_rdm_ep *ep) static inline int efa_rdm_ep_need_sas(struct efa_rdm_ep *ep) { - return ep->msg_order & FI_ORDER_SAS; + return ((ep->user_info->tx_attr->msg_order & FI_ORDER_SAS) || (ep->user_info->rx_attr->msg_order & FI_ORDER_SAS)); } diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index 90e696bbb9a..0585338d7b9 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -471,9 +471,6 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info, efa_rdm_ep->efa_device_iov_limit = efa_domain->device->rdm_info->tx_attr->iov_limit; efa_rdm_ep->use_device_rdma = efa_rdm_get_use_device_rdma(info->fabric_attr->api_version); efa_rdm_ep->shm_permitted = true; - - assert(info->tx_attr->msg_order == info->rx_attr->msg_order); - efa_rdm_ep->msg_order = info->rx_attr->msg_order; efa_rdm_ep->max_msg_size = info->ep_attr->max_msg_size; efa_rdm_ep->msg_prefix_size = info->ep_attr->msg_prefix_size; efa_rdm_ep->max_proto_hdr_size = efa_rdm_pkt_type_get_max_hdr_size(); diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c index 2dfc84548a3..50d84f36023 100644 --- a/prov/efa/test/efa_unit_test_ep.c +++ b/prov/efa/test/efa_unit_test_ep.c @@ -633,4 +633,80 @@ void test_efa_rdm_ep_enable_qp_in_order_aligned_128_bytes_bad(struct efa_resourc test_efa_rdm_ep_enable_qp_in_order_aligned_128_bytes_common(state, -FI_EOPNOTSUPP, true); } -#endif \ No newline at end of file +#endif + +static void +test_efa_rdm_ep_use_zcpy_rx_impl(struct efa_resource *resource, bool expected_use_zcpy_rx) { + struct efa_rdm_ep *ep; + + efa_unit_test_resource_construct_with_hints(resource, FI_EP_RDM, resource->hints, false, false); + + ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + assert_true(ep->use_zcpy_rx == expected_use_zcpy_rx); +} + +/** + * @brief Verify zcpy_rx is enabled when the following requirements are met: + * 1. app doesn't require FI_ORDER_SAS in tx or rx's msg_order + * 2. app uses FI_MSG_PREFIX mode + * 3. app's max msg size is smaller than mtu_size - prefix_size + * 4. app doesn't use FI_DIRECTED_RECV, FI_TAGGED, FI_ATOMIC capability + */ +void test_efa_rdm_ep_user_zcpy_rx_happy(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + assert_non_null(resource->hints); + + /* Just use a small enough size */ + resource->hints->ep_attr->max_msg_size = 1000; + resource->hints->tx_attr->msg_order = FI_ORDER_NONE; + resource->hints->rx_attr->msg_order = FI_ORDER_NONE; + resource->hints->mode = FI_MSG_PREFIX; + resource->hints->caps = FI_MSG; + + test_efa_rdm_ep_use_zcpy_rx_impl(resource, true); +} + +/** + * @brief When sas is requested for either tx or rx. zcpy will be disabled + */ +void test_efa_rdm_ep_user_zcpy_rx_unhappy_due_to_sas(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + assert_non_null(resource->hints); + + /* Just use a small enough size */ + resource->hints->ep_attr->max_msg_size = 1000; + + resource->hints->tx_attr->msg_order = FI_ORDER_SAS; + resource->hints->rx_attr->msg_order = FI_ORDER_NONE; + resource->hints->mode = FI_MSG_PREFIX; + resource->hints->caps = FI_MSG; + + test_efa_rdm_ep_use_zcpy_rx_impl(resource, false); +} + +/** + * @brief zcpy will be disabled if app doesn't use FI_MSG_PREFIX mode. + */ +void test_efa_rdm_ep_user_zcpy_rx_unhappy_due_to_no_prefix(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + assert_non_null(resource->hints); + + /* Just use a small enough size */ + resource->hints->ep_attr->max_msg_size = 1000; + + resource->hints->tx_attr->msg_order = FI_ORDER_NONE; + resource->hints->rx_attr->msg_order = FI_ORDER_NONE; + resource->hints->caps = FI_MSG; + + test_efa_rdm_ep_use_zcpy_rx_impl(resource, false); +} diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index 2210ba668af..fe075e57919 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -95,6 +95,9 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_rdm_ep_send_with_shm_no_copy, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_rma_without_caps, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_atomic_without_caps, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_user_zcpy_rx_happy, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_user_zcpy_rx_unhappy_due_to_sas, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_user_zcpy_rx_unhappy_due_to_no_prefix, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_dgram_cq_read_empty_cq, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_empty_cq, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_failed_poll, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index 013e61c69a8..07259f285ed 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -109,6 +109,9 @@ void test_efa_rdm_ep_atomic_without_caps(); void test_efa_rdm_ep_setopt_shared_memory_permitted(); void test_efa_rdm_ep_enable_qp_in_order_aligned_128_bytes_good(); void test_efa_rdm_ep_enable_qp_in_order_aligned_128_bytes_bad(); +void test_efa_rdm_ep_user_zcpy_rx_happy(); +void test_efa_rdm_ep_user_zcpy_rx_unhappy_due_to_sas(); +void test_efa_rdm_ep_user_zcpy_rx_unhappy_due_to_no_prefix(); void test_dgram_cq_read_empty_cq(); void test_ibv_cq_ex_read_empty_cq(); void test_ibv_cq_ex_read_failed_poll();