diff --git a/prov/opx/include/fi_opx_tid.h b/prov/opx/include/fi_opx_tid.h index 29d1e46e4f5..436c640209d 100644 --- a/prov/opx/include/fi_opx_tid.h +++ b/prov/opx/include/fi_opx_tid.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022-2023 Cornelis Networks. + * Copyright (C) 2022-2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -111,20 +111,11 @@ ctx->__hfi_tidexpcnt) */ #define OPX_MAX_TID_COUNT 2048 -#define OPX_TID_VADDR(tid_reuse_cache) (tid_reuse_cache->tid_vaddr) -#define OPX_TID_LENGTH(tid_reuse_cache) (tid_reuse_cache->tid_length) -#define OPX_TID_NINFO(tid_reuse_cache) (tid_reuse_cache->ninfo) -#define OPX_TID_INFO(tid_reuse_cache, idx) (tid_reuse_cache->info[idx]) -#define OPX_TID_NPAIRS(tid_reuse_cache) (tid_reuse_cache->npairs) -#define OPX_TID_PAIR(tid_reuse_cache, idx) (tid_reuse_cache->pairs[idx]) -#define OPX_TID_IS_INVALID(tid_reuse_cache) (tid_reuse_cache->invalid) -#define OPX_TID_INVALID(tid_reuse_cache) (tid_reuse_cache->invalid = 1) -#define OPX_TID_VALID(tid_reuse_cache) (tid_reuse_cache->invalid = 0) #define OPX_TID_NPAGES(tid_reuse_cache, npages) \ do { \ npages = 0; \ - const uint32_t *tids = &OPX_TID_INFO(tid_reuse_cache, 0); \ - const uint32_t ntids = OPX_TID_NINFO(tid_reuse_cache); \ + const uint32_t *tids = &tid_reuse_cache->info[0]; \ + const uint32_t ntids = tid_reuse_cache->ninfo; \ for (int i = 0; i < ntids; ++i) { \ npages += (int)FI_OPX_EXP_TID_GET(tids[i], LEN); \ FI_DBG(fi_opx_global.prov, FI_LOG_MR, \ @@ -211,10 +202,10 @@ static inline void OPX_TID_CACHE_DEBUG_FPRINTF(const char *format, ...) __func__, __LINE__, \ string, \ tid_vaddr, tid_vaddr + tid_length, tid_length, \ - OPX_TID_VADDR(tid_reuse_cache), \ - OPX_TID_VADDR(tid_reuse_cache) + \ - OPX_TID_LENGTH(tid_reuse_cache), \ - OPX_TID_LENGTH(tid_reuse_cache), count); \ + tid_reuse_cache->tid_vaddr, \ + tid_reuse_cache->tid_vaddr + \ + tid_reuse_cache->tid_length, \ + tid_reuse_cache->tid_length, count); \ last_vaddr = tid_vaddr; \ last_length = tid_length; \ count = 0; \ @@ -226,10 +217,10 @@ static inline void OPX_TID_CACHE_DEBUG_FPRINTF(const char *format, ...) "tid vaddr [%#lx - %#lx] length %lu\n", \ string, tid_vaddr, \ tid_vaddr + tid_length, tid_length, \ - OPX_TID_VADDR(tid_reuse_cache), \ - OPX_TID_VADDR(tid_reuse_cache) + \ - OPX_TID_LENGTH(tid_reuse_cache), \ - OPX_TID_LENGTH(tid_reuse_cache)); \ + tid_reuse_cache->tid_vaddr, \ + tid_reuse_cache->tid_vaddr + \ + tid_reuse_cache->tid_length, \ + tid_reuse_cache->tid_length); \ } while (0) #else /* noisier regular debug logging */ @@ -240,10 +231,10 @@ static inline void OPX_TID_CACHE_DEBUG_FPRINTF(const char *format, ...) "tid vaddr [%#lx - %#lx] length %lu\n", \ string, tid_vaddr, \ tid_vaddr + tid_length, tid_length, \ - OPX_TID_VADDR(tid_reuse_cache), \ - OPX_TID_VADDR(tid_reuse_cache) + \ - OPX_TID_LENGTH(tid_reuse_cache), \ - OPX_TID_LENGTH(tid_reuse_cache)); + tid_reuse_cache->tid_vaddr, \ + tid_reuse_cache->tid_vaddr + \ + tid_reuse_cache->tid_length, \ + tid_reuse_cache->tid_length); #endif /* Special debug for expected receive data ONLY */ @@ -253,8 +244,8 @@ static inline void OPX_TID_CACHE_DEBUG_FPRINTF(const char *format, ...) static int count = 0; \ static uint64_t last_vaddr = 0UL; \ static int32_t last_length = 0; \ - if ((last_vaddr != OPX_TID_VADDR(tid_reuse_cache)) || \ - (last_length != OPX_TID_LENGTH(tid_reuse_cache))) { \ + if ((last_vaddr != tid_reuse_cache->tid_vaddr) || \ + (last_length != tid_reuse_cache->tid_length)) { \ fprintf(stderr, \ "## %s:%u OPX_TID_CACHE_VERBOSE_DEBUG %s TIDs " \ "input vaddr [%#lx - %#lx] length %lu, " \ @@ -262,13 +253,13 @@ static inline void OPX_TID_CACHE_DEBUG_FPRINTF(const char *format, ...) "last count %u\n", \ __func__, __LINE__, \ string, \ - OPX_TID_VADDR(tid_reuse_cache), \ - OPX_TID_VADDR(tid_reuse_cache) + \ - OPX_TID_LENGTH(tid_reuse_cache), \ - OPX_TID_LENGTH(tid_reuse_cache), last_vaddr, \ + tid_reuse_cache->tid_vaddr, \ + tid_reuse_cache->tid_vaddr + \ + tid_reuse_cache->tid_length, \ + tid_reuse_cache->tid_length, last_vaddr, \ last_vaddr + last_length, last_length, count); \ - last_vaddr = OPX_TID_VADDR(tid_reuse_cache); \ - last_length = OPX_TID_LENGTH(tid_reuse_cache); \ + last_vaddr = tid_reuse_cache->tid_vaddr; \ + last_length = tid_reuse_cache->tid_length; \ count = 0; \ } \ ++count; \ @@ -279,10 +270,10 @@ static inline void OPX_TID_CACHE_DEBUG_FPRINTF(const char *format, ...) "OPX_TID_CACHE_VERBOSE_DEBUG %s TIDs " \ "tid vaddr [%#lx - %#lx] length %lu\n", \ string, \ - OPX_TID_VADDR(tid_reuse_cache), \ - OPX_TID_VADDR(tid_reuse_cache) + \ - OPX_TID_LENGTH(tid_reuse_cache), \ - OPX_TID_LENGTH(tid_reuse_cache)) + tid_reuse_cache->tid_vaddr, \ + tid_reuse_cache->tid_vaddr + \ + tid_reuse_cache->tid_length, \ + tid_reuse_cache->tid_length) #endif #endif /* _FI_PROV_OPX_TID_H_ */ diff --git a/prov/opx/include/fi_opx_tid_cache.h b/prov/opx/include/fi_opx_tid_cache.h index 387632863bb..3c365f620f5 100644 --- a/prov/opx/include/fi_opx_tid_cache.h +++ b/prov/opx/include/fi_opx_tid_cache.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022-2023 Cornelis Networks. + * Copyright (C) 2022-2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -96,6 +96,8 @@ struct fi_opx_hfi1_rx_rzv_rts_params; * returns non-zero on failure (fallback to Eager rendezvous) */ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, - const uint64_t tid_vaddr, const uint64_t tid_length); + const uint64_t tid_vaddr, const uint64_t tid_length, + const enum fi_hmem_iface tid_iface, + const uint64_t tid_device); #endif /* _FI_PROV_OPX_TID_CACHE_H_ */ diff --git a/prov/opx/include/opa_service.h b/prov/opx/include/opa_service.h index b597b5ac4d6..4ae6044035e 100644 --- a/prov/opx/include/opa_service.h +++ b/prov/opx/include/opa_service.h @@ -6,7 +6,7 @@ GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. - Copyright(c) 2021-2023 Cornelis Networks. + Copyright(c) 2021-2024 Cornelis Networks. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as @@ -23,7 +23,7 @@ BSD LICENSE Copyright(c) 2015 Intel Corporation. - Copyright(c) 2021 Cornelis Networks. + Copyright(c) 2021-2024 Cornelis Networks. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -111,8 +111,8 @@ enum OPX_HFI_CMD { OPX_HFI_CMD_TID_INVAL_READ, /* read TID cache invalidations */ OPX_HFI_CMD_GET_VERS, /* get the version of the user cdev */ -#ifdef PSM_CUDA - OPX_HFI_CMD_TID_UPDATE_V2 = 28, +#ifdef OPX_HMEM + OPX_HFI_CMD_TID_UPDATE_V3, #endif OPX_HFI_CMD_LAST, }; diff --git a/prov/opx/include/opa_user_gen1.h b/prov/opx/include/opa_user_gen1.h index 5321d996581..06167be9d8e 100644 --- a/prov/opx/include/opa_user_gen1.h +++ b/prov/opx/include/opa_user_gen1.h @@ -6,7 +6,7 @@ GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. - Copyright(c) 2021-2023 Cornelis Networks. + Copyright(c) 2021-2024 Cornelis Networks. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as @@ -23,7 +23,7 @@ BSD LICENSE Copyright(c) 2015 Intel Corporation. - Copyright(c) 2021-2022 Cornelis Networks. + Copyright(c) 2021-2024 Cornelis Networks. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -542,13 +542,17 @@ static __inline__ void opx_hfi_hdrset_seq(__le32 *rbuf, uint32_t val) See full description at declaration */ static __inline__ int32_t opx_hfi_update_tid(struct _hfi_ctrl *ctrl, uint64_t vaddr, uint32_t *length, - uint64_t tidlist, uint32_t *tidcnt, uint16_t flags) + uint64_t tidlist, uint32_t *tidcnt, + uint64_t flags) { struct hfi1_cmd cmd; + +#ifdef OPX_HMEM + struct hfi1_tid_info_v3 tidinfo; +#else struct hfi1_tid_info tidinfo; -#ifdef PSM_CUDA - struct hfi1_tid_info_v2 tidinfov2; #endif + int err; tidinfo.vaddr = vaddr; /* base address for this send to map */ @@ -557,25 +561,20 @@ static __inline__ int32_t opx_hfi_update_tid(struct _hfi_ctrl *ctrl, tidinfo.tidlist = tidlist; /* driver copies tids back directly */ tidinfo.tidcnt = 0; /* clear to zero */ - FI_DBG(&fi_opx_provider, FI_LOG_MR,"OPX_DEBUG_ENTRY update [%p - %p], length %u (pages %u)\n", (void*)vaddr,(void*)(vaddr + *length), *length, (*length)/4096); - +#ifdef OPX_HMEM + cmd.type = OPX_HFI_CMD_TID_UPDATE_V3; + tidinfo.flags = flags; + tidinfo.context = 0ull; +#else cmd.type = OPX_HFI_CMD_TID_UPDATE; /* HFI1_IOCTL_TID_UPDATE */ +#endif + FI_DBG(&fi_opx_provider, FI_LOG_MR, + "OPX_DEBUG_ENTRY update [%p - %p], length %u (pages %u)\n", + (void*)vaddr, (void*) (vaddr + *length), *length, (*length) / 4096); + cmd.len = sizeof(tidinfo); cmd.addr = (__u64) &tidinfo; -#ifdef PSM_CUDA - if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED) { - /* Copy values to v2 struct */ - tidinfov2.vaddr = tidinfo.vaddr; - tidinfov2.length = tidinfo.length; - tidinfov2.tidlist = tidinfo.tidlist; - tidinfov2.tidcnt = tidinfo.tidcnt; - tidinfov2.flags = flags; - - cmd.type = OPX_HFI_CMD_TID_UPDATE_V2; - cmd.len = sizeof(tidinfov2); - cmd.addr = (__u64) &tidinfov2; - } -#endif + errno = 0; err = opx_hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd)); __attribute__((__unused__)) int saved_errno = errno; @@ -584,15 +583,25 @@ static __inline__ int32_t opx_hfi_update_tid(struct _hfi_ctrl *ctrl, struct hfi1_tid_info *rettidinfo = (struct hfi1_tid_info *)cmd.addr; if ((rettidinfo->length != *length) || (rettidinfo->tidcnt == 0) ) { - FI_WARN(&fi_opx_provider, FI_LOG_MR,"PARTIAL UPDATE errno %d \"%s\" INPUTS vaddr [%p - %p] length %u (pages %u), OUTPUTS vaddr [%p - %p] length %u (pages %u), tidcnt %u\n", saved_errno, strerror(saved_errno), (void*)vaddr,(void*)(vaddr + *length), *length, (*length)/4096, (void*)vaddr,(void*)(vaddr + rettidinfo->length), rettidinfo->length, rettidinfo->length/4096, rettidinfo->tidcnt); + FI_WARN(&fi_opx_provider, FI_LOG_MR, + "PARTIAL UPDATE errno %d \"%s\" INPUTS vaddr [%p - %p] length %u (pages %u), OUTPUTS vaddr [%p - %p] length %u (pages %u), tidcnt %u\n", + saved_errno, strerror(saved_errno), (void*)vaddr, + (void*)(vaddr + *length), *length, (*length)/4096, + (void*)rettidinfo->vaddr,(void*)(rettidinfo->vaddr + rettidinfo->length), + rettidinfo->length, rettidinfo->length/4096, + rettidinfo->tidcnt); } /* Always update outputs, even on soft errors */ *length = rettidinfo->length; *tidcnt = rettidinfo->tidcnt; - FI_DBG(&fi_opx_provider, FI_LOG_MR,"OPX_DEBUG_EXIT OUTPUTS errno %d \"%s\" vaddr [%p - %p] length %u (pages %u), tidcnt %u\n", saved_errno, strerror(saved_errno), (void*)vaddr,(void*)(vaddr + *length), *length, (*length)/4096, *tidcnt); - + FI_DBG(&fi_opx_provider, FI_LOG_MR, + "OPX_DEBUG_EXIT OUTPUTS errno %d \"%s\" vaddr [%p - %p] length %u (pages %u), tidcnt %u\n", + saved_errno, strerror(saved_errno), (void*)vaddr, + (void*)(vaddr + *length), *length, (*length)/4096, *tidcnt); } else { - FI_WARN(&fi_opx_provider, FI_LOG_MR,"FAILED ERR %d errno %d \"%s\"\n", err, saved_errno, strerror(saved_errno)); + FI_WARN(&fi_opx_provider, FI_LOG_MR, + "FAILED ERR %d errno %d \"%s\"\n", + err, saved_errno, strerror(saved_errno)); /* Hard error, we can't trust these */ *length = 0; *tidcnt = 0; diff --git a/prov/opx/include/rdma/opx/fi_opx_debug_counters.h b/prov/opx/include/rdma/opx/fi_opx_debug_counters.h index 401f69ddb9e..78fd7678d67 100644 --- a/prov/opx/include/rdma/opx/fi_opx_debug_counters.h +++ b/prov/opx/include/rdma/opx/fi_opx_debug_counters.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2023 Cornelis Networks. + * Copyright (C) 2021-2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -169,12 +169,22 @@ struct fi_opx_debug_counters { } sdma; struct { - uint64_t total_requests; uint64_t tid_updates; uint64_t tid_resource_limit; + uint64_t tid_resource_limit_length_chunk_short; + uint64_t tid_resource_limit_length_chunk_long; + uint64_t tid_resource_limit_tidcnt_chunk_zero; uint64_t tid_invalidate_needed; - uint64_t tid_replays; - uint64_t rts_fallback_eager; + uint64_t tid_rcv_pkts; + uint64_t tid_rcv_pkts_replays; + uint64_t rts_tid_ineligible; + uint64_t rts_tid_eligible; + uint64_t rts_fallback_eager_immediate; + uint64_t rts_fallback_eager_misaligned_thrsh; + uint64_t rts_fallback_eager_reg_rzv; + uint64_t rts_tid_setup_retries; + uint64_t rts_tid_setup_retry_success; + uint64_t rts_tid_setup_success; uint64_t tid_buckets[4]; uint64_t first_tidpair_minlen; uint64_t first_tidpair_maxlen; @@ -245,6 +255,9 @@ struct fi_opx_debug_counters { uint64_t rma_atomic_fetch_intranode; uint64_t rma_atomic_cmp_fetch_hfi; uint64_t rma_atomic_cmp_fetch_intranode; + + uint64_t tid_update; + uint64_t tid_recv; } hmem; }; @@ -331,12 +344,31 @@ void fi_opx_debug_counters_print(struct fi_opx_debug_counters *counters) #endif #ifdef OPX_DEBUG_COUNTERS_EXPECTED_RECEIVE - FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.total_requests); FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.tid_updates); FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.tid_resource_limit); + FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.tid_resource_limit_length_chunk_short); + FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.tid_resource_limit_tidcnt_chunk_zero); FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.tid_invalidate_needed); - FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.tid_replays); - FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.rts_fallback_eager); + FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.tid_rcv_pkts); + FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.tid_rcv_pkts_replays); + FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.rts_tid_ineligible); + FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.rts_tid_eligible); + FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.rts_fallback_eager_immediate); + FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.rts_fallback_eager_misaligned_thrsh); + FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.rts_fallback_eager_reg_rzv); + FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.rts_tid_setup_retries); + FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.rts_tid_setup_retry_success); + FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.rts_tid_setup_success); + uint64_t rts_sum = counters->expected_receive.rts_fallback_eager_immediate + + counters->expected_receive.rts_fallback_eager_misaligned_thrsh + + counters->expected_receive.rts_fallback_eager_reg_rzv + + counters->expected_receive.rts_tid_setup_success; + if (rts_sum != counters->expected_receive.rts_tid_eligible) { + fprintf(stderr, + "(%d) ### WARN: rts_tid_eligible (%lu) != SUM(rts_tid_setup_success + rts_fallback*) (%lu)! Accounting error?\n\n", + pid, + counters->expected_receive.rts_tid_eligible, rts_sum); + } FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER_ARR(pid, expected_receive.tid_buckets, 4); FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.first_tidpair_minlen); FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.first_tidpair_maxlen); @@ -433,6 +465,9 @@ void fi_opx_debug_counters_print(struct fi_opx_debug_counters *counters) FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, hmem.rma_atomic_fetch_hfi); FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, hmem.rma_atomic_cmp_fetch_intranode); FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, hmem.rma_atomic_cmp_fetch_hfi); + + FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, hmem.tid_update); + FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, hmem.tid_recv); #endif } diff --git a/prov/opx/include/rdma/opx/fi_opx_endpoint.h b/prov/opx/include/rdma/opx/fi_opx_endpoint.h index 30fbf56a45d..583ea2c0df8 100644 --- a/prov/opx/include/rdma/opx/fi_opx_endpoint.h +++ b/prov/opx/include/rdma/opx/fi_opx_endpoint.h @@ -2136,6 +2136,7 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, break; case FI_OPX_HFI_DPUT_OPCODE_RZV_TID: { + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.expected_receive.tid_rcv_pkts); struct fi_opx_rzv_completion * rzv_comp = (struct fi_opx_rzv_completion *)(hdr->dput.target.rzv.completion_vaddr); union fi_opx_context *target_context = rzv_comp->context; assert(target_context); @@ -2182,7 +2183,7 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, } else { memcpy(rbuf_qws, sbuf_qws, bytes); } - FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.expected_receive.tid_replays); + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.expected_receive.tid_rcv_pkts_replays); } #ifndef NDEBUG else { /* Debug, tracking where the TID wrote even though we don't memcpy here */ diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1.h b/prov/opx/include/rdma/opx/fi_opx_hfi1.h index 7e2562cea1c..721d714f19c 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1.h @@ -1,6 +1,6 @@ /* * Copyright (C) 2016 by Argonne National Laboratory. - * Copyright (C) 2021-2023 Cornelis Networks. + * Copyright (C) 2021-2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -203,6 +203,21 @@ static_assert(FI_OPX_HFI1_SDMA_MAX_WE >= FI_OPX_HFI1_SDMA_MAX_COMP_INDEX, "FI_OP //Version 1, SDMA replays - EAGER opcode (1)(byte 0), 2 iovectors (byte 1) #define FI_OPX_HFI1_SDMA_REQ_HEADER_REPLAY_EAGER_FIXEDBITS (0x0211) +#ifndef OPX_RTS_TID_SETUP_MAX_TRIES +#define OPX_RTS_TID_SETUP_MAX_TRIES (1) +#endif + +/* + * Minimum page sizes to use for different memory types. + * The array is indexed by the values defined in + * enum fi_hmem_iface. Some values are not supported. + */ +static const uint64_t OPX_TID_PAGE_SIZE[4] = { + PAGE_SIZE, /* FI_HMEM_SYSTEM */ + 64 * 1024, /* FI_HMEM_CUDA */ + PAGE_SIZE, /* FI_HMEM_ROCR */ + PAGE_SIZE /* FI_HMEM_ZE */ +}; static inline uint32_t fi_opx_addr_calculate_base_rx (const uint32_t process_id, const uint32_t processes_per_node) { diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h index e71090f414d..a3f4ea0d70b 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h @@ -59,15 +59,6 @@ static const uint16_t OPX_SDMA_REQ_SET_MEMINFO[2] = {0, static const size_t OPX_SDMA_REQ_HDR_SIZE[2] = {FI_OPX_HFI1_SDMA_HDR_SIZE, FI_OPX_HFI1_SDMA_HDR_SIZE + OPX_SDMA_MEMINFO_SIZE}; -static const unsigned OPX_SDMA_OFI_TO_KERN_MEM_TYPE[4] = { - #ifdef OPX_HMEM - HFI1_MEMINFO_TYPE_SYSTEM, - HFI1_MEMINFO_TYPE_NVIDIA, - 2, /* HFI1_MEMINFO_TYPE_AMD */ - 1 /* HFI1_MEMINFO_TYPE_DMABUF */ - #endif - }; - struct fi_opx_hfi1_sdma_header_vec { union { struct { @@ -515,7 +506,7 @@ void fi_opx_hfi1_sdma_set_meminfo(struct sdma_req_info *req_info, // setting meminfo, and it will be the fist one: // index 0 (the first payload IOV, or iov[1]). const unsigned meminfo_idx = 0; - const unsigned type = OPX_SDMA_OFI_TO_KERN_MEM_TYPE[iface]; + const unsigned type = OPX_HMEM_KERN_MEM_TYPE[iface]; struct sdma_req_meminfo *meminfo = (struct sdma_req_meminfo *) (req_info + 1); meminfo->types = 0; HFI1_MEMINFO_TYPE_ENTRY_SET(meminfo->types, meminfo_idx, type); @@ -759,10 +750,15 @@ void opx_hfi1_sdma_do_sdma_tid(struct fi_opx_ep *opx_ep, unsigned int tidiovec_idx = 2; /* tid info iovec*/ uint32_t *tidpairs = NULL; - // TODO: GPU support for TID - assert(we->hmem.iface == FI_HMEM_SYSTEM); - const uint64_t set_meminfo = 0; + const uint64_t set_meminfo = + #ifdef OPX_HMEM + (we->hmem.iface > FI_HMEM_SYSTEM) ? 1 : 0; + #else + 0; + #endif + struct sdma_req_info *req_info = OPX_SDMA_REQ_INFO_PTR(&we->header_vec, set_meminfo); + fi_opx_hfi1_sdma_set_meminfo(req_info, set_meminfo, we->hmem.iface, we->hmem.device); /* Since we already verified that enough PSNs were available for the send we're about to do, we shouldn't need to check the @@ -775,7 +771,7 @@ void opx_hfi1_sdma_do_sdma_tid(struct fi_opx_ep *opx_ep, we->num_iovs = 3; /* request and data and tids*/ /* no padding for tid, should have been aligned.*/ assert(we->total_payload == ((we->total_payload) & -4)); - ; + we->iovecs[1].iov_len = (we->total_payload + 3) & -4; we->iovecs[1].iov_base = we->packets[0].replay->iov[0].iov_base; @@ -800,8 +796,8 @@ void opx_hfi1_sdma_do_sdma_tid(struct fi_opx_ep *opx_ep, we->packets[i].replay->scb.hdr.qw[2] |= (uint64_t)htonl((uint32_t)psn); we->packets[i].replay->sdma_we_use_count = we->bounce_buf.use_count; we->packets[i].replay->sdma_we = replay_back_ptr; - we->packets[i].replay->hmem_iface = FI_HMEM_SYSTEM; - we->packets[i].replay->hmem_device = 0; + we->packets[i].replay->hmem_iface = we->hmem.iface; + we->packets[i].replay->hmem_device = we->hmem.device; fi_opx_reliability_client_replay_register_with_update( &opx_ep->reliability->state, we->dlid, we->rs, we->rx, we->psn_ptr, we->packets[i].replay, cc, @@ -816,7 +812,8 @@ void opx_hfi1_sdma_do_sdma_tid(struct fi_opx_ep *opx_ep, we->iovecs[tidiovec_idx].iov_len = tid_iov->iov_len - (tid_idx * sizeof(uint32_t)); we->iovecs[tidiovec_idx].iov_base = &tidpairs[tid_idx]; req_info->ctrl = FI_OPX_HFI1_SDMA_REQ_HEADER_EXPECTED_FIXEDBITS | - (((uint16_t)we->num_iovs) << HFI1_SDMA_REQ_IOVCNT_SHIFT); + (((uint16_t)we->num_iovs) << HFI1_SDMA_REQ_IOVCNT_SHIFT) | + OPX_SDMA_REQ_SET_MEMINFO[set_meminfo]; uint32_t tidpair = tidpairs[tid_idx]; uint32_t kdeth = (FI_OPX_HFI1_KDETH_TIDCTRL & FI_OPX_EXP_TID_GET((tidpair), CTRL)) @@ -856,8 +853,7 @@ void opx_hfi1_sdma_do_sdma_tid(struct fi_opx_ep *opx_ep, *fill_index = ((*fill_index) + 1) % (opx_ep->hfi->info.sdma.queue_size); --opx_ep->hfi->info.sdma.available_counter; - FI_OPX_DEBUG_COUNTERS_INC( - opx_ep->debug_counters.sdma.writev_calls[we->num_packets]); + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.sdma.writev_calls[we->num_packets]); ssize_t rc = writev(opx_ep->hfi->fd, we->iovecs, we->num_iovs); FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== SDMA_WE -- called writev rc=%ld Params were: fd=%d iovecs=%p num_iovs=%d \n", diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h index e6e62c2f353..1d320ced2d9 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h @@ -429,9 +429,13 @@ struct fi_opx_hfi1_rx_rzv_rts_params { uintptr_t origin_byte_counter_vaddr; struct fi_opx_rzv_completion* rzv_comp; uintptr_t dst_vaddr; /* bumped past immediate data */ - uint64_t immediate_data; - uint64_t immediate_end_block_count; + uint64_t tid_pending_vaddr; + uint64_t tid_pending_tid_vaddr; + int64_t tid_pending_length; + int64_t tid_pending_tid_length; + int64_t tid_pending_alignment_adjustment; + uint32_t tid_setup_retries; uint32_t ntidpairs; uint32_t tid_offset; uint32_t u32_extended_rx; @@ -442,13 +446,13 @@ struct fi_opx_hfi1_rx_rzv_rts_params { uint16_t origin_rx; uint8_t opcode; - uint8_t fallback_opcode; uint8_t u8_rx; uint8_t target_hfi_unit; + /* Either FI_OPX_MAX_DPUT_IOV iov's or 1 iov and FI_OPX_MAX_DPUT_TIDPAIRS tidpairs */ union { - union fi_opx_hfi1_dput_iov src_iov[FI_OPX_MAX_DPUT_IOV]; + union fi_opx_hfi1_dput_iov dput_iov[FI_OPX_MAX_DPUT_IOV]; struct { union fi_opx_hfi1_dput_iov reserved;/* skip 1 iov */ uint32_t tidpairs[FI_OPX_MAX_DPUT_TIDPAIRS]; diff --git a/prov/opx/include/rdma/opx/fi_opx_hmem.h b/prov/opx/include/rdma/opx/fi_opx_hmem.h index d4c9adb039c..bf4b7d89566 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hmem.h +++ b/prov/opx/include/rdma/opx/fi_opx_hmem.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2023 by Cornelis Networks. + * Copyright (C) 2023-2024 by Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -33,6 +33,7 @@ #define _FI_PROV_OPX_HMEM_H_ #include +#include #include "rdma/opx/fi_opx_compiler.h" #include "rdma/opx/fi_opx_rma_ops.h" #include "ofi_hmem.h" @@ -127,6 +128,15 @@ unsigned fi_opx_hmem_iov_init(const void *buf, #endif } +static const unsigned OPX_HMEM_KERN_MEM_TYPE[4] = { + #ifdef OPX_HMEM + HFI1_MEMINFO_TYPE_SYSTEM, + HFI1_MEMINFO_TYPE_NVIDIA, + 2, /* HFI1_MEMINFO_TYPE_AMD */ + 1 /* HFI1_MEMINFO_TYPE_DMABUF */ + #endif +}; + #ifdef OPX_HMEM #define OPX_HMEM_COPY_FROM(dst, src, len, src_iface, src_device) \ do { \ diff --git a/prov/opx/src/fi_opx_hfi1.c b/prov/opx/src/fi_opx_hfi1.c index a265d26544c..f3206695483 100644 --- a/prov/opx/src/fi_opx_hfi1.c +++ b/prov/opx/src/fi_opx_hfi1.c @@ -159,7 +159,7 @@ static enum opx_hfi1_type opx_hfi1_check_hwversion (const uint32_t hw_version) { // Used by fi_opx_hfi1_context_open as a convenience. static int opx_open_hfi_and_context(struct _hfi_ctrl **ctrl, struct fi_opx_hfi1_context_internal *internal, - uuid_t unique_job_key, + uuid_t unique_job_key, int hfi_unit_number) { int fd; @@ -190,6 +190,7 @@ static int opx_open_hfi_and_context(struct _hfi_ctrl **ctrl, hfi_unit_number); fd = -1; } + assert((*ctrl)->__hfi_pg_sz == OPX_HFI1_TID_PAGESIZE); } return fd; } @@ -963,7 +964,7 @@ ssize_t fi_opx_hfi1_tx_connect (struct fi_opx_ep *opx_ep, fi_addr_t peer) return rc; } -int fi_opx_hfi1_do_rx_rzv_rts_intranode (union fi_opx_hfi1_deferred_work *work) +int opx_hfi1_rx_rzv_rts_send_cts_intranode(union fi_opx_hfi1_deferred_work *work) { struct fi_opx_hfi1_rx_rzv_rts_params *params = &work->rx_rzv_rts; @@ -1008,13 +1009,13 @@ int fi_opx_hfi1_do_rx_rzv_rts_intranode (union fi_opx_hfi1_deferred_work *work) uintptr_t vaddr_with_offset = params->dst_vaddr; /* receive buffer virtual address */ for(int i = 0; i < params->niov; i++) { tx_payload->cts.iov[i].rbuf = vaddr_with_offset; - tx_payload->cts.iov[i].sbuf = (uintptr_t)params->src_iov[i].sbuf; - tx_payload->cts.iov[i].bytes = params->src_iov[i].bytes; - tx_payload->cts.iov[i].rbuf_device = params->src_iov[i].rbuf_device; - tx_payload->cts.iov[i].sbuf_device = params->src_iov[i].sbuf_device; - tx_payload->cts.iov[i].rbuf_iface = params->src_iov[i].rbuf_iface; - tx_payload->cts.iov[i].sbuf_iface = params->src_iov[i].sbuf_iface; - vaddr_with_offset += params->src_iov[i].bytes; + tx_payload->cts.iov[i].sbuf = (uintptr_t)params->dput_iov[i].sbuf; + tx_payload->cts.iov[i].bytes = params->dput_iov[i].bytes; + tx_payload->cts.iov[i].rbuf_device = params->dput_iov[i].rbuf_device; + tx_payload->cts.iov[i].sbuf_device = params->dput_iov[i].sbuf_device; + tx_payload->cts.iov[i].rbuf_iface = params->dput_iov[i].rbuf_iface; + tx_payload->cts.iov[i].sbuf_iface = params->dput_iov[i].sbuf_iface; + vaddr_with_offset += params->dput_iov[i].bytes; } opx_shm_tx_advance(&opx_ep->tx->shm, (void*)tx_hdr, pos); @@ -1025,8 +1026,7 @@ int fi_opx_hfi1_do_rx_rzv_rts_intranode (union fi_opx_hfi1_deferred_work *work) return FI_SUCCESS; } -/* Rendezvous to eager ring buffers (not directly to user buffers) */ -int fi_opx_hfi1_do_rx_rzv_rts_eager_ring(union fi_opx_hfi1_deferred_work *work) +int opx_hfi1_rx_rzv_rts_send_cts(union fi_opx_hfi1_deferred_work *work) { struct fi_opx_hfi1_rx_rzv_rts_params *params = &work->rx_rzv_rts; struct fi_opx_ep *opx_ep = params->opx_ep; @@ -1034,8 +1034,10 @@ int fi_opx_hfi1_do_rx_rzv_rts_eager_ring(union fi_opx_hfi1_deferred_work *work) const uint64_t bth_rx = ((uint64_t)params->u8_rx) << 56; FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== RECV, HFI -- RENDEZVOUS EAGER RTS (begin)\n"); - const uint64_t payload_bytes = params->niov * sizeof(union fi_opx_hfi1_dput_iov); + "===================================== RECV, HFI -- RENDEZVOUS %s RTS (begin)\n", + params->ntidpairs ? "EXPECTED TID" : "EAGER"); + const uint64_t tid_payload = params->ntidpairs ? ((params->ntidpairs + 2) * sizeof(uint32_t)) : 0; + const uint64_t payload_bytes = (params->niov * sizeof(union fi_opx_hfi1_dput_iov)) + tid_payload; const uint64_t pbc_dws = 2 + /* pbc */ 2 + /* lrh */ @@ -1066,45 +1068,65 @@ int fi_opx_hfi1_do_rx_rzv_rts_eager_ring(union fi_opx_hfi1_deferred_work *work) union fi_opx_reliability_tx_psn *psn_ptr; int64_t psn; - psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, params->slid, - params->u8_rx, params->origin_rs, &psn_ptr, &replay, params->reliability); + psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, + &opx_ep->reliability->state, + params->slid, + params->u8_rx, + params->origin_rs, + &psn_ptr, + &replay, + params->reliability); if(OFI_UNLIKELY(psn == -1)) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "FI_EAGAIN\n"); return -FI_EAGAIN; } assert(payload_bytes <= FI_OPX_HFI1_PACKET_MTU); + // The "memcopy first" code is here as an alternative to the more complicated // direct write to pio followed by memory copy of the reliability buffer - replay->scb.qw0 = opx_ep->rx->tx.cts.qw0 | pbc_dws | - ((opx_ep->tx->force_credit_return & FI_OPX_HFI1_PBC_CR_MASK) << FI_OPX_HFI1_PBC_CR_SHIFT); + replay->scb.qw0 = opx_ep->rx->tx.cts.qw0 | pbc_dws; replay->scb.hdr.qw[0] = opx_ep->rx->tx.cts.hdr.qw[0] | lrh_dlid | - ((uint64_t)lrh_dws << 32); + ((uint64_t) lrh_dws << 32); replay->scb.hdr.qw[1] = opx_ep->rx->tx.cts.hdr.qw[1] | bth_rx; replay->scb.hdr.qw[2] = opx_ep->rx->tx.cts.hdr.qw[2] | psn; replay->scb.hdr.qw[3] = opx_ep->rx->tx.cts.hdr.qw[3]; replay->scb.hdr.qw[4] = opx_ep->rx->tx.cts.hdr.qw[4] | + ((uint64_t) params->ntidpairs << 32) | (params->niov << 48) | params->opcode; replay->scb.hdr.qw[5] = params->origin_byte_counter_vaddr; - replay->scb.hdr.qw[6] = (uint64_t)params->rzv_comp; + replay->scb.hdr.qw[6] = (uint64_t) params->rzv_comp; union fi_opx_hfi1_packet_payload *const tx_payload = - (union fi_opx_hfi1_packet_payload *)replay->payload; + (union fi_opx_hfi1_packet_payload *) replay->payload; assert(((uint8_t *)tx_payload) == ((uint8_t *)&replay->data)); - uintptr_t vaddr_with_offset = params->dst_vaddr; /* receive buffer virtual address */ + uintptr_t vaddr_with_offset = params->ntidpairs ? + ((uint64_t)params->dst_vaddr & -64) : + params->dst_vaddr; /* receive buffer virtual address */ + for (int i = 0; i < params->niov; i++) { tx_payload->cts.iov[i].rbuf = vaddr_with_offset; - tx_payload->cts.iov[i].sbuf = (uintptr_t)params->src_iov[i].sbuf; - tx_payload->cts.iov[i].bytes = params->src_iov[i].bytes; - tx_payload->cts.iov[i].sbuf_device = params->src_iov[i].sbuf_device; - tx_payload->cts.iov[i].rbuf_device = params->src_iov[i].rbuf_device; - tx_payload->cts.iov[i].sbuf_iface = params->src_iov[i].sbuf_iface; - tx_payload->cts.iov[i].rbuf_iface = params->src_iov[i].rbuf_iface; - vaddr_with_offset += params->src_iov[i].bytes; + tx_payload->cts.iov[i].sbuf = (uintptr_t)params->dput_iov[i].sbuf; + tx_payload->cts.iov[i].bytes = params->dput_iov[i].bytes; + tx_payload->cts.iov[i].sbuf_device = params->dput_iov[i].sbuf_device; + tx_payload->cts.iov[i].rbuf_device = params->dput_iov[i].rbuf_device; + tx_payload->cts.iov[i].sbuf_iface = params->dput_iov[i].sbuf_iface; + tx_payload->cts.iov[i].rbuf_iface = params->dput_iov[i].rbuf_iface; + vaddr_with_offset += params->dput_iov[i].bytes; } - FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); + /* copy tidpairs to packet */ + if (params->ntidpairs) { + assert(params->niov == 1); + + /* coverity[missing_lock] */ + tx_payload->tid_cts.tid_offset = params->tid_offset; + tx_payload->tid_cts.ntidpairs = params->ntidpairs; + assert(params->tidpairs[0] != 0); + memcpy(&tx_payload->tid_cts.tidpairs, params->tidpairs, + params->ntidpairs * sizeof(uint32_t)); + } fi_opx_reliability_service_do_replay(&opx_ep->reliability->service,replay); fi_opx_reliability_client_replay_register_no_update(&opx_ep->reliability->state, @@ -1114,254 +1136,174 @@ int fi_opx_hfi1_do_rx_rzv_rts_eager_ring(union fi_opx_hfi1_deferred_work *work) psn_ptr, replay, params->reliability); - FI_DBG( - fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== RECV, HFI -- RENDEZVOUS EAGER RTS (end)\n"); - + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV, HFI -- RENDEZVOUS %s RTS (end)\n", + params->ntidpairs ? "EXPECTED TID" : "EAGER"); return FI_SUCCESS; } -/* RTS TID falling back to RTS eager ring */ __OPX_FORCE_INLINE__ -int opx_fallback_eager_ring(union fi_opx_hfi1_deferred_work *work, int line) +int opx_hfi1_rx_rzv_rts_tid_eligible(struct fi_opx_ep *opx_ep, + struct fi_opx_hfi1_rx_rzv_rts_params *params, + const uint64_t niov, + const uint64_t immediate_data, + const uint64_t immediate_end_block_count, + const uint64_t is_hmem, + const uint64_t is_intranode, + const enum fi_hmem_iface iface, + uint8_t opcode) { - struct fi_opx_hfi1_rx_rzv_rts_params *params = &work->rx_rzv_rts; - - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, - "RENDEZVOUS EXPECTED TID CTS fallback to EAGER CTS (%u)\n", - line); -#ifdef OPX_TID_FALLBACK_DEBUG - fprintf(stderr, - "## OPX_TID_FALLBACK_DEBUG: RENDEZVOUS EXPECTED TID CTS fallback to EAGER CTS (%u)\n", - line); -#endif - params->ntidpairs = 0; - params->opcode = params->fallback_opcode; /* fallback */ - params->work_elem.work_fn = fi_opx_hfi1_do_rx_rzv_rts_eager_ring; - FI_OPX_DEBUG_COUNTERS_INC(params->opx_ep->debug_counters - .expected_receive.rts_fallback_eager); - return params->work_elem.work_fn(work); -} - -/* Rendezvous directly to user buffers (using TID) (not to eager buffers) */ -int fi_opx_hfi1_do_rx_rzv_rts_tid(union fi_opx_hfi1_deferred_work *work) -{ - struct fi_opx_hfi1_rx_rzv_rts_params *params = &work->rx_rzv_rts; - struct fi_opx_ep *opx_ep = params->opx_ep; - const uint64_t lrh_dlid = params->lrh_dlid; - const uint64_t bth_rx = ((uint64_t)params->u8_rx) << 56; - - FI_DBG( - fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== RECV, HFI -- RENDEZVOUS EXPECTED TID RTS (begin)\n"); - - /* If tidpairs is set, this is FI_EAGAIN so skip TID processing as we're committed to TID (not eager) */ - if (!params->ntidpairs) { - /*******************************************************************************/ - /* If there's no immediate data, the peer must have - * dynamically disabled expected receive tid so fallback. - */ - const uint64_t immediate_data = params->immediate_data; - const uint64_t immediate_end_block_count = params->immediate_end_block_count; - if ((immediate_data == 0) || (immediate_end_block_count == 0)) { - return opx_fallback_eager_ring(work, __LINE__); - } - - /* Caller adjusted pointers and lengths past the immediate data. - * Now align the destination buffer to be page aligned for expected TID writes - * This should point/overlap into the immediate data area. - * Then realign source buffer and lengths appropriately. - */ - const uint64_t page_alignment_mask = -(int64_t)OPX_HFI1_TID_PAGESIZE; - /* TID writes must start on 64 byte boundaries */ - const uint64_t vaddr = ((uint64_t)params->dst_vaddr) & -64; - /* TID updates require page alignment*/ - const uint64_t tid_vaddr = (uint64_t)vaddr & (uint64_t)page_alignment_mask; - - /* If adjusted pointer doesn't fall into the immediate data region, can't - * continue with TID. Fallback to eager. - */ - if (!((vaddr >= ((uint64_t)params->dst_vaddr -params->immediate_data)) && - (vaddr <= ((uint64_t)params->dst_vaddr)))) { - return opx_fallback_eager_ring(work, __LINE__); - } - - /* First adjust for the start page alignment, using immediate data that was sent.*/ - const int64_t alignment_adjustment = (uint64_t)params->dst_vaddr - vaddr; + if (is_intranode + || !opx_ep->use_expected_tid_rzv + || (niov != 1) + || (opcode != FI_OPX_HFI_DPUT_OPCODE_RZV && + opcode != FI_OPX_HFI_DPUT_OPCODE_RZV_NONCONTIG) + || !fi_opx_hfi1_sdma_use_sdma(opx_ep, params->dput_iov[0].bytes, + opcode, is_hmem, OPX_INTRANODE_FALSE) + || (immediate_data == 0) + || (immediate_end_block_count == 0)) { + + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.expected_receive.rts_tid_ineligible); + return 0; + } - /* Adjust length for aligning the buffer and adjust again for total length, - aligning to SDMA header auto-generation payload requirements. */ - const int64_t length = (params->src_iov[0].bytes + alignment_adjustment) & -64; + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.expected_receive.rts_tid_eligible); - /* Tune for unaligned buffers. Buffers misaligned more than the threshold on - * message sizes under the MSG threshold will fallback to eager. - */ - if ((length < FI_OPX_TID_MSG_MISALIGNED_THRESHOLD) && - ((vaddr - tid_vaddr) > FI_OPX_TID_MISALIGNED_THRESHOLD)) { - return opx_fallback_eager_ring(work, __LINE__); - } + /* Caller adjusted pointers and lengths past the immediate data. + * Now align the destination buffer to be page aligned for expected TID writes + * This should point/overlap into the immediate data area. + * Then realign source buffer and lengths appropriately. + */ + const uint64_t page_alignment_mask = -(int64_t)OPX_TID_PAGE_SIZE[iface]; + /* TID writes must start on 64 byte boundaries */ + const uint64_t vaddr = ((uint64_t)params->dst_vaddr) & -64; + /* TID updates require page alignment*/ + const uint64_t tid_vaddr = (uint64_t)vaddr & (uint64_t)page_alignment_mask; + + /* If adjusted pointer doesn't fall into the immediate data region, can't + * continue with TID. Fallback to eager. + */ + if (!((vaddr >= ((uint64_t)params->dst_vaddr - immediate_data)) && + (vaddr <= ((uint64_t)params->dst_vaddr)))) { + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.expected_receive.rts_fallback_eager_immediate); + return 0; + } - /* The tid length much account for starting at a page boundary and will be page aligned */ - const int64_t tid_length = (uint64_t)(((vaddr + length) - tid_vaddr) + - (OPX_HFI1_TID_PAGESIZE - 1)) & (uint64_t)page_alignment_mask; - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, - "iov_len %#lX, length %#lX, tid_length %#lX, " - "params->dst_vaddr %p, iov_base %p, vaddr [%p - %p], tid_vaddr [%p - %p]\n", - params->src_iov[0].bytes, length, tid_length, - (void *)params->dst_vaddr, (void *) params->src_iov[0].sbuf, - (void *)vaddr, (void *)(vaddr + length), - (void *)tid_vaddr, (void *)(tid_vaddr + tid_length)); - - /* New params were checked above but - * DO NOT CHANGE params->xxx or opx_ep->xxx until we know we will NOT fallback to eager rts */ - if (opx_register_for_rzv(params, tid_vaddr, tid_length)) - return opx_fallback_eager_ring(work, __LINE__); - - /* Register was done based on tid_vaddr and the offset should be set to the page - * offset into the TID now. - * This was done under the mm_lock, but that lock is not required. - * Stop the MISSING_LOCK false positives. */ - /* coverity[missing_lock] */ - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, - "vaddr %p, tid_vaddr %p, diff %#X, registered tid_offset %u/%#X, buffer tid_offset %u/%#X, tid_length %lu/%#lX \n", - (void *)vaddr, (void *)tid_vaddr, - (uint32_t)(vaddr - tid_vaddr), params->tid_offset, - params->tid_offset, - params->tid_offset + (uint32_t)(vaddr - tid_vaddr), - params->tid_offset + (uint32_t)(vaddr - tid_vaddr), - tid_length, tid_length); + /* First adjust for the start page alignment, using immediate data that was sent.*/ + const int64_t alignment_adjustment = (uint64_t)params->dst_vaddr - vaddr; + /* Adjust length for aligning the buffer and adjust again for total length, + aligning to SDMA header auto-generation payload requirements. */ + const int64_t length = (params->dput_iov[0].bytes + alignment_adjustment) & -64; - /* Adjust the offset for vaddr byte offset into the tid. */ - /* coverity[missing_lock] */ - params->tid_offset += (uint32_t)(vaddr - tid_vaddr); + /* Tune for unaligned buffers. Buffers misaligned more than the threshold on + * message sizes under the MSG threshold will fallback to eager. + */ + if ((length < FI_OPX_TID_MSG_MISALIGNED_THRESHOLD) && + ((vaddr - tid_vaddr) > FI_OPX_TID_MISALIGNED_THRESHOLD)) { + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.expected_receive.rts_fallback_eager_misaligned_thrsh); + return 0; + } - /* Now there is no fallback to eager so we can change params in case of FI_EAGAIN */ - const uint64_t iov_adj = ((uint64_t)params->dst_vaddr - vaddr); - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, - " ==== iov[%u].base %p len %zu/%#lX iov_adj %lu/%#lX alignment_adjustment %lu/%#lX\n", - 0, (void *) params->src_iov[0].sbuf, - params->src_iov[0].bytes, params->src_iov[0].bytes, - iov_adj, iov_adj, alignment_adjustment, alignment_adjustment); + /* The tid length must account for starting at a page boundary and will be page aligned */ + const int64_t tid_length = (uint64_t)(((vaddr + length) - tid_vaddr) + + (OPX_TID_PAGE_SIZE[iface] - 1)) & (uint64_t)page_alignment_mask; + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "iov_len %#lX, length %#lX, tid_length %#lX, " + "params->dst_vaddr %p, iov_base %p, vaddr [%p - %p], tid_vaddr [%p - %p]\n", + params->dput_iov[0].bytes, length, tid_length, + (void *)params->dst_vaddr, (void *) params->dput_iov[0].sbuf, + (void *)vaddr, (void *)(vaddr + length), + (void *)tid_vaddr, (void *)(tid_vaddr + tid_length)); - params->src_iov[0].sbuf -= iov_adj; - params->src_iov[0].bytes = (params->src_iov[0].bytes + iov_adj) & -64; - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, - " ==== iov[%u].base %p len %zu/%#lX alignment_adjustment %lu/%#lX\n", - 0, (void *) params->src_iov[0].sbuf, - params->src_iov[0].bytes, params->src_iov[0].bytes, - alignment_adjustment, alignment_adjustment); - /* Adjust the (context) counter with the new length ... */ - params->rzv_comp->context->byte_counter = length; - params->rzv_comp->tid_length = tid_length; - params->rzv_comp->tid_vaddr = tid_vaddr; - } else { - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "RETRY FI_EAGAIN\n"); - OPX_DEBUG_TIDS("RTS retry tidpairs", params->ntidpairs, params->tidpairs); - } + params->tid_pending_vaddr = vaddr; + params->tid_pending_length = length; + params->tid_pending_tid_vaddr = tid_vaddr; + params->tid_pending_tid_length = tid_length; + params->tid_pending_alignment_adjustment = alignment_adjustment; - /*******************************************************************************************************************/ - /* Committed to expected receive (TID) but might FI_EAGAIN out and retry */ - /*******************************************************************************************************************/ - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "ntidpairs %u\n", - params->ntidpairs); - const uint64_t payload_bytes = - params->niov * sizeof(union fi_opx_hfi1_dput_iov) + - sizeof(uint32_t) /* tid_offset */ + - sizeof(uint32_t) /* ntidpairs */ + - params->ntidpairs * sizeof(uint32_t) /* tidpairs[]*/; + return 1; +} - const uint64_t pbc_dws = - 2 + /* pbc */ - 2 + /* lrh */ - 3 + /* bth */ - 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ - ((payload_bytes + 3) >> 2); - const uint16_t lrh_dws = htons(pbc_dws - 1); - union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; - const uint16_t total_credits_needed = 1 + /* packet header */ - ((payload_bytes + 63) >> 6); /* payload blocks needed */ - uint64_t total_credits_available = FI_OPX_HFI1_AVAILABLE_CREDITS( - pio_state, &opx_ep->tx->force_credit_return, - total_credits_needed); +int opx_hfi1_rx_rzv_rts_tid_setup(union fi_opx_hfi1_deferred_work *work) +{ + struct fi_opx_hfi1_rx_rzv_rts_params *params = &work->rx_rzv_rts; - if (OFI_UNLIKELY(total_credits_available < total_credits_needed)) { - fi_opx_compiler_msync_writes(); - FI_OPX_HFI1_UPDATE_CREDITS(pio_state, - opx_ep->tx->pio_credits_addr); - total_credits_available = FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, - &opx_ep->tx->force_credit_return, - total_credits_needed); - opx_ep->tx->pio_state->qw0 = pio_state.qw0; - if (total_credits_available < total_credits_needed) { - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "FI_EAGAIN\n"); + if (opx_register_for_rzv(params, params->tid_pending_tid_vaddr, + params->tid_pending_tid_length, + params->dput_iov[0].rbuf_iface, + params->dput_iov[0].rbuf_device)) { + /* Retry TID setup */ + if (++params->tid_setup_retries < OPX_RTS_TID_SETUP_MAX_TRIES) { + FI_OPX_DEBUG_COUNTERS_INC(params->opx_ep->debug_counters + .expected_receive.rts_tid_setup_retries); return -FI_EAGAIN; } - } - - struct fi_opx_reliability_tx_replay *replay; - union fi_opx_reliability_tx_psn *psn_ptr; - int64_t psn; - psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, params->slid, - params->u8_rx, params->origin_rs, &psn_ptr, &replay, params->reliability); - if(OFI_UNLIKELY(psn == -1)) { - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "FI_EAGAIN\n"); - return -FI_EAGAIN; + // Give up and fall back to non-TID + FI_OPX_DEBUG_COUNTERS_INC(params->opx_ep->debug_counters + .expected_receive.rts_fallback_eager_reg_rzv); + params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts; + return opx_hfi1_rx_rzv_rts_send_cts(work); } - assert(payload_bytes <= FI_OPX_HFI1_PACKET_MTU); - const uint64_t force_credit_return = (opx_ep->tx->force_credit_return & FI_OPX_HFI1_PBC_CR_MASK) - << FI_OPX_HFI1_PBC_CR_SHIFT; - FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); - - // The "memcopy first" code is here as an alternative to the more complicated - // direct write to pio followed by memory copy of the reliability buffer - replay->scb.qw0 = opx_ep->rx->tx.cts.qw0 | pbc_dws | force_credit_return; - - replay->scb.hdr.qw[0] = opx_ep->rx->tx.cts.hdr.qw[0] | lrh_dlid | - ((uint64_t)lrh_dws << 32); - replay->scb.hdr.qw[1] = opx_ep->rx->tx.cts.hdr.qw[1] | bth_rx; - replay->scb.hdr.qw[2] = opx_ep->rx->tx.cts.hdr.qw[2] | psn; - replay->scb.hdr.qw[3] = opx_ep->rx->tx.cts.hdr.qw[3]; - replay->scb.hdr.qw[4] = opx_ep->rx->tx.cts.hdr.qw[4] | - (uint64_t)params->ntidpairs << 32 | - (params->niov << 48) | params->opcode; - replay->scb.hdr.qw[5] = params->origin_byte_counter_vaddr; - replay->scb.hdr.qw[6] = (uint64_t)params->rzv_comp; - - union fi_opx_hfi1_packet_payload *const tx_payload = - (union fi_opx_hfi1_packet_payload *)replay->payload; - assert(((uint8_t *)tx_payload) == ((uint8_t *)&replay->data)); - - uintptr_t vaddr_with_offset = ((uint64_t)params->dst_vaddr & -64); - - assert(params->niov == 1); - - tx_payload->tid_cts.iov[0].rbuf = vaddr_with_offset; /* receive buffer virtual address */ - tx_payload->tid_cts.iov[0].sbuf = (uintptr_t)params->src_iov[0].sbuf; /* send buffer virtual address */ - tx_payload->tid_cts.iov[0].bytes = params->src_iov[0].bytes; /* number of bytes to transfer */ - tx_payload->tid_cts.iov[0].rbuf_device = params->src_iov[0].rbuf_device; - tx_payload->tid_cts.iov[0].sbuf_device = params->src_iov[0].sbuf_device; - tx_payload->tid_cts.iov[0].rbuf_iface = params->src_iov[0].rbuf_iface; - tx_payload->tid_cts.iov[0].sbuf_iface = params->src_iov[0].sbuf_iface; + assert(params->ntidpairs); - /* copy tidpairs to packet */ + const uint64_t vaddr = params->tid_pending_vaddr; + const uint64_t tid_vaddr = params->tid_pending_tid_vaddr; + const int64_t tid_length = params->tid_pending_tid_length; + const int64_t length = params->tid_pending_length; + /* Register was done based on tid_vaddr and the offset should be set to the page + * offset into the TID now. + * This was done under the mm_lock, but that lock is not required. + * Stop the MISSING_LOCK false positives. */ /* coverity[missing_lock] */ - tx_payload->tid_cts.tid_offset = params->tid_offset; - tx_payload->tid_cts.ntidpairs = params->ntidpairs; - assert(params->tidpairs[0] != 0); - memcpy(&tx_payload->tid_cts.tidpairs, params->tidpairs, - params->ntidpairs * sizeof(uint32_t)); - - fi_opx_reliability_service_do_replay(&opx_ep->reliability->service, replay); - fi_opx_reliability_client_replay_register_no_update( - &opx_ep->reliability->state, params->slid, params->origin_rs, - params->origin_rx, psn_ptr, replay, params->reliability); FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== RECV, HFI -- RENDEZVOUS EXPECTED TID RTS (end)\n"); - return FI_SUCCESS; + "vaddr %p, tid_vaddr %p, diff %#X, registered tid_offset %u/%#X, buffer tid_offset %u/%#X, tid_length %lu/%#lX \n", + (void *)vaddr, (void *)tid_vaddr, + (uint32_t)(vaddr - tid_vaddr), params->tid_offset, + params->tid_offset, + params->tid_offset + (uint32_t)(vaddr - tid_vaddr), + params->tid_offset + (uint32_t)(vaddr - tid_vaddr), + tid_length, tid_length); + + /* Adjust the offset for vaddr byte offset into the tid. */ + /* coverity[missing_lock] */ + params->tid_offset += (uint32_t)(vaddr - tid_vaddr); + + const uint64_t iov_adj = ((uint64_t)params->dst_vaddr - vaddr); + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + " ==== iov[%u].base %p len %zu/%#lX iov_adj %lu/%#lX alignment_adjustment %lu/%#lX\n", + 0, (void *) params->dput_iov[0].sbuf, + params->dput_iov[0].bytes, params->dput_iov[0].bytes, + iov_adj, iov_adj, + params->tid_pending_alignment_adjustment, + params->tid_pending_alignment_adjustment); + + params->dput_iov[0].sbuf -= iov_adj; + params->dput_iov[0].bytes = (params->dput_iov[0].bytes + iov_adj) & -64; + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + " ==== iov[%u].base %p len %zu/%#lX alignment_adjustment %lu/%#lX\n", + 0, (void *) params->dput_iov[0].sbuf, + params->dput_iov[0].bytes, params->dput_iov[0].bytes, + params->tid_pending_alignment_adjustment, + params->tid_pending_alignment_adjustment); + /* Adjust the (context) counter with the new length ... */ + params->rzv_comp->context->byte_counter = length; + params->rzv_comp->tid_length = tid_length; + params->rzv_comp->tid_vaddr = tid_vaddr; + params->opcode = FI_OPX_HFI_DPUT_OPCODE_RZV_TID; + + FI_OPX_DEBUG_COUNTERS_INC(params->opx_ep->debug_counters + .expected_receive.rts_tid_setup_success); + FI_OPX_DEBUG_COUNTERS_INC_COND(params->dput_iov[0].rbuf_iface, + params->opx_ep->debug_counters.hmem.tid_recv); + FI_OPX_DEBUG_COUNTERS_INC_COND(params->tid_setup_retries > 0, + params->opx_ep->debug_counters.expected_receive.rts_tid_setup_retry_success); + + params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts; + return opx_hfi1_rx_rzv_rts_send_cts(work); } void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, @@ -1389,9 +1331,6 @@ void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, params->opx_ep = opx_ep; params->work_elem.slist_entry.next = NULL; - params->opcode = opcode; - params->fallback_opcode = opcode; - assert(niov <= MIN(FI_OPX_MAX_HMEM_IOV, FI_OPX_MAX_DPUT_IOV)); const struct fi_opx_hmem_iov *src_iov = src_iovs; @@ -1401,20 +1340,20 @@ void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, #ifdef OPX_HMEM is_hmem |= src_iov->iface; #endif - params->src_iov[i].sbuf = src_iov->buf; - params->src_iov[i].sbuf_iface = src_iov->iface; - params->src_iov[i].sbuf_device = src_iov->device; - params->src_iov[i].rbuf = dst_vaddr + rbuf_offset; - params->src_iov[i].rbuf_iface = dst_iface; - params->src_iov[i].rbuf_device = dst_device; - params->src_iov[i].bytes = src_iov->len; + params->dput_iov[i].sbuf = src_iov->buf; + params->dput_iov[i].sbuf_iface = src_iov->iface; + params->dput_iov[i].sbuf_device = src_iov->device; + params->dput_iov[i].rbuf = dst_vaddr + rbuf_offset; + params->dput_iov[i].rbuf_iface = dst_iface; + params->dput_iov[i].rbuf_device = dst_device; + params->dput_iov[i].bytes = src_iov->len; rbuf_offset += src_iov->len; ++src_iov; } if (is_intranode) { FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "is_intranode %u\n",is_intranode ); - params->work_elem.work_fn = fi_opx_hfi1_do_rx_rzv_rts_intranode; + params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts_intranode; if (hfi1_hdr->stl.lrh.slid == opx_ep->rx->self.uid.lid) { params->target_hfi_unit = opx_ep->rx->self.hfi1_unit; } else { @@ -1422,35 +1361,12 @@ void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, assert(hfi_lookup); params->target_hfi_unit = hfi_lookup->hfi_unit; } - } else if (is_hmem) { - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "is_hmem %lu\n",is_hmem); - params->work_elem.work_fn = fi_opx_hfi1_do_rx_rzv_rts_eager_ring; - } else if (opx_ep->use_expected_tid_rzv) { - /* further checks on whether TID rts is supported */ - if(niov != 1) { - /* TID rts only supports 1 iov, use eager rts */ - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "niov %lu\n", niov); - params->work_elem.work_fn = fi_opx_hfi1_do_rx_rzv_rts_eager_ring; - } else if (!fi_opx_hfi1_sdma_use_sdma(opx_ep, params->src_iov[0].bytes, opcode, is_hmem, is_intranode)) { - /* TID rts requires SDMA, use eager rts */ - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, - "src_iov[0].bytes %zu, opcode %u, is_hmem %lu is_intranode %u\n", - params->src_iov[0].bytes, opcode, is_hmem, is_intranode); - params->work_elem.work_fn = fi_opx_hfi1_do_rx_rzv_rts_eager_ring; - } else { - params->opcode = FI_OPX_HFI_DPUT_OPCODE_RZV_TID; - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, - "opx_ep->use_expected_tid_rzv %u, opcode %u, fallback opcode %u\n", - opx_ep->use_expected_tid_rzv, params->opcode, params->fallback_opcode); - FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.expected_receive.total_requests); - params->work_elem.work_fn = fi_opx_hfi1_do_rx_rzv_rts_tid; - } - params->target_hfi_unit = 0xFF; } else { FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, - "opx_ep->use_expected_tid_rzv %u, opcode %u\n", - opx_ep->use_expected_tid_rzv, params->opcode); - params->work_elem.work_fn = fi_opx_hfi1_do_rx_rzv_rts_eager_ring; + "opx_ep->use_expected_tid_rzv=%u niov=%lu opcode=%u\n", + opx_ep->use_expected_tid_rzv, niov, params->opcode); + + params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts; params->target_hfi_unit = 0xFF; } params->work_elem.completion_action = NULL; @@ -1472,11 +1388,23 @@ void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, params->rzv_comp->context = target_context; params->rzv_comp->invalidate_needed = false; params->dst_vaddr = dst_vaddr; - params->immediate_data = immediate_data; - params->immediate_end_block_count = immediate_end_block_count, params->is_intranode = is_intranode; params->reliability = reliability; + params->tid_pending_vaddr = 0UL; + params->tid_pending_tid_vaddr = 0UL; + params->tid_pending_length = 0L; + params->tid_pending_tid_length = 0L; + params->tid_setup_retries = 0; params->ntidpairs = 0; + params->opcode = opcode; + + if (opx_hfi1_rx_rzv_rts_tid_eligible(opx_ep, params, niov, + immediate_data, + immediate_end_block_count, + is_hmem, is_intranode, + dst_iface, opcode)) { + params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_tid_setup; + } int rc = params->work_elem.work_fn(work); if(rc == FI_SUCCESS) { @@ -2129,6 +2057,7 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) const void* sbuf_start = (opx_mr == NULL) ? 0 : opx_mr->iov.iov_base; const bool sdma_no_bounce_buf = params->sdma_no_bounce_buf; assert(params->ntidpairs != 0); + assert(niov == 1); /* Note that lrh_dlid is just the version of params->slid shifted so that it can be OR'd into the correct position in the packet header */ @@ -2148,7 +2077,8 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) const uint64_t max_dput_bytes = max_eager_bytes; FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, - "%p:===================================== SEND DPUT SDMA TID, opcode %X -- (begin)\n", params, opcode); + "%p:===================================== SEND DPUT SDMA TID, opcode %X -- (begin)\n", + params, opcode); for (i=params->cur_iov; idebug_counters.expected_receive.first_tidpair_minoffset == 0), params->tidoffset, opx_ep->debug_counters.expected_receive.first_tidpair_minoffset); - FI_OPX_DEBUG_COUNTERS_MIN_OF(opx_ep->debug_counters.expected_receive.first_tidpair_minoffset, params->tidoffset); - FI_OPX_DEBUG_COUNTERS_MAX_OF(opx_ep->debug_counters.expected_receive.first_tidpair_maxoffset, params->tidoffset); + FI_OPX_DEBUG_COUNTERS_INC_COND_N((opx_ep->debug_counters.expected_receive.first_tidpair_minoffset == 0), + params->tidoffset, + opx_ep->debug_counters.expected_receive.first_tidpair_minoffset); + FI_OPX_DEBUG_COUNTERS_MIN_OF(opx_ep->debug_counters.expected_receive.first_tidpair_minoffset, + params->tidoffset); + FI_OPX_DEBUG_COUNTERS_MAX_OF(opx_ep->debug_counters.expected_receive.first_tidpair_maxoffset, + params->tidoffset); tididx = 0; tidpairs = (uint32_t *)params->tid_iov.iov_base; @@ -2178,7 +2112,11 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) tidlen_consumed = params->tidoffset / OPX_HFI1_TID_PAGESIZE ; tidlen_remaining -= tidlen_consumed; if (tidlen_consumed) { - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "params->tidoffset %u, tidlen_consumed %u, tidlen_remaining %u, length %llu\n", params->tidoffset, tidlen_consumed, tidlen_remaining, FI_OPX_EXP_TID_GET(tidpairs[0],LEN)); + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "params->tidoffset %u, tidlen_consumed %u, tidlen_remaining %u, length %llu\n", + params->tidoffset, tidlen_consumed, + tidlen_remaining, + FI_OPX_EXP_TID_GET(tidpairs[0],LEN)); } } else { /* eagain retry, restore previous TID state */ tidpairs = (uint32_t *)params->tid_iov.iov_base; @@ -2190,15 +2128,25 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) uint32_t starting_tid_idx = tididx; - assert(i == 0); uint8_t * sbuf = (uint8_t*)((uintptr_t)sbuf_start + (uintptr_t)dput_iov[i].sbuf + params->bytes_sent); uintptr_t rbuf = dput_iov[i].rbuf + params->bytes_sent; uint64_t bytes_to_send = dput_iov[i].bytes - params->bytes_sent; - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, " sbuf %p, sbuf_start %p, dput_iov[%u].sbuf %p, dput_iov[i].bytes %lu/%#lX, bytes sent %lu/%#lX, bytes_to_send %lu/%#lX, origin_byte_counter %ld\n", - sbuf, sbuf_start, i, (void*)dput_iov[i].sbuf, dput_iov[i].bytes, dput_iov[i].bytes, params->bytes_sent, params->bytes_sent, bytes_to_send, bytes_to_send, params->origin_byte_counter? *(params->origin_byte_counter):-1UL); - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, " rbuf %p, dput_iov[%u].rbuf %p, dput_iov[i].bytes %lu/%#lX, bytes sent %lu/%#lX, bytes_to_send %lu/%#lX, first_tidoffset %u/%#X first_tidoffset_page_adj %u/%#X \n", - (void*)rbuf, i, (void *)dput_iov[i].rbuf, dput_iov[i].bytes, dput_iov[i].bytes, params->bytes_sent, params->bytes_sent, bytes_to_send, bytes_to_send, first_tidoffset, first_tidoffset, first_tidoffset_page_adj, first_tidoffset_page_adj); + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + " sbuf %p, sbuf_start %p, dput_iov[%u].sbuf %p, dput_iov[i].bytes %lu/%#lX, bytes sent %lu/%#lX, bytes_to_send %lu/%#lX, origin_byte_counter %ld\n", + sbuf, sbuf_start, i, (void*)dput_iov[i].sbuf, + dput_iov[i].bytes, dput_iov[i].bytes, + params->bytes_sent, params->bytes_sent, + bytes_to_send, bytes_to_send, + params->origin_byte_counter ? *(params->origin_byte_counter) : -1UL); + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + " rbuf %p, dput_iov[%u].rbuf %p, dput_iov[i].bytes %lu/%#lX, bytes sent %lu/%#lX, bytes_to_send %lu/%#lX, first_tidoffset %u/%#X first_tidoffset_page_adj %u/%#X \n", + (void*)rbuf, i, (void *)dput_iov[i].rbuf, + dput_iov[i].bytes, dput_iov[i].bytes, + params->bytes_sent, params->bytes_sent, + bytes_to_send, bytes_to_send, + first_tidoffset, first_tidoffset, + first_tidoffset_page_adj, first_tidoffset_page_adj); while (bytes_to_send > 0) { fi_opx_hfi1_sdma_poll_completion(opx_ep); if (!params->sdma_we) { @@ -2222,7 +2170,8 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) params->slid, params->origin_rs, params->u8_rx, - FI_HMEM_SYSTEM, 0); + dput_iov[i].sbuf_iface, + (int) dput_iov[i].sbuf_device); } assert(!fi_opx_hfi1_sdma_has_unsent_packets(params->sdma_we)); @@ -2256,7 +2205,8 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) if (psns_avail < (int64_t) packet_count) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.sdma.eagain_psn); FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, - "%p:===================================== SEND DPUT SDMA TID, !PSN FI_EAGAIN\n",params); + "%p:===================================== SEND DPUT SDMA TID, !PSN FI_EAGAIN\n", + params); return -FI_EAGAIN; } #ifndef OPX_RELIABILITY_TEST /* defining this will force reliability replay of some packets */ @@ -2292,7 +2242,8 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) OPX_HMEM_COPY_FROM(params->sdma_we->bounce_buf.buf, sbuf, MIN((packet_count * max_dput_bytes), bytes_to_send), - FI_HMEM_SYSTEM, 0ul); + dput_iov[i].sbuf_iface, + dput_iov[i].sbuf_device); sbuf_tmp = params->sdma_we->bounce_buf.buf; } else { sbuf_tmp = sbuf; @@ -2310,18 +2261,26 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) assert(packet_bytes <= FI_OPX_HFI1_PACKET_MTU); if (p == 0) { /* First packet header is user's responsibility even with SDMA header auto-generation*/ /* set fields for first header */ + unsigned offset_shift; starting_tid_idx = tididx; /* first tid this write() */ - if ((FI_OPX_EXP_TID_GET(tidpairs[tididx],LEN)) >= KDETH_OM_MAX_SIZE/OPX_HFI1_TID_PAGESIZE) { + if ((FI_OPX_EXP_TID_GET(tidpairs[tididx],LEN)) >= + (KDETH_OM_MAX_SIZE / OPX_HFI1_TID_PAGESIZE)) { tidOMshift = (1 << HFI_KHDR_OM_SHIFT); - tidoffset = ((tidlen_consumed * OPX_HFI1_TID_PAGESIZE) + first_tidoffset_page_adj) >> KDETH_OM_LARGE_SHIFT; - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "%p:tidoffset %#X/%#X, first_tid_offset %#X, first_tidoffset_page_adj %#X\n",params,tidoffset, tidoffset << KDETH_OM_LARGE_SHIFT, first_tidoffset, first_tidoffset_page_adj); + offset_shift = KDETH_OM_LARGE_SHIFT; } else { tidOMshift = 0; - tidoffset = ((tidlen_consumed * OPX_HFI1_TID_PAGESIZE) + first_tidoffset_page_adj) >> KDETH_OM_SMALL_SHIFT; - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "%p:tidoffset %#X/%#X, first_tid_offset %#X, first_tidoffset_page_adj %#X\n",params,tidoffset, tidoffset << KDETH_OM_SMALL_SHIFT, first_tidoffset, first_tidoffset_page_adj); + offset_shift = KDETH_OM_SMALL_SHIFT; } + tidoffset = ((tidlen_consumed * OPX_HFI1_TID_PAGESIZE) + + first_tidoffset_page_adj) + >> offset_shift; + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "%p:tidoffset %#X/%#X, first_tid_offset %#X, first_tidoffset_page_adj %#X\n", + params, tidoffset, + tidoffset << offset_shift, + first_tidoffset, + first_tidoffset_page_adj); } - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "%p:tid[%u], tidlen_remaining %u, packet_bytes %#lX, first_tid_offset %#X, first_tidoffset_page_adj %#X, packet_count %lu\n",params,tididx,tidlen_remaining, packet_bytes, first_tidoffset, first_tidoffset_page_adj, packet_count); /* Save current values in case we can't process this packet (!REPLAY) and need to restore state */ @@ -2339,11 +2298,17 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) } else { packet_bytes = MIN(packet_bytes, FI_OPX_HFI1_PACKET_MTU-first_tidoffset_page_adj); } - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "%p:tid[%u], tidlen_remaining %u, packet_bytes %#lX, first_tid_offset %#X, first_tidoffset_page_adj %#X, packet_count %lu\n",params,tididx,tidlen_remaining, packet_bytes, first_tidoffset, first_tidoffset_page_adj, packet_count); assert(tididx == 0); first_tidoffset = 0; /* offset ONLY for first tid from cts*/ first_tidoffset_page_adj = 0; } + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "%p:tid[%u], tidlen_remaining %u, packet_bytes %#lX, first_tid_offset %#X, first_tidoffset_page_adj %#X, packet_count %lu\n", + params, tididx, tidlen_remaining, + packet_bytes, first_tidoffset, + first_tidoffset_page_adj, + packet_count); + /* Check tid for each packet and determine if SDMA header auto-generation will use 4k or 8k packet */ /* Assume any CTRL 3 tidpair optimizations were already done, or are not wanted, @@ -2364,12 +2329,16 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) if(tididx == 0) first_tid_last_packet = true;/* First tid even though tididx ++*/ #endif tididx++; - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "%p:tid[%u/%u], tidlen_remaining %u, packet_bytes %#lX, first_tid_offset %#X, first_tidoffset_page_adj %#X, packet_count %lu\n",params,tididx,params->ntidpairs, tidlen_remaining, packet_bytes, first_tidoffset, first_tidoffset_page_adj, packet_count); tidlen_remaining = FI_OPX_EXP_TID_GET(tidpairs[tididx],LEN); tidlen_consumed = 0; - } else { - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "%p:tid[%u], tidlen_remaining %u, packet_bytes %#lX, first_tid_offset %#X, first_tidoffset_page_adj %#X, packet_count %lu\n",params,tididx,tidlen_remaining, packet_bytes, first_tidoffset, first_tidoffset_page_adj, packet_count); } + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "%p:tid[%u/%u], tidlen_remaining %u, packet_bytes %#lX, first_tid_offset %#X, first_tidoffset_page_adj %#X, packet_count %lu\n", + params, tididx, params->ntidpairs, + tidlen_remaining, packet_bytes, + first_tidoffset, + first_tidoffset_page_adj, + packet_count); struct fi_opx_reliability_tx_replay *replay; replay = fi_opx_reliability_client_replay_allocate( @@ -2381,8 +2350,9 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) tidlen_consumed = prev_tidlen_consumed; tidlen_remaining = prev_tidlen_remaining; FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, - "%p:!REPLAY on packet %u out of %lu, params->sdma_we->num_packets %u\n", - params, p, packet_count, params->sdma_we->num_packets); + "%p:!REPLAY on packet %u out of %lu, params->sdma_we->num_packets %u\n", + params, p, packet_count, + params->sdma_we->num_packets); break; } replay->use_sdma = true; /* Always replay TID packets with SDMA */ @@ -2430,7 +2400,8 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) if (OFI_UNLIKELY(params->sdma_we->num_packets == 0)) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.sdma.eagain_replay); FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, - "%p:===================================== SEND DPUT SDMA TID, !REPLAY FI_EAGAIN\n",params); + "%p:===================================== SEND DPUT SDMA TID, !REPLAY FI_EAGAIN\n", + params); return -FI_EAGAIN; } @@ -2456,7 +2427,7 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "%p:===================================== SEND DPUT SDMA TID, finished IOV=%d(%d) bytes_sent=%ld\n", - params,params->cur_iov, niov, params->bytes_sent); + params,params->cur_iov, niov, params->bytes_sent); params->bytes_sent = 0; params->cur_iov++; @@ -2566,7 +2537,6 @@ union fi_opx_hfi1_deferred_work* fi_opx_hfi1_rx_rzv_cts (struct fi_opx_ep * opx_ uint32_t *tidpairs = NULL; if (hfi1_hdr->cts.target.vaddr.opcode == FI_OPX_HFI_DPUT_OPCODE_RZV_TID) { - assert(!is_hmem); ntidpairs = hfi1_hdr->cts.target.vaddr.ntidpairs; if (ntidpairs) { tidpairs = ((union fi_opx_hfi1_packet_payload *)payload)->tid_cts.tidpairs; @@ -2905,6 +2875,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, #endif /* Expected tid needs to send a leading data block and a trailing * data block for alignment. Limit this to SDMA (8K+) for now */ + const uint64_t immediate_block_count = (len > FI_OPX_SDMA_MIN_LENGTH && opx_ep->use_expected_tid_rzv) ? 1 : 0; FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "immediate_block_count %#lX *origin_byte_counter_value %#lX, origin_byte_counter_vaddr %p, " diff --git a/prov/opx/src/fi_opx_init.c b/prov/opx/src/fi_opx_init.c index 00305790a3b..1d1b91f7b42 100644 --- a/prov/opx/src/fi_opx_init.c +++ b/prov/opx/src/fi_opx_init.c @@ -645,6 +645,8 @@ static void do_static_assert_tests() "sizeof(fi_opx_context_ext) should be a multiple of 32") ; OPX_COMPILE_TIME_ASSERT((sizeof(struct fi_opx_hmem_info) >> 3) == OPX_HMEM_SIZE_QWS, "sizeof(fi_opx_hmem_info) >> 3 != OPX_HMEM_SIZE_QWS") ; + OPX_COMPILE_TIME_ASSERT(OPX_HFI1_TID_PAGESIZE == 4096, + "OPX_HFI1_TID_PAGESIZE must be 4K!"); } #pragma GCC diagnostic pop diff --git a/prov/opx/src/fi_opx_service.c b/prov/opx/src/fi_opx_service.c index 3d3f1d20161..e6ccfc2f4bf 100644 --- a/prov/opx/src/fi_opx_service.c +++ b/prov/opx/src/fi_opx_service.c @@ -6,7 +6,7 @@ GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. - Copyright(c) 2021-2023 Cornelis Networks. + Copyright(c) 2021-2024 Cornelis Networks. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as @@ -23,7 +23,7 @@ BSD LICENSE Copyright(c) 2015 Intel Corporation. - Copyright(c) 2021-2022 Cornelis Networks. + Copyright(c) 2021-2024 Cornelis Networks. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -345,8 +345,8 @@ int _hfi_cmd_ioctl(int fd, struct hfi1_cmd *cmd, size_t count) [OPX_HFI_CMD_CTXT_RESET] = {HFI1_IOCTL_CTXT_RESET , 1}, [OPX_HFI_CMD_TID_INVAL_READ] = {HFI1_IOCTL_TID_INVAL_READ, 0}, [OPX_HFI_CMD_GET_VERS] = {HFI1_IOCTL_GET_VERS , 1}, -#ifdef PSM_CUDA - [OPX_HFI_CMD_TID_UPDATE_V2] = {HFI1_IOCTL_TID_UPDATE_V2 , 0}, +#ifdef OPX_HMEM + [OPX_HFI_CMD_TID_UPDATE_V3] = {HFI1_IOCTL_TID_UPDATE_V3 , 0}, #endif }; _HFI_INFO("command OPX_HFI_CMD %#X, HFI1_IOCTL %#X\n",cmd->type, cmdTypeToIoctlNum[cmd->type].ioctlCmd); diff --git a/prov/opx/src/fi_opx_tid_cache.c b/prov/opx/src/fi_opx_tid_cache.c index a647e2b9d6f..667c77a9bd3 100644 --- a/prov/opx/src/fi_opx_tid_cache.c +++ b/prov/opx/src/fi_opx_tid_cache.c @@ -1,6 +1,6 @@ /* * Copyright (c) 2017-2020 Amazon.com, Inc. or its affiliates. All rights reserved. - * Copyright (C) 2022-2023 Cornelis Networks. + * Copyright (C) 2022-2024 Cornelis Networks. * * Copyright (c) 2016-2017 Cray Inc. All rights reserved. * Copyright (c) 2017-2019 Intel Corporation, Inc. All rights reserved. @@ -83,7 +83,7 @@ * struct opx_tid_mr *opx_mr = (struct opx_tid_mr *)entry->data; * * The TID memory region (mr) has TID info for that mr that is - * registered/ioctl(update) and deregisered/ioctl(free) + * registered/ioctl(update) and deregistered/ioctl(free) * * struct opx_mr_tid_info * tid_info = &opx_mr->tid_info; * @@ -243,15 +243,15 @@ static int opx_util_mr_find_within(struct ofi_rbmap *map, void *key, void *data) void opx_regen_tidpairs(struct fi_opx_ep *opx_ep, struct opx_mr_tid_info *const tid_reuse_cache) { - uint32_t *tidinfo = (uint32_t *)&OPX_TID_INFO(tid_reuse_cache, 0); - uint32_t ntidinfo = OPX_TID_NINFO(tid_reuse_cache); - uint32_t *tidpairs = &OPX_TID_PAIR(tid_reuse_cache, 0); - OPX_TID_NPAIRS(tid_reuse_cache) = 0; + uint32_t *tidinfo = (uint32_t *)&tid_reuse_cache->info[0]; + uint32_t ntidinfo = tid_reuse_cache->ninfo; + uint32_t *tidpairs = &tid_reuse_cache->pairs[0]; + tid_reuse_cache->npairs = 0; size_t accumulated_len = 0; int32_t tid_idx = 0, pair_idx = -1; unsigned int npages = 0; OPX_DEBUG_TIDS("Input tidinfo", ntidinfo, tidinfo); - uint32_t tid_length = OPX_TID_LENGTH(tid_reuse_cache); + uint32_t tid_length = tid_reuse_cache->tid_length; FI_DBG(fi_opx_global.prov, FI_LOG_MR, "OPX_DEBUG_ENTRY tid_idx %u, ntidinfo %u, accumulated_len %zu, length_pages %u\n", tid_idx, ntidinfo, accumulated_len, tid_length); @@ -272,73 +272,47 @@ void opx_regen_tidpairs(struct fi_opx_ep *opx_ep, (len >= 128), opx_ep->debug_counters.expected_receive.tid_buckets[3]); #endif - if (FI_OPX_EXP_TID_GET(tidinfo[tid_idx], CTRL) == 1) { - npages += - (int)FI_OPX_EXP_TID_GET(tidinfo[tid_idx], LEN); - accumulated_len += - FI_OPX_EXP_TID_GET(tidinfo[tid_idx], LEN) * - OPX_HFI1_TID_PAGESIZE; + size_t tid_pages = FI_OPX_EXP_TID_GET(tidinfo[tid_idx], LEN); + size_t tid_pages_len = tid_pages * OPX_HFI1_TID_PAGESIZE; + uint64_t tid_ctrl = FI_OPX_EXP_TID_GET(tidinfo[tid_idx], CTRL); + /* Starts with CTRL 1 *or* it's the first entry (tid_idx == 0) + and starts with ONLY CTRL 2, just accumulate it, no previous + CTRL 1 to pair */ + if (tid_idx == 0 || tid_ctrl == 1) { + npages += (int) tid_pages; + accumulated_len += tid_pages_len; pair_idx++; tidpairs[pair_idx] = tidinfo[tid_idx]; - } else { - if (tid_idx == 0) { - /* Starts with ONLY CTRL 2, just accumulate it - - no previous CTRL 1 to pair */ - npages += (int)FI_OPX_EXP_TID_GET( - tidinfo[tid_idx], LEN); - accumulated_len += - FI_OPX_EXP_TID_GET(tidinfo[tid_idx], - LEN) * - OPX_HFI1_TID_PAGESIZE; + } else { /* possible CTRL 1/2 tid pair */ + assert(tid_ctrl == 2); + npages += tid_pages; + accumulated_len += tid_pages_len; + if ((FI_OPX_EXP_TID_GET(tidinfo[tid_idx - 1], IDX) != + FI_OPX_EXP_TID_GET(tidinfo[tid_idx], IDX)) + || (FI_OPX_EXP_TID_GET(tidinfo[tid_idx - 1], CTRL) != 1) + || ((FI_OPX_EXP_TID_GET(tidinfo[tid_idx - 1], LEN) + + tid_pages) > 512)) { + /* Can't combine into CTRL 3 if : + - not the same IDX or + - previous was not CTRL 1 or + - combined LEN > 512 + + Offset field (OFFSET): For expected receive packets this offset is added to the address field + associated with the specified TID to determine a physical address. This physical address is then + used to DMA the data portion of the received packet to system memory. If OM is 0 the 15-bit + OFFSET can address a 128KB mapping in DW multiples. If OM is 1 the 15-bit OFFSET can address a + 2MB mapping in 64B multiples. + + 512 pages is 2MB. So even if a "tid pair" *seems* to be available, it won't work over 512 pages + so keep ctrl 1 tid and ctrl 2 tid separate, do not optimize into ctrl 3 tidpair + */ pair_idx++; tidpairs[pair_idx] = tidinfo[tid_idx]; - } else { /* possible CTRL 1/2 tid pair */ - assert(FI_OPX_EXP_TID_GET(tidinfo[tid_idx], - CTRL) == 2); - npages += (int)FI_OPX_EXP_TID_GET( - tidinfo[tid_idx], LEN); - accumulated_len += - FI_OPX_EXP_TID_GET(tidinfo[tid_idx], - LEN) * - OPX_HFI1_TID_PAGESIZE; - if ((FI_OPX_EXP_TID_GET(tidinfo[tid_idx - 1], - IDX) != - FI_OPX_EXP_TID_GET(tidinfo[tid_idx], - IDX)) || - (FI_OPX_EXP_TID_GET(tidinfo[tid_idx - 1], - CTRL) != 1) || - ((FI_OPX_EXP_TID_GET(tidinfo[tid_idx - 1], - LEN) + - FI_OPX_EXP_TID_GET(tidinfo[tid_idx], - LEN)) > 512)) { - /* Can't combine into CTRL 3 if : - - not the same IDX or - - previous was not CTRL 1 or - - combined LEN > 512 - - Offset field (OFFSET): For expected receive packets this offset is added to the address field - associated with the specified TID to determine a physical address. This physical address is then - used to DMA the data portion of the received packet to system memory. If OM is 0 the 15-bit - OFFSET can address a 128KB mapping in DW multiples. If OM is 1 the 15-bit OFFSET can address a - 2MB mapping in 64B multiples. - - 512 pages is 2MB. So even if a "tid pair" *seems* to be available, it won't work over 512 pages - so keep ctrl 1 tid and ctrl 2 tid separate, do not optimize into ctrl 3 tidpair - */ - pair_idx++; - tidpairs[pair_idx] = tidinfo[tid_idx]; - } else { - FI_OPX_EXP_TID_RESET(tidpairs[pair_idx], - CTRL, 0x3); - int32_t len = - FI_OPX_EXP_TID_GET( - tidinfo[tid_idx - 1], - LEN) + - FI_OPX_EXP_TID_GET( - tidinfo[tid_idx], LEN); - FI_OPX_EXP_TID_RESET(tidpairs[pair_idx], - LEN, len); - } + } else { + FI_OPX_EXP_TID_RESET(tidpairs[pair_idx], CTRL, 0x3); + int32_t len = tid_pages + + FI_OPX_EXP_TID_GET(tidinfo[tid_idx - 1], LEN); + FI_OPX_EXP_TID_RESET(tidpairs[pair_idx], LEN, len); } } tid_idx++; @@ -360,9 +334,9 @@ void opx_regen_tidpairs(struct fi_opx_ep *opx_ep, opx_ep->debug_counters.expected_receive.first_tidpair_maxlen, first_pair_len); #endif - OPX_TID_NPAIRS(tid_reuse_cache) = pair_idx + 1; - OPX_DEBUG_TIDS("Regen tidpairs", OPX_TID_NPAIRS(tid_reuse_cache), - &OPX_TID_PAIR(tid_reuse_cache, 0)); + tid_reuse_cache->npairs = pair_idx + 1; + OPX_DEBUG_TIDS("Regen tidpairs", tid_reuse_cache->npairs, + &tid_reuse_cache->pairs[0]); (void) npages; } @@ -371,82 +345,75 @@ void opx_regen_tidpairs(struct fi_opx_ep *opx_ep, * * Hold the cache->lock across registering the TIDs */ int opx_register_tid_region(uint64_t tid_vaddr, uint64_t tid_length, + enum fi_hmem_iface tid_iface, + uint64_t tid_device, struct fi_opx_ep *opx_ep, struct opx_mr_tid_info *const tid_reuse_cache) { + uint64_t flags = (uint64_t) OPX_HMEM_KERN_MEM_TYPE[tid_iface]; + /* Parameters must be aligned for expected receive */ - assert(tid_length == (tid_length & -64)); - assert(tid_vaddr == (tid_vaddr & -(int64_t)OPX_HFI1_TID_PAGESIZE)); - assert(tid_length == (tid_length & -(int64_t)OPX_HFI1_TID_PAGESIZE)); + assert(tid_vaddr == (tid_vaddr & -(int64_t)OPX_TID_PAGE_SIZE[tid_iface])); + assert(tid_length == (tid_length & -(int64_t)OPX_TID_PAGE_SIZE[tid_iface])); /* Assert precondition that the lock is held with a trylock assert */ assert(pthread_mutex_trylock(&opx_ep->tid_domain->tid_cache->lock) == EBUSY); - FI_DBG(fi_opx_global.prov, FI_LOG_MR, "vaddr %p, length %lu/%lu\n", - (void *)tid_vaddr, tid_length, - (tid_length + (OPX_HFI1_TID_PAGESIZE - 1)) & - -OPX_HFI1_TID_PAGESIZE); - tid_length = (tid_length + (OPX_HFI1_TID_PAGESIZE - 1)) & - -OPX_HFI1_TID_PAGESIZE; + FI_DBG(fi_opx_global.prov, FI_LOG_MR, "vaddr %p, length %lu\n", + (void *)tid_vaddr, tid_length); /* TODO: Remove opx_ep - we aren't registering for an ep, it's domain-wide */ struct _hfi_ctrl *ctx = opx_ep->hfi->ctrl; -#ifndef NDEBUG - /* switching to use the #define more consistently, but assert it's correct - with respect to hfi configuration */ - const uint32_t pg_sz = ctx->__hfi_pg_sz; - assert(pg_sz == OPX_HFI1_TID_PAGESIZE); - assert(sysconf(_SC_PAGESIZE) == OPX_HFI1_TID_PAGESIZE); - /* Unfortunately, for now, we assume 2 TID pages per 8K packet */ - assert(OPX_HFI1_TID_PAGESIZE == 4096); -#endif const uint32_t max_tidcnt = ctx->__hfi_tidexpcnt; - assert(ctx->__hfi_tidexpcnt <= OPX_MAX_TID_COUNT); - if (OFI_UNLIKELY(tid_length > - (max_tidcnt * OPX_HFI1_TID_PAGESIZE))) { + assert(max_tidcnt <= OPX_MAX_TID_COUNT); + + const uint64_t max_tidlen = max_tidcnt * OPX_TID_PAGE_SIZE[tid_iface]; + if (OFI_UNLIKELY(tid_length > max_tidlen)) { /* This is somewhat arbitrary - if we "chunk" the TID updates we might be able * to do larger buffers using multiple update calls. */ FI_WARN(fi_opx_global.prov, FI_LOG_MR, - "OPX_DEBUG_EXIT Max length exceeded, %lu\n", - tid_length); - OPX_TID_CACHE_RZV_RTS(tid_reuse_cache, - "UPDATE LENGTH EXCEEDED"); + "OPX_DEBUG_EXIT Max TID length exceeded, %lu > %lu\n", + tid_length, max_tidlen); + OPX_TID_CACHE_RZV_RTS(tid_reuse_cache, "UPDATE LENGTH EXCEEDED"); return -1; } - uint32_t tidcnt = - (uint32_t)((tid_length + (OPX_HFI1_TID_PAGESIZE - 1)) >> 12); - /* Eventually we might need to "chunk" updates, thus the naming here */ - uint32_t tidcnt_chunk = tidcnt; - uint32_t length_chunk = OPX_HFI1_TID_PAGESIZE * tidcnt; /* tid update takes uint32_t, not uint64_t length */ + + uint32_t tidcnt = (uint32_t) (tid_length / OPX_TID_PAGE_SIZE[tid_iface]); if (OFI_UNLIKELY(tidcnt > max_tidcnt)) { FI_WARN(fi_opx_global.prov, FI_LOG_MR, - "OPX_DEBUG_EXIT Max TIDs exceeded, %u > %u\n", tidcnt, - max_tidcnt); + "OPX_DEBUG_EXIT Max TIDs exceeded, %u > %u\n", + tidcnt, max_tidcnt); OPX_TID_CACHE_RZV_RTS(tid_reuse_cache, "UPDATE NTIDS EXCEEDED"); OPX_TID_CACHE_DEBUG_FPRINTF("## %s:%u OPX_TID_CACHE_DEBUG Update number of TIDs (%u) exceeded\n", __func__, __LINE__, tidcnt); return -1; } + + /* Eventually we might need to "chunk" updates, thus the naming here */ + uint32_t length_chunk = (uint32_t) tid_length; /* new (cumulative) vaddr/length of this operation*/ uint64_t new_vaddr = tid_vaddr; - uint64_t new_length = length_chunk; /* page aligned length */ - assert((OPX_TID_LENGTH(tid_reuse_cache) == 0) && - (OPX_TID_VADDR(tid_reuse_cache) == 0)); + assert((tid_reuse_cache->tid_length == 0) && + (tid_reuse_cache->tid_vaddr == 0)); - uint64_t *tidlist = (uint64_t *)&OPX_TID_INFO(tid_reuse_cache, 0); + uint64_t *tidlist = (uint64_t *)&tid_reuse_cache->info[0]; FI_DBG(fi_opx_global.prov, FI_LOG_MR, - "OPX_DEBUG_ENTRY buffer range [%#lx - %#lx] length %lu %u, new range [%#lx - %#lx] length %lu %u, tidcnt %u, tidlist %p\n", + "OPX_DEBUG_ENTRY buffer range [%#lx - %#lx] length %lu %u, new range [%#lx - %#lx] length %u, tidcnt %u, tidlist %p iface %u flags %#lx\n", tid_vaddr, tid_vaddr + tid_length, tid_length, length_chunk, - new_vaddr, new_vaddr + new_length, new_length, length_chunk, - tidcnt, tidlist); - FI_DBG(fi_opx_global.prov, FI_LOG_MR, - "update tid length %#X, pages (tidcnt) %u\n", length_chunk, - tidcnt); - assert(tid_vaddr + tid_length <= - tid_vaddr + (tidcnt * OPX_HFI1_TID_PAGESIZE)); - FI_DBG(fi_opx_global.prov, FI_LOG_MR, - "opx_hfi_update_tid vaddr [%#lx - %#lx], length %u\n", tid_vaddr, - tid_vaddr + length_chunk, length_chunk); + new_vaddr, new_vaddr + length_chunk, length_chunk, + tidcnt, tidlist, tid_iface, flags); + + if (tid_iface == FI_HMEM_CUDA) { + int err = cuda_set_sync_memops((void *) tid_vaddr); + if (OFI_UNLIKELY(err != 0)) { + FI_WARN(fi_opx_global.prov, FI_LOG_MR, + "cuda_set_sync_memops(%p) FAILED (returned %d)\n", + (void *) tid_vaddr, err); + return -1; + } + } + + uint32_t tidcnt_chunk; /* return code is ignored in favor of length/tidcnt checks * because the ioctl can "succeed" (return code 0) within * resource limitations and the updated length/tidcnt will @@ -457,9 +424,9 @@ int opx_register_tid_region(uint64_t tid_vaddr, uint64_t tid_length, &length_chunk, /* input/output*/ (uint64_t)tidlist, /* input/output ptr cast as uint64_t */ &tidcnt_chunk, /* output */ - 0); - FI_OPX_DEBUG_COUNTERS_INC( - opx_ep->debug_counters.expected_receive.tid_updates); + flags); + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.expected_receive.tid_updates); + FI_OPX_DEBUG_COUNTERS_INC_COND(tid_iface > FI_HMEM_SYSTEM, opx_ep->debug_counters.hmem.tid_update); FI_DBG(fi_opx_global.prov, FI_LOG_MR, "opx_hfi_update_tid return length %u, tidcnt %u\n", length_chunk, tidcnt_chunk); @@ -469,10 +436,14 @@ int opx_register_tid_region(uint64_t tid_vaddr, uint64_t tid_length, if (OFI_UNLIKELY(((uint64_t)length_chunk < tid_length) || (tidcnt_chunk == 0))) { /* errors generally mean we hit the TID resource limit */ FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.expected_receive.tid_resource_limit); + FI_OPX_DEBUG_COUNTERS_INC_COND(length_chunk < tid_length, + opx_ep->debug_counters.expected_receive.tid_resource_limit_length_chunk_short); + FI_OPX_DEBUG_COUNTERS_INC_COND(tidcnt_chunk == 0, + opx_ep->debug_counters.expected_receive.tid_resource_limit_tidcnt_chunk_zero); FI_WARN(fi_opx_global.prov, FI_LOG_MR, "OPX_DEBUG_EXIT opx_hfi_update_tid failed on vaddr %#lX, length %lu/%u, tidcnt %u\n", tid_vaddr, tid_length, length_chunk, tidcnt_chunk); - if(tidcnt_chunk == 0) { + if (tidcnt_chunk == 0) { /* The return length is untouched, so update it for the recovery calculations below */ length_chunk = 0; @@ -483,37 +454,41 @@ int opx_register_tid_region(uint64_t tid_vaddr, uint64_t tid_length, pthread_mutex_lock(&opx_ep->tid_domain->tid_cache->lock); /* Attempt one recovery ioctl()*/ - uint32_t new_length_chunk = (OPX_HFI1_TID_PAGESIZE * tidcnt) - length_chunk; + uint32_t new_length_chunk = tid_length - length_chunk; uint32_t new_tidcnt_chunk = 0; /* Frustrating mix of uint32_t/uint64_t*/ - uint32_t *new_tidinfo = &OPX_TID_INFO(tid_reuse_cache, tidcnt_chunk); + uint32_t *new_tidinfo = &tid_reuse_cache->info[tidcnt_chunk]; opx_hfi_update_tid( ctx, (tid_vaddr + length_chunk), /* input */ &new_length_chunk, /* input/output*/ (uint64_t)new_tidinfo, /* input/output ptr cast as uint64_t */ &new_tidcnt_chunk, /* output */ - 0); + flags); FI_OPX_DEBUG_COUNTERS_INC( opx_ep->debug_counters.expected_receive.tid_updates); + FI_OPX_DEBUG_COUNTERS_INC_COND(tid_iface > FI_HMEM_SYSTEM, + opx_ep->debug_counters.hmem.tid_update); FI_DBG(fi_opx_global.prov, FI_LOG_MR, - "opx_hfi_update_tid return length %u, tidcnt %u\n", new_length_chunk, - new_tidcnt_chunk); - if (OFI_UNLIKELY(((uint64_t)length_chunk + (uint64_t)new_length_chunk) < tid_length) || - (new_tidcnt_chunk == 0)) { + "opx_hfi_update_tid return length %u, tidcnt %u\n", + new_length_chunk, new_tidcnt_chunk); + if (OFI_UNLIKELY((length_chunk + new_length_chunk) < tid_length) || + (new_tidcnt_chunk == 0)) { #ifdef OPX_IOCTL_DEBUG fprintf(stderr, "## FAILED RECOVERY opx_hfi_update_tid failed on vaddr %#lX, length %lu/%u, tidcnt %u\n", tid_vaddr, tid_length, length_chunk, tidcnt_chunk); fprintf(stderr, "## FAILED RECOVERY opx_hfi_update_tid failed on vaddr %#lX, length %lu/%u, tidcnt %u\n", - (tid_vaddr + length_chunk),(OPX_HFI1_TID_PAGESIZE * tidcnt) - length_chunk, new_length_chunk, new_tidcnt_chunk); + (tid_vaddr + length_chunk), + (OPX_TID_PAGE_SIZE[tid_iface] * tidcnt) - length_chunk, + new_length_chunk, new_tidcnt_chunk); #endif OPX_TID_CACHE_RZV_RTS(tid_reuse_cache, "UPDATE/NEW FAILED"); /* free first partial update, it's not useful */ if (length_chunk) { OPX_FPRINTF_TIDS("Partially updated tidinfo", (tidcnt_chunk + new_tidcnt_chunk), - &OPX_TID_INFO(tid_reuse_cache, 0)); + &tid_reuse_cache->info[0]); opx_hfi_free_tid(ctx, (uint64_t)tidlist, tidcnt_chunk); } OPX_TID_CACHE_DEBUG_FPRINTF( @@ -536,23 +511,31 @@ int opx_register_tid_region(uint64_t tid_vaddr, uint64_t tid_length, tid_vaddr, tid_length, length_chunk, tidcnt_chunk); fprintf(stderr, "## SUCCESS RECOVERY opx_hfi_update_tid on vaddr %#lX, length %lu/%u, tidcnt %u\n", - (tid_vaddr + length_chunk),(OPX_HFI1_TID_PAGESIZE * tidcnt) - length_chunk, new_length_chunk, new_tidcnt_chunk); + (tid_vaddr + length_chunk), + (OPX_TID_PAGE_SIZE[tid_iface] * tidcnt) - length_chunk, + new_length_chunk, new_tidcnt_chunk); } #endif /* Successfully recovered */ tidcnt_chunk += new_tidcnt_chunk; + length_chunk += new_length_chunk; OPX_FPRINTF_TIDS("Recovered partially updated tidinfo", tidcnt_chunk, - &OPX_TID_INFO(tid_reuse_cache, 0)); + &tid_reuse_cache->info[0]); + } else if (length_chunk > tid_length) { + FI_DBG(fi_opx_global.prov, FI_LOG_MR, + "opx_hfi_update_tid gave larger than requested range! requested length %lu, return length %u, tidcnt %u\n", + tid_length, length_chunk, tidcnt_chunk); + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.expected_receive + .tid_resource_limit_length_chunk_long); } assert(tidcnt_chunk <= FI_OPX_MAX_DPUT_TIDPAIRS); OPX_DEBUG_TIDS("Updated tidinfo", tidcnt_chunk, - (&(OPX_TID_INFO(tid_reuse_cache, 0)))); - OPX_TID_VADDR(tid_reuse_cache) = new_vaddr; - OPX_TID_LENGTH(tid_reuse_cache) = new_length; - OPX_TID_NINFO(tid_reuse_cache) += - tidcnt_chunk; /* appended or replaced */ - OPX_TID_VALID(tid_reuse_cache); + (&(tid_reuse_cache->info[0]))); + tid_reuse_cache->tid_vaddr = new_vaddr; + tid_reuse_cache->tid_length = length_chunk; + tid_reuse_cache->ninfo += tidcnt_chunk; /* appended or replaced */ + tid_reuse_cache->invalid = 0; OPX_TID_CACHE_RZV_RTS(tid_reuse_cache, "UPDATE/NEW"); @@ -566,9 +549,9 @@ int opx_register_tid_region(uint64_t tid_vaddr, uint64_t tid_length, "OPX_DEBUG_EXIT UPDATED TIDs vaddr [%#lx - %#lx] length %lu, tid vaddr [%#lx - %#lx] , tid length %lu, number of TIDs %u\n", tid_vaddr, tid_vaddr + tid_length, tid_length, - OPX_TID_VADDR(tid_reuse_cache), - OPX_TID_VADDR(tid_reuse_cache) + OPX_TID_LENGTH(tid_reuse_cache), - OPX_TID_LENGTH(tid_reuse_cache), OPX_TID_NINFO(tid_reuse_cache)); + tid_reuse_cache->tid_vaddr, + tid_reuse_cache->tid_vaddr + tid_reuse_cache->tid_length, + tid_reuse_cache->tid_length, tid_reuse_cache->ninfo); opx_regen_tidpairs(opx_ep, tid_reuse_cache); return 0; @@ -582,8 +565,8 @@ void opx_deregister_tid_region(struct fi_opx_ep *opx_ep, struct opx_mr_tid_info *const tid_reuse_cache) { struct _hfi_ctrl *ctx = opx_ep->hfi->ctrl; - uint32_t old_ntidinfo = OPX_TID_NINFO(tid_reuse_cache); - uint64_t *old_tidlist = (uint64_t *)&OPX_TID_INFO(tid_reuse_cache, 0); + uint32_t old_ntidinfo = tid_reuse_cache->ninfo; + uint64_t *old_tidlist = (uint64_t *)&tid_reuse_cache->info[0]; FI_DBG(fi_opx_global.prov, FI_LOG_MR, "OPX_DEBUG_ENTRY vaddr %p, length %lu, opx_hfi_free_tid %u tidpairs\n", (void *)tid_reuse_cache->tid_vaddr, tid_reuse_cache->tid_length, @@ -658,9 +641,9 @@ void opx_tid_cache_delete_region(struct ofi_mr_cache *cache, const size_t iov_len = entry->info.iov.iov_len; assert(entry->use_cnt == 0); /* Is this region current? deregister it */ - if (!OPX_TID_IS_INVALID(tid_reuse_cache) && - (OPX_TID_LENGTH(tid_reuse_cache) == iov_len) && - (OPX_TID_VADDR(tid_reuse_cache) == (uint64_t)iov_base)) { + if (!tid_reuse_cache->invalid && + (tid_reuse_cache->tid_length == iov_len) && + (tid_reuse_cache->tid_vaddr == (uint64_t)iov_base)) { FI_DBG(cache->domain->prov, FI_LOG_MR, "ENTRY cache %p, entry %p, data %p, iov_base %p, iov_len %zu\n", cache, entry, opx_mr, iov_base, iov_len); @@ -668,7 +651,7 @@ void opx_tid_cache_delete_region(struct ofi_mr_cache *cache, } else { FI_DBG(cache->domain->prov, FI_LOG_MR, "ENTRY OPX_TID_IS_INVALID==%u cache %p, entry %p, data %p, iov_base %p, iov_len %zu\n", - OPX_TID_IS_INVALID(tid_reuse_cache), cache, entry, opx_mr, iov_base, iov_len); + tid_reuse_cache->invalid, cache, entry, opx_mr, iov_base, iov_len); } memset(opx_mr, 0x00, sizeof(*opx_mr)); @@ -798,7 +781,7 @@ int opx_tid_cache_init(struct util_domain *domain, __OPX_FORCE_INLINE__ struct ofi_mr_entry *opx_mr_rbt_find(struct ofi_rbmap *tree, - const struct ofi_mr_info *key) + const struct ofi_mr_info *key) { struct ofi_rbnode *node; @@ -927,7 +910,6 @@ int opx_tid_cache_crte(struct ofi_mr_cache *cache, /* drop the mm lock across alloc/register */ pthread_mutex_unlock(&mm_lock); *entry = opx_mr_entry_alloc(cache); - assert((*entry)->use_cnt == 0); if (!*entry) { FI_DBG(cache->domain->prov, FI_LOG_MR, "OPX_DEBUG_ENTRY FI_NOMEM [%p - %p] (len: %zu/%#lX) \n", @@ -938,6 +920,7 @@ int opx_tid_cache_crte(struct ofi_mr_cache *cache, pthread_mutex_lock(&mm_lock); return -FI_ENOMEM; } + assert((*entry)->use_cnt == 0); (*entry)->node = NULL; (*entry)->info = *info; @@ -964,15 +947,15 @@ int opx_tid_cache_crte(struct ofi_mr_cache *cache, struct opx_mr_tid_info *tid_info = &opx_mr->tid_info; - OPX_TID_NINFO(tid_info) = 0; - OPX_TID_NPAIRS(tid_info) = 0; - OPX_TID_VADDR(tid_info) = 0UL; - OPX_TID_LENGTH(tid_info) = 0UL; + tid_info->ninfo = 0; + tid_info->npairs = 0; + tid_info->tid_vaddr = 0UL; + tid_info->tid_length = 0UL; #ifndef NDEBUG for (int i = 0; i < FI_OPX_MAX_DPUT_TIDPAIRS; ++i) { - OPX_TID_INFO(tid_info, i) = -1U; - OPX_TID_PAIR(tid_info, i) = -1U; + tid_info->info[i] = -1U; + tid_info->pairs[i] = -1U; } #endif @@ -982,11 +965,12 @@ int opx_tid_cache_crte(struct ofi_mr_cache *cache, /* Hold the cache->lock across registering the TIDs */ pthread_mutex_lock(&cache->lock); if (opx_register_tid_region((uint64_t)info->iov.iov_base, - (uint64_t)info->iov.iov_len, opx_ep, - tid_info)) { + (uint64_t)info->iov.iov_len, + info->iface, info->device, + opx_ep, tid_info)) { FI_DBG(fi_opx_global.prov, FI_LOG_MR, "opx_register_tid_region failed\n"); - /* Failed, OPX_TID_NINFO(tid_info) will be zero */ + /* Failed, tid_info->ninfo will be zero */ FI_DBG(fi_opx_global.prov, FI_LOG_MR, "FREE node %p\n", (*entry)->node); pthread_mutex_unlock(&cache->lock); @@ -1002,9 +986,9 @@ int opx_tid_cache_crte(struct ofi_mr_cache *cache, "NEW vaddr [%#lx - %#lx] length %lu, tid vaddr [%#lx - %#lx] , tid length %lu\n", (uint64_t)info->iov.iov_base, (uint64_t)info->iov.iov_base + (uint64_t)info->iov.iov_len, - (uint64_t)info->iov.iov_len, OPX_TID_VADDR(tid_info), - OPX_TID_VADDR(tid_info) + OPX_TID_LENGTH(tid_info), - OPX_TID_LENGTH(tid_info)); + (uint64_t)info->iov.iov_len, tid_info->tid_vaddr, + tid_info->tid_vaddr + tid_info->tid_length, + tid_info->tid_length); if (opx_tid_cache_full(cache)) { opx_tid_cache_flush_all(cache, true, true); @@ -1049,7 +1033,7 @@ int opx_tid_cache_crte(struct ofi_mr_cache *cache, info->iov.iov_len, info->iov.iov_len); assert((*entry)->use_cnt == 1); opx_tid_dec_use_cnt(*entry);/* traceable */ - OPX_TID_NINFO(tid_info) = 0; /* error == no tid pairs */ + tid_info->ninfo = 0; /* error == no tid pairs */ OPX_DEBUG_EXIT((*entry), 2); return 0; //TODO - handle case for free } @@ -1077,7 +1061,7 @@ int opx_tid_cache_find(struct fi_opx_ep *opx_ep, goto in_use; } ret = OPX_ENTRY_FOUND; - } else if (!ofi_iov_within(&info->iov, &(*entry)->info.iov)) { + } else { if (opx_mr->opx_ep != opx_ep) { FI_DBG(fi_opx_global.prov, FI_LOG_MR,"OPX_ENTRY_IN_USE %p/%p\n",opx_mr? opx_mr->opx_ep:NULL, opx_ep); goto in_use; @@ -1255,7 +1239,7 @@ int opx_tid_cache_close_region(struct ofi_mr_cache *tid_cache, struct fi_opx_ep *const opx_ep = opx_mr->opx_ep; FI_DBG(tid_cache->domain->prov, FI_LOG_MR, "OPX_TID_IS_INVALID %u->1, (%p/%p) insert lru [%p - %p] (len: %zu,%#lX) use_cnt %x\n", - OPX_TID_IS_INVALID(tid_info), + tid_info->invalid, entry, entry->data, entry->info.iov.iov_base, (char*)entry->info.iov.iov_base + entry->info.iov.iov_len, @@ -1269,7 +1253,7 @@ int opx_tid_cache_close_region(struct ofi_mr_cache *tid_cache, /* Hold the cache->lock across de-registering the TIDs */ pthread_mutex_lock(&tid_cache->lock); opx_deregister_tid_region(opx_ep, tid_info); - OPX_TID_INVALID(tid_info); /* prevent double deregister later */ + tid_info->invalid = 1; /* prevent double deregister later */ pthread_mutex_unlock(&tid_cache->lock); /* re-acquire mm_lock */ @@ -1280,7 +1264,7 @@ int opx_tid_cache_close_region(struct ofi_mr_cache *tid_cache, if (use_cnt == 0) { OPX_DEBUG_UCNT(entry); FI_DBG(tid_cache->domain->prov, FI_LOG_MR, "invalidate %u, invalid %u, node %p, (%p/%p) insert lru [%p - %p] (len: %zu,%#lX) use_cnt %x\n", - invalidate, OPX_TID_IS_INVALID(tid_info), entry->node, + invalidate, tid_info->invalid, entry->node, entry, entry->data, entry->info.iov.iov_base, (char*)entry->info.iov.iov_base + entry->info.iov.iov_len, @@ -1296,7 +1280,7 @@ int opx_tid_cache_close_region(struct ofi_mr_cache *tid_cache, pthread_mutex_lock(&mm_lock); return 0; } - if(OPX_TID_IS_INVALID(tid_info)) { /* it's dead, not just "least recently used */ + if(tid_info->invalid) { /* it's dead, not just "least recently used */ FI_DBG(tid_cache->domain->prov, FI_LOG_MR, "DEAD entry %p\n",entry); opx_mr_uncache_entry_storage(tid_cache, entry); dlist_insert_tail(&entry->list_entry, &tid_cache->dead_region_list); @@ -1674,7 +1658,7 @@ int opx_process_entry(struct fi_opx_ep *opx_ep, int find, found_tid_entry->tid_length) - *vaddr)); - */ + */ assert(inout_info->iov.iov_base == (void *)(input_tid_info->tid_vaddr)); assert(inout_info->iov.iov_base == (void *)*vaddr); const uint64_t adj = *length; @@ -1817,7 +1801,9 @@ int opx_process_entry(struct fi_opx_ep *opx_ep, int find, } int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, - const uint64_t tid_vaddr, const uint64_t tid_length) + const uint64_t tid_vaddr, const uint64_t tid_length, + const enum fi_hmem_iface tid_iface, + const uint64_t tid_device) { struct fi_opx_ep *opx_ep = params->opx_ep; struct opx_tid_domain *tid_domain = opx_ep->domain->tid_domain; @@ -1826,13 +1812,15 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, struct ofi_mr_info find_info = {0}; int first_tid_index = -1, last_tid_index = -1, page_offset_in_tid = -1; - assert(tid_vaddr == (tid_vaddr & -(int64_t)OPX_HFI1_TID_PAGESIZE)); - assert(tid_length == (tid_length & -(int64_t)OPX_HFI1_TID_PAGESIZE)); + assert(tid_vaddr == (tid_vaddr & -(int64_t)OPX_TID_PAGE_SIZE[tid_iface])); + assert(tid_length == (tid_length & -(int64_t)OPX_TID_PAGE_SIZE[tid_iface])); pthread_mutex_lock(&mm_lock); find_info.iov.iov_base = (void *)tid_vaddr; find_info.iov.iov_len = tid_length; + find_info.iface = tid_iface; + find_info.device = tid_device; FI_DBG(fi_opx_global.prov, FI_LOG_MR, "OPX_DEBUG_ENTRY tid vaddr [%#lx - %#lx] , tid length %lu/%#lX\n", @@ -1874,7 +1862,7 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, &((struct opx_tid_mr *)entry->data)->tid_info; /* opx_register_tid_region was done in add region, check result */ - if (OPX_TID_NINFO(cached_tid_entry) == 0) { /* failed */ + if (cached_tid_entry->ninfo == 0) { /* failed */ OPX_TID_CACHE_DEBUG_FPRINTF("## %s:%u return -FI_EFAULT\n", __func__, __LINE__); /*crte returns an entry even if tid update failed */ @@ -1892,11 +1880,11 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, } /* Copy the tid info to params list for further modifications */ - params->ntidpairs = OPX_TID_NPAIRS(cached_tid_entry); + params->ntidpairs = cached_tid_entry->npairs; assert(params->ntidpairs != 0); - memcpy(params->tidpairs, &OPX_TID_PAIR(cached_tid_entry, 0), - (OPX_TID_NPAIRS(cached_tid_entry) * - sizeof(OPX_TID_PAIR(cached_tid_entry, 0)))); + memcpy(params->tidpairs, &cached_tid_entry->pairs[0], + (cached_tid_entry->npairs * + sizeof(cached_tid_entry->pairs[0]))); params->tid_offset = 0; FI_DBG(fi_opx_global.prov, FI_LOG_MR, "tid_offset %u/%#X\n", params->tid_offset, params->tid_offset); @@ -1907,7 +1895,7 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, &((struct opx_tid_mr *)entry->data)->tid_info; assert(cached_tid_entry->tid_length != 0); - if (OPX_TID_IS_INVALID(cached_tid_entry)) { + if (cached_tid_entry->invalid) { /* TID was invalidated while still in use and not deleted, can't user or re-register it until it's dead. */ /* Unlock for failed return */ @@ -1918,10 +1906,10 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, "found [%#lx - %#lx] length %lu/%#lX\n", tid_vaddr, tid_vaddr + tid_length, tid_length, tid_length, - OPX_TID_VADDR(cached_tid_entry), - OPX_TID_VADDR(cached_tid_entry) + OPX_TID_LENGTH(cached_tid_entry), - OPX_TID_LENGTH(cached_tid_entry), - OPX_TID_LENGTH(cached_tid_entry)); + cached_tid_entry->tid_vaddr, + cached_tid_entry->tid_vaddr + cached_tid_entry->tid_length, + cached_tid_entry->tid_length, + cached_tid_entry->tid_length); return -FI_EINVAL; } /* Entry was found. Our search is completely contained in this region */ @@ -1929,22 +1917,21 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, opx_tid_inc_use_cnt(entry); OPX_DEBUG_TIDS("REUSE FULL LIST", - OPX_TID_NPAIRS(cached_tid_entry), - &OPX_TID_PAIR(cached_tid_entry, 0)); + cached_tid_entry->npairs, + &cached_tid_entry->pairs[0]); opx_return_offset_for_new_cache_entry( (uint64_t)find_info.iov.iov_base, (uint64_t)find_info.iov.iov_len, cached_tid_entry, &first_tid_index, &page_offset_in_tid, &last_tid_index); OPX_DEBUG_TIDS("REUSE SUBSET LIST", (last_tid_index - first_tid_index + 1), - &OPX_TID_PAIR(cached_tid_entry, - first_tid_index)); + &cached_tid_entry->pairs[first_tid_index]); /* Copy the tid info to params list for further modifications */ params->ntidpairs = last_tid_index - first_tid_index + 1; assert(params->ntidpairs != 0); memcpy(params->tidpairs, - &OPX_TID_PAIR(cached_tid_entry, first_tid_index), + &cached_tid_entry->pairs[first_tid_index], params->ntidpairs * sizeof(params->tidpairs[0])); params->tid_offset = page_offset_in_tid * OPX_HFI1_TID_PAGESIZE; @@ -1960,7 +1947,7 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, &((struct opx_tid_mr *)entry->data)->tid_info; assert(overlap_tid_entry->tid_length != 0); - if (OPX_TID_IS_INVALID(overlap_tid_entry)) { + if (overlap_tid_entry->invalid) { /* TID was invalidated while still in use and not deleted, can't user or re-register it until it's dead. */ /* Unlock for failed return */ @@ -1971,10 +1958,10 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, "found [%#lx - %#lx] length %lu/%#lX\n", tid_vaddr, tid_vaddr + tid_length, tid_length, tid_length, - OPX_TID_VADDR(overlap_tid_entry), - OPX_TID_VADDR(overlap_tid_entry) + OPX_TID_LENGTH(overlap_tid_entry), - OPX_TID_LENGTH(overlap_tid_entry), - OPX_TID_LENGTH(overlap_tid_entry)); + overlap_tid_entry->tid_vaddr, + overlap_tid_entry->tid_vaddr + overlap_tid_entry->tid_length, + overlap_tid_entry->tid_length, + overlap_tid_entry->tid_length); return -FI_EINVAL; } /* Partial/overlapping memory region found */ @@ -1984,10 +1971,10 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, "overlap vaddr [%#lx - %#lx] , tid length %lu/%#lX\n", tid_vaddr, tid_vaddr + tid_length, tid_length, tid_length, - OPX_TID_VADDR(overlap_tid_entry), - OPX_TID_VADDR(overlap_tid_entry) + OPX_TID_LENGTH(overlap_tid_entry), - OPX_TID_LENGTH(overlap_tid_entry), - OPX_TID_LENGTH(overlap_tid_entry)); + overlap_tid_entry->tid_vaddr, + overlap_tid_entry->tid_vaddr + overlap_tid_entry->tid_length, + overlap_tid_entry->tid_length, + overlap_tid_entry->tid_length); uint64_t remaining_vaddr = tid_vaddr; int64_t remaining_length = tid_length; @@ -2113,7 +2100,7 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, &((struct opx_tid_mr *) create_entry->data) ->tid_info; - if (OPX_TID_NINFO(create_tid_entry) == + if (create_tid_entry->ninfo == 0) { /* failed */ OPX_TID_CACHE_DEBUG_FPRINTF("## %s:%u return -FI_EFAULT\n", __func__, __LINE__); @@ -2165,9 +2152,9 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, /* Copy the tid info to params list for further modifications */ const uint32_t created_ntidpairs = - (int)OPX_TID_NPAIRS(create_tid_entry); + (int)create_tid_entry->npairs; const uint32_t *created_tidpairs = - &OPX_TID_PAIR(create_tid_entry, 0); + &create_tid_entry->pairs[0]; OPX_DEBUG_TIDS("Created tidpairs", created_ntidpairs, created_tidpairs); @@ -2186,7 +2173,7 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, &((struct opx_tid_mr *) entry->data) ->tid_info; - if (OPX_TID_IS_INVALID(found_tid_entry)) { + if (found_tid_entry->invalid) { /* TID was invalidated while still in use and not deleted, can't user or re-register it until it's dead. */ /* Unlock for failed return */ @@ -2196,10 +2183,10 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, "vaddr [%#lx - %#lx] length %lu/%#lX " "found [%#lx - %#lx] length %lu/%#lX\n", tid_vaddr, tid_vaddr + tid_length, tid_length, tid_length, - OPX_TID_VADDR(found_tid_entry), - OPX_TID_VADDR(found_tid_entry) + OPX_TID_LENGTH(found_tid_entry), - OPX_TID_LENGTH(found_tid_entry), - OPX_TID_LENGTH(found_tid_entry)); + found_tid_entry->tid_vaddr, + found_tid_entry->tid_vaddr + found_tid_entry->tid_length, + found_tid_entry->tid_length, + found_tid_entry->tid_length); return -FI_EINVAL; } opx_tid_inc_use_cnt(entry); @@ -2209,19 +2196,18 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, "found vaddr [%#lx - %#lx] %lu/%#lX\n", remaining_vaddr, remaining_vaddr + remaining_length, remaining_length, remaining_length, - OPX_TID_VADDR(found_tid_entry), - OPX_TID_VADDR(found_tid_entry) + - OPX_TID_LENGTH(found_tid_entry), - OPX_TID_LENGTH(found_tid_entry), - OPX_TID_LENGTH(found_tid_entry)); + found_tid_entry->tid_vaddr, + found_tid_entry->tid_vaddr + + found_tid_entry->tid_length, + found_tid_entry->tid_length, + found_tid_entry->tid_length); first_tid_index = 0; last_tid_index = - (int)OPX_TID_NPAIRS(found_tid_entry); + (int)found_tid_entry->npairs; page_offset_in_tid = 0; OPX_DEBUG_TIDS("OVERLAP REUSE FULL LIST", - OPX_TID_NPAIRS(found_tid_entry), - &OPX_TID_PAIR(found_tid_entry, - 0)); + found_tid_entry->npairs, + &found_tid_entry->pairs[0]); if ((found_tid_entry->tid_vaddr < remaining_vaddr) || (remaining_length < @@ -2238,12 +2224,10 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, OPX_DEBUG_TIDS( "OVERLAP REUSE SUBSET LIST", (last_tid_index - first_tid_index + 1), - &OPX_TID_PAIR(found_tid_entry, - first_tid_index)); + &found_tid_entry->pairs[first_tid_index]); const uint32_t found_ntidpairs = last_tid_index - first_tid_index + 1; - const uint32_t *found_tidpairs = &OPX_TID_PAIR( - found_tid_entry, first_tid_index); + const uint32_t *found_tidpairs = &found_tid_entry->pairs[first_tid_index]; assert(found_ntidpairs && found_tidpairs && params && params->tidpairs);