From 0fbf4f40c72d0cfce15b1fe7f4fb6fd297f8acdd Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Thu, 2 May 2024 18:48:10 -0400 Subject: [PATCH] prov/lnx: Introducing the LINKx (lnx) provider The LINKx (lnx) provider offers a framework by which multiple providers can be linked together and presented as one provider to the application. This abstracts away the details of the traffic providers from the application. This iteration of the provider allows linking only two providers, shm and another provider, ex; CXI or RXM. The composite providers which are linked together need to support the peer infrastructure. Currently the provider supports creating a unique chain of fabric->domain->ep. It doesn't support creating multiple domains per fabric and multiple endpoints per domain. This will be addresses in followup updates to the provider. This iteration mainly focuses on supporting open MPI's MTL path which uses the libfabric tagged APIs. It has been tested with linking shm and cxi and shm and rxm. Future work will include: - Supporting 1:N of fabric:domain and domain:endpoint, etc - Hardware offload support - Arbitrary provider linking - Memory caching and registration - Full libfabric API support - Multi-Rail feature In order to use the lnx provider the user needs to: export FI_LNX_PROV_LINKS="shm+" ex: export FI_LNX_PROV_LINKS="shm+cxi" or export FI_LNX_PROV_LINKS="shm+tcp;ofi_rxm" This results in the lnx provider returning all available links to the application, which can then select the most appropriate one to use. Signed-off-by: Amir Shehata --- Makefile.am | 1 + configure.ac | 1 + include/ofi.h | 1 + include/ofi_prov.h | 11 + include/ofi_util.h | 15 +- include/rdma/fabric.h | 1 + include/rdma/fi_errno.h | 2 +- man/fi_lnx.7.md | 157 +++++ man/man7/fi_lnx.7 | 173 ++++++ prov/lnx/Makefile.include | 61 ++ prov/lnx/configure.m4 | 15 + prov/lnx/include/lnx.h | 477 +++++++++++++++ prov/lnx/src/lnx_av.c | 702 ++++++++++++++++++++++ prov/lnx/src/lnx_cq.c | 234 ++++++++ prov/lnx/src/lnx_domain.c | 581 ++++++++++++++++++ prov/lnx/src/lnx_ep.c | 1181 +++++++++++++++++++++++++++++++++++++ prov/lnx/src/lnx_init.c | 884 +++++++++++++++++++++++++++ prov/lnx/src/lnx_ops.c | 1036 ++++++++++++++++++++++++++++++++ prov/util/src/util_attr.c | 15 +- src/fabric.c | 20 +- src/fi_tostr.c | 1 + 21 files changed, 5562 insertions(+), 7 deletions(-) create mode 100644 man/fi_lnx.7.md create mode 100644 man/man7/fi_lnx.7 create mode 100644 prov/lnx/Makefile.include create mode 100644 prov/lnx/configure.m4 create mode 100644 prov/lnx/include/lnx.h create mode 100644 prov/lnx/src/lnx_av.c create mode 100644 prov/lnx/src/lnx_cq.c create mode 100644 prov/lnx/src/lnx_domain.c create mode 100644 prov/lnx/src/lnx_ep.c create mode 100644 prov/lnx/src/lnx_init.c create mode 100644 prov/lnx/src/lnx_ops.c diff --git a/Makefile.am b/Makefile.am index de2158c5fc1..204352db93b 100644 --- a/Makefile.am +++ b/Makefile.am @@ -485,6 +485,7 @@ include prov/sm2/Makefile.include include prov/tcp/Makefile.include include prov/ucx/Makefile.include include prov/lpp/Makefile.include +include prov/lnx/Makefile.include include prov/hook/Makefile.include include prov/hook/perf/Makefile.include include prov/hook/trace/Makefile.include diff --git a/configure.ac b/configure.ac index 2b476f7f9d2..8e222be2e38 100644 --- a/configure.ac +++ b/configure.ac @@ -1125,6 +1125,7 @@ FI_PROVIDER_SETUP([hook_debug]) FI_PROVIDER_SETUP([hook_hmem]) FI_PROVIDER_SETUP([dmabuf_peer_mem]) FI_PROVIDER_SETUP([opx]) +FI_PROVIDER_SETUP([lnx]) FI_PROVIDER_FINI dnl Configure the .pc file FI_PROVIDER_SETUP_PC diff --git a/include/ofi.h b/include/ofi.h index 7592281c766..9661a7553d9 100644 --- a/include/ofi.h +++ b/include/ofi.h @@ -297,6 +297,7 @@ enum ofi_prov_type { OFI_PROV_UTIL, OFI_PROV_HOOK, OFI_PROV_OFFLOAD, + OFI_PROV_LNX, }; /* Restrict to size of struct fi_provider::context (struct fi_context) */ diff --git a/include/ofi_prov.h b/include/ofi_prov.h index ccb3fbf616d..7ffcda76268 100644 --- a/include/ofi_prov.h +++ b/include/ofi_prov.h @@ -211,6 +211,17 @@ MRAIL_INI ; # define MRAIL_INIT NULL #endif +#if (HAVE_LNX) && (HAVE_LNX_DL) +# define LNX_INI FI_EXT_INI +# define LNX_INIT NULL +#elif (HAVE_LNX) +# define LNX_INI INI_SIG(fi_lnx_ini) +# define LNX_INIT fi_lnx_ini() +LNX_INI ; +#else +# define LNX_INIT NULL +#endif + #if (HAVE_PERF) && (HAVE_PERF_DL) # define HOOK_PERF_INI FI_EXT_INI # define HOOK_PERF_INIT NULL diff --git a/include/ofi_util.h b/include/ofi_util.h index 911a69893ba..dda5c903e6e 100644 --- a/include/ofi_util.h +++ b/include/ofi_util.h @@ -1172,9 +1172,11 @@ void ofi_fabric_remove(struct util_fabric *fabric); * Utility Providers */ -#define OFI_NAME_DELIM ';' +#define OFI_NAME_LNX_DELIM ':' +#define OFI_NAME_DELIM ';' #define OFI_UTIL_PREFIX "ofi_" #define OFI_OFFLOAD_PREFIX "off_" +#define OFI_LNX "lnx" static inline int ofi_has_util_prefix(const char *str) { @@ -1186,6 +1188,16 @@ static inline int ofi_has_offload_prefix(const char *str) return !strncasecmp(str, OFI_OFFLOAD_PREFIX, strlen(OFI_OFFLOAD_PREFIX)); } +static inline int ofi_is_lnx(const char *str) +{ + return !strncasecmp(str, OFI_LNX, strlen(OFI_LNX)); +} + +static inline int ofi_is_linked(const char *str) +{ + return (strcasestr(str, OFI_LNX)) ? 1 : 0; +} + int ofi_get_core_info(uint32_t version, const char *node, const char *service, uint64_t flags, const struct util_prov *util_prov, const struct fi_info *util_hints, @@ -1201,6 +1213,7 @@ int ofi_get_core_info_fabric(const struct fi_provider *prov, struct fi_info **core_info); +char *ofi_strdup_link_append(const char *head, const char *tail); char *ofi_strdup_append(const char *head, const char *tail); // char *ofi_strdup_head(const char *str); // char *ofi_strdup_tail(const char *str); diff --git a/include/rdma/fabric.h b/include/rdma/fabric.h index c96d2c79ddc..366e6b0402b 100644 --- a/include/rdma/fabric.h +++ b/include/rdma/fabric.h @@ -340,6 +340,7 @@ enum { FI_PROTO_SM2, FI_PROTO_CXI_RNR, FI_PROTO_LPP, + FI_PROTO_LNX, }; enum { diff --git a/include/rdma/fi_errno.h b/include/rdma/fi_errno.h index f5af121ec79..b90dbd5f42d 100644 --- a/include/rdma/fi_errno.h +++ b/include/rdma/fi_errno.h @@ -114,7 +114,7 @@ extern "C" { //#define FI_EADV EADV /* Advertise error */ //#define FI_ESRMNT ESRMNT /* Srmount error */ //#define FI_ECOMM ECOMM /* Communication error on send */ -//#define FI_EPROTO EPROTO /* Protocol error */ +#define FI_EPROTO EPROTO /* Protocol error */ //#define FI_EMULTIHOP EMULTIHOP /* Multihop attempted */ //#define FI_EDOTDOT EDOTDOT /* RFS specific error */ //#define FI_EBADMSG EBADMSG /* Not a data message */ diff --git a/man/fi_lnx.7.md b/man/fi_lnx.7.md new file mode 100644 index 00000000000..f52a08840dc --- /dev/null +++ b/man/fi_lnx.7.md @@ -0,0 +1,157 @@ +--- +layout: page +title: fi_lnx(7) +tagline: Libfabric Programmer's Manual +--- +{% include JB/setup %} + +# NAME + +fi_lnx \- The LINKx (LNX) Provider + +# OVERVIEW + +The LNX provider is designed to link two or more providers, allowing +applications to seamlessly use multiple providers or NICs. This provider uses +the libfabric peer infrastructure to aid in the use of the underlying providers. +This version of the provider currently supports linking the libfabric +shared memory provider for intra-node traffic and another provider for +inter-node traffic. Future releases of the provider will allow linking any +number of providers and provide the users with the ability to influence +the way the providers are utilized for traffic load. + +# SUPPORTED FEATURES + +This release contains an initial implementation of the LNX provider that +offers the following support: + +*Endpoint types* +: The provider supports only endpoint type *FI_EP_RDM*. + +*Endpoint capabilities* +: LNX is a passthrough layer on the send path. On the receive path LNX + utilizes the peer infrastructure to create shared receive queues (SRQ). + Receive requests are placed on the SRQ instead of on the core provider + receive queue. When the provider receives a message it queries the SRQ for + a match. If one is found the receive request is completed, otherwise the + message is placed on the LNX shared unexpected queue (SUQ). Further receive + requests query the SUQ for matches. + The first release of the provider only supports tagged and RMA operations. + Other message types will be supported in future releases. + +*Modes* +: The provider does not require the use of any mode bits. + +*Progress* +: LNX utilizes the peer infrastructure to provide a shared completion + queue. Each linked provider still needs to handle its own progress. + Completion events will however be placed on the shared completion queue, + which is passed to the application for access. + +*Address Format* +: LNX wraps the linked providers addresses in one common binary blob. + It does not alter or change the linked providers address format. It wraps + them into a LNX structure which is then flattened and returned to the + application. This is passed between different nodes. The LNX provider + is able to parse the flattened format and operate on the different links. + This assumes that nodes in the same group are all using the same version of + the provider with the exact same links. IE: you can't have one node linking + SHM+CXI while another linking SHM+RXM. + +*Message Operations* +: LNX is designed to intercept message operations such as fi_tsenddata + and based on specific criteria forward the operation to the appropriate + provider. For the first release, LNX will only support linking SHM + provider for intra-node traffic and another provider (ex: CXI) for inter + node traffic. LNX send operation looks at the destination and based on + whether the destination is local or remote it will select the provider to + forward the operation to. The receive case has been described earlier. + +*Using the Provider* +: In order to use the provider the user needs to set FI_LNX_PROV_LINKS + environment variable to the linked providers in the following format + shm+. This will allow LNX to report back to the application in the + fi_getinfo() call the different links which can be selected. Since there are + multiple domains per provider LNX reports a permutation of all the + possible links. For example if there are two CXI interfaces on the machine + LNX will report back shm+cxi0 and shm+cxi1. The application can then + select based on its own criteria the link it wishes to use. + The application typically uses the PCI information in the fi_info + structure to select the interface to use. A common selection criteria is + the interface nearest the core the process is bound to. In order to make + this determination, the application requires the PCI information about the + interface. For this reason LNX forwards the PCI information for the + inter-node provider in the link to the application. + +# LIMITATIONS AND FUTURE WORK + +*Hardware Support* +: LNX doesn't support hardware offload; ex hardware tag matching. This is + an inherit limitation when using the peer infrastructure. Due to the use + of a shared receive queue which linked providers need to query when + a message is received, any hardware offload which requires sending the + receive buffers to the hardware directly will not work with the shared + receive queue. The shared receive queue provides two advantages; 1) reduce + memory usage, 2) coordinate the receive operations. For #2 this is needed + when receiving from FI_ADDR_UNSPEC. In this case both providers which are + part of the link can race to gain access to the receive buffer. It is + a future effort to determine a way to use hardware tag matching and other + hardware offload capability with LNX + +*Limited Linking* +: This release of the provider supports linking SHM provider for intra-node + operations and another provider which supports the FI_PEER capability for + inter-node operations. It is a future effort to expand to link any + multiple sets of providers. + +*Memory Registration* +: As part of the memory registration operation, varying hardware can perform + hardware specific steps such as memory pinning. Due to the fact that + memory registration APIs do not specify the source or destination + addresses it is not possible for LNX to determine which provider to + forward the memory registration to. LNX, therefore, registers the memory + with all linked providers. This might not be efficient and might have + unforeseen side effects. A better method is needed to support memory + registration. One option is to have memory registration cache in lnx + to avoid expensive operations. + +*Operation Types* +: This release of LNX supports tagged and RMA operations only. Future + releases will expand the support to other operation types. + +*Multi-Rail* +: Future design effort is being planned to support utilizing multiple interfaces + for traffic simultaneously. This can be over homogeneous interfaces or over + heterogeneous interfaces. + +# RUNTIME PARAMETERS + +The *LNX* provider checks for the following environment variables: + +*FI_LNX_PROV_LINKS* +: This environment variable is used to specify which providers to link. This + must be set in order for the LNX provider to return a list of fi_info + blocks in the fi_getinfo() call. The format which must be used is: + ++... As mentioned earlier currently LNX supports linking + only two providers the first of which is SHM followed by one other + provider for inter-node operations + +*FI_LNX_DISABLE_SHM* +: By default this environment variable is set to 0. However, the user can + set it to one and then the SHM provider will not be used. This can be + useful for debugging and performance analysis. The SHM provider will + naturally be used for all intra-node operations. Therefore, to test SHM in + isolation with LNX, the processes can be limited to the same node only. + +*FI_LNX_USE_SRQ* +: Shared Receive Queues are integral part of the peer infrastructure, but + they have the limitation of not using hardware offload, such as tag + matching. SRQ is needed to support the FI_ADDR_UNSPEC case. If the application + is sure this will never be the case, then it can turn off SRQ support by + setting this environment variable to 0. It is 1 by default. + +# SEE ALSO + +[`fabric`(7)](fabric.7.html), +[`fi_provider`(7)](fi_provider.7.html), +[`fi_getinfo`(3)](fi_getinfo.3.html) diff --git a/man/man7/fi_lnx.7 b/man/man7/fi_lnx.7 new file mode 100644 index 00000000000..b30876e24e4 --- /dev/null +++ b/man/man7/fi_lnx.7 @@ -0,0 +1,173 @@ +.\" Automatically generated by Pandoc 2.9.2.1 +.\" +.TH "fi_lnx" "7" "" "" "" +.hy +.PP +{% include JB/setup %} +.SH NAME +.PP +fi_lnx - The LINKx (lnx) Provider +.SH OVERVIEW +.PP +The lnx provider is designed to link two or more providers, allowing +applications to seamlessly use multiple providers or NICs. +This provider uses the libfabric peer infrastructure to aid in the use +of the underlying providers. +This version of the provider currently supports linking the libfabric +shared memory provider for intra-node traffic and another provider for +inter-node traffic. +Future releases of the provider will allow linking any number of +providers and provide the users with the ability to influence the way +the providers are utilized for traffic load. +.SH SUPPORTED FEATURES +.PP +This release contains an initial implementation of the lnx provider +that offers the following support: +.TP +\f[I]Endpoint types\f[R] +The provider supports only endpoint type \f[I]FI_EP_RDM\f[R]. +.TP +\f[I]Endpoint capabilities\f[R] +lnx is a passthrough layer on the send path. +On the receive path lnx utilizes the peer infrastructure to create +shared receive queues (SRQ). +Receive requests are placed on the SRQ instead of on the core provider +receive queue. +When the provider receives a message it queries the SRQ for a match. +If one is found the receive request is completed, otherwise the message +is placed on the lnx shared unexpected queue (SUQ). +Further receive requests query the SUQ for matches. +The first release of the provider only supports tagged and RMA +operations. +Other message types will be supported in future releases. +.TP +\f[I]Modes\f[R] +The provider does not require the use of any mode bits. +.TP +\f[I]Progress\f[R] +lnx utilizes the peer infrastructure to provide a shared completion +queue. +Each linked provider still needs to handle its own progress. +Completion events will however be placed on the shared completion queue, +which is passed to the application for access. +.TP +\f[I]Address Format\f[R] +lnx wraps the linked providers addresses in one common binary blob. +It does not alter or change the linked providers address format. +It wraps them into a lnx structure which is then flattened and +returned to the application. +This is passed between different nodes. +The lnx provider is able to parse the flattened format and operate on +the different links. +This assumes that nodes in the same group are all using the same version +of the provider with the exact same links. +IE: you can\[cq]t have one node linking SHM+CXI while another linking +SHM+RXM. +.TP +\f[I]Message Operations\f[R] +lnx is designed to intercept message operations such as fi_tsenddata +and based on specific criteria forward the operation to the appropriate +provider. +For the first release, lnx will only support linking SHM provider for +intra-node traffic and another provider (ex: CXI) for inter node +traffic. +lnx send operation looks at the destination and based on whether the +destination is local or remote it will select the provider to forward +the operation to. +The receive case has been described earlier. +.TP +\f[I]Using the Provider\f[R] +In order to use the provider the user needs to set FI_LNX_PROV_LINKS +environment variable to the linked providers in the following format +shm+. +This will allow lnx to report back to the application in the +fi_getinfo() call the different links which can be selected. +Since there are multiple domains per provider lnx reports a +permutation of all the possible links. +For example if there are two CXI interfaces on the machine lnx will +report back shm+cxi0 and shm+cxi1. +The application can then select based on its own criteria the link it +wishes to use. +The application typically uses the PCI information in the fi_info +structure to select the interface to use. +A common selection criteria is the interface nearest the core the +process is bound to. +In order to make this determination, the application requires the PCI +information about the interface. +For this reason lnx forwards the PCI information for the inter-node +provider in the link to the application. +.SH LIMITATIONS AND FUTURE WORK +.TP +\f[I]Hardware Support\f[R] +lnx doesn\[cq]t support hardware offload; ex hardware tag matching. +This is an inherit limitation when using the peer infrastructure. +Due to the use of a shared receive queue which linked providers need to +query when a message is received, any hardware offload which requires +sending the receive buffers to the hardware directly will not work with +the shared receive queue. +The shared receive queue provides two advantages; 1) reduce memory +usage, 2) coordinate the receive operations. +For #2 this is needed when receiving from FI_ADDR_UNSPEC. +In this case both providers which are part of the link can race to gain +access to the receive buffer. +It is a future effort to determine a way to use hardware tag matching +and other hardware offload capability with lnx +.TP +\f[I]Limited Linking\f[R] +This release of the provider supports linking SHM provider for +intra-node operations and another provider which supports the FI_PEER +capability for inter-node operations. +It is a future effort to expand to link any multiple sets of providers. +.TP +\f[I]Memory Registration\f[R] +As part of the memory registration operation, varying hardware can +perform hardware specific steps such as memory pinning. +Due to the fact that memory registration APIs do not specify the source +or destination addresses it is not possible for lnx to determine which +provider to forward the memory registration to. +LINkx, therefore, registers the memory with all linked providers. +This might not be efficient and might have unforeseen side effects. +A better method is needed to support memory registration. +.TP +\f[I]Operation Types\f[R] +This release of lnx supports tagged and RMA operations only. +Future releases will expand the support to other operation types. +.TP +\f[I]Multi-Rail\f[R] +Future design effort is being planned to support utilizing multiple +interfaces for traffic simultaneously. +This can be over homogeneous interfaces or over heterogeneous +interfaces. +.SH RUNTIME PARAMETERS +.PP +The \f[I]lnx\f[R] provider checks for the following environment +variables: +.TP +\f[I]FI_LNX_PROV_LINKS\f[R] +This environment variable is used to specify which providers to link. +This must be set in order for the lnx provider to return a list of +fi_info blocks in the fi_getinfo() call. +The format which must be used is: ++\&... As mentioned earlier currently +lnx supports linking only two providers the first of which is SHM +followed by one other provider for inter-node operations +.TP +\f[I]FI_LNX_DISABLE_SHM\f[R] +By default this environment variable is set to 0. +However, the user can set it to one and then the SHM provider will not +be used. +This can be useful for debugging and performance analysis. +The SHM provider will naturally be used for all intra-node operations. +Therefore, to test SHM in isolation with lnx, the processes can be +limited to the same node only. +.TP +\f[I]FI_LNX_USE_SRQ\f[R] +Shared Receive Queues are integral part of the peer infrastructure, but +they have the limitation of not using hardware offload, such as tag +matching. +SRQ is needed to support the FI_ADDR_UNSPEC case. +If the application is sure this will never be the case, then it can turn +off SRQ support by setting this environment variable to 0. +It is 1 by default. +.SH SEE ALSO +.PP +\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) diff --git a/prov/lnx/Makefile.include b/prov/lnx/Makefile.include new file mode 100644 index 00000000000..cd23049e845 --- /dev/null +++ b/prov/lnx/Makefile.include @@ -0,0 +1,61 @@ +# +# Copyright (c) 2022 ORNL. All rights reserved. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + + +if HAVE_LNX +_lnx_files = \ + prov/lnx/src/lnx_cq.c \ + prov/lnx/src/lnx_domain.c \ + prov/lnx/src/lnx_ep.c \ + prov/lnx/src/lnx_init.c \ + prov/lnx/src/lnx_ops.c \ + prov/lnx/src/lnx_av.c + +_lnx_headers = \ + prov/lnx/include/lnx.h + +if HAVE_LNX_DL +pkglib_LTLIBRARIES += liblnx-fi.la +liblnx_fi_la_SOURCES = $(_lnx_files) $(_lnx_headers) +liblnx_fi_la_LIBADD = $(linkback) $(lnx_LIBS) +liblnx_fi_la_LDFLAGS = -module -avoid-version -shared -export-dynamic +liblnx_fi_la_DEPENDENCIES = $(linkback) +else +src_libfabric_la_SOURCES += $(_lnx_files) $(_lnx_headers) +src_libfabric_la_CPPFLAGS += -I$(top_srcdir)/prov/lnx/include +endif + +prov_install_man_pages += man/man7/fi_lnx.7 + +endif HAVE_LNX + +prov_dist_man_pages += man/man7/fi_lnx.7 diff --git a/prov/lnx/configure.m4 b/prov/lnx/configure.m4 new file mode 100644 index 00000000000..737b62bc46d --- /dev/null +++ b/prov/lnx/configure.m4 @@ -0,0 +1,15 @@ +dnl Configury specific to the libfabric lnx provider + +dnl Called to configure this provider +dnl +dnl Arguments: +dnl +dnl $1: action if configured successfully +dnl $2: action if not configured successfully +dnl +AC_DEFUN([FI_LNX_CONFIGURE],[ + # Determine if we can support the lnx provider + lnx_happy=0 + AS_IF([test x"$enable_lnx" != x"no"], [lnx_happy=1]) + AS_IF([test $lnx_happy -eq 1], [$1], [$2]) +]) diff --git a/prov/lnx/include/lnx.h b/prov/lnx/include/lnx.h new file mode 100644 index 00000000000..b40c9ea3eca --- /dev/null +++ b/prov/lnx/include/lnx.h @@ -0,0 +1,477 @@ +/* + * Copyright (c) 2022 ORNL. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef LNX_H +#define LNX_H + +#define LNX_DEF_AV_SIZE 1024 +#define LNX_MAX_LOCAL_EPS 16 +#define LNX_IOV_LIMIT 4 + +#define lnx_ep_rx_flags(lnx_ep) ((lnx_ep)->le_ep.rx_op_flags) + +struct local_prov_ep; + +struct lnx_match_attr { + fi_addr_t lm_addr; + uint64_t lm_tag; + uint64_t lm_ignore; + struct lnx_peer *lm_peer; + struct local_prov_ep *lm_cep; +}; + +struct lnx_peer_cq { + struct lnx_cq *lpc_shared_cq; + struct fid_peer_cq lpc_cq; + struct fid_cq *lpc_core_cq; +}; + +struct lnx_queue { + struct dlist_entry lq_queue; + dlist_func_t *lq_match_func; + ofi_spin_t lq_qlock; +}; + +struct lnx_qpair { + struct lnx_queue lqp_recvq; + struct lnx_queue lqp_unexq; +}; + +struct lnx_peer_srq { + struct lnx_qpair lps_trecv; + struct lnx_qpair lps_recv; +}; + +struct local_prov_ep { + struct dlist_entry entry; + bool lpe_local; + char lpe_fabric_name[FI_NAME_MAX]; + struct fid_fabric *lpe_fabric; + struct fid_domain *lpe_domain; + struct fid_ep *lpe_ep; + struct fid_ep **lpe_txc; + struct fid_ep **lpe_rxc; + struct fid_av *lpe_av; + struct lnx_peer_cq lpe_cq; + struct fi_info *lpe_fi_info; + struct fid_peer_srx lpe_srx; + struct ofi_bufpool *lpe_recv_bp; + ofi_spin_t lpe_bplock; + struct local_prov *lpe_parent; +}; + +struct lnx_rx_entry { + /* the entry which will be passed to the core provider */ + struct fi_peer_rx_entry rx_entry; + /* iovec to use to point to receive buffers */ + struct iovec rx_iov[LNX_IOV_LIMIT]; + /* desc array to be used to point to the descs passed by the user */ + void *rx_desc[LNX_IOV_LIMIT]; + /* peer we expect messages from. + * This is available if the receive request provided a source address. + * Otherwise it will be NULL + */ + struct lnx_peer *rx_peer; + /* local prov endpoint receiving the message if this entry is + * added to the SUQ + */ + struct local_prov_ep *rx_cep; + /* match information which will be given to us by the core provider */ + struct fi_peer_match_attr rx_match_info; + /* ignore bit passed in by the user */ + uint64_t rx_ignore; + /* which pool this rx_entry came from. It's either from the global + * pool or some core provider pool + */ + bool rx_global; +}; + +OFI_DECLARE_FREESTACK(struct lnx_rx_entry, lnx_recv_fs); + +struct local_prov { + struct dlist_entry lpv_entry; + char lpv_prov_name[FI_NAME_MAX]; + int lpv_ep_count; + struct dlist_entry lpv_prov_eps; +}; + +struct lnx_address_prov { + char lap_prov[FI_NAME_MAX]; + /* an array of addresses of size count. */ + /* entry 0 is shm if available */ + /* array can't be larger than LNX_MAX_LOCAL_EPS */ + int lap_addr_count; + /* size as specified by the provider */ + int lap_addr_size; + /* payload */ + char lap_addrs[]; +}; + +struct lnx_addresses { + /* used to determine if the address is node local or node remote */ + char la_hostname[FI_NAME_MAX]; + /* number of providers <= LNX_MAX_LOCAL_EPS */ + int la_prov_count; + struct lnx_address_prov la_addr_prov[]; +}; + +struct lnx_local2peer_map { + struct dlist_entry entry; + struct local_prov_ep *local_ep; + int addr_count; + fi_addr_t peer_addrs[LNX_MAX_LOCAL_EPS]; +}; + +struct lnx_peer_prov { + struct dlist_entry entry; + + /* provider name */ + char lpp_prov_name[FI_NAME_MAX]; + + uint64_t lpp_flags; + + /* pointer to the local endpoint information to be used for + * communication with this peer. + * + * If the peer is on-node, then lp_endpoints[0] = shm + * + * if peer is off-node, then there could be up to LNX_MAX_LOCAL_EPS + * local endpoints we can use to reach that peer. + */ + struct local_prov *lpp_prov; + + /* each peer can be reached from any of the local provider endpoints + * on any of the addresses which are given to us. It's an N:N + * relationship + */ + struct dlist_entry lpp_map; +}; + +struct lnx_peer { + /* true if peer can be reached over shared memory, false otherwise */ + bool lp_local; + + /* Each provider that we can reach the peer on will have an entry + * below. Each entry will contain all the local provider endpoints we + * can reach the peer through, as well as all the peer addresses on that + * provider. + * + * We can potentially multi-rail between the interfaces on the same + * provider, both local and remote. + * + * Or we can multi-rail across different providers. Although this + * might be more complicated due to the differences in provider + * capabilities. + */ + struct lnx_peer_prov *lp_shm_prov; + struct dlist_entry lp_provs; +}; + +struct lnx_peer_table { + struct util_av lpt_av; + int lpt_max_count; + int lpt_count; + struct lnx_domain *lpt_domain; + /* an array of peer entries */ + struct lnx_peer **lpt_entries; +}; + +struct lnx_ctx { + struct dlist_entry ctx_head; + int ctx_idx; + struct lnx_ep *ctx_parent; + struct fid_ep ctx_ep; +}; + +struct lnx_ep { + struct util_ep le_ep; + struct dlist_entry le_tx_ctx; + struct dlist_entry le_rx_ctx; + struct lnx_domain *le_domain; + size_t le_fclass; + struct lnx_peer_table *le_peer_tbl; + struct lnx_peer_srq le_srq; +}; + +struct lnx_srx_context { + struct lnx_ep *srx_lep; + struct local_prov_ep *srx_cep; +}; + +struct lnx_mem_desc_prov { + struct local_prov *prov; + struct fid_mr *core_mr; +}; + +struct lnx_mem_desc { + struct lnx_mem_desc_prov desc[LNX_MAX_LOCAL_EPS]; + int desc_count; +}; + +struct lnx_mr { + struct ofi_mr mr; + struct lnx_mem_desc desc; +}; + +struct lnx_domain { + struct util_domain ld_domain; + struct lnx_fabric *ld_fabric; + bool ld_srx_supported; + struct ofi_mr_cache ld_mr_cache; +}; + +struct lnx_cq { + struct util_cq util_cq; + struct lnx_domain *lnx_domain; +}; + +struct lnx_fabric { + struct util_fabric util_fabric; + /* providers linked by this fabric */ + struct dlist_entry local_prov_table; + /* memory registration buffer pool */ + struct ofi_bufpool *mem_reg_bp; + /* shared memory provider used in this link */ + struct local_prov *shm_prov; + /* peers associated with this link */ + struct lnx_peer_table *lnx_peer_tbl; +}; + +extern struct util_prov lnx_util_prov; +extern struct fi_provider lnx_prov; +extern struct ofi_bufpool *global_recv_bp; +extern ofi_spin_t global_bplock; + +struct fi_info *lnx_get_link_by_dom(char *domain_name); + +int lnx_getinfo(uint32_t version, const char *node, const char *service, + uint64_t flags, const struct fi_info *hints, + struct fi_info **info); + +int lnx_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric, + void *context); +int lnx_setup_core_fabrics(char *name, struct lnx_fabric *lnx_fab, + void *context); + +void lnx_fini(void); + +int lnx_fabric_close(struct fid *fid); + +int lnx_domain_open(struct fid_fabric *fabric, struct fi_info *info, + struct fid_domain **dom, void *context); + +int lnx_av_open(struct fid_domain *domain, struct fi_av_attr *attr, + struct fid_av **av, void *context); + +int lnx_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, + struct fid_cq **cq, void *context); + +int lnx_endpoint(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep, void *context); + +int lnx_scalable_ep(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep, void *context); + +int lnx_cq2ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags); + +int lnx_get_msg(struct fid_peer_srx *srx, struct fi_peer_match_attr *match, + struct fi_peer_rx_entry **entry); +int lnx_get_tag(struct fid_peer_srx *srx, struct fi_peer_match_attr *match, + struct fi_peer_rx_entry **entry); +int lnx_queue_msg(struct fi_peer_rx_entry *entry); +int lnx_queue_tag(struct fi_peer_rx_entry *entry); +void lnx_free_entry(struct fi_peer_rx_entry *entry); +void lnx_foreach_unspec_addr(struct fid_peer_srx *srx, + fi_addr_t (*get_addr)(struct fi_peer_rx_entry *)); + +static inline struct lnx_peer * +lnx_get_peer(struct lnx_peer **peers, fi_addr_t addr) +{ + if (!peers || addr == FI_ADDR_UNSPEC) + return NULL; + + return peers[addr]; +} + +static inline +void lnx_get_core_desc(struct lnx_mem_desc *desc, void **mem_desc) +{ + if (desc && desc->desc[0].core_mr) { + if (mem_desc) + *mem_desc = desc->desc[0].core_mr->mem_desc; + return; + } + + *mem_desc = NULL; +} + +static inline +int lnx_create_mr(const struct iovec *iov, fi_addr_t addr, + struct lnx_domain *lnx_dom, struct ofi_mr_entry **mre) +{ + struct ofi_mr *mr; + struct fi_mr_attr attr = {}; + struct fi_mr_attr cur_abi_attr; + struct ofi_mr_info info = {}; + uint64_t flags; + int rc; + + attr.iov_count = 1; + attr.mr_iov = iov; + *mre = ofi_mr_cache_find(&lnx_dom->ld_mr_cache, &attr, 0); + if (*mre) { + mr = (struct ofi_mr *)(*mre)->data; + goto out; + } + + attr.iface = ofi_get_hmem_iface(iov->iov_base, + &attr.device.reserved, &flags); + info.iov = *iov; + info.iface = attr.iface; + rc = ofi_hmem_dev_register(attr.iface, iov->iov_base, iov->iov_len, + (uint64_t *) &attr.hmem_data); + if (rc) + return rc; + + rc = ofi_mr_cache_search(&lnx_dom->ld_mr_cache, &info, mre); + if (rc) { + ofi_hmem_dev_unregister(attr.iface, (uint64_t)attr.hmem_data); + return rc; + } + + mr = (struct ofi_mr *)(*mre)->data; + ofi_mr_update_attr(lnx_dom->ld_domain.fabric->fabric_fid.api_version, + lnx_dom->ld_domain.info_domain_caps, &attr, &cur_abi_attr, 0); + + mr->mr_fid.fid.fclass = FI_CLASS_MR; + mr->mr_fid.fid.context = attr.context; + mr->domain = &lnx_dom->ld_domain; + mr->flags = flags; + mr->iface = cur_abi_attr.iface; + mr->device = cur_abi_attr.device.reserved; + mr->hmem_data = cur_abi_attr.hmem_data; + mr->mr_fid.mem_desc = (void*) mr; + +out: + return FI_SUCCESS; +} + +static inline +int lnx_select_send_pathway(struct lnx_peer *lp, struct lnx_domain *lnx_dom, + struct lnx_mem_desc *desc, struct local_prov_ep **cep, + fi_addr_t *addr, const struct iovec *iov, size_t iov_count, + struct ofi_mr_entry **mre, void **mem_desc, uint64_t *rkey) +{ + int idx = 0; + int rc; + struct lnx_peer_prov *prov; + struct lnx_local2peer_map *lpm; + struct ofi_mr *mr = NULL; + + if (lp->lp_local) { + prov = lp->lp_shm_prov; + } else { + prov = dlist_first_entry_or_null( + &lp->lp_provs, struct lnx_peer_prov, entry); + idx = 1; + } + + /* TODO when we support multi-rail we can have multiple maps */ + lpm = dlist_first_entry_or_null(&prov->lpp_map, + struct lnx_local2peer_map, entry); + *addr = lpm->peer_addrs[0]; + + /* TODO this will need to be expanded to handle Multi-Rail. For now + * the assumption is that local peers can be reached on shm and remote + * peers have only one interface, hence indexing on 0 and 1 + * + * If we did memory registration, then we've already figured out the + * pathway + */ + if (desc && desc->desc[idx].core_mr) { + *cep = dlist_first_entry_or_null( + &desc->desc[idx].prov->lpv_prov_eps, + struct local_prov_ep, entry); + if (mem_desc) + *mem_desc = fi_mr_desc(desc->desc[idx].core_mr); + if (rkey) + *rkey = fi_mr_key(desc->desc[idx].core_mr); + return 0; + } + + *cep = lpm->local_ep; + if (mem_desc) + *mem_desc = NULL; + + if (!lp->lp_local || !mem_desc || (mem_desc && *mem_desc) || + !iov || (iov && iov->iov_base == NULL)) + return 0; + + /* Look up the address in the cache: + * - if it's found then use the cached fid_mr + * - This will include the iface, which is really all we need + * - if it's not then lookup the iface, create the fid_mr and + * cache it. + */ + rc = lnx_create_mr(iov, *addr, lnx_dom, mre); + if (!rc && mre) { + mr = (struct ofi_mr *)(*mre)->data; + *mem_desc = mr->mr_fid.mem_desc; + } + + return rc; +} + +static inline +int lnx_select_recv_pathway(struct lnx_peer *lp, struct lnx_domain *lnx_dom, + struct lnx_mem_desc *desc, struct local_prov_ep **cep, + fi_addr_t *addr, const struct iovec *iov, size_t iov_count, + struct ofi_mr_entry **mre, void **mem_desc) +{ + /* if the src address is FI_ADDR_UNSPEC, then we'll need to trigger + * all core providers to listen for a receive, since we don't know + * which one will endup getting the message. + * + * For each core provider we're tracking, trigger the recv operation + * on it. + * + * if the src address is specified then we just need to select and + * exact core endpoint to trigger the recv on. + */ + if (!lp) + return -FI_ENOSYS; + + return lnx_select_send_pathway(lp, lnx_dom, desc, cep, addr, iov, + iov_count, mre, mem_desc, NULL); +} + +#endif /* LNX_H */ diff --git a/prov/lnx/src/lnx_av.c b/prov/lnx/src/lnx_av.c new file mode 100644 index 00000000000..4e6ac0bebaf --- /dev/null +++ b/prov/lnx/src/lnx_av.c @@ -0,0 +1,702 @@ +/* + * Copyright (c) 2022 ORNL. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "ofi_util.h" +#include "ofi.h" +#include "ofi_str.h" +#include "ofi_prov.h" +#include "ofi_perf.h" +#include "ofi_hmem.h" +#include "rdma/fi_ext.h" +#include "lnx.h" + +static void lnx_free_peer(struct lnx_peer *lp) +{ + struct lnx_peer_prov *lpp; + struct dlist_entry *tmp, *tmp2; + struct lnx_local2peer_map *lpm; + + dlist_foreach_container_safe(&lp->lp_provs, + struct lnx_peer_prov, lpp, entry, tmp) { + dlist_foreach_container_safe(&lpp->lpp_map, + struct lnx_local2peer_map, lpm, entry, tmp2) { + dlist_remove(&lpm->entry); + free(lpm); + } + dlist_remove(&lpp->entry); + free(lpp); + } + + free(lp); +} + +#if ENABLE_DEBUG +static void lnx_print_peer(int idx, struct lnx_peer *lp) +{ + int k; + struct lnx_peer_prov *lpp; + struct lnx_local2peer_map *lpm; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "%d: lnx_peer[%d] is %s\n", getpid(), idx, + (lp->lp_local) ? "local" : "remote"); + dlist_foreach_container(&lp->lp_provs, + struct lnx_peer_prov, lpp, entry) { + FI_DBG(&lnx_prov, FI_LOG_CORE, + "%d: peer[%p] provider %s\n", getpid(), lpp, + lpp->lpp_prov_name); + dlist_foreach_container(&lpp->lpp_map, + struct lnx_local2peer_map, lpm, entry) { + FI_DBG(&lnx_prov, FI_LOG_CORE, + " %d: peer has %d mapped addrs\n", + getpid(), lpm->addr_count); + for (k = 0; k < lpm->addr_count; k++) + FI_DBG(&lnx_prov, FI_LOG_CORE, + " %d: addr = %lu\n", + getpid(), lpm->peer_addrs[k]); + } + } +} +#endif /* ENABLE_DEBUG */ + +static int lnx_peer_insert(struct lnx_peer_table *tbl, + struct lnx_peer *lp) +{ + int i; + + if (tbl->lpt_max_count == 0 || + tbl->lpt_count >= tbl->lpt_max_count) + return -FI_ENOENT; + + for (i = 0; i < tbl->lpt_max_count; i++) { + if (!tbl->lpt_entries[i]) { + tbl->lpt_entries[i] = lp; +#if ENABLE_DEBUG + lnx_print_peer(i, lp); +#endif + tbl->lpt_count++; + return i; + } + } + + return -FI_ENOENT; +} + +static int lnx_peer_av_remove(struct lnx_peer *lp) +{ + int rc, frc = 0; + struct lnx_peer_prov *lpp; + struct lnx_local2peer_map *lpm; + + dlist_foreach_container(&lp->lp_provs, + struct lnx_peer_prov, lpp, entry) { + /* if this is a remote peer then we didn't insert its shm address + * into our local shm endpoint, so no need to remove it + */ + if (!strncasecmp(lpp->lpp_prov_name, "shm", 3) && + !lp->lp_local) + continue; + + /* remove these address from all local providers */ + dlist_foreach_container(&lpp->lpp_map, + struct lnx_local2peer_map, lpm, entry) { + if (lpm->addr_count > 0) { + rc = fi_av_remove(lpm->local_ep->lpe_av, lpm->peer_addrs, + lpm->addr_count, lpp->lpp_flags); + if (rc) + frc = rc; + } + } + } + + return frc; +} + +static int lnx_peer_remove(struct lnx_peer_table *tbl, int idx) +{ + struct lnx_peer *lp = tbl->lpt_entries[idx]; + int rc = 0; + + if (!lp) + return 0; + + rc = lnx_peer_av_remove(lp); + + tbl->lpt_entries[idx] = NULL; + tbl->lpt_count--; + + return rc; +} + +static int lnx_cleanup_avs(struct local_prov *prov) +{ + int rc, frc = 0; + struct local_prov_ep *ep; + + dlist_foreach_container(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_close(&ep->lpe_av->fid); + if (rc) + frc = rc; + } + + return frc; +} + +static inline void lnx_free_peer_tbl(struct lnx_peer_table *peer_tbl) +{ + free(peer_tbl->lpt_entries); + free(peer_tbl); +} + +int lnx_av_close(struct fid *fid) +{ + int rc; + struct local_prov *entry; + struct lnx_fabric *fabric; + struct lnx_peer_table *peer_tbl; + + peer_tbl = container_of(fid, struct lnx_peer_table, lpt_av.av_fid.fid); + fabric = peer_tbl->lpt_domain->ld_fabric; + + /* walk through the rest of the core providers and open their + * respective address vector tables + */ + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + rc = lnx_cleanup_avs(entry); + if (rc) { + FI_INFO(&lnx_prov, FI_LOG_CORE, "Failed to close av for %s\n", + entry->lpv_prov_name); + } + } + + ofi_av_close_lightweight(&peer_tbl->lpt_av); + + free(peer_tbl); + + return 0; +} + +static struct fi_ops lnx_av_fi_ops = { + .size = sizeof(struct fi_ops), + .close = lnx_av_close, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +static int lnx_get_or_create_peer_prov(struct dlist_entry *prov_table, + struct lnx_peer *lp, char *prov_name, + struct lnx_peer_prov **lpp) +{ + bool shm = false; + struct local_prov *entry; + struct lnx_peer_prov *peer_prov; + + if (!strcmp(prov_name, "shm")) { + if (lp->lp_shm_prov) + return -FI_ENOENT; + shm = true; + goto insert_prov; + } + + /* check if we already have a peer provider */ + dlist_foreach_container(&lp->lp_provs, + struct lnx_peer_prov, peer_prov, entry) { + if (!strncasecmp(peer_prov->lpp_prov_name, prov_name, FI_NAME_MAX)) { + *lpp = peer_prov; + return 0; + } + } + +insert_prov: + dlist_foreach_container(prov_table, struct local_prov, + entry, lpv_entry) { + if (!strncasecmp(entry->lpv_prov_name, prov_name, FI_NAME_MAX)) { + peer_prov = calloc(sizeof(*peer_prov), 1); + if (!peer_prov) + return -FI_ENOMEM; + + dlist_init(&peer_prov->entry); + dlist_init(&peer_prov->lpp_map); + + strncpy(peer_prov->lpp_prov_name, prov_name, FI_NAME_MAX); + + peer_prov->lpp_prov = entry; + + if (shm) + lp->lp_shm_prov = peer_prov; + else + dlist_insert_tail(&peer_prov->entry, &lp->lp_provs); + + *lpp = peer_prov; + return 0; + } + } + + return -FI_ENOENT; +} + +static inline struct lnx_address_prov * +next_prov(struct lnx_address_prov *prov) +{ + uint8_t *ptr; + + ptr = (uint8_t*) prov; + + ptr += (sizeof(*prov) + (prov->lap_addr_count * prov->lap_addr_size)); + + return (struct lnx_address_prov*)ptr; +} + +static inline size_t +get_lnx_addresses_size(struct lnx_addresses *addrs) +{ + int i; + size_t s = sizeof(*addrs); + struct lnx_address_prov *prov; + + prov = addrs->la_addr_prov; + for (i = 0; i < addrs->la_prov_count; i++) { + s += sizeof(*prov) + (prov->lap_addr_count * prov->lap_addr_size); + prov = next_prov(prov); + } + + return s; +} + +static inline struct lnx_addresses * +next_peer(struct lnx_addresses *addrs) +{ + uint8_t *ptr; + + ptr = (uint8_t*)addrs + get_lnx_addresses_size(addrs); + + return (struct lnx_addresses *)ptr; +} + +static struct lnx_address_prov * +lnx_get_peer_shm_addr(struct lnx_addresses *addrs) +{ + int i; + struct lnx_address_prov *prov; + + prov = addrs->la_addr_prov; + for (i = 0; i < addrs->la_prov_count; i++) { + if (!strcmp(prov->lap_prov, "shm")) + return prov; + prov = next_prov(prov); + } + + return NULL; +} + +static int is_local_addr(struct local_prov **shm_prov, struct lnx_addresses *la) +{ + int rc; + char hostname[FI_NAME_MAX]; + struct lnx_address_prov *lap_shm; + + /* check the hostname and compare it to mine + * TODO: Is this good enough? or do we need a better way of + * determining if the address is local? + */ + rc = gethostname(hostname, FI_NAME_MAX); + if (rc == -1) { + FI_INFO(&lnx_prov, FI_LOG_CORE, "failed to get hostname\n"); + return -FI_EPERM; + } + + lap_shm = lnx_get_peer_shm_addr(la); + if (!lap_shm) + return -FI_EOPNOTSUPP; + + /* Shared memory address not provided or not local*/ + if ((lap_shm->lap_addr_count == 0) || + strncasecmp(hostname, la->la_hostname, FI_NAME_MAX)) + return -FI_EOPNOTSUPP; + + /* badly formed address */ + if (*shm_prov && (lap_shm->lap_addr_count > 1 || + lap_shm->lap_addr_count < 0)) + return -FI_EPROTO; + + return 0; +} + +static void +lnx_update_msg_entries(struct lnx_qpair *qp, + fi_addr_t (*get_addr)(struct fi_peer_rx_entry *)) +{ + struct lnx_queue *q = &qp->lqp_unexq; + struct lnx_rx_entry *rx_entry; + struct dlist_entry *item; + + ofi_spin_lock(&q->lq_qlock); + dlist_foreach(&q->lq_queue, item) { + rx_entry = (struct lnx_rx_entry *) item; + if (rx_entry->rx_entry.addr == FI_ADDR_UNSPEC) + rx_entry->rx_entry.addr = get_addr(&rx_entry->rx_entry); + } + ofi_spin_unlock(&q->lq_qlock); +} + +void +lnx_foreach_unspec_addr(struct fid_peer_srx *srx, + fi_addr_t (*get_addr)(struct fi_peer_rx_entry *)) +{ + struct lnx_srx_context *ctxt; + + ctxt = (struct lnx_srx_context *) srx->ep_fid.fid.context; + + lnx_update_msg_entries(&ctxt->srx_lep->le_srq.lps_trecv, get_addr); + lnx_update_msg_entries(&ctxt->srx_lep->le_srq.lps_recv, get_addr); +} + +static int lnx_peer_map_addrs(struct dlist_entry *prov_table, + struct lnx_peer *lp, struct lnx_addresses *la, + uint64_t flags, void *context) +{ + int i, j, rc; + struct lnx_peer_prov *lpp; + struct lnx_address_prov *lap; + struct local_prov_ep *lpe; + struct dlist_entry *eps; + + lap = &la->la_addr_prov[0]; + + for (i = 0; i < la->la_prov_count; i++) { + if (lap->lap_addr_count > LNX_MAX_LOCAL_EPS) + return -FI_EPROTO; + + rc = lnx_get_or_create_peer_prov(prov_table, lp, lap->lap_prov, + &lpp); + if (rc) + return rc; + + lpp->lpp_flags = flags; + + eps = &lpp->lpp_prov->lpv_prov_eps; + dlist_foreach_container(eps, struct local_prov_ep, lpe, + entry) { + struct lnx_local2peer_map *lpm; + + /* if this is a remote peer, don't insert the shm address + * since we will never talk to that peer over shm + */ + if (!strncasecmp(lpe->lpe_fabric_name, "shm", 3) && + !lp->lp_local) + continue; + + lpm = calloc(sizeof(*lpm), 1); + if (!lpm) + return -FI_ENOMEM; + + dlist_init(&lpm->entry); + dlist_insert_tail(&lpm->entry, &lpp->lpp_map); + + lpm->local_ep = lpe; + lpm->addr_count = lap->lap_addr_count; + for (j = 0; j < LNX_MAX_LOCAL_EPS; j++) + lpm->peer_addrs[j] = FI_ADDR_NOTAVAIL; + /* fi_av_insert returns the number of addresses inserted */ + rc = fi_av_insert(lpe->lpe_av, (void*)lap->lap_addrs, + lap->lap_addr_count, + lpm->peer_addrs, flags, context); + if (rc < 0) + return rc; + + /* should only insert the number of addresses indicated */ + assert(rc == lap->lap_addr_count); + } + + lap = next_prov(lap); + } + + return 0; +} + +/* + * count: number of LNX addresses + * addr: an array of addresses + * fi_addr: an out array of fi_addr)t + * + * Each LNX address can have multiple core provider addresses + * Check the hostname provided in each address to see if it's the same as + * me. If so, then we'll use the SHM address if available. + * + * ASSUMPTION: fi_av_insert() is called exactly once per peer. + * We're not handling multiple av_inserts on the same peer. If that + * happens then we will create multiple peers entries. + */ +int lnx_av_insert(struct fid_av *av, const void *addr, size_t count, + fi_addr_t *fi_addr, uint64_t flags, void *context) +{ + int i, rc, idx; + int disable_shm = 0; + struct lnx_peer *lp; + struct dlist_entry *prov_table; + struct lnx_peer_table *peer_tbl; + struct lnx_addresses *la = (struct lnx_addresses *)addr; + + fi_param_get_bool(&lnx_prov, "disable_shm", &disable_shm); + + peer_tbl = container_of(av, struct lnx_peer_table, lpt_av.av_fid.fid); + prov_table = &peer_tbl->lpt_domain->ld_fabric->local_prov_table; + + /* each entry represents a separate peer */ + for (i = 0; i < count; i++) { + /* can't have more providers than LNX_MAX_LOCAL_EPS */ + if (la->la_prov_count >= LNX_MAX_LOCAL_EPS || + la->la_prov_count <= 0) + return -FI_EPROTO; + + /* this is a local peer */ + lp = calloc(sizeof(*lp), 1); + if (!lp) + return -FI_ENOMEM; + + dlist_init(&lp->lp_provs); + + rc = is_local_addr(&peer_tbl->lpt_domain->ld_fabric->shm_prov, + la); + if (!rc) { + lp->lp_local = !disable_shm; + } else if (rc == -FI_EOPNOTSUPP) { + lp->lp_local = false; + } else if (rc) { + FI_INFO(&lnx_prov, FI_LOG_CORE, "failed to identify address\n"); + return rc; + } + + rc = lnx_peer_map_addrs(prov_table, lp, la, flags, context); + if (rc) { + free(lp); + return rc; + } + + idx = lnx_peer_insert(peer_tbl, lp); + if (idx == -1) { + rc = lnx_peer_av_remove(lp); + lnx_free_peer(lp); + FI_INFO(&lnx_prov, FI_LOG_CORE, + "Peer table size exceeded. Removed = %d\n", rc); + return -FI_ENOENT; + } + + fi_addr[i] = (fi_addr_t) idx; + + la = next_peer(la); + } + + return i; +} + +int lnx_av_remove(struct fid_av *av, fi_addr_t *fi_addr, size_t count, + uint64_t flags) +{ + struct lnx_peer_table *peer_tbl; + int frc = 0, rc, i; + + peer_tbl = container_of(av, struct lnx_peer_table, lpt_av.av_fid.fid); + + for (i = 0; i < count; i++) { + rc = lnx_peer_remove(peer_tbl, (int)fi_addr[i]); + if (rc) + frc = rc; + } + + return frc; +} + +static const char * +lnx_av_straddr(struct fid_av *av, const void *addr, + char *buf, size_t *len) +{ + /* TODO: implement */ + return NULL; +} + +static int +lnx_av_lookup(struct fid_av *av, fi_addr_t fi_addr, void *addr, + size_t *addrlen) +{ + /* TODO: implement */ + return -FI_EOPNOTSUPP; +} + +static struct fi_ops_av lnx_av_ops = { + .size = sizeof(struct fi_ops_av), + .insert = lnx_av_insert, + .remove = lnx_av_remove, + .insertsvc = fi_no_av_insertsvc, + .insertsym = fi_no_av_insertsym, + .lookup = lnx_av_lookup, + .straddr = lnx_av_straddr, +}; + +static void lnx_get_core_av_attr(struct local_prov_ep *ep, + struct fi_av_attr *attr) +{ + memset(attr, 0, sizeof(*attr)); + attr->type = ep->lpe_fi_info->domain_attr->av_type; +} + +static int lnx_open_avs(struct local_prov *prov, struct fi_av_attr *attr, + void *context) +{ + int rc = 0; + struct local_prov_ep *ep; + struct fi_av_attr core_attr; + + dlist_foreach_container(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + lnx_get_core_av_attr(ep, &core_attr); + if (ep->lpe_local) + core_attr.count = ep->lpe_fi_info->domain_attr->ep_cnt; + else + core_attr.count = attr->count; + rc = fi_av_open(ep->lpe_domain, &core_attr, + &ep->lpe_av, context); + if (rc) + return rc; + } + + return 0; +} + +int lnx_av_open(struct fid_domain *domain, struct fi_av_attr *attr, + struct fid_av **av, void *context) +{ + struct lnx_fabric *fabric; + struct lnx_domain *lnx_domain; + struct lnx_peer_table *peer_tbl; + struct local_prov *entry; + size_t table_sz = LNX_DEF_AV_SIZE; + int rc = 0; + + if (!attr) + return -FI_EINVAL; + + if (attr->name) + return -FI_ENOSYS; + + if (attr->type != FI_AV_UNSPEC && + attr->type != FI_AV_TABLE) + return -FI_ENOSYS; + + if (attr->type == FI_AV_UNSPEC) + attr->type = FI_AV_TABLE; + + peer_tbl = calloc(sizeof(*peer_tbl), 1); + if (!peer_tbl) + return -FI_ENOMEM; + + if (attr->count != 0) + table_sz = attr->count; + + peer_tbl->lpt_entries = + calloc(sizeof(struct lnx_peer *) * table_sz, 1); + if (!peer_tbl->lpt_entries) { + rc = -FI_ENOMEM; + goto failed; + } + + lnx_domain = container_of(domain, struct lnx_domain, + ld_domain.domain_fid.fid); + fabric = lnx_domain->ld_fabric; + + rc = ofi_av_init_lightweight(&lnx_domain->ld_domain, attr, + &peer_tbl->lpt_av, context); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "failed to initialize AV: %d\n", rc); + goto failed; + } + + peer_tbl->lpt_max_count = table_sz; + peer_tbl->lpt_domain = lnx_domain; + peer_tbl->lpt_av.av_fid.fid.ops = &lnx_av_fi_ops; + peer_tbl->lpt_av.av_fid.ops = &lnx_av_ops; + + assert(fabric->lnx_peer_tbl == NULL); + + /* need this to handle memory registration vi fi_mr_regattr(). We need + * to be able to access the peer table to determine which endpoint + * we'll be using based on the source/destination address */ + fabric->lnx_peer_tbl = peer_tbl; + + /* walk through the rest of the core providers and open their + * respective address vector tables + */ + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + rc = lnx_open_avs(entry, attr, context); + if (rc) { + FI_INFO(&lnx_prov, FI_LOG_CORE, "Failed to initialize domain for %s\n", + entry->lpv_prov_name); + goto close; + } + } + + *av = &peer_tbl->lpt_av.av_fid; + + return 0; + +close: + ofi_av_close_lightweight(&peer_tbl->lpt_av); +failed: + lnx_free_peer_tbl(peer_tbl); + return rc; +} + + diff --git a/prov/lnx/src/lnx_cq.c b/prov/lnx/src/lnx_cq.c new file mode 100644 index 00000000000..6aebc8f4c5a --- /dev/null +++ b/prov/lnx/src/lnx_cq.c @@ -0,0 +1,234 @@ +/* + * Copyright (c) 2022 ORNL. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include + +#include +#include "ofi_util.h" +#include "ofi.h" +#include "ofi_str.h" +#include "ofi_prov.h" +#include "ofi_perf.h" +#include "ofi_hmem.h" +#include "rdma/fi_ext.h" +#include "lnx.h" + +ssize_t lnx_peer_cq_write(struct fid_peer_cq *cq, void *context, uint64_t flags, + size_t len, void *buf, uint64_t data, uint64_t tag, + fi_addr_t src) +{ + struct lnx_peer_cq *lnx_cq; + int rc; + + lnx_cq = container_of(cq, struct lnx_peer_cq, lpc_cq); + + rc = ofi_cq_write(&lnx_cq->lpc_shared_cq->util_cq, context, + flags, len, buf, data, tag); + + return rc; +} + +ssize_t lnx_peer_cq_writeerr(struct fid_peer_cq *cq, + const struct fi_cq_err_entry *err_entry) +{ + struct lnx_peer_cq *lnx_cq; + int rc; + + lnx_cq = container_of(cq, struct lnx_peer_cq, lpc_cq); + + rc = ofi_cq_write_error(&lnx_cq->lpc_shared_cq->util_cq, err_entry); + + return rc; +} + +static int lnx_cleanup_cqs(struct local_prov *prov) +{ + int rc, frc = 0; + struct local_prov_ep *ep; + + dlist_foreach_container(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_close(&ep->lpe_cq.lpc_core_cq->fid); + if (rc) + frc = rc; + ep->lpe_cq.lpc_core_cq = NULL; + } + + return frc; +} + +static int lnx_cq_close(struct fid *fid) +{ + int rc; + struct lnx_cq *lnx_cq; + struct local_prov *entry; + struct dlist_entry *prov_table; + + lnx_cq = container_of(fid, struct lnx_cq, util_cq.cq_fid); + prov_table = &lnx_cq->lnx_domain->ld_fabric->local_prov_table; + + /* close all the open core cqs */ + dlist_foreach_container(prov_table, struct local_prov, + entry, lpv_entry) { + rc = lnx_cleanup_cqs(entry); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, "Failed to close domain for %s\n", + entry->lpv_prov_name); + return rc; + } + } + + rc = ofi_cq_cleanup(&lnx_cq->util_cq); + if (rc) + return rc; + + free(lnx_cq); + return 0; +} + +struct fi_ops_cq_owner lnx_cq_write = { + .size = sizeof(lnx_cq_write), + .write = lnx_peer_cq_write, + .writeerr = lnx_peer_cq_writeerr, +}; + +static struct fi_ops lnx_cq_fi_ops = { + .size = sizeof(struct fi_ops), + .close = lnx_cq_close, + .bind = fi_no_bind, + .control = ofi_cq_control, + .ops_open = fi_no_ops_open, +}; + +static void lnx_cq_progress(struct util_cq *cq) +{ + struct lnx_cq *lnx_cq; + struct local_prov_ep *ep; + struct local_prov *entry; + struct dlist_entry *prov_table; + + lnx_cq = container_of(cq, struct lnx_cq, util_cq); + prov_table = &lnx_cq->lnx_domain->ld_fabric->local_prov_table; + + /* Kick the core provider endpoints to progress */ + dlist_foreach_container(prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) + fi_cq_read(ep->lpe_cq.lpc_core_cq, NULL, 0); + } +} + +static int lnx_cq_open_core_prov(struct lnx_cq *cq, struct fi_cq_attr *attr) +{ + int rc; + struct local_prov_ep *ep; + struct local_prov *entry; + struct dlist_entry *prov_table = + &cq->lnx_domain->ld_fabric->local_prov_table; + + /* tell the core providers to import my CQ */ + attr->flags |= FI_PEER; + + /* create all the core provider completion queues */ + dlist_foreach_container(prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + struct fid_cq *core_cq; + struct fi_peer_cq_context cq_ctxt; + + ep->lpe_cq.lpc_shared_cq = cq; + ep->lpe_cq.lpc_cq.owner_ops = &lnx_cq_write; + + cq_ctxt.size = sizeof(cq_ctxt); + cq_ctxt.cq = &ep->lpe_cq.lpc_cq; + + /* pass my CQ into the open and get back the core's cq */ + rc = fi_cq_open(ep->lpe_domain, attr, &core_cq, &cq_ctxt); + if (rc) + return rc; + + /* before the fi_cq_open() returns the core provider should + * have called fi_export_fid() and got a pointer to the peer + * CQ which we have allocated for this core provider + */ + + ep->lpe_cq.lpc_core_cq = core_cq; + } + } + + return 0; +} + +int lnx_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, + struct fid_cq **cq_fid, void *context) +{ + struct lnx_cq *lnx_cq; + struct lnx_domain *lnx_dom; + int rc; + + lnx_cq = calloc(1, sizeof(*lnx_cq)); + if (!lnx_cq) + return -FI_ENOMEM; + + /* this is going to be a standard CQ from the read side. From the + * write side, it'll use the peer_cq callbacks to write + */ + rc = ofi_cq_init(&lnx_prov, domain, attr, &lnx_cq->util_cq, + &lnx_cq_progress, context); + if (rc) + goto free; + + lnx_dom = container_of(domain, struct lnx_domain, + ld_domain.domain_fid); + + lnx_cq->lnx_domain = lnx_dom; + lnx_cq->util_cq.cq_fid.fid.ops = &lnx_cq_fi_ops; + (*cq_fid) = &lnx_cq->util_cq.cq_fid; + + /* open core CQs and tell them to import my CQ */ + rc = lnx_cq_open_core_prov(lnx_cq, attr); + + return rc; + +free: + free(lnx_cq); + return rc; +} diff --git a/prov/lnx/src/lnx_domain.c b/prov/lnx/src/lnx_domain.c new file mode 100644 index 00000000000..1d898319225 --- /dev/null +++ b/prov/lnx/src/lnx_domain.c @@ -0,0 +1,581 @@ +/* + * Copyright (c) 2022 ORNL. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include + +#include +#include "ofi_util.h" +#include "ofi.h" +#include "ofi_str.h" +#include "ofi_prov.h" +#include "ofi_perf.h" +#include "ofi_hmem.h" +#include "rdma/fi_ext.h" +#include "lnx.h" + +static struct fi_ops_domain lnx_domain_ops = { + .size = sizeof(struct fi_ops_domain), + .av_open = lnx_av_open, + .cq_open = lnx_cq_open, + .endpoint = lnx_endpoint, + .scalable_ep = lnx_scalable_ep, + .cntr_open = fi_no_cntr_open, + .poll_open = fi_no_poll_open, + .stx_ctx = fi_no_stx_context, + .srx_ctx = fi_no_srx_context, + .query_atomic = fi_no_query_atomic, + .query_collective = fi_no_query_collective, +}; + +static int lnx_cleanup_domains(struct local_prov *prov) +{ + int rc, frc = 0; + struct local_prov_ep *ep; + + dlist_foreach_container(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + if (!ep->lpe_domain) + continue; + rc = fi_close(&ep->lpe_domain->fid); + if (rc) + frc = rc; + } + + return frc; +} + +static int lnx_domain_close(fid_t fid) +{ + int rc = 0; + struct local_prov *entry; + struct lnx_domain *domain; + + domain = container_of(fid, struct lnx_domain, ld_domain.domain_fid.fid); + + /* close all the open core domains */ + dlist_foreach_container(&domain->ld_fabric->local_prov_table, + struct local_prov, + entry, lpv_entry) { + rc = lnx_cleanup_domains(entry); + if (rc) + FI_WARN(&lnx_prov, FI_LOG_CORE, "Failed to close domain for %s\n", + entry->lpv_prov_name); + } + + ofi_mr_cache_cleanup(&domain->ld_mr_cache); + + rc = ofi_domain_close(&domain->ld_domain); + + free(domain); + + return rc; +} + +static int +lnx_mr_regattrs_all(struct local_prov *prov, const struct fi_mr_attr *attr, + uint64_t flags, struct lnx_mem_desc_prov *desc) +{ + int rc = 0; + struct local_prov_ep *ep; + + desc->prov = prov; + + /* TODO: This is another issue here because MR registration can happen + * quiet often + */ + dlist_foreach_container(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_mr_regattr(ep->lpe_domain, attr, + flags, &desc->core_mr); + + /* TODO: SHM provider returns FI_ENOKEY if requested_key is the + * same as the previous call. Application, like OMPI, might not + * specify the requested key in fi_mr_attr, so for now ignore that + * error. + * We need a better way of handling this. + * if (rc == -FI_ENOKEY) + * rc = 0; + * I made a change in SHM to support FI_MR_PROV_KEY if set by the + * application. This tells ofi to generate its own requested_key + * for each fi_mr_regattr call + */ + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, "%s mr_regattr() failed: %d\n", + ep->lpe_fabric_name, rc); + return rc; + } + } + + return rc; +} + +static int +lnx_mr_close_all(struct lnx_mem_desc *mem_desc) +{ + int i, rc, frc = 0; + struct fid_mr *mr; + + for (i = 0; i < mem_desc->desc_count; i++) { + mr = mem_desc->desc[i].core_mr; + if (!mr) + continue; + rc = fi_close(&mr->fid); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, "%s mr_close() failed: %d\n", + mem_desc->desc[i].prov->lpv_prov_name, rc); + frc = rc; + } + } + + return frc; +} + +int lnx_mr_close(struct fid *fid) +{ + struct lnx_mr *lnx_mr; + struct ofi_mr *mr; + int rc, frc = 0; + + mr = container_of(fid, struct ofi_mr, mr_fid.fid); + lnx_mr = container_of(mr, struct lnx_mr, mr); + + rc = lnx_mr_close_all(mr->mr_fid.mem_desc); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, "Failed to complete Memory Deregistration\n"); + frc = rc; + } + + ofi_atomic_dec32(&mr->domain->ref); + + ofi_buf_free(lnx_mr); + + return frc; +} + +static int lnx_mr_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + int i, rc, frc = 0; + struct local_prov_ep *ep; + struct fid_mr *mr, *cmr; + struct lnx_mem_desc *mem_desc; + struct lnx_mem_desc_prov *desc; + + mr = container_of(fid, struct fid_mr, fid); + + mem_desc = mr->mem_desc; + + /* TODO: This is another issue here because MR registration can happen + * quiet often + */ + for (i = 0; i < mem_desc->desc_count; i++) { + desc = &mem_desc->desc[i]; + cmr = desc->core_mr; + if (!cmr) + continue; + dlist_foreach_container(&desc->prov->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_mr_bind(cmr, &ep->lpe_ep->fid, flags); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "%s lnx_mr_bind() failed: %d\n", + mem_desc->desc[i].prov->lpv_prov_name, rc); + frc = rc; + } + } + } + + return frc; +} + +static int lnx_mr_control(struct fid *fid, int command, void *arg) +{ + int i, rc, frc = 0; + struct fid_mr *mr, *cmr; + struct lnx_mem_desc *mem_desc; + struct lnx_mem_desc_prov *desc; + + if (command != FI_ENABLE) + return -FI_ENOSYS; + + mr = container_of(fid, struct fid_mr, fid); + + mem_desc = mr->mem_desc; + + /* TODO: This is another issue here because MR registration can happen + * quiet often + */ + for (i = 0; i < mem_desc->desc_count; i++) { + desc = &mem_desc->desc[i]; + cmr = desc->core_mr; + if (!cmr) + continue; + rc = fi_mr_enable(cmr); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, "%s lnx_mr_control() failed: %d\n", + mem_desc->desc[i].prov->lpv_prov_name, rc); + frc = rc; + } + } + + return frc; +} + +static struct fi_ops lnx_mr_fi_ops = { + .size = sizeof(struct fi_ops), + .close = lnx_mr_close, + .bind = lnx_mr_bind, + .control = lnx_mr_control, + .ops_open = fi_no_ops_open +}; + +static int +lnx_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr, + uint64_t flags, struct fid_mr **mr_fid) +{ + /* + * If the address is specified then use it to find out which + * domain to register the memory against. LNX can be managing + * multiple underlying core provider endpoints, I need to register the + * memory against the correct one. + * + * Once the domain is determined, I need to set the mr->mem_desc to + * point to a structure which contains my local endpoint I'll end up + * using (which is the same one that I registered the memory against) + * and the associate fid_mr which the core provider set for me. + * + * I return that to the application. + * + * When the application calls back into the data operations API it'll + * pass the mr. I can then pull out a pointer to my local endpoint + * which I'll use in the data operation and pass it the correct mr. + * + * If the address is not provided, then I'll register the memory + * buffer against all my core domains, store those and return them to + * the user + */ + + struct lnx_domain *domain; + struct lnx_fabric *fabric; + struct lnx_mr *lnx_mr = NULL;; + struct ofi_mr *mr; + struct lnx_mem_desc *mem_desc; + struct local_prov *entry; + int rc = 0, i = 1; + bool shm = false; + + if (fid->fclass != FI_CLASS_DOMAIN || !attr || attr->iov_count <= 0) + return -FI_EINVAL; + + domain = container_of(fid, struct lnx_domain, ld_domain.domain_fid.fid); + fabric = domain->ld_fabric; + + lnx_mr = ofi_buf_alloc(fabric->mem_reg_bp); + if (!lnx_mr) { + rc = -FI_ENOMEM; + goto fail; + } + + mr = &lnx_mr->mr; + mem_desc = &lnx_mr->desc; + + mr->mr_fid.fid.fclass = FI_CLASS_MR; + mr->mr_fid.fid.context = attr->context; + mr->mr_fid.fid.ops = &lnx_mr_fi_ops; + mr->mr_fid.mem_desc = mem_desc; + mr->domain = &domain->ld_domain; + mr->flags = flags; + + /* TODO: What's gonna happen if you try to register the same piece + * of memory via multiple providers? + * TODO 2: We need a better way to handle memory registration. + * This is simply not very good. We need to have a peer interface + * to memory registration + */ + /* register against all domains */ + dlist_foreach_container(&fabric->local_prov_table, + struct local_prov, + entry, lpv_entry) { + if (!strcmp(entry->lpv_prov_name, "shm")) + shm = true; + else + shm = false; + if (i >= LNX_MAX_LOCAL_EPS) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "Exceeded number of allowed memory registrations %s\n", + entry->lpv_prov_name); + rc = -FI_ENOSPC; + goto fail; + } + rc = lnx_mr_regattrs_all(entry, attr, flags, + (shm) ? &mem_desc->desc[0] : + &mem_desc->desc[i]); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "Failed to complete Memory Registration %s\n", + entry->lpv_prov_name); + goto fail; + } + if (!shm) + i++; + } + + mem_desc->desc_count = i; + if (shm) + mr->mr_fid.key = mem_desc->desc[0].core_mr->key; + else + mr->mr_fid.key = mem_desc->desc[1].core_mr->key; + *mr_fid = &mr->mr_fid; + ofi_atomic_inc32(&domain->ld_domain.ref); + + return 0; + +fail: + if (lnx_mr) + ofi_buf_free(lnx_mr); + return rc; +} + +static struct fi_ops lnx_domain_fi_ops = { + .size = sizeof(struct fi_ops), + .close = lnx_domain_close, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +static struct fi_ops_mr lnx_mr_ops = { + .size = sizeof(struct fi_ops_mr), + .reg = fi_no_mr_reg, + .regv = fi_no_mr_regv, + .regattr = lnx_mr_regattr, +}; + +static int lnx_setup_core_domain(struct local_prov_ep *ep, struct fi_info *info) +{ + struct fi_info *fi, *itr; + + fi = lnx_get_link_by_dom(info->domain_attr->name); + if (!fi) + return -FI_ENODATA; + + for (itr = fi; itr; itr = itr->next) { + if (!strcmp(itr->fabric_attr->name, ep->lpe_fabric_name)) { + ep->lpe_fi_info = fi_dupinfo(itr); + return FI_SUCCESS; + } + } + + ep->lpe_fi_info = NULL; + + return -FI_ENOENT; +} + +static struct fi_ops_srx_owner lnx_srx_ops = { + .size = sizeof(struct fi_ops_srx_owner), + .get_msg = lnx_get_msg, + .get_tag = lnx_get_tag, + .queue_msg = lnx_queue_msg, + .queue_tag = lnx_queue_tag, + .free_entry = lnx_free_entry, + .foreach_unspec_addr = lnx_foreach_unspec_addr, +}; + +static int lnx_open_core_domains(struct local_prov *prov, + void *context, struct lnx_domain *lnx_domain, + struct fi_info *info) +{ + int rc; + struct local_prov_ep *ep; + struct fi_rx_attr attr = {0}; + struct fi_peer_srx_context peer_srx; + struct dlist_entry *tmp; + int srq_support = 1; + + fi_param_get_bool(&lnx_prov, "use_srq", &srq_support); + + attr.op_flags = FI_PEER; + peer_srx.size = sizeof(peer_srx); + + if (srq_support) + lnx_domain->ld_srx_supported = true; + else + lnx_domain->ld_srx_supported = false; + + dlist_foreach_container_safe(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry, tmp) { + /* the fi_info we setup when we created the fabric might not + * necessarily be the correct one. It'll have the same fabric + * information, since the fabric information is common among all + * the domains the provider manages. However at this point we need + * to get the fi_info that the application is requesting */ + rc = lnx_setup_core_domain(ep, info); + if (rc) + return rc; + + if (srq_support) { + /* special case for CXI provider. We need to turn off tag + * matching HW offload if we're going to support shared + * receive queues. + */ + if (strstr(ep->lpe_fabric_name, "cxi")) + setenv("FI_CXI_RX_MATCH_MODE", "software", 1); + } + + rc = fi_domain(ep->lpe_fabric, ep->lpe_fi_info, + &ep->lpe_domain, context); + + if (!rc && srq_support) { + ep->lpe_srx.owner_ops = &lnx_srx_ops; + peer_srx.srx = &ep->lpe_srx; + rc = fi_srx_context(ep->lpe_domain, &attr, NULL, &peer_srx); + } + + /* if one of the constituent endpoints doesn't support shared + * receive context, then fail, as we can't continue with this + * inconsistency + */ + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, "%s does not support shared" + " receive queues. Failing\n", ep->lpe_fabric_name); + return rc; + } + } + + return 0; +} + +static int lnx_addr_add_region_noop(struct ofi_mr_cache *cache, + struct ofi_mr_entry *entry) +{ + return FI_SUCCESS; +} + +static void lnx_addr_del_region(struct ofi_mr_cache *cache, + struct ofi_mr_entry *entry) +{ + struct ofi_mr *mr = (struct ofi_mr *)entry->data; + + ofi_hmem_dev_unregister(mr->iface, (uint64_t) mr->hmem_data); +} + +/* + * provider: shm+cxi:lnx + * fabric: ofi_lnx_fabric + * domain: shm+cxi3:ofi_lnx_domain + * version: 120.0 + * type: FI_EP_RDM + * protocol: FI_PROTO_LNX + * + * Parse out the provider name. It should be shm+ + * + * Create a fabric for shm and one for the other provider. + * + * When fi_domain() is called, we get the fi_info for the + * second provider, which we should've returned as part of the + * fi_getinfo() call. + */ +int lnx_domain_open(struct fid_fabric *fabric, struct fi_info *info, + struct fid_domain **domain, void *context) +{ + int rc = 0; + struct local_prov *entry; + struct lnx_domain *lnx_domain; + struct util_domain *lnx_domain_info; + struct lnx_fabric *lnx_fab = container_of(fabric, struct lnx_fabric, + util_fabric.fabric_fid); + struct ofi_mem_monitor *memory_monitors[OFI_HMEM_MAX] = { + [FI_HMEM_SYSTEM] = default_monitor, + [FI_HMEM_CUDA] = default_cuda_monitor, + [FI_HMEM_ROCR] = default_rocr_monitor, + [FI_HMEM_ZE] = default_ze_monitor, + }; + + /* create a new entry for shm. + * Create its fabric. + * insert fabric in the global table + */ + rc = lnx_setup_core_fabrics(info->domain_attr->name, lnx_fab, context); + if (rc) + goto fail; + + rc = -FI_ENOMEM; + lnx_domain = calloc(sizeof(*lnx_domain), 1); + if (!lnx_domain) + goto fail; + + lnx_domain_info = &lnx_domain->ld_domain; + lnx_domain->ld_fabric = lnx_fab; + + rc = ofi_domain_init(fabric, info, lnx_domain_info, context, + OFI_LOCK_SPINLOCK); + if (rc) + goto fail; + + dlist_foreach_container(&lnx_domain->ld_fabric->local_prov_table, + struct local_prov, entry, lpv_entry) { + rc = lnx_open_core_domains(entry, context, lnx_domain, info); + if (rc) { + FI_INFO(&lnx_prov, FI_LOG_CORE, "Failed to initialize domain for %s\n", + entry->lpv_prov_name); + goto close_domain; + } + } + + lnx_domain_info->domain_fid.fid.ops = &lnx_domain_fi_ops; + lnx_domain_info->domain_fid.ops = &lnx_domain_ops; + lnx_domain_info->domain_fid.mr = &lnx_mr_ops; + + lnx_domain->ld_mr_cache.add_region = lnx_addr_add_region_noop; + lnx_domain->ld_mr_cache.delete_region = lnx_addr_del_region; + lnx_domain->ld_mr_cache.entry_data_size = sizeof(struct ofi_mr); + rc = ofi_mr_cache_init(&lnx_domain->ld_domain, memory_monitors, + &lnx_domain->ld_mr_cache); + if (rc) + goto close_domain; + + *domain = &lnx_domain_info->domain_fid; + + return 0; + +close_domain: + lnx_domain_close(&(lnx_domain_info->domain_fid.fid)); +fail: + return rc; +} + diff --git a/prov/lnx/src/lnx_ep.c b/prov/lnx/src/lnx_ep.c new file mode 100644 index 00000000000..cd4b83d099f --- /dev/null +++ b/prov/lnx/src/lnx_ep.c @@ -0,0 +1,1181 @@ +/* + * Copyright (c) 2022 ORNL. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include + +#include +#include "ofi_util.h" +#include "ofi.h" +#include "ofi_str.h" +#include "ofi_prov.h" +#include "ofi_perf.h" +#include "ofi_hmem.h" +#include "rdma/fi_ext.h" +#include "lnx.h" + +extern struct fi_ops_cm lnx_cm_ops; +extern struct fi_ops_msg lnx_msg_ops; +extern struct fi_ops_tagged lnx_tagged_ops; +extern struct fi_ops_rma lnx_rma_ops; +extern struct fi_ops_atomic lnx_atomic_ops; + +static void lnx_init_ctx(struct fid_ep *ctx, size_t fclass); + +static int lnx_close_ceps(struct local_prov *prov) +{ + int rc, frc = 0; + struct local_prov_ep *ep; + + dlist_foreach_container(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + + if (ep->lpe_srx.ep_fid.fid.context) + free(ep->lpe_srx.ep_fid.fid.context); + + rc = fi_close(&ep->lpe_ep->fid); + if (rc) + frc = rc; + ofi_bufpool_destroy(ep->lpe_recv_bp); + } + + return frc; +} + +int lnx_ep_close(struct fid *fid) +{ + int rc = 0; + struct local_prov *entry; + struct lnx_ep *ep; + struct lnx_fabric *fabric; + + ep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + fabric = ep->le_domain->ld_fabric; + + dlist_foreach_container(&fabric->local_prov_table, + struct local_prov, + entry, lpv_entry) { + lnx_close_ceps(entry); + if (rc) + FI_WARN(&lnx_prov, FI_LOG_CORE, + "Failed to close endpoint for %s\n", + entry->lpv_prov_name); + } + + ofi_endpoint_close(&ep->le_ep); + free(ep); + + return rc; +} + +static int lnx_enable_core_eps(struct lnx_ep *lep) +{ + int rc; + struct local_prov *entry; + struct local_prov_ep *ep; + int srq_support = 1; + struct lnx_fabric *fabric = lep->le_domain->ld_fabric; + + fi_param_get_bool(&lnx_prov, "use_srq", &srq_support); + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + if (srq_support) { + rc = fi_ep_bind(ep->lpe_ep, + &ep->lpe_srx.ep_fid.fid, 0); + if (rc) { + FI_INFO(&lnx_prov, FI_LOG_CORE, + "%s doesn't support SRX (%d)\n", + ep->lpe_fabric_name, rc); + return rc; + } + } + + rc = fi_enable(ep->lpe_ep); + if (rc) + return rc; + } + } + + return 0; +} + +static int lnx_ep_control(struct fid *fid, int command, void *arg) +{ + struct lnx_ep *ep; + int rc; + + ep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + + switch (command) { + case FI_ENABLE: + if (ep->le_fclass == FI_CLASS_EP && + ((ofi_needs_rx(ep->le_ep.caps) && !ep->le_ep.rx_cq) || + (ofi_needs_tx(ep->le_ep.caps) && !ep->le_ep.tx_cq))) + return -FI_ENOCQ; + if (!ep->le_peer_tbl) + return -FI_ENOAV; + rc = lnx_enable_core_eps(ep); + break; + default: + return -FI_ENOSYS; + } + + return rc; +} + +int lnx_cq_bind_core_prov(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + int rc; + struct lnx_ep *lep; + struct util_cq *cq; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + lep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + cq = container_of(bfid, struct util_cq, cq_fid.fid); + fabric = lep->le_domain->ld_fabric; + + rc = ofi_ep_bind_cq(&lep->le_ep, cq, flags); + if (rc) + return rc; + + /* bind the core providers to their respective CQs */ + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_ep_bind(ep->lpe_ep, + &ep->lpe_cq.lpc_core_cq->fid, flags); + if (rc) + return rc; + } + } + + return 0; +} + +static int lnx_ep_bind_core_prov(struct lnx_fabric *fabric, uint64_t flags) +{ + struct local_prov *entry; + struct local_prov_ep *ep; + int rc; + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_ep_bind(ep->lpe_ep, &ep->lpe_av->fid, flags); + if (rc) + return rc; + } + } + + return rc; +} + +static int +lnx_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + int rc = 0; + struct lnx_ep *ep; + struct lnx_peer_table *peer_tbl; + + switch (fid->fclass) { + case FI_CLASS_EP: /* Standard EP */ + case FI_CLASS_SEP: /* Scalable EP */ + ep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + break; + + default: + return -FI_EINVAL; + } + + switch (bfid->fclass) { + case FI_CLASS_EQ: + return -FI_ENOSYS; + + case FI_CLASS_CQ: + rc = lnx_cq_bind_core_prov(fid, bfid, flags); + break; + + case FI_CLASS_CNTR: + return -FI_ENOSYS; + + case FI_CLASS_AV: + peer_tbl = container_of(bfid, struct lnx_peer_table, + lpt_av.av_fid.fid); + if (peer_tbl->lpt_domain != ep->le_domain) + return -FI_EINVAL; + ep->le_peer_tbl = peer_tbl; + /* forward the bind to the core provider endpoints */ + rc = lnx_ep_bind_core_prov(ep->le_domain->ld_fabric, flags); + break; + + case FI_CLASS_STX_CTX: /* shared TX context */ + return -FI_ENOSYS; + + case FI_CLASS_SRX_CTX: /* shared RX context */ + return -FI_ENOSYS; + + default: + return -FI_EINVAL; + } + + return rc; +} + +int lnx_getname(fid_t fid, void *addr, size_t *addrlen) +{ + struct local_prov *entry; + size_t size = sizeof(struct lnx_addresses); + /* initial location to put the address */ + char ep_addr[FI_NAME_MAX]; + char *tmp = NULL; + struct lnx_addresses *la; + struct lnx_address_prov *lap; + char hostname[FI_NAME_MAX]; + size_t prov_addrlen; + size_t addrlen_list[LNX_MAX_LOCAL_EPS]; + int rc, j = 0; + struct lnx_ep *lnx_ep; + struct lnx_fabric *fabric; + struct local_prov_ep *ep; + + lnx_ep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + fabric = lnx_ep->le_domain->ld_fabric; + + /* check the hostname and compare it to mine + * TODO: Is this good enough? or do we need a better way of + * determining if the address is local? + */ + rc = gethostname(hostname, FI_NAME_MAX); + if (rc == -1) { + FI_WARN(&lnx_prov, FI_LOG_CORE, "failed to get hostname\n"); + return -FI_EPERM; + } + + addrlen_list[0] = 0; + + /* calculate the size of the address */ + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + size += sizeof(struct lnx_address_prov); + prov_addrlen = 0; + + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_getname(&ep->lpe_ep->fid, (void*)ep_addr, &prov_addrlen); + if (rc == -FI_ETOOSMALL) { + size += prov_addrlen * entry->lpv_ep_count; + addrlen_list[j] = prov_addrlen; + j++; + break; + } else { + return -FI_EINVAL; + } + } + } + + if (!addr || *addrlen < size) { + *addrlen = size; + return -FI_ETOOSMALL; + } + + la = addr; + + lap = (struct lnx_address_prov *)((char*)la + sizeof(*la)); + + j = 0; + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + memcpy(lap->lap_prov, entry->lpv_prov_name, FI_NAME_MAX - 1); + lap->lap_addr_count = entry->lpv_ep_count; + lap->lap_addr_size = addrlen_list[j]; + + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + tmp = (char*)lap + sizeof(*lap); + + rc = fi_getname(&ep->lpe_ep->fid, (void*)tmp, &addrlen_list[j]); + if (rc) + return rc; + + if (lap->lap_addr_size != addrlen_list[j]) + return -FI_EINVAL; + + tmp += addrlen_list[j]; + } + + lap = (struct lnx_address_prov *)tmp; + j++; + } + + la->la_prov_count = j; + memcpy(la->la_hostname, hostname, FI_NAME_MAX - 1); + + return 0; +} + +static ssize_t lnx_ep_cancel(fid_t fid, void *context) +{ + int rc = 0; + struct lnx_ep *lep; + struct lnx_ctx *ctx; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + switch (fid->fclass) { + case FI_CLASS_EP: + lep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + break; + case FI_CLASS_RX_CTX: + ctx = container_of(fid, struct lnx_ctx, ctx_ep.fid); + lep = ctx->ctx_parent; + break; + case FI_CLASS_TX_CTX: + return -FI_ENOENT; + default: + return -FI_EINVAL; + } + + fabric = lep->le_domain->ld_fabric; + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_cancel(&ep->lpe_ep->fid, context); + if (rc == -FI_ENOSYS) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "%s: Operation not supported by provider. " + "Ignoring\n", ep->lpe_fabric_name); + rc = 0; + continue; + } else if (rc != FI_SUCCESS) { + return rc; + } + } + } + + return rc; +} + +static int lnx_ep_setopt(fid_t fid, int level, int optname, const void *optval, + size_t optlen) +{ + int rc = 0; + struct lnx_ep *lep; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + lep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + fabric = lep->le_domain->ld_fabric; + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_setopt(&ep->lpe_ep->fid, level, optname, + optval, optlen); + if (rc == -FI_ENOSYS) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "%s: Operation not supported by provider. " + "Ignoring\n", ep->lpe_fabric_name); + rc = 0; + continue; + } else if (rc != FI_SUCCESS) { + return rc; + } + } + } + + return rc; +} + + +static int lnx_ep_txc(struct fid_ep *fid, int index, struct fi_tx_attr *attr, + struct fid_ep **tx_ep, void *context) +{ + int rc = 0; + struct lnx_ep *lep; + struct lnx_ctx *ctx; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + ctx = calloc(sizeof(*ctx), 1); + if (!ctx) + return -FI_ENOMEM; + + lep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + fabric = lep->le_domain->ld_fabric; + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + if (index >= ep->lpe_fi_info->ep_attr->tx_ctx_cnt) + continue; + + rc = fi_tx_context(ep->lpe_ep, index, attr, + &ep->lpe_txc[index], context); + if (rc == -FI_ENOSYS) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "%s: Operation not supported by provider. " + "Ignoring\n", ep->lpe_fabric_name); + rc = 0; + continue; + } else if (rc != FI_SUCCESS) { + return rc; + } + } + } + + dlist_init(&ctx->ctx_head); + ctx->ctx_idx = index; + ctx->ctx_parent = lep; + lnx_init_ctx(&ctx->ctx_ep, FI_CLASS_TX_CTX); + dlist_insert_tail(&ctx->ctx_head, &lep->le_tx_ctx); + /* set the callbacks for the transmit context */ + *tx_ep = &ctx->ctx_ep; + + return rc; +} + +static int lnx_ep_rxc(struct fid_ep *fid, int index, struct fi_rx_attr *attr, + struct fid_ep **rx_ep, void *context) +{ + int rc = 0; + struct lnx_ep *lep; + struct lnx_ctx *ctx; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + ctx = calloc(sizeof(*ctx), 1); + if (!ctx) + return -FI_ENOMEM; + + lep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + fabric = lep->le_domain->ld_fabric; + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + if (index >= ep->lpe_fi_info->ep_attr->rx_ctx_cnt) + continue; + + rc = fi_rx_context(ep->lpe_ep, index, attr, + &ep->lpe_rxc[index], context); + if (rc == -FI_ENOSYS) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "%s: Operation not supported by provider. " + "Ignoring\n", ep->lpe_fabric_name); + rc = 0; + continue; + } else if (rc != FI_SUCCESS) { + return rc; + } + } + } + + dlist_init(&ctx->ctx_head); + ctx->ctx_idx = index; + ctx->ctx_parent = lep; + lnx_init_ctx(&ctx->ctx_ep, FI_CLASS_RX_CTX); + dlist_insert_tail(&ctx->ctx_head, &lep->le_rx_ctx); + /* set the callbacks for the receive context */ + *rx_ep = &ctx->ctx_ep; + + return rc; +} + +struct fi_ops_ep lnx_ep_ops = { + .size = sizeof(struct fi_ops_ep), + .cancel = lnx_ep_cancel, + /* can't get opt, because there is no way to report multiple + * options for the different links */ + .getopt = fi_no_getopt, + .setopt = lnx_ep_setopt, + .tx_ctx = lnx_ep_txc, + .rx_ctx = lnx_ep_rxc, + .rx_size_left = fi_no_rx_size_left, + .tx_size_left = fi_no_tx_size_left, +}; + +struct fi_ops lnx_ep_fi_ops = { + .size = sizeof(struct fi_ops), + .close = lnx_ep_close, + .bind = lnx_ep_bind, + .control = lnx_ep_control, + .ops_open = fi_no_ops_open, +}; + +struct fi_ops_cm lnx_cm_ops = { + .size = sizeof(struct fi_ops_cm), + .setname = fi_no_setname, + .getname = lnx_getname, + .getpeer = fi_no_getpeer, + .connect = fi_no_connect, + .listen = fi_no_listen, + .accept = fi_no_accept, + .reject = fi_no_reject, + .shutdown = fi_no_shutdown, +}; + +static int lnx_open_eps(struct local_prov *prov, struct fi_info *info, + void *context, size_t fclass, struct lnx_ep *lep) +{ + int rc = 0; + struct local_prov_ep *ep; + struct dlist_entry *tmp; + struct ofi_bufpool_attr bp_attrs = {}; + struct lnx_srx_context *ctxt; + + ctxt = calloc(1, sizeof(*ctxt)); + if (!ctxt) + return -FI_ENOMEM; + + dlist_foreach_container_safe(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry, tmp) { + if (fclass == FI_CLASS_EP) { + rc = fi_endpoint(ep->lpe_domain, ep->lpe_fi_info, + &ep->lpe_ep, context); + } else { + /* update endpoint attributes with whatever is being + * passed from the application + */ + if (ep->lpe_fi_info && info) { + ep->lpe_fi_info->ep_attr->tx_ctx_cnt = + info->ep_attr->tx_ctx_cnt; + ep->lpe_fi_info->ep_attr->rx_ctx_cnt = + info->ep_attr->rx_ctx_cnt; + } + + ep->lpe_txc = calloc(info->ep_attr->tx_ctx_cnt, + sizeof(*ep->lpe_txc)); + ep->lpe_rxc = calloc(info->ep_attr->rx_ctx_cnt, + sizeof(*ep->lpe_rxc)); + if (!ep->lpe_txc || !ep->lpe_rxc) + return -FI_ENOMEM; + + rc = fi_scalable_ep(ep->lpe_domain, ep->lpe_fi_info, + &ep->lpe_ep, context); + } + if (rc) + return rc; + + ctxt->srx_lep = lep; + ctxt->srx_cep = ep; + + ep->lpe_srx.ep_fid.fid.context = ctxt; + ep->lpe_srx.ep_fid.fid.fclass = FI_CLASS_SRX_CTX; + ofi_spin_init(&ep->lpe_bplock); + /* create a buffer pool for the receive requests */ + bp_attrs.size = sizeof(struct lnx_rx_entry); + bp_attrs.alignment = 8; + bp_attrs.max_cnt = UINT16_MAX; + bp_attrs.chunk_cnt = 64; + bp_attrs.flags = OFI_BUFPOOL_NO_TRACK; + rc = ofi_bufpool_create_attr(&bp_attrs, &ep->lpe_recv_bp); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_FABRIC, + "Failed to create receive buffer pool"); + return -FI_ENOMEM; + } + } + + return 0; +} + +static void +lnx_ep_nosys_progress(struct util_ep *util_ep) +{ + assert(0); +} + +static inline int +match_tag(uint64_t tag, uint64_t match_tag, uint64_t ignore) +{ + return ((tag | ignore) == (match_tag | ignore)); +} + +static inline bool +lnx_addr_match(fi_addr_t addr1, fi_addr_t addr2) +{ + return (addr1 == addr2); +} + +static inline bool +lnx_search_addr_match(fi_addr_t cep_addr, struct lnx_peer_prov *lpp) +{ + struct lnx_local2peer_map *lpm; + fi_addr_t peer_addr; + int i; + + dlist_foreach_container(&lpp->lpp_map, + struct lnx_local2peer_map, + lpm, entry) { + for (i = 0; i < LNX_MAX_LOCAL_EPS; i++) { + peer_addr = lpm->peer_addrs[i]; + if (peer_addr == FI_ADDR_NOTAVAIL) + break; + if (lnx_addr_match(peer_addr, cep_addr)) + return true; + } + } + + return false; +} + +static int lnx_match_common(uint64_t tag1, uint64_t tag2, uint64_t ignore, + fi_addr_t cep_addr, fi_addr_t lnx_addr, struct lnx_peer *peer, + struct local_prov_ep *cep) +{ + struct lnx_peer_prov *lpp; + struct local_prov *lp; + bool tmatch; + + /* if a request has no address specified it'll match against any + * rx_entry with a matching tag + * or + * if an rx_entry has no address specified, it'll match against any + * request with a matching tag + * + * for non tagged messages tags will be set to TAG_ANY so they will + * always match and decision will be made on address only. + */ + tmatch = match_tag(tag1, tag2, ignore); + if (!tmatch) + return tmatch; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "tag1=%lx tag2=%lx ignore=%lx cep_addr=%lx lnx_addr=%lx tmatch=%d\n", + tag1, tag2, ignore, cep_addr, lnx_addr, tmatch); + + /* if we're requested to receive from any peer, then tag maching is + * enough. None tagged message will match irregardless. + */ + if (lnx_addr == FI_ADDR_UNSPEC) + return tmatch; + + /* if the address is specified, then we should have a peer and + * a receiving core endpoint and a provider parent + */ + assert(peer && cep && cep->lpe_parent); + + lp = cep->lpe_parent; + + /* if this is a shm core provider, then only go through lnx + * shm provider + */ + if (cep->lpe_local) + return lnx_search_addr_match(cep_addr, peer->lp_shm_prov); + + /* check if we already have a peer provider. + * A peer can receive messages from multiple providers, we need to + * find the provider which maps to the provider we're currently + * checking. The map looked up can have multiple addresses which + * we can receive from, so we need to check which one of those is + * the correct match. + * + * Note: we're trying to make this loop as efficient as possible, + * because it's executed on the message matching path, which is + * heavily hit. + * + * The theory is in most use cases: + * - There will be only two providers to check + * - Each provider will have 1 endpoint, and therefore only one map + * - Each peer will only have 1 address. + * + */ + dlist_foreach_container(&peer->lp_provs, + struct lnx_peer_prov, lpp, entry) { + if (lpp->lpp_prov == lp) + return lnx_search_addr_match(cep_addr, lpp); + } + + return false; +} + +static int lnx_match_unexq(struct dlist_entry *item, const void *args) +{ + /* this entry is placed on the SUQ via the lnx_get_tag() path + * and examined in the lnx_process_tag() path */ + struct lnx_match_attr *match_attr = (struct lnx_match_attr *) args; + struct lnx_rx_entry *entry = (struct lnx_rx_entry *) item; + struct lnx_peer *peer = match_attr->lm_peer; + + /* entry refers to the unexpected message received + * entry->rx_entry.tag will be the tag of the message or TAG_UNSPEC + * otherwise + * + * entry->rx_entry.addr will be the address of the peer which sent the + * message or ADDR_UNSPEC if the core provider didn't do a reverse + * lookup. + * + * entry->rx_cep will be set to the core endpoint which received the + * message. + * + * match_attr is filled in by the lnx_process_tag() and contains + * information passed to us by the application + * + * match_attr->lm_peer is the peer looked up via the addr passed by + * the application to LNX. It is NULL if the addr is ADDR_UNSPEC. + * + * match_attr->lm_tag, match_attr->lm_ignore are the tag and ignore + * bits passed by the application to LNX via the receive API. + * + * match_attr->lm_addr is the only significant if it's set to + * FI_ADDR_UNSPEC, otherwise it's not used in matching because it's + * the LNX level address and we need to compare the core level address. + */ + return lnx_match_common(entry->rx_entry.tag, match_attr->lm_tag, + match_attr->lm_ignore, entry->rx_entry.addr, + match_attr->lm_addr, peer, entry->rx_cep); +} + +static int lnx_match_recvq(struct dlist_entry *item, const void *args) +{ + struct lnx_match_attr *match_attr = (struct lnx_match_attr *) args; + /* this entry is placed on the recvq via the lnx_process_tag() path + * and examined in the lnx_get_tag() path */ + struct lnx_rx_entry *entry = (struct lnx_rx_entry *) item; + + /* entry refers to the receive request waiting for a message + * entry->rx_entry.tag is the tag passed in by the application. + * + * entry->rx_entry.addr is the address passed in by the application. + * This is the LNX level address. It's only significant if it's set + * to ADDR_UNSPEC. Otherwise, it has already been used to look up the + * peer. + * + * entry->rx_cep is always NULL in this case, as this will only be + * known when the message is received. + * + * entry->rx_peer is the LNX peer looked up if a valid address is + * given by the application, otherwise it's NULL. + * + * match_attr information is filled by the lnx_get_tag() callback and + * contains information passed to us by the core endpoint receiving + * the message. + * + * match_attr->rx_peer is not significant because at the lnx_get_tag() + * call there isn't enough information to find what the peer is. + * + * match_attr->lm_tag, match_attr->lm_ignore are the tag and ignore + * bits passed up by the core endpoint receiving the message. + * + * match_attr->lm_addr is the address of the peer which sent the + * message. Set if the core endpoint has done a reverse lookup, + * otherwise set to ADDR_UNSPEC. + * + * match_attr->lm_cep is the core endpoint which received the message. + */ + return lnx_match_common(entry->rx_entry.tag, match_attr->lm_tag, + entry->rx_ignore, match_attr->lm_addr, + entry->rx_entry.addr, entry->rx_peer, match_attr->lm_cep); +} + +static inline int +lnx_init_queue(struct lnx_queue *q, dlist_func_t *match_func) +{ + int rc; + + rc = ofi_spin_init(&q->lq_qlock); + if (rc) + return rc; + + dlist_init(&q->lq_queue); + + q->lq_match_func = match_func; + + return 0; +} + +static inline int +lnx_init_qpair(struct lnx_qpair *qpair, dlist_func_t *recvq_match_func, + dlist_func_t *unexq_match_func) +{ + int rc = 0; + + rc = lnx_init_queue(&qpair->lqp_recvq, recvq_match_func); + if (rc) + goto out; + rc = lnx_init_queue(&qpair->lqp_unexq, unexq_match_func); + if (rc) + goto out; + +out: + return rc; +} + +static inline int +lnx_init_srq(struct lnx_peer_srq *srq) +{ + int rc; + + rc = lnx_init_qpair(&srq->lps_trecv, lnx_match_recvq, lnx_match_unexq); + if (rc) + return rc; + rc = lnx_init_qpair(&srq->lps_recv, lnx_match_recvq, lnx_match_unexq); + if (rc) + return rc; + + return rc; +} + +static int lnx_get_ctx(struct local_prov_ep *ep, size_t fclass, + struct fid_ep ***ep_ctx, size_t *size) +{ + switch (fclass) { + case FI_CLASS_RX_CTX: + *ep_ctx = ep->lpe_rxc; + *size = ep->lpe_fi_info->ep_attr->rx_ctx_cnt; + break; + case FI_CLASS_TX_CTX: + *ep_ctx = ep->lpe_txc; + *size = ep->lpe_fi_info->ep_attr->tx_ctx_cnt; + break; + default: + return -FI_EINVAL; + } + + return FI_SUCCESS; +} + +static void lnx_close_ep_ctx(struct local_prov_ep *ep, size_t fclass) +{ + struct fid_ep **ep_ctx; + size_t size; + size_t i; + int rc; + + rc = lnx_get_ctx(ep, fclass, &ep_ctx, &size); + if (rc) + return; + + for (i = 0; i < size; i++) { + rc = fi_close(&ep_ctx[i]->fid); + if (rc) + FI_WARN(&lnx_prov, FI_LOG_CORE, + "Failed to close ep context %lu with %d\n", + fclass, rc); + } +} + +static int lnx_ctx_close(struct fid *fid) +{ + struct lnx_ep *lep; + struct lnx_ctx *ctx; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + if (fid->fclass != FI_CLASS_RX_CTX && + fid->fclass != FI_CLASS_TX_CTX) + return -FI_EINVAL; + + ctx = container_of(fid, struct lnx_ctx, ctx_ep.fid); + lep = ctx->ctx_parent; + + fabric = lep->le_domain->ld_fabric; + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) + lnx_close_ep_ctx(ep, fid->fclass); + } + + return FI_SUCCESS; +} + +static int lnx_ctx_bind_cq(struct local_prov_ep *ep, size_t fclass, + struct fid *bfid, uint64_t flags) +{ + struct fid_ep **ep_ctx; + size_t size; + size_t i; + int rc; + + rc = lnx_get_ctx(ep, fclass, &ep_ctx, &size); + if (rc) + return rc; + + for (i = 0; i < size; i++) { + rc = fi_ep_bind(ep_ctx[i], bfid, flags); + if (rc) + return rc; + } + + return FI_SUCCESS; +} + +static int +lnx_ctx_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + int rc; + struct lnx_ep *lep; + struct lnx_ctx *ctx; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + if (fid->fclass != FI_CLASS_RX_CTX && + fid->fclass != FI_CLASS_TX_CTX) + return -FI_EINVAL; + + ctx = container_of(fid, struct lnx_ctx, ctx_ep.fid); + lep = ctx->ctx_parent; + + fabric = lep->le_domain->ld_fabric; + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + if (bfid->fclass == FI_CLASS_CQ) + /* bind the context to the shared cq */ + rc = lnx_ctx_bind_cq(ep, fid->fclass, + &ep->lpe_cq.lpc_core_cq->fid, + flags); + else + return -FI_ENOSYS; + + if (rc) + return rc; + } + } + + return FI_SUCCESS; +} + +static int +lnx_enable_ctx_eps(struct local_prov_ep *ep, size_t fclass) +{ + struct fid_ep **ep_ctx; + size_t size; + size_t i; + int rc; + + rc = lnx_get_ctx(ep, fclass, &ep_ctx, &size); + if (rc) + return rc; + + for (i = 0; i < size; i++) { + rc = fi_enable(ep_ctx[i]); + if (rc) + return rc; + } + + return FI_SUCCESS; +} + +static int +lnx_ctx_control(struct fid *fid, int command, void *arg) +{ + int rc; + struct lnx_ep *lep; + struct lnx_ctx *ctx; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + if (fid->fclass != FI_CLASS_RX_CTX && + fid->fclass != FI_CLASS_TX_CTX) + return -FI_EINVAL; + + ctx = container_of(fid, struct lnx_ctx, ctx_ep.fid); + lep = ctx->ctx_parent; + + fabric = lep->le_domain->ld_fabric; + + switch (command) { + case FI_ENABLE: + if (!lep->le_peer_tbl) + return -FI_ENOAV; + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = lnx_enable_ctx_eps(ep, fid->fclass); + if (rc) + return rc; + } + } + break; + default: + return -FI_ENOSYS; + } + + return rc; +} + +static struct fi_ops lnx_ctx_ops = { + .size = sizeof(struct fi_ops), + .close = lnx_ctx_close, + .bind = lnx_ctx_bind, + .control = lnx_ctx_control, + .ops_open = fi_no_ops_open, +}; + +struct fi_ops_ep lnx_ctx_ep_ops = { + .size = sizeof(struct fi_ops_ep), + .cancel = lnx_ep_cancel, + .getopt = fi_no_getopt, + .setopt = fi_no_setopt, + .tx_ctx = fi_no_tx_ctx, + .rx_ctx = fi_no_rx_ctx, + .rx_size_left = fi_no_rx_size_left, + .tx_size_left = fi_no_tx_size_left, +}; + +static void +lnx_init_ctx(struct fid_ep *ctx, size_t fclass) +{ + ctx->fid.fclass = fclass; + ctx->fid.ops = &lnx_ctx_ops; + ctx->ops = &lnx_ctx_ep_ops; + ctx->msg = &lnx_msg_ops; + ctx->tagged = &lnx_tagged_ops; + ctx->rma = &lnx_rma_ops; + ctx->atomic = &lnx_atomic_ops; +} + +static int +lnx_alloc_endpoint(struct fid_domain *domain, struct fi_info *info, + struct lnx_ep **out_ep, void *context, size_t fclass) +{ + int rc; + struct lnx_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + uint64_t mr_mode; + + ep = calloc(1, sizeof(*ep)); + if (!ep) + return -FI_ENOMEM; + + ep->le_fclass = fclass; + ep->le_ep.ep_fid.fid.fclass = fclass; + + ep->le_ep.ep_fid.fid.ops = &lnx_ep_fi_ops; + ep->le_ep.ep_fid.ops = &lnx_ep_ops; + ep->le_ep.ep_fid.cm = &lnx_cm_ops; + ep->le_ep.ep_fid.msg = &lnx_msg_ops; + ep->le_ep.ep_fid.tagged = &lnx_tagged_ops; + ep->le_ep.ep_fid.rma = &lnx_rma_ops; + ep->le_ep.ep_fid.atomic = &lnx_atomic_ops; + ep->le_domain = container_of(domain, struct lnx_domain, + ld_domain.domain_fid); + lnx_init_srq(&ep->le_srq); + + dlist_init(&ep->le_rx_ctx); + dlist_init(&ep->le_tx_ctx); + + fabric = ep->le_domain->ld_fabric; + + /* create all the core provider endpoints */ + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + rc = lnx_open_eps(entry, info, context, fclass, ep); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "Failed to create ep for %s\n", + entry->lpv_prov_name); + goto fail; + } + } + + mr_mode = lnx_util_prov.info->domain_attr->mr_mode; + lnx_util_prov.info->domain_attr->mr_mode = 0; + rc = ofi_endpoint_init(domain, (const struct util_prov *)&lnx_util_prov, + (struct fi_info *)lnx_util_prov.info, &ep->le_ep, + context, lnx_ep_nosys_progress); + if (rc) + goto fail; + + lnx_util_prov.info->domain_attr->mr_mode = mr_mode; + *out_ep = ep; + + return 0; + +fail: + free(ep); + return rc; +} + +int lnx_scalable_ep(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep, void *context) +{ + int rc; + struct lnx_ep *my_ep; + + rc = lnx_alloc_endpoint(domain, info, &my_ep, context, FI_CLASS_SEP); + if (rc) + return rc; + + *ep = &my_ep->le_ep.ep_fid; + + return 0; +} + +int lnx_endpoint(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep, void *context) +{ + int rc; + struct lnx_ep *my_ep; + + rc = lnx_alloc_endpoint(domain, info, &my_ep, context, FI_CLASS_EP); + if (rc) + return rc; + + *ep = &my_ep->le_ep.ep_fid; + + return 0; +} + + diff --git a/prov/lnx/src/lnx_init.c b/prov/lnx/src/lnx_init.c new file mode 100644 index 00000000000..94c7a7e14cd --- /dev/null +++ b/prov/lnx/src/lnx_init.c @@ -0,0 +1,884 @@ +/* + * Copyright (c) 2022 ORNL. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include + +#include +#include "ofi_util.h" +#include "ofi.h" +#include "ofi_str.h" +#include "ofi_prov.h" +#include "ofi_perf.h" +#include "ofi_hmem.h" +#include "rdma/fi_ext.h" +#include "lnx.h" + +#define LNX_PASSTHRU_TX_OP_FLAGS (FI_INJECT_COMPLETE | \ + FI_TRANSMIT_COMPLETE | \ + FI_DELIVERY_COMPLETE) +#define LNX_PASSTHRU_RX_OP_FLAGS (0ULL) +#define LNX_TX_OP_FLAGS (FI_INJECT_COMPLETE | FI_COMPLETION | \ + FI_DELIVERY_COMPLETE | FI_TRANSMIT_COMPLETE) +#define LNX_RX_OP_FLAGS (FI_COMPLETION) + +ofi_spin_t global_bplock; +struct ofi_bufpool *global_recv_bp = NULL; + +struct util_fabric lnx_fabric_info; + +struct fi_tx_attr lnx_tx_attr = { + .caps = ~0x0ULL, + .op_flags = LNX_PASSTHRU_TX_OP_FLAGS | LNX_TX_OP_FLAGS, + .msg_order = ~0x0ULL, + .comp_order = 0, + .inject_size = SIZE_MAX, + .size = SIZE_MAX, + .iov_limit = LNX_IOV_LIMIT, + .rma_iov_limit = LNX_IOV_LIMIT, +}; + +struct fi_rx_attr lnx_rx_attr = { + .caps = ~0x0ULL, + .op_flags = LNX_PASSTHRU_RX_OP_FLAGS | LNX_RX_OP_FLAGS, + .msg_order = ~0x0ULL, + .comp_order = 0, + .total_buffered_recv = 0, + .size = 1024, + .iov_limit = LNX_IOV_LIMIT, +}; + +struct fi_ep_attr lnx_ep_attr = { + .type = FI_EP_UNSPEC, + .protocol = FI_PROTO_LNX, + .protocol_version = 1, + .max_msg_size = SIZE_MAX, + .msg_prefix_size = SIZE_MAX, + .max_order_raw_size = SIZE_MAX, + .max_order_war_size = SIZE_MAX, + .max_order_waw_size = SIZE_MAX, + .mem_tag_format = FI_TAG_GENERIC, + .tx_ctx_cnt = SIZE_MAX, + .rx_ctx_cnt = SIZE_MAX, + .auth_key = NULL, + .auth_key_size = 0, +}; + +struct fi_domain_attr lnx_domain_attr = { + .name = "ofi_lnx_domain", + .threading = FI_THREAD_SAFE, + .control_progress = FI_PROGRESS_AUTO, + .data_progress = FI_PROGRESS_AUTO, + .resource_mgmt = FI_RM_ENABLED, + .av_type = FI_AV_UNSPEC, + .mr_mode = FI_MR_RAW, + .mr_key_size = SIZE_MAX, + .cq_data_size = SIZE_MAX, + .cq_cnt = SIZE_MAX, + .ep_cnt = SIZE_MAX, + .tx_ctx_cnt = SIZE_MAX, + .rx_ctx_cnt = SIZE_MAX, + .max_ep_tx_ctx = SIZE_MAX, + .max_ep_rx_ctx = SIZE_MAX, + .max_ep_stx_ctx = SIZE_MAX, + .max_ep_srx_ctx = SIZE_MAX, + .cntr_cnt = SIZE_MAX, + .mr_iov_limit = SIZE_MAX, + .caps = ~0x0ULL, + .auth_key_size = SIZE_MAX, + .max_err_data = SIZE_MAX, + .mr_cnt = SIZE_MAX, +}; + +struct fi_fabric_attr lnx_fabric_attr = { + .prov_version = OFI_VERSION_DEF_PROV, + .name = "ofi_lnx_fabric", +}; + +struct fi_info lnx_info = { + .caps = ~0x0ULL, + .tx_attr = &lnx_tx_attr, + .rx_attr = &lnx_rx_attr, + .ep_attr = &lnx_ep_attr, + .domain_attr = &lnx_domain_attr, + .fabric_attr = &lnx_fabric_attr +}; + +static struct fi_ops lnx_fabric_fi_ops = { + .size = sizeof(struct fi_ops), + .close = lnx_fabric_close, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +static struct fi_ops_fabric lnx_fabric_ops = { + .size = sizeof(struct fi_ops_fabric), + .domain = lnx_domain_open, + .passive_ep = fi_no_passive_ep, + .eq_open = fi_no_eq_open, + .wait_open = fi_no_wait_open, + .trywait = fi_no_trywait +}; + +struct fi_provider lnx_prov = { + .name = OFI_LNX, + .version = OFI_VERSION_DEF_PROV, + .fi_version = OFI_VERSION_LATEST, + .getinfo = lnx_getinfo, + .fabric = lnx_fabric, + .cleanup = lnx_fini +}; + +struct util_prov lnx_util_prov = { + .prov = &lnx_prov, + .info = &lnx_info, + .flags = 0 +}; + +/* + * For the fi_getinfo() -> fi_fabric() -> fi_domain() path, we need to + * keep track of the fi_info in case we need them later on when linking in + * the fi_fabric() function. + * + * This cache gets cleared after we use the ones we need, or when the + * library exists, if LNX is never used. + */ +struct dlist_entry lnx_fi_info_cache; +/* this is a list of all possible links */ +struct dlist_entry lnx_links; +struct dlist_entry lnx_links_meta; + +struct lnx_fi_cache_entry { + struct dlist_entry entry; + struct fi_info *fi; +}; + +struct lnx_fi_info_meta { + struct dlist_entry entry; + struct fi_info *lnx_rep; + struct fi_info *lnx_link; +}; + +static int lnx_get_cache_meta(struct dlist_entry *head, int *size) +{ + int num_prov = 0; + struct dlist_entry *e; + + dlist_foreach(head, e) + num_prov++; + + *size = num_prov; + + return FI_SUCCESS; +} + +static void lnx_free_meta(void) +{ + struct lnx_fi_info_meta *e; + struct dlist_entry *tmp; + + dlist_foreach_container_safe(&lnx_links_meta, struct lnx_fi_info_meta, e, + entry, tmp) { + dlist_remove(&e->entry); + free(e); + } +} + +static void lnx_free_info_cache(struct dlist_entry *head, bool meta) +{ + struct lnx_fi_cache_entry *e; + struct dlist_entry *tmp; + + dlist_foreach_container_safe(head, struct lnx_fi_cache_entry, e, + entry, tmp) { + fi_freeinfo(e->fi); + dlist_remove(&e->entry); + free(e); + } + + if (meta) + lnx_free_meta(); +} + +static int lnx_cache_info(struct dlist_entry *head, + struct fi_info *info) +{ + struct lnx_fi_cache_entry *e = calloc(1, sizeof(*e)); + + if (!e) + return -FI_ENOMEM; + dlist_init(&e->entry); + e->fi = info; + + dlist_insert_tail(&e->entry, head); + + return 0; +} + +struct fi_info * +lnx_get_link_by_dom(char *domain_name) +{ + struct fi_info *info; + struct lnx_fi_info_meta *e; + + dlist_foreach_container(&lnx_links_meta, struct lnx_fi_info_meta, e, + entry) { + info = e->lnx_rep; + if (info && info->domain_attr) { + if (!strcmp(domain_name, + info->domain_attr->name)) { + FI_INFO(&lnx_prov, FI_LOG_CORE, "Found %s\n", + info->fabric_attr->prov_name); + return e->lnx_link; + } + } + } + + return NULL; +} + +static void lnx_insert_tail(struct fi_info *head, struct fi_info *item) +{ + struct fi_info *itr = head; + + while (itr->next) + itr = itr->next; + itr->next = item; +} + +static void lnx_remove_tail(struct fi_info **head) +{ + struct fi_info *itr = *head, *prev = NULL; + + while (itr->next) { + prev = itr; + itr = itr->next; + } + + if (prev) + prev->next = NULL; + else + *head = NULL; + free(itr); +} + +static struct fi_info *lnx_dupinfo_list(struct fi_info *l) +{ + struct fi_info *itr, *new, *prev = NULL, *head = NULL; + + for (itr = l; itr; itr = itr->next) { + new = fi_dupinfo(itr); + if (!new) { + if (head) + fi_freeinfo(head); + return NULL; + } + + if (!head) + head = new; + + if (prev) { + prev->next = new; + prev = new; + } else { + prev = new; + } + } + + return head; +} + +static int gen_links_rec(struct dlist_entry *current, struct dlist_entry *head, + struct dlist_entry *result, struct fi_info *l, + int depth, int target_depth) +{ + int rc; + struct fi_info *itr; + struct fi_info *fi_copy, *dup; + struct lnx_fi_cache_entry *e, *new; + + while(current->next != head) { + e = container_of(current->next, struct lnx_fi_cache_entry, entry); + for (itr = e->fi; itr; itr = itr->next) { + fi_copy = fi_dupinfo(itr); + if (l) { + lnx_insert_tail(l, fi_copy); + } else { + l = fi_copy; + } + if (current->next->next == head && + depth == target_depth) { + dup = lnx_dupinfo_list(l); + if (!dup) + return -FI_ENOMEM; + new = calloc(1, sizeof(*new)); + if (!new) + return -FI_ENOMEM; + new->fi = dup; + dlist_init(&new->entry); + dlist_insert_tail(&new->entry, result); + } + rc = gen_links_rec(current->next, head, result, l, + depth+1, target_depth); + lnx_remove_tail(&l); + if (rc) + return rc; + } + current = current->next; + } + + return FI_SUCCESS; +} + +static int gen_links(struct dlist_entry *head, struct dlist_entry *result, + int target_depth) +{ + return gen_links_rec(head, head, result, NULL, 1, target_depth); +} + +static int lnx_form_info(struct fi_info *fi, struct fi_info **out) +{ + int size_prov = 0, size_dom = 0, rc = FI_SUCCESS; + struct lnx_fi_info_meta *meta = NULL; + char *lnx_prov, *lnx_dom, *s; + struct fi_info *itr, *r = NULL; + bool copy = false; + uint64_t min_inject_size = SIZE_MAX; + + for (itr = fi; itr; itr = itr->next) { + size_prov += strlen(itr->fabric_attr->prov_name)+1; + size_dom += strlen(itr->domain_attr->name)+1; + if (itr->tx_attr && itr->tx_attr->inject_size < min_inject_size) + min_inject_size = itr->tx_attr->inject_size; + } + + lnx_dom = calloc(size_dom, sizeof(char)); + lnx_prov = calloc(size_prov, sizeof(char)); + if (!lnx_prov || !lnx_dom) + return -FI_ENOMEM; + + for (itr = fi; itr; itr = itr->next) { + strcat(lnx_prov, itr->fabric_attr->prov_name); + strcat(lnx_dom, itr->domain_attr->name); + if (itr->next) { + strcat(lnx_dom, "+"); + strcat(lnx_prov, "+"); + } + if (!strncmp(itr->fabric_attr->prov_name, "shm", 3)) + continue; + + if (!copy) { + meta = calloc(1, sizeof(*meta)); + r = fi_dupinfo(itr); + if (!r || !meta) { + rc = -FI_ENOMEM; + goto fail; + } + meta->lnx_rep = r; + meta->lnx_link = fi; + if (r->tx_attr) + r->tx_attr->inject_size = min_inject_size; + dlist_init(&meta->entry); + dlist_insert_tail(&meta->entry, &lnx_links_meta); + copy = true; + } + } + + if (!r) { + rc = -FI_ENODATA; + goto fail; + } + + free(r->fabric_attr->prov_name); + free(r->fabric_attr->name); + free(r->domain_attr->name); + + r->fabric_attr->name = NULL; + r->domain_attr->name = NULL; + r->fabric_attr->prov_name = lnx_prov; + + if (asprintf(&s, "%s", lnx_info.fabric_attr->name) < 0) + goto fail; + r->fabric_attr->name = s; + + if (asprintf(&s, "%s:%s", lnx_dom, lnx_info.domain_attr->name) < 0) + goto fail; + r->domain_attr->name = s; + free(lnx_dom); + + *out = r; + return FI_SUCCESS; + +fail: + if (meta) + free(meta); + if (r) + fi_freeinfo(r); + free(lnx_dom); + return rc; +} + +static int lnx_generate_info(struct fi_info **info) +{ + struct fi_info *fi = NULL, *head = NULL, *prev = NULL; + struct lnx_fi_cache_entry *e; + int rc, size; + + /* we need at least 2 providers to link */ + rc = lnx_get_cache_meta(&lnx_fi_info_cache, &size); + if (rc || size < 2) + return -FI_ENODATA; + + rc = gen_links(&lnx_fi_info_cache, &lnx_links, size); + if (rc) + return rc; + + /* + * 1. Iterate over the links and create a linked list of fi_infos + * each fi_info in the list represents one of the links + * 2. Have metadata associated with each fi_info to refer back to + * an entry in the lnx_links cache. + * 3. When the application selects one of these fi_infos, we can + * then find the appropriate link in the cache and be able to + * create the underlying core providers correctly. + */ + dlist_foreach_container(&lnx_links, struct lnx_fi_cache_entry, e, + entry) { + rc = lnx_form_info(e->fi, &fi); + if (rc) + goto err; + + if (prev) { + prev->next = fi; + prev = fi; + } else { + prev = fi; + head = fi; + } + } + + *info = head; + + return FI_SUCCESS; + +err: + if (fi) + fi_freeinfo(fi); + lnx_free_info_cache(&lnx_fi_info_cache, false); + lnx_free_info_cache(&lnx_links, true); + + return -FI_ENODATA; +} + +int lnx_getinfo_helper(uint32_t version, char *prov, struct fi_info *lnx_hints) +{ + int rc; + char *orig_prov_name = NULL; + struct fi_info *core_info; + uint64_t caps, mr_mode; + bool shm = false; + + caps = lnx_hints->caps; + mr_mode = lnx_hints->domain_attr->mr_mode; + + if (lnx_hints->fabric_attr->prov_name) { + orig_prov_name = lnx_hints->fabric_attr->prov_name; + lnx_hints->fabric_attr->prov_name = NULL; + } + + lnx_hints->fabric_attr->prov_name = prov; + if (!strncmp(prov, "shm", 3)) { + shm = true; + /* make sure we get the correct shm provider */ + lnx_hints->caps &= ~(FI_REMOTE_COMM | FI_LOCAL_COMM); + lnx_hints->caps |= FI_HMEM; + lnx_hints->domain_attr->mr_mode |= (FI_MR_VIRT_ADDR | FI_MR_HMEM + | FI_MR_PROV_KEY); + } + rc = fi_getinfo(version, NULL, NULL, OFI_GETINFO_INTERNAL, + lnx_hints, &core_info); + + lnx_hints->fabric_attr->prov_name = orig_prov_name; + if (rc) + return rc; + + if (shm) { + lnx_hints->caps = caps; + lnx_hints->domain_attr->mr_mode = mr_mode; + } + + rc = lnx_cache_info(&lnx_fi_info_cache, core_info); + + return rc; +} + +int lnx_getinfo(uint32_t version, const char *node, const char *service, + uint64_t flags, const struct fi_info *hints, + struct fi_info **info) +{ + int rc; + struct fi_info *lnx_hints; + char *linked_provs, *linked_provs_cp, *token, *exclude = NULL; + + rc = fi_param_get_str(&lnx_prov, "prov_links", + &linked_provs); + if (rc) + return rc; + + if (strstr(linked_provs, "lnx")) { + FI_WARN(&lnx_prov, FI_LOG_FABRIC, + "Can't specify the lnx provider as part of the link: %s\n", + linked_provs); + return -FI_EINVAL; + } + + linked_provs_cp = strdup(linked_provs); + if (!linked_provs_cp) + return -FI_ENOMEM; + + /* The assumption is that the entire series of + * lnx_getinfo()->lnx_fabric()->lnx_domain()->lnx_endpoint() are + * going to be called before another lnx_getinfo() is called again. + * Based on this assumption, we will free the cache whenever + * lnx_getinfo() is called + */ + lnx_free_info_cache(&lnx_fi_info_cache, false); + lnx_free_info_cache(&lnx_links, true); + + /* If the hints are not provided then we endup with a new block */ + lnx_hints = fi_dupinfo(hints); + if (!lnx_hints) + return -FI_ENOMEM; + + rc = ofi_exclude_prov_name(&lnx_hints->fabric_attr->prov_name, lnx_prov.name); + if (rc) + return rc; + + /* get the providers which support peer functionality. These are + * the only ones we can link*/ + lnx_hints->caps |= FI_PEER; + + token = strtok(linked_provs_cp, "+"); + while (token) { + lnx_getinfo_helper(version, token, lnx_hints); + rc = ofi_exclude_prov_name(&lnx_hints->fabric_attr->prov_name, token); + if (rc) + goto free_hints; + token = strtok(NULL, "+"); + } + free(linked_provs_cp); + + /* Generate the lnx info which represents all possible combination + * of domains which are to be linked. + */ + rc = lnx_generate_info(info); + +free_hints: + free(exclude); + fi_freeinfo(lnx_hints); + return rc; +} + +static struct local_prov * +lnx_get_local_prov(struct dlist_entry *prov_table, char *prov_name) +{ + struct local_prov *entry; + + /* close all the open core fabrics */ + dlist_foreach_container(prov_table, struct local_prov, + entry, lpv_entry) { + if (!strncasecmp(entry->lpv_prov_name, prov_name, FI_NAME_MAX)) + return entry; + } + + return NULL; +} + +static int +lnx_add_ep_to_prov(struct local_prov *prov, struct local_prov_ep *ep) +{ + dlist_insert_tail(&ep->entry, &prov->lpv_prov_eps); + ep->lpe_parent = prov; + prov->lpv_ep_count++; + + return FI_SUCCESS; +} + +static int +lnx_setup_core_prov(struct fi_info *info, struct dlist_entry *prov_table, + struct local_prov **shm_prov, void *context) +{ + int rc = -FI_EINVAL; + struct local_prov_ep *ep = NULL; + struct local_prov *lprov, *new_lprov = NULL; + + ep = calloc(sizeof(*ep), 1); + if (!ep) + return -FI_ENOMEM; + + new_lprov = calloc(sizeof(*new_lprov), 1); + if (!new_lprov) + goto free_entry; + + dlist_init(&new_lprov->lpv_prov_eps); + + rc = fi_fabric(info->fabric_attr, &ep->lpe_fabric, context); + if (rc) + return rc; + + ep->lpe_fi_info = info; + strncpy(ep->lpe_fabric_name, info->fabric_attr->name, + FI_NAME_MAX - 1); + + lprov = lnx_get_local_prov(prov_table, info->fabric_attr->prov_name); + if (!lprov) { + lprov = new_lprov; + new_lprov = NULL; + strncpy(lprov->lpv_prov_name, info->fabric_attr->prov_name, + FI_NAME_MAX - 1); + } else { + free(new_lprov); + } + + /* indicate that this fabric can be used for on-node communication */ + if (!strncasecmp(lprov->lpv_prov_name, "shm", 3)) { + *shm_prov = lprov; + ep->lpe_local = true; + } + + dlist_init(&ep->entry); + rc = lnx_add_ep_to_prov(lprov, ep); + if (rc) + goto free_all; + + dlist_insert_after(&lprov->lpv_entry, prov_table); + + return 0; + +free_all: + if (new_lprov) + free(new_lprov); +free_entry: + if (ep) + free(ep); + + return rc; +} + +int +lnx_setup_core_fabrics(char *name, struct lnx_fabric *lnx_fab, + void *context) +{ + int rc; + struct fi_info *link, *itr; + + link = lnx_get_link_by_dom(name); + if (!link) + return -FI_ENODATA; + + for (itr = link; itr; itr = itr->next) { + rc = lnx_setup_core_prov(itr, &lnx_fab->local_prov_table, + &lnx_fab->shm_prov, context); + if (rc) + return rc; + } + + return FI_SUCCESS; +} + +int lnx_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric, + void *context) +{ + struct ofi_bufpool_attr bp_attrs = {}; + struct lnx_fabric *lnx_fab; + int rc; + + lnx_fab = calloc(sizeof(*lnx_fab), 1); + if (!lnx_fab) + return -FI_ENOMEM; + + bp_attrs.size = sizeof(struct lnx_mr); + bp_attrs.alignment = 8; + bp_attrs.max_cnt = UINT32_MAX; + bp_attrs.chunk_cnt = 64; + bp_attrs.flags = OFI_BUFPOOL_NO_TRACK; + rc = ofi_bufpool_create_attr(&bp_attrs, &lnx_fab->mem_reg_bp); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_FABRIC, + "Failed to create memory registration buffer pool"); + free(lnx_fab); + return -FI_ENOMEM; + } + + /* initialize the provider table */ + dlist_init(&lnx_fab->local_prov_table); + + rc = ofi_fabric_init(&lnx_prov, lnx_info.fabric_attr, + lnx_info.fabric_attr, + &lnx_fab->util_fabric, context); + if (rc) + goto fail; + + lnx_fab->util_fabric.fabric_fid.fid.ops = &lnx_fabric_fi_ops; + lnx_fab->util_fabric.fabric_fid.ops = &lnx_fabric_ops; + *fabric = &lnx_fab->util_fabric.fabric_fid; + + return 0; + +fail: + return rc; +} + +void lnx_fini(void) +{ + lnx_free_info_cache(&lnx_fi_info_cache, false); + lnx_free_info_cache(&lnx_links, true); + ofi_bufpool_destroy(global_recv_bp); +} + +static int lnx_free_ep(struct local_prov *prov, struct local_prov_ep *ep) +{ + int rc; + + if (!prov || !ep) + return FI_SUCCESS; + + rc = fi_close(&ep->lpe_fabric->fid); + fi_freeinfo(ep->lpe_fi_info); + free(ep); + prov->lpv_ep_count--; + + if (prov->lpv_ep_count == 0) + dlist_remove(&prov->lpv_entry); + + return rc; +} + +static int lnx_free_eps(struct local_prov *prov) +{ + int rc, frc = 0; + struct dlist_entry *tmp; + struct local_prov_ep *ep; + + dlist_foreach_container_safe(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry, tmp) { + dlist_remove(&ep->entry); + rc = lnx_free_ep(prov, ep); + if (rc) + frc = rc; + } + + return frc; +} + +int lnx_fabric_close(struct fid *fid) +{ + int rc = 0; + struct util_fabric *fabric; + struct lnx_fabric *lnx_fab; + struct local_prov *entry; + struct dlist_entry *tmp; + + fabric = container_of(fid, struct util_fabric, fabric_fid.fid); + lnx_fab = container_of(fabric, struct lnx_fabric, util_fabric); + + /* close all the open core fabrics */ + dlist_foreach_container_safe(&lnx_fab->local_prov_table, + struct local_prov, entry, lpv_entry, tmp) { + dlist_remove(&entry->lpv_entry); + rc = lnx_free_eps(entry); + if (rc) + FI_WARN(&lnx_prov, FI_LOG_CORE, + "Failed to close provider %s\n", + entry->lpv_prov_name); + + free(entry); + } + + /* free mr registration pool */ + ofi_bufpool_destroy(lnx_fab->mem_reg_bp); + + rc = ofi_fabric_close(fabric); + + return rc; +} + +void ofi_link_fini(void) +{ + lnx_prov.cleanup(); +} + +LNX_INI +{ + struct ofi_bufpool_attr bp_attrs = {}; + int ret; + + fi_param_define(&lnx_prov, "prov_links", FI_PARAM_STRING, + "Specify which providers LNX will link together. Format: " + "++...+. EX: shm+cxi"); + + fi_param_define(&lnx_prov, "disable_shm", FI_PARAM_BOOL, + "Turn off SHM support. Defaults to 0"); + + fi_param_define(&lnx_prov, "use_srq", FI_PARAM_BOOL, + "Turns shared receive queue support on and off. By default it is on. " + "When SRQ is turned on some Hardware offload capability will not " + "work. EX: Hardware Tag matching"); + + dlist_init(&lnx_fi_info_cache); + dlist_init(&lnx_links); + dlist_init(&lnx_links_meta); + + if (!global_recv_bp) { + bp_attrs.size = sizeof(struct lnx_rx_entry); + bp_attrs.alignment = 8; + bp_attrs.max_cnt = UINT16_MAX; + bp_attrs.chunk_cnt = 64; + bp_attrs.flags = OFI_BUFPOOL_NO_TRACK; + ret = ofi_bufpool_create_attr(&bp_attrs, &global_recv_bp); + if (ret) { + FI_WARN(&lnx_prov, FI_LOG_FABRIC, + "Failed to create receive buffer pool"); + return NULL; + } + ofi_spin_init(&global_bplock); + } + + return &lnx_prov; +} diff --git a/prov/lnx/src/lnx_ops.c b/prov/lnx/src/lnx_ops.c new file mode 100644 index 00000000000..3750e27f2a6 --- /dev/null +++ b/prov/lnx/src/lnx_ops.c @@ -0,0 +1,1036 @@ +/* + * Copyright (c) 2022 ORNL. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include + +#include +#include "ofi_util.h" +#include "ofi.h" +#include "ofi_str.h" +#include "ofi_prov.h" +#include "ofi_perf.h" +#include "ofi_hmem.h" +#include "ofi_lock.h" +#include "rdma/fi_ext.h" +#include "ofi_iov.h" +#include "lnx.h" + +int lnx_get_msg(struct fid_peer_srx *srx, struct fi_peer_match_attr *match, + struct fi_peer_rx_entry **entry) +{ + return -FI_ENOSYS; +} + +int lnx_queue_msg(struct fi_peer_rx_entry *entry) +{ + return -FI_ENOSYS; +} + +void lnx_free_entry(struct fi_peer_rx_entry *entry) +{ + struct lnx_rx_entry *rx_entry = (struct lnx_rx_entry *) entry; + ofi_spin_t *bplock; + + if (rx_entry->rx_global) + bplock = &global_bplock; + else + bplock = &rx_entry->rx_cep->lpe_bplock; + + ofi_spin_lock(bplock); + ofi_buf_free(rx_entry); + ofi_spin_unlock(bplock); +} + +static struct lnx_ep *lnx_get_lep(struct fid_ep *ep, struct lnx_ctx **ctx) +{ + struct lnx_ep *lep; + + if (ctx) + *ctx = NULL; + + switch (ep->fid.fclass) { + case FI_CLASS_RX_CTX: + case FI_CLASS_TX_CTX: + *ctx = container_of(ep, struct lnx_ctx, ctx_ep.fid); + lep = (*ctx)->ctx_parent; + break; + case FI_CLASS_EP: + case FI_CLASS_SEP: + lep = container_of(ep, struct lnx_ep, le_ep.ep_fid.fid); + break; + default: + lep = NULL; + } + + return lep; +} + +static struct fid_ep *lnx_get_core_ep(struct local_prov_ep *cep, int idx, + size_t fclass) +{ + switch (fclass) { + case FI_CLASS_RX_CTX: + return cep->lpe_rxc[idx]; + case FI_CLASS_TX_CTX: + return cep->lpe_txc[idx]; + case FI_CLASS_EP: + case FI_CLASS_SEP: + return cep->lpe_ep; + default: + return NULL; + } + + return NULL; +} + +static void +lnx_init_rx_entry(struct lnx_rx_entry *entry, struct iovec *iov, void **desc, + size_t count, fi_addr_t addr, uint64_t tag, + uint64_t ignore, void *context, uint64_t flags) +{ + memcpy(&entry->rx_iov, iov, sizeof(*iov) * count); + if (desc) + memcpy(entry->rx_desc, desc, sizeof(*desc) * count); + + entry->rx_entry.iov = entry->rx_iov; + entry->rx_entry.desc = entry->rx_desc; + entry->rx_entry.count = count; + entry->rx_entry.addr = addr; + entry->rx_entry.context = context; + entry->rx_entry.tag = tag; + entry->rx_entry.flags = flags; + entry->rx_ignore = ignore; +} + +static struct lnx_rx_entry * +get_rx_entry(struct local_prov_ep *cep, struct iovec *iov, void **desc, + size_t count, fi_addr_t addr, uint64_t tag, + uint64_t ignore, void *context, uint64_t flags) +{ + struct lnx_rx_entry *rx_entry = NULL; + ofi_spin_t *bplock; + struct ofi_bufpool *bp; + + /* if lp is NULL, then we don't know where the message is going to + * come from, so allocate the rx_entry from a global pool + */ + if (!cep) { + bp = global_recv_bp; + bplock = &global_bplock; + } else { + bp = cep->lpe_recv_bp; + bplock = &cep->lpe_bplock; + } + + ofi_spin_lock(bplock); + rx_entry = (struct lnx_rx_entry *)ofi_buf_alloc(bp); + ofi_spin_unlock(bplock); + if (rx_entry) { + memset(rx_entry, 0, sizeof(*rx_entry)); + if (!cep) + rx_entry->rx_global = true; + rx_entry->rx_cep = cep; + lnx_init_rx_entry(rx_entry, iov, desc, count, addr, tag, + ignore, context, flags); + } + + return rx_entry; +} + +static inline struct lnx_rx_entry * +lnx_remove_first_match(struct lnx_queue *q, struct lnx_match_attr *match) +{ + struct lnx_rx_entry *rx_entry; + + ofi_spin_lock(&q->lq_qlock); + rx_entry = (struct lnx_rx_entry *) dlist_remove_first_match( + &q->lq_queue, q->lq_match_func, match); + ofi_spin_unlock(&q->lq_qlock); + + return rx_entry; +} + +static inline void +lnx_insert_rx_entry(struct lnx_queue *q, struct lnx_rx_entry *entry) +{ + ofi_spin_lock(&q->lq_qlock); + dlist_insert_tail((struct dlist_entry *)(&entry->rx_entry), + &q->lq_queue); + ofi_spin_unlock(&q->lq_qlock); +} + +int lnx_queue_tag(struct fi_peer_rx_entry *entry) +{ + struct lnx_rx_entry *rx_entry = (struct lnx_rx_entry *)entry; + struct lnx_peer_srq *lnx_srq = (struct lnx_peer_srq*)entry->owner_context; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "addr = %lx tag = %lx ignore = 0 found\n", + entry->addr, entry->tag); + + lnx_insert_rx_entry(&lnx_srq->lps_trecv.lqp_unexq, rx_entry); + + return 0; +} + +int lnx_get_tag(struct fid_peer_srx *srx, struct fi_peer_match_attr *match, + struct fi_peer_rx_entry **entry) +{ + struct lnx_match_attr match_attr; + struct lnx_peer_srq *lnx_srq; + struct local_prov_ep *cep; + struct lnx_ep *lep; + struct lnx_rx_entry *rx_entry; + fi_addr_t addr = match->addr; + struct lnx_srx_context *srx_ctxt; + uint64_t tag = match->tag; + int rc = 0; + + /* get the endpoint */ + cep = container_of(srx, struct local_prov_ep, lpe_srx); + srx_ctxt = cep->lpe_srx.ep_fid.fid.context; + cep = srx_ctxt->srx_cep; + lep = srx_ctxt->srx_lep; + lnx_srq = &lep->le_srq; + + /* The fi_addr_t is a generic address returned by the provider. It's usually + * just an index or id in their AV table. When I get it here, I could have + * duplicates if multiple providers are using the same scheme to + * insert in the AV table. I need to be able to identify the provider + * in this function so I'm able to correctly match this message to + * a possible rx entry on my receive queue. That's why we need to make + * sure we use the core endpoint as part of the matching key. + */ + memset(&match_attr, 0, sizeof(match_attr)); + + match_attr.lm_addr = addr; + match_attr.lm_ignore = 0; + match_attr.lm_tag = tag; + match_attr.lm_cep = cep; + + /* 1. Find a matching request to the message received. + * 2. Return the receive request. + * 3. If there are no matching requests, then create a new one + * and return it to the core provider. The core provider will turn + * around and tell us to queue it. Return -FI_ENOENT. + */ + rx_entry = lnx_remove_first_match(&lnx_srq->lps_trecv.lqp_recvq, + &match_attr); + if (rx_entry) { + FI_DBG(&lnx_prov, FI_LOG_CORE, + "addr = %lx tag = %lx ignore = 0 found\n", + addr, tag); + + goto assign; + } + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "addr = %lx tag = %lx ignore = 0 not found\n", + addr, tag); + + rx_entry = get_rx_entry(cep, NULL, NULL, 0, addr, tag, 0, NULL, + lnx_ep_rx_flags(lep)); + if (!rx_entry) { + rc = -FI_ENOMEM; + goto out; + } + + rx_entry->rx_match_info = *match; + rx_entry->rx_entry.owner_context = lnx_srq; + rx_entry->rx_entry.msg_size = match->msg_size; + + rc = -FI_ENOENT; + +assign: + rx_entry->rx_entry.msg_size = MIN(rx_entry->rx_entry.msg_size, + match->msg_size); + *entry = &rx_entry->rx_entry; + +out: + return rc; +} + +/* + * if lp is NULL, then we're attempting to receive from any peer so + * matching the tag is the only thing that matters. + * + * if lp != NULL, then we're attempting to receive from a particular + * peer. This peer can have multiple endpoints serviced by different core + * providers. + * + * Therefore when we check the unexpected queue, we need to check + * if we received any messages from any of the peer's addresses. If we + * find one, then we kick the core provider associated with that + * address to receive the message. + * + * If nothing is found on the unexpected messages, then add a receive + * request on the SRQ; happens in the lnx_process_recv() + */ +static int lnx_process_recv(struct lnx_ep *lep, struct iovec *iov, void **desc, + fi_addr_t addr, size_t count, struct lnx_peer *lp, uint64_t tag, + uint64_t ignore, void *context, uint64_t flags, + bool tagged) +{ + struct lnx_peer_srq *lnx_srq = &lep->le_srq; + struct local_prov_ep *cep; + struct lnx_rx_entry *rx_entry; + struct lnx_match_attr match_attr; + int rc = 0; + + match_attr.lm_addr = addr; + match_attr.lm_ignore = ignore; + match_attr.lm_tag = tag; + match_attr.lm_cep = NULL; + match_attr.lm_peer = lp; + + /* if support is turned off, don't go down the SRQ path */ + if (!lep->le_domain->ld_srx_supported) + return -FI_ENOSYS; + + rx_entry = lnx_remove_first_match(&lnx_srq->lps_trecv.lqp_unexq, + &match_attr); + if (!rx_entry) { + FI_DBG(&lnx_prov, FI_LOG_CORE, + "addr=%lx tag=%lx ignore=%lx buf=%p len=%lx not found\n", + addr, tag, ignore, iov->iov_base, iov->iov_len); + + goto nomatch; + } + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "addr=%lx tag=%lx ignore=%lx buf=%p len=%lx found\n", + addr, tag, ignore, iov->iov_base, iov->iov_len); + + cep = rx_entry->rx_cep; + + /* match is found in the unexpected queue. call into the core + * provider to complete this message + */ + lnx_init_rx_entry(rx_entry, iov, desc, count, addr, tag, ignore, + context, lnx_ep_rx_flags(lep)); + rx_entry->rx_entry.msg_size = MIN(ofi_total_iov_len(iov, count), + rx_entry->rx_entry.msg_size); + if (tagged) + rc = cep->lpe_srx.peer_ops->start_tag(&rx_entry->rx_entry); + else + rc = cep->lpe_srx.peer_ops->start_msg(&rx_entry->rx_entry); + + if (rc == -FI_EINPROGRESS) { + /* this is telling me that more messages can match the same + * rx_entry. So keep it on the queue + */ + FI_DBG(&lnx_prov, FI_LOG_CORE, + "addr = %lx tag = %lx ignore = %lx start_tag() in progress\n", + addr, tag, ignore); + + goto insert_recvq; + } else if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, "start tag failed with %d\n", rc); + } + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "addr = %lx tag = %lx ignore = %lx start_tag() success\n", + addr, tag, ignore); + + return 0; + +nomatch: + /* nothing on the unexpected queue, then allocate one and put it on + * the receive queue + */ + rx_entry = get_rx_entry(NULL, iov, desc, count, addr, tag, ignore, + context, lnx_ep_rx_flags(lep)); + rx_entry->rx_entry.msg_size = ofi_total_iov_len(iov, count); + if (!rx_entry) { + rc = -FI_ENOMEM; + goto out; + } + rx_entry->rx_peer = lp; + +insert_recvq: + lnx_insert_rx_entry(&lnx_srq->lps_trecv.lqp_recvq, rx_entry); + +out: + return rc; +} + +ssize_t lnx_trecv(struct fid_ep *ep, void *buf, size_t len, void *desc, + fi_addr_t src_addr, uint64_t tag, uint64_t ignore, void *context) +{ + int rc; + struct lnx_ep *lep; + struct local_prov_ep *cep = NULL; + fi_addr_t core_addr = FI_ADDR_UNSPEC; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + struct iovec iov = {.iov_base = buf, .iov_len = len}; + struct lnx_peer *lp; + struct ofi_mr_entry *mre = NULL; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + lnx_get_core_desc(desc, &mem_desc); + + /* addr is an index into the peer table. + * This gets us to a peer. Each peer can be reachable on + * multiple endpoints. Each endpoint has its own fi_addr_t which is + * core provider specific. + */ + lp = lnx_get_peer(peer_tbl->lpt_entries, src_addr); + if (lp) { + rc = lnx_select_recv_pathway(lp, lep->le_domain, desc, &cep, + &core_addr, &iov, 1, &mre, &mem_desc); + if (rc) + goto out; + } + + rc = lnx_process_recv(lep, &iov, &mem_desc, src_addr, 1, lp, tag, ignore, + context, 0, true); + if (rc == -FI_ENOSYS) + goto do_recv; + else if (rc) + FI_WARN(&lnx_prov, FI_LOG_CORE, "lnx_process_recv failed with %d\n", rc); + + goto out; + +do_recv: + if (lp) + rc = fi_trecv(cep->lpe_ep, buf, len, mem_desc, core_addr, tag, ignore, context); + +out: + if (mre) + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + + return rc; +} + +ssize_t lnx_trecvv(struct fid_ep *ep, const struct iovec *iov, void **desc, + size_t count, fi_addr_t src_addr, uint64_t tag, uint64_t ignore, + void *context) +{ + int rc; + struct lnx_ep *lep; + struct local_prov_ep *cep = NULL; + fi_addr_t core_addr = FI_ADDR_UNSPEC; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + struct lnx_peer *lp; + struct ofi_mr_entry *mre = NULL; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + lnx_get_core_desc(*desc, &mem_desc); + + lp = lnx_get_peer(peer_tbl->lpt_entries, src_addr); + if (lp) { + rc = lnx_select_recv_pathway(lp, lep->le_domain, *desc, &cep, + &core_addr, iov, count, &mre, &mem_desc); + if (rc) + goto out; + } + + rc = lnx_process_recv(lep, (struct iovec *)iov, &mem_desc, src_addr, + 1, lp, tag, ignore, context, 0, true); + if (rc == -FI_ENOSYS) + goto do_recv; + + goto out; + +do_recv: + if (lp) + rc = fi_trecvv(cep->lpe_ep, iov, &mem_desc, count, core_addr, tag, ignore, context); + +out: + if (mre) + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + + return rc; +} + +ssize_t lnx_trecvmsg(struct fid_ep *ep, const struct fi_msg_tagged *msg, + uint64_t flags) +{ + int rc; + struct lnx_ep *lep; + struct local_prov_ep *cep = NULL; + fi_addr_t core_addr = FI_ADDR_UNSPEC; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + struct lnx_peer *lp; + struct fi_msg_tagged core_msg; + struct ofi_mr_entry *mre = NULL; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + lp = lnx_get_peer(peer_tbl->lpt_entries, msg->addr); + if (lp) { + rc = lnx_select_recv_pathway(lp, lep->le_domain, *msg->desc, + &cep, &core_addr, msg->msg_iov, + msg->iov_count, &mre, &mem_desc); + if (rc) + goto out; + } + lnx_get_core_desc(*msg->desc, &mem_desc); + + rc = lnx_process_recv(lep, (struct iovec *)msg->msg_iov, &mem_desc, + msg->addr, msg->iov_count, lp, msg->tag, msg->ignore, + msg->context, flags, true); + if (rc == -FI_ENOSYS) + goto do_recv; + + goto out; + +do_recv: + if (lp) { + memcpy(&core_msg, msg, sizeof(*msg)); + + core_msg.desc = mem_desc; + core_msg.addr = core_addr; + + rc = fi_trecvmsg(cep->lpe_ep, &core_msg, flags); + } + +out: + if (mre) + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + + return rc; +} + +ssize_t lnx_tsend(struct fid_ep *ep, const void *buf, size_t len, void *desc, + fi_addr_t dest_addr, uint64_t tag, void *context) +{ + int rc; + struct lnx_ep *lep; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + struct ofi_mr_entry *mre = NULL; + struct iovec iov = {.iov_base = (void*) buf, .iov_len = len}; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], + lep->le_domain, desc, &cep, + &core_addr, &iov, 1, &mre, &mem_desc, NULL); + if (rc) + return rc; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx tag %lx buf %p len %ld\n", + core_addr, tag, buf, len); + + rc = fi_tsend(cep->lpe_ep, buf, len, mem_desc, core_addr, tag, context); + + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + + return rc; +} + +ssize_t lnx_tsendv(struct fid_ep *ep, const struct iovec *iov, void **desc, + size_t count, fi_addr_t dest_addr, uint64_t tag, void *context) +{ + int rc; + struct lnx_ep *lep; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + struct ofi_mr_entry *mre = NULL; + void *mem_desc; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], + lep->le_domain, (desc) ? *desc : NULL, &cep, + &core_addr, iov, count, &mre, &mem_desc, NULL); + if (rc) + return rc; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx tag %lx\n", core_addr, tag); + + rc = fi_tsendv(cep->lpe_ep, iov, &mem_desc, count, core_addr, tag, context); + + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + + return rc; +} + +ssize_t lnx_tsendmsg(struct fid_ep *ep, const struct fi_msg_tagged *msg, + uint64_t flags) +{ + int rc; + struct lnx_ep *lep; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + struct fi_msg_tagged core_msg; + struct ofi_mr_entry *mre = NULL; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[msg->addr], + lep->le_domain, + (msg->desc) ? *msg->desc : NULL, &cep, + &core_addr, msg->msg_iov, + msg->iov_count, &mre, &mem_desc, NULL); + if (rc) + return rc; + + memcpy(&core_msg, msg, sizeof(*msg)); + + core_msg.desc = mem_desc; + core_msg.addr = core_addr; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx tag %lx\n", core_msg.addr, core_msg.tag); + + rc = fi_tsendmsg(cep->lpe_ep, &core_msg, flags); + + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + + return rc; +} + +ssize_t lnx_tinject(struct fid_ep *ep, const void *buf, size_t len, + fi_addr_t dest_addr, uint64_t tag) +{ + int rc; + struct lnx_ep *lep; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + struct ofi_mr_entry *mre = NULL; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], + lep->le_domain, NULL, &cep, + &core_addr, NULL, 0, &mre, NULL, NULL); + if (rc) + return rc; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx tag %lx buf %p len %ld\n", + core_addr, tag, buf, len); + + rc = fi_tinject(cep->lpe_ep, buf, len, core_addr, tag); + + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + + return rc; +} + +ssize_t lnx_tsenddata(struct fid_ep *ep, const void *buf, size_t len, void *desc, + uint64_t data, fi_addr_t dest_addr, uint64_t tag, void *context) +{ + int rc; + struct lnx_ep *lep; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + struct ofi_mr_entry *mre = NULL; + struct iovec iov = {.iov_base = (void*)buf, .iov_len = len}; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], + lep->le_domain, desc, &cep, + &core_addr, &iov, 1, &mre, &mem_desc, NULL); + if (rc) + return rc; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx tag %lx buf %p len %ld\n", + core_addr, tag, buf, len); + + rc = fi_tsenddata(cep->lpe_ep, buf, len, mem_desc, + data, core_addr, tag, context); + + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + + return rc; +} + +ssize_t lnx_tinjectdata(struct fid_ep *ep, const void *buf, size_t len, + uint64_t data, fi_addr_t dest_addr, uint64_t tag) +{ + int rc; + struct lnx_ep *lep; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + struct ofi_mr_entry *mre = NULL; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], + lep->le_domain, NULL, &cep, + &core_addr, NULL, 0, &mre, NULL, NULL); + if (rc) + return rc; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx tag %lx buf %p len %ld\n", + core_addr, tag, buf, len); + + rc = fi_tinjectdata(cep->lpe_ep, buf, len, data, core_addr, tag); + + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + + return rc; +} + +static inline ssize_t +lnx_rma_read(struct fid_ep *ep, void *buf, size_t len, void *desc, + fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) +{ + int rc; + struct lnx_ep *lep; + struct fid_ep *core_ep; + struct lnx_ctx *ctx; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + uint64_t rkey; + struct ofi_mr_entry *mre = NULL; + struct iovec iov = {.iov_base = (void*)buf, .iov_len = len}; + + lep = lnx_get_lep(ep, &ctx); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[src_addr], + lep->le_domain, desc, &cep, + &core_addr, &iov, 1, &mre, &mem_desc, &rkey); + if (rc) + goto out; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "rma read from %lx key %lx buf %p len %ld\n", + core_addr, key, buf, len); + + core_ep = lnx_get_core_ep(cep, ctx->ctx_idx, ep->fid.fclass); + + rc = fi_read(core_ep, buf, len, mem_desc, + core_addr, addr, key, context); + + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); +out: + return rc; +} + +static inline ssize_t +lnx_rma_write(struct fid_ep *ep, const void *buf, size_t len, void *desc, + fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) +{ + int rc; + struct lnx_ep *lep; + struct fid_ep *core_ep; + struct lnx_ctx *ctx; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + uint64_t rkey; + struct ofi_mr_entry *mre = NULL; + struct iovec iov = {.iov_base = (void*)buf, .iov_len = len}; + + lep = lnx_get_lep(ep, &ctx); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], + lep->le_domain, desc, &cep, + &core_addr, &iov, 1, &mre, &mem_desc, &rkey); + if (rc) + goto out; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "rma write to %lx key %lx buf %p len %ld\n", + core_addr, key, buf, len); + + core_ep = lnx_get_core_ep(cep, ctx->ctx_idx, ep->fid.fclass); + + rc = fi_write(core_ep, buf, len, mem_desc, + core_addr, addr, key, context); + + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); +out: + return rc; +} + +static inline ssize_t +lnx_atomic_write(struct fid_ep *ep, + const void *buf, size_t count, void *desc, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, enum fi_op op, void *context) +{ + int rc; + struct lnx_ep *lep; + struct fid_ep *core_ep; + struct lnx_ctx *ctx; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + uint64_t rkey; + struct ofi_mr_entry *mre = NULL; + struct iovec iov = {.iov_base = (void*)buf, .iov_len = count}; + + lep = lnx_get_lep(ep, &ctx); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], + lep->le_domain, desc, &cep, + &core_addr, &iov, 1, &mre, &mem_desc, &rkey); + if (rc) + goto out; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx\n", core_addr); + + core_ep = lnx_get_core_ep(cep, ctx->ctx_idx, ep->fid.fclass); + + rc = fi_atomic(core_ep, buf, count, mem_desc, + core_addr, addr, key, datatype, op, context); + + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); +out: + return rc; +} + +static inline ssize_t +lnx_atomic_readwrite(struct fid_ep *ep, + const void *buf, size_t count, void *desc, + void *result, void *result_desc, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, enum fi_op op, void *context) +{ + int rc; + struct lnx_ep *lep; + struct fid_ep *core_ep; + struct lnx_ctx *ctx; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + uint64_t rkey; + struct ofi_mr_entry *mre = NULL; + struct iovec iov = {.iov_base = (void*)buf, .iov_len = count}; + + lep = lnx_get_lep(ep, &ctx); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], + lep->le_domain, result_desc, &cep, &core_addr, &iov, 1, + &mre, &mem_desc, &rkey); + if (rc) + goto out; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx\n", core_addr); + + core_ep = lnx_get_core_ep(cep, ctx->ctx_idx, ep->fid.fclass); + + rc = fi_fetch_atomic(core_ep, buf, count, desc, + result, mem_desc, core_addr, addr, key, + datatype, op, context); + + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); +out: + return rc; +} + +static inline ssize_t +lnx_atomic_compwrite(struct fid_ep *ep, + const void *buf, size_t count, void *desc, + const void *compare, void *compare_desc, + void *result, void *result_desc, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, enum fi_op op, void *context) +{ + int rc; + struct lnx_ep *lep; + struct fid_ep *core_ep; + struct lnx_ctx *ctx; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + uint64_t rkey; + struct ofi_mr_entry *mre = NULL; + struct iovec iov = {.iov_base = (void*)buf, .iov_len = count}; + + lep = lnx_get_lep(ep, &ctx); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], + lep->le_domain, result_desc, &cep, &core_addr, &iov, 1, + &mre, &mem_desc, &rkey); + if (rc) + goto out; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx\n", core_addr); + + core_ep = lnx_get_core_ep(cep, ctx->ctx_idx, ep->fid.fclass); + + rc = fi_compare_atomic(core_ep, buf, count, desc, + compare, compare_desc, result, mem_desc, + core_addr, addr, key, datatype, op, context); + + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + +out: + return rc; +} + +struct fi_ops_tagged lnx_tagged_ops = { + .size = sizeof(struct fi_ops_tagged), + .recv = lnx_trecv, + .recvv = lnx_trecvv, + .recvmsg = lnx_trecvmsg, + .send = lnx_tsend, + .sendv = lnx_tsendv, + .sendmsg = lnx_tsendmsg, + .inject = lnx_tinject, + .senddata = lnx_tsenddata, + .injectdata = lnx_tinjectdata, +}; + +struct fi_ops_msg lnx_msg_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = fi_no_msg_recv, + .recvv = fi_no_msg_recvv, + .recvmsg = fi_no_msg_recvmsg, + .send = fi_no_msg_send, + .sendv = fi_no_msg_sendv, + .sendmsg = fi_no_msg_sendmsg, + .inject = fi_no_msg_inject, + .senddata = fi_no_msg_senddata, + .injectdata = fi_no_msg_injectdata, +}; + +struct fi_ops_rma lnx_rma_ops = { + .size = sizeof(struct fi_ops_rma), + .read = lnx_rma_read, + .readv = fi_no_rma_readv, + .readmsg = fi_no_rma_readmsg, + .write = lnx_rma_write, + .writev = fi_no_rma_writev, + .writemsg = fi_no_rma_writemsg, + .inject = fi_no_rma_inject, + .writedata = fi_no_rma_writedata, + .injectdata = fi_no_rma_injectdata, +}; + +struct fi_ops_atomic lnx_atomic_ops = { + .size = sizeof(struct fi_ops_atomic), + .write = lnx_atomic_write, + .writev = fi_no_atomic_writev, + .writemsg = fi_no_atomic_writemsg, + .inject = fi_no_atomic_inject, + .readwrite = lnx_atomic_readwrite, + .readwritev = fi_no_atomic_readwritev, + .readwritemsg = fi_no_atomic_readwritemsg, + .compwrite = lnx_atomic_compwrite, + .compwritev = fi_no_atomic_compwritev, + .compwritemsg = fi_no_atomic_compwritemsg, + .writevalid = fi_no_atomic_writevalid, + .readwritevalid = fi_no_atomic_readwritevalid, + .compwritevalid = fi_no_atomic_compwritevalid, +}; + + diff --git a/prov/util/src/util_attr.c b/prov/util/src/util_attr.c index 634af1e5e82..ffe1bb87b5f 100644 --- a/prov/util/src/util_attr.c +++ b/prov/util/src/util_attr.c @@ -93,7 +93,8 @@ char *ofi_strdup_tail(const char *str) } */ -char *ofi_strdup_append(const char *head, const char *tail) +static char *ofi_strdup_append_internal(const char *head, const char *tail, + char delim) { char *str; size_t len; @@ -101,10 +102,20 @@ char *ofi_strdup_append(const char *head, const char *tail) len = strlen(head) + strlen(tail) + 2; str = malloc(len); if (str) - sprintf(str, "%s%c%s", head, OFI_NAME_DELIM, tail); + sprintf(str, "%s%c%s", head, delim, tail); return str; } +char *ofi_strdup_link_append(const char *head, const char *tail) +{ + return ofi_strdup_append_internal(head, tail, OFI_NAME_LNX_DELIM); +} + +char *ofi_strdup_append(const char *head, const char *tail) +{ + return ofi_strdup_append_internal(head, tail, OFI_NAME_DELIM); +} + int ofi_exclude_prov_name(char **prov_name_list, const char *util_prov_name) { char *exclude, *name, *temp; diff --git a/src/fabric.c b/src/fabric.c index b1a735638bb..13b529ea95c 100644 --- a/src/fabric.c +++ b/src/fabric.c @@ -262,6 +262,11 @@ static int ofi_is_hook_prov(const struct fi_provider *provider) return ofi_prov_ctx(provider)->type == OFI_PROV_HOOK; } +static int ofi_is_lnx_prov(const struct fi_provider *provider) +{ + return ofi_prov_ctx(provider)->type == OFI_PROV_LNX; +} + int ofi_apply_filter(struct ofi_filter *filter, const char *name) { if (!filter->names) @@ -500,6 +505,8 @@ static void ofi_set_prov_type(struct fi_provider *provider) ofi_prov_ctx(provider)->type = OFI_PROV_UTIL; else if (ofi_has_offload_prefix(provider->name)) ofi_prov_ctx(provider)->type = OFI_PROV_OFFLOAD; + else if (ofi_is_lnx(provider->name)) + ofi_prov_ctx(provider)->type = OFI_PROV_LNX; else ofi_prov_ctx(provider)->type = OFI_PROV_CORE; } @@ -988,6 +995,7 @@ void fi_ini(void) ofi_register_provider(SOCKETS_INIT, NULL); ofi_register_provider(TCP_INIT, NULL); + ofi_register_provider(LNX_INIT, NULL); ofi_register_provider(HOOK_PERF_INIT, NULL); ofi_register_provider(HOOK_TRACE_INIT, NULL); ofi_register_provider(HOOK_PROFILE_INIT, NULL); @@ -1207,8 +1215,12 @@ static void ofi_set_prov_attr(struct fi_fabric_attr *attr, core_name = attr->prov_name; if (core_name) { - assert(ofi_is_util_prov(prov)); - attr->prov_name = ofi_strdup_append(core_name, prov->name); + if (ofi_is_lnx_prov(prov)) { + attr->prov_name = ofi_strdup_link_append(core_name, prov->name); + } else { + assert(ofi_is_util_prov(prov)); + attr->prov_name = ofi_strdup_append(core_name, prov->name); + } free(core_name); } else { attr->prov_name = strdup(prov->name); @@ -1557,7 +1569,9 @@ int DEFAULT_SYMVER_PRE(fi_fabric)(struct fi_fabric_attr *attr, fi_ini(); - top_name = strrchr(attr->prov_name, OFI_NAME_DELIM); + ret = ofi_is_linked(attr->prov_name); + top_name = strrchr(attr->prov_name, + ret ? OFI_NAME_LNX_DELIM : OFI_NAME_DELIM); if (top_name) top_name++; else diff --git a/src/fi_tostr.c b/src/fi_tostr.c index 910dfd1214b..420f0cca2f6 100644 --- a/src/fi_tostr.c +++ b/src/fi_tostr.c @@ -259,6 +259,7 @@ static void ofi_tostr_protocol(char *buf, size_t len, uint32_t protocol) CASEENUMSTRN(FI_PROTO_SM2, len); CASEENUMSTRN(FI_PROTO_CXI_RNR, len); CASEENUMSTRN(FI_PROTO_LPP, len); + CASEENUMSTRN(FI_PROTO_LNX, len); default: ofi_strncatf(buf, len, "Unknown"); break;