From 66d3f762597af0071d614674af5c6e81390a9b58 Mon Sep 17 00:00:00 2001 From: Jiakun Yan Date: Fri, 20 Sep 2024 12:13:25 -0500 Subject: [PATCH] improve gid auto select --- lci/api/lci.h | 6 +++++ lci/backend/ibv/lcisi_ibv_detail.c | 43 ++++++++++++++++++++---------- lci/backend/ibv/server_ibv.c | 3 ++- lci/runtime/env.c | 3 +++ 4 files changed, 40 insertions(+), 15 deletions(-) diff --git a/lci/api/lci.h b/lci/api/lci.h index 434096a2..b927fbf6 100644 --- a/lci/api/lci.h +++ b/lci/api/lci.h @@ -582,6 +582,12 @@ extern bool LCI_IBV_ENABLE_TD; */ extern int LCI_IBV_GID_IDX; +/** + * @ingroup LCI_COMM + * @brief Enable gid index auto selection for both ib and RoCE. + */ +extern int LCI_IBV_FORCE_GID_AUTO_SELECT; + /** * @ingroup LCI_COMM * @brief Whether to enable the progress specific network endpoint. diff --git a/lci/backend/ibv/lcisi_ibv_detail.c b/lci/backend/ibv/lcisi_ibv_detail.c index a05274fc..23e25645 100644 --- a/lci/backend/ibv/lcisi_ibv_detail.c +++ b/lci/backend/ibv/lcisi_ibv_detail.c @@ -137,9 +137,10 @@ bool select_best_device_port(struct ibv_device** dev_list, int num_devices, } typedef enum roce_version_t { - ROCE_V1, ROCE_V2, - ROCE_VER_UNKNOWN + ROCE_V1, + ROCE_VER_UNKNOWN, + ROCE_VER_MAX, } roce_version_t; roce_version_t query_gid_roce_version(LCISI_server_t* server, @@ -157,8 +158,13 @@ roce_version_t query_gid_roce_version(LCISI_server_t* server, dev_name, server->dev_port, gid_index); if (ret > 0) { if (!strncmp(buf, "IB/RoCE v1", 10)) { + LCI_Log(LCI_LOG_DEBUG, "ibv", + "dev %s port %d index %d uses IB/Roce v1\n", dev_name, + server->dev_port, gid_index); return ROCE_V1; } else if (!strncmp(buf, "RoCE v2", 7)) { + LCI_Log(LCI_LOG_DEBUG, "ibv", "dev %s port %d index %d uses Roce v2\n", + dev_name, server->dev_port, gid_index); return ROCE_V2; } } @@ -200,26 +206,35 @@ bool test_roce_gid_index(LCISI_server_t* server, uint8_t gid_index) int select_best_gid_for_roce(LCISI_server_t* server) { - static const roce_version_t roce_prio[] = { - ROCE_V2, - ROCE_V1, - ROCE_VER_UNKNOWN, + static const int roce_prio[] = { + [ROCE_V2] = 0, + [ROCE_V1] = 1, + [ROCE_VER_UNKNOWN] = 2, }; int gid_tbl_len = server->port_attr.gid_tbl_len; + int best_priority = 100; + int best_gid_idx = -1; LCI_Log(LCI_LOG_DEBUG, "ibv", "RoCE gid auto selection among %d gids\n", gid_tbl_len); - for (int prio_idx = 0; prio_idx < sizeof(roce_prio); prio_idx++) { - for (int i = 0; i < gid_tbl_len; i++) { - roce_version_t version = query_gid_roce_version(server, i); + for (int i = 0; i < gid_tbl_len; ++i) { + roce_version_t version = query_gid_roce_version(server, i); + int priority = roce_prio[version]; - if ((roce_prio[prio_idx] == version) && test_roce_gid_index(server, i)) { - LCI_Log(LCI_LOG_INFO, "ibv", "RoCE gid auto selection: use %d %d\n", i, - version); - return i; - } + if (priority == 0 && test_roce_gid_index(server, i)) { + best_gid_idx = i; + best_priority = priority; + break; + } else if (priority < best_priority && test_roce_gid_index(server, i)) { + best_gid_idx = i; + best_priority = priority; } } + if (best_gid_idx >= 0) { + LCI_Log(LCI_LOG_INFO, "ibv", "RoCE gid auto selection: use gid %d\n", + best_gid_idx); + return best_gid_idx; + } const int default_gid = 0; LCI_Log(LCI_LOG_INFO, "ibv", diff --git a/lci/backend/ibv/server_ibv.c b/lci/backend/ibv/server_ibv.c index f6323f16..4150f8c7 100644 --- a/lci/backend/ibv/server_ibv.c +++ b/lci/backend/ibv/server_ibv.c @@ -172,7 +172,8 @@ void LCISD_server_init(LCIS_server_t* s) // query the gid server->gid_idx = LCI_IBV_GID_IDX; if (server->gid_idx < 0 && - server->port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { + (LCI_IBV_FORCE_GID_AUTO_SELECT || + server->port_attr.link_layer == IBV_LINK_LAYER_ETHERNET)) { // User did not explicitly specify the gid to use and we are using RoCE server->gid_idx = select_best_gid_for_roce(server); } diff --git a/lci/runtime/env.c b/lci/runtime/env.c index cb93e7fd..4de6506a 100644 --- a/lci/runtime/env.c +++ b/lci/runtime/env.c @@ -26,6 +26,7 @@ LCI_API int LCI_SEND_SLOW_DOWN_USEC; LCI_API int LCI_RECV_SLOW_DOWN_USEC; LCI_API bool LCI_IBV_ENABLE_TD; LCI_API int LCI_IBV_GID_IDX; +LCI_API int LCI_IBV_FORCE_GID_AUTO_SELECT; LCI_API bool LCI_ENABLE_PRG_NET_ENDPOINT; LCI_API LCI_rdv_protocol_t LCI_RDV_PROTOCOL; LCI_API bool LCI_OFI_CXI_TRY_NO_HACK; @@ -87,6 +88,8 @@ void LCII_env_init(int num_proc, int rank) LCI_IBV_ENABLE_TD = LCIU_getenv_or("LCI_IBV_ENABLE_TD", LCI_IBV_ENABLE_TD_DEFAULT); LCI_IBV_GID_IDX = LCIU_getenv_or("LCI_IBV_GID_IDX", -1); + LCI_IBV_FORCE_GID_AUTO_SELECT = + LCIU_getenv_or("LCI_IBV_FORCE_GID_AUTO_SELECT", 0); LCI_ENABLE_PRG_NET_ENDPOINT = LCIU_getenv_or( "LCI_ENABLE_PRG_NET_ENDPOINT", LCI_ENABLE_PRG_NET_ENDPOINT_DEFAULT); LCI_MEDIUM_SIZE = LCI_PACKET_SIZE - sizeof(struct LCII_packet_context);