Skip to content

Commit

Permalink
force ofi cxi to disable LCI_ENABLE_PRG_NET_ENDPOINT
Browse files Browse the repository at this point in the history
  • Loading branch information
JiakunYan committed Jun 21, 2024
1 parent 917e4ac commit 687cf6e
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 64 deletions.
7 changes: 0 additions & 7 deletions lci/api/lci.h
Original file line number Diff line number Diff line change
Expand Up @@ -592,13 +592,6 @@ typedef enum {
} LCI_rdv_protocol_t;
extern LCI_rdv_protocol_t LCI_RDV_PROTOCOL;

/**
* @ingroup LCI_COMM
* @brief For the libfabric cxi provider, Try turning off the hacking to see
* whether cxi has fixed the double mr_bind error.
*/
extern bool LCI_OFI_CXI_TRY_NO_HACK;

/**
* @ingroup LCI_COMM
* @brief For the UCX backend, use try_lock to wrap the ucx function calls.
Expand Down
24 changes: 4 additions & 20 deletions lci/backend/ofi/server_ofi.c
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,10 @@ void LCISD_server_init(LCIS_server_t* s)
LCI_Assert(LCI_USE_DREG == 0,
"The registration cache should be turned off "
"for libfabric cxi backend. Use `export LCI_USE_DREG=0`.\n");
LCI_Assert(LCI_ENABLE_PRG_NET_ENDPOINT == 0,
"The progress-specific network endpoint "
"for libfabric cxi backend. Use `export "
"LCI_ENABLE_PRG_NET_ENDPOINT=0`.\n");
if (LCI_RDV_PROTOCOL != LCI_RDV_WRITE) {
LCI_RDV_PROTOCOL = LCI_RDV_WRITE;
LCI_Warn(
Expand All @@ -116,15 +120,11 @@ void LCISD_server_init(LCIS_server_t* s)

// Create domain.
FI_SAFECALL(fi_domain(server->fabric, server->info, &server->domain, NULL));

server->endpoint_count = 0;
}

void LCISD_server_fina(LCIS_server_t s)
{
LCISI_server_t* server = (LCISI_server_t*)s;
LCI_Assert(server->endpoint_count == 0, "Endpoint count is not zero (%d)\n",
server->endpoint_count);
FI_SAFECALL(fi_close((struct fid*)&server->domain->fid));
FI_SAFECALL(fi_close((struct fid*)&server->fabric->fid));
fi_freeinfo(server->info);
Expand All @@ -138,19 +138,7 @@ void LCISD_endpoint_init(LCIS_server_t server_pp, LCIS_endpoint_t* endpoint_pp,
LCISI_endpoint_t* endpoint_p = LCIU_malloc(sizeof(LCISI_endpoint_t));
*endpoint_pp = (LCIS_endpoint_t)endpoint_p;
endpoint_p->server = (LCISI_server_t*)server_pp;
endpoint_p->server->endpoints[endpoint_p->server->endpoint_count++] =
endpoint_p;
endpoint_p->is_single_threaded = single_threaded;
if (!LCI_OFI_CXI_TRY_NO_HACK &&
strcmp(endpoint_p->server->info->fabric_attr->prov_name, "cxi") == 0 &&
endpoint_p->server->info->domain_attr->mr_mode & FI_MR_ENDPOINT &&
endpoint_p->server->endpoint_count > 1) {
// We are using more than one endpoint per server, but the cxi provider
// can only bind mr to one endpoint. We have to guess here.
endpoint_p->server->cxi_mr_bind_hack = true;
} else {
endpoint_p->server->cxi_mr_bind_hack = false;
}
// Create end-point;
endpoint_p->server->info->tx_attr->size = LCI_SERVER_MAX_SENDS;
endpoint_p->server->info->rx_attr->size = LCI_SERVER_MAX_RECVS;
Expand Down Expand Up @@ -223,10 +211,6 @@ void LCISD_endpoint_fina(LCIS_endpoint_t endpoint_pp)
LCT_pmi_barrier();
LCISI_endpoint_t* endpoint_p = (LCISI_endpoint_t*)endpoint_pp;
LCIU_free(endpoint_p->peer_addrs);
int my_idx = --endpoint_p->server->endpoint_count;
LCI_Assert(endpoint_p->server->endpoints[my_idx] == endpoint_p,
"This is not me!\n");
endpoint_p->server->endpoints[my_idx] = NULL;
FI_SAFECALL(fi_close((struct fid*)&endpoint_p->ep->fid));
FI_SAFECALL(fi_close((struct fid*)&endpoint_p->cq->fid));
FI_SAFECALL(fi_close((struct fid*)&endpoint_p->av->fid));
Expand Down
38 changes: 1 addition & 37 deletions lci/backend/ofi/server_ofi.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,6 @@ typedef struct __attribute__((aligned(LCI_CACHE_LINE))) LCISI_server_t {
struct fi_info* info;
struct fid_fabric* fabric;
struct fid_domain* domain;
struct LCISI_endpoint_t* endpoints[LCI_SERVER_MAX_ENDPOINTS];
int endpoint_count;
bool cxi_mr_bind_hack;
} LCISI_server_t;

typedef struct __attribute__((aligned(LCI_CACHE_LINE))) LCISI_endpoint_t {
Expand Down Expand Up @@ -72,16 +69,7 @@ static inline void* LCISI_real_server_reg(LCIS_endpoint_t endpoint_pp,
FI_READ | FI_WRITE | FI_REMOTE_WRITE, 0, rdma_key, 0,
&mr, 0));
if (server->info->domain_attr->mr_mode & FI_MR_ENDPOINT) {
LCI_DBG_Assert(server->endpoint_count >= 1, "No endpoints available!\n");
if (server->cxi_mr_bind_hack) {
// A temporary fix for the cxi provider, currently cxi cannot bind a
// memory region to more than one endpoint.
FI_SAFECALL(fi_mr_bind(mr, &endpoint_p->ep->fid, 0));
} else {
for (int i = 0; i < server->endpoint_count; ++i) {
FI_SAFECALL(fi_mr_bind(mr, &server->endpoints[i]->ep->fid, 0));
}
}
FI_SAFECALL(fi_mr_bind(mr, &endpoint_p->ep->fid, 0));
FI_SAFECALL(fi_mr_enable(mr));
}
return (void*)mr;
Expand Down Expand Up @@ -240,12 +228,6 @@ static inline LCI_error_t LCISD_post_puts(LCIS_endpoint_t endpoint_pp, int rank,
LCIS_rkey_t rkey)
{
LCISI_endpoint_t* endpoint_p = (LCISI_endpoint_t*)endpoint_pp;
LCI_Assert(
!endpoint_p->server->cxi_mr_bind_hack ||
endpoint_p == endpoint_p->server
->endpoints[endpoint_p->server->endpoint_count - 1],
"We are using cxi mr_bind hacking mode but unexpected endpoint is "
"performing remote put. Try `export LCI_ENABLE_PRG_NET_ENDPOINT=0`.\n");
uintptr_t addr;
if (endpoint_p->server->info->domain_attr->mr_mode & FI_MR_VIRT_ADDR ||
endpoint_p->server->info->domain_attr->mr_mode & FI_MR_BASIC) {
Expand Down Expand Up @@ -292,12 +274,6 @@ static inline LCI_error_t LCISD_post_put(LCIS_endpoint_t endpoint_pp, int rank,
LCIS_rkey_t rkey, void* ctx)
{
LCISI_endpoint_t* endpoint_p = (LCISI_endpoint_t*)endpoint_pp;
LCI_Assert(
!endpoint_p->server->cxi_mr_bind_hack ||
endpoint_p == endpoint_p->server
->endpoints[endpoint_p->server->endpoint_count - 1],
"We are using cxi mr_bind hacking mode but an unexpected endpoint is "
"performing remote put. Try `export LCI_ENABLE_PRG_NET_ENDPOINT=0`.\n");
uintptr_t addr;
if (endpoint_p->server->info->domain_attr->mr_mode & FI_MR_VIRT_ADDR ||
endpoint_p->server->info->domain_attr->mr_mode & FI_MR_BASIC) {
Expand Down Expand Up @@ -345,12 +321,6 @@ static inline LCI_error_t LCISD_post_putImms(LCIS_endpoint_t endpoint_pp,
LCIS_rkey_t rkey, uint32_t meta)
{
LCISI_endpoint_t* endpoint_p = (LCISI_endpoint_t*)endpoint_pp;
LCI_Assert(
!endpoint_p->server->cxi_mr_bind_hack ||
endpoint_p == endpoint_p->server
->endpoints[endpoint_p->server->endpoint_count - 1],
"We are using cxi mr_bind hacking mode but an unexpected endpoint is "
"performing remote put. Try `export LCI_ENABLE_PRG_NET_ENDPOINT=0`.\n");
uintptr_t addr;
if (endpoint_p->server->info->domain_attr->mr_mode & FI_MR_VIRT_ADDR ||
endpoint_p->server->info->domain_attr->mr_mode & FI_MR_BASIC) {
Expand Down Expand Up @@ -381,12 +351,6 @@ static inline LCI_error_t LCISD_post_putImm(LCIS_endpoint_t endpoint_pp,
void* ctx)
{
LCISI_endpoint_t* endpoint_p = (LCISI_endpoint_t*)endpoint_pp;
LCI_Assert(
!endpoint_p->server->cxi_mr_bind_hack ||
endpoint_p == endpoint_p->server
->endpoints[endpoint_p->server->endpoint_count - 1],
"We are using cxi mr_bind hacking mode but an unexpected endpoint is "
"performing remote put. Try `export LCI_ENABLE_PRG_NET_ENDPOINT=0`.\n");
uintptr_t addr;
if (endpoint_p->server->info->domain_attr->mr_mode & FI_MR_VIRT_ADDR ||
endpoint_p->server->info->domain_attr->mr_mode & FI_MR_BASIC) {
Expand Down

0 comments on commit 687cf6e

Please sign in to comment.