From 3b9cd40f9e5ecb277e67ad797dce31c0757ddb76 Mon Sep 17 00:00:00 2001 From: Gengbin Zheng Date: Tue, 6 Dec 2022 15:21:56 -0800 Subject: [PATCH] mpl/ze: add an option to do GPU copy with and without round robin specify if use command queues in round robin or not. --- src/mpi/init/mpir_init.c | 12 +++++++++++ src/mpl/include/mpl_gpu.h | 1 + src/mpl/src/gpu/mpl_gpu_ze.c | 40 ++++++++++++++++++++++++------------ 3 files changed, 40 insertions(+), 13 deletions(-) diff --git a/src/mpi/init/mpir_init.c b/src/mpi/init/mpir_init.c index d1105662750..d3a361606f2 100644 --- a/src/mpi/init/mpir_init.c +++ b/src/mpi/init/mpir_init.c @@ -55,6 +55,17 @@ description : >- If true, mpl/ze will use immediate command list for copying + - name : MPIR_CVAR_GPU_ROUND_ROBIN_COMMAND_QUEUES + category : GPU + type : boolean + default : false + class : none + verbosity : MPI_T_VERBOSITY_USER_BASIC + scope : MPI_T_SCOPE_ALL_EQ + description : >- + If true, mpl/ze will use command queues in a round-robin fashion. + If false, only command queues of index 0 will be used. + - name : MPIR_CVAR_NO_COLLECTIVE_FINALIZE category : COLLECTIVE type : boolean @@ -225,6 +236,7 @@ int MPII_Init_thread(int *argc, char ***argv, int user_required, int *provided, MPL_gpu_info.specialized_cache = specialized_cache; MPL_gpu_info.use_immediate_cmdlist = MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST; + MPL_gpu_info.roundrobin_cmdq = MPIR_CVAR_GPU_ROUND_ROBIN_COMMAND_QUEUES; int mpl_errno = MPL_gpu_init(debug_summary); MPIR_ERR_CHKANDJUMP(mpl_errno != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**gpu_init"); diff --git a/src/mpl/include/mpl_gpu.h b/src/mpl/include/mpl_gpu.h index b7be4543f09..8d3096dd206 100644 --- a/src/mpl/include/mpl_gpu.h +++ b/src/mpl/include/mpl_gpu.h @@ -70,6 +70,7 @@ typedef struct { /* Input */ int debug_summary; bool use_immediate_cmdlist; + bool roundrobin_cmdq; /* Output */ bool enable_ipc; MPL_gpu_ipc_handle_type_t ipc_handle_type; diff --git a/src/mpl/src/gpu/mpl_gpu_ze.c b/src/mpl/src/gpu/mpl_gpu_ze.c index 9b3ee04d2bc..ca49961dcab 100644 --- a/src/mpl/src/gpu/mpl_gpu_ze.c +++ b/src/mpl/src/gpu/mpl_gpu_ze.c @@ -699,7 +699,7 @@ static int mmapFunction(int nfds, int *fds, size_t size, void **ptr) if (*ptr == (void *) -1) { mpl_err = MPL_ERR_GPU_INTERNAL; perror("mmap device to host"); - printf("gdr_handle_open failed fd: %d\n", fds[0]); + printf("mmap failed fd: %d size: %ld\n", fds[0], size); goto fn_fail; } } else { @@ -2058,13 +2058,20 @@ static inline int get_immediate_cmdlist(int *dev, MPL_gpu_copy_direction_t dir, MPL_ze_engine_entry_t *engine_state = device_state->engines + engine; if (dir == MPL_GPU_COPY_DIRECTION_NONE) { - index = engine_state->curQueue; - /* move to next queue */ - engine_state->curQueue++; - if (engine_state->curQueue == engine_state->numQueues) - engine_state->curQueue = 0; + if (MPL_gpu_info.roundrobin_cmdq) { + index = engine_state->curQueue; + /* move to next queue */ + engine_state->curQueue++; + if (engine_state->curQueue == engine_state->numQueues) + engine_state->curQueue = 0; + } else { + index = 0; + } } else { - index = dir % engine_state->numQueues; + if (MPL_gpu_info.roundrobin_cmdq) + index = dir % engine_state->numQueues; + else + index = 0; } if (!engine_state->cmdlists[index]) { @@ -2168,13 +2175,20 @@ static int MPL_gpu_imemcpy_normal(void *dest_ptr, void *src_ptr, size_t size, in ZE_ERR_CHECK(ret); int q_index; if (dir == MPL_GPU_COPY_DIRECTION_NONE) { - q_index = engine_state->curQueue; - /* move to next queue */ - engine_state->curQueue++; - if (engine_state->curQueue == engine_state->numQueues) - engine_state->curQueue = 0; + if (MPL_gpu_info.roundrobin_cmdq) { + q_index = engine_state->curQueue; + /* move to next queue */ + engine_state->curQueue++; + if (engine_state->curQueue == engine_state->numQueues) + engine_state->curQueue = 0; + } else { + q_index = 0; + } } else { - q_index = dir % engine_state->numQueues; + if (MPL_gpu_info.roundrobin_cmdq) + q_index = dir % engine_state->numQueues; + else + q_index = 0; } assert(engine_state->cmdQueues); ze_command_queue_handle_t cmdq = engine_state->cmdQueues[q_index];