Skip to content

Commit

Permalink
Merge pull request pmodels#6587 from abrooks98/misc_copy_opt
Browse files Browse the repository at this point in the history
mpl/gpu/ze: misc copy optimizations (PR #9)

Approved-by: Hui Zhou
  • Loading branch information
hzhou authored Jul 23, 2023
2 parents c3db64a + 3b9cd40 commit 84a4193
Show file tree
Hide file tree
Showing 11 changed files with 630 additions and 118 deletions.
2 changes: 1 addition & 1 deletion src/include/mpir_misc.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ int MPIR_Localcopy_stream(const void *sendbuf, MPI_Aint sendcount, MPI_Datatype
int MPIR_Localcopy_gpu(const void *sendbuf, MPI_Aint sendcount, MPI_Datatype sendtype,
MPL_pointer_attr_t * sendattr, void *recvbuf, MPI_Aint recvcount,
MPI_Datatype recvtype, MPL_pointer_attr_t * recvattr,
MPL_gpu_engine_type_t enginetype, bool commit);
MPL_gpu_copy_direction_t dir, MPL_gpu_engine_type_t enginetype, bool commit);

/* Contiguous datatype calculates buffer address with `(char *) buf + dt_true_lb`.
* However, dt_true_lb is treated as ptrdiff_t (signed), and when buf is MPI_BOTTOM
Expand Down
23 changes: 23 additions & 0 deletions src/mpi/init/mpir_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,27 @@
in order to allow the process to continue (e.g., in gdb, "set
hold=0").
- name : MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
category : GPU
type : boolean
default : false
class : none
verbosity : MPI_T_VERBOSITY_USER_BASIC
scope : MPI_T_SCOPE_ALL_EQ
description : >-
If true, mpl/ze will use immediate command list for copying
- name : MPIR_CVAR_GPU_ROUND_ROBIN_COMMAND_QUEUES
category : GPU
type : boolean
default : false
class : none
verbosity : MPI_T_VERBOSITY_USER_BASIC
scope : MPI_T_SCOPE_ALL_EQ
description : >-
If true, mpl/ze will use command queues in a round-robin fashion.
If false, only command queues of index 0 will be used.
- name : MPIR_CVAR_NO_COLLECTIVE_FINALIZE
category : COLLECTIVE
type : boolean
Expand Down Expand Up @@ -214,6 +235,8 @@ int MPII_Init_thread(int *argc, char ***argv, int user_required, int *provided,
(MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE == MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE_specialized);

MPL_gpu_info.specialized_cache = specialized_cache;
MPL_gpu_info.use_immediate_cmdlist = MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST;
MPL_gpu_info.roundrobin_cmdq = MPIR_CVAR_GPU_ROUND_ROBIN_COMMAND_QUEUES;

int mpl_errno = MPL_gpu_init(debug_summary);
MPIR_ERR_CHKANDJUMP(mpl_errno != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**gpu_init");
Expand Down
10 changes: 5 additions & 5 deletions src/mpi/misc/utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -184,8 +184,8 @@ static int do_localcopy(const void *sendbuf, MPI_Aint sendcount, MPI_Datatype se
static int do_localcopy_gpu(const void *sendbuf, MPI_Aint sendcount, MPI_Datatype sendtype,
MPL_pointer_attr_t * send_attr, void *recvbuf, MPI_Aint recvcount,
MPI_Datatype recvtype, MPL_pointer_attr_t * recv_attr,
MPL_gpu_engine_type_t enginetype, bool commit,
MPIR_Typerep_req * typerep_req)
MPL_gpu_copy_direction_t dir, MPL_gpu_engine_type_t enginetype,
bool commit, MPIR_Typerep_req * typerep_req)
{
int mpi_errno = MPI_SUCCESS;
int mpl_errno = MPL_SUCCESS;
Expand Down Expand Up @@ -269,7 +269,7 @@ static int do_localcopy_gpu(const void *sendbuf, MPI_Aint sendcount, MPI_Datatyp

mpl_errno = MPL_gpu_imemcpy(MPIR_get_contig_ptr(recvbuf, recvtype_true_lb),
MPIR_get_contig_ptr(sendbuf, sendtype_true_lb), copy_sz,
dev_id, enginetype, &gpu_req, commit);
dev_id, dir, enginetype, &gpu_req, commit);
MPIR_ERR_CHKANDJUMP(mpl_errno != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER,
"**mpl_gpu_imemcpy");

Expand Down Expand Up @@ -365,15 +365,15 @@ int MPIR_Localcopy_stream(const void *sendbuf, MPI_Aint sendcount, MPI_Datatype
int MPIR_Localcopy_gpu(const void *sendbuf, MPI_Aint sendcount, MPI_Datatype sendtype,
MPL_pointer_attr_t * sendattr, void *recvbuf, MPI_Aint recvcount,
MPI_Datatype recvtype, MPL_pointer_attr_t * recvattr,
MPL_gpu_engine_type_t enginetype, bool commit)
MPL_gpu_copy_direction_t dir, MPL_gpu_engine_type_t enginetype, bool commit)
{
int mpi_errno = MPI_SUCCESS;

MPIR_FUNC_ENTER;

#ifdef MPL_HAVE_GPU
mpi_errno = do_localcopy_gpu(sendbuf, sendcount, sendtype, sendattr, recvbuf, recvcount,
recvtype, recvattr, enginetype, commit, NULL);
recvtype, recvattr, dir, enginetype, commit, NULL);
MPIR_ERR_CHECK(mpi_errno);
#else
mpi_errno =
Expand Down
17 changes: 11 additions & 6 deletions src/mpid/ch4/shm/ipc/src/ipc_fd.c
Original file line number Diff line number Diff line change
Expand Up @@ -93,37 +93,42 @@ static int MPIDI_IPC_mpi_ze_fd_setup(void)

#if defined(MPL_HAVE_ZE)
int num_fds, i, r, mpl_err = MPL_SUCCESS;
int *fds;
int *fds, *bdfs;

/* Get the number of ze devices */
mpl_err = MPL_ze_init_device_fds(&num_fds, NULL);
mpl_err = MPL_ze_init_device_fds(&num_fds, NULL, NULL);
MPIR_ERR_CHKANDJUMP(mpl_err != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER,
"**mpl_ze_init_device_fds");

fds = (int *) MPL_malloc(num_fds * sizeof(int), MPL_MEM_OTHER);
MPIR_ERR_CHKANDJUMP(!fds, mpi_errno, MPI_ERR_OTHER, "**nomem");

bdfs = (int *) MPL_malloc(num_fds * 4 * sizeof(int), MPL_MEM_OTHER);
MPIR_ERR_CHKANDJUMP(!bdfs, mpi_errno, MPI_ERR_OTHER, "**nomem");

if (MPIR_Process.local_rank == 0) {
/* Setup the device fds */
mpl_err = MPL_ze_init_device_fds(&num_fds, fds);
mpl_err = MPL_ze_init_device_fds(&num_fds, fds, bdfs);
MPIR_ERR_CHKANDJUMP(mpl_err != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER,
"**mpl_ze_init_device_fds");

/* Send the fds to all other local processes */
for (r = 1; r < MPIR_Process.local_size; r++) {
for (i = 0; i < num_fds; i++) {
MPIDI_IPC_mpi_fd_send(r, fds[0], bdfs, 4 * num_fds * sizeof(int));
for (i = 1; i < num_fds; i++) {
MPIDI_IPC_mpi_fd_send(r, fds[i], NULL, 0);
}
}
} else {
/* Receive the fds from local process 0 */
for (i = 0; i < num_fds; i++) {
MPIDI_IPC_mpi_fd_recv(0, fds, bdfs, 4 * num_fds * sizeof(int), 0);
for (i = 1; i < num_fds; i++) {
MPIDI_IPC_mpi_fd_recv(0, fds + i, NULL, 0, 0);
}
}

/* Save the fds in MPL */
MPL_ze_set_fds(num_fds, fds);
MPL_ze_set_fds(num_fds, fds, bdfs);

fn_exit:
return mpi_errno;
Expand Down
3 changes: 2 additions & 1 deletion src/mpid/ch4/shm/ipc/src/ipc_p2p.h
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,8 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_IPCI_handle_lmt_recv(MPIDI_IPC_hdr * ipc_hdr,
mpi_errno =
MPIR_Localcopy_gpu(src_buf, src_data_sz, MPI_BYTE, NULL,
MPIDIG_REQUEST(rreq, buffer), MPIDIG_REQUEST(rreq, count),
MPIDIG_REQUEST(rreq, datatype), &attr, engine, true);
MPIDIG_REQUEST(rreq, datatype), &attr,
MPL_GPU_COPY_DIRECTION_NONE, engine, true);
MPIR_ERR_CHECK(mpi_errno);
} else {
/* TODO: get sender datatype and call MPIR_Typerep_op with mapped_device set to dev_id */
Expand Down
16 changes: 15 additions & 1 deletion src/mpl/include/mpl_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,26 @@ typedef enum {
MPL_GPU_ENGINE_TYPE_COMPUTE = 0,
MPL_GPU_ENGINE_TYPE_COPY_HIGH_BANDWIDTH,
MPL_GPU_ENGINE_TYPE_COPY_LOW_LATENCY,
MPL_GPU_ENGINE_TYPE_LAST,
} MPL_gpu_engine_type_t;

#define MPL_GPU_ENGINE_NUM_TYPES 3

typedef enum {
MPL_GPU_COPY_D2H = 0,
MPL_GPU_COPY_H2D,
MPL_GPU_COPY_D2D_INCOMING,
MPL_GPU_COPY_D2D_OUTGOING,
MPL_GPU_COPY_DIRECTION_NONE,
} MPL_gpu_copy_direction_t;

#define MPL_GPU_COPY_DIRECTION_TYPES 4

typedef struct {
/* Input */
int debug_summary;
bool use_immediate_cmdlist;
bool roundrobin_cmdq;
/* Output */
bool enable_ipc;
MPL_gpu_ipc_handle_type_t ipc_handle_type;
Expand Down Expand Up @@ -129,7 +142,8 @@ int MPL_gpu_fast_memcpy(void *src, MPL_pointer_attr_t * src_attr, void *dest,
MPL_pointer_attr_t * dest_attr, size_t size);

int MPL_gpu_imemcpy(void *dest_ptr, void *src_ptr, size_t size, int dev,
MPL_gpu_engine_type_t engine_type, MPL_gpu_request * req, bool commit);
MPL_gpu_copy_direction_t dir, MPL_gpu_engine_type_t engine_type,
MPL_gpu_request * req, bool commit);
int MPL_gpu_test(MPL_gpu_request * req, int *completed);

typedef void (*MPL_gpu_hostfn) (void *data);
Expand Down
11 changes: 8 additions & 3 deletions src/mpl/include/mpl_gpu_ze.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,13 @@ typedef struct MPL_cmdlist_pool {
struct MPL_cmdlist_pool *next, *prev;
} MPL_cmdlist_pool_t;

typedef struct {
typedef struct MPL_ze_event {
ze_event_handle_t event;
struct MPL_ze_event *next, *prev;
} MPL_gpu_event;

typedef struct {
MPL_gpu_event *gpu_event;
MPL_cmdlist_pool_t *cmdList;
} MPL_gpu_request;

Expand All @@ -52,8 +57,8 @@ typedef volatile int MPL_gpu_event_t;
#define MPL_GPU_DEV_AFFINITY_ENV "ZE_AFFINITY_MASK"

/* ZE specific function */
int MPL_ze_init_device_fds(int *num_fds, int *device_fds);
void MPL_ze_set_fds(int num_fds, int *fds);
int MPL_ze_init_device_fds(int *num_fds, int *device_fds, int *bdfs);
void MPL_ze_set_fds(int num_fds, int *fds, int *bdfs);
void MPL_ze_ipc_remove_cache_handle(void *dptr);
int MPL_ze_ipc_handle_create(const void *ptr, MPL_gpu_device_attr * ptr_attr, int local_dev_id,
int use_shared_fd, MPL_gpu_ipc_mem_handle_t * ipc_handle);
Expand Down
3 changes: 2 additions & 1 deletion src/mpl/src/gpu/mpl_gpu_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -517,7 +517,8 @@ int MPL_gpu_fast_memcpy(void *src, MPL_pointer_attr_t * src_attr, void *dest,
}

int MPL_gpu_imemcpy(void *dest_ptr, void *src_ptr, size_t size, int dev,
MPL_gpu_engine_type_t engine_type, MPL_gpu_request * req, bool commit)
MPL_gpu_copy_direction_t dir, MPL_gpu_engine_type_t engine_type,
MPL_gpu_request * req, bool commit)
{
return MPL_ERR_GPU_INTERNAL;
}
Expand Down
3 changes: 2 additions & 1 deletion src/mpl/src/gpu/mpl_gpu_fallback.c
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,8 @@ int MPL_gpu_fast_memcpy(void *src, MPL_pointer_attr_t * src_attr, void *dest,
}

int MPL_gpu_imemcpy(void *dest_ptr, void *src_ptr, size_t size, int dev,
MPL_gpu_engine_type_t engine_type, MPL_gpu_request * req, bool commit)
MPL_gpu_copy_direction_t dir, MPL_gpu_engine_type_t engine_type,
MPL_gpu_request * req, bool commit)
{
return MPL_ERR_GPU_INTERNAL;
}
Expand Down
3 changes: 2 additions & 1 deletion src/mpl/src/gpu/mpl_gpu_hip.c
Original file line number Diff line number Diff line change
Expand Up @@ -466,7 +466,8 @@ int MPL_gpu_fast_memcpy(void *src, MPL_pointer_attr_t * src_attr, void *dest,
}

int MPL_gpu_imemcpy(void *dest_ptr, void *src_ptr, size_t size, int dev,
MPL_gpu_engine_type_t engine_type, MPL_gpu_request * req, bool commit)
MPL_gpu_copy_direction_t dir, MPL_gpu_engine_type_t engine_type,
MPL_gpu_request * req, bool commit)
{
return MPL_ERR_GPU_INTERNAL;
}
Expand Down
Loading

0 comments on commit 84a4193

Please sign in to comment.