Skip to content

Commit

Permalink
mpl/ze: use BDF to identify if two level-zero devices are the same de…
Browse files Browse the repository at this point in the history
…vice

For drmfd implementation of shareable IPC handle, track same physical
devices using their (domain, bus, device, function).
  • Loading branch information
zhenggb72 authored and abrooks98 committed Jul 20, 2023
1 parent 23462de commit fcd7c8e
Show file tree
Hide file tree
Showing 3 changed files with 201 additions and 27 deletions.
17 changes: 11 additions & 6 deletions src/mpid/ch4/shm/ipc/src/ipc_fd.c
Original file line number Diff line number Diff line change
Expand Up @@ -93,37 +93,42 @@ static int MPIDI_IPC_mpi_ze_fd_setup(void)

#if defined(MPL_HAVE_ZE)
int num_fds, i, r, mpl_err = MPL_SUCCESS;
int *fds;
int *fds, *bdfs;

/* Get the number of ze devices */
mpl_err = MPL_ze_init_device_fds(&num_fds, NULL);
mpl_err = MPL_ze_init_device_fds(&num_fds, NULL, NULL);
MPIR_ERR_CHKANDJUMP(mpl_err != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER,
"**mpl_ze_init_device_fds");

fds = (int *) MPL_malloc(num_fds * sizeof(int), MPL_MEM_OTHER);
MPIR_ERR_CHKANDJUMP(!fds, mpi_errno, MPI_ERR_OTHER, "**nomem");

bdfs = (int *) MPL_malloc(num_fds * 4 * sizeof(int), MPL_MEM_OTHER);
MPIR_ERR_CHKANDJUMP(!bdfs, mpi_errno, MPI_ERR_OTHER, "**nomem");

if (MPIR_Process.local_rank == 0) {
/* Setup the device fds */
mpl_err = MPL_ze_init_device_fds(&num_fds, fds);
mpl_err = MPL_ze_init_device_fds(&num_fds, fds, bdfs);
MPIR_ERR_CHKANDJUMP(mpl_err != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER,
"**mpl_ze_init_device_fds");

/* Send the fds to all other local processes */
for (r = 1; r < MPIR_Process.local_size; r++) {
for (i = 0; i < num_fds; i++) {
MPIDI_IPC_mpi_fd_send(r, fds[0], bdfs, 4 * num_fds * sizeof(int));
for (i = 1; i < num_fds; i++) {
MPIDI_IPC_mpi_fd_send(r, fds[i], NULL, 0);
}
}
} else {
/* Receive the fds from local process 0 */
for (i = 0; i < num_fds; i++) {
MPIDI_IPC_mpi_fd_recv(0, fds, bdfs, 4 * num_fds * sizeof(int), 0);
for (i = 1; i < num_fds; i++) {
MPIDI_IPC_mpi_fd_recv(0, fds + i, NULL, 0, 0);
}
}

/* Save the fds in MPL */
MPL_ze_set_fds(num_fds, fds);
MPL_ze_set_fds(num_fds, fds, bdfs);

fn_exit:
return mpi_errno;
Expand Down
4 changes: 2 additions & 2 deletions src/mpl/include/mpl_gpu_ze.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ typedef volatile int MPL_gpu_event_t;
#define MPL_GPU_DEV_AFFINITY_ENV "ZE_AFFINITY_MASK"

/* ZE specific function */
int MPL_ze_init_device_fds(int *num_fds, int *device_fds);
void MPL_ze_set_fds(int num_fds, int *fds);
int MPL_ze_init_device_fds(int *num_fds, int *device_fds, int *bdfs);
void MPL_ze_set_fds(int num_fds, int *fds, int *bdfs);
void MPL_ze_ipc_remove_cache_handle(void *dptr);
int MPL_ze_ipc_handle_create(const void *ptr, MPL_gpu_device_attr * ptr_attr, int local_dev_id,
int use_shared_fd, MPL_gpu_ipc_mem_handle_t * ipc_handle);
Expand Down
207 changes: 188 additions & 19 deletions src/mpl/src/gpu/mpl_gpu_ze.c
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ typedef struct {
#ifdef ZE_PCI_PROPERTIES_EXT_NAME
ze_pci_address_ext_t pci;
int pci_avail;
int sys_device_index;
#endif
} MPL_ze_device_entry_t;

Expand Down Expand Up @@ -103,8 +104,13 @@ static int *subdevice_map = NULL;
static uint32_t *subdevice_count = NULL;

/* For drmfd */
static int shared_device_fd_count = 0;
static int *shared_device_fds = NULL;
typedef struct _physical_device_state {
int fd;
int domain, bus, device, function;
} physical_device_state;

static int physical_device_count = 0;
static physical_device_state *physical_device_states = NULL;

typedef struct {
const void *ptr;
Expand Down Expand Up @@ -193,6 +199,9 @@ static int gpu_mem_hook_init(void);
static int remove_ipc_handle_entry(MPL_ze_mapped_buffer_entry_t * cache_entry, int dev_id);
static int MPL_event_pool_add_new_pool(void);
static void MPL_event_pool_destroy(void);
#ifdef ZE_PCI_PROPERTIES_EXT_NAME
static int search_physical_devices(ze_pci_address_ext_t pci);
#endif

/* For zeMemFree callbacks */
static gpu_free_hook_s *free_hook_chain = NULL;
Expand Down Expand Up @@ -568,6 +577,16 @@ int MPL_gpu_get_root_device(int dev_id)
return subdevice_map[dev_id];
}

/* Get dev_id for shared_device_fds from regular dev_id */
static int get_physical_device(int dev_id)
{
#ifdef ZE_PCI_PROPERTIES_EXT_NAME
if (device_states[dev_id].sys_device_index != -1)
return device_states[dev_id].sys_device_index;
#endif
return subdevice_map[dev_id];
}

/* Get dev_id from device handle */
MPL_STATIC_INLINE_PREFIX int device_to_dev_id(MPL_gpu_device_handle_t device)
{
Expand Down Expand Up @@ -975,6 +994,7 @@ static int gpu_ze_init_driver(void)
}
}
#ifdef ZE_PCI_PROPERTIES_EXT_NAME
device_state->sys_device_index = -1;
ze_pci_ext_properties_t pci_property = {
.stype = ZE_STRUCTURE_TYPE_PCI_EXT_PROPERTIES,
.pNext = NULL,
Expand Down Expand Up @@ -1212,11 +1232,11 @@ int MPL_gpu_finalize(void)
MPL_free(entry);
}

for (i = 0; i < shared_device_fd_count; ++i) {
close(shared_device_fds[i]);
for (i = 0; i < physical_device_count; ++i) {
close(physical_device_states[i].fd);
}

MPL_free(shared_device_fds);
MPL_free(physical_device_states);

gpu_free_hook_s *prev;
while (free_hook_chain) {
Expand Down Expand Up @@ -1460,7 +1480,7 @@ int MPL_gpu_ipc_handle_destroy(const void *ptr, MPL_pointer_attr_t * gpu_attr)
int dev_id;
uint64_t mem_id;

if (shared_device_fds != NULL) {
if (physical_device_states != NULL) {
MPL_ze_gem_hash_entry_t *entry = NULL;
HASH_FIND_PTR(gem_hash, &ptr, entry);
if (entry == NULL) {
Expand All @@ -1469,15 +1489,16 @@ int MPL_gpu_ipc_handle_destroy(const void *ptr, MPL_pointer_attr_t * gpu_attr)
}

HASH_DEL(gem_hash, entry);
MPL_free(entry);

/* close GEM handle */
for (int i = 0; i < entry->nhandles; i++) {
status = close_handle(shared_device_fds[entry->dev_id], entry->handles[i]);
status = close_handle(physical_device_states[entry->dev_id].fd, entry->handles[i]);
if (status) {
goto fn_fail;
}
}

MPL_free(entry);
}

if (likely(MPL_gpu_info.specialized_cache)) {
Expand Down Expand Up @@ -1803,7 +1824,8 @@ int MPL_gpu_query_is_same_dev(int global_dev1, int global_dev2)
return 0;

#ifdef ZE_PCI_PROPERTIES_EXT_NAME
if (MPL_gpu_get_root_device(local_dev1) == MPL_gpu_get_root_device(local_dev2))
if (get_physical_device(local_dev1) != -1 && get_physical_device(local_dev2) != -1 &&
get_physical_device(local_dev1) == get_physical_device(local_dev2))
return 1;

device_state1 = device_states + local_dev1;
Expand Down Expand Up @@ -2357,7 +2379,27 @@ ze_result_t ZE_APICALL zeMemFree(ze_context_handle_t hContext, void *dptr)

/* ZE-Specific Functions */

int MPL_ze_init_device_fds(int *num_fds, int *device_fds)
#ifdef ZE_PCI_PROPERTIES_EXT_NAME
static void queryBDF(char *pcipath, int *domain, int *b, int *d, int *f)
{
char *endptr = NULL;
const char *device_suffix = "-render";

char *rdsPos = strstr(pcipath, device_suffix);
assert(rdsPos);

char *bdfstr = rdsPos - 12;

*domain = strtol(bdfstr, &endptr, 16);
bdfstr = endptr + 1;
*b = strtol(bdfstr, &endptr, 16);
bdfstr = endptr + 1;
*d = strtol(bdfstr, &endptr, 16);
bdfstr = endptr + 1;
*f = strtol(bdfstr, &endptr, 16);
}

static int get_bdfs(int nfds, int *bdfs)
{
const char *device_directory = "/dev/dri/by-path";
const char *device_suffix = "-render";
Expand All @@ -2374,6 +2416,97 @@ int MPL_ze_init_device_fds(int *num_fds, int *device_fds)
goto fn_fail;
}

/* Search for all devices in the device directory */
while ((ent = readdir(dir)) != NULL) {
char dev_name[128];

if (ent->d_name[0] == '.') {
continue;
}

/* They must contain the device suffix */
if (strstr(ent->d_name, device_suffix) == NULL) {
continue;
}

strcpy(dev_name, device_directory);
strcat(dev_name, "/");
strcat(dev_name, ent->d_name);

int domain, b, d, f;
queryBDF(dev_name, &domain, &b, &d, &f);
bdfs[n++] = domain;
bdfs[n++] = b;
bdfs[n++] = d;
bdfs[n++] = f;
}

fn_exit:
return MPL_SUCCESS;
fn_fail:
return MPL_ERR_GPU_INTERNAL;
}

static int search_physical_devices(ze_pci_address_ext_t pci)
{
for (int i = 0; i < physical_device_count; i++) {
if (physical_device_states[i].domain == pci.domain &&
physical_device_states[i].bus == pci.bus &&
physical_device_states[i].device == pci.device &&
physical_device_states[i].function == pci.function)
return i;
}
return -1;
}
#endif

static int compareBDFFn(const void *_a, const void *_b)
{
int *a = (int *) _a;
int *b = (int *) _b;

if (a[0] != b[0])
return a[0] - b[0];

if (a[1] != b[1])
return a[1] - b[1];

if (a[2] != b[2])
return a[2] - b[2];

return a[3] - b[3];
}

static void sort_bdfs(int n, int *bdfs)
{
qsort(bdfs, n, sizeof(int) * 4, compareBDFFn);
}

int MPL_ze_init_device_fds(int *num_fds, int *device_fds, int *bdfs)
{
const char *device_directory = "/dev/dri/by-path";
const char *device_suffix = "-render";
struct dirent *ent = NULL;
int n = 0;

#ifndef MPL_ENABLE_DRMFD
printf("Error> drmfd is not supported!");
goto fn_fail;
#endif

#ifdef ZE_PCI_PROPERTIES_EXT_NAME
if (device_fds) {
get_bdfs(*num_fds, bdfs);
/* sort bdf */
sort_bdfs(*num_fds, bdfs);
}
#endif

DIR *dir = opendir(device_directory);
if (dir == NULL) {
goto fn_fail;
}

/* Search for all devices in the device directory */
while ((ent = readdir(dir)) != NULL) {
char dev_name[128];
Expand All @@ -2393,24 +2526,59 @@ int MPL_ze_init_device_fds(int *num_fds, int *device_fds)

/* Open the device */
if (device_fds) {
#ifdef ZE_PCI_PROPERTIES_EXT_NAME
int domain, b, d, f;
queryBDF(dev_name, &domain, &b, &d, &f);
/* find in bdfs */
n = -1;
for (int i = 0; i < *num_fds; i++) {
if (domain == bdfs[i * 4] && b == bdfs[i * 4 + 1] && d == bdfs[i * 4 + 2] &&
f == bdfs[i * 4 + 3])
n = i;
}
assert(n != -1);
#endif
device_fds[n] = open(dev_name, O_RDWR);
}

n++;
}

*num_fds = n;
if (device_fds == NULL)
*num_fds = n;

fn_exit:
return MPL_SUCCESS;
fn_fail:
return MPL_ERR_GPU_INTERNAL;
}

void MPL_ze_set_fds(int num_fds, int *fds)
void MPL_ze_set_fds(int num_fds, int *fds, int *bdfs)
{
shared_device_fds = fds;
shared_device_fd_count = num_fds;
physical_device_count = num_fds;
physical_device_states =
(physical_device_state *) MPL_malloc(num_fds * sizeof(physical_device_state),
MPL_MEM_OTHER);
for (int i = 0; i < num_fds; i++) {
physical_device_states[i].fd = fds[i];
#ifdef ZE_PCI_PROPERTIES_EXT_NAME
physical_device_states[i].domain = bdfs[4 * i];
physical_device_states[i].bus = bdfs[4 * i + 1];
physical_device_states[i].device = bdfs[4 * i + 2];
physical_device_states[i].function = bdfs[4 * i + 3];
#endif
}
MPL_free(fds);
MPL_free(bdfs);

#ifdef ZE_PCI_PROPERTIES_EXT_NAME
/* update sys_device_index for each logical device */
for (int d = 0; d < local_ze_device_count; d++) {
MPL_ze_device_entry_t *device_state = device_states + d;
device_state->sys_device_index = search_physical_devices(device_state->pci);
assert(device_state->sys_device_index != -1);
}
#endif
}

void MPL_ze_ipc_remove_cache_handle(void *dptr)
Expand Down Expand Up @@ -2481,13 +2649,14 @@ int MPL_ze_ipc_handle_create(const void *ptr, MPL_gpu_device_attr * ptr_attr, in
ZE_ERR_CHECK(ret);

h.nfds = nfds;
if (shared_device_fds != NULL) {
if (physical_device_states != NULL) {
if (use_shared_fd) {
int shared_dev_id = MPL_gpu_get_root_device(local_dev_id);
int shared_dev_id = get_physical_device(local_dev_id);
for (int i = 0; i < nfds; i++) {
/* convert dma_buf fd to GEM handle */
memcpy(&fds[i], &ze_ipc_handle[i], sizeof(int));
status = fd_to_handle(shared_device_fds[shared_dev_id], fds[i], &handles[i]);
status =
fd_to_handle(physical_device_states[shared_dev_id].fd, fds[i], &handles[i]);
if (status) {
goto fn_fail;
}
Expand Down Expand Up @@ -2558,10 +2727,10 @@ int MPL_ze_ipc_handle_map(MPL_gpu_ipc_mem_handle_t * mpl_ipc_handle, int is_shar
h = mpl_ipc_handle->data;
nfds = h.nfds;

if (shared_device_fds != NULL) {
if (physical_device_states != NULL) {
/* convert GEM handle to fd */
for (int i = 0; i < nfds; i++) {
status = handle_to_fd(shared_device_fds[h.dev_id], h.fds[i], &fds[i]);
status = handle_to_fd(physical_device_states[h.dev_id].fd, h.fds[i], &fds[i]);
if (status) {
goto fn_fail;
}
Expand Down

0 comments on commit fcd7c8e

Please sign in to comment.