Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

core/verbs: Add native API support for DMA-buf regions #9343

Merged
merged 3 commits into from
Sep 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 37 additions & 2 deletions fabtests/component/dmabuf-rdma/fi-mr-reg-xe.c
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ static struct fid_domain *domain;
static struct fid_ep *ep;
static struct fid_av *av;
static struct fid_cq *cq;
static struct fid_mr *mr;
static struct fid_mr *mr, *dmabuf_mr;

static void *buf;
static size_t buf_size = 65536;
Expand Down Expand Up @@ -171,10 +171,43 @@ void reg_mr(void)
return;
}

void reg_dmabuf_mr(void)
{
struct fi_mr_dmabuf dmabuf = {
.fd = xe_get_buf_fd(buf),
.offset = 0,
.len = buf_size,
};
struct fi_mr_attr mr_attr = {
.dmabuf = &dmabuf,
.access = FI_REMOTE_READ | FI_REMOTE_WRITE,
.requested_key = 2,
};

CHECK_ERROR(fi_mr_regattr(domain, &mr_attr, FI_MR_DMABUF, &dmabuf_mr));

if (fi->domain_attr->mr_mode & FI_MR_ENDPOINT) {
CHECK_ERROR(fi_mr_bind(dmabuf_mr, &ep->fid, 0));
CHECK_ERROR(fi_mr_enable(dmabuf_mr));
}

printf("mr %p, buf %p, rkey 0x%lx, len %zd\n",
dmabuf_mr, buf, fi_mr_key(dmabuf_mr), buf_size);

err_out:
return;
}

void dereg_mr(void)
{
if (mr)
fi_close((fid_t)mr);
fi_close(&mr->fid);
}

void dereg_dmabuf_mr(void)
{
if (dmabuf_mr)
fi_close(&dmabuf_mr->fid);
}

static void finalize_ofi(void)
Expand Down Expand Up @@ -262,8 +295,10 @@ int main(int argc, char *argv[])
init_buf();
init_ofi();
reg_mr();
reg_dmabuf_mr();

dereg_mr();
dereg_dmabuf_mr();
finalize_ofi();
free_buf();

Expand Down
1 change: 1 addition & 0 deletions include/rdma/fabric.h
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ typedef struct fid *fid_t;
#define FI_MATCH_COMPLETE (1ULL << 31)

#define FI_PEER_TRANSFER (1ULL << 36)
#define FI_MR_DMABUF (1ULL << 40)
#define FI_AV_USER_ID (1ULL << 41)
#define FI_PEER (1ULL << 43)
#define FI_XPU_TRIGGER (1ULL << 44)
Expand Down
11 changes: 10 additions & 1 deletion include/rdma/fi_domain.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,17 @@ static inline int fi_hmem_ze_device(int driver_index, int device_index)
return driver_index << 16 | device_index;
}

struct fi_mr_dmabuf {
int fd;
uint64_t offset;
size_t len;
};

struct fi_mr_attr {
const struct iovec *mr_iov;
union {
const struct iovec *mr_iov;
const struct fi_mr_dmabuf *dmabuf;
};
size_t iov_count;
uint64_t access;
uint64_t offset;
Expand Down
57 changes: 51 additions & 6 deletions man/fi_mr.3.md
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,12 @@ The following apply to memory registration.
of this bit typically implies that peers must exchange addressing data
prior to initiating any RMA or atomic operation.

For memory regions that are registered using FI_MR_DMABUF, the starting
'virtual address' of the DMA-buf region is equal to the offset field
of struct fi_mr_dmabuf that was specified through the registration call.
That is, a DMA-buf region starts at 'virtual address' 0, with offset
being used as the starting address of the registration.

*FI_MR_ALLOCATED*
: When set, all registered memory regions must be backed by physical
memory pages at the time the registration call is made.
Expand Down Expand Up @@ -507,7 +513,10 @@ into calls as function parameters.

```c
struct fi_mr_attr {
const struct iovec *mr_iov;
union {
const struct iovec *mr_iov;
const struct fi_mr_dmabuf *dmabuf;
};
size_t iov_count;
uint64_t access;
uint64_t offset;
Expand All @@ -528,8 +537,34 @@ struct fi_mr_attr {
```
## mr_iov

This is an IO vector of addresses that will represent a single memory
region. The number of entries in the iovec is specified by iov_count.
This is an IO vector of virtual addresses and their length that represent
a single memory region. The number of entries in the iovec is specified
by iov_count.

## dmabuf

This structure references a DMA-buf backed device memory region. This field
is only usable if the application has successfully requested support for
FI_HMEM and the FI_MR_DMABUF flag is passed into the memory registration
call. DMA-buf regions are file-based references to device memory. Such
regions are identified through the struct fi_mr_dmabuf.

```c
struct fi_mr_dmabuf {
int fd;
uint64_t offset;
size_t len;
};
```
The fd is the file descriptor associated with the DMA-buf region. The
offset is the offset into the region where the memory registration should
begin. And len is the size of the region to register, starting at the
offset. The selection of dmabuf over the mr_iov field is controlled by
specifying the FI_MR_DMABUF flag.

DMA-buf registrations are used to share device memory between a given
device and the fabric NIC and does not require that the device memory
be mmap'ed into the virtual address space of the calling process.

## iov_count

Expand Down Expand Up @@ -631,7 +666,8 @@ requested the FI_HMEM capability.

*FI_HMEM_SYSTEM*
: Uses standard operating system calls and libraries, such as malloc,
calloc, realloc, mmap, and free.
calloc, realloc, mmap, and free. When iface is set to FI_HMEM_SYSTEM,
the device field (described below) is ignored.

*FI_HMEM_CUDA*
: Uses Nvidia CUDA interfaces such as cuMemAlloc, cuMemAllocHost,
Expand All @@ -652,7 +688,8 @@ requested the FI_HMEM capability.

## device
Reserved 64 bits for device identifier if using non-standard HMEM interface.
This field is ignore unless the iface field is valid.
This field is ignore unless the iface field is valid. Otherwise, the device
field is determined by the value specified through iface.

*cuda*
: For FI_HMEM_CUDA, this is equivalent to CUdevice (int).
Expand Down Expand Up @@ -731,14 +768,22 @@ The follow flag may be specified to any memory registration call.
device is specified by the fi_mr_attr fields iface and device. This refers
to memory regions that were allocated using a device API AllocDevice call
(as opposed to using the host allocation or unified/shared memory allocation).
This flag is only usable for domains opened with FI_HMEM capability support.

*FI_HMEM_HOST_ALLOC*
: This flag indicates that the memory is owned by the host only. Whether it
can be accessed by the device is implementation dependent. The fi_mr_attr
field iface is still used to identify the device API, but the field device
is ignored. This refers to memory regions that were allocated using a device
API AllocHost call (as opposed to using malloc-like host allocation,
unified/shared memory allocation, or AllocDevice).
unified/shared memory allocation, or AllocDevice). This flag is only usable
for domains opened with FI_HMEM capability support.

*FI_MR_DMABUF*
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this also be a capability bit so that the application can check if this supported by the provider?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe, and I had the same thought. Can the current FI_HMEM work, which would require providers support FI_HMEM to also support dmabuf registration? I'd like to avoid capability bit overload when possible. But I'm not sure if FI_HMEM is sufficient.

I was thinking that a provider could call mmap() to map the region into the VA space, then treat the dmabuf the same as a VA registered region. The provider may need to discover what hmem API was used to allocate the region. Is there an issue if the provider were forced down this path? IOW, dmabuf support becomes an optimization that the provider could leverage, otherwise it needs to fallback to whatever its standard HMEM support is.

: This flag indicates that the memory region to registered is a DMA-buf backed
region. When set, the region is specified through the dmabuf field of the
fi_mr_attr structure. This flag is only usable for domains opened with
FI_HMEM capability support.

# MEMORY DOMAINS

Expand Down
71 changes: 55 additions & 16 deletions prov/verbs/src/verbs_mr.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,8 @@ static struct fi_ops vrb_mr_fi_ops = {
};

#if VERBS_HAVE_DMABUF_MR
static inline
struct ibv_mr *vrb_mr_ibv_reg_dmabuf_mr(struct ibv_pd *pd, const void *buf,
size_t len, int vrb_access)
static struct ibv_mr *vrb_reg_ze_dmabuf(struct ibv_pd *pd, const void *buf,
size_t len, int vrb_access)
{
void *handle;
void *base;
Expand Down Expand Up @@ -115,10 +114,10 @@ struct ibv_mr *vrb_mr_ibv_reg_dmabuf_mr(struct ibv_pd *pd, const void *buf,
}
#endif

static inline
int vrb_mr_reg_common(struct vrb_mem_desc *md, int vrb_access, const void *buf,
size_t len, void *context, enum fi_hmem_iface iface,
uint64_t device)
static int
vrb_mr_reg_common(struct vrb_mem_desc *md, int vrb_access, const void *buf,
size_t len, void *context, enum fi_hmem_iface iface,
uint64_t device, uint64_t flags)
{
if (!ofi_hmem_is_initialized(iface)) {
FI_WARN(&vrb_prov, FI_LOG_MR,
Expand All @@ -137,9 +136,13 @@ int vrb_mr_reg_common(struct vrb_mem_desc *md, int vrb_access, const void *buf,
vrb_access |= VRB_ACCESS_ON_DEMAND;

#if VERBS_HAVE_DMABUF_MR
if (iface == FI_HMEM_ZE && vrb_gl_data.dmabuf_support)
md->mr = vrb_mr_ibv_reg_dmabuf_mr(md->domain->pd, buf, len,
vrb_access);
if (flags & FI_MR_DMABUF)
md->mr = ibv_reg_dmabuf_mr(md->domain->pd, (uintptr_t) buf,
len, (uintptr_t) buf,
(int) device, vrb_access);
else if (iface == FI_HMEM_ZE && vrb_gl_data.dmabuf_support)
md->mr = vrb_reg_ze_dmabuf(md->domain->pd, buf, len,
vrb_access);
else
#endif
md->mr = ibv_reg_mr(md->domain->pd, (void *) buf, len,
Expand Down Expand Up @@ -208,7 +211,7 @@ vrb_mr_nocache_reg(struct vrb_domain *domain, const void *buf, size_t len,
enum fi_hmem_iface iface, uint64_t device)
{
struct vrb_mem_desc *md;
int ret;
int vrb_access, ret;

md = calloc(1, sizeof(*md));
if (OFI_UNLIKELY(!md))
Expand All @@ -217,9 +220,14 @@ vrb_mr_nocache_reg(struct vrb_domain *domain, const void *buf, size_t len,
md->domain = domain;
md->mr_fid.fid.ops = &vrb_mr_fi_ops;

ret = vrb_mr_reg_common(md, vrb_mr_ofi2ibv_access(access, md->domain),
buf, len, context, iface, device);
if (OFI_UNLIKELY(ret))
vrb_access = vrb_mr_ofi2ibv_access(access, md->domain);
if (flags & FI_MR_DMABUF)
ret = vrb_mr_reg_common(md, vrb_access, (void *) (uintptr_t) offset,
len, context, iface, device, flags);
else
ret = vrb_mr_reg_common(md, vrb_access, buf, len, context,
iface, device, flags);
if (ret)
goto err;

*mr = &md->mr_fid;
Expand All @@ -229,6 +237,34 @@ vrb_mr_nocache_reg(struct vrb_domain *domain, const void *buf, size_t len,
return ret;
}

#if VERBS_HAVE_DMABUF_MR

static int
vrb_reg_dmabuf(struct vrb_domain *domain, const struct fi_mr_attr *attr,
uint64_t flags, struct fid_mr **mr)
{
int ret;

if (!vrb_gl_data.dmabuf_support)
return -FI_ENOSYS;

/* Skip trying to cache the MR. We don't have a mechanism
* to monitor or verify if the region is invalidated.
*/
ret = vrb_mr_nocache_reg(domain, NULL, attr->dmabuf->len, attr->access,
attr->dmabuf->offset, attr->requested_key,
flags, mr, attr->context, attr->iface,
(uint64_t) attr->dmabuf->fd);

return ret;
}

#else /* VERBS_HAVE_DMABUF_MR */

#define vrb_reg_dmabuf(domain, attr, flags, mr) -FI_ENOSYS

#endif

static int vrb_mr_cache_close(fid_t fid)
{
struct vrb_mem_desc *md =
Expand All @@ -246,7 +282,7 @@ static struct fi_ops vrb_mr_cache_fi_ops = {
};

int vrb_mr_cache_add_region(struct ofi_mr_cache *cache,
struct ofi_mr_entry *entry)
struct ofi_mr_entry *entry)
{
struct vrb_mem_desc *md = (struct vrb_mem_desc *) entry->data;

Expand All @@ -258,7 +294,7 @@ int vrb_mr_cache_add_region(struct ofi_mr_cache *cache,
IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_ATOMIC |
IBV_ACCESS_REMOTE_READ, entry->info.iov.iov_base,
entry->info.iov.iov_len, NULL, entry->info.iface,
entry->info.device);
entry->info.device, 0);
}

void vrb_mr_cache_delete_region(struct ofi_mr_cache *cache,
Expand Down Expand Up @@ -377,6 +413,9 @@ static int vrb_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr,
domain->util_domain.info_domain_caps, attr,
&cur_abi_attr);

if (flags & FI_MR_DMABUF)
return vrb_reg_dmabuf(domain, &cur_abi_attr, flags, mr);

if ((flags & FI_HMEM_HOST_ALLOC) && (cur_abi_attr.iface == FI_HMEM_ZE))
cur_abi_attr.device.ze = -1;

Expand Down