Skip to content

Commit

Permalink
Merge pull request #59 from gpudirect/new_memops
Browse files Browse the repository at this point in the history
support for extended memops
  • Loading branch information
e-ago authored May 30, 2018
2 parents a737a02 + 4e430de commit 205432d
Show file tree
Hide file tree
Showing 22 changed files with 3,482 additions and 858 deletions.
11 changes: 5 additions & 6 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ AM_CPPFLAGS += -D__STDC_FORMAT_MACROS

#AM_LDFLAGS = -L$(CUDA_PATH)/lib64
LIBGDSTOOLS = @LIBGDSTOOLS@
LIBNVTX = @LIBNVTX@

lib_LTLIBRARIES = src/libgdsync.la

Expand All @@ -34,21 +35,19 @@ bin_PROGRAMS = tests/gds_kernel_latency tests/gds_poll_lat tests/gds_kernel_loop
noinst_PROGRAMS = tests/rstest

tests_gds_kernel_latency_SOURCES = tests/gds_kernel_latency.c tests/gpu_kernels.cu tests/pingpong.c tests/gpu.cpp
tests_gds_kernel_latency_LDADD = $(top_builddir)/src/libgdsync.la -lmpi $(LIBGDSTOOLS) -lgdrapi -lcuda -lcudart
tests_gds_kernel_latency_LDADD = $(top_builddir)/src/libgdsync.la -lmpi $(LIBGDSTOOLS) -lgdrapi $(LIBNVTX) -lcuda -lcudart

tests_rstest_SOURCES = tests/rstest.cpp
tests_rstest_LDADD =

#tests_gds_poll_lat_CFLAGS = -DUSE_PROF -DUSE_PERF -I/ivylogin/home/drossetti/work/p4/cuda_a/sw/dev/gpu_drv/cuda_a/drivers/gpgpu/cuda/inc
#tests_gds_poll_lat_SOURCES = tests/gds_poll_lat.c tests/gpu.cpp tests/gpu_kernels.cu tests/perfutil.c tests/perf.c
tests_gds_poll_lat_SOURCES = tests/gds_poll_lat.c tests/gpu.cpp tests/gpu_kernels.cu
tests_gds_poll_lat_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi -lmpi -lcuda -lcudart
tests_gds_poll_lat_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi -lmpi $(LIBNVTX) -lcuda -lcudart

tests_gds_sanity_SOURCES = tests/gds_sanity.c tests/gpu.cpp tests/gpu_kernels.cu
tests_gds_sanity_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi -lmpi -lcuda -lcudart
tests_gds_sanity_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi -lmpi $(LIBNVTX) -lcuda -lcudart

tests_gds_kernel_loopback_latency_SOURCES = tests/gds_kernel_loopback_latency.c tests/pingpong.c tests/gpu.cpp tests/gpu_kernels.cu
tests_gds_kernel_loopback_latency_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi -lcuda -lcudart
tests_gds_kernel_loopback_latency_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi $(LIBNVTX) -lcuda -lcudart


SUFFIXES= .cu
Expand Down
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,10 @@ This prototype has been tested on RHEL 6.x and Ubuntu 16.04
## Build

Git repository does not include autotools files. The first time the directory
must be configured by running autogen.sh
must be configured by running:
```shell
$ autoreconf -if
```

As an example, the build.sh script is provided. You should modify it
according to the desired destination paths as well as the location
Expand Down
7 changes: 0 additions & 7 deletions autogen.sh

This file was deleted.

44 changes: 36 additions & 8 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,55 @@

[ ! -d config ] && mkdir -p config

[ ! -e configure ] && ./autogen.sh
[ ! -e configure ] && autoreconf -fv -i

[ ! -d build ] && mkdir build

cd build
echo "PREFIX=$PREFIX"
echo "CUDADRV=$CUDADRV"
echo "CUDATK=$CUDATK"
echo "CUDA=$CUDA"
echo "MPI_HOME=$MPI_HOME"

if [ ! -e Makefile ]; then
echo "configuring..."
WITHCUDADRV=
EXTRA=
if [ "x$CUDADRV" != "x" ]; then
WITHCUDADRV="--with-cuda-driver=${CUDADRV}"
EXTRA+=" --with-cuda-driver=${CUDADRV}"
fi
if [ "x$CUDATK" != "x" ]; then
EXTRA+=" --with-cuda-toolkit=$CUDATK"
elif [ "x$CUDA" != "x" ]; then
EXTRA+=" --with-cuda-toolkit=$CUDA"
else
echo "ERROR: CUDA toolkit path not passed"
exit
fi
if [ "x$OFED" != "x" ]; then
echo "picking OFED libibverbs from $OFED"
EXTRA+=" --with-libibverbs=$OFED"
else
echo "WARNING: assuming IB Verbs is installed in /usr"
EXTRA+=" --with-libibverbs=/usr"
fi

if [ "x$GDRCOPY" != "x" ]; then
EXTRA+=" --with-gdrcopy=$GDRCOPY"
else
echo "WARNING: assuming GDRcopy is installed in /usr"
EXTRA+=" --with-gdrcopy=/usr"
fi

EXTRA+=" --enable-test"
EXTRA+=" --enable-extended-memops"
#EXTRA+=" --enable-nvtx"
#EXTRA="$EXTRA --with-gdstools=$PREFIX"

../configure \
--prefix=$PREFIX \
--with-libibverbs=$PREFIX \
$WITHCUDADRV \
--with-cuda-toolkit=$CUDA \
--with-gdrcopy=$PREFIX \
--with-mpi=$MPI_HOME \
--enable-test
$EXTRA

fi

Expand Down
48 changes: 40 additions & 8 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ AM_CONDITIONAL(TEST_ENABLE, test x$enable_test = xyes)
AC_ARG_ENABLE(
[extended-memops],
[AC_HELP_STRING([--enable-extended-memops],
[Enable support for CUDA 9.0 MemOps (default=no)])],
[Enable support for CUDA 10.0 MemOps (default=no)])],
[enable_ext_memops=$enableval],
[enable_ext_memops=no])
AM_CONDITIONAL(EXT_MEMOPS, test x$enable_ext_memops = xyes)
Expand Down Expand Up @@ -106,12 +106,30 @@ AC_ARG_WITH(cuda-driver,
)

dnl Specify GPU Arch
AC_ARG_ENABLE(gpu-arch,
AC_HELP_STRING([--enable-gpu-arch=arch], [ Set GPU arch: sm_20, sm_21, sm_30, sm_35, sm_50, sm_52 (default: sm_35)]),
[ gpu_arch=${enableval} ],
[ gpu_arch="sm_35" ]
AC_ARG_WITH(
[gpu-arch],
AC_HELP_STRING([--with-gpu-arch=arch],
[ Set GPU arch: sm_30, sm_35, sm_50, sm_52, sm_60, sm_70 (default: sm_35)]),
[ gpu_arch=${withval} ],
[ gpu_arch="sm_35" ]
)

AC_ARG_ENABLE(
[nvtx],
[AC_HELP_STRING([--enable-nvtx],
[Use NVTX profiling extensions (default=no)])],
[enable_nvtx=$enableval],
[enable_nvtx=no])
if test x$enable_nvtx = x || test x$enable_nvtx = xno; then
want_nvtx=no
LIBNVTX=
else
want_nvtx=yes
CPPFLAGS="$CPPFLAGS -DUSE_NVTX"
LIBNVTX=-lnvToolsExt
AC_MSG_NOTICE([Enabling use of NVTX])
AC_SUBST(LIBNVTX)
fi

dnl Checks for programs
AC_PROG_CC
Expand Down Expand Up @@ -169,11 +187,25 @@ dnl Checks for CUDA >= 8.0
AC_CHECK_LIB(cuda, cuStreamBatchMemOp, [],
AC_MSG_ERROR([cuStreamBatchMemOp() not found. libgdsync requires CUDA 8.0 or later.]))

dnl Checks for CUDA >= 9.0
AC_CHECK_DECLS([CU_STREAM_MEM_OP_WRITE_VALUE_64], [], [], [[#include <cuda.h>]])
AC_CHECK_DECLS([CU_STREAM_MEM_OP_WAIT_VALUE_64], [], [], [[#include <cuda.h>]])
AC_CHECK_DECLS([CU_STREAM_WAIT_VALUE_NOR], [], [], [[#include <cuda.h>]])
AC_CHECK_DECLS([CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS], [], [], [[#include <cuda.h>]])
AC_CHECK_DECLS([CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR], [], [], [[#include <cuda.h>]])

dnl Checks for CUDA >= 9.2
AC_CHECK_DECLS([CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS], [], [], [[#include <cuda.h>]])
AC_CHECK_DECLS([CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES], [], [], [[#include <cuda.h>]])

if test x$enable_ext_memops = xyes; then
AC_CHECK_DECLS([CU_STREAM_MEM_OP_INLINE_COPY], [], [], [[#include <cuda.h>]])
AC_CHECK_DECLS([CU_STREAM_MEM_OP_WRITE_MEMORY], [], [], [[#include <cuda.h>]])
AC_CHECK_DECLS([CU_STREAM_MEM_OP_MEMORY_BARRIER], [], [], [[#include <cuda.h>]])
AC_CHECK_DECLS([CU_STREAM_MEM_OP_WRITE_VALUE_64], [], [], [[#include <cuda.h>]])
AC_CHECK_DECLS([CU_STREAM_BATCH_MEM_OP_CONSISTENCY_WEAK], [], [], [[#include <cuda.h>]])
AC_CHECK_DECLS([CU_STREAM_BATCH_MEM_OP_RELAXED_ORDERING], [], [], [[#include <cuda.h>]])
AC_CHECK_DECLS([CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_BATCH_MEMOP_RELAXED_ORDERING], [], [], [[#include <cuda.h>]])
AC_CHECK_DECLS([CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WRITE_MEMORY], [], [], [[#include <cuda.h>]])
AC_CHECK_DECLS([CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEMORY_BARRIER], [], [], [[#include <cuda.h>]])
AC_CHECK_DECLS([CU_DEVICE_ATTRIBUTE_MAXIMUM_STREAM_WRITE_MEMORY_SIZE], [], [], [[#include <cuda.h>]])
fi

AC_CONFIG_FILES([Makefile libgdsync.spec])
Expand Down
64 changes: 58 additions & 6 deletions include/gdsync/core.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
#endif

#define GDS_API_MAJOR_VERSION 2U
#define GDS_API_MINOR_VERSION 1U
#define GDS_API_MINOR_VERSION 2U
#define GDS_API_VERSION ((GDS_API_MAJOR_VERSION << 16) | GDS_API_MINOR_VERSION)
#define GDS_API_VERSION_COMPATIBLE(v) \
( ((((v) & 0xffff0000U) >> 16) == GDS_API_MAJOR_VERSION) && \
Expand Down Expand Up @@ -120,6 +120,7 @@ typedef enum gds_memory_type {
GDS_MEMORY_MASK = 0x7
} gds_memory_type_t;

// Note: those flags below must not overlap with gds_memory_type_t
typedef enum gds_wait_flags {
GDS_WAIT_POST_FLUSH = 1<<3,
} gds_wait_flags_t;
Expand All @@ -128,14 +129,15 @@ typedef enum gds_write_flags {
GDS_WRITE_PRE_BARRIER = 1<<4,
} gds_write_flags_t;

typedef enum gds_immcopy_flags {
GDS_IMMCOPY_POST_TAIL_FLUSH = 1<<4,
} gds_immcopy_flags_t;
typedef enum gds_write_memory_flags {
GDS_WRITE_MEMORY_POST_BARRIER_SYS = 1<<4, /*< add a trailing memory barrier to the memory write operation */
} gds_write_memory_flags_t;

typedef enum gds_membar_flags {
GDS_MEMBAR_FLUSH_REMOTE = 1<<4,
GDS_MEMBAR_DEFAULT = 1<<5,
GDS_MEMBAR_SYS = 1<<6,
GDS_MEMBAR_MLX5 = 1<<7,
} gds_membar_flags_t;

enum {
Expand Down Expand Up @@ -244,7 +246,32 @@ int gds_prepare_write_value32(gds_write_value32_t *desc, uint32_t *ptr, uint32_t



typedef enum gds_tag { GDS_TAG_SEND, GDS_TAG_WAIT, GDS_TAG_WAIT_VALUE32, GDS_TAG_WRITE_VALUE32 } gds_tag_t;
/**
* Represents a staged copy operation
* the src buffer can be reused after the API call
*/

typedef struct gds_write_memory {
uint8_t *dest;
const uint8_t *src;
size_t count;
int flags; // takes gds_memory_type_t | gds_write_memory_flags_t
} gds_write_memory_t;

/**
* flags: gds_memory_type_t | gds_write_memory_flags_t
*/
int gds_prepare_write_memory(gds_write_memory_t *desc, uint8_t *dest, const uint8_t *src, size_t count, int flags);



typedef enum gds_tag {
GDS_TAG_SEND,
GDS_TAG_WAIT,
GDS_TAG_WAIT_VALUE32,
GDS_TAG_WRITE_VALUE32,
GDS_TAG_WRITE_MEMORY
} gds_tag_t;

typedef struct gds_descriptor {
gds_tag_t tag; /**< selector for union below */
Expand All @@ -253,14 +280,39 @@ typedef struct gds_descriptor {
gds_wait_request_t *wait;
gds_wait_value32_t wait32;
gds_write_value32_t write32;
gds_write_memory_t writemem;
};
} gds_descriptor_t;

/**
* flags: must be 0
* \brief: post descriptors for peer QPs synchronized to the specified CUDA stream
*
* \param flags - must be 0
*
* \return
* 0 on success or one standard errno error
*
*/
int gds_stream_post_descriptors(CUstream stream, size_t n_descs, gds_descriptor_t *descs, int flags);

/**
* \brief: CPU-synchronous post descriptors for peer QPs
*
*
* \param flags - must be 0
*
* \return
* 0 on success or one standard errno error
*
*
* Notes:
* - This API might have higher overhead than issuing multiple ibv_post_send.
* - It is provided for convenience only.
* - It might fail if trying to access CUDA device memory pointers
*/
int gds_post_descriptors(size_t n_descs, gds_descriptor_t *descs, int flags);


/*
* Local variables:
* c-indent-level: 8
Expand Down
11 changes: 0 additions & 11 deletions include/gdsync/tools.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,4 @@ typedef struct gds_mem_desc {
int gds_alloc_mapped_memory(gds_mem_desc_t *desc, size_t size, int flags);
int gds_free_mapped_memory(gds_mem_desc_t *desc);

// flags is combination of gds_memory_type and gds_poll_flags
int gds_stream_post_poll_dword(CUstream stream, uint32_t *ptr, uint32_t magic, gds_wait_cond_flag_t cond_flag, int flags);
int gds_stream_post_poke_dword(CUstream stream, uint32_t *ptr, uint32_t value, int flags);
int gds_stream_post_inline_copy(CUstream stream, void *ptr, void *src, size_t nbytes, int flags);
int gds_stream_post_polls_and_pokes(CUstream stream,
size_t n_polls, uint32_t *ptrs[], uint32_t magics[], gds_wait_cond_flag_t cond_flags[], int poll_flags[],
size_t n_pokes, uint32_t *poke_ptrs[], uint32_t poke_values[], int poke_flags[]);
int gds_stream_post_polls_and_immediate_copies(CUstream stream,
size_t n_polls, uint32_t *ptrs[], uint32_t magics[], gds_wait_cond_flag_t cond_flags[], int poll_flags[],
size_t n_imms, void *imm_ptrs[], void *imm_datas[], size_t imm_bytes[], int imm_flags[]);

GDS_END_DECLS
Loading

0 comments on commit 205432d

Please sign in to comment.