diff --git a/Makefile.am b/Makefile.am index 16b048b..8cbd26d 100644 --- a/Makefile.am +++ b/Makefile.am @@ -8,6 +8,7 @@ AM_CPPFLAGS += -D__STDC_FORMAT_MACROS #AM_LDFLAGS = -L$(CUDA_PATH)/lib64 LIBGDSTOOLS = @LIBGDSTOOLS@ +LIBNVTX = @LIBNVTX@ lib_LTLIBRARIES = src/libgdsync.la @@ -34,21 +35,19 @@ bin_PROGRAMS = tests/gds_kernel_latency tests/gds_poll_lat tests/gds_kernel_loop noinst_PROGRAMS = tests/rstest tests_gds_kernel_latency_SOURCES = tests/gds_kernel_latency.c tests/gpu_kernels.cu tests/pingpong.c tests/gpu.cpp -tests_gds_kernel_latency_LDADD = $(top_builddir)/src/libgdsync.la -lmpi $(LIBGDSTOOLS) -lgdrapi -lcuda -lcudart +tests_gds_kernel_latency_LDADD = $(top_builddir)/src/libgdsync.la -lmpi $(LIBGDSTOOLS) -lgdrapi $(LIBNVTX) -lcuda -lcudart tests_rstest_SOURCES = tests/rstest.cpp tests_rstest_LDADD = -#tests_gds_poll_lat_CFLAGS = -DUSE_PROF -DUSE_PERF -I/ivylogin/home/drossetti/work/p4/cuda_a/sw/dev/gpu_drv/cuda_a/drivers/gpgpu/cuda/inc -#tests_gds_poll_lat_SOURCES = tests/gds_poll_lat.c tests/gpu.cpp tests/gpu_kernels.cu tests/perfutil.c tests/perf.c tests_gds_poll_lat_SOURCES = tests/gds_poll_lat.c tests/gpu.cpp tests/gpu_kernels.cu -tests_gds_poll_lat_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi -lmpi -lcuda -lcudart +tests_gds_poll_lat_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi -lmpi $(LIBNVTX) -lcuda -lcudart tests_gds_sanity_SOURCES = tests/gds_sanity.c tests/gpu.cpp tests/gpu_kernels.cu -tests_gds_sanity_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi -lmpi -lcuda -lcudart +tests_gds_sanity_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi -lmpi $(LIBNVTX) -lcuda -lcudart tests_gds_kernel_loopback_latency_SOURCES = tests/gds_kernel_loopback_latency.c tests/pingpong.c tests/gpu.cpp tests/gpu_kernels.cu -tests_gds_kernel_loopback_latency_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi -lcuda -lcudart +tests_gds_kernel_loopback_latency_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi $(LIBNVTX) -lcuda -lcudart SUFFIXES= .cu diff --git a/README.md b/README.md index 0811917..037fee8 100644 --- a/README.md +++ b/README.md @@ -88,7 +88,10 @@ This prototype has been tested on RHEL 6.x and Ubuntu 16.04 ## Build Git repository does not include autotools files. The first time the directory -must be configured by running autogen.sh +must be configured by running: +```shell +$ autoreconf -if +``` As an example, the build.sh script is provided. You should modify it according to the desired destination paths as well as the location diff --git a/autogen.sh b/autogen.sh deleted file mode 100755 index b0ad85e..0000000 --- a/autogen.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/sh -exE - -aclocal -I config -libtoolize --force --copy -autoheader -automake --foreign --add-missing --copy -autoconf diff --git a/build.sh b/build.sh index 8e320a0..8371b7d 100755 --- a/build.sh +++ b/build.sh @@ -2,27 +2,55 @@ [ ! -d config ] && mkdir -p config -[ ! -e configure ] && ./autogen.sh +[ ! -e configure ] && autoreconf -fv -i [ ! -d build ] && mkdir build cd build +echo "PREFIX=$PREFIX" echo "CUDADRV=$CUDADRV" +echo "CUDATK=$CUDATK" +echo "CUDA=$CUDA" +echo "MPI_HOME=$MPI_HOME" + if [ ! -e Makefile ]; then echo "configuring..." - WITHCUDADRV= + EXTRA= if [ "x$CUDADRV" != "x" ]; then - WITHCUDADRV="--with-cuda-driver=${CUDADRV}" + EXTRA+=" --with-cuda-driver=${CUDADRV}" + fi + if [ "x$CUDATK" != "x" ]; then + EXTRA+=" --with-cuda-toolkit=$CUDATK" + elif [ "x$CUDA" != "x" ]; then + EXTRA+=" --with-cuda-toolkit=$CUDA" + else + echo "ERROR: CUDA toolkit path not passed" + exit + fi + if [ "x$OFED" != "x" ]; then + echo "picking OFED libibverbs from $OFED" + EXTRA+=" --with-libibverbs=$OFED" + else + echo "WARNING: assuming IB Verbs is installed in /usr" + EXTRA+=" --with-libibverbs=/usr" fi + if [ "x$GDRCOPY" != "x" ]; then + EXTRA+=" --with-gdrcopy=$GDRCOPY" + else + echo "WARNING: assuming GDRcopy is installed in /usr" + EXTRA+=" --with-gdrcopy=/usr" + fi + + EXTRA+=" --enable-test" + EXTRA+=" --enable-extended-memops" + #EXTRA+=" --enable-nvtx" + #EXTRA="$EXTRA --with-gdstools=$PREFIX" + ../configure \ --prefix=$PREFIX \ - --with-libibverbs=$PREFIX \ - $WITHCUDADRV \ - --with-cuda-toolkit=$CUDA \ - --with-gdrcopy=$PREFIX \ --with-mpi=$MPI_HOME \ - --enable-test + $EXTRA fi diff --git a/configure.ac b/configure.ac index 24fc885..f4482d8 100644 --- a/configure.ac +++ b/configure.ac @@ -27,7 +27,7 @@ AM_CONDITIONAL(TEST_ENABLE, test x$enable_test = xyes) AC_ARG_ENABLE( [extended-memops], [AC_HELP_STRING([--enable-extended-memops], - [Enable support for CUDA 9.0 MemOps (default=no)])], + [Enable support for CUDA 10.0 MemOps (default=no)])], [enable_ext_memops=$enableval], [enable_ext_memops=no]) AM_CONDITIONAL(EXT_MEMOPS, test x$enable_ext_memops = xyes) @@ -106,12 +106,30 @@ AC_ARG_WITH(cuda-driver, ) dnl Specify GPU Arch -AC_ARG_ENABLE(gpu-arch, - AC_HELP_STRING([--enable-gpu-arch=arch], [ Set GPU arch: sm_20, sm_21, sm_30, sm_35, sm_50, sm_52 (default: sm_35)]), - [ gpu_arch=${enableval} ], - [ gpu_arch="sm_35" ] +AC_ARG_WITH( + [gpu-arch], + AC_HELP_STRING([--with-gpu-arch=arch], + [ Set GPU arch: sm_30, sm_35, sm_50, sm_52, sm_60, sm_70 (default: sm_35)]), + [ gpu_arch=${withval} ], + [ gpu_arch="sm_35" ] ) +AC_ARG_ENABLE( + [nvtx], + [AC_HELP_STRING([--enable-nvtx], + [Use NVTX profiling extensions (default=no)])], + [enable_nvtx=$enableval], + [enable_nvtx=no]) +if test x$enable_nvtx = x || test x$enable_nvtx = xno; then + want_nvtx=no + LIBNVTX= +else + want_nvtx=yes + CPPFLAGS="$CPPFLAGS -DUSE_NVTX" + LIBNVTX=-lnvToolsExt + AC_MSG_NOTICE([Enabling use of NVTX]) + AC_SUBST(LIBNVTX) +fi dnl Checks for programs AC_PROG_CC @@ -169,11 +187,25 @@ dnl Checks for CUDA >= 8.0 AC_CHECK_LIB(cuda, cuStreamBatchMemOp, [], AC_MSG_ERROR([cuStreamBatchMemOp() not found. libgdsync requires CUDA 8.0 or later.])) +dnl Checks for CUDA >= 9.0 +AC_CHECK_DECLS([CU_STREAM_MEM_OP_WRITE_VALUE_64], [], [], [[#include ]]) +AC_CHECK_DECLS([CU_STREAM_MEM_OP_WAIT_VALUE_64], [], [], [[#include ]]) +AC_CHECK_DECLS([CU_STREAM_WAIT_VALUE_NOR], [], [], [[#include ]]) +AC_CHECK_DECLS([CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS], [], [], [[#include ]]) +AC_CHECK_DECLS([CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR], [], [], [[#include ]]) + +dnl Checks for CUDA >= 9.2 +AC_CHECK_DECLS([CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS], [], [], [[#include ]]) +AC_CHECK_DECLS([CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES], [], [], [[#include ]]) + if test x$enable_ext_memops = xyes; then - AC_CHECK_DECLS([CU_STREAM_MEM_OP_INLINE_COPY], [], [], [[#include ]]) + AC_CHECK_DECLS([CU_STREAM_MEM_OP_WRITE_MEMORY], [], [], [[#include ]]) AC_CHECK_DECLS([CU_STREAM_MEM_OP_MEMORY_BARRIER], [], [], [[#include ]]) - AC_CHECK_DECLS([CU_STREAM_MEM_OP_WRITE_VALUE_64], [], [], [[#include ]]) - AC_CHECK_DECLS([CU_STREAM_BATCH_MEM_OP_CONSISTENCY_WEAK], [], [], [[#include ]]) + AC_CHECK_DECLS([CU_STREAM_BATCH_MEM_OP_RELAXED_ORDERING], [], [], [[#include ]]) + AC_CHECK_DECLS([CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_BATCH_MEMOP_RELAXED_ORDERING], [], [], [[#include ]]) + AC_CHECK_DECLS([CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WRITE_MEMORY], [], [], [[#include ]]) + AC_CHECK_DECLS([CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEMORY_BARRIER], [], [], [[#include ]]) + AC_CHECK_DECLS([CU_DEVICE_ATTRIBUTE_MAXIMUM_STREAM_WRITE_MEMORY_SIZE], [], [], [[#include ]]) fi AC_CONFIG_FILES([Makefile libgdsync.spec]) diff --git a/include/gdsync/core.h b/include/gdsync/core.h index b174517..0c24a25 100644 --- a/include/gdsync/core.h +++ b/include/gdsync/core.h @@ -33,7 +33,7 @@ #endif #define GDS_API_MAJOR_VERSION 2U -#define GDS_API_MINOR_VERSION 1U +#define GDS_API_MINOR_VERSION 2U #define GDS_API_VERSION ((GDS_API_MAJOR_VERSION << 16) | GDS_API_MINOR_VERSION) #define GDS_API_VERSION_COMPATIBLE(v) \ ( ((((v) & 0xffff0000U) >> 16) == GDS_API_MAJOR_VERSION) && \ @@ -120,6 +120,7 @@ typedef enum gds_memory_type { GDS_MEMORY_MASK = 0x7 } gds_memory_type_t; +// Note: those flags below must not overlap with gds_memory_type_t typedef enum gds_wait_flags { GDS_WAIT_POST_FLUSH = 1<<3, } gds_wait_flags_t; @@ -128,14 +129,15 @@ typedef enum gds_write_flags { GDS_WRITE_PRE_BARRIER = 1<<4, } gds_write_flags_t; -typedef enum gds_immcopy_flags { - GDS_IMMCOPY_POST_TAIL_FLUSH = 1<<4, -} gds_immcopy_flags_t; +typedef enum gds_write_memory_flags { + GDS_WRITE_MEMORY_POST_BARRIER_SYS = 1<<4, /*< add a trailing memory barrier to the memory write operation */ +} gds_write_memory_flags_t; typedef enum gds_membar_flags { GDS_MEMBAR_FLUSH_REMOTE = 1<<4, GDS_MEMBAR_DEFAULT = 1<<5, GDS_MEMBAR_SYS = 1<<6, + GDS_MEMBAR_MLX5 = 1<<7, } gds_membar_flags_t; enum { @@ -244,7 +246,32 @@ int gds_prepare_write_value32(gds_write_value32_t *desc, uint32_t *ptr, uint32_t -typedef enum gds_tag { GDS_TAG_SEND, GDS_TAG_WAIT, GDS_TAG_WAIT_VALUE32, GDS_TAG_WRITE_VALUE32 } gds_tag_t; +/** + * Represents a staged copy operation + * the src buffer can be reused after the API call + */ + +typedef struct gds_write_memory { + uint8_t *dest; + const uint8_t *src; + size_t count; + int flags; // takes gds_memory_type_t | gds_write_memory_flags_t +} gds_write_memory_t; + +/** + * flags: gds_memory_type_t | gds_write_memory_flags_t + */ +int gds_prepare_write_memory(gds_write_memory_t *desc, uint8_t *dest, const uint8_t *src, size_t count, int flags); + + + +typedef enum gds_tag { + GDS_TAG_SEND, + GDS_TAG_WAIT, + GDS_TAG_WAIT_VALUE32, + GDS_TAG_WRITE_VALUE32, + GDS_TAG_WRITE_MEMORY +} gds_tag_t; typedef struct gds_descriptor { gds_tag_t tag; /**< selector for union below */ @@ -253,14 +280,39 @@ typedef struct gds_descriptor { gds_wait_request_t *wait; gds_wait_value32_t wait32; gds_write_value32_t write32; + gds_write_memory_t writemem; }; } gds_descriptor_t; /** - * flags: must be 0 + * \brief: post descriptors for peer QPs synchronized to the specified CUDA stream + * + * \param flags - must be 0 + * + * \return + * 0 on success or one standard errno error + * */ int gds_stream_post_descriptors(CUstream stream, size_t n_descs, gds_descriptor_t *descs, int flags); +/** + * \brief: CPU-synchronous post descriptors for peer QPs + * + * + * \param flags - must be 0 + * + * \return + * 0 on success or one standard errno error + * + * + * Notes: + * - This API might have higher overhead than issuing multiple ibv_post_send. + * - It is provided for convenience only. + * - It might fail if trying to access CUDA device memory pointers + */ +int gds_post_descriptors(size_t n_descs, gds_descriptor_t *descs, int flags); + + /* * Local variables: * c-indent-level: 8 diff --git a/include/gdsync/tools.h b/include/gdsync/tools.h index 7ecbb89..bc3b13d 100644 --- a/include/gdsync/tools.h +++ b/include/gdsync/tools.h @@ -46,15 +46,4 @@ typedef struct gds_mem_desc { int gds_alloc_mapped_memory(gds_mem_desc_t *desc, size_t size, int flags); int gds_free_mapped_memory(gds_mem_desc_t *desc); -// flags is combination of gds_memory_type and gds_poll_flags -int gds_stream_post_poll_dword(CUstream stream, uint32_t *ptr, uint32_t magic, gds_wait_cond_flag_t cond_flag, int flags); -int gds_stream_post_poke_dword(CUstream stream, uint32_t *ptr, uint32_t value, int flags); -int gds_stream_post_inline_copy(CUstream stream, void *ptr, void *src, size_t nbytes, int flags); -int gds_stream_post_polls_and_pokes(CUstream stream, - size_t n_polls, uint32_t *ptrs[], uint32_t magics[], gds_wait_cond_flag_t cond_flags[], int poll_flags[], - size_t n_pokes, uint32_t *poke_ptrs[], uint32_t poke_values[], int poke_flags[]); -int gds_stream_post_polls_and_immediate_copies(CUstream stream, - size_t n_polls, uint32_t *ptrs[], uint32_t magics[], gds_wait_cond_flag_t cond_flags[], int poll_flags[], - size_t n_imms, void *imm_ptrs[], void *imm_datas[], size_t imm_bytes[], int imm_flags[]); - GDS_END_DECLS diff --git a/src/apis.cpp b/src/apis.cpp index 666f96d..551b5bf 100644 --- a/src/apis.cpp +++ b/src/apis.cpp @@ -48,7 +48,9 @@ #include "objs.hpp" #include "utils.hpp" #include "memmgr.hpp" -//#include "mem.hpp" +#include "utils.hpp" +#include "archutils.h" +#include "mlnxutils.h" //----------------------------------------------------------------------------- @@ -471,63 +473,26 @@ int gds_prepare_write_value32(gds_write_value32_t *desc, uint32_t *ptr, uint32_t //----------------------------------------------------------------------------- -int gds_stream_post_poll_dword(CUstream stream, uint32_t *ptr, uint32_t magic, gds_wait_cond_flag_t cond_flags, int flags) -{ - int retcode = 0; - gds_op_list_t param; - retcode = gds_fill_poll(param, ptr, magic, cond_flags, flags); - if (retcode) { - gds_err("error in fill_poll\n"); - goto out; - } - retcode = gds_stream_batch_ops(stream, param, 0); - if (retcode) { - gds_err("error in batch_ops\n"); - goto out; - } -out: - return retcode; -} - -//----------------------------------------------------------------------------- - -int gds_stream_post_poke_dword(CUstream stream, uint32_t *ptr, uint32_t value, int flags) -{ - int retcode = 0; - gds_op_list_t param; - retcode = gds_fill_poke(param, ptr, value, flags); - if (retcode) { - gds_err("error in fill_poke\n"); - goto out; - } - retcode = gds_stream_batch_ops(stream, param, 0); - if (retcode) { - gds_err("error in batch_ops\n"); - goto out; - } -out: - return retcode; -} - -//----------------------------------------------------------------------------- - -int gds_stream_post_inline_copy(CUstream stream, void *ptr, void *src, size_t nbytes, int flags) +int gds_prepare_write_memory(gds_write_memory_t *desc, uint8_t *dest, const uint8_t *src, size_t count, int flags) { - int retcode = 0; - gds_op_list_t param; - - retcode = gds_fill_inlcpy(param, ptr, src, nbytes, flags); - if (retcode) { - gds_err("error in fill_poke\n"); + int ret = 0; + assert(desc); + if (!is_valid(memtype_from_flags(flags))) { + gds_err("invalid memory type in flags\n"); + ret = EINVAL; goto out; } - retcode = gds_stream_batch_ops(stream, param, 0); - if (retcode) { - gds_err("error in batch_ops\n"); + if (flags & ~(GDS_WRITE_MEMORY_POST_BARRIER_SYS|GDS_MEMORY_MASK)) { + gds_err("invalid flags\n"); + ret = EINVAL; goto out; } + desc->dest = dest; + desc->src = src; + desc->count = count; + desc->flags = flags; out: - return retcode; + return ret; } //----------------------------------------------------------------------------- @@ -570,6 +535,7 @@ static int get_wait_info(size_t n_descs, gds_descriptor_t *descs, size_t &n_wait case GDS_TAG_SEND: case GDS_TAG_WAIT_VALUE32: case GDS_TAG_WRITE_VALUE32: + case GDS_TAG_WRITE_MEMORY: break; default: gds_err("invalid tag\n"); @@ -579,9 +545,10 @@ static int get_wait_info(size_t n_descs, gds_descriptor_t *descs, size_t &n_wait return ret; } -static size_t calc_n_mem_ops(size_t n_descs, gds_descriptor_t *descs) +static int calc_n_mem_ops(size_t n_descs, gds_descriptor_t *descs, size_t &n_mem_ops) { - size_t n_mem_ops = 0; + int ret = 0; + n_mem_ops = 0; size_t i; for(i = 0; i < n_descs; ++i) { gds_descriptor_t *desc = descs + i; @@ -594,13 +561,15 @@ static size_t calc_n_mem_ops(size_t n_descs, gds_descriptor_t *descs) break; case GDS_TAG_WAIT_VALUE32: case GDS_TAG_WRITE_VALUE32: + case GDS_TAG_WRITE_MEMORY: n_mem_ops += 2; // ditto break; default: gds_err("invalid tag\n"); + ret = EINVAL; } } - return n_mem_ops; + return ret; } int gds_stream_post_descriptors(CUstream stream, size_t n_descs, gds_descriptor_t *descs, int flags) @@ -613,9 +582,21 @@ int gds_stream_post_descriptors(CUstream stream, size_t n_descs, gds_descriptor_ size_t n_waits = 0; size_t last_wait = 0; bool move_flush = false; + gds_peer *peer = NULL; + gds_op_list_t params; + - n_mem_ops = calc_n_mem_ops(n_descs, descs); - get_wait_info(n_descs, descs, n_waits, last_wait); + ret = calc_n_mem_ops(n_descs, descs, n_mem_ops); + if (ret) { + gds_err("error %d in calc_n_mem_ops\n", ret); + goto out; + } + + ret = get_wait_info(n_descs, descs, n_waits, last_wait); + if (ret) { + gds_err("error %d in get_wait_info\n", ret); + goto out; + } gds_dbg("n_descs=%zu n_waits=%zu n_mem_ops=%zu\n", n_descs, n_waits, n_mem_ops); @@ -626,13 +607,11 @@ int gds_stream_post_descriptors(CUstream stream, size_t n_descs, gds_descriptor_ } // alternatively, remove flush for wait is next op is a wait too - gds_peer *peer = peer_from_stream(stream); + peer = peer_from_stream(stream); if (!peer) { return EINVAL; } - gds_op_list_t params; - for(i = 0; i < n_descs; ++i) { gds_descriptor_t *desc = descs + i; switch(desc->tag) { @@ -649,8 +628,10 @@ int gds_stream_post_descriptors(CUstream stream, size_t n_descs, gds_descriptor_ case GDS_TAG_WAIT: { gds_wait_request_t *wreq = desc->wait; int flags = 0; - if (move_flush && i != last_wait) + if (move_flush && i != last_wait) { + gds_dbg("discarding FLUSH!\n"); flags = GDS_POST_OPS_DISCARD_WAIT_FLUSH; + } retcode = gds_post_ops(peer, wreq->peek.entries, wreq->peek.storage, params, flags); if (retcode) { gds_err("error %d in gds_post_ops\n", retcode); @@ -675,16 +656,25 @@ int gds_stream_post_descriptors(CUstream stream, size_t n_descs, gds_descriptor_ goto out; } break; + case GDS_TAG_WRITE_MEMORY: + retcode = gds_fill_inlcpy(params, desc->writemem.dest, desc->writemem.src, desc->writemem.count, desc->writemem.flags); + if (retcode) { + gds_err("error %d in gds_fill_inlcpy\n", retcode); + ret = retcode; + goto out; + } + break; default: - gds_err("invalid tag for %d entry\n", i); + gds_err("invalid tag for %zu entry\n", i); ret = EINVAL; goto out; break; } } - retcode = gds_stream_batch_ops(stream, params, 0); + retcode = gds_stream_batch_ops(peer, stream, params, 0); if (retcode) { - gds_err("error in batch_ops\n"); + gds_err("error %d in gds_stream_batch_ops\n", retcode); + ret = retcode; goto out; } @@ -694,6 +684,147 @@ int gds_stream_post_descriptors(CUstream stream, size_t n_descs, gds_descriptor_ //----------------------------------------------------------------------------- +int gds_post_descriptors(size_t n_descs, gds_descriptor_t *descs, int flags) +{ + size_t i; + int ret = 0; + int retcode = 0; + for(i = 0; i < n_descs; ++i) { + gds_descriptor_t *desc = descs + i; + switch(desc->tag) { + case GDS_TAG_SEND: { + gds_dbg("desc[%zu] SEND\n", i); + gds_send_request_t *sreq = desc->send; + retcode = gds_post_ops_on_cpu(sreq->commit.entries, sreq->commit.storage, flags); + if (retcode) { + gds_err("error %d in gds_post_ops_on_cpu\n", retcode); + ret = retcode; + goto out; + } + break; + } + case GDS_TAG_WAIT: { + gds_dbg("desc[%zu] WAIT\n", i); + gds_wait_request_t *wreq = desc->wait; + retcode = gds_post_ops_on_cpu(wreq->peek.entries, wreq->peek.storage, flags); + if (retcode) { + gds_err("error %d in gds_post_ops_on_cpu\n", retcode); + ret = retcode; + goto out; + } + break; + } + case GDS_TAG_WAIT_VALUE32: { + gds_dbg("desc[%zu] WAIT_VALUE32\n", i); + uint32_t *ptr = desc->wait32.ptr; + uint32_t value = desc->wait32.value; + bool flush = false; + if (desc->wait32.flags & GDS_WAIT_POST_FLUSH) { + gds_err("GDS_WAIT_POST_FLUSH flag is not supported yet\n"); + flush = true; + } + gds_memory_type_t mem_type = (gds_memory_type_t)(desc->wait32.flags & GDS_MEMORY_MASK); + switch(mem_type) { + case GDS_MEMORY_GPU: + // dereferencing ptr may fail if ptr points to CUDA device memory + case GDS_MEMORY_HOST: + case GDS_MEMORY_IO: + break; + default: + gds_err("invalid memory type 0x%02x in WAIT_VALUE32\n", mem_type); + ret = EINVAL; + goto out; + break; + } + bool done = false; + do { + uint32_t data = gds_atomic_get(ptr); + switch(desc->wait32.cond_flags) { + case GDS_WAIT_COND_GEQ: + done = ((int32_t)data - (int32_t)value >= 0); + break; + case GDS_WAIT_COND_EQ: + done = (data == value); + break; + case GDS_WAIT_COND_AND: + done = (data & value); + break; + case GDS_WAIT_COND_NOR: + done = ~(data | value); + break; + default: + gds_err("invalid condition flags 0x%02x in WAIT_VALUE32\n", desc->wait32.cond_flags); + goto out; + break; + } + if (done) + break; + // TODO: more aggressive CPU relaxing needed here to avoid starving I/O fabric + arch_cpu_relax(); + } while(true); + break; + } + case GDS_TAG_WRITE_VALUE32: { + gds_dbg("desc[%zu] WRITE_VALUE32\n", i); + uint32_t *ptr = desc->write32.ptr; + uint32_t value = desc->write32.value; + gds_memory_type_t mem_type = (gds_memory_type_t)(desc->write32.flags & GDS_MEMORY_MASK); + switch(mem_type) { + case GDS_MEMORY_GPU: + // dereferencing ptr may fail if ptr points to CUDA device memory + case GDS_MEMORY_HOST: + case GDS_MEMORY_IO: + break; + default: + gds_err("invalid memory type 0x%02x in WRITE_VALUE32\n", mem_type); + ret = EINVAL; + goto out; + break; + } + bool barrier = (desc->write32.flags & GDS_WRITE_PRE_BARRIER); + if (barrier) + wmb(); + gds_atomic_set(ptr, value); + break; + } + case GDS_TAG_WRITE_MEMORY: { + void *dest = desc->writemem.dest; + const void *src = desc->writemem.src; + size_t nbytes = desc->writemem.count; + bool barrier = (desc->writemem.flags & GDS_WRITE_MEMORY_POST_BARRIER_SYS); + gds_memory_type_t mem_type = memtype_from_flags(desc->writemem.flags); + gds_dbg("desc[%zu] WRITE_MEMORY dest=%p src=%p len=%zu memtype=%02x\n", i, dest, src, nbytes, mem_type); + switch(mem_type) { + case GDS_MEMORY_GPU: + case GDS_MEMORY_HOST: + memcpy(dest, src, nbytes); + break; + case GDS_MEMORY_IO: + assert(nbytes % sizeof(uint64_t)); + assert(((unsigned long)dest & 0x7) == 0); + gds_bf_copy((uint64_t*)dest, (uint64_t*)src, nbytes); + break; + default: + assert(!"invalid mem type"); + break; + } + if (barrier) + wmb(); + break; + } + default: + gds_err("invalid tag for %zu entry\n", i); + ret = EINVAL; + goto out; + break; + } + } +out: + return ret; +} + +//----------------------------------------------------------------------------- + /* * Local variables: * c-indent-level: 8 diff --git a/src/gdsync.cpp b/src/gdsync.cpp index 4639ae8..32c9569 100644 --- a/src/gdsync.cpp +++ b/src/gdsync.cpp @@ -45,6 +45,12 @@ //----------------------------------------------------------------------------- +void gds_assert(const char *cond, const char *file, unsigned line, const char *function) +{ + gds_err("assertion '%s' failed in %s at %s:%d\n", cond, function, file, line); + abort(); +} + int gds_dbg_enabled() { static int gds_dbg_is_enabled = -1; @@ -81,49 +87,34 @@ int gds_flusher_enabled() //----------------------------------------------------------------------------- // detect Async APIs -#if HAVE_DECL_CU_STREAM_MEM_OP_WRITE_VALUE_64 -#warning "enabling write_64 extensions" -#define GDS_HAS_WRITE64 1 -#else -#define GDS_HAS_WRITE64 0 -#endif - -#if HAVE_DECL_CU_STREAM_MEM_OP_INLINE_COPY -#warning "enabling inline_copy extensions" +#if HAVE_DECL_CU_STREAM_MEM_OP_WRITE_MEMORY +#warning "enabling WRITE_MEMORY extension" #define GDS_HAS_INLINE_COPY 1 #else #define GDS_HAS_INLINE_COPY 0 #endif -#if HAVE_DECL_CU_STREAM_BATCH_MEM_OP_CONSISTENCY_WEAK -#warning "enabling consistency extensions" -#define GDS_HAS_WEAK_API 1 +#if HAVE_DECL_CU_STREAM_BATCH_MEM_OP_RELAXED_ORDERING #else -#define GDS_HAS_WEAK_API 0 +#define CU_STREAM_BATCH_MEM_OP_RELAXED_ORDERING 0x1 #endif #if HAVE_DECL_CU_STREAM_MEM_OP_MEMORY_BARRIER -#warning "enabling memory barrier extensions" +#warning "enabling memory barrier extension" #define GDS_HAS_MEMBAR 1 #else #define GDS_HAS_MEMBAR 0 #endif -// TODO: use corret value +// TODO: use correct value // TODO: make it dependent upon the particular GPU const size_t GDS_GPU_MAX_INLINE_SIZE = 256; //----------------------------------------------------------------------------- -// Note: inlcpy has precedence -//bool gds_has_inlcpy = GDS_HAS_INLINE_COPY; -//bool gds_has_write64 = GDS_HAS_WRITE64; -//bool gds_has_weak_consistency = GDS_HAS_WEAK_API; -//bool gds_has_membar = GDS_HAS_MEMBAR; - -//----------------------------------------------------------------------------- +// Note: these are default overrides, i.e. allow to disable/enable the features +// in case the GPU supports them -// BUG: this feature is GPU device dependent static bool gds_enable_write64() { static int gds_disable_write64 = -1; @@ -135,25 +126,38 @@ static bool gds_enable_write64() gds_disable_write64 = 0; gds_dbg("GDS_DISABLE_WRITE64=%d\n", gds_disable_write64); } - // BUG: need to query device property for write64 capability - //return GDS_HAS_WRITE64 && !gds_disable_write64; - return false; + return !gds_disable_write64; +} + +static bool gds_enable_wait_nor() +{ + static int gds_disable_wait_nor = -1; + if (-1 == gds_disable_wait_nor) { + const char *env = getenv("GDS_DISABLE_WAIT_NOR"); + if (env) + gds_disable_wait_nor = !!atoi(env); + else + gds_disable_wait_nor = 1; // WAR for issue #68 + gds_dbg("GDS_DISABLE_WAIT_NOR=%d\n", gds_disable_wait_nor); + } + return !gds_disable_wait_nor; } static bool gds_enable_inlcpy() { static int gds_disable_inlcpy = -1; if (-1 == gds_disable_inlcpy) { - const char *env = getenv("GDS_DISABLE_INLINECOPY"); + const char *env = getenv("GDS_DISABLE_WRITEMEMORY"); if (env) gds_disable_inlcpy = !!atoi(env); else gds_disable_inlcpy = 0; - gds_dbg("GDS_DISABLE_INLINECOPY=%d\n", gds_disable_inlcpy); + gds_dbg("GDS_DISABLE_WRITEMEMORY=%d\n", gds_disable_inlcpy); } - return GDS_HAS_INLINE_COPY && !gds_disable_inlcpy; + return !gds_disable_inlcpy; } +// simulate 64-bits writes with inlcpy static bool gds_simulate_write64() { static int gds_simulate_write64 = -1; @@ -166,12 +170,12 @@ static bool gds_simulate_write64() gds_dbg("GDS_SIMULATE_WRITE64=%d\n", gds_simulate_write64); if (gds_simulate_write64 && gds_enable_inlcpy()) { - gds_warn("INLINECOPY has priority over SIMULATE_WRITE64, using the former\n"); + gds_warn("WRITEMEMORY has priority over SIMULATE_WRITE64, using the former\n"); gds_simulate_write64 = 0; } } - // simulate 64-bits writes with inlcpy - return GDS_HAS_INLINE_COPY && gds_simulate_write64; + + return gds_simulate_write64; } static bool gds_enable_membar() @@ -185,21 +189,23 @@ static bool gds_enable_membar() gds_disable_membar = 0; gds_dbg("GDS_DISABLE_MEMBAR=%d\n", gds_disable_membar); } - return GDS_HAS_MEMBAR && !gds_disable_membar; + return !gds_disable_membar; } static bool gds_enable_weak_consistency() { static int gds_disable_weak_consistency = -1; if (-1 == gds_disable_weak_consistency) { - const char *env = getenv("GDS_DISABLE_WEAK_CONSISTENCY"); - if (env) - gds_disable_weak_consistency = !!atoi(env); - else - gds_disable_weak_consistency = 1; // disabled by default - gds_dbg("GDS_DISABLE_WEAK_CONSISTENCY=%d\n", gds_disable_weak_consistency); + const char *env = getenv("GDS_DISABLE_WEAK_CONSISTENCY"); + if (env) + gds_disable_weak_consistency = !!atoi(env); + else + gds_disable_weak_consistency = 1; // disabled by default + gds_dbg("GDS_DISABLE_WEAK_CONSISTENCY=%d\n", gds_disable_weak_consistency); } - return GDS_HAS_WEAK_API && !gds_disable_weak_consistency; + gds_dbg("gds_disable_weak_consistency=%d\n", + gds_disable_weak_consistency); + return !gds_disable_weak_consistency; } //----------------------------------------------------------------------------- @@ -244,20 +250,22 @@ void gds_dump_param(CUstreamBatchMemOpParams *param) break; #if GDS_HAS_INLINE_COPY - case CU_STREAM_MEM_OP_INLINE_COPY: + case CU_STREAM_MEM_OP_WRITE_MEMORY: gds_info("INLINECOPY addr:%p alias:%p src:%p len=%zu flags:%08x\n", - (void*)param->inlineCopy.address, - (void*)param->inlineCopy.alias, - (void*)param->inlineCopy.srcData, - param->inlineCopy.byteCount, - param->inlineCopy.flags); + (void*)param->writeMemory.address, + (void*)param->writeMemory.alias, + (void*)param->writeMemory.src, + param->writeMemory.byteCount, + param->writeMemory.flags); break; #endif #if GDS_HAS_MEMBAR case CU_STREAM_MEM_OP_MEMORY_BARRIER: - gds_info("MEMORY_BARRIER flags:%08x\n", - param->memoryBarrier.flags); + gds_info("MEMORY_BARRIER scope:%02x set_before=%02x set_after=%02x\n", + param->memoryBarrier.scope, + param->memoryBarrier.set_before, + param->memoryBarrier.set_after); break; #endif default: @@ -291,25 +299,33 @@ int gds_fill_membar(gds_op_list_t &ops, int flags) param.operation, param.flushRemoteWrites.flags); } else { + param.operation = CU_STREAM_MEM_OP_MEMORY_BARRIER; + if (flags & GDS_MEMBAR_MLX5) { + param.memoryBarrier.set_before = CU_STREAM_MEMORY_BARRIER_OP_WRITE_32 | CU_STREAM_MEMORY_BARRIER_OP_WRITE_64; + } else { + param.memoryBarrier.set_before = CU_STREAM_MEMORY_BARRIER_OP_ALL; + } + param.memoryBarrier.set_after = CU_STREAM_MEMORY_BARRIER_OP_ALL; if (flags & GDS_MEMBAR_DEFAULT) { - param.operation = CU_STREAM_MEM_OP_MEMORY_BARRIER; - param.memoryBarrier.flags = CU_STREAM_MEMORY_BARRIER_DEFAULT; + param.memoryBarrier.scope = CU_STREAM_MEMORY_BARRIER_SCOPE_GPU; } else if (flags & GDS_MEMBAR_SYS) { - param.operation = CU_STREAM_MEM_OP_MEMORY_BARRIER; - param.memoryBarrier.flags = CU_STREAM_MEMORY_BARRIER_SYS; + param.memoryBarrier.scope = CU_STREAM_MEMORY_BARRIER_SCOPE_SYS; } else { gds_err("error, unsupported membar\n"); retcode = EINVAL; goto out; } - gds_dbg("op=%d membar flags=%08x\n", + gds_dbg("op=%d membar scope:%02x set_before=%02x set_after=%02x\n", param.operation, - param.memoryBarrier.flags); + param.memoryBarrier.scope, + param.memoryBarrier.set_before, + param.memoryBarrier.set_after); + } ops.push_back(param); out: #else - gds_err("error, inline copy is unsupported\n"); + gds_err("unsupported feature\n"); retcode = EINVAL; #endif return retcode; @@ -317,7 +333,7 @@ int gds_fill_membar(gds_op_list_t &ops, int flags) //----------------------------------------------------------------------------- -static int gds_fill_inlcpy(gds_op_list_t &ops, CUdeviceptr addr, void *data, size_t n_bytes, int flags) +static int gds_fill_inlcpy(gds_op_list_t &ops, CUdeviceptr addr, const void *data, size_t n_bytes, int flags) { int retcode = 0; #if GDS_HAS_INLINE_COPY @@ -328,32 +344,32 @@ static int gds_fill_inlcpy(gds_op_list_t &ops, CUdeviceptr addr, void *data, siz assert(n_bytes > 0); // TODO: // verify address requirements of inline_copy - //assert((((unsigned long)addr) & 0x3) == 0); - bool need_barrier = (flags & GDS_IMMCOPY_POST_TAIL_FLUSH ) ? true : false; + bool need_barrier = (flags & GDS_WRITE_MEMORY_POST_BARRIER_SYS) ? true : false; - param.operation = CU_STREAM_MEM_OP_INLINE_COPY; - param.inlineCopy.byteCount = n_bytes; - param.inlineCopy.srcData = data; - param.inlineCopy.address = dev_ptr; - param.inlineCopy.flags = CU_STREAM_INLINE_COPY_NO_MEMORY_BARRIER; + param.operation = CU_STREAM_MEM_OP_WRITE_MEMORY; + param.writeMemory.byteCount = n_bytes; + param.writeMemory.src = const_cast(data); + param.writeMemory.address = dev_ptr; if (need_barrier) - param.inlineCopy.flags = 0; + param.writeMemory.flags = CU_STREAM_WRITE_MEMORY_FENCE_SYS; + else + param.writeMemory.flags = CU_STREAM_WRITE_MEMORY_NO_MEMORY_BARRIER; gds_dbg("op=%d addr=%p src=%p size=%zd flags=%08x\n", param.operation, - (void*)param.inlineCopy.address, - param.inlineCopy.srcData, - param.inlineCopy.byteCount, - param.inlineCopy.flags); + (void*)param.writeMemory.address, + param.writeMemory.src, + param.writeMemory.byteCount, + param.writeMemory.flags); ops.push_back(param); #else - gds_err("error, inline copy is unsupported\n"); + gds_err("unsupported feature\n"); retcode = EINVAL; #endif return retcode; } -int gds_fill_inlcpy(gds_op_list_t &ops, void *ptr, void *data, size_t n_bytes, int flags) +int gds_fill_inlcpy(gds_op_list_t &ops, void *ptr, const void *data, size_t n_bytes, int flags) { int retcode = 0; CUdeviceptr dev_ptr = 0; @@ -373,8 +389,8 @@ int gds_fill_inlcpy(gds_op_list_t &ops, void *ptr, void *data, size_t n_bytes, i static void gds_enable_barrier_for_inlcpy(CUstreamBatchMemOpParams *param) { #if GDS_HAS_INLINE_COPY - assert(param.operation == CU_STREAM_MEM_OP_INLINE_COPY); - param.inlineCopy.flags &= ~CU_STREAM_INLINE_COPY_NO_MEMORY_BARRIER; + assert(param->operation == CU_STREAM_MEM_OP_WRITE_MEMORY); + param->writeMemory.flags &= ~CU_STREAM_WRITE_MEMORY_NO_MEMORY_BARRIER; #endif } @@ -562,19 +578,20 @@ int gds_fill_poll(gds_op_list_t &ops, uint32_t *ptr, uint32_t magic, int cond_fl //----------------------------------------------------------------------------- -int gds_stream_batch_ops(CUstream stream, gds_op_list_t &ops, int flags) +int gds_stream_batch_ops(gds_peer *peer, CUstream stream, gds_op_list_t &ops, int flags) { CUresult result = CUDA_SUCCESS; int retcode = 0; unsigned int cuflags = 0; -#if GDS_HAS_WEAK_API - cuflags |= gds_enable_weak_consistency() ? CU_STREAM_BATCH_MEM_OP_CONSISTENCY_WEAK : 0; -#endif size_t nops = ops.size(); - gds_dbg("nops=%d flags=%08x\n", nops, cuflags); - if (nops > 256) { - gds_warn("batch size might be too big, stream=%p nops=%d flags=%08x\n", stream, nops, flags); + if (gds_enable_weak_consistency() && peer->has_weak) + cuflags |= CU_STREAM_BATCH_MEM_OP_RELAXED_ORDERING; + + gds_dbg("nops=%zu flags=%08x\n", nops, cuflags); + + if (nops > peer->max_batch_size) { + gds_warn("batch size might be too big, stream=%p nops=%zu flags=%08x\n", stream, nops, flags); //return EINVAL; } @@ -584,13 +601,13 @@ int gds_stream_batch_ops(CUstream stream, gds_op_list_t &ops, int flags) cuGetErrorString(result, &err_str); gds_err("got CUDA result %d (%s) while submitting batch operations:\n", result, err_str); retcode = gds_curesult_to_errno(result); - gds_err("nops=%d flags=%08x\n", nops, cuflags); + gds_err("retcode=%d nops=%zu flags=%08x, dumping memops:\n", retcode, nops, cuflags); gds_dump_params(ops); goto out; } if (gds_enable_dump_memops()) { - gds_info("nops=%d flags=%08x\n", nops, cuflags); + gds_info("nops=%zu flags=%08x\n", nops, cuflags); gds_dump_params(ops); } @@ -640,7 +657,7 @@ int gds_post_ops(gds_peer *peer, size_t n_ops, struct peer_op_wr *op, gds_op_lis //size_t n_ops = ops.size(); CUstreamBatchMemOpParams param; - gds_dbg("n_ops=%zu idx=%d\n", n_ops); + gds_dbg("n_ops=%zu\n", n_ops); if (!peer->has_memops) { gds_err("CUDA MemOps are required\n"); @@ -698,11 +715,11 @@ int gds_post_ops(gds_peer *peer, size_t n_ops, struct peer_op_wr *op, gds_op_lis int flags = 0; if (fence_mem == IBV_EXP_PEER_FENCE_MEM_PEER) { gds_dbg("using light membar\n"); - flags = GDS_MEMBAR_DEFAULT; + flags = GDS_MEMBAR_DEFAULT | GDS_MEMBAR_MLX5; } else if (fence_mem == IBV_EXP_PEER_FENCE_MEM_SYS) { gds_dbg("using heavy membar\n"); - flags = GDS_MEMBAR_SYS; + flags = GDS_MEMBAR_SYS | GDS_MEMBAR_MLX5; } else { gds_err("unsupported fence combination\n"); @@ -825,7 +842,7 @@ int gds_post_ops(gds_peer *peer, size_t n_ops, struct peer_op_wr *op, gds_op_lis if (!(post_flags & GDS_POST_OPS_DISCARD_WAIT_FLUSH)) flags |= GDS_WAIT_POST_FLUSH; - gds_dbg("OP_WAIT_DWORD dev_ptr=%llx data=%"PRIx32"\n", dev_ptr, data); + gds_dbg("OP_WAIT_DWORD dev_ptr=%llx data=%"PRIx32" type=%"PRIx32"\n", dev_ptr, data, (uint32_t)op->type); switch(op->type) { case IBV_EXP_PEER_OP_POLL_NOR_DWORD: @@ -899,7 +916,7 @@ int gds_post_pokes(CUstream stream, int count, gds_send_request_t *info, uint32_ } } - retcode = gds_stream_batch_ops(stream, ops, 0); + retcode = gds_stream_batch_ops(peer, stream, ops, 0); if (retcode) { gds_err("error %d in stream_batch_ops\n", retcode); goto out; @@ -911,17 +928,18 @@ int gds_post_pokes(CUstream stream, int count, gds_send_request_t *info, uint32_ //----------------------------------------------------------------------------- -static int gds_post_ops_on_cpu(size_t n_descs, struct peer_op_wr *op) +int gds_post_ops_on_cpu(size_t n_ops, struct peer_op_wr *op, int post_flags) { int retcode = 0; size_t n = 0; - - for (; op && n < n_descs; op = op->next, ++n) { + gds_dbg("n_ops=%zu op=%p post_flags=0x%x\n", n_ops, op, post_flags); + for (; op && n < n_ops; op = op->next, ++n) { //int flags = 0; - gds_dbg("op[%zu] type:%08x\n", n, op->type); + gds_dbg("op[%zu]=%p\n", n, op); + //gds_dbg("op[%zu]=%p type:%08x\n", n, op, op->type); switch(op->type) { case IBV_EXP_PEER_OP_FENCE: { - gds_dbg("fence_flags=%"PRIu64"\n", op->wr.fence.fence_flags); + gds_dbg("FENCE flags=%"PRIu64"\n", op->wr.fence.fence_flags); uint32_t fence_op = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_OP_READ|IBV_EXP_PEER_FENCE_OP_WRITE)); uint32_t fence_from = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_FROM_CPU|IBV_EXP_PEER_FENCE_FROM_HCA)); uint32_t fence_mem = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_MEM_SYS|IBV_EXP_PEER_FENCE_MEM_PEER)); @@ -957,30 +975,58 @@ static int gds_post_ops_on_cpu(size_t n_descs, struct peer_op_wr *op) uint32_t *ptr = (uint32_t*)((ptrdiff_t)range_from_id(op->wr.dword_va.target_id)->va + op->wr.dword_va.offset); uint32_t data = op->wr.dword_va.data; // A || B || C || E + gds_dbg("STORE_DWORD ptr=%p data=%08"PRIx32"\n", ptr, data); ACCESS_ONCE(*ptr) = data; - gds_dbg("%p <- %08x\n", ptr, data); break; } case IBV_EXP_PEER_OP_STORE_QWORD: { uint64_t *ptr = (uint64_t*)((ptrdiff_t)range_from_id(op->wr.qword_va.target_id)->va + op->wr.qword_va.offset); uint64_t data = op->wr.qword_va.data; + gds_dbg("STORE_QWORD ptr=%p data=%016"PRIx64"\n", ptr, data); ACCESS_ONCE(*ptr) = data; - gds_dbg("%p <- %016"PRIx64"\n", ptr, data); break; } case IBV_EXP_PEER_OP_COPY_BLOCK: { uint64_t *ptr = (uint64_t*)((ptrdiff_t)range_from_id(op->wr.copy_op.target_id)->va + op->wr.copy_op.offset); uint64_t *src = (uint64_t*)op->wr.copy_op.src; size_t n_bytes = op->wr.copy_op.len; + gds_dbg("COPY_BLOCK ptr=%p src=%p len=%zu\n", ptr, src, n_bytes); gds_bf_copy(ptr, src, n_bytes); - gds_dbg("%p <- %p len=%zu\n", ptr, src, n_bytes); break; } case IBV_EXP_PEER_OP_POLL_AND_DWORD: case IBV_EXP_PEER_OP_POLL_GEQ_DWORD: case IBV_EXP_PEER_OP_POLL_NOR_DWORD: { - gds_err("polling is not supported\n"); - retcode = EINVAL; + int poll_cond; + uint32_t *ptr = (uint32_t*)((ptrdiff_t)range_from_id(op->wr.dword_va.target_id)->va + op->wr.dword_va.offset); + uint32_t value = op->wr.dword_va.data; + bool flush = true; + if (post_flags & GDS_POST_OPS_DISCARD_WAIT_FLUSH) + flush = false; + gds_dbg("WAIT_32 dev_ptr=%p data=%"PRIx32" type=%"PRIx32"\n", ptr, value, (uint32_t)op->type); + bool done = false; + do { + uint32_t data = gds_atomic_get(ptr); + switch(op->type) { + case IBV_EXP_PEER_OP_POLL_NOR_DWORD: + done = ~(data | value); + break; + case IBV_EXP_PEER_OP_POLL_GEQ_DWORD: + done = ((int32_t)data - (int32_t)value >= 0); + break; + case IBV_EXP_PEER_OP_POLL_AND_DWORD: + done = (data & value); + break; + default: + gds_err("invalid op type %02x\n", op->type); + retcode = EINVAL; + goto out; + } + if (done) + break; + // TODO: more aggressive CPU relaxing needed here to avoid starving I/O fabric + arch_cpu_relax(); + } while(true); break; } default: @@ -989,13 +1035,11 @@ static int gds_post_ops_on_cpu(size_t n_descs, struct peer_op_wr *op) break; } if (retcode) { - gds_err("error in fill func at entry n=%zu\n", n); + gds_err("error %d at entry n=%zu\n", retcode, n); goto out; } } - assert(n_descs == n); - out: return retcode; } @@ -1028,96 +1072,6 @@ int gds_post_pokes_on_cpu(int count, gds_send_request_t *info, uint32_t *dw, uin //----------------------------------------------------------------------------- -int gds_stream_post_polls_and_pokes(CUstream stream, - size_t n_polls, uint32_t *ptrs[], uint32_t magics[], gds_wait_cond_flag_t cond_flags[], int poll_flags[], - size_t n_pokes, uint32_t *poke_ptrs[], uint32_t poke_values[], int poke_flags[]) -{ - int retcode = 0; - gds_op_list_t ops; - - gds_dbg("n_polls=%zu n_pokes=%zu\n", n_polls, n_pokes); - - for (size_t j = 0; j < n_polls; ++j) { - uint32_t *ptr = ptrs[j]; - uint32_t magic = magics[j]; - gds_wait_cond_flag_t cond_flag = cond_flags[j]; - int flags = poll_flags[j]; - gds_dbg("poll %zu: addr=%p value=%08x cond=%d flags=%08x\n", j, ptr, magic, cond_flag, flags); - retcode = gds_fill_poll(ops, ptr, magic, cond_flag, flags); - if (retcode) { - gds_err("error in fill_poll at entry %zu\n", j); - goto out; - } - } - - for (size_t j = 0; j < n_pokes; ++j) { - uint32_t *addr = poke_ptrs[j]; - uint32_t value = poke_values[j]; - int flags = poke_flags[j]; - gds_dbg("poke %zu: addr=%p value=%08x flags=%08x\n", j, addr, value, flags); - retcode = gds_fill_poke(ops, addr, value, flags); - if (retcode) { - gds_err("error in fill_poll at entry %zu\n", j); - goto out; - } - } - - retcode = gds_stream_batch_ops(stream, ops, 0); - if (retcode) { - gds_err("error in batch_ops\n"); - goto out; - } - -out: - return retcode; -} - -//----------------------------------------------------------------------------- - -int gds_stream_post_polls_and_immediate_copies(CUstream stream, - size_t n_polls, uint32_t *ptrs[], uint32_t magics[], gds_wait_cond_flag_t cond_flags[], int poll_flags[], - size_t n_imms, void *imm_ptrs[], void *imm_datas[], size_t imm_bytes[], int imm_flags[]) -{ - int retcode = 0; - gds_op_list_t ops; - - for (size_t j = 0; j < n_polls; ++j) { - uint32_t *ptr = ptrs[j]; - uint32_t magic = magics[j]; - gds_wait_cond_flag_t cond_flag = cond_flags[j]; - int flags = poll_flags[j]; - - retcode = gds_fill_poll(ops, ptr, magic, cond_flag, flags); - if (retcode) { - gds_err("error in fill_poll at entry %zu\n", j); - goto out; - } - } - - for (size_t j = 0; j < n_imms; ++j) { - void *ptr = imm_ptrs[j]; - void *data = imm_datas[j]; - size_t n_bytes = imm_bytes[j]; - int flags = imm_flags[j]; - retcode = gds_fill_inlcpy(ops, ptr, data, n_bytes, flags); - if (retcode) { - gds_err("error in fill_inlcpy at entry %zu\n", j); - goto out; - } - } - - retcode = gds_stream_batch_ops(stream, ops, 0); - if (retcode) { - gds_err("error in batch_ops\n"); - goto out; - } - -out: - return retcode; -} - -//----------------------------------------------------------------------------- - static void gds_dump_ops(struct peer_op_wr *op, size_t count) { size_t n = 0; @@ -1291,15 +1245,14 @@ static int gds_unregister_va(uint64_t registration_id, uint64_t peer_id) static bool support_memops(CUdevice dev) { int flag = 0; -#if CUDA_VERSION >= 9010 +#if HAVE_DECL_CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS + // on CUDA_VERSION >= 9010 CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS, dev)); - gds_warn("flag=%d\n", flag); #elif CUDA_VERSION >= 8000 - // CUDA MemOps are enabled on CUDA 8.0+ + // CUDA MemOps are always enabled on CUDA 8.0+ flag = 1; - gds_warn("flag=%d\n", flag); #else -#error "GCC error CUDA MemOp APIs is missing prior to CUDA 8.0" +#error "CUDA MemOp APIs are missing prior to CUDA 8.0" #endif gds_dbg("dev=%d has_memops=%d\n", dev, flag); return !!flag; @@ -1308,7 +1261,8 @@ static bool support_memops(CUdevice dev) static bool support_remote_flush(CUdevice dev) { int flag = 0; -#if CUDA_VERSION >= 9020 +#if HAVE_DECL_CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES + // on CUDA_VERSION >= 9020 CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES, dev)); #else #warning "Assuming CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES=0 prior to CUDA 9.2" @@ -1320,7 +1274,8 @@ static bool support_remote_flush(CUdevice dev) static bool support_write64(CUdevice dev) { int flag = 0; -#if CUDA_VERSION >= 9000 +#if HAVE_DECL_CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS + // on CUDA_VERSION >= 9000 CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS, dev)); #endif gds_dbg("dev=%d has_write64=%d\n", dev, flag); @@ -1330,8 +1285,11 @@ static bool support_write64(CUdevice dev) static bool support_wait_nor(CUdevice dev) { int flag = 0; -#if CUDA_VERSION >= 9000 +#if HAVE_DECL_CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR + // on CUDA_VERSION >= 9000 CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR, dev)); +#else + gds_dbg("hardcoding has_wait_nor=0\n"); #endif gds_dbg("dev=%d has_wait_nor=%d\n", dev, flag); return !!flag; @@ -1340,15 +1298,87 @@ static bool support_wait_nor(CUdevice dev) static bool support_inlcpy(CUdevice dev) { int flag = 0; +#if HAVE_DECL_CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WRITE_MEMORY + // on CUDA_VERSION >= 1000 + CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WRITE_MEMORY, dev)); +#else + gds_dbg("hardcoding has_inlcpy=0\n"); +#endif + gds_dbg("dev=%d has_inlcpy=%d\n", dev, flag); return !!flag; } static bool support_membar(CUdevice dev) { int flag = 0; +#if HAVE_DECL_CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEMORY_BARRIER + // on CUDA_VERSION >= 1000 + CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEMORY_BARRIER, dev)); +#else + gds_dbg("hardcoding has_membar=0\n"); +#endif + gds_dbg("dev=%d has_membar=%d\n", dev, flag); return !!flag; } +static bool support_weak_consistency(CUdevice dev) +{ + int flag = 0; + CUdevice cur_dev; + bool has_hidden_flag = false; + +#if HAVE_DECL_CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_BATCH_MEMOP_RELAXED_ORDERING + CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_BATCH_MEMOP_RELAXED_ORDERING, dev)); +#endif + + CUCHECK(cuCtxGetDevice(&cur_dev)); + if (cur_dev != dev) { + gds_err("device context is not current, cannot detect weak consistency flag\n"); + goto done; + } + + do { + gds_dbg("testing hidden weak flag\n"); + + CUstreamBatchMemOpParams params[2]; + CUresult res; + res = cuStreamBatchMemOp(0, 0, params, 0); + if (res == CUDA_ERROR_NOT_SUPPORTED) { + gds_err("Either cuStreamBatchMemOp API is not supported on this platform or it has not been enabled, check libgdsync system requirements.\n"); + break; + } else if (res != CUDA_SUCCESS) { + const char *err_str = NULL; + cuGetErrorString(res, &err_str); + const char *err_name = NULL; + cuGetErrorName(res, &err_name); + gds_err("very serious problems with cuStreamBatchMemOp() %d(%s) '%s'\n", res, err_name, err_str); + break; + } + res = cuStreamBatchMemOp(0, 0, params, CU_STREAM_BATCH_MEM_OP_RELAXED_ORDERING); + if (res == CUDA_ERROR_INVALID_VALUE) { + gds_dbg("weak flag is not supported\n"); + break; + } else if (res != CUDA_SUCCESS) { + const char *err_str = NULL; + cuGetErrorString(res, &err_str); + const char *err_name = NULL; + cuGetErrorName(res, &err_name); + gds_err("serious problems with cuStreamBatchMemOp() %d(%s) '%s'\n", res, err_name, err_str); + break; + } + gds_dbg("detected hidden weak consistency flag\n"); + has_hidden_flag = true; + } while(0); + + if (flag && !has_hidden_flag) { + gds_err("GPU dev=%d relaxed ordering device attribute and detection do not agree\n", dev); + abort(); + } +done: + gds_dbg("dev=%d has_weak=%d\n", dev, has_hidden_flag); + return has_hidden_flag; +} + //----------------------------------------------------------------------------- static gds_peer gpu_peer[max_gpus]; @@ -1357,11 +1387,9 @@ static bool gpu_registered[max_gpus]; //----------------------------------------------------------------------------- -static void gds_init_peer(gds_peer *peer, int gpu_id) +static void gds_init_peer(gds_peer *peer, CUdevice dev, int gpu_id) { assert(peer); - CUdevice dev; - CUCHECK(cuDeviceGet(&dev, gpu_id)); peer->gpu_id = gpu_id; peer->gpu_dev = dev; @@ -1369,9 +1397,12 @@ static void gds_init_peer(gds_peer *peer, int gpu_id) peer->has_memops = support_memops(dev); peer->has_remote_flush = support_remote_flush(dev); peer->has_write64 = support_write64(dev) && gds_enable_write64(); - peer->has_wait_nor = support_wait_nor(dev); + peer->has_wait_nor = support_wait_nor(dev) && gds_enable_wait_nor(); peer->has_inlcpy = support_inlcpy(dev) && gds_enable_inlcpy(); peer->has_membar = support_membar(dev); + peer->has_weak = support_weak_consistency(dev); + + peer->max_batch_size = 256; peer->alloc_type = gds_peer::NONE; peer->alloc_flags = 0; @@ -1387,15 +1418,18 @@ static void gds_init_peer(gds_peer *peer, int gpu_id) IBV_EXP_PEER_OP_FENCE_CAP | IBV_EXP_PEER_OP_POLL_AND_DWORD_CAP ); - if (peer->has_wait_nor) + if (peer->has_wait_nor) { + gds_dbg("enabling NOR feature\n"); peer->attr.caps |= IBV_EXP_PEER_OP_POLL_NOR_DWORD_CAP; - else + } else peer->attr.caps |= IBV_EXP_PEER_OP_POLL_GEQ_DWORD_CAP; if (peer->has_inlcpy) { + gds_dbg("enabling COPY BLOCK feature\n"); peer->attr.caps |= IBV_EXP_PEER_OP_COPY_BLOCK_CAP; } else if (peer->has_write64 || gds_simulate_write64()) { + gds_dbg("enabling STORE QWORD feature\n"); peer->attr.caps |= IBV_EXP_PEER_OP_STORE_QWORD_CAP; } gds_dbg("caps=%016lx\n", peer->attr.caps); @@ -1403,30 +1437,70 @@ static void gds_init_peer(gds_peer *peer, int gpu_id) peer->attr.comp_mask = IBV_EXP_PEER_DIRECT_VERSION; peer->attr.version = 1; + gpu_registered[gpu_id] = true; + gds_dbg("peer_attr: peer_id=%"PRIx64"\n", peer->attr.peer_id); } //----------------------------------------------------------------------------- -static ibv_exp_res_domain *gds_create_res_domain(struct ibv_context *context) +static int gds_register_peer(CUdevice dev, unsigned gpu_id, gds_peer **p_peer, gds_peer_attr **p_peer_attr) { - if (!context) { - gds_err("invalid context"); - return NULL; - } + int ret = 0; - ibv_exp_res_domain_init_attr res_domain_attr; - memset(&res_domain_attr, 0, sizeof(res_domain_attr)); + gds_dbg("GPU%u: registering peer\n", gpu_id); + + if (gpu_id >= max_gpus) { + gds_err("invalid gpu_id %d\n", gpu_id); + return EINVAL; + } - res_domain_attr.comp_mask |= IBV_EXP_RES_DOMAIN_THREAD_MODEL; - res_domain_attr.thread_model = IBV_EXP_THREAD_SINGLE; + gds_peer *peer = &gpu_peer[gpu_id]; - ibv_exp_res_domain *res_domain = ibv_exp_create_res_domain(context, &res_domain_attr); - if (!res_domain) { - gds_warn("Can't create resource domain\n"); + if (gpu_registered[gpu_id]) { + gds_dbg("gds_peer for GPU%u already initialized\n", gpu_id); + } else { + gds_init_peer(peer, dev, gpu_id); } - return res_domain; + if (p_peer) + *p_peer = peer; + + if (p_peer_attr) + *p_peer_attr = &peer->attr; + + return ret; +} + +//----------------------------------------------------------------------------- + +static int gds_register_peer_by_ordinal(unsigned gpu_id, gds_peer **p_peer, gds_peer_attr **p_peer_attr) +{ + CUdevice dev; + CUCHECK(cuDeviceGet(&dev, gpu_id)); + return gds_register_peer(dev, gpu_id, p_peer, p_peer_attr); +} + +//----------------------------------------------------------------------------- + +static void gds_ordinal_from_device(CUdevice dev, unsigned &gpu_id) +{ + int count; + CUCHECK(cuDeviceGetCount(&count)); + // FIXME: this is super ugly and may break in the future + int ordinal = static_cast(dev); + GDS_ASSERT(ordinal >= 0 && ordinal < count); + gpu_id = (unsigned)ordinal; + gds_dbg("gpu_id=%u for dev=%d\n", gpu_id, dev); +} + +//----------------------------------------------------------------------------- + +static int gds_register_peer_by_dev(CUdevice dev, gds_peer **p_peer, gds_peer_attr **p_peer_attr) +{ + unsigned gpu_id; + gds_ordinal_from_device(dev, gpu_id); + return gds_register_peer(dev, gpu_id, p_peer, p_peer_attr); } //----------------------------------------------------------------------------- @@ -1437,6 +1511,8 @@ static int gds_device_from_current_context(CUdevice &dev) return 0; } +//----------------------------------------------------------------------------- + static int gds_device_from_context(CUcontext ctx, CUcontext cur_ctx, CUdevice &dev) { // if cur != ctx then push ctx @@ -1452,6 +1528,8 @@ static int gds_device_from_context(CUcontext ctx, CUcontext cur_ctx, CUdevice &d return 0; } +//----------------------------------------------------------------------------- + static int gds_device_from_stream(CUstream stream, CUdevice &dev) { CUcontext cur_ctx, stream_ctx; @@ -1466,13 +1544,13 @@ static int gds_device_from_stream(CUstream stream, CUdevice &dev) return 0; } +//----------------------------------------------------------------------------- + gds_peer *peer_from_stream(CUstream stream) { - CUcontext ctx = NULL; - CUdevice dev; + CUdevice dev = -1; gds_peer *peer = NULL; - if (stream != NULL && stream != CU_STREAM_LEGACY && stream != CU_STREAM_PER_THREAD) { // this a user stream gds_device_from_stream(stream, dev); @@ -1481,53 +1559,46 @@ gds_peer *peer_from_stream(CUstream stream) gds_device_from_current_context(dev); } - for(int g=0; g= max_gpus) { - gds_err("invalid gpu_id %d\n", gpu_id); - return EINVAL; + gds_err("invalid context"); + return NULL; } - gds_peer *peer = &gpu_peer[gpu_id]; - - if (gpu_registered[gpu_id]) { - gds_dbg("gds_peer for GPU%u already initialized\n", gpu_id); - } else { - gds_init_peer(peer, gpu_id); - gpu_registered[gpu_id] = true; - } + ibv_exp_res_domain_init_attr res_domain_attr; + memset(&res_domain_attr, 0, sizeof(res_domain_attr)); - if (p_peer) - *p_peer = peer; + res_domain_attr.comp_mask |= IBV_EXP_RES_DOMAIN_THREAD_MODEL; + res_domain_attr.thread_model = IBV_EXP_THREAD_SINGLE; - if (p_peer_attr) - *p_peer_attr = &peer->attr; + ibv_exp_res_domain *res_domain = ibv_exp_create_res_domain(context, &res_domain_attr); + if (!res_domain) { + gds_warn("Can't create resource domain\n"); + } - return ret; + return res_domain; } //----------------------------------------------------------------------------- @@ -1558,7 +1629,7 @@ gds_create_cq_internal(struct ibv_context *context, int cqe, //Here we need to recover peer and peer_attr pointers to set alloc_type and alloc_flags //before ibv_exp_create_cq - ret = gds_register_peer_ex(context, gpu_id, &peer, &peer_attr); + ret = gds_register_peer_by_ordinal(gpu_id, &peer, &peer_attr); if (ret) { gds_err("error %d while registering GPU peer\n", ret); return NULL; @@ -1602,7 +1673,7 @@ gds_create_cq(struct ibv_context *context, int cqe, gds_peer *peer = NULL; gds_peer_attr *peer_attr = NULL; - ret = gds_register_peer_ex(context, gpu_id, &peer, &peer_attr); + ret = gds_register_peer_by_ordinal(gpu_id, &peer, &peer_attr); if (ret) { gds_err("error %d while registering GPU peer\n", ret); return NULL; @@ -1643,7 +1714,7 @@ struct gds_qp *gds_create_qp(struct ibv_pd *pd, struct ibv_context *context, gds_peer_attr *peer_attr = NULL; int old_errno = errno; - gds_dbg("pd=%p context=%p gpu_id=%d flags=%08x errno=%d\n", pd, context, gpu_id, flags, errno); + gds_dbg("pd=%p context=%p gpu_id=%d flags=%08x current errno=%d\n", pd, context, gpu_id, flags, errno); assert(pd); assert(context); assert(qp_attr); @@ -1663,7 +1734,7 @@ struct gds_qp *gds_create_qp(struct ibv_pd *pd, struct ibv_context *context, // peer registration gds_dbg("before gds_register_peer_ex\n"); - ret = gds_register_peer_ex(context, gpu_id, &peer, &peer_attr); + ret = gds_register_peer_by_ordinal(gpu_id, &peer, &peer_attr); if (ret) { gds_err("error %d in gds_register_peer_ex\n", ret); goto err; diff --git a/src/gdsync_debug_hostregister_bug.cpp b/src/gdsync_debug_hostregister_bug.cpp new file mode 100644 index 0000000..1e36d08 --- /dev/null +++ b/src/gdsync_debug_hostregister_bug.cpp @@ -0,0 +1,1837 @@ +/* Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#if HAVE_CONFIG_H +# include +#endif /* HAVE_CONFIG_H */ + +#include +#include +#include + +#include + +#include "utils.hpp" +#include "memmgr.hpp" +#include "mem.hpp" +#include "objs.hpp" +#include "archutils.h" +#include "mlnxutils.h" + +//----------------------------------------------------------------------------- + +int gds_dbg_enabled() +{ + static int gds_dbg_is_enabled = -1; + if (-1 == gds_dbg_is_enabled) { + const char *env = getenv("GDS_ENABLE_DEBUG"); + if (env) { + int en = atoi(env); + gds_dbg_is_enabled = !!en; + //printf("GDS_ENABLE_DEBUG=%s\n", env); + } else + gds_dbg_is_enabled = 0; + } + return gds_dbg_is_enabled; +} + +//----------------------------------------------------------------------------- +// detect Async APIs + +#if HAVE_DECL_CU_STREAM_MEM_OP_WRITE_VALUE_64 +#warning "enabling write_64 extensions" +#define GDS_HAS_WRITE64 1 +#else +#define GDS_HAS_WRITE64 0 +#endif + +#if HAVE_DECL_CU_STREAM_MEM_OP_INLINE_COPY +#warning "enabling inline_copy extensions" +#define GDS_HAS_INLINE_COPY 1 +#else +#define GDS_HAS_INLINE_COPY 0 +#endif + +#if HAVE_DECL_CU_STREAM_BATCH_MEM_OP_CONSISTENCY_WEAK +#warning "enabling consistency extensions" +#define GDS_HAS_WEAK_API 1 +#else +#define GDS_HAS_WEAK_API 0 +#endif + +#if HAVE_DECL_CU_STREAM_MEM_OP_MEMORY_BARRIER +#warning "enabling memory barrier extensions" +#define GDS_HAS_MEMBAR 1 +#else +#define GDS_HAS_MEMBAR 0 +#endif + +// TODO: use corret value +// TODO: make it dependent upon the particular GPU +const size_t GDS_GPU_MAX_INLINE_SIZE = 256; + +//----------------------------------------------------------------------------- + +// Note: inlcpy has precedence +//bool gds_has_inlcpy = GDS_HAS_INLINE_COPY; +//bool gds_has_write64 = GDS_HAS_WRITE64; +//bool gds_has_weak_consistency = GDS_HAS_WEAK_API; +//bool gds_has_membar = GDS_HAS_MEMBAR; + +static bool gpu_does_support_nor(gds_peer *peer) { return false; } + +//----------------------------------------------------------------------------- + +// BUG: this feature is GPU device dependent +static bool gds_enable_write64() +{ + static int gds_disable_write64 = -1; + if (-1 == gds_disable_write64) { + const char *env = getenv("GDS_DISABLE_WRITE64"); + if (env) + gds_disable_write64 = !!atoi(env); + else + gds_disable_write64 = 0; + gds_dbg("GDS_DISABLE_WRITE64=%d\n", gds_disable_write64); + } + // BUG: need to query device property for write64 capability + //return GDS_HAS_WRITE64 && !gds_disable_write64; + return false; +} + +static bool gds_enable_inlcpy() +{ + static int gds_disable_inlcpy = -1; + if (-1 == gds_disable_inlcpy) { + const char *env = getenv("GDS_DISABLE_INLINECOPY"); + if (env) + gds_disable_inlcpy = !!atoi(env); + else + gds_disable_inlcpy = 0; + gds_dbg("GDS_DISABLE_INLINECOPY=%d\n", gds_disable_inlcpy); + } + return GDS_HAS_INLINE_COPY && !gds_disable_inlcpy; +} + +static bool gds_simulate_write64() +{ + static int gds_simulate_write64 = -1; + if (-1 == gds_simulate_write64) { + const char *env = getenv("GDS_SIMULATE_WRITE64"); + if (env) + gds_simulate_write64 = !!atoi(env); + else + gds_simulate_write64 = 0; // default + gds_dbg("GDS_SIMULATE_WRITE64=%d\n", gds_simulate_write64); + + if (gds_simulate_write64 && gds_enable_inlcpy()) { + gds_warn("INLINECOPY has priority over SIMULATE_WRITE64, using the former\n"); + gds_simulate_write64 = 0; + } + } + // simulate 64-bits writes with inlcpy + return GDS_HAS_INLINE_COPY && gds_simulate_write64; +} + +static bool gds_enable_membar() +{ + static int gds_disable_membar = -1; + if (-1 == gds_disable_membar) { + const char *env = getenv("GDS_DISABLE_MEMBAR"); + if (env) + gds_disable_membar = !!atoi(env); + else + gds_disable_membar = 0; + gds_dbg("GDS_DISABLE_MEMBAR=%d\n", gds_disable_membar); + } + return GDS_HAS_MEMBAR && !gds_disable_membar; +} + +static bool gds_enable_weak_consistency() +{ + static int gds_disable_weak_consistency = -1; + if (-1 == gds_disable_weak_consistency) { + const char *env = getenv("GDS_DISABLE_WEAK_CONSISTENCY"); + if (env) + gds_disable_weak_consistency = !!atoi(env); + else + gds_disable_weak_consistency = 1; // disabled by default + gds_dbg("GDS_DISABLE_WEAK_CONSISTENCY=%d\n", gds_disable_weak_consistency); + } + return GDS_HAS_WEAK_API && !gds_disable_weak_consistency; +} + +//----------------------------------------------------------------------------- + +static bool gds_enable_dump_memops() +{ + static int gds_enable_dump_memops = -1; + if (-1 == gds_enable_dump_memops) { + const char *env = getenv("GDS_ENABLE_DUMP_MEMOPS"); + if (env) + gds_enable_dump_memops = !!atoi(env); + else + gds_enable_dump_memops = 0; // disabled by default + gds_dbg("GDS_ENABLE_DUMP_MEMOPS=%d\n", gds_enable_dump_memops); + } + return gds_enable_dump_memops; +} + +void gds_dump_param(CUstreamBatchMemOpParams *param) +{ + switch(param->operation) { + case CU_STREAM_MEM_OP_WAIT_VALUE_32: + gds_info("WAIT32 addr:%p alias:%p value:%08x flags:%08x\n", + (void*)param->waitValue.address, + (void*)param->writeValue.alias, + param->waitValue.value, + param->waitValue.flags); + break; + + case CU_STREAM_MEM_OP_WRITE_VALUE_32: + gds_info("WRITE32 addr:%p alias:%p value:%08x flags:%08x\n", + (void*)param->writeValue.address, + (void*)param->writeValue.alias, + param->writeValue.value, + param->writeValue.flags); + break; + + case CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES: + gds_dbg("FLUSH\n"); + break; + +#if GDS_HAS_INLINE_COPY + case CU_STREAM_MEM_OP_INLINE_COPY: + gds_info("INLINECOPY addr:%p alias:%p src:%p len=%zu flags:%08x\n", + (void*)param->inlineCopy.address, + (void*)param->inlineCopy.alias, + (void*)param->inlineCopy.srcData, + param->inlineCopy.byteCount, + param->inlineCopy.flags); + break; +#endif + +#if GDS_HAS_MEMBAR + case CU_STREAM_MEM_OP_MEMORY_BARRIER: + gds_info("MEMORY_BARRIER flags:%08x\n", + param->memoryBarrier.flags); + break; +#endif + default: + gds_err("unsupported operation=%d\n", param->operation); + break; + } +} + +//----------------------------------------------------------------------------- + +void gds_dump_params(unsigned int nops, CUstreamBatchMemOpParams *params) +{ + for (unsigned int n = 0; n < nops; ++n) { + CUstreamBatchMemOpParams *param = params + n; + gds_info("param[%d]:\n", n); + gds_dump_param(param); + } +} + +//----------------------------------------------------------------------------- + +static int gds_fill_membar(CUstreamBatchMemOpParams *param, int flags) +{ + int retcode = 0; +#if GDS_HAS_MEMBAR + if (flags & GDS_MEMBAR_FLUSH_REMOTE) { + param->operation = CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES; + param->flushRemoteWrites.flags = 0; + gds_dbg("op=%d flush_remote flags=%08x\n", + param->operation, + param->flushRemoteWrites.flags); + } else { + if (flags & GDS_MEMBAR_DEFAULT) { + param->operation = CU_STREAM_MEM_OP_MEMORY_BARRIER; + param->memoryBarrier.flags = CU_STREAM_MEMORY_BARRIER_DEFAULT; + } else if (flags & GDS_MEMBAR_SYS) { + param->operation = CU_STREAM_MEM_OP_MEMORY_BARRIER; + param->memoryBarrier.flags = CU_STREAM_MEMORY_BARRIER_SYS; + } else { + gds_err("error, unsupported membar\n"); + retcode = EINVAL; + goto out; + } + gds_dbg("op=%d membar flags=%08x\n", + param->operation, + param->memoryBarrier.flags); + } +out: +#else + gds_err("error, inline copy is unsupported\n"); + retcode = EINVAL; +#endif + return retcode; +} + +//----------------------------------------------------------------------------- + +static int gds_fill_inlcpy(CUstreamBatchMemOpParams *param, CUdeviceptr addr, void *data, size_t n_bytes, int flags) +{ + int retcode = 0; +#if GDS_HAS_INLINE_COPY + CUdeviceptr dev_ptr = addr; + + assert(addr); + assert(n_bytes > 0); + // TODO: + // verify address requirements of inline_copy + //assert((((unsigned long)addr) & 0x3) == 0); + + bool need_barrier = (flags & GDS_IMMCOPY_POST_TAIL_FLUSH ) ? true : false; + + param->operation = CU_STREAM_MEM_OP_INLINE_COPY; + param->inlineCopy.byteCount = n_bytes; + param->inlineCopy.srcData = data; + param->inlineCopy.address = dev_ptr; + param->inlineCopy.flags = CU_STREAM_INLINE_COPY_NO_MEMORY_BARRIER; + if (need_barrier) + param->inlineCopy.flags = 0; + gds_dbg("op=%d addr=%p src=%p size=%zd flags=%08x\n", + param->operation, + (void*)param->inlineCopy.address, + param->inlineCopy.srcData, + param->inlineCopy.byteCount, + param->inlineCopy.flags); +#else + gds_err("error, inline copy is unsupported\n"); + retcode = EINVAL; +#endif + return retcode; +} + +static int gds_fill_inlcpy(CUstreamBatchMemOpParams *param, void *ptr, void *data, size_t n_bytes, int flags) +{ + int retcode = 0; + CUdeviceptr dev_ptr = 0; + retcode = gds_map_mem(ptr, n_bytes, memtype_from_flags(flags), &dev_ptr); + if (retcode) { + gds_err("could not lookup %p\n", ptr); + goto out; + } + + retcode = gds_fill_inlcpy(param, dev_ptr, data, n_bytes, flags); +out: + return retcode; +} + +//----------------------------------------------------------------------------- + +static void gds_enable_barrier_for_inlcpy(CUstreamBatchMemOpParams *param) +{ +#if GDS_HAS_INLINE_COPY + assert(param->operation == CU_STREAM_MEM_OP_INLINE_COPY); + param->inlineCopy.flags &= ~CU_STREAM_INLINE_COPY_NO_MEMORY_BARRIER; +#endif +} + +//----------------------------------------------------------------------------- + +static int gds_fill_poke(CUstreamBatchMemOpParams *param, CUdeviceptr addr, uint32_t value, int flags) +{ + int retcode = 0; + CUdeviceptr dev_ptr = addr; + + // TODO: convert into errors + assert(addr); + assert((((unsigned long)addr) & 0x3) == 0); + + bool need_barrier = (flags & GDS_POKE_POST_PRE_BARRIER ) ? true : false; + + param->operation = CU_STREAM_MEM_OP_WRITE_VALUE_32; + param->writeValue.address = dev_ptr; + param->writeValue.value = value; + param->writeValue.flags = CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER; + if (need_barrier) + param->writeValue.flags = 0; + gds_dbg("op=%d addr=%p value=%08x flags=%08x\n", + param->operation, + (void*)param->writeValue.address, + param->writeValue.value, + param->writeValue.flags); + + return retcode; +} + +static int gds_fill_poke(CUstreamBatchMemOpParams *param, uint32_t *ptr, uint32_t value, int flags) +{ + int retcode = 0; + CUdeviceptr dev_ptr = 0; + + gds_dbg("addr=%p value=%08x flags=%08x\n", ptr, value, flags); + + retcode = gds_map_mem(ptr, sizeof(*ptr), memtype_from_flags(flags), &dev_ptr); + if (retcode) { + gds_err("error %d while looking up %p\n", retcode, ptr); + goto out; + } + + retcode = gds_fill_poke(param, dev_ptr, value, flags); +out: + return retcode; +} + +//----------------------------------------------------------------------------- + +static int gds_fill_poll(CUstreamBatchMemOpParams *param, CUdeviceptr ptr, uint32_t magic, int cond_flag, int flags) +{ + int retcode = 0; + const char *cond_str = NULL; + CUdeviceptr dev_ptr = ptr; + + assert(ptr); + assert((((unsigned long)ptr) & 0x3) == 0); + + bool need_flush = (flags & GDS_POLL_POST_FLUSH) ? true : false; + + param->operation = CU_STREAM_MEM_OP_WAIT_VALUE_32; + param->waitValue.address = dev_ptr; + param->waitValue.value = magic; + switch(cond_flag) { + case GDS_POLL_COND_GEQ: + param->waitValue.flags = CU_STREAM_WAIT_VALUE_GEQ; + cond_str = "CU_STREAM_WAIT_VALUE_GEQ"; + break; + case GDS_POLL_COND_EQ: + param->waitValue.flags = CU_STREAM_WAIT_VALUE_EQ; + cond_str = "CU_STREAM_WAIT_VALUE_EQ"; + break; + case GDS_POLL_COND_AND: + param->waitValue.flags = CU_STREAM_WAIT_VALUE_AND; + cond_str = "CU_STREAM_WAIT_VALUE_AND"; + break; + default: + gds_err("invalid wait condition flag\n"); + retcode = EINVAL; + goto out; + } + if (need_flush) + param->waitValue.flags |= CU_STREAM_WAIT_VALUE_FLUSH; + gds_dbg("op=%d addr=%p value=%08x cond=%s flags=%08x\n", + param->operation, + (void*)param->waitValue.address, + param->waitValue.value, + cond_str, + param->waitValue.flags); +out: + return retcode; +} + +int gds_fill_poll(CUstreamBatchMemOpParams *param, uint32_t *ptr, uint32_t magic, int cond_flag, int flags) +{ + int retcode = 0; + CUdeviceptr dev_ptr = 0; + + gds_dbg("addr=%p value=%08x cond=%08x flags=%08x\n", ptr, magic, cond_flag, flags); + + retcode = gds_map_mem(ptr, sizeof(*ptr), memtype_from_flags(flags), &dev_ptr); + if (retcode) { + gds_err("could not lookup %p\n", ptr); + goto out; + } + + retcode = gds_fill_poll(param, dev_ptr, magic, cond_flag, flags); +out: + return retcode; +} + +//----------------------------------------------------------------------------- + +int gds_stream_batch_ops(CUstream stream, int nops, CUstreamBatchMemOpParams *params, int flags) +{ + CUresult result = CUDA_SUCCESS; + int retcode = 0; + unsigned int cuflags = 0; +#if GDS_HAS_WEAK_API + cuflags |= gds_enable_weak_consistency() ? CU_STREAM_BATCH_MEM_OP_CONSISTENCY_WEAK : 0; +#endif + gds_dbg("nops=%d flags=%08x\n", nops, cuflags); +#if 0 + // temporarily disabled, see below + if (gds_enable_dump_memops()) { + gds_info("nops=%d flags=%08x\n", nops, cuflags); + gds_dump_params(nops, params); + } +#endif + if (nops > 256) { + gds_warn("batch size might be too big, stream=%p nops=%d params=%p flags=%08x\n", stream, nops, params, flags); + //return EINVAL; + } + +#if 0 + int batch_size = 1; + { + int j = 100; + while (--j) { + result = cuStreamQuery(stream); + //result = cuStreamSynchronize(stream); + if (CUDA_SUCCESS != result && CUDA_ERROR_NOT_READY != result) { + const char *err_str = NULL; + cuGetErrorString(result, &err_str); + gds_err("got CUDA result %d (%s) while pre-checking stream\n", result, err_str); + retcode = gds_curesult_to_errno(result); + goto out; + } + usleep(100); + } + } + gds_dbg("chopping batch with max batch_size=%d\n", batch_size); + for(int i=0; i < nops; i += batch_size) { + result = cuStreamBatchMemOp(stream, batch_size, params+i, cuflags); + if (CUDA_SUCCESS != result) { + const char *err_str = NULL; + cuGetErrorString(result, &err_str); + gds_err("got CUDA result %d (%s) while submitting batch operations:\n", result, err_str); + retcode = gds_curesult_to_errno(result); + } + if (!retcode) { + int j = 100; + while (--j) { + result = cuStreamQuery(stream); + //result = cuStreamSynchronize(stream); + if (CUDA_SUCCESS != result && CUDA_ERROR_NOT_READY != result) { + const char *err_str = NULL; + cuGetErrorString(result, &err_str); + gds_err("got CUDA result %d (%s) while checking stream\n", result, err_str); + retcode = gds_curesult_to_errno(result); + break; + } + usleep(100); + } + } + // moved here to be able to dump .alias field too + if (retcode || gds_enable_dump_memops()) { + gds_info("last submited batch was: stream=0x%x nops=%d flags=%08x\n", stream, batch_size, cuflags); + gds_dump_params(batch_size, params+i); + } + if (retcode) + goto out; + } +#else + result = cuStreamBatchMemOp(stream, nops, params, cuflags); + if (CUDA_SUCCESS != result) { + const char *err_str = NULL; + cuGetErrorString(result, &err_str); + gds_err("got CUDA result %d (%s) while submitting batch operations:\n", result, err_str); + retcode = gds_curesult_to_errno(result); + gds_err("nops=%d flags=%08x\n", nops, cuflags); + gds_dump_params(nops, params); + goto out; + } + // moved here to be able to dump .alias field too + if (gds_enable_dump_memops()) { + gds_info("nops=%d flags=%08x\n", nops, cuflags); + gds_dump_params(nops, params); + } +#if 0 + // optional debugging aid + result = cuStreamQuery(stream); + if (CUDA_SUCCESS != result && CUDA_ERROR_NOT_READY != result) { + const char *err_str = NULL; + cuGetErrorString(result, &err_str); + gds_err("got CUDA result %d (%s) while checking stream\n", result, err_str); + retcode = gds_curesult_to_errno(result); + goto out; + } +#endif +#endif + +out: + return retcode; +} + +//----------------------------------------------------------------------------- + +int gds_stream_post_poll_dword(CUstream stream, uint32_t *ptr, uint32_t magic, int cond_flags, int flags) +{ + int retcode = 0; + CUstreamBatchMemOpParams param[1]; + retcode = gds_fill_poll(param, ptr, magic, cond_flags, flags); + if (retcode) { + gds_err("error in fill_poll\n"); + goto out; + } + retcode = gds_stream_batch_ops(stream, 1, param, 0); + if (retcode) { + gds_err("error in batch_ops\n"); + goto out; + } +out: + return retcode; +} + +//----------------------------------------------------------------------------- + +int gds_stream_post_poke_dword(CUstream stream, uint32_t *ptr, uint32_t value, int flags) +{ + int retcode = 0; + CUstreamBatchMemOpParams param[1]; + retcode = gds_fill_poke(param, ptr, value, flags); + if (retcode) { + gds_err("error in fill_poke\n"); + goto out; + } + retcode = gds_stream_batch_ops(stream, 1, param, 0); + if (retcode) { + gds_err("error in batch_ops\n"); + goto out; + } +out: + return retcode; +} + +//----------------------------------------------------------------------------- + +int gds_stream_post_inline_copy(CUstream stream, void *ptr, void *src, size_t nbytes, int flags) +{ + int retcode = 0; + CUstreamBatchMemOpParams param[1]; + + retcode = gds_fill_inlcpy(param, ptr, src, nbytes, flags); + if (retcode) { + gds_err("error in fill_poke\n"); + goto out; + } + retcode = gds_stream_batch_ops(stream, 1, param, 0); + if (retcode) { + gds_err("error in batch_ops\n"); + goto out; + } +out: + return retcode; +} + +//----------------------------------------------------------------------------- + +/* + A) plain+membar: + WR32 + MEMBAR + WR32 + WR32 + + B) plain: + WR32 + WR32+PREBARRIER + WR32 + + C) sim64+membar: + WR32 + MEMBAR + INLCPY 8B + + D) sim64: + INLCPY 4B + POSTBARRIER + INLCPY 8B + + E) inlcpy+membar: + WR32 + MEMBAR + INLCPY XB + + F) inlcpy: + INLCPY 4B + POSTBARRIER + INLCPY 128B +*/ + +static inline uint32_t gds_qword_lo(uint64_t v) { + return (uint32_t)(v); +} +static inline uint32_t gds_qword_hi(uint64_t v) { + return (uint32_t)(v >> 32); +} + +enum { + GDS_POST_OPS_DISCARD_WAIT_FLUSH = 1<<0 +}; + +static int gds_post_ops(size_t n_ops, struct peer_op_wr *op, CUstreamBatchMemOpParams *params, int &idx, int post_flags = 0) +{ + int retcode = 0; + size_t n = 0; + bool prev_was_fence = false; + bool use_inlcpy_for_dword = false; + + gds_dbg("n_ops=%zu idx=%d\n", n_ops, idx); + + // divert the request to the same engine handling 64bits + // to avoid out-of-order execution + // caveat: can't use membar if inlcpy is used for 4B writes (to simulate 8B writes) + if (gds_enable_inlcpy()) { + if (!gds_enable_membar()) + use_inlcpy_for_dword = true; // F + } + if (gds_simulate_write64()) { + if (!gds_enable_membar()) { + gds_warn_once("enabling use_inlcpy_for_dword\n"); + use_inlcpy_for_dword = true; // D + } + } + + for (; op && n < n_ops; op = op->next, ++n) { + //int flags = 0; + gds_dbg("op[%zu] type:%08x\n", n, op->type); + switch(op->type) { + case IBV_PEER_OP_FENCE: { + gds_dbg("OP_FENCE: fence_flags=%"PRIu64"\n", op->wr.fence.fence_flags); + uint32_t fence_op = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_OP_READ|IBV_EXP_PEER_FENCE_OP_WRITE)); + uint32_t fence_from = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_FROM_CPU|IBV_EXP_PEER_FENCE_FROM_HCA)); + uint32_t fence_mem = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_MEM_SYS|IBV_EXP_PEER_FENCE_MEM_PEER)); + + if (fence_op == IBV_EXP_PEER_FENCE_OP_READ) { + gds_dbg("nothing to do for read fences\n"); + //retcode = EINVAL; + break; + } + else { + if (!gds_enable_membar()) { + if (use_inlcpy_for_dword) { + assert(idx-1 >= 0); + gds_dbg("patching previous param\n"); + gds_enable_barrier_for_inlcpy(params+idx-1); + } + else { + gds_dbg("recording fence event\n"); + prev_was_fence = true; + } + //retcode = 0; + } + else { + if (fence_from != IBV_EXP_PEER_FENCE_FROM_HCA) { + gds_err("unexpected from fence\n"); + retcode = EINVAL; + break; + } + int flags = 0; + if (fence_mem == IBV_EXP_PEER_FENCE_MEM_PEER) { + gds_dbg("using light membar\n"); + flags = GDS_MEMBAR_DEFAULT; + } + else if (fence_mem == IBV_EXP_PEER_FENCE_MEM_SYS) { + gds_dbg("using heavy membar\n"); + flags = GDS_MEMBAR_SYS; + } + else { + gds_err("unsupported fence combination\n"); + retcode = EINVAL; + break; + } + retcode = gds_fill_membar(params+idx, flags); + ++idx; + } + } + break; + } + case IBV_PEER_OP_STORE_DWORD: { + CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + + op->wr.dword_va.offset; + uint32_t data = op->wr.dword_va.data; + int flags = 0; + gds_dbg("OP_STORE_DWORD dev_ptr=%llx data=%"PRIx32"\n", dev_ptr, data); + if (use_inlcpy_for_dword) { // F || D + // membar may be out of order WRT inlcpy + if (gds_enable_membar()) { + gds_err("invalid feature combination, inlcpy + membar\n"); + retcode = EINVAL; + break; + } + // tail flush is set when following fence is met + // flags |= GDS_IMMCOPY_POST_TAIL_FLUSH; + retcode = gds_fill_inlcpy(params+idx, dev_ptr, &data, sizeof(data), flags); + ++idx; + } + else { // A || B || C || E + // can't guarantee ordering of write32+inlcpy unless + // a membar is there + // TODO: fix driver when !weak + if (gds_enable_inlcpy() && !gds_enable_membar()) { + gds_err("invalid feature combination, inlcpy needs membar\n"); + retcode = EINVAL; + break; + } + if (prev_was_fence) { + gds_dbg("using PRE_BARRIER as fence\n"); + flags |= GDS_POKE_POST_PRE_BARRIER; + prev_was_fence = false; + } + retcode = gds_fill_poke(params+idx, dev_ptr, data, flags); + ++idx; + } + break; + } + case IBV_PEER_OP_STORE_QWORD: { + CUdeviceptr dev_ptr = range_from_id(op->wr.qword_va.target_id)->dptr + + op->wr.qword_va.offset; + uint64_t data = op->wr.qword_va.data; + int flags = 0; + gds_dbg("OP_STORE_QWORD dev_ptr=%llx data=%"PRIx64"\n", dev_ptr, data); + // C || D + if (gds_enable_write64()) { + gds_err("write64 is not supported\n"); + retcode = EINVAL; + break; + } + + // simulate 64-bit poke by inline copy + + if (gds_simulate_write64()){ + if (!gds_enable_membar()) { + gds_err("invalid feature combination, inlcpy needs membar\n"); + retcode = EINVAL; + break; + } + + // tail flush is never useful here + //flags |= GDS_IMMCOPY_POST_TAIL_FLUSH; + retcode = gds_fill_inlcpy(params+idx, dev_ptr, &data, sizeof(data), flags); + ++idx; + } + else { + uint32_t datalo = gds_qword_lo(op->wr.qword_va.data); + uint32_t datahi = gds_qword_hi(op->wr.qword_va.data); + + if (prev_was_fence) { + gds_dbg("enabling PRE_BARRIER\n"); + flags |= GDS_POKE_POST_PRE_BARRIER; + prev_was_fence = false; + } + retcode = gds_fill_poke(params+idx, dev_ptr, datalo, flags); + ++idx; + + // get rid of the barrier, if there + flags &= ~GDS_POKE_POST_PRE_BARRIER; + + // advance to next DWORD + dev_ptr += sizeof(uint32_t); + retcode = gds_fill_poke(params+idx, dev_ptr, datahi, flags); + ++idx; + } + + break; + } + case IBV_PEER_OP_COPY_BLOCK: { + CUdeviceptr dev_ptr = range_from_id(op->wr.copy_op.target_id)->dptr + + op->wr.copy_op.offset; + size_t len = op->wr.copy_op.len; + void *src = op->wr.copy_op.src; + int flags = 0; + gds_dbg("OP_COPY_BLOCK dev_ptr=%llx src=%p len=%zu\n", dev_ptr, src, len); + // catching any other size here + if (!gds_enable_inlcpy()) { + gds_err("inline copy is not supported\n"); + retcode = EINVAL; + break; + } + // IB Verbs bug + assert(len <= GDS_GPU_MAX_INLINE_SIZE); + //if (desc->need_flush) { + // flags |= GDS_IMMCOPY_POST_TAIL_FLUSH; + //} + retcode = gds_fill_inlcpy(params+idx, dev_ptr, src, len, flags); + ++idx; + break; + } + case IBV_PEER_OP_POLL_AND_DWORD: + case IBV_PEER_OP_POLL_GEQ_DWORD: + case IBV_PEER_OP_POLL_NOR_DWORD: { + int poll_cond; + CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + + op->wr.dword_va.offset; + uint32_t data = op->wr.dword_va.data; + // TODO: properly handle a following fence instead of blidly flushing + int flags = 0; + if (!(post_flags & GDS_POST_OPS_DISCARD_WAIT_FLUSH)) + flags |= GDS_POLL_POST_FLUSH; + + gds_dbg("OP_POLL_DWORD dev_ptr=%llx data=%"PRIx32"\n", dev_ptr, data); + + switch(op->type) { + case IBV_PEER_OP_POLL_NOR_DWORD: + //poll_cond = GDS_POLL_COND_NOR; + // TODO: lookup and pass peer down + assert(gpu_does_support_nor(NULL)); + retcode = -EINVAL; + goto out; + break; + case IBV_PEER_OP_POLL_GEQ_DWORD: + poll_cond = GDS_POLL_COND_GEQ; + break; + case IBV_PEER_OP_POLL_AND_DWORD: + poll_cond = GDS_POLL_COND_AND; + break; + default: + assert(!"cannot happen"); + retcode = EINVAL; + goto out; + } + retcode = gds_fill_poll(params+idx, dev_ptr, data, poll_cond, flags); + ++idx; + break; + } + default: + gds_err("undefined peer op type %d\n", op->type); + retcode = EINVAL; + break; + } + if (retcode) { + gds_err("error in fill func at entry n=%zu (idx=%d)\n", n, idx); + goto out; + } + } + + assert(n_ops == n); + +out: + return retcode; +} + +//----------------------------------------------------------------------------- + +int gds_post_pokes(CUstream stream, int count, gds_send_request_t *info, uint32_t *dw, uint32_t val) +{ + int retcode = 0; + int poke_count = 0; + int idx = 0; + + assert(info); + + for (int i = 0; i < count; i++) { + poke_count += info[i].commit.entries + 2; + } + + CUstreamBatchMemOpParams params[poke_count+1]; + + for (int j=0; jnext, ++n) { + //int flags = 0; + gds_dbg("op[%zu] type:%08x\n", n, op->type); + switch(op->type) { + case IBV_PEER_OP_FENCE: { + gds_dbg("fence_flags=%"PRIu64"\n", op->wr.fence.fence_flags); + uint32_t fence_op = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_OP_READ|IBV_EXP_PEER_FENCE_OP_WRITE)); + uint32_t fence_from = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_FROM_CPU|IBV_EXP_PEER_FENCE_FROM_HCA)); + uint32_t fence_mem = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_MEM_SYS|IBV_EXP_PEER_FENCE_MEM_PEER)); + + if (fence_op == IBV_EXP_PEER_FENCE_OP_READ) { + gds_warnc(1, "nothing to do for read fences\n"); + //retcode = EINVAL; + break; + } + else { + if (fence_from != IBV_EXP_PEER_FENCE_FROM_HCA) { + gds_err("unexpected from %08x fence, expected FROM_HCA\n", fence_from); + retcode = EINVAL; + break; + } + if (fence_mem == IBV_EXP_PEER_FENCE_MEM_PEER) { + gds_dbg("using light membar\n"); + wmb(); + } + else if (fence_mem == IBV_EXP_PEER_FENCE_MEM_SYS) { + gds_dbg("using heavy membar\n"); + wmb(); + } + else { + gds_err("unsupported fence combination\n"); + retcode = EINVAL; + break; + } + } + break; + } + case IBV_PEER_OP_STORE_DWORD: { + uint32_t *ptr = (uint32_t*)((ptrdiff_t)range_from_id(op->wr.dword_va.target_id)->va + op->wr.dword_va.offset); + uint32_t data = op->wr.dword_va.data; + // A || B || C || E + ACCESS_ONCE(*ptr) = data; + gds_dbg("%p <- %08x\n", ptr, data); + break; + } + case IBV_PEER_OP_STORE_QWORD: { + uint64_t *ptr = (uint64_t*)((ptrdiff_t)range_from_id(op->wr.qword_va.target_id)->va + op->wr.qword_va.offset); + uint64_t data = op->wr.qword_va.data; + ACCESS_ONCE(*ptr) = data; + gds_dbg("%p <- %016"PRIx64"\n", ptr, data); + break; + } + case IBV_PEER_OP_COPY_BLOCK: { + uint64_t *ptr = (uint64_t*)((ptrdiff_t)range_from_id(op->wr.copy_op.target_id)->va + op->wr.copy_op.offset); + uint64_t *src = (uint64_t*)op->wr.copy_op.src; + size_t n_bytes = op->wr.copy_op.len; + gds_bf_copy(ptr, src, n_bytes); + gds_dbg("%p <- %p len=%zu\n", ptr, src, n_bytes); + break; + } + case IBV_PEER_OP_POLL_AND_DWORD: + case IBV_PEER_OP_POLL_GEQ_DWORD: + case IBV_PEER_OP_POLL_NOR_DWORD: { + gds_err("polling is not supported\n"); + retcode = EINVAL; + break; + } + default: + gds_err("undefined peer op type %d\n", op->type); + retcode = EINVAL; + break; + } + if (retcode) { + gds_err("error in fill func at entry n=%zu\n", n); + goto out; + } + } + + assert(n_descs == n); + +out: + return retcode; +} + +//----------------------------------------------------------------------------- + +int gds_post_pokes_on_cpu(int count, gds_send_request_t *info, uint32_t *dw, uint32_t val) +{ + int retcode = 0; + int idx = 0; + + assert(info); + + for (int j=0; jnext, ++n) { + gds_dbg("op[%zu] type:%d\n", n, op->type); + switch(op->type) { + case IBV_PEER_OP_FENCE: { + gds_dbg("FENCE flags=%"PRIu64"\n", op->wr.fence.fence_flags); + break; + } + case IBV_PEER_OP_STORE_DWORD: { + CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + + op->wr.dword_va.offset; + gds_dbg("STORE_QWORD data:%x target_id:%"PRIx64" offset:%zu dev_ptr=%llx\n", + op->wr.dword_va.data, op->wr.dword_va.target_id, + op->wr.dword_va.offset, dev_ptr); + break; + } + case IBV_PEER_OP_STORE_QWORD: { + CUdeviceptr dev_ptr = range_from_id(op->wr.qword_va.target_id)->dptr + + op->wr.qword_va.offset; + gds_dbg("STORE_QWORD data:%"PRIx64" target_id:%"PRIx64" offset:%zu dev_ptr=%llx\n", + op->wr.qword_va.data, op->wr.qword_va.target_id, + op->wr.qword_va.offset, dev_ptr); + break; + } + case IBV_PEER_OP_COPY_BLOCK: { + CUdeviceptr dev_ptr = range_from_id(op->wr.copy_op.target_id)->dptr + + op->wr.copy_op.offset; + gds_dbg("COPY_BLOCK src:%p len:%zu target_id:%"PRIx64" offset:%zu dev_ptr=%llx\n", + op->wr.copy_op.src, op->wr.copy_op.len, + op->wr.copy_op.target_id, op->wr.copy_op.offset, + dev_ptr); + break; + } + case IBV_PEER_OP_POLL_AND_DWORD: + case IBV_PEER_OP_POLL_NOR_DWORD: { + CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + + op->wr.dword_va.offset; + gds_dbg("%s data:%08x target_id:%"PRIx64" offset:%zu dev_ptr=%llx\n", + (op->type==IBV_PEER_OP_POLL_AND_DWORD) ? "POLL_AND_DW" : "POLL_NOR_SDW", + op->wr.dword_va.data, + op->wr.dword_va.target_id, + op->wr.dword_va.offset, + dev_ptr); + break; + } + default: + gds_err("undefined peer op type %d\n", op->type); + break; + } + } + + assert(count == n); +} + +//----------------------------------------------------------------------------- + +void gds_dump_wait_request(gds_wait_request_t *request, size_t count) +{ + for (size_t j=0; jentries, peek->whence, peek->offset, + peek->peek_id, peek->comp_mask); + gds_dump_ops(peek->storage, peek->entries); + } +} + +//----------------------------------------------------------------------------- + +int gds_stream_post_wait_cq_multi(CUstream stream, int count, gds_wait_request_t *request, uint32_t *dw, uint32_t val) +{ + int retcode = 0; + int n_mem_ops = 0; + int idx = 0; + + assert(request); + + for (int i = 0; i < count; i++) { + n_mem_ops += request[i].peek.entries; + } + + gds_dbg("count=%d dw=%p val=%08x space for n_mem_ops=%d\n", count, dw, val, n_mem_ops); + + CUstreamBatchMemOpParams params[n_mem_ops+1]; + + for (int j=0; jpeer_id); + assert(peer); + + gds_dbg("alloc mem peer:{type=%d gpu_id=%d} attr{len=%zu dir=%d alignment=%d peer_id=%"PRIx64"}\n", + peer->alloc_type, peer->gpu_id, attr->length, attr->dir, attr->alignment, attr->peer_id); + + return peer->buf_alloc(peer->alloc_type, attr->length, attr->dir, attr->alignment, peer->alloc_flags); +} + +static int gds_buf_release(struct ibv_peer_buf *pb) +{ + gds_dbg("freeing pb=%p\n", pb); + gds_buf *buf = static_cast(pb); + gds_peer *peer = buf->peer; + peer->free(buf); + return 0; +} + +static uint64_t gds_register_va(void *start, size_t length, uint64_t peer_id, struct ibv_exp_peer_buf *pb) +{ + gds_peer *peer = peer_from_id(peer_id); + gds_range *range = NULL; + + gds_dbg("start=%p length=%zu peer_id=%"PRIx64" peer_buf=%p\n", start, length, peer_id, pb); + + if (IBV_EXP_PEER_IOMEMORY == pb) { + // register as IOMEM + range = peer->register_range(start, length, GDS_MEMORY_IO); + } + else if (pb) { + gds_buf *buf = static_cast(pb); + // should have been allocated via gds_buf_alloc + // assume GDR mapping already created + // associate range to peer_buf + range = peer->range_from_buf(buf, start, length); + } + else { + // register as SYSMEM + range = peer->register_range(start, length, GDS_MEMORY_HOST); + } + if (!range) { + gds_err("error while registering range, returning 0 as error value\n"); + return 0; + } + return range_to_id(range); +} + +static int gds_unregister_va(uint64_t registration_id, uint64_t peer_id) +{ + gds_peer *peer = peer_from_id(peer_id); + gds_range *range = range_from_id(registration_id); + gds_dbg("peer=%p range=%p\n", peer, range); + peer->unregister(range); + return 0; +} + +static void gds_init_peer(gds_peer *peer, int gpu_id) +{ + assert(peer); + + peer->gpu_id = gpu_id; + peer->gpu_dev = 0; + peer->gpu_ctx = 0; +} + +static void gds_init_peer_attr(gds_peer_attr *attr, gds_peer *peer) +{ + assert(peer); + + peer->alloc_type = gds_peer::NONE; + peer->alloc_flags = 0; + + attr->peer_id = peer_to_id(peer); + attr->buf_alloc = gds_buf_alloc; + attr->buf_release = gds_buf_release; + attr->register_va = gds_register_va; + attr->unregister_va = gds_unregister_va; + + attr->caps = ( IBV_EXP_PEER_OP_STORE_DWORD_CAP | + IBV_EXP_PEER_OP_STORE_QWORD_CAP | + IBV_EXP_PEER_OP_FENCE_CAP | + IBV_EXP_PEER_OP_POLL_AND_DWORD_CAP ); + + if (gpu_does_support_nor(peer)) + attr->caps |= IBV_EXP_PEER_OP_POLL_NOR_DWORD_CAP; + else + attr->caps |= IBV_EXP_PEER_OP_POLL_GEQ_DWORD_CAP; + + if (gds_enable_inlcpy()) { + attr->caps |= IBV_EXP_PEER_OP_COPY_BLOCK_CAP; + } + else if (gds_enable_write64() || gds_simulate_write64()) { + attr->caps |= IBV_EXP_PEER_OP_STORE_QWORD_CAP; + } + gds_dbg("caps=%016lx\n", attr->caps); + attr->peer_dma_op_map_len = GDS_GPU_MAX_INLINE_SIZE; + attr->comp_mask = IBV_EXP_PEER_DIRECT_VERSION; + attr->version = 1; + + gds_dbg("peer_attr: peer_id=%"PRIx64"\n", attr->peer_id); +} + +//----------------------------------------------------------------------------- + +static gds_peer gpu_peer[max_gpus]; +static gds_peer_attr gpu_peer_attr[max_gpus]; +static bool gpu_registered[max_gpus]; + +int gds_register_peer_ex(struct ibv_context *context, unsigned gpu_id, gds_peer **p_peer, gds_peer_attr **p_peer_attr) +{ + int ret = 0; + + gds_dbg("GPU %u: registering peer\n", gpu_id); + + if (gpu_id >= max_gpus) { + gds_err("invalid gpu_id %d\n", gpu_id); + return EINVAL; + } + + gds_peer *peer = &gpu_peer[gpu_id]; + gds_peer_attr *peer_attr = &gpu_peer_attr[gpu_id]; + + if (gpu_registered[gpu_id]) { + gds_dbg("gds_peer for GPU %d already initialized\n", gpu_id); + } else { + gds_init_peer(peer, gpu_id); + gds_init_peer_attr(peer_attr, peer); + gpu_registered[gpu_id] = true; + } + + if (p_peer) + *p_peer = peer; + + if (p_peer_attr) + *p_peer_attr = peer_attr; + + return ret; +} + +int gds_register_peer(struct ibv_context *context, unsigned gpu_id) +{ + return gds_register_peer_ex(context, gpu_id, NULL, NULL); +} + +//----------------------------------------------------------------------------- + +struct ibv_cq * +gds_create_cq(struct ibv_context *context, int cqe, + void *cq_context, struct ibv_comp_channel *channel, + int comp_vector, int gpu_id, gds_alloc_cq_flags_t flags) +{ + int ret = 0; + struct ibv_cq *cq = NULL; + + gds_dbg("cqe=%d gpu_id=%d cq_flags=%08x\n", cqe, gpu_id, flags); + + // TODO: add support for res_domain + + gds_peer *peer = NULL; + gds_peer_attr *peer_attr = NULL; + ret = gds_register_peer_ex(context, gpu_id, &peer, &peer_attr); + if (ret) { + gds_err("error %d while registering GPU peer\n", ret); + return NULL; + } + assert(peer); + assert(peer_attr); + + peer->alloc_type = gds_peer::CQ; + peer->alloc_flags = flags; + + ibv_create_cq_attr_ex attr; + attr.comp_mask = IBV_CREATE_CQ_ATTR_PEER_DIRECT; + attr.flags = 0; // see ibv_exp_cq_create_flags + attr.res_domain = NULL; + attr.peer_direct_attrs = peer_attr; + + int old_errno = errno; + cq = ibv_exp_create_cq(context, cqe, cq_context, channel, comp_vector, &attr); + if (!cq) { + gds_err("error %d in ibv_exp_create_cq, old errno %d\n", errno, old_errno); + } + + return cq; +} + +//----------------------------------------------------------------------------- + +struct gds_qp *gds_create_qp(struct ibv_pd *pd, struct ibv_context *context, gds_qp_init_attr_t *qp_attr, int gpu_id, int flags) +{ + int ret = 0; + struct gds_qp *gqp = NULL; + struct ibv_qp *qp = NULL; + struct ibv_cq *rx_cq = NULL, *tx_cq = NULL; + gds_peer *peer = NULL; + gds_peer_attr *peer_attr = NULL; + int old_errno = errno; + + gds_dbg("pd=%p context=%p gpu_id=%d flags=%08x errno=%d\n", pd, context, gpu_id, flags, errno); + assert(pd); + assert(context); + assert(qp_attr); + + gqp = (struct gds_qp*)calloc(1, sizeof(struct gds_qp)); + if (!gqp) { + gds_err("cannot allocate memory\n"); + return NULL; + } + + gds_dbg("creating TX CQ\n"); + tx_cq = gds_create_cq(context, qp_attr->cap.max_send_wr, NULL, NULL, 0, gpu_id, + (flags & GDS_CREATE_QP_TX_CQ_ON_GPU) ? + GDS_ALLOC_CQ_ON_GPU : GDS_ALLOC_CQ_DEFAULT); + if (!tx_cq) { + ret = errno; + gds_err("error %d while creating TX CQ, old_errno=%d\n", ret, old_errno); + goto err; + } + + gds_dbg("creating RX CQ\n"); + rx_cq = gds_create_cq(context, qp_attr->cap.max_recv_wr, NULL, NULL, 0, gpu_id, + (flags & GDS_CREATE_QP_RX_CQ_ON_GPU) ? + GDS_ALLOC_CQ_ON_GPU : GDS_ALLOC_CQ_DEFAULT); + if (!rx_cq) { + ret = errno; + gds_err("error %d while creating RX CQ\n", ret); + goto err_free_tx_cq; + } + + qp_attr->send_cq = tx_cq; + qp_attr->recv_cq = rx_cq; + + qp_attr->pd = pd; + qp_attr->comp_mask |= IBV_QP_INIT_ATTR_PD; + + // disable overflow checks in ibv_poll_cq(), as GPU might invalidate + // the CQE without updating the tracking variables + if (flags & GDS_CREATE_QP_GPU_INVALIDATE_RX_CQ) { + gds_warn("IGNORE_RQ_OVERFLOW\n"); + qp_attr->exp_create_flags |= IBV_EXP_QP_CREATE_IGNORE_RQ_OVERFLOW; + qp_attr->comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS; + } + if (flags & GDS_CREATE_QP_GPU_INVALIDATE_TX_CQ) { + gds_warn("IGNORE_SQ_OVERFLOW\n"); + qp_attr->exp_create_flags |= IBV_EXP_QP_CREATE_IGNORE_SQ_OVERFLOW; + qp_attr->comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS; + } + + gds_dbg("before gds_register_peer_ex\n"); + + ret = gds_register_peer_ex(context, gpu_id, &peer, &peer_attr); + if (ret) { + gds_err("error %d in gds_register_peer_ex\n", ret); + goto err_free_cqs; + } + + peer->alloc_type = gds_peer::WQ; + peer->alloc_flags = GDS_ALLOC_WQ_DEFAULT | GDS_ALLOC_DBREC_DEFAULT; + if (flags & GDS_CREATE_QP_WQ_ON_GPU) { + gds_err("error, QP WQ on GPU is not supported yet\n"); + goto err_free_cqs; + } + if (flags & GDS_CREATE_QP_WQ_DBREC_ON_GPU) { + gds_warn("QP WQ DBREC on GPU\n"); + peer->alloc_flags |= GDS_ALLOC_DBREC_ON_GPU; + } + qp_attr->comp_mask |= IBV_QP_INIT_ATTR_PEER_DIRECT; + qp_attr->peer_direct_attrs = peer_attr; + + qp = ibv_create_qp_ex(context, qp_attr); + if (!qp) { + ret = EINVAL; + gds_err("error in ibv_create_qp_ex\n"); + goto err_free_cqs; + } + + gqp->qp = qp; + gqp->send_cq.cq = qp->send_cq; + gqp->send_cq.curr_offset = 0; + gqp->recv_cq.cq = qp->recv_cq; + gqp->recv_cq.curr_offset = 0; + + gds_dbg("created gds_qp=%p\n", gqp); + + return gqp; + +err_free_qp: + gds_dbg("destroying QP\n"); + ibv_destroy_qp(qp); + +err_free_cqs: + gds_dbg("destroying RX CQ\n"); + ret = ibv_destroy_cq(rx_cq); + if (ret) { + gds_err("error %d destroying RX CQ\n", ret); + } + +err_free_tx_cq: + gds_dbg("destroying TX CQ\n"); + ret = ibv_destroy_cq(tx_cq); + if (ret) { + gds_err("error %d destroying TX CQ\n", ret); + } + +err: + free(gqp); + + return NULL; +} + +//----------------------------------------------------------------------------- + +int gds_destroy_qp(struct gds_qp *qp) +{ + int retcode = 0; + int ret; + assert(qp); + + assert(qp->qp); + ret = ibv_destroy_qp(qp->qp); + if (ret) { + gds_err("error %d in destroy_qp\n", ret); + retcode = ret; + } + + assert(qp->send_cq.cq); + ret = ibv_destroy_cq(qp->send_cq.cq); + if (ret) { + gds_err("error %d in destroy_cq send_cq\n", ret); + retcode = ret; + } + + assert(qp->recv_cq.cq); + ret = ibv_destroy_cq(qp->recv_cq.cq); + if (ret) { + gds_err("error %d in destroy_cq recv_cq\n", ret); + retcode = ret; + } + + free(qp); + + return retcode; +} + +//----------------------------------------------------------------------------- + +int gds_query_param(gds_param_t param, int *value) +{ + int ret = 0; + if (!value) + return EINVAL; + + switch (param) { + case GDS_PARAM_VERSION: + *value = (GDS_API_MAJOR_VERSION << 16)|GDS_API_MINOR_VERSION; + break; + default: + ret = EINVAL; + break; + }; + return ret; +} + +//----------------------------------------------------------------------------- + +static bool no_network_descs_after_entry(size_t n_descs, gds_descriptor_t *descs, size_t idx) +{ + bool ret = true; + size_t i; + for(i = idx+1; i < n_descs; ++i) { + gds_descriptor_t *desc = descs + i; + switch(desc->tag) { + case GDS_TAG_SEND: + case GDS_TAG_WAIT: + ret = false; + goto out; + case GDS_TAG_WAIT_VALUE32: + case GDS_TAG_WRITE_VALUE32: + break; + default: + gds_err("invalid tag\n"); + ret = EINVAL; + goto out; + } + } +out: + return ret; +} + +static int get_wait_info(size_t n_descs, gds_descriptor_t *descs, size_t &n_waits, size_t &last_wait) +{ + int ret = 0; + size_t i; + for(i = 0; i < n_descs; ++i) { + gds_descriptor_t *desc = descs + i; + switch(desc->tag) { + case GDS_TAG_WAIT: + ++n_waits; + last_wait = i; + break; + case GDS_TAG_SEND: + case GDS_TAG_WAIT_VALUE32: + case GDS_TAG_WRITE_VALUE32: + break; + default: + gds_err("invalid tag\n"); + ret = EINVAL; + } + } + return ret; +} + +static size_t calc_n_mem_ops(size_t n_descs, gds_descriptor_t *descs) +{ + size_t n_mem_ops = 0; + size_t i; + for(i = 0; i < n_descs; ++i) { + gds_descriptor_t *desc = descs + i; + switch(desc->tag) { + case GDS_TAG_SEND: + n_mem_ops += desc->send->commit.entries + 2; // extra space, ugly + break; + case GDS_TAG_WAIT: + n_mem_ops += desc->wait->peek.entries + 2; // ditto + break; + case GDS_TAG_WAIT_VALUE32: + case GDS_TAG_WRITE_VALUE32: + n_mem_ops += 2; // ditto + break; + default: + gds_err("invalid tag\n"); + } + } + return n_mem_ops; +} + +int gds_stream_post_descriptors(CUstream stream, size_t n_descs, gds_descriptor_t *descs) +{ + size_t i; + int idx = 0; + int ret = 0; + int retcode = 0; + size_t n_mem_ops = 0; + size_t n_waits = 0; + size_t last_wait = 0; + bool move_flush = false; + + n_mem_ops = calc_n_mem_ops(n_descs, descs); + get_wait_info(n_descs, descs, n_waits, last_wait); + + gds_dbg("n_descs=%zu n_waits=%zu n_mem_ops=%zu\n", n_descs, n_waits, n_mem_ops); + + // move flush to last wait in the whole batch + if (n_waits && no_network_descs_after_entry(n_descs, descs, last_wait)) { + gds_dbg("optimizing FLUSH to last wait i=%zu\n", last_wait); + move_flush = true; + } + // alternatively, remove flush for wait is next op is a wait too + + CUstreamBatchMemOpParams params[n_mem_ops]; + + for(i = 0; i < n_descs; ++i) { + gds_descriptor_t *desc = descs + i; + switch(desc->tag) { + case GDS_TAG_SEND: { + gds_send_request_t *sreq = desc->send; + retcode = gds_post_ops(sreq->commit.entries, sreq->commit.storage, params, idx); + if (retcode) { + gds_err("error %d in gds_post_ops\n", retcode); + ret = retcode; + goto out; + } + // TODO: fix late checking + //assert(idx <= n_mem_ops); + if (idx >= n_mem_ops) { + gds_err("idx=%d is past allocation (%zu)\n", idx, n_mem_ops); + assert(!"corrupted heap"); + } + break; + } + case GDS_TAG_WAIT: { + gds_wait_request_t *wreq = desc->wait; + int flags = 0; + if (move_flush && i != last_wait) + flags = GDS_POST_OPS_DISCARD_WAIT_FLUSH; + retcode = gds_post_ops(wreq->peek.entries, wreq->peek.storage, params, idx, flags); + if (retcode) { + gds_err("error %d in gds_post_ops\n", retcode); + ret = retcode; + goto out; + } + // TODO: fix late checking + assert(idx <= n_mem_ops); + break; + } + case GDS_TAG_WAIT_VALUE32: + retcode = gds_fill_poll(params+idx, desc->value32.ptr, desc->value32.value, desc->value32.cond_flags, desc->value32.flags); + if (retcode) { + gds_err("error %d in gds_fill_poll\n", retcode); + ret = retcode; + goto out; + } + ++idx; + break; + case GDS_TAG_WRITE_VALUE32: + retcode = gds_fill_poke(params+idx, desc->value32.ptr, desc->value32.value, desc->value32.flags); + if (retcode) { + gds_err("error %d in gds_fill_poll\n", retcode); + ret = retcode; + goto out; + } + ++idx; + break; + default: + assert(0); + break; + } + } + retcode = gds_stream_batch_ops(stream, idx, params, 0); + if (retcode) { + gds_err("error in batch_ops\n"); + goto out; + } + +out: + return ret; +} + +//----------------------------------------------------------------------------- + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * indent-tabs-mode: nil + * End: + */ diff --git a/src/memmgr.cpp b/src/memmgr.cpp index 7ab5e7c..ab3e490 100644 --- a/src/memmgr.cpp +++ b/src/memmgr.cpp @@ -181,8 +181,10 @@ int gds_register_mem_internal(void *ptr, size_t size, gds_memory_type_t type, CU } else if ((res == CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED) || (res == CUDA_ERROR_ALREADY_MAPPED)) { + const char *err_str = NULL; + cuGetErrorString(res, &err_str); // older CUDA driver versions seem to return CUDA_ERROR_ALREADY_MAPPED - gds_warn("page=%p size=%zu is already registered with CUDA\n", (void*)page_addr, len); + gds_warn("page=%p size=%zu is already registered with CUDA (%d:%s)\n", (void*)page_addr, len, res, err_str); cuda_registered = true; } else if (res == CUDA_ERROR_NOT_INITIALIZED) { diff --git a/src/mlnxutils.h b/src/mlnxutils.h index 5a7e280..ffb6f83 100644 --- a/src/mlnxutils.h +++ b/src/mlnxutils.h @@ -66,9 +66,10 @@ #endif // no WQ wrap-around check!!! -static void gds_bf_copy(uint64_t *dest, uint64_t *src, size_t n_bytes) +static inline void gds_bf_copy(uint64_t *dest, uint64_t *src, size_t n_bytes) { assert(n_bytes % sizeof(uint64_t) == 0); + assert(n_bytes < 128); while (n_bytes > 0) { COPY_64B_NT(dest, src); n_bytes -= 8 * sizeof(*dest); diff --git a/src/objs.hpp b/src/objs.hpp index 0ceb268..e3989bc 100644 --- a/src/objs.hpp +++ b/src/objs.hpp @@ -75,6 +75,8 @@ struct gds_peer { bool has_wait_nor; bool has_inlcpy; bool has_membar; + bool has_weak; + unsigned max_batch_size; gds_peer_attr attr; enum obj_type { NONE, CQ, WQ, N_IBV_OBJS } alloc_type; diff --git a/src/utils.hpp b/src/utils.hpp index e68ede6..09ae3ea 100644 --- a/src/utils.hpp +++ b/src/utils.hpp @@ -32,6 +32,22 @@ #endif #include // to pull PRIx64 +// internal assert function + +void gds_assert(const char *cond, const char *file, unsigned line, const char *function); + +#define GDS_ASSERT2(COND) \ + do { \ + if (!(COND)) \ + gds_assert(#COND, __FILE__, __LINE__, __FUNCTION__); \ + } \ + while(0) + +#define GDS_ASSERT(COND) GDS_ASSERT2(COND) + + +// CUDA error checking + #define __CUCHECK(stmt, cond_str) \ do { \ CUresult result = (stmt); \ @@ -46,7 +62,21 @@ #define CUCHECK(stmt) __CUCHECK(stmt, #stmt) +#ifndef ACCESS_ONCE #define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) +#endif + +template +static inline void gds_atomic_set(T *ptr, T value) +{ + ACCESS_ONCE(*ptr) = value; +} + +template +static inline T gds_atomic_get(T *ptr) +{ + return ACCESS_ONCE(*ptr); +} #define ROUND_UP(V,SIZE) (((V)+(SIZE)-1)/(SIZE)*(SIZE)) @@ -102,7 +132,8 @@ static inline int gds_curesult_to_errno(CUresult result) { int retcode = 0; switch (result) { - case CUDA_SUCCESS: retcode = 0; break; + case CUDA_SUCCESS: retcode = 0; break; + case CUDA_ERROR_NOT_SUPPORTED: retcode = EPERM; break; case CUDA_ERROR_INVALID_VALUE: retcode = EINVAL; break; case CUDA_ERROR_OUT_OF_MEMORY: retcode = ENOMEM; break; // TODO: add missing cases @@ -174,11 +205,13 @@ void gds_dump_wait_request(gds_wait_request_t *request, size_t count); void gds_dump_param(CUstreamBatchMemOpParams *param); void gds_dump_params(gds_op_list_t ¶ms); int gds_fill_membar(gds_op_list_t ¶m, int flags); -int gds_fill_inlcpy(gds_op_list_t ¶m, void *ptr, void *data, size_t n_bytes, int flags); +int gds_fill_inlcpy(gds_op_list_t ¶m, void *ptr, const void *data, size_t n_bytes, int flags); int gds_fill_poke(gds_op_list_t ¶m, uint32_t *ptr, uint32_t value, int flags); int gds_fill_poke64(gds_op_list_t ¶m, uint64_t *ptr, uint64_t value, int flags); int gds_fill_poll(gds_op_list_t ¶m, uint32_t *ptr, uint32_t magic, int cond_flag, int flags); -int gds_stream_batch_ops(CUstream stream, gds_op_list_t ¶ms, int flags); + +struct gds_peer; +int gds_stream_batch_ops(gds_peer *peer, CUstream stream, gds_op_list_t ¶ms, int flags); enum gds_post_ops_flags { GDS_POST_OPS_DISCARD_WAIT_FLUSH = 1<<0 @@ -186,7 +219,7 @@ enum gds_post_ops_flags { struct gds_peer; int gds_post_ops(gds_peer *peer, size_t n_ops, struct peer_op_wr *op, gds_op_list_t ¶ms, int post_flags = 0); - +int gds_post_ops_on_cpu(size_t n_descs, struct peer_op_wr *op, int post_flags = 0); gds_peer *peer_from_stream(CUstream stream); //----------------------------------------------------------------------------- diff --git a/tests/gds_kernel_latency.c b/tests/gds_kernel_latency.c index 1db7a27..c68d35f 100644 --- a/tests/gds_kernel_latency.c +++ b/tests/gds_kernel_latency.c @@ -76,7 +76,7 @@ do { \ MPI_Error_string(result, string, &resultlen); \ fprintf(stderr, " (%s:%d) MPI check failed with %d (%*s)\n", \ __FILE__, __LINE__, result, resultlen, string); \ - exit(-1); \ + exit(EXIT_FAILURE); \ } \ } while(0) @@ -112,6 +112,8 @@ float elapsed_time = 0.0; int event_idx = 0; int gds_enable_event_prof = 0; int gds_qpt = IBV_QPT_UD; //UD by default +int max_batch_len = 20; +int stream_cb_error = 0; struct pingpong_context { struct ibv_context *context; @@ -137,7 +139,13 @@ struct pingpong_context { int peersync; int peersync_gpu_cq; int consume_rx_cqe; + int gpumem; int use_desc_apis; + int n_tx_ev; + int n_rx_ev; + int scnt; + int rcnt; + int skip_kernel_launch; }; static int my_rank, comm_size; @@ -149,51 +157,6 @@ struct pingpong_dest { union ibv_gid gid; }; -static int pp_connect_ctx(struct pingpong_context *ctx, int port, int my_psn, - int sl, struct pingpong_dest *dest, int sgid_idx) -{ - struct ibv_ah_attr ah_attr = { - .is_global = 0, - .dlid = dest->lid, - .sl = sl, - .src_path_bits = 0, - .port_num = port - }; - struct ibv_qp_attr attr = { - .qp_state = IBV_QPS_RTR - }; - - if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE)) { - fprintf(stderr, "Failed to modify QP to RTR\n"); - return 1; - } - - attr.qp_state = IBV_QPS_RTS; - attr.sq_psn = my_psn; - - if (ibv_modify_qp(ctx->qp, &attr, - IBV_QP_STATE | - IBV_QP_SQ_PSN)) { - fprintf(stderr, "Failed to modify QP to RTS\n"); - return 1; - } - - if (dest->gid.global.interface_id) { - ah_attr.is_global = 1; - ah_attr.grh.hop_limit = 1; - ah_attr.grh.dgid = dest->gid; - ah_attr.grh.sgid_index = sgid_idx; - } - - ctx->ah = ibv_create_ah(ctx->pd, &ah_attr); - if (!ctx->ah) { - fprintf(stderr, "Failed to create AH\n"); - return 1; - } - - return 0; -} - static inline unsigned long align_to(unsigned long val, unsigned long pow2) { return (val + pow2 - 1) & ~(pow2 - 1); @@ -209,7 +172,9 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, int peersync_gpu_dbrec, int consume_rx_cqe, int sched_mode, - int use_desc_apis) + int use_gpumem, + int use_desc_apis, + int skip_kernel_launch) { struct pingpong_context *ctx; @@ -226,26 +191,31 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, ctx->calc_size = calc_size; ctx->rx_depth = rx_depth; ctx->gpu_id = gpu_id; + ctx->gpumem = use_gpumem; ctx->use_desc_apis = use_desc_apis; + ctx->skip_kernel_launch = skip_kernel_launch; size_t alloc_size = 3 * align_to(size + 40, page_size); - if (ctx->gpu_id >= 0) + if (ctx->gpumem) { ctx->buf = gpu_malloc(page_size, alloc_size); - else + printf("allocated GPU memory at %p\n", ctx->buf); + } else { ctx->buf = memalign(page_size, alloc_size); - + printf("allocated CPU memory at %p\n", ctx->buf); + } if (!ctx->buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); goto clean_ctx; } - printf("ctx buf=%p\n", ctx->buf); + + gpu_info("allocated ctx buffer %p\n", ctx->buf); ctx->rxbuf = (char*)ctx->buf; ctx->txbuf = (char*)ctx->buf + align_to(size + 40, page_size); //ctx->rx_flag = (char*)ctx->buf + 2 * align_to(size + 40, page_size); ctx->rx_flag = memalign(page_size, alloc_size); if (!ctx->rx_flag) { - fprintf(stderr, "Couldn't allocate rx_flag buf\n"); + gpu_err("Couldn't allocate rx_flag buf\n"); goto clean_ctx; } @@ -255,7 +225,7 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, ctx->consume_rx_cqe = consume_rx_cqe; // must be ZERO!!! for rx_flag to work... - if (ctx->gpu_id >= 0) + if (ctx->gpumem) gpu_memset(ctx->buf, 0, alloc_size); else memset(ctx->buf, 0, alloc_size); @@ -271,7 +241,7 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, ctx->context = ibv_open_device(ib_dev); if (!ctx->context) { - fprintf(stderr, "Couldn't get context for %s\n", + gpu_err("Couldn't get context for %s\n", ibv_get_device_name(ib_dev)); goto clean_buffer; } @@ -279,7 +249,7 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, if (use_event) { ctx->channel = ibv_create_comp_channel(ctx->context); if (!ctx->channel) { - fprintf(stderr, "Couldn't create completion channel\n"); + gpu_err("Couldn't create completion channel\n"); goto clean_device; } } else @@ -287,13 +257,13 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, ctx->pd = ibv_alloc_pd(ctx->context); if (!ctx->pd) { - fprintf(stderr, "Couldn't allocate PD\n"); + gpu_err("Couldn't allocate PD\n"); goto clean_comp_channel; } ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, alloc_size, IBV_ACCESS_LOCAL_WRITE); if (!ctx->mr) { - fprintf(stderr, "Couldn't register MR\n"); + gpu_err("Couldn't register MR\n"); goto clean_pd; } @@ -318,7 +288,7 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, ctx->gds_qp = gds_create_qp(ctx->pd, ctx->context, &attr, gpu_id, gds_flags); if (!ctx->gds_qp) { - fprintf(stderr, "Couldn't create QP\n"); + gpu_err("Couldn't create QP\n"); goto clean_mr; } ctx->qp = ctx->gds_qp->qp; @@ -339,7 +309,7 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, IBV_QP_PKEY_INDEX | IBV_QP_PORT | ((IBV_QPT_UD == gds_qpt) ? IBV_QP_QKEY : IBV_QP_ACCESS_FLAGS))) { - fprintf(stderr, "Failed to modify QP to INIT\n"); + gpu_err("Failed to modify QP to INIT\n"); goto clean_qp; } } @@ -363,7 +333,7 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, ibv_close_device(ctx->context); clean_buffer: - if (ctx->gpu_id >= 0) + if (ctx->gpumem) gpu_free(ctx->buf); else free(ctx->buf); @@ -379,34 +349,34 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, int pp_close_ctx(struct pingpong_context *ctx) { if (gds_destroy_qp(ctx->gds_qp)) { - fprintf(stderr, "Couldn't destroy QP\n"); + gpu_err("Couldn't destroy QP\n"); } if (ibv_dereg_mr(ctx->mr)) { - fprintf(stderr, "Couldn't deregister MR\n"); + gpu_err("Couldn't deregister MR\n"); } if (IBV_QPT_UD == gds_qpt) { if (ibv_destroy_ah(ctx->ah)) { - fprintf(stderr, "Couldn't destroy AH\n"); + gpu_err("Couldn't destroy AH\n"); } } if (ibv_dealloc_pd(ctx->pd)) { - fprintf(stderr, "Couldn't deallocate PD\n"); + gpu_err("Couldn't deallocate PD\n"); } if (ctx->channel) { if (ibv_destroy_comp_channel(ctx->channel)) { - fprintf(stderr, "Couldn't destroy completion channel\n"); + gpu_err("Couldn't destroy completion channel\n"); } } if (ibv_close_device(ctx->context)) { - fprintf(stderr, "Couldn't release context\n"); + gpu_err("Couldn't release context\n"); } - if (ctx->gpu_id >= 0) + if (ctx->gpumem) gpu_free(ctx->buf); else free(ctx->buf); @@ -419,11 +389,88 @@ int pp_close_ctx(struct pingpong_context *ctx) return 0; } +static int poll_send_cq(struct pingpong_context *ctx) +{ + ctx->n_tx_ev = 0; + + struct ibv_wc wc[max_batch_len]; + int ne, i; + + ne = ibv_poll_cq(ctx->tx_cq, max_batch_len, wc); + if (ne < 0) { + gpu_err("poll TX CQ failed %d\n", ne); + return 1; + } + + ctx->n_tx_ev = ne; + + for (i = 0; i < ne; ++i) { + if (wc[i].status != IBV_WC_SUCCESS) { + gpu_err("Failed status %s (%d) for wr_id %d\n", + ibv_wc_status_str(wc[i].status), + wc[i].status, (int) wc[i].wr_id); + return 1; + } + + switch ((int) wc[i].wr_id) { + case PINGPONG_SEND_WRID: + gpu_dbg("got send event\n"); + ++ctx->scnt; + break; + default: + gpu_err("Completion for unknown wr_id %d\n", + (int) wc[i].wr_id); + return 1; + } + } + + return 0; +} + +static int poll_recv_cq(struct pingpong_context *ctx) +{ + // don't call poll_cq on events which are still being polled by the GPU + ctx->n_rx_ev = 0; + + struct ibv_wc wc[max_batch_len]; + int ne = 0; + int i; + + ne = ibv_poll_cq(ctx->rx_cq, max_batch_len, wc); + if (ne < 0) { + gpu_err("poll RX CQ failed %d\n", ne); + return 1; + } + + ctx->n_rx_ev = ne; + + for (i = 0; i < ne; ++i) { + if (wc[i].status != IBV_WC_SUCCESS) { + gpu_err("Failed status %s (%d) for wr_id %d\n", + ibv_wc_status_str(wc[i].status), + wc[i].status, (int) wc[i].wr_id); + return 1; + } + + switch ((int) wc[i].wr_id) { + case PINGPONG_RECV_WRID: + gpu_dbg("got recv event\n"); + ++ctx->rcnt; + break; + default: + gpu_err("Completion for unknown wr_id %d\n", + (int) wc[i].wr_id); + return 1; + } + } + return 0; +} + static int pp_post_recv(struct pingpong_context *ctx, int n) { struct ibv_sge list = { .addr = (uintptr_t) ctx->rxbuf, - .length = ctx->size + 40, + .length = ctx->size + 40, // good for IBV_QPT_UD .lkey = ctx->mr->lkey }; @@ -444,7 +491,46 @@ static int pp_post_recv(struct pingpong_context *ctx, int n) return i; } -static int pp_post_gpu_send(struct pingpong_context *ctx, uint32_t qpn) +static int pp_wait_cq(struct pingpong_context *ctx, int is_client) +{ + int ret; + if (ctx->peersync) { + ret = gds_stream_wait_cq(gpu_stream, &ctx->gds_qp->recv_cq, ctx->consume_rx_cqe); + } else { + if (is_client) { + do { + ret = poll_send_cq(ctx); + if (ret) { + return ret; + } + } while(ctx->n_tx_ev <= 0); + + do { + ret = poll_recv_cq(ctx); + if (ret) { + return ret; + } + } while(ctx->n_rx_ev <= 0); + } else { + do { + ret = poll_recv_cq(ctx); + if (ret) { + return ret; + } + } while(ctx->n_rx_ev <= 0); + + do { + ret = poll_send_cq(ctx); + if (ret) { + return ret; + } + } while(ctx->n_tx_ev <= 0); + } + } + return ret; +} + +static int pp_post_gpu_send(struct pingpong_context *ctx, uint32_t qpn, CUstream *p_gpu_stream) { int ret = 0; struct ibv_sge list = { @@ -467,7 +553,7 @@ static int pp_post_gpu_send(struct pingpong_context *ctx, uint32_t qpn) }, .comp_mask = 0 }; - +#if 0 if (IBV_QPT_UD != gds_qpt) { memset(&ewr, 0, sizeof(ewr)); ewr.num_sge = 1; @@ -477,9 +563,9 @@ static int pp_post_gpu_send(struct pingpong_context *ctx, uint32_t qpn) ewr.sg_list = &list; ewr.next = NULL; } - +#endif gds_send_wr *bad_ewr; - return gds_stream_queue_send(gpu_stream, ctx->gds_qp, &ewr, &bad_ewr); + return gds_stream_queue_send(*p_gpu_stream, ctx->gds_qp, &ewr, &bad_ewr); } static int pp_prepare_gpu_send(struct pingpong_context *ctx, uint32_t qpn, gds_send_request_t *req) @@ -516,12 +602,43 @@ static int pp_prepare_gpu_send(struct pingpong_context *ctx, uint32_t qpn, gds_s ewr.next = NULL; } gds_send_wr *bad_ewr; - return gds_prepare_send(ctx->gds_qp, &ewr, &bad_ewr, req); + return gds_prepare_send(ctx->gds_qp, &ewr, &bad_ewr, req); } +typedef struct work_desc { + gds_send_request_t send_rq; + gds_wait_request_t wait_tx_rq; + gds_wait_request_t wait_rx_rq; +#define N_WORK_DESCS 3 + gds_descriptor_t descs[N_WORK_DESCS]; + unsigned n_descs; +} work_desc_t; + +static void post_work_cb(CUstream hStream, CUresult status, void *userData)\ +{ + int retcode; + work_desc_t *wdesc = (work_desc_t *)userData; + gpu_dbg("[%d] stream callback wdesc=%p n_descs=%d\n", my_rank, wdesc, wdesc->n_descs); + assert(wdesc); + NVTX_PUSH("", 1); + if (status != CUDA_SUCCESS) { + gpu_err("[%d] CUresult %d in stream callback\n", my_rank, status); + goto out; + } + assert(sizeof(wdesc->descs)/sizeof(wdesc->descs[0]) == N_WORK_DESCS); + retcode = gds_post_descriptors(wdesc->n_descs, wdesc->descs, 0); + if (retcode) { + gpu_err("[%d] error %d returned by gds_post_descriptors, going on...\n", my_rank, retcode); + stream_cb_error = 1; + } +out: + free(wdesc); + NVTX_POP(); +} static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uint32_t qpn, int is_client) { + int retcode = 0; int i, ret = 0; int posted_recv = 0; @@ -532,110 +649,219 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin posted_recv = pp_post_recv(ctx, n_posts); if (posted_recv < 0) { - fprintf(stderr,"ERROR: can't post recv (%d) n_posts=%d is_client=%d\n", + gpu_err("can't post recv (%d) n_posts=%d is_client=%d\n", posted_recv, n_posts, is_client); exit(EXIT_FAILURE); return 0; } else if (posted_recv != n_posts) { - fprintf(stderr,"ERROR: couldn't post all recvs (%d posted, %d requested)\n", posted_recv, n_posts); + gpu_warn("[%d] couldn't post all recvs (%d posted, %d requested)\n", my_rank, posted_recv, n_posts); if (!posted_recv) return 0; } - PROF(&prof, prof_idx++); - - gds_send_request_t send_rq[posted_recv]; - gds_wait_request_t wait_tx_rq[posted_recv]; - gds_wait_request_t wait_rx_rq[posted_recv]; - gds_descriptor_t descs[2]; - for (i = 0; i < posted_recv; ++i) { if (is_client) { if (gds_enable_event_prof && (event_idx < MAX_EVENTS)) { cudaEventRecord(start_time[event_idx], gpu_stream); } if (ctx->use_desc_apis) { + work_desc_t *wdesc = calloc(1, sizeof(*wdesc)); int k = 0; - ret = pp_prepare_gpu_send(ctx, qpn, &send_rq[i]); + ret = pp_prepare_gpu_send(ctx, qpn, &wdesc->send_rq); if (ret) { - i = -ret; + retcode = -ret; break; } - descs[k].tag = GDS_TAG_SEND; - descs[k].send = &send_rq[i]; + assert(k < N_WORK_DESCS); + wdesc->descs[k].tag = GDS_TAG_SEND; + wdesc->descs[k].send = &wdesc->send_rq; ++k; - - ret = gds_prepare_wait_cq(&ctx->gds_qp->recv_cq, &wait_rx_rq[i], 0); + ret = gds_prepare_wait_cq(&ctx->gds_qp->send_cq, &wdesc->wait_tx_rq, 0); if (ret) { - i = -ret; + retcode = -ret; break; } - descs[k].tag = GDS_TAG_WAIT; - descs[k].wait = &wait_rx_rq[i]; + assert(k < N_WORK_DESCS); + wdesc->descs[k].tag = GDS_TAG_WAIT; + wdesc->descs[k].wait = &wdesc->wait_tx_rq; ++k; - - ret = gds_stream_post_descriptors(gpu_stream, k, descs, 0); + ret = gds_prepare_wait_cq(&ctx->gds_qp->recv_cq, &wdesc->wait_rx_rq, 0); if (ret) { - i = -ret; + retcode = -ret; break; } - } else { - ret = pp_post_gpu_send(ctx, qpn); + assert(k < N_WORK_DESCS); + wdesc->descs[k].tag = GDS_TAG_WAIT; + wdesc->descs[k].wait = &wdesc->wait_rx_rq; + ++k; + wdesc->n_descs = k; + if (ctx->peersync) { + ret = gds_stream_post_descriptors(gpu_stream, k, wdesc->descs, 0); + free(wdesc); + if (ret) { + retcode = -ret; + break; + } + } else { + gpu_dbg("adding post_work_cb to stream=%p\n", gpu_stream); + CUCHECK(cuStreamAddCallback(gpu_stream, post_work_cb, wdesc, 0)); + } + } + else if (ctx->peersync) { + ret = pp_post_gpu_send(ctx, qpn, &gpu_stream); if (ret) { - fprintf(stderr,"ERROR: can't post GPU send (%d) posted_recv=%d posted_so_far=%d is_client=%d \n", + gpu_err("error %d in pp_post_gpu_send, posted_recv=%d posted_so_far=%d is_client=%d \n", ret, posted_recv, i, is_client); - i = -ret; + retcode = -ret; + break; + } + ret = gds_stream_wait_cq(gpu_stream, &ctx->gds_qp->send_cq, 0); + if (ret) { + // TODO: rollback gpu send + gpu_err("error %d in gds_stream_wait_cq\n", ret); + retcode = -ret; break; } - ret = gds_stream_wait_cq(gpu_stream, &ctx->gds_qp->recv_cq, ctx->consume_rx_cqe); if (ret) { // TODO: rollback gpu send and wait send_cq - fprintf(stderr,"ERROR: error in gpu_post_poll_cq (%d)\n", ret); - i = -ret; + gpu_err("[%d] error %d in gds_stream_wait_cq\n", my_rank, ret); + //exit(EXIT_FAILURE); + retcode = -ret; break; } + } else { + gpu_err("!peersync case only supported when using descriptor APIs\n"); + retcode = -EINVAL; + break; } + if (gds_enable_event_prof && (event_idx < MAX_EVENTS)) { cudaEventRecord(stop_time[event_idx], gpu_stream); event_idx++; } - if (ctx->calc_size) + if (ctx->skip_kernel_launch) { + gpu_warn_once("[%d] NOT LAUNCHING ANY KERNEL AT ALL\n", my_rank); + } else { gpu_launch_kernel(ctx->calc_size, ctx->peersync); - } else { - // no point in using descriptor APIs here, as kernel launch - // would be sitting in between - ret = gds_stream_wait_cq(gpu_stream, &ctx->gds_qp->recv_cq, ctx->consume_rx_cqe); - if (ret) { - fprintf(stderr, "ERROR: error in gpu_post_poll_cq (%d)\n", ret); - i = -ret; + } + + } else { // !is_client == server + + if (ctx->use_desc_apis) { + work_desc_t *wdesc = calloc(1, sizeof(*wdesc)); + int k = 0; + ret = gds_prepare_wait_cq(&ctx->gds_qp->recv_cq, &wdesc->wait_rx_rq, 0); + if (ret) { + retcode = -ret; + break; + } + assert(k < N_WORK_DESCS); + wdesc->descs[k].tag = GDS_TAG_WAIT; + wdesc->descs[k].wait = &wdesc->wait_rx_rq; + ++k; + wdesc->n_descs = k; + if (ctx->peersync) { + ret = gds_stream_post_descriptors(gpu_stream, k, wdesc->descs, 0); + free(wdesc); + if (ret) { + retcode = -ret; + break; + } + } else { + gpu_dbg("adding post_work_cb to stream=%p\n", gpu_stream); + CUCHECK(cuStreamAddCallback(gpu_stream, post_work_cb, wdesc, 0)); + } + } else if (ctx->peersync) { + ret = gds_stream_wait_cq(gpu_stream, &ctx->gds_qp->recv_cq, ctx->consume_rx_cqe); + if (ret) { + // TODO: rollback gpu send and wait send_cq + gpu_err("error %d in gds_stream_wait_cq\n", ret); + //exit(EXIT_FAILURE); + retcode = -ret; + break; + } + } else { + gpu_err("!peersync case only supported when using descriptor APIs\n"); + retcode = -EINVAL; break; } - if (ctx->calc_size) + if (ctx->skip_kernel_launch) { + gpu_warn_once("NOT LAUNCHING ANY KERNEL AT ALL\n"); + } else { gpu_launch_kernel(ctx->calc_size, ctx->peersync); - + } if (gds_enable_event_prof && (event_idx < MAX_EVENTS)) { cudaEventRecord(start_time[event_idx], gpu_stream); } - ret = pp_post_gpu_send(ctx, qpn); - if (ret) { - // TODO: rollback gpu send and kernel launch - fprintf(stderr, "ERROR: can't post GPU send\n"); - i = -ret; + if (ctx->use_desc_apis) { + work_desc_t *wdesc = calloc(1, sizeof(*wdesc)); + int k = 0; + ret = pp_prepare_gpu_send(ctx, qpn, &wdesc->send_rq); + if (ret) { + retcode = -ret; + break; + } + assert(k < N_WORK_DESCS); + wdesc->descs[k].tag = GDS_TAG_SEND; + wdesc->descs[k].send = &wdesc->send_rq; + ++k; + ret = gds_prepare_wait_cq(&ctx->gds_qp->send_cq, &wdesc->wait_tx_rq, 0); + if (ret) { + retcode = -ret; + break; + } + assert(k < N_WORK_DESCS); + wdesc->descs[k].tag = GDS_TAG_WAIT; + wdesc->descs[k].wait = &wdesc->wait_tx_rq; + ++k; + wdesc->n_descs = k; + if (ctx->peersync) { + ret = gds_stream_post_descriptors(gpu_stream, k, wdesc->descs, 0); + free(wdesc); + if (ret) { + retcode = -ret; + break; + } + } else { + gpu_dbg("adding post_work_cb to stream=%p\n", gpu_stream); + CUCHECK(cuStreamAddCallback(gpu_stream, post_work_cb, wdesc, 0)); + } + } else if (ctx->peersync) { + ret = pp_post_gpu_send(ctx, qpn, &gpu_stream); + if (ret) { + gpu_err("error %d in pp_post_gpu_send, posted_recv=%d posted_so_far=%d is_client=%d \n", + ret, posted_recv, i, is_client); + retcode = -ret; + break; + } + ret = gds_stream_wait_cq(gpu_stream, &ctx->gds_qp->send_cq, 0); + if (ret) { + // TODO: rollback gpu send + gpu_err("error %d in gds_stream_wait_cq\n", ret); + retcode = -ret; + break; + } + } else { + gpu_err("!peersync case only supported when using descriptor APIs\n"); + retcode = -EINVAL; break; } + if (gds_enable_event_prof && (event_idx < MAX_EVENTS)) { cudaEventRecord(stop_time[event_idx], gpu_stream); event_idx++; } } } - PROF(&prof, prof_idx++); + if (!retcode) { + retcode = i; + gpu_post_release_tracking_event(&gpu_stream_server); + //sleep(1); + } - gpu_post_release_tracking_event(); - //sleep(1); - return i; + return retcode; } static void usage(const char *argv0) @@ -662,8 +888,10 @@ static void usage(const char *argv0) printf(" -U, --peersync-desc-apis use batched descriptor APIs (default disabled)\n"); printf(" -Q, --consume-rx-cqe enable GPU consumes RX CQE support (default disabled)\n"); printf(" -T, --time-gds-ops evaluate time needed to execute gds operations using cuda events\n"); - printf(" -K, --qp-kind select IB transport kind used by GDS QPs. (-K 1) for UD, (-K 2) for RC\n"); + printf(" -k, --qp-kind select IB transport kind used by GDS QPs. (-K 1) for UD, (-K 2) for RC\n"); printf(" -M, --gpu-sched-mode set CUDA context sched mode, default (A)UTO, (S)PIN, (Y)IELD, (B)LOCKING\n"); + printf(" -E, --gpu-mem allocate GPU intead of CPU memory buffers\n"); + printf(" -K, --skip-kernel-launch no GPU kernel computations, only communications\n"); } int main(int argc, char *argv[]) @@ -685,7 +913,6 @@ int main(int argc, char *argv[]) int use_event = 0; int routs; int nposted; - int rcnt, scnt; int num_cq_events = 0; int sl = 0; int gidx = -1; @@ -695,19 +922,20 @@ int main(int argc, char *argv[]) int peersync_gpu_cq = 0; int peersync_gpu_dbrec = 0; int warmup = 10; - int max_batch_len = 20; int consume_rx_cqe = 0; int gds_qp_type = 1; int sched_mode = CU_CTX_SCHED_AUTO; int ret = 0; + int use_gpumem = 0; int use_desc_apis = 0; + int skip_kernel_launch = 0; MPI_CHECK(MPI_Init(&argc, &argv)); MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &comm_size)); MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &my_rank)); if (comm_size != 2) { - fprintf(stderr, "this test requires exactly two processes \n"); + gpu_err("this test requires exactly two processes \n"); MPI_Abort(MPI_COMM_WORLD, -1); } @@ -716,12 +944,12 @@ int main(int argc, char *argv[]) int version; ret = gds_query_param(GDS_PARAM_VERSION, &version); if (ret) { - fprintf(stderr, "error querying libgdsync version\n"); + gpu_err("error querying libgdsync version\n"); MPI_Abort(MPI_COMM_WORLD, -1); } fprintf(stdout, "libgdsync queried version 0x%08x\n", version); if (!GDS_API_VERSION_COMPATIBLE(version)) { - fprintf(stderr, "incompatible libgdsync version 0x%08x\n", version); + gpu_err("incompatible libgdsync version 0x%08x\n", version); MPI_Abort(MPI_COMM_WORLD, -1); } @@ -749,12 +977,14 @@ int main(int argc, char *argv[]) { .name = "batch-length", .has_arg = 1, .val = 'B' }, { .name = "consume-rx-cqe", .has_arg = 0, .val = 'Q' }, { .name = "time-gds-ops", .has_arg = 0, .val = 'T' }, - { .name = "qp-kind", .has_arg = 1, .val = 'K' }, + { .name = "qp-kind", .has_arg = 1, .val = 'k' }, { .name = "gpu-sched-mode", .has_arg = 1, .val = 'M' }, + { .name = "gpu-mem", .has_arg = 0, .val = 'E' }, + { .name = "skip-kernel-launch", .has_arg = 0, .val = 'K' }, { 0 } }; - c = getopt_long(argc, argv, "p:d:i:s:r:n:l:eg:G:K:S:B:PCDQTM:U", long_options, NULL); + c = getopt_long(argc, argv, "p:d:i:s:r:n:l:eg:G:k:S:B:PCDQTM:EUK", long_options, NULL); if (c == -1) break; @@ -824,7 +1054,7 @@ int main(int argc, char *argv[]) printf("INFO: switching PeerSync %s\n", peersync?"ON":"OFF"); break; - case 'K': + case 'k': gds_qp_type = (int) strtol(optarg, NULL, 0); switch (gds_qp_type) { case 1: printf("INFO: GDS_QPT %s\n","UD"); gds_qpt = IBV_QPT_UD; break; @@ -866,16 +1096,33 @@ int main(int argc, char *argv[]) } break; + case 'E': + use_gpumem = !use_gpumem; + printf("INFO: use_gpumem=%d\n", use_gpumem); + break; + case 'U': use_desc_apis = 1; printf("INFO: use_desc_apis=%d\n", use_desc_apis); break; + case 'K': + skip_kernel_launch = 1; + printf("INFO: skip_kernel_launch=%d\n", skip_kernel_launch); + break; + default: usage(argv[0]); return 1; } } + + if (!peersync && !use_desc_apis) { + gpu_err("!peersync case only supported when using descriptor APIs, enabling them\n"); + use_desc_apis = 1; + return 1; + } + assert(comm_size == 2); char hostnames[comm_size][MPI_MAX_PROCESSOR_NAME]; int name_len; @@ -931,7 +1178,7 @@ int main(int argc, char *argv[]) printf("[%d] picking 1st available device\n", my_rank); ib_dev = *dev_list; if (!ib_dev) { - fprintf(stderr, "[%d] No IB devices found\n", my_rank); + gpu_err("[%d] No IB devices found\n", my_rank); return 1; } } else { @@ -941,7 +1188,7 @@ int main(int argc, char *argv[]) break; ib_dev = dev_list[i]; if (!ib_dev) { - fprintf(stderr, "IB device %s not found\n", ib_devname); + gpu_err("IB device %s not found\n", ib_devname); return 1; } } @@ -953,18 +1200,19 @@ int main(int argc, char *argv[]) printf("USE_GPU=%s(%d)\n", env, gpu_id); } } - ctx = pp_init_ctx(ib_dev, size, calc_size, rx_depth, ib_port, 0, gpu_id, peersync, peersync_gpu_cq, peersync_gpu_dbrec, consume_rx_cqe, sched_mode, use_desc_apis); + printf("[%d] use gpumem: %d\n", my_rank, use_gpumem); + ctx = pp_init_ctx(ib_dev, size, calc_size, rx_depth, ib_port, 0, gpu_id, peersync, peersync_gpu_cq, peersync_gpu_dbrec, consume_rx_cqe, sched_mode, use_gpumem, use_desc_apis, skip_kernel_launch); if (!ctx) return 1; int nrecv = pp_post_recv(ctx, max_batch_len); if (nrecv < max_batch_len) { - fprintf(stderr, "Couldn't post receive (%d)\n", nrecv); + gpu_warn("[%d] Could not post all receive, requested %d, actually posted %d\n", my_rank, max_batch_len, nrecv); return 1; } if (pp_get_port_info(ctx->context, ib_port, &ctx->portinfo)) { - fprintf(stderr, "Couldn't get port info\n"); + gpu_err("[%d] Couldn't get port info\n", my_rank); return 1; } my_dest.lid = ctx->portinfo.lid; @@ -973,7 +1221,7 @@ int main(int argc, char *argv[]) if (gidx >= 0) { if (ibv_query_gid(ctx->context, ib_port, gidx, &my_dest.gid)) { - fprintf(stderr, "Could not get local gid for gid index " + gpu_err("Could not get local gid for gid index " "%d\n", gidx); return 1; } @@ -1000,7 +1248,7 @@ int main(int argc, char *argv[]) }; if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE)) { - fprintf(stderr, "Failed to modify QP to RTR\n"); + gpu_err("Failed to modify QP to RTR\n"); return 1; } @@ -1012,7 +1260,7 @@ int main(int argc, char *argv[]) if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_SQ_PSN)) { - fprintf(stderr, "Failed to modify QP to RTS\n"); + gpu_err("Failed to modify QP to RTS\n"); return 1; } @@ -1025,10 +1273,16 @@ int main(int argc, char *argv[]) .src_path_bits = 0, .port_num = ib_port }; + if (rem_dest->gid.global.interface_id) { + ah_attr.is_global = 1; + ah_attr.grh.hop_limit = 1; + ah_attr.grh.dgid = rem_dest->gid; + ah_attr.grh.sgid_index = gidx; + } ctx->ah = ibv_create_ah(ctx->pd, &ah_attr); if (!ctx->ah) { - fprintf(stderr, "Failed to create AH\n"); + gpu_err("Failed to create AH\n"); return 1; } @@ -1051,7 +1305,7 @@ int main(int argc, char *argv[]) if (ibv_modify_qp(ctx->qp, &attr, (IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MIN_RNR_TIMER | IBV_QP_MAX_DEST_RD_ATOMIC))) { - fprintf(stderr, "Failed to modify QP to RTR\n"); + gpu_err("Failed to modify QP to RTR\n"); return 1; } @@ -1066,28 +1320,25 @@ int main(int argc, char *argv[]) if (ibv_modify_qp(ctx->qp, &attr, (IBV_QP_STATE | IBV_QP_SQ_PSN | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_MAX_QP_RD_ATOMIC))) { - fprintf(stderr, "Failed to modify QP to RTS\n"); + gpu_err("Failed to modify QP to RTS\n"); return 1; } } MPI_Barrier(MPI_COMM_WORLD); - if (gettimeofday(&start, NULL)) { - perror("gettimeofday"); - ret = 1; - goto out; - } - // for performance reasons, multiple batches back-to-back are posted here - rcnt = scnt = 0; + ctx->rcnt = 0; + ctx->scnt = 0; + ctx->n_tx_ev = 0; + ctx->n_rx_ev = 0; nposted = 0; routs = 0; const int n_batches = 3; //int prev_batch_len = 0; int last_batch_len = 0; int n_post = 0; - int n_posted; + int n_posted = 0; int batch; int ii; @@ -1098,40 +1349,44 @@ int main(int argc, char *argv[]) } } - for (batch=0; batchrx_depth/2, iters-nposted), max_batch_len); - n_posted = pp_post_work(ctx, n_post, 0, rem_dest->qpn, servername?1:0); - if (n_posted != n_post) { - fprintf(stderr, "ERROR: Couldn't post work, got %d requested %d\n", n_posted, n_post); + float pre_post_us = 0; + + { + if (gettimeofday(&start, NULL)) { + gpu_err("gettimeofday"); ret = 1; goto out; } - routs += n_posted; - nposted += n_posted; - //prev_batch_len = last_batch_len; - last_batch_len = n_posted; - printf("[%d] batch %d: posted %d sequences\n", my_rank, batch, n_posted); - } - - ctx->pending = PINGPONG_RECV_WRID; - float pre_post_us = 0; - - if (gettimeofday(&end, NULL)) { - perror("gettimeofday"); - ret = 1; - goto out; - } - { + for (batch=0; batchrx_depth/2, iters-nposted), max_batch_len); + n_posted = pp_post_work(ctx, n_post, 0, rem_dest->qpn, servername?1:0); + if (n_posted != n_post) { + gpu_err("[%d] Couldn't post work, got %d requested %d\n", my_rank, n_posted, n_post); + ret = 1; + goto out; + } + routs += n_posted; + nposted += n_posted; + //prev_batch_len = last_batch_len; + last_batch_len = n_posted; + printf("[%d] batch %d: posted %d sequences\n", my_rank, batch, n_posted); + } + if (gettimeofday(&end, NULL)) { + gpu_err("gettimeofday"); + ret = 1; + goto out; + } float usec = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec); printf("pre-posting took %.2f usec\n", usec); pre_post_us = usec; - } + } + ctx->pending = PINGPONG_RECV_WRID; if (!my_rank) { puts(""); - printf("batch info: rx+kernel+tx %d per batch\n", n_posted); // this is the last actually + if (ctx->peersync) printf("batch info: rx+kernel+tx %d per batch\n", n_posted); // this is the last actually printf("pre-posted %d sequences in %d batches\n", nposted, 2); printf("GPU kernel calc buf size: %d\n", ctx->calc_size); printf("iters=%d tx/rx_depth=%d\n", iters, ctx->rx_depth); @@ -1148,11 +1403,30 @@ int main(int argc, char *argv[]) prof_idx = 0; int got_error = 0; int iter = 0; - while ((rcnt < iters || scnt < iters) && !got_error) { + while ((ctx->rcnt < iters || ctx->scnt < iters) && !got_error && !stream_cb_error) { ++iter; PROF(&prof, prof_idx++); - //printf("before tracking\n"); fflush(stdout); +#if 0 + if (!ctx->peersync) { + n_post = 1; + int n = pp_post_work(ctx, n_post, nposted, rem_dest->qpn, servername?1:0); + if (n != n_post) { + gpu_err("[%d] post_work error (%d) rcnt=%d n_post=%d routs=%d\n", my_rank, n, ctx->rcnt, n_post, routs); + return 1; + } + last_batch_len = n; + routs += n; + nposted += n; + + PROF(&prof, prof_idx++); + prof_update(&prof); + prof_idx = 0; + + continue; + } +#endif + int ret = gpu_wait_tracking_event(1000*1000); if (ret == ENOMEM) { printf("gpu_wait_tracking_event nothing to do (%d)\n", ret); @@ -1161,100 +1435,53 @@ int main(int argc, char *argv[]) prof_reset(&prof); continue; } else if (ret) { - fprintf(stderr, "gpu_wait_tracking_event failed (%d)\n", ret); + gpu_err("gpu_wait_tracking_event failed (%d)\n", ret); got_error = ret; } - //gpu_infoc(20, "after tracking\n"); PROF(&prof, prof_idx++); - // don't call poll_cq on events which are still being polled by the GPU - int n_rx_ev = 0; - if (!ctx->consume_rx_cqe) { - struct ibv_wc wc[max_batch_len]; - int ne = 0, i; - - ne = ibv_poll_cq(ctx->rx_cq, max_batch_len, wc); - if (ne < 0) { - fprintf(stderr, "poll RX CQ failed %d\n", ne); - return 1; - } - n_rx_ev += ne; - //if (ne) printf("ne=%d\n", ne); - for (i = 0; i < ne; ++i) { - if (wc[i].status != IBV_WC_SUCCESS) { - fprintf(stderr, "Failed status %s (%d) for wr_id %d\n", - ibv_wc_status_str(wc[i].status), - wc[i].status, (int) wc[i].wr_id); - return 1; - } - - switch ((int) wc[i].wr_id) { - case PINGPONG_RECV_WRID: - ++rcnt; - break; - default: - fprintf(stderr, "Completion for unknown wr_id %d\n", - (int) wc[i].wr_id); - return 1; - } - } + if (ctx->consume_rx_cqe) { + gpu_err("consume_rx_cqe!!!!!!\n"); + ctx->n_rx_ev = last_batch_len; + ctx->rcnt += last_batch_len; } else { - n_rx_ev = last_batch_len; - rcnt += last_batch_len; + ret = poll_recv_cq(ctx); + if (ret) { + gpu_err("error in poll_recv_cq\n"); + exit(EXIT_FAILURE); + } } PROF(&prof, prof_idx++); - int n_tx_ev = 0; - { - struct ibv_wc wc[max_batch_len]; - int ne, i; - ne = ibv_poll_cq(ctx->tx_cq, max_batch_len, wc); - if (ne < 0) { - fprintf(stderr, "poll TX CQ failed %d\n", ne); - return 1; - } - n_tx_ev += ne; - for (i = 0; i < ne; ++i) { - if (wc[i].status != IBV_WC_SUCCESS) { - fprintf(stderr, "Failed status %s (%d) for wr_id %d\n", - ibv_wc_status_str(wc[i].status), - wc[i].status, (int) wc[i].wr_id); - return 1; - } - - switch ((int) wc[i].wr_id) { - case PINGPONG_SEND_WRID: - ++scnt; - break; - default: - fprintf(stderr, "Completion for unknown wr_id %d\n", - (int) wc[i].wr_id); - ret = 1; - goto out; - } - } + ret = poll_send_cq(ctx); + if (ret) { + gpu_err("error in poll_send_cq\n"); + exit(EXIT_FAILURE); } + PROF(&prof, prof_idx++); - if (1 && (n_tx_ev || n_rx_ev)) { - //fprintf(stderr, "iter=%d n_rx_ev=%d, n_tx_ev=%d\n", iter, n_rx_ev, n_tx_ev); fflush(stdout); + + if (0 && (ctx->n_tx_ev || ctx->n_rx_ev)) { + gpu_err("iter=%d n_rx_ev=%d, n_tx_ev=%d\n", iter, ctx->n_rx_ev, ctx->n_tx_ev); + fflush(stdout); } - if (n_tx_ev || n_rx_ev) { + if (ctx->n_tx_ev || ctx->n_rx_ev) { // update counters routs -= last_batch_len; //prev_batch_len = last_batch_len; - if (n_tx_ev != last_batch_len) - fprintf(stderr, "[%d] unexpected tx ev %d, batch len %d\n", iter, n_tx_ev, last_batch_len); - if (n_rx_ev != last_batch_len) - fprintf(stderr, "[%d] unexpected rx ev %d, batch len %d\n", iter, n_rx_ev, last_batch_len); + if (ctx->n_tx_ev != last_batch_len) + gpu_err("[%d] iter:%d unexpected tx ev %d, batch len %d\n", my_rank, iter, ctx->n_tx_ev, last_batch_len); + if (ctx->n_rx_ev != last_batch_len) + gpu_err("[%d] iter:%d unexpected rx ev %d, batch len %d\n", my_rank, iter, ctx->n_rx_ev, last_batch_len); if (nposted < iters) { //fprintf(stdout, "rcnt=%d scnt=%d routs=%d nposted=%d\n", rcnt, scnt, routs, nposted); fflush(stdout); // potentially submit new work n_post = min(min(ctx->rx_depth/2, iters-nposted), max_batch_len); int n = pp_post_work(ctx, n_post, nposted, rem_dest->qpn, servername?1:0); if (n != n_post) { - fprintf(stderr, "ERROR: post_work error (%d) rcnt=%d n_post=%d routs=%d\n", n, rcnt, n_post, routs); + gpu_err("ERROR: post_work error (%d) rcnt=%d n_post=%d routs=%d\n", n, ctx->rcnt, n_post, routs); return 1; } last_batch_len = n; @@ -1272,7 +1499,7 @@ int main(int argc, char *argv[]) if (got_error) { - fprintf(stderr, "exiting for error\n"); + gpu_err("exiting for error\n"); return 1; } } @@ -1310,7 +1537,7 @@ int main(int argc, char *argv[]) if (gds_enable_event_prof) { for (ii = 0; ii < event_idx; ii++) { cudaEventElapsedTime(&elapsed_time, start_time[ii], stop_time[ii]); - fprintf(stderr, "[%d] size = %d, time = %f\n", my_rank, ctx->size, 1000 * elapsed_time); + gpu_err("[%d] size = %d, time = %f\n", my_rank, ctx->size, 1000 * elapsed_time); } for (ii = 0; ii < MAX_EVENTS; ii++) { cudaEventDestroy(stop_time[ii]); diff --git a/tests/gds_kernel_loopback_latency.c b/tests/gds_kernel_loopback_latency.c index 1efddbe..b2d209c 100644 --- a/tests/gds_kernel_loopback_latency.c +++ b/tests/gds_kernel_loopback_latency.c @@ -71,12 +71,8 @@ int prof_idx = 0; //----------------------------------------------------------------------------- -#if 1 -#define dbg(FMT, ARGS...) do {} while(0) -#else -#define dbg_msg(FMT, ARGS...) fprintf(stderr, "DBG [%s] " FMT, __FUNCTION__ ,##ARGS) -#define dbg(FMT, ARGS...) dbg_msg("DBG: ", FMT, ## ARGS) -#endif + +#define dbg(FMT, ARGS...) gpu_dbg(FMT, ## ARGS) #define min(A,B) ((A)<(B)?(A):(B)) @@ -88,6 +84,7 @@ enum { }; static int page_size; +int stream_cb_error = 0; struct pingpong_context { struct ibv_context *context; @@ -115,6 +112,7 @@ struct pingpong_context { int consume_rx_cqe; int gpumem; int use_desc_apis; + int skip_kernel_launch; }; static int my_rank = 0, comm_size = 1; @@ -215,7 +213,8 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, int consume_rx_cqe, int sched_mode, int use_gpumem, - int use_desc_apis) + int use_desc_apis, + int skip_kernel_launch) { struct pingpong_context *ctx; @@ -234,11 +233,12 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, ctx->gpu_id = gpu_id; ctx->gpumem = use_gpumem; ctx->use_desc_apis = use_desc_apis; + ctx->skip_kernel_launch = skip_kernel_launch; size_t alloc_size = 3 * align_to(size + 40, page_size); if (ctx->gpumem) { - printf("allocating GPU memory buf\n"); ctx->buf = gpu_malloc(page_size, alloc_size); + printf("allocated GPU buffer address at %p\n", ctx->buf); } else { printf("allocating CPU memory buf\n"); ctx->buf = memalign(page_size, alloc_size); @@ -272,16 +272,19 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, memset(ctx->buf, 0, alloc_size); memset(ctx->rx_flag, 0, alloc_size); - gpu_register_host_mem(ctx->rx_flag, alloc_size); - - // pipe-cleaner - gpu_launch_kernel_on_stream(ctx->calc_size, ctx->peersync, gpu_stream_server); - gpu_launch_kernel_on_stream(ctx->calc_size, ctx->peersync, gpu_stream_server); - gpu_launch_kernel_on_stream(ctx->calc_size, ctx->peersync, gpu_stream_server); - //gpu_launch_kernel_on_stream(ctx->calc_size, ctx->peersync, gpu_stream_client); - //gpu_launch_kernel_on_stream(ctx->calc_size, ctx->peersync, gpu_stream_client); - //gpu_launch_kernel_on_stream(ctx->calc_size, ctx->peersync, gpu_stream_client); - CUCHECK(cuCtxSynchronize()); + //gpu_register_host_mem(ctx->rx_flag, alloc_size); + + if (!ctx->skip_kernel_launch) { + // pipe-cleaner + gpu_launch_kernel_on_stream(ctx->calc_size, ctx->peersync, gpu_stream_server); + gpu_launch_kernel_on_stream(ctx->calc_size, ctx->peersync, gpu_stream_server); + gpu_launch_kernel_on_stream(ctx->calc_size, ctx->peersync, gpu_stream_server); + // client stream is not really used + gpu_launch_kernel_on_stream(ctx->calc_size, ctx->peersync, gpu_stream_client); + gpu_launch_kernel_on_stream(ctx->calc_size, ctx->peersync, gpu_stream_client); + gpu_launch_kernel_on_stream(ctx->calc_size, ctx->peersync, gpu_stream_client); + CUCHECK(cuCtxSynchronize()); + } ctx->context = ibv_open_device(ib_dev); if (!ctx->context) { @@ -437,6 +440,42 @@ int pp_close_ctx(struct pingpong_context *ctx) return 0; } +static int block_server_stream(struct pingpong_context *ctx) +{ + gds_descriptor_t desc; + desc.tag = GDS_TAG_WAIT_VALUE32; + gds_prepare_wait_value32(&desc.wait32, (uint32_t *)ctx->rx_flag, 1, GDS_WAIT_COND_GEQ, GDS_MEMORY_HOST); + + gds_atomic_set_dword(desc.wait32.ptr, 0); + gds_wmb(); + + gpu_dbg("before gds_stream_post_descriptors\n"); + CUCHECK(gds_stream_post_descriptors(gpu_stream_server, 1, &desc, 0)); + gpu_dbg("after gds_stream_post_descriptors\n"); + return 0; +} + +static int unblock_server_stream(struct pingpong_context *ctx) +{ + int retcode = 0; + usleep(100); + int ret = cuStreamQuery(gpu_stream_server); + switch (ret) { + case CUDA_ERROR_NOT_READY: + break; + case CUDA_SUCCESS: + gpu_err("unexpected idle stream\n"); + retcode = EINVAL; + break; + default: + gpu_err("unexpected error %d in stream query\n", ret); + retcode = EINVAL; + break; + } + gds_atomic_set_dword((uint32_t *)ctx->rx_flag, 1); + return 0; +} + static int pp_post_recv(struct pingpong_context *ctx, int n) { struct ibv_sge list = { @@ -451,15 +490,43 @@ static int pp_post_recv(struct pingpong_context *ctx, int n) }; struct ibv_recv_wr *bad_wr; int i; - + gpu_dbg("posting %d recvs\n", n); for (i = 0; i < n; ++i) if (ibv_post_recv(ctx->qp, &wr, &bad_wr)) break; - + gpu_dbg("posted %d recvs\n", i); return i; } -static int pp_post_gpu_send(struct pingpong_context *ctx, uint32_t qpn, CUstream *gpu_stream) +// will be needed when implementing the !peersync !use_desc_apis case +static int pp_post_send(struct pingpong_context *ctx, uint32_t qpn) +{ + int ret = 0; + struct ibv_sge list = { + .addr = (uintptr_t) ctx->txbuf, + .length = ctx->size, + .lkey = ctx->mr->lkey + }; + gds_send_wr ewr = { + .wr_id = PINGPONG_SEND_WRID, + .sg_list = &list, + .num_sge = 1, + .exp_opcode = IBV_EXP_WR_SEND, + .exp_send_flags = IBV_EXP_SEND_SIGNALED, + .wr = { + .ud = { + .ah = ctx->ah, + .remote_qpn = qpn, + .remote_qkey = 0x11111111 + } + }, + .comp_mask = 0 + }; + gds_send_wr *bad_ewr; + return gds_post_send(ctx->gds_qp, &ewr, &bad_ewr); +} + +static int pp_post_gpu_send(struct pingpong_context *ctx, uint32_t qpn, CUstream *p_gpu_stream) { int ret = 0; struct ibv_sge list = { @@ -483,8 +550,7 @@ static int pp_post_gpu_send(struct pingpong_context *ctx, uint32_t qpn, CUstream .comp_mask = 0 }; gds_send_wr *bad_ewr; - //printf("gpu_post_send_on_stream\n"); - return gds_stream_queue_send(*gpu_stream, ctx->gds_qp, &ewr, &bad_ewr); + return gds_stream_queue_send(*p_gpu_stream, ctx->gds_qp, &ewr, &bad_ewr); } static int pp_prepare_gpu_send(struct pingpong_context *ctx, uint32_t qpn, gds_send_request_t *req) @@ -511,8 +577,38 @@ static int pp_prepare_gpu_send(struct pingpong_context *ctx, uint32_t qpn, gds_s .comp_mask = 0 }; gds_send_wr *bad_ewr; - //printf("gpu_post_send_on_stream\n"); - return gds_prepare_send(ctx->gds_qp, &ewr, &bad_ewr, req); + //printf("gpu_post_send_on_stream\n"); + return gds_prepare_send(ctx->gds_qp, &ewr, &bad_ewr, req); +} + +typedef struct work_desc { + gds_send_request_t send_rq; + gds_wait_request_t wait_tx_rq; + gds_wait_request_t wait_rx_rq; +#define N_WORK_DESCS 3 + gds_descriptor_t descs[N_WORK_DESCS]; +} work_desc_t; + +static void post_work_cb(CUstream hStream, CUresult status, void *userData)\ +{ + int retcode; + work_desc_t *wdesc = (work_desc_t *)userData; + gpu_dbg("stream callback wdesc=%p\n", wdesc); + assert(wdesc); + NVTX_PUSH("work_cb", 1); + if (status != CUDA_SUCCESS) { + fprintf(stderr,"ERROR: CUresult %d in stream callback\n", status); + goto out; + } + assert(sizeof(wdesc->descs)/sizeof(wdesc->descs[0]) == N_WORK_DESCS); + retcode = gds_post_descriptors(sizeof(wdesc->descs)/sizeof(wdesc->descs[0]), wdesc->descs, 0); + if (retcode) { + fprintf(stderr,"ERROR: error %d returned by gds_post_descriptors, going on...\n", retcode); + stream_cb_error = 1; + } +out: + free(wdesc); + NVTX_POP(); } static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uint32_t qpn, int is_client) @@ -521,11 +617,14 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin int i, ret = 0; int posted_recv = 0; - //printf("post_work posting %d\n", n_posts); + gpu_dbg("n_posts=%d rcnt=%d is_client=%d\n", n_posts, rcnt, is_client); - if (n_posts <= 0) + if (n_posts <= 0) { + gpu_dbg("nothing to do\n"); return 0; + } + NVTX_PUSH("post recv", 1); posted_recv = pp_post_recv(ctx, n_posts); if (posted_recv < 0) { fprintf(stderr,"ERROR: can't post recv (%d) n_posts=%d is_client=%d\n", @@ -537,52 +636,62 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin if (!posted_recv) return 0; } + NVTX_POP(); PROF(&prof, prof_idx++); - gds_send_request_t send_rq[posted_recv]; - gds_wait_request_t wait_tx_rq[posted_recv]; - gds_wait_request_t wait_rx_rq[posted_recv]; - gds_descriptor_t descs[3]; + NVTX_PUSH("post send+wait", 1); for (i = 0; i < posted_recv; ++i) { if (ctx->use_desc_apis) { + work_desc_t *wdesc = calloc(1, sizeof(*wdesc)); int k = 0; - ret = pp_prepare_gpu_send(ctx, qpn, &send_rq[i]); + ret = pp_prepare_gpu_send(ctx, qpn, &wdesc->send_rq); if (ret) { retcode = -ret; break; } - descs[k].tag = GDS_TAG_SEND; - descs[k].send = &send_rq[i]; + assert(k < N_WORK_DESCS); + wdesc->descs[k].tag = GDS_TAG_SEND; + wdesc->descs[k].send = &wdesc->send_rq; ++k; - ret = gds_prepare_wait_cq(&ctx->gds_qp->send_cq, &wait_tx_rq[i], 0); + ret = gds_prepare_wait_cq(&ctx->gds_qp->send_cq, &wdesc->wait_tx_rq, 0); if (ret) { retcode = -ret; break; } - descs[k].tag = GDS_TAG_WAIT; - descs[k].wait = &wait_tx_rq[i]; + assert(k < N_WORK_DESCS); + wdesc->descs[k].tag = GDS_TAG_WAIT; + wdesc->descs[k].wait = &wdesc->wait_tx_rq; ++k; - ret = gds_prepare_wait_cq(&ctx->gds_qp->recv_cq, &wait_rx_rq[i], 0); + ret = gds_prepare_wait_cq(&ctx->gds_qp->recv_cq, &wdesc->wait_rx_rq, 0); if (ret) { retcode = -ret; break; } - descs[k].tag = GDS_TAG_WAIT; - descs[k].wait = &wait_rx_rq[i]; + assert(k < N_WORK_DESCS); + wdesc->descs[k].tag = GDS_TAG_WAIT; + wdesc->descs[k].wait = &wdesc->wait_rx_rq; ++k; - ret = gds_stream_post_descriptors(gpu_stream_server, k, descs, 0); - if (ret) { - retcode = -ret; - break; + if (ctx->peersync) { + gpu_dbg("before gds_stream_post_descriptors\n"); + ret = gds_stream_post_descriptors(gpu_stream_server, k, wdesc->descs, 0); + gpu_dbg("after gds_stream_post_descriptors\n"); + free(wdesc); + if (ret) { + retcode = -ret; + break; + } + } else { + gpu_dbg("adding post_work_cb to stream=%p\n", gpu_stream_server); + CUCHECK(cuStreamAddCallback(gpu_stream_server, post_work_cb, wdesc, 0)); } - } else { + } else if (ctx->peersync) { ret = pp_post_gpu_send(ctx, qpn, &gpu_stream_server); if (ret) { - fprintf(stderr,"ERROR: error %d in pp_post_gpu_send, posted_recv=%d posted_so_far=%d is_client=%d \n", + gpu_err("error %d in pp_post_gpu_send, posted_recv=%d posted_so_far=%d is_client=%d \n", ret, posted_recv, i, is_client); retcode = -ret; break; @@ -591,7 +700,7 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin ret = gds_stream_wait_cq(gpu_stream_server, &ctx->gds_qp->send_cq, 0); if (ret) { // TODO: rollback gpu send - fprintf(stderr, "ERROR: error %d in gds_stream_wait_cq\n", ret); + gpu_err("error %d in gds_stream_wait_cq\n", ret); retcode = -ret; break; } @@ -599,14 +708,21 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin ret = gds_stream_wait_cq(gpu_stream_server, &ctx->gds_qp->recv_cq, ctx->consume_rx_cqe); if (ret) { // TODO: rollback gpu send and wait send_cq - fprintf(stderr, "ERROR: error %d in gds_stream_wait_cq\n", ret); + gpu_err("error %d in gds_stream_wait_cq\n", ret); //exit(EXIT_FAILURE); retcode = -ret; break; } + } else { + gpu_err("!peersync case only supported when using descriptor APIs\n"); + retcode = -EINVAL; + break; } - if (ctx->calc_size) + if (ctx->skip_kernel_launch) { + gpu_warn_once("NOT LAUNCHING ANY KERNEL AT ALL\n"); + } else { gpu_launch_kernel_on_stream(ctx->calc_size, ctx->peersync, gpu_stream_server); + } } PROF(&prof, prof_idx++); @@ -615,8 +731,8 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin gpu_post_release_tracking_event(&gpu_stream_server); //sleep(1); } + NVTX_POP(); -out: return retcode; } @@ -638,12 +754,15 @@ static void usage(const char *argv0) printf(" -S, --gpu-calc-size= size of GPU compute buffer (default 128KB)\n"); printf(" -G, --gpu-id use specified GPU (default 0)\n"); printf(" -B, --batch-length= max batch length (default 20)\n"); - printf(" -P, --peersync enable GPUDirect PeerSync support (default enabled)\n"); + printf(" -P, --peersync disable GPUDirect PeerSync support (default enabled)\n"); printf(" -C, --peersync-gpu-cq enable GPUDirect PeerSync GPU CQ support (default disabled)\n"); printf(" -D, --peersync-gpu-dbrec enable QP DBREC on GPU memory (default disabled)\n"); printf(" -U, --peersync-desc-apis use batched descriptor APIs (default disabled)\n"); printf(" -Q, --consume-rx-cqe enable GPU consumes RX CQE support (default disabled)\n"); printf(" -M, --gpu-sched-mode set CUDA context sched mode, default (A)UTO, (S)PIN, (Y)IELD, (B)LOCKING\n"); + printf(" -E, --gpu-mem allocate GPU intead of CPU memory buffers\n"); + printf(" -K, --skip-kernel-launch no GPU kernel computations, only communications\n"); + printf(" -L, --hide-cpu-launch-latency try to prelaunch work on blocked stream then unblock\n"); } int main(int argc, char *argv[]) @@ -682,6 +801,8 @@ int main(int argc, char *argv[]) int wait_key = -1; int use_gpumem = 0; int use_desc_apis = 0; + int skip_kernel_launch = 0; + int hide_cpu_launch_latency = 0; fprintf(stdout, "libgdsync build version 0x%08x, major=%d minor=%d\n", GDS_API_VERSION, GDS_API_MAJOR_VERSION, GDS_API_MINOR_VERSION); @@ -723,10 +844,12 @@ int main(int argc, char *argv[]) { .name = "gpu-sched-mode", .has_arg = 1, .val = 'M' }, { .name = "gpu-mem", .has_arg = 0, .val = 'E' }, { .name = "wait-key", .has_arg = 1, .val = 'W' }, + { .name = "skip-kernel-launch", .has_arg = 0, .val = 'K' }, + { .name = "hide-cpu-launch-latency", .has_arg = 0, .val = 'L' }, { 0 } }; - c = getopt_long(argc, argv, "p:d:i:s:r:n:l:eg:G:S:B:PCDQM:W:EU", long_options, NULL); + c = getopt_long(argc, argv, "p:d:i:s:r:n:l:eg:G:S:B:PCDQM:W:EUKL", long_options, NULL); if (c == -1) break; @@ -755,10 +878,12 @@ int main(int argc, char *argv[]) case 's': size = strtol(optarg, NULL, 0); + printf("INFO: message size=%d\n", size); break; case 'S': calc_size = strtol(optarg, NULL, 0); + printf("INFO: kernel calc size=%d\n", calc_size); break; case 'r': @@ -794,6 +919,9 @@ int main(int argc, char *argv[]) case 'P': peersync = !peersync; printf("INFO: switching PeerSync %s\n", peersync?"ON":"OFF"); + if (!peersync) { + printf("WARNING: PeerSync OFF is approximated using CUDA stream callbacks\n"); + } break; case 'Q': @@ -840,11 +968,28 @@ int main(int argc, char *argv[]) printf("INFO: use_desc_apis=%d\n", use_desc_apis); break; + case 'K': + skip_kernel_launch = 1; + printf("INFO: skip_kernel_launch=%d\n", skip_kernel_launch); + break; + + case 'L': + hide_cpu_launch_latency = 1; + printf("INFO: hide_cpu_launch_latency=%d\n", hide_cpu_launch_latency); + break; + default: usage(argv[0]); return 1; } } + + if (!peersync && !use_desc_apis) { + gpu_err("!peersync case only supported when using descriptor APIs, enabling them\n"); + use_desc_apis = 1; + return 1; + } + assert(comm_size == 1); char *hostnames[1] = {"localhost"}; @@ -854,12 +999,8 @@ int main(int argc, char *argv[]) } const char *tags = NULL; - if (peersync) { - tags = "wait trk|pollrxcq|polltxcq|postrecv|postwork| poketrk"; - } else { - tags = "krn laun|krn sync|postsend|<------>|<------>| sent ev"; - } - prof_init(&prof, 10000, 10000, "10us", 60, 2, tags); + tags = "wait trk|pollrxcq|polltxcq|postrecv|postwork| poketrk"; + prof_init(&prof, 100000, 100000, "100us", 60, 2, tags); //prof_init(&prof, 100, 100, "100ns", 25*4, 2, tags); prof_disable(&prof); @@ -908,7 +1049,7 @@ int main(int argc, char *argv[]) } } printf("use gpumem: %d\n", use_gpumem); - ctx = pp_init_ctx(ib_dev, size, calc_size, rx_depth, ib_port, 0, gpu_id, peersync, peersync_gpu_cq, peersync_gpu_dbrec, consume_rx_cqe, sched_mode, use_gpumem, use_desc_apis); + ctx = pp_init_ctx(ib_dev, size, calc_size, rx_depth, ib_port, 0, gpu_id, peersync, peersync_gpu_cq, peersync_gpu_dbrec, consume_rx_cqe, sched_mode, use_gpumem, use_desc_apis, skip_kernel_launch); if (!ctx) return 1; @@ -959,6 +1100,11 @@ int main(int argc, char *argv[]) //sleep(1); } + if (hide_cpu_launch_latency) { + printf("INFO: blocking stream ...\n"); + block_server_stream(ctx); + } + if (gettimeofday(&start, NULL)) { perror("gettimeofday"); ret = 1; @@ -980,8 +1126,15 @@ int main(int argc, char *argv[]) int batch; for (batch=0; batchrx_depth/2, iters-nposted), max_batch_len); + gpu_dbg("batch=%d n_post=%d\n", batch, n_post); n_posted = pp_post_work(ctx, n_post, 0, rem_dest->qpn, servername?1:0); + PROF(&prof, prof_idx++); if (n_posted < 0) { fprintf(stderr, "ERROR: got error %d\n", n_posted); ret = 1; @@ -1015,6 +1168,14 @@ int main(int argc, char *argv[]) pre_post_us = usec; } + if (hide_cpu_launch_latency) { + printf("ignoring pre-posting time and unblocking the stream\n"); + pre_post_us = 0; + if (unblock_server_stream(ctx)) { + exit(EXIT_FAILURE); + } + } + if (!my_rank) { puts(""); printf("batch info: rx+kernel+tx %d per batch\n", n_posted); // this is the last actually @@ -1034,20 +1195,20 @@ int main(int argc, char *argv[]) prof_idx = 0; int got_error = 0; int iter = 0; - while ((rcnt < iters && scnt < iters) && !got_error) { + while ((rcnt < iters && scnt < iters) && !got_error && !stream_cb_error) { ++iter; PROF(&prof, prof_idx++); //printf("before tracking\n"); fflush(stdout); int ret = gpu_wait_tracking_event(1000*1000); if (ret == ENOMEM) { - dbg("gpu_wait_tracking_event nothing to do (%d)\n", ret); + gpu_dbg("gpu_wait_tracking_event nothing to do (%d)\n", ret); } else if (ret == EAGAIN) { - fprintf(stderr, "gpu_wait_tracking_event timout (%d), retrying\n", ret); + gpu_err("gpu_wait_tracking_event timout (%d), retrying\n", ret); prof_reset(&prof); continue; } else if (ret) { - fprintf(stderr, "gpu_wait_tracking_event failed (%d)\n", ret); + gpu_err("gpu_wait_tracking_event failed (%d)\n", ret); got_error = ret; } //gpu_infoc(20, "after tracking\n"); @@ -1122,6 +1283,7 @@ int main(int argc, char *argv[]) } } } + PROF(&prof, prof_idx++); if (1 && (n_tx_ev || n_rx_ev)) { //fprintf(stderr, "iter=%d n_rx_ev=%d, n_tx_ev=%d\n", iter, n_rx_ev, n_tx_ev); fflush(stdout); @@ -1131,9 +1293,9 @@ int main(int argc, char *argv[]) routs -= last_batch_len; //prev_batch_len = last_batch_len; if (n_tx_ev != last_batch_len) - dbg("[%d] partially completed batch, got tx ev %d, batch len %d\n", iter, n_tx_ev, last_batch_len); + gpu_dbg("[%d] partially completed batch, got tx ev %d, batch len %d\n", iter, n_tx_ev, last_batch_len); if (n_rx_ev != last_batch_len) - dbg("[%d] partially completed batch, got rx ev %d, batch len %d\n", iter, n_rx_ev, last_batch_len); + gpu_dbg("[%d] partially completed batch, got rx ev %d, batch len %d\n", iter, n_rx_ev, last_batch_len); if (nposted < iters) { //fprintf(stdout, "rcnt=%d scnt=%d routs=%d nposted=%d\n", rcnt, scnt, routs, nposted); fflush(stdout); // potentially submit new work @@ -1148,6 +1310,9 @@ int main(int argc, char *argv[]) nposted += n; //fprintf(stdout, "n_post=%d n=%d\n", n_post, n); } + } else { + PROF(&prof, prof_idx++); + PROF(&prof, prof_idx++); } //usleep(10); PROF(&prof, prof_idx++); @@ -1157,10 +1322,10 @@ int main(int argc, char *argv[]) //fprintf(stdout, "%d %d\n", rcnt, scnt); fflush(stdout); - if (got_error) { + if (got_error || stream_cb_error) { //fprintf(stderr, "sleeping 10s then exiting for error\n"); //sleep(10); - fprintf(stderr, "exiting for error\n"); + gpu_err("[%d] exiting due to error(s)\n", my_rank); return 1; } diff --git a/tests/gds_poll_lat.c b/tests/gds_poll_lat.c index 7a22d79..9d2d336 100644 --- a/tests/gds_poll_lat.c +++ b/tests/gds_poll_lat.c @@ -35,6 +35,7 @@ int main(int argc, char *argv[]) int use_flush = 0; int use_combined = 0; int use_membar = 0; + int use_wrmem = 0; int wait_key = -1; //CUstream gpu_stream; @@ -44,7 +45,7 @@ int main(int argc, char *argv[]) while(1) { int c; - c = getopt(argc, argv, "cd:p:n:s:hfgP:mW:"); + c = getopt(argc, argv, "cd:p:n:s:hfgP:mW:w"); if (c == -1) break; @@ -77,22 +78,32 @@ int main(int argc, char *argv[]) break; case 'f': use_flush = 1; - printf("INFO enabling flush\n"); + gpu_info("enabling flush\n"); break; case 'g': use_gpu_buf = 1; - printf("INFO polling on GPU buffer\n"); + gpu_info("polling on GPU buffer\n"); break; + case 'w': + use_wrmem = 1; + gpu_info("enabling use of WRITE_MEMORY\n"); + break; + case '?': case 'h': printf(" %s [-n ][-s ][-p # bg streams][-P # pokes][ckhfgomW]\n", argv[0]); exit(EXIT_SUCCESS); break; default: - printf("ERROR: invalid option\n"); + gpu_err("invalid option '%c'\n", c); exit(EXIT_FAILURE); } } + if (n_pokes < 1) { + gpu_err("n_pokes must be 1 at least\n"); + exit(EXIT_FAILURE); + } + CUstream bg_streams[n_bg_streams]; memset(bg_streams, 0, sizeof(bg_streams)); @@ -103,28 +114,26 @@ int main(int argc, char *argv[]) const char *tags = "postpoll|que poke| sleep| set dw|pollpoke|str sync"; if ( /*prof_init(&prof, 1000, 1000, "1ms", 50, 1, tags)*/ prof_init(&prof, 100, 100, "100ns", 25*4*2, 5, tags)) { - fprintf(stderr, "error in prof_init init.\n"); + gpu_err("error in prof_init init.\n"); exit(EXIT_FAILURE); } if (gpu_init(gpu_id, CU_CTX_SCHED_AUTO)) { - fprintf(stderr, "error in GPU init.\n"); + gpu_err("error in GPU init.\n"); exit(EXIT_FAILURE); } //CUCHECK(cuStreamCreate(&gpu_stream, 0)); - puts(""); - printf("number iterations %d\n", num_iters); - printf("num dwords per poke %zu\n", n_pokes); - printf("use poll flush %d\n", use_flush); - printf("use poke membar %d\n", use_membar); - printf("use %d background streams\n", n_bg_streams); - printf("sleep %dus\n", sleep_us); - printf("buffer size %zd\n", size); - printf("poll on %s buffer\n", use_gpu_buf?"GPU":"CPU"); - printf("write on %s buffer\n", use_gpu_buf?"GPU":"CPU"); - puts(""); + gpu_info("number iterations %d\n", num_iters); + gpu_info("num dwords per poke %zu\n", n_pokes); + gpu_info("use poll flush %d\n", use_flush); + gpu_info("use poke membar %d\n", use_membar); + gpu_info("use %d background streams\n", n_bg_streams); + gpu_info("sleep %dus\n", sleep_us); + gpu_info("buffer size %zd\n", size); + gpu_info("poll on %s buffer\n", use_gpu_buf?"GPU":"CPU"); + gpu_info("write on %s buffer\n", use_gpu_buf?"GPU":"CPU"); gds_mem_desc_t desc = {0,}; ret = gds_alloc_mapped_memory(&desc, size, use_gpu_buf?GDS_MEMORY_GPU:GDS_MEMORY_HOST); @@ -134,7 +143,7 @@ int main(int argc, char *argv[]) } CUdeviceptr d_buf = desc.d_ptr; void *h_buf = desc.h_ptr; - printf("allocated d_buf=%p h_buf=%p\n", (void*)d_buf, h_buf); + gpu_info("allocated d_buf=%p h_buf=%p\n", (void*)d_buf, h_buf); memset(h_buf, 0, size); gds_mem_desc_t desc_data = {0,}; @@ -145,7 +154,7 @@ int main(int argc, char *argv[]) } CUdeviceptr d_data = desc_data.d_ptr; uint32_t *h_data = desc_data.h_ptr; - printf("allocated d_data=%p h_data=%p\n", (void*)d_data, h_data); + gpu_info("allocated d_data=%p h_data=%p\n", (void*)d_data, h_data); memset(h_data, 0, size); int i; @@ -158,17 +167,21 @@ int main(int argc, char *argv[]) uint32_t *h_bg_buf = NULL; if (n_bg_streams) { - printf("launching background %dx poll\n", n_bg_streams); + gpu_info("launching background %dx poll\n", n_bg_streams); ASSERT(!posix_memalign((void*)&h_bg_buf, page_size, size)); memset(h_bg_buf, 0, size); for (i=0; i warmup) { double avg_wait_us = (double)delta_t / (double)(num_iters - warmup); - printf("average wait time: %fus\n", avg_wait_us); + printf("sleep time: %d average wait time: %fus\n", sleep_us, avg_wait_us); } perf_stop(); prof_dump(&prof); err: if (n_bg_streams) { - printf("signaling %d background polling stream(s)\n", n_bg_streams); + gpu_info("signaling %d background polling stream(s)\n", n_bg_streams); int s; for (s=0; s #include +#include "config.h" #include "test_utils.h" #include "gpu.h" +#define CHUNK_SIZE 16 + int poll_dword_geq(uint32_t *ptr, uint32_t payload, gds_us_t tm) { gds_us_t start = gds_get_time_us(); @@ -53,18 +56,12 @@ int main(int argc, char *argv[]) size_t size = 1024*64; int use_gpu_buf = 0; int use_flush = 0; - int use_combined = 0; int use_membar = 0; - int wait_key = -1; - //CUstream gpu_stream; - - int n_bg_streams = 0; - - size_t n_pokes = 1; + CUstream gpu_stream; while(1) { int c; - c = getopt(argc, argv, "cd:p:n:s:hfgP:mW:"); + c = getopt(argc, argv, "d:n:s:hfgm"); if (c == -1) break; @@ -72,20 +69,6 @@ int main(int argc, char *argv[]) case 'd': gpu_id = strtol(optarg, NULL, 0); break; - case 'W': - wait_key = strtol(optarg, NULL, 0); - break; - case 'p': - n_bg_streams = strtol(optarg, NULL, 0); - break; - case 'c': - // merge poll and multiple pokes - use_combined = 1; - break; - case 'P': - // multiple pokes - n_pokes = strtol(optarg, NULL, 0); - break; case 'm': use_membar = 1; break; @@ -104,7 +87,13 @@ int main(int argc, char *argv[]) printf("INFO polling on GPU buffer\n"); break; case 'h': - printf(" %s [-n ][-s ][-p # bg streams][-P # pokes][ckhfgomW]\n", argv[0]); + printf("Usage:\n" + " %s [-d ][-n ][-s ][hfgm]\n" + "Options:\n" + " -f issue a GPU RDMA flush following each poll\n" + " -g allocate all memory on GPU\n" + " -m issue memory barrier between signal and data stores\n" + " -h this help\n", argv[0]); exit(EXIT_SUCCESS); break; default: @@ -113,9 +102,6 @@ int main(int argc, char *argv[]) } } - CUstream bg_streams[n_bg_streams]; - memset(bg_streams, 0, sizeof(bg_streams)); - if (gpu_init(gpu_id, CU_CTX_SCHED_AUTO)) { fprintf(stderr, "error in GPU init.\n"); exit(EXIT_FAILURE); @@ -125,15 +111,14 @@ int main(int argc, char *argv[]) puts(""); printf("number iterations %d\n", num_iters); - printf("num dwords per poke %zu\n", n_pokes); printf("use poll flush %d\n", use_flush); printf("use poke membar %d\n", use_membar); - printf("use %d background streams\n", n_bg_streams); printf("sleep %dus\n", sleep_us); printf("buffer size %zd\n", size); printf("poll on %s buffer\n", use_gpu_buf?"GPU":"CPU"); printf("write on %s buffer\n", use_gpu_buf?"GPU":"CPU"); puts(""); + int mem_type = use_gpu_buf ? GDS_MEMORY_GPU : GDS_MEMORY_HOST; @@ -170,104 +155,104 @@ int main(int argc, char *argv[]) perf_start(); int n_errors = 0; -#define CHUNK_SIZE 3 int round; + int print_dots = 0; for (i = 0, value = 1; i < num_iters; ++i, ++value) { for (round = 0; round < 2; ++round) { ASSERT(value <= INT_MAX); - uint32_t *h_signal = (uint32_t*)h_buf + ((0) % (size/sizeof(uint32_t))); uint32_t *d_signal = (uint32_t*)d_buf + ((0) % (size/sizeof(uint32_t))); + uint32_t *signal = (mem_type == GDS_MEMORY_GPU ? d_signal : h_signal); uint32_t *h_done = (uint32_t*)h_buf + ((1) % (size/sizeof(uint32_t))); uint32_t *d_done = (uint32_t*)d_buf + ((1) % (size/sizeof(uint32_t))); + uint32_t *done = (mem_type == GDS_MEMORY_GPU ? d_done : h_done); uint32_t *h_dbg = (uint32_t*)h_buf + ((2) % (size/sizeof(uint32_t))); uint32_t *d_dbg = (uint32_t*)d_buf + ((2) % (size/sizeof(uint32_t))); + uint32_t *dbg = (mem_type == GDS_MEMORY_GPU ? d_dbg : h_dbg); + // CHUNK_SIZE contiguous blocks of dwords + ASSERT(size >= CHUNK_SIZE*sizeof(uint32_t)); uint32_t *h_vals = (uint32_t*)h_data + ((i*CHUNK_SIZE) % (size/sizeof(uint32_t))); uint32_t *d_vals = (uint32_t*)d_data + ((i*CHUNK_SIZE) % (size/sizeof(uint32_t))); + uint32_t *vals = (mem_type == GDS_MEMORY_GPU ? d_vals : h_vals); - uint32_t src_data[CHUNK_SIZE] = {1, 2, 3}; + int ii; + //uint32_t src_data[CHUNK_SIZE] = {1, 2, 3}; + uint32_t src_data[CHUNK_SIZE]; + for (ii=0; ii= deviceCount) { - printf("ERROR: requested GPU gpu_id beyond available\n"); + gpu_err("ERROR: requested GPU gpu_id beyond available\n"); ret = 1; goto out; } @@ -109,42 +108,42 @@ int gpu_init(int gpu_id, int sched_mode) cuDeviceGetAttribute(&pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, gpu_device); cuDeviceGetAttribute(&pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, gpu_device); //printf(" Device PCI Bus ID / PCI location ID: %d / %d\n", pciBusID, pciDeviceID); - printf("GPU id:%d dev:%d name:%s pci %d:%d\n", i, gpu_device, name, pciBusID, pciDeviceID); + gpu_info("GPU id:%d dev:%d name:%s pci %d:%d\n", i, gpu_device, name, pciBusID, pciDeviceID); } CUCHECK(cuDeviceGet(&gpu_device, gpu_id)); - printf("creating CUDA Primary Ctx on device:%d id:%d\n", gpu_device, gpu_id); + gpu_info("creating CUDA Primary Ctx on device:%d id:%d\n", gpu_device, gpu_id); CUCHECK(cuDevicePrimaryCtxRetain(&gpu_ctx, gpu_device)); - printf("making it the current CUDA Ctx\n"); + gpu_dbg("making it the current CUDA Ctx\n"); CUCHECK(cuCtxSetCurrent(gpu_ctx)); // TODO: add a check for canMapHost //CUCHECK(cuDeviceGetProperties(&prop, gpu_device)); cuDeviceGetAttribute(&gpu_num_sm, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, gpu_device); - printf("num SMs per GPU:%d\n", gpu_num_sm); + gpu_dbg("num SMs per GPU:%d\n", gpu_num_sm); cuDeviceGetAttribute(&gpu_clock_rate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, gpu_device); - printf("clock rate:%d\n", gpu_clock_rate); + gpu_dbg("clock rate:%d\n", gpu_clock_rate); CUCHECK(cuStreamCreate(&gpu_stream, 0)); - printf("created main test CUDA stream %p\n", gpu_stream); + gpu_dbg("created main test CUDA stream %p\n", gpu_stream); CUCHECK(cuStreamCreate(&gpu_stream_server, 0)); - printf("created stream server CUDA stream %p\n", gpu_stream_server); + gpu_dbg("created stream server CUDA stream %p\n", gpu_stream_server); CUCHECK(cuStreamCreate(&gpu_stream_client, 0)); - printf("created stream cliebt CUDA stream %p\n", gpu_stream_client); + gpu_dbg("created stream client CUDA stream %p\n", gpu_stream_client); { int n; int ev_flags = CU_EVENT_DISABLE_TIMING; if (CU_CTX_SCHED_BLOCKING_SYNC == sched_mode) { - printf("creating events with blocking sync behavior\n"); + gpu_dbg("creating events with blocking sync behavior\n"); ev_flags |= CU_EVENT_BLOCKING_SYNC; } for (n=0; n +#define CUDA_PROFILER_START_ON_LEVEL(start) { if(start){cudaProfilerStart();} } +#define CUDA_PROFILER_STOP_ON_LEVEL(stop) { if(stop){cudaProfilerStop();cudaDeviceReset();exit(0);} } +#else +#define CUDA_PROFILER_START_ON_LEVEL(start) { } +#define CUDA_PROFILER_STOP_ON_LEVEL(stop) { } +#endif + +#ifdef USE_NVTX +#include "nvToolsExt.h" +#define NVTX_PUSH(name,cid) do { \ + uint32_t colors[] = { 0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff }; \ + int num_colors = sizeof(colors)/sizeof(uint32_t); \ + int color_id = cid; \ + color_id = color_id%num_colors;\ + nvtxEventAttributes_t eventAttrib = {0}; \ + eventAttrib.version = NVTX_VERSION; \ + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ + eventAttrib.colorType = NVTX_COLOR_ARGB; \ + eventAttrib.color = colors[color_id]; \ + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ + eventAttrib.message.ascii = name; \ + nvtxRangePushEx(&eventAttrib); \ +} while(0) +#define NVTX_POP() do { nvtxRangePop(); } while(0) +#else +#define NVTX_PUSH(name,cid) do {} while(0) +#define NVTX_POP() do {} while(0) +#endif + +//---- + enum gpu_msg_level { GPU_MSG_DEBUG = 1, GPU_MSG_INFO, @@ -75,7 +125,11 @@ enum gpu_msg_level { }; -#define gpu_msg(LVL, LVLSTR, FMT, ARGS...) fprintf(stderr, LVLSTR "[%s] " FMT, __FUNCTION__ ,##ARGS) +#define gpu_msg(LVL, LVLSTR, FMT, ARGS...) \ + do { \ + fprintf(stderr, LVLSTR "[%s] " FMT, __FUNCTION__ ,##ARGS); \ + fflush(stderr); \ + } while(0) #if 0 #define gpu_dbg(FMT, ARGS...) do {} while(0) @@ -86,6 +140,8 @@ enum gpu_msg_level { #define gpu_info(FMT, ARGS...) gpu_msg(GPU_MSG_INFO, "INFO: ", FMT, ## ARGS) #define gpu_infoc(CNT, FMT, ARGS...) do { static int __cnt = 0; if (__cnt++ < CNT) gpu_info(FMT, ## ARGS); } while(0) #define gpu_warn(FMT, ARGS...) gpu_msg(GPU_MSG_WARN, "WARN: ", FMT, ## ARGS) +#define gpu_warnc(CNT, FMT, ARGS...) do { static int __cnt = 0; if (__cnt++ < CNT) gpu_msg(GPU_MSG_WARN, "WARN: ", FMT, ## ARGS); } while(0) +#define gpu_warn_once(FMT, ARGS...) gpu_warnc(1, FMT, ##ARGS) #define gpu_err(FMT, ARGS...) gpu_msg(GPU_MSG_ERROR, "ERR: ", FMT, ##ARGS) // oversubscribe SM by factor 2 diff --git a/tests/gpu_kernels.cu b/tests/gpu_kernels.cu index 51025e1..48aac75 100644 --- a/tests/gpu_kernels.cu +++ b/tests/gpu_kernels.cu @@ -1,5 +1,6 @@ #include #include +#include #include "gdsync/device.cuh" @@ -10,6 +11,8 @@ __global__ void void_kernel() { + __threadfence_system(); + __syncthreads(); } int gpu_launch_void_kernel_on_stream(CUstream s) @@ -50,12 +53,12 @@ int gpu_launch_dummy_kernel(void) __global__ void calc_kernel(int n, float c, float *in, float *out) { - const uint tid = threadIdx.x; - const uint bid = blockIdx.x; - const uint block_size = blockDim.x; - const uint grid_size = gridDim.x; - const uint gid = tid + bid*block_size; - const uint n_threads = block_size*grid_size; + const int tid = threadIdx.x; + const int bid = blockIdx.x; + const int block_size = blockDim.x; + const int grid_size = gridDim.x; + const int gid = tid + bid*block_size; + const int n_threads = block_size*grid_size; for (int i=gid; i>>(n, 1.0f, in, out); + // at least 1 thr block + int nb = std::min(((n + nthreads - 1) / nthreads), nblocks); + assert(nb >= 1); + calc_kernel<<>>(n, 1.0f, in, out); CUDACHECK(cudaGetLastError()); return 0; } diff --git a/tests/test_utils.h b/tests/test_utils.h index 007cd44..208dd86 100644 --- a/tests/test_utils.h +++ b/tests/test_utils.h @@ -1,5 +1,7 @@ #pragma once +#include "gpu.h" + #ifdef USE_PROF #include "prof.h" #else @@ -20,7 +22,7 @@ static inline void prof_reset(struct prof *p) {} #else static int perf_start() { - printf("Performance instrumentation is disabled\n"); + gpu_warn("Performance instrumentation is disabled\n"); return 0; } static int perf_stop() @@ -35,7 +37,7 @@ static inline gds_us_t gds_get_time_us() struct timespec ts; int ret = clock_gettime(CLOCK_MONOTONIC, &ts); if (ret) { - fprintf(stderr, "error in gettime %d/%s\n", errno, strerror(errno)); + gpu_err("error in gettime %d/%s\n", errno, strerror(errno)); exit(EXIT_FAILURE); } return (gds_us_t)ts.tv_sec * 1000 * 1000 + (gds_us_t)ts.tv_nsec / 1000;