From 6d449061c3ce0369474e6b39cbe14dc4d070cab9 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 12 Jan 2017 13:32:32 -0500 Subject: [PATCH 1/3] Switch the order of gs and ls to conform to what the underlying APIs use. --- src/CMakeLists.txt | 2 +- src/gpuarray/buffer.h | 4 ++-- src/gpuarray/config.h | 2 +- src/gpuarray/kernel.h | 8 ++++---- src/gpuarray_array.c | 4 ++-- src/gpuarray_blas_cuda_cublas.c | 12 ++++++------ src/gpuarray_buffer.c | 6 +++--- src/gpuarray_buffer_cuda.c | 8 ++++---- src/gpuarray_buffer_opencl.c | 6 +++--- src/gpuarray_elemwise.c | 8 ++++---- src/gpuarray_kernel.c | 6 +++--- src/gpuarray_reduction.c | 2 +- src/private.h | 2 +- 13 files changed, 35 insertions(+), 35 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 029148050d..02e32eccd4 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -88,7 +88,7 @@ set_target_properties(gpuarray PROPERTIES INSTALL_NAME_DIR ${CMAKE_INSTALL_PREFIX}/lib MACOSX_RPATH OFF # This is the shared library version - VERSION 1.0 + VERSION 2.0 ) add_library(gpuarray-static STATIC ${GPUARRAY_SRC}) diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h index fb8970781a..d6d3dd8a09 100644 --- a/src/gpuarray/buffer.h +++ b/src/gpuarray/buffer.h @@ -482,15 +482,15 @@ GPUARRAY_PUBLIC int gpukernel_setarg(gpukernel *k, unsigned int i, void *a); * * \param k kernel * \param n number of dimensions of grid/block - * \param bs block sizes for this call (also known as local size) * \param gs grid sizes for this call (also known as global size) + * \param ls block sizes for this call (also known as local size) * \param shared amount of dynamic shared memory to reserve * \param args table of pointers to each argument (optional). * * \returns GA_NO_ERROR or an error code if an error occurred. */ GPUARRAY_PUBLIC int gpukernel_call(gpukernel *k, unsigned int n, - const size_t *ls, const size_t *gs, + const size_t *gs, const size_t *ls, size_t shared, void **args); /** diff --git a/src/gpuarray/config.h b/src/gpuarray/config.h index 4eb58b401d..f8fc86a01d 100644 --- a/src/gpuarray/config.h +++ b/src/gpuarray/config.h @@ -3,7 +3,7 @@ /* The following included file should have been generated by CMake. */ #include -#define GPUARRAY_API_VERSION 0 +#define GPUARRAY_API_VERSION 1 #ifdef GPUARRAY_SHARED #ifdef _WIN32 diff --git a/src/gpuarray/kernel.h b/src/gpuarray/kernel.h index 82d4f74edf..f88d74ffc6 100644 --- a/src/gpuarray/kernel.h +++ b/src/gpuarray/kernel.h @@ -87,24 +87,24 @@ GPUARRAY_PUBLIC int GpuKernel_setarg(GpuKernel *k, unsigned int i, void *val); * * \param k the kernel to schedule for * \param n number of elements to handle - * \param ls local size (in/out) * \param gs grid size (in/out) + * \param ls local size (in/out) */ GPUARRAY_PUBLIC int GpuKernel_sched(GpuKernel *k, size_t n, - size_t *ls, size_t *gs); + size_t *gs, size_t *ls); /** * Launch the execution of a kernel. * * \param k the kernel to launch * \param n dimensionality of the grid/blocks - * \param ls sizes of launch blocks * \param gs sizes of launch grid + * \param ls sizes of launch blocks * \param amount of dynamic shared memory to allocate * \param args table of pointers to arguments */ GPUARRAY_PUBLIC int GpuKernel_call(GpuKernel *k, unsigned int n, - const size_t *ls, const size_t *gs, + const size_t *gs, const size_t *ls, size_t shared, void **args); GPUARRAY_PUBLIC int GpuKernel_binary(const GpuKernel *k, size_t *sz, diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c index eef077e6d5..434c641ae2 100644 --- a/src/gpuarray_array.c +++ b/src/gpuarray_array.c @@ -487,7 +487,7 @@ int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i, if (err != GA_NO_ERROR) return err; - err = GpuKernel_sched(&k, n[0]*n[1], &ls[1], &gs[1]); + err = GpuKernel_sched(&k, n[0]*n[1], &gs[1], &ls[1]); if (err != GA_NO_ERROR) goto out; @@ -521,7 +521,7 @@ int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i, GpuKernel_setarg(&k, argp++, &n[1]); GpuKernel_setarg(&k, argp++, errbuf); - err = GpuKernel_call(&k, 2, ls, gs, 0, NULL); + err = GpuKernel_call(&k, 2, gs, ls, 0, NULL); if (check_error && err == GA_NO_ERROR) { err = gpudata_read(&kerr, errbuf, 0, sizeof(int)); if (err == GA_NO_ERROR && kerr != 0) { diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 9354d057a1..6d4648e232 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -1099,9 +1099,9 @@ static int sgemvBatch(cb_order order, cb_transpose transA, args[8] = &N; if (transA == cb_no_trans) { - err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_N_a1_b1_small, 2, ls, gs, 0, args); + err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_N_a1_b1_small, 2, gs, ls, 0, args); } else { - err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_T_a1_b1_small, 2, ls, gs, 0, args); + err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_T_a1_b1_small, 2, gs, ls, 0, args); } cuda_ops.buffer_release(Aa); @@ -1223,9 +1223,9 @@ static int dgemvBatch(cb_order order, cb_transpose transA, args[8] = &N; if (transA == cb_no_trans) { - err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_N_a1_b1_small, 2, ls, gs, 0, args); + err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_N_a1_b1_small, 2, gs, ls, 0, args); } else { - err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_T_a1_b1_small, 2, ls, gs, 0, args); + err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_T_a1_b1_small, 2, gs, ls, 0, args); } cuda_ops.buffer_release(Aa); @@ -1486,7 +1486,7 @@ static int sgerBatch(cb_order order, size_t M, size_t N, float alpha, args[8] = &M; args[9] = &N; - err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, ls, gs, 0, args); + err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, gs, ls, 0, args); cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); @@ -1618,7 +1618,7 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, args[8] = &M; args[9] = &N; - err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, ls, gs, 0, args); + err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, gs, ls, 0, args); cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c index 14f792e453..a4dfd3329b 100644 --- a/src/gpuarray_buffer.c +++ b/src/gpuarray_buffer.c @@ -180,9 +180,9 @@ int gpukernel_setarg(gpukernel *k, unsigned int i, void *a) { return ((partial_gpukernel *)k)->ctx->ops->kernel_setarg(k, i, a); } -int gpukernel_call(gpukernel *k, unsigned int n, const size_t *ls, - const size_t *gs, size_t shared, void **args) { - return ((partial_gpukernel *)k)->ctx->ops->kernel_call(k, n, ls, gs, +int gpukernel_call(gpukernel *k, unsigned int n, const size_t *gs, + const size_t *ls, size_t shared, void **args) { + return ((partial_gpukernel *)k)->ctx->ops->kernel_call(k, n, gs, ls, shared, args); } diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 500eb147c1..120919c72a 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -1237,7 +1237,7 @@ static int cuda_kernelsetarg(gpukernel *k, unsigned int i, void *arg) { } static int cuda_callkernel(gpukernel *k, unsigned int n, - const size_t *bs, const size_t *gs, + const size_t *gs, const size_t *ls, size_t shared, void **args) { cuda_context *ctx = k->ctx; unsigned int i; @@ -1258,15 +1258,15 @@ static int cuda_callkernel(gpukernel *k, unsigned int n, switch (n) { case 1: - ctx->err = cuLaunchKernel(k->k, gs[0], 1, 1, bs[0], 1, 1, shared, + ctx->err = cuLaunchKernel(k->k, gs[0], 1, 1, ls[0], 1, 1, shared, ctx->s, args, NULL); break; case 2: - ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], 1, bs[0], bs[1], 1, shared, + ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], 1, ls[0], ls[1], 1, shared, ctx->s, args, NULL); break; case 3: - ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], gs[2], bs[0], bs[1], bs[2], + ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], gs[2], ls[0], ls[1], ls[2], shared, ctx->s, args, NULL); break; default: diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index f744084af3..7b8d684c7c 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -274,7 +274,7 @@ cl_mem cl_get_buf(gpudata *g) { ASSERT_BUF(g); return g->buf; } static void cl_releasekernel(gpukernel *k); static int cl_callkernel(gpukernel *k, unsigned int n, - const size_t *bs, const size_t *gs, + const size_t *gs, const size_t *ls, size_t shared, void **args); static const char CL_PREAMBLE[] = @@ -748,7 +748,7 @@ static int cl_memset(gpudata *dst, size_t offset, int data) { if (res != GA_NO_ERROR) goto fail; gs = ((n-1) / ls) + 1; args[0] = dst; - res = cl_callkernel(m, 1, &ls, &gs, 0, args); + res = cl_callkernel(m, 1, &gs, &ls, 0, args); fail: cl_releasekernel(m); @@ -998,7 +998,7 @@ static int cl_setkernelarg(gpukernel *k, unsigned int i, void *a) { } static int cl_callkernel(gpukernel *k, unsigned int n, - const size_t *ls, const size_t *gs, + const size_t *gs, const size_t *ls, size_t shared, void **args) { cl_ctx *ctx = k->ctx; size_t _gs[3]; diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c index 1bb05bbb7e..f3ce7ee261 100644 --- a/src/gpuarray_elemwise.c +++ b/src/gpuarray_elemwise.c @@ -414,10 +414,10 @@ static int call_basic(GpuElemwise *ge, void **args, size_t n, unsigned int nd, } } - err = GpuKernel_sched(k, n, &ls, &gs); + err = GpuKernel_sched(k, n, &gs, &ls); if (err != GA_NO_ERROR) goto error; - err = GpuKernel_call(k, 1, &ls, &gs, 0, NULL); + err = GpuKernel_call(k, 1, &gs, &ls, 0, NULL); error: return err; } @@ -572,9 +572,9 @@ static int call_contig(GpuElemwise *ge, void **args, size_t n) { if (err != GA_NO_ERROR) return err; } } - err = GpuKernel_sched(&ge->k_contig, n, &ls, &gs); + err = GpuKernel_sched(&ge->k_contig, n, &gs, &ls); if (err != GA_NO_ERROR) return err; - return GpuKernel_call(&ge->k_contig, 1, &ls, &gs, 0, NULL); + return GpuKernel_call(&ge->k_contig, 1, &gs, &ls, 0, NULL); } GpuElemwise *GpuElemwise_new(gpucontext *ctx, diff --git a/src/gpuarray_kernel.c b/src/gpuarray_kernel.c index 8beea94150..58311c86bb 100644 --- a/src/gpuarray_kernel.c +++ b/src/gpuarray_kernel.c @@ -32,7 +32,7 @@ gpucontext *GpuKernel_context(GpuKernel *k) { return gpukernel_context(k->k); } -int GpuKernel_sched(GpuKernel *k, size_t n, size_t *ls, size_t *gs) { +int GpuKernel_sched(GpuKernel *k, size_t n, size_t *gs, size_t *ls) { size_t min_l; size_t max_l; size_t target_l; @@ -90,9 +90,9 @@ int GpuKernel_setarg(GpuKernel *k, unsigned int i, void *a) { } int GpuKernel_call(GpuKernel *k, unsigned int n, - const size_t *bs, const size_t *gs, + const size_t *gs, const size_t *ls, size_t shared, void **args) { - return gpukernel_call(k->k, n, bs, gs, shared, args); + return gpukernel_call(k->k, n, gs, ls, shared, args); } int GpuKernel_binary(const GpuKernel *k, size_t *sz, void **bin) { diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 15391bad69..12eedb24a9 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -815,8 +815,8 @@ static int maxandargmaxInvoke (maxandargmax_ctx* ctx){ ctx->dstArgmaxStepsGD){ ctx->ret = GpuKernel_call(&ctx->kernel, ctx->ndh>0 ? ctx->ndh : 1, - ctx->blockSize, ctx->gridSize, + ctx->blockSize, 0, args); }else{ diff --git a/src/private.h b/src/private.h index 57d919be88..0513df8605 100644 --- a/src/private.h +++ b/src/private.h @@ -97,7 +97,7 @@ struct _gpuarray_buffer_ops { void (*kernel_release)(gpukernel *k); int (*kernel_setarg)(gpukernel *k, unsigned int i, void *a); int (*kernel_call)(gpukernel *k, unsigned int n, - const size_t *bs, const size_t *gs, + const size_t *gs, const size_t *ls, size_t shared, void **args); int (*kernel_binary)(gpukernel *k, size_t *sz, void **obj); From 67cb9fccf4c6899d2d0d3194b7dc84ad302c0e00 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 12 Jan 2017 13:40:43 -0500 Subject: [PATCH 2/3] Adapt pygpu to the order change. --- pygpu/gpuarray.pxd | 10 +++++----- pygpu/gpuarray.pyx | 24 ++++++++++++------------ pygpu/reduction.py | 2 +- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/pygpu/gpuarray.pxd b/pygpu/gpuarray.pxd index c305d1dcfe..a4c83b0e2e 100644 --- a/pygpu/gpuarray.pxd +++ b/pygpu/gpuarray.pxd @@ -127,9 +127,9 @@ cdef extern from "gpuarray/kernel.h": unsigned int argcount, const int *types, int flags, char **err_str) void GpuKernel_clear(_GpuKernel *k) gpucontext *GpuKernel_context(_GpuKernel *k) - int GpuKernel_sched(_GpuKernel *k, size_t n, size_t *ls, size_t *gs) + int GpuKernel_sched(_GpuKernel *k, size_t n, size_t *gs, size_t *ls) int GpuKernel_call(_GpuKernel *k, unsigned int n, - const size_t *ls, const size_t *gs, + const size_t *gs, const size_t *ls, size_t shared, void **args) int GpuKernel_binary(_GpuKernel *, size_t *, void **) @@ -265,9 +265,9 @@ cdef int kernel_init(GpuKernel k, gpucontext *ctx, int flags) except -1 cdef int kernel_clear(GpuKernel k) except -1 cdef gpucontext *kernel_context(GpuKernel k) except NULL -cdef int kernel_sched(GpuKernel k, size_t n, size_t *ls, size_t *gs) except -1 +cdef int kernel_sched(GpuKernel k, size_t n, size_t *gs, size_t *ls) except -1 cdef int kernel_call(GpuKernel k, unsigned int n, - const size_t *ls, const size_t *gs, + const size_t *gs, const size_t *ls, size_t shared, void **args) except -1 cdef int kernel_binary(GpuKernel k, size_t *, void **) except -1 cdef int kernel_property(GpuKernel k, int prop_id, void *res) except -1 @@ -346,5 +346,5 @@ cdef api class GpuKernel [type PyGpuKernelType, object PyGpuKernelObject]: cdef void **callbuf cdef object __weakref__ - cdef do_call(self, py_n, py_ls, py_gs, py_args, size_t shared) + cdef do_call(self, py_n, py_gs, py_ls, py_args, size_t shared) cdef _setarg(self, unsigned int index, int typecode, object o) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index ef2fb818e7..ca19ed907a 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -446,16 +446,16 @@ cdef gpucontext *kernel_context(GpuKernel k) except NULL: raise GpuArrayException, "Invalid kernel or destroyed context" return res -cdef int kernel_sched(GpuKernel k, size_t n, size_t *ls, size_t *gs) except -1: +cdef int kernel_sched(GpuKernel k, size_t n, size_t *gs, size_t *ls) except -1: cdef int err - err = GpuKernel_sched(&k.k, n, ls, gs) + err = GpuKernel_sched(&k.k, n, gs, ls) if err != GA_NO_ERROR: raise get_exc(err), kernel_error(k, err) -cdef int kernel_call(GpuKernel k, unsigned int n, const size_t *ls, - const size_t *gs, size_t shared, void **args) except -1: +cdef int kernel_call(GpuKernel k, unsigned int n, const size_t *gs, + const size_t *ls, size_t shared, void **args) except -1: cdef int err - err = GpuKernel_call(&k.k, n, ls, gs, shared, args) + err = GpuKernel_call(&k.k, n, gs, ls, shared, args) if err != GA_NO_ERROR: raise get_exc(err), kernel_error(k, err) @@ -2105,10 +2105,10 @@ cdef class GpuKernel: sure to test against the size of your data. If you want more control over thread allocation you can use the - `ls` and `gs` parameters like so:: + `gs` and `ls` parameters like so:: k = GpuKernel(...) - k(param1, param2, ls=ls, gs=gs) + k(param1, param2, gs=gs, ls=ls) If you choose to use this interface, make sure to stay within the limits of `k.maxlsize` and `ctx.maxgsize` or the call will fail. @@ -2192,12 +2192,12 @@ cdef class GpuKernel: finally: free(_types) - def __call__(self, *args, n=None, ls=None, gs=None, shared=0): + def __call__(self, *args, n=None, gs=None, ls=None, shared=0): if n == None and (ls == None or gs == None): raise ValueError, "Must specify size (n) or both gs and ls" - self.do_call(n, ls, gs, args, shared) + self.do_call(n, gs, ls, args, shared) - cdef do_call(self, py_n, py_ls, py_gs, py_args, size_t shared): + cdef do_call(self, py_n, py_gs, py_ls, py_args, size_t shared): cdef size_t n cdef size_t gs[3] cdef size_t ls[3] @@ -2264,8 +2264,8 @@ cdef class GpuKernel: if nd != 1: raise ValueError, "n is specified and nd != 1" n = py_n - kernel_sched(self, n, &ls[0], &gs[0]) - kernel_call(self, nd, ls, gs, shared, self.callbuf) + kernel_sched(self, n, &gs[0], &ls[0]) + kernel_call(self, nd, gs, ls, shared, self.callbuf) cdef _setarg(self, unsigned int index, int typecode, object o): if typecode == GA_BUFFER: diff --git a/pygpu/reduction.py b/pygpu/reduction.py index 22f5a9c927..441380dbb5 100644 --- a/pygpu/reduction.py +++ b/pygpu/reduction.py @@ -277,7 +277,7 @@ def __call__(self, *args, **kwargs): kargs.append(offsets[i]) kargs.extend(strs[i]) - k(*kargs, ls=ls, gs=gs) + k(*kargs, gs=gs, ls=ls) return out From 7c1b19819cf8a133f358a58e38333729080d9f30 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 16 Jan 2017 13:12:00 -0500 Subject: [PATCH 3/3] Bump the rc version. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 54b391c71a..c6c8139c9f 100755 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ MAJOR = 0 MINOR = 6 PATCH = 0 -SUFFIX = 'rc1' +SUFFIX = 'rc2' FULLVERSION = '%d.%d.%d%s' % (MAJOR, MINOR, PATCH, SUFFIX) try: