Skip to content

Commit

Permalink
Merge pull request #325 from abergeron/switch
Browse files Browse the repository at this point in the history
Switch gs and ls
  • Loading branch information
nouiz authored Jan 16, 2017
2 parents dc5508f + 7c1b198 commit d838f6a
Show file tree
Hide file tree
Showing 17 changed files with 54 additions and 54 deletions.
10 changes: 5 additions & 5 deletions pygpu/gpuarray.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -127,9 +127,9 @@ cdef extern from "gpuarray/kernel.h":
unsigned int argcount, const int *types, int flags, char **err_str)
void GpuKernel_clear(_GpuKernel *k)
gpucontext *GpuKernel_context(_GpuKernel *k)
int GpuKernel_sched(_GpuKernel *k, size_t n, size_t *ls, size_t *gs)
int GpuKernel_sched(_GpuKernel *k, size_t n, size_t *gs, size_t *ls)
int GpuKernel_call(_GpuKernel *k, unsigned int n,
const size_t *ls, const size_t *gs,
const size_t *gs, const size_t *ls,
size_t shared, void **args)
int GpuKernel_binary(_GpuKernel *, size_t *, void **)

Expand Down Expand Up @@ -265,9 +265,9 @@ cdef int kernel_init(GpuKernel k, gpucontext *ctx,
int flags) except -1
cdef int kernel_clear(GpuKernel k) except -1
cdef gpucontext *kernel_context(GpuKernel k) except NULL
cdef int kernel_sched(GpuKernel k, size_t n, size_t *ls, size_t *gs) except -1
cdef int kernel_sched(GpuKernel k, size_t n, size_t *gs, size_t *ls) except -1
cdef int kernel_call(GpuKernel k, unsigned int n,
const size_t *ls, const size_t *gs,
const size_t *gs, const size_t *ls,
size_t shared, void **args) except -1
cdef int kernel_binary(GpuKernel k, size_t *, void **) except -1
cdef int kernel_property(GpuKernel k, int prop_id, void *res) except -1
Expand Down Expand Up @@ -346,5 +346,5 @@ cdef api class GpuKernel [type PyGpuKernelType, object PyGpuKernelObject]:
cdef void **callbuf
cdef object __weakref__

cdef do_call(self, py_n, py_ls, py_gs, py_args, size_t shared)
cdef do_call(self, py_n, py_gs, py_ls, py_args, size_t shared)
cdef _setarg(self, unsigned int index, int typecode, object o)
24 changes: 12 additions & 12 deletions pygpu/gpuarray.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -446,16 +446,16 @@ cdef gpucontext *kernel_context(GpuKernel k) except NULL:
raise GpuArrayException, "Invalid kernel or destroyed context"
return res

cdef int kernel_sched(GpuKernel k, size_t n, size_t *ls, size_t *gs) except -1:
cdef int kernel_sched(GpuKernel k, size_t n, size_t *gs, size_t *ls) except -1:
cdef int err
err = GpuKernel_sched(&k.k, n, ls, gs)
err = GpuKernel_sched(&k.k, n, gs, ls)
if err != GA_NO_ERROR:
raise get_exc(err), kernel_error(k, err)

cdef int kernel_call(GpuKernel k, unsigned int n, const size_t *ls,
const size_t *gs, size_t shared, void **args) except -1:
cdef int kernel_call(GpuKernel k, unsigned int n, const size_t *gs,
const size_t *ls, size_t shared, void **args) except -1:
cdef int err
err = GpuKernel_call(&k.k, n, ls, gs, shared, args)
err = GpuKernel_call(&k.k, n, gs, ls, shared, args)
if err != GA_NO_ERROR:
raise get_exc(err), kernel_error(k, err)

Expand Down Expand Up @@ -2113,10 +2113,10 @@ cdef class GpuKernel:
sure to test against the size of your data.
If you want more control over thread allocation you can use the
`ls` and `gs` parameters like so::
`gs` and `ls` parameters like so::
k = GpuKernel(...)
k(param1, param2, ls=ls, gs=gs)
k(param1, param2, gs=gs, ls=ls)
If you choose to use this interface, make sure to stay within the
limits of `k.maxlsize` and `ctx.maxgsize` or the call will fail.
Expand Down Expand Up @@ -2200,12 +2200,12 @@ cdef class GpuKernel:
finally:
free(_types)

def __call__(self, *args, n=None, ls=None, gs=None, shared=0):
def __call__(self, *args, n=None, gs=None, ls=None, shared=0):
if n == None and (ls == None or gs == None):
raise ValueError, "Must specify size (n) or both gs and ls"
self.do_call(n, ls, gs, args, shared)
self.do_call(n, gs, ls, args, shared)

cdef do_call(self, py_n, py_ls, py_gs, py_args, size_t shared):
cdef do_call(self, py_n, py_gs, py_ls, py_args, size_t shared):
cdef size_t n
cdef size_t gs[3]
cdef size_t ls[3]
Expand Down Expand Up @@ -2272,8 +2272,8 @@ cdef class GpuKernel:
if nd != 1:
raise ValueError, "n is specified and nd != 1"
n = py_n
kernel_sched(self, n, &ls[0], &gs[0])
kernel_call(self, nd, ls, gs, shared, self.callbuf)
kernel_sched(self, n, &gs[0], &ls[0])
kernel_call(self, nd, gs, ls, shared, self.callbuf)

cdef _setarg(self, unsigned int index, int typecode, object o):
if typecode == GA_BUFFER:
Expand Down
2 changes: 1 addition & 1 deletion pygpu/reduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ def __call__(self, *args, **kwargs):
kargs.append(offsets[i])
kargs.extend(strs[i])

k(*kargs, ls=ls, gs=gs)
k(*kargs, gs=gs, ls=ls)

return out

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
MAJOR = 0
MINOR = 6
PATCH = 0
SUFFIX = 'rc1'
SUFFIX = 'rc2'
FULLVERSION = '%d.%d.%d%s' % (MAJOR, MINOR, PATCH, SUFFIX)

try:
Expand Down
2 changes: 1 addition & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ set_target_properties(gpuarray PROPERTIES
INSTALL_NAME_DIR ${CMAKE_INSTALL_PREFIX}/lib
MACOSX_RPATH OFF
# This is the shared library version
VERSION 1.0
VERSION 2.0
)

add_library(gpuarray-static STATIC ${GPUARRAY_SRC})
Expand Down
4 changes: 2 additions & 2 deletions src/gpuarray/buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -482,15 +482,15 @@ GPUARRAY_PUBLIC int gpukernel_setarg(gpukernel *k, unsigned int i, void *a);
*
* \param k kernel
* \param n number of dimensions of grid/block
* \param bs block sizes for this call (also known as local size)
* \param gs grid sizes for this call (also known as global size)
* \param ls block sizes for this call (also known as local size)
* \param shared amount of dynamic shared memory to reserve
* \param args table of pointers to each argument (optional).
*
* \returns GA_NO_ERROR or an error code if an error occurred.
*/
GPUARRAY_PUBLIC int gpukernel_call(gpukernel *k, unsigned int n,
const size_t *ls, const size_t *gs,
const size_t *gs, const size_t *ls,
size_t shared, void **args);

/**
Expand Down
2 changes: 1 addition & 1 deletion src/gpuarray/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

/* The following included file should have been generated by CMake. */
#include <gpuarray/abi_version.h>
#define GPUARRAY_API_VERSION 0
#define GPUARRAY_API_VERSION 1

#ifdef GPUARRAY_SHARED
#ifdef _WIN32
Expand Down
8 changes: 4 additions & 4 deletions src/gpuarray/kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,24 +87,24 @@ GPUARRAY_PUBLIC int GpuKernel_setarg(GpuKernel *k, unsigned int i, void *val);
*
* \param k the kernel to schedule for
* \param n number of elements to handle
* \param ls local size (in/out)
* \param gs grid size (in/out)
* \param ls local size (in/out)
*/
GPUARRAY_PUBLIC int GpuKernel_sched(GpuKernel *k, size_t n,
size_t *ls, size_t *gs);
size_t *gs, size_t *ls);

/**
* Launch the execution of a kernel.
*
* \param k the kernel to launch
* \param n dimensionality of the grid/blocks
* \param ls sizes of launch blocks
* \param gs sizes of launch grid
* \param ls sizes of launch blocks
* \param amount of dynamic shared memory to allocate
* \param args table of pointers to arguments
*/
GPUARRAY_PUBLIC int GpuKernel_call(GpuKernel *k, unsigned int n,
const size_t *ls, const size_t *gs,
const size_t *gs, const size_t *ls,
size_t shared, void **args);

GPUARRAY_PUBLIC int GpuKernel_binary(const GpuKernel *k, size_t *sz,
Expand Down
4 changes: 2 additions & 2 deletions src/gpuarray_array.c
Original file line number Diff line number Diff line change
Expand Up @@ -487,7 +487,7 @@ int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i,
if (err != GA_NO_ERROR)
return err;

err = GpuKernel_sched(&k, n[0]*n[1], &ls[1], &gs[1]);
err = GpuKernel_sched(&k, n[0]*n[1], &gs[1], &ls[1]);
if (err != GA_NO_ERROR)
goto out;

Expand Down Expand Up @@ -521,7 +521,7 @@ int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i,
GpuKernel_setarg(&k, argp++, &n[1]);
GpuKernel_setarg(&k, argp++, errbuf);

err = GpuKernel_call(&k, 2, ls, gs, 0, NULL);
err = GpuKernel_call(&k, 2, gs, ls, 0, NULL);
if (check_error && err == GA_NO_ERROR) {
err = gpudata_read(&kerr, errbuf, 0, sizeof(int));
if (err == GA_NO_ERROR && kerr != 0) {
Expand Down
12 changes: 6 additions & 6 deletions src/gpuarray_blas_cuda_cublas.c
Original file line number Diff line number Diff line change
Expand Up @@ -1099,9 +1099,9 @@ static int sgemvBatch(cb_order order, cb_transpose transA,
args[8] = &N;

if (transA == cb_no_trans) {
err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_N_a1_b1_small, 2, ls, gs, 0, args);
err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_N_a1_b1_small, 2, gs, ls, 0, args);
} else {
err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_T_a1_b1_small, 2, ls, gs, 0, args);
err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_T_a1_b1_small, 2, gs, ls, 0, args);
}

cuda_ops.buffer_release(Aa);
Expand Down Expand Up @@ -1223,9 +1223,9 @@ static int dgemvBatch(cb_order order, cb_transpose transA,
args[8] = &N;

if (transA == cb_no_trans) {
err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_N_a1_b1_small, 2, ls, gs, 0, args);
err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_N_a1_b1_small, 2, gs, ls, 0, args);
} else {
err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_T_a1_b1_small, 2, ls, gs, 0, args);
err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_T_a1_b1_small, 2, gs, ls, 0, args);
}

cuda_ops.buffer_release(Aa);
Expand Down Expand Up @@ -1486,7 +1486,7 @@ static int sgerBatch(cb_order order, size_t M, size_t N, float alpha,
args[8] = &M;
args[9] = &N;

err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, ls, gs, 0, args);
err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, gs, ls, 0, args);

cuda_ops.buffer_release(Aa);
cuda_ops.buffer_release(xa);
Expand Down Expand Up @@ -1618,7 +1618,7 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha,
args[8] = &M;
args[9] = &N;

err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, ls, gs, 0, args);
err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, gs, ls, 0, args);

cuda_ops.buffer_release(Aa);
cuda_ops.buffer_release(xa);
Expand Down
6 changes: 3 additions & 3 deletions src/gpuarray_buffer.c
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,9 @@ int gpukernel_setarg(gpukernel *k, unsigned int i, void *a) {
return ((partial_gpukernel *)k)->ctx->ops->kernel_setarg(k, i, a);
}

int gpukernel_call(gpukernel *k, unsigned int n, const size_t *ls,
const size_t *gs, size_t shared, void **args) {
return ((partial_gpukernel *)k)->ctx->ops->kernel_call(k, n, ls, gs,
int gpukernel_call(gpukernel *k, unsigned int n, const size_t *gs,
const size_t *ls, size_t shared, void **args) {
return ((partial_gpukernel *)k)->ctx->ops->kernel_call(k, n, gs, ls,
shared, args);
}

Expand Down
8 changes: 4 additions & 4 deletions src/gpuarray_buffer_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -1237,7 +1237,7 @@ static int cuda_kernelsetarg(gpukernel *k, unsigned int i, void *arg) {
}

static int cuda_callkernel(gpukernel *k, unsigned int n,
const size_t *bs, const size_t *gs,
const size_t *gs, const size_t *ls,
size_t shared, void **args) {
cuda_context *ctx = k->ctx;
unsigned int i;
Expand All @@ -1258,15 +1258,15 @@ static int cuda_callkernel(gpukernel *k, unsigned int n,

switch (n) {
case 1:
ctx->err = cuLaunchKernel(k->k, gs[0], 1, 1, bs[0], 1, 1, shared,
ctx->err = cuLaunchKernel(k->k, gs[0], 1, 1, ls[0], 1, 1, shared,
ctx->s, args, NULL);
break;
case 2:
ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], 1, bs[0], bs[1], 1, shared,
ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], 1, ls[0], ls[1], 1, shared,
ctx->s, args, NULL);
break;
case 3:
ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], gs[2], bs[0], bs[1], bs[2],
ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], gs[2], ls[0], ls[1], ls[2],
shared, ctx->s, args, NULL);
break;
default:
Expand Down
6 changes: 3 additions & 3 deletions src/gpuarray_buffer_opencl.c
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ cl_mem cl_get_buf(gpudata *g) { ASSERT_BUF(g); return g->buf; }

static void cl_releasekernel(gpukernel *k);
static int cl_callkernel(gpukernel *k, unsigned int n,
const size_t *bs, const size_t *gs,
const size_t *gs, const size_t *ls,
size_t shared, void **args);

static const char CL_PREAMBLE[] =
Expand Down Expand Up @@ -748,7 +748,7 @@ static int cl_memset(gpudata *dst, size_t offset, int data) {
if (res != GA_NO_ERROR) goto fail;
gs = ((n-1) / ls) + 1;
args[0] = dst;
res = cl_callkernel(m, 1, &ls, &gs, 0, args);
res = cl_callkernel(m, 1, &gs, &ls, 0, args);

fail:
cl_releasekernel(m);
Expand Down Expand Up @@ -998,7 +998,7 @@ static int cl_setkernelarg(gpukernel *k, unsigned int i, void *a) {
}

static int cl_callkernel(gpukernel *k, unsigned int n,
const size_t *ls, const size_t *gs,
const size_t *gs, const size_t *ls,
size_t shared, void **args) {
cl_ctx *ctx = k->ctx;
size_t _gs[3];
Expand Down
8 changes: 4 additions & 4 deletions src/gpuarray_elemwise.c
Original file line number Diff line number Diff line change
Expand Up @@ -414,10 +414,10 @@ static int call_basic(GpuElemwise *ge, void **args, size_t n, unsigned int nd,
}
}

err = GpuKernel_sched(k, n, &ls, &gs);
err = GpuKernel_sched(k, n, &gs, &ls);
if (err != GA_NO_ERROR) goto error;

err = GpuKernel_call(k, 1, &ls, &gs, 0, NULL);
err = GpuKernel_call(k, 1, &gs, &ls, 0, NULL);
error:
return err;
}
Expand Down Expand Up @@ -572,9 +572,9 @@ static int call_contig(GpuElemwise *ge, void **args, size_t n) {
if (err != GA_NO_ERROR) return err;
}
}
err = GpuKernel_sched(&ge->k_contig, n, &ls, &gs);
err = GpuKernel_sched(&ge->k_contig, n, &gs, &ls);
if (err != GA_NO_ERROR) return err;
return GpuKernel_call(&ge->k_contig, 1, &ls, &gs, 0, NULL);
return GpuKernel_call(&ge->k_contig, 1, &gs, &ls, 0, NULL);
}

GpuElemwise *GpuElemwise_new(gpucontext *ctx,
Expand Down
6 changes: 3 additions & 3 deletions src/gpuarray_kernel.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ gpucontext *GpuKernel_context(GpuKernel *k) {
return gpukernel_context(k->k);
}

int GpuKernel_sched(GpuKernel *k, size_t n, size_t *ls, size_t *gs) {
int GpuKernel_sched(GpuKernel *k, size_t n, size_t *gs, size_t *ls) {
size_t min_l;
size_t max_l;
size_t target_l;
Expand Down Expand Up @@ -90,9 +90,9 @@ int GpuKernel_setarg(GpuKernel *k, unsigned int i, void *a) {
}

int GpuKernel_call(GpuKernel *k, unsigned int n,
const size_t *bs, const size_t *gs,
const size_t *gs, const size_t *ls,
size_t shared, void **args) {
return gpukernel_call(k->k, n, bs, gs, shared, args);
return gpukernel_call(k->k, n, gs, ls, shared, args);
}

int GpuKernel_binary(const GpuKernel *k, size_t *sz, void **bin) {
Expand Down
2 changes: 1 addition & 1 deletion src/gpuarray_reduction.c
Original file line number Diff line number Diff line change
Expand Up @@ -815,8 +815,8 @@ static int maxandargmaxInvoke (maxandargmax_ctx* ctx){
ctx->dstArgmaxStepsGD){
ctx->ret = GpuKernel_call(&ctx->kernel,
ctx->ndh>0 ? ctx->ndh : 1,
ctx->blockSize,
ctx->gridSize,
ctx->blockSize,
0,
args);
}else{
Expand Down
2 changes: 1 addition & 1 deletion src/private.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ struct _gpuarray_buffer_ops {
void (*kernel_release)(gpukernel *k);
int (*kernel_setarg)(gpukernel *k, unsigned int i, void *a);
int (*kernel_call)(gpukernel *k, unsigned int n,
const size_t *bs, const size_t *gs,
const size_t *gs, const size_t *ls,
size_t shared, void **args);

int (*kernel_binary)(gpukernel *k, size_t *sz, void **obj);
Expand Down

0 comments on commit d838f6a

Please sign in to comment.