From 6d449061c3ce0369474e6b39cbe14dc4d070cab9 Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Thu, 12 Jan 2017 13:32:32 -0500
Subject: [PATCH 1/3] Switch the order of gs and ls to conform to what the
 underlying APIs use.

---
 src/CMakeLists.txt              |  2 +-
 src/gpuarray/buffer.h           |  4 ++--
 src/gpuarray/config.h           |  2 +-
 src/gpuarray/kernel.h           |  8 ++++----
 src/gpuarray_array.c            |  4 ++--
 src/gpuarray_blas_cuda_cublas.c | 12 ++++++------
 src/gpuarray_buffer.c           |  6 +++---
 src/gpuarray_buffer_cuda.c      |  8 ++++----
 src/gpuarray_buffer_opencl.c    |  6 +++---
 src/gpuarray_elemwise.c         |  8 ++++----
 src/gpuarray_kernel.c           |  6 +++---
 src/gpuarray_reduction.c        |  2 +-
 src/private.h                   |  2 +-
 13 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 029148050d..02e32eccd4 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -88,7 +88,7 @@ set_target_properties(gpuarray PROPERTIES
   INSTALL_NAME_DIR ${CMAKE_INSTALL_PREFIX}/lib
   MACOSX_RPATH OFF
   # This is the shared library version
-  VERSION 1.0
+  VERSION 2.0
   )
 
 add_library(gpuarray-static STATIC ${GPUARRAY_SRC})
diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h
index fb8970781a..d6d3dd8a09 100644
--- a/src/gpuarray/buffer.h
+++ b/src/gpuarray/buffer.h
@@ -482,15 +482,15 @@ GPUARRAY_PUBLIC int gpukernel_setarg(gpukernel *k, unsigned int i, void *a);
  *
  * \param k kernel
  * \param n number of dimensions of grid/block
- * \param bs block sizes for this call (also known as local size)
  * \param gs grid sizes for this call (also known as global size)
+ * \param ls block sizes for this call (also known as local size)
  * \param shared amount of dynamic shared memory to reserve
  * \param args table of pointers to each argument (optional).
  *
  * \returns GA_NO_ERROR or an error code if an error occurred.
  */
 GPUARRAY_PUBLIC int gpukernel_call(gpukernel *k, unsigned int n,
-                                   const size_t *ls, const size_t *gs,
+                                   const size_t *gs, const size_t *ls,
                                    size_t shared, void **args);
 
 /**
diff --git a/src/gpuarray/config.h b/src/gpuarray/config.h
index 4eb58b401d..f8fc86a01d 100644
--- a/src/gpuarray/config.h
+++ b/src/gpuarray/config.h
@@ -3,7 +3,7 @@
 
 /* The following included file should have been generated by CMake. */
 #include <gpuarray/abi_version.h>
-#define GPUARRAY_API_VERSION 0
+#define GPUARRAY_API_VERSION 1
 
 #ifdef GPUARRAY_SHARED
  #ifdef _WIN32
diff --git a/src/gpuarray/kernel.h b/src/gpuarray/kernel.h
index 82d4f74edf..f88d74ffc6 100644
--- a/src/gpuarray/kernel.h
+++ b/src/gpuarray/kernel.h
@@ -87,24 +87,24 @@ GPUARRAY_PUBLIC int GpuKernel_setarg(GpuKernel *k, unsigned int i, void *val);
  *
  * \param k the kernel to schedule for
  * \param n number of elements to handle
- * \param ls local size (in/out)
  * \param gs grid size (in/out)
+ * \param ls local size (in/out)
  */
 GPUARRAY_PUBLIC int GpuKernel_sched(GpuKernel *k, size_t n,
-                                    size_t *ls, size_t *gs);
+                                    size_t *gs, size_t *ls);
 
 /**
  * Launch the execution of a kernel.
  *
  * \param k the kernel to launch
  * \param n dimensionality of the grid/blocks
- * \param ls sizes of launch blocks
  * \param gs sizes of launch grid
+ * \param ls sizes of launch blocks
  * \param amount of dynamic shared memory to allocate
  * \param args table of pointers to arguments
  */
 GPUARRAY_PUBLIC int GpuKernel_call(GpuKernel *k, unsigned int n,
-                                   const size_t *ls, const size_t *gs,
+                                   const size_t *gs, const size_t *ls,
                                    size_t shared, void **args);
 
 GPUARRAY_PUBLIC int GpuKernel_binary(const GpuKernel *k, size_t *sz,
diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c
index eef077e6d5..434c641ae2 100644
--- a/src/gpuarray_array.c
+++ b/src/gpuarray_array.c
@@ -487,7 +487,7 @@ int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i,
   if (err != GA_NO_ERROR)
     return err;
 
-  err = GpuKernel_sched(&k, n[0]*n[1], &ls[1], &gs[1]);
+  err = GpuKernel_sched(&k, n[0]*n[1], &gs[1], &ls[1]);
   if (err != GA_NO_ERROR)
     goto out;
 
@@ -521,7 +521,7 @@ int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i,
   GpuKernel_setarg(&k, argp++, &n[1]);
   GpuKernel_setarg(&k, argp++, errbuf);
 
-  err = GpuKernel_call(&k, 2, ls, gs, 0, NULL);
+  err = GpuKernel_call(&k, 2, gs, ls, 0, NULL);
   if (check_error && err == GA_NO_ERROR) {
     err = gpudata_read(&kerr, errbuf, 0, sizeof(int));
     if (err == GA_NO_ERROR && kerr != 0) {
diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c
index 9354d057a1..6d4648e232 100644
--- a/src/gpuarray_blas_cuda_cublas.c
+++ b/src/gpuarray_blas_cuda_cublas.c
@@ -1099,9 +1099,9 @@ static int sgemvBatch(cb_order order, cb_transpose transA,
   args[8] = &N;
 
   if (transA == cb_no_trans) {
-    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_N_a1_b1_small, 2, ls, gs, 0, args);
+    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_N_a1_b1_small, 2, gs, ls, 0, args);
   } else {
-    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_T_a1_b1_small, 2, ls, gs, 0, args);
+    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_T_a1_b1_small, 2, gs, ls, 0, args);
   }
 
   cuda_ops.buffer_release(Aa);
@@ -1223,9 +1223,9 @@ static int dgemvBatch(cb_order order, cb_transpose transA,
   args[8] = &N;
 
   if (transA == cb_no_trans) {
-    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_N_a1_b1_small, 2, ls, gs, 0, args);
+    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_N_a1_b1_small, 2, gs, ls, 0, args);
   } else {
-    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_T_a1_b1_small, 2, ls, gs, 0, args);
+    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_T_a1_b1_small, 2, gs, ls, 0, args);
   }
 
   cuda_ops.buffer_release(Aa);
@@ -1486,7 +1486,7 @@ static int sgerBatch(cb_order order, size_t M, size_t N, float alpha,
   args[8] = &M;
   args[9] = &N;
 
-  err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, ls, gs, 0, args);
+  err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, gs, ls, 0, args);
 
   cuda_ops.buffer_release(Aa);
   cuda_ops.buffer_release(xa);
@@ -1618,7 +1618,7 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha,
   args[8] = &M;
   args[9] = &N;
 
-  err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, ls, gs, 0, args);
+  err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, gs, ls, 0, args);
 
   cuda_ops.buffer_release(Aa);
   cuda_ops.buffer_release(xa);
diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c
index 14f792e453..a4dfd3329b 100644
--- a/src/gpuarray_buffer.c
+++ b/src/gpuarray_buffer.c
@@ -180,9 +180,9 @@ int gpukernel_setarg(gpukernel *k, unsigned int i, void *a) {
   return ((partial_gpukernel *)k)->ctx->ops->kernel_setarg(k, i, a);
 }
 
-int gpukernel_call(gpukernel *k, unsigned int n, const size_t *ls,
-                   const size_t *gs, size_t shared, void **args) {
-  return ((partial_gpukernel *)k)->ctx->ops->kernel_call(k, n, ls, gs,
+int gpukernel_call(gpukernel *k, unsigned int n, const size_t *gs,
+                   const size_t *ls, size_t shared, void **args) {
+  return ((partial_gpukernel *)k)->ctx->ops->kernel_call(k, n, gs, ls,
                                                          shared, args);
 }
 
diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c
index 500eb147c1..120919c72a 100644
--- a/src/gpuarray_buffer_cuda.c
+++ b/src/gpuarray_buffer_cuda.c
@@ -1237,7 +1237,7 @@ static int cuda_kernelsetarg(gpukernel *k, unsigned int i, void *arg) {
 }
 
 static int cuda_callkernel(gpukernel *k, unsigned int n,
-                           const size_t *bs, const size_t *gs,
+                           const size_t *gs, const size_t *ls,
                            size_t shared, void **args) {
     cuda_context *ctx = k->ctx;
     unsigned int i;
@@ -1258,15 +1258,15 @@ static int cuda_callkernel(gpukernel *k, unsigned int n,
 
     switch (n) {
     case 1:
-      ctx->err = cuLaunchKernel(k->k, gs[0], 1, 1, bs[0], 1, 1, shared,
+      ctx->err = cuLaunchKernel(k->k, gs[0], 1, 1, ls[0], 1, 1, shared,
                                 ctx->s, args, NULL);
       break;
     case 2:
-      ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], 1, bs[0], bs[1], 1, shared,
+      ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], 1, ls[0], ls[1], 1, shared,
                                 ctx->s, args, NULL);
       break;
     case 3:
-      ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], gs[2], bs[0], bs[1], bs[2],
+      ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], gs[2], ls[0], ls[1], ls[2],
                                 shared, ctx->s, args, NULL);
       break;
     default:
diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c
index f744084af3..7b8d684c7c 100644
--- a/src/gpuarray_buffer_opencl.c
+++ b/src/gpuarray_buffer_opencl.c
@@ -274,7 +274,7 @@ cl_mem cl_get_buf(gpudata *g) { ASSERT_BUF(g); return g->buf; }
 
 static void cl_releasekernel(gpukernel *k);
 static int cl_callkernel(gpukernel *k, unsigned int n,
-                         const size_t *bs, const size_t *gs,
+                         const size_t *gs, const size_t *ls,
                          size_t shared, void **args);
 
 static const char CL_PREAMBLE[] =
@@ -748,7 +748,7 @@ static int cl_memset(gpudata *dst, size_t offset, int data) {
   if (res != GA_NO_ERROR) goto fail;
   gs = ((n-1) / ls) + 1;
   args[0] = dst;
-  res = cl_callkernel(m, 1, &ls, &gs, 0, args);
+  res = cl_callkernel(m, 1, &gs, &ls, 0, args);
 
  fail:
   cl_releasekernel(m);
@@ -998,7 +998,7 @@ static int cl_setkernelarg(gpukernel *k, unsigned int i, void *a) {
 }
 
 static int cl_callkernel(gpukernel *k, unsigned int n,
-                         const size_t *ls, const size_t *gs,
+                         const size_t *gs, const size_t *ls,
                          size_t shared, void **args) {
   cl_ctx *ctx = k->ctx;
   size_t _gs[3];
diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c
index 1bb05bbb7e..f3ce7ee261 100644
--- a/src/gpuarray_elemwise.c
+++ b/src/gpuarray_elemwise.c
@@ -414,10 +414,10 @@ static int call_basic(GpuElemwise *ge, void **args, size_t n, unsigned int nd,
     }
   }
 
-  err = GpuKernel_sched(k, n, &ls, &gs);
+  err = GpuKernel_sched(k, n, &gs, &ls);
   if (err != GA_NO_ERROR) goto error;
 
-  err = GpuKernel_call(k, 1, &ls, &gs, 0, NULL);
+  err = GpuKernel_call(k, 1, &gs, &ls, 0, NULL);
  error:
   return err;
 }
@@ -572,9 +572,9 @@ static int call_contig(GpuElemwise *ge, void **args, size_t n) {
       if (err != GA_NO_ERROR) return err;
     }
   }
-  err = GpuKernel_sched(&ge->k_contig, n, &ls, &gs);
+  err = GpuKernel_sched(&ge->k_contig, n, &gs, &ls);
   if (err != GA_NO_ERROR) return err;
-  return GpuKernel_call(&ge->k_contig, 1, &ls, &gs, 0, NULL);
+  return GpuKernel_call(&ge->k_contig, 1, &gs, &ls, 0, NULL);
 }
 
 GpuElemwise *GpuElemwise_new(gpucontext *ctx,
diff --git a/src/gpuarray_kernel.c b/src/gpuarray_kernel.c
index 8beea94150..58311c86bb 100644
--- a/src/gpuarray_kernel.c
+++ b/src/gpuarray_kernel.c
@@ -32,7 +32,7 @@ gpucontext *GpuKernel_context(GpuKernel *k) {
   return gpukernel_context(k->k);
 }
 
-int GpuKernel_sched(GpuKernel *k, size_t n, size_t *ls, size_t *gs) {
+int GpuKernel_sched(GpuKernel *k, size_t n, size_t *gs, size_t *ls) {
   size_t min_l;
   size_t max_l;
   size_t target_l;
@@ -90,9 +90,9 @@ int GpuKernel_setarg(GpuKernel *k, unsigned int i, void *a) {
 }
 
 int GpuKernel_call(GpuKernel *k, unsigned int n,
-                   const size_t *bs, const size_t *gs,
+                   const size_t *gs, const size_t *ls,
                    size_t shared, void **args) {
-  return gpukernel_call(k->k, n, bs, gs, shared, args);
+  return gpukernel_call(k->k, n, gs, ls, shared, args);
 }
 
 int GpuKernel_binary(const GpuKernel *k, size_t *sz, void **bin) {
diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
index 15391bad69..12eedb24a9 100644
--- a/src/gpuarray_reduction.c
+++ b/src/gpuarray_reduction.c
@@ -815,8 +815,8 @@ static int   maxandargmaxInvoke                 (maxandargmax_ctx*  ctx){
 	   ctx->dstArgmaxStepsGD){
 		ctx->ret = GpuKernel_call(&ctx->kernel,
 		                          ctx->ndh>0 ? ctx->ndh : 1,
-		                          ctx->blockSize,
 		                          ctx->gridSize,
+		                          ctx->blockSize,
 		                          0,
 		                          args);
 	}else{
diff --git a/src/private.h b/src/private.h
index 57d919be88..0513df8605 100644
--- a/src/private.h
+++ b/src/private.h
@@ -97,7 +97,7 @@ struct _gpuarray_buffer_ops {
   void (*kernel_release)(gpukernel *k);
   int (*kernel_setarg)(gpukernel *k, unsigned int i, void *a);
   int (*kernel_call)(gpukernel *k, unsigned int n,
-                     const size_t *bs, const size_t *gs,
+                     const size_t *gs, const size_t *ls,
                      size_t shared, void **args);
 
   int (*kernel_binary)(gpukernel *k, size_t *sz, void **obj);

From 67cb9fccf4c6899d2d0d3194b7dc84ad302c0e00 Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Thu, 12 Jan 2017 13:40:43 -0500
Subject: [PATCH 2/3] Adapt pygpu to the order change.

---
 pygpu/gpuarray.pxd | 10 +++++-----
 pygpu/gpuarray.pyx | 24 ++++++++++++------------
 pygpu/reduction.py |  2 +-
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/pygpu/gpuarray.pxd b/pygpu/gpuarray.pxd
index c305d1dcfe..a4c83b0e2e 100644
--- a/pygpu/gpuarray.pxd
+++ b/pygpu/gpuarray.pxd
@@ -127,9 +127,9 @@ cdef extern from "gpuarray/kernel.h":
                        unsigned int argcount, const int *types, int flags, char **err_str)
     void GpuKernel_clear(_GpuKernel *k)
     gpucontext *GpuKernel_context(_GpuKernel *k)
-    int GpuKernel_sched(_GpuKernel *k, size_t n, size_t *ls, size_t *gs)
+    int GpuKernel_sched(_GpuKernel *k, size_t n, size_t *gs, size_t *ls)
     int GpuKernel_call(_GpuKernel *k, unsigned int n,
-                       const size_t *ls, const size_t *gs,
+                       const size_t *gs, const size_t *ls,
                        size_t shared, void **args)
     int GpuKernel_binary(_GpuKernel *, size_t *, void **)
 
@@ -265,9 +265,9 @@ cdef int kernel_init(GpuKernel k, gpucontext *ctx,
                      int flags) except -1
 cdef int kernel_clear(GpuKernel k) except -1
 cdef gpucontext *kernel_context(GpuKernel k) except NULL
-cdef int kernel_sched(GpuKernel k, size_t n, size_t *ls, size_t *gs) except -1
+cdef int kernel_sched(GpuKernel k, size_t n, size_t *gs, size_t *ls) except -1
 cdef int kernel_call(GpuKernel k, unsigned int n,
-                     const size_t *ls, const size_t *gs,
+                     const size_t *gs, const size_t *ls,
                      size_t shared, void **args) except -1
 cdef int kernel_binary(GpuKernel k, size_t *, void **) except -1
 cdef int kernel_property(GpuKernel k, int prop_id, void *res) except -1
@@ -346,5 +346,5 @@ cdef api class GpuKernel [type PyGpuKernelType, object PyGpuKernelObject]:
     cdef void **callbuf
     cdef object __weakref__
 
-    cdef do_call(self, py_n, py_ls, py_gs, py_args, size_t shared)
+    cdef do_call(self, py_n, py_gs, py_ls, py_args, size_t shared)
     cdef _setarg(self, unsigned int index, int typecode, object o)
diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx
index ef2fb818e7..ca19ed907a 100644
--- a/pygpu/gpuarray.pyx
+++ b/pygpu/gpuarray.pyx
@@ -446,16 +446,16 @@ cdef gpucontext *kernel_context(GpuKernel k) except NULL:
         raise GpuArrayException, "Invalid kernel or destroyed context"
     return res
 
-cdef int kernel_sched(GpuKernel k, size_t n, size_t *ls, size_t *gs) except -1:
+cdef int kernel_sched(GpuKernel k, size_t n, size_t *gs, size_t *ls) except -1:
     cdef int err
-    err = GpuKernel_sched(&k.k, n, ls, gs)
+    err = GpuKernel_sched(&k.k, n, gs, ls)
     if err != GA_NO_ERROR:
         raise get_exc(err), kernel_error(k, err)
 
-cdef int kernel_call(GpuKernel k, unsigned int n, const size_t *ls,
-                     const size_t *gs, size_t shared, void **args) except -1:
+cdef int kernel_call(GpuKernel k, unsigned int n, const size_t *gs,
+                     const size_t *ls, size_t shared, void **args) except -1:
     cdef int err
-    err = GpuKernel_call(&k.k, n, ls, gs, shared, args)
+    err = GpuKernel_call(&k.k, n, gs, ls, shared, args)
     if err != GA_NO_ERROR:
         raise get_exc(err), kernel_error(k, err)
 
@@ -2105,10 +2105,10 @@ cdef class GpuKernel:
     sure to test against the size of your data.
 
     If you want more control over thread allocation you can use the
-    `ls` and `gs` parameters like so::
+    `gs` and `ls` parameters like so::
 
         k = GpuKernel(...)
-        k(param1, param2, ls=ls, gs=gs)
+        k(param1, param2, gs=gs, ls=ls)
 
     If you choose to use this interface, make sure to stay within the
     limits of `k.maxlsize` and `ctx.maxgsize` or the call will fail.
@@ -2192,12 +2192,12 @@ cdef class GpuKernel:
         finally:
             free(_types)
 
-    def __call__(self, *args, n=None, ls=None, gs=None, shared=0):
+    def __call__(self, *args, n=None, gs=None, ls=None, shared=0):
         if n == None and (ls == None or gs == None):
             raise ValueError, "Must specify size (n) or both gs and ls"
-        self.do_call(n, ls, gs, args, shared)
+        self.do_call(n, gs, ls, args, shared)
 
-    cdef do_call(self, py_n, py_ls, py_gs, py_args, size_t shared):
+    cdef do_call(self, py_n, py_gs, py_ls, py_args, size_t shared):
         cdef size_t n
         cdef size_t gs[3]
         cdef size_t ls[3]
@@ -2264,8 +2264,8 @@ cdef class GpuKernel:
             if nd != 1:
                 raise ValueError, "n is specified and nd != 1"
             n = py_n
-            kernel_sched(self, n, &ls[0], &gs[0])
-        kernel_call(self, nd, ls, gs, shared, self.callbuf)
+            kernel_sched(self, n, &gs[0], &ls[0])
+        kernel_call(self, nd, gs, ls, shared, self.callbuf)
 
     cdef _setarg(self, unsigned int index, int typecode, object o):
         if typecode == GA_BUFFER:
diff --git a/pygpu/reduction.py b/pygpu/reduction.py
index 22f5a9c927..441380dbb5 100644
--- a/pygpu/reduction.py
+++ b/pygpu/reduction.py
@@ -277,7 +277,7 @@ def __call__(self, *args, **kwargs):
                 kargs.append(offsets[i])
                 kargs.extend(strs[i])
 
-        k(*kargs, ls=ls, gs=gs)
+        k(*kargs, gs=gs, ls=ls)
 
         return out
 

From 7c1b19819cf8a133f358a58e38333729080d9f30 Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Mon, 16 Jan 2017 13:12:00 -0500
Subject: [PATCH 3/3] Bump the rc version.

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 54b391c71a..c6c8139c9f 100755
--- a/setup.py
+++ b/setup.py
@@ -6,7 +6,7 @@
 MAJOR = 0
 MINOR = 6
 PATCH = 0
-SUFFIX = 'rc1'
+SUFFIX = 'rc2'
 FULLVERSION = '%d.%d.%d%s' % (MAJOR, MINOR, PATCH, SUFFIX)
 
 try: