Use set_scalar_arg_dtypes on python kernels

This involves pulling out the kernel from the program object, and running this method, specifying the scalar types (memory types are listed None). This means that clCreateKernel is only called once, when we pull out the kernel. It is called once for everytime program.kernel() was called before.
HandsOnOpenCL · Sep 26, 2013 · 6d34daa · 6d34daa
1 parent f8819ef
commit 6d34daa
Show file tree

Hide file tree

Showing 12 changed files with 73 additions and 43 deletions.
diff --git a/Exercises/Exercise03/Python/vadd.py b/Exercises/Exercise03/Python/vadd.py
@@ -83,7 +83,9 @@
 
 # Execute the kernel over the entire range of our 1d input
 # allowing OpenCL runtime to select the work group items for the device
-program.vadd(queue, h_a.shape, None, d_a, d_b, d_c, numpy.uint32(LENGTH))
+vadd = program.vadd
+vadd.set_scalar_arg_dtypes([None, None, None, numpy.uint32])
+vadd(queue, h_a.shape, None, d_a, d_b, d_c, LENGTH)
 
 # Wait for the commands to finish before reading back
 queue.finish()

diff --git a/Exercises/Exercise04/Python/vadd.py b/Exercises/Exercise04/Python/vadd.py
@@ -83,7 +83,9 @@
 
 # Execute the kernel over the entire range of our 1d input
 # allowing OpenCL runtime to select the work group items for the device
-program.vadd(queue, h_a.shape, None, d_a, d_b, d_c, numpy.uint32(LENGTH))
+vadd = program.vadd
+vadd.set_scalar_arg_dtypes([None, None, None, numpy.uint32])
+vadd(queue, h_a.shape, None, d_a, d_b, d_c, LENGTH)
 
 # Wait for the commands to finish before reading back
 queue.finish()

diff --git a/Exercises/Exercise05/Python/vadd.py b/Exercises/Exercise05/Python/vadd.py
@@ -83,7 +83,9 @@
 
 # Execute the kernel over the entire range of our 1d input
 # allowing OpenCL runtime to select the work group items for the device
-program.vadd(queue, h_a.shape, None, d_a, d_b, d_c, numpy.uint32(LENGTH))
+vadd = program.vadd
+vadd.set_scalar_arg_dtypes([None, None, None, numpy.uint32])
+vadd(queue, h_a.shape, None, d_a, d_b, d_c, LENGTH)
 
 # Wait for the commands to finish before reading back
 queue.finish()

diff --git a/Exercises/Exercise06/Python/matmul.py b/Exercises/Exercise06/Python/matmul.py
@@ -87,6 +87,8 @@
 d_c = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_C.nbytes)
 
 program = cl.Program(context, C_elem_KernelSource).build()
+mmul = program.mmul
+mmul.set_scalar_arg_dtypes([numpy.int32, numpy.int32, numpy.int32, None, None, None])
 
 print "\n===== OpenCL, matrix mult, C(i,j) per work item, order", Ndim, "======\n"
 
@@ -98,7 +100,7 @@
     globalrange = (Ndim, Mdim)
     localrange = None
 
-    program.mmul(queue, globalrange, localrange, numpy.int32(Mdim), numpy.int32(Ndim), numpy.int32(Pdim), d_a, d_b, d_c)
+    mmul(queue, globalrange, localrange, Mdim, Ndim, Pdim, d_a, d_b, d_c)
     queue.finish()
 
     run_time = time() - start_time

diff --git a/Solutions/Exercise04/Python/vadd_chain.py b/Solutions/Exercise04/Python/vadd_chain.py
@@ -79,15 +79,18 @@
 d_d = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_d.nbytes)
 d_f = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_f.nbytes)
 
+vadd = program.vadd
+vadd.set_scalar_arg_dtypes([None, None, None, numpy.uint32])
+
 # Execute the kernel over the entire range of our 1d input
 # allowing OpenCL runtime to select the work group items for the device
-program.vadd(queue, h_a.shape, None, d_a, d_b, d_c, numpy.uint32(LENGTH))
+vadd(queue, h_a.shape, None, d_a, d_b, d_c, LENGTH)
 
 # Enqueue the kernel again, but with different arguments
-program.vadd(queue, h_e.shape, None, d_e, d_c, d_d, numpy.uint32(LENGTH))
+vadd(queue, h_e.shape, None, d_e, d_c, d_d, LENGTH)
 
 # Enqueue the kernel a third time, again with different arguments
-program.vadd(queue, h_g.shape, None, d_g, d_d, d_f, numpy.uint32(LENGTH))
+vadd(queue, h_g.shape, None, d_g, d_d, d_f, LENGTH)
 
 
 # Read back the results from the compute device

diff --git a/Solutions/Exercise05/Python/vadd_abc.py b/Solutions/Exercise05/Python/vadd_abc.py
@@ -76,7 +76,9 @@
 
 # Execute the kernel over the entire range of our 1d input
 # allowing OpenCL runtime to select the work group items for the device
-program.vadd(queue, h_a.shape, None, d_a, d_b, d_c, d_r, numpy.uint32(LENGTH))
+vadd = program.vadd
+vadd.set_scalar_arg_dtypes([None, None, None, None, numpy.uint32])
+vadd(queue, h_a.shape, None, d_a, d_b, d_c, d_r, LENGTH)
 
 # Read back the results from the compute device
 cl.enqueue_copy(queue, h_r, d_r)

diff --git a/Solutions/Exercise06/Python/matmul.py b/Solutions/Exercise06/Python/matmul.py
@@ -97,6 +97,8 @@
 d_c = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_C.nbytes)
 
 program = cl.Program(context, C_elem_KernelSource).build()
+mmul = program.mmul
+mmul.set_scalar_arg_dtypes([numpy.int32, numpy.int32, numpy.int32, None, None, None])
 
 print "\n===== OpenCL, matrix mult, C(i,j) per work item, order", Ndim, "======\n"
 
@@ -108,7 +110,7 @@
     globalrange = (Ndim, Mdim)
     localrange = None
 
-    program.mmul(queue, globalrange, localrange, numpy.int32(Mdim), numpy.int32(Ndim), numpy.int32(Pdim), d_a, d_b, d_c)
+    mmul(queue, globalrange, localrange, Mdim, Ndim, Pdim, d_a, d_b, d_c)
     queue.finish()
 
     run_time = time() - start_time

diff --git a/Solutions/Exercise07/Python/matmul.py b/Solutions/Exercise07/Python/matmul.py
@@ -80,6 +80,8 @@
 
 kernelsource = open("../C_elem.cl").read()
 program = cl.Program(context, kernelsource).build()
+mmul = program.mmul
+mmul.set_scalar_arg_dtypes([numpy.int32, numpy.int32, numpy.int32, None, None, None])
 
 print "\n===== OpenCL, matrix mult, C(i,j) per work item, order", Ndim, "======\n"
 
@@ -88,7 +90,7 @@
     h_C.fill(0.0)
     start_time = time()
 
-    program.mmul(queue, (Ndim, Mdim), None, numpy.int32(Mdim), numpy.int32(Ndim), numpy.int32(Pdim), d_a, d_b, d_c)
+    mmul(queue, (Ndim, Mdim), None, Mdim, Ndim, Pdim, d_a, d_b, d_c)
     queue.finish()
 
     run_time = time() - start_time
@@ -102,13 +104,15 @@
 
 kernelsource = open("../C_row.cl").read()
 program = cl.Program(context, kernelsource).build()
+mmul = program.mmul
+mmul.set_scalar_arg_dtypes([numpy.int32, numpy.int32, numpy.int32, None, None, None])
 print "\n===== OpenCL, matrix mult, C row per work item, order", Ndim, "======\n"
 # Do the multiplication COUNT times
 for i in range(COUNT):
     h_C.fill(0.0)
     start_time = time()
 
-    program.mmul(queue, (Ndim,), (ORDER/16,), numpy.int32(Mdim), numpy.int32(Ndim), numpy.int32(Pdim), d_a, d_b, d_c)
+    mmul(queue, (Ndim,), (ORDER/16,), Mdim, Ndim, Pdim, d_a, d_b, d_c)
     queue.finish()
 
     run_time = time() - start_time
@@ -122,13 +126,15 @@
 
 kernelsource = open("../C_row_priv.cl").read()
 program = cl.Program(context, kernelsource).build()
+mmul = program.mmul
+mmul.set_scalar_arg_dtypes([numpy.int32, numpy.int32, numpy.int32, None, None, None])
 print "\n===== OpenCL, matrix mult, C row, A row in priv mem, order", Ndim, "======\n"
 # Do the multiplication COUNT times
 for i in range(COUNT):
     h_C.fill(0.0)
     start_time = time()
 
-    program.mmul(queue, (Ndim,), (ORDER/16,), numpy.int32(Mdim), numpy.int32(Ndim), numpy.int32(Pdim), d_a, d_b, d_c)
+    mmul(queue, (Ndim,), (ORDER/16,), Mdim, Ndim, Pdim, d_a, d_b, d_c)
     queue.finish()
 
     run_time = time() - start_time

diff --git a/Solutions/Exercise08/Python/matmul.py b/Solutions/Exercise08/Python/matmul.py
@@ -80,15 +80,16 @@
 
 kernelsource = open("../C_elem.cl").read()
 program = cl.Program(context, kernelsource).build()
-
+mmul = program.mmul
+mmul.set_scalar_arg_dtypes([numpy.int32, numpy.int32, numpy.int32, None, None, None])
 print "\n===== OpenCL, matrix mult, C(i,j) per work item, order", Ndim, "======\n"
 
 # Do the multiplication COUNT times
 for i in range(COUNT):
     h_C.fill(0.0)
     start_time = time()
 
-    program.mmul(queue, (Ndim, Mdim), None, numpy.int32(Mdim), numpy.int32(Ndim), numpy.int32(Pdim), d_a, d_b, d_c)
+    mmul(queue, (Ndim, Mdim), None, Mdim, Ndim, Pdim, d_a, d_b, d_c)
     queue.finish()
 
     run_time = time() - start_time
@@ -102,13 +103,15 @@
 
 kernelsource = open("../C_row.cl").read()
 program = cl.Program(context, kernelsource).build()
+mmul = program.mmul
+mmul.set_scalar_arg_dtypes([numpy.int32, numpy.int32, numpy.int32, None, None, None])
 print "\n===== OpenCL, matrix mult, C row per work item, order", Ndim, "======\n"
 # Do the multiplication COUNT times
 for i in range(COUNT):
     h_C.fill(0.0)
     start_time = time()
 
-    program.mmul(queue, (Ndim,), (ORDER/16,), numpy.int32(Mdim), numpy.int32(Ndim), numpy.int32(Pdim), d_a, d_b, d_c)
+    mmul(queue, (Ndim,), (ORDER/16,), Mdim, Ndim, Pdim, d_a, d_b, d_c)
     queue.finish()
 
     run_time = time() - start_time
@@ -122,13 +125,15 @@
 
 kernelsource = open("../C_row_priv.cl").read()
 program = cl.Program(context, kernelsource).build()
+mmul = program.mmul
+mmul.set_scalar_arg_dtypes([numpy.int32, numpy.int32, numpy.int32, None, None, None])
 print "\n===== OpenCL, matrix mult, C row, A row in priv mem, order", Ndim, "======\n"
 # Do the multiplication COUNT times
 for i in range(COUNT):
     h_C.fill(0.0)
     start_time = time()
 
-    program.mmul(queue, (Ndim,), (ORDER/16,), numpy.int32(Mdim), numpy.int32(Ndim), numpy.int32(Pdim), d_a, d_b, d_c)
+    mmul(queue, (Ndim,), (ORDER/16,), Mdim, Ndim, Pdim, d_a, d_b, d_c)
     queue.finish()
 
     run_time = time() - start_time
@@ -142,14 +147,16 @@
 
 kernelsource = open("../C_row_priv_bloc.cl").read()
 program = cl.Program(context, kernelsource).build()
+mmul = program.mmul
+mmul.set_scalar_arg_dtypes([numpy.int32, numpy.int32, numpy.int32, None, None, None, None])
 print "\n===== OpenCL, mat mult, C row, priv A, B cols loc, order", Ndim, "======\n"
 # Do the multiplication COUNT times
 for i in range(COUNT):
     h_C.fill(0.0)
     start_time = time()
 
     localmem = cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * Pdim)
-    program.mmul(queue, (Ndim,), (ORDER/16,), numpy.int32(Mdim), numpy.int32(Ndim), numpy.int32(Pdim),
+    mmul(queue, (Ndim,), (ORDER/16,), Mdim, Ndim, Pdim,
     	d_a, d_b, d_c, localmem)
     queue.finish()
 
@@ -164,6 +171,8 @@
 
 kernelsource = open("../C_block_form.cl").read()
 program = cl.Program(context, kernelsource).build()
+mmul = program.mmul
+mmul.set_scalar_arg_dtypes([numpy.int32, numpy.int32, numpy.int32, None, None, None, None, None])
 print "\n===== OpenCL, A and B in block form in local memory, order", Ndim, "======\n"
 blockSize = 16
 # Do the multiplication COUNT times
@@ -173,7 +182,7 @@
 
     localmem1 = cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * blockSize * blockSize)
     localmem2 = cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * blockSize * blockSize)
-    program.mmul(queue, (Ndim, Mdim), (blockSize, blockSize),
+    mmul(queue, (Ndim, Mdim), (blockSize, blockSize),
     	numpy.int32(Mdim), numpy.int32(Ndim), numpy.int32(Pdim),
     	d_a, d_b, d_c, localmem1, localmem2)
     queue.finish()

diff --git a/Solutions/Exercise09/Python/pi_ocl.py b/Solutions/Exercise09/Python/pi_ocl.py
@@ -30,6 +30,8 @@
 queue = cl.CommandQueue(context)
 kernelsource = open("../pi_ocl.cl").read()
 program = cl.Program(context, kernelsource).build()
+pi = program.pi
+pi.set_scalar_arg_dtypes([numpy.int32, numpy.float32, None, None])
 
 # Get the max work group size for the kernel pi on our device
 device = context.devices[0]
@@ -63,11 +65,11 @@
 # Set the global and local size as tuples
 global_size = ((nwork_groups * work_group_size),)
 local_size = ((work_group_size),)
-program.pi(queue, global_size, local_size,
-	numpy.int32(niters),
-	numpy.float32(step_size),
-	cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * work_group_size),
-	d_partial_sums)
+localmem = cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * work_group_size)
+
+pi(queue, global_size, local_size,
+	niters, step_size,
+	localmem, d_partial_sums)
 
 cl.enqueue_copy(queue, h_psum, d_partial_sums)
 

diff --git a/Solutions/Exercise13/Python/gameoflife.py b/Solutions/Exercise13/Python/gameoflife.py
@@ -45,6 +45,8 @@ def main():
     with open('../gameoflife.cl', 'r') as f:
         kernelsource = f.read()
     program = cl.Program(context, kernelsource).build()
+    accelerate_life = program.accelerate_life
+    accelerate_life.set_scalar_arg_dtypes([None, None, numpy.uint32, numpy.uint32, None])
 
     # Allocate memory for boards
     h_board = numpy.zeros(nx * ny).astype(numpy.int8)
@@ -71,9 +73,9 @@ def main():
     for i in xrange(iterations):
         # Apply the rules of Life
         # Enqueue the kernel
-        program.accelerate_life(queue, global_size, local_size,
+        accelerate_life(queue, global_size, local_size,
             d_board_tick, d_board_tock,
-            numpy.uint32(nx), numpy.uint32(ny),
+            nx, ny,
             localmem)
 
         # Swap the boards over

diff --git a/Solutions/ExerciseA/Python/pi_vocl.py b/Solutions/ExerciseA/Python/pi_vocl.py
@@ -54,7 +54,14 @@
 queue = cl.CommandQueue(context)
 kernelsource = open("../pi_vocl.cl").read()
 program = cl.Program(context, kernelsource).build()
+if vector_size == 1:
+	pi = program.pi
+elif vector_size == 4:
+	pi = program.pi_vec4
+elif vector_size == 8:
+	pi = program.pi_vec8
 
+pi.set_scalar_arg_dtypes([numpy.int32, numpy.float32, None, None])
 
 # Now that we know the size of the work_groups, we can set the number of work
 # groups, the actual number of steps, and the step size
@@ -98,24 +105,13 @@
 global_size = ((nwork_groups * work_group_size),)
 local_size = ((work_group_size),)
 
-if vector_size == 1:
-	program.pi(queue, global_size, local_size,
-		numpy.int32(niters),
-		numpy.float32(step_size),
-		cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * work_group_size),
-		d_partial_sums)
-elif vector_size == 4:
-	program.pi_vec4(queue, global_size, local_size,
-		numpy.int32(niters),
-		numpy.float32(step_size),
-		cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * work_group_size),
-		d_partial_sums)
-elif vector_size == 8:
-	program.pi_vec8(queue, global_size, local_size,
-		numpy.int32(niters),
-		numpy.float32(step_size),
-		cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * work_group_size),
-		d_partial_sums)
+localmem = cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * work_group_size)
+
+pi(queue, global_size, local_size,
+	niters,
+	step_size,
+	localmem,
+	d_partial_sums)
 
 cl.enqueue_copy(queue, h_psum, d_partial_sums)