Skip to content

Commit

Permalink
Use set_scalar_arg_dtypes on python kernels
Browse files Browse the repository at this point in the history
This involves pulling out the kernel from the program object, and
running this method, specifying the scalar types (memory types are
listed None).
This means that clCreateKernel is only called once, when we pull
out the kernel. It is called once for everytime program.kernel()
was called before.
  • Loading branch information
tomdeakin committed Sep 26, 2013
1 parent f8819ef commit 6d34daa
Show file tree
Hide file tree
Showing 12 changed files with 73 additions and 43 deletions.
4 changes: 3 additions & 1 deletion Exercises/Exercise03/Python/vadd.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,9 @@

# Execute the kernel over the entire range of our 1d input
# allowing OpenCL runtime to select the work group items for the device
program.vadd(queue, h_a.shape, None, d_a, d_b, d_c, numpy.uint32(LENGTH))
vadd = program.vadd
vadd.set_scalar_arg_dtypes([None, None, None, numpy.uint32])
vadd(queue, h_a.shape, None, d_a, d_b, d_c, LENGTH)

# Wait for the commands to finish before reading back
queue.finish()
Expand Down
4 changes: 3 additions & 1 deletion Exercises/Exercise04/Python/vadd.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,9 @@

# Execute the kernel over the entire range of our 1d input
# allowing OpenCL runtime to select the work group items for the device
program.vadd(queue, h_a.shape, None, d_a, d_b, d_c, numpy.uint32(LENGTH))
vadd = program.vadd
vadd.set_scalar_arg_dtypes([None, None, None, numpy.uint32])
vadd(queue, h_a.shape, None, d_a, d_b, d_c, LENGTH)

# Wait for the commands to finish before reading back
queue.finish()
Expand Down
4 changes: 3 additions & 1 deletion Exercises/Exercise05/Python/vadd.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,9 @@

# Execute the kernel over the entire range of our 1d input
# allowing OpenCL runtime to select the work group items for the device
program.vadd(queue, h_a.shape, None, d_a, d_b, d_c, numpy.uint32(LENGTH))
vadd = program.vadd
vadd.set_scalar_arg_dtypes([None, None, None, numpy.uint32])
vadd(queue, h_a.shape, None, d_a, d_b, d_c, LENGTH)

# Wait for the commands to finish before reading back
queue.finish()
Expand Down
4 changes: 3 additions & 1 deletion Exercises/Exercise06/Python/matmul.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@
d_c = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_C.nbytes)

program = cl.Program(context, C_elem_KernelSource).build()
mmul = program.mmul
mmul.set_scalar_arg_dtypes([numpy.int32, numpy.int32, numpy.int32, None, None, None])

print "\n===== OpenCL, matrix mult, C(i,j) per work item, order", Ndim, "======\n"

Expand All @@ -98,7 +100,7 @@
globalrange = (Ndim, Mdim)
localrange = None

program.mmul(queue, globalrange, localrange, numpy.int32(Mdim), numpy.int32(Ndim), numpy.int32(Pdim), d_a, d_b, d_c)
mmul(queue, globalrange, localrange, Mdim, Ndim, Pdim, d_a, d_b, d_c)
queue.finish()

run_time = time() - start_time
Expand Down
9 changes: 6 additions & 3 deletions Solutions/Exercise04/Python/vadd_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,15 +79,18 @@
d_d = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_d.nbytes)
d_f = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_f.nbytes)

vadd = program.vadd
vadd.set_scalar_arg_dtypes([None, None, None, numpy.uint32])

# Execute the kernel over the entire range of our 1d input
# allowing OpenCL runtime to select the work group items for the device
program.vadd(queue, h_a.shape, None, d_a, d_b, d_c, numpy.uint32(LENGTH))
vadd(queue, h_a.shape, None, d_a, d_b, d_c, LENGTH)

# Enqueue the kernel again, but with different arguments
program.vadd(queue, h_e.shape, None, d_e, d_c, d_d, numpy.uint32(LENGTH))
vadd(queue, h_e.shape, None, d_e, d_c, d_d, LENGTH)

# Enqueue the kernel a third time, again with different arguments
program.vadd(queue, h_g.shape, None, d_g, d_d, d_f, numpy.uint32(LENGTH))
vadd(queue, h_g.shape, None, d_g, d_d, d_f, LENGTH)


# Read back the results from the compute device
Expand Down
4 changes: 3 additions & 1 deletion Solutions/Exercise05/Python/vadd_abc.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,9 @@

# Execute the kernel over the entire range of our 1d input
# allowing OpenCL runtime to select the work group items for the device
program.vadd(queue, h_a.shape, None, d_a, d_b, d_c, d_r, numpy.uint32(LENGTH))
vadd = program.vadd
vadd.set_scalar_arg_dtypes([None, None, None, None, numpy.uint32])
vadd(queue, h_a.shape, None, d_a, d_b, d_c, d_r, LENGTH)

# Read back the results from the compute device
cl.enqueue_copy(queue, h_r, d_r)
Expand Down
4 changes: 3 additions & 1 deletion Solutions/Exercise06/Python/matmul.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@
d_c = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_C.nbytes)

program = cl.Program(context, C_elem_KernelSource).build()
mmul = program.mmul
mmul.set_scalar_arg_dtypes([numpy.int32, numpy.int32, numpy.int32, None, None, None])

print "\n===== OpenCL, matrix mult, C(i,j) per work item, order", Ndim, "======\n"

Expand All @@ -108,7 +110,7 @@
globalrange = (Ndim, Mdim)
localrange = None

program.mmul(queue, globalrange, localrange, numpy.int32(Mdim), numpy.int32(Ndim), numpy.int32(Pdim), d_a, d_b, d_c)
mmul(queue, globalrange, localrange, Mdim, Ndim, Pdim, d_a, d_b, d_c)
queue.finish()

run_time = time() - start_time
Expand Down
12 changes: 9 additions & 3 deletions Solutions/Exercise07/Python/matmul.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@

kernelsource = open("../C_elem.cl").read()
program = cl.Program(context, kernelsource).build()
mmul = program.mmul
mmul.set_scalar_arg_dtypes([numpy.int32, numpy.int32, numpy.int32, None, None, None])

print "\n===== OpenCL, matrix mult, C(i,j) per work item, order", Ndim, "======\n"

Expand All @@ -88,7 +90,7 @@
h_C.fill(0.0)
start_time = time()

program.mmul(queue, (Ndim, Mdim), None, numpy.int32(Mdim), numpy.int32(Ndim), numpy.int32(Pdim), d_a, d_b, d_c)
mmul(queue, (Ndim, Mdim), None, Mdim, Ndim, Pdim, d_a, d_b, d_c)
queue.finish()

run_time = time() - start_time
Expand All @@ -102,13 +104,15 @@

kernelsource = open("../C_row.cl").read()
program = cl.Program(context, kernelsource).build()
mmul = program.mmul
mmul.set_scalar_arg_dtypes([numpy.int32, numpy.int32, numpy.int32, None, None, None])
print "\n===== OpenCL, matrix mult, C row per work item, order", Ndim, "======\n"
# Do the multiplication COUNT times
for i in range(COUNT):
h_C.fill(0.0)
start_time = time()

program.mmul(queue, (Ndim,), (ORDER/16,), numpy.int32(Mdim), numpy.int32(Ndim), numpy.int32(Pdim), d_a, d_b, d_c)
mmul(queue, (Ndim,), (ORDER/16,), Mdim, Ndim, Pdim, d_a, d_b, d_c)
queue.finish()

run_time = time() - start_time
Expand All @@ -122,13 +126,15 @@

kernelsource = open("../C_row_priv.cl").read()
program = cl.Program(context, kernelsource).build()
mmul = program.mmul
mmul.set_scalar_arg_dtypes([numpy.int32, numpy.int32, numpy.int32, None, None, None])
print "\n===== OpenCL, matrix mult, C row, A row in priv mem, order", Ndim, "======\n"
# Do the multiplication COUNT times
for i in range(COUNT):
h_C.fill(0.0)
start_time = time()

program.mmul(queue, (Ndim,), (ORDER/16,), numpy.int32(Mdim), numpy.int32(Ndim), numpy.int32(Pdim), d_a, d_b, d_c)
mmul(queue, (Ndim,), (ORDER/16,), Mdim, Ndim, Pdim, d_a, d_b, d_c)
queue.finish()

run_time = time() - start_time
Expand Down
21 changes: 15 additions & 6 deletions Solutions/Exercise08/Python/matmul.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,15 +80,16 @@

kernelsource = open("../C_elem.cl").read()
program = cl.Program(context, kernelsource).build()

mmul = program.mmul
mmul.set_scalar_arg_dtypes([numpy.int32, numpy.int32, numpy.int32, None, None, None])
print "\n===== OpenCL, matrix mult, C(i,j) per work item, order", Ndim, "======\n"

# Do the multiplication COUNT times
for i in range(COUNT):
h_C.fill(0.0)
start_time = time()

program.mmul(queue, (Ndim, Mdim), None, numpy.int32(Mdim), numpy.int32(Ndim), numpy.int32(Pdim), d_a, d_b, d_c)
mmul(queue, (Ndim, Mdim), None, Mdim, Ndim, Pdim, d_a, d_b, d_c)
queue.finish()

run_time = time() - start_time
Expand All @@ -102,13 +103,15 @@

kernelsource = open("../C_row.cl").read()
program = cl.Program(context, kernelsource).build()
mmul = program.mmul
mmul.set_scalar_arg_dtypes([numpy.int32, numpy.int32, numpy.int32, None, None, None])
print "\n===== OpenCL, matrix mult, C row per work item, order", Ndim, "======\n"
# Do the multiplication COUNT times
for i in range(COUNT):
h_C.fill(0.0)
start_time = time()

program.mmul(queue, (Ndim,), (ORDER/16,), numpy.int32(Mdim), numpy.int32(Ndim), numpy.int32(Pdim), d_a, d_b, d_c)
mmul(queue, (Ndim,), (ORDER/16,), Mdim, Ndim, Pdim, d_a, d_b, d_c)
queue.finish()

run_time = time() - start_time
Expand All @@ -122,13 +125,15 @@

kernelsource = open("../C_row_priv.cl").read()
program = cl.Program(context, kernelsource).build()
mmul = program.mmul
mmul.set_scalar_arg_dtypes([numpy.int32, numpy.int32, numpy.int32, None, None, None])
print "\n===== OpenCL, matrix mult, C row, A row in priv mem, order", Ndim, "======\n"
# Do the multiplication COUNT times
for i in range(COUNT):
h_C.fill(0.0)
start_time = time()

program.mmul(queue, (Ndim,), (ORDER/16,), numpy.int32(Mdim), numpy.int32(Ndim), numpy.int32(Pdim), d_a, d_b, d_c)
mmul(queue, (Ndim,), (ORDER/16,), Mdim, Ndim, Pdim, d_a, d_b, d_c)
queue.finish()

run_time = time() - start_time
Expand All @@ -142,14 +147,16 @@

kernelsource = open("../C_row_priv_bloc.cl").read()
program = cl.Program(context, kernelsource).build()
mmul = program.mmul
mmul.set_scalar_arg_dtypes([numpy.int32, numpy.int32, numpy.int32, None, None, None, None])
print "\n===== OpenCL, mat mult, C row, priv A, B cols loc, order", Ndim, "======\n"
# Do the multiplication COUNT times
for i in range(COUNT):
h_C.fill(0.0)
start_time = time()

localmem = cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * Pdim)
program.mmul(queue, (Ndim,), (ORDER/16,), numpy.int32(Mdim), numpy.int32(Ndim), numpy.int32(Pdim),
mmul(queue, (Ndim,), (ORDER/16,), Mdim, Ndim, Pdim,
d_a, d_b, d_c, localmem)
queue.finish()

Expand All @@ -164,6 +171,8 @@

kernelsource = open("../C_block_form.cl").read()
program = cl.Program(context, kernelsource).build()
mmul = program.mmul
mmul.set_scalar_arg_dtypes([numpy.int32, numpy.int32, numpy.int32, None, None, None, None, None])
print "\n===== OpenCL, A and B in block form in local memory, order", Ndim, "======\n"
blockSize = 16
# Do the multiplication COUNT times
Expand All @@ -173,7 +182,7 @@

localmem1 = cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * blockSize * blockSize)
localmem2 = cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * blockSize * blockSize)
program.mmul(queue, (Ndim, Mdim), (blockSize, blockSize),
mmul(queue, (Ndim, Mdim), (blockSize, blockSize),
numpy.int32(Mdim), numpy.int32(Ndim), numpy.int32(Pdim),
d_a, d_b, d_c, localmem1, localmem2)
queue.finish()
Expand Down
12 changes: 7 additions & 5 deletions Solutions/Exercise09/Python/pi_ocl.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
queue = cl.CommandQueue(context)
kernelsource = open("../pi_ocl.cl").read()
program = cl.Program(context, kernelsource).build()
pi = program.pi
pi.set_scalar_arg_dtypes([numpy.int32, numpy.float32, None, None])

# Get the max work group size for the kernel pi on our device
device = context.devices[0]
Expand Down Expand Up @@ -63,11 +65,11 @@
# Set the global and local size as tuples
global_size = ((nwork_groups * work_group_size),)
local_size = ((work_group_size),)
program.pi(queue, global_size, local_size,
numpy.int32(niters),
numpy.float32(step_size),
cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * work_group_size),
d_partial_sums)
localmem = cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * work_group_size)

pi(queue, global_size, local_size,
niters, step_size,
localmem, d_partial_sums)

cl.enqueue_copy(queue, h_psum, d_partial_sums)

Expand Down
6 changes: 4 additions & 2 deletions Solutions/Exercise13/Python/gameoflife.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ def main():
with open('../gameoflife.cl', 'r') as f:
kernelsource = f.read()
program = cl.Program(context, kernelsource).build()
accelerate_life = program.accelerate_life
accelerate_life.set_scalar_arg_dtypes([None, None, numpy.uint32, numpy.uint32, None])

# Allocate memory for boards
h_board = numpy.zeros(nx * ny).astype(numpy.int8)
Expand All @@ -71,9 +73,9 @@ def main():
for i in xrange(iterations):
# Apply the rules of Life
# Enqueue the kernel
program.accelerate_life(queue, global_size, local_size,
accelerate_life(queue, global_size, local_size,
d_board_tick, d_board_tock,
numpy.uint32(nx), numpy.uint32(ny),
nx, ny,
localmem)

# Swap the boards over
Expand Down
32 changes: 14 additions & 18 deletions Solutions/ExerciseA/Python/pi_vocl.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,14 @@
queue = cl.CommandQueue(context)
kernelsource = open("../pi_vocl.cl").read()
program = cl.Program(context, kernelsource).build()
if vector_size == 1:
pi = program.pi
elif vector_size == 4:
pi = program.pi_vec4
elif vector_size == 8:
pi = program.pi_vec8

pi.set_scalar_arg_dtypes([numpy.int32, numpy.float32, None, None])

# Now that we know the size of the work_groups, we can set the number of work
# groups, the actual number of steps, and the step size
Expand Down Expand Up @@ -98,24 +105,13 @@
global_size = ((nwork_groups * work_group_size),)
local_size = ((work_group_size),)

if vector_size == 1:
program.pi(queue, global_size, local_size,
numpy.int32(niters),
numpy.float32(step_size),
cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * work_group_size),
d_partial_sums)
elif vector_size == 4:
program.pi_vec4(queue, global_size, local_size,
numpy.int32(niters),
numpy.float32(step_size),
cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * work_group_size),
d_partial_sums)
elif vector_size == 8:
program.pi_vec8(queue, global_size, local_size,
numpy.int32(niters),
numpy.float32(step_size),
cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * work_group_size),
d_partial_sums)
localmem = cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * work_group_size)

pi(queue, global_size, local_size,
niters,
step_size,
localmem,
d_partial_sums)

cl.enqueue_copy(queue, h_psum, d_partial_sums)

Expand Down

0 comments on commit 6d34daa

Please sign in to comment.