Python code for solution 7 added

This is the long matrix multiply example with different kernels
HandsOnOpenCL · Jul 23, 2013 · de8abc7 · de8abc7
1 parent a7acfba
commit de8abc7
Show file tree

Hide file tree

Showing 3 changed files with 232 additions and 0 deletions.
diff --git a/Solutions/Exercise07/Python/definitions.py b/Solutions/Exercise07/Python/definitions.py
@@ -0,0 +1,18 @@
+
+# Order of the square matrices A, B and C
+ORDER = 1024
+
+# A elemetns are constant and equal to AVAL
+AVAL = 3.0
+
+# B elemetns are constant and equal to BVAL
+BVAL = 5.0
+
+# tolerance used in floating point comparisons
+TOL = 0.001
+
+# Max dim for NDRange
+DIM = 2
+
+# number of times to do each multiplication
+COUNT = 1
diff --git a/Solutions/Exercise07/Python/helper.py b/Solutions/Exercise07/Python/helper.py
@@ -0,0 +1,30 @@
+
+from definitions import *
+
+#  Function to compute the matrix product (sequential algorithm, dot prod)
+def seq_mat_mul_sdot(Mdim, Ndim, Pdim, A, B, C):
+    for i in range(Ndim):
+        for j in range(Mdim):
+            tmp = 0.0
+            for k in range(Pdim):
+                tmp += A[i*Ndim+k] * B[k*Pdim+j]
+            C[i*Ndim+j] = tmp
+
+#  Function to compute errors of the product matrix
+def error(Mdim, Ndim, Pdim, C):
+   cval = float(Pdim) * AVAL * BVAL
+   errsq = 0.0
+   for i in range(Ndim):
+       for j in range(Mdim):
+            err = C[i*Ndim+j] - cval
+            errsq += err * err
+   return errsq;
+
+
+# Function to analyze and output results
+def results(Mdim, Ndim, Pdim, C, run_time):
+    mflops = 2.0 * Mdim * Ndim * Pdim/(1000000.0* run_time)
+    print run_time, "seconds at", mflops, "MFLOPS"
+    errsq = error(Mdim, Ndim, Pdim, C)
+    if (errsq > TOL):
+        print "Errors in multiplication:", errsq
diff --git a/Solutions/Exercise07/Python/matmul.py b/Solutions/Exercise07/Python/matmul.py
@@ -0,0 +1,184 @@
+#
+# Matrix Multiplication Driver
+#
+# This is a driver program to test various ways of computing
+# the product:
+#                 C = A * B
+#
+# A and B are constant matrices, square and the order is
+# set as a constant, ORDER (see definitions.py). This is so
+# we can make a quick test of the multiplication result.
+#
+# History:   C++ version written by Tim Mattson, August 2010 
+#            Modified by Simon McIntosh-Smith, September 2011
+#            Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
+#            Ported to Python by Tom Deakin, July 2013
+#
+
+from helper import *
+from definitions import *
+
+import pyopencl as cl
+import numpy
+from time import time
+
+# A[N][P], B[P][M], C[N][M]
+Ndim = ORDER;
+Pdim = ORDER;
+Mdim = ORDER;
+
+# Number of elements in the matrix
+sizeA = Ndim * Pdim
+sizeB = Pdim * Mdim
+sizeC = Ndim * Mdim
+
+
+# A matrix
+h_A = numpy.empty(sizeA).astype(numpy.float32)
+h_A.fill(AVAL)
+
+# B matrix
+h_B = numpy.empty(sizeB).astype(numpy.float32)
+h_B.fill(BVAL)
+
+# C matrix
+h_C = numpy.empty(sizeC).astype(numpy.float32)
+
+print "\n===== Sequential, matrix mult (dot prod), order", ORDER, "on host CPU ======\n"
+
+for i in range(COUNT):
+    h_C.fill(0.0)
+    start_time = time()
+
+    print "Skipping as this takes a long time to run!"
+    #seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C)
+
+    run_time = time() - start_time
+    #results(Mdim, Ndim, Pdim, h_C, run_time)
+
+
+# Set up OpenCL
+context = cl.create_some_context()
+queue = cl.CommandQueue(context)
+
+# Reset host buffers - just to play it safe
+h_A = numpy.empty(sizeA).astype(numpy.float32)
+h_A.fill(AVAL)
+h_B = numpy.empty(sizeB).astype(numpy.float32)
+h_B.fill(BVAL)
+h_C = numpy.empty(sizeC).astype(numpy.float32)
+
+
+#--------------------------------------------------------------------------------
+# OpenCL matrix multiplication ... Naive
+#--------------------------------------------------------------------------------
+
+# Create OpenCL buffers
+d_a = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_A)
+d_b = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_B)
+d_c = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_C.nbytes)
+
+kernelsource = open("../C_elem.cl").read()
+program = cl.Program(context, kernelsource).build()
+
+print "\n===== OpenCL, matrix mult, C(i,j) per work item, order", Ndim, "======\n"
+
+# Do the multiplication COUNT times
+for i in range(COUNT):
+    h_C.fill(0.0)
+    start_time = time()
+
+    program.mmul(queue, (Ndim, Mdim), None, numpy.int32(Mdim), numpy.int32(Ndim), numpy.int32(Pdim), d_a, d_b, d_c)
+    queue.finish()
+
+    run_time = time() - start_time
+
+    cl.enqueue_copy(queue, h_C, d_c)
+    results(Mdim, Ndim, Pdim, h_C, run_time)
+
+#--------------------------------------------------------------------------------
+# OpenCL matrix multiplication ... C row per work item
+#--------------------------------------------------------------------------------
+
+kernelsource = open("../C_row.cl").read()
+program = cl.Program(context, kernelsource).build()
+print "\n===== OpenCL, matrix mult, C row per work item, order", Ndim, "======\n"
+# Do the multiplication COUNT times
+for i in range(COUNT):
+    h_C.fill(0.0)
+    start_time = time()
+
+    program.mmul(queue, (Ndim,), (ORDER/16,), numpy.int32(Mdim), numpy.int32(Ndim), numpy.int32(Pdim), d_a, d_b, d_c)
+    queue.finish()
+
+    run_time = time() - start_time
+
+    cl.enqueue_copy(queue, h_C, d_c)
+    results(Mdim, Ndim, Pdim, h_C, run_time)
+
+#--------------------------------------------------------------------------------
+# OpenCL matrix multiplication ... C row per work item, A row in pivate memory
+#--------------------------------------------------------------------------------
+
+kernelsource = open("../C_row_priv.cl").read()
+program = cl.Program(context, kernelsource).build()
+print "\n===== OpenCL, matrix mult, C row, A row in priv mem, order", Ndim, "======\n"
+# Do the multiplication COUNT times
+for i in range(COUNT):
+    h_C.fill(0.0)
+    start_time = time()
+
+    program.mmul(queue, (Ndim,), (ORDER/16,), numpy.int32(Mdim), numpy.int32(Ndim), numpy.int32(Pdim), d_a, d_b, d_c)
+    queue.finish()
+
+    run_time = time() - start_time
+
+    cl.enqueue_copy(queue, h_C, d_c)
+    results(Mdim, Ndim, Pdim, h_C, run_time)
+
+#--------------------------------------------------------------------------------
+# OpenCL matrix multiplication ... C row per work item, A row pivate, B col local
+#--------------------------------------------------------------------------------
+
+kernelsource = open("../C_row_priv_bloc.cl").read()
+program = cl.Program(context, kernelsource).build()
+print "\n===== OpenCL, mat mult, C row, priv A, B cols loc, order", Ndim, "======\n"
+# Do the multiplication COUNT times
+for i in range(COUNT):
+    h_C.fill(0.0)
+    start_time = time()
+
+    localmem = cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * Pdim)
+    program.mmul(queue, (Ndim,), (ORDER/16,), numpy.int32(Mdim), numpy.int32(Ndim), numpy.int32(Pdim),
+    	d_a, d_b, d_c, localmem)
+    queue.finish()
+
+    run_time = time() - start_time
+
+    cl.enqueue_copy(queue, h_C, d_c)
+    results(Mdim, Ndim, Pdim, h_C, run_time)
+
+#--------------------------------------------------------------------------------
+# OpenCL matrix multiplication ...  A and B in block form in local memory
+#--------------------------------------------------------------------------------
+
+kernelsource = open("../C_block_form.cl").read()
+program = cl.Program(context, kernelsource).build()
+print "\n===== OpenCL, A and B in block form in local memory, order", Ndim, "======\n"
+blockSize = 16
+# Do the multiplication COUNT times
+for i in range(COUNT):
+    h_C.fill(0.0)
+    start_time = time()
+
+    localmem1 = cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * blockSize * blockSize)
+    localmem2 = cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * blockSize * blockSize)
+    program.mmul(queue, (Ndim, Mdim), (blockSize, blockSize),
+    	numpy.int32(Mdim), numpy.int32(Ndim), numpy.int32(Pdim),
+    	d_a, d_b, d_c, localmem1, localmem2)
+    queue.finish()
+
+    run_time = time() - start_time
+
+    cl.enqueue_copy(queue, h_C, d_c)
+    results(Mdim, Ndim, Pdim, h_C, run_time)