diff --git a/Exercises/Exercise01/Julia/device_info.jl b/Exercises/Exercise01/Julia/device_info.jl new file mode 100644 index 0000000..261b922 --- /dev/null +++ b/Exercises/Exercise01/Julia/device_info.jl @@ -0,0 +1,51 @@ +# +# Display Device Information +# +# Script to print out some information about the OpenCL devices +# and platforms available on your system +# +# History: C++ version written by Tom Deakin, 2012 +# Ported to Python by Tom Deakin, July 2013 +# + +import OpenCL +const cl = OpenCL + +# create a list of all the platform ids +platforms = cl.platforms() + +println("\nNumber of OpenCL platforms: $(length(platforms))") +println("\n-----------------------------\n") + +# info for each platform +for p in platforms + + # print out some info + @printf("Platform: %s\n", p[:name]) + @printf("Vendor: %s\n", p[:vendor]) + @printf("Version: %s\n", p[:version]) + + # discover all the devices + devices = cl.devices(p) + @printf("Number of devices: %s\n", length(devices)) + + for d in devices + println("\t-----------------------------") + # Print out some information about the devices + @printf("\t\tName: %s\n", d[:name]) + @printf("\t\tVersion: %s\n", d[:version]) + @printf("\t\tMax. Compute Units: %s\n", d[:max_compute_units]) + @printf("\t\tLocal Memory Size: %i KB\n", d[:local_mem_size] / 1024) + @printf("\t\tGlobal Memory Size: %i MB\n", d[:global_mem_size] / (1024^2)) + @printf("\t\tMax Alloc Size: %i MB\n", d[:max_mem_alloc_size] / (1024^2)) + @printf("\t\tMax Work-group Size: %s\n", d[:max_work_group_size]) + + # Find the maximum dimensions of the work-groups + dim = d[:max_work_item_size] + @printf("\t\tMax Work-item Dims: %s\n", dim) + println("\t-----------------------------") + end + + print("\n-------------------------") +end + diff --git a/Exercises/Exercise03/Julia/deviceinfo.jl b/Exercises/Exercise03/Julia/deviceinfo.jl new file mode 100644 index 0000000..c391c6f --- /dev/null +++ b/Exercises/Exercise03/Julia/deviceinfo.jl @@ -0,0 +1,20 @@ +# +# Device Info +# +# Function to output key parameters about the input OpenCL device +# +# History: C version written by Tim Mattson, June 2010 +# Ported to Python by Tom Deakin, July 2013 +# Ported to Julia by Jake Bolewski, Nov 2013 + +import OpenCL + +function output_device_info(d::OpenCL.Device) + n = d[:name] + dt = d[:device_type] + v = d[:platform][:vendor] + mc = d[:max_compute_units] + str = "Device is $n $dt from $v with a max of $mc compute units" + println(str) +end + diff --git a/Exercises/Exercise03/Julia/vadd.jl b/Exercises/Exercise03/Julia/vadd.jl new file mode 100644 index 0000000..15b0359 --- /dev/null +++ b/Exercises/Exercise03/Julia/vadd.jl @@ -0,0 +1,101 @@ +# +# Vadd +# +# Element wise addition of two vectors (c = a + b) +# Asks the user to select a device at runtime +# +# History: C version written by Tim Mattson, December 2009 +# C version Updated by Tom Deakin and Simon McIntosh-Smith, October 2012 +# Ported to Python by Tom Deakin, July 2013 +# Ported to Julia by Jake Bolewski, Nov 2013 + +import OpenCL +const cl = OpenCL + +include("deviceinfo.jl") + +# tolerance used in floating point comparisons +TOL = 1e-3 + +# length of vectors a, b, c +LENGTH = 1024 + +# Kernel: vadd +# +# To compute the elementwise sum c = a + b +# +# Input: a and b float vectors of length count +# Output c float vector of length count holding the sum a + b + +kernelsource = " +__kernel void vadd( + __global float* a, + __global float* b, + __global float* c, + const unsigned int count) +{ + int i = get_global_id(0); + if (i < count) + c[i] = a[i] + b[i]; +} +" + +# create a compute context + +# this selects the fastest opencl device available +# and creates a context and queue for using the +# the selected device +device, ctx, queue = cl.create_compute_context() + +output_device_info(device) + +# create the compute program and build it +program = cl.Program(ctx, source=kernelsource) |> cl.build! + +#create a and b vectors and fill with random float values +h_a = rand(Float32, LENGTH) +h_b = rand(Float32, LENGTH) + +# create the input (a,b,e,g) arrays in device memory and copy data from the host + +# buffers can be passed memory flags: +# {:r = readonly, :w = writeonly, :rw = read_write (default)} + +# buffers can also be passed flags for allocation: +# {:use (use host buffer), :alloc (alloc pinned memory), :copy (default)} + +# Create the input (a, b, e, g) arrays in device memory and copy data from host +d_a = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_a) +d_b = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_b) +# Create the output (c, d, f) array in device memory +d_c = cl.Buffer(Float32, ctx, :w, LENGTH) + +# create the kernel +vadd = cl.Kernel(program, "vadd") + +# execute the kernel over the entire range of 1d, input +# cl.call is blocking, it accepts a queue, the kernel, global / local work sizes, +# the the kernel's arguments. + +# here we call the kernel with work size set to the number of elements and a local +# work size of nothing. This enables the opencl runtime to optimize the local size +# for simple kernels +cl.call(queue, vadd, size(h_a), nothing, d_a, d_b, d_c, uint32(LENGTH)) + +# read back the results from the compute device +h_c = cl.read(queue, d_c) + +# test the results +correct = 0 +for i in 1:LENGTH + tmp = h_a[i] + h_b[i] + tmp -= h_c[i] + if tmp^2 < TOL^2 + correct += 1 + else + println("tmp $tmp h_a $(h_a[i]) h_b $(h_b[i]) h_c $(h_c[i])") + end +end + +# summarize results +println("3 vector adds to find F=A+B+E+G: $correct out of $LENGTH results were correct") diff --git a/Exercises/Exercise04/Julia/deviceinfo.jl b/Exercises/Exercise04/Julia/deviceinfo.jl new file mode 100644 index 0000000..c391c6f --- /dev/null +++ b/Exercises/Exercise04/Julia/deviceinfo.jl @@ -0,0 +1,20 @@ +# +# Device Info +# +# Function to output key parameters about the input OpenCL device +# +# History: C version written by Tim Mattson, June 2010 +# Ported to Python by Tom Deakin, July 2013 +# Ported to Julia by Jake Bolewski, Nov 2013 + +import OpenCL + +function output_device_info(d::OpenCL.Device) + n = d[:name] + dt = d[:device_type] + v = d[:platform][:vendor] + mc = d[:max_compute_units] + str = "Device is $n $dt from $v with a max of $mc compute units" + println(str) +end + diff --git a/Exercises/Exercise04/Julia/vadd.jl b/Exercises/Exercise04/Julia/vadd.jl new file mode 100644 index 0000000..55d77cc --- /dev/null +++ b/Exercises/Exercise04/Julia/vadd.jl @@ -0,0 +1,101 @@ +# +# Vadd +# +# Element wise addition of two vectors (c = a + b) +# Asks the user to select a device at runtime +# +# History: C version written by Tim Mattson, December 2009 +# C version Updated by Tom Deakin and Simon McIntosh-Smith, October 2012 +# Ported to Python by Tom Deakin, July 2013 +# Ported to Julia by Jake Bolewski, Nov 2013 + +import OpenCL +const cl = OpenCL + +include("deviceinfo.jl") + +# tolerance used in floating point comparisons +TOL = 1e-3 + +# length of vectors a, b, c +LENGTH = 1024 + +# Kernel: vadd +# +# To compute the elementwise sum c = a + b +# +# Input: a and b float vectors of length count +# Output c float vector of length count holding the sum a + b + +kernelsource = " +__kernel void vadd( + __global float* a, + __global float* b, + __global float* c, + const unsigned int count) +{ + unsigned int i = get_global_id(0); + if (i < count) + c[i] = a[i] + b[i]; +} +" + +# create a compute context + +# this selects the fastest opencl device available +# and creates a context and queue for using the +# the selected device +device, ctx, queue = cl.create_compute_context() + +output_device_info(device) + +# create the compute program and build it +program = cl.Program(ctx, source=kernelsource) |> cl.build! + +#create a and b vectors and fill with random float values +h_a = rand(Float32, LENGTH) +h_b = rand(Float32, LENGTH) + +# create the input (a,b,e,g) arrays in device memory and copy data from the host + +# buffers can be passed memory flags: +# {:r = readonly, :w = writeonly, :rw = read_write (default)} + +# buffers can also be passed flags for allocation: +# {:use (use host buffer), :alloc (alloc pinned memory), :copy (default)} + +# Create the input (a, b, e, g) arrays in device memory and copy data from host +d_a = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_a) +d_b = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_b) +# Create the output (c, d, f) array in device memory +d_c = cl.Buffer(Float32, ctx, :w, LENGTH) + +# create the kernel +vadd = cl.Kernel(program, "vadd") + +# execute the kernel over the entire range of 1d, input +# cl.call is blocking, it accepts a queue, the kernel, global / local work sizes, +# the the kernel's arguments. + +# here we call the kernel with work size set to the number of elements and a local +# work size of nothing. This enables the opencl runtime to optimize the local size +# for simple kernels +cl.call(queue, vadd, size(h_a), nothing, d_a, d_b, d_c, uint32(LENGTH)) + +# read back the results from the compute device +h_c = cl.read(queue, d_c) + +# test the results +correct = 0 +for i in 1:LENGTH + tmp = h_a[i] + h_b[i] + tmp -= h_c[i] + if tmp^2 < TOL^2 + correct += 1 + else + println("tmp $tmp h_a $(h_a[i]) h_b $(h_b[i]) h_c $(h_c[i])") + end +end + +# summarize results +println("3 vector adds to find F=A+B+E+G: $correct out of $LENGTH results were correct") diff --git a/Exercises/Exercise05/Julia/deviceinfo.jl b/Exercises/Exercise05/Julia/deviceinfo.jl new file mode 100644 index 0000000..c391c6f --- /dev/null +++ b/Exercises/Exercise05/Julia/deviceinfo.jl @@ -0,0 +1,20 @@ +# +# Device Info +# +# Function to output key parameters about the input OpenCL device +# +# History: C version written by Tim Mattson, June 2010 +# Ported to Python by Tom Deakin, July 2013 +# Ported to Julia by Jake Bolewski, Nov 2013 + +import OpenCL + +function output_device_info(d::OpenCL.Device) + n = d[:name] + dt = d[:device_type] + v = d[:platform][:vendor] + mc = d[:max_compute_units] + str = "Device is $n $dt from $v with a max of $mc compute units" + println(str) +end + diff --git a/Exercises/Exercise05/Julia/vadd.jl b/Exercises/Exercise05/Julia/vadd.jl new file mode 100644 index 0000000..15b0359 --- /dev/null +++ b/Exercises/Exercise05/Julia/vadd.jl @@ -0,0 +1,101 @@ +# +# Vadd +# +# Element wise addition of two vectors (c = a + b) +# Asks the user to select a device at runtime +# +# History: C version written by Tim Mattson, December 2009 +# C version Updated by Tom Deakin and Simon McIntosh-Smith, October 2012 +# Ported to Python by Tom Deakin, July 2013 +# Ported to Julia by Jake Bolewski, Nov 2013 + +import OpenCL +const cl = OpenCL + +include("deviceinfo.jl") + +# tolerance used in floating point comparisons +TOL = 1e-3 + +# length of vectors a, b, c +LENGTH = 1024 + +# Kernel: vadd +# +# To compute the elementwise sum c = a + b +# +# Input: a and b float vectors of length count +# Output c float vector of length count holding the sum a + b + +kernelsource = " +__kernel void vadd( + __global float* a, + __global float* b, + __global float* c, + const unsigned int count) +{ + int i = get_global_id(0); + if (i < count) + c[i] = a[i] + b[i]; +} +" + +# create a compute context + +# this selects the fastest opencl device available +# and creates a context and queue for using the +# the selected device +device, ctx, queue = cl.create_compute_context() + +output_device_info(device) + +# create the compute program and build it +program = cl.Program(ctx, source=kernelsource) |> cl.build! + +#create a and b vectors and fill with random float values +h_a = rand(Float32, LENGTH) +h_b = rand(Float32, LENGTH) + +# create the input (a,b,e,g) arrays in device memory and copy data from the host + +# buffers can be passed memory flags: +# {:r = readonly, :w = writeonly, :rw = read_write (default)} + +# buffers can also be passed flags for allocation: +# {:use (use host buffer), :alloc (alloc pinned memory), :copy (default)} + +# Create the input (a, b, e, g) arrays in device memory and copy data from host +d_a = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_a) +d_b = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_b) +# Create the output (c, d, f) array in device memory +d_c = cl.Buffer(Float32, ctx, :w, LENGTH) + +# create the kernel +vadd = cl.Kernel(program, "vadd") + +# execute the kernel over the entire range of 1d, input +# cl.call is blocking, it accepts a queue, the kernel, global / local work sizes, +# the the kernel's arguments. + +# here we call the kernel with work size set to the number of elements and a local +# work size of nothing. This enables the opencl runtime to optimize the local size +# for simple kernels +cl.call(queue, vadd, size(h_a), nothing, d_a, d_b, d_c, uint32(LENGTH)) + +# read back the results from the compute device +h_c = cl.read(queue, d_c) + +# test the results +correct = 0 +for i in 1:LENGTH + tmp = h_a[i] + h_b[i] + tmp -= h_c[i] + if tmp^2 < TOL^2 + correct += 1 + else + println("tmp $tmp h_a $(h_a[i]) h_b $(h_b[i]) h_c $(h_c[i])") + end +end + +# summarize results +println("3 vector adds to find F=A+B+E+G: $correct out of $LENGTH results were correct") diff --git a/Exercises/Exercise06/Julia/helper.jl b/Exercises/Exercise06/Julia/helper.jl new file mode 100644 index 0000000..eec59ae --- /dev/null +++ b/Exercises/Exercise06/Julia/helper.jl @@ -0,0 +1,20 @@ +function error{T}(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T}) + cval = float32(Pdim * AVAL * BVAL) + errsq = float32(0.0) + for i in 1:Ndim + for j in 1:Mdim + err = C[(i-1)*Ndim+j] - cval + errsq += err^2 + end + end + return errsq +end + +function results{T}(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T}, run_time) + mflops = 2.0 * Mdim * Ndim * Pdim/(1000000.0* run_time) + println("$run_time seconds at $mflops MFLOPS") + errsq = error(Mdim, Ndim, Pdim, C) + if isnan(errsq) || errsq > TOL + println("Errors in multiplication: $errsq") + end +end diff --git a/Exercises/Exercise06/Julia/matmul.jl b/Exercises/Exercise06/Julia/matmul.jl new file mode 100644 index 0000000..8f4a530 --- /dev/null +++ b/Exercises/Exercise06/Julia/matmul.jl @@ -0,0 +1,126 @@ +# Matrix Multiplication Driver +# +# This is a driver program to test various ways of computing +# the product: +# C = A * B +# +# A and B are constant matrices, square and the order is +# set as a constant, ORDER (see definitions.py). This is so +# we can make a quick test of the multiplication result. +# +# History: C++ version written by Tim Mattson, August 2010 +# Modified by Simon McIntosh-Smith, September 2011 +# Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 +# Ported to Python by Tom Deakin, July 2013 +# Ported to Julia by Jake Bolewski, Nov 2013 + +import OpenCL +const cl = OpenCL + +const kernel_source = " +__kernel void mmul( + const int Mdim, + const int Ndim, + const int Pdim, + __global float* A, + __global float* B, + __global float* C) +{ + +} +" + +#### Definitions ### + +# Order of the square matrices A, B and C +ORDER = 512 + +# A elemetns are constant and equal to AVAL +AVAL = 3.0 + +# B elemetns are constant and equal to BVAL +BVAL = 5.0 + +# tolerance used in floating point comparisons +TOL = 0.001 + +# Max dim for NDRange +DIM = 2 + +# number of times to do each multiplication +COUNT = 1 + +# Helper functions +include("helper.jl") + +# A[N,P], B[P M], C[N,M] +Ndim = ORDER +Pdim = ORDER +Mdim = ORDER + +# Number of elements in the matrix +sizeA = Ndim * Pdim +sizeB = Pdim * Mdim +sizeC = Ndim * Mdim + +# Number of elements in the matrix +h_A = fill(float32(AVAL), sizeA) +h_B = fill(float32(BVAL), sizeB) +h_C = Array(Float32, sizeC) + +# %20 improvment using @inbounds +function seq_mat_mul_sdot{T}(Mdim::Int, Ndim::Int, Pdim::Int, + A::Array{T}, B::Array{T}, C::Array{T}) + for i in 1:Ndim + for j in 1:Mdim + tmp = zero(Float32) + for k in 1:Pdim + @inbounds tmp += A[(i-1)*Ndim+k] * B[(k-1)*Pdim+j] + end + @inbounds C[(i-1)*Ndim+j] = tmp + end + end +end + +info("=== Julia, matix mult (dot prod), order $ORDER ===") + +# force compilation +seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C) + +for i in 1:COUNT + fill!(h_C, 0.0) + t1 = time() + #seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C) + t2 = time() + results(Mdim, Ndim, Pdim, h_C, t2 - t1) +end + +# set up OpenCL +ctx = cl.create_some_context() + +# You can enable profiling events on the queue +# by calling the constructor with the :profile flag +queue = cl.CmdQueue(ctx, :profile) + +# create OpenCL Buffers +d_a = cl.Buffer(Float32, ctx, (:r,:copy), hostbuf=h_A) +d_b = cl.Buffer(Float32, ctx, (:r,:copy), hostbuf=h_B) +d_c = cl.Buffer(Float32, ctx, :w, length(h_C)) + +prg = cl.Program(ctx, source=kernel_source) |> cl.build! +mmul = cl.Kernel(prg, "mmul") + +info("=== OpenCL, matrix mult, C(i, j) per work item, order $Ndim ====") + +for i in 1:COUNT + fill!(h_C, 0.0) + global_range = (Ndim, Mdim) + local_range = nothing + evt = cl.call(queue, mmul, global_range, local_range, + int32(Mdim), int32(Ndim), int32(Pdim), + d_a, d_b, d_c) + # profiling events are measured in ns + run_time = evt[:profile_duration] / 1e9 + cl.copy!(queue, h_C, d_c) + results(Mdim, Ndim, Pdim, h_C, run_time) +end diff --git a/Exercises/Exercise09/Julia/pi.jl b/Exercises/Exercise09/Julia/pi.jl new file mode 100644 index 0000000..4077ec0 --- /dev/null +++ b/Exercises/Exercise09/Julia/pi.jl @@ -0,0 +1,37 @@ +# +# This program will numerically compute the integral of +# +# 4/(1+x*x) +# +# from 0 to 1. The value of this integral is pi -- which +# is great since it gives us an easy way to check the answer. +# +# This the original sequential program. +# +# History: Written in C by Tim Mattson, 11/99 +# Ported to Python by Tom Deakin, July 2013 +# Ported to Julia by Jake Bolewski, Nov 2013 + +const num_steps = 100000000 + +println("Doing $num_steps") + +const step = 1.0/num_steps + +start_time = time() + +# global variables are slow in julia, so we wrap in a function +pi_sum() = begin + integral_sum = 0.0 + for i in 1:num_steps + x = (i - 0.5) * step + integral_sum += 4.0 / (1.0 + x * x) + end + integral_sum +end + +est_pi = step * pi_sum() + +run_time = time() - start_time; + +println("\npi with $num_steps steps is $est_pi in $run_time seconds\n") diff --git a/Solutions/Exercise04/Julia/vadd_chain.jl b/Solutions/Exercise04/Julia/vadd_chain.jl new file mode 100644 index 0000000..6e1d5cf --- /dev/null +++ b/Solutions/Exercise04/Julia/vadd_chain.jl @@ -0,0 +1,114 @@ +# +# Vadd +# +# Element wise addition of two vectors at a time in a chain (C=A+B; D=C+E; F=D+G) +# Asks the user to select a device at runtime +# +# History: Initial version based on vadd.c, written by Tim Mattson, June 2011 +# Ported to C++ Wrapper API by Benedict Gaster, September 2011 +# Updated to C++ Wrapper API v1.2 by Tom Deakin and Simon McIntosh-Smith, October 2012 +# Ported to Python by Tom Deakin, July 2013 +# Ported to Julia by Jake Bolewski, Nov 2013 + +import OpenCL +const cl = OpenCL + +# tolerance used in floating point comparisons +TOL = 1e-3 + +# length of vectors a, b, c +LENGTH = 1024 + +# Kernel: vadd +# +# To compute the elementwise sum c = a + b +# +# Input: a and b float vectors of length count +# Output c float vector of length count holding the sum a + b + +kernelsource = " +__kernel void vadd( + __global float* a, + __global float* b, + __global float* c, + const unsigned int count) +{ + int i = get_global_id(0); + if (i < count) + c[i] = a[i] + b[i]; +} +" + +# create a compute context + +# this selects the fastest opencl device available +# and creates a context and queue for using the +# the selected device +device, ctx, queue = cl.create_compute_context() + +# create the compute program and build it +program = cl.Program(ctx, source=kernelsource) |> cl.build! + +#create a, b, e, and g vectors and fill with random float values +#create empty vectors for c, d, and f +h_a = rand(Float32, LENGTH) +h_b = rand(Float32, LENGTH) +h_c = Array(Float32, LENGTH) +h_d = Array(Float32, LENGTH) +h_e = rand(Float32, LENGTH) +h_f = Array(Float32, LENGTH) +h_g = rand(Float32, LENGTH) + +# create the input (a,b,e,g) arrays in device memory and copy data from the host + +# buffers can be passed memory flags: +# {:r = readonly, :w = writeonly, :rw = read_write (default)} + +# buffers can also be passed flags for allocation: +# {:use (use host buffer), :alloc (alloc pinned memory), :copy (default)} + +# Create the input (a, b, e, g) arrays in device memory and copy data from host +d_a = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_a) +d_b = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_b) +d_e = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_e) +d_g = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_g) +# Create the output (c, d, f) array in device memory +d_c = cl.Buffer(Float32, ctx, :w, LENGTH) +d_d = cl.Buffer(Float32, ctx, :w, LENGTH) +d_f = cl.Buffer(Float32, ctx, :w, LENGTH) + +# create the kernel +vadd = cl.Kernel(program, "vadd") + +# execute the kernel over the entire range of 1d, input +# cl.call is blocking, it accepts a queue, the kernel, global / local work sizes, +# the the kernel's arguments. + +# here we call the kernel with work size set to the number of elements and a local +# work size of nothing. This enables the opencl runtime to optimize the local size +# for simple kernels +cl.call(queue, vadd, size(h_a), nothing, d_a, d_b, d_c, uint32(LENGTH)) + +# call the kernel again with different arguments +cl.call(queue, vadd, size(h_e), nothing, d_e, d_c, d_d, uint32(LENGTH)) +cl.call(queue, vadd, size(h_g), nothing, d_g, d_d, d_f, uint32(LENGTH)) + +# copy back the results from the compute device +# copy!(queue, dst, src) follows same interface as julia's built in copy! +cl.copy!(queue, h_f, d_f) + +# test the results +correct = 0 +for i in 1:LENGTH + tmp = h_a[i] + h_b[i] + h_e[i] + h_g[i] + tmp -= h_f[i] + if tmp^2 < TOL^2 + correct += 1 + else + println("tmp $tmp h_a $(h_a[i]) h_b $(h_b[i]) ", + "h_e $(h_e[i]) h_g $(h_g[i]) h_f $(h_f[i])") + end +end + +# summarize results +println("3 vector adds to find F=A+B+E+G: $correct out of $LENGTH results were correct") diff --git a/Solutions/Exercise05/Julia/vadd_abc.jl b/Solutions/Exercise05/Julia/vadd_abc.jl new file mode 100644 index 0000000..6bfcb63 --- /dev/null +++ b/Solutions/Exercise05/Julia/vadd_abc.jl @@ -0,0 +1,87 @@ +# +# Vadd +# +# Element wise addition of three vectors at a time (R=A+B+C) +# Asks the user to select a device at runtime +# +# History: Initial version based on vadd.c, written by Tim Mattson, June 2011 +# Ported to C++ Wrapper API by Benedict Gaster, September 2011 +# Updated to C++ Wrapper API v1.2 by Tom Deakin and Simon McIntosh-Smith, October 2012 +# Ported to Python by Tom Deakin, July 2013 +# Ported to Julia by Jake Bolewski, Nov 2013 + +import OpenCL +const cl = OpenCL + +# tolerance used in floating point comparisons +TOL = 1e-3 + +# length of vectors a, b, c +LENGTH = 1024 + +# Kernel: vadd +# +# To compute the elementwise sum r = a + b + c +# +# Input: a, b and c float vectors of length count +# Output r float vector of length count holding the sum a + b + cs + +kernelsource = " +__kernel void vadd( + __global float* a, + __global float* b, + __global float* c, + __global float* r, + const unsigned int count) +{ + unsigned int i = get_global_id(0); + if (i < count) + r[i] = a[i] + b[i] + c[i]; +}" + +# create a compute context +device, ctx, queue = cl.create_compute_context() + +# create the compute program and build it +program = cl.Program(ctx, source=kernelsource) |> cl.build! + +# create a, b and c vectors and fill with random float values +# (the result array will be created when reading back from the device) +h_a = rand(Float32, LENGTH) +h_b = rand(Float32, LENGTH) +h_c = rand(Float32, LENGTH) + +d_a = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_a) +d_b = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_b) +d_c = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_c) + +# create the output (r) buffer in device memory +d_r = cl.Buffer(Float32, ctx, :w, LENGTH) + +# create the kernel +vadd = cl.Kernel(program, "vadd") + +# execute the kernel over the entire range of the input +cl.call(queue, vadd, size(h_a), nothing, d_a, d_b, d_c, d_r, uint32(LENGTH)) + +# read the results back from the compute device +# by convention.. +# cl.(action) calls are blocking +# cl.enqueue_(action) calll are async/non-blocking +h_r = cl.read(queue, d_r) + +# test the results +correct = 0 +for i in 1:LENGTH + tmp = h_a[i] + h_b[i] + h_c[i] + # compute the deviation of expected and output result + tmp -= h_r[i] + if tmp^2 < TOL^2 + correct += 1 + else + println("tmp $tmp h_a $(h_a[i]) h_b $(h_b[i]) h_c $(h_c[i]) h_r $(h_r[i])") + end +end + +# summarize results +println("3 vector adds to find F=A+B+C: $correct out of $LENGTH results were correct") diff --git a/Solutions/Exercise06/Julia/helper.jl b/Solutions/Exercise06/Julia/helper.jl new file mode 100644 index 0000000..eec59ae --- /dev/null +++ b/Solutions/Exercise06/Julia/helper.jl @@ -0,0 +1,20 @@ +function error{T}(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T}) + cval = float32(Pdim * AVAL * BVAL) + errsq = float32(0.0) + for i in 1:Ndim + for j in 1:Mdim + err = C[(i-1)*Ndim+j] - cval + errsq += err^2 + end + end + return errsq +end + +function results{T}(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T}, run_time) + mflops = 2.0 * Mdim * Ndim * Pdim/(1000000.0* run_time) + println("$run_time seconds at $mflops MFLOPS") + errsq = error(Mdim, Ndim, Pdim, C) + if isnan(errsq) || errsq > TOL + println("Errors in multiplication: $errsq") + end +end diff --git a/Solutions/Exercise06/Julia/matmul.jl b/Solutions/Exercise06/Julia/matmul.jl new file mode 100644 index 0000000..8cdaaca --- /dev/null +++ b/Solutions/Exercise06/Julia/matmul.jl @@ -0,0 +1,136 @@ +# Matrix Multiplication Driver +# +# This is a driver program to test various ways of computing +# the product: +# C = A * B +# +# A and B are constant matrices, square and the order is +# set as a constant, ORDER (see definitions.py). This is so +# we can make a quick test of the multiplication result. +# +# History: C++ version written by Tim Mattson, August 2010 +# Modified by Simon McIntosh-Smith, September 2011 +# Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 +# Ported to Python by Tom Deakin, July 2013 +# Ported to Julia by Jake Bolewski, Nov 2013 + +import OpenCL +const cl = OpenCL + +const kernel_source = " +__kernel void mmul( + const int Mdim, + const int Ndim, + const int Pdim, + __global float* A, + __global float* B, + __global float* C) +{ + int k; + int i = get_global_id(0); + int j = get_global_id(1); + float tmp; + if ((i < Ndim) && (j < Mdim)) + { + tmp = 0.0; + for (k = 0; k < Pdim; k++) + tmp += A[i*Ndim+k] * B[k*Pdim+j]; + C[i*Ndim+j] = tmp; + } +} +" + +#### Definitions ### + +# Order of the square matrices A, B and C +ORDER = 512 + +# A elemetns are constant and equal to AVAL +AVAL = 3.0 + +# B elemetns are constant and equal to BVAL +BVAL = 5.0 + +# tolerance used in floating point comparisons +TOL = 0.001 + +# Max dim for NDRange +DIM = 2 + +# number of times to do each multiplication +COUNT = 1 + +# Helper functions +include("helper.jl") + +# A[N,P], B[P M], C[N,M] +Ndim = ORDER +Pdim = ORDER +Mdim = ORDER + +# Number of elements in the matrix +sizeA = Ndim * Pdim +sizeB = Pdim * Mdim +sizeC = Ndim * Mdim + +# Number of elements in the matrix +h_A = fill(float32(AVAL), sizeA) +h_B = fill(float32(BVAL), sizeB) +h_C = Array(Float32, sizeC) + +# %20 improvment using @inbounds +function seq_mat_mul_sdot{T}(Mdim::Int, Ndim::Int, Pdim::Int, + A::Array{T}, B::Array{T}, C::Array{T}) + for i in 1:Ndim + for j in 1:Mdim + tmp = zero(Float32) + for k in 1:Pdim + @inbounds tmp += A[(i-1)*Ndim+k] * B[(k-1)*Pdim+j] + end + @inbounds C[(i-1)*Ndim+j] = tmp + end + end +end + +info("=== Julia, matix mult (dot prod), order $ORDER ===") + +# force compilation +seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C) + +for i in 1:COUNT + fill!(h_C, 0.0) + t1 = time() + #seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C) + t2 = time() + results(Mdim, Ndim, Pdim, h_C, t2 - t1) +end + +# set up OpenCL +ctx = cl.create_some_context() + +# You can enable profiling events on the queue +# by calling the constructor with the :profile flag +queue = cl.CmdQueue(ctx, :profile) + +# create OpenCL Buffers +d_a = cl.Buffer(Float32, ctx, (:r,:copy), hostbuf=h_A) +d_b = cl.Buffer(Float32, ctx, (:r,:copy), hostbuf=h_B) +d_c = cl.Buffer(Float32, ctx, :w, length(h_C)) + +prg = cl.Program(ctx, source=kernel_source) |> cl.build! +mmul = cl.Kernel(prg, "mmul") + +info("=== OpenCL, matrix mult, C(i, j) per work item, order $Ndim ====") + +for i in 1:COUNT + fill!(h_C, 0.0) + global_range = (Ndim, Mdim) + local_range = nothing + evt = cl.call(queue, mmul, global_range, local_range, + int32(Mdim), int32(Ndim), int32(Pdim), + d_a, d_b, d_c) + # profiling events are measured in ns + run_time = evt[:profile_duration] / 1e9 + cl.copy!(queue, h_C, d_c) + results(Mdim, Ndim, Pdim, h_C, run_time) +end diff --git a/Solutions/Exercise07/Julia/helper.jl b/Solutions/Exercise07/Julia/helper.jl new file mode 100644 index 0000000..eec59ae --- /dev/null +++ b/Solutions/Exercise07/Julia/helper.jl @@ -0,0 +1,20 @@ +function error{T}(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T}) + cval = float32(Pdim * AVAL * BVAL) + errsq = float32(0.0) + for i in 1:Ndim + for j in 1:Mdim + err = C[(i-1)*Ndim+j] - cval + errsq += err^2 + end + end + return errsq +end + +function results{T}(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T}, run_time) + mflops = 2.0 * Mdim * Ndim * Pdim/(1000000.0* run_time) + println("$run_time seconds at $mflops MFLOPS") + errsq = error(Mdim, Ndim, Pdim, C) + if isnan(errsq) || errsq > TOL + println("Errors in multiplication: $errsq") + end +end diff --git a/Solutions/Exercise07/Julia/matmul.jl b/Solutions/Exercise07/Julia/matmul.jl new file mode 100644 index 0000000..5f48368 --- /dev/null +++ b/Solutions/Exercise07/Julia/matmul.jl @@ -0,0 +1,161 @@ +# Matrix Multiplication Driver +# +# This is a driver program to test various ways of computing +# the product: +# C = A * B +# +# A and B are constant matrices, square and the order is +# set as a constant, ORDER (see definitions.py). This is so +# we can make a quick test of the multiplication result. +# +# History: C++ version written by Tim Mattson, August 2010 +# Modified by Simon McIntosh-Smith, September 2011 +# Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 +# Ported to Python by Tom Deakin, July 2013 +# Ported to Julia by Jake Bolewski, Nov 2013 + +import OpenCL +const cl = OpenCL + +# get the directory of this file +# (used for test runner) +src_dir = dirname(Base.source_path()) + +#### Definitions ### + +# Order of the square matrices A, B and C +ORDER = 512 + +# A elemetns are constant and equal to AVAL +AVAL = 3.0 + +# B elemetns are constant and equal to BVAL +BVAL = 5.0 + +# tolerance used in floating point comparisons +TOL = 0.001 + +# Max dim for NDRange +DIM = 2 + +# number of times to do each multiplication +COUNT = 1 + +# Helper functions +include("helper.jl") + +# A[N,P], B[P M], C[N,M] +Ndim = ORDER +Pdim = ORDER +Mdim = ORDER + +# Number of elements in the matrix +sizeA = Ndim * Pdim +sizeB = Pdim * Mdim +sizeC = Ndim * Mdim + +# Number of elements in the matrix +h_A = fill(float32(AVAL), sizeA) +h_B = fill(float32(BVAL), sizeB) +h_C = Array(Float32, sizeC) + +# %20 improvment using @inbounds +function seq_mat_mul_sdot{T}(Mdim::Int, Ndim::Int, Pdim::Int, + A::Array{T}, B::Array{T}, C::Array{T}) + for i in 1:Ndim + for j in 1:Mdim + tmp = zero(Float32) + for k in 1:Pdim + @inbounds tmp += A[(i-1)*Ndim+k] * B[(k-1)*Pdim+j] + end + @inbounds C[(i-1)*Ndim+j] = tmp + end + end +end + +info("=== Julia, matix mult (dot prod), order $ORDER ===") + +# force compilation +seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C) + +for i in 1:COUNT + fill!(h_C, 0.0) + t1 = time() + seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C) + t2 = time() + results(Mdim, Ndim, Pdim, h_C, t2 - t1) +end + +# set up OpenCL +ctx = cl.create_some_context() + +# You can enable profiling events on the queue +# by calling the constructor with the :profile flag +queue = cl.CmdQueue(ctx, :profile) + +# create OpenCL Buffers +d_a = cl.Buffer(Float32, ctx, (:r,:copy), hostbuf=h_A) +d_b = cl.Buffer(Float32, ctx, (:r,:copy), hostbuf=h_B) +d_c = cl.Buffer(Float32, ctx, :w, length(h_C)) + +#-------------------------------------------------------------------------------- +# OpenCL matrix multiplication ... Naive +#-------------------------------------------------------------------------------- + +kernel_source = open(readall, joinpath(src_dir, "../C_elem.cl")) +prg = cl.Program(ctx, source=kernel_source) |> cl.build! +mmul = cl.Kernel(prg, "mmul") + +info("=== OpenCL, matrix mult, C(i, j) per work item, order $Ndim ====") + +for i in 1:COUNT + fill!(h_C, 0.0) + evt = cl.call(queue, mmul, (Ndim, Mdim), nothing, + int32(Mdim), int32(Ndim), int32(Pdim), + d_a, d_b, d_c) + # profiling events are measured in ns + run_time = evt[:profile_duration] / 1e9 + cl.copy!(queue, h_C, d_c) + results(Mdim, Ndim, Pdim, h_C, run_time) +end + +#-------------------------------------------------------------------------------- +# OpenCL matrix multiplication ... C row per work item +#-------------------------------------------------------------------------------- + +kernel_source = open(readall, joinpath(src_dir, "../C_row.cl")) +prg = cl.Program(ctx, source=kernel_source) |> cl.build! +mmul = cl.Kernel(prg, "mmul") + +info("=== OpenCL, matrix mult, C row per work item, order $Ndim ====") + +for i in 1:COUNT + fill!(h_C, 0.0) + evt = cl.call(queue, mmul, (Ndim,), (int(ORDER/16),), + int32(Mdim), int32(Ndim), int32(Pdim), + d_a, d_b, d_c) + # profiling events are measured in ns + run_time = evt[:profile_duration] / 1e9 + cl.copy!(queue, h_C, d_c) + results(Mdim, Ndim, Pdim, h_C, run_time) +end + +#-------------------------------------------------------------------------------- +# OpenCL matrix multiplication ... C row per work item, A row in pivate memory +#-------------------------------------------------------------------------------- +kernel_source = open(readall, joinpath(src_dir, "../C_row_priv.cl")) +prg = cl.Program(ctx, source=kernel_source) |> cl.build! +mmul = cl.Kernel(prg, "mmul") + +info("=== OpenCL, matrix mult, C row, A row in priv mem, order $Ndim ====") + +for i in 1:COUNT + fill!(h_C, 0.0) + evt = cl.call(queue, mmul, (Ndim,), (int(ORDER/16),), + int32(Mdim), int32(Ndim), int32(Pdim), + d_a, d_b, d_c) + # profiling events are measured in ns + run_time = evt[:profile_duration] / 1e9 + cl.copy!(queue, h_C, d_c) + results(Mdim, Ndim, Pdim, h_C, run_time) +end diff --git a/Solutions/Exercise08/Julia/helper.jl b/Solutions/Exercise08/Julia/helper.jl new file mode 100644 index 0000000..eec59ae --- /dev/null +++ b/Solutions/Exercise08/Julia/helper.jl @@ -0,0 +1,20 @@ +function error{T}(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T}) + cval = float32(Pdim * AVAL * BVAL) + errsq = float32(0.0) + for i in 1:Ndim + for j in 1:Mdim + err = C[(i-1)*Ndim+j] - cval + errsq += err^2 + end + end + return errsq +end + +function results{T}(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T}, run_time) + mflops = 2.0 * Mdim * Ndim * Pdim/(1000000.0* run_time) + println("$run_time seconds at $mflops MFLOPS") + errsq = error(Mdim, Ndim, Pdim, C) + if isnan(errsq) || errsq > TOL + println("Errors in multiplication: $errsq") + end +end diff --git a/Solutions/Exercise08/Julia/matmul.jl b/Solutions/Exercise08/Julia/matmul.jl new file mode 100644 index 0000000..30069a2 --- /dev/null +++ b/Solutions/Exercise08/Julia/matmul.jl @@ -0,0 +1,185 @@ +# Matrix Multiplication Driver +# +# This is a driver program to test various ways of computing +# the product: +# C = A * B +# +# A and B are constant matrices, square and the order is +# set as a constant, ORDER (see definitions.py). This is so +# we can make a quick test of the multiplication result. +# +# History: C++ version written by Tim Mattson, August 2010 +# Modified by Simon McIntosh-Smith, September 2011 +# Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 +# Ported to Python by Tom Deakin, July 2013 +# Ported to Julia by Jake Bolewski, Nov 2013 + +import OpenCL +const cl = OpenCL + +# get the directory of this file +# (used for test runner) +src_dir = dirname(Base.source_path()) + +#### Definitions ### + +# Order of the square matrices A, B and C +ORDER = 512 + +# A elemetns are constant and equal to AVAL +AVAL = 3.0 + +# B elemetns are constant and equal to BVAL +BVAL = 5.0 + +# tolerance used in floating point comparisons +TOL = 0.001 + +# Max dim for NDRange +DIM = 2 + +# number of times to do each multiplication +COUNT = 1 + +# Helper functions +include("helper.jl") + +# A[N,P], B[P M], C[N,M] +Ndim = ORDER +Pdim = ORDER +Mdim = ORDER + +# Number of elements in the matrix +sizeA = Ndim * Pdim +sizeB = Pdim * Mdim +sizeC = Ndim * Mdim + +# Number of elements in the matrix +h_A = fill(float32(AVAL), sizeA) +h_B = fill(float32(BVAL), sizeB) +h_C = Array(Float32, sizeC) + +# %20 improvment using @inbounds +function seq_mat_mul_sdot{T}(Mdim::Int, Ndim::Int, Pdim::Int, + A::Array{T}, B::Array{T}, C::Array{T}) + for i in 1:Ndim + for j in 1:Mdim + tmp = zero(Float32) + for k in 1:Pdim + @inbounds tmp += A[(i-1)*Ndim+k] * B[(k-1)*Pdim+j] + end + @inbounds C[(i-1)*Ndim+j] = tmp + end + end +end + +info("=== Julia, matix mult (dot prod), order $ORDER ===") + +# force compilation +seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C) + +for i in 1:COUNT + fill!(h_C, 0.0) + t1 = time() + seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C) + t2 = time() + results(Mdim, Ndim, Pdim, h_C, t2 - t1) +end + +# set up OpenCL +ctx = cl.create_some_context() + +# You can enable profiling events on the queue +# by calling the constructor with the :profile flag +queue = cl.CmdQueue(ctx, :profile) + +# create OpenCL Buffers +d_a = cl.Buffer(Float32, ctx, (:r,:copy), hostbuf=h_A) +d_b = cl.Buffer(Float32, ctx, (:r,:copy), hostbuf=h_B) +d_c = cl.Buffer(Float32, ctx, :w, length(h_C)) + +#-------------------------------------------------------------------------------- +# OpenCL matrix multiplication ... Naive +#-------------------------------------------------------------------------------- + +kernel_source = open(readall, joinpath(src_dir, "../C_elem.cl")) +prg = cl.Program(ctx, source=kernel_source) |> cl.build! +mmul = cl.Kernel(prg, "mmul") + +info("=== OpenCL, matrix mult, C(i, j) per work item, order $Ndim ====") + +for i in 1:COUNT + fill!(h_C, 0.0) + evt = cl.call(queue, mmul, (Ndim, Mdim), nothing, + int32(Mdim), int32(Ndim), int32(Pdim), + d_a, d_b, d_c) + # profiling events are measured in ns + run_time = evt[:profile_duration] / 1e9 + cl.copy!(queue, h_C, d_c) + results(Mdim, Ndim, Pdim, h_C, run_time) +end + +#-------------------------------------------------------------------------------- +# OpenCL matrix multiplication ... C row per work item +#-------------------------------------------------------------------------------- + +kernel_source = open(readall, joinpath(src_dir, "../C_row.cl")) +prg = cl.Program(ctx, source=kernel_source) |> cl.build! +mmul = cl.Kernel(prg, "mmul") + +info("=== OpenCL, matrix mult, C row per work item, order $Ndim ====") + +for i in 1:COUNT + fill!(h_C, 0.0) + evt = cl.call(queue, mmul, (Ndim,), (int(ORDER/16),), + int32(Mdim), int32(Ndim), int32(Pdim), + d_a, d_b, d_c) + # profiling events are measured in ns + run_time = evt[:profile_duration] / 1e9 + cl.copy!(queue, h_C, d_c) + results(Mdim, Ndim, Pdim, h_C, run_time) +end + +#-------------------------------------------------------------------------------- +# OpenCL matrix multiplication ... C row per work item, A row in pivate memory +#-------------------------------------------------------------------------------- +kernel_source = open(readall, joinpath(src_dir, "../C_row_priv_bloc.cl")) +prg = cl.Program(ctx, source=kernel_source) |> cl.build! +mmul = cl.Kernel(prg, "mmul") + +info("=== OpenCL, matrix mult, C row, priv A, B, cols loc, order $Ndim ====") + +for i in 1:COUNT + fill!(h_C, 0.0) + localmem = cl.LocalMem(Float32, Pdim) + evt = cl.call(queue, mmul, (Ndim,), (int(ORDER/16),), + int32(Mdim), int32(Ndim), int32(Pdim), + d_a, d_b, d_c, localmem) + # profiling events are measured in ns + run_time = evt[:profile_duration] / 1e9 + cl.copy!(queue, h_C, d_c) + results(Mdim, Ndim, Pdim, h_C, run_time) +end + +#-------------------------------------------------------------------------------- +# OpenCL matrix multiplication ... C row per work item, A row pivate, B col local +#-------------------------------------------------------------------------------- +kernel_source = open(readall, joinpath(src_dir, "../C_block_form.cl")) +prg = cl.Program(ctx, source=kernel_source) |> cl.build! +mmul = cl.Kernel(prg, "mmul") + +info("=== OpenCL, matrix mult, A and B in block form in local memory, order $Ndim ====") +blocksize = 16 + +for i in 1:COUNT + fill!(h_C, float32(0.0)) + localmem1 = cl.LocalMem(Float32, blocksize^2) + localmem2 = cl.LocalMem(Float32, blocksize^2) + evt = cl.call(queue, mmul, (Ndim,), (int(ORDER/16),), + int32(Mdim), int32(Ndim), int32(Pdim), + d_a, d_b, d_c, localmem1, localmem2) + # profiling events are measured in ns + run_time = evt[:profile_duration] / 1e9 + cl.copy!(queue, h_C, d_c) + results(Mdim, Ndim, Pdim, h_C, run_time) +end diff --git a/Solutions/Exercise09/Julia/pi_ocl.jl b/Solutions/Exercise09/Julia/pi_ocl.jl new file mode 100644 index 0000000..d6d9f0e --- /dev/null +++ b/Solutions/Exercise09/Julia/pi_ocl.jl @@ -0,0 +1,88 @@ +# +# Pi reduction +# +# Numeric integration to estimate pi +# Asks the user to select a device at runtime +# +# History: C version written by Tim Mattson, May 2010 +# Ported to the C++ Wrapper API by Benedict R. Gaster, September 2011 +# C++ version Updated by Tom Deakin and Simon McIntosh-Smith, October 2012 +# Ported to Python by Tom Deakin, July 2013 +# Ported to Julia by Jake Bolewski, Nov 2013 + +import OpenCL +const cl = OpenCL + +# get the directory of this file +# (used for test runner) +src_dir = dirname(Base.source_path()) + +# +# Some constant values +const INSTEPS = 512*512*512 +const ITERS = 262144 + +# Set some default values: +# Default number of steps (updated later to device prefereable) +const in_nsteps = INSTEPS + +# Default number of iterations +const niters = ITERS + +# create context, queue and build program +device, ctx, queue = cl.create_compute_context() + +kernelsource = open(readall, joinpath(src_dir, "../pi_ocl.cl")) +program = cl.Program(ctx, source=kernelsource) |> cl.build! + +# pi is a julia keyword +pi_kernel = cl.Kernel(program, "pi") + +# get the max work group size for the kernel pi on the device +work_group_size = device[:max_work_group_size] + +# now that we know the size of the work_groups, we can set the number +# of work groups, the actual number of steps, and the step size +nwork_groups = int(in_nsteps / (work_group_size * niters)) + +if nwork_groups < 1 + # you can get opencl object info through the obj[:symbol] syntax + # or cl.info(obj, :symbol) + nwork_groups = device[:max_compute_units] + work_group_size = in_nsteps / (nwork_groups * niters) +end + +nsteps = work_group_size * niters * nwork_groups +step_size = 1.0 / nsteps + +# vector to hold partial sum +h_psum = Array(Float32, nwork_groups) + +println("$nwork_groups work groups of size $work_group_size.") +println("$nsteps integration steps") + +d_partial_sums = cl.Buffer(Float32, ctx, :w, length(h_psum)) + +# start timer +rtime = time() + +# Execute the kernel over the entire range of our 1d input data et +# using the maximum number of work group items for this device +# Set the global and local size as tuples +global_size = (nwork_groups * work_group_size,) +local_size = (work_group_size,) +localmem = cl.LocalMem(Float32, work_group_size) + +cl.call(queue, pi_kernel, global_size, local_size, + int32(niters), float32(step_size), localmem, d_partial_sums) + +cl.copy!(queue, h_psum, d_partial_sums) + +# complete the sum and compute final integral value +pi_res = sum(h_psum) * step_size + +# stop the timer +rtime = time() - rtime + +println("The calculation ran in $rtime secs") +println("pi=$pi_res for $nsteps steps") diff --git a/Solutions/ExerciseA/Julia/pi_vocl.jl b/Solutions/ExerciseA/Julia/pi_vocl.jl new file mode 100644 index 0000000..917cd99 --- /dev/null +++ b/Solutions/ExerciseA/Julia/pi_vocl.jl @@ -0,0 +1,131 @@ +# Pi reduction - vectorized +# +# Numeric integration to estimate pi +# Asks the user to select a device at runtime +# Vector size must be present as a CLI argument +# +# History: C version written by Tim Mattson, May 2010 +# Ported to the C++ Wrapper API by Benedict R. Gaster, September 2011 +# C++ version Updated by Tom Deakin and Simon McIntosh-Smith, October 2012 +# Ported to Python by Tom Deakin, July 2013 +# Ported to Julia by Jake Bolewski, Nov 2013 + +import OpenCL +const cl = OpenCL + +# get the directory of this file +# (used for test runner) +src_dir = dirname(Base.source_path()) + +# Some constant values +INSTEPS = 512 * 512 * 512 +ITERS = -1 +WGS = -1 +NAME = "" + +if length(ARGS) < 1 + info("Usage: julia pi_vocl.jl [num] (where num = 1, 4, or 8)") + exit(1) +end +vector_size = int(ARGS[1]) + +if vector_size == 1 + ITERS = 262144 + WGS = 8 +elseif vector_size == 4 + ITERS = 65536 # (262144/4) + WGS = 32 +elseif vector_size == 8 + ITERS = 32768 # (262144/8) + WGS = 64 +else + warn("Invalid vector size") + exit(1) +end + +# Set some default values: +# Default number of steps (updated later to device prefereable) +in_nsteps = INSTEPS + +# Default number of iterations +niters = ITERS +work_group_size = WGS + +# Create context, queue and build program + +#--------------------------------------------- +# Uncomment to switch between gpu/cpu devices +#--------------------------------------------- +#device = first(cl.devices(:gpu)) +#device = first(cl.devices(:cpu)) +device, ctx, queue = cl.create_compute_context() + + +kernelsource = open(readall, joinpath(src_dir, "../pi_vocl.cl")) +program = cl.Program(ctx, source=kernelsource) |> cl.build! + +if vector_size == 1 + pi_kernel = cl.Kernel(program, "pi") +elseif vector_size == 4 + pi_kernel = cl.Kernel(program, "pi_vec4") +elseif vector_size == 8 + pi_kernel = cl.Kernel(program, "pi_vec8") +end + +# Now that we know the size of the work_groups, we can set the number of work +# groups, the actual number of steps, and the step size +nwork_groups = int(in_nsteps / (work_group_size * niters)) + +# get the max work group size for the kernel on our device +if vector_size == 1 + max_size = cl.work_group_info(pi_kernel, :size, device) +elseif vector_size == 4 + max_size = cl.work_group_info(pi_kernel, :size, device) +elseif vector_size == 8 + max_size = cl.work_group_info(pi_kernel, :size, device) +end + +if max_size > work_group_size + work_group_size = max_size + nwork_groups = int(in_nsteps / (work_group_size * niters)) +end + +if nwork_groups < 1 + nwork_groups = device[:max_compute_units] + work_group_size = int(in_nsteps / (nwork_groups * niters)) +end + +nsteps = work_group_size * niters * nwork_groups +step_size = 1.0 / nsteps + +# vector to hold partial sum +h_psum = Array(Float32, nwork_groups) + +println("$nwork_groups work groups of size $work_group_size.") +println("$nsteps integration steps") + +d_partial_sums = cl.Buffer(Float32, ctx, :w, length(h_psum)) + +# start timer +rtime = time() + +# Execute the kernel over the entire range of our 1d input data et +# using the maximum number of work group items for this device +# Set the global and local size as tuples +global_size = (nwork_groups * work_group_size,) +local_size = (work_group_size,) +localmem = cl.LocalMem(Float32, work_group_size) + +cl.call(queue, pi_kernel, global_size, local_size, + int32(niters), float32(step_size), localmem, d_partial_sums) + +cl.copy!(queue, h_psum, d_partial_sums) + +# complete the sum and compute final integral value +pi_res = sum(h_psum) * step_size + +# stop the timer +rtime = time() - rtime + +println("The calculation ran in $rtime secs") +println("pi=$pi_res for $nsteps steps")