diff --git a/Exercises/Exercise01/Julia/device_info.jl b/Exercises/Exercise01/Julia/device_info.jl
new file mode 100644
index 0000000..261b922
--- /dev/null
+++ b/Exercises/Exercise01/Julia/device_info.jl
@@ -0,0 +1,51 @@
+#
+# Display Device Information
+#
+# Script to print out some information about the OpenCL devices
+# and platforms available on your system
+#
+# History: C++ version written by Tom Deakin, 2012
+#          Ported to Python by Tom Deakin, July 2013
+#
+
+import OpenCL
+const cl = OpenCL
+
+# create a list of all the platform ids
+platforms = cl.platforms()
+
+println("\nNumber of OpenCL platforms: $(length(platforms))")
+println("\n-----------------------------\n")
+
+# info for each platform
+for p in platforms
+    
+    # print out some info
+    @printf("Platform: %s\n", p[:name])
+    @printf("Vendor:   %s\n", p[:vendor])
+    @printf("Version:  %s\n", p[:version])
+
+    # discover all the devices
+    devices = cl.devices(p)
+    @printf("Number of devices: %s\n", length(devices))
+
+    for d in devices
+        println("\t-----------------------------")
+        # Print out some information about the devices
+        @printf("\t\tName: %s\n", d[:name])
+        @printf("\t\tVersion: %s\n", d[:version]) 
+        @printf("\t\tMax. Compute Units: %s\n", d[:max_compute_units])
+        @printf("\t\tLocal Memory Size: %i KB\n", d[:local_mem_size] / 1024)
+        @printf("\t\tGlobal Memory Size: %i MB\n", d[:global_mem_size] / (1024^2))
+        @printf("\t\tMax Alloc Size: %i MB\n", d[:max_mem_alloc_size] / (1024^2))
+        @printf("\t\tMax Work-group Size: %s\n", d[:max_work_group_size])
+
+        # Find the maximum dimensions of the work-groups
+        dim = d[:max_work_item_size]
+        @printf("\t\tMax Work-item Dims: %s\n", dim)
+        println("\t-----------------------------")
+    end
+
+    print("\n-------------------------")
+end
+
diff --git a/Exercises/Exercise03/Julia/deviceinfo.jl b/Exercises/Exercise03/Julia/deviceinfo.jl
new file mode 100644
index 0000000..c391c6f
--- /dev/null
+++ b/Exercises/Exercise03/Julia/deviceinfo.jl
@@ -0,0 +1,20 @@
+#
+# Device Info
+#
+# Function to output key parameters about the input OpenCL device
+#
+# History: C version written by Tim Mattson, June 2010
+#          Ported to Python by Tom Deakin, July 2013
+#          Ported to Julia  by Jake Bolewski, Nov 2013
+
+import OpenCL
+
+function output_device_info(d::OpenCL.Device)
+    n  = d[:name]
+    dt = d[:device_type]
+    v  = d[:platform][:vendor] 
+    mc = d[:max_compute_units] 
+    str = "Device is $n $dt from $v with a max of $mc compute units" 
+    println(str)
+end
+
diff --git a/Exercises/Exercise03/Julia/vadd.jl b/Exercises/Exercise03/Julia/vadd.jl
new file mode 100644
index 0000000..15b0359
--- /dev/null
+++ b/Exercises/Exercise03/Julia/vadd.jl
@@ -0,0 +1,101 @@
+#
+# Vadd
+#
+# Element wise addition of two vectors (c = a + b)
+# Asks the user to select a device at runtime
+#
+# History: C version written by Tim Mattson, December 2009
+#          C version Updated by Tom Deakin and Simon McIntosh-Smith, October 2012
+#          Ported to Python by Tom Deakin, July 2013
+#          Ported to Julia by Jake Bolewski, Nov 2013
+
+import OpenCL
+const cl = OpenCL
+
+include("deviceinfo.jl")
+
+# tolerance used in floating point comparisons
+TOL = 1e-3
+
+# length of vectors a, b, c
+LENGTH = 1024
+
+# Kernel: vadd
+#
+# To compute the elementwise sum c = a + b
+#
+# Input: a and b float vectors of length count
+# Output c float vector of length count holding the sum a + b
+
+kernelsource = "
+__kernel void vadd(
+    __global float* a,
+    __global float* b,
+    __global float* c,
+    const unsigned int count)
+{
+    int i = get_global_id(0);
+    if (i < count)
+        c[i] = a[i] + b[i];
+}
+"
+
+# create a compute context
+
+# this selects the fastest opencl device available
+# and creates a context and queue for using the
+# the selected device
+device, ctx, queue = cl.create_compute_context()
+
+output_device_info(device)
+
+# create the compute program and build it
+program = cl.Program(ctx, source=kernelsource) |> cl.build!
+
+#create a and b vectors and fill with random float values 
+h_a = rand(Float32, LENGTH)
+h_b = rand(Float32, LENGTH)
+
+# create the input (a,b,e,g) arrays in device memory and copy data from the host
+
+# buffers can be passed memory flags: 
+# {:r = readonly, :w = writeonly, :rw = read_write (default)}
+
+# buffers can also be passed flags for allocation:
+# {:use (use host buffer), :alloc (alloc pinned memory), :copy (default)}
+
+# Create the input (a, b, e, g) arrays in device memory and copy data from host
+d_a = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_a)
+d_b = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_b)
+# Create the output (c, d, f) array in device memory
+d_c = cl.Buffer(Float32, ctx, :w, LENGTH)
+
+# create the kernel
+vadd = cl.Kernel(program, "vadd")
+
+# execute the kernel over the entire range of 1d, input
+# cl.call is blocking, it accepts a queue, the kernel, global / local work sizes,
+# the the kernel's arguments. 
+
+# here we call the kernel with work size set to the number of elements and a local
+# work size of nothing. This enables the opencl runtime to optimize the local size
+# for simple kernels
+cl.call(queue, vadd, size(h_a), nothing, d_a, d_b, d_c, uint32(LENGTH))
+
+# read back the results from the compute device
+h_c = cl.read(queue, d_c)
+
+# test the results
+correct = 0
+for i in 1:LENGTH
+    tmp = h_a[i] + h_b[i]
+    tmp -= h_c[i]
+    if tmp^2 < TOL^2 
+        correct += 1
+    else
+        println("tmp $tmp h_a $(h_a[i]) h_b $(h_b[i]) h_c $(h_c[i])")
+    end
+end
+
+# summarize results
+println("3 vector adds to find F=A+B+E+G: $correct out of $LENGTH results were correct")
diff --git a/Exercises/Exercise04/Julia/deviceinfo.jl b/Exercises/Exercise04/Julia/deviceinfo.jl
new file mode 100644
index 0000000..c391c6f
--- /dev/null
+++ b/Exercises/Exercise04/Julia/deviceinfo.jl
@@ -0,0 +1,20 @@
+#
+# Device Info
+#
+# Function to output key parameters about the input OpenCL device
+#
+# History: C version written by Tim Mattson, June 2010
+#          Ported to Python by Tom Deakin, July 2013
+#          Ported to Julia  by Jake Bolewski, Nov 2013
+
+import OpenCL
+
+function output_device_info(d::OpenCL.Device)
+    n  = d[:name]
+    dt = d[:device_type]
+    v  = d[:platform][:vendor] 
+    mc = d[:max_compute_units] 
+    str = "Device is $n $dt from $v with a max of $mc compute units" 
+    println(str)
+end
+
diff --git a/Exercises/Exercise04/Julia/vadd.jl b/Exercises/Exercise04/Julia/vadd.jl
new file mode 100644
index 0000000..55d77cc
--- /dev/null
+++ b/Exercises/Exercise04/Julia/vadd.jl
@@ -0,0 +1,101 @@
+#
+# Vadd
+#
+# Element wise addition of two vectors (c = a + b)
+# Asks the user to select a device at runtime
+#
+# History: C version written by Tim Mattson, December 2009
+#          C version Updated by Tom Deakin and Simon McIntosh-Smith, October 2012
+#          Ported to Python by Tom Deakin, July 2013
+#          Ported to Julia by Jake Bolewski, Nov 2013
+
+import OpenCL
+const cl = OpenCL
+
+include("deviceinfo.jl")
+
+# tolerance used in floating point comparisons
+TOL = 1e-3
+
+# length of vectors a, b, c
+LENGTH = 1024
+
+# Kernel: vadd
+#
+# To compute the elementwise sum c = a + b
+#
+# Input: a and b float vectors of length count
+# Output c float vector of length count holding the sum a + b
+
+kernelsource = "
+__kernel void vadd(
+    __global float* a,
+    __global float* b,
+    __global float* c,
+    const unsigned int count)
+{
+    unsigned int i = get_global_id(0);
+    if (i < count)
+        c[i] = a[i] + b[i];
+}
+"
+
+# create a compute context
+
+# this selects the fastest opencl device available
+# and creates a context and queue for using the
+# the selected device
+device, ctx, queue = cl.create_compute_context()
+
+output_device_info(device)
+
+# create the compute program and build it
+program = cl.Program(ctx, source=kernelsource) |> cl.build!
+
+#create a and b vectors and fill with random float values 
+h_a = rand(Float32, LENGTH)
+h_b = rand(Float32, LENGTH)
+
+# create the input (a,b,e,g) arrays in device memory and copy data from the host
+
+# buffers can be passed memory flags: 
+# {:r = readonly, :w = writeonly, :rw = read_write (default)}
+
+# buffers can also be passed flags for allocation:
+# {:use (use host buffer), :alloc (alloc pinned memory), :copy (default)}
+
+# Create the input (a, b, e, g) arrays in device memory and copy data from host
+d_a = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_a)
+d_b = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_b)
+# Create the output (c, d, f) array in device memory
+d_c = cl.Buffer(Float32, ctx, :w, LENGTH)
+
+# create the kernel
+vadd = cl.Kernel(program, "vadd")
+
+# execute the kernel over the entire range of 1d, input
+# cl.call is blocking, it accepts a queue, the kernel, global / local work sizes,
+# the the kernel's arguments. 
+
+# here we call the kernel with work size set to the number of elements and a local
+# work size of nothing. This enables the opencl runtime to optimize the local size
+# for simple kernels
+cl.call(queue, vadd, size(h_a), nothing, d_a, d_b, d_c, uint32(LENGTH))
+
+# read back the results from the compute device
+h_c = cl.read(queue, d_c)
+
+# test the results
+correct = 0
+for i in 1:LENGTH
+    tmp = h_a[i] + h_b[i]
+    tmp -= h_c[i]
+    if tmp^2 < TOL^2 
+        correct += 1
+    else
+        println("tmp $tmp h_a $(h_a[i]) h_b $(h_b[i]) h_c $(h_c[i])")
+    end
+end
+
+# summarize results
+println("3 vector adds to find F=A+B+E+G: $correct out of $LENGTH results were correct")
diff --git a/Exercises/Exercise05/Julia/deviceinfo.jl b/Exercises/Exercise05/Julia/deviceinfo.jl
new file mode 100644
index 0000000..c391c6f
--- /dev/null
+++ b/Exercises/Exercise05/Julia/deviceinfo.jl
@@ -0,0 +1,20 @@
+#
+# Device Info
+#
+# Function to output key parameters about the input OpenCL device
+#
+# History: C version written by Tim Mattson, June 2010
+#          Ported to Python by Tom Deakin, July 2013
+#          Ported to Julia  by Jake Bolewski, Nov 2013
+
+import OpenCL
+
+function output_device_info(d::OpenCL.Device)
+    n  = d[:name]
+    dt = d[:device_type]
+    v  = d[:platform][:vendor] 
+    mc = d[:max_compute_units] 
+    str = "Device is $n $dt from $v with a max of $mc compute units" 
+    println(str)
+end
+
diff --git a/Exercises/Exercise05/Julia/vadd.jl b/Exercises/Exercise05/Julia/vadd.jl
new file mode 100644
index 0000000..15b0359
--- /dev/null
+++ b/Exercises/Exercise05/Julia/vadd.jl
@@ -0,0 +1,101 @@
+#
+# Vadd
+#
+# Element wise addition of two vectors (c = a + b)
+# Asks the user to select a device at runtime
+#
+# History: C version written by Tim Mattson, December 2009
+#          C version Updated by Tom Deakin and Simon McIntosh-Smith, October 2012
+#          Ported to Python by Tom Deakin, July 2013
+#          Ported to Julia by Jake Bolewski, Nov 2013
+
+import OpenCL
+const cl = OpenCL
+
+include("deviceinfo.jl")
+
+# tolerance used in floating point comparisons
+TOL = 1e-3
+
+# length of vectors a, b, c
+LENGTH = 1024
+
+# Kernel: vadd
+#
+# To compute the elementwise sum c = a + b
+#
+# Input: a and b float vectors of length count
+# Output c float vector of length count holding the sum a + b
+
+kernelsource = "
+__kernel void vadd(
+    __global float* a,
+    __global float* b,
+    __global float* c,
+    const unsigned int count)
+{
+    int i = get_global_id(0);
+    if (i < count)
+        c[i] = a[i] + b[i];
+}
+"
+
+# create a compute context
+
+# this selects the fastest opencl device available
+# and creates a context and queue for using the
+# the selected device
+device, ctx, queue = cl.create_compute_context()
+
+output_device_info(device)
+
+# create the compute program and build it
+program = cl.Program(ctx, source=kernelsource) |> cl.build!
+
+#create a and b vectors and fill with random float values 
+h_a = rand(Float32, LENGTH)
+h_b = rand(Float32, LENGTH)
+
+# create the input (a,b,e,g) arrays in device memory and copy data from the host
+
+# buffers can be passed memory flags: 
+# {:r = readonly, :w = writeonly, :rw = read_write (default)}
+
+# buffers can also be passed flags for allocation:
+# {:use (use host buffer), :alloc (alloc pinned memory), :copy (default)}
+
+# Create the input (a, b, e, g) arrays in device memory and copy data from host
+d_a = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_a)
+d_b = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_b)
+# Create the output (c, d, f) array in device memory
+d_c = cl.Buffer(Float32, ctx, :w, LENGTH)
+
+# create the kernel
+vadd = cl.Kernel(program, "vadd")
+
+# execute the kernel over the entire range of 1d, input
+# cl.call is blocking, it accepts a queue, the kernel, global / local work sizes,
+# the the kernel's arguments. 
+
+# here we call the kernel with work size set to the number of elements and a local
+# work size of nothing. This enables the opencl runtime to optimize the local size
+# for simple kernels
+cl.call(queue, vadd, size(h_a), nothing, d_a, d_b, d_c, uint32(LENGTH))
+
+# read back the results from the compute device
+h_c = cl.read(queue, d_c)
+
+# test the results
+correct = 0
+for i in 1:LENGTH
+    tmp = h_a[i] + h_b[i]
+    tmp -= h_c[i]
+    if tmp^2 < TOL^2 
+        correct += 1
+    else
+        println("tmp $tmp h_a $(h_a[i]) h_b $(h_b[i]) h_c $(h_c[i])")
+    end
+end
+
+# summarize results
+println("3 vector adds to find F=A+B+E+G: $correct out of $LENGTH results were correct")
diff --git a/Exercises/Exercise06/Julia/helper.jl b/Exercises/Exercise06/Julia/helper.jl
new file mode 100644
index 0000000..eec59ae
--- /dev/null
+++ b/Exercises/Exercise06/Julia/helper.jl
@@ -0,0 +1,20 @@
+function error{T}(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T})
+    cval  = float32(Pdim * AVAL * BVAL)
+    errsq = float32(0.0)
+    for i in 1:Ndim
+        for j in 1:Mdim
+            err = C[(i-1)*Ndim+j] - cval
+            errsq += err^2
+        end
+    end
+    return errsq
+end
+
+function results{T}(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T}, run_time)
+    mflops = 2.0 * Mdim * Ndim * Pdim/(1000000.0* run_time)
+    println("$run_time seconds at $mflops MFLOPS")
+    errsq = error(Mdim, Ndim, Pdim, C)
+    if isnan(errsq) || errsq > TOL
+        println("Errors in multiplication: $errsq")
+    end
+end
diff --git a/Exercises/Exercise06/Julia/matmul.jl b/Exercises/Exercise06/Julia/matmul.jl
new file mode 100644
index 0000000..8f4a530
--- /dev/null
+++ b/Exercises/Exercise06/Julia/matmul.jl
@@ -0,0 +1,126 @@
+# Matrix Multiplication Driver
+#
+# This is a driver program to test various ways of computing
+# the product:
+#                 C = A * B
+#
+# A and B are constant matrices, square and the order is
+# set as a constant, ORDER (see definitions.py). This is so
+# we can make a quick test of the multiplication result.
+#
+# History:   C++ version written by Tim Mattson, August 2010 
+#            Modified by Simon McIntosh-Smith, September 2011
+#            Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
+#            Ported to Python by Tom Deakin, July 2013
+#            Ported to Julia by Jake Bolewski, Nov 2013
+
+import OpenCL
+const cl = OpenCL
+
+const kernel_source = "
+__kernel void mmul(
+	const int Mdim,
+	const int Ndim,
+	const int Pdim,
+	__global float* A,
+	__global float* B,
+	__global float* C)
+{
+
+}
+"
+
+#### Definitions ###
+
+# Order of the square matrices A, B and C
+ORDER = 512
+
+# A elemetns are constant and equal to AVAL
+AVAL = 3.0
+
+# B elemetns are constant and equal to BVAL
+BVAL = 5.0
+
+# tolerance used in floating point comparisons
+TOL = 0.001
+
+# Max dim for NDRange
+DIM = 2
+
+# number of times to do each multiplication
+COUNT = 1
+
+# Helper functions
+include("helper.jl")
+
+# A[N,P], B[P M], C[N,M]
+Ndim = ORDER
+Pdim = ORDER
+Mdim = ORDER
+
+# Number of elements in the matrix 
+sizeA = Ndim * Pdim
+sizeB = Pdim * Mdim
+sizeC = Ndim * Mdim 
+
+# Number of elements in the matrix
+h_A = fill(float32(AVAL), sizeA)
+h_B = fill(float32(BVAL), sizeB)
+h_C = Array(Float32, sizeC)
+
+# %20 improvment using @inbounds
+function seq_mat_mul_sdot{T}(Mdim::Int, Ndim::Int, Pdim::Int,
+                             A::Array{T}, B::Array{T}, C::Array{T})
+    for i in 1:Ndim
+        for j in 1:Mdim
+            tmp = zero(Float32)
+            for k in 1:Pdim
+                @inbounds tmp += A[(i-1)*Ndim+k] * B[(k-1)*Pdim+j]
+            end
+            @inbounds C[(i-1)*Ndim+j] = tmp
+        end
+    end
+end
+
+info("=== Julia, matix mult (dot prod), order $ORDER ===")
+
+# force compilation
+seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C)
+
+for i in 1:COUNT
+    fill!(h_C, 0.0)
+    t1 = time()
+    #seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C)
+    t2 = time()
+    results(Mdim, Ndim, Pdim, h_C, t2 - t1)
+end
+
+# set up OpenCL
+ctx = cl.create_some_context()
+
+# You can enable profiling events on the queue
+# by calling the constructor with the :profile flag
+queue = cl.CmdQueue(ctx, :profile)
+
+# create OpenCL Buffers
+d_a = cl.Buffer(Float32, ctx, (:r,:copy), hostbuf=h_A)
+d_b = cl.Buffer(Float32, ctx, (:r,:copy), hostbuf=h_B)
+d_c = cl.Buffer(Float32, ctx, :w, length(h_C))
+
+prg  = cl.Program(ctx, source=kernel_source) |> cl.build!
+mmul = cl.Kernel(prg, "mmul")
+
+info("=== OpenCL, matrix mult, C(i, j) per work item, order $Ndim ====")
+
+for i in 1:COUNT
+    fill!(h_C, 0.0)
+    global_range = (Ndim, Mdim)
+    local_range  = nothing
+    evt = cl.call(queue, mmul, global_range, local_range,
+                  int32(Mdim), int32(Ndim), int32(Pdim),
+                  d_a, d_b, d_c)
+    # profiling events are measured in ns
+    run_time = evt[:profile_duration] / 1e9
+    cl.copy!(queue, h_C, d_c)
+    results(Mdim, Ndim, Pdim, h_C, run_time)
+end
diff --git a/Exercises/Exercise09/Julia/pi.jl b/Exercises/Exercise09/Julia/pi.jl
new file mode 100644
index 0000000..4077ec0
--- /dev/null
+++ b/Exercises/Exercise09/Julia/pi.jl
@@ -0,0 +1,37 @@
+#
+# This program will numerically compute the integral of
+#
+#                4/(1+x*x) 
+#			  
+# from 0 to 1.  The value of this integral is pi -- which 
+# is great since it gives us an easy way to check the answer.
+#
+# This the original sequential program.
+#
+# History: Written in C by Tim Mattson, 11/99
+#          Ported to Python by Tom Deakin, July 2013
+#          Ported to Julia by Jake Bolewski, Nov 2013
+
+const num_steps = 100000000
+
+println("Doing $num_steps")
+
+const step = 1.0/num_steps
+
+start_time = time()
+
+# global variables are slow in julia, so we wrap in a function
+pi_sum() = begin
+    integral_sum = 0.0
+    for i in 1:num_steps
+        x = (i - 0.5) * step
+        integral_sum += 4.0 / (1.0 + x * x)
+    end
+    integral_sum
+end
+
+est_pi = step * pi_sum()
+
+run_time = time() - start_time;
+
+println("\npi with $num_steps steps is $est_pi in $run_time seconds\n")
diff --git a/Solutions/Exercise04/Julia/vadd_chain.jl b/Solutions/Exercise04/Julia/vadd_chain.jl
new file mode 100644
index 0000000..6e1d5cf
--- /dev/null
+++ b/Solutions/Exercise04/Julia/vadd_chain.jl
@@ -0,0 +1,114 @@
+#
+# Vadd
+#
+# Element wise addition of two vectors at a time in a chain (C=A+B; D=C+E; F=D+G)
+# Asks the user to select a device at runtime
+#
+# History: Initial version based on vadd.c, written by Tim Mattson, June 2011
+# Ported to C++ Wrapper API by Benedict Gaster, September 2011
+# Updated to C++ Wrapper API v1.2 by Tom Deakin and Simon McIntosh-Smith, October 2012
+# Ported to Python by Tom Deakin, July 2013
+# Ported to Julia  by Jake Bolewski, Nov 2013
+
+import OpenCL
+const cl = OpenCL
+
+# tolerance used in floating point comparisons
+TOL = 1e-3
+
+# length of vectors a, b, c
+LENGTH = 1024
+
+# Kernel: vadd
+#
+# To compute the elementwise sum c = a + b
+#
+# Input: a and b float vectors of length count
+# Output c float vector of length count holding the sum a + b
+
+kernelsource = "
+__kernel void vadd(
+    __global float* a,
+    __global float* b,
+    __global float* c,
+    const unsigned int count)
+{
+    int i = get_global_id(0);
+    if (i < count)
+        c[i] = a[i] + b[i];
+}
+"
+
+# create a compute context
+
+# this selects the fastest opencl device available
+# and creates a context and queue for using the
+# the selected device
+device, ctx, queue = cl.create_compute_context()
+
+# create the compute program and build it
+program = cl.Program(ctx, source=kernelsource) |> cl.build!
+
+#create a, b, e, and g vectors and fill with random float values
+#create empty vectors for c, d, and f
+h_a = rand(Float32, LENGTH)
+h_b = rand(Float32, LENGTH)
+h_c = Array(Float32, LENGTH)
+h_d = Array(Float32, LENGTH)
+h_e = rand(Float32, LENGTH)
+h_f = Array(Float32, LENGTH)
+h_g = rand(Float32, LENGTH)
+
+# create the input (a,b,e,g) arrays in device memory and copy data from the host
+
+# buffers can be passed memory flags: 
+# {:r = readonly, :w = writeonly, :rw = read_write (default)}
+
+# buffers can also be passed flags for allocation:
+# {:use (use host buffer), :alloc (alloc pinned memory), :copy (default)}
+
+# Create the input (a, b, e, g) arrays in device memory and copy data from host
+d_a = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_a)
+d_b = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_b)
+d_e = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_e)
+d_g = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_g)
+# Create the output (c, d, f) array in device memory
+d_c = cl.Buffer(Float32, ctx, :w, LENGTH)
+d_d = cl.Buffer(Float32, ctx, :w, LENGTH)
+d_f = cl.Buffer(Float32, ctx, :w, LENGTH)
+
+# create the kernel
+vadd = cl.Kernel(program, "vadd")
+
+# execute the kernel over the entire range of 1d, input
+# cl.call is blocking, it accepts a queue, the kernel, global / local work sizes,
+# the the kernel's arguments. 
+
+# here we call the kernel with work size set to the number of elements and a local
+# work size of nothing. This enables the opencl runtime to optimize the local size
+# for simple kernels
+cl.call(queue, vadd, size(h_a), nothing, d_a, d_b, d_c, uint32(LENGTH))
+
+# call the kernel again with different arguments
+cl.call(queue, vadd, size(h_e), nothing, d_e, d_c, d_d, uint32(LENGTH))
+cl.call(queue, vadd, size(h_g), nothing, d_g, d_d, d_f, uint32(LENGTH))
+
+# copy back the results from the compute device
+# copy!(queue, dst, src) follows same interface as julia's built in copy!
+cl.copy!(queue, h_f, d_f)
+
+# test the results
+correct = 0
+for i in 1:LENGTH
+    tmp = h_a[i] + h_b[i] + h_e[i] + h_g[i]
+    tmp -= h_f[i]
+    if tmp^2 < TOL^2 
+        correct += 1
+    else
+        println("tmp $tmp h_a $(h_a[i]) h_b $(h_b[i]) ",
+                "h_e $(h_e[i]) h_g $(h_g[i]) h_f $(h_f[i])")
+    end
+end
+
+# summarize results
+println("3 vector adds to find F=A+B+E+G: $correct out of $LENGTH results were correct")
diff --git a/Solutions/Exercise05/Julia/vadd_abc.jl b/Solutions/Exercise05/Julia/vadd_abc.jl
new file mode 100644
index 0000000..6bfcb63
--- /dev/null
+++ b/Solutions/Exercise05/Julia/vadd_abc.jl
@@ -0,0 +1,87 @@
+#
+# Vadd
+#
+# Element wise addition of three vectors at a time (R=A+B+C)
+# Asks the user to select a device at runtime
+#
+# History: Initial version based on vadd.c, written by Tim Mattson, June 2011
+#          Ported to C++ Wrapper API by Benedict Gaster, September 2011
+#          Updated to C++ Wrapper API v1.2 by Tom Deakin and Simon McIntosh-Smith, October 2012
+#          Ported to Python by Tom Deakin, July 2013
+#          Ported to Julia by Jake Bolewski, Nov 2013
+
+import OpenCL
+const cl = OpenCL
+
+# tolerance used in floating point comparisons
+TOL = 1e-3
+
+# length of vectors a, b, c
+LENGTH = 1024
+
+# Kernel: vadd
+#
+# To compute the elementwise sum r = a + b + c
+#
+# Input: a, b and c float vectors of length count
+# Output r float vector of length count holding the sum a + b + cs
+
+kernelsource = "
+__kernel void vadd(
+    __global float* a,
+    __global float* b,
+    __global float* c,
+    __global float* r,
+    const unsigned int count)
+{
+    unsigned int i = get_global_id(0);
+    if (i < count)
+        r[i] = a[i] + b[i] + c[i];
+}"
+
+# create a compute context
+device, ctx, queue = cl.create_compute_context()
+
+# create the compute program and build it
+program = cl.Program(ctx, source=kernelsource) |> cl.build!
+
+# create a, b and c vectors and fill with random float values
+# (the result array will be created when reading back from the device)
+h_a = rand(Float32, LENGTH)
+h_b = rand(Float32, LENGTH)
+h_c = rand(Float32, LENGTH)
+
+d_a = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_a)
+d_b = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_b)
+d_c = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_c)
+
+# create the output (r) buffer in device memory
+d_r = cl.Buffer(Float32, ctx, :w, LENGTH)
+
+# create the kernel
+vadd = cl.Kernel(program, "vadd")
+
+# execute the kernel over the entire range of the input
+cl.call(queue, vadd, size(h_a), nothing, d_a, d_b, d_c, d_r, uint32(LENGTH))
+
+# read the results back from the compute device
+# by convention..
+# cl.(action) calls are blocking
+# cl.enqueue_(action) calll are async/non-blocking
+h_r = cl.read(queue, d_r)
+
+# test the results
+correct = 0
+for i in 1:LENGTH
+    tmp = h_a[i] + h_b[i] + h_c[i]
+    # compute the deviation of expected and output result
+    tmp -= h_r[i]
+    if tmp^2 < TOL^2
+        correct += 1
+    else
+        println("tmp $tmp h_a $(h_a[i]) h_b $(h_b[i]) h_c $(h_c[i]) h_r $(h_r[i])")
+    end
+end
+
+# summarize results
+println("3 vector adds to find F=A+B+C: $correct out of $LENGTH results were correct")
diff --git a/Solutions/Exercise06/Julia/helper.jl b/Solutions/Exercise06/Julia/helper.jl
new file mode 100644
index 0000000..eec59ae
--- /dev/null
+++ b/Solutions/Exercise06/Julia/helper.jl
@@ -0,0 +1,20 @@
+function error{T}(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T})
+    cval  = float32(Pdim * AVAL * BVAL)
+    errsq = float32(0.0)
+    for i in 1:Ndim
+        for j in 1:Mdim
+            err = C[(i-1)*Ndim+j] - cval
+            errsq += err^2
+        end
+    end
+    return errsq
+end
+
+function results{T}(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T}, run_time)
+    mflops = 2.0 * Mdim * Ndim * Pdim/(1000000.0* run_time)
+    println("$run_time seconds at $mflops MFLOPS")
+    errsq = error(Mdim, Ndim, Pdim, C)
+    if isnan(errsq) || errsq > TOL
+        println("Errors in multiplication: $errsq")
+    end
+end
diff --git a/Solutions/Exercise06/Julia/matmul.jl b/Solutions/Exercise06/Julia/matmul.jl
new file mode 100644
index 0000000..8cdaaca
--- /dev/null
+++ b/Solutions/Exercise06/Julia/matmul.jl
@@ -0,0 +1,136 @@
+# Matrix Multiplication Driver
+#
+# This is a driver program to test various ways of computing
+# the product:
+#                 C = A * B
+#
+# A and B are constant matrices, square and the order is
+# set as a constant, ORDER (see definitions.py). This is so
+# we can make a quick test of the multiplication result.
+#
+# History:   C++ version written by Tim Mattson, August 2010 
+#            Modified by Simon McIntosh-Smith, September 2011
+#            Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
+#            Ported to Python by Tom Deakin, July 2013
+#            Ported to Julia by Jake Bolewski, Nov 2013
+
+import OpenCL
+const cl = OpenCL
+
+const kernel_source = "
+__kernel void mmul(
+	const int Mdim,
+	const int Ndim,
+	const int Pdim,
+	__global float* A,
+	__global float* B,
+	__global float* C)
+{
+	int k;
+	int i = get_global_id(0);
+	int j = get_global_id(1);
+	float tmp;
+	if ((i < Ndim) && (j < Mdim))
+	{
+		tmp = 0.0;
+		for (k = 0; k < Pdim; k++)
+			tmp += A[i*Ndim+k] * B[k*Pdim+j];
+		C[i*Ndim+j] = tmp;
+	}
+}
+"
+
+#### Definitions ###
+
+# Order of the square matrices A, B and C
+ORDER = 512
+
+# A elemetns are constant and equal to AVAL
+AVAL = 3.0
+
+# B elemetns are constant and equal to BVAL
+BVAL = 5.0
+
+# tolerance used in floating point comparisons
+TOL = 0.001
+
+# Max dim for NDRange
+DIM = 2
+
+# number of times to do each multiplication
+COUNT = 1
+
+# Helper functions
+include("helper.jl")
+
+# A[N,P], B[P M], C[N,M]
+Ndim = ORDER
+Pdim = ORDER
+Mdim = ORDER
+
+# Number of elements in the matrix 
+sizeA = Ndim * Pdim
+sizeB = Pdim * Mdim
+sizeC = Ndim * Mdim 
+
+# Number of elements in the matrix
+h_A = fill(float32(AVAL), sizeA)
+h_B = fill(float32(BVAL), sizeB)
+h_C = Array(Float32, sizeC)
+
+# %20 improvment using @inbounds
+function seq_mat_mul_sdot{T}(Mdim::Int, Ndim::Int, Pdim::Int,
+                             A::Array{T}, B::Array{T}, C::Array{T})
+    for i in 1:Ndim
+        for j in 1:Mdim
+            tmp = zero(Float32)
+            for k in 1:Pdim
+                @inbounds tmp += A[(i-1)*Ndim+k] * B[(k-1)*Pdim+j]
+            end
+            @inbounds C[(i-1)*Ndim+j] = tmp
+        end
+    end
+end
+
+info("=== Julia, matix mult (dot prod), order $ORDER ===")
+
+# force compilation
+seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C)
+
+for i in 1:COUNT
+    fill!(h_C, 0.0)
+    t1 = time()
+    #seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C)
+    t2 = time()
+    results(Mdim, Ndim, Pdim, h_C, t2 - t1)
+end
+
+# set up OpenCL
+ctx = cl.create_some_context()
+
+# You can enable profiling events on the queue
+# by calling the constructor with the :profile flag
+queue = cl.CmdQueue(ctx, :profile)
+
+# create OpenCL Buffers
+d_a = cl.Buffer(Float32, ctx, (:r,:copy), hostbuf=h_A)
+d_b = cl.Buffer(Float32, ctx, (:r,:copy), hostbuf=h_B)
+d_c = cl.Buffer(Float32, ctx, :w, length(h_C))
+
+prg  = cl.Program(ctx, source=kernel_source) |> cl.build!
+mmul = cl.Kernel(prg, "mmul")
+
+info("=== OpenCL, matrix mult, C(i, j) per work item, order $Ndim ====")
+
+for i in 1:COUNT
+    fill!(h_C, 0.0)
+    global_range = (Ndim, Mdim)
+    local_range  = nothing
+    evt = cl.call(queue, mmul, global_range, local_range,
+                  int32(Mdim), int32(Ndim), int32(Pdim),
+                  d_a, d_b, d_c)
+    # profiling events are measured in ns
+    run_time = evt[:profile_duration] / 1e9
+    cl.copy!(queue, h_C, d_c)
+    results(Mdim, Ndim, Pdim, h_C, run_time)
+end
diff --git a/Solutions/Exercise07/Julia/helper.jl b/Solutions/Exercise07/Julia/helper.jl
new file mode 100644
index 0000000..eec59ae
--- /dev/null
+++ b/Solutions/Exercise07/Julia/helper.jl
@@ -0,0 +1,20 @@
+function error{T}(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T})
+    cval  = float32(Pdim * AVAL * BVAL)
+    errsq = float32(0.0)
+    for i in 1:Ndim
+        for j in 1:Mdim
+            err = C[(i-1)*Ndim+j] - cval
+            errsq += err^2
+        end
+    end
+    return errsq
+end
+
+function results{T}(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T}, run_time)
+    mflops = 2.0 * Mdim * Ndim * Pdim/(1000000.0* run_time)
+    println("$run_time seconds at $mflops MFLOPS")
+    errsq = error(Mdim, Ndim, Pdim, C)
+    if isnan(errsq) || errsq > TOL
+        println("Errors in multiplication: $errsq")
+    end
+end
diff --git a/Solutions/Exercise07/Julia/matmul.jl b/Solutions/Exercise07/Julia/matmul.jl
new file mode 100644
index 0000000..5f48368
--- /dev/null
+++ b/Solutions/Exercise07/Julia/matmul.jl
@@ -0,0 +1,161 @@
+# Matrix Multiplication Driver
+#
+# This is a driver program to test various ways of computing
+# the product:
+#                 C = A * B
+#
+# A and B are constant matrices, square and the order is
+# set as a constant, ORDER (see definitions.py). This is so
+# we can make a quick test of the multiplication result.
+#
+# History:   C++ version written by Tim Mattson, August 2010 
+#            Modified by Simon McIntosh-Smith, September 2011
+#            Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
+#            Ported to Python by Tom Deakin, July 2013
+#            Ported to Julia by Jake Bolewski, Nov 2013
+
+import OpenCL
+const cl = OpenCL
+
+# get the directory of this file
+# (used for test runner)
+src_dir = dirname(Base.source_path())
+
+#### Definitions ###
+
+# Order of the square matrices A, B and C
+ORDER = 512
+
+# A elemetns are constant and equal to AVAL
+AVAL = 3.0
+
+# B elemetns are constant and equal to BVAL
+BVAL = 5.0
+
+# tolerance used in floating point comparisons
+TOL = 0.001
+
+# Max dim for NDRange
+DIM = 2
+
+# number of times to do each multiplication
+COUNT = 1
+
+# Helper functions
+include("helper.jl")
+
+# A[N,P], B[P M], C[N,M]
+Ndim = ORDER
+Pdim = ORDER
+Mdim = ORDER
+
+# Number of elements in the matrix 
+sizeA = Ndim * Pdim
+sizeB = Pdim * Mdim
+sizeC = Ndim * Mdim 
+
+# Number of elements in the matrix
+h_A = fill(float32(AVAL), sizeA)
+h_B = fill(float32(BVAL), sizeB)
+h_C = Array(Float32, sizeC)
+
+# %20 improvment using @inbounds
+function seq_mat_mul_sdot{T}(Mdim::Int, Ndim::Int, Pdim::Int,
+                             A::Array{T}, B::Array{T}, C::Array{T})
+    for i in 1:Ndim
+        for j in 1:Mdim
+            tmp = zero(Float32)
+            for k in 1:Pdim
+                @inbounds tmp += A[(i-1)*Ndim+k] * B[(k-1)*Pdim+j]
+            end
+            @inbounds C[(i-1)*Ndim+j] = tmp
+        end
+    end
+end
+
+info("=== Julia, matix mult (dot prod), order $ORDER ===")
+
+# force compilation
+seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C)
+
+for i in 1:COUNT
+    fill!(h_C, 0.0)
+    t1 = time()
+    seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C)
+    t2 = time()
+    results(Mdim, Ndim, Pdim, h_C, t2 - t1)
+end
+
+# set up OpenCL
+ctx = cl.create_some_context()
+
+# You can enable profiling events on the queue
+# by calling the constructor with the :profile flag
+queue = cl.CmdQueue(ctx, :profile)
+
+# create OpenCL Buffers
+d_a = cl.Buffer(Float32, ctx, (:r,:copy), hostbuf=h_A)
+d_b = cl.Buffer(Float32, ctx, (:r,:copy), hostbuf=h_B)
+d_c = cl.Buffer(Float32, ctx, :w, length(h_C))
+
+#--------------------------------------------------------------------------------
+# OpenCL matrix multiplication ... Naive
+#--------------------------------------------------------------------------------
+
+kernel_source = open(readall, joinpath(src_dir, "../C_elem.cl"))
+prg  = cl.Program(ctx, source=kernel_source) |> cl.build!
+mmul = cl.Kernel(prg, "mmul")
+
+info("=== OpenCL, matrix mult, C(i, j) per work item, order $Ndim ====")
+
+for i in 1:COUNT
+    fill!(h_C, 0.0)
+    evt = cl.call(queue, mmul, (Ndim, Mdim), nothing,
+                  int32(Mdim), int32(Ndim), int32(Pdim),
+                  d_a, d_b, d_c)
+    # profiling events are measured in ns
+    run_time = evt[:profile_duration] / 1e9
+    cl.copy!(queue, h_C, d_c)
+    results(Mdim, Ndim, Pdim, h_C, run_time)
+end
+
+#--------------------------------------------------------------------------------
+# OpenCL matrix multiplication ... C row per work item
+#--------------------------------------------------------------------------------
+
+kernel_source = open(readall, joinpath(src_dir, "../C_row.cl"))
+prg  = cl.Program(ctx, source=kernel_source) |> cl.build!
+mmul = cl.Kernel(prg, "mmul")
+
+info("=== OpenCL, matrix mult, C row per work item, order $Ndim ====")
+
+for i in 1:COUNT
+    fill!(h_C, 0.0)
+    evt = cl.call(queue, mmul, (Ndim,), (int(ORDER/16),),
+                  int32(Mdim), int32(Ndim), int32(Pdim),
+                  d_a, d_b, d_c)
+    # profiling events are measured in ns
+    run_time = evt[:profile_duration] / 1e9
+    cl.copy!(queue, h_C, d_c)
+    results(Mdim, Ndim, Pdim, h_C, run_time)
+end
+
+#--------------------------------------------------------------------------------
+# OpenCL matrix multiplication ... C row per work item, A row in pivate memory
+#--------------------------------------------------------------------------------
+kernel_source = open(readall, joinpath(src_dir, "../C_row_priv.cl"))
+prg  = cl.Program(ctx, source=kernel_source) |> cl.build!
+mmul = cl.Kernel(prg, "mmul")
+
+info("=== OpenCL, matrix mult, C row, A row in priv mem, order $Ndim ====")
+
+for i in 1:COUNT
+    fill!(h_C, 0.0)
+    evt = cl.call(queue, mmul, (Ndim,), (int(ORDER/16),),
+                  int32(Mdim), int32(Ndim), int32(Pdim),
+                  d_a, d_b, d_c)
+    # profiling events are measured in ns
+    run_time = evt[:profile_duration] / 1e9
+    cl.copy!(queue, h_C, d_c)
+    results(Mdim, Ndim, Pdim, h_C, run_time)
+end
diff --git a/Solutions/Exercise08/Julia/helper.jl b/Solutions/Exercise08/Julia/helper.jl
new file mode 100644
index 0000000..eec59ae
--- /dev/null
+++ b/Solutions/Exercise08/Julia/helper.jl
@@ -0,0 +1,20 @@
+function error{T}(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T})
+    cval  = float32(Pdim * AVAL * BVAL)
+    errsq = float32(0.0)
+    for i in 1:Ndim
+        for j in 1:Mdim
+            err = C[(i-1)*Ndim+j] - cval
+            errsq += err^2
+        end
+    end
+    return errsq
+end
+
+function results{T}(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T}, run_time)
+    mflops = 2.0 * Mdim * Ndim * Pdim/(1000000.0* run_time)
+    println("$run_time seconds at $mflops MFLOPS")
+    errsq = error(Mdim, Ndim, Pdim, C)
+    if isnan(errsq) || errsq > TOL
+        println("Errors in multiplication: $errsq")
+    end
+end
diff --git a/Solutions/Exercise08/Julia/matmul.jl b/Solutions/Exercise08/Julia/matmul.jl
new file mode 100644
index 0000000..30069a2
--- /dev/null
+++ b/Solutions/Exercise08/Julia/matmul.jl
@@ -0,0 +1,185 @@
+# Matrix Multiplication Driver
+#
+# This is a driver program to test various ways of computing
+# the product:
+#                 C = A * B
+#
+# A and B are constant matrices, square and the order is
+# set as a constant, ORDER (see definitions.py). This is so
+# we can make a quick test of the multiplication result.
+#
+# History:   C++ version written by Tim Mattson, August 2010 
+#            Modified by Simon McIntosh-Smith, September 2011
+#            Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
+#            Ported to Python by Tom Deakin, July 2013
+#            Ported to Julia by Jake Bolewski, Nov 2013
+
+import OpenCL
+const cl = OpenCL
+
+# get the directory of this file
+# (used for test runner)
+src_dir = dirname(Base.source_path())
+
+#### Definitions ###
+
+# Order of the square matrices A, B and C
+ORDER = 512 
+
+# A elemetns are constant and equal to AVAL
+AVAL = 3.0
+
+# B elemetns are constant and equal to BVAL
+BVAL = 5.0
+
+# tolerance used in floating point comparisons
+TOL = 0.001
+
+# Max dim for NDRange
+DIM = 2
+
+# number of times to do each multiplication
+COUNT = 1
+
+# Helper functions
+include("helper.jl")
+
+# A[N,P], B[P M], C[N,M]
+Ndim = ORDER
+Pdim = ORDER
+Mdim = ORDER
+
+# Number of elements in the matrix 
+sizeA = Ndim * Pdim
+sizeB = Pdim * Mdim
+sizeC = Ndim * Mdim 
+
+# Number of elements in the matrix
+h_A = fill(float32(AVAL), sizeA)
+h_B = fill(float32(BVAL), sizeB)
+h_C = Array(Float32, sizeC)
+
+# %20 improvment using @inbounds
+function seq_mat_mul_sdot{T}(Mdim::Int, Ndim::Int, Pdim::Int,
+                             A::Array{T}, B::Array{T}, C::Array{T})
+    for i in 1:Ndim
+        for j in 1:Mdim
+            tmp = zero(Float32)
+            for k in 1:Pdim
+                @inbounds tmp += A[(i-1)*Ndim+k] * B[(k-1)*Pdim+j]
+            end
+            @inbounds C[(i-1)*Ndim+j] = tmp
+        end
+    end
+end
+
+info("=== Julia, matix mult (dot prod), order $ORDER ===")
+
+# force compilation
+seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C)
+
+for i in 1:COUNT
+    fill!(h_C, 0.0)
+    t1 = time()
+    seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C)
+    t2 = time()
+    results(Mdim, Ndim, Pdim, h_C, t2 - t1)
+end
+
+# set up OpenCL
+ctx = cl.create_some_context()
+
+# You can enable profiling events on the queue
+# by calling the constructor with the :profile flag
+queue = cl.CmdQueue(ctx, :profile)
+
+# create OpenCL Buffers
+d_a = cl.Buffer(Float32, ctx, (:r,:copy), hostbuf=h_A)
+d_b = cl.Buffer(Float32, ctx, (:r,:copy), hostbuf=h_B)
+d_c = cl.Buffer(Float32, ctx, :w, length(h_C))
+
+#--------------------------------------------------------------------------------
+# OpenCL matrix multiplication ... Naive
+#--------------------------------------------------------------------------------
+
+kernel_source = open(readall, joinpath(src_dir, "../C_elem.cl"))
+prg  = cl.Program(ctx, source=kernel_source) |> cl.build!
+mmul = cl.Kernel(prg, "mmul")
+
+info("=== OpenCL, matrix mult, C(i, j) per work item, order $Ndim ====")
+
+for i in 1:COUNT
+    fill!(h_C, 0.0)
+    evt = cl.call(queue, mmul, (Ndim, Mdim), nothing,
+                  int32(Mdim), int32(Ndim), int32(Pdim),
+                  d_a, d_b, d_c)
+    # profiling events are measured in ns
+    run_time = evt[:profile_duration] / 1e9
+    cl.copy!(queue, h_C, d_c)
+    results(Mdim, Ndim, Pdim, h_C, run_time)
+end
+
+#--------------------------------------------------------------------------------
+# OpenCL matrix multiplication ... C row per work item
+#--------------------------------------------------------------------------------
+
+kernel_source = open(readall, joinpath(src_dir, "../C_row.cl"))
+prg  = cl.Program(ctx, source=kernel_source) |> cl.build!
+mmul = cl.Kernel(prg, "mmul")
+
+info("=== OpenCL, matrix mult, C row per work item, order $Ndim ====")
+
+for i in 1:COUNT
+    fill!(h_C, 0.0)
+    evt = cl.call(queue, mmul, (Ndim,), (int(ORDER/16),),
+                  int32(Mdim), int32(Ndim), int32(Pdim),
+                  d_a, d_b, d_c)
+    # profiling events are measured in ns
+    run_time = evt[:profile_duration] / 1e9
+    cl.copy!(queue, h_C, d_c)
+    results(Mdim, Ndim, Pdim, h_C, run_time)
+end
+
+#--------------------------------------------------------------------------------
+# OpenCL matrix multiplication ... C row per work item, A row in pivate memory
+#--------------------------------------------------------------------------------
+kernel_source = open(readall, joinpath(src_dir, "../C_row_priv_bloc.cl"))
+prg  = cl.Program(ctx, source=kernel_source) |> cl.build!
+mmul = cl.Kernel(prg, "mmul")
+
+info("=== OpenCL, matrix mult, C row, priv A, B, cols loc, order $Ndim ====")
+
+for i in 1:COUNT
+    fill!(h_C, 0.0)
+    localmem = cl.LocalMem(Float32, Pdim)
+    evt = cl.call(queue, mmul, (Ndim,), (int(ORDER/16),),
+                  int32(Mdim), int32(Ndim), int32(Pdim),
+                  d_a, d_b, d_c, localmem)
+    # profiling events are measured in ns
+    run_time = evt[:profile_duration] / 1e9
+    cl.copy!(queue, h_C, d_c)
+    results(Mdim, Ndim, Pdim, h_C, run_time)
+end
+
+#--------------------------------------------------------------------------------
+# OpenCL matrix multiplication ... C row per work item, A row pivate, B col local
+#--------------------------------------------------------------------------------
+kernel_source = open(readall, joinpath(src_dir, "../C_block_form.cl"))
+prg  = cl.Program(ctx, source=kernel_source) |> cl.build!
+mmul = cl.Kernel(prg, "mmul")
+
+info("=== OpenCL, matrix mult, A and B in block form in local memory, order $Ndim ====")
+blocksize = 16
+
+for i in 1:COUNT
+    fill!(h_C, float32(0.0))
+    localmem1 = cl.LocalMem(Float32, blocksize^2)
+    localmem2 = cl.LocalMem(Float32, blocksize^2)
+    evt = cl.call(queue, mmul, (Ndim,), (int(ORDER/16),),
+                  int32(Mdim), int32(Ndim), int32(Pdim),
+                  d_a, d_b, d_c, localmem1, localmem2)
+    # profiling events are measured in ns
+    run_time = evt[:profile_duration] / 1e9
+    cl.copy!(queue, h_C, d_c)
+    results(Mdim, Ndim, Pdim, h_C, run_time)
+end
diff --git a/Solutions/Exercise09/Julia/pi_ocl.jl b/Solutions/Exercise09/Julia/pi_ocl.jl
new file mode 100644
index 0000000..d6d9f0e
--- /dev/null
+++ b/Solutions/Exercise09/Julia/pi_ocl.jl
@@ -0,0 +1,88 @@
+#
+# Pi reduction
+#
+# Numeric integration to estimate pi
+# Asks the user to select a device at runtime
+#
+# History: C version written by Tim Mattson, May 2010
+#          Ported to the C++ Wrapper API by Benedict R. Gaster, September 2011
+#          C++ version Updated by Tom Deakin and Simon McIntosh-Smith, October 2012
+#          Ported to Python by Tom Deakin, July 2013
+#          Ported to Julia by Jake Bolewski, Nov 2013
+
+import OpenCL
+const cl = OpenCL
+
+# get the directory of this file
+# (used for test runner)
+src_dir = dirname(Base.source_path())
+
+#
+# Some constant values
+const INSTEPS = 512*512*512
+const ITERS = 262144
+
+# Set some default values:
+# Default number of steps (updated later to device prefereable)
+const in_nsteps = INSTEPS
+
+# Default number of iterations
+const niters = ITERS
+
+# create context, queue and build program
+device, ctx, queue = cl.create_compute_context()
+
+kernelsource = open(readall, joinpath(src_dir, "../pi_ocl.cl"))
+program = cl.Program(ctx, source=kernelsource) |> cl.build!
+
+# pi is a julia keyword
+pi_kernel = cl.Kernel(program, "pi")
+
+# get the max work group size for the kernel pi on the device
+work_group_size = device[:max_work_group_size]
+
+# now that we know the size of the work_groups, we can set the number
+# of work groups, the actual number of steps, and the step size
+nwork_groups = int(in_nsteps / (work_group_size * niters))
+
+if nwork_groups < 1
+    # you can get opencl object info through the obj[:symbol] syntax
+    # or cl.info(obj, :symbol)
+    nwork_groups = device[:max_compute_units]
+    work_group_size = in_nsteps / (nwork_groups * niters)
+end
+
+nsteps = work_group_size * niters * nwork_groups
+step_size = 1.0 / nsteps
+
+# vector to hold partial sum
+h_psum = Array(Float32, nwork_groups)
+
+println("$nwork_groups work groups of size $work_group_size.")
+println("$nsteps integration steps")
+
+d_partial_sums = cl.Buffer(Float32, ctx, :w, length(h_psum))
+
+# start timer 
+rtime = time() 
+
+# Execute the kernel over the entire range of our 1d input data et
+# using the maximum number of work group items for this device
+# Set the global and local size as tuples
+global_size = (nwork_groups * work_group_size,)
+local_size  = (work_group_size,)
+localmem    = cl.LocalMem(Float32, work_group_size)
+
+cl.call(queue, pi_kernel, global_size, local_size,
+        int32(niters), float32(step_size), localmem, d_partial_sums)
+
+cl.copy!(queue, h_psum, d_partial_sums)
+
+# complete the sum and compute final integral value
+pi_res = sum(h_psum) * step_size
+
+# stop the timer
+rtime = time() - rtime
+
+println("The calculation ran in $rtime secs")
+println("pi=$pi_res for $nsteps steps")
diff --git a/Solutions/ExerciseA/Julia/pi_vocl.jl b/Solutions/ExerciseA/Julia/pi_vocl.jl
new file mode 100644
index 0000000..917cd99
--- /dev/null
+++ b/Solutions/ExerciseA/Julia/pi_vocl.jl
@@ -0,0 +1,131 @@
+# Pi reduction - vectorized
+#
+# Numeric integration to estimate pi
+# Asks the user to select a device at runtime
+# Vector size must be present as a CLI argument
+#
+# History: C version written by Tim Mattson, May 2010
+#          Ported to the C++ Wrapper API by Benedict R. Gaster, September 2011
+#          C++ version Updated by Tom Deakin and Simon McIntosh-Smith, October 2012
+#          Ported to Python by Tom Deakin, July 2013
+#          Ported to Julia by Jake Bolewski, Nov 2013
+
+import OpenCL
+const cl = OpenCL 
+
+# get the directory of this file
+# (used for test runner)
+src_dir = dirname(Base.source_path())
+
+# Some constant values
+INSTEPS = 512 * 512 * 512
+ITERS = -1
+WGS = -1
+NAME = ""
+
+if length(ARGS) < 1 
+    info("Usage: julia pi_vocl.jl [num] (where num = 1, 4, or 8)")
+    exit(1)
+end
+vector_size = int(ARGS[1])
+
+if vector_size == 1
+        ITERS = 262144
+        WGS = 8
+elseif vector_size == 4 
+        ITERS = 65536 # (262144/4)
+        WGS = 32
+elseif vector_size == 8
+        ITERS = 32768 # (262144/8)
+        WGS = 64
+else
+    warn("Invalid vector size")
+    exit(1)
+end
+
+# Set some default values:
+# Default number of steps (updated later to device prefereable)
+in_nsteps = INSTEPS
+
+# Default number of iterations
+niters = ITERS
+work_group_size = WGS
+
+# Create context, queue and build program
+
+#---------------------------------------------
+# Uncomment to switch between gpu/cpu devices
+#---------------------------------------------
+#device = first(cl.devices(:gpu))
+#device = first(cl.devices(:cpu))
+device, ctx, queue = cl.create_compute_context()
+
+
+kernelsource = open(readall, joinpath(src_dir, "../pi_vocl.cl"))
+program = cl.Program(ctx, source=kernelsource) |> cl.build!
+
+if vector_size == 1
+    pi_kernel = cl.Kernel(program, "pi")
+elseif vector_size == 4
+    pi_kernel = cl.Kernel(program, "pi_vec4")
+elseif vector_size == 8
+    pi_kernel = cl.Kernel(program, "pi_vec8")
+end
+
+# Now that we know the size of the work_groups, we can set the number of work
+# groups, the actual number of steps, and the step size
+nwork_groups = int(in_nsteps / (work_group_size * niters))
+
+# get the max work group size for the kernel on our device
+if vector_size == 1
+    max_size = cl.work_group_info(pi_kernel, :size, device)
+elseif vector_size == 4
+    max_size = cl.work_group_info(pi_kernel, :size, device)
+elseif vector_size == 8
+    max_size = cl.work_group_info(pi_kernel, :size, device)
+end
+
+if max_size > work_group_size
+    work_group_size = max_size
+    nwork_groups = int(in_nsteps / (work_group_size * niters))
+end
+
+if nwork_groups < 1
+    nwork_groups = device[:max_compute_units]
+    work_group_size = int(in_nsteps / (nwork_groups * niters))
+end
+
+nsteps = work_group_size * niters * nwork_groups
+step_size = 1.0 / nsteps
+
+# vector to hold partial sum
+h_psum = Array(Float32, nwork_groups)
+
+println("$nwork_groups work groups of size $work_group_size.")
+println("$nsteps integration steps")
+
+d_partial_sums = cl.Buffer(Float32, ctx, :w, length(h_psum))
+
+# start timer 
+rtime = time() 
+
+# Execute the kernel over the entire range of our 1d input data et
+# using the maximum number of work group items for this device
+# Set the global and local size as tuples
+global_size = (nwork_groups * work_group_size,)
+local_size  = (work_group_size,)
+localmem    = cl.LocalMem(Float32, work_group_size)
+
+cl.call(queue, pi_kernel, global_size, local_size,
+        int32(niters), float32(step_size), localmem, d_partial_sums)
+
+cl.copy!(queue, h_psum, d_partial_sums)
+
+# complete the sum and compute final integral value
+pi_res = sum(h_psum) * step_size
+
+# stop the timer
+rtime = time() - rtime
+
+println("The calculation ran in $rtime secs")
+println("pi=$pi_res for $nsteps steps")