Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Julia OpenCL Examples #47

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions Exercises/Exercise01/Julia/device_info.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#
# Display Device Information
#
# Script to print out some information about the OpenCL devices
# and platforms available on your system
#
# History: C++ version written by Tom Deakin, 2012
# Ported to Python by Tom Deakin, July 2013
#

import OpenCL
const cl = OpenCL

# create a list of all the platform ids
platforms = cl.platforms()

println("\nNumber of OpenCL platforms: $(length(platforms))")
println("\n-----------------------------\n")

# info for each platform
for p in platforms

# print out some info
@printf("Platform: %s\n", p[:name])
@printf("Vendor: %s\n", p[:vendor])
@printf("Version: %s\n", p[:version])

# discover all the devices
devices = cl.devices(p)
@printf("Number of devices: %s\n", length(devices))

for d in devices
println("\t-----------------------------")
# Print out some information about the devices
@printf("\t\tName: %s\n", d[:name])
@printf("\t\tVersion: %s\n", d[:version])
@printf("\t\tMax. Compute Units: %s\n", d[:max_compute_units])
@printf("\t\tLocal Memory Size: %i KB\n", d[:local_mem_size] / 1024)
@printf("\t\tGlobal Memory Size: %i MB\n", d[:global_mem_size] / (1024^2))
@printf("\t\tMax Alloc Size: %i MB\n", d[:max_mem_alloc_size] / (1024^2))
@printf("\t\tMax Work-group Size: %s\n", d[:max_work_group_size])

# Find the maximum dimensions of the work-groups
dim = d[:max_work_item_size]
@printf("\t\tMax Work-item Dims: %s\n", dim)
println("\t-----------------------------")
end

print("\n-------------------------")
end

20 changes: 20 additions & 0 deletions Exercises/Exercise03/Julia/deviceinfo.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#
# Device Info
#
# Function to output key parameters about the input OpenCL device
#
# History: C version written by Tim Mattson, June 2010
# Ported to Python by Tom Deakin, July 2013
# Ported to Julia by Jake Bolewski, Nov 2013

import OpenCL

function output_device_info(d::OpenCL.Device)
n = d[:name]
dt = d[:device_type]
v = d[:platform][:vendor]
mc = d[:max_compute_units]
str = "Device is $n $dt from $v with a max of $mc compute units"
println(str)
end

101 changes: 101 additions & 0 deletions Exercises/Exercise03/Julia/vadd.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#
# Vadd
#
# Element wise addition of two vectors (c = a + b)
# Asks the user to select a device at runtime
#
# History: C version written by Tim Mattson, December 2009
# C version Updated by Tom Deakin and Simon McIntosh-Smith, October 2012
# Ported to Python by Tom Deakin, July 2013
# Ported to Julia by Jake Bolewski, Nov 2013

import OpenCL
const cl = OpenCL

include("deviceinfo.jl")

# tolerance used in floating point comparisons
TOL = 1e-3

# length of vectors a, b, c
LENGTH = 1024

# Kernel: vadd
#
# To compute the elementwise sum c = a + b
#
# Input: a and b float vectors of length count
# Output c float vector of length count holding the sum a + b

kernelsource = "
__kernel void vadd(
__global float* a,
__global float* b,
__global float* c,
const unsigned int count)
{
int i = get_global_id(0);
if (i < count)
c[i] = a[i] + b[i];
}
"

# create a compute context

# this selects the fastest opencl device available
# and creates a context and queue for using the
# the selected device
device, ctx, queue = cl.create_compute_context()

output_device_info(device)

# create the compute program and build it
program = cl.Program(ctx, source=kernelsource) |> cl.build!

#create a and b vectors and fill with random float values
h_a = rand(Float32, LENGTH)
h_b = rand(Float32, LENGTH)

# create the input (a,b,e,g) arrays in device memory and copy data from the host

# buffers can be passed memory flags:
# {:r = readonly, :w = writeonly, :rw = read_write (default)}

# buffers can also be passed flags for allocation:
# {:use (use host buffer), :alloc (alloc pinned memory), :copy (default)}

# Create the input (a, b, e, g) arrays in device memory and copy data from host
d_a = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_a)
d_b = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_b)
# Create the output (c, d, f) array in device memory
d_c = cl.Buffer(Float32, ctx, :w, LENGTH)

# create the kernel
vadd = cl.Kernel(program, "vadd")

# execute the kernel over the entire range of 1d, input
# cl.call is blocking, it accepts a queue, the kernel, global / local work sizes,
# the the kernel's arguments.

# here we call the kernel with work size set to the number of elements and a local
# work size of nothing. This enables the opencl runtime to optimize the local size
# for simple kernels
cl.call(queue, vadd, size(h_a), nothing, d_a, d_b, d_c, uint32(LENGTH))

# read back the results from the compute device
h_c = cl.read(queue, d_c)

# test the results
correct = 0
for i in 1:LENGTH
tmp = h_a[i] + h_b[i]
tmp -= h_c[i]
if tmp^2 < TOL^2
correct += 1
else
println("tmp $tmp h_a $(h_a[i]) h_b $(h_b[i]) h_c $(h_c[i])")
end
end

# summarize results
println("3 vector adds to find F=A+B+E+G: $correct out of $LENGTH results were correct")
20 changes: 20 additions & 0 deletions Exercises/Exercise04/Julia/deviceinfo.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#
# Device Info
#
# Function to output key parameters about the input OpenCL device
#
# History: C version written by Tim Mattson, June 2010
# Ported to Python by Tom Deakin, July 2013
# Ported to Julia by Jake Bolewski, Nov 2013

import OpenCL

function output_device_info(d::OpenCL.Device)
n = d[:name]
dt = d[:device_type]
v = d[:platform][:vendor]
mc = d[:max_compute_units]
str = "Device is $n $dt from $v with a max of $mc compute units"
println(str)
end

101 changes: 101 additions & 0 deletions Exercises/Exercise04/Julia/vadd.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#
# Vadd
#
# Element wise addition of two vectors (c = a + b)
# Asks the user to select a device at runtime
#
# History: C version written by Tim Mattson, December 2009
# C version Updated by Tom Deakin and Simon McIntosh-Smith, October 2012
# Ported to Python by Tom Deakin, July 2013
# Ported to Julia by Jake Bolewski, Nov 2013

import OpenCL
const cl = OpenCL

include("deviceinfo.jl")

# tolerance used in floating point comparisons
TOL = 1e-3

# length of vectors a, b, c
LENGTH = 1024

# Kernel: vadd
#
# To compute the elementwise sum c = a + b
#
# Input: a and b float vectors of length count
# Output c float vector of length count holding the sum a + b

kernelsource = "
__kernel void vadd(
__global float* a,
__global float* b,
__global float* c,
const unsigned int count)
{
unsigned int i = get_global_id(0);
if (i < count)
c[i] = a[i] + b[i];
}
"

# create a compute context

# this selects the fastest opencl device available
# and creates a context and queue for using the
# the selected device
device, ctx, queue = cl.create_compute_context()

output_device_info(device)

# create the compute program and build it
program = cl.Program(ctx, source=kernelsource) |> cl.build!

#create a and b vectors and fill with random float values
h_a = rand(Float32, LENGTH)
h_b = rand(Float32, LENGTH)

# create the input (a,b,e,g) arrays in device memory and copy data from the host

# buffers can be passed memory flags:
# {:r = readonly, :w = writeonly, :rw = read_write (default)}

# buffers can also be passed flags for allocation:
# {:use (use host buffer), :alloc (alloc pinned memory), :copy (default)}

# Create the input (a, b, e, g) arrays in device memory and copy data from host
d_a = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_a)
d_b = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_b)
# Create the output (c, d, f) array in device memory
d_c = cl.Buffer(Float32, ctx, :w, LENGTH)

# create the kernel
vadd = cl.Kernel(program, "vadd")

# execute the kernel over the entire range of 1d, input
# cl.call is blocking, it accepts a queue, the kernel, global / local work sizes,
# the the kernel's arguments.

# here we call the kernel with work size set to the number of elements and a local
# work size of nothing. This enables the opencl runtime to optimize the local size
# for simple kernels
cl.call(queue, vadd, size(h_a), nothing, d_a, d_b, d_c, uint32(LENGTH))

# read back the results from the compute device
h_c = cl.read(queue, d_c)

# test the results
correct = 0
for i in 1:LENGTH
tmp = h_a[i] + h_b[i]
tmp -= h_c[i]
if tmp^2 < TOL^2
correct += 1
else
println("tmp $tmp h_a $(h_a[i]) h_b $(h_b[i]) h_c $(h_c[i])")
end
end

# summarize results
println("3 vector adds to find F=A+B+E+G: $correct out of $LENGTH results were correct")
20 changes: 20 additions & 0 deletions Exercises/Exercise05/Julia/deviceinfo.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#
# Device Info
#
# Function to output key parameters about the input OpenCL device
#
# History: C version written by Tim Mattson, June 2010
# Ported to Python by Tom Deakin, July 2013
# Ported to Julia by Jake Bolewski, Nov 2013

import OpenCL

function output_device_info(d::OpenCL.Device)
n = d[:name]
dt = d[:device_type]
v = d[:platform][:vendor]
mc = d[:max_compute_units]
str = "Device is $n $dt from $v with a max of $mc compute units"
println(str)
end

Loading