-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathreduction.cu
102 lines (65 loc) · 2.36 KB
/
reduction.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <float.h>
#include <cutil_inline.h>
void reduction_gold(float* odata, float* idata, const unsigned int len)
{
*odata = 0;
for(int i=0; i<len; i++) *odata += idata[i];
}
__global__ void reduction(float *g_odata, float *g_idata)
{
// dynamically allocated shared memory
extern __shared__ float temp[];
int tid = threadIdx.x;
// first, each thread loads data into shared memory
temp[tid] = g_idata[tid];
// next, we perform binary tree reduction
for (int d = blockDim.x>>1; d > 0; d >>= 1) {
__syncthreads(); // ensure previous step completed
if (tid<d) temp[tid] += temp[tid+d];
}
// finally, first thread puts result into global memory
if (tid==0) g_odata[0] = temp[0];
}
int main( int argc, char** argv)
{
int num_elements, num_threads, mem_size, shared_mem_size;
float *h_data, *reference, sum;
float *d_idata, *d_odata;
cutilDeviceInit(argc, argv);
num_elements = 512;
num_threads = num_elements;
mem_size = sizeof(float) * num_elements;
// allocate host memory to store the input data
// and initialize to integer values between 0 and 1000
h_data = (float*) malloc(mem_size);
for(int i = 0; i < num_elements; i++)
h_data[i] = floorf(1000*(rand()/(float)RAND_MAX));
// compute reference solutions
reference = (float*) malloc(mem_size);
reduction_gold(&sum, h_data, num_elements);
// allocate device memory input and output arrays
cudaSafeCall(cudaMalloc((void**)&d_idata, mem_size));
cudaSafeCall(cudaMalloc((void**)&d_odata, sizeof(float)));
// copy host memory to device input array
cudaSafeCall(cudaMemcpy(d_idata, h_data, mem_size, cudaMemcpyHostToDevice));
// execute the kernel
shared_mem_size = sizeof(float) * num_elements;
reduction<<<1,num_threads,shared_mem_size>>>(d_odata,d_idata);
cudaCheckMsg("reduction kernel execution failed");
// copy result from device to host
cudaSafeCall(cudaMemcpy(h_data, d_odata, sizeof(float),
cudaMemcpyDeviceToHost));
// check results
printf("reduction error = %f\n",h_data[0]-sum);
// cleanup memory
free(h_data);
free(reference);
cudaSafeCall(cudaFree(d_idata));
cudaSafeCall(cudaFree(d_odata));
// CUDA exit -- needed to flush printf write buffer
cudaDeviceReset();
}