-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnumber_points.cu
176 lines (131 loc) · 4.88 KB
/
number_points.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#include <iostream>
#include <math.h>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#define N 1000000
#define SEED 72
#define BLOCKSIZE 1024
struct Point
{
double x;
double y;
};
void generatePoints(struct Point *data);
__global__ void number_points(struct Point *data, double *epsilonSqr, unsigned int *total);
int main(int argc, char *argv[])
{
// Read epsilon distance from command line
if (argc != 2)
{
printf("\nIncorrect number of input parameters. Please input an epsilon distance.\n");
return 0;
}
char inputEpsilon[20];
strcpy(inputEpsilon, argv[1]);
double epsilon = atof(inputEpsilon);
// Generate random points:
struct Point *data;
data = (struct Point *) malloc(sizeof(struct Point) * N);
generatePoints(data);
omp_set_num_threads(1);
cudaError_t errCode = cudaSuccess;
if (errCode != cudaSuccess)
std::cout << "\nLast error: " << errCode << std::endl;
double *epsilonSqr = (double *) malloc(sizeof(double));
unsigned int *total = (unsigned int *) malloc(sizeof(unsigned int));
struct Point *dev_data;
double *dev_epsilonSqr;
unsigned int *dev_total;
*epsilonSqr = epsilon * epsilon;
*total = 0;
// Allocate on the device
errCode = cudaMalloc((struct Point **) &dev_data, sizeof(struct Point) * N);
if(errCode != cudaSuccess)
std::cout << "\nError: dev_data alloc error with code " << errCode << std::endl;
errCode = cudaMalloc((unsigned int **) &dev_total, sizeof(unsigned int));
if (errCode != cudaSuccess)
std::cout << "\nError: dev_total alloc error with code " << errCode << std::endl;
errCode = cudaMalloc((double **) &dev_epsilonSqr, sizeof(double));
if (errCode != cudaSuccess)
std::cout << "\nError: dev_epsilon alloc error with code " << errCode << std::endl;
// Copy data over to device
errCode = cudaMemcpy(dev_data, data, sizeof(struct Point) * N, cudaMemcpyHostToDevice);
if (errCode != cudaSuccess)
std::cout << "\nError: dev_data memcpy error with code " << errCode << std::endl;
errCode = cudaMemcpy(dev_total, total, sizeof(unsigned int), cudaMemcpyHostToDevice);
if (errCode != cudaSuccess)
std::cout << "\nError: dev_total memcpy error with code " << errCode << std::endl;
errCode = cudaMemcpy(dev_epsilonSqr, epsilonSqr, sizeof(double), cudaMemcpyHostToDevice);
if (errCode != cudaSuccess)
std::cout << "\nError: dev_epsilonSqr memcpy error with code " << errCode << std::endl;
// Calculate total blocks
const unsigned int totalBlocks = ceil(N * 1.0 / BLOCKSIZE);
double tstart=omp_get_wtime();
// Execute kernel
number_points <<< totalBlocks, BLOCKSIZE >>> (dev_data, dev_epsilonSqr, dev_total);
cudaDeviceSynchronize();
double tend = omp_get_wtime();
if (errCode != cudaSuccess)
std::cout << "\nError after kernel launch " << errCode << std::endl;
printf("\nKernel time (s): %f", tend - tstart);
// Copy data from device to host
errCode = cudaMemcpy(total, dev_total, sizeof(unsigned int), cudaMemcpyDeviceToHost);
if (errCode != cudaSuccess)
fprintf(stderr, "\nError: \"%s\": %s", cudaGetErrorName(errCode), cudaGetErrorString(errCode));
else
printf("\nNumber of points within epsilon: %llu", (*total * 2) + N);
cudaFree(dev_data);
cudaFree(dev_epsilonSqr);
cudaFree(dev_total);
free(data);
printf("\n");
return 0;
}
/*
* Kernel - Find the total number of points within epsilon distance of each point
*/
__global__ void number_points(struct Point *data, double *epsilonSqr, unsigned int *total)
{
unsigned int i = threadIdx.x + (blockIdx.x * blockDim.x);
if (i >= N - 1)
return;
__shared__ unsigned int totalInBlock[BLOCKSIZE];
unsigned int totalInThread = 0;
unsigned int tmpTotal;
struct Point p1 = data[i];
for (unsigned int j = i + 1; j < N; j++)
{
struct Point p2 = data[j];
double xDiff = p1.x - p2.x;
double yDiff = p1.y - p2.y;
double distance = (xDiff * xDiff) + (yDiff * yDiff);
// Increment thread total
if (distance <= *epsilonSqr)
totalInThread++;
}
// Put thread total into block total array
totalInBlock[threadIdx.x] = totalInThread;
__syncthreads();
tmpTotal = 0;
// Sum all block totals into final total
if (threadIdx.x == 0)
{
for (unsigned int index = 0; index < BLOCKSIZE; index++)
tmpTotal += totalInBlock[index];
atomicAdd(total, tmpTotal);
}
return;
}
void generatePoints(struct Point *data)
{
srand(time(0));
for (unsigned int i = 0; i < N; i++)
{
data[i].x = 1000.0 * ((double)(rand()) / RAND_MAX);
data[i].y = 1000.0 * ((double)(rand()) / RAND_MAX);
}
}