-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path11_csdn_add_example
137 lines (116 loc) · 3.72 KB
/
11_csdn_add_example
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#include "cuda_runtime.h"
#include "device_launch_parameters.h" // threadIdx
#include <stdio.h> // io
#include <time.h> // time_t
#include <stdlib.h> // rand
#include <memory.h> //memset
#define CHECK(call) \
{ \
const cudaError_t error_code = call; \
if (error_code != cudaSuccess) \
{ \
printf("CUDA Error:\n"); \
printf(" File: %s\n", __FILE__); \
printf(" Line: %d\n", __LINE__); \
printf(" Error code: %d\n", error_code); \
printf(" Error text: %s\n", \
cudaGetErrorString(error_code)); \
exit(1); \
} \
}
void checkResult(float* hostRef, float* deviceRef, const int N)
{
double eps = 1.0E-8;
int match = 1;
for (int i = 0; i < N; i++)
{
if (hostRef[i] - deviceRef[i] > eps)
{
match = 0;
printf("\nArrays do not match\n");
printf("host %5.2f gpu %5.2f at current %d\n", hostRef[i], deviceRef[i], i);
break;
}
}
if (match)
printf("Arrays match!\n");
}
void initialData(float* p, const int N)
{
//generate different seed from random number
time_t t;
srand((unsigned int)time(&t)); // 生成种子
for (int i = 0; i < N; i++)
{
p[i] = (float)(rand() & 0xFF) / 10.0f; // 随机数
}
}
__device__ void checkIndex(void) {
printf("blockIdx: (%d, %d, %d) threadIdx: (%d, %d, %d) \n"
"gridDim: (%d, %d, %d) blockDim: (%d, %d, %d) \n ==========================\n",
blockIdx.x, blockIdx.y, blockIdx.z,
threadIdx.x, threadIdx.y, threadIdx.z,
gridDim.x, gridDim.y, gridDim.z,
blockDim.x, blockDim.y, blockDim.z
);
}
// cpu
void sumArraysOnHost(float* a, float* b, float* c, const int N)
{
for (int i = 0; i < N; i++)
{
c[i] = a[i] + b[i];
}
}
// 设备端:去掉了循环
__global__ void sumArraysOnDevice(float* a, float* b, float* c, const int N)
{
checkIndex();
int i = threadIdx.x;
c[i] = a[i] + b[i];
}
int main(void)
{
int device = 0;
cudaSetDevice(device); // 设置显卡号
// 1 分配内存
// host memory
int nElem = 32;
size_t nBytes = nElem * sizeof(nElem);
float* h_a, * h_b, * hostRef, *gpuRef;
h_a = (float*)malloc(nBytes);
h_b = (float*)malloc(nBytes);
hostRef = (float*)malloc(nBytes); // 主机端求得的结果
gpuRef = (float*)malloc(nBytes); // 设备端拷回的数据
// 初始化
initialData(h_a, nElem);
initialData(h_b, nElem);
memset(hostRef, 0, nBytes);
memset(hostRef, 0, nBytes);
// device memory
float* d_a, * d_b, * d_c;
cudaMalloc((float**)&d_a, nBytes);
cudaMalloc((float**)&d_b, nBytes);
cudaMalloc((float**)&d_c, nBytes);
// 2 transfer data from host to device
cudaMemcpy(d_a, h_a, nBytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, nBytes, cudaMemcpyHostToDevice);
// 3 在主机端调用设备端核函数
dim3 block(nElem);
dim3 grid(nElem / block.x);
sumArraysOnDevice<<<grid, block>>>(d_a, d_b, d_c, nElem);
// 4 transfer data from device to host
cudaMemcpy(gpuRef, d_c, nBytes, cudaMemcpyDeviceToHost);
//确认下结果
sumArraysOnHost(h_a, h_b, hostRef, nElem);
checkResult(hostRef, gpuRef, nElem);
// 5 释放内存
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
free(h_a);
free(h_b);
free(hostRef);
free(gpuRef);
return 0;
}