-
Notifications
You must be signed in to change notification settings - Fork 12
/
cudaMallocPitch_performance.cu
109 lines (90 loc) · 4.59 KB
/
cudaMallocPitch_performance.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#include<stdio.h>
#include<cuda.h>
#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include "Utilities.cuh"
#include "TimingGPU.cuh"
#define BLOCKSIZE_x 16
#define BLOCKSIZE_y 16
/******************/
/* TEST KERNEL 2D */
/******************/
__global__ void test_kernel_2D(float * __restrict__ devPtrA, float * __restrict__ devPtrB, float * __restrict__ devPtrC, const int Nrows, const int Ncols)
{
int tidx = blockIdx.x * blockDim.x + threadIdx.x;
int tidy = blockIdx.y * blockDim.y + threadIdx.y;
if ((tidx < Ncols) && (tidy < Nrows))
devPtrA[tidy * Ncols + tidx] = devPtrA[tidy * Ncols + tidx] + devPtrB[tidy * Ncols + tidx] + devPtrC[tidy * Ncols + tidx];
}
/**************************/
/* TEST KERNEL PITCHED 2D */
/**************************/
__global__ void test_kernel_Pitched_2D(float * __restrict__ devPtrA, float * __restrict__ devPtrB, float * __restrict__ devPtrC, const size_t pitchA, const size_t pitchB, const size_t pitchC, const int Nrows, const int Ncols)
{
int tidx = blockIdx.x * blockDim.x + threadIdx.x;
int tidy = blockIdx.y * blockDim.y + threadIdx.y;
if ((tidx < Ncols) && (tidy < Nrows))
{
float *row_a = (float *)((char*)devPtrA + tidy * pitchA);
float *row_b = (float *)((char*)devPtrB + tidy * pitchB);
float *row_c = (float *)((char*)devPtrC + tidy * pitchC);
row_a[tidx] = row_a[tidx] + row_b[tidx] + row_c[tidx];
}
}
/********/
/* MAIN */
/********/
int main()
{
const int Nrows = 7100;
const int Ncols = 2300;
TimingGPU timerGPU;
float *hostPtrA = (float *)malloc(Nrows * Ncols * sizeof(float));
float *hostPtrB = (float *)malloc(Nrows * Ncols * sizeof(float));
float *hostPtrC = (float *)malloc(Nrows * Ncols * sizeof(float));
float *devPtrA, *devPtrPitchedA;
float *devPtrB, *devPtrPitchedB;
float *devPtrC, *devPtrPitchedC;
size_t pitchA, pitchB, pitchC;
for (int i = 0; i < Nrows; i++)
for (int j = 0; j < Ncols; j++) {
hostPtrA[i * Ncols + j] = 1.f;
hostPtrB[i * Ncols + j] = 2.f;
hostPtrC[i * Ncols + j] = 3.f;
//printf("row %i column %i value %f \n", i, j, hostPtr[i][j]);
}
// --- 2D non-pitched allocation and host->device memcopy
gpuErrchk(cudaMalloc(&devPtrA, Nrows * Ncols * sizeof(float)));
gpuErrchk(cudaMalloc(&devPtrB, Nrows * Ncols * sizeof(float)));
gpuErrchk(cudaMalloc(&devPtrC, Nrows * Ncols * sizeof(float)));
gpuErrchk(cudaMemcpy(devPtrA, hostPtrA, Nrows * Ncols * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(devPtrB, hostPtrB, Nrows * Ncols * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(devPtrC, hostPtrC, Nrows * Ncols * sizeof(float), cudaMemcpyHostToDevice));
// --- 2D pitched allocation and host->device memcopy
gpuErrchk(cudaMallocPitch(&devPtrPitchedA, &pitchA, Ncols * sizeof(float), Nrows));
gpuErrchk(cudaMallocPitch(&devPtrPitchedB, &pitchB, Ncols * sizeof(float), Nrows));
gpuErrchk(cudaMallocPitch(&devPtrPitchedC, &pitchC, Ncols * sizeof(float), Nrows));
gpuErrchk(cudaMemcpy2D(devPtrPitchedA, pitchA, hostPtrA, Ncols * sizeof(float), Ncols*sizeof(float), Nrows, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy2D(devPtrPitchedB, pitchB, hostPtrB, Ncols * sizeof(float), Ncols*sizeof(float), Nrows, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy2D(devPtrPitchedC, pitchC, hostPtrC, Ncols * sizeof(float), Ncols*sizeof(float), Nrows, cudaMemcpyHostToDevice));
dim3 gridSize(iDivUp(Ncols, BLOCKSIZE_x), iDivUp(Nrows, BLOCKSIZE_y));
dim3 blockSize(BLOCKSIZE_y, BLOCKSIZE_x);
timerGPU.StartCounter();
test_kernel_2D << <gridSize, blockSize >> >(devPtrA, devPtrB, devPtrC, Nrows, Ncols);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
printf("Non-pitched - Time = %f; Memory = %i bytes \n", timerGPU.GetCounter(), Nrows * Ncols * sizeof(float));
timerGPU.StartCounter();
test_kernel_Pitched_2D << <gridSize, blockSize >> >(devPtrPitchedA, devPtrPitchedB, devPtrPitchedC, pitchA, pitchB, pitchC, Nrows, Ncols);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
printf("Pitched - Time = %f; Memory = %i bytes \n", timerGPU.GetCounter(), Nrows * pitchA);
//gpuErrchk(cudaMemcpy2D(hostPtr, Ncols * sizeof(float), devPtrPitched, pitch, Ncols * sizeof(float), Nrows, cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(hostPtrA, devPtrA, Nrows * Ncols * sizeof(float), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(hostPtrB, devPtrB, Nrows * Ncols * sizeof(float), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(hostPtrC, devPtrC, Nrows * Ncols * sizeof(float), cudaMemcpyDeviceToHost));
//for (int i = 0; i < Nrows; i++)
// for (int j = 0; j < Ncols; j++)
// printf("row %i column %i value %f \n", i, j, hostPtr[i * Ncols + j]);
return 0;
}