Skip to content

Commit

Permalink
Put pad into pyramid
Browse files Browse the repository at this point in the history
  • Loading branch information
Richard Zhao committed May 4, 2017
1 parent 5ac4469 commit 238b44e
Show file tree
Hide file tree
Showing 3 changed files with 115 additions and 44 deletions.
129 changes: 100 additions & 29 deletions src/kernels/pyramid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ namespace cu {
void constructImgPyramids(
const cv::Mat& I,
cv::Mat* Is, cv::Mat* Ixs, cv::Mat* Iys,
int nLevels) {
int padding, int nLevels) {

// Timing
auto start_total = now();
Expand Down Expand Up @@ -69,16 +69,28 @@ namespace cu {
double shiftX = 0.0;
double shiftY = 0.0;

// Padding params
int padWidth = 2 * padding + width;
int padHeight = 2 * padding + height;
const Npp32f PAD_VAL[3] = { 0.0, 0.0, 0.0 };

std::cout << "[start] constructImgPyramids: processing "
<< width << "x" << height << " image" << std::endl;

// Allocate device memory
// Allocate device memory (to account for padding too
auto start_cuda_malloc = now();
Npp32f *pDeviceI, *pDeviceIx, *pDeviceIy, *pDeviceTmp, *pDeviceKernel;
Npp32f *pDeviceI, *pDeviceIx, *pDeviceIy;
Npp32f *pDevicePaddedI, *pDevicePaddedIx, *pDevicePaddedIy;
Npp32f *pDeviceTmp, *pDeviceKernel;

checkCudaErrors( cudaMalloc((void**) &pDeviceI, width * height * elemSize) );
checkCudaErrors( cudaMalloc((void**) &pDeviceIx, width * height * elemSize) );
checkCudaErrors( cudaMalloc((void**) &pDeviceIy, width * height * elemSize) );

checkCudaErrors( cudaMalloc((void**) &pDevicePaddedI, padWidth * padHeight * elemSize) );
checkCudaErrors( cudaMalloc((void**) &pDevicePaddedIx, padWidth * padHeight * elemSize) );
checkCudaErrors( cudaMalloc((void**) &pDevicePaddedIy, padWidth * padHeight * elemSize) );

checkCudaErrors( cudaMalloc((void**) &pDeviceI, width * height * elemSize) );
checkCudaErrors( cudaMalloc((void**) &pDeviceIx, width * height * elemSize) );
checkCudaErrors( cudaMalloc((void**) &pDeviceIy, width * height * elemSize) );
checkCudaErrors( cudaMalloc((void**) &pDeviceTmp, width * height * elemSize) );
checkCudaErrors( cudaMalloc((void**) &pDeviceKernel, nMaskSize * sizeof(Npp32f)) );

Expand Down Expand Up @@ -112,13 +124,6 @@ namespace cu {
);
compute_time += calc_print_elapsed("sobel: Ixs[0]", start_dx);

auto start_cp_dx = now();
Ixs[0].create(height, width, CV_32FC3);
checkCudaErrors(
cudaMemcpy(Ixs[0].data, pDeviceIx,
width * height * elemSize, cudaMemcpyDeviceToHost) );
compute_time += calc_print_elapsed("Ixs[0] cudaMemcpy D->H", start_cp_dx);

// dy's
auto start_dy = now();
NPP_CHECK_NPP(
Expand All @@ -133,11 +138,53 @@ namespace cu {
);
compute_time += calc_print_elapsed("sobel: Iys[0]", start_dy);

////////////////////////////////////////////////////////////////////////////////////////////////
// Pad Is[0] I, dx, dy
////////////////////////////////////////////////////////////////////////////////////////////////


NppiSize oPadSize = { padWidth, padHeight };
int nDstStep = oPadSize.width * elemSize;

// Pad original
NPP_CHECK_NPP(
nppiCopyReplicateBorder_32f_C3R (
pDeviceI, nSrcStep, oSize,
pDevicePaddedI, nDstStep, oPadSize, padding, padding) );

// Pad dx, dy
NPP_CHECK_NPP(
nppiCopyConstBorder_32f_C3R (
pDeviceIx, nSrcStep, oSize,
pDevicePaddedIx, nDstStep, oPadSize, padding, padding, PAD_VAL) );
NPP_CHECK_NPP(
nppiCopyConstBorder_32f_C3R (
pDeviceIy, nSrcStep, oSize,
pDevicePaddedIy, nDstStep, oPadSize, padding, padding, PAD_VAL) );

////////////////////////////////////////////////////////////////////////////////////////////////
// Copy Is[0] I, dx, dy
////////////////////////////////////////////////////////////////////////////////////////////////

auto start_cp_Is0 = now();
Is[0].create(oPadSize.height, oPadSize.width, CV_32FC3);
checkCudaErrors(
cudaMemcpy(Is[0].data, pDevicePaddedI,
oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
compute_time += calc_print_elapsed("Is[0] cudaMemcpy D->H", start_cp_Is0);

auto start_cp_dx = now();
Ixs[0].create(oPadSize.height, oPadSize.width, CV_32FC3);
checkCudaErrors(
cudaMemcpy(Ixs[0].data, pDevicePaddedIx,
oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
compute_time += calc_print_elapsed("Ixs[0] cudaMemcpy D->H", start_cp_dx);

auto start_cp_dy = now();
Iys[0].create(height, width, CV_32FC3);
Iys[0].create(oPadSize.height, oPadSize.width, CV_32FC3);
checkCudaErrors(
cudaMemcpy(Iys[0].data, pDeviceIy,
width * height * elemSize, cudaMemcpyDeviceToHost) );
cudaMemcpy(Iys[0].data, pDevicePaddedIy,
oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
compute_time += calc_print_elapsed("Iys[0] cudaMemcpy D->H", start_cp_dy);


Expand Down Expand Up @@ -203,28 +250,49 @@ namespace cu {
);
compute_time += calc_print_elapsed("sobel: Iys[i]", start_dy);

// Allocate host destinations
auto start_host_alloc = now();
Is[i].create(dstRect.height, dstRect.width, CV_32FC3);
Ixs[i].create(dstRect.height, dstRect.width, CV_32FC3);
Iys[i].create(dstRect.height, dstRect.width, CV_32FC3);
//////////////////////////////////////////////////////////////////////////////////////////////
// Pad I, dx, dy
//////////////////////////////////////////////////////////////////////////////////////////////

padWidth = width + 2 * padding;
padHeight = height + 2 * padding;
NppiSize oPadSize = { padWidth, padHeight };
nDstStep = oPadSize.width * elemSize;

// Pad original
NPP_CHECK_NPP(
nppiCopyReplicateBorder_32f_C3R (
pDeviceI, nSrcStep, oSize,
pDevicePaddedI, nDstStep, oPadSize, padding, padding) );

std::cout << "Is[" << i << "]: " << Is[i].size() << " channels: " << Is[i].channels()
<< " type: " << Is[i].type() << std::endl;
// Pad dx, dy
NPP_CHECK_NPP(
nppiCopyConstBorder_32f_C3R (
pDeviceIx, nSrcStep, oSize,
pDevicePaddedIx, nDstStep, oPadSize, padding, padding, PAD_VAL) );
NPP_CHECK_NPP(
nppiCopyConstBorder_32f_C3R (
pDeviceIy, nSrcStep, oSize,
pDevicePaddedIy, nDstStep, oPadSize, padding, padding, PAD_VAL) );

// Allocate host destinations
auto start_host_alloc = now();
Is[i].create(oPadSize.height, oPadSize.width, CV_32FC3);
Ixs[i].create(oPadSize.height, oPadSize.width, CV_32FC3);
Iys[i].create(oPadSize.height, oPadSize.width, CV_32FC3);
compute_time += calc_print_elapsed("host alloc", start_host_alloc);

// Copy over data
auto start_cp = now();
checkCudaErrors(
cudaMemcpy(Is[i].data, pDeviceI,
dstRect.width * dstRect.height * elemSize, cudaMemcpyDeviceToHost) );
cudaMemcpy(Is[i].data, pDevicePaddedI,
oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
checkCudaErrors(
cudaMemcpy(Ixs[i].data, pDeviceIx,
dstRect.width * dstRect.height * elemSize, cudaMemcpyDeviceToHost) );
cudaMemcpy(Ixs[i].data, pDevicePaddedIx,
oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
checkCudaErrors(
cudaMemcpy(Iys[i].data, pDeviceIy,
dstRect.width * dstRect.height * elemSize, cudaMemcpyDeviceToHost) );
cudaMemcpy(Iys[i].data, pDevicePaddedIy,
oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
compute_time += calc_print_elapsed("pyramid cudaMemcpy D->H", start_cp);

}
Expand All @@ -233,6 +301,9 @@ namespace cu {
cudaFree(pDeviceI);
cudaFree(pDeviceIx);
cudaFree(pDeviceIy);
cudaFree(pDevicePaddedI);
cudaFree(pDevicePaddedIx);
cudaFree(pDevicePaddedIy);
cudaFree(pDeviceTmp);
cudaFree(pDeviceKernel);

Expand Down
2 changes: 1 addition & 1 deletion src/kernels/pyramid.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ namespace cu {
void constructImgPyramids(
const cv::Mat& I,
cv::Mat* Is, cv::Mat* Ixs, cv::Mat* Iys,
int nLevels);
int padding, int nLevels);

}

Expand Down
28 changes: 14 additions & 14 deletions src/oflow.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,31 +78,31 @@ namespace OFC {
gettimeofday(&start_time, NULL);

// Construct image and gradient pyramides
cu::constructImgPyramids(I0, I0_mats, I0x_mats, I0y_mats, op.coarsest_scale + 1);
cu::constructImgPyramids(I1, I1_mats, I1x_mats, I1y_mats, op.coarsest_scale + 1);
cu::constructImgPyramids(I0, I0_mats, I0x_mats, I0y_mats, op.patch_size, op.coarsest_scale + 1);
cu::constructImgPyramids(I1, I1_mats, I1x_mats, I1y_mats, op.patch_size, op.coarsest_scale + 1);

auto start_pad = now();

// Pad images
for (int i = 0; i <= op.coarsest_scale; ++i) {

// Replicate padding for images
cu::pad(I0_mats[i], I0_mats[i], op.patch_size, op.patch_size,
op.patch_size, op.patch_size, true);
cu::pad(I1_mats[i], I1_mats[i], op.patch_size, op.patch_size,
op.patch_size, op.patch_size, true);
// cu::pad(I0_mats[i], I0_mats[i], op.patch_size, op.patch_size,
// op.patch_size, op.patch_size, true);
// cu::pad(I1_mats[i], I1_mats[i], op.patch_size, op.patch_size,
// op.patch_size, op.patch_size, true);
I0s[i] = (float*) I0_mats[i].data;
I1s[i] = (float*) I1_mats[i].data;

// Zero pad for gradients
cu::pad(I0x_mats[i], I0x_mats[i], op.patch_size, op.patch_size,
op.patch_size, op.patch_size, false);
cu::pad(I0y_mats[i], I0y_mats[i], op.patch_size, op.patch_size,
op.patch_size, op.patch_size, false);
cu::pad(I1x_mats[i], I1x_mats[i], op.patch_size, op.patch_size,
op.patch_size, op.patch_size, false);
cu::pad(I1y_mats[i], I1y_mats[i], op.patch_size, op.patch_size,
op.patch_size, op.patch_size, false);
// cu::pad(I0x_mats[i], I0x_mats[i], op.patch_size, op.patch_size,
// op.patch_size, op.patch_size, false);
// cu::pad(I0y_mats[i], I0y_mats[i], op.patch_size, op.patch_size,
// op.patch_size, op.patch_size, false);
// cu::pad(I1x_mats[i], I1x_mats[i], op.patch_size, op.patch_size,
// op.patch_size, op.patch_size, false);
// cu::pad(I1y_mats[i], I1y_mats[i], op.patch_size, op.patch_size,
// op.patch_size, op.patch_size, false);

I0xs[i] = (float*) I0x_mats[i].data;
I0ys[i] = (float*) I0y_mats[i].data;
Expand Down

0 comments on commit 238b44e

Please sign in to comment.