Skip to content

Commit

Permalink
Remove cv::Mat's from pyramid kernel
Browse files Browse the repository at this point in the history
  • Loading branch information
Richard Zhao committed May 4, 2017
1 parent 238b44e commit a353098
Show file tree
Hide file tree
Showing 8 changed files with 100 additions and 143 deletions.
59 changes: 13 additions & 46 deletions src/kernels/pad.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ using namespace timer;
namespace cu {

/**
* Perform border padding with constant (0) or replication on src and store it in dest.
* Accepts 3-channel 32-bit float matrices.
* Perform border padding with constant (0) or replication on src.
* Accepts 3-channel 32-bit float matrices. Returns pointer to device dest.
*
* Params:
* src input image.
Expand All @@ -42,13 +42,10 @@ namespace cu {
* right right padding
* replicate whether to replicate or constant
*/
void pad(
const cv::Mat& src, cv::Mat& dest, int top,
int bottom, int left, int right, bool replicate) {

if (src.type() != CV_32FC3) {
throw std::invalid_argument("pad: invalid input matrix type");
}
Npp32f* pad(
Npp32f* src,
int width, int height,
int top, int bottom, int left, int right, bool replicate) {

// Compute time of relevant kernel
double compute_time = 0.0;
Expand All @@ -57,17 +54,11 @@ namespace cu {
int channels = 3;
size_t elemSize = 3 * sizeof(float);

cv::Size sz = src.size();
int width = sz.width;
int height = sz.height;
int destWidth = left + width + right;
int destHeight = top + height + bottom;

std::cout << "[start] pad: processing " << width << "x" << height << " image" << std::endl;

// pSrc pointer to image data
Npp32f* pHostSrc = (float*) src.data;

// The width, in bytes, of the image, sometimes referred to as pitch
unsigned int nSrcStep = width * elemSize;
unsigned int nDstStep = destWidth * elemSize;
Expand All @@ -77,26 +68,15 @@ namespace cu {
NppiSize oDstSizeROI = { destWidth, destHeight };
const Npp32f padVal[3] = {0.0, 0.0, 0.0};

auto start_cuda_malloc = now();

// Allocate device memory
Npp32f* pDeviceSrc, *pDeviceDst;

checkCudaErrors( cudaMalloc((void**) &pDeviceSrc, width * height * elemSize) );
auto start_cuda_malloc = now();
Npp32f* pDeviceDst;
checkCudaErrors( cudaMalloc((void**) &pDeviceDst, destWidth * destHeight * elemSize) );
checkCudaErrors( cudaMemset(pDeviceDst, 0, destWidth * destHeight * elemSize) );

if (!replicate)
checkCudaErrors( cudaMemset(pDeviceDst, 0, destWidth * destHeight * elemSize) );
calc_print_elapsed("cudaMalloc", start_cuda_malloc);


auto start_memcpy_hd = now();

// Copy image to device
checkCudaErrors(
cudaMemcpy(pDeviceSrc, pHostSrc, width * height * elemSize, cudaMemcpyHostToDevice) );

calc_print_elapsed("cudaMemcpy H->D", start_memcpy_hd);

Npp32f* pDeviceSrc = src;

auto start_pad = now();

Expand All @@ -112,22 +92,9 @@ namespace cu {

compute_time += calc_print_elapsed("pad", start_pad);


auto start_memcpy_dh = now();

// Copy result to host
dest.create(destHeight, destWidth, CV_32FC3);

checkCudaErrors(
cudaMemcpy(dest.data, pDeviceDst,
destWidth * destHeight * elemSize, cudaMemcpyDeviceToHost) );

calc_print_elapsed("cudaMemcpy H<-D", start_memcpy_dh);

cudaFree((void*) pDeviceSrc);
cudaFree((void*) pDeviceDst);

std::cout << "[done] pad: primary compute time: " << compute_time << " (ms)" << std::endl;

return pDeviceDst;
}

}
11 changes: 6 additions & 5 deletions src/kernels/pad.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@
namespace cu {

/**
* Perform border padding with constant (0) or replication on src and store it in dest.
* Accepts 3-channel 32-bit float matrices.
* Perform border padding with constant (0) or replication on src.
* Accepts 3-channel 32-bit float matrices. Returns pointer to dest
*
* Params:
* src input image.
Expand All @@ -43,9 +43,10 @@ namespace cu {
* right right padding
* replicate whether to replicate or constant pad
*/
void pad(
const cv::Mat& src, cv::Mat& dest, int top,
int bottom, int left, int right, bool replicate);
Npp32f *pad(
Npp32f* src,
int width, int height,
int top, int bottom, int left, int right, bool replicate);

}

Expand Down
49 changes: 22 additions & 27 deletions src/kernels/pyramid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ using namespace timer;
namespace cu {

void constructImgPyramids(
const cv::Mat& I,
cv::Mat* Is, cv::Mat* Ixs, cv::Mat* Iys,
Npp32f* src, float** Is, float** Ixs, float** Iys,
int width, int height,
int padding, int nLevels) {

// Timing
Expand All @@ -42,13 +42,6 @@ namespace cu {
int channels = 3;
size_t elemSize = channels * sizeof(float);

// Setup
Is[0] = I.clone();

cv::Size sz = I.size();
int width = sz.width;
int height = sz.height;

unsigned int nSrcStep = width * elemSize;

// Gradient params
Expand Down Expand Up @@ -77,13 +70,14 @@ namespace cu {
std::cout << "[start] constructImgPyramids: processing "
<< width << "x" << height << " image" << std::endl;

Npp32f* pDeviceI = src;

// Allocate device memory (to account for padding too
auto start_cuda_malloc = now();
Npp32f *pDeviceI, *pDeviceIx, *pDeviceIy;
Npp32f *pDeviceIx, *pDeviceIy;
Npp32f *pDevicePaddedI, *pDevicePaddedIx, *pDevicePaddedIy;
Npp32f *pDeviceTmp, *pDeviceKernel;

checkCudaErrors( cudaMalloc((void**) &pDeviceI, width * height * elemSize) );
checkCudaErrors( cudaMalloc((void**) &pDeviceIx, width * height * elemSize) );
checkCudaErrors( cudaMalloc((void**) &pDeviceIy, width * height * elemSize) );

Expand All @@ -99,12 +93,10 @@ namespace cu {
// Copy over initial image and kernel
auto start_memcpy_hd = now();

checkCudaErrors(
cudaMemcpy(pDeviceI, (float*) Is[0].data, width * height * elemSize, cudaMemcpyHostToDevice) );
checkCudaErrors(
cudaMemcpy(pDeviceKernel, pSrcKernel, nMaskSize * sizeof(Npp32f), cudaMemcpyHostToDevice) );

calc_print_elapsed("cudaMemcpy I[0] H->D", start_memcpy_hd);
calc_print_elapsed("cudaMemcpy Kernel H->D", start_memcpy_hd);

////////////////////////////////////////////////////////////////////////////////////////////////
// Apply first gradients to Is[0]
Expand Down Expand Up @@ -153,6 +145,8 @@ namespace cu {
pDevicePaddedI, nDstStep, oPadSize, padding, padding) );

// Pad dx, dy
checkCudaErrors( cudaMemset(pDevicePaddedIx, 0, oPadSize.width * oPadSize.height * elemSize) );
checkCudaErrors( cudaMemset(pDevicePaddedIy, 0, oPadSize.width * oPadSize.height * elemSize) );
NPP_CHECK_NPP(
nppiCopyConstBorder_32f_C3R (
pDeviceIx, nSrcStep, oSize,
Expand All @@ -167,23 +161,23 @@ namespace cu {
////////////////////////////////////////////////////////////////////////////////////////////////

auto start_cp_Is0 = now();
Is[0].create(oPadSize.height, oPadSize.width, CV_32FC3);
Is[0] = new float[oPadSize.height * oPadSize.width * channels];
checkCudaErrors(
cudaMemcpy(Is[0].data, pDevicePaddedI,
cudaMemcpy(Is[0], pDevicePaddedI,
oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
compute_time += calc_print_elapsed("Is[0] cudaMemcpy D->H", start_cp_Is0);

auto start_cp_dx = now();
Ixs[0].create(oPadSize.height, oPadSize.width, CV_32FC3);
Ixs[0] = new float[oPadSize.height * oPadSize.width * channels];
checkCudaErrors(
cudaMemcpy(Ixs[0].data, pDevicePaddedIx,
cudaMemcpy(Ixs[0], pDevicePaddedIx,
oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
compute_time += calc_print_elapsed("Ixs[0] cudaMemcpy D->H", start_cp_dx);

auto start_cp_dy = now();
Iys[0].create(oPadSize.height, oPadSize.width, CV_32FC3);
Iys[0] = new float[oPadSize.height * oPadSize.width * channels];
checkCudaErrors(
cudaMemcpy(Iys[0].data, pDevicePaddedIy,
cudaMemcpy(Iys[0], pDevicePaddedIy,
oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
compute_time += calc_print_elapsed("Iys[0] cudaMemcpy D->H", start_cp_dy);

Expand Down Expand Up @@ -266,6 +260,8 @@ namespace cu {
pDevicePaddedI, nDstStep, oPadSize, padding, padding) );

// Pad dx, dy
checkCudaErrors( cudaMemset(pDevicePaddedIx, 0, oPadSize.width * oPadSize.height * elemSize) );
checkCudaErrors( cudaMemset(pDevicePaddedIy, 0, oPadSize.width * oPadSize.height * elemSize) );
NPP_CHECK_NPP(
nppiCopyConstBorder_32f_C3R (
pDeviceIx, nSrcStep, oSize,
Expand All @@ -277,28 +273,27 @@ namespace cu {

// Allocate host destinations
auto start_host_alloc = now();
Is[i].create(oPadSize.height, oPadSize.width, CV_32FC3);
Ixs[i].create(oPadSize.height, oPadSize.width, CV_32FC3);
Iys[i].create(oPadSize.height, oPadSize.width, CV_32FC3);
Is[i] = new float[oPadSize.width * oPadSize.height * channels];
Ixs[i] = new float[oPadSize.width * oPadSize.height * channels];
Iys[i] = new float[oPadSize.width * oPadSize.height * channels];
compute_time += calc_print_elapsed("host alloc", start_host_alloc);

// Copy over data
auto start_cp = now();
checkCudaErrors(
cudaMemcpy(Is[i].data, pDevicePaddedI,
cudaMemcpy(Is[i], pDevicePaddedI,
oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
checkCudaErrors(
cudaMemcpy(Ixs[i].data, pDevicePaddedIx,
cudaMemcpy(Ixs[i], pDevicePaddedIx,
oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
checkCudaErrors(
cudaMemcpy(Iys[i].data, pDevicePaddedIy,
cudaMemcpy(Iys[i], pDevicePaddedIy,
oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
compute_time += calc_print_elapsed("pyramid cudaMemcpy D->H", start_cp);

}

// Clean up
cudaFree(pDeviceI);
cudaFree(pDeviceIx);
cudaFree(pDeviceIy);
cudaFree(pDevicePaddedI);
Expand Down
4 changes: 2 additions & 2 deletions src/kernels/pyramid.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@
namespace cu {

void constructImgPyramids(
const cv::Mat& I,
cv::Mat* Is, cv::Mat* Ixs, cv::Mat* Iys,
Npp32f* I, float** Is, float** Ixs, float** Iys,
int width, int height,
int padding, int nLevels);

}
Expand Down
57 changes: 10 additions & 47 deletions src/oflow.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
#include <opencv2/highgui/highgui.hpp> // needed for verbosity >= 3, DISVISUAL
#include <opencv2/imgproc/imgproc.hpp> // needed for verbosity >= 3, DISVISUAL

#include <nppi.h>

#include <sys/time.h> // timeof day
#include <stdio.h>

Expand Down Expand Up @@ -60,58 +62,21 @@ namespace OFC {
I1xs = new float*[op.coarsest_scale+1];
I1ys = new float*[op.coarsest_scale+1];

I0_mats = new cv::Mat[op.coarsest_scale+1];
I1_mats = new cv::Mat[op.coarsest_scale+1];
I0x_mats = new cv::Mat[op.coarsest_scale+1];
I0y_mats = new cv::Mat[op.coarsest_scale+1];
I1x_mats = new cv::Mat[op.coarsest_scale+1];
I1y_mats = new cv::Mat[op.coarsest_scale+1];

}



void OFClass::ConstructImgPyramids() {
void OFClass::ConstructImgPyramids(img_params iparams) {

// Timing structures
struct timeval start_time, end_time;
gettimeofday(&start_time, NULL);

// Construct image and gradient pyramides
cu::constructImgPyramids(I0, I0_mats, I0x_mats, I0y_mats, op.patch_size, op.coarsest_scale + 1);
cu::constructImgPyramids(I1, I1_mats, I1x_mats, I1y_mats, op.patch_size, op.coarsest_scale + 1);

auto start_pad = now();

// Pad images
for (int i = 0; i <= op.coarsest_scale; ++i) {

// Replicate padding for images
// cu::pad(I0_mats[i], I0_mats[i], op.patch_size, op.patch_size,
// op.patch_size, op.patch_size, true);
// cu::pad(I1_mats[i], I1_mats[i], op.patch_size, op.patch_size,
// op.patch_size, op.patch_size, true);
I0s[i] = (float*) I0_mats[i].data;
I1s[i] = (float*) I1_mats[i].data;

// Zero pad for gradients
// cu::pad(I0x_mats[i], I0x_mats[i], op.patch_size, op.patch_size,
// op.patch_size, op.patch_size, false);
// cu::pad(I0y_mats[i], I0y_mats[i], op.patch_size, op.patch_size,
// op.patch_size, op.patch_size, false);
// cu::pad(I1x_mats[i], I1x_mats[i], op.patch_size, op.patch_size,
// op.patch_size, op.patch_size, false);
// cu::pad(I1y_mats[i], I1y_mats[i], op.patch_size, op.patch_size,
// op.patch_size, op.patch_size, false);

I0xs[i] = (float*) I0x_mats[i].data;
I0ys[i] = (float*) I0y_mats[i].data;
I1xs[i] = (float*) I1x_mats[i].data;
I1ys[i] = (float*) I1y_mats[i].data;

}

calc_print_elapsed("pad images", start_pad);
cu::constructImgPyramids(I0, I0s, I0xs, I0ys, iparams.width, iparams.height,
op.patch_size, op.coarsest_scale + 1);
cu::constructImgPyramids(I1, I1s, I1xs, I1ys, iparams.width, iparams.height,
op.patch_size, op.coarsest_scale + 1);

// Timing, image gradients and pyramid
if (op.verbosity > 1) {
Expand All @@ -126,16 +91,15 @@ namespace OFC {



void OFClass::calc(cv::Mat _I0, cv::Mat _I1, img_params _iparams, const float * initflow, float * outflow) {
void OFClass::calc(Npp32f* _I0, Npp32f* _I1, img_params _iparams, const float * initflow, float * outflow) {

I0 = _I0;
I1 = _I1;

std::cout << "I0 " << I0.size() << " channels: " << I0.channels()
<< " type: " << I0.type() << std::endl;
std::cout << "I0 " << _iparams.height << "x" << _iparams.width << std::endl;

printf("Constructing pyramids\n");
ConstructImgPyramids();
ConstructImgPyramids(_iparams);

if (op.verbosity > 1) cout << ", cflow " << endl;

Expand Down Expand Up @@ -185,7 +149,6 @@ namespace OFC {

}


// Timing, Grid memory allocation
if (op.verbosity>1) {

Expand Down
Loading

0 comments on commit a353098

Please sign in to comment.