From a35309887d2fd99e3a89db49536be5330bd6003d Mon Sep 17 00:00:00 2001 From: Richard Zhao Date: Thu, 4 May 2017 18:16:10 -0400 Subject: [PATCH] Remove cv::Mat's from pyramid kernel --- src/kernels/pad.cpp | 59 ++++++++----------------------------- src/kernels/pad.h | 11 +++---- src/kernels/pyramid.cpp | 49 ++++++++++++++---------------- src/kernels/pyramid.h | 4 +-- src/oflow.cpp | 57 +++++++---------------------------- src/oflow.h | 11 ++++--- src/run_dense.cpp | 49 ++++++++++++++++++++++++------ src/sandbox/process_pad.cpp | 3 +- 8 files changed, 100 insertions(+), 143 deletions(-) diff --git a/src/kernels/pad.cpp b/src/kernels/pad.cpp index cc7f30d..6b2f098 100644 --- a/src/kernels/pad.cpp +++ b/src/kernels/pad.cpp @@ -29,8 +29,8 @@ using namespace timer; namespace cu { /** - * Perform border padding with constant (0) or replication on src and store it in dest. - * Accepts 3-channel 32-bit float matrices. + * Perform border padding with constant (0) or replication on src. + * Accepts 3-channel 32-bit float matrices. Returns pointer to device dest. * * Params: * src input image. @@ -42,13 +42,10 @@ namespace cu { * right right padding * replicate whether to replicate or constant */ - void pad( - const cv::Mat& src, cv::Mat& dest, int top, - int bottom, int left, int right, bool replicate) { - - if (src.type() != CV_32FC3) { - throw std::invalid_argument("pad: invalid input matrix type"); - } + Npp32f* pad( + Npp32f* src, + int width, int height, + int top, int bottom, int left, int right, bool replicate) { // Compute time of relevant kernel double compute_time = 0.0; @@ -57,17 +54,11 @@ namespace cu { int channels = 3; size_t elemSize = 3 * sizeof(float); - cv::Size sz = src.size(); - int width = sz.width; - int height = sz.height; int destWidth = left + width + right; int destHeight = top + height + bottom; std::cout << "[start] pad: processing " << width << "x" << height << " image" << std::endl; - // pSrc pointer to image data - Npp32f* pHostSrc = (float*) src.data; - // The width, in bytes, of the image, sometimes referred to as pitch unsigned int nSrcStep = width * elemSize; unsigned int nDstStep = destWidth * elemSize; @@ -77,26 +68,15 @@ namespace cu { NppiSize oDstSizeROI = { destWidth, destHeight }; const Npp32f padVal[3] = {0.0, 0.0, 0.0}; - auto start_cuda_malloc = now(); - // Allocate device memory - Npp32f* pDeviceSrc, *pDeviceDst; - - checkCudaErrors( cudaMalloc((void**) &pDeviceSrc, width * height * elemSize) ); + auto start_cuda_malloc = now(); + Npp32f* pDeviceDst; checkCudaErrors( cudaMalloc((void**) &pDeviceDst, destWidth * destHeight * elemSize) ); - checkCudaErrors( cudaMemset(pDeviceDst, 0, destWidth * destHeight * elemSize) ); - + if (!replicate) + checkCudaErrors( cudaMemset(pDeviceDst, 0, destWidth * destHeight * elemSize) ); calc_print_elapsed("cudaMalloc", start_cuda_malloc); - - auto start_memcpy_hd = now(); - - // Copy image to device - checkCudaErrors( - cudaMemcpy(pDeviceSrc, pHostSrc, width * height * elemSize, cudaMemcpyHostToDevice) ); - - calc_print_elapsed("cudaMemcpy H->D", start_memcpy_hd); - + Npp32f* pDeviceSrc = src; auto start_pad = now(); @@ -112,22 +92,9 @@ namespace cu { compute_time += calc_print_elapsed("pad", start_pad); - - auto start_memcpy_dh = now(); - - // Copy result to host - dest.create(destHeight, destWidth, CV_32FC3); - - checkCudaErrors( - cudaMemcpy(dest.data, pDeviceDst, - destWidth * destHeight * elemSize, cudaMemcpyDeviceToHost) ); - - calc_print_elapsed("cudaMemcpy H<-D", start_memcpy_dh); - - cudaFree((void*) pDeviceSrc); - cudaFree((void*) pDeviceDst); - std::cout << "[done] pad: primary compute time: " << compute_time << " (ms)" << std::endl; + + return pDeviceDst; } } diff --git a/src/kernels/pad.h b/src/kernels/pad.h index 662a30a..0c49e6c 100644 --- a/src/kernels/pad.h +++ b/src/kernels/pad.h @@ -30,8 +30,8 @@ namespace cu { /** - * Perform border padding with constant (0) or replication on src and store it in dest. - * Accepts 3-channel 32-bit float matrices. + * Perform border padding with constant (0) or replication on src. + * Accepts 3-channel 32-bit float matrices. Returns pointer to dest * * Params: * src input image. @@ -43,9 +43,10 @@ namespace cu { * right right padding * replicate whether to replicate or constant pad */ - void pad( - const cv::Mat& src, cv::Mat& dest, int top, - int bottom, int left, int right, bool replicate); + Npp32f *pad( + Npp32f* src, + int width, int height, + int top, int bottom, int left, int right, bool replicate); } diff --git a/src/kernels/pyramid.cpp b/src/kernels/pyramid.cpp index 79d9a98..311935c 100644 --- a/src/kernels/pyramid.cpp +++ b/src/kernels/pyramid.cpp @@ -30,8 +30,8 @@ using namespace timer; namespace cu { void constructImgPyramids( - const cv::Mat& I, - cv::Mat* Is, cv::Mat* Ixs, cv::Mat* Iys, + Npp32f* src, float** Is, float** Ixs, float** Iys, + int width, int height, int padding, int nLevels) { // Timing @@ -42,13 +42,6 @@ namespace cu { int channels = 3; size_t elemSize = channels * sizeof(float); - // Setup - Is[0] = I.clone(); - - cv::Size sz = I.size(); - int width = sz.width; - int height = sz.height; - unsigned int nSrcStep = width * elemSize; // Gradient params @@ -77,13 +70,14 @@ namespace cu { std::cout << "[start] constructImgPyramids: processing " << width << "x" << height << " image" << std::endl; + Npp32f* pDeviceI = src; + // Allocate device memory (to account for padding too auto start_cuda_malloc = now(); - Npp32f *pDeviceI, *pDeviceIx, *pDeviceIy; + Npp32f *pDeviceIx, *pDeviceIy; Npp32f *pDevicePaddedI, *pDevicePaddedIx, *pDevicePaddedIy; Npp32f *pDeviceTmp, *pDeviceKernel; - checkCudaErrors( cudaMalloc((void**) &pDeviceI, width * height * elemSize) ); checkCudaErrors( cudaMalloc((void**) &pDeviceIx, width * height * elemSize) ); checkCudaErrors( cudaMalloc((void**) &pDeviceIy, width * height * elemSize) ); @@ -99,12 +93,10 @@ namespace cu { // Copy over initial image and kernel auto start_memcpy_hd = now(); - checkCudaErrors( - cudaMemcpy(pDeviceI, (float*) Is[0].data, width * height * elemSize, cudaMemcpyHostToDevice) ); checkCudaErrors( cudaMemcpy(pDeviceKernel, pSrcKernel, nMaskSize * sizeof(Npp32f), cudaMemcpyHostToDevice) ); - calc_print_elapsed("cudaMemcpy I[0] H->D", start_memcpy_hd); + calc_print_elapsed("cudaMemcpy Kernel H->D", start_memcpy_hd); //////////////////////////////////////////////////////////////////////////////////////////////// // Apply first gradients to Is[0] @@ -153,6 +145,8 @@ namespace cu { pDevicePaddedI, nDstStep, oPadSize, padding, padding) ); // Pad dx, dy + checkCudaErrors( cudaMemset(pDevicePaddedIx, 0, oPadSize.width * oPadSize.height * elemSize) ); + checkCudaErrors( cudaMemset(pDevicePaddedIy, 0, oPadSize.width * oPadSize.height * elemSize) ); NPP_CHECK_NPP( nppiCopyConstBorder_32f_C3R ( pDeviceIx, nSrcStep, oSize, @@ -167,23 +161,23 @@ namespace cu { //////////////////////////////////////////////////////////////////////////////////////////////// auto start_cp_Is0 = now(); - Is[0].create(oPadSize.height, oPadSize.width, CV_32FC3); + Is[0] = new float[oPadSize.height * oPadSize.width * channels]; checkCudaErrors( - cudaMemcpy(Is[0].data, pDevicePaddedI, + cudaMemcpy(Is[0], pDevicePaddedI, oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) ); compute_time += calc_print_elapsed("Is[0] cudaMemcpy D->H", start_cp_Is0); auto start_cp_dx = now(); - Ixs[0].create(oPadSize.height, oPadSize.width, CV_32FC3); + Ixs[0] = new float[oPadSize.height * oPadSize.width * channels]; checkCudaErrors( - cudaMemcpy(Ixs[0].data, pDevicePaddedIx, + cudaMemcpy(Ixs[0], pDevicePaddedIx, oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) ); compute_time += calc_print_elapsed("Ixs[0] cudaMemcpy D->H", start_cp_dx); auto start_cp_dy = now(); - Iys[0].create(oPadSize.height, oPadSize.width, CV_32FC3); + Iys[0] = new float[oPadSize.height * oPadSize.width * channels]; checkCudaErrors( - cudaMemcpy(Iys[0].data, pDevicePaddedIy, + cudaMemcpy(Iys[0], pDevicePaddedIy, oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) ); compute_time += calc_print_elapsed("Iys[0] cudaMemcpy D->H", start_cp_dy); @@ -266,6 +260,8 @@ namespace cu { pDevicePaddedI, nDstStep, oPadSize, padding, padding) ); // Pad dx, dy + checkCudaErrors( cudaMemset(pDevicePaddedIx, 0, oPadSize.width * oPadSize.height * elemSize) ); + checkCudaErrors( cudaMemset(pDevicePaddedIy, 0, oPadSize.width * oPadSize.height * elemSize) ); NPP_CHECK_NPP( nppiCopyConstBorder_32f_C3R ( pDeviceIx, nSrcStep, oSize, @@ -277,28 +273,27 @@ namespace cu { // Allocate host destinations auto start_host_alloc = now(); - Is[i].create(oPadSize.height, oPadSize.width, CV_32FC3); - Ixs[i].create(oPadSize.height, oPadSize.width, CV_32FC3); - Iys[i].create(oPadSize.height, oPadSize.width, CV_32FC3); + Is[i] = new float[oPadSize.width * oPadSize.height * channels]; + Ixs[i] = new float[oPadSize.width * oPadSize.height * channels]; + Iys[i] = new float[oPadSize.width * oPadSize.height * channels]; compute_time += calc_print_elapsed("host alloc", start_host_alloc); // Copy over data auto start_cp = now(); checkCudaErrors( - cudaMemcpy(Is[i].data, pDevicePaddedI, + cudaMemcpy(Is[i], pDevicePaddedI, oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) ); checkCudaErrors( - cudaMemcpy(Ixs[i].data, pDevicePaddedIx, + cudaMemcpy(Ixs[i], pDevicePaddedIx, oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) ); checkCudaErrors( - cudaMemcpy(Iys[i].data, pDevicePaddedIy, + cudaMemcpy(Iys[i], pDevicePaddedIy, oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) ); compute_time += calc_print_elapsed("pyramid cudaMemcpy D->H", start_cp); } // Clean up - cudaFree(pDeviceI); cudaFree(pDeviceIx); cudaFree(pDeviceIy); cudaFree(pDevicePaddedI); diff --git a/src/kernels/pyramid.h b/src/kernels/pyramid.h index 3d529eb..c46d6ad 100644 --- a/src/kernels/pyramid.h +++ b/src/kernels/pyramid.h @@ -29,8 +29,8 @@ namespace cu { void constructImgPyramids( - const cv::Mat& I, - cv::Mat* Is, cv::Mat* Ixs, cv::Mat* Iys, + Npp32f* I, float** Is, float** Ixs, float** Iys, + int width, int height, int padding, int nLevels); } diff --git a/src/oflow.cpp b/src/oflow.cpp index 9106a8a..5f37d5b 100644 --- a/src/oflow.cpp +++ b/src/oflow.cpp @@ -12,6 +12,8 @@ #include // needed for verbosity >= 3, DISVISUAL #include // needed for verbosity >= 3, DISVISUAL +#include + #include // timeof day #include @@ -60,58 +62,21 @@ namespace OFC { I1xs = new float*[op.coarsest_scale+1]; I1ys = new float*[op.coarsest_scale+1]; - I0_mats = new cv::Mat[op.coarsest_scale+1]; - I1_mats = new cv::Mat[op.coarsest_scale+1]; - I0x_mats = new cv::Mat[op.coarsest_scale+1]; - I0y_mats = new cv::Mat[op.coarsest_scale+1]; - I1x_mats = new cv::Mat[op.coarsest_scale+1]; - I1y_mats = new cv::Mat[op.coarsest_scale+1]; - } - void OFClass::ConstructImgPyramids() { + void OFClass::ConstructImgPyramids(img_params iparams) { // Timing structures struct timeval start_time, end_time; gettimeofday(&start_time, NULL); // Construct image and gradient pyramides - cu::constructImgPyramids(I0, I0_mats, I0x_mats, I0y_mats, op.patch_size, op.coarsest_scale + 1); - cu::constructImgPyramids(I1, I1_mats, I1x_mats, I1y_mats, op.patch_size, op.coarsest_scale + 1); - - auto start_pad = now(); - - // Pad images - for (int i = 0; i <= op.coarsest_scale; ++i) { - - // Replicate padding for images - // cu::pad(I0_mats[i], I0_mats[i], op.patch_size, op.patch_size, - // op.patch_size, op.patch_size, true); - // cu::pad(I1_mats[i], I1_mats[i], op.patch_size, op.patch_size, - // op.patch_size, op.patch_size, true); - I0s[i] = (float*) I0_mats[i].data; - I1s[i] = (float*) I1_mats[i].data; - - // Zero pad for gradients - // cu::pad(I0x_mats[i], I0x_mats[i], op.patch_size, op.patch_size, - // op.patch_size, op.patch_size, false); - // cu::pad(I0y_mats[i], I0y_mats[i], op.patch_size, op.patch_size, - // op.patch_size, op.patch_size, false); - // cu::pad(I1x_mats[i], I1x_mats[i], op.patch_size, op.patch_size, - // op.patch_size, op.patch_size, false); - // cu::pad(I1y_mats[i], I1y_mats[i], op.patch_size, op.patch_size, - // op.patch_size, op.patch_size, false); - - I0xs[i] = (float*) I0x_mats[i].data; - I0ys[i] = (float*) I0y_mats[i].data; - I1xs[i] = (float*) I1x_mats[i].data; - I1ys[i] = (float*) I1y_mats[i].data; - - } - - calc_print_elapsed("pad images", start_pad); + cu::constructImgPyramids(I0, I0s, I0xs, I0ys, iparams.width, iparams.height, + op.patch_size, op.coarsest_scale + 1); + cu::constructImgPyramids(I1, I1s, I1xs, I1ys, iparams.width, iparams.height, + op.patch_size, op.coarsest_scale + 1); // Timing, image gradients and pyramid if (op.verbosity > 1) { @@ -126,16 +91,15 @@ namespace OFC { - void OFClass::calc(cv::Mat _I0, cv::Mat _I1, img_params _iparams, const float * initflow, float * outflow) { + void OFClass::calc(Npp32f* _I0, Npp32f* _I1, img_params _iparams, const float * initflow, float * outflow) { I0 = _I0; I1 = _I1; - std::cout << "I0 " << I0.size() << " channels: " << I0.channels() - << " type: " << I0.type() << std::endl; + std::cout << "I0 " << _iparams.height << "x" << _iparams.width << std::endl; printf("Constructing pyramids\n"); - ConstructImgPyramids(); + ConstructImgPyramids(_iparams); if (op.verbosity > 1) cout << ", cflow " << endl; @@ -185,7 +149,6 @@ namespace OFC { } - // Timing, Grid memory allocation if (op.verbosity>1) { diff --git a/src/oflow.h b/src/oflow.h index a05855c..0c01ddb 100644 --- a/src/oflow.h +++ b/src/oflow.h @@ -3,6 +3,8 @@ #ifndef OFC_HEADER #define OFC_HEADER +#include + #include #include #include @@ -76,19 +78,16 @@ namespace OFC { public: OFClass(opt_params _op); - void calc(cv::Mat _I0, cv::Mat _I1, img_params _iparams, const float * initflow, float * outflow); + void calc(Npp32f* _I0, Npp32f* _I1, img_params _iparams, const float * initflow, float * outflow); private: - void ConstructImgPyramids(); + void ConstructImgPyramids(img_params iparams); - cv::Mat I0, I1; + Npp32f* I0, * I1; float ** I0s, ** I0xs, ** I0ys; float ** I1s, ** I1xs, ** I1ys; - cv::Mat * I0_mats, * I0x_mats, * I0y_mats; - cv::Mat * I1_mats, * I1x_mats, * I1y_mats; - opt_params op; // Struct for optimization parameters std::vector iparams; // Struct (for each scale) for image parameter diff --git a/src/run_dense.cpp b/src/run_dense.cpp index fece28f..24f3c82 100644 --- a/src/run_dense.cpp +++ b/src/run_dense.cpp @@ -9,12 +9,14 @@ #include "oflow.h" #include "kernels/warmup.h" #include "kernels/pad.h" +#include "common/timer.h" // CUDA #include using namespace std; using namespace OFC; +using namespace timer; // Save a Depth/OF/SF as .flo file void SaveFlowFile(cv::Mat& img, const char* filename) { @@ -127,9 +129,32 @@ int main( int argc, char** argv ) { // Load images I0_mat = cv::imread(I0_file, CV_LOAD_IMAGE_COLOR); // Read the file I1_mat = cv::imread(I1_file, CV_LOAD_IMAGE_COLOR); // Read the file + int width_org = I0_mat.size().width; // unpadded original image size int height_org = I0_mat.size().height; // unpadded original image size + // convert to float + I0_mat.convertTo(I0_fmat, CV_32F); + I1_mat.convertTo(I1_fmat, CV_32F); + + int channels = 3; + int elemSize = channels * sizeof(Npp32f); + + /* MEMCOPY to CUDA */ + Npp32f* I0, *I1; + auto start_cuda_malloc = now(); + checkCudaErrors( cudaMalloc((void**) &I0, width_org * height_org * elemSize) ); + checkCudaErrors( cudaMalloc((void**) &I1, width_org * height_org * elemSize) ); + calc_print_elapsed("I0, I1 cudaMalloc", start_cuda_malloc); + + auto start_memcpy_hd = now(); + checkCudaErrors( + cudaMemcpy(I0, (float*) I0_fmat.data, width_org * height_org * elemSize, cudaMemcpyHostToDevice) ); + checkCudaErrors( + cudaMemcpy(I1, (float*) I1_fmat.data, width_org * height_org * elemSize, cudaMemcpyHostToDevice) ); + calc_print_elapsed("cudaMemcpy I0, I1 H->D", start_memcpy_hd); + + // Parse rest of parameters opt_params op; @@ -196,9 +221,6 @@ int main( int argc, char** argv ) { } - // convert to float - I0_mat.convertTo(I0_fmat, CV_32F); - I1_mat.convertTo(I1_fmat, CV_32F); // Pad image such that width and height are restless divisible on all scales (except last) int padw = 0, padh = 0; @@ -209,20 +231,29 @@ int main( int argc, char** argv ) { if (div > 0) padh = max_scale - div; if (padh > 0 || padw > 0) { - - cu::pad(I0_fmat, I0_fmat, floor((float) padh / 2.0f), ceil((float) padh / 2.0f), + Npp32f* I0Padded = cu::pad( + I0, width_org, height_org, floor((float) padh / 2.0f), ceil((float) padh / 2.0f), floor((float) padw / 2.0f), ceil((float) padw / 2.0f), true); - cu::pad(I1_fmat, I1_fmat, floor((float) padh / 2.0f), ceil((float) padh / 2.0f), + + Npp32f* I1Padded = cu::pad( + I1, width_org, height_org, floor((float) padh / 2.0f), ceil((float) padh / 2.0f), floor((float) padw / 2.0f), ceil((float) padw / 2.0f), true); + cudaFree(I0); + cudaFree(I1); + + I0 = I0Padded; + I1 = I1Padded; } + + // Create image paramaters img_params iparams; // padded image size, ensures divisibility by 2 on all scales (except last) - iparams.width = I0_fmat.size().width; - iparams.height = I0_fmat.size().height; + iparams.width = width_org + padw; + iparams.height = height_org + padh; iparams.padding = op.patch_size; @@ -243,7 +274,7 @@ int main( int argc, char** argv ) { float scale_fact = pow(2, op.finest_scale); cv::Mat flow_mat(iparams.height / scale_fact , iparams.width / scale_fact, CV_32FC2); // Optical Flow - ofc.calc(I0_fmat, I1_fmat, iparams, nullptr, (float*) flow_mat.data); + ofc.calc(I0, I1, iparams, nullptr, (float*) flow_mat.data); if (op.verbosity > 1) gettimeofday(&start_time, NULL); diff --git a/src/sandbox/process_pad.cpp b/src/sandbox/process_pad.cpp index e1253e4..9f21a9d 100644 --- a/src/sandbox/process_pad.cpp +++ b/src/sandbox/process_pad.cpp @@ -53,7 +53,8 @@ void process(const char* input_file, const char* output_file) { auto start_resize = now(); - cu::pad(I0_f, I1_f, 20, 50, 100, 200, true); + I1_f.data = (uchar*) cu::pad((float*) I0_f.data, I0_f.size().width, I0_f.size().height, + 20, 50, 100, 200, true); calc_print_elapsed("resize", start_resize);