Remove cv::Mat's from pyramid kernel

zhaorz · May 4, 2017 · a353098 · a353098
1 parent 238b44e
commit a353098
Show file tree

Hide file tree

Showing 8 changed files with 100 additions and 143 deletions.
diff --git a/src/kernels/pad.cpp b/src/kernels/pad.cpp
@@ -29,8 +29,8 @@ using namespace timer;
 namespace cu {
 
   /**
-   * Perform border padding with constant (0) or replication on src and store it in dest.
-   * Accepts 3-channel 32-bit float matrices.
+   * Perform border padding with constant (0) or replication on src.
+   * Accepts 3-channel 32-bit float matrices. Returns pointer to device dest.
    *
    * Params:
    *   src          input image.
@@ -42,13 +42,10 @@ namespace cu {
    *   right        right padding
    *   replicate    whether to replicate or constant
    */
-  void pad(
-      const cv::Mat& src, cv::Mat& dest, int top,
-      int bottom, int left, int right, bool replicate) {
-
-    if (src.type() != CV_32FC3) {
-      throw std::invalid_argument("pad: invalid input matrix type");
-    }
+  Npp32f* pad(
+      Npp32f* src,
+      int width, int height,
+      int top, int bottom, int left, int right, bool replicate) {
 
     // Compute time of relevant kernel
     double compute_time = 0.0;
@@ -57,17 +54,11 @@ namespace cu {
     int channels = 3;
     size_t elemSize = 3 * sizeof(float);
 
-    cv::Size sz = src.size();
-    int width   = sz.width;
-    int height  = sz.height;
     int destWidth = left + width + right;
     int destHeight = top + height + bottom;
 
     std::cout << "[start] pad: processing " << width << "x" << height << " image" << std::endl;
 
-    // pSrc pointer to image data
-    Npp32f* pHostSrc = (float*) src.data;
-
     // The width, in bytes, of the image, sometimes referred to as pitch
     unsigned int nSrcStep = width * elemSize;
     unsigned int nDstStep = destWidth * elemSize;
@@ -77,26 +68,15 @@ namespace cu {
     NppiSize oDstSizeROI = { destWidth, destHeight };
     const Npp32f padVal[3] = {0.0, 0.0, 0.0};
 
-    auto start_cuda_malloc = now();
-
     // Allocate device memory
-    Npp32f* pDeviceSrc, *pDeviceDst;
-
-    checkCudaErrors( cudaMalloc((void**) &pDeviceSrc, width * height * elemSize) );
+    auto start_cuda_malloc = now();
+    Npp32f* pDeviceDst;
     checkCudaErrors( cudaMalloc((void**) &pDeviceDst, destWidth * destHeight *  elemSize) );
-    checkCudaErrors( cudaMemset(pDeviceDst, 0, destWidth * destHeight * elemSize) );
-
+    if (!replicate)
+      checkCudaErrors( cudaMemset(pDeviceDst, 0, destWidth * destHeight * elemSize) );
     calc_print_elapsed("cudaMalloc", start_cuda_malloc);
 
-
-    auto start_memcpy_hd = now();
-
-    // Copy image to device
-    checkCudaErrors(
-        cudaMemcpy(pDeviceSrc, pHostSrc, width * height * elemSize, cudaMemcpyHostToDevice) );
-
-    calc_print_elapsed("cudaMemcpy H->D", start_memcpy_hd);
-
+    Npp32f* pDeviceSrc = src;
 
     auto start_pad = now();
 
@@ -112,22 +92,9 @@ namespace cu {
 
     compute_time += calc_print_elapsed("pad", start_pad);
 
-
-    auto start_memcpy_dh = now();
-
-    // Copy result to host
-    dest.create(destHeight, destWidth, CV_32FC3);
-
-    checkCudaErrors(
-        cudaMemcpy(dest.data, pDeviceDst,
-          destWidth * destHeight * elemSize, cudaMemcpyDeviceToHost) );
-
-    calc_print_elapsed("cudaMemcpy H<-D", start_memcpy_dh);
-
-    cudaFree((void*) pDeviceSrc);
-    cudaFree((void*) pDeviceDst);
-
     std::cout << "[done] pad: primary compute time: " << compute_time << " (ms)" << std::endl;
+
+    return pDeviceDst;
   }
 
 }
diff --git a/src/kernels/pad.h b/src/kernels/pad.h
@@ -30,8 +30,8 @@
 namespace cu {
 
   /**
-   * Perform border padding with constant (0) or replication on src and store it in dest.
-   * Accepts 3-channel 32-bit float matrices.
+   * Perform border padding with constant (0) or replication on src.
+   * Accepts 3-channel 32-bit float matrices. Returns pointer to dest
    *
    * Params:
    *   src        input image.
@@ -43,9 +43,10 @@ namespace cu {
    *   right      right padding
    *   replicate  whether to replicate or constant pad
    */
-  void pad(
-      const cv::Mat& src, cv::Mat& dest, int top,
-      int bottom, int left, int right, bool replicate);
+  Npp32f *pad(
+      Npp32f* src,
+      int width, int height,
+      int top, int bottom, int left, int right, bool replicate);
 
 }
 

diff --git a/src/kernels/pyramid.cpp b/src/kernels/pyramid.cpp
@@ -30,8 +30,8 @@ using namespace timer;
 namespace cu {
 
   void constructImgPyramids(
-      const cv::Mat& I,
-      cv::Mat* Is, cv::Mat* Ixs, cv::Mat* Iys,
+      Npp32f* src, float** Is, float** Ixs, float** Iys,
+      int width, int height,
       int padding, int nLevels) {
 
     // Timing
@@ -42,13 +42,6 @@ namespace cu {
     int channels = 3;
     size_t elemSize = channels * sizeof(float);
 
-    // Setup
-    Is[0] = I.clone();
-
-    cv::Size sz = I.size();
-    int width   = sz.width;
-    int height  = sz.height;
-
     unsigned int nSrcStep = width * elemSize;
 
     // Gradient params
@@ -77,13 +70,14 @@ namespace cu {
     std::cout << "[start] constructImgPyramids: processing "
       << width << "x" << height << " image" << std::endl;
 
+    Npp32f* pDeviceI = src;
+
     // Allocate device memory (to account for padding too
     auto start_cuda_malloc = now();
-    Npp32f *pDeviceI, *pDeviceIx, *pDeviceIy;
+    Npp32f *pDeviceIx, *pDeviceIy;
     Npp32f *pDevicePaddedI, *pDevicePaddedIx, *pDevicePaddedIy;
     Npp32f *pDeviceTmp, *pDeviceKernel;
 
-    checkCudaErrors( cudaMalloc((void**) &pDeviceI,  width * height * elemSize) );
     checkCudaErrors( cudaMalloc((void**) &pDeviceIx, width * height * elemSize) );
     checkCudaErrors( cudaMalloc((void**) &pDeviceIy, width * height * elemSize) );
 
@@ -99,12 +93,10 @@ namespace cu {
     // Copy over initial image and kernel
     auto start_memcpy_hd = now();
 
-    checkCudaErrors(
-        cudaMemcpy(pDeviceI, (float*) Is[0].data, width * height * elemSize, cudaMemcpyHostToDevice) );
     checkCudaErrors(
         cudaMemcpy(pDeviceKernel, pSrcKernel, nMaskSize * sizeof(Npp32f), cudaMemcpyHostToDevice) );
 
-    calc_print_elapsed("cudaMemcpy I[0] H->D", start_memcpy_hd);
+    calc_print_elapsed("cudaMemcpy Kernel H->D", start_memcpy_hd);
 
     ////////////////////////////////////////////////////////////////////////////////////////////////
     // Apply first gradients to Is[0]
@@ -153,6 +145,8 @@ namespace cu {
           pDevicePaddedI, nDstStep, oPadSize, padding, padding) );
 
     // Pad dx, dy
+    checkCudaErrors( cudaMemset(pDevicePaddedIx, 0, oPadSize.width * oPadSize.height * elemSize) );
+    checkCudaErrors( cudaMemset(pDevicePaddedIy, 0, oPadSize.width * oPadSize.height * elemSize) );
     NPP_CHECK_NPP(
         nppiCopyConstBorder_32f_C3R (
           pDeviceIx, nSrcStep, oSize,
@@ -167,23 +161,23 @@ namespace cu {
     ////////////////////////////////////////////////////////////////////////////////////////////////
 
     auto start_cp_Is0 = now();
-    Is[0].create(oPadSize.height, oPadSize.width, CV_32FC3);
+    Is[0] = new float[oPadSize.height * oPadSize.width * channels];
     checkCudaErrors(
-        cudaMemcpy(Is[0].data, pDevicePaddedI,
+        cudaMemcpy(Is[0], pDevicePaddedI,
           oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
     compute_time += calc_print_elapsed("Is[0] cudaMemcpy D->H", start_cp_Is0);
 
     auto start_cp_dx = now();
-    Ixs[0].create(oPadSize.height, oPadSize.width, CV_32FC3);
+    Ixs[0] = new float[oPadSize.height * oPadSize.width * channels];
     checkCudaErrors(
-        cudaMemcpy(Ixs[0].data, pDevicePaddedIx,
+        cudaMemcpy(Ixs[0], pDevicePaddedIx,
           oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
     compute_time += calc_print_elapsed("Ixs[0] cudaMemcpy D->H", start_cp_dx);
 
     auto start_cp_dy = now();
-    Iys[0].create(oPadSize.height, oPadSize.width, CV_32FC3);
+    Iys[0] = new float[oPadSize.height * oPadSize.width * channels];
     checkCudaErrors(
-        cudaMemcpy(Iys[0].data, pDevicePaddedIy,
+        cudaMemcpy(Iys[0], pDevicePaddedIy,
           oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
     compute_time += calc_print_elapsed("Iys[0] cudaMemcpy D->H", start_cp_dy);
 
@@ -266,6 +260,8 @@ namespace cu {
             pDevicePaddedI, nDstStep, oPadSize, padding, padding) );
 
       // Pad dx, dy
+      checkCudaErrors( cudaMemset(pDevicePaddedIx, 0, oPadSize.width * oPadSize.height * elemSize) );
+      checkCudaErrors( cudaMemset(pDevicePaddedIy, 0, oPadSize.width * oPadSize.height * elemSize) );
       NPP_CHECK_NPP(
           nppiCopyConstBorder_32f_C3R (
             pDeviceIx, nSrcStep, oSize,
@@ -277,28 +273,27 @@ namespace cu {
 
       // Allocate host destinations
       auto start_host_alloc = now();
-      Is[i].create(oPadSize.height, oPadSize.width, CV_32FC3);
-      Ixs[i].create(oPadSize.height, oPadSize.width, CV_32FC3);
-      Iys[i].create(oPadSize.height, oPadSize.width, CV_32FC3);
+      Is[i] = new float[oPadSize.width * oPadSize.height * channels];
+      Ixs[i] = new float[oPadSize.width * oPadSize.height * channels];
+      Iys[i] = new float[oPadSize.width * oPadSize.height * channels];
       compute_time += calc_print_elapsed("host alloc", start_host_alloc);
 
       // Copy over data
       auto start_cp = now();
       checkCudaErrors(
-          cudaMemcpy(Is[i].data, pDevicePaddedI,
+          cudaMemcpy(Is[i], pDevicePaddedI,
             oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
       checkCudaErrors(
-          cudaMemcpy(Ixs[i].data, pDevicePaddedIx,
+          cudaMemcpy(Ixs[i], pDevicePaddedIx,
             oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
       checkCudaErrors(
-          cudaMemcpy(Iys[i].data, pDevicePaddedIy,
+          cudaMemcpy(Iys[i], pDevicePaddedIy,
             oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
       compute_time += calc_print_elapsed("pyramid cudaMemcpy D->H", start_cp);
 
     }
 
     // Clean up
-    cudaFree(pDeviceI);
     cudaFree(pDeviceIx);
     cudaFree(pDeviceIy);
     cudaFree(pDevicePaddedI);

diff --git a/src/kernels/pyramid.h b/src/kernels/pyramid.h
@@ -29,8 +29,8 @@
 namespace cu {
 
   void constructImgPyramids(
-      const cv::Mat& I,
-      cv::Mat* Is, cv::Mat* Ixs, cv::Mat* Iys,
+      Npp32f* I, float** Is, float** Ixs, float** Iys,
+      int width, int height,
       int padding, int nLevels);
 
 }

diff --git a/src/oflow.cpp b/src/oflow.cpp
@@ -12,6 +12,8 @@
 #include <opencv2/highgui/highgui.hpp> // needed for verbosity >= 3, DISVISUAL
 #include <opencv2/imgproc/imgproc.hpp> // needed for verbosity >= 3, DISVISUAL
 
+#include <nppi.h>
+
 #include <sys/time.h>    // timeof day
 #include <stdio.h>
 
@@ -60,58 +62,21 @@ namespace OFC {
     I1xs = new float*[op.coarsest_scale+1];
     I1ys = new float*[op.coarsest_scale+1];
 
-    I0_mats = new cv::Mat[op.coarsest_scale+1];
-    I1_mats = new cv::Mat[op.coarsest_scale+1];
-    I0x_mats = new cv::Mat[op.coarsest_scale+1];
-    I0y_mats = new cv::Mat[op.coarsest_scale+1];
-    I1x_mats = new cv::Mat[op.coarsest_scale+1];
-    I1y_mats = new cv::Mat[op.coarsest_scale+1];
-
   }
 
 
 
-  void OFClass::ConstructImgPyramids() {
+  void OFClass::ConstructImgPyramids(img_params iparams) {
 
     // Timing structures
     struct timeval start_time, end_time;
     gettimeofday(&start_time, NULL);
 
     // Construct image and gradient pyramides
-    cu::constructImgPyramids(I0, I0_mats, I0x_mats, I0y_mats, op.patch_size, op.coarsest_scale + 1);
-    cu::constructImgPyramids(I1, I1_mats, I1x_mats, I1y_mats, op.patch_size, op.coarsest_scale + 1);
-
-    auto start_pad = now();
-
-    // Pad images
-    for (int i = 0; i <= op.coarsest_scale; ++i) {
-
-      // Replicate padding for images
-      // cu::pad(I0_mats[i], I0_mats[i], op.patch_size, op.patch_size,
-      //     op.patch_size, op.patch_size, true);
-      // cu::pad(I1_mats[i], I1_mats[i], op.patch_size, op.patch_size,
-      //     op.patch_size, op.patch_size, true);
-      I0s[i] = (float*) I0_mats[i].data;
-      I1s[i] = (float*) I1_mats[i].data;
-
-      // Zero pad for gradients
-      // cu::pad(I0x_mats[i], I0x_mats[i], op.patch_size, op.patch_size,
-      //     op.patch_size, op.patch_size, false);
-      // cu::pad(I0y_mats[i], I0y_mats[i], op.patch_size, op.patch_size,
-      //     op.patch_size, op.patch_size, false);
-      // cu::pad(I1x_mats[i], I1x_mats[i], op.patch_size, op.patch_size,
-      //     op.patch_size, op.patch_size, false);
-      // cu::pad(I1y_mats[i], I1y_mats[i], op.patch_size, op.patch_size,
-      //     op.patch_size, op.patch_size, false);
-
-      I0xs[i] = (float*) I0x_mats[i].data;
-      I0ys[i] = (float*) I0y_mats[i].data;
-      I1xs[i] = (float*) I1x_mats[i].data;
-      I1ys[i] = (float*) I1y_mats[i].data;
-
-    }
-
-    calc_print_elapsed("pad images", start_pad);
+    cu::constructImgPyramids(I0, I0s, I0xs, I0ys, iparams.width, iparams.height,
+        op.patch_size, op.coarsest_scale + 1);
+    cu::constructImgPyramids(I1, I1s, I1xs, I1ys, iparams.width, iparams.height,
+        op.patch_size, op.coarsest_scale + 1);
 
     // Timing, image gradients and pyramid
     if (op.verbosity > 1) {
@@ -126,16 +91,15 @@ namespace OFC {
 
 
 
-  void OFClass::calc(cv::Mat _I0, cv::Mat _I1, img_params _iparams, const float * initflow, float * outflow) {
+  void OFClass::calc(Npp32f* _I0, Npp32f* _I1, img_params _iparams, const float * initflow, float * outflow) {
 
     I0 = _I0;
     I1 = _I1;
 
-    std::cout << "I0 " << I0.size() << " channels: " << I0.channels()
-      << " type: " << I0.type() << std::endl;
+    std::cout << "I0 " << _iparams.height << "x" << _iparams.width << std::endl;
 
     printf("Constructing pyramids\n");
-    ConstructImgPyramids();
+    ConstructImgPyramids(_iparams);
 
     if (op.verbosity > 1) cout << ", cflow " << endl;
 
@@ -185,7 +149,6 @@ namespace OFC {
 
     }
 
-
     // Timing, Grid memory allocation
     if (op.verbosity>1) {