Merge branch 'pad' into pyramid

Conflicts: src/CMakeLists.txt
zhaorz · May 4, 2017 · 5ac4469 · 5ac4469
2 parents eb3bbf2 + 90c256e
commit 5ac4469
Show file tree

Hide file tree

Showing 6 changed files with 280 additions and 22 deletions.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -46,6 +46,7 @@ set(KERNELS
   kernels/warmup.cpp
   kernels/sobel.cpp
   kernels/pyramid.cpp
+  kernels/pad.cpp
   kernels/resizeGrad.cpp
   kernels/resize.cpp)
 
@@ -70,9 +71,10 @@ target_link_libraries(flow ${OpenCV_LIBS})
 
 # CUDA sandbox
 set(SANDBOX_FILES
-  sandbox/process_sobel.cpp
+  # sandbox/process_sobel.cpp
   # sandbox/process_resize.cpp
   # sandbox/process_resizeGrad.cpp
+  sandbox/process_pad.cpp
   # sandbox/RgbMatTest.cpp
   sandbox/sandbox.cpp)
 cuda_add_executable(sandbox ${COMMON} ${KERNELS} ${SANDBOX_FILES})

diff --git a/src/kernels/pad.cpp b/src/kernels/pad.cpp
@@ -0,0 +1,133 @@
+/**
+ * Implements a pad kernel
+ */
+
+// System
+#include <iostream>
+#include <chrono>
+#include <string>
+#include <stdexcept>
+
+// OpenCV
+#include <opencv2/opencv.hpp>
+
+// CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+// NVIDIA Perf Primitives
+#include <nppi.h>
+#include <nppi_filtering_functions.h>
+
+// Local
+#include "../common/timer.h"
+
+#include "pad.h"
+
+using namespace timer;
+
+namespace cu {
+
+  /**
+   * Perform border padding with constant (0) or replication on src and store it in dest.
+   * Accepts 3-channel 32-bit float matrices.
+   *
+   * Params:
+   *   src          input image.
+   *   dst          output image;
+   *                it has size of src + padding
+   *   top          top padding
+   *   bottom       bottom padding
+   *   left         left padding
+   *   right        right padding
+   *   replicate    whether to replicate or constant
+   */
+  void pad(
+      const cv::Mat& src, cv::Mat& dest, int top,
+      int bottom, int left, int right, bool replicate) {
+
+    if (src.type() != CV_32FC3) {
+      throw std::invalid_argument("pad: invalid input matrix type");
+    }
+
+    // Compute time of relevant kernel
+    double compute_time = 0.0;
+
+    // CV_32FC3 is made up of RGB floats
+    int channels = 3;
+    size_t elemSize = 3 * sizeof(float);
+
+    cv::Size sz = src.size();
+    int width   = sz.width;
+    int height  = sz.height;
+    int destWidth = left + width + right;
+    int destHeight = top + height + bottom;
+
+    std::cout << "[start] pad: processing " << width << "x" << height << " image" << std::endl;
+
+    // pSrc pointer to image data
+    Npp32f* pHostSrc = (float*) src.data;
+
+    // The width, in bytes, of the image, sometimes referred to as pitch
+    unsigned int nSrcStep = width * elemSize;
+    unsigned int nDstStep = destWidth * elemSize;
+
+    NppiSize oSrcSizeROI = { width, height };
+    // NppiSize oDestSizeROI = { left + width + right, top + height + bottom};
+    NppiSize oDstSizeROI = { destWidth, destHeight };
+    const Npp32f padVal[3] = {0.0, 0.0, 0.0};
+
+    auto start_cuda_malloc = now();
+
+    // Allocate device memory
+    Npp32f* pDeviceSrc, *pDeviceDst;
+
+    checkCudaErrors( cudaMalloc((void**) &pDeviceSrc, width * height * elemSize) );
+    checkCudaErrors( cudaMalloc((void**) &pDeviceDst, destWidth * destHeight *  elemSize) );
+    checkCudaErrors( cudaMemset(pDeviceDst, 0, destWidth * destHeight * elemSize) );
+
+    calc_print_elapsed("cudaMalloc", start_cuda_malloc);
+
+
+    auto start_memcpy_hd = now();
+
+    // Copy image to device
+    checkCudaErrors(
+        cudaMemcpy(pDeviceSrc, pHostSrc, width * height * elemSize, cudaMemcpyHostToDevice) );
+
+    calc_print_elapsed("cudaMemcpy H->D", start_memcpy_hd);
+
+
+    auto start_pad = now();
+
+    NPP_CHECK_NPP(
+
+        (replicate)
+        ? nppiCopyReplicateBorder_32f_C3R (pDeviceSrc, nSrcStep, oSrcSizeROI, pDeviceDst,
+            nDstStep, oDstSizeROI, top, left)
+        : nppiCopyConstBorder_32f_C3R (pDeviceSrc, nSrcStep, oSrcSizeROI, pDeviceDst,
+            nDstStep, oDstSizeROI, top, left, padVal)
+
+        );
+
+    compute_time += calc_print_elapsed("pad", start_pad);
+
+
+    auto start_memcpy_dh = now();
+
+    // Copy result to host
+    dest.create(destHeight, destWidth, CV_32FC3);
+
+    checkCudaErrors(
+        cudaMemcpy(dest.data, pDeviceDst,
+          destWidth * destHeight * elemSize, cudaMemcpyDeviceToHost) );
+
+    calc_print_elapsed("cudaMemcpy H<-D", start_memcpy_dh);
+
+    cudaFree((void*) pDeviceSrc);
+    cudaFree((void*) pDeviceDst);
+
+    std::cout << "[done] pad: primary compute time: " << compute_time << " (ms)" << std::endl;
+  }
+
+}
diff --git a/src/kernels/pad.h b/src/kernels/pad.h
@@ -0,0 +1,52 @@
+/**
+ * Implements a pad kernel
+ */
+
+#ifndef __KERNEL_PAD_H__
+#define __KERNEL_PAD_H__
+
+// System
+#include <iostream>
+#include <chrono>
+#include <string>
+#include <stdexcept>
+
+// OpenCV
+#include <opencv2/opencv.hpp>
+
+// CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+// NVIDIA Perf Primitives
+#include <nppi.h>
+#include <nppi_filtering_functions.h>
+
+// Local
+#include "../common/Exceptions.h"
+#include "../common/timer.h"
+#include "../sandbox/process.h"
+
+namespace cu {
+
+  /**
+   * Perform border padding with constant (0) or replication on src and store it in dest.
+   * Accepts 3-channel 32-bit float matrices.
+   *
+   * Params:
+   *   src        input image.
+   *   dst        output image;
+   *              it has size of src + padding
+   *   top        top padding
+   *   bottom     bottom padding
+   *   left       left padding
+   *   right      right padding
+   *   replicate  whether to replicate or constant pad
+   */
+  void pad(
+      const cv::Mat& src, cv::Mat& dest, int top,
+      int bottom, int left, int right, bool replicate);
+
+}
+
+#endif // end __KERNEL_PAD_H__
diff --git a/src/oflow.cpp b/src/oflow.cpp
@@ -20,6 +20,7 @@
 #include "refine_variational.h"
 
 #include "kernels/resize.h"
+#include "kernels/pad.h"
 #include "kernels/resizeGrad.h"
 #include "kernels/sobel.h"
 #include "kernels/pyramid.h"
@@ -86,22 +87,22 @@ namespace OFC {
     for (int i = 0; i <= op.coarsest_scale; ++i) {
 
       // Replicate padding for images
-      copyMakeBorder(I0_mats[i], I0_mats[i], op.patch_size, op.patch_size,
-          op.patch_size, op.patch_size, cv::BORDER_REPLICATE);
-      copyMakeBorder(I1_mats[i], I1_mats[i], op.patch_size, op.patch_size,
-          op.patch_size, op.patch_size, cv::BORDER_REPLICATE);
+      cu::pad(I0_mats[i], I0_mats[i], op.patch_size, op.patch_size,
+          op.patch_size, op.patch_size, true);
+      cu::pad(I1_mats[i], I1_mats[i], op.patch_size, op.patch_size,
+          op.patch_size, op.patch_size, true);
       I0s[i] = (float*) I0_mats[i].data;
       I1s[i] = (float*) I1_mats[i].data;
 
       // Zero pad for gradients
-      copyMakeBorder(I0x_mats[i], I0x_mats[i], op.patch_size, op.patch_size,
-          op.patch_size, op.patch_size, cv::BORDER_CONSTANT, 0);
-      copyMakeBorder(I0y_mats[i], I0y_mats[i], op.patch_size, op.patch_size,
-          op.patch_size, op.patch_size, cv::BORDER_CONSTANT, 0);
-      copyMakeBorder(I1x_mats[i], I1x_mats[i], op.patch_size, op.patch_size,
-          op.patch_size, op.patch_size, cv::BORDER_CONSTANT, 0);
-      copyMakeBorder(I1y_mats[i], I1y_mats[i], op.patch_size, op.patch_size,
-          op.patch_size, op.patch_size, cv::BORDER_CONSTANT, 0);
+      cu::pad(I0x_mats[i], I0x_mats[i], op.patch_size, op.patch_size,
+          op.patch_size, op.patch_size, false);
+      cu::pad(I0y_mats[i], I0y_mats[i], op.patch_size, op.patch_size,
+          op.patch_size, op.patch_size, false);
+      cu::pad(I1x_mats[i], I1x_mats[i], op.patch_size, op.patch_size,
+          op.patch_size, op.patch_size, false);
+      cu::pad(I1y_mats[i], I1y_mats[i], op.patch_size, op.patch_size,
+          op.patch_size, op.patch_size, false);
 
       I0xs[i] = (float*) I0x_mats[i].data;
       I0ys[i] = (float*) I0y_mats[i].data;

diff --git a/src/run_dense.cpp b/src/run_dense.cpp
@@ -8,6 +8,7 @@
 
 #include "oflow.h"
 #include "kernels/warmup.h"
+#include "kernels/pad.h"
 
 // CUDA
 #include <cuda_runtime.h>
@@ -195,6 +196,10 @@ int main( int argc, char** argv ) {
   }
 
 
+  // convert to float
+  I0_mat.convertTo(I0_fmat, CV_32F);
+  I1_mat.convertTo(I1_fmat, CV_32F);
+
   // Pad image such that width and height are restless divisible on all scales (except last)
   int padw = 0, padh = 0;
   int max_scale = pow(2, op.coarsest_scale); // enforce restless division by this number on coarsest scale
@@ -205,24 +210,21 @@ int main( int argc, char** argv ) {
 
   if (padh > 0 || padw > 0) {
 
-    copyMakeBorder(I0_mat, I0_mat, floor((float) padh / 2.0f), ceil((float) padh / 2.0f),
-        floor((float) padw / 2.0f), ceil((float) padw / 2.0f), cv::BORDER_REPLICATE);
-    copyMakeBorder(I1_mat, I1_mat, floor((float) padh / 2.0f), ceil((float) padh / 2.0f),
-        floor((float) padw / 2.0f), ceil((float) padw / 2.0f), cv::BORDER_REPLICATE);
+    cu::pad(I0_fmat, I0_fmat, floor((float) padh / 2.0f), ceil((float) padh / 2.0f),
+        floor((float) padw / 2.0f), ceil((float) padw / 2.0f), true);
+    cu::pad(I1_fmat, I1_fmat, floor((float) padh / 2.0f), ceil((float) padh / 2.0f),
+        floor((float) padw / 2.0f), ceil((float) padw / 2.0f), true);
 
   }
 
   // Create image paramaters
   img_params iparams;
 
   // padded image size, ensures divisibility by 2 on all scales (except last)
-  iparams.width = I0_mat.size().width;
-  iparams.height = I0_mat.size().height;
+  iparams.width = I0_fmat.size().width;
+  iparams.height = I0_fmat.size().height;
   iparams.padding = op.patch_size;
 
-  // convert to float
-  I0_mat.convertTo(I0_fmat, CV_32F);
-  I1_mat.convertTo(I1_fmat, CV_32F);
 
   // Timing, image loading
   if (op.verbosity > 1) {

diff --git a/src/sandbox/process_pad.cpp b/src/sandbox/process_pad.cpp
@@ -0,0 +1,68 @@
+/**
+ * Test a pad kernel
+ */
+
+// System
+#include <iostream>
+#include <chrono>
+#include <string>
+#include <stdexcept>
+
+// OpenCV
+#include <opencv2/opencv.hpp>
+
+// CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+// NVIDIA Perf Primitives
+#include <nppi.h>
+#include <nppi_filtering_functions.h>
+
+// Local
+#include "../common/timer.h"
+#include "process.h"
+
+#include "../kernels/pad.h"
+
+using namespace timer;
+
+void process(const char* input_file, const char* output_file) {
+  cv::Mat I0, I0_f, I1_f;
+
+  auto start_read = now();
+
+  // Get input
+  I0 = cv::imread(input_file, CV_LOAD_IMAGE_COLOR);
+
+  calc_print_elapsed("imread", start_read);
+
+  // Check for invalid input
+  if(!I0.data) {
+    std::cout <<  "Could not open or find the image" << std::endl ;
+    exit(1);
+  }
+
+  auto start_convert = now();
+
+  // Convert to float
+  I0.convertTo(I0_f, CV_32FC3);
+
+  calc_print_elapsed("convertTo float", start_convert);
+
+
+  auto start_resize = now();
+
+  cu::pad(I0_f, I1_f, 20, 50, 100, 200, true);
+
+  calc_print_elapsed("resize", start_resize);
+
+
+  auto start_write = now();
+
+  // Write output
+  cv::imwrite(output_file, I1_f);
+
+  calc_print_elapsed("write", start_write);
+}
+