From 238b44e27b12103e3a5f7855a857172e359b2712 Mon Sep 17 00:00:00 2001
From: Richard Zhao <richardz@andrew.cmu.edu>
Date: Thu, 4 May 2017 14:39:36 -0400
Subject: [PATCH] Put pad into pyramid

---
 src/kernels/pyramid.cpp | 129 +++++++++++++++++++++++++++++++---------
 src/kernels/pyramid.h   |   2 +-
 src/oflow.cpp           |  28 ++++-----
 3 files changed, 115 insertions(+), 44 deletions(-)

diff --git a/src/kernels/pyramid.cpp b/src/kernels/pyramid.cpp
index 0d8466d..79d9a98 100644
--- a/src/kernels/pyramid.cpp
+++ b/src/kernels/pyramid.cpp
@@ -32,7 +32,7 @@ namespace cu {
   void constructImgPyramids(
       const cv::Mat& I,
       cv::Mat* Is, cv::Mat* Ixs, cv::Mat* Iys,
-      int nLevels) {
+      int padding, int nLevels) {
 
     // Timing
     auto start_total = now();
@@ -69,16 +69,28 @@ namespace cu {
     double shiftX = 0.0;
     double shiftY = 0.0;
 
+    // Padding params
+    int padWidth = 2 * padding + width;
+    int padHeight = 2 * padding + height;
+    const Npp32f PAD_VAL[3] = { 0.0, 0.0, 0.0 };
+
     std::cout << "[start] constructImgPyramids: processing "
       << width << "x" << height << " image" << std::endl;
 
-    // Allocate device memory
+    // Allocate device memory (to account for padding too
     auto start_cuda_malloc = now();
-    Npp32f *pDeviceI, *pDeviceIx, *pDeviceIy, *pDeviceTmp, *pDeviceKernel;
+    Npp32f *pDeviceI, *pDeviceIx, *pDeviceIy;
+    Npp32f *pDevicePaddedI, *pDevicePaddedIx, *pDevicePaddedIy;
+    Npp32f *pDeviceTmp, *pDeviceKernel;
+
+    checkCudaErrors( cudaMalloc((void**) &pDeviceI,  width * height * elemSize) );
+    checkCudaErrors( cudaMalloc((void**) &pDeviceIx, width * height * elemSize) );
+    checkCudaErrors( cudaMalloc((void**) &pDeviceIy, width * height * elemSize) );
+
+    checkCudaErrors( cudaMalloc((void**) &pDevicePaddedI,  padWidth * padHeight * elemSize) );
+    checkCudaErrors( cudaMalloc((void**) &pDevicePaddedIx, padWidth * padHeight * elemSize) );
+    checkCudaErrors( cudaMalloc((void**) &pDevicePaddedIy, padWidth * padHeight * elemSize) );
 
-    checkCudaErrors( cudaMalloc((void**) &pDeviceI,      width * height * elemSize)  );
-    checkCudaErrors( cudaMalloc((void**) &pDeviceIx,     width * height * elemSize)  );
-    checkCudaErrors( cudaMalloc((void**) &pDeviceIy,     width * height * elemSize)  );
     checkCudaErrors( cudaMalloc((void**) &pDeviceTmp,    width * height * elemSize)  );
     checkCudaErrors( cudaMalloc((void**) &pDeviceKernel, nMaskSize * sizeof(Npp32f)) );
 
@@ -112,13 +124,6 @@ namespace cu {
         );
     compute_time += calc_print_elapsed("sobel: Ixs[0]", start_dx);
 
-    auto start_cp_dx = now();
-    Ixs[0].create(height, width, CV_32FC3);
-    checkCudaErrors(
-        cudaMemcpy(Ixs[0].data, pDeviceIx,
-          width * height * elemSize, cudaMemcpyDeviceToHost) );
-    compute_time += calc_print_elapsed("Ixs[0] cudaMemcpy D->H", start_cp_dx);
-
     // dy's
     auto start_dy = now();
     NPP_CHECK_NPP(
@@ -133,11 +138,53 @@ namespace cu {
         );
     compute_time += calc_print_elapsed("sobel: Iys[0]", start_dy);
 
+    ////////////////////////////////////////////////////////////////////////////////////////////////
+    // Pad Is[0] I, dx, dy
+    ////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+    NppiSize oPadSize = { padWidth, padHeight };
+    int nDstStep = oPadSize.width * elemSize;
+
+    // Pad original
+    NPP_CHECK_NPP(
+        nppiCopyReplicateBorder_32f_C3R (
+          pDeviceI, nSrcStep, oSize,
+          pDevicePaddedI, nDstStep, oPadSize, padding, padding) );
+
+    // Pad dx, dy
+    NPP_CHECK_NPP(
+        nppiCopyConstBorder_32f_C3R (
+          pDeviceIx, nSrcStep, oSize,
+          pDevicePaddedIx, nDstStep, oPadSize, padding, padding, PAD_VAL) );
+    NPP_CHECK_NPP(
+        nppiCopyConstBorder_32f_C3R (
+          pDeviceIy, nSrcStep, oSize,
+          pDevicePaddedIy, nDstStep, oPadSize, padding, padding, PAD_VAL) );
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////
+    // Copy Is[0] I, dx, dy
+    ////////////////////////////////////////////////////////////////////////////////////////////////
+
+    auto start_cp_Is0 = now();
+    Is[0].create(oPadSize.height, oPadSize.width, CV_32FC3);
+    checkCudaErrors(
+        cudaMemcpy(Is[0].data, pDevicePaddedI,
+          oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
+    compute_time += calc_print_elapsed("Is[0] cudaMemcpy D->H", start_cp_Is0);
+
+    auto start_cp_dx = now();
+    Ixs[0].create(oPadSize.height, oPadSize.width, CV_32FC3);
+    checkCudaErrors(
+        cudaMemcpy(Ixs[0].data, pDevicePaddedIx,
+          oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
+    compute_time += calc_print_elapsed("Ixs[0] cudaMemcpy D->H", start_cp_dx);
+
     auto start_cp_dy = now();
-    Iys[0].create(height, width, CV_32FC3);
+    Iys[0].create(oPadSize.height, oPadSize.width, CV_32FC3);
     checkCudaErrors(
-        cudaMemcpy(Iys[0].data, pDeviceIy,
-          width * height * elemSize, cudaMemcpyDeviceToHost) );
+        cudaMemcpy(Iys[0].data, pDevicePaddedIy,
+          oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
     compute_time += calc_print_elapsed("Iys[0] cudaMemcpy D->H", start_cp_dy);
 
 
@@ -203,28 +250,49 @@ namespace cu {
         );
       compute_time += calc_print_elapsed("sobel: Iys[i]", start_dy);
 
-      // Allocate host destinations
-      auto start_host_alloc = now();
-      Is[i].create(dstRect.height, dstRect.width, CV_32FC3);
-      Ixs[i].create(dstRect.height, dstRect.width, CV_32FC3);
-      Iys[i].create(dstRect.height, dstRect.width, CV_32FC3);
+      //////////////////////////////////////////////////////////////////////////////////////////////
+      // Pad I, dx, dy
+      //////////////////////////////////////////////////////////////////////////////////////////////
+
+      padWidth  = width + 2 * padding;
+      padHeight = height + 2 * padding;
+      NppiSize oPadSize = { padWidth, padHeight };
+      nDstStep = oPadSize.width * elemSize;
+
+      // Pad original
+      NPP_CHECK_NPP(
+          nppiCopyReplicateBorder_32f_C3R (
+            pDeviceI, nSrcStep, oSize,
+            pDevicePaddedI, nDstStep, oPadSize, padding, padding) );
 
-      std::cout << "Is[" << i << "]: "  << Is[i].size() << " channels: " << Is[i].channels()
-        << " type: " << Is[i].type() << std::endl;
+      // Pad dx, dy
+      NPP_CHECK_NPP(
+          nppiCopyConstBorder_32f_C3R (
+            pDeviceIx, nSrcStep, oSize,
+            pDevicePaddedIx, nDstStep, oPadSize, padding, padding, PAD_VAL) );
+      NPP_CHECK_NPP(
+          nppiCopyConstBorder_32f_C3R (
+            pDeviceIy, nSrcStep, oSize,
+            pDevicePaddedIy, nDstStep, oPadSize, padding, padding, PAD_VAL) );
 
+      // Allocate host destinations
+      auto start_host_alloc = now();
+      Is[i].create(oPadSize.height, oPadSize.width, CV_32FC3);
+      Ixs[i].create(oPadSize.height, oPadSize.width, CV_32FC3);
+      Iys[i].create(oPadSize.height, oPadSize.width, CV_32FC3);
       compute_time += calc_print_elapsed("host alloc", start_host_alloc);
 
       // Copy over data
       auto start_cp = now();
       checkCudaErrors(
-          cudaMemcpy(Is[i].data, pDeviceI,
-            dstRect.width * dstRect.height * elemSize, cudaMemcpyDeviceToHost) );
+          cudaMemcpy(Is[i].data, pDevicePaddedI,
+            oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
       checkCudaErrors(
-          cudaMemcpy(Ixs[i].data, pDeviceIx,
-            dstRect.width * dstRect.height * elemSize, cudaMemcpyDeviceToHost) );
+          cudaMemcpy(Ixs[i].data, pDevicePaddedIx,
+            oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
       checkCudaErrors(
-          cudaMemcpy(Iys[i].data, pDeviceIy,
-            dstRect.width * dstRect.height * elemSize, cudaMemcpyDeviceToHost) );
+          cudaMemcpy(Iys[i].data, pDevicePaddedIy,
+            oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
       compute_time += calc_print_elapsed("pyramid cudaMemcpy D->H", start_cp);
 
     }
@@ -233,6 +301,9 @@ namespace cu {
     cudaFree(pDeviceI);
     cudaFree(pDeviceIx);
     cudaFree(pDeviceIy);
+    cudaFree(pDevicePaddedI);
+    cudaFree(pDevicePaddedIx);
+    cudaFree(pDevicePaddedIy);
     cudaFree(pDeviceTmp);
     cudaFree(pDeviceKernel);
 
diff --git a/src/kernels/pyramid.h b/src/kernels/pyramid.h
index 0b809d1..3d529eb 100644
--- a/src/kernels/pyramid.h
+++ b/src/kernels/pyramid.h
@@ -31,7 +31,7 @@ namespace cu {
   void constructImgPyramids(
       const cv::Mat& I,
       cv::Mat* Is, cv::Mat* Ixs, cv::Mat* Iys,
-      int nLevels);
+      int padding, int nLevels);
 
 }
 
diff --git a/src/oflow.cpp b/src/oflow.cpp
index e6155db..9106a8a 100644
--- a/src/oflow.cpp
+++ b/src/oflow.cpp
@@ -78,8 +78,8 @@ namespace OFC {
     gettimeofday(&start_time, NULL);
 
     // Construct image and gradient pyramides
-    cu::constructImgPyramids(I0, I0_mats, I0x_mats, I0y_mats, op.coarsest_scale + 1);
-    cu::constructImgPyramids(I1, I1_mats, I1x_mats, I1y_mats, op.coarsest_scale + 1);
+    cu::constructImgPyramids(I0, I0_mats, I0x_mats, I0y_mats, op.patch_size, op.coarsest_scale + 1);
+    cu::constructImgPyramids(I1, I1_mats, I1x_mats, I1y_mats, op.patch_size, op.coarsest_scale + 1);
 
     auto start_pad = now();
 
@@ -87,22 +87,22 @@ namespace OFC {
     for (int i = 0; i <= op.coarsest_scale; ++i) {
 
       // Replicate padding for images
-      cu::pad(I0_mats[i], I0_mats[i], op.patch_size, op.patch_size,
-          op.patch_size, op.patch_size, true);
-      cu::pad(I1_mats[i], I1_mats[i], op.patch_size, op.patch_size,
-          op.patch_size, op.patch_size, true);
+      // cu::pad(I0_mats[i], I0_mats[i], op.patch_size, op.patch_size,
+      //     op.patch_size, op.patch_size, true);
+      // cu::pad(I1_mats[i], I1_mats[i], op.patch_size, op.patch_size,
+      //     op.patch_size, op.patch_size, true);
       I0s[i] = (float*) I0_mats[i].data;
       I1s[i] = (float*) I1_mats[i].data;
 
       // Zero pad for gradients
-      cu::pad(I0x_mats[i], I0x_mats[i], op.patch_size, op.patch_size,
-          op.patch_size, op.patch_size, false);
-      cu::pad(I0y_mats[i], I0y_mats[i], op.patch_size, op.patch_size,
-          op.patch_size, op.patch_size, false);
-      cu::pad(I1x_mats[i], I1x_mats[i], op.patch_size, op.patch_size,
-          op.patch_size, op.patch_size, false);
-      cu::pad(I1y_mats[i], I1y_mats[i], op.patch_size, op.patch_size,
-          op.patch_size, op.patch_size, false);
+      // cu::pad(I0x_mats[i], I0x_mats[i], op.patch_size, op.patch_size,
+      //     op.patch_size, op.patch_size, false);
+      // cu::pad(I0y_mats[i], I0y_mats[i], op.patch_size, op.patch_size,
+      //     op.patch_size, op.patch_size, false);
+      // cu::pad(I1x_mats[i], I1x_mats[i], op.patch_size, op.patch_size,
+      //     op.patch_size, op.patch_size, false);
+      // cu::pad(I1y_mats[i], I1y_mats[i], op.patch_size, op.patch_size,
+      //     op.patch_size, op.patch_size, false);
 
       I0xs[i] = (float*) I0x_mats[i].data;
       I0ys[i] = (float*) I0y_mats[i].data;