From e9c83cdc7af4823d42c5cb0273e05f225e26ac02 Mon Sep 17 00:00:00 2001
From: Richard Zhao <richardz@andrew.cmu.edu>
Date: Wed, 10 May 2017 21:38:33 +0000
Subject: [PATCH] Add unified memory setup

---
 src/CMakeLists.txt              |  24 ++---
 src/common/cuda_helper.h        |   7 ++
 src/kernels/sobelUnifiedMem.cpp | 164 ++++++++++++++++++++++++++++++++
 src/run_dense.cpp               |   3 +
 4 files changed, 187 insertions(+), 11 deletions(-)
 create mode 100644 src/kernels/sobelUnifiedMem.cpp
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d43564f..a7c7cd5 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -80,7 +80,8 @@ set(KERNELS
   kernels/interpolate.cu
   kernels/extract.cu
   kernels/warmup.cpp
-  kernels/sobel.cpp
+  # kernels/sobel.cpp
+  kernels/sobelUnifiedMem.cpp
   kernels/pyramid.cpp
   kernels/pad.cpp
   kernels/resizeGrad.cpp
@@ -109,13 +110,14 @@ set_property(TARGET flow APPEND PROPERTY COMPILE_DEFINITIONS "VECTOR_WIDTH=4") #
 # set_property(TARGET flow APPEND PROPERTY COMPILE_DEFINITIONS "VECTOR_WIDTH=1") # no SIMD
 target_link_libraries(flow ${OpenCV_LIBS})
 
-# # CUDA sandbox
-# set(SANDBOX_FILES
-#   # sandbox/process_sobel.cpp
-#   sandbox/process_resize.cpp
-#   # sandbox/process_resizeGrad.cpp
-#   # sandbox/process_pad.cpp
-#   # sandbox/RgbMatTest.cpp
-#   sandbox/sandbox.cpp)
-# cuda_add_executable(sandbox ${COMMON} ${KERNELS} ${SANDBOX_FILES})
-# target_link_libraries(sandbox ${OpenCV_LIBS})
+# CUDA sandbox
+set(SANDBOX_FILES
+  # sandbox/process_sobel.cpp
+  sandbox/process_sobel.cpp
+  # sandbox/process_resize.cpp
+  # sandbox/process_resizeGrad.cpp
+  # sandbox/process_pad.cpp
+  # sandbox/RgbMatTest.cpp
+  sandbox/sandbox.cpp)
+cuda_add_executable(sandbox ${COMMON} ${KERNELS} ${SANDBOX_FILES})
+target_link_libraries(sandbox ${OpenCV_LIBS})
diff --git a/src/common/cuda_helper.h b/src/common/cuda_helper.h
index ebd51a1..0cc8e14 100644
--- a/src/common/cuda_helper.h
+++ b/src/common/cuda_helper.h
@@ -517,6 +517,13 @@ inline void initializeCuda(int argc, char** argv) {
   //Get GPU information
   checkCudaErrors(cudaGetDevice(&devID));
   checkCudaErrors(cudaGetDeviceProperties(&props, devID));
+
+  if (props.canMapHostMemory) {
+    if (props.integrated)
+      printf("Device is an integrated GPU\n");
+    printf("Device supports zero copy, enabling page locked memory mapping\n\n");
+    checkCudaErrors( cudaSetDeviceFlags(cudaDeviceMapHost) );
+  }
 }
 
 #endif // end __CUDA_HELPER_H__
diff --git a/src/kernels/sobelUnifiedMem.cpp b/src/kernels/sobelUnifiedMem.cpp
new file mode 100644
index 0000000..dece9de
--- /dev/null
+++ b/src/kernels/sobelUnifiedMem.cpp
@@ -0,0 +1,164 @@
+/**
+ * Implements a sobel kernel
+ */
+
+// System
+#include <iostream>
+#include <chrono>
+#include <string>
+#include <stdexcept>
+#include <cstring>
+
+// OpenCV
+#include <opencv2/opencv.hpp>
+
+// CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+// NVIDIA Perf Primitives
+#include <nppi.h>
+#include <nppi_filtering_functions.h>
+
+// Local
+#include "../common/timer.h"
+
+#include "sobel.h"
+
+using namespace timer;
+
+namespace cu {
+
+  /**
+   * Perform a sobel filter on src and store it in dest.
+   * dest must be allocated.
+   * Accepts 3-channel 32-bit float matrices.
+   *
+   * Params:
+   *   src     input image.
+   *   dst     output image of the same size and the same number of channels as src .
+   *   ddepth  output image depth.
+   *   dx      order of the derivative x. Only 0, 1 supported.
+   *   dy      order of the derivative y. Only 0, 1 supported.
+   *   ksize   (unused) size of the extended Sobel kernel; it must be 1, 3, 5, or 7.
+   *   scale   (unused) optional scale factor for the computed derivative values
+   *   delta   (optional|unused) delta value that is added to the results prior to storing them in dst.
+   *   borderType  (unused) pixel extrapolation method, see cv::BorderTypes
+   */
+  void sobel(
+      const cv::Mat& src, cv::Mat& dest, int ddepth, int dx, int dy,
+      int ksize, double scale, double delta, int borderType) {
+
+    if (src.type() != CV_32FC3) {
+      throw std::invalid_argument("sobel: invalid input matrix type");
+    }
+
+    if ( !((dx == 1 && dy == 0) ||
+          (dx == 0 && dy == 1)) ) {
+      throw std::invalid_argument("sobel: only accepts first order derivatives");
+    }
+
+    // Compute time of relevant kernel
+    double compute_time = 0.0;
+    double total_time = 0.0;
+
+    // CV_32FC3 is made up of RGB floats
+    int channels = 3;
+    size_t elemSize = 3 * sizeof(float);
+
+    cv::Size sz = src.size();
+    int width   = sz.width;
+    int height  = sz.height;
+
+    std::cout << "[start] sobel: processing " << width << "x" << height << " image" << std::endl;
+
+    // pSrc pointer to image data
+    Npp32f* pHostSrc = (float*) src.data;
+
+    // The width, in bytes, of the image, sometimes referred to as pitch
+    unsigned int nSrcStep = width * elemSize;
+    unsigned int nDstStep = nSrcStep;
+
+    NppiSize oSrcSize = { width, height };
+    NppiPoint oSrcOffset = { 0, 0 };
+    NppiSize oSizeROI = { width, height };
+
+    // For 1D convolution
+    const Npp32f pKernel[3] = { 1, 0, -1 };
+    Npp32s nMaskSize =  3;
+    Npp32s nAnchor   = 1;  // Kernel is centered over pixel
+    NppiBorderType eBorderType = NPP_BORDER_REPLICATE;
+
+    auto start_cuda_malloc = now();
+
+    // Allocate device memory
+    Npp32f* pDeviceSrc, *pDeviceDst;
+
+    checkCudaErrors( cudaHostAlloc((void**) &pDeviceSrc, width * height * elemSize, cudaHostAllocMapped) );
+    checkCudaErrors( cudaHostAlloc((void**) &pDeviceDst, width * height * elemSize, cudaHostAllocMapped) );
+
+    // For custom row/col kernel
+    Npp32f* pDeviceKernel;
+    checkCudaErrors( cudaHostAlloc((void**) &pDeviceKernel, nMaskSize * sizeof(Npp32f), cudaHostAllocMapped) );
+
+    calc_print_elapsed("cudaHostAlloc", start_cuda_malloc);
+
+
+    auto start_memcpy_hd = now();
+
+    std::memcpy(pDeviceSrc, pHostSrc, width * height * elemSize);
+    std::memcpy(pDeviceKernel, pKernel, nMaskSize * sizeof(Npp32f));
+
+    calc_print_elapsed("std::memcpy D->H", start_memcpy_hd);
+
+
+    bool useHoriz = (dx == 1);
+
+    auto start_sobel = now();
+
+    NPP_CHECK_NPP(
+        (useHoriz)
+        // For built in sobel
+        // ? nppiFilterPrewittHorizBorder_32f_C3R (pDeviceSrc, nSrcStep, oSrcSize, oSrcOffset, pDeviceDst, nDstStep, oSizeROI, eBorderType)
+        // : nppiFilterPrewittVertBorder_32f_C3R  (pDeviceSrc, nSrcStep, oSrcSize, oSrcOffset, pDeviceDst, nDstStep, oSizeROI, eBorderType)
+
+        // Custom row filter
+        ? nppiFilterRowBorder_32f_C3R (
+          pDeviceSrc, nSrcStep, oSrcSize, oSrcOffset,
+          pDeviceDst, nDstStep, oSizeROI, pDeviceKernel, nMaskSize, nAnchor, eBorderType)
+
+        : nppiFilterColumnBorder_32f_C3R (
+          pDeviceSrc, nSrcStep, oSrcSize, oSrcOffset,
+          pDeviceDst, nDstStep, oSizeROI, pDeviceKernel, nMaskSize, nAnchor, eBorderType)
+
+        // Sobel with mask
+        // ? nppiFilterSobelHorizMaskBorder_32f_C1R (pDeviceSrc, nSrcStep, oSrcSize, oSrcOffset, pDeviceDst, nDstStep, oSizeROI, NPP_MASK_SIZE_1_X_3, eBorderType)
+        // : nppiFilterSobelVertMaskBorder_32f_C1R  (pDeviceSrc, nSrcStep, oSrcSize, oSrcOffset, pDeviceDst, nDstStep, oSizeROI, NPP_MASK_SIZE_1_X_3, eBorderType)
+        );
+
+    compute_time += calc_print_elapsed("sobel", start_sobel);
+
+
+    auto start_memcpy_dh = now();
+
+    // Copy result to host
+    dest.create(height, width, CV_32FC3);
+    float* pHostDst = (float*) dest.data;
+
+    std::memcpy(pHostDst, pDeviceDst, width * height * elemSize);
+
+    total_time += calc_print_elapsed("std::memcpy H<-D", start_memcpy_dh);
+
+    cudaFree((void*) pDeviceSrc);
+    cudaFree((void*) pDeviceDst);
+
+    // Only for custom row/col filter
+    cudaFree((void*) pDeviceKernel);
+
+    std::cout << "[done] sobel" << std::endl;
+    std::cout << "  primary compute time: " << compute_time << " (ms)" << std::endl;
+    std::cout << "  total compute time:   " << compute_time + total_time << " (ms)" << std::endl;
+  }
+
+}
+
diff --git a/src/run_dense.cpp b/src/run_dense.cpp
index b307080..ac3bdd5 100644
--- a/src/run_dense.cpp
+++ b/src/run_dense.cpp
@@ -15,6 +15,7 @@
 #include "kernels/warmup.h"
 #include "kernels/pad.h"
 #include "common/timer.h"
+#include "common/cuda_helper.h"
 
 
 using namespace std;
@@ -113,6 +114,8 @@ int AutoFirstScaleSelect(int imgwidth, int fratio, int patchsize) {
 
 int main( int argc, char** argv ) {
 
+  initializeCuda(argc, argv);
+
   // Warmup GPU
   cu::warmup();