Add setup function, pipeline until img pyramids

zhaorz · May 8, 2017 · 4fe277f · 4fe277f
1 parent 5523fd3
commit 4fe277f
Show file tree

Hide file tree

Showing 9 changed files with 212 additions and 200 deletions.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -71,7 +71,7 @@ set(CODEFILES
   FDF1.0.1/opticalflow_aux.c
   FDF1.0.1/solver.c)
 
-# GrayScale, Optical Flow
+# RGB, Optical Flow
 cuda_add_executable(flow ${COMMON} ${CODEFILES} ${KERNELS})
 set_target_properties (flow PROPERTIES COMPILE_DEFINITIONS "SELECTMODE=1")
 set_property(TARGET flow APPEND PROPERTY COMPILE_DEFINITIONS "SELECTCHANNEL=3") # use RGB image
@@ -87,4 +87,3 @@ set(SANDBOX_FILES
   sandbox/sandbox.cpp)
 cuda_add_executable(sandbox ${COMMON} ${KERNELS} ${SANDBOX_FILES})
 target_link_libraries(sandbox ${OpenCV_LIBS})
-
diff --git a/src/kernels/pyramid.cpp b/src/kernels/pyramid.cpp
@@ -75,16 +75,11 @@ namespace cu {
     // Allocate device memory (to account for padding too
     auto start_cuda_malloc = now();
     Npp32f *pDeviceIx, *pDeviceIy;
-    Npp32f *pDevicePaddedI, *pDevicePaddedIx, *pDevicePaddedIy;
     Npp32f *pDeviceTmp, *pDeviceKernel;
 
     checkCudaErrors( cudaMalloc((void**) &pDeviceIx, width * height * elemSize) );
     checkCudaErrors( cudaMalloc((void**) &pDeviceIy, width * height * elemSize) );
 
-    checkCudaErrors( cudaMalloc((void**) &pDevicePaddedI,  padWidth * padHeight * elemSize) );
-    checkCudaErrors( cudaMalloc((void**) &pDevicePaddedIx, padWidth * padHeight * elemSize) );
-    checkCudaErrors( cudaMalloc((void**) &pDevicePaddedIy, padWidth * padHeight * elemSize) );
-
     checkCudaErrors( cudaMalloc((void**) &pDeviceTmp,    width * height * elemSize)  );
     checkCudaErrors( cudaMalloc((void**) &pDeviceKernel, nMaskSize * sizeof(Npp32f)) );
 
@@ -142,44 +137,17 @@ namespace cu {
     NPP_CHECK_NPP(
         nppiCopyReplicateBorder_32f_C3R (
           pDeviceI, nSrcStep, oSize,
-          pDevicePaddedI, nDstStep, oPadSize, padding, padding) );
+          Is[0], nDstStep, oPadSize, padding, padding) );
 
     // Pad dx, dy
-    checkCudaErrors( cudaMemset(pDevicePaddedIx, 0, oPadSize.width * oPadSize.height * elemSize) );
-    checkCudaErrors( cudaMemset(pDevicePaddedIy, 0, oPadSize.width * oPadSize.height * elemSize) );
     NPP_CHECK_NPP(
         nppiCopyConstBorder_32f_C3R (
           pDeviceIx, nSrcStep, oSize,
-          pDevicePaddedIx, nDstStep, oPadSize, padding, padding, PAD_VAL) );
+          Ixs[0], nDstStep, oPadSize, padding, padding, PAD_VAL) );
     NPP_CHECK_NPP(
         nppiCopyConstBorder_32f_C3R (
           pDeviceIy, nSrcStep, oSize,
-          pDevicePaddedIy, nDstStep, oPadSize, padding, padding, PAD_VAL) );
-
-    ////////////////////////////////////////////////////////////////////////////////////////////////
-    // Copy Is[0] I, dx, dy
-    ////////////////////////////////////////////////////////////////////////////////////////////////
-
-    auto start_cp_Is0 = now();
-    Is[0] = new float[oPadSize.height * oPadSize.width * channels];
-    checkCudaErrors(
-        cudaMemcpy(Is[0], pDevicePaddedI,
-          oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
-    compute_time += calc_print_elapsed("Is[0] cudaMemcpy D->H", start_cp_Is0);
-
-    auto start_cp_dx = now();
-    Ixs[0] = new float[oPadSize.height * oPadSize.width * channels];
-    checkCudaErrors(
-        cudaMemcpy(Ixs[0], pDevicePaddedIx,
-          oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
-    compute_time += calc_print_elapsed("Ixs[0] cudaMemcpy D->H", start_cp_dx);
-
-    auto start_cp_dy = now();
-    Iys[0] = new float[oPadSize.height * oPadSize.width * channels];
-    checkCudaErrors(
-        cudaMemcpy(Iys[0], pDevicePaddedIy,
-          oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
-    compute_time += calc_print_elapsed("Iys[0] cudaMemcpy D->H", start_cp_dy);
+          Iys[0], nDstStep, oPadSize, padding, padding, PAD_VAL) );
 
 
     ////////////////////////////////////////////////////////////////////////////////////////////////
@@ -241,7 +209,7 @@ namespace cu {
             pDeviceI, nSrcStep, oSize, oOffset,
             pDeviceIy, nSrcStep, oROI,
             pDeviceKernel, nMaskSize, nAnchor, eBorderType)
-        );
+          );
       compute_time += calc_print_elapsed("sobel: Iys[i]", start_dy);
 
       //////////////////////////////////////////////////////////////////////////////////////////////
@@ -257,48 +225,23 @@ namespace cu {
       NPP_CHECK_NPP(
           nppiCopyReplicateBorder_32f_C3R (
             pDeviceI, nSrcStep, oSize,
-            pDevicePaddedI, nDstStep, oPadSize, padding, padding) );
+            Is[i], nDstStep, oPadSize, padding, padding) );
 
       // Pad dx, dy
-      checkCudaErrors( cudaMemset(pDevicePaddedIx, 0, oPadSize.width * oPadSize.height * elemSize) );
-      checkCudaErrors( cudaMemset(pDevicePaddedIy, 0, oPadSize.width * oPadSize.height * elemSize) );
       NPP_CHECK_NPP(
           nppiCopyConstBorder_32f_C3R (
             pDeviceIx, nSrcStep, oSize,
-            pDevicePaddedIx, nDstStep, oPadSize, padding, padding, PAD_VAL) );
+            Ixs[i], nDstStep, oPadSize, padding, padding, PAD_VAL) );
       NPP_CHECK_NPP(
           nppiCopyConstBorder_32f_C3R (
             pDeviceIy, nSrcStep, oSize,
-            pDevicePaddedIy, nDstStep, oPadSize, padding, padding, PAD_VAL) );
-
-      // Allocate host destinations
-      auto start_host_alloc = now();
-      Is[i] = new float[oPadSize.width * oPadSize.height * channels];
-      Ixs[i] = new float[oPadSize.width * oPadSize.height * channels];
-      Iys[i] = new float[oPadSize.width * oPadSize.height * channels];
-      compute_time += calc_print_elapsed("host alloc", start_host_alloc);
-
-      // Copy over data
-      auto start_cp = now();
-      checkCudaErrors(
-          cudaMemcpy(Is[i], pDevicePaddedI,
-            oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
-      checkCudaErrors(
-          cudaMemcpy(Ixs[i], pDevicePaddedIx,
-            oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
-      checkCudaErrors(
-          cudaMemcpy(Iys[i], pDevicePaddedIy,
-            oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
-      compute_time += calc_print_elapsed("pyramid cudaMemcpy D->H", start_cp);
+            Iys[i], nDstStep, oPadSize, padding, padding, PAD_VAL) );
 
     }
 
     // Clean up
     cudaFree(pDeviceIx);
     cudaFree(pDeviceIy);
-    cudaFree(pDevicePaddedI);
-    cudaFree(pDevicePaddedIx);
-    cudaFree(pDevicePaddedIy);
     cudaFree(pDeviceTmp);
     cudaFree(pDeviceKernel);
 
@@ -307,4 +250,3 @@ namespace cu {
   }
 
 }
-
diff --git a/src/oflow.cpp b/src/oflow.cpp
@@ -38,7 +38,10 @@ using namespace timer;
 
 namespace OFC {
 
-  OFClass::OFClass(opt_params _op) {
+  OFClass::OFClass(opt_params _op, img_params _iparams) {
+
+    struct timeval tv_start_all, tv_end_all, tv_start_all_global, tv_end_all_global;
+    if (op.verbosity > 1) gettimeofday(&tv_start_all_global, nullptr);
 
     // Parse optimization parameters
     op = _op;
@@ -54,6 +57,13 @@ namespace OFC {
     op.dr_thresh = 0.95;
     op.res_thresh = 0.0;
 
+    // Initialize cuBLAS
+    cublasStatus_t stat = cublasCreate(&op.cublasHandle);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+      printf ("CUBLAS initialization failed\n");
+      exit(-1);
+    }
+
     // Allocate scale pyramides
     I0s = new float*[op.coarsest_scale+1];
     I1s = new float*[op.coarsest_scale+1];
@@ -62,8 +72,95 @@ namespace OFC {
     I1xs = new float*[op.coarsest_scale+1];
     I1ys = new float*[op.coarsest_scale+1];
 
+    // Create grids on each scale
+    if (op.verbosity>1) gettimeofday(&tv_start_all, nullptr);
+
+
+    grid.resize(op.n_scales);
+    flow.resize(op.n_scales);
+    iparams.resize(op.n_scales);
+    for (int sl = op.coarsest_scale; sl >= 0; --sl) {
+
+      int i = sl - op.finest_scale;
+
+      float scale_fact = pow(2, -sl); // scaling factor at current scale
+      if (i >= 0) {
+        iparams[i].scale_fact = scale_fact;
+        iparams[i].height = _iparams.height * scale_fact;
+        iparams[i].width = _iparams.width * scale_fact;
+        iparams[i].padding = _iparams.padding;
+        iparams[i].l_bound = -(float) op.patch_size / 2;
+        iparams[i].u_bound_width = (float) (iparams[i].width + op.patch_size / 2 - 2);
+        iparams[i].u_bound_height = (float) (iparams[i].height + op.patch_size / 2 - 2);
+        iparams[i].width_pad = iparams[i].width + 2 * _iparams.padding;
+        iparams[i].height_pad = iparams[i].height + 2 * _iparams.padding;
+        iparams[i].curr_lvl = sl;
+
+        flow[i]   = new float[2 * iparams[i].width * iparams[i].height];
+        grid[i]   = new OFC::PatGridClass(&(iparams[i]), &op);
+      }
+
+      int elemSize = 3 * sizeof(float);
+      int padWidth = _iparams.width * scale_fact + 2 * _iparams.padding;
+      int padHeight = _iparams.height * scale_fact + 2 * _iparams.padding;
+
+      checkCudaErrors( cudaMalloc((void**) &I0s[sl],  padWidth * padHeight * elemSize) );
+      checkCudaErrors( cudaMalloc((void**) &I0xs[sl], padWidth * padHeight * elemSize) );
+      checkCudaErrors( cudaMalloc((void**) &I0ys[sl], padWidth * padHeight * elemSize) );
+
+      checkCudaErrors( cudaMalloc((void**) &I1s[sl],  padWidth * padHeight * elemSize) );
+      checkCudaErrors( cudaMalloc((void**) &I1xs[sl], padWidth * padHeight * elemSize) );
+      checkCudaErrors( cudaMalloc((void**) &I1ys[sl], padWidth * padHeight * elemSize) );
+    }
+
+    // Timing, Grid memory allocation
+    if (op.verbosity>1) {
+
+      gettimeofday(&tv_end_all, nullptr);
+      double tt_gridconst = (tv_end_all.tv_sec-tv_start_all.tv_sec)*1000.0f + (tv_end_all.tv_usec-tv_start_all.tv_usec)/1000.0f;
+      printf("TIME (Grid Memo. Alloc. ) (ms): %3g\n", tt_gridconst);
+
+    }
+
+    // Timing, Setup
+    if (op.verbosity>1) {
+
+      gettimeofday(&tv_end_all_global, nullptr);
+      double tt = (tv_end_all_global.tv_sec-tv_start_all_global.tv_sec)*1000.0f + (tv_end_all_global.tv_usec-tv_start_all_global.tv_usec)/1000.0f;
+      printf("TIME (Setup) (ms): %3g\n", tt);
+    }
+
   }
 
+  OFClass::~OFClass() {
+
+    cublasDestroy(op.cublasHandle);
+
+    for (int sl = op.coarsest_scale; sl >= op.finest_scale; --sl) {
+
+      delete[] flow[sl - op.finest_scale];
+      delete grid[sl - op.finest_scale];
+
+    }
+
+    for (int i = 0; i <= op.coarsest_scale; i++) {
+      cudaFree(I0s[i]);
+      cudaFree(I0xs[i]);
+      cudaFree(I0ys[i]);
+
+      cudaFree(I1s[i]);
+      cudaFree(I1xs[i]);
+      cudaFree(I1ys[i]);
+    }
+
+    delete I0s;
+    delete I1s;
+    delete I0xs;
+    delete I0ys;
+    delete I1xs;
+    delete I1ys;
+
+  }
 
 
   void OFClass::ConstructImgPyramids(img_params iparams) {
@@ -121,43 +218,6 @@ namespace OFC {
 
     }
 
-    if (op.verbosity>1) gettimeofday(&tv_start_all, nullptr);
-
-
-    // Create grids on each scale
-    vector<OFC::PatGridClass*> grid(op.n_scales);
-    vector<float*> flow(op.n_scales);
-    iparams.resize(op.n_scales);
-    for (int sl = op.coarsest_scale; sl >= op.finest_scale; --sl) {
-
-      int i = sl - op.finest_scale;
-
-      float scale_fact = pow(2, -sl); // scaling factor at current scale
-      iparams[i].scale_fact = scale_fact;
-      iparams[i].height = _iparams.height * scale_fact;
-      iparams[i].width = _iparams.width * scale_fact;
-      iparams[i].padding = _iparams.padding;
-      iparams[i].l_bound = -(float) op.patch_size / 2;
-      iparams[i].u_bound_width = (float) (iparams[i].width + op.patch_size / 2 - 2);
-      iparams[i].u_bound_height = (float) (iparams[i].height + op.patch_size / 2 - 2);
-      iparams[i].width_pad = iparams[i].width + 2 * _iparams.padding;
-      iparams[i].height_pad = iparams[i].height + 2 * _iparams.padding;
-      iparams[i].curr_lvl = sl;
-
-      flow[i]   = new float[2 * iparams[i].width * iparams[i].height];
-      grid[i]   = new OFC::PatGridClass(&(iparams[i]), &op);
-
-    }
-
-    // Timing, Grid memory allocation
-    if (op.verbosity>1) {
-
-      gettimeofday(&tv_end_all, nullptr);
-      double tt_gridconst = (tv_end_all.tv_sec-tv_start_all.tv_sec)*1000.0f + (tv_end_all.tv_usec-tv_start_all.tv_usec)/1000.0f;
-      printf("TIME (Grid Memo. Alloc. ) (ms): %3g\n", tt_gridconst);
-
-    }
-
 
     // Main loop; Operate over scales, coarse-to-fine
     for (int sl = op.coarsest_scale; sl >= op.finest_scale; --sl) {
@@ -236,8 +296,21 @@ namespace OFC {
 
       // Variational refinement, (Step 5 in Algorithm 1 of paper)
       if (op.use_var_ref) {
+        float* I0H, * I1H;
+        int elemSize = 3 * sizeof(float);
+        int size = iparams[ii].width_pad * iparams[ii].height_pad * elemSize;
+        I0H = (float*) malloc(size);
+        I1H = (float*) malloc(size);
 
-        OFC::VarRefClass var_ref(I0s[sl], I1s[sl], &(iparams[ii]), &op, out_ptr);
+        checkCudaErrors(
+            cudaMemcpy(I0H, I0s[sl], size, cudaMemcpyDeviceToHost) );
+        checkCudaErrors(
+            cudaMemcpy(I1H, I1s[sl], size, cudaMemcpyDeviceToHost) );
+
+        OFC::VarRefClass var_ref(I0H, I1H, &(iparams[ii]), &op, out_ptr);
+
+        delete I0H;
+        delete I1H;
 
       }
 
@@ -254,15 +327,6 @@ namespace OFC {
 
     }
 
-    // Clean up
-    for (int sl = op.coarsest_scale; sl >= op.finest_scale; --sl) {
-
-      delete[] flow[sl - op.finest_scale];
-      delete grid[sl - op.finest_scale];
-
-    }
-
-
     // Timing, total algorithm run-time
     if (op.verbosity > 0) {