From 7038d8f0ab2543ca9bbc49cb590e7ac3b19f982d Mon Sep 17 00:00:00 2001 From: Ashwin Sekar Date: Mon, 8 May 2017 20:36:59 -0400 Subject: [PATCH] Initial attempt at reducing mallocs in construct pyramid (linker error) --- src/CMakeLists.txt | 2 +- src/kernels/pyramid.cpp | 39 ++++++--------------------------------- src/kernels/pyramid.h | 3 ++- src/oflow.cpp | 29 +++++++++++++++++++++++------ src/oflow.h | 4 ++++ src/params.h | 6 +++--- src/run_dense.cpp | 8 +++++--- 7 files changed, 44 insertions(+), 47 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d817b29..0e9a9f3 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -72,7 +72,7 @@ set(CODEFILES FDF1.0.1/solver.c) # RGB, Optical Flow -cuda_add_executable(flow ${COMMON} ${CODEFILES} ${KERNELS}) +cuda_add_executable(flow ${COMMON} ${KERNELS} ${CODEFILES}) set_target_properties (flow PROPERTIES COMPILE_DEFINITIONS "SELECTMODE=1") set_property(TARGET flow APPEND PROPERTY COMPILE_DEFINITIONS "SELECTCHANNEL=3") # use RGB image target_link_libraries(flow ${OpenCV_LIBS}) diff --git a/src/kernels/pyramid.cpp b/src/kernels/pyramid.cpp index 03bf9b1..b0b1b2c 100644 --- a/src/kernels/pyramid.cpp +++ b/src/kernels/pyramid.cpp @@ -31,7 +31,8 @@ namespace cu { void constructImgPyramids( Npp32f* src, float** Is, float** Ixs, float** Iys, - int width, int height, + Npp32f* pDeviceIx, Npp32f* pDeviceIy, Npp32f* pDeviceTmp, + Npp32f* pDeviceWew, int width, int height, int padding, int nLevels) { // Timing @@ -51,7 +52,6 @@ namespace cu { NppiSize oROI = { width, height }; // Mask params - const Npp32f pSrcKernel[3] = { 1, 0, -1 }; Npp32s nMaskSize = 3; Npp32s nAnchor = 1; // Kernel is centered over pixel @@ -72,27 +72,6 @@ namespace cu { Npp32f* pDeviceI = src; - // Allocate device memory (to account for padding too - auto start_cuda_malloc = now(); - Npp32f *pDeviceIx, *pDeviceIy; - Npp32f *pDeviceTmp, *pDeviceKernel; - - checkCudaErrors( cudaMalloc((void**) &pDeviceIx, width * height * elemSize) ); - checkCudaErrors( cudaMalloc((void**) &pDeviceIy, width * height * elemSize) ); - - checkCudaErrors( cudaMalloc((void**) &pDeviceTmp, width * height * elemSize) ); - checkCudaErrors( cudaMalloc((void**) &pDeviceKernel, nMaskSize * sizeof(Npp32f)) ); - - calc_print_elapsed("cudaMalloc", start_cuda_malloc); - - // Copy over initial image and kernel - auto start_memcpy_hd = now(); - - checkCudaErrors( - cudaMemcpy(pDeviceKernel, pSrcKernel, nMaskSize * sizeof(Npp32f), cudaMemcpyHostToDevice) ); - - calc_print_elapsed("cudaMemcpy Kernel H->D", start_memcpy_hd); - //////////////////////////////////////////////////////////////////////////////////////////////// // Apply first gradients to Is[0] //////////////////////////////////////////////////////////////////////////////////////////////// @@ -107,7 +86,7 @@ namespace cu { nppiFilterRowBorder_32f_C3R ( pDeviceI, nSrcStep, oSize, oOffset, pDeviceIx, nSrcStep, oROI, - pDeviceKernel, nMaskSize, nAnchor, eBorderType) + pDeviceWew, nMaskSize, nAnchor, eBorderType) ); compute_time += calc_print_elapsed("sobel: Ixs[0]", start_dx); @@ -121,7 +100,7 @@ namespace cu { nppiFilterColumnBorder_32f_C3R ( pDeviceI, nSrcStep, oSize, oOffset, pDeviceIy, nSrcStep, oROI, - pDeviceKernel, nMaskSize, nAnchor, eBorderType) + pDeviceWew, nMaskSize, nAnchor, eBorderType) ); compute_time += calc_print_elapsed("sobel: Iys[0]", start_dy); @@ -194,7 +173,7 @@ namespace cu { nppiFilterRowBorder_32f_C3R ( pDeviceI, nSrcStep, oSize, oOffset, pDeviceIx, nSrcStep, oROI, - pDeviceKernel, nMaskSize, nAnchor, eBorderType) + pDeviceWew, nMaskSize, nAnchor, eBorderType) ); compute_time += calc_print_elapsed("sobel: Ixs[i]", start_dx); @@ -208,7 +187,7 @@ namespace cu { nppiFilterColumnBorder_32f_C3R ( pDeviceI, nSrcStep, oSize, oOffset, pDeviceIy, nSrcStep, oROI, - pDeviceKernel, nMaskSize, nAnchor, eBorderType) + pDeviceWew, nMaskSize, nAnchor, eBorderType) ); compute_time += calc_print_elapsed("sobel: Iys[i]", start_dy); @@ -239,12 +218,6 @@ namespace cu { } - // Clean up - cudaFree(pDeviceIx); - cudaFree(pDeviceIy); - cudaFree(pDeviceTmp); - cudaFree(pDeviceKernel); - calc_print_elapsed("total time", start_total); std::cout << "[done] constructImgPyramids: primmary compute time: " << compute_time << std::endl; } diff --git a/src/kernels/pyramid.h b/src/kernels/pyramid.h index c46d6ad..407ef52 100644 --- a/src/kernels/pyramid.h +++ b/src/kernels/pyramid.h @@ -30,7 +30,8 @@ namespace cu { void constructImgPyramids( Npp32f* I, float** Is, float** Ixs, float** Iys, - int width, int height, + Npp32f* pDeviceIx, Npp32f* pDeviceIy, Npp32f* pDeviceTmp, + Npp32f* pDeviceWew, int width, int height, int padding, int nLevels); } diff --git a/src/oflow.cpp b/src/oflow.cpp index 7050848..b8ea919 100644 --- a/src/oflow.cpp +++ b/src/oflow.cpp @@ -18,15 +18,12 @@ #include #include "oflow.h" -#include "patchgrid.h" -#include "refine_variational.h" #include "kernels/resize.h" #include "kernels/pad.h" #include "kernels/resizeGrad.h" #include "kernels/sobel.h" #include "kernels/pyramid.h" -#include "common/RgbMat.h" #include "common/timer.h" @@ -76,6 +73,7 @@ namespace OFC { if (op.verbosity>1) gettimeofday(&tv_start_all, nullptr); + int elemSize = 3 * sizeof(float); grid.resize(op.n_scales); flow.resize(op.n_scales); iparams.resize(op.n_scales); @@ -100,7 +98,6 @@ namespace OFC { grid[i] = new OFC::PatGridClass(&(iparams[i]), &op); } - int elemSize = 3 * sizeof(float); int padWidth = _iparams.width * scale_fact + 2 * _iparams.padding; int padHeight = _iparams.height * scale_fact + 2 * _iparams.padding; @@ -122,6 +119,18 @@ namespace OFC { } + const Npp32f pSrcKernel[3] = { 1, 0, -1 }; + Npp32s nMaskSize = 3; + + checkCudaErrors( cudaMalloc((void**) &pDeviceIx, _iparams.width * _iparams.height * elemSize) ); + checkCudaErrors( cudaMalloc((void**) &pDeviceIy, _iparams.width * _iparams.height * elemSize) ); + + checkCudaErrors( cudaMalloc((void**) &pDeviceTmp, _iparams.width * _iparams.height * elemSize) ); + checkCudaErrors( cudaMalloc((void**) &pDeviceWew, nMaskSize * sizeof(Npp32f)) ); + + checkCudaErrors( + cudaMemcpy(pDeviceWew, pSrcKernel, nMaskSize * sizeof(Npp32f), cudaMemcpyHostToDevice) ); + // Timing, Setup if (op.verbosity>1) { @@ -160,6 +169,10 @@ namespace OFC { delete I1xs; delete I1ys; + cudaFree(pDeviceIx); + cudaFree(pDeviceIy); + cudaFree(pDeviceTmp); + cudaFree(pDeviceWew); } @@ -170,9 +183,13 @@ namespace OFC { gettimeofday(&start_time, NULL); // Construct image and gradient pyramides - cu::constructImgPyramids(I0, I0s, I0xs, I0ys, iparams.width, iparams.height, + cu::constructImgPyramids(I0, I0s, I0xs, I0ys, + pDeviceIx, pDeviceIy, pDeviceTmp, pDeviceWew, + iparams.width, iparams.height, op.patch_size, op.coarsest_scale + 1); - cu::constructImgPyramids(I1, I1s, I1xs, I1ys, iparams.width, iparams.height, + cu::constructImgPyramids(I1, I1s, I1xs, I1ys, + pDeviceIx, pDeviceIy, pDeviceTmp, pDeviceWew, + iparams.width, iparams.height, op.patch_size, op.coarsest_scale + 1); // Timing, image gradients and pyramid diff --git a/src/oflow.h b/src/oflow.h index d924f1a..c553c76 100644 --- a/src/oflow.h +++ b/src/oflow.h @@ -12,6 +12,7 @@ #include #include "params.h" #include "patchgrid.h" +#include "refine_variational.h" using std::cout; using std::endl; @@ -39,6 +40,9 @@ namespace OFC { std::vector grid; std::vector flow; + + // Temp images to speedup pyramid generation + Npp32f* pDeviceIx, *pDeviceIy, *pDeviceTmp, *pDeviceWew; }; } diff --git a/src/params.h b/src/params.h index cd51f9c..bee846c 100644 --- a/src/params.h +++ b/src/params.h @@ -1,7 +1,7 @@ // Holds all of the paramaters structures -#ifndef PARAMS_HEADER -#define PARAMS_HEADER +#ifndef OFC_PARAMS_HEADER +#define OFC_PARAMS_HEADER namespace OFC { @@ -68,4 +68,4 @@ namespace OFC { } -#endif /* PARAMS_HEADER */ +#endif /* OFC_PARAMS_HEADER */ diff --git a/src/run_dense.cpp b/src/run_dense.cpp index a6655d9..b307080 100644 --- a/src/run_dense.cpp +++ b/src/run_dense.cpp @@ -6,14 +6,16 @@ #include #include +// CUDA +#include +#include + +#include "params.h" #include "oflow.h" #include "kernels/warmup.h" #include "kernels/pad.h" #include "common/timer.h" -// CUDA -#include -#include using namespace std; using namespace OFC;