Skip to content

Commit

Permalink
Initial attempt at reducing mallocs in construct pyramid (linker error)
Browse files Browse the repository at this point in the history
  • Loading branch information
Ashwin Sekar committed May 9, 2017
1 parent 2c09960 commit 7038d8f
Show file tree
Hide file tree
Showing 7 changed files with 44 additions and 47 deletions.
2 changes: 1 addition & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ set(CODEFILES
FDF1.0.1/solver.c)

# RGB, Optical Flow
cuda_add_executable(flow ${COMMON} ${CODEFILES} ${KERNELS})
cuda_add_executable(flow ${COMMON} ${KERNELS} ${CODEFILES})
set_target_properties (flow PROPERTIES COMPILE_DEFINITIONS "SELECTMODE=1")
set_property(TARGET flow APPEND PROPERTY COMPILE_DEFINITIONS "SELECTCHANNEL=3") # use RGB image
target_link_libraries(flow ${OpenCV_LIBS})
Expand Down
39 changes: 6 additions & 33 deletions src/kernels/pyramid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ namespace cu {

void constructImgPyramids(
Npp32f* src, float** Is, float** Ixs, float** Iys,
int width, int height,
Npp32f* pDeviceIx, Npp32f* pDeviceIy, Npp32f* pDeviceTmp,
Npp32f* pDeviceWew, int width, int height,
int padding, int nLevels) {

// Timing
Expand All @@ -51,7 +52,6 @@ namespace cu {
NppiSize oROI = { width, height };

// Mask params
const Npp32f pSrcKernel[3] = { 1, 0, -1 };
Npp32s nMaskSize = 3;
Npp32s nAnchor = 1; // Kernel is centered over pixel

Expand All @@ -72,27 +72,6 @@ namespace cu {

Npp32f* pDeviceI = src;

// Allocate device memory (to account for padding too
auto start_cuda_malloc = now();
Npp32f *pDeviceIx, *pDeviceIy;
Npp32f *pDeviceTmp, *pDeviceKernel;

checkCudaErrors( cudaMalloc((void**) &pDeviceIx, width * height * elemSize) );
checkCudaErrors( cudaMalloc((void**) &pDeviceIy, width * height * elemSize) );

checkCudaErrors( cudaMalloc((void**) &pDeviceTmp, width * height * elemSize) );
checkCudaErrors( cudaMalloc((void**) &pDeviceKernel, nMaskSize * sizeof(Npp32f)) );

calc_print_elapsed("cudaMalloc", start_cuda_malloc);

// Copy over initial image and kernel
auto start_memcpy_hd = now();

checkCudaErrors(
cudaMemcpy(pDeviceKernel, pSrcKernel, nMaskSize * sizeof(Npp32f), cudaMemcpyHostToDevice) );

calc_print_elapsed("cudaMemcpy Kernel H->D", start_memcpy_hd);

////////////////////////////////////////////////////////////////////////////////////////////////
// Apply first gradients to Is[0]
////////////////////////////////////////////////////////////////////////////////////////////////
Expand All @@ -107,7 +86,7 @@ namespace cu {
nppiFilterRowBorder_32f_C3R (
pDeviceI, nSrcStep, oSize, oOffset,
pDeviceIx, nSrcStep, oROI,
pDeviceKernel, nMaskSize, nAnchor, eBorderType)
pDeviceWew, nMaskSize, nAnchor, eBorderType)
);
compute_time += calc_print_elapsed("sobel: Ixs[0]", start_dx);

Expand All @@ -121,7 +100,7 @@ namespace cu {
nppiFilterColumnBorder_32f_C3R (
pDeviceI, nSrcStep, oSize, oOffset,
pDeviceIy, nSrcStep, oROI,
pDeviceKernel, nMaskSize, nAnchor, eBorderType)
pDeviceWew, nMaskSize, nAnchor, eBorderType)
);
compute_time += calc_print_elapsed("sobel: Iys[0]", start_dy);

Expand Down Expand Up @@ -194,7 +173,7 @@ namespace cu {
nppiFilterRowBorder_32f_C3R (
pDeviceI, nSrcStep, oSize, oOffset,
pDeviceIx, nSrcStep, oROI,
pDeviceKernel, nMaskSize, nAnchor, eBorderType)
pDeviceWew, nMaskSize, nAnchor, eBorderType)
);
compute_time += calc_print_elapsed("sobel: Ixs[i]", start_dx);

Expand All @@ -208,7 +187,7 @@ namespace cu {
nppiFilterColumnBorder_32f_C3R (
pDeviceI, nSrcStep, oSize, oOffset,
pDeviceIy, nSrcStep, oROI,
pDeviceKernel, nMaskSize, nAnchor, eBorderType)
pDeviceWew, nMaskSize, nAnchor, eBorderType)
);
compute_time += calc_print_elapsed("sobel: Iys[i]", start_dy);

Expand Down Expand Up @@ -239,12 +218,6 @@ namespace cu {

}

// Clean up
cudaFree(pDeviceIx);
cudaFree(pDeviceIy);
cudaFree(pDeviceTmp);
cudaFree(pDeviceKernel);

calc_print_elapsed("total time", start_total);
std::cout << "[done] constructImgPyramids: primmary compute time: " << compute_time << std::endl;
}
Expand Down
3 changes: 2 additions & 1 deletion src/kernels/pyramid.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ namespace cu {

void constructImgPyramids(
Npp32f* I, float** Is, float** Ixs, float** Iys,
int width, int height,
Npp32f* pDeviceIx, Npp32f* pDeviceIy, Npp32f* pDeviceTmp,
Npp32f* pDeviceWew, int width, int height,
int padding, int nLevels);

}
Expand Down
29 changes: 23 additions & 6 deletions src/oflow.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,12 @@
#include <stdio.h>

#include "oflow.h"
#include "patchgrid.h"
#include "refine_variational.h"

#include "kernels/resize.h"
#include "kernels/pad.h"
#include "kernels/resizeGrad.h"
#include "kernels/sobel.h"
#include "kernels/pyramid.h"
#include "common/RgbMat.h"
#include "common/timer.h"


Expand Down Expand Up @@ -76,6 +73,7 @@ namespace OFC {
if (op.verbosity>1) gettimeofday(&tv_start_all, nullptr);


int elemSize = 3 * sizeof(float);
grid.resize(op.n_scales);
flow.resize(op.n_scales);
iparams.resize(op.n_scales);
Expand All @@ -100,7 +98,6 @@ namespace OFC {
grid[i] = new OFC::PatGridClass(&(iparams[i]), &op);
}

int elemSize = 3 * sizeof(float);
int padWidth = _iparams.width * scale_fact + 2 * _iparams.padding;
int padHeight = _iparams.height * scale_fact + 2 * _iparams.padding;

Expand All @@ -122,6 +119,18 @@ namespace OFC {

}

const Npp32f pSrcKernel[3] = { 1, 0, -1 };
Npp32s nMaskSize = 3;

checkCudaErrors( cudaMalloc((void**) &pDeviceIx, _iparams.width * _iparams.height * elemSize) );
checkCudaErrors( cudaMalloc((void**) &pDeviceIy, _iparams.width * _iparams.height * elemSize) );

checkCudaErrors( cudaMalloc((void**) &pDeviceTmp, _iparams.width * _iparams.height * elemSize) );
checkCudaErrors( cudaMalloc((void**) &pDeviceWew, nMaskSize * sizeof(Npp32f)) );

checkCudaErrors(
cudaMemcpy(pDeviceWew, pSrcKernel, nMaskSize * sizeof(Npp32f), cudaMemcpyHostToDevice) );

// Timing, Setup
if (op.verbosity>1) {

Expand Down Expand Up @@ -160,6 +169,10 @@ namespace OFC {
delete I1xs;
delete I1ys;

cudaFree(pDeviceIx);
cudaFree(pDeviceIy);
cudaFree(pDeviceTmp);
cudaFree(pDeviceWew);
}


Expand All @@ -170,9 +183,13 @@ namespace OFC {
gettimeofday(&start_time, NULL);

// Construct image and gradient pyramides
cu::constructImgPyramids(I0, I0s, I0xs, I0ys, iparams.width, iparams.height,
cu::constructImgPyramids(I0, I0s, I0xs, I0ys,
pDeviceIx, pDeviceIy, pDeviceTmp, pDeviceWew,
iparams.width, iparams.height,
op.patch_size, op.coarsest_scale + 1);
cu::constructImgPyramids(I1, I1s, I1xs, I1ys, iparams.width, iparams.height,
cu::constructImgPyramids(I1, I1s, I1xs, I1ys,
pDeviceIx, pDeviceIy, pDeviceTmp, pDeviceWew,
iparams.width, iparams.height,
op.patch_size, op.coarsest_scale + 1);

// Timing, image gradients and pyramid
Expand Down
4 changes: 4 additions & 0 deletions src/oflow.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <opencv2/imgproc/imgproc.hpp>
#include "params.h"
#include "patchgrid.h"
#include "refine_variational.h"

using std::cout;
using std::endl;
Expand Down Expand Up @@ -39,6 +40,9 @@ namespace OFC {

std::vector<PatGridClass*> grid;
std::vector<float*> flow;

// Temp images to speedup pyramid generation
Npp32f* pDeviceIx, *pDeviceIy, *pDeviceTmp, *pDeviceWew;
};

}
Expand Down
6 changes: 3 additions & 3 deletions src/params.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// Holds all of the paramaters structures

#ifndef PARAMS_HEADER
#define PARAMS_HEADER
#ifndef OFC_PARAMS_HEADER
#define OFC_PARAMS_HEADER

namespace OFC {

Expand Down Expand Up @@ -68,4 +68,4 @@ namespace OFC {

}

#endif /* PARAMS_HEADER */
#endif /* OFC_PARAMS_HEADER */
8 changes: 5 additions & 3 deletions src/run_dense.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,16 @@
#include <sys/time.h>
#include <fstream>

// CUDA
#include <cuda_runtime.h>
#include <cublas_v2.h>

#include "params.h"
#include "oflow.h"
#include "kernels/warmup.h"
#include "kernels/pad.h"
#include "common/timer.h"

// CUDA
#include <cuda_runtime.h>
#include <cublas_v2.h>

using namespace std;
using namespace OFC;
Expand Down

0 comments on commit 7038d8f

Please sign in to comment.