Skip to content

Commit

Permalink
Add setup function, pipeline until img pyramids
Browse files Browse the repository at this point in the history
  • Loading branch information
Ashwin Sekar committed May 8, 2017
1 parent 5523fd3 commit 4fe277f
Show file tree
Hide file tree
Showing 9 changed files with 212 additions and 200 deletions.
3 changes: 1 addition & 2 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ set(CODEFILES
FDF1.0.1/opticalflow_aux.c
FDF1.0.1/solver.c)

# GrayScale, Optical Flow
# RGB, Optical Flow
cuda_add_executable(flow ${COMMON} ${CODEFILES} ${KERNELS})
set_target_properties (flow PROPERTIES COMPILE_DEFINITIONS "SELECTMODE=1")
set_property(TARGET flow APPEND PROPERTY COMPILE_DEFINITIONS "SELECTCHANNEL=3") # use RGB image
Expand All @@ -87,4 +87,3 @@ set(SANDBOX_FILES
sandbox/sandbox.cpp)
cuda_add_executable(sandbox ${COMMON} ${KERNELS} ${SANDBOX_FILES})
target_link_libraries(sandbox ${OpenCV_LIBS})

72 changes: 7 additions & 65 deletions src/kernels/pyramid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,16 +75,11 @@ namespace cu {
// Allocate device memory (to account for padding too
auto start_cuda_malloc = now();
Npp32f *pDeviceIx, *pDeviceIy;
Npp32f *pDevicePaddedI, *pDevicePaddedIx, *pDevicePaddedIy;
Npp32f *pDeviceTmp, *pDeviceKernel;

checkCudaErrors( cudaMalloc((void**) &pDeviceIx, width * height * elemSize) );
checkCudaErrors( cudaMalloc((void**) &pDeviceIy, width * height * elemSize) );

checkCudaErrors( cudaMalloc((void**) &pDevicePaddedI, padWidth * padHeight * elemSize) );
checkCudaErrors( cudaMalloc((void**) &pDevicePaddedIx, padWidth * padHeight * elemSize) );
checkCudaErrors( cudaMalloc((void**) &pDevicePaddedIy, padWidth * padHeight * elemSize) );

checkCudaErrors( cudaMalloc((void**) &pDeviceTmp, width * height * elemSize) );
checkCudaErrors( cudaMalloc((void**) &pDeviceKernel, nMaskSize * sizeof(Npp32f)) );

Expand Down Expand Up @@ -142,44 +137,17 @@ namespace cu {
NPP_CHECK_NPP(
nppiCopyReplicateBorder_32f_C3R (
pDeviceI, nSrcStep, oSize,
pDevicePaddedI, nDstStep, oPadSize, padding, padding) );
Is[0], nDstStep, oPadSize, padding, padding) );

// Pad dx, dy
checkCudaErrors( cudaMemset(pDevicePaddedIx, 0, oPadSize.width * oPadSize.height * elemSize) );
checkCudaErrors( cudaMemset(pDevicePaddedIy, 0, oPadSize.width * oPadSize.height * elemSize) );
NPP_CHECK_NPP(
nppiCopyConstBorder_32f_C3R (
pDeviceIx, nSrcStep, oSize,
pDevicePaddedIx, nDstStep, oPadSize, padding, padding, PAD_VAL) );
Ixs[0], nDstStep, oPadSize, padding, padding, PAD_VAL) );
NPP_CHECK_NPP(
nppiCopyConstBorder_32f_C3R (
pDeviceIy, nSrcStep, oSize,
pDevicePaddedIy, nDstStep, oPadSize, padding, padding, PAD_VAL) );

////////////////////////////////////////////////////////////////////////////////////////////////
// Copy Is[0] I, dx, dy
////////////////////////////////////////////////////////////////////////////////////////////////

auto start_cp_Is0 = now();
Is[0] = new float[oPadSize.height * oPadSize.width * channels];
checkCudaErrors(
cudaMemcpy(Is[0], pDevicePaddedI,
oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
compute_time += calc_print_elapsed("Is[0] cudaMemcpy D->H", start_cp_Is0);

auto start_cp_dx = now();
Ixs[0] = new float[oPadSize.height * oPadSize.width * channels];
checkCudaErrors(
cudaMemcpy(Ixs[0], pDevicePaddedIx,
oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
compute_time += calc_print_elapsed("Ixs[0] cudaMemcpy D->H", start_cp_dx);

auto start_cp_dy = now();
Iys[0] = new float[oPadSize.height * oPadSize.width * channels];
checkCudaErrors(
cudaMemcpy(Iys[0], pDevicePaddedIy,
oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
compute_time += calc_print_elapsed("Iys[0] cudaMemcpy D->H", start_cp_dy);
Iys[0], nDstStep, oPadSize, padding, padding, PAD_VAL) );


////////////////////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -241,7 +209,7 @@ namespace cu {
pDeviceI, nSrcStep, oSize, oOffset,
pDeviceIy, nSrcStep, oROI,
pDeviceKernel, nMaskSize, nAnchor, eBorderType)
);
);
compute_time += calc_print_elapsed("sobel: Iys[i]", start_dy);

//////////////////////////////////////////////////////////////////////////////////////////////
Expand All @@ -257,48 +225,23 @@ namespace cu {
NPP_CHECK_NPP(
nppiCopyReplicateBorder_32f_C3R (
pDeviceI, nSrcStep, oSize,
pDevicePaddedI, nDstStep, oPadSize, padding, padding) );
Is[i], nDstStep, oPadSize, padding, padding) );

// Pad dx, dy
checkCudaErrors( cudaMemset(pDevicePaddedIx, 0, oPadSize.width * oPadSize.height * elemSize) );
checkCudaErrors( cudaMemset(pDevicePaddedIy, 0, oPadSize.width * oPadSize.height * elemSize) );
NPP_CHECK_NPP(
nppiCopyConstBorder_32f_C3R (
pDeviceIx, nSrcStep, oSize,
pDevicePaddedIx, nDstStep, oPadSize, padding, padding, PAD_VAL) );
Ixs[i], nDstStep, oPadSize, padding, padding, PAD_VAL) );
NPP_CHECK_NPP(
nppiCopyConstBorder_32f_C3R (
pDeviceIy, nSrcStep, oSize,
pDevicePaddedIy, nDstStep, oPadSize, padding, padding, PAD_VAL) );

// Allocate host destinations
auto start_host_alloc = now();
Is[i] = new float[oPadSize.width * oPadSize.height * channels];
Ixs[i] = new float[oPadSize.width * oPadSize.height * channels];
Iys[i] = new float[oPadSize.width * oPadSize.height * channels];
compute_time += calc_print_elapsed("host alloc", start_host_alloc);

// Copy over data
auto start_cp = now();
checkCudaErrors(
cudaMemcpy(Is[i], pDevicePaddedI,
oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
checkCudaErrors(
cudaMemcpy(Ixs[i], pDevicePaddedIx,
oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
checkCudaErrors(
cudaMemcpy(Iys[i], pDevicePaddedIy,
oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
compute_time += calc_print_elapsed("pyramid cudaMemcpy D->H", start_cp);
Iys[i], nDstStep, oPadSize, padding, padding, PAD_VAL) );

}

// Clean up
cudaFree(pDeviceIx);
cudaFree(pDeviceIy);
cudaFree(pDevicePaddedI);
cudaFree(pDevicePaddedIx);
cudaFree(pDevicePaddedIy);
cudaFree(pDeviceTmp);
cudaFree(pDeviceKernel);

Expand All @@ -307,4 +250,3 @@ namespace cu {
}

}

160 changes: 112 additions & 48 deletions src/oflow.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,10 @@ using namespace timer;

namespace OFC {

OFClass::OFClass(opt_params _op) {
OFClass::OFClass(opt_params _op, img_params _iparams) {

struct timeval tv_start_all, tv_end_all, tv_start_all_global, tv_end_all_global;
if (op.verbosity > 1) gettimeofday(&tv_start_all_global, nullptr);

// Parse optimization parameters
op = _op;
Expand All @@ -54,6 +57,13 @@ namespace OFC {
op.dr_thresh = 0.95;
op.res_thresh = 0.0;

// Initialize cuBLAS
cublasStatus_t stat = cublasCreate(&op.cublasHandle);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("CUBLAS initialization failed\n");
exit(-1);
}

// Allocate scale pyramides
I0s = new float*[op.coarsest_scale+1];
I1s = new float*[op.coarsest_scale+1];
Expand All @@ -62,8 +72,95 @@ namespace OFC {
I1xs = new float*[op.coarsest_scale+1];
I1ys = new float*[op.coarsest_scale+1];

// Create grids on each scale
if (op.verbosity>1) gettimeofday(&tv_start_all, nullptr);


grid.resize(op.n_scales);
flow.resize(op.n_scales);
iparams.resize(op.n_scales);
for (int sl = op.coarsest_scale; sl >= 0; --sl) {

int i = sl - op.finest_scale;

float scale_fact = pow(2, -sl); // scaling factor at current scale
if (i >= 0) {
iparams[i].scale_fact = scale_fact;
iparams[i].height = _iparams.height * scale_fact;
iparams[i].width = _iparams.width * scale_fact;
iparams[i].padding = _iparams.padding;
iparams[i].l_bound = -(float) op.patch_size / 2;
iparams[i].u_bound_width = (float) (iparams[i].width + op.patch_size / 2 - 2);
iparams[i].u_bound_height = (float) (iparams[i].height + op.patch_size / 2 - 2);
iparams[i].width_pad = iparams[i].width + 2 * _iparams.padding;
iparams[i].height_pad = iparams[i].height + 2 * _iparams.padding;
iparams[i].curr_lvl = sl;

flow[i] = new float[2 * iparams[i].width * iparams[i].height];
grid[i] = new OFC::PatGridClass(&(iparams[i]), &op);
}

int elemSize = 3 * sizeof(float);
int padWidth = _iparams.width * scale_fact + 2 * _iparams.padding;
int padHeight = _iparams.height * scale_fact + 2 * _iparams.padding;

checkCudaErrors( cudaMalloc((void**) &I0s[sl], padWidth * padHeight * elemSize) );
checkCudaErrors( cudaMalloc((void**) &I0xs[sl], padWidth * padHeight * elemSize) );
checkCudaErrors( cudaMalloc((void**) &I0ys[sl], padWidth * padHeight * elemSize) );

checkCudaErrors( cudaMalloc((void**) &I1s[sl], padWidth * padHeight * elemSize) );
checkCudaErrors( cudaMalloc((void**) &I1xs[sl], padWidth * padHeight * elemSize) );
checkCudaErrors( cudaMalloc((void**) &I1ys[sl], padWidth * padHeight * elemSize) );
}

// Timing, Grid memory allocation
if (op.verbosity>1) {

gettimeofday(&tv_end_all, nullptr);
double tt_gridconst = (tv_end_all.tv_sec-tv_start_all.tv_sec)*1000.0f + (tv_end_all.tv_usec-tv_start_all.tv_usec)/1000.0f;
printf("TIME (Grid Memo. Alloc. ) (ms): %3g\n", tt_gridconst);

}

// Timing, Setup
if (op.verbosity>1) {

gettimeofday(&tv_end_all_global, nullptr);
double tt = (tv_end_all_global.tv_sec-tv_start_all_global.tv_sec)*1000.0f + (tv_end_all_global.tv_usec-tv_start_all_global.tv_usec)/1000.0f;
printf("TIME (Setup) (ms): %3g\n", tt);
}

}

OFClass::~OFClass() {

cublasDestroy(op.cublasHandle);

for (int sl = op.coarsest_scale; sl >= op.finest_scale; --sl) {

delete[] flow[sl - op.finest_scale];
delete grid[sl - op.finest_scale];

}

for (int i = 0; i <= op.coarsest_scale; i++) {
cudaFree(I0s[i]);
cudaFree(I0xs[i]);
cudaFree(I0ys[i]);

cudaFree(I1s[i]);
cudaFree(I1xs[i]);
cudaFree(I1ys[i]);
}

delete I0s;
delete I1s;
delete I0xs;
delete I0ys;
delete I1xs;
delete I1ys;

}


void OFClass::ConstructImgPyramids(img_params iparams) {
Expand Down Expand Up @@ -121,43 +218,6 @@ namespace OFC {

}

if (op.verbosity>1) gettimeofday(&tv_start_all, nullptr);


// Create grids on each scale
vector<OFC::PatGridClass*> grid(op.n_scales);
vector<float*> flow(op.n_scales);
iparams.resize(op.n_scales);
for (int sl = op.coarsest_scale; sl >= op.finest_scale; --sl) {

int i = sl - op.finest_scale;

float scale_fact = pow(2, -sl); // scaling factor at current scale
iparams[i].scale_fact = scale_fact;
iparams[i].height = _iparams.height * scale_fact;
iparams[i].width = _iparams.width * scale_fact;
iparams[i].padding = _iparams.padding;
iparams[i].l_bound = -(float) op.patch_size / 2;
iparams[i].u_bound_width = (float) (iparams[i].width + op.patch_size / 2 - 2);
iparams[i].u_bound_height = (float) (iparams[i].height + op.patch_size / 2 - 2);
iparams[i].width_pad = iparams[i].width + 2 * _iparams.padding;
iparams[i].height_pad = iparams[i].height + 2 * _iparams.padding;
iparams[i].curr_lvl = sl;

flow[i] = new float[2 * iparams[i].width * iparams[i].height];
grid[i] = new OFC::PatGridClass(&(iparams[i]), &op);

}

// Timing, Grid memory allocation
if (op.verbosity>1) {

gettimeofday(&tv_end_all, nullptr);
double tt_gridconst = (tv_end_all.tv_sec-tv_start_all.tv_sec)*1000.0f + (tv_end_all.tv_usec-tv_start_all.tv_usec)/1000.0f;
printf("TIME (Grid Memo. Alloc. ) (ms): %3g\n", tt_gridconst);

}


// Main loop; Operate over scales, coarse-to-fine
for (int sl = op.coarsest_scale; sl >= op.finest_scale; --sl) {
Expand Down Expand Up @@ -236,8 +296,21 @@ namespace OFC {

// Variational refinement, (Step 5 in Algorithm 1 of paper)
if (op.use_var_ref) {
float* I0H, * I1H;
int elemSize = 3 * sizeof(float);
int size = iparams[ii].width_pad * iparams[ii].height_pad * elemSize;
I0H = (float*) malloc(size);
I1H = (float*) malloc(size);

OFC::VarRefClass var_ref(I0s[sl], I1s[sl], &(iparams[ii]), &op, out_ptr);
checkCudaErrors(
cudaMemcpy(I0H, I0s[sl], size, cudaMemcpyDeviceToHost) );
checkCudaErrors(
cudaMemcpy(I1H, I1s[sl], size, cudaMemcpyDeviceToHost) );

OFC::VarRefClass var_ref(I0H, I1H, &(iparams[ii]), &op, out_ptr);

delete I0H;
delete I1H;

}

Expand All @@ -254,15 +327,6 @@ namespace OFC {

}

// Clean up
for (int sl = op.coarsest_scale; sl >= op.finest_scale; --sl) {

delete[] flow[sl - op.finest_scale];
delete grid[sl - op.finest_scale];

}


// Timing, total algorithm run-time
if (op.verbosity > 0) {

Expand Down
Loading

0 comments on commit 4fe277f

Please sign in to comment.