diff --git a/src/kernels/densify.cu b/src/kernels/densify.cu index e964794..b331bbc 100644 --- a/src/kernels/densify.cu +++ b/src/kernels/densify.cu @@ -90,13 +90,14 @@ __global__ void kernelDensifyPatches( __global__ void kernelNormalizeFlow( - float* pDeviceFlowOut, float* pDeviceWeights, int N, int numBlocks) { + float* pDeviceFlowOut, float* flow, + float* pDeviceWeights, int N, int numBlocks) { int i = blockIdx.x * blockDim.x + threadIdx.x; for (; i < N; i+= blockDim.x * numBlocks) { if (pDeviceWeights[i / 2] > 0) - pDeviceFlowOut[i] /= pDeviceWeights[i / 2]; + flow[i] = pDeviceFlowOut[i] / pDeviceWeights[i / 2]; } } @@ -121,13 +122,14 @@ namespace cu { patchSize, minErrVal); } - void normalizeFlow( + void normalizeFlow(float* pHostFlowOut, float* pDeviceFlowOut, float* pDeviceWeights, int N) { int nThreadsPerBlock = 64; int nBlocks = 10; - kernelNormalizeFlow<<>>(pDeviceFlowOut, pDeviceWeights, N, nBlocks); + kernelNormalizeFlow<<>>(pDeviceFlowOut, + pHostFlowOut, pDeviceWeights, N, nBlocks); } void densifyPatches( diff --git a/src/kernels/densify.h b/src/kernels/densify.h index fbcfdc7..937dd56 100644 --- a/src/kernels/densify.h +++ b/src/kernels/densify.h @@ -40,7 +40,7 @@ namespace cu { void normalizeFlow( - float* pDeviceFlowOut, float* pDeviceWeights, int N); + float* pDeviceFlowOut, float* pHostFlowOut, float* pDeviceWeights, int N); } diff --git a/src/kernels/extract.cu b/src/kernels/extract.cu index b1900a8..b092817 100644 --- a/src/kernels/extract.cu +++ b/src/kernels/extract.cu @@ -123,8 +123,8 @@ __global__ void kernelExtractPatchesAndHessians( // TODO: merge this with above kernel? __global__ void kernelInitCoarserOF( - float* flowPrev, dev_patch_state* states, int width, - int lb, int ub_w, int ub_h) { + const float* flowPrev, dev_patch_state* states, + int width, int lb, int ub_w, int ub_h) { int patchId = blockIdx.x; int x = floor(states[patchId].midpoint_orgx / 2); @@ -207,7 +207,7 @@ namespace cu { } - void initCoarserOF(float* flowPrev, dev_patch_state* states, + void initCoarserOF(const float* flowPrev, dev_patch_state* states, int n_patches, const img_params* i_params) { int nBlocks = n_patches; diff --git a/src/kernels/extract.h b/src/kernels/extract.h index 0adb83d..f875488 100644 --- a/src/kernels/extract.h +++ b/src/kernels/extract.h @@ -47,7 +47,7 @@ namespace cu { dev_patch_state* states, int n_patches, const opt_params* op, const img_params* i_params); - void initCoarserOF(float* flowPrev, dev_patch_state* states, + void initCoarserOF(const float* flowPrev, dev_patch_state* states, int n_patches, const img_params* i_params); diff --git a/src/kernels/flowUtil.cu b/src/kernels/flowUtil.cu index fd55be5..ca162b4 100644 --- a/src/kernels/flowUtil.cu +++ b/src/kernels/flowUtil.cu @@ -620,7 +620,7 @@ namespace cu { // kernelSubLaplacianHoriz<<>>( // pDeviceSrc, pDeviceDst, pDeviceWeights, pDeviceCoeffs, height, width, stride); cudaDeviceSynchronize(); - calc_print_elapsed("laplacian horiz", start_horiz); + // calc_print_elapsed("laplacian horiz", start_horiz); cudaFree(pDeviceCoeffs); } @@ -641,7 +641,7 @@ namespace cu { auto start_vert = now(); kernelSubLaplacianVert<<>>( d_src, d_src + stride, d_dst, d_dst + stride, d_weights, height, stride); - calc_print_elapsed("laplacian vert", start_vert); + // calc_print_elapsed("laplacian vert", start_vert); } diff --git a/src/oflow.cpp b/src/oflow.cpp index 573bcd6..1830f8b 100644 --- a/src/oflow.cpp +++ b/src/oflow.cpp @@ -94,7 +94,10 @@ namespace OFC { iparams[i].height_pad = iparams[i].height + 2 * _iparams.padding; iparams[i].curr_lvl = sl; - flow[i] = new float[2 * iparams[i].width * iparams[i].height]; + // flow[i] = new float[2 * iparams[i].width * iparams[i].height]; + checkCudaErrors( + cudaHostAlloc((void**) &(flow[i]), + 2 * iparams[i].width * iparams[i].height * sizeof(float), cudaHostAllocMapped) ); grid[i] = new OFC::PatGridClass(&(iparams[i]), &op); } @@ -147,7 +150,7 @@ namespace OFC { for (int sl = op.coarsest_scale; sl >= op.finest_scale; --sl) { - delete[] flow[sl - op.finest_scale]; + cudaFree(flow[sl - op.finest_scale]); delete grid[sl - op.finest_scale]; } diff --git a/src/patchgrid.cpp b/src/patchgrid.cpp index 2d5bedc..962e1eb 100644 --- a/src/patchgrid.cpp +++ b/src/patchgrid.cpp @@ -288,13 +288,9 @@ namespace OFC { void PatGridClass::InitializeFromCoarserOF(const float * flow_prev) { - int flow_size = i_params->width * i_params->height / 2; - checkCudaErrors( cudaMemcpy(pDevFlowPrev, flow_prev, - flow_size * sizeof(float), cudaMemcpyHostToDevice) ); - gettimeofday(&tv_start, nullptr); - cu::initCoarserOF(pDevFlowPrev, pDevicePatchStates, + cu::initCoarserOF(flow_prev, pDevicePatchStates, n_patches, i_params); gettimeofday(&tv_end, nullptr); @@ -325,16 +321,12 @@ namespace OFC { gettimeofday(&tv_start, nullptr); // Normalize all pixels - cu::normalizeFlow(pDeviceFlowOut, pDeviceWeights, 2 * i_params->width * i_params->height); + cu::normalizeFlow(flowout, pDeviceFlowOut, pDeviceWeights, 2 * i_params->width * i_params->height); gettimeofday(&tv_end, nullptr); meanTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f + (tv_end.tv_usec - tv_start.tv_usec) / 1000.0f; - checkCudaErrors( - cudaMemcpy(flowout, pDeviceFlowOut, - i_params->width * i_params->height * 2 * sizeof(float), cudaMemcpyDeviceToHost) ); - } diff --git a/src/refine_variational.cpp b/src/refine_variational.cpp index 41d7b46..9299326 100644 --- a/src/refine_variational.cpp +++ b/src/refine_variational.cpp @@ -92,12 +92,12 @@ namespace OFC { copyimage(_I0, I0); copyimage(_I1, I1); - calc_print_elapsed("refine: flow_sep", start_flow_sep); + // calc_print_elapsed("refine: flow_sep", start_flow_sep); // Call solver auto start_solver = now(); RefLevelOF(flow_sep[0], flow_sep[1], I0, I1); - calc_print_elapsed("RefLevelOF [total]", start_solver); + // calc_print_elapsed("RefLevelOF [total]", start_solver); // Copy flow result back auto start_copy = now(); @@ -111,7 +111,7 @@ namespace OFC { } } - calc_print_elapsed("refine: copy back", start_copy); + // calc_print_elapsed("refine: copy back", start_copy); // free FV structs for (int i = 0; i < noparam; ++i ) @@ -170,24 +170,24 @@ namespace OFC { *Ix = color_image_new(width,height), *Iy = color_image_new(width,height), *Iz = color_image_new(width,height), // first order derivatives *Ixx = color_image_new(width,height), *Ixy = color_image_new(width,height), *Iyy = color_image_new(width,height), *Ixz = color_image_new(width,height), *Iyz = color_image_new(width,height); // second order derivatives - calc_print_elapsed("RefLevelOF setup", start_setup); + // calc_print_elapsed("RefLevelOF setup", start_setup); // warp second image auto start_image_warp = now(); // image_warp(w_im2, mask, im2, wx, wy); cu::warpImage(w_im2, mask, im2, wx, wy); - calc_print_elapsed("RefLevelOF image_warp", start_image_warp); + // calc_print_elapsed("RefLevelOF image_warp", start_image_warp); // compute derivatives auto start_get_derivs = now(); get_derivatives(im1, w_im2, pDeviceColorDerivativeKernel, Ix, Iy, Iz, Ixx, Ixy, Iyy, Ixz, Iyz); - calc_print_elapsed("RefLevelOF get_derivatives", start_get_derivs); + // calc_print_elapsed("RefLevelOF get_derivatives", start_get_derivs); // erase du and dv auto start_image_erase = now(); image_erase(du); image_erase(dv); - calc_print_elapsed("RefLevelOF image_erase", start_image_erase); + // calc_print_elapsed("RefLevelOF image_erase", start_image_erase); // initialize uu and vv memcpy(uu->c1,wx->c1,wx->stride*wx->height*sizeof(float)); @@ -200,23 +200,23 @@ namespace OFC { // compute robust function and system auto start_smooth = now(); compute_smoothness(smooth_horiz, smooth_vert, uu, vv, pDeviceDerivativeKernel, vr.tmp_quarter_alpha ); - calc_print_elapsed(("RefLevelOF " + iterStr + " smoothness").c_str(), start_smooth); + // calc_print_elapsed(("RefLevelOF " + iterStr + " smoothness").c_str(), start_smooth); auto start_data = now(); // compute_data(a11, a12, a22, b1, b2, mask, wx, wy, du, dv, uu, vv, Ix, Iy, Iz, Ixx, Ixy, Iyy, Ixz, Iyz, vr.tmp_half_delta_over3, vr.tmp_half_beta, vr.tmp_half_gamma_over3); cu::dataTerm(a11, a12, a22, b1, b2, mask, wx, wy, du, dv, uu, vv, Ix, Iy, Iz, Ixx, Ixy, Iyy, Ixz, Iyz, vr.tmp_half_delta_over3, vr.tmp_half_beta, vr.tmp_half_gamma_over3); - calc_print_elapsed(("RefLevelOF " + iterStr + " data").c_str(), start_data); + // calc_print_elapsed(("RefLevelOF " + iterStr + " data").c_str(), start_data); auto start_lapalcian = now(); sub_laplacian(b1, wx, smooth_horiz, smooth_vert); sub_laplacian(b2, wy, smooth_horiz, smooth_vert); - calc_print_elapsed(("RefLevelOF " + iterStr + " laplacian").c_str(), start_lapalcian); + // calc_print_elapsed(("RefLevelOF " + iterStr + " laplacian").c_str(), start_lapalcian); // solve system // #ifdef WITH_OPENMP auto start_sor = now(); sor_coupled_slow_but_readable(du, dv, a11, a12, a22, b1, b2, smooth_horiz, smooth_vert, vr.solve_iter, vr.sor_omega); // slower but parallelized - calc_print_elapsed(("RefLevelOF " + iterStr + " sor").c_str(), start_sor); + // calc_print_elapsed(("RefLevelOF " + iterStr + " sor").c_str(), start_sor); // #else // sor_coupled(du, dv, a11, a12, a22, b1, b2, smooth_horiz, smooth_vert, vr.solve_iter, vr.sor_omega); // #endif @@ -226,16 +226,16 @@ namespace OFC { cu::flowUpdate( uu->c1, vv->c1, wx->c1, wy->c1, du->c1, dv->c1, height, width, stride); - calc_print_elapsed(("RefLevelOF " + iterStr + " flow update").c_str(), start_flow_update); + // calc_print_elapsed(("RefLevelOF " + iterStr + " flow update").c_str(), start_flow_update); - calc_print_elapsed(("RefLevelOF " + iterStr + " [total]").c_str(), start_iteration); + // calc_print_elapsed(("RefLevelOF " + iterStr + " [total]").c_str(), start_iteration); } // add flow increment to current flow auto start_increment_flow = now(); memcpy(wx->c1,uu->c1,uu->stride*uu->height*sizeof(float)); memcpy(wy->c1,vv->c1,vv->stride*vv->height*sizeof(float)); - calc_print_elapsed("RefLevelOF increment flow", start_increment_flow); + // calc_print_elapsed("RefLevelOF increment flow", start_increment_flow); // free memory auto start_cleanup = now(); @@ -249,7 +249,7 @@ namespace OFC { color_image_delete(w_im2); color_image_delete(Ix); color_image_delete(Iy); color_image_delete(Iz); color_image_delete(Ixx); color_image_delete(Ixy); color_image_delete(Iyy); color_image_delete(Ixz); color_image_delete(Iyz); - calc_print_elapsed("RefLevelOF cleanup", start_cleanup); + // calc_print_elapsed("RefLevelOF cleanup", start_cleanup); } diff --git a/src/run_dense.cpp b/src/run_dense.cpp index ac3bdd5..cb9aaae 100644 --- a/src/run_dense.cpp +++ b/src/run_dense.cpp @@ -278,10 +278,15 @@ int main( int argc, char** argv ) { // Run main optical flow / depth algorithm float scale_fact = pow(2, op.finest_scale); - cv::Mat flow_mat(iparams.height / scale_fact , iparams.width / scale_fact, CV_32FC2); // Optical Flow + float* outflow; + checkCudaErrors( + cudaHostAlloc((void**) &(outflow), 2 * iparams.height / scale_fact + * iparams.width / scale_fact * sizeof(float), cudaHostAllocMapped) ); - ofc.calc(I0, I1, iparams, nullptr, (float*) flow_mat.data); + ofc.calc(I0, I1, iparams, nullptr, outflow); + cv::Mat flow_mat(iparams.height / scale_fact , iparams.width / scale_fact, + CV_32FC2, outflow); if (op.verbosity > 1) gettimeofday(&start_time, NULL);