diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 794ec56..d817b29 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -71,7 +71,7 @@ set(CODEFILES FDF1.0.1/opticalflow_aux.c FDF1.0.1/solver.c) -# GrayScale, Optical Flow +# RGB, Optical Flow cuda_add_executable(flow ${COMMON} ${CODEFILES} ${KERNELS}) set_target_properties (flow PROPERTIES COMPILE_DEFINITIONS "SELECTMODE=1") set_property(TARGET flow APPEND PROPERTY COMPILE_DEFINITIONS "SELECTCHANNEL=3") # use RGB image @@ -87,4 +87,3 @@ set(SANDBOX_FILES sandbox/sandbox.cpp) cuda_add_executable(sandbox ${COMMON} ${KERNELS} ${SANDBOX_FILES}) target_link_libraries(sandbox ${OpenCV_LIBS}) - diff --git a/src/kernels/pyramid.cpp b/src/kernels/pyramid.cpp index 80c8ab8..03bf9b1 100644 --- a/src/kernels/pyramid.cpp +++ b/src/kernels/pyramid.cpp @@ -75,16 +75,11 @@ namespace cu { // Allocate device memory (to account for padding too auto start_cuda_malloc = now(); Npp32f *pDeviceIx, *pDeviceIy; - Npp32f *pDevicePaddedI, *pDevicePaddedIx, *pDevicePaddedIy; Npp32f *pDeviceTmp, *pDeviceKernel; checkCudaErrors( cudaMalloc((void**) &pDeviceIx, width * height * elemSize) ); checkCudaErrors( cudaMalloc((void**) &pDeviceIy, width * height * elemSize) ); - checkCudaErrors( cudaMalloc((void**) &pDevicePaddedI, padWidth * padHeight * elemSize) ); - checkCudaErrors( cudaMalloc((void**) &pDevicePaddedIx, padWidth * padHeight * elemSize) ); - checkCudaErrors( cudaMalloc((void**) &pDevicePaddedIy, padWidth * padHeight * elemSize) ); - checkCudaErrors( cudaMalloc((void**) &pDeviceTmp, width * height * elemSize) ); checkCudaErrors( cudaMalloc((void**) &pDeviceKernel, nMaskSize * sizeof(Npp32f)) ); @@ -142,44 +137,17 @@ namespace cu { NPP_CHECK_NPP( nppiCopyReplicateBorder_32f_C3R ( pDeviceI, nSrcStep, oSize, - pDevicePaddedI, nDstStep, oPadSize, padding, padding) ); + Is[0], nDstStep, oPadSize, padding, padding) ); // Pad dx, dy - checkCudaErrors( cudaMemset(pDevicePaddedIx, 0, oPadSize.width * oPadSize.height * elemSize) ); - checkCudaErrors( cudaMemset(pDevicePaddedIy, 0, oPadSize.width * oPadSize.height * elemSize) ); NPP_CHECK_NPP( nppiCopyConstBorder_32f_C3R ( pDeviceIx, nSrcStep, oSize, - pDevicePaddedIx, nDstStep, oPadSize, padding, padding, PAD_VAL) ); + Ixs[0], nDstStep, oPadSize, padding, padding, PAD_VAL) ); NPP_CHECK_NPP( nppiCopyConstBorder_32f_C3R ( pDeviceIy, nSrcStep, oSize, - pDevicePaddedIy, nDstStep, oPadSize, padding, padding, PAD_VAL) ); - - //////////////////////////////////////////////////////////////////////////////////////////////// - // Copy Is[0] I, dx, dy - //////////////////////////////////////////////////////////////////////////////////////////////// - - auto start_cp_Is0 = now(); - Is[0] = new float[oPadSize.height * oPadSize.width * channels]; - checkCudaErrors( - cudaMemcpy(Is[0], pDevicePaddedI, - oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) ); - compute_time += calc_print_elapsed("Is[0] cudaMemcpy D->H", start_cp_Is0); - - auto start_cp_dx = now(); - Ixs[0] = new float[oPadSize.height * oPadSize.width * channels]; - checkCudaErrors( - cudaMemcpy(Ixs[0], pDevicePaddedIx, - oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) ); - compute_time += calc_print_elapsed("Ixs[0] cudaMemcpy D->H", start_cp_dx); - - auto start_cp_dy = now(); - Iys[0] = new float[oPadSize.height * oPadSize.width * channels]; - checkCudaErrors( - cudaMemcpy(Iys[0], pDevicePaddedIy, - oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) ); - compute_time += calc_print_elapsed("Iys[0] cudaMemcpy D->H", start_cp_dy); + Iys[0], nDstStep, oPadSize, padding, padding, PAD_VAL) ); //////////////////////////////////////////////////////////////////////////////////////////////// @@ -241,7 +209,7 @@ namespace cu { pDeviceI, nSrcStep, oSize, oOffset, pDeviceIy, nSrcStep, oROI, pDeviceKernel, nMaskSize, nAnchor, eBorderType) - ); + ); compute_time += calc_print_elapsed("sobel: Iys[i]", start_dy); ////////////////////////////////////////////////////////////////////////////////////////////// @@ -257,48 +225,23 @@ namespace cu { NPP_CHECK_NPP( nppiCopyReplicateBorder_32f_C3R ( pDeviceI, nSrcStep, oSize, - pDevicePaddedI, nDstStep, oPadSize, padding, padding) ); + Is[i], nDstStep, oPadSize, padding, padding) ); // Pad dx, dy - checkCudaErrors( cudaMemset(pDevicePaddedIx, 0, oPadSize.width * oPadSize.height * elemSize) ); - checkCudaErrors( cudaMemset(pDevicePaddedIy, 0, oPadSize.width * oPadSize.height * elemSize) ); NPP_CHECK_NPP( nppiCopyConstBorder_32f_C3R ( pDeviceIx, nSrcStep, oSize, - pDevicePaddedIx, nDstStep, oPadSize, padding, padding, PAD_VAL) ); + Ixs[i], nDstStep, oPadSize, padding, padding, PAD_VAL) ); NPP_CHECK_NPP( nppiCopyConstBorder_32f_C3R ( pDeviceIy, nSrcStep, oSize, - pDevicePaddedIy, nDstStep, oPadSize, padding, padding, PAD_VAL) ); - - // Allocate host destinations - auto start_host_alloc = now(); - Is[i] = new float[oPadSize.width * oPadSize.height * channels]; - Ixs[i] = new float[oPadSize.width * oPadSize.height * channels]; - Iys[i] = new float[oPadSize.width * oPadSize.height * channels]; - compute_time += calc_print_elapsed("host alloc", start_host_alloc); - - // Copy over data - auto start_cp = now(); - checkCudaErrors( - cudaMemcpy(Is[i], pDevicePaddedI, - oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) ); - checkCudaErrors( - cudaMemcpy(Ixs[i], pDevicePaddedIx, - oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) ); - checkCudaErrors( - cudaMemcpy(Iys[i], pDevicePaddedIy, - oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) ); - compute_time += calc_print_elapsed("pyramid cudaMemcpy D->H", start_cp); + Iys[i], nDstStep, oPadSize, padding, padding, PAD_VAL) ); } // Clean up cudaFree(pDeviceIx); cudaFree(pDeviceIy); - cudaFree(pDevicePaddedI); - cudaFree(pDevicePaddedIx); - cudaFree(pDevicePaddedIy); cudaFree(pDeviceTmp); cudaFree(pDeviceKernel); @@ -307,4 +250,3 @@ namespace cu { } } - diff --git a/src/oflow.cpp b/src/oflow.cpp index 18f6719..7050848 100644 --- a/src/oflow.cpp +++ b/src/oflow.cpp @@ -38,7 +38,10 @@ using namespace timer; namespace OFC { - OFClass::OFClass(opt_params _op) { + OFClass::OFClass(opt_params _op, img_params _iparams) { + + struct timeval tv_start_all, tv_end_all, tv_start_all_global, tv_end_all_global; + if (op.verbosity > 1) gettimeofday(&tv_start_all_global, nullptr); // Parse optimization parameters op = _op; @@ -54,6 +57,13 @@ namespace OFC { op.dr_thresh = 0.95; op.res_thresh = 0.0; + // Initialize cuBLAS + cublasStatus_t stat = cublasCreate(&op.cublasHandle); + if (stat != CUBLAS_STATUS_SUCCESS) { + printf ("CUBLAS initialization failed\n"); + exit(-1); + } + // Allocate scale pyramides I0s = new float*[op.coarsest_scale+1]; I1s = new float*[op.coarsest_scale+1]; @@ -62,8 +72,95 @@ namespace OFC { I1xs = new float*[op.coarsest_scale+1]; I1ys = new float*[op.coarsest_scale+1]; + // Create grids on each scale + if (op.verbosity>1) gettimeofday(&tv_start_all, nullptr); + + + grid.resize(op.n_scales); + flow.resize(op.n_scales); + iparams.resize(op.n_scales); + for (int sl = op.coarsest_scale; sl >= 0; --sl) { + + int i = sl - op.finest_scale; + + float scale_fact = pow(2, -sl); // scaling factor at current scale + if (i >= 0) { + iparams[i].scale_fact = scale_fact; + iparams[i].height = _iparams.height * scale_fact; + iparams[i].width = _iparams.width * scale_fact; + iparams[i].padding = _iparams.padding; + iparams[i].l_bound = -(float) op.patch_size / 2; + iparams[i].u_bound_width = (float) (iparams[i].width + op.patch_size / 2 - 2); + iparams[i].u_bound_height = (float) (iparams[i].height + op.patch_size / 2 - 2); + iparams[i].width_pad = iparams[i].width + 2 * _iparams.padding; + iparams[i].height_pad = iparams[i].height + 2 * _iparams.padding; + iparams[i].curr_lvl = sl; + + flow[i] = new float[2 * iparams[i].width * iparams[i].height]; + grid[i] = new OFC::PatGridClass(&(iparams[i]), &op); + } + + int elemSize = 3 * sizeof(float); + int padWidth = _iparams.width * scale_fact + 2 * _iparams.padding; + int padHeight = _iparams.height * scale_fact + 2 * _iparams.padding; + + checkCudaErrors( cudaMalloc((void**) &I0s[sl], padWidth * padHeight * elemSize) ); + checkCudaErrors( cudaMalloc((void**) &I0xs[sl], padWidth * padHeight * elemSize) ); + checkCudaErrors( cudaMalloc((void**) &I0ys[sl], padWidth * padHeight * elemSize) ); + + checkCudaErrors( cudaMalloc((void**) &I1s[sl], padWidth * padHeight * elemSize) ); + checkCudaErrors( cudaMalloc((void**) &I1xs[sl], padWidth * padHeight * elemSize) ); + checkCudaErrors( cudaMalloc((void**) &I1ys[sl], padWidth * padHeight * elemSize) ); + } + + // Timing, Grid memory allocation + if (op.verbosity>1) { + + gettimeofday(&tv_end_all, nullptr); + double tt_gridconst = (tv_end_all.tv_sec-tv_start_all.tv_sec)*1000.0f + (tv_end_all.tv_usec-tv_start_all.tv_usec)/1000.0f; + printf("TIME (Grid Memo. Alloc. ) (ms): %3g\n", tt_gridconst); + + } + + // Timing, Setup + if (op.verbosity>1) { + + gettimeofday(&tv_end_all_global, nullptr); + double tt = (tv_end_all_global.tv_sec-tv_start_all_global.tv_sec)*1000.0f + (tv_end_all_global.tv_usec-tv_start_all_global.tv_usec)/1000.0f; + printf("TIME (Setup) (ms): %3g\n", tt); + } + } + OFClass::~OFClass() { + + cublasDestroy(op.cublasHandle); + + for (int sl = op.coarsest_scale; sl >= op.finest_scale; --sl) { + + delete[] flow[sl - op.finest_scale]; + delete grid[sl - op.finest_scale]; + + } + + for (int i = 0; i <= op.coarsest_scale; i++) { + cudaFree(I0s[i]); + cudaFree(I0xs[i]); + cudaFree(I0ys[i]); + + cudaFree(I1s[i]); + cudaFree(I1xs[i]); + cudaFree(I1ys[i]); + } + + delete I0s; + delete I1s; + delete I0xs; + delete I0ys; + delete I1xs; + delete I1ys; + + } void OFClass::ConstructImgPyramids(img_params iparams) { @@ -121,43 +218,6 @@ namespace OFC { } - if (op.verbosity>1) gettimeofday(&tv_start_all, nullptr); - - - // Create grids on each scale - vector grid(op.n_scales); - vector flow(op.n_scales); - iparams.resize(op.n_scales); - for (int sl = op.coarsest_scale; sl >= op.finest_scale; --sl) { - - int i = sl - op.finest_scale; - - float scale_fact = pow(2, -sl); // scaling factor at current scale - iparams[i].scale_fact = scale_fact; - iparams[i].height = _iparams.height * scale_fact; - iparams[i].width = _iparams.width * scale_fact; - iparams[i].padding = _iparams.padding; - iparams[i].l_bound = -(float) op.patch_size / 2; - iparams[i].u_bound_width = (float) (iparams[i].width + op.patch_size / 2 - 2); - iparams[i].u_bound_height = (float) (iparams[i].height + op.patch_size / 2 - 2); - iparams[i].width_pad = iparams[i].width + 2 * _iparams.padding; - iparams[i].height_pad = iparams[i].height + 2 * _iparams.padding; - iparams[i].curr_lvl = sl; - - flow[i] = new float[2 * iparams[i].width * iparams[i].height]; - grid[i] = new OFC::PatGridClass(&(iparams[i]), &op); - - } - - // Timing, Grid memory allocation - if (op.verbosity>1) { - - gettimeofday(&tv_end_all, nullptr); - double tt_gridconst = (tv_end_all.tv_sec-tv_start_all.tv_sec)*1000.0f + (tv_end_all.tv_usec-tv_start_all.tv_usec)/1000.0f; - printf("TIME (Grid Memo. Alloc. ) (ms): %3g\n", tt_gridconst); - - } - // Main loop; Operate over scales, coarse-to-fine for (int sl = op.coarsest_scale; sl >= op.finest_scale; --sl) { @@ -236,8 +296,21 @@ namespace OFC { // Variational refinement, (Step 5 in Algorithm 1 of paper) if (op.use_var_ref) { + float* I0H, * I1H; + int elemSize = 3 * sizeof(float); + int size = iparams[ii].width_pad * iparams[ii].height_pad * elemSize; + I0H = (float*) malloc(size); + I1H = (float*) malloc(size); - OFC::VarRefClass var_ref(I0s[sl], I1s[sl], &(iparams[ii]), &op, out_ptr); + checkCudaErrors( + cudaMemcpy(I0H, I0s[sl], size, cudaMemcpyDeviceToHost) ); + checkCudaErrors( + cudaMemcpy(I1H, I1s[sl], size, cudaMemcpyDeviceToHost) ); + + OFC::VarRefClass var_ref(I0H, I1H, &(iparams[ii]), &op, out_ptr); + + delete I0H; + delete I1H; } @@ -254,15 +327,6 @@ namespace OFC { } - // Clean up - for (int sl = op.coarsest_scale; sl >= op.finest_scale; --sl) { - - delete[] flow[sl - op.finest_scale]; - delete grid[sl - op.finest_scale]; - - } - - // Timing, total algorithm run-time if (op.verbosity > 0) { diff --git a/src/oflow.h b/src/oflow.h index ecd2199..d924f1a 100644 --- a/src/oflow.h +++ b/src/oflow.h @@ -10,77 +10,20 @@ #include #include #include +#include "params.h" +#include "patchgrid.h" using std::cout; using std::endl; namespace OFC { - typedef __v4sf v4sf; - - - typedef struct { - int width; // image width, does not include '2 * padding', but includes original padding to ensure integer divisible image width and height - int height; // image height, does not include '2 * padding', but includes original padding to ensure integer divisible image width and height - int padding; // image padding in pixels at all sides, images padded with replicated border, gradients padded with zero, ADD THIS ONLY WHEN ADDRESSING THE IMAGE OR GRADIENT - float l_bound; // lower bound for valid image region, pre-compute for image padding to avoid border check - float u_bound_width; // upper width bound for valid image region, pre-compute for image padding to avoid border check - float u_bound_height; // upper height bound for valid image region, pre-compute for image padding to avoid border check - int width_pad; // width + 2 * padding - int height_pad; // height + 2 * padding - float scale_fact; // scaling factor at current scale - int curr_lvl; // current level - } img_params ; - - - typedef struct { - // Explicitly set parameters: - int coarsest_scale; - int finest_scale; - int patch_size; - float patch_stride; - bool use_mean_normalization; - // Termination - int grad_descent_iter; - float dp_thresh; - float dr_thresh; - float res_thresh; - // Verbosity, 0: plot nothing, 1: final internal timing 2: complete iteration timing, (UNCOMMENTED -> 3: Display flow scales, 4: Display flow scale iterations) - int verbosity; - bool use_var_ref; - int var_ref_iter; - float var_ref_alpha; - float var_ref_gamma; - float var_ref_delta; - float var_ref_sor_weight; // Successive-over-relaxation weight - - // Automatically set parameters / fixed parameters - float outlier_thresh; // displacement threshold (in px) before a patch is flagged as outlier - int steps; // horizontal and vertical distance (in px) between patch centers - int n_vals; // number of points in patch (=p_samp_s*p_samp_s) - int n_scales; // total number of scales - float min_errval = 2.0f; // 1/max(this, error) for pixel averaging weight - float norm_outlier = 5.0f; // norm error threshold for huber norm - - // cuBLAS - cublasHandle_t cublasHandle; - - // Helper variables - v4sf zero = (v4sf) {0.0f, 0.0f, 0.0f, 0.0f}; - v4sf negzero = (v4sf) {-0.0f, -0.0f, -0.0f, -0.0f}; - v4sf half = (v4sf) {0.5f, 0.5f, 0.5f, 0.5f}; - v4sf ones = (v4sf) {1.0f, 1.0f, 1.0f, 1.0f}; - v4sf twos = (v4sf) {2.0f, 2.0f, 2.0f, 2.0f}; - v4sf fours = (v4sf) {4.0f, 4.0f, 4.0f, 4.0f}; - v4sf norm_outlier_tmpbsq; - v4sf norm_outlier_tmp2bsq; - v4sf norm_outlier_tmp4bsq; - } opt_params; - class OFClass { public: - OFClass(opt_params _op); + OFClass(opt_params _op, img_params _i_params); + ~OFClass(); + void calc(Npp32f* _I0, Npp32f* _I1, img_params _iparams, const float * initflow, float * outflow); private: @@ -94,6 +37,8 @@ namespace OFC { opt_params op; // Struct for optimization parameters std::vector iparams; // Struct (for each scale) for image parameter + std::vector grid; + std::vector flow; }; } diff --git a/src/params.h b/src/params.h new file mode 100644 index 0000000..cd51f9c --- /dev/null +++ b/src/params.h @@ -0,0 +1,71 @@ +// Holds all of the paramaters structures + +#ifndef PARAMS_HEADER +#define PARAMS_HEADER + +namespace OFC { + + typedef __v4sf v4sf; + + + typedef struct { + int width; // image width, does not include '2 * padding', but includes original padding to ensure integer divisible image width and height + int height; // image height, does not include '2 * padding', but includes original padding to ensure integer divisible image width and height + int padding; // image padding in pixels at all sides, images padded with replicated border, gradients padded with zero, ADD THIS ONLY WHEN ADDRESSING THE IMAGE OR GRADIENT + float l_bound; // lower bound for valid image region, pre-compute for image padding to avoid border check + float u_bound_width; // upper width bound for valid image region, pre-compute for image padding to avoid border check + float u_bound_height; // upper height bound for valid image region, pre-compute for image padding to avoid border check + int width_pad; // width + 2 * padding + int height_pad; // height + 2 * padding + float scale_fact; // scaling factor at current scale + int curr_lvl; // current level + } img_params ; + + + typedef struct { + // Explicitly set parameters: + int coarsest_scale; + int finest_scale; + int patch_size; + float patch_stride; + bool use_mean_normalization; + // Termination + int grad_descent_iter; + float dp_thresh; + float dr_thresh; + float res_thresh; + // Verbosity, 0: plot nothing, 1: final internal timing 2: complete iteration timing, (UNCOMMENTED -> 3: Display flow scales, 4: Display flow scale iterations) + int verbosity; + bool use_var_ref; + int var_ref_iter; + float var_ref_alpha; + float var_ref_gamma; + float var_ref_delta; + float var_ref_sor_weight; // Successive-over-relaxation weight + + // Automatically set parameters / fixed parameters + float outlier_thresh; // displacement threshold (in px) before a patch is flagged as outlier + int steps; // horizontal and vertical distance (in px) between patch centers + int n_vals; // number of points in patch (=p_samp_s*p_samp_s) + int n_scales; // total number of scales + float min_errval = 2.0f; // 1/max(this, error) for pixel averaging weight + float norm_outlier = 5.0f; // norm error threshold for huber norm + + // cuBLAS + cublasHandle_t cublasHandle; + + // Helper variables + v4sf zero = (v4sf) {0.0f, 0.0f, 0.0f, 0.0f}; + v4sf negzero = (v4sf) {-0.0f, -0.0f, -0.0f, -0.0f}; + v4sf half = (v4sf) {0.5f, 0.5f, 0.5f, 0.5f}; + v4sf ones = (v4sf) {1.0f, 1.0f, 1.0f, 1.0f}; + v4sf twos = (v4sf) {2.0f, 2.0f, 2.0f, 2.0f}; + v4sf fours = (v4sf) {4.0f, 4.0f, 4.0f, 4.0f}; + v4sf norm_outlier_tmpbsq; + v4sf norm_outlier_tmp2bsq; + v4sf norm_outlier_tmp4bsq; + } opt_params; + +} + +#endif /* PARAMS_HEADER */ diff --git a/src/patch.cpp b/src/patch.cpp index 6cb6e0d..f3a8a41 100644 --- a/src/patch.cpp +++ b/src/patch.cpp @@ -295,7 +295,7 @@ namespace OFC { int lb = -op->patch_size / 2; int patch_offset = 3 * ((x + lb) + (y + lb) * i_params->width_pad); - float* pDeviceI0, *pDeviceI0x, *pDeviceI0y; + /*float* pDeviceI0, *pDeviceI0x, *pDeviceI0y; int size = i_params->width_pad * i_params->height_pad * 3; checkCudaErrors( cudaMalloc ((void**) &pDeviceI0, size * sizeof(float)) ); @@ -308,20 +308,20 @@ namespace OFC { CUBLAS_CHECK ( cublasSetVector(size, sizeof(float), I0x, 1, pDeviceI0x, 1) ); CUBLAS_CHECK ( - cublasSetVector(size, sizeof(float), I0y, 1, pDeviceI0y, 1) ); + cublasSetVector(size, sizeof(float), I0y, 1, pDeviceI0y, 1) );*/ // Extract patch checkCudaErrors( cudaMemcpy2D (pDevicePatch, 3 * op->patch_size * sizeof(float), - pDeviceI0 + patch_offset, 3 * i_params->width_pad * sizeof(float), + I0 + patch_offset, 3 * i_params->width_pad * sizeof(float), 3 * op->patch_size * sizeof(float), op->patch_size, cudaMemcpyDeviceToDevice) ); checkCudaErrors( cudaMemcpy2D (pDevicePatchX, 3 * op->patch_size * sizeof(float), - pDeviceI0x + patch_offset, 3 * i_params->width_pad * sizeof(float), + I0x + patch_offset, 3 * i_params->width_pad * sizeof(float), 3 * op->patch_size * sizeof(float), op->patch_size, cudaMemcpyDeviceToDevice) ); checkCudaErrors( cudaMemcpy2D (pDevicePatchY, 3 * op->patch_size * sizeof(float), - pDeviceI0y + patch_offset, 3 * i_params->width_pad * sizeof(float), + I0y + patch_offset, 3 * i_params->width_pad * sizeof(float), 3 * op->patch_size * sizeof(float), op->patch_size, cudaMemcpyDeviceToDevice) ); // Mean Normalization @@ -369,9 +369,7 @@ namespace OFC { // Mean Normalization if (op->use_mean_normalization > 0) { - cu::normalizeMean(pDeviceRawDiff, op->cublasHandle, op->patch_size); - } } diff --git a/src/patch.h b/src/patch.h index 72b479b..1ca8ca0 100644 --- a/src/patch.h +++ b/src/patch.h @@ -4,7 +4,11 @@ #ifndef PAT_HEADER #define PAT_HEADER -#include "oflow.h" // For camera intrinsic and opt. parameter struct +#include +#include +#include + +#include "params.h" // For camera intrinsic and opt. parameter struct namespace OFC { @@ -12,10 +16,6 @@ namespace OFC { bool has_converged; bool has_opt_started; - // reference/template patch - Eigen::Matrix raw_diff; // image error to reference image - Eigen::Matrix cost_diff; // absolute error image - Eigen::Matrix hessian; // Hessian for optimization Eigen::Vector2f p_org, p_cur, delta_p; // point position, displacement to starting position, iteration update @@ -54,7 +54,6 @@ namespace OFC { inline const bool HasOptStarted() const { return p_state->has_opt_started; } inline const Eigen::Vector2f GetTargMidpoint() const { return p_state->midpoint_cur; } inline const bool IsValid() const { return !p_state->invalid; } - inline const float * GetCostDiffPtr() const { return (float*) p_state->cost_diff.data(); } inline float * GetDeviceCostDiffPtr() const { return (float*) pDeviceCostDiff; } diff --git a/src/patchgrid.h b/src/patchgrid.h index 16e5424..9045111 100644 --- a/src/patchgrid.h +++ b/src/patchgrid.h @@ -5,7 +5,7 @@ #define PATGRID_HEADER #include "patch.h" -#include "oflow.h" // For camera intrinsic and opt. parameter struct +#include "params.h" // For camera intrinsic and opt. parameter struct namespace OFC { diff --git a/src/run_dense.cpp b/src/run_dense.cpp index 1af3c44..a6655d9 100644 --- a/src/run_dense.cpp +++ b/src/run_dense.cpp @@ -108,6 +108,7 @@ int AutoFirstScaleSelect(int imgwidth, int fratio, int patchsize) { } + int main( int argc, char** argv ) { // Warmup GPU @@ -141,7 +142,7 @@ int main( int argc, char** argv ) { int channels = 3; int elemSize = channels * sizeof(Npp32f); - /* MEMCOPY to CUDA */ + /* memcpy to cuda */ Npp32f* I0, *I1; auto start_cuda_malloc = now(); checkCudaErrors( cudaMalloc((void**) &I0, width_org * height_org * elemSize) ); @@ -159,12 +160,6 @@ int main( int argc, char** argv ) { // Parse rest of parameters opt_params op; - cublasStatus_t stat = cublasCreate(&op.cublasHandle); - if (stat != CUBLAS_STATUS_SUCCESS) { - printf ("CUBLAS initialization failed\n"); - exit(-1); - } - if (argc <= 5) { op.use_mean_normalization = true; @@ -274,7 +269,7 @@ int main( int argc, char** argv ) { // Create Optical Flow object - OFClass ofc(op); + OFClass ofc(op, iparams); // Run main optical flow / depth algorithm float scale_fact = pow(2, op.finest_scale); @@ -282,7 +277,6 @@ int main( int argc, char** argv ) { ofc.calc(I0, I1, iparams, nullptr, (float*) flow_mat.data); - cublasDestroy(op.cublasHandle); if (op.verbosity > 1) gettimeofday(&start_time, NULL);