From 6909adffe9fde066df4701b56a5d2b2c5e832684 Mon Sep 17 00:00:00 2001 From: Ashwin Sekar Date: Thu, 11 May 2017 23:08:02 +0000 Subject: [PATCH] Finish pipeline opti time is super fast! (CORRECT ON 4K) --- src/CMakeLists.txt | 2 +- src/kernels/optimize.cu | 25 ++- src/kernels/optimize.h | 2 +- src/oflow.cpp | 3 - src/patch.cpp | 419 ---------------------------------------- src/patch.h | 103 ---------- src/patchgrid.cpp | 157 ++------------- src/patchgrid.h | 16 +- 8 files changed, 46 insertions(+), 681 deletions(-) delete mode 100644 src/patch.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f2a3d1a..c1b4731 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -95,7 +95,7 @@ set(COMMON set(CODEFILES run_dense.cpp oflow.cpp - patch.cpp + # patch.cpp patchgrid.cpp refine_variational.cpp FDF1.0.1/image.c diff --git a/src/kernels/optimize.cu b/src/kernels/optimize.cu index 8eb3f39..41a1f51 100644 --- a/src/kernels/optimize.cu +++ b/src/kernels/optimize.cu @@ -100,7 +100,7 @@ __global__ void kernelInterpolateAndComputeErr( float** tempXX, float** tempYY, int n_patches, int padding, int patch_size, int width_pad, int gd_iter, float res_thresh, float dp_thresh, - float dr_thresh, float out_thresh, int lb, int ubw, int ubh) { + float dr_thresh, float out_thresh, int lb, int ubw, int ubh, bool proj) { int patchId = blockIdx.x; int tid = threadIdx.x; @@ -124,7 +124,8 @@ __global__ void kernelInterpolateAndComputeErr( // Interpolate the patch - float pos0, pos1, pos2, pos3, resid0, resid1, w0, w1, w2, w3; + int pos0, pos1, pos2, pos3; + float resid0, resid1, w0, w1, w2, w3; // Compute the bilinear weight vector, for patch without orientation/scale change // weight vector is constant for all pixels @@ -141,6 +142,9 @@ __global__ void kernelInterpolateAndComputeErr( w2 = resid0 * (1- resid1); w3 = (1 - resid0) * (1 - resid1); + // if (tid == 0) { + // printf("Weights %.2f, %.2f, %.2f, %.2f\n", w0, w1, w2, w3); + // } pos0 += padding; pos1 += padding; @@ -150,7 +154,8 @@ __global__ void kernelInterpolateAndComputeErr( for (int i = tid, j = starty; i < patch_size * patch_size * 3; - i += 3 * patch_size, j += 3 * width_pad) { + i += 3 * patch_size, j++) { + const float* img_e = I1 + x; const float* img_a = img_e + j * width_pad * 3; @@ -158,6 +163,9 @@ __global__ void kernelInterpolateAndComputeErr( const float* img_b = img_a - 3; const float* img_d = img_c - 3; raw[i] = w0 * (*img_a) + w1 * (*img_b) + w2 * (*img_c) + w3 * (*img_d); + // if (tid == 0) { + // printf("img_d %.2f\n", *img_d); + // } } @@ -172,7 +180,11 @@ __global__ void kernelInterpolateAndComputeErr( for (int i = 0; i < patch_size * patch_size * 3; i++) { mean += raw[i]; } - mean /= patch_size * patch_size * 3; + // printf("sum %.2f\n", mean); + mean /= (patch_size * patch_size * 3); + + // printf("mean %.2f\n", mean); + // printf("pos0, pos1, (%d, %d)\n", pos0, pos1); } @@ -194,6 +206,7 @@ __global__ void kernelInterpolateAndComputeErr( c += cost[i]; } states[patchId].cost = c; + // printf("cost %.2f\n", c); // Check convergence @@ -237,7 +250,7 @@ namespace cu { float** raw_diff, float** costs, float** patches, float** patchXs, float** patchYs, float** tempXX, float** tempYY, const float* I1, int n_patches, const opt_params* op, - const img_params* i_params) { + const img_params* i_params, bool notFirst) { int nBlocks = n_patches; int nThreadsPerBlock = 3 * op->patch_size; @@ -248,7 +261,7 @@ namespace cu { i_params->padding, op->patch_size, i_params->width_pad, op->grad_descent_iter, op->res_thresh, op->dp_thresh, op->dr_thresh, op->outlier_thresh, i_params->l_bound, i_params->u_bound_width, - i_params->u_bound_height); + i_params->u_bound_height, notFirst); } diff --git a/src/kernels/optimize.h b/src/kernels/optimize.h index 040d34f..3e85dfe 100644 --- a/src/kernels/optimize.h +++ b/src/kernels/optimize.h @@ -33,7 +33,7 @@ namespace cu { float** raw_diff, float** costs, float** patches, float** patchXs, float** patchYs, float** tempXX, float** tempYY, const float* I1, int n_patches, const opt_params* op, - const img_params* i_params); + const img_params* i_params, bool project); } diff --git a/src/oflow.cpp b/src/oflow.cpp index f4b68f9..573bcd6 100644 --- a/src/oflow.cpp +++ b/src/oflow.cpp @@ -280,9 +280,6 @@ namespace OFC { // Dense Inverse Search. (Step 3 in Algorithm 1 of paper) // Parallel over all patches - grid[ii]->OptimizeSetup(); - /*while (!grid[ii]->AllConverged()) - grid[ii]->OptimizeStep();*/ grid[ii]->Optimize(); // Timing, DIS diff --git a/src/patch.cpp b/src/patch.cpp deleted file mode 100644 index 925c358..0000000 --- a/src/patch.cpp +++ /dev/null @@ -1,419 +0,0 @@ -#include -#include -#include - -#include - -#include -#include -#include - -#include -#include - -#include -#include -#include "common/cuda_helper.h" -#include "common/Exceptions.h" -#include "common/timer.h" - -#include "patch.h" -#include "kernels/interpolate.h" -#include "kernels/extract.h" - - -using std::cout; -using std::endl; -using std::vector; - -namespace OFC { - - - PatClass::PatClass( - const img_params* _i_params, - const opt_params* _op, - const int _patch_id) - : - i_params(_i_params), - op(_op), - patch_id(_patch_id) { - - p_state = new patch_state; - - patch.resize(op->n_vals,1); - patch_x.resize(op->n_vals,1); - patch_y.resize(op->n_vals,1); - - /*checkCudaErrors( - cudaMalloc ((void**) &pDevicePatch, patch.size() * sizeof(float)) ); - checkCudaErrors( - cudaMalloc ((void**) &pDevicePatchX, patch_x.size() * sizeof(float)) ); - checkCudaErrors( - cudaMalloc ((void**) &pDevicePatchY, patch_y.size() * sizeof(float)) );*/ - /*checkCudaErrors( - cudaMalloc ((void**) &pDeviceRawDiff, patch.size() * sizeof(float)) ); - checkCudaErrors( - cudaMalloc ((void**) &pDeviceCostDiff, patch.size() * sizeof(float)) );*/ - checkCudaErrors( - cudaMalloc ((void**) &pDeviceWeights, 4 * sizeof(float)) ); - - - // Timing - hessianTime = 0; - projectionTime = 0; - costTime = 0; - interpolateTime = 0; - meanTime = 0; - - hessianCalls = 0; - projectionCalls = 0; - costCalls = 0; - interpolateCalls = 0; - meanCalls = 0; - - } - - - PatClass::~PatClass() { - - /*cudaFree(pDevicePatch); - cudaFree(pDevicePatchX); - cudaFree(pDevicePatchY);*/ - - // cudaFree(pDeviceRawDiff); - // cudaFree(pDeviceCostDiff); - cudaFree(pDeviceWeights); - - delete p_state; - - } - - // void PatClass::InitializePatch(const float * _I0, - // const float * _I0x, const float * _I0y, const Eigen::Vector2f _midpoint) { - void PatClass::InitializePatch(float * _patch, - float * _patchx, float * _patchy, float H00, float H01, float H11, - const Eigen::Vector2f _midpoint) { - - // I0 = _I0; - // I0x = _I0x; - // I0y = _I0y; - - pDevicePatch = _patch; - pDevicePatchX = _patchx; - pDevicePatchY = _patchy; - - midpoint = _midpoint; - - ResetPatchState(); - - p_state->hessian(0,0) = H00; - p_state->hessian(0,1) = H01; - p_state->hessian(1,0) = p_state->hessian(0,1); - p_state->hessian(1,1) = H11; - - //ExtractPatch(); - // ComputeHessian(H00, H01, H11); - - } - - void PatClass::ComputeHessian(float H00, float H01, float H11) { - - /*gettimeofday(&tv_start, nullptr); - - CUBLAS_CHECK ( - cublasSdot(op->cublasHandle, patch.size(), - pDevicePatchX, 1, pDevicePatchX, 1, &(p_state->hessian(0,0))) ); - CUBLAS_CHECK ( - cublasSdot(op->cublasHandle, patch.size(), - pDevicePatchX, 1, pDevicePatchY, 1, &(p_state->hessian(0,1))) ); - CUBLAS_CHECK ( - cublasSdot(op->cublasHandle, patch.size(), - pDevicePatchY, 1, pDevicePatchY, 1, &(p_state->hessian(1,1))) ); - - p_state->hessian(1,0) = p_state->hessian(0,1); - - gettimeofday(&tv_end, nullptr); - - hessianTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f + - (tv_end.tv_usec - tv_start.tv_usec) / 1000.0f; - hessianCalls++;*/ - - p_state->hessian(0,0) = H00; - p_state->hessian(0,1) = H01; - p_state->hessian(1,0) = p_state->hessian(0,1); - p_state->hessian(1,1) = H11; - - // If not invertible adjust values - if (p_state->hessian.determinant() == 0) { - p_state->hessian(0,0) += 1e-10; - p_state->hessian(1,1) += 1e-10; - } - - } - - void PatClass::SetTargetImage(const float * _I1) { - - pDeviceI = _I1; - - ResetPatchState(); - - } - - void PatClass::ResetPatchState() { - - p_state->has_converged = 0; - p_state->has_opt_started = 0; - - p_state->midpoint_org = midpoint; - p_state->midpoint_cur = midpoint; - - p_state->p_org.setZero(); - p_state->p_cur.setZero(); - p_state->delta_p.setZero(); - - p_state->delta_p_sq_norm = 1e-10; - p_state->delta_p_sq_norm_init = 1e-10; - p_state->mares = 1e20; - p_state->mares_old = 1e20; - p_state->count = 0; - p_state->invalid = false; - - p_state->cost = 0.0; - } - - void PatClass::OptimizeStart(const Eigen::Vector2f p_prev, bool conv) { - - p_state->p_org = p_prev; - p_state->p_cur = p_prev; - - UpdateMidpoint(); - - // save starting location, only needed for outlier check - p_state->midpoint_org = p_state->midpoint_cur; - - //Check if initial position is already invalid - if (p_state->midpoint_cur[0] < i_params->l_bound - || p_state->midpoint_cur[1] < i_params->l_bound - || p_state->midpoint_cur[0] > i_params->u_bound_width - || p_state->midpoint_cur[1] > i_params->u_bound_height) { - - p_state->has_converged=1; - p_state->has_opt_started=1; - - } else { - - p_state->count = 0; // reset iteration counter - p_state->delta_p_sq_norm = 1e-10; - p_state->delta_p_sq_norm_init = 1e-10; // set to arbitrary low value, s.t. that loop condition is definitely true on first iteration - p_state->mares = 1e5; // mean absolute residual - p_state->mares_old = 1e20; // for rate of change, keep mares from last iteration in here. Set high so that loop condition is definitely true on first iteration - p_state->has_converged=0; - - // OptimizeComputeErrImg(false, c); - p_state->has_converged = conv; - - p_state->has_opt_started = 1; - p_state->invalid = false; - - } - - } - - void PatClass::OptimizeIter() { - - // Do one optimize iteration - - if (!p_state->has_converged) { - - p_state->count++; - - // Projection onto sd_images - gettimeofday(&tv_start, nullptr); - CUBLAS_CHECK ( - cublasSdot(op->cublasHandle, patch.size(), - pDevicePatchX, 1, pDeviceRawDiff, 1, &(p_state->delta_p[0])) ); - CUBLAS_CHECK ( - cublasSdot(op->cublasHandle, patch.size(), - pDevicePatchY, 1, pDeviceRawDiff, 1, &(p_state->delta_p[1])) ); - gettimeofday(&tv_end, nullptr); - projectionTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f + - (tv_end.tv_usec - tv_start.tv_usec) / 1000.0f; - projectionCalls++; - - - p_state->delta_p = p_state->hessian.llt().solve(p_state->delta_p); // solve linear system - - p_state->p_cur -= p_state->delta_p; // update flow vector - - // compute patch locations based on new parameter vector - UpdateMidpoint(); - - // check if patch(es) moved too far from starting location, if yes, stop iteration and reset to starting location - // check if query patch moved more than >padval from starting location -> most likely outlier - if ((p_state->midpoint_org - p_state->midpoint_cur).norm() > op->outlier_thresh - || p_state->midpoint_cur[0] < i_params->l_bound - || p_state->midpoint_cur[1] < i_params->l_bound - || p_state->midpoint_cur[0] > i_params->u_bound_width - || p_state->midpoint_cur[1] > i_params->u_bound_height) { - - // Reset because this is an outlier - p_state->p_cur = p_state->p_org; - UpdateMidpoint(); - p_state->has_converged=1; - p_state->has_opt_started=1; - - } - - OptimizeComputeErrImg(true, -0.0); - - } - - } - - inline void PatClass::UpdateMidpoint() { - - p_state->midpoint_cur = midpoint + p_state->p_cur; - - } - - void PatClass::ComputeCostErr() { - - // L2-Norm - - const float alpha = -1.0; - - gettimeofday(&tv_start, nullptr); - // raw = raw - patch - CUBLAS_CHECK ( - cublasSaxpy(op->cublasHandle, patch.size(), &alpha, - pDevicePatch, 1, pDeviceRawDiff, 1) ); - - // Element-wise multiplication - CUBLAS_CHECK ( - cublasSdgmm(op->cublasHandle, CUBLAS_SIDE_RIGHT, - 1, patch.size(), pDeviceRawDiff, 1, pDeviceRawDiff, 1, pDeviceCostDiff, 1) ); - - // Sum - CUBLAS_CHECK ( - cublasSasum(op->cublasHandle, patch.size(), - pDeviceCostDiff, 1, &(p_state->cost)) ); - gettimeofday(&tv_end, nullptr); - - costTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f + - (tv_end.tv_usec - tv_start.tv_usec) / 1000.0f; - costCalls++; - - } - - void PatClass::OptimizeComputeErrImg(bool interp, float c) { - - if (interp) { - InterpolatePatch(); - ComputeCostErr(); - } else { - p_state->cost = c; - } - - // Compute step norm - p_state->delta_p_sq_norm = p_state->delta_p.squaredNorm(); - if (p_state->count == 1) - p_state->delta_p_sq_norm_init = p_state->delta_p_sq_norm; - - // Check early termination criterions - p_state->mares_old = p_state->mares; - p_state->mares = p_state->cost / op->n_vals; - - if (!((p_state->count < op->grad_descent_iter) & (p_state->mares > op->res_thresh) - & ((p_state->count < op->grad_descent_iter) | (p_state->delta_p_sq_norm / p_state->delta_p_sq_norm_init >= op->dp_thresh)) - & ((p_state->count < op->grad_descent_iter) | (p_state->mares / p_state->mares_old <= op->dr_thresh)))) { - p_state->has_converged = 1; - } - - } - - // Extract patch on integer position, and gradients, No Bilinear interpolation - void PatClass::ExtractPatch() { - - int x = round(midpoint[0]) + i_params->padding; - int y = round(midpoint[1]) + i_params->padding; - - int lb = -op->patch_size / 2; - int patch_offset = (x + lb) + (y + lb) * i_params->width_pad; - - // Extract patch - /*cu::extractPatch(pDevicePatch, pDevicePatchX, pDevicePatchY, - I0, I0x, I0y, patch_offset, op->patch_size, i_params->width_pad);*/ - - gettimeofday(&tv_start, nullptr); - // Mean Normalization - if (op->use_mean_normalization > 0) { - cu::normalizeMean(pDevicePatch, op->patch_size); - } - - gettimeofday(&tv_end, nullptr); - meanTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f + - (tv_end.tv_usec - tv_start.tv_usec) / 1000.0f; - meanCalls++; - - - - } - - // Extract patch on float position with bilinear interpolation, no gradients. - void PatClass::InterpolatePatch() { - - Eigen::Vector2f resid; - Eigen::Vector4f weight; // bilinear weight vector - Eigen::Vector4i pos; - Eigen::Vector2i pos_iter; - - // Compute the bilinear weight vector, for patch without orientation/scale change - // weight vector is constant for all pixels - pos[0] = ceil(p_state->midpoint_cur[0] + .00001f); // ensure rounding up to natural numbers - pos[1] = ceil(p_state->midpoint_cur[1] + .00001f); - pos[2] = floor(p_state->midpoint_cur[0]); - pos[3] = floor(p_state->midpoint_cur[1]); - - resid[0] = p_state->midpoint_cur[0] - (float)pos[2]; - resid[1] = p_state->midpoint_cur[1] - (float)pos[3]; - weight[0] = resid[0] * resid[1]; - weight[1] = (1 - resid[0]) * resid[1]; - weight[2] = resid[0] * (1- resid[1]); - weight[3] = (1 - resid[0]) * (1 - resid[1]); - - pos[0] += i_params->padding; - pos[1] += i_params->padding; - - int lb = -op->patch_size / 2; - int startx = pos[0] + lb; - int starty = pos[1] + lb; - - gettimeofday(&tv_start, nullptr); - checkCudaErrors( - cudaMemcpy(pDeviceWeights, weight.data(), - 4 * sizeof(float), cudaMemcpyHostToDevice) ); - - - cu::interpolatePatch(pDeviceRawDiff, pDeviceI, pDeviceWeights, - i_params->width_pad, starty, startx, op->patch_size); - - gettimeofday(&tv_end, nullptr); - interpolateTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f + - (tv_end.tv_usec - tv_start.tv_usec) / 1000.0f; - interpolateCalls++; - - gettimeofday(&tv_start, nullptr); - // Mean Normalization - if (op->use_mean_normalization > 0) { - cu::normalizeMean(pDeviceRawDiff, op->patch_size); - } - - gettimeofday(&tv_end, nullptr); - meanTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f + - (tv_end.tv_usec - tv_start.tv_usec) / 1000.0f; - meanCalls++; - - - } - -} diff --git a/src/patch.h b/src/patch.h index 489da3e..a5de5c0 100644 --- a/src/patch.h +++ b/src/patch.h @@ -12,28 +12,6 @@ namespace OFC { - typedef struct { - bool has_converged; - bool has_opt_started; - - Eigen::Matrix hessian; // Hessian for optimization - Eigen::Vector2f p_org, p_cur, delta_p; // point position, displacement to starting position, iteration update - - // start positions, current point position, patch norm - // Eigen::Matrix norm; - Eigen::Vector2f midpoint_cur; - Eigen::Vector2f midpoint_org; - - float delta_p_sq_norm = 1e-10; - float delta_p_sq_norm_init = 1e-10; - float mares = 1e20; // mares: Mean Absolute RESidual - float mares_old = 1e20; - int count = 0; - bool invalid = false; - - float cost = 0.0; - } patch_state; - typedef struct { bool has_converged; bool has_opt_started; @@ -57,87 +35,6 @@ namespace OFC { float cost = 0.0; } dev_patch_state; - - - class PatClass { - - public: - PatClass(const img_params* _i_params, - const opt_params* _op, - const int _patch_id); - - ~PatClass(); - - // void InitializePatch(const float * _I0, const float * _I0x, - // const float * _I0y, const Eigen::Vector2f _midpoint); - void InitializePatch(float * _patch, - float * _patchx, float* _patchy, - float H00, float H01, float H11, - const Eigen::Vector2f _midpoint); - void SetTargetImage(const float * _I1); - - void OptimizeStart(const Eigen::Vector2f p_prev, bool conv); - void OptimizeIter(); - - inline const bool IsConverged() const { return p_state->has_converged; } - inline const bool HasOptStarted() const { return p_state->has_opt_started; } - inline const Eigen::Vector2f GetTargMidpoint() const { return p_state->midpoint_cur; } - inline const bool IsValid() const { return !p_state->invalid; } - inline float * GetDeviceCostDiffPtr() const { return (float*) pDeviceCostDiff; } - inline float * getCostP() { return (float*) pDeviceCostDiff; } - inline float * getRawP() { return (float*) pDeviceRawDiff; } - inline void setRawP(float* raw) { pDeviceRawDiff = raw; } - inline void setCostP(float* cost) { pDeviceCostDiff = cost; } - - - inline const Eigen::Vector2f* GetCurP() const { return &(p_state->p_cur); } - inline const Eigen::Vector2f* GetOrgP() const { return &(p_state->p_org); } - inline const int GetPatchId() const { return patch_id; } - - struct timeval tv_start, tv_end; - double hessianTime, projectionTime, costTime, interpolateTime, meanTime; - int hessianCalls, projectionCalls, costCalls, interpolateCalls, meanCalls; - - private: - - void OptimizeComputeErrImg(bool interp, float cost); - void UpdateMidpoint(); - void ResetPatchState(); - void ComputeHessian(float H00, float H01, float H11); - void ComputeCostErr(); - - // Extract patch on integer position, and gradients, No Bilinear interpolation - void ExtractPatch(); - // Extract patch on float position with bilinear interpolation, no gradients. - void InterpolatePatch(); - - - const float* pDeviceI; - - float* pDevicePatch; - float* pDevicePatchX; - float* pDevicePatchY; - - float* pDeviceRawDiff; - float* pDeviceCostDiff; - float* pDeviceWeights; - - - Eigen::Vector2f midpoint; // reference point location - Eigen::Matrix patch; - Eigen::Matrix patch_x; - Eigen::Matrix patch_y; - - const float * I0, * I0x, * I0y; - - const img_params* i_params; - const opt_params* op; - const int patch_id; - - patch_state * p_state = nullptr; // current patch state - - }; - } #endif /* PAT_HEADER */ diff --git a/src/patchgrid.cpp b/src/patchgrid.cpp index a44226a..87d2e30 100644 --- a/src/patchgrid.cpp +++ b/src/patchgrid.cpp @@ -46,17 +46,11 @@ namespace OFC { n_patches = n_patches_width * n_patches_height; midpoints_ref.resize(n_patches); - p_init.resize(n_patches); - patches.reserve(n_patches); checkCudaErrors( cudaMalloc ((void**) &pDevicePatchStates, n_patches * sizeof(dev_patch_state)) ); pHostDevicePatchStates = new dev_patch_state[n_patches]; - midpointX_host = new float[n_patches]; - midpointY_host = new float[n_patches]; - - int patch_id = 0; for (int x = 0; x < n_patches_width; ++x) { for (int y = 0; y < n_patches_height; ++y) { @@ -64,25 +58,10 @@ namespace OFC { midpoints_ref[i][0] = x * steps + offsetw; midpoints_ref[i][1] = y * steps + offseth; - midpointX_host[i] = x * steps + offsetw; - midpointY_host[i] = y * steps + offseth; - p_init[i].setZero(); - - patches.push_back(new OFC::PatClass(i_params, op, patch_id)); - patch_id++; } } - // Midpoint - checkCudaErrors( - cudaMalloc ((void**) &pDeviceMidpointX, n_patches * sizeof(float)) ); - checkCudaErrors( - cudaMalloc ((void**) &pDeviceMidpointY, n_patches * sizeof(float)) ); - checkCudaErrors( cudaMemcpy(pDeviceMidpointX, midpointX_host, - n_patches * sizeof(float), cudaMemcpyHostToDevice) ); - checkCudaErrors( cudaMemcpy(pDeviceMidpointY, midpointY_host, - n_patches * sizeof(float), cudaMemcpyHostToDevice) ); // Aggregate flow checkCudaErrors( @@ -205,8 +184,10 @@ namespace OFC { aggregateTime = 0.0; meanTime = 0.0; extractTime = 0.0; + optiTime = 0.0; } + PatGridClass::~PatGridClass() { for (int i = 0; i < n_patches; ++i) { @@ -222,7 +203,6 @@ namespace OFC { cudaFree(pHostDeviceRaws[i]); cudaFree(pHostDeviceCosts[i]); - delete patches[i]; } cudaFree(pDevicePatches); @@ -243,12 +223,6 @@ namespace OFC { delete pHostDeviceTempXY; delete pHostDeviceTempYY; - delete midpointX_host; - delete midpointY_host; - - cudaFree(pDeviceMidpointX); - cudaFree(pDeviceMidpointY); - cudaFree(pDeviceH00); cudaFree(pDeviceH01); cudaFree(pDeviceH11); @@ -263,6 +237,7 @@ namespace OFC { } + void PatGridClass::InitializeGrid(const float * _I0, const float * _I0x, const float * _I0y) { I0 = _I0; @@ -279,74 +254,29 @@ namespace OFC { extractTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f + (tv_end.tv_usec - tv_start.tv_usec) / 1000.0f; - /* float H00, H01, H11; - - for (int i = 0; i < n_patches; ++i) { - checkCudaErrors( - cudaMemcpy(&H00, &(pDevicePatchStates[i].H00), sizeof(float), cudaMemcpyDeviceToHost) ); - checkCudaErrors( - cudaMemcpy(&H01, &(pDevicePatchStates[i].H01), sizeof(float), cudaMemcpyDeviceToHost) ); - checkCudaErrors( - cudaMemcpy(&H11, &(pDevicePatchStates[i].H11), sizeof(float), cudaMemcpyDeviceToHost) ); - - patches[i]->InitializePatch(pHostDevicePatches[i], - pHostDevicePatchXs[i], pHostDevicePatchYs[i], - H00, H01, H11, midpoints_ref[i]); - p_init[i].setZero(); - }*/ - } + void PatGridClass::SetTargetImage(const float * _I1) { I1 = _I1; - /*for (int i = 0; i < n_patches; ++i) { - patches[i]->SetTargetImage(I1); - }*/ - } - void PatGridClass::OptimizeSetup() { - - /*cu::interpolateAndComputeErr(pDevicePatchStates, - pDeviceRaws, pDeviceCosts, pDevicePatches, I1, - n_patches, op, i_params, false); - - bool conv; - for (int i = 0; i < n_patches; ++i) { - patches[i]->setRawP(pHostDeviceRaws[i]); - patches[i]->setCostP(pHostDeviceCosts[i]); - checkCudaErrors( - cudaMemcpy(&conv, &(pDevicePatchStates[i].has_converged), sizeof(bool), - cudaMemcpyDeviceToHost) ); - - patches[i]->OptimizeStart(p_init[i], conv); - }*/ - - } void PatGridClass::Optimize() { + gettimeofday(&tv_start, nullptr); + cu::interpolateAndComputeErr(pDevicePatchStates, pDeviceRaws, pDeviceCosts, pDevicePatches, pDevicePatchXs, pDevicePatchYs, pDeviceTempXX, pDeviceTempYY, - I1, n_patches, op, i_params); - // #pragma omp parallel for schedule(static) - /*for (int i = 0; i < n_patches; ++i) { - patches[i]->OptimizeIter(); - }*/ + I1, n_patches, op, i_params, true); + gettimeofday(&tv_end, nullptr); + optiTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f + + (tv_end.tv_usec - tv_start.tv_usec) / 1000.0f; } - bool PatGridClass::AllConverged() { - - for (int i = 0; i < n_patches; ++i) { - if (!patches[i]->IsConverged()) - return false; - } - - return true; - } void PatGridClass::InitializeFromCoarserOF(const float * flow_prev) { @@ -360,24 +290,10 @@ namespace OFC { cu::initCoarserOF(devFlowPrev, pDevicePatchStates, n_patches, i_params); - for (int ip = 0; ip < n_patches; ++ip) { - - // int x = floor(midpoints_ref[ip][0] / 2); - // int y = floor(midpoints_ref[ip][1] / 2); - // int i = y * (i_params->width / 2) + x; - - // p_init[ip](0) = flow_prev[2 * i] * 2; - // p_init[ip](1) = flow_prev[2 * i + 1] * 2; - - /*checkCudaErrors( cudaMemcpy(&(p_init[ip](0)), &(pDevicePatchStates[ip].p_orgx), - sizeof(float), cudaMemcpyDeviceToHost) ); - checkCudaErrors( cudaMemcpy(&(p_init[ip](1)), &(pDevicePatchStates[ip].p_orgy), - sizeof(float), cudaMemcpyDeviceToHost) );*/ - - } } + void PatGridClass::AggregateFlowDense(float *flowout) { /*bool isValid[n_patches]; @@ -430,16 +346,14 @@ namespace OFC { op, i_params);*/ for (int ip = 0; ip < n_patches; ++ip) { - //const Eigen::Vector2f* fl = patches[ip]->GetCurP(); // flow displacement of this patch + float* pweight = pHostDeviceCosts[ip]; // use image error as weight - float* pweight = pHostDeviceCosts[ip]; // use image error as weight - - cu::densifyPatch( - pweight, pDeviceFlowOut, pDeviceWeights, - pDevicePatchStates, ip, - midpoints_ref[ip][0], midpoints_ref[ip][1], - i_params->width, i_params->height, - op->patch_size, op->min_errval); + cu::densifyPatch( + pweight, pDeviceFlowOut, pDeviceWeights, + pDevicePatchStates, ip, + midpoints_ref[ip][0], midpoints_ref[ip][1], + i_params->width, i_params->height, + op->patch_size, op->min_errval); } @@ -460,48 +374,17 @@ namespace OFC { (tv_end.tv_usec - tv_start.tv_usec) / 1000.0f; } - void PatGridClass::printTimings() { - double tot_hessianTime = 0, - tot_projectionTime = 0, tot_costTime = 0, - tot_interpolateTime = 0, tot_meanTime = 0; - int tot_hessianCalls = 0, - tot_projectionCalls = 0, tot_costCalls = 0, - tot_interpolateCalls = 0, tot_meanCalls = 0; - - for (auto & element : patches) { - tot_hessianTime += element->hessianTime; - tot_projectionTime += element->projectionTime; - tot_costTime += element->costTime; - tot_interpolateTime += element->interpolateTime; - tot_meanTime += element->meanTime; - - tot_hessianCalls += element->hessianCalls; - tot_projectionCalls += element->projectionCalls; - tot_costCalls += element->costCalls; - tot_interpolateCalls += element->interpolateCalls; - tot_meanCalls += element->meanCalls; - } + void PatGridClass::printTimings() { cout << endl; cout << "===============Timings (ms)===============" << endl; - cout << "Avg grad descent iterations: " << float(tot_costCalls) / float(n_patches) << endl; - cout << "[hessian] " << tot_hessianTime; - cout << " tot => " << tot_hessianTime / tot_hessianCalls << " avg" << endl; - cout << "[project] " << tot_projectionTime; - cout << " tot => " << tot_projectionTime / tot_projectionCalls << " avg" << endl; - cout << "[cost] " << tot_costTime; - cout << " tot => " << tot_costTime / tot_costCalls << " avg" << endl; - cout << "[interpolate] " << tot_interpolateTime; - cout << " tot => " << tot_interpolateTime / tot_interpolateCalls << " avg" << endl; - cout << "[mean norm] " << tot_meanTime; - cout << " tot => " << tot_meanTime / tot_meanCalls << " avg" << endl; cout << "[extract] " << extractTime << endl; + cout << "[optiTime] " << optiTime << endl; cout << "[aggregate] " << aggregateTime << endl; cout << "[flow norm] " << meanTime << endl; cout << "==========================================" << endl; - } } diff --git a/src/patchgrid.h b/src/patchgrid.h index ef2e3d1..389fff7 100644 --- a/src/patchgrid.h +++ b/src/patchgrid.h @@ -24,19 +24,13 @@ namespace OFC { void AggregateFlowDense(float *flowout); // Optimizes grid to convergence of each patch - void OptimizeSetup(); void Optimize(); - bool AllConverged(); - //Optimize each patch in grid for one iteration, visualize displacement vector, repeat - //void OptimizeAndVisualize(const float sc_fct_tmp); // needed for verbosity >= 3, DISVISUAL inline const int GetNumPatches() const { return n_patches; } inline const int GetNumPatchesW() const { return n_patches_width; } inline const int GetNumPatchesH() const { return n_patches_height; } inline const Eigen::Vector2f GetRefPatchPos(int i) const { return midpoints_ref[i]; } // Get reference patch position - inline const Eigen::Vector2f GetQuePatchPos(int i) const { return patches[i]->GetTargMidpoint(); } // Get target/query patch position - inline const Eigen::Vector2f GetQuePatchDis(int i) const { return midpoints_ref[i]-patches[i]->GetTargMidpoint(); } // Get query patch displacement from reference patch void printTimings(); @@ -53,7 +47,6 @@ namespace OFC { float** pDevicePatches, ** pDevicePatchXs, ** pDevicePatchYs; float** pHostDevicePatches, **pHostDevicePatchXs, **pHostDevicePatchYs; - float* pDeviceMidpointX, * pDeviceMidpointY; // Raw Diff and Costs float** pDeviceRaws, **pDeviceCosts; @@ -79,12 +72,13 @@ namespace OFC { double aggregateTime; double meanTime; double extractTime; + double optiTime; - float* midpointX_host; - float* midpointY_host; - std::vector patches; // Patch Objects + // float* midpointX_host; + // float* midpointY_host; + // std::vector patches; // Patch Objects std::vector midpoints_ref; // Midpoints for reference patches - std::vector p_init; // starting parameters for query patches + // std::vector p_init; // starting parameters for query patches }; }