From 6909adffe9fde066df4701b56a5d2b2c5e832684 Mon Sep 17 00:00:00 2001
From: Ashwin Sekar <asekar@andrew.cmu.edu>
Date: Thu, 11 May 2017 23:08:02 +0000
Subject: [PATCH] Finish pipeline opti time is super fast! (CORRECT ON 4K)

---
 src/CMakeLists.txt      |   2 +-
 src/kernels/optimize.cu |  25 ++-
 src/kernels/optimize.h  |   2 +-
 src/oflow.cpp           |   3 -
 src/patch.cpp           | 419 ----------------------------------------
 src/patch.h             | 103 ----------
 src/patchgrid.cpp       | 157 ++-------------
 src/patchgrid.h         |  16 +-
 8 files changed, 46 insertions(+), 681 deletions(-)
 delete mode 100644 src/patch.cpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f2a3d1a..c1b4731 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -95,7 +95,7 @@ set(COMMON
 set(CODEFILES
   run_dense.cpp
   oflow.cpp
-  patch.cpp
+  # patch.cpp
   patchgrid.cpp
   refine_variational.cpp
   FDF1.0.1/image.c
diff --git a/src/kernels/optimize.cu b/src/kernels/optimize.cu
index 8eb3f39..41a1f51 100644
--- a/src/kernels/optimize.cu
+++ b/src/kernels/optimize.cu
@@ -100,7 +100,7 @@ __global__ void kernelInterpolateAndComputeErr(
     float** tempXX, float** tempYY,
     int n_patches, int padding, int patch_size,
     int width_pad, int gd_iter, float res_thresh, float dp_thresh,
-    float dr_thresh, float out_thresh, int lb, int ubw, int ubh) {
+    float dr_thresh, float out_thresh, int lb, int ubw, int ubh, bool proj) {
 
   int patchId = blockIdx.x;
   int tid = threadIdx.x;
@@ -124,7 +124,8 @@ __global__ void kernelInterpolateAndComputeErr(
 
     // Interpolate the patch
 
-    float pos0, pos1, pos2, pos3, resid0, resid1, w0, w1, w2, w3;
+    int pos0, pos1, pos2, pos3;
+    float resid0, resid1, w0, w1, w2, w3;
 
     // Compute the bilinear weight vector, for patch without orientation/scale change
     // weight vector is constant for all pixels
@@ -141,6 +142,9 @@ __global__ void kernelInterpolateAndComputeErr(
     w2 = resid0 * (1- resid1);
     w3 = (1 - resid0) * (1 - resid1);
 
+    // if (tid == 0) {
+    //   printf("Weights %.2f, %.2f, %.2f, %.2f\n", w0, w1, w2, w3);
+    // }
     pos0 += padding;
     pos1 += padding;
 
@@ -150,7 +154,8 @@ __global__ void kernelInterpolateAndComputeErr(
 
 
     for (int i = tid, j = starty; i < patch_size * patch_size * 3;
-        i += 3 * patch_size, j += 3 * width_pad) {
+         i += 3 * patch_size, j++) {
+
 
       const float* img_e = I1 + x;
       const float* img_a = img_e + j * width_pad * 3;
@@ -158,6 +163,9 @@ __global__ void kernelInterpolateAndComputeErr(
       const float* img_b = img_a - 3;
       const float* img_d = img_c - 3;
       raw[i] = w0 * (*img_a) + w1 * (*img_b) + w2 * (*img_c) + w3 * (*img_d);
+      // if (tid == 0) {
+      //   printf("img_d %.2f\n", *img_d);
+      // }
 
     }
 
@@ -172,7 +180,11 @@ __global__ void kernelInterpolateAndComputeErr(
       for (int i = 0; i < patch_size * patch_size * 3; i++) {
         mean += raw[i];
       }
-      mean /= patch_size * patch_size * 3;
+      // printf("sum %.2f\n", mean);
+      mean /= (patch_size * patch_size * 3);
+
+      // printf("mean %.2f\n", mean);
+      // printf("pos0, pos1, (%d, %d)\n", pos0, pos1);
 
     }
 
@@ -194,6 +206,7 @@ __global__ void kernelInterpolateAndComputeErr(
         c += cost[i];
       }
       states[patchId].cost = c;
+      // printf("cost %.2f\n", c);
 
       // Check convergence
 
@@ -237,7 +250,7 @@ namespace cu {
       float** raw_diff, float** costs, float** patches, float** patchXs,
       float** patchYs, float** tempXX, float** tempYY, const float* I1,
       int n_patches, const opt_params* op,
-      const img_params* i_params) {
+      const img_params* i_params, bool notFirst) {
 
     int nBlocks = n_patches;
     int nThreadsPerBlock = 3 * op->patch_size;
@@ -248,7 +261,7 @@ namespace cu {
         i_params->padding, op->patch_size, i_params->width_pad, 
         op->grad_descent_iter, op->res_thresh, op->dp_thresh,
         op->dr_thresh, op->outlier_thresh, i_params->l_bound, i_params->u_bound_width,
-        i_params->u_bound_height);
+        i_params->u_bound_height, notFirst);
 
 
   }
diff --git a/src/kernels/optimize.h b/src/kernels/optimize.h
index 040d34f..3e85dfe 100644
--- a/src/kernels/optimize.h
+++ b/src/kernels/optimize.h
@@ -33,7 +33,7 @@ namespace cu {
       float** raw_diff, float** costs, float** patches, float** patchXs,
       float** patchYs, float** tempXX, float** tempYY, const float* I1,
       int n_patches, const opt_params* op,
-      const img_params* i_params); 
+      const img_params* i_params, bool project); 
 
 }
 
diff --git a/src/oflow.cpp b/src/oflow.cpp
index f4b68f9..573bcd6 100644
--- a/src/oflow.cpp
+++ b/src/oflow.cpp
@@ -280,9 +280,6 @@ namespace OFC {
 
       // Dense Inverse Search. (Step 3 in Algorithm 1 of paper)
       // Parallel over all patches
-      grid[ii]->OptimizeSetup();
-      /*while (!grid[ii]->AllConverged())
-        grid[ii]->OptimizeStep();*/
       grid[ii]->Optimize();
 
       // Timing, DIS
diff --git a/src/patch.cpp b/src/patch.cpp
deleted file mode 100644
index 925c358..0000000
--- a/src/patch.cpp
+++ /dev/null
@@ -1,419 +0,0 @@
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include <thread>
-
-#include <Eigen/Core>
-#include <Eigen/LU>
-#include <Eigen/Dense>
-
-#include <stdio.h>
-#include <sys/time.h>
-
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
-#include "common/cuda_helper.h"
-#include "common/Exceptions.h"
-#include "common/timer.h"
-
-#include "patch.h"
-#include "kernels/interpolate.h"
-#include "kernels/extract.h"
-
-
-using std::cout;
-using std::endl;
-using std::vector;
-
-namespace OFC {
-
-
-  PatClass::PatClass(
-      const img_params* _i_params,
-      const opt_params* _op,
-      const int _patch_id)
-    :
-      i_params(_i_params),
-      op(_op),
-      patch_id(_patch_id) {
-
-        p_state = new patch_state;
-
-        patch.resize(op->n_vals,1);
-        patch_x.resize(op->n_vals,1);
-        patch_y.resize(op->n_vals,1);
-
-        /*checkCudaErrors(
-            cudaMalloc ((void**) &pDevicePatch, patch.size() * sizeof(float)) );
-        checkCudaErrors(
-            cudaMalloc ((void**) &pDevicePatchX, patch_x.size() * sizeof(float)) );
-        checkCudaErrors(
-            cudaMalloc ((void**) &pDevicePatchY, patch_y.size() * sizeof(float)) );*/
-        /*checkCudaErrors(
-            cudaMalloc ((void**) &pDeviceRawDiff, patch.size() * sizeof(float)) );
-        checkCudaErrors(
-            cudaMalloc ((void**) &pDeviceCostDiff, patch.size() * sizeof(float)) );*/
-        checkCudaErrors(
-            cudaMalloc ((void**) &pDeviceWeights, 4 * sizeof(float)) );
-
-
-        // Timing
-        hessianTime = 0;
-        projectionTime = 0;
-        costTime = 0;
-        interpolateTime = 0;
-        meanTime = 0;
-
-        hessianCalls = 0;
-        projectionCalls = 0;
-        costCalls = 0;
-        interpolateCalls = 0;
-        meanCalls = 0;
-
-      }
-
-
-  PatClass::~PatClass() {
-
-    /*cudaFree(pDevicePatch);
-    cudaFree(pDevicePatchX);
-    cudaFree(pDevicePatchY);*/
-
-    // cudaFree(pDeviceRawDiff);
-    // cudaFree(pDeviceCostDiff);
-    cudaFree(pDeviceWeights);
-
-    delete p_state;
-
-  }
-
-  // void PatClass::InitializePatch(const float * _I0,
-  //     const float * _I0x, const float * _I0y, const Eigen::Vector2f _midpoint) {
-  void PatClass::InitializePatch(float * _patch,
-      float * _patchx, float * _patchy, float H00, float H01, float H11,
-      const Eigen::Vector2f _midpoint) {
-
-    // I0 = _I0;
-    // I0x = _I0x;
-    // I0y = _I0y;
-
-    pDevicePatch = _patch;
-    pDevicePatchX = _patchx;
-    pDevicePatchY = _patchy;
-
-    midpoint = _midpoint;
-
-    ResetPatchState();
-
-    p_state->hessian(0,0) = H00;
-    p_state->hessian(0,1) = H01;
-    p_state->hessian(1,0) = p_state->hessian(0,1);
-    p_state->hessian(1,1) = H11;
-
-    //ExtractPatch();
-    // ComputeHessian(H00, H01, H11);
-
-  }
-
-  void PatClass::ComputeHessian(float H00, float H01, float H11) {
-
-    /*gettimeofday(&tv_start, nullptr);
-
-    CUBLAS_CHECK (
-        cublasSdot(op->cublasHandle, patch.size(),
-          pDevicePatchX, 1, pDevicePatchX, 1, &(p_state->hessian(0,0))) );
-    CUBLAS_CHECK (
-        cublasSdot(op->cublasHandle, patch.size(),
-          pDevicePatchX, 1, pDevicePatchY, 1, &(p_state->hessian(0,1))) );
-    CUBLAS_CHECK (
-        cublasSdot(op->cublasHandle, patch.size(),
-          pDevicePatchY, 1, pDevicePatchY, 1, &(p_state->hessian(1,1))) );
-
-    p_state->hessian(1,0) = p_state->hessian(0,1);
-
-    gettimeofday(&tv_end, nullptr);
-
-    hessianTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f +
-      (tv_end.tv_usec - tv_start.tv_usec) / 1000.0f;
-    hessianCalls++;*/
-
-    p_state->hessian(0,0) = H00;
-    p_state->hessian(0,1) = H01;
-    p_state->hessian(1,0) = p_state->hessian(0,1);
-    p_state->hessian(1,1) = H11;
-
-    // If not invertible adjust values
-    if (p_state->hessian.determinant() == 0) {
-      p_state->hessian(0,0) += 1e-10;
-      p_state->hessian(1,1) += 1e-10;
-    }
-
-  }
-
-  void PatClass::SetTargetImage(const float * _I1) {
-
-    pDeviceI = _I1;
-
-    ResetPatchState();
-
-  }
-
-  void PatClass::ResetPatchState() {
-
-    p_state->has_converged = 0;
-    p_state->has_opt_started = 0;
-
-    p_state->midpoint_org = midpoint;
-    p_state->midpoint_cur = midpoint;
-
-    p_state->p_org.setZero();
-    p_state->p_cur.setZero();
-    p_state->delta_p.setZero();
-
-    p_state->delta_p_sq_norm = 1e-10;
-    p_state->delta_p_sq_norm_init = 1e-10;
-    p_state->mares = 1e20;
-    p_state->mares_old = 1e20;
-    p_state->count = 0;
-    p_state->invalid = false;
-
-    p_state->cost = 0.0;
-  }
-
-  void PatClass::OptimizeStart(const Eigen::Vector2f p_prev, bool conv) {
-
-    p_state->p_org = p_prev;
-    p_state->p_cur = p_prev;
-
-    UpdateMidpoint();
-
-    // save starting location, only needed for outlier check
-    p_state->midpoint_org = p_state->midpoint_cur;
-
-    //Check if initial position is already invalid
-    if (p_state->midpoint_cur[0] < i_params->l_bound
-        || p_state->midpoint_cur[1] < i_params->l_bound
-        || p_state->midpoint_cur[0] > i_params->u_bound_width
-        || p_state->midpoint_cur[1] > i_params->u_bound_height) {
-
-      p_state->has_converged=1;
-      p_state->has_opt_started=1;
-
-    } else {
-
-      p_state->count = 0; // reset iteration counter
-      p_state->delta_p_sq_norm = 1e-10;
-      p_state->delta_p_sq_norm_init = 1e-10;  // set to arbitrary low value, s.t. that loop condition is definitely true on first iteration
-      p_state->mares = 1e5;          // mean absolute residual
-      p_state->mares_old = 1e20; // for rate of change, keep mares from last iteration in here. Set high so that loop condition is definitely true on first iteration
-      p_state->has_converged=0;
-
-     //  OptimizeComputeErrImg(false, c);
-      p_state->has_converged = conv;
-
-      p_state->has_opt_started = 1;
-      p_state->invalid = false;
-
-    }
-
-  }
-
-  void PatClass::OptimizeIter() {
-
-    // Do one optimize iteration
-
-    if (!p_state->has_converged) {
-
-      p_state->count++;
-
-      // Projection onto sd_images
-      gettimeofday(&tv_start, nullptr);
-      CUBLAS_CHECK (
-          cublasSdot(op->cublasHandle, patch.size(),
-            pDevicePatchX, 1, pDeviceRawDiff, 1, &(p_state->delta_p[0])) );
-      CUBLAS_CHECK (
-          cublasSdot(op->cublasHandle, patch.size(),
-            pDevicePatchY, 1, pDeviceRawDiff, 1, &(p_state->delta_p[1])) );
-      gettimeofday(&tv_end, nullptr);
-      projectionTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f +
-        (tv_end.tv_usec - tv_start.tv_usec) / 1000.0f;
-      projectionCalls++;
-
-
-      p_state->delta_p = p_state->hessian.llt().solve(p_state->delta_p); // solve linear system
-
-      p_state->p_cur -= p_state->delta_p; // update flow vector
-
-      // compute patch locations based on new parameter vector
-      UpdateMidpoint();
-
-      // check if patch(es) moved too far from starting location, if yes, stop iteration and reset to starting location
-      // check if query patch moved more than >padval from starting location -> most likely outlier
-      if ((p_state->midpoint_org - p_state->midpoint_cur).norm() > op->outlier_thresh
-          || p_state->midpoint_cur[0] < i_params->l_bound
-          || p_state->midpoint_cur[1] < i_params->l_bound
-          || p_state->midpoint_cur[0] > i_params->u_bound_width
-          || p_state->midpoint_cur[1] > i_params->u_bound_height) {
-
-        // Reset because this is an outlier
-        p_state->p_cur = p_state->p_org;
-        UpdateMidpoint();
-        p_state->has_converged=1;
-        p_state->has_opt_started=1;
-
-      }
-
-      OptimizeComputeErrImg(true, -0.0);
-
-    }
-
-  }
-
-  inline void PatClass::UpdateMidpoint() {
-
-    p_state->midpoint_cur = midpoint + p_state->p_cur;
-
-  }
-
-  void PatClass::ComputeCostErr() {
-
-    // L2-Norm
-
-    const float alpha = -1.0;
-
-    gettimeofday(&tv_start, nullptr);
-    // raw = raw - patch
-    CUBLAS_CHECK (
-        cublasSaxpy(op->cublasHandle, patch.size(), &alpha,
-          pDevicePatch, 1, pDeviceRawDiff, 1) );
-
-    // Element-wise multiplication
-    CUBLAS_CHECK (
-        cublasSdgmm(op->cublasHandle, CUBLAS_SIDE_RIGHT,
-          1, patch.size(), pDeviceRawDiff, 1, pDeviceRawDiff, 1, pDeviceCostDiff, 1) );
-
-    // Sum
-    CUBLAS_CHECK (
-        cublasSasum(op->cublasHandle, patch.size(),
-          pDeviceCostDiff, 1, &(p_state->cost)) );
-    gettimeofday(&tv_end, nullptr);
-
-    costTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f +
-      (tv_end.tv_usec - tv_start.tv_usec) / 1000.0f;
-    costCalls++;
-
-  }
-
-  void PatClass::OptimizeComputeErrImg(bool interp, float c) {
-
-    if (interp) {
-      InterpolatePatch();
-      ComputeCostErr();
-    } else {
-      p_state->cost = c;
-    }
-
-    // Compute step norm
-    p_state->delta_p_sq_norm = p_state->delta_p.squaredNorm();
-    if (p_state->count == 1)
-      p_state->delta_p_sq_norm_init = p_state->delta_p_sq_norm;
-
-    // Check early termination criterions
-    p_state->mares_old = p_state->mares;
-    p_state->mares = p_state->cost / op->n_vals;
-
-    if (!((p_state->count < op->grad_descent_iter) & (p_state->mares > op->res_thresh)
-          & ((p_state->count < op->grad_descent_iter) | (p_state->delta_p_sq_norm / p_state->delta_p_sq_norm_init >= op->dp_thresh))
-          & ((p_state->count < op->grad_descent_iter) | (p_state->mares / p_state->mares_old <= op->dr_thresh)))) {
-      p_state->has_converged = 1;
-    }
-
-  }
-
-  // Extract patch on integer position, and gradients, No Bilinear interpolation
-  void PatClass::ExtractPatch() {
-
-    int x = round(midpoint[0]) + i_params->padding;
-    int y = round(midpoint[1]) + i_params->padding;
-
-    int lb = -op->patch_size / 2;
-    int patch_offset = (x + lb) + (y + lb) * i_params->width_pad;
-
-    // Extract patch
-    /*cu::extractPatch(pDevicePatch, pDevicePatchX, pDevicePatchY,
-        I0, I0x, I0y, patch_offset, op->patch_size, i_params->width_pad);*/
-
-    gettimeofday(&tv_start, nullptr);
-    // Mean Normalization
-    if (op->use_mean_normalization > 0) {
-      cu::normalizeMean(pDevicePatch, op->patch_size);
-    }
-
-    gettimeofday(&tv_end, nullptr);
-    meanTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f +
-      (tv_end.tv_usec - tv_start.tv_usec) / 1000.0f;
-    meanCalls++;
-
-
-
-  }
-
-  // Extract patch on float position with bilinear interpolation, no gradients.
-  void PatClass::InterpolatePatch() {
-
-    Eigen::Vector2f resid;
-    Eigen::Vector4f weight; // bilinear weight vector
-    Eigen::Vector4i pos;
-    Eigen::Vector2i pos_iter;
-
-    // Compute the bilinear weight vector, for patch without orientation/scale change
-    // weight vector is constant for all pixels
-    pos[0] = ceil(p_state->midpoint_cur[0] + .00001f); // ensure rounding up to natural numbers
-    pos[1] = ceil(p_state->midpoint_cur[1] + .00001f);
-    pos[2] = floor(p_state->midpoint_cur[0]);
-    pos[3] = floor(p_state->midpoint_cur[1]);
-
-    resid[0] = p_state->midpoint_cur[0] - (float)pos[2];
-    resid[1] = p_state->midpoint_cur[1] - (float)pos[3];
-    weight[0] = resid[0] * resid[1];
-    weight[1] = (1 - resid[0]) * resid[1];
-    weight[2] = resid[0] * (1- resid[1]);
-    weight[3] = (1 - resid[0]) * (1 - resid[1]);
-
-    pos[0] += i_params->padding;
-    pos[1] += i_params->padding;
-
-    int lb = -op->patch_size / 2;
-    int startx = pos[0] + lb;
-    int starty = pos[1] + lb;
-
-    gettimeofday(&tv_start, nullptr);
-    checkCudaErrors(
-        cudaMemcpy(pDeviceWeights, weight.data(),
-          4 * sizeof(float), cudaMemcpyHostToDevice) );
-
-
-    cu::interpolatePatch(pDeviceRawDiff, pDeviceI, pDeviceWeights,
-        i_params->width_pad, starty, startx, op->patch_size);
-
-    gettimeofday(&tv_end, nullptr);
-    interpolateTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f +
-      (tv_end.tv_usec - tv_start.tv_usec) / 1000.0f;
-    interpolateCalls++;
-
-    gettimeofday(&tv_start, nullptr);
-    // Mean Normalization
-    if (op->use_mean_normalization > 0) {
-      cu::normalizeMean(pDeviceRawDiff, op->patch_size);
-    }
-
-    gettimeofday(&tv_end, nullptr);
-    meanTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f +
-      (tv_end.tv_usec - tv_start.tv_usec) / 1000.0f;
-    meanCalls++;
-
-
-  }
-
-}
diff --git a/src/patch.h b/src/patch.h
index 489da3e..a5de5c0 100644
--- a/src/patch.h
+++ b/src/patch.h
@@ -12,28 +12,6 @@
 
 namespace OFC {
 
-  typedef struct {
-    bool has_converged;
-    bool has_opt_started;
-
-    Eigen::Matrix<float, 2, 2> hessian; // Hessian for optimization
-    Eigen::Vector2f p_org, p_cur, delta_p; // point position, displacement to starting position, iteration update
-
-    // start positions, current point position, patch norm
-    // Eigen::Matrix<float,1,1> norm;
-    Eigen::Vector2f midpoint_cur;
-    Eigen::Vector2f midpoint_org;
-
-    float delta_p_sq_norm = 1e-10;
-    float delta_p_sq_norm_init = 1e-10;
-    float mares = 1e20; // mares: Mean Absolute RESidual
-    float mares_old = 1e20;
-    int count = 0;
-    bool invalid = false;
-
-    float cost = 0.0;
-  } patch_state;
-
   typedef struct {
     bool has_converged;
     bool has_opt_started;
@@ -57,87 +35,6 @@ namespace OFC {
     float cost = 0.0;
   } dev_patch_state;
 
-
-
-  class PatClass {
-
-    public:
-      PatClass(const img_params* _i_params,
-          const opt_params* _op,
-          const int _patch_id);
-
-      ~PatClass();
-
-      // void InitializePatch(const float * _I0, const float * _I0x,
-      //     const float * _I0y, const Eigen::Vector2f _midpoint);
-      void InitializePatch(float * _patch,
-          float * _patchx, float* _patchy,
-          float H00, float H01, float H11,
-          const Eigen::Vector2f _midpoint);
-      void SetTargetImage(const float * _I1);
-
-      void OptimizeStart(const Eigen::Vector2f p_prev, bool conv);
-      void OptimizeIter();
-
-      inline const bool IsConverged() const { return p_state->has_converged; }
-      inline const bool HasOptStarted() const { return p_state->has_opt_started; }
-      inline const Eigen::Vector2f GetTargMidpoint() const { return p_state->midpoint_cur; }
-      inline const bool IsValid() const { return !p_state->invalid; }
-      inline float * GetDeviceCostDiffPtr() const { return (float*) pDeviceCostDiff; }
-      inline float * getCostP() { return (float*) pDeviceCostDiff; }
-      inline float * getRawP() { return (float*) pDeviceRawDiff; }
-      inline void setRawP(float* raw) { pDeviceRawDiff = raw; }
-      inline void setCostP(float* cost) { pDeviceCostDiff = cost; }
-
-
-      inline const Eigen::Vector2f* GetCurP() const { return &(p_state->p_cur); }
-      inline const Eigen::Vector2f* GetOrgP() const { return &(p_state->p_org); }
-      inline const int GetPatchId() const { return patch_id; }
-
-      struct timeval tv_start, tv_end;
-      double hessianTime, projectionTime, costTime, interpolateTime, meanTime;
-      int hessianCalls, projectionCalls, costCalls, interpolateCalls, meanCalls;
-
-    private:
-
-      void OptimizeComputeErrImg(bool interp, float cost);
-      void UpdateMidpoint();
-      void ResetPatchState();
-      void ComputeHessian(float H00, float H01, float H11);
-      void ComputeCostErr();
-
-      // Extract patch on integer position, and gradients, No Bilinear interpolation
-      void ExtractPatch();
-      // Extract patch on float position with bilinear interpolation, no gradients.
-      void InterpolatePatch();
-
-
-      const float* pDeviceI;
-
-      float* pDevicePatch;
-      float* pDevicePatchX;
-      float* pDevicePatchY;
-
-      float* pDeviceRawDiff;
-      float* pDeviceCostDiff;
-      float* pDeviceWeights;
-
-
-      Eigen::Vector2f midpoint; // reference point location
-      Eigen::Matrix<float, Eigen::Dynamic, 1> patch;
-      Eigen::Matrix<float, Eigen::Dynamic, 1> patch_x;
-      Eigen::Matrix<float, Eigen::Dynamic, 1> patch_y;
-
-      const float * I0, * I0x, * I0y;
-
-      const img_params* i_params;
-      const opt_params* op;
-      const int patch_id;
-
-      patch_state * p_state = nullptr; // current patch state
-
-  };
-
 }
 
 #endif /* PAT_HEADER */
diff --git a/src/patchgrid.cpp b/src/patchgrid.cpp
index a44226a..87d2e30 100644
--- a/src/patchgrid.cpp
+++ b/src/patchgrid.cpp
@@ -46,17 +46,11 @@ namespace OFC {
 
       n_patches = n_patches_width * n_patches_height;
       midpoints_ref.resize(n_patches);
-      p_init.resize(n_patches);
-      patches.reserve(n_patches);
 
       checkCudaErrors(
           cudaMalloc ((void**) &pDevicePatchStates, n_patches * sizeof(dev_patch_state)) );
       pHostDevicePatchStates = new dev_patch_state[n_patches];
 
-      midpointX_host = new float[n_patches];
-      midpointY_host = new float[n_patches];
-
-      int patch_id = 0;
       for (int x = 0; x < n_patches_width; ++x) {
         for (int y = 0; y < n_patches_height; ++y) {
 
@@ -64,25 +58,10 @@ namespace OFC {
 
           midpoints_ref[i][0] = x * steps + offsetw;
           midpoints_ref[i][1] = y * steps + offseth;
-          midpointX_host[i] = x * steps + offsetw;
-          midpointY_host[i] = y * steps + offseth;
-          p_init[i].setZero();
-
-          patches.push_back(new OFC::PatClass(i_params, op, patch_id));
-          patch_id++;
 
         }
       }
 
-      // Midpoint
-      checkCudaErrors(
-          cudaMalloc ((void**) &pDeviceMidpointX, n_patches * sizeof(float)) );
-      checkCudaErrors(
-          cudaMalloc ((void**) &pDeviceMidpointY, n_patches * sizeof(float)) );
-      checkCudaErrors( cudaMemcpy(pDeviceMidpointX, midpointX_host,
-          n_patches * sizeof(float), cudaMemcpyHostToDevice) );
-      checkCudaErrors( cudaMemcpy(pDeviceMidpointY, midpointY_host,
-          n_patches * sizeof(float), cudaMemcpyHostToDevice) );
 
       // Aggregate flow
       checkCudaErrors(
@@ -205,8 +184,10 @@ namespace OFC {
       aggregateTime = 0.0;
       meanTime = 0.0;
       extractTime = 0.0;
+      optiTime = 0.0;
     }
 
+
   PatGridClass::~PatGridClass() {
 
     for (int i = 0; i < n_patches; ++i) {
@@ -222,7 +203,6 @@ namespace OFC {
       cudaFree(pHostDeviceRaws[i]);
       cudaFree(pHostDeviceCosts[i]);
 
-      delete patches[i];
     }
 
     cudaFree(pDevicePatches);
@@ -243,12 +223,6 @@ namespace OFC {
     delete pHostDeviceTempXY;
     delete pHostDeviceTempYY;
 
-    delete midpointX_host;
-    delete midpointY_host;
-
-    cudaFree(pDeviceMidpointX);
-    cudaFree(pDeviceMidpointY);
-
     cudaFree(pDeviceH00);
     cudaFree(pDeviceH01);
     cudaFree(pDeviceH11);
@@ -263,6 +237,7 @@ namespace OFC {
 
   }
 
+
   void PatGridClass::InitializeGrid(const float * _I0, const float * _I0x, const float * _I0y) {
 
     I0 = _I0;
@@ -279,74 +254,29 @@ namespace OFC {
     extractTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f +
       (tv_end.tv_usec - tv_start.tv_usec) / 1000.0f;
 
-   /* float H00, H01, H11;
-
-    for (int i = 0; i < n_patches; ++i) {
-      checkCudaErrors(
-          cudaMemcpy(&H00, &(pDevicePatchStates[i].H00), sizeof(float), cudaMemcpyDeviceToHost) );
-      checkCudaErrors(
-          cudaMemcpy(&H01, &(pDevicePatchStates[i].H01), sizeof(float), cudaMemcpyDeviceToHost) );
-      checkCudaErrors(
-          cudaMemcpy(&H11, &(pDevicePatchStates[i].H11), sizeof(float), cudaMemcpyDeviceToHost) );
-
-      patches[i]->InitializePatch(pHostDevicePatches[i],
-          pHostDevicePatchXs[i], pHostDevicePatchYs[i],
-          H00, H01, H11, midpoints_ref[i]);
-      p_init[i].setZero();
-    }*/
-
   }
 
+
   void PatGridClass::SetTargetImage(const float * _I1) {
 
     I1 = _I1;
 
-    /*for (int i = 0; i < n_patches; ++i) {
-      patches[i]->SetTargetImage(I1);
-    }*/
-
   }
 
-  void PatGridClass::OptimizeSetup() {
-
-    /*cu::interpolateAndComputeErr(pDevicePatchStates,
-        pDeviceRaws, pDeviceCosts, pDevicePatches, I1,
-        n_patches, op, i_params, false);
-
-    bool conv;
-    for (int i = 0; i < n_patches; ++i) {
-      patches[i]->setRawP(pHostDeviceRaws[i]);
-      patches[i]->setCostP(pHostDeviceCosts[i]);
-      checkCudaErrors(
-          cudaMemcpy(&conv, &(pDevicePatchStates[i].has_converged), sizeof(bool),
-            cudaMemcpyDeviceToHost) );
-
-      patches[i]->OptimizeStart(p_init[i], conv);
-    }*/
-
-  }
 
   void PatGridClass::Optimize() {
 
+    gettimeofday(&tv_start, nullptr);
+
     cu::interpolateAndComputeErr(pDevicePatchStates, pDeviceRaws, pDeviceCosts,
         pDevicePatches, pDevicePatchXs, pDevicePatchYs, pDeviceTempXX, pDeviceTempYY,
-        I1, n_patches, op, i_params);
-    // #pragma omp parallel for schedule(static)
-    /*for (int i = 0; i < n_patches; ++i) {
-      patches[i]->OptimizeIter();
-    }*/
+        I1, n_patches, op, i_params, true);
 
+    gettimeofday(&tv_end, nullptr);
+    optiTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f +
+      (tv_end.tv_usec - tv_start.tv_usec) / 1000.0f;
   }
 
-  bool PatGridClass::AllConverged() {
-
-    for (int i = 0; i < n_patches; ++i) {
-      if (!patches[i]->IsConverged())
-        return false;
-    }
-
-    return true;
-  }
 
   void PatGridClass::InitializeFromCoarserOF(const float * flow_prev) {
 
@@ -360,24 +290,10 @@ namespace OFC {
     cu::initCoarserOF(devFlowPrev, pDevicePatchStates,
         n_patches, i_params);
 
-    for (int ip = 0; ip < n_patches; ++ip) {
-
-      // int x = floor(midpoints_ref[ip][0] / 2);
-      // int y = floor(midpoints_ref[ip][1] / 2);
-      // int i = y * (i_params->width / 2) + x;
-
-      // p_init[ip](0) = flow_prev[2 * i] * 2;
-      // p_init[ip](1) = flow_prev[2 * i + 1] * 2;
-
-      /*checkCudaErrors( cudaMemcpy(&(p_init[ip](0)), &(pDevicePatchStates[ip].p_orgx),
-            sizeof(float), cudaMemcpyDeviceToHost) );
-      checkCudaErrors( cudaMemcpy(&(p_init[ip](1)), &(pDevicePatchStates[ip].p_orgy),
-            sizeof(float), cudaMemcpyDeviceToHost) );*/
-
-    }
 
   }
 
+
   void PatGridClass::AggregateFlowDense(float *flowout) {
 
     /*bool isValid[n_patches];
@@ -430,16 +346,14 @@ namespace OFC {
       op, i_params);*/
     for (int ip = 0; ip < n_patches; ++ip) {
 
-        //const Eigen::Vector2f* fl = patches[ip]->GetCurP(); // flow displacement of this patch
+      float* pweight = pHostDeviceCosts[ip]; // use image error as weight
 
-        float* pweight = pHostDeviceCosts[ip]; // use image error as weight
-
-        cu::densifyPatch(
-            pweight, pDeviceFlowOut, pDeviceWeights,
-            pDevicePatchStates, ip,
-            midpoints_ref[ip][0], midpoints_ref[ip][1],
-            i_params->width, i_params->height,
-            op->patch_size, op->min_errval);
+      cu::densifyPatch(
+          pweight, pDeviceFlowOut, pDeviceWeights,
+          pDevicePatchStates, ip,
+          midpoints_ref[ip][0], midpoints_ref[ip][1],
+          i_params->width, i_params->height,
+          op->patch_size, op->min_errval);
 
     }
 
@@ -460,48 +374,17 @@ namespace OFC {
       (tv_end.tv_usec - tv_start.tv_usec) / 1000.0f;
   }
 
-  void PatGridClass::printTimings() {
 
-    double tot_hessianTime = 0,
-           tot_projectionTime = 0, tot_costTime = 0,
-           tot_interpolateTime = 0, tot_meanTime = 0;
-    int tot_hessianCalls = 0,
-        tot_projectionCalls = 0, tot_costCalls = 0,
-        tot_interpolateCalls = 0, tot_meanCalls = 0;
-
-    for (auto & element : patches) {
-      tot_hessianTime += element->hessianTime;
-      tot_projectionTime += element->projectionTime;
-      tot_costTime += element->costTime;
-      tot_interpolateTime += element->interpolateTime;
-      tot_meanTime += element->meanTime;
-
-      tot_hessianCalls += element->hessianCalls;
-      tot_projectionCalls += element->projectionCalls;
-      tot_costCalls += element->costCalls;
-      tot_interpolateCalls += element->interpolateCalls;
-      tot_meanCalls += element->meanCalls;
-    }
+  void PatGridClass::printTimings() {
 
     cout << endl;
     cout << "===============Timings (ms)===============" << endl;
-    cout << "Avg grad descent iterations:        " << float(tot_costCalls) / float(n_patches) << endl;
-    cout << "[hessian]      " << tot_hessianTime;
-    cout << "  tot => " << tot_hessianTime / tot_hessianCalls << " avg" << endl;
-    cout << "[project]      " << tot_projectionTime;
-    cout << "  tot => " << tot_projectionTime / tot_projectionCalls << " avg" << endl;
-    cout << "[cost]         " << tot_costTime;
-    cout << "  tot => " << tot_costTime / tot_costCalls << " avg" << endl;
-    cout << "[interpolate]  " << tot_interpolateTime;
-    cout << "  tot => " << tot_interpolateTime / tot_interpolateCalls << " avg" << endl;
-    cout << "[mean norm]    " << tot_meanTime;
-    cout << "  tot => " << tot_meanTime / tot_meanCalls << " avg" << endl;
     cout << "[extract]      " << extractTime << endl;
+    cout << "[optiTime]      " << optiTime << endl;
     cout << "[aggregate]    " << aggregateTime << endl;
     cout << "[flow norm]    " << meanTime << endl;
     cout << "==========================================" << endl;
 
-
   }
 
 }
diff --git a/src/patchgrid.h b/src/patchgrid.h
index ef2e3d1..389fff7 100644
--- a/src/patchgrid.h
+++ b/src/patchgrid.h
@@ -24,19 +24,13 @@ namespace OFC {
       void AggregateFlowDense(float *flowout);
 
       // Optimizes grid to convergence of each patch
-      void OptimizeSetup();
       void Optimize();
-      bool AllConverged();
-      //Optimize each patch in grid for one iteration, visualize displacement vector, repeat
-      //void OptimizeAndVisualize(const float sc_fct_tmp);  // needed for verbosity >= 3, DISVISUAL
 
       inline const int GetNumPatches() const { return n_patches; }
       inline const int GetNumPatchesW() const { return n_patches_width; }
       inline const int GetNumPatchesH() const { return n_patches_height; }
 
       inline const Eigen::Vector2f GetRefPatchPos(int i) const { return midpoints_ref[i]; } // Get reference  patch position
-      inline const Eigen::Vector2f GetQuePatchPos(int i) const { return patches[i]->GetTargMidpoint(); } // Get target/query patch position
-      inline const Eigen::Vector2f GetQuePatchDis(int i) const { return midpoints_ref[i]-patches[i]->GetTargMidpoint(); } // Get query patch displacement from reference patch
 
       void printTimings();
 
@@ -53,7 +47,6 @@ namespace OFC {
 
       float** pDevicePatches, ** pDevicePatchXs, ** pDevicePatchYs;
       float** pHostDevicePatches, **pHostDevicePatchXs, **pHostDevicePatchYs;
-      float* pDeviceMidpointX, * pDeviceMidpointY;
 
       // Raw Diff and Costs
       float** pDeviceRaws, **pDeviceCosts;
@@ -79,12 +72,13 @@ namespace OFC {
       double aggregateTime;
       double meanTime;
       double extractTime;
+      double optiTime;
 
-      float* midpointX_host;
-      float* midpointY_host;
-      std::vector<OFC::PatClass*> patches; // Patch Objects
+      // float* midpointX_host;
+      // float* midpointY_host;
+      // std::vector<OFC::PatClass*> patches; // Patch Objects
       std::vector<Eigen::Vector2f> midpoints_ref; // Midpoints for reference patches
-      std::vector<Eigen::Vector2f> p_init; // starting parameters for query patches
+      // std::vector<Eigen::Vector2f> p_init; // starting parameters for query patches
   };
 
 }