diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 794ec56..d817b29 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -71,7 +71,7 @@ set(CODEFILES
   FDF1.0.1/opticalflow_aux.c
   FDF1.0.1/solver.c)
 
-# GrayScale, Optical Flow
+# RGB, Optical Flow
 cuda_add_executable(flow ${COMMON} ${CODEFILES} ${KERNELS})
 set_target_properties (flow PROPERTIES COMPILE_DEFINITIONS "SELECTMODE=1")
 set_property(TARGET flow APPEND PROPERTY COMPILE_DEFINITIONS "SELECTCHANNEL=3") # use RGB image
@@ -87,4 +87,3 @@ set(SANDBOX_FILES
   sandbox/sandbox.cpp)
 cuda_add_executable(sandbox ${COMMON} ${KERNELS} ${SANDBOX_FILES})
 target_link_libraries(sandbox ${OpenCV_LIBS})
-
diff --git a/src/kernels/pyramid.cpp b/src/kernels/pyramid.cpp
index 80c8ab8..03bf9b1 100644
--- a/src/kernels/pyramid.cpp
+++ b/src/kernels/pyramid.cpp
@@ -75,16 +75,11 @@ namespace cu {
     // Allocate device memory (to account for padding too
     auto start_cuda_malloc = now();
     Npp32f *pDeviceIx, *pDeviceIy;
-    Npp32f *pDevicePaddedI, *pDevicePaddedIx, *pDevicePaddedIy;
     Npp32f *pDeviceTmp, *pDeviceKernel;
 
     checkCudaErrors( cudaMalloc((void**) &pDeviceIx, width * height * elemSize) );
     checkCudaErrors( cudaMalloc((void**) &pDeviceIy, width * height * elemSize) );
 
-    checkCudaErrors( cudaMalloc((void**) &pDevicePaddedI,  padWidth * padHeight * elemSize) );
-    checkCudaErrors( cudaMalloc((void**) &pDevicePaddedIx, padWidth * padHeight * elemSize) );
-    checkCudaErrors( cudaMalloc((void**) &pDevicePaddedIy, padWidth * padHeight * elemSize) );
-
     checkCudaErrors( cudaMalloc((void**) &pDeviceTmp,    width * height * elemSize)  );
     checkCudaErrors( cudaMalloc((void**) &pDeviceKernel, nMaskSize * sizeof(Npp32f)) );
 
@@ -142,44 +137,17 @@ namespace cu {
     NPP_CHECK_NPP(
         nppiCopyReplicateBorder_32f_C3R (
           pDeviceI, nSrcStep, oSize,
-          pDevicePaddedI, nDstStep, oPadSize, padding, padding) );
+          Is[0], nDstStep, oPadSize, padding, padding) );
 
     // Pad dx, dy
-    checkCudaErrors( cudaMemset(pDevicePaddedIx, 0, oPadSize.width * oPadSize.height * elemSize) );
-    checkCudaErrors( cudaMemset(pDevicePaddedIy, 0, oPadSize.width * oPadSize.height * elemSize) );
     NPP_CHECK_NPP(
         nppiCopyConstBorder_32f_C3R (
           pDeviceIx, nSrcStep, oSize,
-          pDevicePaddedIx, nDstStep, oPadSize, padding, padding, PAD_VAL) );
+          Ixs[0], nDstStep, oPadSize, padding, padding, PAD_VAL) );
     NPP_CHECK_NPP(
         nppiCopyConstBorder_32f_C3R (
           pDeviceIy, nSrcStep, oSize,
-          pDevicePaddedIy, nDstStep, oPadSize, padding, padding, PAD_VAL) );
-
-    ////////////////////////////////////////////////////////////////////////////////////////////////
-    // Copy Is[0] I, dx, dy
-    ////////////////////////////////////////////////////////////////////////////////////////////////
-
-    auto start_cp_Is0 = now();
-    Is[0] = new float[oPadSize.height * oPadSize.width * channels];
-    checkCudaErrors(
-        cudaMemcpy(Is[0], pDevicePaddedI,
-          oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
-    compute_time += calc_print_elapsed("Is[0] cudaMemcpy D->H", start_cp_Is0);
-
-    auto start_cp_dx = now();
-    Ixs[0] = new float[oPadSize.height * oPadSize.width * channels];
-    checkCudaErrors(
-        cudaMemcpy(Ixs[0], pDevicePaddedIx,
-          oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
-    compute_time += calc_print_elapsed("Ixs[0] cudaMemcpy D->H", start_cp_dx);
-
-    auto start_cp_dy = now();
-    Iys[0] = new float[oPadSize.height * oPadSize.width * channels];
-    checkCudaErrors(
-        cudaMemcpy(Iys[0], pDevicePaddedIy,
-          oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
-    compute_time += calc_print_elapsed("Iys[0] cudaMemcpy D->H", start_cp_dy);
+          Iys[0], nDstStep, oPadSize, padding, padding, PAD_VAL) );
 
 
     ////////////////////////////////////////////////////////////////////////////////////////////////
@@ -241,7 +209,7 @@ namespace cu {
             pDeviceI, nSrcStep, oSize, oOffset,
             pDeviceIy, nSrcStep, oROI,
             pDeviceKernel, nMaskSize, nAnchor, eBorderType)
-        );
+          );
       compute_time += calc_print_elapsed("sobel: Iys[i]", start_dy);
 
       //////////////////////////////////////////////////////////////////////////////////////////////
@@ -257,48 +225,23 @@ namespace cu {
       NPP_CHECK_NPP(
           nppiCopyReplicateBorder_32f_C3R (
             pDeviceI, nSrcStep, oSize,
-            pDevicePaddedI, nDstStep, oPadSize, padding, padding) );
+            Is[i], nDstStep, oPadSize, padding, padding) );
 
       // Pad dx, dy
-      checkCudaErrors( cudaMemset(pDevicePaddedIx, 0, oPadSize.width * oPadSize.height * elemSize) );
-      checkCudaErrors( cudaMemset(pDevicePaddedIy, 0, oPadSize.width * oPadSize.height * elemSize) );
       NPP_CHECK_NPP(
           nppiCopyConstBorder_32f_C3R (
             pDeviceIx, nSrcStep, oSize,
-            pDevicePaddedIx, nDstStep, oPadSize, padding, padding, PAD_VAL) );
+            Ixs[i], nDstStep, oPadSize, padding, padding, PAD_VAL) );
       NPP_CHECK_NPP(
           nppiCopyConstBorder_32f_C3R (
             pDeviceIy, nSrcStep, oSize,
-            pDevicePaddedIy, nDstStep, oPadSize, padding, padding, PAD_VAL) );
-
-      // Allocate host destinations
-      auto start_host_alloc = now();
-      Is[i] = new float[oPadSize.width * oPadSize.height * channels];
-      Ixs[i] = new float[oPadSize.width * oPadSize.height * channels];
-      Iys[i] = new float[oPadSize.width * oPadSize.height * channels];
-      compute_time += calc_print_elapsed("host alloc", start_host_alloc);
-
-      // Copy over data
-      auto start_cp = now();
-      checkCudaErrors(
-          cudaMemcpy(Is[i], pDevicePaddedI,
-            oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
-      checkCudaErrors(
-          cudaMemcpy(Ixs[i], pDevicePaddedIx,
-            oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
-      checkCudaErrors(
-          cudaMemcpy(Iys[i], pDevicePaddedIy,
-            oPadSize.width * oPadSize.height * elemSize, cudaMemcpyDeviceToHost) );
-      compute_time += calc_print_elapsed("pyramid cudaMemcpy D->H", start_cp);
+            Iys[i], nDstStep, oPadSize, padding, padding, PAD_VAL) );
 
     }
 
     // Clean up
     cudaFree(pDeviceIx);
     cudaFree(pDeviceIy);
-    cudaFree(pDevicePaddedI);
-    cudaFree(pDevicePaddedIx);
-    cudaFree(pDevicePaddedIy);
     cudaFree(pDeviceTmp);
     cudaFree(pDeviceKernel);
 
@@ -307,4 +250,3 @@ namespace cu {
   }
 
 }
-
diff --git a/src/oflow.cpp b/src/oflow.cpp
index 18f6719..7050848 100644
--- a/src/oflow.cpp
+++ b/src/oflow.cpp
@@ -38,7 +38,10 @@ using namespace timer;
 
 namespace OFC {
 
-  OFClass::OFClass(opt_params _op) {
+  OFClass::OFClass(opt_params _op, img_params _iparams) {
+
+    struct timeval tv_start_all, tv_end_all, tv_start_all_global, tv_end_all_global;
+    if (op.verbosity > 1) gettimeofday(&tv_start_all_global, nullptr);
 
     // Parse optimization parameters
     op = _op;
@@ -54,6 +57,13 @@ namespace OFC {
     op.dr_thresh = 0.95;
     op.res_thresh = 0.0;
 
+    // Initialize cuBLAS
+    cublasStatus_t stat = cublasCreate(&op.cublasHandle);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+      printf ("CUBLAS initialization failed\n");
+      exit(-1);
+    }
+
     // Allocate scale pyramides
     I0s = new float*[op.coarsest_scale+1];
     I1s = new float*[op.coarsest_scale+1];
@@ -62,8 +72,95 @@ namespace OFC {
     I1xs = new float*[op.coarsest_scale+1];
     I1ys = new float*[op.coarsest_scale+1];
 
+    // Create grids on each scale
+    if (op.verbosity>1) gettimeofday(&tv_start_all, nullptr);
+
+
+    grid.resize(op.n_scales);
+    flow.resize(op.n_scales);
+    iparams.resize(op.n_scales);
+    for (int sl = op.coarsest_scale; sl >= 0; --sl) {
+
+      int i = sl - op.finest_scale;
+
+      float scale_fact = pow(2, -sl); // scaling factor at current scale
+      if (i >= 0) {
+        iparams[i].scale_fact = scale_fact;
+        iparams[i].height = _iparams.height * scale_fact;
+        iparams[i].width = _iparams.width * scale_fact;
+        iparams[i].padding = _iparams.padding;
+        iparams[i].l_bound = -(float) op.patch_size / 2;
+        iparams[i].u_bound_width = (float) (iparams[i].width + op.patch_size / 2 - 2);
+        iparams[i].u_bound_height = (float) (iparams[i].height + op.patch_size / 2 - 2);
+        iparams[i].width_pad = iparams[i].width + 2 * _iparams.padding;
+        iparams[i].height_pad = iparams[i].height + 2 * _iparams.padding;
+        iparams[i].curr_lvl = sl;
+
+        flow[i]   = new float[2 * iparams[i].width * iparams[i].height];
+        grid[i]   = new OFC::PatGridClass(&(iparams[i]), &op);
+      }
+
+      int elemSize = 3 * sizeof(float);
+      int padWidth = _iparams.width * scale_fact + 2 * _iparams.padding;
+      int padHeight = _iparams.height * scale_fact + 2 * _iparams.padding;
+
+      checkCudaErrors( cudaMalloc((void**) &I0s[sl],  padWidth * padHeight * elemSize) );
+      checkCudaErrors( cudaMalloc((void**) &I0xs[sl], padWidth * padHeight * elemSize) );
+      checkCudaErrors( cudaMalloc((void**) &I0ys[sl], padWidth * padHeight * elemSize) );
+
+      checkCudaErrors( cudaMalloc((void**) &I1s[sl],  padWidth * padHeight * elemSize) );
+      checkCudaErrors( cudaMalloc((void**) &I1xs[sl], padWidth * padHeight * elemSize) );
+      checkCudaErrors( cudaMalloc((void**) &I1ys[sl], padWidth * padHeight * elemSize) );
+    }
+
+    // Timing, Grid memory allocation
+    if (op.verbosity>1) {
+
+      gettimeofday(&tv_end_all, nullptr);
+      double tt_gridconst = (tv_end_all.tv_sec-tv_start_all.tv_sec)*1000.0f + (tv_end_all.tv_usec-tv_start_all.tv_usec)/1000.0f;
+      printf("TIME (Grid Memo. Alloc. ) (ms): %3g\n", tt_gridconst);
+
+    }
+
+    // Timing, Setup
+    if (op.verbosity>1) {
+
+      gettimeofday(&tv_end_all_global, nullptr);
+      double tt = (tv_end_all_global.tv_sec-tv_start_all_global.tv_sec)*1000.0f + (tv_end_all_global.tv_usec-tv_start_all_global.tv_usec)/1000.0f;
+      printf("TIME (Setup) (ms): %3g\n", tt);
+    }
+
   }
 
+  OFClass::~OFClass() {
+
+    cublasDestroy(op.cublasHandle);
+
+    for (int sl = op.coarsest_scale; sl >= op.finest_scale; --sl) {
+
+      delete[] flow[sl - op.finest_scale];
+      delete grid[sl - op.finest_scale];
+
+    }
+
+    for (int i = 0; i <= op.coarsest_scale; i++) {
+      cudaFree(I0s[i]);
+      cudaFree(I0xs[i]);
+      cudaFree(I0ys[i]);
+
+      cudaFree(I1s[i]);
+      cudaFree(I1xs[i]);
+      cudaFree(I1ys[i]);
+    }
+
+    delete I0s;
+    delete I1s;
+    delete I0xs;
+    delete I0ys;
+    delete I1xs;
+    delete I1ys;
+
+  }
 
 
   void OFClass::ConstructImgPyramids(img_params iparams) {
@@ -121,43 +218,6 @@ namespace OFC {
 
     }
 
-    if (op.verbosity>1) gettimeofday(&tv_start_all, nullptr);
-
-
-    // Create grids on each scale
-    vector<OFC::PatGridClass*> grid(op.n_scales);
-    vector<float*> flow(op.n_scales);
-    iparams.resize(op.n_scales);
-    for (int sl = op.coarsest_scale; sl >= op.finest_scale; --sl) {
-
-      int i = sl - op.finest_scale;
-
-      float scale_fact = pow(2, -sl); // scaling factor at current scale
-      iparams[i].scale_fact = scale_fact;
-      iparams[i].height = _iparams.height * scale_fact;
-      iparams[i].width = _iparams.width * scale_fact;
-      iparams[i].padding = _iparams.padding;
-      iparams[i].l_bound = -(float) op.patch_size / 2;
-      iparams[i].u_bound_width = (float) (iparams[i].width + op.patch_size / 2 - 2);
-      iparams[i].u_bound_height = (float) (iparams[i].height + op.patch_size / 2 - 2);
-      iparams[i].width_pad = iparams[i].width + 2 * _iparams.padding;
-      iparams[i].height_pad = iparams[i].height + 2 * _iparams.padding;
-      iparams[i].curr_lvl = sl;
-
-      flow[i]   = new float[2 * iparams[i].width * iparams[i].height];
-      grid[i]   = new OFC::PatGridClass(&(iparams[i]), &op);
-
-    }
-
-    // Timing, Grid memory allocation
-    if (op.verbosity>1) {
-
-      gettimeofday(&tv_end_all, nullptr);
-      double tt_gridconst = (tv_end_all.tv_sec-tv_start_all.tv_sec)*1000.0f + (tv_end_all.tv_usec-tv_start_all.tv_usec)/1000.0f;
-      printf("TIME (Grid Memo. Alloc. ) (ms): %3g\n", tt_gridconst);
-
-    }
-
 
     // Main loop; Operate over scales, coarse-to-fine
     for (int sl = op.coarsest_scale; sl >= op.finest_scale; --sl) {
@@ -236,8 +296,21 @@ namespace OFC {
 
       // Variational refinement, (Step 5 in Algorithm 1 of paper)
       if (op.use_var_ref) {
+        float* I0H, * I1H;
+        int elemSize = 3 * sizeof(float);
+        int size = iparams[ii].width_pad * iparams[ii].height_pad * elemSize;
+        I0H = (float*) malloc(size);
+        I1H = (float*) malloc(size);
 
-        OFC::VarRefClass var_ref(I0s[sl], I1s[sl], &(iparams[ii]), &op, out_ptr);
+        checkCudaErrors(
+            cudaMemcpy(I0H, I0s[sl], size, cudaMemcpyDeviceToHost) );
+        checkCudaErrors(
+            cudaMemcpy(I1H, I1s[sl], size, cudaMemcpyDeviceToHost) );
+
+        OFC::VarRefClass var_ref(I0H, I1H, &(iparams[ii]), &op, out_ptr);
+
+        delete I0H;
+        delete I1H;
 
       }
 
@@ -254,15 +327,6 @@ namespace OFC {
 
     }
 
-    // Clean up
-    for (int sl = op.coarsest_scale; sl >= op.finest_scale; --sl) {
-
-      delete[] flow[sl - op.finest_scale];
-      delete grid[sl - op.finest_scale];
-
-    }
-
-
     // Timing, total algorithm run-time
     if (op.verbosity > 0) {
 
diff --git a/src/oflow.h b/src/oflow.h
index ecd2199..d924f1a 100644
--- a/src/oflow.h
+++ b/src/oflow.h
@@ -10,77 +10,20 @@
 #include <opencv2/core/core.hpp>
 #include <opencv2/highgui/highgui.hpp>
 #include <opencv2/imgproc/imgproc.hpp>
+#include "params.h"
+#include "patchgrid.h"
 
 using std::cout;
 using std::endl;
 
 namespace OFC {
 
-  typedef __v4sf v4sf;
-
-
-  typedef struct {
-    int width;      // image width, does not include '2 * padding', but includes original padding to ensure integer divisible image width and height
-    int height;     // image height, does not include '2 * padding', but includes original padding to ensure integer divisible image width and height
-    int padding;    // image padding in pixels at all sides, images padded with replicated border, gradients padded with zero, ADD THIS ONLY WHEN ADDRESSING THE IMAGE OR GRADIENT
-    float l_bound;   // lower bound for valid image region, pre-compute for image padding to avoid border check
-    float u_bound_width;  // upper width bound for valid image region, pre-compute for image padding to avoid border check
-    float u_bound_height;  // upper height bound for valid image region, pre-compute for image padding to avoid border check
-    int width_pad;      // width + 2 * padding
-    int height_pad;      // height + 2 * padding
-    float scale_fact;   // scaling factor at current scale
-    int curr_lvl;    // current level
-  } img_params ;
-
-
-  typedef struct {
-    // Explicitly set parameters:
-    int coarsest_scale;
-    int finest_scale;
-    int patch_size;
-    float patch_stride;
-    bool use_mean_normalization;
-    // Termination
-    int grad_descent_iter;
-    float dp_thresh;
-    float dr_thresh;
-    float res_thresh;
-    // Verbosity, 0: plot nothing, 1: final internal timing 2: complete iteration timing, (UNCOMMENTED -> 3: Display flow scales, 4: Display flow scale iterations)
-    int verbosity;
-    bool use_var_ref;
-    int var_ref_iter;
-    float var_ref_alpha;
-    float var_ref_gamma;
-    float var_ref_delta;
-    float var_ref_sor_weight;         // Successive-over-relaxation weight
-
-    // Automatically set parameters / fixed parameters
-    float outlier_thresh;          // displacement threshold (in px) before a patch is flagged as outlier
-    int steps;                    // horizontal and vertical distance (in px) between patch centers
-    int n_vals;                   // number of points in patch (=p_samp_s*p_samp_s)
-    int n_scales;                 // total number of scales
-    float min_errval = 2.0f;       // 1/max(this, error) for pixel averaging weight
-    float norm_outlier = 5.0f;     // norm error threshold for huber norm
-
-    // cuBLAS
-    cublasHandle_t cublasHandle;
-
-    // Helper variables
-    v4sf zero     = (v4sf) {0.0f, 0.0f, 0.0f, 0.0f};
-    v4sf negzero  = (v4sf) {-0.0f, -0.0f, -0.0f, -0.0f};
-    v4sf half     = (v4sf) {0.5f, 0.5f, 0.5f, 0.5f};
-    v4sf ones     = (v4sf) {1.0f, 1.0f, 1.0f, 1.0f};
-    v4sf twos     = (v4sf) {2.0f, 2.0f, 2.0f, 2.0f};
-    v4sf fours    = (v4sf) {4.0f, 4.0f, 4.0f, 4.0f};
-    v4sf norm_outlier_tmpbsq;
-    v4sf norm_outlier_tmp2bsq;
-    v4sf norm_outlier_tmp4bsq;
-  } opt_params;
-
   class OFClass {
 
     public:
-      OFClass(opt_params _op);
+      OFClass(opt_params _op, img_params _i_params);
+      ~OFClass();
+
       void calc(Npp32f* _I0, Npp32f* _I1, img_params _iparams, const float * initflow, float * outflow);
 
     private:
@@ -94,6 +37,8 @@ namespace OFC {
       opt_params op;                     // Struct for optimization parameters
       std::vector<img_params> iparams;    // Struct (for each scale) for image parameter
 
+      std::vector<PatGridClass*> grid;
+      std::vector<float*> flow;
   };
 
 }
diff --git a/src/params.h b/src/params.h
new file mode 100644
index 0000000..cd51f9c
--- /dev/null
+++ b/src/params.h
@@ -0,0 +1,71 @@
+// Holds all of the paramaters structures
+
+#ifndef PARAMS_HEADER
+#define PARAMS_HEADER
+
+namespace OFC {
+
+  typedef __v4sf v4sf;
+
+
+  typedef struct {
+    int width;      // image width, does not include '2 * padding', but includes original padding to ensure integer divisible image width and height
+    int height;     // image height, does not include '2 * padding', but includes original padding to ensure integer divisible image width and height
+    int padding;    // image padding in pixels at all sides, images padded with replicated border, gradients padded with zero, ADD THIS ONLY WHEN ADDRESSING THE IMAGE OR GRADIENT
+    float l_bound;   // lower bound for valid image region, pre-compute for image padding to avoid border check
+    float u_bound_width;  // upper width bound for valid image region, pre-compute for image padding to avoid border check
+    float u_bound_height;  // upper height bound for valid image region, pre-compute for image padding to avoid border check
+    int width_pad;      // width + 2 * padding
+    int height_pad;      // height + 2 * padding
+    float scale_fact;   // scaling factor at current scale
+    int curr_lvl;    // current level
+  } img_params ;
+
+
+  typedef struct {
+    // Explicitly set parameters:
+    int coarsest_scale;
+    int finest_scale;
+    int patch_size;
+    float patch_stride;
+    bool use_mean_normalization;
+    // Termination
+    int grad_descent_iter;
+    float dp_thresh;
+    float dr_thresh;
+    float res_thresh;
+    // Verbosity, 0: plot nothing, 1: final internal timing 2: complete iteration timing, (UNCOMMENTED -> 3: Display flow scales, 4: Display flow scale iterations)
+    int verbosity;
+    bool use_var_ref;
+    int var_ref_iter;
+    float var_ref_alpha;
+    float var_ref_gamma;
+    float var_ref_delta;
+    float var_ref_sor_weight;         // Successive-over-relaxation weight
+
+    // Automatically set parameters / fixed parameters
+    float outlier_thresh;          // displacement threshold (in px) before a patch is flagged as outlier
+    int steps;                    // horizontal and vertical distance (in px) between patch centers
+    int n_vals;                   // number of points in patch (=p_samp_s*p_samp_s)
+    int n_scales;                 // total number of scales
+    float min_errval = 2.0f;       // 1/max(this, error) for pixel averaging weight
+    float norm_outlier = 5.0f;     // norm error threshold for huber norm
+
+    // cuBLAS
+    cublasHandle_t cublasHandle;
+
+    // Helper variables
+    v4sf zero     = (v4sf) {0.0f, 0.0f, 0.0f, 0.0f};
+    v4sf negzero  = (v4sf) {-0.0f, -0.0f, -0.0f, -0.0f};
+    v4sf half     = (v4sf) {0.5f, 0.5f, 0.5f, 0.5f};
+    v4sf ones     = (v4sf) {1.0f, 1.0f, 1.0f, 1.0f};
+    v4sf twos     = (v4sf) {2.0f, 2.0f, 2.0f, 2.0f};
+    v4sf fours    = (v4sf) {4.0f, 4.0f, 4.0f, 4.0f};
+    v4sf norm_outlier_tmpbsq;
+    v4sf norm_outlier_tmp2bsq;
+    v4sf norm_outlier_tmp4bsq;
+  } opt_params;
+
+}
+
+#endif /* PARAMS_HEADER */
diff --git a/src/patch.cpp b/src/patch.cpp
index 6cb6e0d..f3a8a41 100644
--- a/src/patch.cpp
+++ b/src/patch.cpp
@@ -295,7 +295,7 @@ namespace OFC {
     int lb = -op->patch_size / 2;
     int patch_offset = 3 * ((x + lb) + (y + lb) * i_params->width_pad);
 
-    float* pDeviceI0, *pDeviceI0x, *pDeviceI0y;
+    /*float* pDeviceI0, *pDeviceI0x, *pDeviceI0y;
     int size = i_params->width_pad * i_params->height_pad * 3;
     checkCudaErrors(
         cudaMalloc ((void**) &pDeviceI0, size * sizeof(float)) );
@@ -308,20 +308,20 @@ namespace OFC {
     CUBLAS_CHECK (
         cublasSetVector(size, sizeof(float), I0x, 1, pDeviceI0x, 1) );
     CUBLAS_CHECK (
-        cublasSetVector(size, sizeof(float), I0y, 1, pDeviceI0y, 1) );
+        cublasSetVector(size, sizeof(float), I0y, 1, pDeviceI0y, 1) );*/
 
     // Extract patch
     checkCudaErrors(
         cudaMemcpy2D (pDevicePatch, 3 * op->patch_size * sizeof(float),
-          pDeviceI0 + patch_offset, 3 * i_params->width_pad * sizeof(float),
+          I0 + patch_offset, 3 * i_params->width_pad * sizeof(float),
           3 * op->patch_size * sizeof(float), op->patch_size, cudaMemcpyDeviceToDevice) );
     checkCudaErrors(
         cudaMemcpy2D (pDevicePatchX, 3 * op->patch_size * sizeof(float),
-          pDeviceI0x + patch_offset, 3 * i_params->width_pad * sizeof(float),
+          I0x + patch_offset, 3 * i_params->width_pad * sizeof(float),
           3 * op->patch_size * sizeof(float), op->patch_size, cudaMemcpyDeviceToDevice) );
     checkCudaErrors(
         cudaMemcpy2D (pDevicePatchY, 3 * op->patch_size * sizeof(float),
-          pDeviceI0y + patch_offset, 3 * i_params->width_pad * sizeof(float),
+          I0y + patch_offset, 3 * i_params->width_pad * sizeof(float),
           3 * op->patch_size * sizeof(float), op->patch_size, cudaMemcpyDeviceToDevice) );
 
     // Mean Normalization
@@ -369,9 +369,7 @@ namespace OFC {
 
     // Mean Normalization
     if (op->use_mean_normalization > 0) {
-
       cu::normalizeMean(pDeviceRawDiff, op->cublasHandle, op->patch_size);
-
     }
 
   }
diff --git a/src/patch.h b/src/patch.h
index 72b479b..1ca8ca0 100644
--- a/src/patch.h
+++ b/src/patch.h
@@ -4,7 +4,11 @@
 #ifndef PAT_HEADER
 #define PAT_HEADER
 
-#include "oflow.h" // For camera intrinsic and opt. parameter struct
+#include <Eigen/Core>
+#include <Eigen/LU>
+#include <Eigen/Dense>
+
+#include "params.h" // For camera intrinsic and opt. parameter struct
 
 namespace OFC {
 
@@ -12,10 +16,6 @@ namespace OFC {
     bool has_converged;
     bool has_opt_started;
 
-    // reference/template patch
-    Eigen::Matrix<float, Eigen::Dynamic, 1> raw_diff; // image error to reference image
-    Eigen::Matrix<float, Eigen::Dynamic, 1> cost_diff; // absolute error image
-
     Eigen::Matrix<float, 2, 2> hessian; // Hessian for optimization
     Eigen::Vector2f p_org, p_cur, delta_p; // point position, displacement to starting position, iteration update
 
@@ -54,7 +54,6 @@ namespace OFC {
       inline const bool HasOptStarted() const { return p_state->has_opt_started; }
       inline const Eigen::Vector2f GetTargMidpoint() const { return p_state->midpoint_cur; }
       inline const bool IsValid() const { return !p_state->invalid; }
-      inline const float * GetCostDiffPtr() const { return (float*) p_state->cost_diff.data(); }
       inline float * GetDeviceCostDiffPtr() const { return (float*) pDeviceCostDiff; }
 
 
diff --git a/src/patchgrid.h b/src/patchgrid.h
index 16e5424..9045111 100644
--- a/src/patchgrid.h
+++ b/src/patchgrid.h
@@ -5,7 +5,7 @@
 #define PATGRID_HEADER
 
 #include "patch.h"
-#include "oflow.h" // For camera intrinsic and opt. parameter struct
+#include "params.h" // For camera intrinsic and opt. parameter struct
 
 
 namespace OFC {
diff --git a/src/run_dense.cpp b/src/run_dense.cpp
index 1af3c44..a6655d9 100644
--- a/src/run_dense.cpp
+++ b/src/run_dense.cpp
@@ -108,6 +108,7 @@ int AutoFirstScaleSelect(int imgwidth, int fratio, int patchsize) {
 
 }
 
+
 int main( int argc, char** argv ) {
 
   // Warmup GPU
@@ -141,7 +142,7 @@ int main( int argc, char** argv ) {
   int channels = 3;
   int elemSize = channels * sizeof(Npp32f);
 
-  /* MEMCOPY to CUDA */
+  /* memcpy to cuda */
   Npp32f* I0, *I1;
   auto start_cuda_malloc = now();
   checkCudaErrors( cudaMalloc((void**) &I0, width_org * height_org * elemSize) );
@@ -159,12 +160,6 @@ int main( int argc, char** argv ) {
   // Parse rest of parameters
   opt_params op;
 
-  cublasStatus_t stat = cublasCreate(&op.cublasHandle);
-  if (stat != CUBLAS_STATUS_SUCCESS) {
-    printf ("CUBLAS initialization failed\n");
-    exit(-1);
-  }
-
   if (argc <= 5) {
 
     op.use_mean_normalization = true;
@@ -274,7 +269,7 @@ int main( int argc, char** argv ) {
 
 
   // Create Optical Flow object
-  OFClass ofc(op);
+  OFClass ofc(op, iparams);
 
   // Run main optical flow / depth algorithm
   float scale_fact = pow(2, op.finest_scale);
@@ -282,7 +277,6 @@ int main( int argc, char** argv ) {
 
   ofc.calc(I0, I1, iparams, nullptr, (float*) flow_mat.data);
 
-  cublasDestroy(op.cublasHandle);
 
   if (op.verbosity > 1) gettimeofday(&start_time, NULL);