diff --git a/src/kernels/densify.cu b/src/kernels/densify.cu
index fa3ed9a..e964794 100644
--- a/src/kernels/densify.cu
+++ b/src/kernels/densify.cu
@@ -53,18 +53,16 @@ __global__ void kernelDensifyPatch(
 
 __global__ void kernelDensifyPatches(
     float** costs, float* flow, float* weights,
-    float* flowXs, float* flowYs, bool* valid,
-    float* midpointX, float* midpointY,
+    dev_patch_state* states,
     int width, int height,
     int patch_size, float minErrVal) {
 
   int patchId = blockIdx.x;
   int tid = threadIdx.x;
-  if (!valid[patchId]) return;
 
   int lower_bound = -patch_size / 2;
-  int xt = midpointX[patchId] + lower_bound;
-  int yt = midpointY[patchId] + lower_bound;
+  int xt = states[patchId].midpoint_orgx + lower_bound;
+  int yt = states[patchId].midpoint_orgy + lower_bound;
   int offset = (xt + yt * width) + tid;
 
   float* cost = costs[patchId];
@@ -82,8 +80,8 @@ __global__ void kernelDensifyPatches(
       // Weight contribution RGB
       atomicAdd(&weights[j], absw);
 
-      atomicAdd(&flow[2 * j], flowXs[patchId] * absw);
-      atomicAdd(&flow[2 * j + 1], flowYs[patchId] * absw);
+      atomicAdd(&flow[2 * j], states[patchId].p_curx * absw);
+      atomicAdd(&flow[2 * j + 1], states[patchId].p_cury * absw);
     }
 
   }
@@ -92,13 +90,13 @@ __global__ void kernelDensifyPatches(
 
 
 __global__ void kernelNormalizeFlow(
-    float* pDeviceFlowOut, float* pDeviceWeights, int N) {
+    float* pDeviceFlowOut, float* pDeviceWeights, int N, int numBlocks) {
 
   int i = blockIdx.x * blockDim.x + threadIdx.x;
 
-  if (i < N && pDeviceWeights[i] > 0) {
-    pDeviceFlowOut[2 * i]     /= pDeviceWeights[i];
-    pDeviceFlowOut[2 * i + 1] /= pDeviceWeights[i];
+  for (; i < N; i+= blockDim.x * numBlocks) {
+    if (pDeviceWeights[i / 2] > 0) 
+      pDeviceFlowOut[i]  /= pDeviceWeights[i / 2];
   }
 
 }
@@ -127,15 +125,14 @@ namespace cu {
       float* pDeviceFlowOut, float* pDeviceWeights, int N) {
 
     int nThreadsPerBlock = 64;
-    int nBlocks = (N + nThreadsPerBlock - 1) / nThreadsPerBlock;
+    int nBlocks = 10;
 
-    kernelNormalizeFlow<<<nBlocks, nThreadsPerBlock>>>(pDeviceFlowOut, pDeviceWeights, N);
+    kernelNormalizeFlow<<<nBlocks, nThreadsPerBlock>>>(pDeviceFlowOut, pDeviceWeights, N, nBlocks);
   }
 
   void densifyPatches(
       float** costs, float* flow, float* weights,
-      float* flowXs, float* flowYs, bool* valid,
-      float* midpointX, float* midpointY, int n_patches,
+      dev_patch_state* states, int n_patches,
       const opt_params* op, const img_params* i_params) {
 
     int nBlocks = n_patches;
@@ -143,8 +140,7 @@ namespace cu {
 
     kernelDensifyPatches<<<nBlocks, nThreadsPerBlock>>>(
         costs, flow, weights,
-        flowXs, flowYs, valid,
-        midpointX,  midpointY,
+        states,
         i_params->width, i_params->height,
         op->patch_size, op->min_errval);
 
diff --git a/src/kernels/densify.h b/src/kernels/densify.h
index 096b4ef..fbcfdc7 100644
--- a/src/kernels/densify.h
+++ b/src/kernels/densify.h
@@ -35,8 +35,7 @@ namespace cu {
 
   void densifyPatches(
       float** costs, float* flow, float* weights,
-      float* flowXs, float* flowYs, bool* valid,
-      float* midpointX, float* midpointY, int n_patches,
+      dev_patch_state* states, int n_patches,
       const opt_params* op, const img_params* i_params);
 
 
diff --git a/src/patchgrid.cpp b/src/patchgrid.cpp
index 87d2e30..2d5bedc 100644
--- a/src/patchgrid.cpp
+++ b/src/patchgrid.cpp
@@ -171,6 +171,11 @@ namespace OFC {
       checkCudaErrors( cudaMemcpy(pDevicePatchStates, pHostDevicePatchStates,
             n_patches * sizeof(dev_patch_state), cudaMemcpyHostToDevice) );
 
+      // Prev flow
+      int flow_size = i_params->width * i_params->height / 2;
+      checkCudaErrors(
+          cudaMalloc ((void**) &pDevFlowPrev, flow_size * sizeof(float)) );
+
 
       // Hessian
       H00 = new float[n_patches];
@@ -185,6 +190,7 @@ namespace OFC {
       meanTime = 0.0;
       extractTime = 0.0;
       optiTime = 0.0;
+      coarseTime = 0.0;
     }
 
 
@@ -223,6 +229,8 @@ namespace OFC {
     delete pHostDeviceTempXY;
     delete pHostDeviceTempYY;
 
+    cudaFree(pDevFlowPrev);
+
     cudaFree(pDeviceH00);
     cudaFree(pDeviceH01);
     cudaFree(pDeviceH11);
@@ -280,57 +288,24 @@ namespace OFC {
 
   void PatGridClass::InitializeFromCoarserOF(const float * flow_prev) {
 
-    float * devFlowPrev;
     int flow_size = i_params->width * i_params->height / 2;
-    checkCudaErrors(
-        cudaMalloc ((void**) &devFlowPrev, flow_size * sizeof(float)) );
-    checkCudaErrors( cudaMemcpy(devFlowPrev, flow_prev,
+    checkCudaErrors( cudaMemcpy(pDevFlowPrev, flow_prev,
           flow_size * sizeof(float), cudaMemcpyHostToDevice) );
 
-    cu::initCoarserOF(devFlowPrev, pDevicePatchStates,
+    gettimeofday(&tv_start, nullptr);
+
+    cu::initCoarserOF(pDevFlowPrev, pDevicePatchStates,
         n_patches, i_params);
 
+    gettimeofday(&tv_end, nullptr);
+    coarseTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f +
+      (tv_end.tv_usec - tv_start.tv_usec) / 1000.0f;
 
   }
 
 
   void PatGridClass::AggregateFlowDense(float *flowout) {
 
-    /*bool isValid[n_patches];
-      float flowXs[n_patches];
-      float flowYs[n_patches];
-      float* costs[n_patches];
-
-      for (int i = 0; i < n_patches; i++) {
-      isValid[i] = patches[i]->IsValid();
-      flowXs[i] = (*(patches[i]->GetCurP()))[0];
-      flowYs[i] = (*(patches[i]->GetCurP()))[1];
-      costs[i] = patches[i]->GetDeviceCostDiffPtr();
-      }
-
-      bool *deviceIsValid;
-      float* deviceFlowXs, * deviceFlowYs;
-      float** deviceCosts;
-
-      checkCudaErrors(
-      cudaMalloc ((void**) &deviceIsValid, n_patches * sizeof(bool)) );
-      checkCudaErrors(
-      cudaMalloc ((void**) &deviceFlowXs, n_patches * sizeof(float)) );
-      checkCudaErrors(
-      cudaMalloc ((void**) &deviceFlowYs, n_patches * sizeof(float)) );
-      checkCudaErrors(
-      cudaMalloc ((void**) &deviceCosts, n_patches * sizeof(float*)) );
-
-      checkCudaErrors( cudaMemcpy(deviceIsValid, isValid,
-      n_patches * sizeof(bool), cudaMemcpyHostToDevice) );
-      checkCudaErrors( cudaMemcpy(deviceFlowXs, flowXs,
-      n_patches * sizeof(float), cudaMemcpyHostToDevice) );
-      checkCudaErrors( cudaMemcpy(deviceFlowYs, flowYs,
-      n_patches * sizeof(float), cudaMemcpyHostToDevice) );
-      checkCudaErrors( cudaMemcpy(deviceCosts, costs,
-      n_patches * sizeof(float*), cudaMemcpyHostToDevice) );*/
-
-
     gettimeofday(&tv_start, nullptr);
 
     // Device mem
@@ -339,39 +314,28 @@ namespace OFC {
     checkCudaErrors(
         cudaMemset (pDeviceFlowOut, 0.0, i_params->width * i_params->height * 2 * sizeof(float)) );
 
-    /*cu::densifyPatches(
-      deviceCosts, pDeviceFlowOut, pDeviceWeights,
-      deviceFlowXs, deviceFlowYs, deviceIsValid,
-      pDeviceMidpointX, pDeviceMidpointY, n_patches,
-      op, i_params);*/
-    for (int ip = 0; ip < n_patches; ++ip) {
-
-      float* pweight = pHostDeviceCosts[ip]; // use image error as weight
-
-      cu::densifyPatch(
-          pweight, pDeviceFlowOut, pDeviceWeights,
-          pDevicePatchStates, ip,
-          midpoints_ref[ip][0], midpoints_ref[ip][1],
-          i_params->width, i_params->height,
-          op->patch_size, op->min_errval);
-
-    }
+    cu::densifyPatches(
+        pDeviceCosts, pDeviceFlowOut, pDeviceWeights,
+        pDevicePatchStates, n_patches, op, i_params);
 
     gettimeofday(&tv_end, nullptr);
     aggregateTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f +
       (tv_end.tv_usec - tv_start.tv_usec) / 1000.0f;
 
     gettimeofday(&tv_start, nullptr);
+
     // Normalize all pixels
-    cu::normalizeFlow(pDeviceFlowOut, pDeviceWeights, i_params->width * i_params->height);
+    cu::normalizeFlow(pDeviceFlowOut, pDeviceWeights, 2 * i_params->width * i_params->height);
+
+    gettimeofday(&tv_end, nullptr);
+    meanTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f +
+      (tv_end.tv_usec - tv_start.tv_usec) / 1000.0f;
 
     checkCudaErrors(
         cudaMemcpy(flowout, pDeviceFlowOut,
           i_params->width * i_params->height * 2 * sizeof(float), cudaMemcpyDeviceToHost) );
 
-    gettimeofday(&tv_end, nullptr);
-    meanTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f +
-      (tv_end.tv_usec - tv_start.tv_usec) / 1000.0f;
+
   }
 
 
@@ -380,6 +344,7 @@ namespace OFC {
     cout << endl;
     cout << "===============Timings (ms)===============" << endl;
     cout << "[extract]      " << extractTime << endl;
+    cout << "[coarse]       " << coarseTime << endl;
     cout << "[optiTime]      " << optiTime << endl;
     cout << "[aggregate]    " << aggregateTime << endl;
     cout << "[flow norm]    " << meanTime << endl;
diff --git a/src/patchgrid.h b/src/patchgrid.h
index 389fff7..ddfb2ae 100644
--- a/src/patchgrid.h
+++ b/src/patchgrid.h
@@ -52,6 +52,9 @@ namespace OFC {
       float** pDeviceRaws, **pDeviceCosts;
       float** pHostDeviceRaws, **pHostDeviceCosts;
 
+      // Previous flow
+      float* pDevFlowPrev;
+
       // Hessian
       // TODO: Can we shared memory?
       float** pDeviceTempXX, ** pDeviceTempXY, ** pDeviceTempYY;
@@ -72,6 +75,7 @@ namespace OFC {
       double aggregateTime;
       double meanTime;
       double extractTime;
+      double coarseTime;
       double optiTime;
 
       // float* midpointX_host;