diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 6c95cdf..4ffc473 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -79,12 +79,12 @@ set_property(TARGET flow APPEND PROPERTY COMPILE_DEFINITIONS "SELECTCHANNEL=3")
 target_link_libraries(flow ${OpenCV_LIBS})
 
 # CUDA sandbox
-set(SANDBOX_FILES
-  # sandbox/process_sobel.cpp
-  # sandbox/process_resize.cpp
-  # sandbox/process_resizeGrad.cpp
-  sandbox/process_pad.cpp
-  # sandbox/RgbMatTest.cpp
-  sandbox/sandbox.cpp)
-cuda_add_executable(sandbox ${COMMON} ${KERNELS} ${SANDBOX_FILES})
-target_link_libraries(sandbox ${OpenCV_LIBS})
+# set(SANDBOX_FILES
+#   # sandbox/process_sobel.cpp
+#   # sandbox/process_resize.cpp
+#   # sandbox/process_resizeGrad.cpp
+#   sandbox/process_pad.cpp
+#   # sandbox/RgbMatTest.cpp
+#   sandbox/sandbox.cpp)
+# cuda_add_executable(sandbox ${COMMON} ${KERNELS} ${SANDBOX_FILES})
+# target_link_libraries(sandbox ${OpenCV_LIBS})
diff --git a/src/kernels/densify.cu b/src/kernels/densify.cu
index 2d413cc..e33767f 100644
--- a/src/kernels/densify.cu
+++ b/src/kernels/densify.cu
@@ -48,6 +48,47 @@ __global__ void kernelDensifyPatch(
 
 }
 
+
+__global__ void kernelDensifyPatches(
+    float** costs, float* flow, float* weights,
+    float* flowXs, float* flowYs, bool* valid,
+    float* midpointX, float* midpointY,
+    int width, int height,
+    int patch_size, float minErrVal) {
+
+  int patchId = blockIdx.x;
+  int tid = threadIdx.x;
+  if (!valid[patchId]) return;
+
+  int lower_bound = -patch_size / 2;
+  int xt = midpointX[patchId] + lower_bound;
+  int yt = midpointY[patchId] + lower_bound;
+  int offset = (xt + yt * width) + tid;
+
+  float* cost = costs[patchId];
+
+  for (int i = 3 * tid, j = offset; i < patch_size * patch_size * 3;
+      i += 3 * patch_size, j += width) {
+
+    if (j >= 0 && j < width * height) {
+
+      float absw = (float) (fmaxf(minErrVal, cost[i]));
+      absw += (float) (fmaxf(minErrVal, cost[i + 1]));
+      absw += (float) (fmaxf(minErrVal, cost[i + 2]));
+      absw = 1.0 / absw;
+
+      // Weight contribution RGB
+      atomicAdd(&weights[j], absw);
+
+      atomicAdd(&flow[2 * j], flowXs[patchId] * absw);
+      atomicAdd(&flow[2 * j + 1], flowYs[patchId] * absw);
+    }
+
+  }
+
+}
+
+
 __global__ void kernelNormalizeFlow(
     float* pDeviceFlowOut, float* pDeviceWeights, int N) {
 
@@ -89,4 +130,22 @@ namespace cu {
     kernelNormalizeFlow<<<nBlocks, nThreadsPerBlock>>>(pDeviceFlowOut, pDeviceWeights, N);
   }
 
+  void densifyPatches(
+      float** costs, float* flow, float* weights,
+      float* flowXs, float* flowYs, bool* valid,
+      float* midpointX, float* midpointY, int n_patches,
+      const opt_params* op, const img_params* i_params) {
+
+    int nBlocks = n_patches;
+    int nThreadsPerBlock = op->patch_size;
+
+    kernelDensifyPatches<<<nBlocks, nThreadsPerBlock>>>(
+        costs, flow, weights,
+        flowXs, flowYs, valid,
+        midpointX,  midpointY,
+        i_params->width, i_params->height,
+        op->patch_size, op->min_errval);
+
+  }
+
 }
diff --git a/src/kernels/densify.h b/src/kernels/densify.h
index 8b705cf..b755050 100644
--- a/src/kernels/densify.h
+++ b/src/kernels/densify.h
@@ -19,6 +19,9 @@
 #include "../common/Exceptions.h"
 #include "../common/timer.h"
 #include "../sandbox/process.h"
+#include "../patch.h"
+
+using namespace OFC;
 
 namespace cu {
 
@@ -29,6 +32,14 @@ namespace cu {
       int width, int height,
       int patchSize, float minErrVal);
 
+
+  void densifyPatches(
+      float** costs, float* flow, float* weights,
+      float* flowXs, float* flowYs, bool* valid,
+      float* midpointX, float* midpointY, int n_patches,
+      const opt_params* op, const img_params* i_params);
+
+
   void normalizeFlow(
       float* pDeviceFlowOut, float* pDeviceWeights, int N);
 
diff --git a/src/patchgrid.cpp b/src/patchgrid.cpp
index 47b0363..ad2c216 100644
--- a/src/patchgrid.cpp
+++ b/src/patchgrid.cpp
@@ -263,14 +263,55 @@ namespace OFC {
 
   void PatGridClass::AggregateFlowDense(float *flowout) {
 
+    bool isValid[n_patches];
+    float flowXs[n_patches];
+    float flowYs[n_patches];
+    float* costs[n_patches];
+
+    for (int i = 0; i < n_patches; i++) {
+      isValid[i] = patches[i]->IsValid();
+      flowXs[i] = (*(patches[i]->GetCurP()))[0];
+      flowYs[i] = (*(patches[i]->GetCurP()))[1];
+      costs[i] = patches[i]->GetDeviceCostDiffPtr();
+    }
+
+    bool *deviceIsValid;
+    float* deviceFlowXs, * deviceFlowYs;
+    float** deviceCosts;
+
+    checkCudaErrors(
+          cudaMalloc ((void**) &deviceIsValid, n_patches * sizeof(bool)) );
+    checkCudaErrors(
+          cudaMalloc ((void**) &deviceFlowXs, n_patches * sizeof(float)) );
+    checkCudaErrors(
+          cudaMalloc ((void**) &deviceFlowYs, n_patches * sizeof(float)) );
+    checkCudaErrors(
+          cudaMalloc ((void**) &deviceCosts, n_patches * sizeof(float*)) );
+
+    checkCudaErrors( cudaMemcpy(deviceIsValid, isValid,
+          n_patches * sizeof(bool), cudaMemcpyHostToDevice) );
+    checkCudaErrors( cudaMemcpy(deviceFlowXs, flowXs,
+          n_patches * sizeof(float), cudaMemcpyHostToDevice) );
+    checkCudaErrors( cudaMemcpy(deviceFlowYs, flowYs,
+          n_patches * sizeof(float), cudaMemcpyHostToDevice) );
+    checkCudaErrors( cudaMemcpy(deviceCosts, costs,
+          n_patches * sizeof(float*), cudaMemcpyHostToDevice) );
+
+
     gettimeofday(&tv_start, nullptr);
+
     // Device mem
     checkCudaErrors(
         cudaMemset (pDeviceWeights, 0.0, i_params->width * i_params->height * sizeof(float)) );
     checkCudaErrors(
         cudaMemset (pDeviceFlowOut, 0.0, i_params->width * i_params->height * 2 * sizeof(float)) );
 
-    for (int ip = 0; ip < n_patches; ++ip) {
+    cu::densifyPatches(
+        deviceCosts, pDeviceFlowOut, pDeviceWeights,
+        deviceFlowXs, deviceFlowYs, deviceIsValid,
+        pDeviceMidpointX, pDeviceMidpointY, n_patches,
+        op, i_params);
+    /*for (int ip = 0; ip < n_patches; ++ip) {
       if (patches[ip]->IsValid()) {
 
         const Eigen::Vector2f* fl = patches[ip]->GetCurP(); // flow displacement of this patch
@@ -285,7 +326,7 @@ namespace OFC {
             op->patch_size, op->min_errval);
 
       }
-    }
+    }*/
 
     gettimeofday(&tv_end, nullptr);
     aggregateTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f +
diff --git a/src/timing/batch_agg.time b/src/timing/batch_agg.time
new file mode 100644
index 0000000..c322d90
--- /dev/null
+++ b/src/timing/batch_agg.time
@@ -0,0 +1,103 @@
+[start] warmup: processing 720x480 image
+[time] 647.278 (ms) : cudaMalloc
+[time] 0.135 (ms) : cudaMemcpy H->D
+[time] 227.372 (ms) : warmup
+[time] 0.108 (ms) : cudaMemcpy H<-D
+[done] warmup: primary compute time: 227.372 (ms)
+[time] 0.419 (ms) : I0, I1 cudaMalloc
+[time] 1.004 (ms) : cudaMemcpy I0, I1 H->D
+[start] pad: processing 1024x436 image
+[time] 0.226 (ms) : cudaMalloc
+[time] 0.025 (ms) : pad
+[done] pad: primary compute time: 0.025 (ms)
+[start] pad: processing 1024x436 image
+[time] 0.205 (ms) : cudaMalloc
+[time] 0.012 (ms) : pad
+[done] pad: primary compute time: 0.012 (ms)
+TIME (Image loading     ) (ms): 24.776
+TIME (Grid Memo. Alloc. ) (ms): 47.66
+TIME (Setup) (ms): 208.829
+I0 448x1024
+Constructing pyramids
+[start] constructImgPyramids: processing 1024x448 image
+[time] 1.463 (ms) : sobel: Ixs[0]
+[time] 0.308 (ms) : sobel: Iys[0]
+constructImgPyramids level 1: 512x224
+[time] 0.275 (ms) : sobel: Ixs[i]
+[time] 0.273 (ms) : sobel: Iys[i]
+constructImgPyramids level 2: 256x112
+[time] 0.281 (ms) : sobel: Ixs[i]
+[time] 0.277 (ms) : sobel: Iys[i]
+constructImgPyramids level 3: 128x56
+[time] 0.273 (ms) : sobel: Ixs[i]
+[time] 0.276 (ms) : sobel: Iys[i]
+constructImgPyramids level 4: 64x28
+[time] 0.286 (ms) : sobel: Ixs[i]
+[time] 0.274 (ms) : sobel: Iys[i]
+constructImgPyramids level 5: 32x14
+[time] 0.276 (ms) : sobel: Ixs[i]
+[time] 0.279 (ms) : sobel: Iys[i]
+[time] 4.772 (ms) : total time
+[done] constructImgPyramids: primmary compute time: 4.541
+[start] constructImgPyramids: processing 1024x448 image
+[time] 0.279 (ms) : sobel: Ixs[0]
+[time] 0.274 (ms) : sobel: Iys[0]
+constructImgPyramids level 1: 512x224
+[time] 0.28 (ms) : sobel: Ixs[i]
+[time] 0.275 (ms) : sobel: Iys[i]
+constructImgPyramids level 2: 256x112
+[time] 0.275 (ms) : sobel: Ixs[i]
+[time] 0.274 (ms) : sobel: Iys[i]
+constructImgPyramids level 3: 128x56
+[time] 0.285 (ms) : sobel: Ixs[i]
+[time] 0.277 (ms) : sobel: Iys[i]
+constructImgPyramids level 4: 64x28
+[time] 0.276 (ms) : sobel: Ixs[i]
+[time] 0.277 (ms) : sobel: Iys[i]
+constructImgPyramids level 5: 32x14
+[time] 0.307 (ms) : sobel: Ixs[i]
+[time] 0.273 (ms) : sobel: Iys[i]
+[time] 3.534 (ms) : total time
+[done] constructImgPyramids: primmary compute time: 3.352
+TIME (Pyramids+Gradients) (ms): 8.32
+, cflow 
+TIME (Sc: 5, #p:    32, pconst, pinit, poptim, cflow, tvopt, total):     0.09     0.00    31.70     0.10     0.25 ->    32.13 ms.
+TIME (Sc: 4, #p:   112, pconst, pinit, poptim, cflow, tvopt, total):     0.07     0.00   110.32     0.09     0.65 ->   111.12 ms.
+TIME (Sc: 3, #p:   448, pconst, pinit, poptim, cflow, tvopt, total):     0.11     0.00   425.12     0.12     2.16 ->   427.51 ms.
+TIME (O.Flow Run-Time   ) (ms): 570.787
+
+===============Timings (ms)===============
+Avg grad descent iterations:        12.9978
+[hessian]      0  tot => -nan avg
+[project]      149.146  tot => 0.0277481 avg
+[cost]         119.47  tot => 0.0205169 avg
+[interpolate]  37.362  tot => 0.00641628 avg
+[mean norm]    105.888  tot => 0.0181844 avg
+[extract]      0.074
+[aggregate]    0.011
+[flow norm]    0.04
+==========================================
+
+===============Timings (ms)===============
+Avg grad descent iterations:        13
+[hessian]      0  tot => -nan avg
+[project]      38.76  tot => 0.0288393 avg
+[cost]         31.095  tot => 0.0213565 avg
+[interpolate]  9.248  tot => 0.00635165 avg
+[mean norm]    27.84  tot => 0.0191209 avg
+[extract]      0.057
+[aggregate]    0.011
+[flow norm]    0.017
+==========================================
+
+===============Timings (ms)===============
+Avg grad descent iterations:        13
+[hessian]      0  tot => -nan avg
+[project]      11.16  tot => 0.0290625 avg
+[cost]         8.943  tot => 0.0214976 avg
+[interpolate]  2.684  tot => 0.00645192 avg
+[mean norm]    7.954  tot => 0.0191202 avg
+[extract]      0.08
+[aggregate]    0.019
+[flow norm]    0.012
+==========================================
diff --git a/src/timing/batch_hessian_too.time b/src/timing/batch_hessian_too.time
index cda9202..f7f5659 100644
--- a/src/timing/batch_hessian_too.time
+++ b/src/timing/batch_hessian_too.time
@@ -1,103 +1,103 @@
 [start] warmup: processing 720x480 image
-[time] 665.115 (ms) : cudaMalloc
-[time] 0.11 (ms) : cudaMemcpy H->D
-[time] 216.189 (ms) : warmup
-[time] 0.221 (ms) : cudaMemcpy H<-D
-[done] warmup: primary compute time: 216.189 (ms)
-[time] 0.424 (ms) : I0, I1 cudaMalloc
-[time] 0.993 (ms) : cudaMemcpy I0, I1 H->D
+[time] 686.119 (ms) : cudaMalloc
+[time] 0.106 (ms) : cudaMemcpy H->D
+[time] 242.681 (ms) : warmup
+[time] 0.099 (ms) : cudaMemcpy H<-D
+[done] warmup: primary compute time: 242.681 (ms)
+[time] 0.476 (ms) : I0, I1 cudaMalloc
+[time] 1.008 (ms) : cudaMemcpy I0, I1 H->D
 [start] pad: processing 1024x436 image
-[time] 0.219 (ms) : cudaMalloc
-[time] 0.026 (ms) : pad
-[done] pad: primary compute time: 0.026 (ms)
+[time] 0.213 (ms) : cudaMalloc
+[time] 0.021 (ms) : pad
+[done] pad: primary compute time: 0.021 (ms)
 [start] pad: processing 1024x436 image
-[time] 0.202 (ms) : cudaMalloc
-[time] 0.012 (ms) : pad
-[done] pad: primary compute time: 0.012 (ms)
-TIME (Image loading     ) (ms): 25.133
-TIME (Grid Memo. Alloc. ) (ms): 47.809
-TIME (Setup) (ms): 207.694
+[time] 0.204 (ms) : cudaMalloc
+[time] 0.01 (ms) : pad
+[done] pad: primary compute time: 0.01 (ms)
+TIME (Image loading     ) (ms): 27.769
+TIME (Grid Memo. Alloc. ) (ms): 49.605
+TIME (Setup) (ms): 227.645
 I0 448x1024
 Constructing pyramids
 [start] constructImgPyramids: processing 1024x448 image
-[time] 1.728 (ms) : sobel: Ixs[0]
-[time] 0.327 (ms) : sobel: Iys[0]
+[time] 1.73 (ms) : sobel: Ixs[0]
+[time] 0.283 (ms) : sobel: Iys[0]
 constructImgPyramids level 1: 512x224
-[time] 0.279 (ms) : sobel: Ixs[i]
-[time] 0.276 (ms) : sobel: Iys[i]
+[time] 0.294 (ms) : sobel: Ixs[i]
+[time] 0.283 (ms) : sobel: Iys[i]
 constructImgPyramids level 2: 256x112
-[time] 0.281 (ms) : sobel: Ixs[i]
-[time] 0.268 (ms) : sobel: Iys[i]
+[time] 0.275 (ms) : sobel: Ixs[i]
+[time] 0.289 (ms) : sobel: Iys[i]
 constructImgPyramids level 3: 128x56
-[time] 0.268 (ms) : sobel: Ixs[i]
-[time] 0.273 (ms) : sobel: Iys[i]
+[time] 0.274 (ms) : sobel: Ixs[i]
+[time] 0.285 (ms) : sobel: Iys[i]
 constructImgPyramids level 4: 64x28
-[time] 0.267 (ms) : sobel: Ixs[i]
-[time] 0.28 (ms) : sobel: Iys[i]
+[time] 0.276 (ms) : sobel: Ixs[i]
+[time] 0.288 (ms) : sobel: Iys[i]
 constructImgPyramids level 5: 32x14
-[time] 0.268 (ms) : sobel: Ixs[i]
-[time] 0.275 (ms) : sobel: Iys[i]
-[time] 4.991 (ms) : total time
-[done] constructImgPyramids: primmary compute time: 4.79
+[time] 0.274 (ms) : sobel: Ixs[i]
+[time] 0.279 (ms) : sobel: Iys[i]
+[time] 5.011 (ms) : total time
+[done] constructImgPyramids: primmary compute time: 4.83
 [start] constructImgPyramids: processing 1024x448 image
-[time] 0.265 (ms) : sobel: Ixs[0]
-[time] 0.267 (ms) : sobel: Iys[0]
+[time] 0.282 (ms) : sobel: Ixs[0]
+[time] 0.276 (ms) : sobel: Iys[0]
 constructImgPyramids level 1: 512x224
-[time] 0.272 (ms) : sobel: Ixs[i]
-[time] 0.271 (ms) : sobel: Iys[i]
+[time] 0.278 (ms) : sobel: Ixs[i]
+[time] 0.284 (ms) : sobel: Iys[i]
 constructImgPyramids level 2: 256x112
-[time] 0.266 (ms) : sobel: Ixs[i]
-[time] 0.266 (ms) : sobel: Iys[i]
+[time] 0.277 (ms) : sobel: Ixs[i]
+[time] 0.274 (ms) : sobel: Iys[i]
 constructImgPyramids level 3: 128x56
-[time] 0.272 (ms) : sobel: Ixs[i]
-[time] 0.265 (ms) : sobel: Iys[i]
+[time] 0.279 (ms) : sobel: Ixs[i]
+[time] 0.282 (ms) : sobel: Iys[i]
 constructImgPyramids level 4: 64x28
-[time] 0.266 (ms) : sobel: Ixs[i]
-[time] 0.266 (ms) : sobel: Iys[i]
+[time] 0.277 (ms) : sobel: Ixs[i]
+[time] 0.274 (ms) : sobel: Iys[i]
 constructImgPyramids level 5: 32x14
-[time] 0.274 (ms) : sobel: Ixs[i]
-[time] 0.265 (ms) : sobel: Iys[i]
-[time] 3.341 (ms) : total time
-[done] constructImgPyramids: primmary compute time: 3.215
-TIME (Pyramids+Gradients) (ms): 8.342
+[time] 0.286 (ms) : sobel: Ixs[i]
+[time] 0.277 (ms) : sobel: Iys[i]
+[time] 3.479 (ms) : total time
+[done] constructImgPyramids: primmary compute time: 3.346
+TIME (Pyramids+Gradients) (ms): 8.501
 , cflow 
-TIME (Sc: 5, #p:    32, pconst, pinit, poptim, cflow, tvopt, total):     0.08     0.00    32.09     0.12     0.26 ->    32.54 ms.
-TIME (Sc: 4, #p:   112, pconst, pinit, poptim, cflow, tvopt, total):     0.07     0.00   112.14     0.35     0.66 ->   113.21 ms.
-TIME (Sc: 3, #p:   448, pconst, pinit, poptim, cflow, tvopt, total):     0.10     0.00   436.45     1.41     2.18 ->   440.14 ms.
-TIME (O.Flow Run-Time   ) (ms): 585.922
+TIME (Sc: 5, #p:    32, pconst, pinit, poptim, cflow, tvopt, total):     0.08     0.00    34.25     0.14     0.29 ->    34.76 ms.
+TIME (Sc: 4, #p:   112, pconst, pinit, poptim, cflow, tvopt, total):     0.07     0.00   120.03     0.39     0.74 ->   121.24 ms.
+TIME (Sc: 3, #p:   448, pconst, pinit, poptim, cflow, tvopt, total):     0.11     0.00   440.06     1.35     2.11 ->   443.63 ms.
+TIME (O.Flow Run-Time   ) (ms): 599.655
 
 ===============Timings (ms)===============
 Avg grad descent iterations:        12.9933
 [hessian]      0  tot => -nan avg
-[project]      153.341  tot => 0.0285392 avg
-[cost]         122.301  tot => 0.0210103 avg
-[interpolate]  38.586  tot => 0.00662876 avg
-[mean norm]    108.655  tot => 0.018666 avg
-[extract]      0.074
-[aggregate]    1.372
-[flow norm]    0.04
+[project]      154.088  tot => 0.0286782 avg
+[cost]         122.006  tot => 0.0209596 avg
+[interpolate]  40.805  tot => 0.00700997 avg
+[mean norm]    109.308  tot => 0.0187782 avg
+[extract]      0.078
+[aggregate]    1.304
+[flow norm]    0.038
 ==========================================
 
 ===============Timings (ms)===============
 Avg grad descent iterations:        13
 [hessian]      0  tot => -nan avg
-[project]      39.555  tot => 0.0294308 avg
-[cost]         31.504  tot => 0.0216374 avg
-[interpolate]  9.531  tot => 0.00654602 avg
-[mean norm]    28.148  tot => 0.0193324 avg
-[extract]      0.056
-[aggregate]    0.328
-[flow norm]    0.017
+[project]      41.891  tot => 0.0311689 avg
+[cost]         33.433  tot => 0.0229622 avg
+[interpolate]  11.077  tot => 0.00760783 avg
+[mean norm]    30.095  tot => 0.0206696 avg
+[extract]      0.059
+[aggregate]    0.378
+[flow norm]    0.015
 ==========================================
 
 ===============Timings (ms)===============
 Avg grad descent iterations:        13
 [hessian]      0  tot => -nan avg
-[project]      11.309  tot => 0.0294505 avg
-[cost]         8.998  tot => 0.0216298 avg
-[interpolate]  2.746  tot => 0.00660096 avg
-[mean norm]    8.051  tot => 0.0193534 avg
-[extract]      0.073
-[aggregate]    0.108
+[project]      11.963  tot => 0.0311536 avg
+[cost]         9.552  tot => 0.0229615 avg
+[interpolate]  3.159  tot => 0.00759375 avg
+[mean norm]    8.55  tot => 0.0205529 avg
+[extract]      0.075
+[aggregate]    0.121
 [flow norm]    0.013
 ==========================================