From fc67d64ea6d6e5620af7c59085130bd966bd2c48 Mon Sep 17 00:00:00 2001
From: Ashwin Sekar <asekar@andrew.cmu.edu>
Date: Wed, 10 May 2017 02:45:03 -0400
Subject: [PATCH] Batch-fy hessian also, construction time is now very fast <
 10 ms

---
 src/kernels/extract.cu                 |  66 +++++++++++++++-
 src/kernels/extract.h                  |   5 +-
 src/patch.cpp                          |  22 ++++--
 src/patch.h                            |   3 +-
 src/patchgrid.cpp                      |  73 +++++++++++++++++-
 src/patchgrid.h                        |   8 ++
 src/timing/batch_extract.time          | 103 +++++++++++++++++++++++++
 src/timing/batch_extract_and_mean.time | 103 +++++++++++++++++++++++++
 src/timing/batch_hessian_too.time      | 103 +++++++++++++++++++++++++
 9 files changed, 470 insertions(+), 16 deletions(-)
 create mode 100644 src/timing/batch_extract.time
 create mode 100644 src/timing/batch_extract_and_mean.time
 create mode 100644 src/timing/batch_hessian_too.time

diff --git a/src/kernels/extract.cu b/src/kernels/extract.cu
index 34c1cc8..f6d6506 100644
--- a/src/kernels/extract.cu
+++ b/src/kernels/extract.cu
@@ -33,9 +33,11 @@ __global__  void kernelExtractPatch(
 }
 
 
-__global__ void kernelExtractPatches(
+__global__ void kernelExtractPatchesAndHessians(
     float** patches, float** patchxs, float** patchys,
     const float * I0, const float * I0x, const float * I0y,
+    float* H00, float* H01, float* H11,
+    float** tempXX, float** tempXY, float** tempYY,
     float* midpointX, float* midpointY, int padding,
     int patch_size, int width_pad) {
 
@@ -45,6 +47,9 @@ __global__ void kernelExtractPatches(
   float* patch = patches[patchId];
   float* patchX = patchxs[patchId];
   float* patchY = patchys[patchId];
+  float* XX = tempXX[patchId];
+  float* XY = tempXY[patchId];
+  float* YY = tempYY[patchId];
 
   int x = round(midpointX[patchId]) + padding;
   int y = round(midpointY[patchId]) + padding;
@@ -57,8 +62,57 @@ __global__ void kernelExtractPatches(
     patch[i] = I0[j];
     patchX[i] = I0x[j];
     patchY[i] = I0y[j];
+    XX[i] = patchX[i] * patchX[i];
+    XY[i] = patchX[i] * patchY[i];
+    YY[i] = patchY[i] * patchY[i];
   }
 
+  __syncthreads();
+
+  // Mean normalize
+  __shared__ float mean;
+
+  if (tid == 0) {
+
+    mean = 0.0;
+    for (int i = 0; i < patch_size * patch_size * 3; i++) {
+      mean += patch[i];
+    }
+    mean /= patch_size * patch_size * 3;
+
+  }
+
+  __syncthreads();
+
+  for (int i = tid; i < patch_size * patch_size * 3;
+      i+= 3 * patch_size) {
+    patch[i] -= mean;
+  }
+
+  // TODO: can this be done in parallel?
+  if (tid == 0) {
+
+    float h00 = 0.0, h01 = 0.0, h11 = 0.0;
+
+    for (int i = 0; i < patch_size * patch_size * 3; i++) {
+      h00 += XX[i];
+      h01 += XY[i];
+      h11 += YY[i];
+    }
+
+    // If not invertible adjust values
+    if (h00 * h11 - h01 * h01 == 0) {
+      h00 += 1e-10;
+      h11 += 1e-10;
+    }
+
+    H00[patchId] = h00;
+    H01[patchId] = h01;
+    H11[patchId] = h11;
+
+  }
+
+
 }
 
 
@@ -80,17 +134,21 @@ namespace cu {
   }
 
 
-  void extractPatches(float** patches, float** patchxs, float** patchys,
+  void extractPatchesAndHessians(
+      float** patches, float** patchxs, float** patchys,
       const float * I0, const float * I0x, const float * I0y,
+      float* H00, float* H01, float* H11,
+      float** tempXX, float** tempXY, float** tempYY,
       float* midpointX, float* midpointY, int n_patches,
       const opt_params* op, const img_params* i_params) {
 
     int nBlocks = n_patches;
     int nThreadsPerBlock = 3 * op->patch_size;
 
-    kernelExtractPatches<<<nBlocks, nThreadsPerBlock>>>(
+    kernelExtractPatchesAndHessians<<<nBlocks, nThreadsPerBlock>>>(
         patches, patchxs, patchys,
-        I0, I0x, I0y, midpointX, midpointY,
+        I0, I0x, I0y, H00, H01, H11,
+        tempXX, tempXY, tempYY, midpointX, midpointY,
         i_params->padding, op->patch_size, i_params->width_pad);
 
   }
diff --git a/src/kernels/extract.h b/src/kernels/extract.h
index 0ac8e1c..85c1a5a 100644
--- a/src/kernels/extract.h
+++ b/src/kernels/extract.h
@@ -32,8 +32,11 @@ namespace cu {
       const float* I0, const float* I0x, const float* I0y, int patch_offset,
       int patch_size, int width_pad);
 
-  void extractPatches(float** patches, float** patchxs, float** patchys,
+  void extractPatchesAndHessians(
+      float** patches, float** patchxs, float** patchys,
       const float * I0, const float * I0x, const float * I0y,
+      float* H00, float* H01, float* H11,
+      float** tempXX, float** tempXY, float** tempYY,
       float* midpointX, float* midpointY, int n_patches,
       const opt_params* op, const img_params* i_params);
 
diff --git a/src/patch.cpp b/src/patch.cpp
index 0fb67cd..f48da59 100644
--- a/src/patch.cpp
+++ b/src/patch.cpp
@@ -92,7 +92,7 @@ namespace OFC {
   // void PatClass::InitializePatch(const float * _I0,
   //     const float * _I0x, const float * _I0y, const Eigen::Vector2f _midpoint) {
   void PatClass::InitializePatch(float * _patch,
-      float * _patchx, float * _patchy,
+      float * _patchx, float * _patchy, float H00, float H01, float H11,
       const Eigen::Vector2f _midpoint) {
 
     // I0 = _I0;
@@ -106,14 +106,20 @@ namespace OFC {
     midpoint = _midpoint;
 
     ResetPatchState();
-    ExtractPatch();
-    ComputeHessian();
+
+    p_state->hessian(0,0) = H00;
+    p_state->hessian(0,1) = H01;
+    p_state->hessian(1,0) = p_state->hessian(0,1);
+    p_state->hessian(1,1) = H11;
+
+    //ExtractPatch();
+    // ComputeHessian(H00, H01, H11);
 
   }
 
-  void PatClass::ComputeHessian() {
+  void PatClass::ComputeHessian(float H00, float H01, float H11) {
 
-    gettimeofday(&tv_start, nullptr);
+    /*gettimeofday(&tv_start, nullptr);
 
     CUBLAS_CHECK (
         cublasSdot(op->cublasHandle, patch.size(),
@@ -131,8 +137,12 @@ namespace OFC {
 
     hessianTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f +
       (tv_end.tv_usec - tv_start.tv_usec) / 1000.0f;
-    hessianCalls++;
+    hessianCalls++;*/
 
+    p_state->hessian(0,0) = H00;
+    p_state->hessian(0,1) = H01;
+    p_state->hessian(1,0) = p_state->hessian(0,1);
+    p_state->hessian(1,1) = H11;
 
     // If not invertible adjust values
     if (p_state->hessian.determinant() == 0) {
diff --git a/src/patch.h b/src/patch.h
index 6442963..269f58d 100644
--- a/src/patch.h
+++ b/src/patch.h
@@ -48,6 +48,7 @@ namespace OFC {
       //     const float * _I0y, const Eigen::Vector2f _midpoint);
       void InitializePatch(float * _patch,
           float * _patchx, float* _patchy,
+          float H00, float H01, float H11,
           const Eigen::Vector2f _midpoint);
       void SetTargetImage(const float * _I1);
 
@@ -75,7 +76,7 @@ namespace OFC {
       void OptimizeComputeErrImg();
       void UpdateMidpoint();
       void ResetPatchState();
-      void ComputeHessian();
+      void ComputeHessian(float H00, float H01, float H11);
       void ComputeCostErr();
 
       // Extract patch on integer position, and gradients, No Bilinear interpolation
diff --git a/src/patchgrid.cpp b/src/patchgrid.cpp
index af9260c..47b0363 100644
--- a/src/patchgrid.cpp
+++ b/src/patchgrid.cpp
@@ -85,7 +85,7 @@ namespace OFC {
       checkCudaErrors(
           cudaMalloc ((void**) &pDeviceFlowOut, i_params->width * i_params->height * 2 * sizeof(float)) );
 
-      // Patches
+      // Patches and Hessians
       checkCudaErrors(
           cudaMalloc((void**) &pDevicePatches, n_patches * sizeof(float*)) );
       checkCudaErrors(
@@ -93,9 +93,21 @@ namespace OFC {
       checkCudaErrors(
           cudaMalloc((void**) &pDevicePatchYs, n_patches * sizeof(float*)) );
 
+      checkCudaErrors(
+          cudaMalloc((void**) &pDeviceTempXX, n_patches * sizeof(float*)) );
+      checkCudaErrors(
+          cudaMalloc((void**) &pDeviceTempXY, n_patches * sizeof(float*)) );
+      checkCudaErrors(
+          cudaMalloc((void**) &pDeviceTempYY, n_patches * sizeof(float*)) );
+
       pHostDevicePatches = new float*[n_patches];
       pHostDevicePatchXs = new float*[n_patches];
       pHostDevicePatchYs = new float*[n_patches];
+
+      float* pHostDeviceTempXX[n_patches];
+      float* pHostDeviceTempXY[n_patches];
+      float* pHostDeviceTempYY[n_patches];
+
       for (int i = 0; i < n_patches; i++) {
         checkCudaErrors(
             cudaMalloc((void**) &pHostDevicePatches[i], op->n_vals * sizeof(float)) );
@@ -103,6 +115,13 @@ namespace OFC {
             cudaMalloc((void**) &pHostDevicePatchXs[i], op->n_vals * sizeof(float)) );
         checkCudaErrors(
             cudaMalloc((void**) &pHostDevicePatchYs[i], op->n_vals * sizeof(float)) );
+
+        checkCudaErrors(
+            cudaMalloc((void**) &pHostDeviceTempXX[i], op->n_vals * sizeof(float)) );
+        checkCudaErrors(
+            cudaMalloc((void**) &pHostDeviceTempXY[i], op->n_vals * sizeof(float)) );
+        checkCudaErrors(
+            cudaMalloc((void**) &pHostDeviceTempYY[i], op->n_vals * sizeof(float)) );
       }
 
       checkCudaErrors( cudaMemcpy(pDevicePatches, pHostDevicePatches,
@@ -112,6 +131,23 @@ namespace OFC {
       checkCudaErrors( cudaMemcpy(pDevicePatchYs, pHostDevicePatchYs,
           n_patches * sizeof(float*), cudaMemcpyHostToDevice) );
 
+
+      checkCudaErrors( cudaMemcpy(pDeviceTempXX, pHostDeviceTempXX,
+          n_patches * sizeof(float*), cudaMemcpyHostToDevice) );
+      checkCudaErrors( cudaMemcpy(pDeviceTempXY, pHostDeviceTempXY,
+          n_patches * sizeof(float*), cudaMemcpyHostToDevice) );
+      checkCudaErrors( cudaMemcpy(pDeviceTempYY, pHostDeviceTempYY,
+          n_patches * sizeof(float*), cudaMemcpyHostToDevice) );
+
+      // Hessian
+      H00 = new float[n_patches];
+      H01 = new float[n_patches];
+      H11 = new float[n_patches];
+
+      checkCudaErrors( cudaMalloc((void**) &pDeviceH00, n_patches * sizeof(float)) );
+      checkCudaErrors( cudaMalloc((void**) &pDeviceH01, n_patches * sizeof(float)) );
+      checkCudaErrors( cudaMalloc((void**) &pDeviceH11, n_patches * sizeof(float)) );
+
       aggregateTime = 0.0;
       meanTime = 0.0;
       extractTime = 0.0;
@@ -123,6 +159,11 @@ namespace OFC {
       cudaFree(pDevicePatches[i]);
       cudaFree(pDevicePatchXs[i]);
       cudaFree(pDevicePatchYs[i]);
+
+      cudaFree(pDeviceTempXX[i]);
+      cudaFree(pDeviceTempXY[i]);
+      cudaFree(pDeviceTempYY[i]);
+
       delete patches[i];
     }
 
@@ -139,6 +180,18 @@ namespace OFC {
     cudaFree(pDeviceMidpointX);
     cudaFree(pDeviceMidpointY);
 
+    cudaFree(pDeviceH00);
+    cudaFree(pDeviceH01);
+    cudaFree(pDeviceH11);
+
+    delete H00;
+    delete H01;
+    delete H11;
+
+    cudaFree(pDeviceTempXX);
+    cudaFree(pDeviceTempXY);
+    cudaFree(pDeviceTempYY);
+
   }
 
   void PatGridClass::InitializeGrid(const float * _I0, const float * _I0x, const float * _I0y) {
@@ -148,15 +201,27 @@ namespace OFC {
     I0y = _I0y;
 
     gettimeofday(&tv_start, nullptr);
-    cu::extractPatches(pDevicePatches, pDevicePatchXs, pDevicePatchYs,
-        I0, I0x, I0y, pDeviceMidpointX, pDeviceMidpointY, n_patches, op, i_params);
+
+    cu::extractPatchesAndHessians(pDevicePatches, pDevicePatchXs, pDevicePatchYs,
+        I0, I0x, I0y, pDeviceH00, pDeviceH01, pDeviceH11,
+        pDeviceTempXX, pDeviceTempXY, pDeviceTempYY,
+        pDeviceMidpointX, pDeviceMidpointY, n_patches, op, i_params);
+
+    checkCudaErrors(
+        cudaMemcpy(H00, pDeviceH00, n_patches * sizeof(float), cudaMemcpyDeviceToHost) );
+    checkCudaErrors(
+        cudaMemcpy(H01, pDeviceH01, n_patches * sizeof(float), cudaMemcpyDeviceToHost) );
+    checkCudaErrors(
+        cudaMemcpy(H11, pDeviceH11, n_patches * sizeof(float), cudaMemcpyDeviceToHost) );
+
     gettimeofday(&tv_end, nullptr);
     extractTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f +
       (tv_end.tv_usec - tv_start.tv_usec) / 1000.0f;
 
     for (int i = 0; i < n_patches; ++i) {
       patches[i]->InitializePatch(pHostDevicePatches[i],
-          pHostDevicePatchXs[i], pHostDevicePatchYs[i], midpoints_ref[i]);
+          pHostDevicePatchXs[i], pHostDevicePatchYs[i],
+          H00[i], H01[i], H11[i], midpoints_ref[i]);
       p_init[i].setZero();
     }
 
diff --git a/src/patchgrid.h b/src/patchgrid.h
index 1767740..36881f8 100644
--- a/src/patchgrid.h
+++ b/src/patchgrid.h
@@ -45,10 +45,18 @@ namespace OFC {
 
       float* pDeviceWeights, *pDeviceFlowOut;
 
+      // Patches
       float** pDevicePatches, ** pDevicePatchXs, ** pDevicePatchYs;
       float** pHostDevicePatches, **pHostDevicePatchXs, **pHostDevicePatchYs;
       float* pDeviceMidpointX, * pDeviceMidpointY;
 
+      // Hessian
+      // TODO: Can we shared memory?
+      float** pDeviceTempXX, ** pDeviceTempXY, ** pDeviceTempYY;
+      float* pDeviceH00, * pDeviceH01, * pDeviceH11;
+      float* H00, * H01, * H11;
+
+
       const img_params* i_params;
       const opt_params* op;
 
diff --git a/src/timing/batch_extract.time b/src/timing/batch_extract.time
new file mode 100644
index 0000000..5646e82
--- /dev/null
+++ b/src/timing/batch_extract.time
@@ -0,0 +1,103 @@
+[start] warmup: processing 720x480 image
+[time] 659.243 (ms) : cudaMalloc
+[time] 0.162 (ms) : cudaMemcpy H->D
+[time] 263.46 (ms) : warmup
+[time] 0.099 (ms) : cudaMemcpy H<-D
+[done] warmup: primary compute time: 263.46 (ms)
+[time] 0.476 (ms) : I0, I1 cudaMalloc
+[time] 0.991 (ms) : cudaMemcpy I0, I1 H->D
+[start] pad: processing 1024x436 image
+[time] 0.212 (ms) : cudaMalloc
+[time] 0.023 (ms) : pad
+[done] pad: primary compute time: 0.023 (ms)
+[start] pad: processing 1024x436 image
+[time] 0.211 (ms) : cudaMalloc
+[time] 0.01 (ms) : pad
+[done] pad: primary compute time: 0.01 (ms)
+TIME (Image loading     ) (ms): 27.635
+TIME (Grid Memo. Alloc. ) (ms): 39.898
+TIME (Setup) (ms): 219.219
+I0 448x1024
+Constructing pyramids
+[start] constructImgPyramids: processing 1024x448 image
+[time] 1.73 (ms) : sobel: Ixs[0]
+[time] 0.283 (ms) : sobel: Iys[0]
+constructImgPyramids level 1: 512x224
+[time] 0.278 (ms) : sobel: Ixs[i]
+[time] 0.289 (ms) : sobel: Iys[i]
+constructImgPyramids level 2: 256x112
+[time] 0.275 (ms) : sobel: Ixs[i]
+[time] 0.279 (ms) : sobel: Iys[i]
+constructImgPyramids level 3: 128x56
+[time] 0.28 (ms) : sobel: Ixs[i]
+[time] 0.28 (ms) : sobel: Iys[i]
+constructImgPyramids level 4: 64x28
+[time] 0.276 (ms) : sobel: Ixs[i]
+[time] 0.277 (ms) : sobel: Iys[i]
+constructImgPyramids level 5: 32x14
+[time] 0.289 (ms) : sobel: Ixs[i]
+[time] 0.279 (ms) : sobel: Iys[i]
+[time] 4.987 (ms) : total time
+[done] constructImgPyramids: primmary compute time: 4.815
+[start] constructImgPyramids: processing 1024x448 image
+[time] 0.273 (ms) : sobel: Ixs[0]
+[time] 0.284 (ms) : sobel: Iys[0]
+constructImgPyramids level 1: 512x224
+[time] 0.277 (ms) : sobel: Ixs[i]
+[time] 0.278 (ms) : sobel: Iys[i]
+constructImgPyramids level 2: 256x112
+[time] 0.289 (ms) : sobel: Ixs[i]
+[time] 0.302 (ms) : sobel: Iys[i]
+constructImgPyramids level 3: 128x56
+[time] 0.296 (ms) : sobel: Ixs[i]
+[time] 0.297 (ms) : sobel: Iys[i]
+constructImgPyramids level 4: 64x28
+[time] 0.297 (ms) : sobel: Ixs[i]
+[time] 0.279 (ms) : sobel: Iys[i]
+constructImgPyramids level 5: 32x14
+[time] 0.275 (ms) : sobel: Ixs[i]
+[time] 0.285 (ms) : sobel: Iys[i]
+[time] 3.561 (ms) : total time
+[done] constructImgPyramids: primmary compute time: 3.432
+TIME (Pyramids+Gradients) (ms): 8.56
+, cflow 
+TIME (Sc: 5, #p:    32, pconst, pinit, poptim, cflow, tvopt, total):     2.21     0.00    33.41     0.14     0.29 ->    36.05 ms.
+TIME (Sc: 4, #p:   112, pconst, pinit, poptim, cflow, tvopt, total):     7.45     0.00   116.82     0.39     0.74 ->   125.40 ms.
+TIME (Sc: 3, #p:   448, pconst, pinit, poptim, cflow, tvopt, total):    29.80     0.00   444.32     1.42     2.14 ->   477.67 ms.
+TIME (O.Flow Run-Time   ) (ms): 639.141
+
+===============Timings (ms)===============
+Avg grad descent iterations:        12.9933
+[hessian]      20  tot => 0.0446429 avg
+[project]      155.165  tot => 0.0288787 avg
+[cost]         123.778  tot => 0.021264 avg
+[interpolate]  41.283  tot => 0.00709208 avg
+[mean norm]    119.738  tot => 0.0191 avg
+[extract]      0.007
+[aggregate]    1.382
+[flow norm]    0.037
+==========================================
+
+===============Timings (ms)===============
+Avg grad descent iterations:        13
+[hessian]      4.982  tot => 0.0444821 avg
+[project]      40.511  tot => 0.0301421 avg
+[cost]         32.868  tot => 0.0225742 avg
+[interpolate]  10.571  tot => 0.0072603 avg
+[mean norm]    31.73  tot => 0.020236 avg
+[extract]      0.007
+[aggregate]    0.378
+[flow norm]    0.015
+==========================================
+
+===============Timings (ms)===============
+Avg grad descent iterations:        13
+[hessian]      1.44  tot => 0.045 avg
+[project]      11.583  tot => 0.0301641 avg
+[cost]         9.426  tot => 0.0226587 avg
+[interpolate]  3.026  tot => 0.00727404 avg
+[mean norm]    9.069  tot => 0.0202433 avg
+[extract]      0.023
+[aggregate]    0.119
+[flow norm]    0.014
+==========================================
diff --git a/src/timing/batch_extract_and_mean.time b/src/timing/batch_extract_and_mean.time
new file mode 100644
index 0000000..496accb
--- /dev/null
+++ b/src/timing/batch_extract_and_mean.time
@@ -0,0 +1,103 @@
+[start] warmup: processing 720x480 image
+[time] 655.99 (ms) : cudaMalloc
+[time] 0.106 (ms) : cudaMemcpy H->D
+[time] 213.393 (ms) : warmup
+[time] 0.084 (ms) : cudaMemcpy H<-D
+[done] warmup: primary compute time: 213.393 (ms)
+[time] 0.433 (ms) : I0, I1 cudaMalloc
+[time] 1.008 (ms) : cudaMemcpy I0, I1 H->D
+[start] pad: processing 1024x436 image
+[time] 0.199 (ms) : cudaMalloc
+[time] 0.019 (ms) : pad
+[done] pad: primary compute time: 0.019 (ms)
+[start] pad: processing 1024x436 image
+[time] 0.195 (ms) : cudaMalloc
+[time] 0.009 (ms) : pad
+[done] pad: primary compute time: 0.009 (ms)
+TIME (Image loading     ) (ms): 24.99
+TIME (Grid Memo. Alloc. ) (ms): 36.493
+TIME (Setup) (ms): 195.469
+I0 448x1024
+Constructing pyramids
+[start] constructImgPyramids: processing 1024x448 image
+[time] 1.675 (ms) : sobel: Ixs[0]
+[time] 0.296 (ms) : sobel: Iys[0]
+constructImgPyramids level 1: 512x224
+[time] 0.274 (ms) : sobel: Ixs[i]
+[time] 0.266 (ms) : sobel: Iys[i]
+constructImgPyramids level 2: 256x112
+[time] 0.277 (ms) : sobel: Ixs[i]
+[time] 0.267 (ms) : sobel: Iys[i]
+constructImgPyramids level 3: 128x56
+[time] 0.265 (ms) : sobel: Ixs[i]
+[time] 0.271 (ms) : sobel: Iys[i]
+constructImgPyramids level 4: 64x28
+[time] 0.272 (ms) : sobel: Ixs[i]
+[time] 0.27 (ms) : sobel: Iys[i]
+constructImgPyramids level 5: 32x14
+[time] 0.268 (ms) : sobel: Ixs[i]
+[time] 0.269 (ms) : sobel: Iys[i]
+[time] 4.837 (ms) : total time
+[done] constructImgPyramids: primmary compute time: 4.67
+[start] constructImgPyramids: processing 1024x448 image
+[time] 0.264 (ms) : sobel: Ixs[0]
+[time] 0.264 (ms) : sobel: Iys[0]
+constructImgPyramids level 1: 512x224
+[time] 0.265 (ms) : sobel: Ixs[i]
+[time] 0.279 (ms) : sobel: Iys[i]
+constructImgPyramids level 2: 256x112
+[time] 0.27 (ms) : sobel: Ixs[i]
+[time] 0.274 (ms) : sobel: Iys[i]
+constructImgPyramids level 3: 128x56
+[time] 0.27 (ms) : sobel: Ixs[i]
+[time] 0.267 (ms) : sobel: Iys[i]
+constructImgPyramids level 4: 64x28
+[time] 0.265 (ms) : sobel: Ixs[i]
+[time] 0.265 (ms) : sobel: Iys[i]
+constructImgPyramids level 5: 32x14
+[time] 0.271 (ms) : sobel: Ixs[i]
+[time] 0.264 (ms) : sobel: Iys[i]
+[time] 3.344 (ms) : total time
+[done] constructImgPyramids: primmary compute time: 3.218
+TIME (Pyramids+Gradients) (ms): 8.191
+, cflow 
+TIME (Sc: 5, #p:    32, pconst, pinit, poptim, cflow, tvopt, total):     1.53     0.00    31.91     0.13     0.25 ->    33.81 ms.
+TIME (Sc: 4, #p:   112, pconst, pinit, poptim, cflow, tvopt, total):     4.96     0.00   110.87     0.35     0.65 ->   116.84 ms.
+TIME (Sc: 3, #p:   448, pconst, pinit, poptim, cflow, tvopt, total):    18.77     0.00   424.14     1.33     2.10 ->   446.35 ms.
+TIME (O.Flow Run-Time   ) (ms): 597.018
+
+===============Timings (ms)===============
+Avg grad descent iterations:        12.9933
+[hessian]      18.47  tot => 0.0412277 avg
+[project]      149.086  tot => 0.0277473 avg
+[cost]         118.145  tot => 0.0202963 avg
+[interpolate]  37.841  tot => 0.00650077 avg
+[mean norm]    105.728  tot => 0.0181632 avg
+[extract]      0.007
+[aggregate]    1.288
+[flow norm]    0.038
+==========================================
+
+===============Timings (ms)===============
+Avg grad descent iterations:        13
+[hessian]      4.879  tot => 0.0435625 avg
+[project]      39.264  tot => 0.0292143 avg
+[cost]         30.855  tot => 0.0211916 avg
+[interpolate]  9.481  tot => 0.00651168 avg
+[mean norm]    27.972  tot => 0.0192115 avg
+[extract]      0.007
+[aggregate]    0.327
+[flow norm]    0.02
+==========================================
+
+===============Timings (ms)===============
+Avg grad descent iterations:        13
+[hessian]      1.487  tot => 0.0464688 avg
+[project]      11.26  tot => 0.0293229 avg
+[cost]         8.915  tot => 0.0214303 avg
+[interpolate]  2.723  tot => 0.00654567 avg
+[mean norm]    8.038  tot => 0.0193221 avg
+[extract]      0.02
+[aggregate]    0.111
+[flow norm]    0.014
+==========================================
diff --git a/src/timing/batch_hessian_too.time b/src/timing/batch_hessian_too.time
new file mode 100644
index 0000000..cda9202
--- /dev/null
+++ b/src/timing/batch_hessian_too.time
@@ -0,0 +1,103 @@
+[start] warmup: processing 720x480 image
+[time] 665.115 (ms) : cudaMalloc
+[time] 0.11 (ms) : cudaMemcpy H->D
+[time] 216.189 (ms) : warmup
+[time] 0.221 (ms) : cudaMemcpy H<-D
+[done] warmup: primary compute time: 216.189 (ms)
+[time] 0.424 (ms) : I0, I1 cudaMalloc
+[time] 0.993 (ms) : cudaMemcpy I0, I1 H->D
+[start] pad: processing 1024x436 image
+[time] 0.219 (ms) : cudaMalloc
+[time] 0.026 (ms) : pad
+[done] pad: primary compute time: 0.026 (ms)
+[start] pad: processing 1024x436 image
+[time] 0.202 (ms) : cudaMalloc
+[time] 0.012 (ms) : pad
+[done] pad: primary compute time: 0.012 (ms)
+TIME (Image loading     ) (ms): 25.133
+TIME (Grid Memo. Alloc. ) (ms): 47.809
+TIME (Setup) (ms): 207.694
+I0 448x1024
+Constructing pyramids
+[start] constructImgPyramids: processing 1024x448 image
+[time] 1.728 (ms) : sobel: Ixs[0]
+[time] 0.327 (ms) : sobel: Iys[0]
+constructImgPyramids level 1: 512x224
+[time] 0.279 (ms) : sobel: Ixs[i]
+[time] 0.276 (ms) : sobel: Iys[i]
+constructImgPyramids level 2: 256x112
+[time] 0.281 (ms) : sobel: Ixs[i]
+[time] 0.268 (ms) : sobel: Iys[i]
+constructImgPyramids level 3: 128x56
+[time] 0.268 (ms) : sobel: Ixs[i]
+[time] 0.273 (ms) : sobel: Iys[i]
+constructImgPyramids level 4: 64x28
+[time] 0.267 (ms) : sobel: Ixs[i]
+[time] 0.28 (ms) : sobel: Iys[i]
+constructImgPyramids level 5: 32x14
+[time] 0.268 (ms) : sobel: Ixs[i]
+[time] 0.275 (ms) : sobel: Iys[i]
+[time] 4.991 (ms) : total time
+[done] constructImgPyramids: primmary compute time: 4.79
+[start] constructImgPyramids: processing 1024x448 image
+[time] 0.265 (ms) : sobel: Ixs[0]
+[time] 0.267 (ms) : sobel: Iys[0]
+constructImgPyramids level 1: 512x224
+[time] 0.272 (ms) : sobel: Ixs[i]
+[time] 0.271 (ms) : sobel: Iys[i]
+constructImgPyramids level 2: 256x112
+[time] 0.266 (ms) : sobel: Ixs[i]
+[time] 0.266 (ms) : sobel: Iys[i]
+constructImgPyramids level 3: 128x56
+[time] 0.272 (ms) : sobel: Ixs[i]
+[time] 0.265 (ms) : sobel: Iys[i]
+constructImgPyramids level 4: 64x28
+[time] 0.266 (ms) : sobel: Ixs[i]
+[time] 0.266 (ms) : sobel: Iys[i]
+constructImgPyramids level 5: 32x14
+[time] 0.274 (ms) : sobel: Ixs[i]
+[time] 0.265 (ms) : sobel: Iys[i]
+[time] 3.341 (ms) : total time
+[done] constructImgPyramids: primmary compute time: 3.215
+TIME (Pyramids+Gradients) (ms): 8.342
+, cflow 
+TIME (Sc: 5, #p:    32, pconst, pinit, poptim, cflow, tvopt, total):     0.08     0.00    32.09     0.12     0.26 ->    32.54 ms.
+TIME (Sc: 4, #p:   112, pconst, pinit, poptim, cflow, tvopt, total):     0.07     0.00   112.14     0.35     0.66 ->   113.21 ms.
+TIME (Sc: 3, #p:   448, pconst, pinit, poptim, cflow, tvopt, total):     0.10     0.00   436.45     1.41     2.18 ->   440.14 ms.
+TIME (O.Flow Run-Time   ) (ms): 585.922
+
+===============Timings (ms)===============
+Avg grad descent iterations:        12.9933
+[hessian]      0  tot => -nan avg
+[project]      153.341  tot => 0.0285392 avg
+[cost]         122.301  tot => 0.0210103 avg
+[interpolate]  38.586  tot => 0.00662876 avg
+[mean norm]    108.655  tot => 0.018666 avg
+[extract]      0.074
+[aggregate]    1.372
+[flow norm]    0.04
+==========================================
+
+===============Timings (ms)===============
+Avg grad descent iterations:        13
+[hessian]      0  tot => -nan avg
+[project]      39.555  tot => 0.0294308 avg
+[cost]         31.504  tot => 0.0216374 avg
+[interpolate]  9.531  tot => 0.00654602 avg
+[mean norm]    28.148  tot => 0.0193324 avg
+[extract]      0.056
+[aggregate]    0.328
+[flow norm]    0.017
+==========================================
+
+===============Timings (ms)===============
+Avg grad descent iterations:        13
+[hessian]      0  tot => -nan avg
+[project]      11.309  tot => 0.0294505 avg
+[cost]         8.998  tot => 0.0216298 avg
+[interpolate]  2.746  tot => 0.00660096 avg
+[mean norm]    8.051  tot => 0.0193534 avg
+[extract]      0.073
+[aggregate]    0.108
+[flow norm]    0.013
+==========================================