diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c1b4731..1557bcc 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -76,6 +76,7 @@ if (ARCH STREQUAL "ARM")
 endif()
 
 set(KERNELS
+  kernels/flowUtil.cu
   kernels/densify.cu
   kernels/interpolate.cu
   kernels/extract.cu
@@ -98,16 +99,17 @@ set(CODEFILES
   # patch.cpp
   patchgrid.cpp
   refine_variational.cpp
-  FDF1.0.1/image.c
-  FDF1.0.1/opticalflow_aux.c
-  FDF1.0.1/solver.c
+  # FDF1.0.1/image.c
+  FDF1.0.1/image.cpp
+  FDF1.0.1/opticalflow_aux.cpp
+  FDF1.0.1/solver.cpp
   )
 
 # GrayScale, Optical Flow
 cuda_add_executable(flow ${COMMON} ${CODEFILES} ${KERNELS})
 set_target_properties (flow PROPERTIES COMPILE_DEFINITIONS "SELECTMODE=1")
-set_property(TARGET flow APPEND PROPERTY COMPILE_DEFINITIONS "SELECTCHANNEL=3") # use RGB image
 set_property(TARGET flow APPEND PROPERTY COMPILE_DEFINITIONS "VECTOR_WIDTH=4") # 8 wide SIMD (4 floats)
+set_property(TARGET flow APPEND PROPERTY COMPILE_DEFINITIONS "UNIFIED_MEM=1") # Use zero copy memory
 # set_property(TARGET flow APPEND PROPERTY COMPILE_DEFINITIONS "VECTOR_WIDTH=1") # no SIMD
 target_link_libraries(flow ${OpenCV_LIBS})
 
diff --git a/src/FDF1.0.1/image.c b/src/FDF1.0.1/image.cpp
similarity index 97%
rename from src/FDF1.0.1/image.c
rename to src/FDF1.0.1/image.cpp
index d8f044e..146caeb 100644
--- a/src/FDF1.0.1/image.c
+++ b/src/FDF1.0.1/image.cpp
@@ -6,6 +6,10 @@
 
 #include "image.h"
 
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "../common/cuda_helper.h"
+
 // #include <xmmintrin.h>
 // typedef __v4sf v4sf;
 
@@ -29,7 +33,11 @@ image_t *image_new(const int width, const int height){
     image->width = width;
     image->height = height;  
     image->stride = ( (width+3) / 4 ) * 4;
+#if (UNIFIED_MEM)
+    checkCudaErrors( cudaHostAlloc((void**) &image->c1, image->stride*height*sizeof(float), cudaHostAllocMapped) );
+#else
     image->c1 = (float*) memalign(16, image->stride*height*sizeof(float));
+#endif
     if(image->c1== NULL){
         fprintf(stderr, "Error: image_new() - not enough memory !\n");
         exit(1);
@@ -70,7 +78,11 @@ void image_delete(image_t *image){
     if(image == NULL){
         //fprintf(stderr, "Warning: Delete image --> Ignore action (image not allocated)\n");
     }else{
+#if (UNIFIED_MEM)
+    cudaFree(image->c1);
+#else
     free(image->c1);
+#endif
     free(image);
     }
 }
@@ -86,7 +98,11 @@ color_image_t *color_image_new(const int width, const int height){
     image->width = width;
     image->height = height;  
     image->stride = ( (width+VECTOR_WIDTH-1) / VECTOR_WIDTH ) * VECTOR_WIDTH;
+#if (UNIFIED_MEM)
+    checkCudaErrors( cudaHostAlloc((void**) &image->c1, 3*image->stride*height*sizeof(float), cudaHostAllocMapped) );
+#else
     image->c1 = (float*) memalign(16, 3*image->stride*height*sizeof(float));
+#endif
     if(image->c1 == NULL){
         fprintf(stderr, "Error: color_image_new() - not enough memory !\n");
         exit(1);
@@ -111,7 +127,11 @@ void color_image_erase(color_image_t *image){
 /* free memory of a color image */
 void color_image_delete(color_image_t *image){
     if(image){
+#if (UNIFIED_MEM)
+      cudaFree(image->c1);
+#else
         free(image->c1); // c2 and c3 was allocated at the same moment
+#endif
         free(image);
     }
 }
@@ -665,7 +685,7 @@ void color_image_convolve_hv(color_image_t *dst, const color_image_t *src, const
             dst_red = {width,height,stride,dst->c1}, dst_green = {width,height,stride,dst->c2}, dst_blue = {width,height,stride,dst->c3};
     // horizontal and vertical
     if(horiz_conv != NULL && vert_conv != NULL){
-        float *tmp_data = malloc(sizeof(float)*stride*height);
+        float *tmp_data = (float*) malloc(sizeof(float)*stride*height);
         if(tmp_data == NULL){
 	        fprintf(stderr,"error color_image_convolve_hv(): not enough memory\n");
 	        exit(1);
@@ -699,7 +719,7 @@ void image_convolve_hv(image_t *dst, const image_t *src, const convolution_t *ho
             dst_red = {width,height,stride,dst->c1};
     // horizontal and vertical
     if(horiz_conv != NULL && vert_conv != NULL){
-        float *tmp_data = malloc(sizeof(float)*stride*height);
+        float *tmp_data = (float*) malloc(sizeof(float)*stride*height);
         if(tmp_data == NULL){
           fprintf(stderr,"error image_convolve_hv(): not enough memory\n");
           exit(1);
diff --git a/src/FDF1.0.1/opticalflow_aux.c b/src/FDF1.0.1/opticalflow_aux.c
deleted file mode 100644
index 9b6648e..0000000
--- a/src/FDF1.0.1/opticalflow_aux.c
+++ /dev/null
@@ -1,384 +0,0 @@
-#include <stdlib.h>
-#include <math.h>
-#include <malloc.h>
-#include <string.h>
-#include "opticalflow_aux.h"
-
-#include <arm_neon.h>
-
-#if (VECTOR_WIDTH == 4)
-typedef float32x4_t v4sf;
-#else
-typedef float v4sf;
-#endif
-
-#define datanorm 0.1f*0.1f//0.01f // square of the normalization factor
-#define epsilon_color (0.001f*0.001f)//0.000001f
-#define epsilon_grad (0.001f*0.001f)//0.000001f
-#define epsilon_desc (0.001f*0.001f)//0.000001f
-#define epsilon_smooth (0.001f*0.001f)//0.000001f
-
-/* warp a color image according to a flow. src is the input image, wx and wy, the input flow. dst is the warped image and mask contains 0 or 1 if the pixels goes outside/inside image boundaries */
-#if (SELECTCHANNEL==1 | SELECTCHANNEL==2)  // use single band image_delete
-void image_warp(image_t *dst, image_t *mask, const image_t *src, const image_t *wx, const image_t *wy)
-#else
-void image_warp(color_image_t *dst, image_t *mask, const color_image_t *src, const image_t *wx, const image_t *wy)
-#endif
-{
-  int i, j, offset, incr_line = mask->stride-mask->width, x, y, x1, x2, y1, y2;
-  float xx, yy, dx, dy;
-  for(j=0,offset=0 ; j<src->height ; j++)
-  {
-    for(i=0 ; i<src->width ; i++,offset++)
-    {
-      xx = i+wx->c1[offset];
-      yy = j+wy->c1[offset];
-      x = floor(xx);
-      y = floor(yy);
-      dx = xx-x;
-      dy = yy-y;
-      mask->c1[offset] = (xx>=0 && xx<=src->width-1 && yy>=0 && yy<=src->height-1);
-      x1 = MINMAX_TA(x,src->width);
-      x2 = MINMAX_TA(x+1,src->width);
-      y1 = MINMAX_TA(y,src->height);
-      y2 = MINMAX_TA(y+1,src->height);
-      dst->c1[offset] = 
-        src->c1[y1*src->stride+x1]*(1.0f-dx)*(1.0f-dy) +
-        src->c1[y1*src->stride+x2]*dx*(1.0f-dy) +
-        src->c1[y2*src->stride+x1]*(1.0f-dx)*dy +
-        src->c1[y2*src->stride+x2]*dx*dy;
-#if (SELECTCHANNEL==3)
-      dst->c2[offset] = 
-        src->c2[y1*src->stride+x1]*(1.0f-dx)*(1.0f-dy) +
-        src->c2[y1*src->stride+x2]*dx*(1.0f-dy) +
-        src->c2[y2*src->stride+x1]*(1.0f-dx)*dy +
-        src->c2[y2*src->stride+x2]*dx*dy;
-      dst->c3[offset] = 
-        src->c3[y1*src->stride+x1]*(1.0f-dx)*(1.0f-dy) +
-        src->c3[y1*src->stride+x2]*dx*(1.0f-dy) +
-        src->c3[y2*src->stride+x1]*(1.0f-dx)*dy +
-        src->c3[y2*src->stride+x2]*dx*dy;
-#endif
-    }
-    offset += incr_line;
-  }
-}
-
-
-/* compute image first and second order spatio-temporal derivatives of a color image */
-#if (SELECTCHANNEL==1 | SELECTCHANNEL==2)  // use single band image_delete
-void get_derivatives(const image_t *im1, const image_t *im2, const convolution_t *deriv,
-    image_t *dx, image_t *dy, image_t *dt, 
-    image_t *dxx, image_t *dxy, image_t *dyy, image_t *dxt, image_t *dyt)
-#else
-void get_derivatives(const color_image_t *im1, const color_image_t *im2, const convolution_t *deriv,
-    color_image_t *dx, color_image_t *dy, color_image_t *dt, 
-    color_image_t *dxx, color_image_t *dxy, color_image_t *dyy, color_image_t *dxt, color_image_t *dyt)
-#endif
-{
-  // derivatives are computed on the mean of the first image and the warped second image
-#if (SELECTCHANNEL==1 | SELECTCHANNEL==2)
-  image_t *tmp_im2 = image_new(im2->width,im2->height);    
-  v4sf *tmp_im2p = (v4sf*) tmp_im2->c1, *dtp = (v4sf*) dt->c1, *im1p = (v4sf*) im1->c1, *im2p = (v4sf*) im2->c1;
-#if (VECTOR_WIDTH == 4)
-  const v4sf half = {0.5f,0.5f,0.5f,0.5f};
-#else
-  const v4sf half = 0.5f;
-#endif
-  int i=0;
-  for(i=0 ; i<im1->height*im1->stride/VECTOR_WIDTH ; i++){
-    *tmp_im2p = half * ( (*im2p) + (*im1p) );
-    *dtp = (*im2p)-(*im1p);
-    dtp+=1; im1p+=1; im2p+=1; tmp_im2p+=1;
-  }   
-  // compute all other derivatives
-  image_convolve_hv(dx, tmp_im2, deriv, NULL);
-  image_convolve_hv(dy, tmp_im2, NULL, deriv);
-  image_convolve_hv(dxx, dx, deriv, NULL);
-  image_convolve_hv(dxy, dx, NULL, deriv);
-  image_convolve_hv(dyy, dy, NULL, deriv);
-  image_convolve_hv(dxt, dt, deriv, NULL);
-  image_convolve_hv(dyt, dt, NULL, deriv);
-  // free memory
-  image_delete(tmp_im2);
-#else
-  color_image_t *tmp_im2 = color_image_new(im2->width,im2->height);    
-  v4sf *tmp_im2p = (v4sf*) tmp_im2->c1, *dtp = (v4sf*) dt->c1, *im1p = (v4sf*) im1->c1, *im2p = (v4sf*) im2->c1;
-#if (VECTOR_WIDTH == 4)
-  const v4sf half = {0.5f,0.5f,0.5f,0.5f};
-#else
-  const v4sf half = 0.5f;
-#endif
-  int i=0;
-  for(i=0 ; i<3*im1->height*im1->stride/VECTOR_WIDTH; i++){
-    *tmp_im2p = half * ( (*im2p) + (*im1p) );
-    *dtp = (*im2p)-(*im1p);
-    dtp+=1; im1p+=1; im2p+=1; tmp_im2p+=1;
-  }   
-  // compute all other derivatives
-  color_image_convolve_hv(dx, tmp_im2, deriv, NULL);
-  color_image_convolve_hv(dy, tmp_im2, NULL, deriv);
-  color_image_convolve_hv(dxx, dx, deriv, NULL);
-  color_image_convolve_hv(dxy, dx, NULL, deriv);
-  color_image_convolve_hv(dyy, dy, NULL, deriv);
-  color_image_convolve_hv(dxt, dt, deriv, NULL);
-  color_image_convolve_hv(dyt, dt, NULL, deriv);
-  // free memory
-  color_image_delete(tmp_im2);
-#endif
-}
-
-
-/* compute the smoothness term */
-/* It is represented as two images, the first one for horizontal smoothness, the second for vertical
-   in dst_horiz, the pixel i,j represents the smoothness weight between pixel i,j and i,j+1
-   in dst_vert, the pixel i,j represents the smoothness weight between pixel i,j and i+1,j */
-void compute_smoothness(image_t *dst_horiz, image_t *dst_vert, const image_t *uu, const image_t *vv, const convolution_t *deriv_flow, const float quarter_alpha){
-  const int width = uu->width, height = vv->height, stride = uu->stride;
-  int j;
-  image_t *ux = image_new(width,height), *vx = image_new(width,height), *uy = image_new(width,height), *vy = image_new(width,height), *smoothness = image_new(width,height);
-  // compute derivatives [-0.5 0 0.5]
-  convolve_horiz(ux, uu, deriv_flow);
-  convolve_horiz(vx, vv, deriv_flow);
-  convolve_vert(uy, uu, deriv_flow);
-  convolve_vert(vy, vv, deriv_flow);
-  // compute smoothness
-  v4sf *uxp = (v4sf*) ux->c1, *vxp = (v4sf*) vx->c1, *uyp = (v4sf*) uy->c1, *vyp = (v4sf*) vy->c1, *sp = (v4sf*) smoothness->c1;
-#if (VECTOR_WIDTH == 4)
-  const v4sf qa = {quarter_alpha,quarter_alpha,quarter_alpha,quarter_alpha};
-  const v4sf epsmooth = {epsilon_smooth,epsilon_smooth,epsilon_smooth,epsilon_smooth};
-#else
-  const v4sf qa = quarter_alpha;
-  const v4sf epsmooth = epsilon_smooth;
-#endif
-  for(j=0 ; j< height*stride/VECTOR_WIDTH ; j++){
-#if (VECTOR_WIDTH == 4)
-    *sp = qa / vsqrtq_f32(
-        (*uxp)*(*uxp) + (*uyp)*(*uyp) + (*vxp)*(*vxp) + (*vyp)*(*vyp) + epsmooth );
-#else
-    *sp = qa / sqrtf(
-        (*uxp)*(*uxp) + (*uyp)*(*uyp) + (*vxp)*(*vxp) + (*vyp)*(*vyp) + epsmooth );
-#endif
-    sp+=1;uxp+=1; uyp+=1; vxp+=1; vyp+=1;
-  }
-  image_delete(ux); image_delete(uy); image_delete(vx); image_delete(vy); 
-  // compute dst_horiz
-  v4sf *dsthp = (v4sf*) dst_horiz->c1; sp = (v4sf*) smoothness->c1;
-  float *sp_shift = (float*) memalign(16, stride*sizeof(float)); // aligned shifted copy of the current line
-  for(j=0;j<height;j++){
-    // create an aligned copy
-    float *spf = (float*) sp;
-    memcpy(sp_shift, spf+1, sizeof(float)*(stride-1));
-    v4sf *sps = (v4sf*) sp_shift;
-    int i;
-    for(i=0;i<stride/VECTOR_WIDTH;i++){
-      *dsthp = (*sp) + (*sps);
-      dsthp+=1; sp+=1; sps+=1;
-    }
-    memset( &dst_horiz->c1[j*stride+width-1], 0, sizeof(float)*(stride-width+1));
-  }
-  free(sp_shift);
-  // compute dst_vert
-  v4sf *dstvp = (v4sf*) dst_vert->c1, *sp_bottom = (v4sf*) (smoothness->c1+stride); sp = (v4sf*) smoothness->c1;
-  for(j=0 ; j<(height-1)*stride/VECTOR_WIDTH ; j++){
-    *dstvp = (*sp) + (*sp_bottom);
-    dstvp+=1; sp+=1; sp_bottom+=1;
-  }
-  memset( &dst_vert->c1[(height-1)*stride], 0, sizeof(float)*stride);
-  image_delete(smoothness);
-}
-
-
-
-
-
-/* sub the laplacian (smoothness term) to the right-hand term */
-void sub_laplacian(image_t *dst, const image_t *src, const image_t *weight_horiz, const image_t *weight_vert){
-  int j;
-  const int offsetline = src->stride-src->width;
-  float *src_ptr = src->c1, *dst_ptr = dst->c1, *weight_horiz_ptr = weight_horiz->c1;
-  // horizontal filtering
-  for(j=src->height+1;--j;){ // faster than for(j=0;j<src->height;j++)
-    int i;
-    for(i=src->width;--i;){
-      const float tmp = (*weight_horiz_ptr)*((*(src_ptr+1))-(*src_ptr));
-      *dst_ptr += tmp;
-      *(dst_ptr+1) -= tmp;
-      dst_ptr++;
-      src_ptr++;
-      weight_horiz_ptr++;
-    }
-    dst_ptr += offsetline+1;
-    src_ptr += offsetline+1;
-    weight_horiz_ptr += offsetline+1;
-  }
-
-  v4sf *wvp = (v4sf*) weight_vert->c1, *srcp = (v4sf*) src->c1, *srcp_s = (v4sf*) (src->c1+src->stride), *dstp = (v4sf*) dst->c1, *dstp_s = (v4sf*) (dst->c1+src->stride);
-  for(j=1+(src->height-1)*src->stride/VECTOR_WIDTH ; --j ;){
-    const v4sf tmp = (*wvp) * ((*srcp_s)-(*srcp));
-    *dstp += tmp;
-    *dstp_s -= tmp;
-    wvp+=1; srcp+=1; srcp_s+=1; dstp+=1; dstp_s+=1;
-  }
-}
-
-/* compute the dataterm // REMOVED MATCHING TERM
-   a11 a12 a22 represents the 2x2 diagonal matrix, b1 and b2 the right hand side
-   other (color) images are input */
-#if (SELECTCHANNEL==1 | SELECTCHANNEL==2)  // use single band image_delete
-void compute_data(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t *b2, image_t *mask, image_t *wx, image_t *wy, image_t *du, image_t *dv, image_t *uu, image_t *vv, image_t *Ix, image_t *Iy, image_t *Iz, image_t *Ixx, image_t *Ixy, image_t *Iyy, image_t *Ixz, image_t *Iyz, const float half_delta_over3, const float half_beta, const float half_gamma_over3)
-#else
-void compute_data(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t *b2, image_t *mask, image_t *wx, image_t *wy, image_t *du, image_t *dv, image_t *uu, image_t *vv, color_image_t *Ix, color_image_t *Iy, color_image_t *Iz, color_image_t *Ixx, color_image_t *Ixy, color_image_t *Iyy, color_image_t *Ixz, color_image_t *Iyz, const float half_delta_over3, const float half_beta, const float half_gamma_over3)
-#endif
-{
-#if (VECTOR_WIDTH == 4)
-  const v4sf dnorm = {datanorm, datanorm, datanorm, datanorm};
-  const v4sf hdover3 = {half_delta_over3, half_delta_over3, half_delta_over3, half_delta_over3};
-  const v4sf epscolor = {epsilon_color, epsilon_color, epsilon_color, epsilon_color};
-  const v4sf hgover3 = {half_gamma_over3, half_gamma_over3, half_gamma_over3, half_gamma_over3};
-  const v4sf epsgrad = {epsilon_grad, epsilon_grad, epsilon_grad, epsilon_grad};
-  //const v4sf hbeta = {half_beta,half_beta,half_beta,half_beta};
-  //const v4sf epsdesc = {epsilon_desc,epsilon_desc,epsilon_desc,epsilon_desc};
-#else
-  const v4sf dnorm = datanorm;
-  const v4sf hdover3 = half_delta_over3;
-  const v4sf epscolor = epsilon_color;
-  const v4sf hgover3 = half_gamma_over3;
-  const v4sf epsgrad = epsilon_grad;
-#endif
-
-  v4sf *dup = (v4sf*) du->c1, *dvp = (v4sf*) dv->c1,
-       *maskp = (v4sf*) mask->c1,
-       *a11p = (v4sf*) a11->c1, *a12p = (v4sf*) a12->c1, *a22p = (v4sf*) a22->c1, 
-       *b1p = (v4sf*) b1->c1, *b2p = (v4sf*) b2->c1, 
-       *ix1p=(v4sf*)Ix->c1, *iy1p=(v4sf*)Iy->c1, *iz1p=(v4sf*)Iz->c1, *ixx1p=(v4sf*)Ixx->c1, *ixy1p=(v4sf*)Ixy->c1, *iyy1p=(v4sf*)Iyy->c1, *ixz1p=(v4sf*)Ixz->c1, *iyz1p=(v4sf*) Iyz->c1, 
-#if (SELECTCHANNEL==3)
-       *ix2p=(v4sf*)Ix->c2, *iy2p=(v4sf*)Iy->c2, *iz2p=(v4sf*)Iz->c2, *ixx2p=(v4sf*)Ixx->c2, *ixy2p=(v4sf*)Ixy->c2, *iyy2p=(v4sf*)Iyy->c2, *ixz2p=(v4sf*)Ixz->c2, *iyz2p=(v4sf*) Iyz->c2, 
-       *ix3p=(v4sf*)Ix->c3, *iy3p=(v4sf*)Iy->c3, *iz3p=(v4sf*)Iz->c3, *ixx3p=(v4sf*)Ixx->c3, *ixy3p=(v4sf*)Ixy->c3, *iyy3p=(v4sf*)Iyy->c3, *ixz3p=(v4sf*)Ixz->c3, *iyz3p=(v4sf*) Iyz->c3, 
-#endif
-       *uup = (v4sf*) uu->c1, *vvp = (v4sf*)vv->c1, *wxp = (v4sf*)wx->c1, *wyp = (v4sf*)wy->c1;
-
-
-  memset(a11->c1, 0, sizeof(float)*uu->height*uu->stride);
-  memset(a12->c1, 0, sizeof(float)*uu->height*uu->stride);
-  memset(a22->c1, 0, sizeof(float)*uu->height*uu->stride);
-  memset(b1->c1 , 0, sizeof(float)*uu->height*uu->stride);
-  memset(b2->c1 , 0, sizeof(float)*uu->height*uu->stride);
-
-  int i;
-  for(i = 0 ; i<uu->height*uu->stride/VECTOR_WIDTH ; i++){
-    v4sf tmp, tmp2, n1, n2;
-#if (SELECTCHANNEL==3)
-    v4sf tmp3, tmp4, tmp5, tmp6, n3, n4, n5, n6;
-#endif
-    // dpsi color
-    if(half_delta_over3){
-      tmp  = *iz1p + (*ix1p)*(*dup) + (*iy1p)*(*dvp);
-      n1 = (*ix1p) * (*ix1p) + (*iy1p) * (*iy1p) + dnorm;
-#if (SELECTCHANNEL==3)
-      tmp2 = *iz2p + (*ix2p)*(*dup) + (*iy2p)*(*dvp);
-      n2 = (*ix2p) * (*ix2p) + (*iy2p) * (*iy2p) + dnorm;
-      tmp3 = *iz3p + (*ix3p)*(*dup) + (*iy3p)*(*dvp);
-      n3 = (*ix3p) * (*ix3p) + (*iy3p) * (*iy3p) + dnorm;
-#if (VECTOR_WIDTH == 4)
-      tmp = (*maskp) * hdover3 / vsqrtq_f32(tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + epscolor);
-#else
-      tmp = (*maskp) * hdover3 / sqrtf(tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + epscolor);
-#endif
-      tmp3 = tmp/n3; tmp2 = tmp/n2; tmp /= n1;
-#else
-#if (VECTOR_WIDTH == 4)
-      tmp = (*maskp) * hdover3 / vsqrtq_f32(3 * tmp*tmp/n1 + epscolor);
-#else
-      tmp = (*maskp) * hdover3 / sqrtf(3 * tmp*tmp/n1 + epscolor);
-#endif
-      tmp /= n1;
-#endif
-      *a11p += tmp  * (*ix1p) * (*ix1p);
-      *a12p += tmp  * (*ix1p) * (*iy1p);
-      *a22p += tmp  * (*iy1p) * (*iy1p);
-      *b1p -=  tmp  * (*iz1p) * (*ix1p);
-      *b2p -=  tmp  * (*iz1p) * (*iy1p);
-#if (SELECTCHANNEL==3)
-      *a11p += tmp2 * (*ix2p) * (*ix2p);
-      *a12p += tmp2 * (*ix2p) * (*iy2p);
-      *a22p += tmp2 * (*iy2p) * (*iy2p);
-      *b1p -=  tmp2 * (*iz2p) * (*ix2p);
-      *b2p -=  tmp2 * (*iz2p) * (*iy2p);
-      *a11p += tmp3 * (*ix3p) * (*ix3p);
-      *a12p += tmp3 * (*ix3p) * (*iy3p);
-      *a22p += tmp3 * (*iy3p) * (*iy3p);
-      *b1p -=  tmp3 * (*iz3p) * (*ix3p);
-      *b2p -=  tmp3 * (*iz3p) * (*iy3p);
-#endif
-    }
-
-    // dpsi gradient
-    n1 = (*ixx1p) * (*ixx1p) + (*ixy1p) * (*ixy1p) + dnorm;
-    n2 = (*iyy1p) * (*iyy1p) + (*ixy1p) * (*ixy1p) + dnorm;
-    tmp  = *ixz1p + (*ixx1p) * (*dup) + (*ixy1p) * (*dvp);
-    tmp2 = *iyz1p + (*ixy1p) * (*dup) + (*iyy1p) * (*dvp);
-#if (SELECTCHANNEL==3)
-    n3 = (*ixx2p) * (*ixx2p) + (*ixy2p) * (*ixy2p) + dnorm;
-    n4 = (*iyy2p) * (*iyy2p) + (*ixy2p) * (*ixy2p) + dnorm;
-    tmp3 = *ixz2p + (*ixx2p) * (*dup) + (*ixy2p) * (*dvp);
-    tmp4 = *iyz2p + (*ixy2p) * (*dup) + (*iyy2p) * (*dvp);
-    n5 = (*ixx3p) * (*ixx3p) + (*ixy3p) * (*ixy3p) + dnorm;
-    n6 = (*iyy3p) * (*iyy3p) + (*ixy3p) * (*ixy3p) + dnorm;
-    tmp5 = *ixz3p + (*ixx3p) * (*dup) + (*ixy3p) * (*dvp);
-    tmp6 = *iyz3p + (*ixy3p) * (*dup) + (*iyy3p) * (*dvp);
-#if (VECTOR_WIDTH == 4)
-    tmp = (*maskp) * hgover3 / vsqrtq_f32(
-        tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + tmp4*tmp4/n4 + tmp5*tmp5/n5 + tmp6*tmp6/n6 + epsgrad);
-#else
-    tmp = (*maskp) * hgover3 / sqrtf(
-        tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + tmp4*tmp4/n4 + tmp5*tmp5/n5 + tmp6*tmp6/n6 + epsgrad);
-#endif
-    tmp6 = tmp/n6; tmp5 = tmp/n5; tmp4 = tmp/n4; tmp3 = tmp/n3; tmp2 = tmp/n2; tmp /= n1;      
-#else
-#if (VECTOR_WIDTH == 4)
-    tmp = (*maskp) * hgover3 / vsqrtq_f32(3* tmp*tmp/n1 + 3* tmp2*tmp2/n2 + epsgrad);
-#else
-    tmp = (*maskp) * hgover3 / sqrtf(3* tmp*tmp/n1 + 3* tmp2*tmp2/n2 + epsgrad);
-#endif
-    tmp2 = tmp/n2; tmp /= n1;      
-#endif
-    *a11p += tmp *(*ixx1p)*(*ixx1p) + tmp2*(*ixy1p)*(*ixy1p);
-    *a12p += tmp *(*ixx1p)*(*ixy1p) + tmp2*(*ixy1p)*(*iyy1p);
-    *a22p += tmp2*(*iyy1p)*(*iyy1p) + tmp *(*ixy1p)*(*ixy1p);
-    *b1p -=  tmp *(*ixx1p)*(*ixz1p) + tmp2*(*ixy1p)*(*iyz1p);
-    *b2p -=  tmp2*(*iyy1p)*(*iyz1p) + tmp *(*ixy1p)*(*ixz1p);
-#if (SELECTCHANNEL==3)
-    *a11p += tmp3*(*ixx2p)*(*ixx2p) + tmp4*(*ixy2p)*(*ixy2p);
-    *a12p += tmp3*(*ixx2p)*(*ixy2p) + tmp4*(*ixy2p)*(*iyy2p);
-    *a22p += tmp4*(*iyy2p)*(*iyy2p) + tmp3*(*ixy2p)*(*ixy2p);
-    *b1p -=  tmp3*(*ixx2p)*(*ixz2p) + tmp4*(*ixy2p)*(*iyz2p);
-    *b2p -=  tmp4*(*iyy2p)*(*iyz2p) + tmp3*(*ixy2p)*(*ixz2p);
-    *a11p += tmp5*(*ixx3p)*(*ixx3p) + tmp6*(*ixy3p)*(*ixy3p);
-    *a12p += tmp5*(*ixx3p)*(*ixy3p) + tmp6*(*ixy3p)*(*iyy3p);
-    *a22p += tmp6*(*iyy3p)*(*iyy3p) + tmp5*(*ixy3p)*(*ixy3p);
-    *b1p -=  tmp5*(*ixx3p)*(*ixz3p) + tmp6*(*ixy3p)*(*iyz3p);
-    *b2p -=  tmp6*(*iyy3p)*(*iyz3p) + tmp5*(*ixy3p)*(*ixz3p);  
-#endif
-
-
-#if (SELECTCHANNEL==1 | SELECTCHANNEL==2)  // multiply system to make smoothing parameters same for RGB and single-channel image
-    *a11p *= 3;  
-    *a12p *= 3;  
-    *a22p *= 3;  
-    *b1p *=  3;  
-    *b2p *=  3;  
-
-#endif
-
-    dup+=1; dvp+=1; maskp+=1; a11p+=1; a12p+=1; a22p+=1; b1p+=1; b2p+=1; 
-    ix1p+=1; iy1p+=1; iz1p+=1; ixx1p+=1; ixy1p+=1; iyy1p+=1; ixz1p+=1; iyz1p+=1;
-#if (SELECTCHANNEL==3)
-    ix2p+=1; iy2p+=1; iz2p+=1; ixx2p+=1; ixy2p+=1; iyy2p+=1; ixz2p+=1; iyz2p+=1;
-    ix3p+=1; iy3p+=1; iz3p+=1; ixx3p+=1; ixy3p+=1; iyy3p+=1; ixz3p+=1; iyz3p+=1;
-#endif
-    uup+=1;vvp+=1;wxp+=1; wyp+=1;
-
-  }
-}
-
diff --git a/src/FDF1.0.1/opticalflow_aux.cpp b/src/FDF1.0.1/opticalflow_aux.cpp
new file mode 100644
index 0000000..bc7f956
--- /dev/null
+++ b/src/FDF1.0.1/opticalflow_aux.cpp
@@ -0,0 +1,115 @@
+#include <stdlib.h>
+#include <math.h>
+#include <malloc.h>
+#include <string.h>
+
+#include "opticalflow_aux.h"
+#include "../kernels/flowUtil.h"
+#include "../common/timer.h"
+
+#include <arm_neon.h>
+
+using namespace timer;
+
+#if (VECTOR_WIDTH == 4)
+typedef float32x4_t v4sf;
+#else
+typedef float v4sf;
+#endif
+
+#define datanorm 0.1f*0.1f//0.01f // square of the normalization factor
+#define epsilon_color (0.001f*0.001f)//0.000001f
+#define epsilon_grad (0.001f*0.001f)//0.000001f
+#define epsilon_desc (0.001f*0.001f)//0.000001f
+#define epsilon_smooth (0.001f*0.001f)//0.000001f
+
+/* warp a color image according to a flow. src is the input image, wx and wy, the input flow. dst is the warped image and mask contains 0 or 1 if the pixels goes outside/inside image boundaries */
+void image_warp(color_image_t *dst, image_t *mask, const color_image_t *src, const image_t *wx, const image_t *wy)
+{
+
+  cu::warpImage(dst, mask, src, wx, wy);
+
+}
+
+
+/* compute image first and second order spatio-temporal derivatives of a color image */
+void get_derivatives(
+    const color_image_t *im1, const color_image_t *im2, float *pDeviceKernel,
+    color_image_t *dx, color_image_t *dy, color_image_t *dt, 
+    color_image_t *dxx, color_image_t *dxy, color_image_t *dyy, color_image_t *dxt, color_image_t *dyt)
+{
+  // derivatives are computed on the mean of the first image and the warped second image
+  color_image_t *tmp_im2 = color_image_new(im2->width,im2->height);    
+
+  int height = im2->height;
+  int width = im2->width;
+  int stride = im2->stride;
+
+  cu::getMeanImageAndDiff(im1->c1, im2->c1, tmp_im2->c1, dt->c1, im1->height, im1->stride);
+
+  // compute all other derivatives
+  cu::colorImageDerivative(dx->c1,  tmp_im2->c1, pDeviceKernel, height, width, stride, true); // horizontal
+  cu::colorImageDerivative(dy->c1,  tmp_im2->c1, pDeviceKernel, height, width, stride, false);
+  cu::colorImageDerivative(dxx->c1, dx->c1,      pDeviceKernel, height, width, stride, true);
+  cu::colorImageDerivative(dxy->c1, dx->c1,      pDeviceKernel, height, width, stride, false);
+  cu::colorImageDerivative(dyy->c1, dy->c1,      pDeviceKernel, height, width, stride, false);
+  cu::colorImageDerivative(dxt->c1, dt->c1,      pDeviceKernel, height, width, stride, true);
+  cu::colorImageDerivative(dyt->c1, dt->c1,      pDeviceKernel, height, width, stride, false);
+
+  // free memory
+  color_image_delete(tmp_im2);
+}
+
+
+/* compute the smoothness term */
+/* It is represented as two images, the first one for horizontal smoothness, the second for vertical
+   in dst_horiz, the pixel i,j represents the smoothness weight between pixel i,j and i,j+1
+   in dst_vert, the pixel i,j represents the smoothness weight between pixel i,j and i+1,j */
+void compute_smoothness(image_t *dst_horiz, image_t *dst_vert, const image_t *uu, const image_t *vv, float *deriv_flow, const float quarter_alpha){
+  const int width = uu->width, height = vv->height, stride = uu->stride;
+  int j;
+  image_t *ux = image_new(width,height), *vx = image_new(width,height), *uy = image_new(width,height), *vy = image_new(width,height), *smoothness = image_new(width,height);
+
+  // compute derivatives [-0.5 0 0.5]
+  cu::imageDerivative(ux->c1, uu->c1, deriv_flow, height, width, stride, true);
+  cu::imageDerivative(vx->c1, vv->c1, deriv_flow, height, width, stride, true);
+  cu::imageDerivative(uy->c1, uu->c1, deriv_flow, height, width, stride, false);
+  cu::imageDerivative(vy->c1, vv->c1, deriv_flow, height, width, stride, false);
+
+  cu::smoothnessTerm(
+      dst_horiz->c1, dst_vert->c1, smoothness->c1,
+      ux->c1, uy->c1, vx->c1, vy->c1,
+      quarter_alpha, epsilon_smooth,
+      height, width, stride);
+
+  // Cleanup extra columns
+  for(j=0;j<height;j++){
+    memset(&dst_horiz->c1[j*stride+width-1], 0, sizeof(float)*(stride-width+1));
+  }
+  // Cleanup last row
+  memset( &dst_vert->c1[(height-1)*stride], 0, sizeof(float)*stride);
+
+  image_delete(ux); image_delete(uy); image_delete(vx); image_delete(vy); 
+  image_delete(smoothness);
+}
+
+
+
+
+
+/* sub the laplacian (smoothness term) to the right-hand term */
+void sub_laplacian(image_t *dst, const image_t *src, const image_t *weight_horiz, const image_t *weight_vert){
+
+  cu::subLaplacianHoriz(src->c1, dst->c1, weight_horiz->c1, src->height, src->width, src->stride);
+
+  cu::subLaplacianVert(src->c1, dst->c1, weight_vert->c1, src->height, src->stride);
+}
+
+/* compute the dataterm // REMOVED MATCHING TERM
+   a11 a12 a22 represents the 2x2 diagonal matrix, b1 and b2 the right hand side
+   other (color) images are input */
+void compute_data(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t *b2, image_t *mask, image_t *wx, image_t *wy, image_t *du, image_t *dv, image_t *uu, image_t *vv, color_image_t *Ix, color_image_t *Iy, color_image_t *Iz, color_image_t *Ixx, color_image_t *Ixy, color_image_t *Iyy, color_image_t *Ixz, color_image_t *Iyz, const float half_delta_over3, const float half_beta, const float half_gamma_over3)
+{
+  return cu::dataTerm(a11, a12, a22, b1, b2, mask, wx, wy, du, dv, uu, vv, Ix, Iy, Iz, Ixx, Ixy, Iyy, Ixz, Iyz, half_delta_over3, half_beta, half_gamma_over3);
+}
+
diff --git a/src/FDF1.0.1/opticalflow_aux.h b/src/FDF1.0.1/opticalflow_aux.h
index ebca1f1..f78443d 100644
--- a/src/FDF1.0.1/opticalflow_aux.h
+++ b/src/FDF1.0.1/opticalflow_aux.h
@@ -9,25 +9,14 @@ extern "C" {
 
 #include "image.h"
 
-#if (SELECTCHANNEL==1 | SELECTCHANNEL==2)  // use single band image
-/* warp a color image according to a flow. src is the input image, wx and wy, the input flow. dst is the warped image and mask contains 0 or 1 if the pixels goes outside/inside image boundaries */
-void image_warp(image_t *dst, image_t *mask, const image_t *src, const image_t *wx, const image_t *wy);
-
-/* compute image first and second order spatio-temporal derivatives of a color image */
-void get_derivatives(const image_t *im1, const image_t *im2, const convolution_t *deriv, image_t *dx, image_t *dy, image_t *dt, image_t *dxx, image_t *dxy, image_t *dyy, image_t *dxt, image_t *dyt);
-
-#else                                     // use RGB image_new
 /* warp a color image according to a flow. src is the input image, wx and wy, the input flow. dst is the warped image and mask contains 0 or 1 if the pixels goes outside/inside image boundaries */
 void image_warp(color_image_t *dst, image_t *mask, const color_image_t *src, const image_t *wx, const image_t *wy);
 
 /* compute image first and second order spatio-temporal derivatives of a color image */
-void get_derivatives(const color_image_t *im1, const color_image_t *im2, const convolution_t *deriv, color_image_t *dx, color_image_t *dy, color_image_t *dt, color_image_t *dxx, color_image_t *dxy, color_image_t *dyy, color_image_t *dxt, color_image_t *dyt);
+void get_derivatives(const color_image_t *im1, const color_image_t *im2, float *deriv, color_image_t *dx, color_image_t *dy, color_image_t *dt, color_image_t *dxx, color_image_t *dxy, color_image_t *dyy, color_image_t *dxt, color_image_t *dyt);
 
-#endif
-    
-    
 /* compute the smoothness term */
-void compute_smoothness(image_t *dst_horiz, image_t *dst_vert, const image_t *uu, const image_t *vv, const convolution_t *deriv_flow, const float quarter_alpha);
+void compute_smoothness(image_t *dst_horiz, image_t *dst_vert, const image_t *uu, const image_t *vv, float *kernel, const float quarter_alpha);
 // void compute_smoothness_SF(image_t *dst_horiz, image_t *dst_vert, const image_t *xx1, const image_t *xx2, const image_t *yy, const image_t *xx3, const convolution_t *deriv_flow, const float quarter_alpha);
 
 /* sub the laplacian (smoothness term) to the right-hand term */
@@ -36,12 +25,7 @@ void sub_laplacian(image_t *dst, const image_t *src, const image_t *weight_horiz
 /* compute the dataterm and ... REMOVED THE MATCHING TERM
    a11 a12 a22 represents the 2x2 diagonal matrix, b1 and b2 the right hand side
    other (color) images are input */
-#if (SELECTCHANNEL==1 | SELECTCHANNEL==2)  // use single band image_delete
-void compute_data(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t *b2, image_t *mask, image_t *wx, image_t *wy, image_t *du, image_t *dv, image_t *uu, image_t *vv, image_t *Ix, image_t *Iy, image_t *Iz, image_t *Ixx, image_t *Ixy, image_t *Iyy, image_t *Ixz, image_t *Iyz, const float half_delta_over3, const float half_beta, const float half_gamma_over3);
-#else
 void compute_data(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t *b2, image_t *mask, image_t *wx, image_t *wy, image_t *du, image_t *dv, image_t *uu, image_t *vv, color_image_t *Ix, color_image_t *Iy, color_image_t *Iz, color_image_t *Ixx, color_image_t *Ixy, color_image_t *Iyy, color_image_t *Ixz, color_image_t *Iyz, const float half_delta_over3, const float half_beta, const float half_gamma_over3);
-#endif
-
 
 #ifdef __cplusplus
 }
diff --git a/src/FDF1.0.1/solver.c b/src/FDF1.0.1/solver.c
deleted file mode 100644
index 162b3a0..0000000
--- a/src/FDF1.0.1/solver.c
+++ /dev/null
@@ -1,426 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <malloc.h>
-
-#include <omp.h>
-
-#include "image.h"
-#include "solver.h"
-
-// #include <arm_neon.h>
-// #if (VECTOR_WIDTH == 4)
-// typedef float32x4_t v4sf;
-// #else
-// typedef float v4sf;
-// #endif
-
-//THIS IS A SLOW VERSION BUT READABLE
-//Perform n iterations of the sor_coupled algorithm
-//du and dv are used as initial guesses
-//The system form is the same as in opticalflow.c
-void sor_coupled_slow_but_readable(image_t *du, image_t *dv, image_t *a11, image_t *a12, image_t *a22, const image_t *b1, const image_t *b2, const image_t *dpsis_horiz, const image_t *dpsis_vert, const int iterations, const float omega)
-{  
-    int i,j,iter;
-    for(iter = 0 ; iter<iterations ; iter++)
-    {
-    #pragma omp parallel for
-    for(j=0 ; j<du->height ; j++)
-    {
-      float sigma_u,sigma_v,sum_dpsis,A11,A22,A12,B1,B2;//,det;
-      for(i=0 ; i<du->width ; i++)
-      {
-          sigma_u = 0.0f;
-          sigma_v = 0.0f;
-          sum_dpsis = 0.0f;
-          if(j>0)
-          {
-            sigma_u -= dpsis_vert->c1[(j-1)*du->stride+i]*du->c1[(j-1)*du->stride+i];
-            sigma_v -= dpsis_vert->c1[(j-1)*du->stride+i]*dv->c1[(j-1)*du->stride+i];
-            sum_dpsis += dpsis_vert->c1[(j-1)*du->stride+i];
-          }
-          if(i>0)
-          {
-            sigma_u -= dpsis_horiz->c1[j*du->stride+i-1]*du->c1[j*du->stride+i-1];
-            sigma_v -= dpsis_horiz->c1[j*du->stride+i-1]*dv->c1[j*du->stride+i-1];
-            sum_dpsis += dpsis_horiz->c1[j*du->stride+i-1];
-          }
-          if(j<du->height-1)
-          {
-            sigma_u -= dpsis_vert->c1[j*du->stride+i]*du->c1[(j+1)*du->stride+i];
-            sigma_v -= dpsis_vert->c1[j*du->stride+i]*dv->c1[(j+1)*du->stride+i];
-            sum_dpsis += dpsis_vert->c1[j*du->stride+i];
-          }
-          if(i<du->width-1)
-          {
-            sigma_u -= dpsis_horiz->c1[j*du->stride+i]*du->c1[j*du->stride+i+1];
-            sigma_v -= dpsis_horiz->c1[j*du->stride+i]*dv->c1[j*du->stride+i+1];
-            sum_dpsis += dpsis_horiz->c1[j*du->stride+i];
-          }
-          A11 = a11->c1[j*du->stride+i]+sum_dpsis;
-          A12 = a12->c1[j*du->stride+i];
-          A22 = a22->c1[j*du->stride+i]+sum_dpsis;
-          //det = A11*A22-A12*A12;
-          B1 = b1->c1[j*du->stride+i]-sigma_u;
-          B2 = b2->c1[j*du->stride+i]-sigma_v;
-//           du->c1[j*du->stride+i] = (1.0f-omega)*du->c1[j*du->stride+i] +omega*( A22*B1-A12*B2)/det;
-//           dv->c1[j*du->stride+i] = (1.0f-omega)*dv->c1[j*du->stride+i] +omega*(-A12*B1+A11*B2)/det;
-          du->c1[j*du->stride+i] = (1.0f-omega)*du->c1[j*du->stride+i] + omega/A11 *(B1 - A12* dv->c1[j*du->stride+i] );
-          dv->c1[j*du->stride+i] = (1.0f-omega)*dv->c1[j*du->stride+i] + omega/A22 *(B2 - A12* du->c1[j*du->stride+i] );
-          
-          
-      }
-    }
-  }  
-}
-
- // THIS IS A FASTER VERSION BUT UNREADABLE, ONLY OPTICAL FLOW WITHOUT OPENMP PARALLELIZATION
- // the first iteration is separated from the other to compute the inverse of the 2x2 block diagonal
- // each iteration is split in two first line / middle lines / last line, and the left block is computed separately on each line
-// void sor_coupled(image_t *du, image_t *dv, image_t *a11, image_t *a12, image_t *a22, const image_t *b1, const image_t *b2, const image_t *dpsis_horiz, const image_t *dpsis_vert, const int iterations, const float omega){
-//     //sor_coupled_slow(du,dv,a11,a12,a22,b1,b2,dpsis_horiz,dpsis_vert,iterations,omega); return; printf("test\n");
-//   
-//     if(du->width<2 || du->height<2 || iterations < 1){
-//         sor_coupled_slow_but_readable(du,dv,a11,a12,a22,b1,b2,dpsis_horiz,dpsis_vert,iterations,omega);
-//         return;
-//     }
-//     
-//     const int stride = du->stride, width = du->width;
-//     const int iterheight = du->height-1, iterline = (stride)/4, width_minus_1_sizeoffloat = sizeof(float)*(width-1);
-//     int j,iter,i,k;
-//     float *floatarray = (float*) memalign(16, stride*sizeof(float)*3); 
-//     if(floatarray==NULL){
-//         fprintf(stderr, "error in sor_coupled(): not enough memory\n");
-//         exit(1);
-//     }   
-//     float *f1 = floatarray;
-//     float *f2 = f1+stride;
-//     float *f3 = f2+stride;
-//     f1[0] = 0.0f;
-//     memset(&f1[width], 0, sizeof(float)*(stride-width));
-//     memset(&f2[width-1], 0, sizeof(float)*(stride-width+1));
-//     memset(&f3[width-1], 0, sizeof(float)*(stride-width+1));   	  
-//     	  
-//     { // first iteration
-//         v4sf *a11p = (v4sf*) a11->c1, *a12p = (v4sf*) a12->c1, *a22p = (v4sf*) a22->c1, *b1p = (v4sf*) b1->c1, *b2p = (v4sf*) b2->c1, *hp = (v4sf*) dpsis_horiz->c1, *vp = (v4sf*) dpsis_vert->c1;
-//         float *du_ptr = du->c1, *dv_ptr = dv->c1;
-//         v4sf *dub = (v4sf*) (du_ptr+stride), *dvb = (v4sf*) (dv_ptr+stride);
-//         
-//         { // first iteration - first line
-//         
-//             memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat);   
-//             memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat);
-//             memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat);
-//             v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3;
-//             
-//             { // left block
-//                 // reverse 2x2 diagonal block
-//                 const v4sf dpsis = (*hpl) + (*hp) + (*vp);
-//                 const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis;
-//                 const v4sf det = A11*A22 - (*a12p)*(*a12p);
-//                 *a11p = A11/det;
-//                 *a22p = A22/det;
-//                 *a12p /= -det;
-//                 // do one iteration
-//                 const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p);
-//                 const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p);
-//                 du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] );
-// 	            dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] );             
-//                 for(k=1;k<4;k++){
-//                     const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
-//                     const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
-//                     du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
-// 	                dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
-//                 }
-//                 // increment pointer
-//                 hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1;
-//                 dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1;
-//                 du_ptr += 4; dv_ptr += 4;        
-//             }
-//             for(i=iterline;--i;){
-//                 // reverse 2x2 diagonal block
-//                 const v4sf dpsis = (*hpl) + (*hp) + (*vp);
-//                 const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis;
-//                 const v4sf det = A11*A22 - (*a12p)*(*a12p);
-//                 *a11p = A11/det;
-//                 *a22p = A22/det;
-//                 *a12p /= -det;
-//                 // do one iteration
-//                 const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p);
-//                 const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p);
-//                 for(k=0;k<4;k++){
-//                     const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
-//                     const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
-//                     du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
-// 	                dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
-//                 }
-//                 // increment pointer
-//                 hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1;
-//                 dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1;
-//                 du_ptr += 4; dv_ptr += 4;
-//             }
-//           
-//         }
-//         
-//         v4sf *vpt = (v4sf*) dpsis_vert->c1;
-//         v4sf *dut = (v4sf*) du->c1, *dvt = (v4sf*) dv->c1;
-//         
-//         for(j=iterheight;--j;){ // first iteration - middle lines
-//             memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat);   
-//             memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat);
-//             memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat);
-//             v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3;
-//                  
-//             { // left block
-//                 // reverse 2x2 diagonal block
-//                 const v4sf dpsis = (*hpl) + (*hp) + (*vpt) + (*vp);
-//                 const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis;
-//                 const v4sf det = A11*A22 - (*a12p)*(*a12p);
-//                 *a11p = A11/det;
-//                 *a22p = A22/det;
-//                 *a12p /= -det;
-//                 // do one iteration
-//                 const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p);
-//                 const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p);
-//                 du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] );
-// 	            dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] );             
-//                 for(k=1;k<4;k++){
-//                     const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
-//                     const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
-//                     du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
-// 	                dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
-//                 }
-//                 // increment pointer
-//                 hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1;
-//                 dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1;
-//                 du_ptr += 4; dv_ptr += 4;           
-//             }
-//             for(i=iterline;--i;){
-//                 // reverse 2x2 diagonal block
-//                 const v4sf dpsis = (*hpl) + (*hp) + (*vpt) + (*vp);
-//                 const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis;
-//                 const v4sf det = A11*A22 - (*a12p)*(*a12p);
-//                 *a11p = A11/det;
-//                 *a22p = A22/det;
-//                 *a12p /= -det;
-//                 // do one iteration
-//                 const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p);
-//                 const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p);
-//                 for(k=0;k<4;k++){
-//                     const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
-//                     const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
-//                     du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
-// 	                dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
-//                 }
-//                 // increment pointer
-//                 hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1;
-//                 dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1;
-//                 du_ptr += 4; dv_ptr += 4;
-//             }
-//                 
-//         }
-//         
-//         { // first iteration - last line
-//             memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat);   
-//             memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat);
-//             memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat);
-//             v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3;
-// 
-//             { // left block
-//                 // reverse 2x2 diagonal block
-//                 const v4sf dpsis = (*hpl) + (*hp) + (*vpt);
-//                 const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis;
-//                 const v4sf det = A11*A22 - (*a12p)*(*a12p);
-//                 *a11p = A11/det;
-//                 *a22p = A22/det;
-//                 *a12p /= -det;
-//                 // do one iteration
-//                 const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p);
-//                 const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p);
-//                 du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] );
-// 	            dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] );             
-//                 for(k=1;k<4;k++){
-//                     const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
-//                     const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
-//                     du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
-// 	                dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
-//                 }
-//                 // increment pointer
-//                 hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1;
-//                 dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1;
-//                 du_ptr += 4; dv_ptr += 4;           
-//             }
-//             for(i=iterline;--i;){
-//                 // reverse 2x2 diagonal block
-//                 const v4sf dpsis = (*hpl) + (*hp) + (*vpt);
-//                 const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis;
-//                 const v4sf det = A11*A22 - (*a12p)*(*a12p);
-//                 *a11p = A11/det;
-//                 *a22p = A22/det;
-//                 *a12p /= -det;
-//                 // do one iteration
-//                 const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p);
-//                 const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p);
-//                 for(k=0;k<4;k++){
-//                     const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
-//                     const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
-//                     du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
-// 	                dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
-//                 }
-//                 // increment pointer
-//                 hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1;
-//                 dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1;
-//                 du_ptr += 4; dv_ptr += 4;
-//             }
-// 
-//         }
-//     }
-// 
-//     
-// 
-//    for(iter=iterations;--iter;)   // other iterations
-//    {
-//         v4sf *a11p = (v4sf*) a11->c1, *a12p = (v4sf*) a12->c1, *a22p = (v4sf*) a22->c1, *b1p = (v4sf*) b1->c1, *b2p = (v4sf*) b2->c1, *hp = (v4sf*) dpsis_horiz->c1, *vp = (v4sf*) dpsis_vert->c1;
-//         float *du_ptr = du->c1, *dv_ptr = dv->c1;
-//         v4sf *dub = (v4sf*) (du_ptr+stride), *dvb = (v4sf*) (dv_ptr+stride);
-//         
-//         { // other iteration - first line
-//         
-//             memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat);   
-//             memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat);
-//             memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat);
-//             v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3;
-//             
-//             { // left block
-//                 // do one iteration
-//                 const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p);
-//                 const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p);
-//                 du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] );
-// 	            dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] );             
-//                 for(k=1;k<4;k++){
-//                     const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
-//                     const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
-//                     du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
-// 	                dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
-//                 }
-//                 // increment pointer
-//                 hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1;
-//                 dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1;
-//                 du_ptr += 4; dv_ptr += 4;        
-//             }
-//             for(i=iterline;--i;){
-//                 // do one iteration
-//                 const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p);
-//                 const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p);
-//                 for(k=0;k<4;k++){
-//                     const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
-//                     const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
-//                     du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
-// 	                dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
-//                 }
-//                 // increment pointer
-//                 hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1;
-//                 dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1;
-//                 du_ptr += 4; dv_ptr += 4;
-//             }
-//           
-//         }
-//         
-//         v4sf *vpt = (v4sf*) dpsis_vert->c1;
-//         v4sf *dut = (v4sf*) du->c1, *dvt = (v4sf*) dv->c1;
-// 
-//         for(j=iterheight;--j;)  // other iteration - middle lines
-// 	{ 
-//             memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat);   
-//             memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat);
-//             memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat);
-//             v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3;
-//                  
-//             { // left block
-//                 // do one iteration
-//                 const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p);
-//                 const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p);
-//                 du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] );
-// 		dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] );             
-//                 for(k=1;k<4;k++)
-// 		{
-// 		  const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
-// 		  const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
-// 		  du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
-// 		  dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
-//                 }
-//                 // increment pointer
-//                 hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1;
-//                 dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1;
-//                 du_ptr += 4; dv_ptr += 4;           
-//             }
-//             
-//             for(i=iterline; --i;)
-// 	    {
-// 	      // do one iteration
-// 	      const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p);
-// 	      const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p);
-// 	      for(k=0;k<4;k++)
-// 	      {
-// 		  const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
-// 		  const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
-// 		  du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
-// 		      dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
-// 	      }
-// 	      // increment pointer
-// 	      hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1;
-// 	      dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1;
-// 	      du_ptr += 4; dv_ptr += 4;
-//             }
-//                 
-//         }
-//         
-//         { // other iteration - last line
-//             memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat);   
-//             memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat);
-//             memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat);
-//             v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3;
-// 
-//             { // left block
-//                 // do one iteration
-//                 const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p);
-//                 const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p);
-//                 du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] );
-// 	            dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] );             
-//                 for(k=1;k<4;k++){
-//                     const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
-//                     const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
-//                     du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
-// 	                dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
-//                 }
-//                 // increment pointer
-//                 hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1;
-//                 dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1;
-//                 du_ptr += 4; dv_ptr += 4;           
-//             }
-//             for(i=iterline;--i;){
-//                 // do one iteration
-//                 const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p);
-//                 const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p);
-//                 for(k=0;k<4;k++){
-//                     const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
-//                     const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
-//                     du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
-// 	                dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
-//                 }
-//                 // increment pointer
-//                 hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1;
-//                 dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1;
-//                 du_ptr += 4; dv_ptr += 4;
-//             }
-// 
-//         }
-//     }
-// 
-// 
-// 
-//     free(floatarray);
-// 
-// }
-
diff --git a/src/kernels/flowUtil.cu b/src/kernels/flowUtil.cu
new file mode 100644
index 0000000..fd55be5
--- /dev/null
+++ b/src/kernels/flowUtil.cu
@@ -0,0 +1,892 @@
+// System
+#include <iostream>
+#include <chrono>
+#include <string>
+#include <stdexcept>
+
+// CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+// NVIDIA Perf Primitives
+#include <nppi.h>
+#include <nppi_filtering_functions.h>
+
+#include "../common/timer.h"
+#include "../FDF1.0.1/image.h"
+#include "flowUtil.h"
+
+using namespace timer;
+
+#define datanorm        0.1f*0.1f      //0.01f // square of the normalization factor
+#define epsilon_color  (0.001f*0.001f) //0.000001f
+#define epsilon_grad   (0.001f*0.001f) //0.000001f
+#define epsilon_desc   (0.001f*0.001f) //0.000001f
+#define epsilon_smooth (0.001f*0.001f) //0.000001f
+
+__global__ void kernelDataTerm(
+    float *a11c1, float *a12c1, float *a22c1,
+    float *b1c1, float *b2c1, 
+    float *maskc1, 
+    float *wxc1, float *wyc1,
+    float *duc1, float *dvc1, 
+    float *uuc1, float *vvc1, 
+    float *Ixc1,    float *Ixc2,    float *Ixc3,
+    float *Iyc1,    float *Iyc2,    float *Iyc3,
+    float *Izc1,    float *Izc2,    float *Izc3,
+    float *Ixxc1,   float *Ixxc2,   float *Ixxc3,
+    float *Ixyc1,   float *Ixyc2,   float *Ixyc3,
+    float *Iyyc1,   float *Iyyc2,   float *Iyyc3,
+    float *Ixzc1,   float *Ixzc2,   float *Ixzc3,
+    float *Iyzc1,   float *Iyzc2,   float *Iyzc3, 
+    const float half_delta_over3, const float half_beta, const float half_gamma_over3, int N) {
+
+  int tidx = blockDim.x * blockIdx.x + threadIdx.x;
+
+  if (tidx < N) {
+
+    const float dnorm    = datanorm;
+    const float hdover3  = half_delta_over3;
+    const float epscolor = epsilon_color;
+    const float hgover3  = half_gamma_over3;
+    const float epsgrad  = epsilon_grad;
+
+    float *dup  = (float*) duc1 + tidx,
+          *dvp = (float*) dvc1 + tidx,
+          *maskp = (float*) maskc1 + tidx,
+          *a11p  = (float*) a11c1 + tidx,
+          *a12p = (float*) a12c1 + tidx,
+          *a22p = (float*) a22c1 + tidx, 
+          *b1p   = (float*) b1c1 + tidx,
+          *b2p = (float*) b2c1 + tidx, 
+          *ix1p  = (float*) Ixc1 + tidx,
+          *iy1p=(float*)Iyc1 + tidx,
+          *iz1p=(float*)Izc1 + tidx,
+          *ixx1p=(float*)Ixxc1 + tidx,
+          *ixy1p=(float*)Ixyc1 + tidx,
+          *iyy1p=(float*)Iyyc1 + tidx,
+          *ixz1p=(float*)Ixzc1 + tidx,
+          *iyz1p=(float*) Iyzc1 + tidx, 
+          *ix2p  = (float*) Ixc2 + tidx,
+          *iy2p=(float*)Iyc2 + tidx,
+          *iz2p=(float*)Izc2 + tidx,
+          *ixx2p=(float*)Ixxc2 + tidx,
+          *ixy2p=(float*)Ixyc2 + tidx,
+          *iyy2p=(float*)Iyyc2 + tidx,
+          *ixz2p=(float*)Ixzc2 + tidx,
+          *iyz2p=(float*) Iyzc2 + tidx, 
+          *ix3p  = (float*) Ixc3 + tidx,
+          *iy3p=(float*)Iyc3 + tidx,
+          *iz3p=(float*)Izc3 + tidx,
+          *ixx3p=(float*)Ixxc3 + tidx,
+          *ixy3p=(float*)Ixyc3 + tidx,
+          *iyy3p=(float*)Iyyc3 + tidx,
+          *ixz3p=(float*)Ixzc3 + tidx,
+          *iyz3p=(float*) Iyzc3 + tidx;
+
+
+    float tmp, tmp2, n1, n2;
+    float tmp3, tmp4, tmp5, tmp6, n3, n4, n5, n6;
+
+    // dpsi color
+    if(half_delta_over3){
+      tmp  = *iz1p + (*ix1p)*(*dup) + (*iy1p)*(*dvp);
+      n1 = (*ix1p) * (*ix1p) + (*iy1p) * (*iy1p) + dnorm;
+      tmp2 = *iz2p + (*ix2p)*(*dup) + (*iy2p)*(*dvp);
+      n2 = (*ix2p) * (*ix2p) + (*iy2p) * (*iy2p) + dnorm;
+      tmp3 = *iz3p + (*ix3p)*(*dup) + (*iy3p)*(*dvp);
+      n3 = (*ix3p) * (*ix3p) + (*iy3p) * (*iy3p) + dnorm;
+      tmp = (*maskp) * hdover3 / sqrtf(tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + epscolor);
+      tmp3 = tmp/n3; tmp2 = tmp/n2; tmp /= n1;
+      *a11p += tmp  * (*ix1p) * (*ix1p);
+      *a12p += tmp  * (*ix1p) * (*iy1p);
+      *a22p += tmp  * (*iy1p) * (*iy1p);
+      *b1p -=  tmp  * (*iz1p) * (*ix1p);
+      *b2p -=  tmp  * (*iz1p) * (*iy1p);
+      *a11p += tmp2 * (*ix2p) * (*ix2p);
+      *a12p += tmp2 * (*ix2p) * (*iy2p);
+      *a22p += tmp2 * (*iy2p) * (*iy2p);
+      *b1p -=  tmp2 * (*iz2p) * (*ix2p);
+      *b2p -=  tmp2 * (*iz2p) * (*iy2p);
+      *a11p += tmp3 * (*ix3p) * (*ix3p);
+      *a12p += tmp3 * (*ix3p) * (*iy3p);
+      *a22p += tmp3 * (*iy3p) * (*iy3p);
+      *b1p -=  tmp3 * (*iz3p) * (*ix3p);
+      *b2p -=  tmp3 * (*iz3p) * (*iy3p);
+    }
+
+    // dpsi gradient
+    n1 = (*ixx1p) * (*ixx1p) + (*ixy1p) * (*ixy1p) + dnorm;
+    n2 = (*iyy1p) * (*iyy1p) + (*ixy1p) * (*ixy1p) + dnorm;
+    tmp  = *ixz1p + (*ixx1p) * (*dup) + (*ixy1p) * (*dvp);
+    tmp2 = *iyz1p + (*ixy1p) * (*dup) + (*iyy1p) * (*dvp);
+    n3 = (*ixx2p) * (*ixx2p) + (*ixy2p) * (*ixy2p) + dnorm;
+    n4 = (*iyy2p) * (*iyy2p) + (*ixy2p) * (*ixy2p) + dnorm;
+    tmp3 = *ixz2p + (*ixx2p) * (*dup) + (*ixy2p) * (*dvp);
+    tmp4 = *iyz2p + (*ixy2p) * (*dup) + (*iyy2p) * (*dvp);
+    n5 = (*ixx3p) * (*ixx3p) + (*ixy3p) * (*ixy3p) + dnorm;
+    n6 = (*iyy3p) * (*iyy3p) + (*ixy3p) * (*ixy3p) + dnorm;
+    tmp5 = *ixz3p + (*ixx3p) * (*dup) + (*ixy3p) * (*dvp);
+    tmp6 = *iyz3p + (*ixy3p) * (*dup) + (*iyy3p) * (*dvp);
+    tmp = (*maskp) * hgover3 / sqrtf(
+        tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + tmp4*tmp4/n4 + tmp5*tmp5/n5 + tmp6*tmp6/n6 + epsgrad);
+    tmp6 = tmp/n6; tmp5 = tmp/n5; tmp4 = tmp/n4; tmp3 = tmp/n3; tmp2 = tmp/n2; tmp /= n1;      
+    *a11p += tmp *(*ixx1p)*(*ixx1p) + tmp2*(*ixy1p)*(*ixy1p);
+    *a12p += tmp *(*ixx1p)*(*ixy1p) + tmp2*(*ixy1p)*(*iyy1p);
+    *a22p += tmp2*(*iyy1p)*(*iyy1p) + tmp *(*ixy1p)*(*ixy1p);
+    *b1p -=  tmp *(*ixx1p)*(*ixz1p) + tmp2*(*ixy1p)*(*iyz1p);
+    *b2p -=  tmp2*(*iyy1p)*(*iyz1p) + tmp *(*ixy1p)*(*ixz1p);
+    *a11p += tmp3*(*ixx2p)*(*ixx2p) + tmp4*(*ixy2p)*(*ixy2p);
+    *a12p += tmp3*(*ixx2p)*(*ixy2p) + tmp4*(*ixy2p)*(*iyy2p);
+    *a22p += tmp4*(*iyy2p)*(*iyy2p) + tmp3*(*ixy2p)*(*ixy2p);
+    *b1p -=  tmp3*(*ixx2p)*(*ixz2p) + tmp4*(*ixy2p)*(*iyz2p);
+    *b2p -=  tmp4*(*iyy2p)*(*iyz2p) + tmp3*(*ixy2p)*(*ixz2p);
+    *a11p += tmp5*(*ixx3p)*(*ixx3p) + tmp6*(*ixy3p)*(*ixy3p);
+    *a12p += tmp5*(*ixx3p)*(*ixy3p) + tmp6*(*ixy3p)*(*iyy3p);
+    *a22p += tmp6*(*iyy3p)*(*iyy3p) + tmp5*(*ixy3p)*(*ixy3p);
+    *b1p -=  tmp5*(*ixx3p)*(*ixz3p) + tmp6*(*ixy3p)*(*iyz3p);
+    *b2p -=  tmp6*(*iyy3p)*(*iyz3p) + tmp5*(*ixy3p)*(*ixz3p);  
+  }
+
+}
+
+__global__ void kernelSubLaplacianVert(
+    float *src, float *nextSrc,
+    float *dst, float *nextDst,
+    float *weights, int height, int stride) {
+
+  int tidx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (tidx < stride) {
+    float *wvp    = weights + tidx,
+          *srcp   = src + tidx,
+          *srcp_s = nextSrc + tidx,
+          *dstp   = dst + tidx,
+          *dstp_s = nextDst + tidx;
+
+    for (int j = 0; j < height - 1; j++) {
+      float tmp = (*wvp) * ((*srcp_s)-(*srcp));
+      *dstp += tmp;
+      *dstp_s -= tmp;
+      wvp += stride; srcp += stride; srcp_s += stride; dstp += stride; dstp_s += stride;
+    }
+  }
+
+}
+
+__global__ void kernelSubLaplacianHoriz(
+    float *src, float *dst, float *weights, float *coeffs, int height, int width, int stride) {
+
+  int tidx = blockIdx.x * blockDim.x + threadIdx.x;
+  int col  = tidx % width;
+
+  const int BLOCK_HEIGHT = 1;
+
+  if (tidx < width) {
+    float *pSrc         = src + tidx,
+          *pDst         = dst + tidx,
+          *pWeight      = weights + tidx,
+          *pCoeffCalc   = coeffs + tidx,
+          *pCoeffUpdate = pCoeffCalc;
+
+    int nBlocks = (height + BLOCK_HEIGHT - 1) / BLOCK_HEIGHT;
+    int jCalc = 0;
+    int jUpdate = 0;
+
+    // Block calculation and update so coeffs fit in cache
+
+    for (int iBlock = 0; iBlock < nBlocks; iBlock++) {
+
+      // Calc coeffs
+      for (int j = 0; j < BLOCK_HEIGHT && jCalc < height; j++, jCalc++) {
+        // Do not calculate the last column
+        if (col != width - 1)
+          *pCoeffCalc = (*pWeight) * ( *(pSrc + 1) - *pSrc );
+
+        pSrc += stride; pWeight += stride; pCoeffCalc += stride;
+      }
+
+      // Update dst
+      for (int j = 0; j < BLOCK_HEIGHT && jUpdate < height; j++, jUpdate++) {
+        float update = 0.0;
+
+        if (col != 0)
+          update -= *(pCoeffUpdate - 1);
+        if (col != width - 1)
+          update += *pCoeffUpdate;
+
+        *pDst += update;
+
+        pDst += stride; pCoeffUpdate += stride;
+      }
+    }
+  }
+}
+
+__global__ void kernelSubLaplacianHorizFillCoeffs(
+    float *src, float *weights, float *coeffs, int height, int width, int stride) {
+
+  int tidx = blockIdx.x * blockDim.x + threadIdx.x;
+  int row  = tidx / stride;
+  int col  = tidx % stride;
+
+  // Do not calculate the last column
+  if (tidx < width && col != width - 1) {
+    float *pSrc    = src + tidx,
+          *pWeight = weights + tidx,
+          *pCoeff  = coeffs + tidx;
+
+    for (int j = 0; j < height; j++) {
+      *pCoeff = (*pWeight) * ( *(pSrc + 1) - *pSrc );
+
+      pSrc += stride; pWeight += stride; pCoeff += stride;
+    }
+  }
+
+  // // Do not calculate the last column
+  // if (col < width - 1) {
+  //   float *pSrc    = src + tidx,
+  //         *pWeight = weights + tidx,
+  //         *pCoeff  = coeffs + tidx;
+
+  //   *pCoeff = (*pWeight) * ( *(pSrc + 1) - *pSrc );
+  // }
+}
+
+__global__ void kernelSubLaplacianHorizApplyCoeffs(
+    float *dst, float *coeffs, int height, int width, int stride) {
+
+  int tidx = blockIdx.x * blockDim.x + threadIdx.x;
+  int row  = tidx / stride;
+  int col  = tidx % stride;
+
+  if (tidx < width) {
+
+    float *pDst   = dst + tidx,
+          *pCoeff = coeffs + tidx;
+
+    for (int j = 0; j < height; j++) {
+      float update = 0.0;
+
+      if (col != 0)
+        update -= *(pCoeff - 1);
+      if (col != width - 1)
+        update += *pCoeff;
+
+      *pDst += update;
+
+      pDst += stride; pCoeff += stride;
+    }
+  }
+
+  // if (col < width) {
+
+  //   float *pDst   = dst + tidx,
+  //         *pCoeff = coeffs + tidx;
+
+  //   float update = 0.0;
+
+  //   if (col != 0)
+  //     update -= *(pCoeff - 1);
+  //   if (col != width - 1)
+  //     update += *pCoeff;
+
+  //   *pDst += update;
+  // }
+}
+
+__global__ void kernelSorStep(
+    float *du, float *dv,
+    float *a11, float *a12, float *a22,
+    const float *b1, const float *b2,
+    const float *horiz, const float *vert,
+    const int iterations, const float omega,
+    int height, int width, int stride, bool odd) {
+
+  int tidx = blockIdx.x * blockDim.x + threadIdx.x;
+  int j  = tidx / width;
+  int i  = tidx % width;
+
+  bool shouldRun = (odd)
+    ? ((i + j) % 2 == 1)
+    : ((i + j) % 2 == 0);
+
+  if (tidx < width * height && shouldRun) {
+
+    float sigma_u,sigma_v,sum_dpsis,A11,A22,A12,B1,B2;
+    sigma_u = 0.0f;
+    sigma_v = 0.0f;
+    sum_dpsis = 0.0f;
+
+    int here  = j * stride + i;
+    int left  = j * stride + i - 1;
+    int right = j * stride + i + 1;
+    int up    = (j-1) * stride + i;
+    int down  = (j+1) * stride + i;
+
+    if(j>0)
+    {
+      sigma_u   -= vert[up] * du[up];
+      sigma_v   -= vert[up] * dv[up];
+      sum_dpsis += vert[up];
+    }
+    if(i>0)
+    {
+      sigma_u   -= horiz[left] * du[left];
+      sigma_v   -= horiz[left] * dv[left];
+      sum_dpsis += horiz[left];
+    }
+    if(j<height-1)
+    {
+      sigma_u   -= vert[here] * du[down];
+      sigma_v   -= vert[here] * dv[down];
+      sum_dpsis += vert[here];
+    }
+    if(i<width-1)
+    {
+      sigma_u   -= horiz[here] * du[right];
+      sigma_v   -= horiz[here] * dv[right];
+      sum_dpsis += horiz[here];
+    }
+
+    A11 = a11[here] + sum_dpsis;
+    A12 = a12[here];
+    A22 = a22[here] + sum_dpsis;
+
+    B1 = b1[here] - sigma_u;
+    B2 = b2[here] - sigma_v;
+
+    du[here] = (1.0f-omega) * du[here] + omega/A11 * (B1 - A12 * dv[here]);
+    dv[here] = (1.0f-omega) * dv[here] + omega/A22 * (B2 - A12 * du[here]);
+
+  }
+}
+
+__global__ void kernelGetMeanImageAndDiff(
+    float *img1, float *img2, float *avgImg, float *diff,
+    int height, int stride) {
+
+  int tidx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // For 3 channels images
+  if (tidx < 3 * stride) {
+    float
+      *pImg1   = img1 + tidx,
+      *pImg2   = img2 + tidx,
+      *pAvgImg = avgImg + tidx,
+      *pDiff   = diff + tidx;
+
+    for (int j = 0; j < height; j++) {
+      *pAvgImg = 0.5 * ((*pImg1) + (*pImg2));
+      *pDiff   = (*pImg2) - (*pImg1);
+
+      pImg1 += stride; pImg2 += stride; pAvgImg += stride; pDiff += stride;
+    }
+
+  }
+
+}
+
+
+__global__ void kernelFlowMag(
+    float *dst,  float *ux,  float *uy,  float *vx,  float *vy,
+    float qa, float epsmooth, int height, int width, int stride) {
+
+  int tidx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (tidx < height * stride) {
+    float *uxp = ux + tidx,
+          *uyp = uy + tidx,
+          *vxp = vx + tidx,
+          *vyp = vy + tidx,
+          *sp  = dst + tidx;
+
+    *sp = qa / sqrtf(
+        (*uxp)*(*uxp) + (*uyp)*(*uyp) + (*vxp)*(*vxp) + (*vyp)*(*vyp) + epsmooth );
+
+  }
+}
+
+__global__ void kernelSmoothnessHorizVert(
+    float *dst_horiz, float *dst_vert, float *smoothness, int height, int width, int stride) {
+
+  int tidx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (tidx < height * stride) {
+
+    float *dst_horiz_p = dst_horiz + tidx,
+          *dst_vert_p  = dst_vert  + tidx,
+          *sp          = smoothness + tidx;
+
+    *dst_horiz_p = *sp + *(sp + 1);
+    *dst_vert_p  = *sp + *(sp + stride);
+  }
+}
+
+
+__global__ void kernelFlowUpdate(
+    float *uu, float *vv, float *wx, float *wy, float *du, float *dv,
+    int height, int width, int stride) {
+
+  int tidx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (tidx < height * stride) {
+
+    float *uup = uu + tidx,
+          *vvp = vv + tidx,
+          *wxp = wx + tidx,
+          *wyp = wy + tidx,
+          *dup = du + tidx,
+          *dvp = dv + tidx;
+
+    (*uup) = (*wxp) + (*dup);
+    (*vvp) = (*wyp) + (*dvp);
+  }
+
+}
+
+
+__global__ void kernelWarpImage(
+    float *dst1, float *dst2, float *dst3, float *mask,
+    float *src1, float *src2, float *src3,
+    float *wx, float *wy,
+    int height, int width, int stride) {
+
+  int tidx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int i = tidx % stride;
+  int j = tidx / stride;
+  int offset = j * stride + i;
+
+  if (i < width && j < height) {
+    float xx = i + wx[offset];
+    float yy = j + wy[offset];
+    int x = floor(xx);
+    int y = floor(yy);
+    float dx = xx - x;
+    float dy = yy - y;
+
+    // Set mask according to bounds
+    mask[offset] = (xx >= 0 && xx < width && yy >= 0 && yy < height);
+
+    int x1 = MINMAX_TA(x, width);
+    int x2 = MINMAX_TA(x + 1, width);
+    int y1 = MINMAX_TA(y, height);
+    int y2 = MINMAX_TA(y + 1, height);
+
+    dst1[offset] = 
+      src1[y1 * stride + x1] * (1.0f-dx) * (1.0f-dy) +
+      src1[y1 * stride + x2] * dx * (1.0f-dy) +
+      src1[y2 * stride + x1] * (1.0f-dx) * dy +
+      src1[y2 * stride + x2] * dx * dy;
+    dst2[offset] = 
+      src2[y1 * stride + x1] * (1.0f-dx) * (1.0f-dy) +
+      src2[y1 * stride + x2] * dx * (1.0f-dy) +
+      src2[y2 * stride + x1] * (1.0f-dx) * dy +
+      src2[y2 * stride + x2] * dx * dy;
+    dst3[offset] = 
+      src3[y1 * stride + x1] * (1.0f-dx) * (1.0f-dy) +
+      src3[y1 * stride + x2] * dx * (1.0f-dy) +
+      src3[y2 * stride + x1] * (1.0f-dx) * dy +
+      src3[y2 * stride + x2] * dx * dy;
+
+  }
+}
+
+
+namespace cu {
+
+  void dataTerm(
+      image_t *a11, image_t *a12, image_t *a22,
+      image_t *b1, image_t *b2, 
+      image_t *mask, 
+      image_t *wx, image_t *wy,
+      image_t *du, image_t *dv, 
+      image_t *uu, image_t *vv, 
+      color_image_t *Ix,  color_image_t *Iy,  color_image_t *Iz,
+      color_image_t *Ixx, color_image_t *Ixy, color_image_t *Iyy,
+      color_image_t *Ixz, color_image_t *Iyz, 
+      const float half_delta_over3, const float half_beta, const float half_gamma_over3) {
+
+    memset(a11->c1, 0, sizeof(float)*uu->height*uu->stride);
+    memset(a12->c1, 0, sizeof(float)*uu->height*uu->stride);
+    memset(a22->c1, 0, sizeof(float)*uu->height*uu->stride);
+    memset(b1->c1 , 0, sizeof(float)*uu->height*uu->stride);
+    memset(b2->c1 , 0, sizeof(float)*uu->height*uu->stride);
+
+    // Set up device pointers
+    float *a11c1,
+          *a12c1,    *a22c1,
+          *b1c1,     *b2c1, 
+          *maskc1, 
+          *wxc1,     *wyc1,
+          *duc1,     *dvc1, 
+          *uuc1,     *vvc1, 
+          *Ixc1,     *Ixc2,     *Ixc3,
+          *Iyc1,     *Iyc2,     *Iyc3,
+          *Izc1,     *Izc2,     *Izc3,
+          *Ixxc1,    *Ixxc2,    *Ixxc3,
+          *Ixyc1,    *Ixyc2,    *Ixyc3,
+          *Iyyc1,    *Iyyc2,    *Iyyc3,
+          *Ixzc1,    *Ixzc2,    *Ixzc3,
+          *Iyzc1,    *Iyzc2,    *Iyzc3;
+
+    checkCudaErrors( cudaHostGetDevicePointer(&a11c1,    a11->c1, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&a12c1,    a12->c1, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&a22c1,    a22->c1, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&b1c1,     b1->c1, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&b2c1,     b2->c1, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&maskc1,   mask->c1, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&wxc1,     wx->c1, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&wyc1,     wy->c1, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&duc1,     du->c1, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&dvc1,     dv->c1, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&uuc1,     uu->c1, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&vvc1,     vv->c1, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&Ixc1,     Ix->c1, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&Ixc2,     Ix->c2, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&Ixc3,     Ix->c3, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&Iyc1,     Iy->c1, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&Iyc2,     Iy->c2, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&Iyc3,     Iy->c3, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&Izc1,     Iz->c1, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&Izc2,     Iz->c2, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&Izc3,     Iz->c3, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&Ixxc1,    Ixx->c1, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&Ixxc2,    Ixx->c2, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&Ixxc3,    Ixx->c3, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&Ixyc1,    Ixy->c1, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&Ixyc2,    Ixy->c2, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&Ixyc3,    Ixy->c3, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&Iyyc1,    Iyy->c1, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&Iyyc2,    Iyy->c2, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&Iyyc3,    Iyy->c3, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&Ixzc1,    Ixz->c1, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&Ixzc2,    Ixz->c2, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&Ixzc3,    Ixz->c3, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&Iyzc1,    Iyz->c1, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&Iyzc2,    Iyz->c2, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&Iyzc3,    Iyz->c3, 0) );
+
+    int N = uu->height*uu->stride;
+    int nThreadsPerBlock = 64;
+    int nBlocks = (N + nThreadsPerBlock - 1) / nThreadsPerBlock;
+
+    kernelDataTerm<<<nBlocks, nThreadsPerBlock>>>(
+        a11c1, a12c1, a22c1,
+        b1c1, b2c1, 
+        maskc1, 
+        wxc1, wyc1,
+        duc1, dvc1, 
+        uuc1, vvc1, 
+        Ixc1,    Ixc2,    Ixc3,
+        Iyc1,    Iyc2,    Iyc3,
+        Izc1,    Izc2,    Izc3,
+        Ixxc1,   Ixxc2,   Ixxc3,
+        Ixyc1,   Ixyc2,   Ixyc3,
+        Iyyc1,   Iyyc2,   Iyyc3,
+        Ixzc1,   Ixzc2,   Ixzc3,
+        Iyzc1,   Iyzc2,   Iyzc3, 
+        half_delta_over3, half_beta, half_gamma_over3, N);
+
+  };
+
+
+  void subLaplacianHoriz(
+      float *src, float *dst, float *weights, int height, int width, int stride) {
+
+    float *pDeviceCoeffs;
+    checkCudaErrors( cudaMalloc((void**) &pDeviceCoeffs, height * stride * sizeof(float)) );
+
+    // Setup device pointers
+    float *pDeviceSrc, *pDeviceDst, *pDeviceWeights;
+    checkCudaErrors( cudaHostGetDevicePointer(&pDeviceSrc, src, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&pDeviceDst, dst, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&pDeviceWeights, weights, 0) );
+
+    int N = width;
+    // int N = height * stride;
+    int nThreadsPerBlock = 64;
+    int nBlocks = (N + nThreadsPerBlock - 1) / nThreadsPerBlock;
+
+    auto start_horiz = now();
+
+    kernelSubLaplacianHorizFillCoeffs<<<nBlocks, nThreadsPerBlock>>>(
+        pDeviceSrc, pDeviceWeights, pDeviceCoeffs, height, width, stride);
+
+    kernelSubLaplacianHorizApplyCoeffs<<<nBlocks, nThreadsPerBlock>>>(
+        pDeviceDst, pDeviceCoeffs, height, width, stride);
+
+    // kernelSubLaplacianHoriz<<<nBlocks, nThreadsPerBlock>>>(
+    //     pDeviceSrc, pDeviceDst, pDeviceWeights, pDeviceCoeffs, height, width, stride);
+    cudaDeviceSynchronize();
+    calc_print_elapsed("laplacian horiz", start_horiz);
+
+    cudaFree(pDeviceCoeffs);
+  }
+
+  void subLaplacianVert(
+      float *src, float *dst, float *weights, int height, int stride) {
+
+    float *d_src, *d_dst, *d_weights;
+
+    checkCudaErrors( cudaHostGetDevicePointer(&d_src, src, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_dst, dst, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_weights, weights, 0) );
+
+    int N = stride;
+    int nThreadsPerBlock = 64;
+    int nBlocks = (N + nThreadsPerBlock - 1) / nThreadsPerBlock;
+
+    auto start_vert = now();
+    kernelSubLaplacianVert<<<nBlocks, nThreadsPerBlock>>>(
+        d_src, d_src + stride, d_dst, d_dst + stride, d_weights, height, stride);
+    calc_print_elapsed("laplacian vert", start_vert);
+
+  }
+
+  void sor(
+      float *du, float *dv,
+      float *a11, float *a12, float *a22,
+      float *b1, float *b2,
+      float *horiz, float *vert,
+      int iterations, float omega,
+      int height, int width, int stride) {
+
+    // Device setup
+    float 
+      *d_du,
+    *d_dv,
+    *d_a11,
+    *d_a12,
+    *d_a22,
+    *d_b1,
+    *d_b2,
+    *d_horiz,
+    *d_vert;
+
+    checkCudaErrors( cudaHostGetDevicePointer(&d_du,    du, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_dv,    dv, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_a11,   a11, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_a12,   a12, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_a22,   a22, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_b1,    b1, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_b2,    b2, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_horiz, horiz, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_vert,  vert, 0) );
+
+    int N = width * height;
+    int nThreadsPerBlock = 64;
+    int nBlocks = (N + nThreadsPerBlock - 1) / nThreadsPerBlock;
+
+    for(int iter = 0 ; iter<iterations ; iter++)
+    {
+
+      kernelSorStep<<<nBlocks, nThreadsPerBlock>>>(
+          d_du, d_dv,
+          d_a11, d_a12, d_a22,
+          d_b1, d_b2,
+          d_horiz, d_vert,
+          iterations, omega,
+          height, width, stride, true);
+
+      cudaDeviceSynchronize();
+
+      kernelSorStep<<<nBlocks, nThreadsPerBlock>>>(
+          d_du, d_dv,
+          d_a11, d_a12, d_a22,
+          d_b1, d_b2,
+          d_horiz, d_vert,
+          iterations, omega,
+          height, width, stride, false);
+    }  
+  }
+
+  void getMeanImageAndDiff(
+      float *img1, float *img2, float *avgImg, float *diff,
+      int height, int stride) {
+
+    float
+      *d_img1,
+    *d_img2,
+    *d_avgImg,
+    *d_diff;
+
+    checkCudaErrors( cudaHostGetDevicePointer(&d_img1,   img1,   0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_img2,   img2,   0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_avgImg, avgImg, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_diff,   diff,   0) );
+
+    int N = 3 * stride;
+    int nThreadsPerBlock = 64;
+    int nBlocks = (N + nThreadsPerBlock - 1) / nThreadsPerBlock;
+
+    kernelGetMeanImageAndDiff<<<nBlocks, nThreadsPerBlock>>>(
+        d_img1, d_img2, d_avgImg, d_diff,
+        height, stride);
+
+  }
+
+  void colorImageDerivative(
+      float *dst, float *src, float *pDeviceColorDerivativeKernel, 
+      int height, int width, int stride, bool horiz) {
+
+    float *d_dst, *d_src;
+
+    checkCudaErrors( cudaHostGetDevicePointer(&d_dst, dst, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_src, src, 0) );
+
+    Npp32f *pDeviceSrc = d_src;
+    Npp32f *pDeviceDst = d_dst;
+
+    size_t elemSize = sizeof(float);
+    unsigned int nSrcStep = stride * elemSize;
+    unsigned int nDstStep = nSrcStep;
+
+    NppiSize oSrcSize = { width, height };
+    NppiPoint oSrcOffset = { 0, 0 };
+    NppiSize oSizeROI = { width, height };
+    NppiBorderType eBorderType = NPP_BORDER_REPLICATE;
+
+    NPP_CHECK_NPP(
+        (horiz)
+        ? nppiFilterRowBorder_32f_C1R (
+          pDeviceSrc, nSrcStep, oSrcSize, oSrcOffset,
+          pDeviceDst, nDstStep, oSizeROI,
+          pDeviceColorDerivativeKernel, 5, 2, eBorderType)
+        : nppiFilterColumnBorder_32f_C1R (
+          pDeviceSrc, nSrcStep, oSrcSize, oSrcOffset,
+          pDeviceDst, nDstStep, oSizeROI,
+          pDeviceColorDerivativeKernel, 5, 2, eBorderType)
+        );
+  }
+
+  // Expects filter kernel of the form
+  //   { -0.5, 0.0, 0.5 }
+  void imageDerivative(
+      float *dst, float *src, float *pDeviceDerivativeKernel, 
+      int height, int width, int stride, bool horiz) {
+
+    float *d_dst, *d_src;
+
+    checkCudaErrors( cudaHostGetDevicePointer(&d_dst, dst, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_src, src, 0) );
+
+    Npp32f *pDeviceSrc = d_src;
+    Npp32f *pDeviceDst = d_dst;
+
+    size_t elemSize = sizeof(float);
+    unsigned int nSrcStep = stride * elemSize;
+    unsigned int nDstStep = nSrcStep;
+
+    NppiSize oSrcSize = { width, height };
+    NppiPoint oSrcOffset = { 0, 0 };
+    NppiSize oSizeROI = { width, height };
+    NppiBorderType eBorderType = NPP_BORDER_REPLICATE;
+
+    NPP_CHECK_NPP(
+        (horiz)
+        ? nppiFilterRowBorder_32f_C1R (
+          pDeviceSrc, nSrcStep, oSrcSize, oSrcOffset,
+          pDeviceDst, nDstStep, oSizeROI,
+          pDeviceDerivativeKernel, 3, 1, eBorderType)
+        : nppiFilterColumnBorder_32f_C1R (
+          pDeviceSrc, nSrcStep, oSrcSize, oSrcOffset,
+          pDeviceDst, nDstStep, oSizeROI,
+          pDeviceDerivativeKernel, 3, 1, eBorderType)
+        );
+  }
+
+  void smoothnessTerm(
+      float *dst_horiz, float *dst_vert, float *smoothness,
+      float *ux,  float *uy,  float *vx,  float *vy,
+      float qa, float epsmooth,
+      int height, int width, int stride) {
+
+    float *d_dst_horiz, *d_dst_vert, *d_smoothness, *d_ux, *d_uy, *d_vx, *d_vy;
+
+    checkCudaErrors( cudaHostGetDevicePointer(&d_dst_horiz,     dst_horiz,  0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_dst_vert,      dst_vert,   0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_smoothness,    smoothness, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_ux,            ux,         0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_uy,            uy,         0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_vx,            vx,         0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_vy,            vy,         0) );
+
+    int N = height * stride;
+    int nThreadsPerBlock = 128;
+    int nBlocks = (N + nThreadsPerBlock - 1) / nThreadsPerBlock;
+
+    kernelFlowMag<<<nBlocks, nThreadsPerBlock>>> (
+        d_smoothness, d_ux, d_uy, d_vx, d_vy,
+        qa, epsmooth, height, width, stride);
+
+    cudaDeviceSynchronize();
+
+    kernelSmoothnessHorizVert<<< nBlocks, nThreadsPerBlock >>> (
+        d_dst_horiz, d_dst_vert, d_smoothness, height, width, stride);
+
+    cudaDeviceSynchronize();
+  }
+
+
+  void flowUpdate(
+      float *uu, float *vv, float *wx, float *wy, float *du, float *dv,
+      int height, int width, int stride) {
+
+    float *d_uu,
+          *d_vv,
+          *d_wx,
+          *d_wy,
+          *d_du,
+          *d_dv;
+
+    checkCudaErrors( cudaHostGetDevicePointer(&d_uu, uu, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_vv, vv, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_wx, wx, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_wy, wy, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_du, du, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_dv, dv, 0) );
+
+    int N = height * stride;
+    int nThreadsPerBlock = 128;
+    int nBlocks = (N + nThreadsPerBlock - 1) / nThreadsPerBlock;
+
+    kernelFlowUpdate<<< nBlocks, nThreadsPerBlock >>> (
+        d_uu, d_vv, d_wx, d_wy, d_du, d_dv,
+        height, width, stride);
+
+  }
+
+  /*
+     Warp an image `src` into `dst` using warp vectors `wx`, `wy`.
+     Store `mask[i]` = 0 or 1 if pixel i goes outisde or inside image bounds.
+   */
+  void warpImage(
+      color_image_t *dst, image_t *mask, const color_image_t *src, const image_t *wx, const image_t *wy) {
+
+    float *d_dst1, *d_dst2, *d_dst3;
+    float *d_src1, *d_src2, *d_src3;
+    float *d_mask, *d_wx,   *d_wy;
+
+    checkCudaErrors( cudaHostGetDevicePointer(&d_dst1, dst->c1,  0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_dst2, dst->c2,  0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_dst3, dst->c3,  0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_src1, src->c1,  0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_src2, src->c2,  0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_src3, src->c3,  0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_mask, mask->c1, 0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_wx,   wx->c1,   0) );
+    checkCudaErrors( cudaHostGetDevicePointer(&d_wy,   wy->c1,   0) );
+
+    int N = src->height * src->stride;
+    int nThreadsPerBlock = 64;
+    int nBlocks = (N + nThreadsPerBlock - 1) / nThreadsPerBlock;
+
+    kernelWarpImage<<< nBlocks, nThreadsPerBlock >>> (
+        d_dst1, d_dst2, d_dst3, d_mask,
+        d_src1, d_src2, d_src3,
+        d_wx,   d_wy, src->height, src->width, src->stride);
+  }
+
+}
diff --git a/src/kernels/flowUtil.h b/src/kernels/flowUtil.h
new file mode 100644
index 0000000..1781b03
--- /dev/null
+++ b/src/kernels/flowUtil.h
@@ -0,0 +1,77 @@
+#ifndef __KERNEL_FLOW_UTIL_H__
+#define __KERNEL_FLOW_UTIL_H__
+
+// System
+#include <iostream>
+#include <chrono>
+#include <string>
+#include <stdexcept>
+
+// CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+// Local
+#include "../FDF1.0.1/image.h"
+#include "../common/Exceptions.h"
+#include "../common/timer.h"
+#include "../sandbox/process.h"
+#include "../patch.h"
+
+using namespace OFC;
+
+namespace cu {
+
+  void dataTerm(
+      image_t *a11, image_t *a12, image_t *a22,
+      image_t *b1, image_t *b2, 
+      image_t *mask, 
+      image_t *wx, image_t *wy,
+      image_t *du, image_t *dv, 
+      image_t *uu, image_t *vv, 
+      color_image_t *Ix,  color_image_t *Iy,  color_image_t *Iz,
+      color_image_t *Ixx, color_image_t *Ixy, color_image_t *Iyy,
+      color_image_t *Ixz, color_image_t *Iyz, 
+      const float half_delta_over3, const float half_beta, const float half_gamma_over3);
+
+  void subLaplacianHoriz(
+      float *src, float *dst, float *weights, int height, int width, int stride);
+
+  void subLaplacianVert(
+      float *src, float *dst, float *weights, int height, int stride);
+
+  void sor(
+      float *du, float *dv,
+      float *a11, float *a12, float *a22,
+      float *b1, float *b2,
+      float *horiz, float *vert,
+      int iterations, float omega,
+      int height, int width, int stride);
+
+  void getMeanImageAndDiff(
+      float *img1, float *img2, float *avgImg, float *diff,
+      int height, int stride);
+
+  void colorImageDerivative(
+      float *dst, float *src, float *pDeviceKernel, int height, int width, int stride, bool horiz);
+
+  void imageDerivative(
+      float *dst, float *src, float *pDeviceKernel, int height, int width, int stride, bool horiz);
+
+  void smoothnessTerm(
+      float *dst_horiz, float *dst_vert, float *smoothness,
+      float *ux,  float *uy,  float *vx,  float *vy,
+      float quarter_alpha, float epsilon_smooth,
+      int height, int width, int stride);
+
+  void flowUpdate(
+      float *uu, float *vv, float *wx, float *wy, float *du, float *dv,
+      int height, int width, int stride);
+
+  void warpImage(
+      color_image_t *dst, image_t *mask, const color_image_t *src, const image_t *wx, const image_t *wy);
+
+}
+
+#endif // end __KERNEL_FLOW_UTIL_H__
+
diff --git a/src/refine_variational.cpp b/src/refine_variational.cpp
index 6bf988d..41d7b46 100644
--- a/src/refine_variational.cpp
+++ b/src/refine_variational.cpp
@@ -5,6 +5,12 @@
 
 #include <thread>
 
+// CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "common/cuda_helper.h"
+#include "kernels/flowUtil.h"
+
 #include <Eigen/Core>
 #include <Eigen/LU>
 #include <Eigen/Dense>
@@ -46,6 +52,18 @@ namespace OFC {
       float deriv_filter_flow[2] = {0.0f, -0.5f};
       deriv_flow = convolution_new(1, deriv_filter_flow, 0);
 
+      float pHostColorDerivativeKernel[5] = { 1.0 / 12.0, 2.0 / 3.0, 0.0, -2.0 / 3.0, 1.0 / 12.0 };
+      checkCudaErrors( cudaMalloc((void**) &pDeviceColorDerivativeKernel, 5 * sizeof(float)) );
+      checkCudaErrors(
+          cudaMemcpy(pDeviceColorDerivativeKernel, pHostColorDerivativeKernel,
+            5 * sizeof(float), cudaMemcpyHostToDevice) );
+
+      float pHostDerivativeKernel[3] = { 0.5, 0.0, -0.5 };
+      checkCudaErrors( cudaMalloc((void**) &pDeviceDerivativeKernel, 3 * sizeof(float)) );
+      checkCudaErrors(
+          cudaMemcpy(pDeviceDerivativeKernel, pHostDerivativeKernel,
+            3 * sizeof(float), cudaMemcpyHostToDevice) );
+
       // copy flow initialization into FV structs
       static int noparam = 2; // Optical flow
 
@@ -74,12 +92,12 @@ namespace OFC {
 
       copyimage(_I0, I0);
       copyimage(_I1, I1);
-      // calc_print_elapsed("refine: flow_sep", start_flow_sep);
+      calc_print_elapsed("refine: flow_sep", start_flow_sep);
 
       // Call solver
       auto start_solver = now();
       RefLevelOF(flow_sep[0], flow_sep[1], I0, I1);
-      // calc_print_elapsed("RefLevelOF [total]", start_solver);
+      calc_print_elapsed("RefLevelOF [total]", start_solver);
 
       // Copy flow result back
       auto start_copy = now();
@@ -93,7 +111,7 @@ namespace OFC {
 
         }
       }
-      // calc_print_elapsed("refine: copy back", start_copy);
+      calc_print_elapsed("refine: copy back", start_copy);
 
       // free FV structs
       for (int i = 0; i < noparam; ++i )
@@ -152,23 +170,24 @@ namespace OFC {
                   *Ix = color_image_new(width,height), *Iy = color_image_new(width,height), *Iz = color_image_new(width,height), // first order derivatives
                   *Ixx = color_image_new(width,height), *Ixy = color_image_new(width,height),
                   *Iyy = color_image_new(width,height), *Ixz = color_image_new(width,height), *Iyz = color_image_new(width,height); // second order derivatives
-    // calc_print_elapsed("RefLevelOF setup", start_setup);
+    calc_print_elapsed("RefLevelOF setup", start_setup);
 
     // warp second image
     auto start_image_warp = now();
-    image_warp(w_im2, mask, im2, wx, wy);
-    // calc_print_elapsed("RefLevelOF image_warp", start_image_warp);
+    // image_warp(w_im2, mask, im2, wx, wy);
+    cu::warpImage(w_im2, mask, im2, wx, wy);
+    calc_print_elapsed("RefLevelOF image_warp", start_image_warp);
 
     // compute derivatives
     auto start_get_derivs = now();
-    get_derivatives(im1, w_im2, deriv, Ix, Iy, Iz, Ixx, Ixy, Iyy, Ixz, Iyz);
-    // calc_print_elapsed("RefLevelOF get_derivatives", start_get_derivs);
+    get_derivatives(im1, w_im2, pDeviceColorDerivativeKernel, Ix, Iy, Iz, Ixx, Ixy, Iyy, Ixz, Iyz);
+    calc_print_elapsed("RefLevelOF get_derivatives", start_get_derivs);
 
     // erase du and dv
     auto start_image_erase = now();
     image_erase(du);
     image_erase(dv);
-    // calc_print_elapsed("RefLevelOF image_erase", start_image_erase);
+    calc_print_elapsed("RefLevelOF image_erase", start_image_erase);
 
     // initialize uu and vv
     memcpy(uu->c1,wx->c1,wx->stride*wx->height*sizeof(float));
@@ -180,46 +199,43 @@ namespace OFC {
 
       //  compute robust function and system
       auto start_smooth = now();
-      compute_smoothness(smooth_horiz, smooth_vert, uu, vv, deriv_flow, vr.tmp_quarter_alpha );
-      // calc_print_elapsed(("RefLevelOF " + iterStr + " smoothness").c_str(), start_smooth);
+      compute_smoothness(smooth_horiz, smooth_vert, uu, vv, pDeviceDerivativeKernel, vr.tmp_quarter_alpha );
+      calc_print_elapsed(("RefLevelOF " + iterStr + " smoothness").c_str(), start_smooth);
 
       auto start_data = now();
-      compute_data(a11, a12, a22, b1, b2, mask, wx, wy, du, dv, uu, vv, Ix, Iy, Iz, Ixx, Ixy, Iyy, Ixz, Iyz, vr.tmp_half_delta_over3, vr.tmp_half_beta, vr.tmp_half_gamma_over3);
-      // calc_print_elapsed(("RefLevelOF " + iterStr + " data").c_str(), start_data);
+      // compute_data(a11, a12, a22, b1, b2, mask, wx, wy, du, dv, uu, vv, Ix, Iy, Iz, Ixx, Ixy, Iyy, Ixz, Iyz, vr.tmp_half_delta_over3, vr.tmp_half_beta, vr.tmp_half_gamma_over3);
+      cu::dataTerm(a11, a12, a22, b1, b2, mask, wx, wy, du, dv, uu, vv, Ix, Iy, Iz, Ixx, Ixy, Iyy, Ixz, Iyz, vr.tmp_half_delta_over3, vr.tmp_half_beta, vr.tmp_half_gamma_over3);
+      calc_print_elapsed(("RefLevelOF " + iterStr + " data").c_str(), start_data);
 
       auto start_lapalcian = now();
       sub_laplacian(b1, wx, smooth_horiz, smooth_vert);
       sub_laplacian(b2, wy, smooth_horiz, smooth_vert);
-      // calc_print_elapsed(("RefLevelOF " + iterStr + " laplacian").c_str(), start_lapalcian);
+      calc_print_elapsed(("RefLevelOF " + iterStr + " laplacian").c_str(), start_lapalcian);
 
       // solve system
 // #ifdef WITH_OPENMP
       auto start_sor = now();
       sor_coupled_slow_but_readable(du, dv, a11, a12, a22, b1, b2, smooth_horiz, smooth_vert, vr.solve_iter, vr.sor_omega); // slower but parallelized
-      // calc_print_elapsed(("RefLevelOF " + iterStr + " sor").c_str(), start_sor);
+      calc_print_elapsed(("RefLevelOF " + iterStr + " sor").c_str(), start_sor);
 // #else
 //      sor_coupled(du, dv, a11, a12, a22, b1, b2, smooth_horiz, smooth_vert, vr.solve_iter, vr.sor_omega);
 // #endif
 
       // update flow plus flow increment
       auto start_flow_update = now();
-      int i;
-      v4sf *uup = (v4sf*) uu->c1, *vvp = (v4sf*) vv->c1, *wxp = (v4sf*) wx->c1, *wyp = (v4sf*) wy->c1, *dup = (v4sf*) du->c1, *dvp = (v4sf*) dv->c1;
-      for( i=0 ; i<height*stride/VECTOR_WIDTH ; i++) {
-        (*uup) = (*wxp) + (*dup);
-        (*vvp) = (*wyp) + (*dvp);
-        uup+=1; vvp+=1; wxp+=1; wyp+=1;dup+=1;dvp+=1;
-      }
-      // calc_print_elapsed(("RefLevelOF " + iterStr + " flow update").c_str(), start_flow_update);
+      cu::flowUpdate(
+          uu->c1, vv->c1, wx->c1, wy->c1, du->c1, dv->c1,
+          height, width, stride);
+      calc_print_elapsed(("RefLevelOF " + iterStr + " flow update").c_str(), start_flow_update);
 
-      // calc_print_elapsed(("RefLevelOF " + iterStr + " [total]").c_str(), start_iteration);
+      calc_print_elapsed(("RefLevelOF " + iterStr + " [total]").c_str(), start_iteration);
 
     }
     // add flow increment to current flow
     auto start_increment_flow = now();
     memcpy(wx->c1,uu->c1,uu->stride*uu->height*sizeof(float));
     memcpy(wy->c1,vv->c1,vv->stride*vv->height*sizeof(float));
-    // calc_print_elapsed("RefLevelOF increment flow", start_increment_flow);
+    calc_print_elapsed("RefLevelOF increment flow", start_increment_flow);
 
     // free memory
     auto start_cleanup = now();
@@ -233,10 +249,13 @@ namespace OFC {
     color_image_delete(w_im2);
     color_image_delete(Ix); color_image_delete(Iy); color_image_delete(Iz);
     color_image_delete(Ixx); color_image_delete(Ixy); color_image_delete(Iyy); color_image_delete(Ixz); color_image_delete(Iyz);
-    // calc_print_elapsed("RefLevelOF cleanup", start_cleanup);
+    calc_print_elapsed("RefLevelOF cleanup", start_cleanup);
   }
 
 
-  VarRefClass::~VarRefClass() { }
+  VarRefClass::~VarRefClass() {
+    cudaFree(pDeviceColorDerivativeKernel); 
+    cudaFree(pDeviceDerivativeKernel); 
+  }
 
 }
diff --git a/src/refine_variational.h b/src/refine_variational.h
index 9f98d99..8495e55 100644
--- a/src/refine_variational.h
+++ b/src/refine_variational.h
@@ -52,6 +52,9 @@ namespace OFC {
       const img_params* i_params;
       const opt_params* op;
 
+      float *pDeviceColorDerivativeKernel;
+      float *pDeviceDerivativeKernel;
+
   };
 
 }