diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 4ffc473..d43564f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,27 +1,46 @@
 cmake_minimum_required (VERSION 2.8)
 project (flow)
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O3 -Wno-unknown-pragmas -Wall -std=c++11 -msse4 -fopenmp")  #-Wall
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O3 -Wno-unknown-pragmas -Wall -msse4 -fopenmp")  #-Wall
+message(STATUS "Architecture: ${ARCH}")
+
+# On ghc machines
+if (ARCH STREQUAL "x86")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O3 -Wno-unknown-pragmas -Wall -std=c++11 -msse4 -fopenmp")  #-Wall
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O3 -Wno-unknown-pragmas -Wall -msse4 -fopenmp")  #-Wall
+
+  # For ghc machines
+  set(OpenCV_DIR "/afs/cs/academic/class/15418-s17/public/sw/opencv/build")
+  set(Eigen3_DIR "/afs/cs.cmu.edu/academic/class/15418-s17/public/sw/eigen/build")
+
+# On NVIDIA Jetson
+elseif (ARCH STREQUAL "ARM")
+
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O3 -Wno-unknown-pragmas -Wall -std=c++11 -fopenmp")  #-Wall
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O3 -Wno-unknown-pragmas -Wall -fopenmp")  #-Wall
+
+else()
+  message(STATUS "specify using -DARCH=x86 or -DARCH=ARM")
+  message(FATAL_ERROR "aborting: missing required architecture")
+endif ()
 
-# For ghc machines
-set(OpenCV_DIR "/afs/cs/academic/class/15418-s17/public/sw/opencv/build")
-set(Eigen3_DIR "/afs/cs.cmu.edu/academic/class/15418-s17/public/sw/eigen/build")
 
 find_package(OpenCV REQUIRED)
 find_package(Eigen3 REQUIRED)
 find_package(CUDA REQUIRED)
 
 # On ghc, include DIR from FIND_PACKAGE is set wrong
-set(EIGEN3_INCLUDE_DIR "/afs/cs.cmu.edu/academic/class/15418-s17/public/sw/eigen/include/eigen3")
+if (ARCH STREQUAL "x86")
+  set(EIGEN3_INCLUDE_DIR "/afs/cs.cmu.edu/academic/class/15418-s17/public/sw/eigen/include/eigen3")
+endif()
 include_directories(${EIGEN3_INCLUDE_DIR})
 
 # Add extra cuda libraries
 set(CUDA_CUSOLVER_LIBRARIES
-  "/usr/local/depot/cuda-8.0/lib64/libcusolver.so"
-  "/usr/local/depot/cuda-8.0/lib64/libcublas.so.8.0")
+  "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcusolver.so"
+  "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcublas.so.8.0")
+set(CUDA_LIBRARIES ${CUDA_LIBRARIES} ${CUDA_CUSOLVER_LIBRARIES})
 
-set(CUDA_LIBRARIES ${CUDA_LIBRARIES} ${CUDA_nppi_LIBRARY} ${CUDA_CUSOLVER_LIBRARIES})
+set(CUDA_LIBRARIES ${CUDA_LIBRARIES} ${CUDA_nppi_LIBRARY})
 
 message(STATUS "OpenCV library status:")
 message(STATUS "    version: ${OpenCV_VERSION}")
@@ -34,6 +53,7 @@ message(STATUS "    include path: ${EIGEN3_INCLUDE_DIR} ${EIGEN3_INCLUDE_DIRS}")
 
 message(STATUS "CUDA library status:")
 message(STATUS "    version: ${CUDA_VERSION}")
+message(STATUS "    toolkit path: ${CUDA_TOOLKIT_ROOT_DIR}")
 message(STATUS "    libraries: ${CUDA_LIBRARIES}")
 message(STATUS "    include path: ${CUDA_INCLUDE_DIRS}")
 message(STATUS "    toolkit path: ${CUDA_TOOLKIT_ROOT_DIR}")
@@ -46,7 +66,14 @@ message(STATUS "    cuSOLVER path: ${CUDA_CUSOLVER_LIBRARIES}")
 ################################################################################
 
 set(CUDA_VERBOSE_BUILD ON)
-set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -g -std=c++11 -arch=compute_61 -code=sm_61)
+
+if (ARCH STREQUAL "x86")
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -g -arch=compute_61 -code=sm_61)
+endif()
+
+if (ARCH STREQUAL "ARM")
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -g -arch=compute_62 -code=sm_62)
+endif()
 
 set(KERNELS
   kernels/densify.cu
@@ -59,6 +86,7 @@ set(KERNELS
   kernels/resizeGrad.cpp
   kernels/resize.cpp)
 
+
 set(COMMON
   common/RgbMat.cpp)
 
@@ -70,20 +98,23 @@ set(CODEFILES
   refine_variational.cpp
   FDF1.0.1/image.c
   FDF1.0.1/opticalflow_aux.c
-  FDF1.0.1/solver.c)
+  FDF1.0.1/solver.c
+  )
 
-# RGB, Optical Flow
-cuda_add_executable(flow ${COMMON} ${KERNELS} ${CODEFILES})
+# GrayScale, Optical Flow
+cuda_add_executable(flow ${COMMON} ${CODEFILES} ${KERNELS})
 set_target_properties (flow PROPERTIES COMPILE_DEFINITIONS "SELECTMODE=1")
 set_property(TARGET flow APPEND PROPERTY COMPILE_DEFINITIONS "SELECTCHANNEL=3") # use RGB image
+set_property(TARGET flow APPEND PROPERTY COMPILE_DEFINITIONS "VECTOR_WIDTH=4") # 8 wide SIMD (4 floats)
+# set_property(TARGET flow APPEND PROPERTY COMPILE_DEFINITIONS "VECTOR_WIDTH=1") # no SIMD
 target_link_libraries(flow ${OpenCV_LIBS})
 
-# CUDA sandbox
+# # CUDA sandbox
 # set(SANDBOX_FILES
 #   # sandbox/process_sobel.cpp
-#   # sandbox/process_resize.cpp
+#   sandbox/process_resize.cpp
 #   # sandbox/process_resizeGrad.cpp
-#   sandbox/process_pad.cpp
+#   # sandbox/process_pad.cpp
 #   # sandbox/RgbMatTest.cpp
 #   sandbox/sandbox.cpp)
 # cuda_add_executable(sandbox ${COMMON} ${KERNELS} ${SANDBOX_FILES})
diff --git a/src/FDF1.0.1/image.c b/src/FDF1.0.1/image.c
index 460e274..d8f044e 100644
--- a/src/FDF1.0.1/image.c
+++ b/src/FDF1.0.1/image.c
@@ -6,8 +6,16 @@
 
 #include "image.h"
 
-#include <xmmintrin.h>
-typedef __v4sf v4sf;
+// #include <xmmintrin.h>
+// typedef __v4sf v4sf;
+
+#include <arm_neon.h>
+
+#if (VECTOR_WIDTH == 4)
+typedef float32x4_t v4sf;
+#else
+typedef float v4sf;
+#endif
 
 /********** Create/Delete **********/
 
@@ -46,8 +54,12 @@ void image_erase(image_t *image){
 void image_mul_scalar(image_t *image, const float scalar){
     int i;
     v4sf* imp = (v4sf*) image->c1;
+#if (VECTOR_WIDTH == 4)
     const v4sf scalarp = {scalar,scalar,scalar,scalar};
-    for( i=0 ; i<image->stride/4*image->height ; i++){
+#else
+    const v4sf scalarp = scalar;
+#endif
+    for( i=0 ; i<image->stride/VECTOR_WIDTH*image->height ; i++){
         (*imp) *= scalarp;
         imp+=1;
     }
@@ -73,7 +85,7 @@ color_image_t *color_image_new(const int width, const int height){
     }
     image->width = width;
     image->height = height;  
-    image->stride = ( (width+3) / 4 ) * 4;
+    image->stride = ( (width+VECTOR_WIDTH-1) / VECTOR_WIDTH ) * VECTOR_WIDTH;
     image->c1 = (float*) memalign(16, 3*image->stride*height*sizeof(float));
     if(image->c1 == NULL){
         fprintf(stderr, "Error: color_image_new() - not enough memory !\n");
@@ -374,7 +386,11 @@ convolution_t *convolution_new(const int order, const float *half_coeffs, const
 }
 
 static void convolve_vert_fast_3(image_t *dst, const image_t *src, const convolution_t *conv){
+#if (VECTOR_WIDTH == 4)
     const int iterline = (src->stride>>2)+1;
+#else
+    const int iterline = (src->stride)+1;
+#endif
     const float *coeff = conv->coeffs;
     //const float *coeff_accu = conv->coeffs_accu;
     v4sf *srcp = (v4sf*) src->c1, *dstp = (v4sf*) dst->c1;
@@ -399,7 +415,11 @@ static void convolve_vert_fast_3(image_t *dst, const image_t *src, const convolu
 }
 
 static void convolve_vert_fast_5(image_t *dst, const image_t *src, const convolution_t *conv){
+#if (VECTOR_WIDTH == 4)
     const int iterline = (src->stride>>2)+1;
+#else
+    const int iterline = (src->stride)+1;
+#endif
     const float *coeff = conv->coeffs;
     //const float *coeff_accu = conv->coeffs_accu;
     v4sf *srcp = (v4sf*) src->c1, *dstp = (v4sf*) dst->c1;
@@ -435,7 +455,11 @@ static void convolve_vert_fast_5(image_t *dst, const image_t *src, const convolu
 
 static void convolve_horiz_fast_3(image_t *dst, const image_t *src, const convolution_t *conv){
     const int stride_minus_1 = src->stride-1;
+#if (VECTOR_WIDTH == 4)
     const int iterline = (src->stride>>2);
+#else
+    const int iterline = (src->stride);
+#endif
     const float *coeff = conv->coeffs;
     v4sf *srcp = (v4sf*) src->c1, *dstp = (v4sf*) dst->c1;
     // create shifted version of src
@@ -466,7 +490,11 @@ static void convolve_horiz_fast_3(image_t *dst, const image_t *src, const convol
 static void convolve_horiz_fast_5(image_t *dst, const image_t *src, const convolution_t *conv){
     const int stride_minus_1 = src->stride-1;
     const int stride_minus_2 = src->stride-2;
+#if (VECTOR_WIDTH == 4)
     const int iterline = (src->stride>>2);
+#else
+    const int iterline = (src->stride);
+#endif
     const float *coeff = conv->coeffs;
     v4sf *srcp = (v4sf*) src->c1, *dstp = (v4sf*) dst->c1;
     float *src_p1 = (float*) malloc(sizeof(float)*src->stride*4);
@@ -506,7 +534,8 @@ void convolve_horiz(image_t *dest, const image_t *src, const convolution_t *conv
     if(conv->order==1){
         convolve_horiz_fast_3(dest,src,conv);
         return;
-    }else if(conv->order==2){
+    }
+    else if(conv->order==2){
         convolve_horiz_fast_5(dest,src,conv);
         return;    
     }
@@ -556,7 +585,8 @@ void convolve_vert(image_t *dest, const image_t *src, const convolution_t *conv)
     if(conv->order==1){
         convolve_vert_fast_3(dest,src,conv);
         return;
-    }else if(conv->order==2){
+    }
+    else if(conv->order==2){
         convolve_vert_fast_5(dest,src,conv);
         return;    
     }
diff --git a/src/FDF1.0.1/opticalflow_aux.c b/src/FDF1.0.1/opticalflow_aux.c
index 957a946..3076063 100644
--- a/src/FDF1.0.1/opticalflow_aux.c
+++ b/src/FDF1.0.1/opticalflow_aux.c
@@ -4,8 +4,13 @@
 #include <string.h>
 #include "opticalflow_aux.h"
 
-#include <xmmintrin.h>
-typedef __v4sf v4sf;
+#include <arm_neon.h>
+
+#if (VECTOR_WIDTH == 4)
+typedef float32x4_t v4sf;
+#else
+typedef float v4sf;
+#endif
 
 #define datanorm 0.1f*0.1f//0.01f // square of the normalization factor
 #define epsilon_color (0.001f*0.001f)//0.000001f
@@ -75,9 +80,13 @@ void get_derivatives(const color_image_t *im1, const color_image_t *im2, const c
 #if (SELECTCHANNEL==1 | SELECTCHANNEL==2)
     image_t *tmp_im2 = image_new(im2->width,im2->height);    
     v4sf *tmp_im2p = (v4sf*) tmp_im2->c1, *dtp = (v4sf*) dt->c1, *im1p = (v4sf*) im1->c1, *im2p = (v4sf*) im2->c1;
+#if (VECTOR_WIDTH == 4)
     const v4sf half = {0.5f,0.5f,0.5f,0.5f};
+#else
+    const v4sf half = 0.5f;
+#endif
     int i=0;
-    for(i=0 ; i<im1->height*im1->stride/4 ; i++){
+    for(i=0 ; i<im1->height*im1->stride/VECTOR_WIDTH ; i++){
         *tmp_im2p = half * ( (*im2p) + (*im1p) );
         *dtp = (*im2p)-(*im1p);
         dtp+=1; im1p+=1; im2p+=1; tmp_im2p+=1;
@@ -95,9 +104,13 @@ void get_derivatives(const color_image_t *im1, const color_image_t *im2, const c
 #else
     color_image_t *tmp_im2 = color_image_new(im2->width,im2->height);    
     v4sf *tmp_im2p = (v4sf*) tmp_im2->c1, *dtp = (v4sf*) dt->c1, *im1p = (v4sf*) im1->c1, *im2p = (v4sf*) im2->c1;
+#if (VECTOR_WIDTH == 4)
     const v4sf half = {0.5f,0.5f,0.5f,0.5f};
+#else
+    const v4sf half = 0.5f;
+#endif
     int i=0;
-    for(i=0 ; i<3*im1->height*im1->stride/4 ; i++){
+    for(i=0 ; i<3*im1->height*im1->stride/VECTOR_WIDTH; i++){
         *tmp_im2p = half * ( (*im2p) + (*im1p) );
         *dtp = (*im2p)-(*im1p);
         dtp+=1; im1p+=1; im2p+=1; tmp_im2p+=1;
@@ -131,10 +144,21 @@ void compute_smoothness(image_t *dst_horiz, image_t *dst_vert, const image_t *uu
     convolve_vert(vy, vv, deriv_flow);
     // compute smoothness
     v4sf *uxp = (v4sf*) ux->c1, *vxp = (v4sf*) vx->c1, *uyp = (v4sf*) uy->c1, *vyp = (v4sf*) vy->c1, *sp = (v4sf*) smoothness->c1;
+#if (VECTOR_WIDTH == 4)
     const v4sf qa = {quarter_alpha,quarter_alpha,quarter_alpha,quarter_alpha};
     const v4sf epsmooth = {epsilon_smooth,epsilon_smooth,epsilon_smooth,epsilon_smooth};
-    for(j=0 ; j< height*stride/4 ; j++){
-        *sp = qa / __builtin_ia32_sqrtps( (*uxp)*(*uxp) + (*uyp)*(*uyp) + (*vxp)*(*vxp) + (*vyp)*(*vyp) + epsmooth );
+#else
+    const v4sf qa = quarter_alpha;
+    const v4sf epsmooth = epsilon_smooth;
+#endif
+    for(j=0 ; j< height*stride/VECTOR_WIDTH ; j++){
+#if (VECTOR_WIDTH == 4)
+        *sp = qa / vsqrtq_f32(
+            (*uxp)*(*uxp) + (*uyp)*(*uyp) + (*vxp)*(*vxp) + (*vyp)*(*vyp) + epsmooth );
+#else
+        *sp = qa / sqrtf(
+            (*uxp)*(*uxp) + (*uyp)*(*uyp) + (*vxp)*(*vxp) + (*vyp)*(*vyp) + epsmooth );
+#endif
         sp+=1;uxp+=1; uyp+=1; vxp+=1; vyp+=1;
     }
     image_delete(ux); image_delete(uy); image_delete(vx); image_delete(vy); 
@@ -147,7 +171,7 @@ void compute_smoothness(image_t *dst_horiz, image_t *dst_vert, const image_t *uu
         memcpy(sp_shift, spf+1, sizeof(float)*(stride-1));
         v4sf *sps = (v4sf*) sp_shift;
         int i;
-        for(i=0;i<stride/4;i++){
+        for(i=0;i<stride/VECTOR_WIDTH;i++){
             *dsthp = (*sp) + (*sps);
             dsthp+=1; sp+=1; sps+=1;
         }
@@ -156,7 +180,7 @@ void compute_smoothness(image_t *dst_horiz, image_t *dst_vert, const image_t *uu
     free(sp_shift);
     // compute dst_vert
     v4sf *dstvp = (v4sf*) dst_vert->c1, *sp_bottom = (v4sf*) (smoothness->c1+stride); sp = (v4sf*) smoothness->c1;
-    for(j=0 ; j<(height-1)*stride/4 ; j++){
+    for(j=0 ; j<(height-1)*stride/VECTOR_WIDTH ; j++){
         *dstvp = (*sp) + (*sp_bottom);
         dstvp+=1; sp+=1; sp_bottom+=1;
     }
@@ -190,7 +214,7 @@ void sub_laplacian(image_t *dst, const image_t *src, const image_t *weight_horiz
     }
   
     v4sf *wvp = (v4sf*) weight_vert->c1, *srcp = (v4sf*) src->c1, *srcp_s = (v4sf*) (src->c1+src->stride), *dstp = (v4sf*) dst->c1, *dstp_s = (v4sf*) (dst->c1+src->stride);
-    for(j=1+(src->height-1)*src->stride/4 ; --j ;){
+    for(j=1+(src->height-1)*src->stride/VECTOR_WIDTH ; --j ;){
         const v4sf tmp = (*wvp) * ((*srcp_s)-(*srcp));
         *dstp += tmp;
         *dstp_s -= tmp;
@@ -198,111 +222,6 @@ void sub_laplacian(image_t *dst, const image_t *src, const image_t *weight_horiz
     }
 }
 
-/* compute the dataterm and the matching term
-   a11 a12 a22 represents the 2x2 diagonal matrix, b1 and b2 the right hand side
-   other (color) images are input */
-void compute_data_and_match(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t *b2, image_t *mask, image_t *wx, image_t *wy, image_t *du, image_t *dv, image_t *uu, image_t *vv, color_image_t *Ix, color_image_t *Iy, color_image_t *Iz, color_image_t *Ixx, color_image_t *Ixy, color_image_t *Iyy, color_image_t *Ixz, color_image_t *Iyz, image_t *desc_weight, image_t *desc_flow_x, image_t *desc_flow_y, const float half_delta_over3, const float half_beta, const float half_gamma_over3){
- 
-    const v4sf dnorm = {datanorm, datanorm, datanorm, datanorm};
-    const v4sf hdover3 = {half_delta_over3, half_delta_over3, half_delta_over3, half_delta_over3};
-    const v4sf epscolor = {epsilon_color, epsilon_color, epsilon_color, epsilon_color};
-    const v4sf hgover3 = {half_gamma_over3, half_gamma_over3, half_gamma_over3, half_gamma_over3};
-    const v4sf epsgrad = {epsilon_grad, epsilon_grad, epsilon_grad, epsilon_grad};
-    const v4sf hbeta = {half_beta,half_beta,half_beta,half_beta};
-    const v4sf epsdesc = {epsilon_desc,epsilon_desc,epsilon_desc,epsilon_desc};
-    
-    v4sf *dup = (v4sf*) du->c1, *dvp = (v4sf*) dv->c1,
-        *maskp = (v4sf*) mask->c1,
-        *a11p = (v4sf*) a11->c1, *a12p = (v4sf*) a12->c1, *a22p = (v4sf*) a22->c1, 
-        *b1p = (v4sf*) b1->c1, *b2p = (v4sf*) b2->c1, 
-        *ix1p=(v4sf*)Ix->c1, *iy1p=(v4sf*)Iy->c1, *iz1p=(v4sf*)Iz->c1, *ixx1p=(v4sf*)Ixx->c1, *ixy1p=(v4sf*)Ixy->c1, *iyy1p=(v4sf*)Iyy->c1, *ixz1p=(v4sf*)Ixz->c1, *iyz1p=(v4sf*) Iyz->c1, 
-        *ix2p=(v4sf*)Ix->c2, *iy2p=(v4sf*)Iy->c2, *iz2p=(v4sf*)Iz->c2, *ixx2p=(v4sf*)Ixx->c2, *ixy2p=(v4sf*)Ixy->c2, *iyy2p=(v4sf*)Iyy->c2, *ixz2p=(v4sf*)Ixz->c2, *iyz2p=(v4sf*) Iyz->c2, 
-        *ix3p=(v4sf*)Ix->c3, *iy3p=(v4sf*)Iy->c3, *iz3p=(v4sf*)Iz->c3, *ixx3p=(v4sf*)Ixx->c3, *ixy3p=(v4sf*)Ixy->c3, *iyy3p=(v4sf*)Iyy->c3, *ixz3p=(v4sf*)Ixz->c3, *iyz3p=(v4sf*) Iyz->c3, 
-        *uup = (v4sf*) uu->c1, *vvp = (v4sf*)vv->c1, *wxp = (v4sf*)wx->c1, *wyp = (v4sf*)wy->c1,
-        *descflowxp = (v4sf*)desc_flow_x->c1, *descflowyp = (v4sf*)desc_flow_y->c1, *descweightp = (v4sf*)desc_weight->c1;
-            
-    memset(a11->c1, 0, sizeof(float)*uu->height*uu->stride);
-    memset(a12->c1, 0, sizeof(float)*uu->height*uu->stride);
-    memset(a22->c1, 0, sizeof(float)*uu->height*uu->stride);
-    memset(b1->c1 , 0, sizeof(float)*uu->height*uu->stride);
-    memset(b2->c1 , 0, sizeof(float)*uu->height*uu->stride);
-              
-    int i;
-    for(i = 0 ; i<uu->height*uu->stride/4 ; i++){
-        v4sf tmp, tmp2, tmp3, tmp4, tmp5, tmp6, n1, n2, n3, n4, n5, n6;
-        // dpsi color
-        if(half_delta_over3){
-            tmp  = *iz1p + (*ix1p)*(*dup) + (*iy1p)*(*dvp);
-            n1 = (*ix1p) * (*ix1p) + (*iy1p) * (*iy1p) + dnorm;
-            tmp2 = *iz2p + (*ix2p)*(*dup) + (*iy2p)*(*dvp);
-            n2 = (*ix2p) * (*ix2p) + (*iy2p) * (*iy2p) + dnorm;
-            tmp3 = *iz3p + (*ix3p)*(*dup) + (*iy3p)*(*dvp);
-            n3 = (*ix3p) * (*ix3p) + (*iy3p) * (*iy3p) + dnorm;
-            tmp = (*maskp) * hdover3 / __builtin_ia32_sqrtps(tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + epscolor);
-            tmp3 = tmp/n3; tmp2 = tmp/n2; tmp /= n1;
-            *a11p += tmp  * (*ix1p) * (*ix1p);
-            *a12p += tmp  * (*ix1p) * (*iy1p);
-            *a22p += tmp  * (*iy1p) * (*iy1p);
-            *b1p -=  tmp  * (*iz1p) * (*ix1p);
-            *b2p -=  tmp  * (*iz1p) * (*iy1p);
-            *a11p += tmp2 * (*ix2p) * (*ix2p);
-            *a12p += tmp2 * (*ix2p) * (*iy2p);
-            *a22p += tmp2 * (*iy2p) * (*iy2p);
-            *b1p -=  tmp2 * (*iz2p) * (*ix2p);
-            *b2p -=  tmp2 * (*iz2p) * (*iy2p);
-            *a11p += tmp3 * (*ix3p) * (*ix3p);
-            *a12p += tmp3 * (*ix3p) * (*iy3p);
-            *a22p += tmp3 * (*iy3p) * (*iy3p);
-            *b1p -=  tmp3 * (*iz3p) * (*ix3p);
-            *b2p -=  tmp3 * (*iz3p) * (*iy3p);
-        }
-        // dpsi gradient
-        n1 = (*ixx1p) * (*ixx1p) + (*ixy1p) * (*ixy1p) + dnorm;
-        n2 = (*iyy1p) * (*iyy1p) + (*ixy1p) * (*ixy1p) + dnorm;
-        tmp  = *ixz1p + (*ixx1p) * (*dup) + (*ixy1p) * (*dvp);
-        tmp2 = *iyz1p + (*ixy1p) * (*dup) + (*iyy1p) * (*dvp);
-        n3 = (*ixx2p) * (*ixx2p) + (*ixy2p) * (*ixy2p) + dnorm;
-        n4 = (*iyy2p) * (*iyy2p) + (*ixy2p) * (*ixy2p) + dnorm;
-        tmp3 = *ixz2p + (*ixx2p) * (*dup) + (*ixy2p) * (*dvp);
-        tmp4 = *iyz2p + (*ixy2p) * (*dup) + (*iyy2p) * (*dvp);
-        n5 = (*ixx3p) * (*ixx3p) + (*ixy3p) * (*ixy3p) + dnorm;
-        n6 = (*iyy3p) * (*iyy3p) + (*ixy3p) * (*ixy3p) + dnorm;
-        tmp5 = *ixz3p + (*ixx3p) * (*dup) + (*ixy3p) * (*dvp);
-        tmp6 = *iyz3p + (*ixy3p) * (*dup) + (*iyy3p) * (*dvp);
-        tmp = (*maskp) * hgover3 / __builtin_ia32_sqrtps(tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + tmp4*tmp4/n4 + tmp5*tmp5/n5 + tmp6*tmp6/n6 + epsgrad);
-        tmp6 = tmp/n6; tmp5 = tmp/n5; tmp4 = tmp/n4; tmp3 = tmp/n3; tmp2 = tmp/n2; tmp /= n1;      
-        *a11p += tmp *(*ixx1p)*(*ixx1p) + tmp2*(*ixy1p)*(*ixy1p);
-        *a12p += tmp *(*ixx1p)*(*ixy1p) + tmp2*(*ixy1p)*(*iyy1p);
-        *a22p += tmp2*(*iyy1p)*(*iyy1p) + tmp *(*ixy1p)*(*ixy1p);
-        *b1p -=  tmp *(*ixx1p)*(*ixz1p) + tmp2*(*ixy1p)*(*iyz1p);
-        *b2p -=  tmp2*(*iyy1p)*(*iyz1p) + tmp *(*ixy1p)*(*ixz1p);
-        *a11p += tmp3*(*ixx2p)*(*ixx2p) + tmp4*(*ixy2p)*(*ixy2p);
-        *a12p += tmp3*(*ixx2p)*(*ixy2p) + tmp4*(*ixy2p)*(*iyy2p);
-        *a22p += tmp4*(*iyy2p)*(*iyy2p) + tmp3*(*ixy2p)*(*ixy2p);
-        *b1p -=  tmp3*(*ixx2p)*(*ixz2p) + tmp4*(*ixy2p)*(*iyz2p);
-        *b2p -=  tmp4*(*iyy2p)*(*iyz2p) + tmp3*(*ixy2p)*(*ixz2p);
-        *a11p += tmp5*(*ixx3p)*(*ixx3p) + tmp6*(*ixy3p)*(*ixy3p);
-        *a12p += tmp5*(*ixx3p)*(*ixy3p) + tmp6*(*ixy3p)*(*iyy3p);
-        *a22p += tmp6*(*iyy3p)*(*iyy3p) + tmp5*(*ixy3p)*(*ixy3p);
-        *b1p -=  tmp5*(*ixx3p)*(*ixz3p) + tmp6*(*ixy3p)*(*iyz3p);
-        *b2p -=  tmp6*(*iyy3p)*(*iyz3p) + tmp5*(*ixy3p)*(*ixz3p);  
-        if(half_beta){ // dpsi_match
-            tmp  = *uup - (*descflowxp);
-            tmp2 = *vvp - (*descflowyp);
-            tmp = hbeta*(*descweightp)/__builtin_ia32_sqrtps(tmp*tmp+tmp2*tmp2+epsdesc);
-            *a11p += tmp;
-            *a22p += tmp;
-            *b1p -= tmp*((*wxp)-(*descflowxp));
-            *b2p -= tmp*((*wyp)-(*descflowyp));
-        }
-        dup+=1; dvp+=1; maskp+=1; a11p+=1; a12p+=1; a22p+=1; b1p+=1; b2p+=1; 
-        ix1p+=1; iy1p+=1; iz1p+=1; ixx1p+=1; ixy1p+=1; iyy1p+=1; ixz1p+=1; iyz1p+=1;
-        ix2p+=1; iy2p+=1; iz2p+=1; ixx2p+=1; ixy2p+=1; iyy2p+=1; ixz2p+=1; iyz2p+=1;
-        ix3p+=1; iy3p+=1; iz3p+=1; ixx3p+=1; ixy3p+=1; iyy3p+=1; ixz3p+=1; iyz3p+=1;
-        uup+=1;vvp+=1;wxp+=1; wyp+=1;descflowxp+=1;descflowyp+=1;descweightp+=1;
-    }
-}
-
 /* compute the dataterm // REMOVED MATCHING TERM
    a11 a12 a22 represents the 2x2 diagonal matrix, b1 and b2 the right hand side
    other (color) images are input */
@@ -312,6 +231,7 @@ void compute_data(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t
 void compute_data(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t *b2, image_t *mask, image_t *wx, image_t *wy, image_t *du, image_t *dv, image_t *uu, image_t *vv, color_image_t *Ix, color_image_t *Iy, color_image_t *Iz, color_image_t *Ixx, color_image_t *Ixy, color_image_t *Iyy, color_image_t *Ixz, color_image_t *Iyz, const float half_delta_over3, const float half_beta, const float half_gamma_over3)
 #endif
 {
+#if (VECTOR_WIDTH == 4)
     const v4sf dnorm = {datanorm, datanorm, datanorm, datanorm};
     const v4sf hdover3 = {half_delta_over3, half_delta_over3, half_delta_over3, half_delta_over3};
     const v4sf epscolor = {epsilon_color, epsilon_color, epsilon_color, epsilon_color};
@@ -319,6 +239,13 @@ void compute_data(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t
     const v4sf epsgrad = {epsilon_grad, epsilon_grad, epsilon_grad, epsilon_grad};
     //const v4sf hbeta = {half_beta,half_beta,half_beta,half_beta};
     //const v4sf epsdesc = {epsilon_desc,epsilon_desc,epsilon_desc,epsilon_desc};
+#else
+    const v4sf dnorm = datanorm;
+    const v4sf hdover3 = half_delta_over3;
+    const v4sf epscolor = epsilon_color;
+    const v4sf hgover3 = half_gamma_over3;
+    const v4sf epsgrad = epsilon_grad;
+#endif
     
     v4sf *dup = (v4sf*) du->c1, *dvp = (v4sf*) dv->c1,
         *maskp = (v4sf*) mask->c1,
@@ -339,7 +266,7 @@ void compute_data(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t
     memset(b2->c1 , 0, sizeof(float)*uu->height*uu->stride);
               
     int i;
-    for(i = 0 ; i<uu->height*uu->stride/4 ; i++){
+    for(i = 0 ; i<uu->height*uu->stride/VECTOR_WIDTH ; i++){
         v4sf tmp, tmp2, n1, n2;
 	#if (SELECTCHANNEL==3)
 	v4sf tmp3, tmp4, tmp5, tmp6, n3, n4, n5, n6;
@@ -353,10 +280,18 @@ void compute_data(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t
             n2 = (*ix2p) * (*ix2p) + (*iy2p) * (*iy2p) + dnorm;
             tmp3 = *iz3p + (*ix3p)*(*dup) + (*iy3p)*(*dvp);
             n3 = (*ix3p) * (*ix3p) + (*iy3p) * (*iy3p) + dnorm;
-            tmp = (*maskp) * hdover3 / __builtin_ia32_sqrtps(tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + epscolor);
+#if (VECTOR_WIDTH == 4)
+            tmp = (*maskp) * hdover3 / vsqrtq_f32(tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + epscolor);
+#else
+            tmp = (*maskp) * hdover3 / sqrtf(tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + epscolor);
+#endif
             tmp3 = tmp/n3; tmp2 = tmp/n2; tmp /= n1;
             #else
-            tmp = (*maskp) * hdover3 / __builtin_ia32_sqrtps(3 * tmp*tmp/n1 + epscolor);
+#if (VECTOR_WIDTH == 4)
+            tmp = (*maskp) * hdover3 / vsqrtq_f32(3 * tmp*tmp/n1 + epscolor);
+#else
+            tmp = (*maskp) * hdover3 / sqrtf(3 * tmp*tmp/n1 + epscolor);
+#endif
             tmp /= n1;
             #endif
             *a11p += tmp  * (*ix1p) * (*ix1p);
@@ -392,10 +327,20 @@ void compute_data(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t
         n6 = (*iyy3p) * (*iyy3p) + (*ixy3p) * (*ixy3p) + dnorm;
         tmp5 = *ixz3p + (*ixx3p) * (*dup) + (*ixy3p) * (*dvp);
         tmp6 = *iyz3p + (*ixy3p) * (*dup) + (*iyy3p) * (*dvp);
-        tmp = (*maskp) * hgover3 / __builtin_ia32_sqrtps(tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + tmp4*tmp4/n4 + tmp5*tmp5/n5 + tmp6*tmp6/n6 + epsgrad);
+#if (VECTOR_WIDTH == 4)
+        tmp = (*maskp) * hgover3 / vsqrtq_f32(
+            tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + tmp4*tmp4/n4 + tmp5*tmp5/n5 + tmp6*tmp6/n6 + epsgrad);
+#else
+        tmp = (*maskp) * hgover3 / sqrtf(
+            tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + tmp4*tmp4/n4 + tmp5*tmp5/n5 + tmp6*tmp6/n6 + epsgrad);
+#endif
         tmp6 = tmp/n6; tmp5 = tmp/n5; tmp4 = tmp/n4; tmp3 = tmp/n3; tmp2 = tmp/n2; tmp /= n1;      
         #else
-        tmp = (*maskp) * hgover3 / __builtin_ia32_sqrtps(3* tmp*tmp/n1 + 3* tmp2*tmp2/n2 + epsgrad);
+#if (VECTOR_WIDTH == 4)
+        tmp = (*maskp) * hgover3 / vsqrtq_f32(3* tmp*tmp/n1 + 3* tmp2*tmp2/n2 + epsgrad);
+#else
+        tmp = (*maskp) * hgover3 / sqrtf(3* tmp*tmp/n1 + 3* tmp2*tmp2/n2 + epsgrad);
+#endif
         tmp2 = tmp/n2; tmp /= n1;      
         #endif
         *a11p += tmp *(*ixx1p)*(*ixx1p) + tmp2*(*ixy1p)*(*ixy1p);
@@ -437,193 +382,3 @@ void compute_data(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t
     }
 }
 
-
-
-/* compute the dataterm // REMOVED MATCHING TERM
-   a11 a12 a22 represents the 2x2 diagonal matrix, b1 and b2 the right hand side
-   other (color) images are input */
-#if (SELECTCHANNEL==1 | SELECTCHANNEL==2)  // use single band image_delete
-void compute_data_DE(image_t *a11, image_t *b1, image_t *mask, image_t *wx, image_t *du, image_t *uu, image_t *Ix, image_t *Iy, image_t *Iz, image_t *Ixx, image_t *Ixy, image_t *Iyy, image_t *Ixz, image_t *Iyz, const float half_delta_over3, const float half_beta, const float half_gamma_over3)
-#else
-void compute_data_DE(image_t *a11, image_t *b1, image_t *mask, image_t *wx, image_t *du, image_t *uu, color_image_t *Ix, color_image_t *Iy, color_image_t *Iz, color_image_t *Ixx, color_image_t *Ixy, color_image_t *Iyy, color_image_t *Ixz, color_image_t *Iyz, const float half_delta_over3, const float half_beta, const float half_gamma_over3)
-#endif
-{
-    const v4sf dnorm = {datanorm, datanorm, datanorm, datanorm};
-    const v4sf hdover3 = {half_delta_over3, half_delta_over3, half_delta_over3, half_delta_over3};
-    const v4sf epscolor = {epsilon_color, epsilon_color, epsilon_color, epsilon_color};
-    const v4sf hgover3 = {half_gamma_over3, half_gamma_over3, half_gamma_over3, half_gamma_over3};
-    const v4sf epsgrad = {epsilon_grad, epsilon_grad, epsilon_grad, epsilon_grad};
-    //const v4sf hbeta = {half_beta,half_beta,half_beta,half_beta};
-    //const v4sf epsdesc = {epsilon_desc,epsilon_desc,epsilon_desc,epsilon_desc};
-    
-    v4sf *dup = (v4sf*) du->c1,
-        *maskp = (v4sf*) mask->c1,
-        *a11p = (v4sf*) a11->c1,  
-        *b1p = (v4sf*) b1->c1, 
-        *ix1p=(v4sf*)Ix->c1, *iy1p=(v4sf*)Iy->c1, *iz1p=(v4sf*)Iz->c1, *ixx1p=(v4sf*)Ixx->c1, *ixy1p=(v4sf*)Ixy->c1, *iyy1p=(v4sf*)Iyy->c1, *ixz1p=(v4sf*)Ixz->c1, *iyz1p=(v4sf*) Iyz->c1, 
-        #if (SELECTCHANNEL==3)
-        *ix2p=(v4sf*)Ix->c2, *iy2p=(v4sf*)Iy->c2, *iz2p=(v4sf*)Iz->c2, *ixx2p=(v4sf*)Ixx->c2, *ixy2p=(v4sf*)Ixy->c2, *iyy2p=(v4sf*)Iyy->c2, *ixz2p=(v4sf*)Ixz->c2, *iyz2p=(v4sf*) Iyz->c2, 
-        *ix3p=(v4sf*)Ix->c3, *iy3p=(v4sf*)Iy->c3, *iz3p=(v4sf*)Iz->c3, *ixx3p=(v4sf*)Ixx->c3, *ixy3p=(v4sf*)Ixy->c3, *iyy3p=(v4sf*)Iyy->c3, *ixz3p=(v4sf*)Ixz->c3, *iyz3p=(v4sf*) Iyz->c3, 
-        #endif
-        *uup = (v4sf*) uu->c1, *wxp = (v4sf*)wx->c1;
-        
-            
-    memset(a11->c1, 0, sizeof(float)*uu->height*uu->stride);
-    memset(b1->c1 , 0, sizeof(float)*uu->height*uu->stride);
-              
-    int i;
-    for(i = 0 ; i<uu->height*uu->stride/4 ; i++){
-        v4sf tmp, tmp2, n1, n2;
-	#if (SELECTCHANNEL==3)
-	v4sf tmp3, tmp4, tmp5, tmp6, n3, n4, n5, n6;
-	#endif
-        // dpsi color
-        if(half_delta_over3){
-            tmp  = *iz1p + (*ix1p)*(*dup);
-            n1 = (*ix1p) * (*ix1p) + (*iy1p) * (*iy1p) + dnorm;
-            #if (SELECTCHANNEL==3)
-            tmp2 = *iz2p + (*ix2p)*(*dup);
-            n2 = (*ix2p) * (*ix2p) + (*iy2p) * (*iy2p) + dnorm;
-            tmp3 = *iz3p + (*ix3p)*(*dup);
-            n3 = (*ix3p) * (*ix3p) + (*iy3p) * (*iy3p) + dnorm;
-            tmp = (*maskp) * hdover3 / __builtin_ia32_sqrtps(tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + epscolor);
-            tmp3 = tmp/n3; tmp2 = tmp/n2; tmp /= n1;
-            #else
-            tmp = (*maskp) * hdover3 / __builtin_ia32_sqrtps(3 * tmp*tmp/n1 + epscolor);
-            tmp /= n1;
-            #endif
-            *a11p += tmp  * (*ix1p) * (*ix1p);
-            *b1p -=  tmp  * (*iz1p) * (*ix1p);
-            #if (SELECTCHANNEL==3)
-            *a11p += tmp2 * (*ix2p) * (*ix2p);
-            *b1p -=  tmp2 * (*iz2p) * (*ix2p);
-            *a11p += tmp3 * (*ix3p) * (*ix3p);
-            *b1p -=  tmp3 * (*iz3p) * (*ix3p);
-            #endif
-        }
-        // dpsi gradient
-        n1 = (*ixx1p) * (*ixx1p) + (*ixy1p) * (*ixy1p) + dnorm;
-        n2 = (*iyy1p) * (*iyy1p) + (*ixy1p) * (*ixy1p) + dnorm;
-        tmp  = *ixz1p + (*ixx1p) * (*dup);
-        tmp2 = *iyz1p + (*ixy1p) * (*dup);
-        #if (SELECTCHANNEL==3)
-        n3 = (*ixx2p) * (*ixx2p) + (*ixy2p) * (*ixy2p) + dnorm;
-        n4 = (*iyy2p) * (*iyy2p) + (*ixy2p) * (*ixy2p) + dnorm;
-        tmp3 = *ixz2p + (*ixx2p) * (*dup);
-        tmp4 = *iyz2p + (*ixy2p) * (*dup);
-        n5 = (*ixx3p) * (*ixx3p) + (*ixy3p) * (*ixy3p) + dnorm;
-        n6 = (*iyy3p) * (*iyy3p) + (*ixy3p) * (*ixy3p) + dnorm;
-        tmp5 = *ixz3p + (*ixx3p) * (*dup);
-        tmp6 = *iyz3p + (*ixy3p) * (*dup);
-        tmp = (*maskp) * hgover3 / __builtin_ia32_sqrtps(tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + tmp4*tmp4/n4 + tmp5*tmp5/n5 + tmp6*tmp6/n6 + epsgrad);
-        tmp6 = tmp/n6; tmp5 = tmp/n5; tmp4 = tmp/n4; tmp3 = tmp/n3; tmp2 = tmp/n2; tmp /= n1;      
-        #else
-        tmp = (*maskp) * hgover3 / __builtin_ia32_sqrtps(3* tmp*tmp/n1 + 3* tmp2*tmp2/n2 + epsgrad);
-        tmp2 = tmp/n2; tmp /= n1;      
-        #endif
-        *a11p += tmp *(*ixx1p)*(*ixx1p) + tmp2*(*ixy1p)*(*ixy1p);
-        *b1p -=  tmp *(*ixx1p)*(*ixz1p) + tmp2*(*ixy1p)*(*iyz1p);
-        #if (SELECTCHANNEL==3)
-        *a11p += tmp3*(*ixx2p)*(*ixx2p) + tmp4*(*ixy2p)*(*ixy2p);
-        *b1p -=  tmp3*(*ixx2p)*(*ixz2p) + tmp4*(*ixy2p)*(*iyz2p);
-        *a11p += tmp5*(*ixx3p)*(*ixx3p) + tmp6*(*ixy3p)*(*ixy3p);
-        *b1p -=  tmp5*(*ixx3p)*(*ixz3p) + tmp6*(*ixy3p)*(*iyz3p);
-        #endif
-        
-        
-        #if (SELECTCHANNEL==1 | SELECTCHANNEL==2)  // multiply system to make smoothing parameters same for RGB and single-channel image
-        *a11p *= 3;  
-        *b1p *=  3;  
-        #endif
-
-        dup+=1; maskp+=1; a11p+=1; b1p+=1; 
-        ix1p+=1; iy1p+=1; iz1p+=1; ixx1p+=1; ixy1p+=1; iyy1p+=1; ixz1p+=1; iyz1p+=1;
-        #if (SELECTCHANNEL==3)
-        ix2p+=1; iy2p+=1; iz2p+=1; ixx2p+=1; ixy2p+=1; iyy2p+=1; ixz2p+=1; iyz2p+=1;
-        ix3p+=1; iy3p+=1; iz3p+=1; ixx3p+=1; ixy3p+=1; iyy3p+=1; ixz3p+=1; iyz3p+=1;
-        #endif
-        uup+=1; wxp+=1;
-
-    }
-}
-
-
-
-/* resize the descriptors to the new size using a weighted mean */
-void descflow_resize(image_t *dst_flow_x, image_t *dst_flow_y, image_t *dst_weight, const image_t *src_flow_x, const image_t *src_flow_y, const image_t *src_weight){
-    const int src_width = src_flow_x->width, src_height = src_flow_x->height, src_stride = src_flow_x->stride,
-                dst_width = dst_flow_x->width, dst_height = dst_flow_x->height, dst_stride = dst_flow_x->stride;
-    const float scale_x = ((float)dst_width-1)/((float)src_width-1), scale_y = ((float)dst_height-1)/((float)src_height-1);
-    image_erase(dst_flow_x); image_erase(dst_flow_y); image_erase(dst_weight);
-    int j;
-    for( j=0 ; j<src_height ; j++){
-        const float yy = ((float)j)*scale_y;
-        const float yyf = floor(yy);
-        const float dy = yy-yyf;
-        const int y1 = MINMAX_TA( (int) yyf   , dst_height);
-        const int y2 = MINMAX_TA( (int) yyf+1 , dst_height);
-        int i;
-        for( i=0 ; i<src_width ; i++ ){
-            const float weight = src_weight->c1[j*src_stride+i];
-	        if( weight<0.0000000001f ) continue;
-            const float xx = ((float)i)*scale_x;
-	        const float xxf = floor(xx);
-	        const float dx = xx-xxf;
-	        const int x1 = MINMAX_TA( (int) xxf   , dst_width);
-	        const int x2 = MINMAX_TA( (int) xxf+1 , dst_width);
-	        float weightxy, newweight;
-	        if( dx ){
-	            if( dy ){
-		            weightxy = weight*dx*dy;
-		            newweight = dst_weight->c1[y2*dst_stride+x2] + weightxy;
-		            dst_flow_x->c1[y2*dst_stride+x2] = (dst_flow_x->c1[y2*dst_stride+x2]*dst_weight->c1[y2*dst_stride+x2] + src_flow_x->c1[j*src_stride+i]*weightxy*scale_x)/newweight;
-		            dst_flow_y->c1[y2*dst_stride+x2] = (dst_flow_y->c1[y2*dst_stride+x2]*dst_weight->c1[y2*dst_stride+x2] + src_flow_y->c1[j*src_stride+i]*weightxy*scale_y)/newweight;
-		            dst_weight->c1[y2*dst_stride+x2] = newweight;
-		        }
-	            weightxy = weight*dx*(1.0f-dy);
-	            newweight = dst_weight->c1[y1*dst_stride+x2] + weightxy;
-	            dst_flow_x->c1[y1*dst_stride+x2] = (dst_flow_x->c1[y1*dst_stride+x2]*dst_weight->c1[y1*dst_stride+x2] + src_flow_x->c1[j*src_stride+i]*weightxy*scale_x)/newweight;
-	            dst_flow_y->c1[y1*dst_stride+x2] = (dst_flow_y->c1[y1*dst_stride+x2]*dst_weight->c1[y1*dst_stride+x2] + src_flow_y->c1[j*src_stride+i]*weightxy*scale_y)/newweight;
-	            dst_weight->c1[y1*dst_stride+x2] = newweight;
-            }
-	        if( dy ) {
-                weightxy = weight*(1.0f-dx)*dy;
-                newweight = dst_weight->c1[y2*dst_stride+x1] + weightxy;
-                dst_flow_x->c1[y2*dst_stride+x1] = (dst_flow_x->c1[y2*dst_stride+x1]*dst_weight->c1[y2*dst_stride+x1] + src_flow_x->c1[j*src_stride+i]*weightxy*scale_x)/newweight;
-                dst_flow_y->c1[y2*dst_stride+x1] = (dst_flow_y->c1[y2*dst_stride+x1]*dst_weight->c1[y2*dst_stride+x1] + src_flow_y->c1[j*src_stride+i]*weightxy*scale_y)/newweight;
-                dst_weight->c1[y2*dst_stride+x1] = newweight;
-	        }
-	        weightxy = weight*(1.0f-dx)*(1.0f-dy);
-	        newweight = dst_weight->c1[y1*dst_stride+x1] + weightxy;
-	        dst_flow_x->c1[y1*dst_stride+x1] = (dst_flow_x->c1[y1*dst_stride+x1]*dst_weight->c1[y1*dst_stride+x1] + src_flow_x->c1[j*src_stride+i]*weightxy*scale_x)/newweight;
-	        dst_flow_y->c1[y1*dst_stride+x1] = (dst_flow_y->c1[y1*dst_stride+x1]*dst_weight->c1[y1*dst_stride+x1] + src_flow_y->c1[j*src_stride+i]*weightxy*scale_y)/newweight;
-	        dst_weight->c1[y1*dst_stride+x1] = newweight;
-	    }
-    }
-}
-
-/* resize the descriptors to the new size using a nearest neighbor method while keeping the descriptor with the higher weight at the end */
-void descflow_resize_nn(image_t *dst_flow_x, image_t *dst_flow_y, image_t *dst_weight, const image_t *src_flow_x, const image_t *src_flow_y, const image_t *src_weight){
-    const int src_width = src_flow_x->width, src_height = src_flow_x->height, src_stride = src_flow_x->stride,
-                dst_width = dst_flow_x->width, dst_height = dst_flow_x->height, dst_stride = dst_flow_x->stride;
-    const float scale_x = ((float)dst_width-1)/((float)src_width-1), scale_y = ((float)dst_height-1)/((float)src_height-1);
-    image_erase(dst_flow_x); image_erase(dst_flow_y); image_erase(dst_weight);
-    int j;
-    for( j=0 ; j<src_height ; j++){
-        const float yy = ((float)j)*scale_y;
-        const int y = (int) 0.5f+yy; // equivalent to round(yy)
-        int i;
-        for( i=0 ; i<src_width ; i++ ){
-	        const float weight = src_weight->c1[j*src_stride+i];
-	        if( !weight )
-	            continue;
-	        const float xx = ((float)i)*scale_x;
-	        const int x = (int) 0.5f+xx; // equivalent to round(xx)
-	        if( dst_weight->c1[y*dst_stride+x] < weight ){
-	            dst_weight->c1[y*dst_stride+x] = weight;
-	            dst_flow_x->c1[y*dst_stride+x] = src_flow_x->c1[j*src_stride+i]*scale_x;
-	            dst_flow_y->c1[y*dst_stride+x] = src_flow_y->c1[j*src_stride+i]*scale_y;
-	        }
-	    }
-    }
-}
diff --git a/src/FDF1.0.1/opticalflow_aux.h b/src/FDF1.0.1/opticalflow_aux.h
index 063ac23..ebca1f1 100644
--- a/src/FDF1.0.1/opticalflow_aux.h
+++ b/src/FDF1.0.1/opticalflow_aux.h
@@ -33,11 +33,6 @@ void compute_smoothness(image_t *dst_horiz, image_t *dst_vert, const image_t *uu
 /* sub the laplacian (smoothness term) to the right-hand term */
 void sub_laplacian(image_t *dst, const image_t *src, const image_t *weight_horiz, const image_t *weight_vert);
 
-/* compute the dataterm and the matching term
-   a11 a12 a22 represents the 2x2 diagonal matrix, b1 and b2 the right hand side
-   other (color) images are input */
-void compute_data_and_match(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t *b2, image_t *mask, image_t *wx, image_t *wy, image_t *du, image_t *dv, image_t *uu, image_t *vv, color_image_t *Ix, color_image_t *Iy, color_image_t *Iz, color_image_t *Ixx, color_image_t *Ixy, color_image_t *Iyy, color_image_t *Ixz, color_image_t *Iyz, image_t *desc_weight, image_t *desc_flow_x, image_t *desc_flow_y, const float half_delta_over3, const float half_beta, const float half_gamma_over3);
-
 /* compute the dataterm and ... REMOVED THE MATCHING TERM
    a11 a12 a22 represents the 2x2 diagonal matrix, b1 and b2 the right hand side
    other (color) images are input */
@@ -47,18 +42,6 @@ void compute_data(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t
 void compute_data(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t *b2, image_t *mask, image_t *wx, image_t *wy, image_t *du, image_t *dv, image_t *uu, image_t *vv, color_image_t *Ix, color_image_t *Iy, color_image_t *Iz, color_image_t *Ixx, color_image_t *Ixy, color_image_t *Iyy, color_image_t *Ixz, color_image_t *Iyz, const float half_delta_over3, const float half_beta, const float half_gamma_over3);
 #endif
 
-#if (SELECTCHANNEL==1 | SELECTCHANNEL==2)  // use single band image_delete
-void compute_data_DE(image_t *a11, image_t *b1, image_t *mask, image_t *wx, image_t *du, image_t *uu, image_t *Ix, image_t *Iy, image_t *Iz, image_t *Ixx, image_t *Ixy, image_t *Iyy, image_t *Ixz, image_t *Iyz, const float half_delta_over3, const float half_beta, const float half_gamma_over3);
-#else
-void compute_data_DE(image_t *a11, image_t *b1, image_t *mask, image_t *wx, image_t *du, image_t *uu, color_image_t *Ix, color_image_t *Iy, color_image_t *Iz, color_image_t *Ixx, color_image_t *Ixy, color_image_t *Iyy, color_image_t *Ixz, color_image_t *Iyz, const float half_delta_over3, const float half_beta, const float half_gamma_over3);
-#endif
-
-
-/* resize the descriptors to the new size using a weighted mean */
-void descflow_resize(image_t *dst_flow_x, image_t *dst_flow_y, image_t *dst_weight, const image_t *src_flow_x, const image_t *src_flow_y, const image_t *src_weight);
-
-/* resize the descriptors to the new size using a nearest neighbor method while keeping the descriptor with the higher weight at the end */
-void descflow_resize_nn(image_t *dst_flow_x, image_t *dst_flow_y, image_t *dst_weight, const image_t *src_flow_x, const image_t *src_flow_y, const image_t *src_weight);
 
 #ifdef __cplusplus
 }
diff --git a/src/FDF1.0.1/solver.c b/src/FDF1.0.1/solver.c
index a4402e9..162b3a0 100644
--- a/src/FDF1.0.1/solver.c
+++ b/src/FDF1.0.1/solver.c
@@ -9,8 +9,12 @@
 #include "image.h"
 #include "solver.h"
 
-#include <xmmintrin.h>
-typedef __v4sf v4sf;
+// #include <arm_neon.h>
+// #if (VECTOR_WIDTH == 4)
+// typedef float32x4_t v4sf;
+// #else
+// typedef float v4sf;
+// #endif
 
 //THIS IS A SLOW VERSION BUT READABLE
 //Perform n iterations of the sor_coupled algorithm
@@ -74,396 +78,349 @@ void sor_coupled_slow_but_readable(image_t *du, image_t *dv, image_t *a11, image
  // THIS IS A FASTER VERSION BUT UNREADABLE, ONLY OPTICAL FLOW WITHOUT OPENMP PARALLELIZATION
  // the first iteration is separated from the other to compute the inverse of the 2x2 block diagonal
  // each iteration is split in two first line / middle lines / last line, and the left block is computed separately on each line
-void sor_coupled(image_t *du, image_t *dv, image_t *a11, image_t *a12, image_t *a22, const image_t *b1, const image_t *b2, const image_t *dpsis_horiz, const image_t *dpsis_vert, const int iterations, const float omega){
-    //sor_coupled_slow(du,dv,a11,a12,a22,b1,b2,dpsis_horiz,dpsis_vert,iterations,omega); return; printf("test\n");
-  
-    if(du->width<2 || du->height<2 || iterations < 1){
-        sor_coupled_slow_but_readable(du,dv,a11,a12,a22,b1,b2,dpsis_horiz,dpsis_vert,iterations,omega);
-        return;
-    }
-    
-    const int stride = du->stride, width = du->width;
-    const int iterheight = du->height-1, iterline = (stride)/4, width_minus_1_sizeoffloat = sizeof(float)*(width-1);
-    int j,iter,i,k;
-    float *floatarray = (float*) memalign(16, stride*sizeof(float)*3); 
-    if(floatarray==NULL){
-        fprintf(stderr, "error in sor_coupled(): not enough memory\n");
-        exit(1);
-    }   
-    float *f1 = floatarray;
-    float *f2 = f1+stride;
-    float *f3 = f2+stride;
-    f1[0] = 0.0f;
-    memset(&f1[width], 0, sizeof(float)*(stride-width));
-    memset(&f2[width-1], 0, sizeof(float)*(stride-width+1));
-    memset(&f3[width-1], 0, sizeof(float)*(stride-width+1));   	  
-    	  
-    { // first iteration
-        v4sf *a11p = (v4sf*) a11->c1, *a12p = (v4sf*) a12->c1, *a22p = (v4sf*) a22->c1, *b1p = (v4sf*) b1->c1, *b2p = (v4sf*) b2->c1, *hp = (v4sf*) dpsis_horiz->c1, *vp = (v4sf*) dpsis_vert->c1;
-        float *du_ptr = du->c1, *dv_ptr = dv->c1;
-        v4sf *dub = (v4sf*) (du_ptr+stride), *dvb = (v4sf*) (dv_ptr+stride);
-        
-        { // first iteration - first line
-        
-            memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat);   
-            memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat);
-            memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat);
-            v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3;
-            
-            { // left block
-                // reverse 2x2 diagonal block
-                const v4sf dpsis = (*hpl) + (*hp) + (*vp);
-                const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis;
-                const v4sf det = A11*A22 - (*a12p)*(*a12p);
-                *a11p = A11/det;
-                *a22p = A22/det;
-                *a12p /= -det;
-                // do one iteration
-                const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p);
-                const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p);
-                du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] );
-	            dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] );             
-                for(k=1;k<4;k++){
-                    const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
-                    const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
-                    du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
-	                dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
-                }
-                // increment pointer
-                hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1;
-                dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1;
-                du_ptr += 4; dv_ptr += 4;        
-            }
-            for(i=iterline;--i;){
-                // reverse 2x2 diagonal block
-                const v4sf dpsis = (*hpl) + (*hp) + (*vp);
-                const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis;
-                const v4sf det = A11*A22 - (*a12p)*(*a12p);
-                *a11p = A11/det;
-                *a22p = A22/det;
-                *a12p /= -det;
-                // do one iteration
-                const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p);
-                const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p);
-                for(k=0;k<4;k++){
-                    const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
-                    const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
-                    du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
-	                dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
-                }
-                // increment pointer
-                hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1;
-                dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1;
-                du_ptr += 4; dv_ptr += 4;
-            }
-          
-        }
-        
-        v4sf *vpt = (v4sf*) dpsis_vert->c1;
-        v4sf *dut = (v4sf*) du->c1, *dvt = (v4sf*) dv->c1;
-        
-        for(j=iterheight;--j;){ // first iteration - middle lines
-            memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat);   
-            memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat);
-            memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat);
-            v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3;
-                 
-            { // left block
-                // reverse 2x2 diagonal block
-                const v4sf dpsis = (*hpl) + (*hp) + (*vpt) + (*vp);
-                const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis;
-                const v4sf det = A11*A22 - (*a12p)*(*a12p);
-                *a11p = A11/det;
-                *a22p = A22/det;
-                *a12p /= -det;
-                // do one iteration
-                const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p);
-                const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p);
-                du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] );
-	            dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] );             
-                for(k=1;k<4;k++){
-                    const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
-                    const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
-                    du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
-	                dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
-                }
-                // increment pointer
-                hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1;
-                dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1;
-                du_ptr += 4; dv_ptr += 4;           
-            }
-            for(i=iterline;--i;){
-                // reverse 2x2 diagonal block
-                const v4sf dpsis = (*hpl) + (*hp) + (*vpt) + (*vp);
-                const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis;
-                const v4sf det = A11*A22 - (*a12p)*(*a12p);
-                *a11p = A11/det;
-                *a22p = A22/det;
-                *a12p /= -det;
-                // do one iteration
-                const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p);
-                const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p);
-                for(k=0;k<4;k++){
-                    const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
-                    const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
-                    du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
-	                dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
-                }
-                // increment pointer
-                hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1;
-                dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1;
-                du_ptr += 4; dv_ptr += 4;
-            }
-                
-        }
-        
-        { // first iteration - last line
-            memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat);   
-            memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat);
-            memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat);
-            v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3;
-
-            { // left block
-                // reverse 2x2 diagonal block
-                const v4sf dpsis = (*hpl) + (*hp) + (*vpt);
-                const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis;
-                const v4sf det = A11*A22 - (*a12p)*(*a12p);
-                *a11p = A11/det;
-                *a22p = A22/det;
-                *a12p /= -det;
-                // do one iteration
-                const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p);
-                const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p);
-                du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] );
-	            dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] );             
-                for(k=1;k<4;k++){
-                    const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
-                    const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
-                    du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
-	                dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
-                }
-                // increment pointer
-                hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1;
-                dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1;
-                du_ptr += 4; dv_ptr += 4;           
-            }
-            for(i=iterline;--i;){
-                // reverse 2x2 diagonal block
-                const v4sf dpsis = (*hpl) + (*hp) + (*vpt);
-                const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis;
-                const v4sf det = A11*A22 - (*a12p)*(*a12p);
-                *a11p = A11/det;
-                *a22p = A22/det;
-                *a12p /= -det;
-                // do one iteration
-                const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p);
-                const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p);
-                for(k=0;k<4;k++){
-                    const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
-                    const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
-                    du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
-	                dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
-                }
-                // increment pointer
-                hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1;
-                dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1;
-                du_ptr += 4; dv_ptr += 4;
-            }
-
-        }
-    }
-
-    
-
-   for(iter=iterations;--iter;)   // other iterations
-   {
-        v4sf *a11p = (v4sf*) a11->c1, *a12p = (v4sf*) a12->c1, *a22p = (v4sf*) a22->c1, *b1p = (v4sf*) b1->c1, *b2p = (v4sf*) b2->c1, *hp = (v4sf*) dpsis_horiz->c1, *vp = (v4sf*) dpsis_vert->c1;
-        float *du_ptr = du->c1, *dv_ptr = dv->c1;
-        v4sf *dub = (v4sf*) (du_ptr+stride), *dvb = (v4sf*) (dv_ptr+stride);
-        
-        { // other iteration - first line
-        
-            memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat);   
-            memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat);
-            memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat);
-            v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3;
-            
-            { // left block
-                // do one iteration
-                const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p);
-                const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p);
-                du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] );
-	            dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] );             
-                for(k=1;k<4;k++){
-                    const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
-                    const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
-                    du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
-	                dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
-                }
-                // increment pointer
-                hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1;
-                dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1;
-                du_ptr += 4; dv_ptr += 4;        
-            }
-            for(i=iterline;--i;){
-                // do one iteration
-                const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p);
-                const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p);
-                for(k=0;k<4;k++){
-                    const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
-                    const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
-                    du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
-	                dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
-                }
-                // increment pointer
-                hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1;
-                dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1;
-                du_ptr += 4; dv_ptr += 4;
-            }
-          
-        }
-        
-        v4sf *vpt = (v4sf*) dpsis_vert->c1;
-        v4sf *dut = (v4sf*) du->c1, *dvt = (v4sf*) dv->c1;
-
-        for(j=iterheight;--j;)  // other iteration - middle lines
-	{ 
-            memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat);   
-            memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat);
-            memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat);
-            v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3;
-                 
-            { // left block
-                // do one iteration
-                const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p);
-                const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p);
-                du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] );
-		dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] );             
-                for(k=1;k<4;k++)
-		{
-		  const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
-		  const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
-		  du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
-		  dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
-                }
-                // increment pointer
-                hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1;
-                dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1;
-                du_ptr += 4; dv_ptr += 4;           
-            }
-            
-            for(i=iterline; --i;)
-	    {
-	      // do one iteration
-	      const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p);
-	      const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p);
-	      for(k=0;k<4;k++)
-	      {
-		  const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
-		  const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
-		  du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
-		      dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
-	      }
-	      // increment pointer
-	      hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1;
-	      dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1;
-	      du_ptr += 4; dv_ptr += 4;
-            }
-                
-        }
-        
-        { // other iteration - last line
-            memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat);   
-            memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat);
-            memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat);
-            v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3;
-
-            { // left block
-                // do one iteration
-                const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p);
-                const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p);
-                du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] );
-	            dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] );             
-                for(k=1;k<4;k++){
-                    const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
-                    const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
-                    du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
-	                dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
-                }
-                // increment pointer
-                hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1;
-                dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1;
-                du_ptr += 4; dv_ptr += 4;           
-            }
-            for(i=iterline;--i;){
-                // do one iteration
-                const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p);
-                const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p);
-                for(k=0;k<4;k++){
-                    const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
-                    const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
-                    du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
-	                dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
-                }
-                // increment pointer
-                hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1;
-                dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1;
-                du_ptr += 4; dv_ptr += 4;
-            }
-
-        }
-    }
-
-
-
-    free(floatarray);
-
-}
-
-
-//THIS IS A SLOW VERSION BUT READABLE
-//Perform n iterations of the sor_coupled algorithm
-//du is used as initial guesses
-//The system form is the same as in opticalflow.c
-void sor_coupled_slow_but_readable_DE(image_t *du, const image_t *a11, const image_t *b1, const image_t *dpsis_horiz, const image_t *dpsis_vert, const int iterations, const float omega)
-{
-    int i,j,iter;
-    for(iter = 0 ; iter<iterations ; iter++)
-    {
-	#pragma omp parallel for
-        for(j=0 ; j<du->height ; j++)
-	{
-	  float sigma_u,sum_dpsis,A11,B1;
-	        for(i=0 ; i<du->width ; i++){
-	            sigma_u = 0.0f;
-	            sum_dpsis = 0.0f;
-	            if(j>0)
-		    {
-		      sigma_u -= dpsis_vert->c1[(j-1)*du->stride+i]*du->c1[(j-1)*du->stride+i];
-		      sum_dpsis += dpsis_vert->c1[(j-1)*du->stride+i];
-		    }
-	            if(i>0)
-		    {
-		      sigma_u -= dpsis_horiz->c1[j*du->stride+i-1]*du->c1[j*du->stride+i-1];
-		      sum_dpsis += dpsis_horiz->c1[j*du->stride+i-1];
-		    }
-	            if(j<du->height-1)
-		    {
-		      sigma_u -= dpsis_vert->c1[j*du->stride+i]*du->c1[(j+1)*du->stride+i];
-		      sum_dpsis += dpsis_vert->c1[j*du->stride+i];
-		    }
-	            if(i<du->width-1)
-		    {
-		      sigma_u -= dpsis_horiz->c1[j*du->stride+i]*du->c1[j*du->stride+i+1];
-		      sum_dpsis += dpsis_horiz->c1[j*du->stride+i];
-		    }
-                A11 = a11->c1[j*du->stride+i]+sum_dpsis;
-                B1 = b1->c1[j*du->stride+i]-sigma_u;
-                du->c1[j*du->stride+i] = (1.0f-omega)*du->c1[j*du->stride+i] +omega*( B1/A11 );
-	        }
-	    }
-    }
-}
-
-
+// void sor_coupled(image_t *du, image_t *dv, image_t *a11, image_t *a12, image_t *a22, const image_t *b1, const image_t *b2, const image_t *dpsis_horiz, const image_t *dpsis_vert, const int iterations, const float omega){
+//     //sor_coupled_slow(du,dv,a11,a12,a22,b1,b2,dpsis_horiz,dpsis_vert,iterations,omega); return; printf("test\n");
+//   
+//     if(du->width<2 || du->height<2 || iterations < 1){
+//         sor_coupled_slow_but_readable(du,dv,a11,a12,a22,b1,b2,dpsis_horiz,dpsis_vert,iterations,omega);
+//         return;
+//     }
+//     
+//     const int stride = du->stride, width = du->width;
+//     const int iterheight = du->height-1, iterline = (stride)/4, width_minus_1_sizeoffloat = sizeof(float)*(width-1);
+//     int j,iter,i,k;
+//     float *floatarray = (float*) memalign(16, stride*sizeof(float)*3); 
+//     if(floatarray==NULL){
+//         fprintf(stderr, "error in sor_coupled(): not enough memory\n");
+//         exit(1);
+//     }   
+//     float *f1 = floatarray;
+//     float *f2 = f1+stride;
+//     float *f3 = f2+stride;
+//     f1[0] = 0.0f;
+//     memset(&f1[width], 0, sizeof(float)*(stride-width));
+//     memset(&f2[width-1], 0, sizeof(float)*(stride-width+1));
+//     memset(&f3[width-1], 0, sizeof(float)*(stride-width+1));   	  
+//     	  
+//     { // first iteration
+//         v4sf *a11p = (v4sf*) a11->c1, *a12p = (v4sf*) a12->c1, *a22p = (v4sf*) a22->c1, *b1p = (v4sf*) b1->c1, *b2p = (v4sf*) b2->c1, *hp = (v4sf*) dpsis_horiz->c1, *vp = (v4sf*) dpsis_vert->c1;
+//         float *du_ptr = du->c1, *dv_ptr = dv->c1;
+//         v4sf *dub = (v4sf*) (du_ptr+stride), *dvb = (v4sf*) (dv_ptr+stride);
+//         
+//         { // first iteration - first line
+//         
+//             memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat);   
+//             memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat);
+//             memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat);
+//             v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3;
+//             
+//             { // left block
+//                 // reverse 2x2 diagonal block
+//                 const v4sf dpsis = (*hpl) + (*hp) + (*vp);
+//                 const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis;
+//                 const v4sf det = A11*A22 - (*a12p)*(*a12p);
+//                 *a11p = A11/det;
+//                 *a22p = A22/det;
+//                 *a12p /= -det;
+//                 // do one iteration
+//                 const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p);
+//                 const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p);
+//                 du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] );
+// 	            dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] );             
+//                 for(k=1;k<4;k++){
+//                     const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
+//                     const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
+//                     du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
+// 	                dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
+//                 }
+//                 // increment pointer
+//                 hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1;
+//                 dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1;
+//                 du_ptr += 4; dv_ptr += 4;        
+//             }
+//             for(i=iterline;--i;){
+//                 // reverse 2x2 diagonal block
+//                 const v4sf dpsis = (*hpl) + (*hp) + (*vp);
+//                 const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis;
+//                 const v4sf det = A11*A22 - (*a12p)*(*a12p);
+//                 *a11p = A11/det;
+//                 *a22p = A22/det;
+//                 *a12p /= -det;
+//                 // do one iteration
+//                 const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p);
+//                 const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p);
+//                 for(k=0;k<4;k++){
+//                     const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
+//                     const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
+//                     du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
+// 	                dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
+//                 }
+//                 // increment pointer
+//                 hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1;
+//                 dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1;
+//                 du_ptr += 4; dv_ptr += 4;
+//             }
+//           
+//         }
+//         
+//         v4sf *vpt = (v4sf*) dpsis_vert->c1;
+//         v4sf *dut = (v4sf*) du->c1, *dvt = (v4sf*) dv->c1;
+//         
+//         for(j=iterheight;--j;){ // first iteration - middle lines
+//             memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat);   
+//             memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat);
+//             memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat);
+//             v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3;
+//                  
+//             { // left block
+//                 // reverse 2x2 diagonal block
+//                 const v4sf dpsis = (*hpl) + (*hp) + (*vpt) + (*vp);
+//                 const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis;
+//                 const v4sf det = A11*A22 - (*a12p)*(*a12p);
+//                 *a11p = A11/det;
+//                 *a22p = A22/det;
+//                 *a12p /= -det;
+//                 // do one iteration
+//                 const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p);
+//                 const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p);
+//                 du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] );
+// 	            dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] );             
+//                 for(k=1;k<4;k++){
+//                     const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
+//                     const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
+//                     du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
+// 	                dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
+//                 }
+//                 // increment pointer
+//                 hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1;
+//                 dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1;
+//                 du_ptr += 4; dv_ptr += 4;           
+//             }
+//             for(i=iterline;--i;){
+//                 // reverse 2x2 diagonal block
+//                 const v4sf dpsis = (*hpl) + (*hp) + (*vpt) + (*vp);
+//                 const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis;
+//                 const v4sf det = A11*A22 - (*a12p)*(*a12p);
+//                 *a11p = A11/det;
+//                 *a22p = A22/det;
+//                 *a12p /= -det;
+//                 // do one iteration
+//                 const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p);
+//                 const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p);
+//                 for(k=0;k<4;k++){
+//                     const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
+//                     const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
+//                     du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
+// 	                dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
+//                 }
+//                 // increment pointer
+//                 hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1;
+//                 dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1;
+//                 du_ptr += 4; dv_ptr += 4;
+//             }
+//                 
+//         }
+//         
+//         { // first iteration - last line
+//             memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat);   
+//             memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat);
+//             memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat);
+//             v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3;
+// 
+//             { // left block
+//                 // reverse 2x2 diagonal block
+//                 const v4sf dpsis = (*hpl) + (*hp) + (*vpt);
+//                 const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis;
+//                 const v4sf det = A11*A22 - (*a12p)*(*a12p);
+//                 *a11p = A11/det;
+//                 *a22p = A22/det;
+//                 *a12p /= -det;
+//                 // do one iteration
+//                 const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p);
+//                 const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p);
+//                 du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] );
+// 	            dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] );             
+//                 for(k=1;k<4;k++){
+//                     const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
+//                     const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
+//                     du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
+// 	                dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
+//                 }
+//                 // increment pointer
+//                 hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1;
+//                 dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1;
+//                 du_ptr += 4; dv_ptr += 4;           
+//             }
+//             for(i=iterline;--i;){
+//                 // reverse 2x2 diagonal block
+//                 const v4sf dpsis = (*hpl) + (*hp) + (*vpt);
+//                 const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis;
+//                 const v4sf det = A11*A22 - (*a12p)*(*a12p);
+//                 *a11p = A11/det;
+//                 *a22p = A22/det;
+//                 *a12p /= -det;
+//                 // do one iteration
+//                 const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p);
+//                 const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p);
+//                 for(k=0;k<4;k++){
+//                     const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
+//                     const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
+//                     du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
+// 	                dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
+//                 }
+//                 // increment pointer
+//                 hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1;
+//                 dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1;
+//                 du_ptr += 4; dv_ptr += 4;
+//             }
+// 
+//         }
+//     }
+// 
+//     
+// 
+//    for(iter=iterations;--iter;)   // other iterations
+//    {
+//         v4sf *a11p = (v4sf*) a11->c1, *a12p = (v4sf*) a12->c1, *a22p = (v4sf*) a22->c1, *b1p = (v4sf*) b1->c1, *b2p = (v4sf*) b2->c1, *hp = (v4sf*) dpsis_horiz->c1, *vp = (v4sf*) dpsis_vert->c1;
+//         float *du_ptr = du->c1, *dv_ptr = dv->c1;
+//         v4sf *dub = (v4sf*) (du_ptr+stride), *dvb = (v4sf*) (dv_ptr+stride);
+//         
+//         { // other iteration - first line
+//         
+//             memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat);   
+//             memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat);
+//             memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat);
+//             v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3;
+//             
+//             { // left block
+//                 // do one iteration
+//                 const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p);
+//                 const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p);
+//                 du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] );
+// 	            dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] );             
+//                 for(k=1;k<4;k++){
+//                     const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
+//                     const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
+//                     du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
+// 	                dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
+//                 }
+//                 // increment pointer
+//                 hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1;
+//                 dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1;
+//                 du_ptr += 4; dv_ptr += 4;        
+//             }
+//             for(i=iterline;--i;){
+//                 // do one iteration
+//                 const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p);
+//                 const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p);
+//                 for(k=0;k<4;k++){
+//                     const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
+//                     const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
+//                     du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
+// 	                dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
+//                 }
+//                 // increment pointer
+//                 hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1;
+//                 dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1;
+//                 du_ptr += 4; dv_ptr += 4;
+//             }
+//           
+//         }
+//         
+//         v4sf *vpt = (v4sf*) dpsis_vert->c1;
+//         v4sf *dut = (v4sf*) du->c1, *dvt = (v4sf*) dv->c1;
+// 
+//         for(j=iterheight;--j;)  // other iteration - middle lines
+// 	{ 
+//             memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat);   
+//             memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat);
+//             memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat);
+//             v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3;
+//                  
+//             { // left block
+//                 // do one iteration
+//                 const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p);
+//                 const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p);
+//                 du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] );
+// 		dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] );             
+//                 for(k=1;k<4;k++)
+// 		{
+// 		  const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
+// 		  const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
+// 		  du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
+// 		  dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
+//                 }
+//                 // increment pointer
+//                 hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1;
+//                 dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1;
+//                 du_ptr += 4; dv_ptr += 4;           
+//             }
+//             
+//             for(i=iterline; --i;)
+// 	    {
+// 	      // do one iteration
+// 	      const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p);
+// 	      const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p);
+// 	      for(k=0;k<4;k++)
+// 	      {
+// 		  const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
+// 		  const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
+// 		  du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
+// 		      dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
+// 	      }
+// 	      // increment pointer
+// 	      hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1;
+// 	      dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1;
+// 	      du_ptr += 4; dv_ptr += 4;
+//             }
+//                 
+//         }
+//         
+//         { // other iteration - last line
+//             memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat);   
+//             memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat);
+//             memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat);
+//             v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3;
+// 
+//             { // left block
+//                 // do one iteration
+//                 const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p);
+//                 const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p);
+//                 du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] );
+// 	            dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] );             
+//                 for(k=1;k<4;k++){
+//                     const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
+//                     const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
+//                     du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
+// 	                dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
+//                 }
+//                 // increment pointer
+//                 hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1;
+//                 dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1;
+//                 du_ptr += 4; dv_ptr += 4;           
+//             }
+//             for(i=iterline;--i;){
+//                 // do one iteration
+//                 const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p);
+//                 const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p);
+//                 for(k=0;k<4;k++){
+//                     const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k];
+//                     const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k];
+//                     du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] );
+// 	                dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] );
+//                 }
+//                 // increment pointer
+//                 hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1;
+//                 dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1;
+//                 du_ptr += 4; dv_ptr += 4;
+//             }
+// 
+//         }
+//     }
+// 
+// 
+// 
+//     free(floatarray);
+// 
+// }
 
diff --git a/src/FDF1.0.1/solver.h b/src/FDF1.0.1/solver.h
index 6064349..ee75e7a 100644
--- a/src/FDF1.0.1/solver.h
+++ b/src/FDF1.0.1/solver.h
@@ -12,8 +12,6 @@ void sor_coupled(image_t *du, image_t *dv, image_t *a11, image_t *a12, image_t *
 
 void sor_coupled_slow_but_readable(image_t *du, image_t *dv, image_t *a11, image_t *a12, image_t *a22, const image_t *b1, const image_t *b2, const image_t *dpsis_horiz, const image_t *dpsis_vert, const int iterations, const float omega);
 
-void sor_coupled_slow_but_readable_DE(image_t *du, const image_t *a11, const image_t *b1, const image_t *dpsis_horiz, const image_t *dpsis_vert, const int iterations, const float omega);
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/oflow.cpp b/src/oflow.cpp
index d38e709..f8f67ae 100644
--- a/src/oflow.cpp
+++ b/src/oflow.cpp
@@ -47,9 +47,9 @@ namespace OFC {
     op.n_vals = 3 * pow(op.patch_size, 2);
     op.n_scales = op.coarsest_scale - op.finest_scale + 1;
     float norm_outlier2 = pow(op.norm_outlier, 2);
-    op.norm_outlier_tmpbsq = (v4sf) {norm_outlier2, norm_outlier2, norm_outlier2, norm_outlier2};
-    op.norm_outlier_tmp2bsq = __builtin_ia32_mulps(op.norm_outlier_tmpbsq, op.twos);
-    op.norm_outlier_tmp4bsq = __builtin_ia32_mulps(op.norm_outlier_tmpbsq, op.fours);
+    // op.norm_outlier_tmpbsq = (v4sf) {norm_outlier2, norm_outlier2, norm_outlier2, norm_outlier2};
+    // op.norm_outlier_tmp2bsq = __builtin_ia32_mulps(op.norm_outlier_tmpbsq, op.twos);
+    // op.norm_outlier_tmp4bsq = __builtin_ia32_mulps(op.norm_outlier_tmpbsq, op.fours);
     op.dp_thresh = 0.05 * 0.05;
     op.dr_thresh = 0.95;
     op.res_thresh = 0.0;
diff --git a/src/params.h b/src/params.h
index bee846c..a1bae87 100644
--- a/src/params.h
+++ b/src/params.h
@@ -5,8 +5,6 @@
 
 namespace OFC {
 
-  typedef __v4sf v4sf;
-
 
   typedef struct {
     int width;      // image width, does not include '2 * padding', but includes original padding to ensure integer divisible image width and height
@@ -55,15 +53,15 @@ namespace OFC {
     cublasHandle_t cublasHandle;
 
     // Helper variables
-    v4sf zero     = (v4sf) {0.0f, 0.0f, 0.0f, 0.0f};
-    v4sf negzero  = (v4sf) {-0.0f, -0.0f, -0.0f, -0.0f};
-    v4sf half     = (v4sf) {0.5f, 0.5f, 0.5f, 0.5f};
-    v4sf ones     = (v4sf) {1.0f, 1.0f, 1.0f, 1.0f};
-    v4sf twos     = (v4sf) {2.0f, 2.0f, 2.0f, 2.0f};
-    v4sf fours    = (v4sf) {4.0f, 4.0f, 4.0f, 4.0f};
-    v4sf norm_outlier_tmpbsq;
-    v4sf norm_outlier_tmp2bsq;
-    v4sf norm_outlier_tmp4bsq;
+    // v4sf zero     = (v4sf) {0.0f, 0.0f, 0.0f, 0.0f};
+    // v4sf negzero  = (v4sf) {-0.0f, -0.0f, -0.0f, -0.0f};
+    // v4sf half     = (v4sf) {0.5f, 0.5f, 0.5f, 0.5f};
+    // v4sf ones     = (v4sf) {1.0f, 1.0f, 1.0f, 1.0f};
+    // v4sf twos     = (v4sf) {2.0f, 2.0f, 2.0f, 2.0f};
+    // v4sf fours    = (v4sf) {4.0f, 4.0f, 4.0f, 4.0f};
+    // v4sf norm_outlier_tmpbsq;
+    // v4sf norm_outlier_tmp2bsq;
+    // v4sf norm_outlier_tmp4bsq;
   } opt_params;
 
 }
diff --git a/src/patch.cpp b/src/patch.cpp
index f48da59..dbffe0d 100644
--- a/src/patch.cpp
+++ b/src/patch.cpp
@@ -28,7 +28,6 @@ using std::vector;
 
 namespace OFC {
 
-  typedef __v4sf v4sf;
 
   PatClass::PatClass(
       const img_params* _i_params,
diff --git a/src/refine_variational.cpp b/src/refine_variational.cpp
index 8086abb..c40f25d 100644
--- a/src/refine_variational.cpp
+++ b/src/refine_variational.cpp
@@ -10,6 +10,7 @@
 #include <Eigen/Dense>
 
 #include <stdio.h>
+#include <arm_neon.h>
 
 #include "refine_variational.h"
 
@@ -144,7 +145,6 @@ namespace OFC {
                   *Iyy = color_image_new(width,height), *Ixz = color_image_new(width,height), *Iyz = color_image_new(width,height); // second order derivatives
 
     // warp second image
-
     image_warp(w_im2, mask, im2, wx, wy);
     // compute derivatives
     get_derivatives(im1, w_im2, deriv, Ix, Iy, Iz, Ixx, Ixy, Iyy, Ixz, Iyz);
@@ -165,16 +165,16 @@ namespace OFC {
       sub_laplacian(b2, wy, smooth_horiz, smooth_vert);
 
       // solve system
-#ifdef WITH_OPENMP
+// #ifdef WITH_OPENMP
       sor_coupled_slow_but_readable(du, dv, a11, a12, a22, b1, b2, smooth_horiz, smooth_vert, vr.solve_iter, vr.sor_omega); // slower but parallelized
-#else
-      sor_coupled(du, dv, a11, a12, a22, b1, b2, smooth_horiz, smooth_vert, vr.solve_iter, vr.sor_omega);
-#endif
+// #else
+//      sor_coupled(du, dv, a11, a12, a22, b1, b2, smooth_horiz, smooth_vert, vr.solve_iter, vr.sor_omega);
+// #endif
 
       // update flow plus flow increment
       int i;
       v4sf *uup = (v4sf*) uu->c1, *vvp = (v4sf*) vv->c1, *wxp = (v4sf*) wx->c1, *wyp = (v4sf*) wy->c1, *dup = (v4sf*) du->c1, *dvp = (v4sf*) dv->c1;
-      for( i=0 ; i<height*stride/4 ; i++) {
+      for( i=0 ; i<height*stride/VECTOR_WIDTH ; i++) {
         (*uup) = (*wxp) + (*dup);
         (*vvp) = (*wyp) + (*dvp);
         uup+=1; vvp+=1; wxp+=1; wyp+=1;dup+=1;dvp+=1;
@@ -200,77 +200,6 @@ namespace OFC {
   }
 
 
-  void VarRefClass::RefLevelDE(image_t *wx, const color_image_t *im1, const color_image_t *im2) {
-
-    int i_inner_iteration;
-    int width  = wx->width;
-    int height = wx->height;
-    int stride = wx->stride;
-
-    image_t *du = image_new(width,height), *wy_dummy = image_new(width,height), // the flow increment
-            *mask = image_new(width,height), // mask containing 0 if a point goes outside image boundary, 1 otherwise
-            *smooth_horiz = image_new(width,height), *smooth_vert = image_new(width,height), // horiz: (i,j) contains the diffusivity coeff. from (i,j) to (i+1,j)
-            *uu = image_new(width,height), // flow plus flow increment
-            *a11 = image_new(width,height), // system matrix A of Ax=b for each pixel
-            *b1 = image_new(width,height); // system matrix b of Ax=b for each pixel
-
-    image_erase(wy_dummy);
-
-    color_image_t *w_im2 = color_image_new(width,height), // warped second image
-                  *Ix = color_image_new(width,height), *Iy = color_image_new(width,height), *Iz = color_image_new(width,height), // first order derivatives
-                  *Ixx = color_image_new(width,height), *Ixy = color_image_new(width,height), *Iyy = color_image_new(width,height), *Ixz = color_image_new(width,height), *Iyz = color_image_new(width,height); // second order derivatives
-
-    // warp second image
-    image_warp(w_im2, mask, im2, wx, wy_dummy);
-    // compute derivatives
-    get_derivatives(im1, w_im2, deriv, Ix, Iy, Iz, Ixx, Ixy, Iyy, Ixz, Iyz);
-    // erase du and dv
-    image_erase(du);
-
-    // initialize uu and vv
-    memcpy(uu->c1,wx->c1,wx->stride*wx->height*sizeof(float));
-
-    // inner fixed point iterations
-    for(i_inner_iteration = 0 ; i_inner_iteration < vr.inner_iter ; i_inner_iteration++) {
-
-      //  compute robust function and system
-      compute_smoothness(smooth_horiz, smooth_vert, uu, wy_dummy, deriv_flow, vr.tmp_quarter_alpha );
-      compute_data_DE(a11, b1, mask, wx, du, uu, Ix, Iy, Iz, Ixx, Ixy, Iyy, Ixz, Iyz, vr.tmp_half_delta_over3, vr.tmp_half_beta, vr.tmp_half_gamma_over3);
-      sub_laplacian(b1, wx, smooth_horiz, smooth_vert);
-
-      // solve system
-      sor_coupled_slow_but_readable_DE(du, a11, b1, smooth_horiz, smooth_vert, vr.solve_iter, vr.sor_omega);
-
-      // update flow plus flow increment
-      int i;
-      v4sf *uup = (v4sf*) uu->c1, *wxp = (v4sf*) wx->c1, *dup = (v4sf*) du->c1;
-
-      for( i=0 ; i<height*stride/4 ; i++)
-      {
-        (*uup) = __builtin_ia32_minps(   (*wxp) + (*dup)   ,  op->zero);
-        uup+=1; wxp+=1; dup+=1;
-      }
-
-    }
-
-    // add flow increment to current flow
-    memcpy(wx->c1,uu->c1,uu->stride*uu->height*sizeof(float));
-
-    // free memory
-    image_delete(du); image_delete(wy_dummy);
-    image_delete(mask);
-    image_delete(smooth_horiz); image_delete(smooth_vert);
-    image_delete(uu);
-    image_delete(a11);
-    image_delete(b1);
-
-    color_image_delete(w_im2);
-    color_image_delete(Ix); color_image_delete(Iy); color_image_delete(Iz);
-    color_image_delete(Ixx); color_image_delete(Ixy); color_image_delete(Iyy); color_image_delete(Ixz); color_image_delete(Iyz);
-
-  }
-
-
   VarRefClass::~VarRefClass() { }
 
 }
diff --git a/src/refine_variational.h b/src/refine_variational.h
index ca469f1..9f98d99 100644
--- a/src/refine_variational.h
+++ b/src/refine_variational.h
@@ -9,7 +9,12 @@
 
 namespace OFC {
 
-  typedef __v4sf v4sf;
+  // typedef __v4sf v4sf;
+#if (VECTOR_WIDTH == 4)
+  typedef float32x4_t v4sf;
+#else
+  typedef float v4sf;
+#endif
 
   typedef struct {
 
@@ -42,7 +47,6 @@ namespace OFC {
 
       void copyimage(const float* img, color_image_t * img_t);
       void RefLevelOF(image_t *wx, image_t *wy, const color_image_t *im1, const color_image_t *im2);
-      void RefLevelDE(image_t *wx, const color_image_t *im1, const color_image_t *im2);
 
       VR_params vr;
       const img_params* i_params;